def calc_perplexity(model_type, file): f = open(file, "r") text = f.read() tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", eos_token='<|endoftext|>') encodings = tokenizer(text, return_tensors='pt') model = GPT2LMHeadModel.from_pretrained(model_type) tokenizer = GPT2TokenizerFast.from_pretrained(model_type) model = GPT2LMHeadModel.from_pretrained("./data/models" + model_type) max_length = model.config.n_positions stride = 512 lls = [] for i in tqdm(range(0, encodings.input_ids.size(1), stride)): begin_loc = max(i + stride - max_length, 0) end_loc = min(i + stride, encodings.input_ids.size(1)) trg_len = end_loc - i # may be different from stride on last loop input_ids = encodings.input_ids[:,begin_loc:end_loc] target_ids = input_ids.clone() target_ids[:,:-trg_len] = -100 with torch.no_grad(): outputs = model(input_ids, labels=target_ids) log_likelihood = outputs[0] * trg_len lls.append(log_likelihood) ppl = torch.exp(torch.stack(lls).sum() / end_loc) return ppl
def train_tokenizer(data_path, wiki_text_file_path): # ToDo := Load if weights exists, else setup tokenizer_en = GPT2TokenizerFast.from_pretrained("gpt2") tokenizer_en.pad_token = tokenizer_en.eos_token vocab_size = tokenizer_en.vocab_size max_length = 1024 tokenizer_es = ByteLevelBPETokenizer() tokenizer_es.train( files=[str(wiki_text_file_path)], vocab_size=vocab_size, min_frequency=2, special_tokens=[EOF_TOKEN] ) tokenizer_es.enable_truncation(max_length=max_length) tokenizer_es_path = data_path/"BLBPE_tokenizer_es" tokenizer_es_path.mkdir(exist_ok=True, parents=True) tokenizer_es.save_model(str(tokenizer_es_path)) tokenizer_es = GPT2TokenizerFast.from_pretrained( str(tokenizer_es_path), pad_token=EOF_TOKEN ) tokenizer_es.model_max_length = max_length # tokenizer_es = ByteLevelBPETokenizer( # vocab_file=str(tokenizer_es_path/"vocab.json"), # merges_file=str(tokenizer_es_path/"merges.txt"), # ) # tokenizer_es.enable_truncation(max_length=1024) # ToDo := is this necessary # tokenizer_en.pad_token = tokenizer_en.eos_token return tokenizer_en, tokenizer_es
def load_tokenizer(location): if tf.io.gfile.exists(os.path.join(location, "merges.txt")): # use tf gfile in case the dictionary is remote fastok = GPT2TokenizerFast.from_pretrained(location) fastok.add_special_tokens({"eos_token": "<|endoftext|>"}) else: if location.startswith("/"): raise ValueError("invalid location %s" % location) else: fastok = GPT2TokenizerFast.from_pretrained(location) return fastok
def generatePerplexity(generatedFilename, reportFilename): device = 'cuda' model_id = 'gpt2-medium' model = GPT2LMHeadModel.from_pretrained(model_id).to(device) model.eval() tokenizer = GPT2TokenizerFast.from_pretrained(model_id) scoreSum = 0 totalCount = 0 reportFile = open(reportFilename, 'w') #totalData = ["ivent to the sound of a couple of fuses : 29 per cent of the fine is for the use of this."] with open(generatedFilename, 'r') as fi: for index, line in enumerate(fi): if index%5000 == 0: print(index) if '<ERROR>' in line: reportFile.write(line) else: input_sentence = line.strip() input_ids = torch.tensor(tokenizer.encode(input_sentence)).unsqueeze(0) input_ids = input_ids.to(device) with torch.no_grad(): outputs = model(input_ids, labels=input_ids) loss, logits = outputs[:2] ppl = math.exp(loss) scoreSum += ppl totalCount += 1 reportFile.write(str(ppl)+'\n') reportFile.close() print(scoreSum/totalCount) print(index) print('DONE') return None
def load(self, counterfactual_model, evalution_model="gpt2-medium"): counterfactual_model = counterfactual_model.strip().lower() self.MODELS_ARE_UNCASED = True if counterfactual_model == "sst-2" or counterfactual_model == "sst2": bert_mlm_name = 'bert-large-uncased' base_classifier_name = 'textattack/bert-base-uncased-SST-2' elif counterfactual_model == "imdb": bert_mlm_name = 'bert-large-uncased' base_classifier_name = 'textattack/bert-base-uncased-imdb' elif counterfactual_model == "ag_news": bert_mlm_name = 'bert-large-uncased' base_classifier_name = 'textattack/bert-base-uncased-ag-news' elif counterfactual_model == "german": self.MODELS_ARE_UNCASED = False bert_mlm_name = 'bert-base-german-cased' base_classifier_name = 'oliverguhr/german-sentiment-bert' else: assert False, "invalid dataset name" self.tokenizer = BertTokenizerFast.from_pretrained(bert_mlm_name) self.model = BertForMaskedLM.from_pretrained(bert_mlm_name).to(device) self.sentiment_tokenizer = AutoTokenizer.from_pretrained( base_classifier_name) self.sentiment_model = BertForSequenceClassification.from_pretrained( base_classifier_name).to(device) if evalution_model is not None: self.perplexity_model = GPT2LMHeadModel.from_pretrained( evalution_model).to(device) self.perplexity_tokenizer = GPT2TokenizerFast.from_pretrained( evalution_model)
def __init__(self, config, dataset): super(GPT2Seq, self).__init__(config, dataset) self.pretrained_model_path = config['pretrained_model_path'] self.tokenizer = GPT2TokenizerFast.from_pretrained( self.pretrained_model_path, pad_token='[PAD]') self.configuration = GPT2Config.from_pretrained( self.pretrained_model_path, pad_token_id=self.padding_token_idx) self.model = GPT2LMHeadModel.from_pretrained( self.pretrained_model_path, config=self.configuration) self.model.resize_token_embeddings(len(self.tokenizer)) if config['task_type'] == "summarization": self.task_text = "TL;DR:" elif config['task_type'] == "translation": self.task_text = "story:" elif config['task_type'] == "multi_dialog": self.task_text = "question:" else: raise NotImplementedError( "Only summarization and translation are supported.") self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx, reduction='none')
def test_gpt2(self): for tokenizer_name in GPT2Tokenizer.pretrained_vocab_files_map["vocab_file"].keys(): tokenizer_p = GPT2Tokenizer.from_pretrained(tokenizer_name) tokenizer_r = GPT2TokenizerFast.from_pretrained(tokenizer_name) # Check we have the same number of added_tokens for both pair and non-pair inputs. self.assertEqual(tokenizer_r.num_added_tokens(False), tokenizer_p.num_added_tokens(False)) self.assertEqual(tokenizer_r.num_added_tokens(True), tokenizer_p.num_added_tokens(True)) # Check we have the correct max_length for both pair and non-pair inputs. self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence) self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair) # Assert the set of special tokens match. self.assertSequenceEqual( tokenizer_p.special_tokens_map.items(), tokenizer_r.special_tokens_map.items(), "GPT2 tokenizers doesn't have the same set of special_tokens", ) # Assure tokenization overlap between python and rust impl. self.assert_tokenization_python_rust_almost_equals(tokenizer_p, tokenizer_r, 0.0) # Ensure add_tokens and add_special_tokens return the correct vocab size self.assert_add_tokens(tokenizer_r) # Check for offsets mapping self.assert_offsets_mapping(tokenizer_r) # Check for dynamic encoding sequence handling in batch_encode_plus self.assertRaises(ValueError, self.assert_batch_encode_dynamic_overflowing, tokenizer_r) # Check alignment for build_inputs_with_special_tokens self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p)
def get_encoder(model_path=None): if model_path is None: #model_path = 'gs://gpt-2/models/117M/' #model_path = os.path.dirname(__file__) from transformers import GPT2TokenizerFast tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") return tokenizer with bucket_file(bucket_path(model_path, 'encoder.json')) as (vocab_path, vocab_data): with bucket_file(bucket_path(model_path, 'vocab.bpe')) as (bpe_merges_path, bpe_data): encoder = json.loads(vocab_data.decode('utf8')) if use_high_speed_tokenizer: tokenizer = HighSpeedTokenizer(vocab_path=vocab_path, bpe_merges_path=bpe_merges_path) tokenizer.encoder = encoder return tokenizer bpe_data = bpe_data.decode('utf8') bpe_merges = [ tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1] ] return Encoder( encoder=encoder, bpe_merges=bpe_merges, )
def makeUnilabelModel(self, modelName, num_labels=10, root='', **kwargs): if modelName == 'distilbert-base-uncased': tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased') model = DistilBertForSequenceClassification.from_pretrained( root + "distilbert-base-uncased", num_labels=num_labels, **kwargs) if modelName == 'gpt2': tokenizer = GPT2TokenizerFast.from_pretrained('gpt2') tokenizer.add_special_tokens({'pad_token': '[PAD]'}) model = GPT2ForSequenceClassification.from_pretrained( root + "gpt2", num_labels=num_labels, **kwargs) model.resize_token_embeddings(len(tokenizer)) # add padding token model.config.pad_token_id = tokenizer('[PAD]').input_ids[0] if modelName == 'bertweet': tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base') model = AutoModelForSequenceClassification.from_pretrained( root + "vinai/bertweet-base", num_labels=num_labels, **kwargs) if modelName == 'distilroberta-base': tokenizer = AutoTokenizer.from_pretrained('distilroberta-base') model = AutoModelForSequenceClassification.from_pretrained( root + "distilroberta-base", num_labels=num_labels, **kwargs) if modelName == 'lstm': tokenizer = AutoTokenizer.from_pretrained( 'distilbert-base-uncased') model = LSTMCclassifier(128, 64, 2, tokenizer.vocab_size, num_labels) return tokenizer, model
def __init__(self, model_id="gpt2", model_name_or_path="gpt2"): self.model = GPT2LMHeadModel.from_pretrained(model_name_or_path) if torch.cuda.is_available(): self.model = self.model.cuda() self.tokenizer = GPT2TokenizerFast.from_pretrained(model_id) self.metric_name = "fwppl" if model_name_or_path == model_id else "ft-fwppl"
def tokenize_ita_file(tokenizer: GPT2TokenizerFast, src_path, eot_token='<|endoftext|>', min_length=30, eot=[0]): token_ids = [] doc = '' with open(src_path) as f: for line in tqdm(f): if line == eot_token + '\n': if len(doc) >= min_length: token_ids.extend(tokenizer.encode(doc.strip()) + eot) doc = '' continue doc += line if doc != '': token_ids.extend(tokenizer.encode(doc.strip()) + eot) return token_ids
def get_bert_tokenizer(bert_model_type): if bert_model_type in [ 'bert-base-uncased', 'prod-bert-base-uncased', 'bert-base-cased', 'bert-large-uncased', 'tune_bert-base-uncased_nsp', 'bert-large-uncased-whole-word-masking', 'bert-large-uncased-whole-word-masking-finetuned-squad' ]: if '-cased' in bert_model_type: do_lower_case = False else: do_lower_case = True # default return BertTokenizerFast(vocab_file=BERT_VOCAB_FILE[bert_model_type], do_lower_case=do_lower_case) elif bert_model_type in [ 'roberta-base', 'prod-roberta-base-cased', 'roberta-large', 'roberta-large-mnli', 'distilroberta-base' ]: return RobertaTokenizerFast( vocab_file=BERT_VOCAB_FILE[bert_model_type], merges_file=BERT_MERGE_FILE[bert_model_type], add_prefix_space=True) elif bert_model_type in ['xlnet-base-cased']: if '-uncased' in bert_model_type: do_lower_case = True else: do_lower_case = False # default return XLNetTokenizer(vocab_file=BERT_VOCAB_FILE[bert_model_type], do_lower_case=do_lower_case) elif bert_model_type in [ 'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1', 'albert-xxlarge-v1' ]: return AlbertTokenizer(vocab_file=BERT_VOCAB_FILE[bert_model_type]) elif bert_model_type in ['gpt2', 'gpt2-medium']: tokenizer = GPT2TokenizerFast( vocab_file=BERT_VOCAB_FILE[bert_model_type], merges_file=BERT_MERGE_FILE[bert_model_type], add_prefix_space=True) # https://github.com/huggingface/transformers/issues/3859 tokenizer.pad_token = tokenizer.eos_token return tokenizer elif bert_model_type in ['transfo-xl']: return TransfoXLTokenizerFast( vocab_file=BERT_VOCAB_FILE[bert_model_type]) elif bert_model_type in [ 'distilbert-base-uncased', 'distilbert-base-uncased-distilled-squad' ]: if '-cased' in bert_model_type: do_lower_case = False else: do_lower_case = True # default return DistilBertTokenizerFast( vocab_file=BERT_VOCAB_FILE[bert_model_type], do_lower_case=do_lower_case) else: raise ValueError( f'`bert_model_type` not understood: {bert_model_type}')
def main( tokenizer, text_path, out_path, val_chance, max_length, batch_size, ): tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer) with open(text_path) as f, h5py.File(out_path, "w", libver="latest") as hdf5_f: texts = [] hdf5_f.swmr_mode = True # need concurrent reads from pytorch train_dataset = hdf5_f.create_dataset( "train", (0, max_length), maxshape=(None, max_length), dtype=np.int32, chunks=(CHUNK_SIZE, max_length), ) val_dataset = hdf5_f.create_dataset( "val", (0, max_length), maxshape=(None, max_length), dtype=np.int32, chunks=(CHUNK_SIZE, max_length), ) current_text = "" train_samples = [] val_samples = [] for line in tqdm(f): if line.isspace(): texts.append(current_text) current_text = "" if len(texts) == batch_size: batch_train_samples, batch_val_samples = get_samples( tokenizer, texts, val_chance, max_length, ) train_samples += batch_train_samples val_samples += batch_val_samples write_samples(train_samples, train_dataset, max_length) write_samples(val_samples, val_dataset, max_length) texts = [] else: current_text += line if len(current_text) > 0: texts.append(current_text) if len(texts) > 0: batch_train_samples, batch_val_samples = get_samples( tokenizer, texts, val_chance, max_length ) write_samples(train_samples, train_dataset, max_length) write_samples(val_samples, val_dataset, max_length)
def __init__( self, tokenizer_name: str, model_name: str, utterance_window: int = 4, ): self.tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_name) self.model = TFAutoModelForCausalLM.from_pretrained(model_name, from_pt=True)
def load_vocab(lang, model=''): if lang in LANG_TOKENIZERS: tokenizer = GPT2TokenizerFast.from_pretrained(LANG_TOKENIZERS[lang]) else: tok_path = Path(model) / 'tokenizer.json' if not tok_path.exists(): tok_path = Path('data') / lang / 'vocabularies' / 'tokenizer.json' tokenizer = Tokenizer.from_file(str(tok_path)) vocab = tokenizer.get_vocab() return sorted(vocab, key=vocab.get)
def main(args: Namespace) -> None: if args.seed_everything: seed_everything(0) # For reproducibility # Initialize tokenizer the same way we did when training (in MemesDataModule) tokenizer = GPT2TokenizerFast.from_pretrained(args.gpt2_model_type) tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token}) # Validate memes_module = MemesDataModule(args) model = GPT2.load_from_checkpoint(args.checkpoint, args=args, tokenizer=tokenizer) Trainer().test(model, datamodule=memes_module)
def __init__(self, index_fname, batch_size, restore_state=None, text_tokens=256): self.tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") self.tokenizer.pad_token = "<|endoftext|>" self.tokenizer.add_special_tokens({'sep_token': '<|sep|>', 'pad_token': '<|pad|>'}) def map_fn(example): tokenizer = self.tokenizer def decode(x): return tokenizer(["<|endoftext|>" + i.decode() for i in x])["input_ids"] texts = [ decode(example["context_page_description"]), decode(example["context_section_description"]), decode(example["caption_reference_description"]), decode(example["caption_alt_text_description"]), decode(example["caption_attribution_description"]), ] output = [] for text, dalle in zip(zip(*texts), example["dalle"]): all_text = list(itertools.chain(*text))[-text_tokens+1:] all_text += [tokenizer.pad_token_id] * ((text_tokens - 1) - len(all_text)) assert len(all_text) == text_tokens - 1 all_tokens = all_text + [tokenizer.sep_token_id] + list(dalle + tokenizer.vocab_size + 1) output.append(all_tokens) return np.array(output) def tf_parse(example_proto): features = { "page_title": tf.io.FixedLenFeature([], tf.string), "section_title": tf.io.FixedLenFeature([], tf.string), "hierarchical_section_title": tf.io.FixedLenFeature([], tf.string), "caption_reference_description": tf.io.FixedLenFeature([], tf.string), "caption_attribution_description": tf.io.FixedLenFeature([], tf.string), "caption_alt_text_description": tf.io.FixedLenFeature([], tf.string), "mime_type": tf.io.FixedLenFeature([], tf.string), "context_page_description": tf.io.FixedLenFeature([], tf.string), "context_section_description": tf.io.FixedLenFeature([], tf.string), "dalle": tf.io.FixedLenFeature([1024], tf.int64), } parsed_features = tf.io.parse_single_example(example_proto, features) return parsed_features super().__init__(index_fname, batch_size, tf_parse, map_fn, restore_state=restore_state)
def main(text_path, out_directory): Path(out_directory).mkdir(exist_ok=True, parents=True) english_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") german_tokenizer = tokenizers.ByteLevelBPETokenizer() german_tokenizer.train( [text_path], vocab_size=english_tokenizer.vocab_size, special_tokens=["<|endoftext|>"], show_progress=True, ) german_tokenizer.save_model(out_directory)
def __init__(self, vocab_file=None, fast=True): name = 'HFGPT2Tokenizer' if fast: name += "Fast" super().__init__(name) if vocab_file is None: vocab_file = 'gpt2' if fast: self.tokenizer = GPT2TokenizerFast.from_pretrained(vocab_file) else: self.tokenizer = GPT2Tokenizer.from_pretrained(vocab_file) self.tokenizer.add_special_tokens({'pad_token': '<|padding|>'}) self.eod_id = self.tokenizer.eos_token_id self.pad_id = self.tokenizer.pad_token_id
def main(cfg): # tokenizer 로드 tokenizer = GPT2TokenizerFast.from_pretrained(cfg.PATH.tokenizer) model = AutoModelForCausalLM.from_pretrained(cfg.PATH.output_dir).cuda() bos = tokenizer.bos_token eos = tokenizer.eos_token sep = tokenizer.sep_token talk_log = [] while True: # 유저 입력단 user_input = "" while True: utterance = input("me: ") if not utterance: break elif utterance == "종료": exit() user_input += (f" {sep} " if user_input else "") + utterance talk_log.append(f"{bos}{user_input}{eos}") # 모델 예측 input_text = "".join(talk_log[-4:] if talk_log else "") + bos input_ids = tokenizer.encode(input_text, return_tensors="pt").cuda() input_len = input_ids.shape[-1] output = model.generate( input_ids, max_length=256, # do_sample=True, top_k=10, top_p=0.95, no_repeat_ngram_size=2, early_stopping=True, eos_token_id=tokenizer.eos_token_id, pad_token_id=0, )[0, input_len - 1 :].cpu() talk_log.append(tokenizer.decode(output)) model_output = tokenizer.decode(output[1:-1]) # 모델 결과 출력 for utterance in model_output.split("<sep>"): print(f"AI: {utterance.strip()}")
def main(_): tf.logging.set_verbosity(tf.logging.INFO) logger = tf.get_logger() logger.propagate = False input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info("*** Reading from input files ***") for input_file in input_files: tf.logging.info(" %s", input_file) gpt2_tok = GPT2TokenizerFast.from_pretrained(FLAGS.tokenizer_dir) writer = tf.python_io.TFRecordWriter(FLAGS.output_file + ".tfrecord") eos_id = gpt2_tok.eos_token_id all_examples = [] for input_file in input_files: queue = [] example = [] with tf.gfile.GFile(input_file, "r") as reader: for line in reader.readlines(): if line == "\n": queue.append(eos_id) else: line = line.replace("\n", " ") line = line.strip() enc_line = gpt2_tok.encode(line) queue.extend(enc_line) if len(queue) > FLAGS.max_len + 1: example = [queue.pop(0) for _ in range(FLAGS.max_len + 1)] assert len(example) == FLAGS.max_len + 1 all_examples.append(example) for i, ex in enumerate(all_examples): features = collections.OrderedDict() features["input_ids"] = create_int_feature(ex) tf_example = tf.train.Example(features=tf.train.Features( feature=features)) writer.write(tf_example.SerializeToString()) if i < FLAGS.num_examples_print: tf.logging.info("*** Example ***") tf.logging.info("Length: %d" % len(ex)) tf.logging.info("Tokens: %s" % gpt2_tok.decode(ex)) tf.logging.info("ids: %s" % " ".join([str(x) for x in ex])) tf.logging.info("Wrote %d total instances", len(all_examples))
def get_tokenizer(tokenizer_type=None, from_pretrained=True, add_padding_token=False): if (tokenizer_type.lower() == "hf_gpt2tokenizerfast" and from_pretrained) or tokenizer_type is None: tok = GPT2TokenizerFast.from_pretrained('gpt2') if add_padding_token: tok.add_special_tokens({'pad_token': '<|padding|>'}) return tok elif tokenizer_type.lower() == "hf_gp2tokenizer" and from_pretrained: tok = GPT2Tokenizer.from_pretrained('gpt2') if add_padding_token: tok.add_special_tokens({'pad_token': '<|padding|>'}) return tok else: raise NotImplementedError('TODO: add custom tokenizers')
def create_tfrecords(files, args): GPT2TokenizerFast.max_model_input_sizes['gpt2'] = 1e20 # disables a misleading warning enc = GPT2TokenizerFast.from_pretrained('gpt2') random.seed(args.seed) data_to_prepend = [] all_sequences_across_epochs = [] ep_len = None for ep_ix in range(args.n_repack_epochs): tokenized_files_array = [] if args.preserve_data_order: files = sorted(files) else: random.shuffle(files) print(f'starting epoch {ep_ix}\n\t{len(all_sequences_across_epochs)} sequences so far\n\t{len(data_to_prepend)} tokens rolled over from last epoch\n\tfirst file this ep is {files[0]}') for f in tqdm(files, mininterval=10, smoothing=0): for tokenized_files in archive_to_tokens(f, enc, args, prefix=data_to_prepend): # if the last chunk < chunk size, take it and append it to the beginning of the next file data_to_prepend = [] n_tokens = len(tokenized_files[-1]) if n_tokens < 2049: data = tokenized_files.pop(-1) data_to_prepend = data tokenized_files_array.extend(tokenized_files) if not args.preserve_data_order: random.shuffle(tokenized_files_array) if args.min_unique_tokens > 0: tokenized_files_array = list(enforce_min_unique(tokenized_files_array, args.min_unique_tokens, enc, args.verbose)) all_sequences_across_epochs.extend(tokenized_files_array) if ep_ix == 0: ep_len = len(tokenized_files_array) total_sequence_len = len(all_sequences_across_epochs) fp = os.path.join(args.output_dir, f"{args.name}_{total_sequence_len}.tfrecords") write_tfrecord(all_sequences_across_epochs, fp)
def __init__(self, api, model_name, batch_size=1): self.api = api self.id = model_name self.ckpt = tf.train.latest_checkpoint(os.path.join(api.model_path, model_name)) if self.ckpt is None: raise ValueError("Couldn't load checkpoint for {model_name} from {path}".format(model_name=model_name, path=os.path.join(api.model_path, model_name))) self.graph = tf.Graph() self.config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True) self.session = tf.Session(graph=self.graph, config=self.config) #self.encoder = encoder.get_encoder(model_name, self.api.model_path) self.encoder = GPT2TokenizerFast.from_pretrained("gpt2") self.hparams = model.default_hparams() with open(os.path.join(self.api.model_path, model_name, 'hparams.json')) as f: params = json_load(f) self.hparams.override_from_dict(params) with self.session.as_default() as sess, self.graph.as_default() as graph: pp(self.session.list_devices()) if 'CUDA_VISIBLE_DEVICES' in os.environ: print('Using /gpu:0 on device {}'.format(os.environ['CUDA_VISIBLE_DEVICES'])) with tf.device('/gpu:0' if 'CUDA_VISIBLE_DEVICES' in os.environ else None): self.batch_size = batch_size self.context = tf.placeholder(tf.int32, [self.batch_size, None], name="context") self.length = tf.placeholder(tf.int32, (), name="length") self.temperature = tf.placeholder(tf.float32, (), name="temperature") self.top_k = tf.placeholder(tf.int32, (), name="top_k") self.top_p = tf.placeholder(tf.float32, (), name="top_p") self.frequency_penalty = tf.placeholder(tf.float32, (), name="frequency_penalty") #np.random.seed(seed) #tf.set_random_seed(seed) self.output = sample.sample_sequence( hparams=self.hparams, length=self.length, context=self.context, batch_size=self.batch_size, temperature=self.temperature, top_k=self.top_k, top_p=self.top_p, frequency_penalty=self.frequency_penalty, ) var_list = tf.trainable_variables() self.saver = tf.train.Saver(var_list=var_list) for v in var_list: print(self.ckpt, v) pp(self.hparams) print('Restoring from {!r}'.format(self.ckpt)) self.saver.restore(sess, self.ckpt)
def get_tokenizer(train_data, vocab_size): """ Trains and returns a byte-level BPE tokenizer. If a cached tokenizer with these parameters exists it is loaded instead of training a new tokenizer. :param train_data: list of dataset files :param vocab_size: BPE vocab size :return: GPT2TokenizerFast with the requested parameters. """ assert vocab_size >= 257, 'vocab size must cover all possible bytes and one special token' # calculate the name of the cached file m = hashlib.md5() m.update(str(vocab_size).encode()) for file in train_data: m.update(file.encode()) cache_id = m.hexdigest() cached_tokenizer_file = os.path.join(CACHE_DIR, 'tokenizer_{}'.format(cache_id)) train_new_tokenizer = not os.path.exists(cached_tokenizer_file) if train_new_tokenizer: start = time.time() os.makedirs(cached_tokenizer_file) tokenizer = ByteLevelBPETokenizer() tokenizer.train( train_data, vocab_size=vocab_size, special_tokens=['<|endoftext|>'], show_progress=False, ) tokenizer.save_model(cached_tokenizer_file) logger.info(f"Trained tokenizer {cached_tokenizer_file} [took %.3f s]", time.time() - start) start = time.time() tokenizer = GPT2TokenizerFast.from_pretrained(cached_tokenizer_file) tokenizer.cache_id = cache_id if not train_new_tokenizer: logger.info( f"Loaded tokenizer from {cached_tokenizer_file} [took %.3f s]", time.time() - start) return tokenizer
def __init__(self, gpt2_pretrained_model="gpt2-medium", gpt2_gpu_id=-1, **kargs): """Initialize GPT2 model.""" super(GPT2GrammarQualityMetric, self).__init__() logger.info("load gpt2 model.") self._tokenizer = GPT2TokenizerFast.from_pretrained( utils.get_transformers(gpt2_pretrained_model)) if gpt2_gpu_id == -1: logger.warning("GPT2 metric is running on CPU.") self._device = torch.device("cpu") else: logger.info("GPT2 metric is running on GPU %d.", gpt2_gpu_id) self._device = torch.device("cuda:%d" % gpt2_gpu_id) self._model = GPT2LMHeadModel.from_pretrained( utils.get_transformers(gpt2_pretrained_model)).to(self._device)
def fetch_encoder(params): no_dataset = params.get('no_dataset', False) if no_dataset: return None dataset = next(iter(params['dataset_configs'].values()) ) # Get the first value from the dict path = dataset["tokenizer_path"] is_pretrained = dataset.get("tokenizer_is_pretrained", False) if is_pretrained: tok = GPT2TokenizerFast.from_pretrained(path) # Will add a padding token id of 50257 at run-time tok.add_special_tokens({'pad_token': '<|padding|>'}) return tok return Tokenizer.from_file(path)
def __init__(self, args: Namespace) -> None: super().__init__() self.gpu_boole = torch.cuda.is_available() if args.accelerator == 'ddp_spawn': self.num_cpus = 0 elif args.accelerator is None: self.num_cpus = cpu_count() else: self.num_cpus = 1 # There should be no parallelism: stop warnings # environ['TOKENIZERS_PARALLELISM'] = 'false' (maybe it should actually be 'true'?) self.tokenizer = GPT2TokenizerFast.from_pretrained( args.gpt2_model_type) # Make sure pad token is also <|endoftext|> self.tokenizer.add_special_tokens( {'pad_token': self.tokenizer.eos_token}) # Define custom collate function for data loader to tokenize batch properly self.collate_fn = lambda batch: self.tokenizer( batch, return_tensors='pt', padding=True, truncation=True) # Save hyperparameters using hack b/c this is a data module self.hparams = args
def load(self): self._tokenizer = GPT2TokenizerFast.from_pretrained( "gpt2") # use_fast=True self._special_word = '<|bed|>' special_tokens_dict = { 'additional_special_tokens': [self._special_word] } self._tokenizer.add_special_tokens(special_tokens_dict) self._tokenizer.pad_token = self._tokenizer.eos_token self._special_token = self.get_token(self._special_word) print( f"Special token has the number: {self._special_token}. Hopefully this does not vary!" ) if os.path.exists(self._model_path): print('loading') self._model = GPT2LMHeadModel.from_pretrained( self._model_path, pad_token_id=self._tokenizer.eos_token_id) return self
def __init__(self): self.params = { "layers": 28, "d_model": 4096, "n_heads": 16, "n_vocab": 50400, "norm": "layernorm", "pe": "rotary", "pe_rotary_dims": 64, "seq": 2048, "cores_per_replica": 8, "per_replica_batch": 1, "sampler": nucleaus_sample, "optimizer": optax.scale(0) } self.tokenizer = GPT2TokenizerFast.from_pretrained('gpt2') self.queue_ids = {} self.qidx = 0 self.queue = Queue() self.network = None self.lock = threading.Lock() self._alive_time = timer()