def evaluation(args): # start experiment batch_size = 1 fname = args.file DATASET = args.dataset model_pt = args.model_pt method = nltk.translate.bleu_score.SmoothingFunction(0.000000001).method1 # load data tokenizer = GPT2Tokenizer.from_pretrained("./%s/gpt" % DATASET) tokenizer.bos_token = '<BOS>' tokenizer.pad_token = "<PAD>" tokenizer.add_tokens(['<negative>']) tokenizer.add_tokens(['<positive>']) tokenizer.add_tokens(['<PAD>']) tokenizer.add_tokens(['<BOS>']) if DATASET == "formality_family": with open("./%s/formality-gpt.test.json" % (DATASET), "r") as f: data = json.load(f) else: with open("./%s/sentiment-gpt.test.json" % (DATASET), "r") as f: data = json.load(f) if DATASET != "imdb": test_data = Dataloader.GPTRefLoader(data, tokenizer, batch_size, args.cuda) else: test_data = Dataloader.GPTLoader(data, tokenizer, batch_size, args.cuda) # build model generator = GPT2LMHeadModel.from_pretrained("./%s/gpt" % DATASET) generator.resize_token_embeddings(len(tokenizer)) if args.model_pt is not None: generator.load_state_dict(torch.load(args.model_pt)) if args.cuda: generator = generator.cuda() generator.eval() generate_output(generator, args, test_data, tokenizer, BATCH_SIZE=batch_size, fname=fname, dname=DATASET) if DATASET == "yelp": result = evaluate_file_yelp(fname, torch.device('cuda:%d' % args.gpuid), learned=False, is_test=True) elif DATASET == "amazon": result = evaluate_file_amazon(fname, torch.device('cuda:%d' % args.gpuid), learned=False, is_test=True) elif DATASET == "imdb": result = evaluate_file_imdb(fname, torch.device('cuda:%d' % args.gpuid), learned=False, is_test=True) else: result = evaluate_file_formality(fname, torch.device('cuda:%d' % args.gpuid), learned=False, is_test=True) print(result) return
epsilon = 1e-8 # Set the seed value all over the place to make this reproducible. seed_val = 42 random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) SAVE_PATH = "/mnt/nfs/work1/llcao/zonghaiyao/LM/" # I'm not really doing anything with the config buheret configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False) # Load the GPT tokenizer. tokenizer = GPT2Tokenizer.from_pretrained( 'gpt2', pad_token='<|endoftext|>') #gpt2-medium # instantiate the model model = rerankGPT2LMHeadModel_token_type_embeddings01.from_pretrained( "gpt2", config=configuration, MAX_LEN=MAX_LEN, CAN_NUM=CAN_NUM, num_of_rerank=num_of_rerank) # this step is necessary because I've added some tokens (bos_token, etc) to the embeddings # otherwise the tokenizer and model tensors won't match up model.resize_token_embeddings(len(tokenizer)) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!")
def __init__(self, model_path, tokenizer_path): self.tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path) self.model = GPT2LMHeadModel.from_pretrained(model_path)
def train( batch_size = 20,\ epochs = 10,\ lr = 0.00002,\ warmup = 10000,\ checkpoint=True,\ samples=10, blocksize=512 ): from transformers import ( GPT2Tokenizer, GPT2LMHeadModel, CONFIG_MAPPING, MODEL_WITH_LM_HEAD_MAPPING, AutoConfig, AutoModelWithLMHead, AutoTokenizer, DataCollatorForLanguageModeling, get_linear_schedule_with_warmup, HfArgumentParser, LineByLineTextDataset, PreTrainedTokenizer, TextDataset, Trainer, TrainingArguments, set_seed, WEIGHTS_NAME, CONFIG_NAME) print("Models Will Be Checkpointed : ", checkpoint, " Saving Training Params") save_training_params(batch_size, epochs, lr, warmup, checkpoint, samples, blocksize, output_dir) tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained('gpt2') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) model.train() optimizer = AdamW(model.parameters(), lr=lr) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup, num_training_steps=-1) script_count = 0 sum_loss = 0.0 batch_count = 0 language_dataset = ProgrammingLanguagesDataset(tokenizer=tokenizer, samples=samples, block_size=blocksize) script_loader = DataLoader(language_dataset) for epoch in range(epochs): if epoch % 2 == 0 and epoch != 0 and checkpoint: checkpoint_model(model, tokenizer, output_dir + str(epoch)) print("Starting Epoch : ", epoch) for _, script in enumerate(script_loader): outputs = model(script.to(device), labels=script.to(device)) loss, logits = outputs[:2] loss.backward() sum_loss = sum_loss + loss.detach().data script_count = script_count + 1 if script_count == batch_size: script_count = 0 batch_count += 1 optimizer.step() scheduler.step() optimizer.zero_grad() model.zero_grad() if batch_count == 200: model.eval() print(f"sum loss {sum_loss}") sample_outputs = model.generate(bos_token_id=random.randint( 1, 30000), do_sample=True, top_k=50, max_length=200, top_p=0.95, num_return_sequences=1) print("Output:\n" + 100 * '-') for i, sample_output in enumerate(sample_outputs): print("{}: {}".format( i, tokenizer.decode(sample_output, skip_special_tokens=True))) batch_count = 0 sum_loss = 0.0 model.train() checkpoint_model(model, tokenizer, output_dir + str(epochs))
def main(): parser = argparse.ArgumentParser( description= "Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids)." ) parser.add_argument("--file_path", type=str, default="data/dump.txt", help="The path to the data.") parser.add_argument("--tokenizer_type", type=str, default="bert", choices=["bert", "roberta", "gpt2"]) parser.add_argument("--tokenizer_name", type=str, default="bert-base-uncased", help="The tokenizer to use.") parser.add_argument("--dump_file", type=str, default="data/dump", help="The dump file prefix.") args = parser.parse_args() logger.info(f"Loading Tokenizer ({args.tokenizer_name})") if args.tokenizer_type == "bert": tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name) bos = tokenizer.special_tokens_map["cls_token"] # `[CLS]` sep = tokenizer.special_tokens_map["sep_token"] # `[SEP]` elif args.tokenizer_type == "roberta": tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name) bos = tokenizer.special_tokens_map["cls_token"] # `<s>` sep = tokenizer.special_tokens_map["sep_token"] # `</s>` elif args.tokenizer_type == "bertweet": tokenizer = BertweetTokenizer.from_pretrained(args.tokenizer_name) bos = tokenizer.special_tokens_map["cls_token"] # `<s>` sep = tokenizer.special_tokens_map["sep_token"] elif args.tokenizer_type == "gpt2": tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_name) bos = tokenizer.special_tokens_map["bos_token"] # `<|endoftext|>` sep = tokenizer.special_tokens_map["eos_token"] # `<|endoftext|>` logger.info(f"Loading text from {args.file_path}") with open(args.file_path, "r", encoding="utf8") as fp: data = fp.readlines() logger.info("Start encoding") logger.info(f"{len(data)} examples to process.") rslt = [] iter = 0 interval = 10000 start = time.time() for text in data: text = f"{bos} {text.strip()} {sep}" token_ids = tokenizer.encode(text, add_special_tokens=False) rslt.append(token_ids) iter += 1 if iter % interval == 0: end = time.time() logger.info( f"{iter} examples processed. - {(end-start):.2f}s/{interval}expl" ) start = time.time() logger.info("Finished binarization") logger.info(f"{len(data)} examples processed.") dp_file = f"{args.dump_file}.pickle" vocab_size = tokenizer.vocab_size if vocab_size < (1 << 16): rslt_ = [np.uint16(d) for d in rslt] else: rslt_ = [np.int32(d) for d in rslt] random.shuffle(rslt_) logger.info(f"Dump to {dp_file}") with open(dp_file, "wb") as handle: pickle.dump(rslt_, handle, protocol=pickle.HIGHEST_PROTOCOL)
def create_app(): app = Flask(__name__) def sample_sequence( model, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.9, repetition_penalty=1.0, device="cuda", ): context = torch.tensor(context, dtype=torch.long, device=device) context = context.unsqueeze(0).repeat(num_samples, 1) generated = context with torch.no_grad(): for _ in trange(length): inputs = {"input_ids": generated} outputs = model(**inputs) next_token_logits = outputs[0][0, -1, :] / ( temperature if temperature > 0 else 1.0) for _ in set(generated.view(-1).tolist()): next_token_logits[_] /= repetition_penalty filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p) if temperature == 0: next_token = torch.argmax(filtered_logits).unsqueeze(0) else: next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1) generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1) return generated def get_output(model, input_text, tokenizer): indexed_tokens = tokenizer.encode(input_text) output = sample_sequence(model, num_words, indexed_tokens, device=device) return tokenizer.decode(output[0, 0:].tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True) tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2") model = GPT2LMHeadModel.from_pretrained("distilgpt2") model.eval() @app.route('/predict', methods=['POST']) def predict(): if request.method == 'POST': lines = request.get_json(force=True) input_text = lines['input_text'] time_now = time.time() output_text = get_output(model, input_text, tokenizer) time_to_predict = time.time() - time_now return jsonify({ 'input_text': input_text, 'output_text': output_text, 'prediction_time': time_to_predict }) return app
# Downloads model(s) during docker build from transformers import GPT2LMHeadModel, GPT2Tokenizer GPT2LMHeadModel.from_pretrained('gpt2').save_pretrained('./gpt2') GPT2Tokenizer.from_pretrained('gpt2').save_pretrained('./gpt2')
def generate_text(seed, length, prompt_text): # set up model_path = '../output10k' device = 'cpu' # set seed set_seed(seed) # Initialize the model and tokenizer tokenizer = GPT2Tokenizer.from_pretrained(model_path) model = GPT2LMHeadModel.from_pretrained(model_path) model.to(device) genLength = adjust_length_to_model( length, max_sequence_length=model.config.max_position_embeddings) # Encode prompt prompt_text = 'Question: ' + prompt_text + ' Answer: ' encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt") encoded_prompt = encoded_prompt.to(device) output_sequences = model.generate( input_ids=encoded_prompt, max_length=genLength + len(encoded_prompt[0]), temperature=1, top_k=0, top_p=0.9, repetition_penalty=1, do_sample=True, num_return_sequences=1, ) # Remove the batch dimension when returning multiple sequences if len(output_sequences.shape) > 2: output_sequences.squeeze_() generated_sequences = [] for generated_sequence_idx, generated_sequence in enumerate( output_sequences): print("=== GENERATED SEQUENCE {} ===".format(generated_sequence_idx + 1)) generated_sequence = generated_sequence.tolist() # Decode text text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True) # remove prompt text = text[len( tokenizer. decode(encoded_prompt[0], clean_up_tokenization_spaces=True)):] # Remove all text after the stop token text = text[:text.find('Answer:' ) if text.find('Answer:') > 0 else None] # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing total_sequence = (prompt_text + text) generated_sequences.append(total_sequence) print(total_sequence) return generated_sequences
def load_gpt2() -> Any: tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained('gpt2', pad_token_id=tokenizer.eos_token_id) return model, tokenizer
def train(data_folder): checkpoint = False # set to True if continuing to train our model, o/w false # set to True to chat with the unaltered GPT-2 model (at bottom of notebook) baseline = False model_file = '/gpt-2_epoch_0' tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2DoubleHeadsModel.from_pretrained('gpt2') csv_file = data_folder + '/processed_data_final.csv' genre_dict = {'comedy': '<comedy>', 'sport': '<sport>', 'biography': '<biography>', 'romance': '<romance>', 'action': '<action>', 'adventure': '<adventure>', 'drama': '<drama>', 'sci-fi': '<sci-fi>', 'family': '<family>', 'fantasy': '<fantasy>', 'musical': '<musical>', 'crime': '<crime>', 'thriller': '<thriller>', 'short': '<short>', 'western': '<western>', 'documentary': '<documentary>', 'horror': '<horror>', 'animation': '<animation>', 'film-noir': '<film-noir>', 'music': '<music>', 'war': '<war>', 'mystery': '<mystery>'} genres = genre_dict.keys() special_tokens = ["<speaker1>", "<speaker2>"] + \ ["<" + genre + ">" for genre in genres] SPECIAL_TOKENS = {"bos_token": "<bos>", "eos_token": "<eos>", "additional_special_tokens": special_tokens, "pad_token": "<pad>"} if not baseline: tokenizer.add_special_tokens(SPECIAL_TOKENS) model.resize_token_embeddings(len(tokenizer)) if not baseline: ngpu = 0 for param in model.parameters(): param.requires_grad = False # Parameters of newly constructed modules have requires_grad=True by default model.lm_head = nn.Linear(model.lm_head.in_features, len(tokenizer)) model.multiple_choice_head.summary = nn.Linear( model.multiple_choice_head.summary.in_features, 1, bias=True) # retrain final fc layer and mc layer for language modeling task device = torch.device("cuda:0" if ( torch.cuda.is_available() and ngpu > 0) else "cpu") model = model.to(device) if checkpoint: model.load_state_dict(torch.load(model_file)) pkl_file = data_folder + '/dialogue_data.pkl' dataset = DialogueDataset(pkl_file=pkl_file) data_size = dataset.__len__() batch_size = 4 train_size = .8 shuffle_dataset = True #random_seed = random.randint(1, 10000) random_seed = 42 # use indexing info from dataset for splitting groups gss = GroupShuffleSplit(n_splits=1, train_size=train_size, random_state=random_seed) # group stratified CV df = get_df_data(csv_file) for train_idx, val_idx in gss.split(df, df['sentence_2'], df['index']): train_sampler = SubsetRandomSampler(train_idx) valid_sampler = SubsetRandomSampler(val_idx) train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler) validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler) # params lm_losses = [] mc_losses = [] total_losses = [] lm_losses_val = [] mc_losses_val = [] total_losses_val = [] iters = 0 lm_coef = 2.0 mc_coef = 1.0 num_epochs = 3 lr = 6.25e-5 max_grad_norm = 1.0 num_training_steps = (data_size // batch_size) * num_epochs warmup_proportion = 0.1 num_warmup_steps = num_training_steps * .1 grad_accum_steps = 8 # In Transformers, optimizer and schedules are splitted and instantiated like this: # To reproduce BertAdam specific behavior set correct_bias=False optimizer = AdamW(model.parameters(), lr=lr) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps, num_training_steps) # PyTorch scheduler #scheduler = PiecewiseLinear(optimizer, "lr", [(0, lr), (num_epochs * len(train_loader), 0.0)]) print("Starting Training Loop...") min_total_loss = 4000 # For each epoch for epoch in range(num_epochs): # checkpoints if epoch > 0: torch.save(model.state_dict(), "/gpt-2_epoch_{}".format(epoch)) # For each batch in the dataloader for i, data in enumerate(train_loader, 0): model.train() input_ids = data[0] token_type_ids = data[1] mc_token_ids = data[2] lm_labels = data[3] mc_labels = data[4] output = model(input_ids, mc_token_ids=mc_token_ids, mc_labels=mc_labels, token_type_ids=token_type_ids, lm_labels=lm_labels) lm_loss = output[0] mc_loss = output[1] total_loss = lm_loss * lm_coef + mc_loss * mc_coef / grad_accum_steps total_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) if i % grad_accum_steps == 0: optimizer.step() scheduler.step() optimizer.zero_grad() # Output training stats if i % 50 == 0: print('[%d/%d][%d/%d]\tLoss LM: %.4f\tLoss MC: %.4f\tLoss total:%.4f' % (epoch, num_epochs, i, len(train_loader), lm_loss.item(), mc_loss.item(), total_loss.item())) # Save Losses for plotting later lm_losses.append(lm_loss.item()) mc_losses.append(mc_loss.item()) total_losses.append(total_loss.item()) curr_total_loss = total_loss.item() if curr_total_loss <= min_total_loss: min_total_loss = curr_total_loss best_model_wts = copy.deepcopy(model.state_dict()) run.log('best_min_loss', np.float(min_total_loss)) iters += 1 return model
import os from .template import get_template import torch from transformers import GPT2LMHeadModel, GPT2Tokenizer output_dir = os.path.abspath( os.path.join(os.path.realpath(__file__), '..', 'model_weights')) model = GPT2LMHeadModel.from_pretrained(output_dir) tokenizer = GPT2Tokenizer.from_pretrained(output_dir) model = model.cuda() def generate_meme(seed, id): model.eval() prompt = f"<|startoftext|> [{get_template(int(id))['label']}] {seed}" generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0) generated = generated.cuda() sample_outputs = model.generate( generated, #bos_token_id=random.randint(1,30000), do_sample=True, top_k=50, max_length=300, top_p=0.95, num_return_sequences=1) return tokenizer.decode(sample_outputs[0][2:-1], skip_special_tokens=False)
def get_tokenizer(self, **kwargs): kwargs.update(self.special_tokens_map) return GPT2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
def test_padding_if_pad_token_set_slow(self): tokenizer = GPT2Tokenizer.from_pretrained(self.tmpdirname, pad_token="<pad>") # Simple input s = "This is a simple input" s2 = ["This is a simple input looooooooong", "This is a simple input"] p = ("This is a simple input", "This is a pair") p2 = [ ("This is a simple input loooooong", "This is a simple input"), ("This is a simple pair loooooong", "This is a simple pair"), ] pad_token_id = tokenizer.pad_token_id out_s = tokenizer(s, padding="max_length", max_length=30, return_tensors="np") out_s2 = tokenizer(s2, padding=True, truncate=True, return_tensors="np") out_p = tokenizer(*p, padding="max_length", max_length=60, return_tensors="np") out_p2 = tokenizer(p2, padding=True, truncate=True, return_tensors="np") # s # test single string max_length padding self.assertEqual(out_s["input_ids"].shape[-1], 30) self.assertTrue(pad_token_id in out_s["input_ids"]) self.assertTrue(0 in out_s["attention_mask"]) # s2 # test automatic padding self.assertEqual(out_s2["input_ids"].shape[-1], 33) # long slice doesn't have padding self.assertFalse(pad_token_id in out_s2["input_ids"][0]) self.assertFalse(0 in out_s2["attention_mask"][0]) # short slice does have padding self.assertTrue(pad_token_id in out_s2["input_ids"][1]) self.assertTrue(0 in out_s2["attention_mask"][1]) # p # test single pair max_length padding self.assertEqual(out_p["input_ids"].shape[-1], 60) self.assertTrue(pad_token_id in out_p["input_ids"]) self.assertTrue(0 in out_p["attention_mask"]) # p2 # test automatic padding pair self.assertEqual(out_p2["input_ids"].shape[-1], 52) # long slice pair doesn't have padding self.assertFalse(pad_token_id in out_p2["input_ids"][0]) self.assertFalse(0 in out_p2["attention_mask"][0]) # short slice pair does have padding self.assertTrue(pad_token_id in out_p2["input_ids"][1]) self.assertTrue(0 in out_p2["attention_mask"][1])
if "maxr" not in opt.data.keys(): opt.data.maxr = 5 if opt.data.rel == "language" else 1 x = "data/conceptnet/processed/generation/rel_language-trainsize_100-devversion_12-maxe1_200-maxe2_200.pickle" path = x.format(utils.make_name_string(opt.data)) data_loader = data.make_data_loader(opt) loaded = data_loader.load_data(path) data_loader.opt = opt data_loader.batch_size = opt.train.dynamic.bs print("Done.") text_encoder = GPT2Tokenizer.from_pretrained('gpt2') special_tokens = {"cls_token": "[CLS]", "unk_token": "[UNK]"} text_encoder = GPT2Tokenizer.from_pretrained("gpt2", cls_token="[CLS]", unk_token="[UNK]", mask='["MASK"]', separator='["SEP"]', start_of_sentence='["SOS"]', end_of_sentence='["EOS"]') text_encoder.add_special_tokens(special_tokens) #categories = data.conceptnet_data.conceptnet_relations special = [data.start_token, data.end_token] #special += ["<{}>".format(cat) for cat in categories]
f"Saving features into cached file {cached_features_file}") with open(cached_features_file, "wb") as cache: pickle.dump(self.examples, cache, protocol=pickle.HIGHEST_PROTOCOL) def __len__(self): return len(self.examples) def __getitem__(self, item): return torch.tensor(self.examples[item], dtype=torch.long) if __name__ == "__main__": tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium") model = GPT2LMHeadModel.from_pretrained("gpt2-medium") tokenizer.pad_token = tokenizer.eos_token model.train() sc = DescriptionData_v2(tokenizer, file_path=FILE_PATH) # criterion = nn.CrossEntropyLoss() # optimizer = torch.optim.Adam(model.parameters(), lr=3e-5, eps=1e-08) # trainer = create_supervised_trainer(model, optimizer, criterion) # train_loader = DataLoader(dataset=sc, batch_size=64, shuffle=True, num_workers=0) # trainer.run(train_loader, 1) # optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0) # loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
def load_gpt2_large() -> Any: tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large') model = GPT2LMHeadModel.from_pretrained('gpt2-large', pad_token_id=tokenizer.eos_token_id) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) return model, tokenizer
import torch.nn.functional as F from tqdm import tqdm import numpy as np import random logger = logging.getLogger() logger.setLevel(logging.INFO) # Needed to avoid cross-domain issues response_header = { 'Access-Control-Allow-Origin': '*' } # Load artifacts model = GPT2LMHeadModel.from_pretrained('app/output') tokenizer = GPT2Tokenizer.from_pretrained('app/output') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) model.eval() def set_seed(seed): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')): """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering Args: logits: logits distribution shape (batch size x vocabulary size) top_k > 0: keep only top k tokens with highest probability (top-k filtering). top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
def main(): logging.basicConfig(stream=sys.stdout, level=logging.INFO) json_file_parser = ArgumentParser() json_file_parser.add_argument("--config_file", type=str, default=None) json_file_parser.add_argument("--tpu_num_cores", type=int, default=None) json_parser_args = json_file_parser.parse_args() parser = HfArgumentParser([TrainingArguments, ExtraArgs]) if json_parser_args.config_file is None: training_args, extra_args = parser.parse_args_into_dataclasses() else: training_args, extra_args = parser.parse_json_file( json_parser_args.config_file) with h5pickle.File("data/train.hdf5", "r", libver="latest", swmr=True, skip_cache=False) as f: train_dataset = f["train"] val_dataset = f["val"] if extra_args.max_n_train is not None: train_dataset = train_dataset[:extra_args.max_n_train] if extra_args.max_n_val is not None: val_dataset = val_dataset[:extra_args.max_n_val] model = get_model(extra_args) tokenizer = GPT2Tokenizer( "data/german_tokenizer_cc/vocab.json", "data/german_tokenizer_cc/merges.txt", ) tokenizer.pad_token = tokenizer.eos_token name = generate_slug(2) if json_parser_args.tpu_num_cores is not None: training_args.tpu_num_cores = json_parser_args.tpu_num_cores training_args.remove_unused_columns = False steps_per_epoch = int( len(train_dataset) / training_args.per_device_train_batch_size / training_args.gradient_accumulation_steps / training_args.tpu_num_cores) training_args.steps_per_epoch = steps_per_epoch training_args.eval_steps = steps_per_epoch training_args.save_steps = ( steps_per_epoch * training_args.num_train_epochs ) # only save once at the end to save space training_args.run_name = name training_args.output_dir = os.path.join("checkpoints", name) trainer = GPT2Trainer( model, training_args, extra_args=extra_args, train_dataset=train_dataset, eval_dataset=val_dataset, tokenizer=tokenizer, callbacks=[GPT2WandbCallback], ) trainer.remove_callback(WandbCallback) trainer.train() print("Done!")
import pandas as pd import numpy as np import torch import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import messenger_utils as butils import json from transformers import GPT2Tokenizer, GPT2LMHeadModel from pathlib import Path import messenger_models as models # %% tokenizer = GPT2Tokenizer.from_pretrained('gpt2') training_set_path = '../data/datasets/cami_training.csv' data = pd.read_csv(training_set_path) #data = data[0:500] parameter_dict = {} #Currently huggingface defaults for training GPT2 (except more epochs) parameter_dict['training_set_path'] = training_set_path parameter_dict['epochs'] = 10 parameter_dict['num_worker'] = 2 parameter_dict['batch_size'] = 2 parameter_dict['learning_rate'] = 5e-5 parameter_dict['weight_decay'] = 0 parameter_dict['eps'] = 1e-8 parameter_dict['warmup_steps'] = 0 parameter_dict['filename'] = 'cami_bot_072320'
def main(): bleu_list = list() length_history = list() rouge_history = list() embedding_list = list() dist1_list = list() meteor_list = list() conv_idx_match = 0 convs_top_answer = list() convs_ground_truth = list() num_answers = 1 if dataset != "cornell": if model_name == "DialoGPT": vocab = GPT2Tokenizer.from_pretrained('gpt2') else: vocab = OpenAIGPTTokenizer.from_pretrained('openai-gpt') special_tokens = { 'pad_token': PAD_TOKEN, 'bos_token': SOS_TOKEN, 'eos_token': EOS_TOKEN, 'sep_token': SEP_TOKEN, } vocab.add_special_tokens(special_tokens) state_dict = torch.load(checkpoint_path) embedding_weight_name = None for key in state_dict.keys(): if key.endswith("tok_embedding.weight"): embedding_weight_name = key break elif key.endswith("transformer.tokens_embed.weight"): embedding_weight_name = key break elif key.endswith("encoder.embedding.weight"): embedding_weight_name = key num_answers = int(target_file_path.split('_')[-2]) break elif key.endswith("wte.weight"): embedding_weight_name = key assert embedding_weight_name != None weight_tensor = state_dict[embedding_weight_name] embedding = nn.Embedding.from_pretrained(weight_tensor).to("cpu") else: with open(id2word_path, 'rb') as f: id2word = pickle.load(f) word2id = {v: k for k, v in id2word.items()} with open(pretrained_wv_path, 'rb') as f: weight_tensor = to_var(torch.FloatTensor(pickle.load(f))) embedding = nn.Embedding.from_pretrained(weight_tensor, freeze=False).to("cpu") with codecs.open(target_file_path, "r", "utf-8") as csv_f: for line in csv_f: try: conv_idx = int(line.strip().split()[-1]) except: print(line) print(conv_idx) if conv_idx_match != conv_idx: print("What?!") return conv_idx_match += 1 context_utter = csv_f.readline().strip() # print(context_utter) answers = list() # for _ in range(num_answers): answers.append(csv_f.readline().strip()) # print(answers) if '<eos>' in answers[-1]: top_answer = answers[-1].split('<eos>')[0].strip() else: top_answer = answers[-1].strip() ground_truth_utter = csv_f.readline().strip() if ground_truth_utter.split()[-1].startswith('u'): ground_truth_utter = ' '.join(ground_truth_utter.split()[:-1]) if '<eos>' in ground_truth_utter: ground_truth_utter = ground_truth_utter.split('<eos>')[0] length_history.append(len(top_answer.split())) if context_utter == "" or top_answer == "" or ground_truth_utter == "": continue dist1_list += top_answer.split() try: ground_truth_utter_ids = vocab.encode(ground_truth_utter) top_answer_utter_ids = vocab.encode(top_answer) embedding_list.append( embedding_compute(ground_truth_utter_ids, top_answer_utter_ids, embedding)) except ValueError: embedding_list.append(0) try: bleu_list.append(bleu_compute(ground_truth_utter, top_answer)) except ZeroDivisionError: bleu_list.append(0) try: rouge_history.append( rouge_compute(ground_truth_utter, top_answer)) except ValueError: rouge_history.append(np.zeros(3)) meteor_list.append(meteor_compute(ground_truth_utter, top_answer)) length_mat = np.array(length_history) bleu_mat = np.array(bleu_list) rouge_mat = np.stack(rouge_history, axis=0) embedding_mat = np.array(embedding_list) meteor_mat = np.array(meteor_list) avg_length = np.mean(length_mat) avg_bleu = np.mean(bleu_mat) avg_rouge = np.mean(rouge_mat, axis=0) avg_embedding = np.mean(embedding_mat) avg_meteor = np.mean(meteor_mat) stderr_bleu = sem(bleu_mat, axis=0) stderr_length = sem(length_mat) stderr_rouge = sem(rouge_mat, axis=0) stderr_embedding = sem(embedding_mat, axis=0) stderr_meteor = sem(meteor_mat, axis=0) dist1 = dist_compute(dist1_list) dist2 = dist_compute(dist1_list, 2) output_str_list = list() output_str_list.append(["Length", avg_length, stderr_length]) output_str_list.append(["BLEU", avg_bleu, stderr_bleu]) output_str_list.append(["Embedding", avg_embedding, stderr_embedding]) output_str_list.append(["METEOR", avg_meteor, stderr_meteor]) output_str_list.append(["Dist1", dist1, '-']) output_str_list.append(["Dist2", dist2, '-']) for one_name, one_avg, one_stderr in zip(rouge_names(), avg_rouge, stderr_rouge): output_str_list.append([one_name, one_avg, one_stderr]) output_str = tabulate.tabulate( output_str_list, headers=["Metric", "Average", "Standard Error"]) print(output_str)
def run_pplm_example(pretrained_model="gpt2-medium", cond_text="", uncond=False, num_samples=1, bag_of_words=None, discrim=None, discrim_weights=None, discrim_meta=None, class_label=-1, length=100, stepsize=0.02, temperature=1.0, top_k=10, sample=False, num_iterations=3, grad_length=10000, horizon_length=1, window_length=0, decay=False, gamma=1.5, gm_scale=0.9, kl_scale=0.01, seed=0, no_cuda=False, colorama=False): # set Random seed torch.manual_seed(seed) np.random.seed(seed) # set the device device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu" if discrim == 'generic': set_generic_model_params(discrim_weights, discrim_meta) if discrim is not None: pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim][ "pretrained_model"] print("discrim = {}, pretrained_model set " "to discriminator's = {}".format(discrim, pretrained_model)) # load pretrained model model = GPT2LMHeadModel.from_pretrained(pretrained_model, output_hidden_states=True) model.to(device) model.eval() # load tokenizer tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model) # Freeze GPT-2 weights for param in model.parameters(): param.requires_grad = False # figure out conditioning text if uncond: tokenized_cond_text = tokenizer.encode([tokenizer.bos_token]) else: raw_text = cond_text while not raw_text: print("Did you forget to add `--cond_text`? ") raw_text = input("Model prompt >>> ") tokenized_cond_text = tokenizer.encode(tokenizer.bos_token + raw_text) print("= Prefix of sentence =") print(tokenizer.decode(tokenized_cond_text)) print() # generate unperturbed and perturbed texts # full_text_generation returns: # unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time unpert_gen_tok_text, pert_gen_tok_texts, _, _ = full_text_generation( model=model, tokenizer=tokenizer, context=tokenized_cond_text, device=device, num_samples=num_samples, bag_of_words=bag_of_words, discrim=discrim, class_label=class_label, length=length, stepsize=stepsize, temperature=temperature, top_k=top_k, sample=sample, num_iterations=num_iterations, grad_length=grad_length, horizon_length=horizon_length, window_length=window_length, decay=decay, gamma=gamma, gm_scale=gm_scale, kl_scale=kl_scale, ) # untokenize unperturbed text unpert_gen_text = tokenizer.decode(unpert_gen_tok_text.tolist()[0]) print("=" * 80) print("= Unperturbed generated text =") print(unpert_gen_text) print() generated_texts = [] bow_word_ids = set() if bag_of_words and colorama: bow_indices = get_bag_of_words_indices(bag_of_words.split(";"), tokenizer) for single_bow_list in bow_indices: # filtering all words in the list composed of more than 1 token filtered = list(filter(lambda x: len(x) <= 1, single_bow_list)) # w[0] because we are sure w has only 1 item because previous fitler bow_word_ids.update(w[0] for w in filtered) # iterate through the perturbed texts for i, pert_gen_tok_text in enumerate(pert_gen_tok_texts): try: # untokenize unperturbed text if colorama: import colorama pert_gen_text = '' for word_id in pert_gen_tok_text.tolist()[0]: if word_id in bow_word_ids: pert_gen_text += '{}{}{}'.format( colorama.Fore.RED, tokenizer.decode([word_id]), colorama.Style.RESET_ALL) else: pert_gen_text += tokenizer.decode([word_id]) else: pert_gen_text = tokenizer.decode(pert_gen_tok_text.tolist()[0]) print("= Perturbed generated text {} =".format(i + 1)) print(pert_gen_text) print() except: pass # keep the prefix, perturbed seq, original seq for each index generated_texts.append( (tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text)) return
for metric, values in meta_info.items(): data_file.write(f"{metric}\n") data_file.write(f"{describe(values)}\n") data_file.write(f"{percentile(values, [50, 75, 90, 95, 99])}\n\n") print(f"Results saved to {results_path}") def analyze_dataset(dataset): dataset_meta = { "train": { "title": "Train" }, "valid_freq": { "title": "Valid Frequent" } } for split, meta in dataset_meta.items(): data_split = dataset[split] analyze_split(data_split, meta) if __name__ == '__main__': tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large') dataset_path = 'processed_output' dataset_cache = './dataset_cache' dataset = get_dataset(tokenizer, dataset_path, dataset_cache) analyze_dataset(dataset)
import torch from transformers import GPT2LMHeadModel, GPT2Tokenizer tokenizer = GPT2Tokenizer.from_pretrained("gpt2") # add the EOS token as PAD token to avoid warnings model = GPT2LMHeadModel.from_pretrained( "gpt2", pad_token_id=tokenizer.eos_token_id).cuda() text = "Replace me by any text you'd like." encoded_input = tokenizer.encode(text, return_tensors='pt') print(encoded_input) predictions, _ = model(encoded_input.cuda()) print(predictions.size()) sentences = {length: set() for length in range(20)} counter = 0 with open( "/jagupard27/scr0/mhahn/memory/char-lm-ud-stationary_12_SuperLong_WithAutoencoder_WithEx_Samples_Short_Combination_Subseq_VeryLong_WithSurp12_NormJudg_Short_CondGPT2.py_749792590_Model.txt", "r") as inFile: next(inFile) for line in inFile: counter += 1 line = line.strip().split("\t") _, _, _, _, _, _, _, sentence, _, _, nextWord = line sentence = (sentence.strip().split(" ")[1:-1] + [nextWord.strip()]) sentences[len(sentence)].add(" ".join(sentence).strip()) if counter % 1000 == 0: print(counter / 9537985, sum([len(x) for _, x in sentences.items()]) / counter) # break
) print( f"max memory B: {torch.cuda.max_memory_allocated(model_B.device)}" ) logger.info( f"max memory B: {torch.cuda.max_memory_allocated(model_B.device)}" ) torch.save(( self.model_A.state_dict(), self.model_B.state_dict() ), f"Checkpoint/{self.trained_steps}_steps_{np.mean(all_rewards)}_reward_model_A.pth" ) NEW_MODEL_A_DIR = None #"Checkpoint/20_steps_0.049586776859504134_reward_model_A.pth" tokenizer = GPT2Tokenizer.from_pretrained("gpt2") #torch.load(tokenizer_dir) DEVICE1 = torch.device("cuda:1") DEVICE2 = torch.device("cuda:0") SPLIT_INTO = 2 model_A, model_B = load_model(cfg, device1=DEVICE1, device2=DEVICE2, split_into=SPLIT_INTO, model_A_dir=NEW_MODEL_A_DIR) pdb.set_trace() PAD_TOKEN = tokenizer.encoder["<|endoftext|>"] clip_range = 0.2 entropy_coef = 1e-5 min_entropy = 10.0 # depends on the task
from transformers import GPT2Tokenizer BATCH_SIZE = 16 EPOCHS = 4 LEARNING_RATE = 2e-5 MAX_LEN = 64 TRAIN_PATH = "/kaggle/input/short-jokes/shortjokes.csv" #ADD PATH TO YOUR DATASET HERE MODEL_FOLDER = "/kaggle/working/trained_models" # ADD PATH TO WHERE YOU WANT TO SAVE YOUR MODEL Tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
def __init__(self): super(GPT2, self).__init__() self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2') self.model = GPT2Model.from_pretrained('gpt2') self.length = 1024 * 20
import torch import numpy as np from transformers import GPT2Tokenizer, GPT2LMHeadModel from tqdm import tqdm from lib import objects from lib import wikitext_to_html from lib import constants as c device = 'cpu' if torch.cuda.is_available(): device = 'cuda' tokenizer = GPT2Tokenizer.from_pretrained(c.GPT2_NAME) model = GPT2LMHeadModel.from_pretrained(c.GPT2_NAME) model = model.to(device) EOD_ID = tokenizer.encode("<|endoftext|>")[0] EQUAL_ID = tokenizer.encode(" = ")[0] NEW_LINE_ID = 198 HEADER_ID = 796 test_article = """ = Toronto Raptors = Toronto Raptors are the best team in the world = = History = = Founded in 1996, they had to endure Vince Carter before winning the 2018-2019 NBA Championship """ def generate_text(input_str, text_len=c.MAX_TEXT_LENGTH, end_of_text_id=EOD_ID, top_random=5, test=False, memory=c.DEFAULT_MODEL_MEMORY): if test:
default='structural/grammar.avg') args = ap.parse_args() algo = args.algo k = args.k layer = args.layer out_dir = args.out_dir model_type = args.model_type structural = args.structural grammar_file = args.grammar_file if not os.path.exists(out_dir): os.makedirs(out_dir) tokenizer = GPT2Tokenizer.from_pretrained(model_type) model = Model(device='cuda') DEVICE = 'cuda' if structural: grammar = read_grammar(grammar_file) templates = get_template_list(structure='across_obj_rel', grammar=grammar) else: grammar = None templates = get_template_list() if args.algo == 'topk': marg_contrib_path = out_dir + "/marg_contrib.pickle" if os.path.exists(marg_contrib_path): print('Using cached marginal contribution')
import torch from transformers import GPT2Tokenizer, GPT2LMHeadModel import time # 加载词汇表 tokenizer = GPT2Tokenizer.from_pretrained('/workspace/xulun/out4/') # 输入待补全的文本 text = 'function walk(dir, fn) { if (fs.existsSync(dir)) { let stat =' #text = 'if (stat.isDirectory()) {fs.readdirSync(dir).' #text = 'mediaFileText.color =' #text = 'mediaFileText.top =' predicted_text = text # 加载模型中预训练好的权值 model = GPT2LMHeadModel.from_pretrained('/workspace/xulun/out4/') # 设置为eval模式,这样就不会执行训练模式下的Dropout过程 model.eval() #model.to('cuda') # 每一个只能补一个token出来,补一句话需要多次,30次是我拍脑袋的 for i in range(0,30): ticks = time.time() print(time.localtime(time.time())) # 以上次预测结果作为本次的输入,所谓的自回归 indexed_tokens = tokenizer.encode(predicted_text) # 将读出的索引标记转化成PyTorch向量 tokens_tensor = torch.tensor([indexed_tokens])
# This text into sentences. Fun fact, it's hard to know what the sentences are in # the following: # "You notice that Dr. Taco is shaped like his name. Nevertheless, you hope # he'll know how to deal with your mishapen toe." from nltk.corpus import gutenberg from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer # This is the original GTP-2 model, and it's a bit huge (so stuff goes slow) and # it's not that great for generating flowing, coherent, text. # model = GPT2LMHeadModel.from_pretrained('gpt2') # tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # This model is pretty good, and tuned more for story-like text. It's a bit # faster than the standard GPT-2 model as well. model = GPT2LMHeadModel.from_pretrained('pranavpsv/genre-story-generator-v2') tokenizer = GPT2Tokenizer.from_pretrained('pranavpsv/genre-story-generator-v2') # Some models may have this string in their output. We want to scrub it out. stop_token = '<|endoftext|>' # Default settings default_full_sentences = False # allow for generated sentences to be incomplete default_cycles = 1 # must be set to at least 2 if full_sentences is True default_cycle_counter = 0 default_story = "I enjoy walking with my cute dog" default_base_length = 30 default_timestamps = True default_temperature = 0.7 default_top_k = 50 default_top_p = 0.95 default_min_length = 10