def create_and_check_double_lm_head_model( self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args ): model = GPT2DoubleHeadsModel(config) model.to(torch_device) model.eval() multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() inputs = { "input_ids": multiple_choice_inputs_ids, "mc_token_ids": mc_token_ids, "attention_mask": multiple_choice_input_mask, "token_type_ids": multiple_choice_token_type_ids, "lm_labels": multiple_choice_inputs_ids, } loss, lm_logits, mc_logits, _ = model(**inputs) result = {"loss": loss, "lm_logits": lm_logits, "mc_logits": mc_logits} self.parent.assertListEqual(list(result["loss"].size()), []) self.parent.assertListEqual( list(result["lm_logits"].size()), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size], ) self.parent.assertListEqual(list(result["mc_logits"].size()), [self.batch_size, self.num_choices])
def create_and_check_double_lm_head_model( self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args ): model = GPT2DoubleHeadsModel(config) model.to(torch_device) model.eval() multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() inputs = { "input_ids": multiple_choice_inputs_ids, "mc_token_ids": mc_token_ids, "attention_mask": multiple_choice_input_mask, "token_type_ids": multiple_choice_token_type_ids, "labels": multiple_choice_inputs_ids, } result = model(**inputs) self.parent.assertEqual(result.loss.shape, ()) self.parent.assertEqual( result.logits.shape, (self.batch_size, self.num_choices, self.seq_length, self.vocab_size) ) self.parent.assertEqual(result.mc_logits.shape, (self.batch_size, self.num_choices))
def test_batch_generation_2heads(self): model = GPT2DoubleHeadsModel.from_pretrained("gpt2") model.to(torch_device) tokenizer = GPT2Tokenizer.from_pretrained("gpt2") tokenizer.padding_side = "left" # This tokenizer has no pad token, so we have to set it in some way # Define PAD Token = EOS Token = 50256 tokenizer.pad_token = tokenizer.eos_token model.config.pad_token_id = model.config.eos_token_id # use different length sentences to test batching sentences = [ "Hello, my dog is a little", "Today, I", ] inputs = tokenizer(sentences, return_tensors="pt", padding=True) input_ids = inputs["input_ids"].to(torch_device) token_type_ids = torch.cat( [ input_ids.new_full((input_ids.shape[0], input_ids.shape[1] - 1), 0), input_ids.new_full((input_ids.shape[0], 1), 500), ], dim=-1, ) outputs = model.generate( input_ids=input_ids, attention_mask=inputs["attention_mask"].to(torch_device), ) outputs_tt = model.generate( input_ids=input_ids, attention_mask=inputs["attention_mask"].to(torch_device), token_type_ids=token_type_ids, ) inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device) output_non_padded = model.generate(input_ids=inputs_non_padded) num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item() inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device) output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings) batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True) batch_out_sentence_tt = tokenizer.batch_decode(outputs_tt, skip_special_tokens=True) non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True) padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True) expected_output_sentence = [ "Hello, my dog is a little bit of a mess. I'm not sure if he's going", "Today, I'm going to be doing a lot of research on this. I", ] self.assertListEqual(expected_output_sentence, batch_out_sentence) self.assertTrue(batch_out_sentence_tt != batch_out_sentence) # token_type_ids should change output self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
def __init__(self, hparams: Namespace): super().__init__() self.hparams = hparams # GPT2 is going to be frozen and fixed! # because of that we hide it inside the DataModule self.gpt2 = GPT2DoubleHeadsModel.from_pretrained( self.hparams.pretrained_model) self.tokenizer = Tokenizer(self.hparams.pretrained_model) # Resize embeddings to include the added tokens self.gpt2.resize_token_embeddings(self.tokenizer.vocab_size)
def get_model_tokenizer(): global model global tokenizer if model is None: # Load trained model model = GPT2DoubleHeadsModel.from_pretrained(trained_model_path) # Convert model parameter tensors to device model.to("cpu") # Load trained Tokenizer tokenizer = GPT2Tokenizer.from_pretrained(trained_model_path) return model, tokenizer
def main(args): """ Execute the summarization from fine-tuned GPT2 model (given in arguments CLI) write the summary.txt file """ model = GPT2DoubleHeadsModel.from_pretrained(args.model_directory) tokenizer = GPT2Tokenizer.from_pretrained(args.model_directory) # Add a [CLS] to the vocabulary (we should train it also!) special_tokens = { 'bos_token': '<|startoftext|>', 'eos_token': '<|endoftext|>', 'pad_token': '<pad>', 'additional_special_tokens': ['<|keyword|>', '<|summarize|>'] } tokenizer.add_special_tokens(special_tokens) assert len(tokenizer) == 50261, "tokenizer size is not 50261" model.resize_token_embeddings(len(tokenizer)) print(' ') file1 = open(args.input_file, 'r') input_text = file1.read() file1.close() model = model.to(device) input_text = '<|startoftext|> ' + input_text + ' <|summarize|>' input_token = tokenizer.encode(input_text) input_token_torch = torch.tensor(input_token, dtype=torch.long) generated_output = model.generate( input_ids=input_token_torch.unsqueeze(0).to(device), max_length=args.max_length + len(input_token), min_length=args.min_length + len(input_token), temperature=args.temperature, decoder_start_token_id='<|summarize|>', top_k=args.top_k, top_p=args.top_p, repetition_penalty=None, do_sample=True, num_return_sequences=args.num_return_sequences) batch_answer = [] for item in generated_output: batch_answer.append( tokenizer.decode(item[len(input_token):], skip_special_tokens=True)) f = open("summary.txt", "a") f.writelines(batch_answer) f.close()
def __init__(self, context): super(TransferLearning, self).__init__(context) self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium') self.model = GPT2DoubleHeadsModel.from_pretrained('gpt2-medium') # ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, # ``additional_special_tokens`` self.special_tokens = { 'bos_token': "<bos>", 'eos_token': "<eos>", 'additional_special_tokens': ["<speaker1>", "<speaker2>"], 'pad_token': "<pad>" } self.tokenizer.add_special_tokens(self.special_tokens) self.model.resize_token_embeddings(self.tokenizer.vocab_size)
def __init__(self, hparams: Namespace): super().__init__() self.hparams = hparams # GPT2 is going to be frozen and fixed! # because of that we hide it inside the DataModule self.gpt2 = GPT2DoubleHeadsModel.from_pretrained( self.hparams.pretrained_model) self.tokenizer = Tokenizer(self.hparams.pretrained_model) # Resize embeddings to include the added tokens self.gpt2.resize_token_embeddings(self.tokenizer.vocab_size) ## Quantize if self.hparams.quantize: emb_qconf = torch.quantization.float_qparams_weight_only_qconfig self.gpt2.transformer.wte.qconfig = emb_qconf self.gpt2.transformer.wpe.qconfig = emb_qconf
def run(): parser = ArgumentParser() parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": raise ValueError("Requiring a finetuned model_checkpoint") if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") config = GPT2Config(vocab_size=50003) model = GPT2DoubleHeadsModel(config) if args.model_checkpoint: print("\nLoad model from", args.model_checkpoint) model.load_state_dict(torch.load(args.model_checkpoint), strict=False) model.to(args.device) add_special_tokens_(model, tokenizer) history = '' print('\nPlease input a sentece to chat with the chatbot!') while True: raw_text = input(">>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input(">>> ") history = tokenizer.tokenize(raw_text) with torch.no_grad(): out_ids = sample_sequence(history, tokenizer, model, args) print(tokenizer.convert_ids_to_tokens(out_ids))
# Functions and Models Prepared #===============================================================================================# device = torch.device("cpu") GPT2_directory = 'Models' tokenizer_GPT2 = GPT2Tokenizer.from_pretrained(GPT2_directory) special_tokens = { 'bos_token': '<|startoftext|>', 'eos_token': '<|endoftext|>', 'pad_token': '<pad>', 'additional_special_tokens': ['<|keyword|>', '<|summarize|>'] } tokenizer_GPT2.add_special_tokens(special_tokens) GPT2_generator = GPT2DoubleHeadsModel.from_pretrained(GPT2_directory) device = torch.device("cpu") use_GPU_GPT_generator = False if use_GPU_GPT_generator: GPT2_generator = GPT2_generator.to(device) GPT2_input_torch = GPT2_input_torch.to(device) list_keywords = get_keywords(text) GPT2_input = tokenizer_GPT2.encode('<|startoftext|> ' + title + list_keywords + ' <|summarize|> ') GPT2_input_torch = torch.tensor(GPT2_input, dtype=torch.long) temperature = 1 greedy_search = False
def main(args): """ executing the training given the arguments in CLI output: write pytorch model file, and config files write training and validation statistics (in .json) """ train_dict = {'lm_loss': [], 'mc_loss': [], 'total_loss': []} val_dict = {'lm_loss': [], 'mc_loss': [], 'total_loss': []} if args.model_directory == None: model = GPT2DoubleHeadsModel.from_pretrained('distilgpt2') tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2') special_tokens = { 'bos_token': '<|startoftext|>', 'eos_token': '<|endoftext|>', 'pad_token': '<pad>', 'additional_special_tokens': ['<|keyword|>', '<|summarize|>'] } print('total length of vocab should be 50261 = ', len(tokenizer)) model.resize_token_embeddings(len(tokenizer)) print('resize the model embedding layer') else: model = GPT2DoubleHeadsModel.from_pretrained(args.model_directory) tokenizer = GPT2Tokenizer.from_pretrained(args.model_directory) special_tokens = { 'bos_token': '<|startoftext|>', 'eos_token': '<|endoftext|>', 'pad_token': '<pad>', 'additional_special_tokens': ['<|keyword|>', '<|summarize|>'] } print('total length of vocab should be 50261 = ', len(tokenizer)) # Add a [CLS] to the vocabulary (we should train it also!) print(' ') train_dataset = torch.load(args.train_data) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=1) print('finished downloading train dataset') val_dataset = torch.load(args.val_data) val_sampler = RandomSampler(val_dataset) val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=1) print('finished downloading vallidation dataset') model = model.to(device) optimizer = AdamW(model.parameters(), lr=args.learning_rate, eps=args.eps, correct_bias=True) total_steps = len(train_dataloader) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.scheduler_warmup, num_training_steps=total_steps) for epoch in range(args.epochs): start = timeit.default_timer() start_iter = timeit.default_timer() for iterations, batch in enumerate(train_dataloader): lm_loss, mc_loss, total_loss = train(args, batch, iterations, model, optimizer, scheduler) train_dict['lm_loss'].append(lm_loss) train_dict['mc_loss'].append(mc_loss) train_dict['total_loss'].append(total_loss) if iterations % args.print_every == 0: stop_iter = timeit.default_timer() print( "Trainer Results - epoch {} - LM loss: {:.2f} MC loss: {:.2f} total loss: {:.2f} report time: {:.1f} sec" .format(iterations, train_dict['lm_loss'][-1], train_dict['mc_loss'][-1], train_dict['total_loss'][-1], stop_iter - start_iter)) start_iter = timeit.default_timer() print('end-of-training-epoch') stop = timeit.default_timer() print( "Trainer Results - epoch {} - LM loss: {:.2f} MC loss: {:.2f} total loss: {:.2f} report time: {:.1f} sec" .format(epoch, train_dict['lm_loss'][-1], train_dict['mc_loss'][-1], train_dict['total_loss'][-1], stop - start)) print(' ') for iterations, batch in enumerate(val_dataloader): lm_loss, mc_loss, total_loss = evaluate(args, batch, model) val_dict['lm_loss'].append(lm_loss) val_dict['mc_loss'].append(mc_loss) val_dict['total_loss'].append(total_loss) print('end-of-validation-epoch') stop_eval = timeit.default_timer() print( "Evaluator Results - epoch {} - LM loss: {:.2f} MC loss: {:.2f} total loss: {:.2f} report time: {:.1f} sec" .format(epoch, val_dict['lm_loss'][-1], val_dict['mc_loss'][-1], val_dict['total_loss'][-1], stop_eval - stop)) print(' ') model.config.to_json_file(args.model_name + '/config.json') tokenizer.save_vocabulary(args.model_name) model_file = args.model_name + '/pytorch_model.bin' torch.save(model.state_dict(), model_file) with open( args.model_name + '/training_loss_' + str(args.epochs) + '_epoch.json', 'w') as fp: json.dump(train_dict, fp) with open( args.model_name + '/validation_loss_' + str(args.epochs) + '_epoch.json', 'w') as fq: json.dump(val_dict, fq)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--model_name", type=str, default="gpt2", help="pretrained model name") parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) parser.add_argument("--train_dataset", type=str, default="") parser.add_argument("--eval_dataset", type=str, default="") parser.add_argument("--seed", type=int, default=42) parser.add_argument("--num_train_epochs", type=int, default=3) parser.add_argument("--train_batch_size", type=int, default=16) parser.add_argument("--eval_batch_size", type=int, default=16) parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", type=int, default=1) parser.add_argument( "--max_steps", default=-1, type=int, help="If > 0: set total number of training \ steps to perform. Override num_train_epochs.", ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before\ performing a backward/update pass.", ) parser.add_argument("--learning_rate", type=float, default=6.25e-5) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--lr_schedule", type=str, default="warmup_linear") parser.add_argument("--weight_decay", type=float, default=0.01) parser.add_argument("--lm_coef", type=float, default=0.5) parser.add_argument("--n_valid", type=int, default=374) parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Load tokenizer and model # This loading function also adds new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ["_start_", "_delimiter_", "_classify_"] try: # Load a trained model and vocabulary that you have fine-tuned model = GPT2DoubleHeadsModel.from_pretrained(args.output_dir) tokenizer = GPT2Tokenizer.from_pretrained(args.output_dir) except: model = GPT2DoubleHeadsModel.from_pretrained(args.model_name) tokenizer = GPT2Tokenizer.from_pretrained(args.model_name) tokenizer.add_tokens(special_tokens) special_tokens_ids = tokenizer.convert_tokens_to_ids(special_tokens) model.resize_token_embeddings(len(tokenizer)) model.to(device) # Load and encode the datasets def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding dataset...") train_dataset = load_rocstories_dataset(args.train_dataset) eval_dataset = load_rocstories_dataset(args.eval_dataset) datasets = (train_dataset, eval_dataset) encoded_datasets = tokenize_and_encode(datasets) # Compute the max input length for the Transformer max_length = model.config.n_positions // 2 - 2 input_length = max( len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3 for dataset in encoded_datasets for story, cont1, cont2, _ in dataset) input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids) train_tensor_dataset, eval_tensor_dataset = tensor_datasets[ 0], tensor_datasets[1] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare optimizer if args.do_train: if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels) loss = args.lm_coef * losses[0] + losses[ 1] # LM loss * coef + MC loss loss.backward() optimizer.step() scheduler.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = (loss.item() if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item()) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, scheduler.get_lr()[0]) # Save a trained model if args.do_train: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, "module") else model # Only save the model itself # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) model_to_save.save_pretrained(args.output_dir) tokenizer.save_vocabulary(args.output_dir) tokenizer.save_pretrained(args.output_dir) logger.info("Saving model to %s", args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = GPT2DoubleHeadsModel.from_pretrained(args.output_dir) tokenizer = GPT2Tokenizer.from_pretrained(args.output_dir) model.to(device) if args.do_eval: model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch with torch.no_grad(): _, mc_loss, _, mc_logits, _ = model(input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels) mc_logits = mc_logits.detach().cpu().numpy() mc_labels = mc_labels.to("cpu").numpy() tmp_eval_accuracy = accuracy(mc_logits, mc_labels) eval_loss += mc_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples train_loss = tr_loss / nb_tr_steps if args.do_train else None result = { "eval_loss": eval_loss, "eval_accuracy": eval_accuracy, "train_loss": train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") torch.__version__ import transformers from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel, AdamW print('use transformers version = ',transformers.__version__) # make sure it is 2.6.0 load_model = False load_previous_weight = False resize_model = False ### 1 Pretrained Model setup ### ################################ model = GPT2DoubleHeadsModel.from_pretrained('distilgpt2') tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2') special_tokens = {'bos_token':'<|startoftext|>','eos_token':'<|endoftext|>','pad_token':'<pad>','additional_special_tokens':['<|keyword|>','<|summarize|>']} print(len(tokenizer), 'total length of vocab') # expect 50257 special_tokens = {'bos_token':'<|startoftext|>','eos_token':'<|endoftext|>','pad_token':'<pad>','additional_special_tokens':['<|keyword|>','<|summarize|>']} #special_tokens2 = {'bos_token':'<|startoftext|>','eos_token':'<|endoftext|>','keyword_token':'<|keyword|>','summary_token':'<|summarize|>'} tokenizer.add_special_tokens(special_tokens) model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size # The newly token the last token of the vocabulary resize_model = True print(len(tokenizer), 'total length of vocab') print(tokenizer.bos_token_id, 'bos_token') print(tokenizer.eos_token_id, 'eos_token')
def train(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--model_checkpoint", type=str, default="gpt2", help="Path, url or short name of the model") parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training") parser.add_argument("--train_batch_size", type=int, default=16, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=4, help="Batch size for validation") parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Accumulate gradients on several steps") parser.add_argument("--lr", type=float, default=1e-4, help="Learning rate") parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient") parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs") parser.add_argument( "--eval_before_start", action='store_true', help="If true start with a first evaluation before training") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument( "--fp16", type=str, default="", help= "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument( "--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") parser.add_argument( "--init_model", default="model/pytorch_kogpt2_676e9bcfa7.params", type=str, help= "The model checkpoint for weights initialization. Leave None if you want to train a model from scratch.", ) args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", args.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info("Prepare tokenizer, pretrained model and optimizer.") config = GPT2Config(vocab_size=50000) model = GPT2DoubleHeadsModel(config) if args.init_model: print("Load model from ", args.init_model) model.load_state_dict(torch.load(args.init_model), strict=False) model.to(args.device) add_special_tokens_(model, tokenizer) optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders( args, tokenizer) # Training function and trainer def update(engine, batch): model.train() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch (lm_loss), (mc_loss), *_ = model(input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, mc_labels=mc_labels, lm_labels=lm_labels) loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple( input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch # logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) # if we dont send labels to model, it doesnt return losses lm_logits, mc_logits, *_ = model( input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, ) lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0][0], x[1][0])), "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1])) } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args), "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) log_dir = make_logdir(args.init_model) tb_logger = TensorboardLogger(log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" takes care of distributed encapsulation torch.save(args, log_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME)) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename( os.path.join(log_dir, checkpoint_handler._saved[-1][1]), os.path.join(log_dir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def __init__(self, pretrained_model_name_or_path, config): super(GPT2ForMultipleChoice, self).__init__() self.gpt2 = GPT2DoubleHeadsModel.from_pretrained( pretrained_model_name_or_path, config=config)
def train(data_folder): checkpoint = False # set to True if continuing to train our model, o/w false # set to True to chat with the unaltered GPT-2 model (at bottom of notebook) baseline = False model_file = '/gpt-2_epoch_0' tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2DoubleHeadsModel.from_pretrained('gpt2') csv_file = data_folder + '/processed_data_final.csv' genre_dict = {'comedy': '<comedy>', 'sport': '<sport>', 'biography': '<biography>', 'romance': '<romance>', 'action': '<action>', 'adventure': '<adventure>', 'drama': '<drama>', 'sci-fi': '<sci-fi>', 'family': '<family>', 'fantasy': '<fantasy>', 'musical': '<musical>', 'crime': '<crime>', 'thriller': '<thriller>', 'short': '<short>', 'western': '<western>', 'documentary': '<documentary>', 'horror': '<horror>', 'animation': '<animation>', 'film-noir': '<film-noir>', 'music': '<music>', 'war': '<war>', 'mystery': '<mystery>'} genres = genre_dict.keys() special_tokens = ["<speaker1>", "<speaker2>"] + \ ["<" + genre + ">" for genre in genres] SPECIAL_TOKENS = {"bos_token": "<bos>", "eos_token": "<eos>", "additional_special_tokens": special_tokens, "pad_token": "<pad>"} if not baseline: tokenizer.add_special_tokens(SPECIAL_TOKENS) model.resize_token_embeddings(len(tokenizer)) if not baseline: ngpu = 0 for param in model.parameters(): param.requires_grad = False # Parameters of newly constructed modules have requires_grad=True by default model.lm_head = nn.Linear(model.lm_head.in_features, len(tokenizer)) model.multiple_choice_head.summary = nn.Linear( model.multiple_choice_head.summary.in_features, 1, bias=True) # retrain final fc layer and mc layer for language modeling task device = torch.device("cuda:0" if ( torch.cuda.is_available() and ngpu > 0) else "cpu") model = model.to(device) if checkpoint: model.load_state_dict(torch.load(model_file)) pkl_file = data_folder + '/dialogue_data.pkl' dataset = DialogueDataset(pkl_file=pkl_file) data_size = dataset.__len__() batch_size = 4 train_size = .8 shuffle_dataset = True #random_seed = random.randint(1, 10000) random_seed = 42 # use indexing info from dataset for splitting groups gss = GroupShuffleSplit(n_splits=1, train_size=train_size, random_state=random_seed) # group stratified CV df = get_df_data(csv_file) for train_idx, val_idx in gss.split(df, df['sentence_2'], df['index']): train_sampler = SubsetRandomSampler(train_idx) valid_sampler = SubsetRandomSampler(val_idx) train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler) validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler) # params lm_losses = [] mc_losses = [] total_losses = [] lm_losses_val = [] mc_losses_val = [] total_losses_val = [] iters = 0 lm_coef = 2.0 mc_coef = 1.0 num_epochs = 3 lr = 6.25e-5 max_grad_norm = 1.0 num_training_steps = (data_size // batch_size) * num_epochs warmup_proportion = 0.1 num_warmup_steps = num_training_steps * .1 grad_accum_steps = 8 # In Transformers, optimizer and schedules are splitted and instantiated like this: # To reproduce BertAdam specific behavior set correct_bias=False optimizer = AdamW(model.parameters(), lr=lr) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps, num_training_steps) # PyTorch scheduler #scheduler = PiecewiseLinear(optimizer, "lr", [(0, lr), (num_epochs * len(train_loader), 0.0)]) print("Starting Training Loop...") min_total_loss = 4000 # For each epoch for epoch in range(num_epochs): # checkpoints if epoch > 0: torch.save(model.state_dict(), "/gpt-2_epoch_{}".format(epoch)) # For each batch in the dataloader for i, data in enumerate(train_loader, 0): model.train() input_ids = data[0] token_type_ids = data[1] mc_token_ids = data[2] lm_labels = data[3] mc_labels = data[4] output = model(input_ids, mc_token_ids=mc_token_ids, mc_labels=mc_labels, token_type_ids=token_type_ids, lm_labels=lm_labels) lm_loss = output[0] mc_loss = output[1] total_loss = lm_loss * lm_coef + mc_loss * mc_coef / grad_accum_steps total_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) if i % grad_accum_steps == 0: optimizer.step() scheduler.step() optimizer.zero_grad() # Output training stats if i % 50 == 0: print('[%d/%d][%d/%d]\tLoss LM: %.4f\tLoss MC: %.4f\tLoss total:%.4f' % (epoch, num_epochs, i, len(train_loader), lm_loss.item(), mc_loss.item(), total_loss.item())) # Save Losses for plotting later lm_losses.append(lm_loss.item()) mc_losses.append(mc_loss.item()) total_losses.append(total_loss.item()) curr_total_loss = total_loss.item() if curr_total_loss <= min_total_loss: min_total_loss = curr_total_loss best_model_wts = copy.deepcopy(model.state_dict()) run.log('best_min_loss', np.float(min_total_loss)) iters += 1 break break return model
def run(): parser = ArgumentParser() parser.add_argument( "--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") config = GPT2Config(vocab_size=50003) model = GPT2DoubleHeadsModel(config) if args.model_checkpoint: print("\tLoad model from ", args.model_checkpoint) model.load_state_dict(torch.load(args.model_checkpoint), strict=False) model.to(args.device) add_special_tokens_(model, tokenizer) history = '' while True: raw_text = input(">>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input(">>> ") history = tokenizer.tokenize(raw_text) result_set = set() for _ in range(0, 10): with torch.no_grad(): out_ids = sample_sequence(history, tokenizer, model, args) result_set.add(tokenizer.convert_ids_to_tokens(out_ids)) for result in result_set: print(result)