def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='gpt2', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--train_dataset', type=str, default='') parser.add_argument('--eval_dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument('--warmup_proportion', type=float, default=0.002) parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print('{} is on use...'.format(device)) n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = GPT2Tokenizer.from_pretrained(args.model_name, special_tokens=special_tokens) special_tokens_ids = list( tokenizer.convert_tokens_to_ids(token) for token in special_tokens) # model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens)) model = GPT2DoubleHeadsModel.from_pretrained( args.model_name, num_special_tokens=len(special_tokens)) # GPT2DoubleHeadsModel.set_num_special_tokens(model, len(special_tokens)) model.to(device) # Load and encode the datasets if not args.train_dataset and not args.eval_dataset: roc_stories = cached_path(ROCSTORIES_URL) def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding dataset...") train_dataset = load_rocstories_dataset(args.train_dataset) eval_dataset = load_rocstories_dataset(args.eval_dataset) datasets = (train_dataset, eval_dataset) encoded_datasets = tokenize_and_encode(datasets) # Compute the max input length for the Transformer max_length = model.config.n_positions // 2 - 2 input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3 \ for dataset in encoded_datasets for story, cont1, cont2, _ in dataset) input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids) train_tensor_dataset, eval_tensor_dataset = tensor_datasets[ 0], tensor_datasets[1] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare optimizer if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = len( train_dataloader) * args.num_train_epochs optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids, lm_labels, mc_labels) loss = args.lm_coef * losses[0] + losses[1] loss.backward() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, optimizer.get_lr()[0]) # Save a trained model if args.do_train: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = GPT2DoubleHeadsModel.from_pretrained(args.output_dir) tokenizer = GPT2Tokenizer.from_pretrained(args.output_dir) model.to(device) if args.do_eval: model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch with torch.no_grad(): _, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels) _, mc_logits = model(input_ids, mc_token_ids) mc_logits = mc_logits.detach().cpu().numpy() mc_labels = mc_labels.to('cpu').numpy() tmp_eval_accuracy = accuracy(mc_logits, mc_labels) eval_loss += mc_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples train_loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'train_loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--output_dir", default='tuned_gpt2', type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument('--train_dataset', type=str, default='') parser.add_argument('--source_eval', type=str, default='') parser.add_argument('--target_eval', type=str, default='') parser.add_argument('--source_train', type=str, default='') parser.add_argument('--target_train', type=str, default='') parser.add_argument('--eval_dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=10) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--effective_batch_size',type=int, default=64) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument('--warmup_proportion', type=float, default=0.002) parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument('--bsz', type=int, default = 20) parser.add_argument('--bptt', type=int, default = 40) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() # print(args) model_type = 'gpt2' if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device(type='cuda') n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) # if not args.do_train and not args.do_eval: # raise ValueError("At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained('gpt2').to('cuda') model.to(device) #file_train = args.train_dataset #'cnn_train.txt' #file_eval = args.eval_dataset #'cnn_valid.txt' bptt = args.bptt bsz = args.bsz # X_eval, nbatch_eval = load_dataset(file_eval, tokenizer, bptt, bsz) # X_train, nbatch_train = load_dataset(file_train, tokenizer, bptt, bsz) batches_eval, labels_eval, nbatch_eval = load_dataset(args.source_eval, args.target_eval, tokenizer, bptt, bsz) batches_train, labels_train, nbatch_train = load_dataset(args.source_train, args.target_train, tokenizer, bptt, bsz) # Prepare optimizer # param_optimizer = list(model.parameters()) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] print('here 3') # num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size num_train_optimization_steps = nbatch_train * args.num_train_epochs optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) eval_loss_min = None print('here 4') model.to(device) nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for epoch_i in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 for i_batch in tqdm(list(range(nbatch_train)), desc='Evaluating epoch {}'.format(epoch_i)): batch = batches_train[i_batch]#X_train[:, i_batch*bsz:(1+i_batch)*bsz].permute(1,0) batch = batch.cuda() lm_labels = labels_train[i_batch].cuda() if batch.numel() == 0: break #loss = model(batch, lm_labels = labels_train[i_batch].cuda()) # TRY DOING IT MANUALLY loss_fct = CrossEntropyLoss(reduction = 'none') lm_logits,_ = model(batch) shift_logits = lm_logits[:, :-1, :].contiguous() shift_labels = batch[:,1:].contiguous() shift_labels_mask = (lm_labels[:,1:].contiguous().view(-1) != -1).float() loss_mat = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) loss = (loss_mat*shift_labels_mask).view(-1).sum()/shift_labels_mask.sum() # avg over non-masked indices loss.backward() # only step the model if you've gone through 'effective_batch_size' examples if (i_batch*args.train_batch_size) % args.effective_batch_size == 0 and i_batch != 0: optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item() nb_tr_steps += 1 ### # Evaluations ### if i_batch % 1000 == 0: # get eval score eval_loss = eval_model(model, nbatch_eval,batches_eval,labels_eval, bsz) # if eval_loss improves, save model if eval_loss_min is None or eval_loss < eval_loss_min: eval_loss_min = eval_loss # save model if eval loss is lower model_to_save = model # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) to_json_file(model_to_save.config,output_config_file) print('eval_loss {}',format(eval_loss)) model.train() if i_batch % 200 == 0: # try generating from model print("Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, optimizer.get_lr()[0])) model.eval() if model_type == 'gpt': encode = lambda a: tokenizer.convert_tokens_to_ids(tokenizer.tokenize(a)) decode = tokenizer.decode elif model_type == 'gpt2': encode = tokenizer.encode decode = tokenizer.decode generate_from_model(encode, decode, model = model,model_type = model_type) model.train()
def train_model(epochs=10, num_gradients_accumulation=4, batch_size=8, gpu_id=0, lr=1e-4, load_dir='decoder_model', decoder_model='original_pretrained_model_for_bertGPT.pth'): # make sure your model is on GPU device = torch.device(f"cuda:{gpu_id}") #------------------------LOAD MODEL----------------- print('load the model....') model = BertGPT() model.load_state_dict(torch.load(decoder_model)) # model = nn.DataParallel(model, device_ids = [0]) model = model.to(device) print('load success') #------------------------END LOAD MODEL-------------- #------------------------LOAD TRAIN DATA------------------ train_data = torch.load("train_data.pth") train_dataset = MyDataset(*train_data) train_dataloader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=batch_size, num_workers=2, collate_fn=collate_fn) val_data = torch.load("validate_data.pth") val_dataset = MyDataset(*val_data) val_dataloader = DataLoader(dataset=val_dataset, shuffle=True, batch_size=batch_size, num_workers=2, collate_fn=collate_fn) #------------------------END LOAD TRAIN DATA-------------- #------------------------SET OPTIMIZER------------------- num_train_optimization_steps = len( train_dataset) * epochs // batch_size // num_gradients_accumulation param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and p.requires_grad ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) and p.requires_grad ], 'weight_decay': 0.0 }] print('train') print(len(optimizer_grouped_parameters[0]['params'])) optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=lr, warmup=0.01, max_grad_norm=1.0, weight_decay=0.01, t_total=num_train_optimization_steps) #------------------------END SET OPTIMIZER-------------- #------------------------START TRAINING------------------- update_count = 0 start = time.time() print('start training....') for epoch in range(epochs): #------------------------training------------------------ model.train() losses = 0 times = 0 for batch in tqdm(train_dataloader, desc='dirs'): batch = [item.to(device) for item in batch] encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch logits = model(encoder_input, mask_encoder_input, decoder_input, mask_decoder_input) out = logits[:, :-1].contiguous() target = decoder_input[:, 1:].contiguous() target_mask = mask_decoder_input[:, 1:].contiguous() loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token") loss.backward() losses += loss.item() times += 1 update_count += 1 if update_count % num_gradients_accumulation == num_gradients_accumulation - 1: optimizer.step() optimizer.zero_grad() end = time.time() print('-' * 20 + f'epoch {epoch}' + '-' * 20) print(f'time: {(end - start)}') print(f'loss: {losses / times}') start = end #------------------------validate------------------------ model.eval() perplexity = 0 batch_count = 0 print('start calculate the perplexity....') with torch.no_grad(): for batch in tqdm(val_dataloader): batch = [item.to(device) for item in batch] encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch logits = model(encoder_input, mask_encoder_input, decoder_input, mask_decoder_input) out = logits[:, :-1].contiguous() target = decoder_input[:, 1:].contiguous() target_mask = mask_decoder_input[:, 1:].contiguous() loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token") perplexity += np.exp(loss.item()) batch_count += 1 print(f'validate perplexity: {perplexity / batch_count}') direct_path = os.path.join(os.path.abspath('.'), load_dir) if not os.path.exists(direct_path): os.mkdir(direct_path) torch.save(model.state_dict(), os.path.join(direct_path, str(epoch) + "model.pth"))
def main(): # Parse the arguments parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--train_dataset', type=str, default='') parser.add_argument('--eval_dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=1) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument('--warmup_proportion', type=float, default=0.002) parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument('--max_seq_length', type=int, default=110) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Set the seed for random, numpy, PyTorch random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned special_tokens = ['<POS>', '<NEG>', '<CON_START>', '<START>', '<END>'] tokenizer = OpenAIGPTTokenizer.from_pretrained( args.model_name, special_tokens=special_tokens) start_token_id = tokenizer.convert_tokens_to_ids(['<START>'])[0] model = OpenAIGPTLMHeadModel.from_pretrained( args.model_name, num_special_tokens=len(special_tokens)) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # Load and encode dataset def tokenize_and_encode(file_path): ''' This method tokenizes the input data and encodes it using the OpenAIGPTTokenizer :param file_path: Path of the input file, dtype: str :return: encoded dataset dtype: list ''' with open(file_path, 'r') as in_fp: lines = in_fp.read().splitlines() tokenized_dataset = lines for i, line in enumerate(tqdm(lines)): token = tokenizer.tokenize(line)[:512] tokenized_dataset[i] = tokenizer.convert_tokens_to_ids(token) return tokenized_dataset logger.info("Encoding dataset...") train_dataset = tokenize_and_encode(args.train_dataset) eval_dataset = tokenize_and_encode(args.eval_dataset) print("Training samples = {}".format(len(train_dataset))) print("Validation samples = {}".format(len(eval_dataset))) print("Example = {}".format(train_dataset[0])) time.sleep(2) # Compute the mex input length for the Transformer train_dataset = [ x for x in train_dataset if len(x) <= args.max_seq_length and start_token_id in x ] # Remove all sentence longer than max_seq_length eval_dataset = [ x for x in eval_dataset if len(x) <= args.max_seq_length and start_token_id in x ] input_length = max(max(len(t) for t in train_dataset), max(len(q) for q in eval_dataset)) if n_gpu > 1: input_length = min(input_length, model.module.config.n_positions) else: input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model print("Input Length = {}".format(input_length)) def pre_process_dataset(encoded_dataset, input_length, start_token_id): """ This method is to create torch tensor of input ids and lm labels :param encoded_dataset: Input dataset, dtype: list :param input_length: Maximum length of sentence from training and eval dataset, dtype: int :param start_token_id: id of the '<START>' token, dtype: int :return: torch.tensor of size [len(encoded_dataset), 2] """ n_batch = len(encoded_dataset) input_ids = np.zeros(shape=(n_batch, input_length), dtype=np.int64) lm_labels = np.full(shape=(n_batch, input_length), fill_value=-1, dtype=np.int64) for i, tokens in enumerate(encoded_dataset): try: #tokens = tokens[:input_length] start_id_index = tokens.index(start_token_id) input_ids[i, :len(tokens)] = tokens start_id_index = tokens.index(start_token_id) lm_labels[i, start_id_index:len(tokens) - 1] = tokens[start_id_index + 1:len(tokens)] # LM loss calculate only for tokens after <START> token in the sentence #lm_labels[i, :len(tokens)-1] = tokens[1:] except ValueError: print("Index {} doesn't have start token".format(i)) input_ids = torch.tensor(input_ids) lm_labels = torch.tensor(lm_labels) tensor_dataset = (input_ids, lm_labels) #tensor_dataset.append(torch.tensor(d) for d in all_inputs) return tensor_dataset # Prepare input tensors and dataloders train_tensor_dataset = pre_process_dataset(train_dataset, input_length, start_token_id=start_token_id) eval_tensor_dataset = pre_process_dataset(eval_dataset, input_length, start_token_id=start_token_id) print("Training Example Input ids= {}".format(train_tensor_dataset[0][0])) print("Training Example Language Modeling ids = {}".format( train_tensor_dataset[1][0])) time.sleep(10) train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = RandomSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = len( train_data) * args.num_train_epochs // args.train_batch_size optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for epoch in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, lm_labels = batch loss = model(input_ids, lm_labels=lm_labels) if n_gpu > 1: loss.mean().backward() else: loss.backward() optimizer.step() optimizer.zero_grad() if n_gpu > 1: tmp_loss = loss.mean().item() else: tmp_loss = loss.item() exp_average_loss = tmp_loss if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * tmp_loss nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, optimizer.get_lr()[0]) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pytorch_model_zero_grad_{}.bin".format(epoch + 1)) config = model.module.config if hasattr(model, 'module') else model.config torch.save(model_to_save.state_dict(), output_model_file) model_state_dict = torch.load(output_model_file) model = OpenAIGPTLMHeadModel(config) model.load_state_dict(model_state_dict) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_eval: model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, lm_labels = batch with torch.no_grad(): lm_loss = model(input_ids, lm_labels=lm_labels) eval_loss += lm_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps train_loss = tr_loss / nb_tr_steps if args.do_train else None result = {'eval_loss': eval_loss, 'train_loss': train_loss} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default='/hdd/user4/gpt_classification/dataset/ag_news', type=str, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--task_name", default='ag_news', type=str, help="The name of the task to train.") parser.add_argument("--output_dir", default='/hdd/user4/gpt_classification/experiment/ag_news', type=str, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument("--max_grad_norm", default=1) parser.add_argument('--weight_decay', type=float, default=0.0) ## Other parameters parser.add_argument("--cache_dir", default='/hdd/user4/gpt_classification/pretrained', type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=True, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=True, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=9.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--overwrite_output_dir', default=True, action='store_true', help="Overwrite the content of the output directory") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") # n_gpu = torch.cuda.device_count() n_gpu = 1 else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') args.device = device logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train and not args.overwrite_output_dir: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name](args.data_dir) output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens) model = OpenAIGPTForClassification.from_pretrained(args.model_name, num_special_tokens=len(special_tokens), num_labels=num_labels) if args.local_rank == 0: torch.distributed.barrier() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 tr_loss = 0 if args.do_train: if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() # Prepare data loader train_examples = processor.get_train_examples() cached_train_features_file = os.path.join(args.data_dir, 'train_{0}_{1}_{2}'.format( list(filter(None, args.model_name.split('/'))).pop(), str(args.max_seq_length), str(task_name))) try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, output_mode) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_optimization_steps = len(train_dataloader) * args.num_train_epochs optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) model.train() for _ in range(int(args.num_train_epochs)): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, _, label_ids = batch # define a new function to compute loss values for both output_modes logits = model.forward(input_ids, input_mask) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 if args.local_rank in [-1, 0]: tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) tb_writer.add_scalar('loss', loss.item(), global_step) tb_writer.close() ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Good practice: save your training arguments together with the trained model output_args_file = os.path.join(args.output_dir, 'training_args.bin') torch.save(args, output_args_file) # Load a trained model and vocabulary that you have fine-tuned model = OpenAIGPTForClassification.from_pretrained(args.output_dir, num_labels=num_labels) model.to(device) ### Evaluation if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples() cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format( list(filter(None, args.model_name.split('/'))).pop(), str(args.max_seq_length), str(task_name))) try: with open(cached_eval_features_file, "rb") as reader: eval_features = pickle.load(reader) except: eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving eval features into cached file %s", cached_eval_features_file) with open(cached_eval_features_file, "wb") as writer: pickle.dump(eval_features, writer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) # Note that this sampler samples randomly eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] out_label_ids = None for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model.forward(input_ids, input_mask) # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) out_label_ids = label_ids.detach().cpu().numpy() else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, label_ids.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] if output_mode == "classification": output_odp = [] for arr in preds: t = (-arr).argsort()[:5] output_odp.append(t.tolist()) file_path = 'D:/바탕화면/(논문)multi-pretraining/NYT' with open('gpt_top5.pkl','wb') as f: pickle.dump(output_odp,f) preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(task_name, preds, out_label_ids) print('preds:',preds,'label:',out_label_ids) loss = tr_loss / global_step if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # hack for MNLI-MM if task_name == "mnli": task_name = "mnli-mm" processor = processors[task_name]() if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir + '-MM'): os.makedirs(args.output_dir + '-MM') eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] out_label_ids = None for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None) loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) out_label_ids = label_ids.detach().cpu().numpy() else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, label_ids.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] preds = np.argmax(preds, axis=1) result = compute_metrics(task_name, preds, out_label_ids) loss = tr_loss / global_step if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def train_model(epochs=10, num_gradients_accumulation=4, batch_size=4, gpu_id=0, lr=1e-5, load_dir='decoder_model'): # make sure your model is on GPU device = torch.device(f"cuda:{gpu_id}") #------------------------LOAD MODEL----------------- print('load the model....') encoder = TransformerEncoder() decoder = TransformerDecoderLM() encoder.load_state_dict(torch.load("encoder.pth")) decoder.load_state_dict(torch.load("decoder.pth")) encoder = encoder.to(device) decoder = decoder.to(device) print('load success') #------------------------END LOAD MODEL-------------- #------------------------LOAD TRAIN DATA------------------ train_data = torch.load("train_data.pth") train_dataset = TensorDataset(*train_data) train_dataloader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=batch_size) val_data = torch.load("validate_data.pth") val_dataset = TensorDataset(*val_data) val_dataloader = DataLoader(dataset=val_dataset, shuffle=True, batch_size=batch_size) #------------------------END LOAD TRAIN DATA-------------- #------------------------SET OPTIMIZER------------------- num_train_optimization_steps = len( train_dataset) * epochs // batch_size // num_gradients_accumulation param_optimizer = list(decoder.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=lr, warmup=0.01, max_grad_norm=1.0, weight_decay=0.01, t_total=num_train_optimization_steps) #------------------------END SET OPTIMIZER-------------- #------------------------START TRAINING------------------- update_count = 0 start = time.time() print('start training....') for epoch in range(epochs): #------------------------training------------------------ decoder.train() losses = 0 times = 0 for batch in train_dataloader: batch = [item.to(device) for item in batch] encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch _, past = encoder(encoder_input, mask_encoder_input) mask = torch.cat([mask_encoder_input, mask_decoder_input], dim=1) logits, _ = decoder(decoder_input, mask, past=past, past_length=0) out = logits[:, :-1].contiguous() target = decoder_input[:, 1:].contiguous() target_mask = mask_decoder_input[:, 1:].contiguous() loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token") loss.backward() losses += loss.item() times += 1 update_count += 1 if update_count % num_gradients_accumulation == num_gradients_accumulation - 1: optimizer.step() optimizer.zero_grad() end = time.time() print('-' * 20 + f'epoch {epoch}' + '-' * 20) print(f'time: {(end - start)}') print(f'loss: {losses / times}') start = end #------------------------validate------------------------ decoder.eval() perplexity = 0 batch_count = 0 print('start calculate the perplexity....') with torch.no_grad(): for batch in val_dataloader: batch = [item.to(device) for item in batch] encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch _, past = encoder(encoder_input, mask_encoder_input) mask = torch.cat([mask_encoder_input, mask_decoder_input], dim=1) logits, _ = decoder(decoder_input, mask, past=past, past_length=0) out = logits[:, :-1].contiguous() target = decoder_input[:, 1:].contiguous() target_mask = mask_decoder_input[:, 1:].contiguous() loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token") perplexity += np.exp(loss.item()) batch_count += 1 print(f'validate perplexity: {perplexity / batch_count}') torch.save( decoder.state_dict(), os.path.join(os.path.abspath('.'), load_dir, str(epoch) + "decoder.pth"))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument('--task', type=str, default='intent', choices=['intent', 'slot'], help="Intent or slot prediction") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument('--train_dataset', type=str, default='') parser.add_argument('--eval_dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument('--warmup_proportion', type=float, default=0.002) parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.0) parser.add_argument('--probabilistic_masks', action='store_true') parser.add_argument('--attn_bias', action='store_true') parser.add_argument('--linearize', action='store_true') args = parser.parse_args() print(args) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) label_list = list() for line in open(LABEL_FILES[args.task]): label_list.append(line.strip()) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens) special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens) model = OpenAIGPTDoubleHeadsClsModel.from_pretrained(args.model_name, num_labels=len(label_list), num_special_tokens=len(special_tokens)) model.to(device) # Load and encode the datasets def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj elif isinstance(obj, np.ndarray): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding dataset...") train_dataset = load_atis_dataset(args.train_dataset, label_list, tokenizer, args.probabilistic_masks, args.linearize) eval_dataset = load_atis_dataset(args.eval_dataset, label_list, tokenizer, args.probabilistic_masks, args.linearize, plot=False) datasets = (train_dataset, eval_dataset) encoded_datasets = tokenize_and_encode(datasets) # Compute the max input length for the Transformer max_length = model.config.n_positions - 2 input_length = max(len(utt[:max_length]) + 2 \ for dataset in encoded_datasets for utt, _, _, _, _ in dataset) input_length = min(input_length, model.config.n_positions) # Max size of input for the pre-trained model # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids, len(label_list)) train_tensor_dataset, eval_tensor_dataset = tensor_datasets[0], tensor_datasets[1] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare optimizer if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_optimization_steps = len(train_dataloader) * args.num_train_epochs optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() results = [] for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, pos_ids, attn_bias, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids, lm_labels, mc_labels, position_ids=pos_ids, attn_bias=attn_bias if args.attn_bias else None) # loss = args.lm_coef * losses[0] + losses[1] loss = losses[1] loss.backward() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item() nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, optimizer.get_lr()[0]) model.eval() eval_loss = 0 nb_eval_steps, nb_eval_examples = 0, 0 all_logits, all_labels = [], [] for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, pos_ids, attn_bias, lm_labels, mc_labels = batch with torch.no_grad(): _, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels, position_ids=pos_ids, attn_bias=attn_bias if args.attn_bias else None) _, mc_logits = model(input_ids, mc_token_ids, position_ids=pos_ids, attn_bias=attn_bias if args.attn_bias else None) mc_logits = mc_logits.detach().cpu().numpy() mc_labels = mc_labels.to('cpu').numpy() eval_loss += mc_loss.mean().item() all_logits.append(mc_logits) all_labels.append(mc_labels) nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps all_logits = np.concatenate(all_logits, axis=0) all_labels = np.concatenate(all_labels, axis=0) eval_f1 = f1(all_logits, all_labels) eval_acc = accuracy(all_logits, all_labels) / nb_eval_examples train_loss = tr_loss/nb_tr_steps if args.do_train else None result = {'eval_loss': eval_loss, 'eval_f1': eval_f1, 'eval_accuracy': eval_acc, 'train_loss': train_loss} print(result) results.append(result) with open(os.path.join(args.output_dir, "log.csv"), "w") as csvfile: writer = csv.DictWriter( csvfile, ["train_loss", "eval_loss", "eval_accuracy", "eval_f1"] ) writer.writeheader() writer.writerows(results) # Save a trained model if args.do_train: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = OpenAIGPTDoubleHeadsClsModel.from_pretrained( args.output_dir,num_labels=len(label_list), num_special_tokens=len(special_tokens)) tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir) model.to(device) if args.do_eval: model.eval() eval_loss = 0 nb_eval_steps, nb_eval_examples = 0, 0 all_logits, all_labels = [], [] fw = open("prediction.txt", "w") for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, pos_ids, attn_bias, lm_labels, mc_labels = batch with torch.no_grad(): _, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels, position_ids=pos_ids, attn_bias=attn_bias if args.attn_bias else None) _, mc_logits = model(input_ids, mc_token_ids, position_ids=pos_ids, attn_bias=attn_bias if args.attn_bias else None) mc_logits = mc_logits.detach().cpu().numpy() mc_labels = mc_labels.to('cpu').numpy() for i, (o, l) in enumerate(zip((mc_logits>=0.5).astype(np.int32), mc_labels.astype(np.int32))): # if np.any(o != l): # pred = [label_list[idx] for idx, val in enumerate(o) if val == 1] # true = [label_list[idx] for idx, val in enumerate(l) if val == 1] pred = o true = l fw.write(f"{eval_dataset[nb_eval_examples+i][0]}\n{pred}\n{true}\n\n") eval_loss += mc_loss.mean().item() all_logits.append(mc_logits) all_labels.append(mc_labels) nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 fw.close() eval_loss = eval_loss / nb_eval_steps all_logits = np.concatenate(all_logits, axis=0) all_labels = np.concatenate(all_labels, axis=0) eval_f1 = f1(all_logits, all_labels) eval_acc = accuracy(all_logits, all_labels) / nb_eval_examples train_loss = tr_loss/nb_tr_steps if args.do_train else None result = {'eval_loss': eval_loss, 'eval_f1': eval_f1, 'eval_accuracy': eval_acc, 'train_loss': train_loss} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def train(self): if self.debug_mode: self.epochs = 1 # 加载 dataloader train_loader, valid_loader = self.create_dataloader() # 训练 self.seed_everything() lr = 2e-5 accumulation_steps = math.ceil(self.batch_size / self.base_batch_size) # 加载预训练模型 print("Load pre-trained model") model = GPT2NeuralNet.from_pretrained(self.gpt2_model_path, cache_dir=None) model.zero_grad() model = model.to(self.device) """ # 不同的参数组设置不同的 weight_decay param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] """ epoch_steps = int(self.train_len * 0.5 / self.base_batch_size / accumulation_steps) num_train_optimization_steps = int(self.epochs * epoch_steps) valid_every = math.floor(epoch_steps * accumulation_steps / 5) optimizer = OpenAIAdam(model.parameters(), lr=lr, warmup=0.05, t_total=num_train_optimization_steps) # 渐变学习速率 #scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) # 开始训练 print("Train") best_auc_score_1 = 0 best_auc_score_2 = 0 best_auc_score_3 = 0 best_auc_score_4 = 0 f_log = open("train_log.txt", "w") for epoch in range(self.epochs): model.train() optimizer.zero_grad() # 加载每个 batch 并训练 train_start_time = time.time() for i, batch_data in enumerate(train_loader): x_batch = batch_data[0] y_batch = batch_data[1] target_weight_batch = batch_data[2] aux_weight_batch = batch_data[3] identity_weight_batch = batch_data[4] np_weight_batch = batch_data[5] np_identity_weight_batch = batch_data[6] y_pred = model(x_batch.to(self.device)) target_loss, aux_loss, identity_loss, np_loss = self.custom_loss( y_pred, y_batch, epoch, target_weight_batch, aux_weight_batch, identity_weight_batch, np_weight_batch) loss = target_loss + aux_loss + identity_loss + np_loss with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if (i + 1) % accumulation_steps == 0: optimizer.step() optimizer.zero_grad() # 验证 if (i + 1) % valid_every == 0: model.eval() stage = int((i + 1) / valid_every) train_stage_duration = int( (time.time() - train_start_time) / 60) valid_start_time = time.time() y_pred = np.zeros((len(self.train_df) - self.train_len)) for j, valid_batch_data in enumerate(valid_loader): x_batch = valid_batch_data[0] batch_y_pred = self.sigmoid( model(x_batch.to( self.device)).detach().cpu().numpy())[:, 0] y_pred[j * self.base_batch_size:(j + 1) * self.base_batch_size] = batch_y_pred # 计算得分 auc_score = self.evaluator.get_final_metric(y_pred) valid_duration = int((time.time() - valid_start_time) / 60) train_start_time = time.time() f_log.write( "epoch: %d stage: %d train_stage_duration: %dmin valid_duration: %dmin auc_score: %.4f\n" % (epoch, stage, train_stage_duration, valid_duration, auc_score)) print( "epoch: %d stage: %d train_stage_duration: %dmin valid_duration: %dmin auc_score: %.4f" % (epoch, stage, train_stage_duration, valid_duration, auc_score)) if auc_score > best_auc_score_4: state_dict = model.state_dict() if auc_score > best_auc_score_1: best_auc_score_1 = auc_score torch.save(state_dict, "model1.bin") elif auc_score > best_auc_score_2: best_auc_score_2 = auc_score torch.save(state_dict, "model2.bin") elif auc_score > best_auc_score_3: best_auc_score_3 = auc_score torch.save(state_dict, "model3.bin") else: best_auc_score_4 = auc_score torch.save(state_dict, "model4.bin") with open("model_score.txt", "w") as f: f.write( "model1: %.4f model2: %.4f model3: %.4f model4: %.4f" % (best_auc_score_1, best_auc_score_2, best_auc_score_3, best_auc_score_4)) print( "model1: %.4f model2: %.4f model3: %.4f model4: %.4f" % (best_auc_score_1, best_auc_score_2, best_auc_score_3, best_auc_score_4)) model.train() if self.last is True: state_dict = model.state_dict() torch.save(state_dict, "model_last.bin") # del 训练相关输入和模型 training_history = [train_loader, valid_loader, model, optimizer] for variable in training_history: del variable gc.collect()
# model_B.train() for batch in pbar: batch = batch[0] # without relative position # if sum([len(item) for item in batch[1]]) > 1024: # continue record_loss = train_one_iter(batch, update_count, fp16=False) update_count += 1 if update_count % num_gradients_accumulation == num_gradients_accumulation - 1: # update for gradient accumulation optimizer.step() optimizer.zero_grad() # speed measure end = time.time() speed = batch_size * num_gradients_accumulation / (end - start) start = end # show progress pbar.set_postfix(loss=record_loss, speed=speed) "Evaluation" model_A.eval() # model_B.eval() val_acc, val_f1 = validate(val_dataloader) print(f"val f1: {val_f1}, valid acc: {val_acc}") is_best_so_far = val_f1 > best_f1
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .csv files (or other data files) for the task." ) parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--answer_only", default=False, action='store_true', help="Whether to run with answers only (blank out question).") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( "--load_model_from", default=None, type=str, help= "The saved model file to load before doing any training or eval (if both --do_train and --do_eval are specified, the saved model will be loaded, then trained, then the trained model will be evaluated)." ) parser.add_argument( '--train_filename', type=str, default='train.csv', help="Filename to load train data from (relative to data_dir)") parser.add_argument( '--eval_filename', type=str, default='val.csv', help="File to load eval data from (relative to data_dir)") parser.add_argument( '--data_format', type=str, choices=['swag', 'codah'], default='swag', help= "Format of the train and eval files (original SWAG CSV format vs our TSV format)" ) parser.add_argument( '--model_labels_save_filename', type=str, default='model_labels.json', help= "JSON file to save model outputs/labels to (relative to output_dir)") parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=32) parser.add_argument('--eval_batch_size', type=int, default=8) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument('--warmup_proportion', type=float, default=0.002) parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.5) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument( '--gradient_accumulation_steps', type=int, default=8, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if args.do_eval and (not args.do_train) and args.load_model_from is None: args.load_model_from = os.path.join(args.output_dir, 'pytorch_model.bin') # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained( args.model_name, special_tokens=special_tokens) special_tokens_ids = list( tokenizer.convert_tokens_to_ids(token) for token in special_tokens) model = OpenAIGPTDoubleHeadsModel.from_pretrained( args.model_name, num_special_tokens=len(special_tokens)) config = model.config if args.load_model_from: model_state_dict = torch.load(args.load_model_from) model = OpenAIGPTDoubleHeadsModel(config) model.load_state_dict(model_state_dict) model.to(device) # Load and encode the datasets logger.info("Loading datasets...") datasets = [] dataset_keys = dict() if args.do_train: train_dataset = read_swag_examples(os.path.join( args.data_dir, args.train_filename), is_training=True, answer_only=args.answer_only, data_format=args.data_format) train_dataset = [ EncodedSwagExample(ex, tokenizer) for ex in tqdm(train_dataset, desc='Encoding train') ] dataset_keys['train'] = len(datasets) datasets.append(train_dataset) if args.do_eval: eval_dataset = read_swag_examples(os.path.join(args.data_dir, args.eval_filename), is_training=True, answer_only=args.answer_only, data_format=args.data_format) eval_dataset = [ EncodedSwagExample(ex, tokenizer) for ex in tqdm(eval_dataset, desc='Encoding eval') ] dataset_keys['eval'] = len(datasets) datasets.append(eval_dataset) # Compute the max input length for the Transformer max_length = model.config.n_positions // 2 - 2 input_length = max(len(swagex.context_tokens[:max_length]) + len(swagex.start_ending_tokens[:max_length]) + max(len(ending[:max_length]) for ending in swagex.endings_tokens) + 3 \ for dataset in datasets for swagex in dataset) input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model print('---') print('Input length: {}\n'.format(input_length)) print('---') # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(datasets, input_length, max_length, *special_tokens_ids) if args.do_train: train_tensor_dataset = tensor_datasets[dataset_keys['train']] if args.do_eval: eval_tensor_dataset = tensor_datasets[dataset_keys['eval']] # Prepare optimizer if args.do_train: train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] #num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size num_train_optimization_steps = int( len(train_data) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids, lm_labels, mc_labels) loss = args.lm_coef * losses[0] + losses[1] if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_steps += 1 exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, optimizer.get_lr()[0]) if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() # Save a trained model output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") if args.do_train: model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self torch.save(model_to_save.state_dict(), output_model_file) if args.do_eval: eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Load a trained model that you have fine-tuned if args.do_train: model_state_dict = torch.load(output_model_file) model = OpenAIGPTDoubleHeadsModel(config) model.load_state_dict(model_state_dict) model.to(device) model.eval() all_model_outputs = [] data_index = 0 eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch with torch.no_grad(): _, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels) _, mc_logits = model(input_ids, mc_token_ids) mc_logits = mc_logits.detach().cpu().numpy() mc_labels = mc_labels.to('cpu').numpy() tmp_eval_accuracy = accuracy(mc_logits, mc_labels) eval_loss += mc_loss.mean().item() eval_accuracy += tmp_eval_accuracy for i in range(input_ids.size(0)): output_obj = dict() output_obj['logits'] = [float(x) for x in mc_logits[i]] output_obj['true_label'] = int(mc_labels[i]) output_obj['model_label'] = int(np.argmax(mc_logits[i])) output_obj['swag_data'] = datasets[ dataset_keys['eval']][data_index].raw_example.to_dict() all_model_outputs.append(output_obj) data_index += 1 nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples train_loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'train_loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) with open( os.path.join(args.output_dir, args.model_labels_save_filename), 'w') as f: json.dump(all_model_outputs, f)
class ModelClassifier(object): def __init__(self, config, which_to_train, model_A, model_B, tokenizer, device1, device2): # config.num_labels = le.classes_.shape[0] # label encode # super().__init__() self.config = config self.le_A = load_pkl("training/data/labelencoder_A.pkl") self.le_B = load_pkl("training/data/labelencoder_B.pkl") self.clf_A = SequenceSummary(num_labels=self.le_A.classes_.shape[0], config=config) self.clf_B = SequenceSummary(num_labels=self.le_B.classes_.shape[0], config=config) self.clf_TF = SequenceSummary(num_labels=2, config=config) # self.apply(self.init_weight) self.past = None self.history = [] # model self.model_A = model_A self.model_B = model_B self.tokenizer = tokenizer self.cls_token_id = tokenizer.cls_token_id self.device1 = device1 self.device2 = device2 self.to_device(self.device1) # define loss self.criterion = nn.CrossEntropyLoss() # optimizer parameters self.num_gradients_accumulation = 1 self.batch_size = 1 self.batch_size_TF = 8 # load training data self.load_data() def reload(self): self.past = None self.history = [] def to_device(self, device): # to device self.clf_A = self.clf_A.to(device) self.clf_B = self.clf_B.to(device) self.clf_TF = self.clf_TF.to(device) self.clf_A.device = device self.clf_B.device = device self.clf_TF.device = device # self.model_A = self.model_A.to(self.device) # self.model_B = self.model_B.to(self.device) def load_data(self): # load training data self.train_data = load_pkl("training/data/train_data.pkl") self.val_data = load_pkl("training/data/val_data.pkl") self.train_data_TF, self.val_data_TF = torch.load("demonstration/old_model/demonstration_train_with_text_only.pkl", map_location="cpu"), \ torch.load("demonstration/old_model/demonstration_val_with_text_only.pkl", map_location="cpu") self.train_dataset = PersuadeDataset(self.train_data, self.tokenizer) self.val_dataset = PersuadeDataset(self.val_data, self.tokenizer) self.train_dataset_TF, self.val_dataset_TF = TFDataset(self.train_data_TF, self.tokenizer), \ TFDataset(self.val_data_TF, self.tokenizer) self.train_dataloader = DataLoader(dataset=self.train_dataset, shuffle=True, batch_size=self.batch_size, collate_fn=self.train_dataset.collate) self.val_dataloader = DataLoader(dataset=self.val_dataset, shuffle=False, batch_size=self.batch_size, collate_fn=self.train_dataset.collate) self.train_dataloader_TF = DataLoader(dataset=self.train_dataset_TF, shuffle=True, batch_size=self.batch_size_TF, collate_fn=self.train_dataset_TF.collate) self.val_dataloader_TF = DataLoader(dataset=self.val_dataset_TF, shuffle=False, batch_size=self.batch_size_TF, collate_fn=self.val_dataset_TF.collate) def load_model(self, all_model_dir=None, clf_A_dir=None, clf_B_dir=None, clf_TF_dir=None): if all_model_dir is None: if clf_A_dir: clf_A_state = torch.load(clf_A_dir) self.clf_A.load_state_dict(clf_A_state) print(f"clf_A loaded") if clf_B_dir: clf_B_state = torch.load(clf_B_dir) self.clf_B.load_state_dict(clf_B_state) print(f"clf_B loaded") if clf_TF_dir: clf_TF_state = torch.load(clf_TF_dir) self.clf_TF.load_state_dict(clf_TF_state) print(f"clf_TF loaded") else: model_A_state, model_B_state, clf_A_state, clf_B_state, clf_TF_state = torch.load(all_model_dir) self.model_A.load_state_dict(model_A_state) self.model_B.load_state_dict(model_B_state) self.clf_A.load_state_dict(clf_A_state) self.clf_B.load_state_dict(clf_B_state) self.clf_TF.load_state_dict(clf_TF_state) print(f"all models loaded") def train(self, which_to_train, num_epochs=10): # optimizer param_optimizer = list(self.model_A.named_parameters()) + \ list(self.model_B.named_parameters()) if "A" in which_to_train: print("clf_A to optimize") param_optimizer += list(self.clf_A.named_parameters()) if "B" in which_to_train: print("clf_B to optimize") param_optimizer += list(self.clf_B.named_parameters()) if "TF" in which_to_train: print("clf_TF to optimize") param_optimizer += list(self.clf_TF.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_optimization_steps = len(self.train_dataset) * num_epochs // self.batch_size // self.num_gradients_accumulation self.optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=2e-5, warmup=0.1, max_grad_norm=1.0, weight_decay=0.01, t_total=num_train_optimization_steps) update_count = 0 progress_bar = tqdm.tqdm start = time.time() best_acc_A = -float('Inf') best_f1_A = -float('Inf') best_acc_B = -float('Inf') best_f1_B = -float('Inf') best_acc_TF = -float('Inf') best_f1_TF = -float('Inf') for ep in tqdm.tqdm(range(num_epochs)): # set train mode self.model_A.train() self.model_B.train() self.clf_A.train() self.clf_B.train() self.clf_TF.train() "Training" pbar = progress_bar(self.train_dataloader) train_dataloader_TF_list = list(self.train_dataloader_TF) for i, batch in enumerate(pbar): batch = batch[0] batch_TF = train_dataloader_TF_list[i%len(train_dataloader_TF_list)] # without relative position # if sum([len(item) for item in batch[1]]) > 1024: # input("1024 here!") # continue record_loss = self.train_one_iter(batch, batch_TF, update_count, which_to_train, fp16=False) update_count += 1 if update_count % self.num_gradients_accumulation == self.num_gradients_accumulation - 1: # update for gradient accumulation self.optimizer.step() self.optimizer.zero_grad() # speed measure end = time.time() speed = self.batch_size * self.num_gradients_accumulation / (end - start) start = end # show progress pbar.set_postfix(loss=record_loss, speed=speed) "Evaluation" self.model_A.eval() self.model_B.eval() self.clf_A.eval() self.clf_B.eval() self.clf_TF.eval() (val_acc_A, val_f1_A), (val_acc_B, val_f1_B), (val_acc_TF, val_f1_TF) = self.validate(self.val_dataloader, self.val_dataloader_TF, ep, which_to_train) print(f"A: val f1: {val_f1_A}, valid acc: {val_acc_A}") print(f"B: val f1: {val_f1_B}, valid acc: {val_acc_B}") print(f"TF: val f1: {val_f1_TF}, valid acc: {val_acc_TF}") is_best_so_far_TF = val_f1_TF > best_f1_TF is_best_so_far_A = val_f1_A > best_f1_A is_best_so_far_B = val_f1_TF > best_f1_B if is_best_so_far_TF: best_acc_TF = val_acc_TF best_f1_TF = val_f1_TF if is_best_so_far_A: best_acc_A = val_acc_A best_f1_A = val_f1_A if is_best_so_far_B: best_acc_B = val_acc_B best_f1_B = val_f1_B SAVED = False if is_best_so_far_TF and not SAVED: SAVED = True torch.save((self.model_A.state_dict(), self.model_B.state_dict(), self.clf_A.state_dict(), self.clf_B.state_dict(), self.clf_TF.state_dict()), f"Checkpoint_act_clf/epoch{ep}_multitask_TF_best_acc_{val_acc_TF}_f1_{val_f1_TF}_A_acc_{val_acc_A}_f1_{val_f1_A}_B_acc_{val_acc_B}_f1_{val_f1_B}.pth") if is_best_so_far_A and not SAVED: SAVED = True torch.save((self.model_A.state_dict(), self.model_B.state_dict(), self.clf_A.state_dict(), self.clf_B.state_dict(), self.clf_TF.state_dict()), f"Checkpoint_act_clf/epoch{ep}_multitask_TF_best_acc_{val_acc_TF}_f1_{val_f1_TF}_A_acc_{val_acc_A}_f1_{val_f1_A}_B_acc_{val_acc_B}_f1_{val_f1_B}.pth") if is_best_so_far_B and not SAVED: SAVED = True torch.save((self.model_A.state_dict(), self.model_B.state_dict(), self.clf_A.state_dict(), self.clf_B.state_dict(), self.clf_TF.state_dict()), f"Checkpoint_act_clf/epoch{ep}_multitask_TF_best_acc_{val_acc_TF}_f1_{val_f1_TF}_A_acc_{val_acc_A}_f1_{val_f1_A}_B_acc_{val_acc_B}_f1_{val_f1_B}.pth") # if which_to_train == "A": # torch.save(model_A.state_dict(), f"Checkpoint_act_clf/A/best_acc_{best_acc}_f1_{best_f1}.pth") # elif which_to_train == "B": # torch.save(model_A.state_dict(), f"Checkpoint_act_clf/B/best_acc_{best_acc}_f1_{best_f1}.pth") # checkpointer.save_checkpoint(ep, model_A.state_dict(), {"None": None}, is_best_so_far) print("finally") print("A: \nbest acc: {}, best f1: {}".format(best_acc_A, best_f1_A)) print("B: \nbest acc: {}, best f1: {}".format(best_acc_B, best_f1_B)) print("TF: \nbest acc: {}, best f1: {}".format(best_acc_TF, best_f1_TF)) def validate(self, dataloader, dataloader_TF, ep, which_to_train): from sklearn.metrics import f1_score from sklearn.metrics import confusion_matrix from utils import print_cm # evaluation mode self.model_A.eval() self.model_B.eval() self.clf_A.eval() self.clf_B.eval() self.clf_TF.eval() def get_numbers_for_one_task(sents, logits, acts, x, y_true, y_pred, total, correct): _, predicted_acts = torch.max(logits, 1) x.extend(sents) y_true.extend(acts.tolist()[0]) y_pred.extend(predicted_acts.tolist()) total += len(acts.tolist()[0]) correct += (predicted_acts == acts).sum().item() return x, y_true, y_pred, total, correct progress_bar = tqdm.tqdm with torch.no_grad(): pbar = progress_bar(dataloader) dataloader_TF_list = list(dataloader_TF) correct = 0 total = 0 x_A, y_true_A, y_pred_A, correct_A, total_A = [], [], [], 0, 0 x_B, y_true_B, y_pred_B, correct_B, total_B = [], [], [], 0, 0 x_TF, y_true_TF, y_pred_TF, correct_TF, total_TF = [], [], [], 0, 0 for i, batch in enumerate(pbar): batch = batch[0] batch_TF = dataloader_TF_list[i%len(dataloader_TF_list)] # if sum([len(item) for item in batch[1]]) > 1024: # continue sents_A, logits_A, acts_A,\ sents_B, logits_B, acts_B,\ sents_TF, logits_TF, acts_TF = self.train_one_iter(batch, batch_TF, None, which_to_train, fp16=False, is_validation=True) x_A, y_true_A, y_pred_A, total_A, correct_A = get_numbers_for_one_task(sents_A, logits_A, acts_A,\ x_A, y_true_A, y_pred_A, total_A, correct_A) x_B, y_true_B, y_pred_B, total_B, correct_B = get_numbers_for_one_task(sents_B, logits_B, acts_B,\ x_B, y_true_B, y_pred_B, total_B, correct_B) x_TF, y_true_TF, y_pred_TF, total_TF, correct_TF = get_numbers_for_one_task(sents_TF, logits_TF, acts_TF,\ x_TF, y_true_TF, y_pred_TF, total_TF, correct_TF) f1_A = f1_score(y_true_A, y_pred_A, average="weighted") f1_B = f1_score(y_true_B, y_pred_B, average="weighted") f1_TF = f1_score(y_true_TF, y_pred_TF, average="binary") # pdb.set_trace() pd.DataFrame(zip(x_A, self.le_A.inverse_transform(y_true_A).tolist(), self.le_A.inverse_transform(y_pred_A).tolist()), columns=['sent', 'y_true', 'y_pred']).to_csv(f"Checkpoint_act_clf/A/act_classifier_val_results_epoch{ep}.csv", index=None) print(f"A: Epoch {ep} Validation accuracy: {correct_A/total_A}, f1: {f1_A}") pd.DataFrame(zip(x_B, self.le_B.inverse_transform(y_true_B).tolist(), self.le_B.inverse_transform(y_pred_B).tolist()), columns=['sent', 'y_true', 'y_pred']).to_csv(f"Checkpoint_act_clf/B/act_classifier_val_results_epoch{ep}.csv", index=None) print(f"B: Epoch {ep} Validation accuracy: {correct_B/total_B}, f1: {f1_B}") pd.DataFrame(zip(x_TF, y_true_TF, y_pred_TF), columns=['sent', 'y_true', 'y_pred']).to_csv(f"Checkpoint_act_clf/TF/act_classifier_val_results_epoch{ep}.csv", index=None) print(f"TF: Epoch {ep} Validation accuracy: {correct_TF/total_TF}, f1: {f1_TF}") # print_cm(confusion_matrix(y_true, y_pred, labels=range(len(le.classes_))), labels=[l[:] for l in le.classes_.tolist()]) return (correct_A/total_A, f1_A), (correct_B/total_B, f1_B), (correct_TF/total_TF, f1_TF) def set_past(self, sent, which_task): "sent: str, a whole sent" # assert sent.startswith("A:") or sent.startswith("B:") if sent.startswith("A:") or sent.startswith("B:"): pdb.set_trace() sent = sent[2:] if which_task == "A": lm_model = self.model_A prefix = "A:" device = lm_model.device elif which_task == "B": lm_model = self.model_B prefix = "B:" device = lm_model.device elif which_task == "TF": lm_model = self.model_A prefix = "A:" # candidate_sent = prefix+" ".join(separate_sents) device = lm_model.device # encode sent self.history.append(prefix+sent) sent = self.tokenizer.encode(prefix) + self.tokenizer.encode(sent) + self.train_dataset.turn_ending sent = torch.LongTensor(sent).unsqueeze(0).to(device) past = self.move_to_device(self.past, lm_model) _, past, _ = lm_model(sent, past) self.past = past def predict(self, separate_sents, which_task): "separate_sents: list of sentences with no prefix" past = self.past if which_task == "A": lm_model = self.model_A clf_head = self.clf_A le = self.le_A prefix = "A:" device = lm_model.device elif which_task == "B": lm_model = self.model_B clf_head = self.clf_B le = self.le_B prefix = "B:" device = lm_model.device elif which_task == "TF": lm_model = self.model_A clf_head = self.clf_TF prefix = "A:" candidate_sent = " ".join(separate_sents) device = lm_model.device # evaluation mode self.model_A.eval() self.model_B.eval() self.clf_A.eval() self.clf_B.eval() self.clf_TF.eval() with torch.no_grad(): if which_task in ["A", "B"]: all_logits = [] for i, sent in enumerate(separate_sents): if i == 0: sent = self.tokenizer.encode(prefix) + self.tokenizer.encode(sent) else: sent = self.tokenizer.encode(" "+sent) # pdb.set_trace() sent = torch.LongTensor(sent).unsqueeze(0).to(device) past = self.move_to_device(past, lm_model) logits, past, hidden_states = lm_model(sent, past) # encode [CLS] cls_token_tensor = torch.LongTensor([self.cls_token_id]).unsqueeze(0).to(device) _, _, hidden_states = lm_model(cls_token_tensor, past) hidden_states = self.move_to_device(hidden_states, clf_head) mc_logits = clf_head(hidden_states[-1], cls_index=None).squeeze(-1) all_logits.append(mc_logits) # finish tail end_input = torch.LongTensor(self.train_dataset.turn_ending).unsqueeze(0).to(device) past = self.move_to_device(past, lm_model) _, past, _ = lm_model(end_input, past) # get labels all_logits = torch.cat(all_logits, dim=0) # pdb.set_trace() _, predicted_acts = torch.max(all_logits, 1) predicted_acts = predicted_acts.tolist() predicted_acts = le.inverse_transform(predicted_acts).tolist() return predicted_acts, past elif which_task == "TF": # encode candidate candidate = self.tokenizer.encode(prefix) + self.tokenizer.encode(candidate_sent) # pdb.set_trace() candidate = torch.LongTensor(candidate).unsqueeze(0).to(device) past = self.move_to_device(past, self.model_A) logits, past, hidden_states = self.model_A(candidate, past) # encode [CLS] cls_token_tensor = torch.LongTensor([self.cls_token_id]).unsqueeze(0).to(device) _, _, hidden_states = self.model_A(cls_token_tensor, past) hidden_states = self.move_to_device(hidden_states, self.clf_TF) mc_logits = self.clf_TF(hidden_states[-1], cls_index=None).squeeze(-1) # pdb.set_trace() _, predicted_acts = torch.max(mc_logits, 1) predicted_acts = predicted_acts.tolist() assert len(predicted_acts) == 1 return predicted_acts[0], past def train_one_iter(self, batch, batch_TF, update_count, which_to_train, fp16=False, is_validation=False): # role_ids, whole_sents, separate_sents, acts = batch past = None all_sents_A, all_logits_A, all_acts_A = [], [], [] all_sents_B, all_logits_B, all_acts_B = [], [], [] for i, (role_id, whole_sent, separate_sents, acts) in enumerate(zip(*batch)): if role_id == 0: whole_sent = torch.LongTensor(whole_sent).unsqueeze(0).to(self.device1) try: assert self.tokenizer.decode(whole_sent[0][:2].tolist()) == "A:" except: pdb.set_trace() if "A" in which_to_train: past = self.move_to_device(past, self.model_A) _, real_past, _ = self.model_A(whole_sent, past) for act, sent in zip(acts, separate_sents): all_sents_A.append(self.tokenizer.decode(sent)) # pdb.set_trace() # 'A:HI I would like to tell you About a childrens charity called Save the CHildren.' sent = torch.LongTensor(sent).unsqueeze(0).to(self.device1) past = self.move_to_device(past, self.model_A) logits, past, hidden_states = self.model_A(sent, past) # pdb.set_trace() # encode [CLS] cls_token_tensor = torch.LongTensor([self.cls_token_id]).unsqueeze(0).to(self.device1) past = self.move_to_device(past, self.model_A) _, _, hidden_states = self.model_A(cls_token_tensor, past) mc_logits = self.clf_A(hidden_states[-1], cls_index=None).squeeze(-1) all_logits_A.append(mc_logits) all_acts_A.append(act) # pdb.set_trace() past = real_past # # finish tail # end_input = torch.LongTensor(self.train_dataset.turn_ending).unsqueeze(0).to(self.device1) # _, past, _ = self.model_A(end_input, past) else: past = self.move_to_device(past, self.model_A) _, past, hidden_states = self.model_A(whole_sent, past) else: whole_sent = torch.LongTensor(whole_sent).unsqueeze(0).to(self.device2) try: assert self.tokenizer.decode(whole_sent[0][:2].tolist()) == "B:" except: pdb.set_trace() if "B" in which_to_train: past = self.move_to_device(past, self.model_B) _, real_past, _ = self.model_B(whole_sent, past) for act, sent in zip(acts, separate_sents): all_sents_B.append(self.tokenizer.decode(sent)) # pdb.set_trace() #'B:ok please do' sent = torch.LongTensor(sent).unsqueeze(0).to(self.device2) past = self.move_to_device(past, self.model_B) logits, past, hidden_states = self.model_B(sent, past) # encode [CLS] cls_token_tensor = torch.LongTensor([self.cls_token_id]).unsqueeze(0).to(self.device2) _, _, hidden_states = self.model_B(cls_token_tensor, past) hidden_states = self.move_to_device(hidden_states, self.clf_B) mc_logits = self.clf_B(hidden_states[-1], cls_index=None).squeeze(-1) all_logits_B.append(mc_logits) all_acts_B.append(act) past = real_past # finish tail # end_input = torch.LongTensor(self.train_dataset.turn_ending).unsqueeze(0).to(self.device2) # past = self.move_to_device(past, self.model_B) # _, past, _ = self.model_B(end_input, past) else: past = self.move_to_device(past, self.model_B) _, past, hidden_states = self.model_B(whole_sent, past) all_logits_A = torch.cat(all_logits_A, dim=0) all_acts_A = torch.tensor(all_acts_A).unsqueeze(0).to(self.device1) # pdb.set_trace() loss_A = self.criterion(all_logits_A.view(-1, all_logits_A.size(-1)), all_acts_A.view(-1)) all_logits_B = torch.cat(all_logits_B, dim=0) all_acts_B = torch.tensor(all_acts_B).unsqueeze(0).to(self.device1) loss_B = self.criterion(all_logits_B.view(-1, all_logits_B.size(-1)), all_acts_B.view(-1)) # TF task all_contexts_candidate_TF = [] all_logits_TF = [] all_acts_TF = [] for one_dial in batch_TF: past = None contexts, candidate, pick_or_not = one_dial all_contexts_candidate_TF.append((" ".join([self.tokenizer.decode(c) for c in contexts]), self.tokenizer.decode(candidate))) # get past for i, context in enumerate(contexts): if i%2 == 0: # pdb.set_trace() #'A:Would you like to know more about the charity Save the Children?\n\n\n' context = torch.LongTensor(context).unsqueeze(0).to(self.device1) past = self.move_to_device(past, self.model_A) logits, past, hidden_states = self.model_A(context, past) else: # pdb.set_trace() #'B:hello I am great.\n\n\n' context = torch.LongTensor(context).unsqueeze(0).to(self.device2) past = self.move_to_device(past, self.model_B) logits, past, hidden_states = self.model_B(context, past) # encode candidate # pdb.set_trace() # "A:Save the Children is an international non-governmental organization that promotes children's rights, provides relief and helps support children in developing countries." candidate = torch.LongTensor(candidate).unsqueeze(0).to(self.device1) past = self.move_to_device(past, self.model_A) logits, past, hidden_states = self.model_A(candidate, past) # encode [CLS] cls_token_tensor = torch.LongTensor([self.cls_token_id]).unsqueeze(0).to(self.device1) _, _, hidden_states = self.model_A(cls_token_tensor, past) mc_logits = self.clf_TF(hidden_states[-1], cls_index=None).squeeze(-1) all_logits_TF.append(mc_logits) all_acts_TF.append(pick_or_not) all_logits_TF = torch.cat(all_logits_TF, dim=0) all_acts_TF = torch.tensor(all_acts_TF).unsqueeze(0).to(self.device1) loss_TF = self.criterion(all_logits_TF.view(-1, all_logits_TF.size(-1)), all_acts_TF.view(-1)) if is_validation: return all_sents_A, all_logits_A, all_acts_A,\ all_sents_B, all_logits_B, all_acts_B,\ all_contexts_candidate_TF, all_logits_TF, all_acts_TF loss = loss_A.to(self.device1) + loss_B.to(self.device1) + loss_TF.to(self.device1) loss /= self.num_gradients_accumulation if fp16: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() record_loss = loss.item() * self.num_gradients_accumulation return record_loss#, perplexity def move_to_device(self, past, target): if past is not None and target.device != past[0].device: past = [p.to(target.device) for p in past] return past
def main(): global global_example_count, global_token_count, event_writer, logdir args = parse_args() torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.device = device use_cuda = (str(device) == 'cuda') if not use_cuda: print( f'WARNING: --fp16 requires --cuda, have {device}, ignoring --fp16 option' ) args.fp16 = False else: try: from apex.fp16_utils import FP16_Optimizer except: print('WARNING: apex not installed, ignoring --fp16 option') args.fp16 = False logdir = f'{args.logdir_root}/{args.run_name}-{current_timestamp()}' os.system(f'mkdir -p {logdir}') os.system(f'mkdir -p {args.output_dir}') assert os.path.exists(args.data), f"Didn't find {args.data}" enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path) model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path) if args.fp16: model = model.half() model.to(device) # setup TensorBoard logging global_example_count = 0 global_token_count = 0 print(f"Logging to {logdir}") event_writer = SummaryWriter(logdir) log_tb("first", time.time()) data_loader = get_data_loader(args.data, enc, args.batch_size, args) # ## Prep optimizer # We use OpenAIAdam because that's what run_openai_gpt used # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = len(data_loader) * args.num_train_epochs optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) if use_cuda and args.fp16: # If args.dynamic_loss_scale is False, static_loss_scale will be used. # If args.dynamic_loss_scale is True, it will take precedence over static_loss_scale. optimizer = FP16_Optimizer(optimizer, static_loss_scale=1, dynamic_loss_scale=0, dynamic_loss_args={'init_scale': 2**16}) # Reset all model weights so we can train from scratch. model.apply(model.init_weights) model.train() for current_epoch in range(args.num_train_epochs): data_loader_iter = iter(data_loader) for step in range(len(data_loader)): start_batch_ts = time.time() with timeit('dataloader'): batch = next(data_loader_iter) with timeit('batch.to'): batch = batch.to(device) with timeit('loss'): loss = model(batch, lm_labels=batch) with timeit('loss.backward'): loss.backward() with timeit('optimizer.step'): optimizer.step() optimizer.zero_grad() end_batch_ts = time.time() # time to do single batch batch_time = end_batch_ts - start_batch_ts total_tokens = args.context_length * args.batch_size time_per_token = batch_time / total_tokens time_per_sample = batch_time / args.batch_size log_tb('times/tokens_per_sec', 1 / time_per_token) log_tb('times/samples_per_sec', 1 / time_per_sample) log_tb('times/step', 1000 * batch_time) if step % args.print_freq == 0: log_tb("memory/allocated_gb", torch.cuda.memory_allocated() / 1e9) log_tb("memory/max_allocated_gb", torch.cuda.max_memory_allocated() / 1e9) log_tb("memory/cached_gb", torch.cuda.memory_cached() / 1e9) log_tb("memory/max_cached_gb", torch.cuda.max_memory_cached() / 1e9) print('loss', loss.item()) # FP16Optimizer doesn't support get_lr # print('lr', optimizer.get_lr()[0]) log_tb('loss', loss.item()) # log_tb('lr', optimizer.get_lr()[0]) with timeit('sample'): sample = print_samples( model, enc, args, # Context is a random sample from the dataset. context_tokens=next(iter(data_loader)), batch_size=1, length=20, nsamples=1, temperature=1, top_k=40) event_writer.add_text('sample', sample, global_example_count) # TODO: replace with len(batch) global_example_count += args.batch_size global_token_count += total_tokens # checkpoint at the end of each epoch print("Checkpointing at epoch ", current_epoch) checkpoint(model, args)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--train_dataset', type=str, default='') parser.add_argument('--eval_dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument('--warmup_proportion', type=float, default=0.002) parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained( args.model_name, special_tokens=special_tokens) special_tokens_ids = list( tokenizer.convert_tokens_to_ids(token) for token in special_tokens) model = OpenAIGPTDoubleHeadsModel.from_pretrained( args.model_name, num_special_tokens=len(special_tokens)) model.to(device) # Load and encode the datasets if not args.train_dataset and not args.eval_dataset: roc_stories = cached_path(ROCSTORIES_URL) def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding dataset...") train_dataset = load_csqa_dataset(args.train_dataset) print("Splitting train 90-10 into train-dev.") dev_dataset = train_dataset[int(len(train_dataset) * 0.9):] train_dataset = train_dataset[:int(len(train_dataset) * 0.9)] test_dataset = load_csqa_dataset(args.eval_dataset) datasets = (train_dataset, dev_dataset, test_dataset) encoded_datasets = tokenize_and_encode(datasets) # Compute the mex input length for the Transformer max_length = model.config.n_positions // 2 - 2 input_length = max( len(question[:max_length]) + max(len(answer1[:max_length]), len(answer2[:max_length]), len(answer3[:max_length])) + 3 for dataset in encoded_datasets for question, answer1, answer2, answer3, _ in dataset) input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids) train_tensor_dataset = tensor_datasets[0] dev_tensor_dataset = tensor_datasets[1] test_tensor_dataset = tensor_datasets[2] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) dev_data = TensorDataset(*dev_tensor_dataset) dev_sampler = RandomSampler(dev_data) dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=args.train_batch_size) test_data = TensorDataset(*test_tensor_dataset) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = len( train_data) * args.num_train_epochs // args.train_batch_size optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None best_dev_accuracy = 0 test_acc_best_dev = 0 best_dev_epoch = 0 no_up = 0 tqdm_epoch = tqdm(range(args.num_train_epochs), desc="Epoch") for epoch in tqdm_epoch: model.train() tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids, lm_labels, mc_labels) loss = args.lm_coef * losses[0] + losses[1] loss.backward() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, optimizer.get_lr()[0]) # train_loss, train_accuracy = evaluate(model, device, train_dataloader, desc="Evaluate Train") dev_loss, dev_accuracy = evaluate(model, device, dev_dataloader, desc="Evaluate Dev") test_loss, test_accuracy = evaluate(model, device, test_dataloader, desc="Evaluate Test") train_loss = tr_loss / nb_tr_steps if args.do_train else None if dev_accuracy >= best_dev_accuracy: # New best model. best_dev_accuracy = dev_accuracy test_acc_best_dev = test_accuracy best_dev_epoch = epoch + 1 no_up = 0 # Save the new best model. model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) else: no_up += 1 tqdm.write("\t ***** Eval results (Epoch %s) *****" % str(epoch + 1)) # tqdm.write("\t train_accuracy = %s" % str(train_accuracy)) tqdm.write("\t dev_accuracy = %s" % str(dev_accuracy)) tqdm.write("") tqdm.write("\t best_dev_accuracy = %s" % str(best_dev_accuracy)) tqdm.write("\t test_acc_best_dev = %s" % str(test_acc_best_dev)) tqdm.write("\t best_dev_epoch = %s" % str(best_dev_epoch)) tqdm.write("\t no_up = %s" % str(no_up)) tqdm.write("") if no_up >= 10: tqdm_epoch.close() break
def main(): # Pre-train model: eval_ppl = 104.29582476475977 parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--train_dataset', type=str, default='') parser.add_argument('--eval_dataset', type=str, default='') parser.add_argument('--dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument('--warmup_proportion', type=float, default=0.002) parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") # args = parser.parse_args() args = parser.parse_args([ #'--do_train', '--do_eval', '--dataset=../data/convai2/train_both_original.txt', '--dataset=data/convai2/convai2_data.models', '--output_dir=./language-quality-subreward/gpt_output/' ]) print(args) # This commented code was used for parsing and pickling data from the original data file. ''' data = Parser(persona_limit=None, set_relation=None) print('Parsing...') data.parse(args.dataset) file_utils.save_model('data/convai2', data, '.models', 'convai2_data') ''' data = file_utils.read_model('', args.dataset, '') data = list(chain(*data.conversation)) #data = data[: 10] train_data_org = data[:int(0.9 * len(data))] eval_data_org = data[int(0.9 * len(data)):] del data print('') if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained( args.model_name, cache_dir="./cache/", special_tokens=special_tokens) special_tokens_ids = list( tokenizer.convert_tokens_to_ids(token) for token in special_tokens) model = OpenAIGPTLMHeadModel.from_pretrained( args.model_name, cache_dir="./cache/", num_special_tokens=len(special_tokens)) model.to(device) ''' # Load and encode the datasets if not args.train_dataset and not args.eval_dataset: roc_stories = cached_path(ROCSTORIES_URL) ''' def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding dataset...") train_dataset = train_data_org eval_dataset = eval_data_org datasets = (train_dataset, eval_dataset) encoded_datasets = tokenize_and_encode(datasets) # Compute the max input length for the Transformer max_length = model.config.n_positions // 2 - 2 input_length = max(len(sent[:max_length]) + 2 \ for dataset in encoded_datasets for sent in dataset) input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids) train_tensor_dataset, eval_tensor_dataset = tensor_datasets[ 0], tensor_datasets[1] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = len( train_data) * args.num_train_epochs // args.train_batch_size optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, lm_labels = batch loss = model(input_ids, lm_labels=lm_labels) loss.backward() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, optimizer.get_lr()[0]) # Save a trained model if args.do_train: model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") config = model.config torch.save(model_to_save.state_dict(), output_model_file) # Yue: save the config: output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model that you have fine-tuned ''' model_state_dict = torch.load(output_model_file) model = OpenAIGPTLMHeadModel(config) model.load_state_dict(model_state_dict) model.to(device) ''' if args.do_eval: output_config_file = os.path.join(args.output_dir, CONFIG_NAME) config = OpenAIGPTConfig(output_config_file) # Load a trained model that you have fine-tuned output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) model_state_dict = torch.load(output_model_file) model = OpenAIGPTLMHeadModel(config) model.load_state_dict(model_state_dict) model.to(device) model.eval() eval_ppl = 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, lm_labels = batch loss = model(input_ids, lm_labels=lm_labels) eval_ppl += math.exp(loss.item()) nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_ppl = eval_ppl / nb_eval_steps train_loss = tr_loss / nb_tr_steps if args.do_train else None result = {'eval_ppl': eval_ppl, 'train_loss': train_loss} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))