def create_optim(model, args, rl=False): """ Not applied. :param model: :param args: :param rl: :return: """ if not rl: optimizer = OpenAIAdam(model.parameters(), lr=args.lr) else: optimizer = OpenAIAdam(model.parameters(), lr=args.reinforce_lr) return optimizer
def get_optimizer(model: GPT2LMHeadModel, data_loader: Any, num_epochs: int, lr: float): params = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in params if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = len(data_loader) * num_epochs optimizer = OpenAIAdam( optimizer_grouped_parameters, lr=lr, t_total=num_train_optimization_steps, # the following group of parameters is taken from train_gpt2.py warmup=0.002, max_grad_norm=1.0, weight_decay=0.01, schedule="warmup_linear", b2=.99) return optimizer
def test_openai_sched_init(self): m = torch.nn.Linear(50, 50) optim = OpenAIAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule=None) self.assertTrue( isinstance(optim.param_groups[0]["schedule"], ConstantLR)) optim = OpenAIAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule="none") self.assertTrue( isinstance(optim.param_groups[0]["schedule"], ConstantLR)) optim = OpenAIAdam(m.parameters(), lr=0.001, warmup=.01, t_total=1000) self.assertTrue( isinstance(optim.param_groups[0]["schedule"], WarmupLinearSchedule))
def get_optimizer(model, args, data_loader): # We use OpenAIAdam because that's what run_openai_gpt used param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = len(data_loader) * args.num_train_epochs if args.optimizer == 'openai': optimizer = OpenAIAdam( optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, schedule=args.lr_schedule, b2=.99, # instead of .999 t_total=num_train_optimization_steps) else: optimizer = torch.optim.Adam(optimizer_grouped_parameters, lr=args.learning_rate, betas=(0.9, 0.99), eps=1e-08, weight_decay=args.weight_decay, amsgrad=False) optimizer.get_lr = lambda: [p['lr'] for p in optimizer.param_groups] return optimizer
def prep_optimizer(model, epochs, learning_rate, warmup_proportion, max_grad_norm, weight_decay): param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_optimization_steps = len(train_dataloader) * epochs optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, max_grad_norm=max_grad_norm, weight_decay=weight_decay, t_total=num_train_optimization_steps) return optimizer
def __init__(self, model, opt): no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] parameters_with_decay = [] parameters_with_decay_names = [] parameters_without_decay = [] parameters_without_decay_names = [] base_parameters = [] base_parameters_names = [] for n, p in model.named_parameters(): if p.requires_grad: # fine-tune BERT if any(t in n for t in ["transformer"]): if any(t in n for t in no_decay): parameters_without_decay.append(p) parameters_without_decay_names.append(n) else: parameters_with_decay.append(p) parameters_with_decay_names.append(n) else: base_parameters.append(p) base_parameters_names.append(n) weight_decay = opt['weight_decay'] bert_learning_rate = opt['gpt_lr'] base_learning_rate = opt['lr'] optimizer_grouped_parameters = [ {'params': parameters_with_decay, 'weight_decay': weight_decay, 'lr': bert_learning_rate}, {'params': parameters_without_decay, 'weight_decay': 0.0, 'lr': bert_learning_rate}, {'params': base_parameters, 'weight_decay': weight_decay, 'lr': base_learning_rate} ] # print('The following parameters will be optimized WITH decay:') print(_ellipse(parameters_with_decay_names, 5, ' , ')) print('The following parameters will be optimized WITHOUT decay:') print(_ellipse(parameters_without_decay_names, 5, ' , ')) print('The following parameters will be optimized NORMALLY:') print(_ellipse(base_parameters_names, 5, ' , ')) optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=opt['gpt_lr'], warmup=opt['warmup_proportion'], max_grad_norm=opt['gradient_clip'], t_total=opt.get('optimizer_step', -1)) self.optimizer = optimizer
def buildOptimizer(self, neural, epochs, batch_size, accumulation_steps, lr=2e-5, warmup=0.05): """ build bert optimizer """ no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] param_optimizer = list(neural.named_parameters()) optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = int(epochs * len(self.sentences) / batch_size / accumulation_steps) if self.optimizer == 'BertAdam': return BertAdam(optimizer_grouped_parameters, lr=lr, warmup=warmup, t_total=num_train_optimization_steps) else: return OpenAIAdam(optimizer_grouped_parameters, lr=lr, warmup=warmup, t_total=num_train_optimization_steps)
def train_model(epochs=10, num_gradients_accumulation=4, batch_size=4, gpu_id=0, lr=1e-5, load_dir='decoder_model'): # make sure your model is on GPU device = torch.device(f"cuda:{gpu_id}") #------------------------LOAD MODEL----------------- print('load the model....') encoder = TransformerEncoder() decoder = TransformerDecoderLM() encoder.load_state_dict(torch.load("encoder.pth")) decoder.load_state_dict(torch.load("decoder.pth")) encoder = encoder.to(device) decoder = decoder.to(device) print('load success') #------------------------END LOAD MODEL-------------- #------------------------LOAD TRAIN DATA------------------ train_data = torch.load("train_data.pth") train_dataset = TensorDataset(*train_data) train_dataloader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=batch_size) val_data = torch.load("validate_data.pth") val_dataset = TensorDataset(*val_data) val_dataloader = DataLoader(dataset=val_dataset, shuffle=True, batch_size=batch_size) #------------------------END LOAD TRAIN DATA-------------- #------------------------SET OPTIMIZER------------------- num_train_optimization_steps = len( train_dataset) * epochs // batch_size // num_gradients_accumulation param_optimizer = list(decoder.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=lr, warmup=0.01, max_grad_norm=1.0, weight_decay=0.01, t_total=num_train_optimization_steps) #------------------------END SET OPTIMIZER-------------- #------------------------START TRAINING------------------- update_count = 0 start = time.time() print('start training....') for epoch in range(epochs): #------------------------training------------------------ decoder.train() losses = 0 times = 0 for batch in train_dataloader: batch = [item.to(device) for item in batch] encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch _, past = encoder(encoder_input, mask_encoder_input) mask = torch.cat([mask_encoder_input, mask_decoder_input], dim=1) logits, _ = decoder(decoder_input, mask, past=past, past_length=0) out = logits[:, :-1].contiguous() target = decoder_input[:, 1:].contiguous() target_mask = mask_decoder_input[:, 1:].contiguous() loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token") loss.backward() losses += loss.item() times += 1 update_count += 1 if update_count % num_gradients_accumulation == num_gradients_accumulation - 1: optimizer.step() optimizer.zero_grad() end = time.time() print('-' * 20 + f'epoch {epoch}' + '-' * 20) print(f'time: {(end - start)}') print(f'loss: {losses / times}') start = end #------------------------validate------------------------ decoder.eval() perplexity = 0 batch_count = 0 print('start calculate the perplexity....') with torch.no_grad(): for batch in val_dataloader: batch = [item.to(device) for item in batch] encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch _, past = encoder(encoder_input, mask_encoder_input) mask = torch.cat([mask_encoder_input, mask_decoder_input], dim=1) logits, _ = decoder(decoder_input, mask, past=past, past_length=0) out = logits[:, :-1].contiguous() target = decoder_input[:, 1:].contiguous() target_mask = mask_decoder_input[:, 1:].contiguous() loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token") perplexity += np.exp(loss.item()) batch_count += 1 print(f'validate perplexity: {perplexity / batch_count}') torch.save( decoder.state_dict(), os.path.join(os.path.abspath('.'), load_dir, str(epoch) + "decoder.pth"))
# optimizer num_epochs = 10 num_gradients_accumulation = 1 num_train_optimization_steps = num_train_optimization_steps = len(train_dataset) * num_epochs // batch_size // num_gradients_accumulation param_optimizer = list(model_A.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=2e-5, warmup=0.1, max_grad_norm=1.0, weight_decay=0.01, t_total=num_train_optimization_steps) # In[12]: # support fp16 # [model_A, model_B], optimizer = amp.initialize([model_A, model_B], optimizer, opt_level="O1") # In[13]: import tqdm update_count = 0
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument('--train_dataset', type=str, default='') parser.add_argument('--eval_dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument('--warmup_proportion', type=float, default=0.002) parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens) special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens) model = OpenAIGPTLMHeadModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens)) model.to(device) # Load and encode the datasets ''' if not args.train_dataset and not args.eval_dataset: roc_stories = cached_path(ROCSTORIES_URL) ''' def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding dataset...") train_dataset = load_recipes_dataset(args.train_dataset) #eval_dataset = load_rocstories_dataset(args.eval_dataset) datasets = (train_dataset,) encoded_datasets = tokenize_and_encode(datasets) # Compute the mex input length for the Transformer max_length = model.config.n_positions - 2 input_length = max(len(story[:max_length]) + 2 for dataset in encoded_datasets for story in dataset) input_length = min(input_length, model.config.n_positions) # Max size of input for the pre-trained model # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids) train_tensor_dataset = tensor_datasets[0] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) ''' eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) ''' # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): print(batch) batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch loss = model(input_ids, mc_token_ids, lm_labels = lm_labels) print(loss) ''' loss.backward() optimizer.step() tr_loss += loss.item() exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item() nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, optimizer.get_lr()[0]) ''' # Save a trained model '''
### params = list(filter(lambda x: x.requires_grad, model.parameters())) + list(criterion.parameters()) total_params = sum(p.data.nelement() for p in params if p.requires_grad) if args.mode == 'GPT': param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight', 'ln_'] # Add 'ln_1' to test if it's better optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and 'transformer' in n], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and 'transformer' in n], 'weight_decay': 0.0} ] num_train_optimization_steps = train_data.size(0) * args.epochs // args.batch_size optimizer_gpt = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) params = [p for n, p in param_optimizer if 'transformer' not in n] tools.print_log(args.save, args) tools.print_log(args.save, 'Model total parameters:{}'.format(total_params)) if args.mode == 'GPT': tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') with open('GPT_index.pkl', 'rb') as handle: gptdic = pickle.load(handle) ############################################################################### # Training code ###############################################################################
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='gpt2', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--train_dataset', type=str, default='') parser.add_argument('--eval_dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument('--warmup_proportion', type=float, default=0.002) parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print('{} is on use...'.format(device)) n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = GPT2Tokenizer.from_pretrained(args.model_name, special_tokens=special_tokens) special_tokens_ids = list( tokenizer.convert_tokens_to_ids(token) for token in special_tokens) # model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens)) model = GPT2DoubleHeadsModel.from_pretrained( args.model_name, num_special_tokens=len(special_tokens)) # GPT2DoubleHeadsModel.set_num_special_tokens(model, len(special_tokens)) model.to(device) # Load and encode the datasets if not args.train_dataset and not args.eval_dataset: roc_stories = cached_path(ROCSTORIES_URL) def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding dataset...") train_dataset = load_rocstories_dataset(args.train_dataset) eval_dataset = load_rocstories_dataset(args.eval_dataset) datasets = (train_dataset, eval_dataset) encoded_datasets = tokenize_and_encode(datasets) # Compute the max input length for the Transformer max_length = model.config.n_positions // 2 - 2 input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3 \ for dataset in encoded_datasets for story, cont1, cont2, _ in dataset) input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids) train_tensor_dataset, eval_tensor_dataset = tensor_datasets[ 0], tensor_datasets[1] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare optimizer if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = len( train_dataloader) * args.num_train_epochs optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids, lm_labels, mc_labels) loss = args.lm_coef * losses[0] + losses[1] loss.backward() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, optimizer.get_lr()[0]) # Save a trained model if args.do_train: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = GPT2DoubleHeadsModel.from_pretrained(args.output_dir) tokenizer = GPT2Tokenizer.from_pretrained(args.output_dir) model.to(device) if args.do_eval: model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch with torch.no_grad(): _, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels) _, mc_logits = model(input_ids, mc_token_ids) mc_logits = mc_logits.detach().cpu().numpy() mc_labels = mc_labels.to('cpu').numpy() tmp_eval_accuracy = accuracy(mc_logits, mc_labels) eval_loss += mc_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples train_loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'train_loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default='/hdd/user4/gpt_classification/dataset/ag_news', type=str, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--task_name", default='ag_news', type=str, help="The name of the task to train.") parser.add_argument("--output_dir", default='/hdd/user4/gpt_classification/experiment/ag_news', type=str, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument("--max_grad_norm", default=1) parser.add_argument('--weight_decay', type=float, default=0.0) ## Other parameters parser.add_argument("--cache_dir", default='/hdd/user4/gpt_classification/pretrained', type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=True, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=True, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=9.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--overwrite_output_dir', default=True, action='store_true', help="Overwrite the content of the output directory") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") # n_gpu = torch.cuda.device_count() n_gpu = 1 else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') args.device = device logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train and not args.overwrite_output_dir: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name](args.data_dir) output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens) model = OpenAIGPTForClassification.from_pretrained(args.model_name, num_special_tokens=len(special_tokens), num_labels=num_labels) if args.local_rank == 0: torch.distributed.barrier() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 tr_loss = 0 if args.do_train: if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() # Prepare data loader train_examples = processor.get_train_examples() cached_train_features_file = os.path.join(args.data_dir, 'train_{0}_{1}_{2}'.format( list(filter(None, args.model_name.split('/'))).pop(), str(args.max_seq_length), str(task_name))) try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, output_mode) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_optimization_steps = len(train_dataloader) * args.num_train_epochs optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) model.train() for _ in range(int(args.num_train_epochs)): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, _, label_ids = batch # define a new function to compute loss values for both output_modes logits = model.forward(input_ids, input_mask) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 if args.local_rank in [-1, 0]: tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) tb_writer.add_scalar('loss', loss.item(), global_step) tb_writer.close() ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Good practice: save your training arguments together with the trained model output_args_file = os.path.join(args.output_dir, 'training_args.bin') torch.save(args, output_args_file) # Load a trained model and vocabulary that you have fine-tuned model = OpenAIGPTForClassification.from_pretrained(args.output_dir, num_labels=num_labels) model.to(device) ### Evaluation if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples() cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format( list(filter(None, args.model_name.split('/'))).pop(), str(args.max_seq_length), str(task_name))) try: with open(cached_eval_features_file, "rb") as reader: eval_features = pickle.load(reader) except: eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving eval features into cached file %s", cached_eval_features_file) with open(cached_eval_features_file, "wb") as writer: pickle.dump(eval_features, writer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) # Note that this sampler samples randomly eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] out_label_ids = None for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model.forward(input_ids, input_mask) # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) out_label_ids = label_ids.detach().cpu().numpy() else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, label_ids.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] if output_mode == "classification": output_odp = [] for arr in preds: t = (-arr).argsort()[:5] output_odp.append(t.tolist()) file_path = 'D:/바탕화면/(논문)multi-pretraining/NYT' with open('gpt_top5.pkl','wb') as f: pickle.dump(output_odp,f) preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(task_name, preds, out_label_ids) print('preds:',preds,'label:',out_label_ids) loss = tr_loss / global_step if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # hack for MNLI-MM if task_name == "mnli": task_name = "mnli-mm" processor = processors[task_name]() if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir + '-MM'): os.makedirs(args.output_dir + '-MM') eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] out_label_ids = None for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None) loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) out_label_ids = label_ids.detach().cpu().numpy() else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, label_ids.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] preds = np.argmax(preds, axis=1) result = compute_metrics(task_name, preds, out_label_ids) loss = tr_loss / global_step if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument('--task', type=str, default='intent', choices=['intent', 'slot'], help="Intent or slot prediction") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument('--train_dataset', type=str, default='') parser.add_argument('--eval_dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument('--warmup_proportion', type=float, default=0.002) parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.0) parser.add_argument('--probabilistic_masks', action='store_true') parser.add_argument('--attn_bias', action='store_true') parser.add_argument('--linearize', action='store_true') args = parser.parse_args() print(args) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) label_list = list() for line in open(LABEL_FILES[args.task]): label_list.append(line.strip()) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens) special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens) model = OpenAIGPTDoubleHeadsClsModel.from_pretrained(args.model_name, num_labels=len(label_list), num_special_tokens=len(special_tokens)) model.to(device) # Load and encode the datasets def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj elif isinstance(obj, np.ndarray): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding dataset...") train_dataset = load_atis_dataset(args.train_dataset, label_list, tokenizer, args.probabilistic_masks, args.linearize) eval_dataset = load_atis_dataset(args.eval_dataset, label_list, tokenizer, args.probabilistic_masks, args.linearize, plot=False) datasets = (train_dataset, eval_dataset) encoded_datasets = tokenize_and_encode(datasets) # Compute the max input length for the Transformer max_length = model.config.n_positions - 2 input_length = max(len(utt[:max_length]) + 2 \ for dataset in encoded_datasets for utt, _, _, _, _ in dataset) input_length = min(input_length, model.config.n_positions) # Max size of input for the pre-trained model # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids, len(label_list)) train_tensor_dataset, eval_tensor_dataset = tensor_datasets[0], tensor_datasets[1] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare optimizer if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_optimization_steps = len(train_dataloader) * args.num_train_epochs optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() results = [] for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, pos_ids, attn_bias, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids, lm_labels, mc_labels, position_ids=pos_ids, attn_bias=attn_bias if args.attn_bias else None) # loss = args.lm_coef * losses[0] + losses[1] loss = losses[1] loss.backward() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item() nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, optimizer.get_lr()[0]) model.eval() eval_loss = 0 nb_eval_steps, nb_eval_examples = 0, 0 all_logits, all_labels = [], [] for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, pos_ids, attn_bias, lm_labels, mc_labels = batch with torch.no_grad(): _, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels, position_ids=pos_ids, attn_bias=attn_bias if args.attn_bias else None) _, mc_logits = model(input_ids, mc_token_ids, position_ids=pos_ids, attn_bias=attn_bias if args.attn_bias else None) mc_logits = mc_logits.detach().cpu().numpy() mc_labels = mc_labels.to('cpu').numpy() eval_loss += mc_loss.mean().item() all_logits.append(mc_logits) all_labels.append(mc_labels) nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps all_logits = np.concatenate(all_logits, axis=0) all_labels = np.concatenate(all_labels, axis=0) eval_f1 = f1(all_logits, all_labels) eval_acc = accuracy(all_logits, all_labels) / nb_eval_examples train_loss = tr_loss/nb_tr_steps if args.do_train else None result = {'eval_loss': eval_loss, 'eval_f1': eval_f1, 'eval_accuracy': eval_acc, 'train_loss': train_loss} print(result) results.append(result) with open(os.path.join(args.output_dir, "log.csv"), "w") as csvfile: writer = csv.DictWriter( csvfile, ["train_loss", "eval_loss", "eval_accuracy", "eval_f1"] ) writer.writeheader() writer.writerows(results) # Save a trained model if args.do_train: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = OpenAIGPTDoubleHeadsClsModel.from_pretrained( args.output_dir,num_labels=len(label_list), num_special_tokens=len(special_tokens)) tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir) model.to(device) if args.do_eval: model.eval() eval_loss = 0 nb_eval_steps, nb_eval_examples = 0, 0 all_logits, all_labels = [], [] fw = open("prediction.txt", "w") for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, pos_ids, attn_bias, lm_labels, mc_labels = batch with torch.no_grad(): _, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels, position_ids=pos_ids, attn_bias=attn_bias if args.attn_bias else None) _, mc_logits = model(input_ids, mc_token_ids, position_ids=pos_ids, attn_bias=attn_bias if args.attn_bias else None) mc_logits = mc_logits.detach().cpu().numpy() mc_labels = mc_labels.to('cpu').numpy() for i, (o, l) in enumerate(zip((mc_logits>=0.5).astype(np.int32), mc_labels.astype(np.int32))): # if np.any(o != l): # pred = [label_list[idx] for idx, val in enumerate(o) if val == 1] # true = [label_list[idx] for idx, val in enumerate(l) if val == 1] pred = o true = l fw.write(f"{eval_dataset[nb_eval_examples+i][0]}\n{pred}\n{true}\n\n") eval_loss += mc_loss.mean().item() all_logits.append(mc_logits) all_labels.append(mc_labels) nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 fw.close() eval_loss = eval_loss / nb_eval_steps all_logits = np.concatenate(all_logits, axis=0) all_labels = np.concatenate(all_labels, axis=0) eval_f1 = f1(all_logits, all_labels) eval_acc = accuracy(all_logits, all_labels) / nb_eval_examples train_loss = tr_loss/nb_tr_steps if args.do_train else None result = {'eval_loss': eval_loss, 'eval_f1': eval_f1, 'eval_accuracy': eval_acc, 'train_loss': train_loss} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def train(seed, depth, maxlen, batch_size, accumulation_steps, model_name): config.seed = seed config.max_sequence_length = maxlen config.batch_size = batch_size config.accumulation_steps = accumulation_steps if depth != 24: config.bert_weight = f"../bert_weight/uncased_L-{depth}_H-768_A-12/" else: config.bert_weight = f"../bert_weight/uncased_L-{depth}_H-1024_A-16/" if model_name == 'bert': config.features = f"../bert_features_{maxlen}/" elif model_name == 'gpt2': config.features = f"../features_{maxlen}_gpt/" else: config.features = f"../features_{maxlen}_xlnet/" config.experiment = f"{depth}layers" config.checkpoint = f"{config.logdir}/{config.today}/{model_name}_{config.experiment}_" \ f"{config.batch_size}bs_{config.accumulation_steps}accum_{config.seed}seed_{config.max_sequence_length}/" print_config(config) np.random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed(config.seed) torch.backends.cudnn.deterministic = True # Data loaders train_loader, valid_loader, valid_df, loss_weight = get_data_loaders( config) loaders = {"train": train_loader, "valid": valid_loader} # Criterion criterion = CustomLoss(loss_weight) # Model and optimizer if model_name == 'bert': print("BERT MODEL") model = BertForTokenClassificationMultiOutput2.from_pretrained( config.bert_weight, cache_dir=None, num_aux_labels=config.n_aux_targets) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = np.ceil( len(train_loader.dataset) / config.batch_size / config.accumulation_steps) * config.epochs optimizer = BertAdam(optimizer_grouped_parameters, lr=config.lr, warmup=0.01, t_total=num_train_optimization_steps) elif model_name == 'gpt2': print("GPT2 MODEL") model = GPT2ClassificationMultioutput.from_pretrained( config.gpt2_weight, cache_dir=None, num_aux_labels=config.n_aux_targets) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = np.ceil( len(train_loader.dataset) / config.batch_size / config.accumulation_steps) * config.epochs optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=config.lr, warmup=0.01, t_total=num_train_optimization_steps) elif model_name == 'xlnet': model = XLNetWithMultiOutput.from_pretrained( config.xlnet_weight, clf_dropout=0.4, n_class=6 # num_aux_labels=config.n_aux_targets ) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = np.ceil( len(train_loader.dataset) / config.batch_size / config.accumulation_steps) * config.epochs optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=config.lr, warmup=0.01, t_total=num_train_optimization_steps) else: raise ("Model is not implemented") scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer) model = model.cuda() from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level="O1") # if distributed_rank > -1: # from apex.parallel import DistributedDataParallel # model = DistributedDataParallel(model) model = torch.nn.DataParallel(model) if config.resume: checkpoint = torch.load(config.checkpoint + "/checkpoints/best.pth") import pdb pdb.set_trace() new_state_dict = {} old_state_dict = checkpoint['model_state_dict'] for k, v in old_state_dict.items(): new_state_dict["module." + k] = v model.load_state_dict(new_state_dict) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) scheduler.load_state_dict(checkpoint['scheduler_state_dict']) criterion.load_state_dict(checkpoint['criterion_state_dict']) print("!!! Loaded checkpoint ", config.checkpoint + "/checkpoints/best.pth") identity_valid = valid_df[config.identity_columns].copy() target_valid = valid_df.target.values auc_callback = AucCallback(identity=identity_valid, target=target_valid) checkpoint_callback = IterationCheckpointCallback( save_n_last=2000, num_iters=10000, ) # model runner runner = ModelRunner() # model training runner.train(model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, main_metric='auc', minimize_metric=False, logdir=config.checkpoint, num_epochs=config.epochs, verbose=True, fp16={"opt_level": "O1"}, callbacks=[auc_callback, checkpoint_callback])
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .csv files (or other data files) for the task." ) parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--answer_only", default=False, action='store_true', help="Whether to run with answers only (blank out question).") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( "--load_model_from", default=None, type=str, help= "The saved model file to load before doing any training or eval (if both --do_train and --do_eval are specified, the saved model will be loaded, then trained, then the trained model will be evaluated)." ) parser.add_argument( '--train_filename', type=str, default='train.csv', help="Filename to load train data from (relative to data_dir)") parser.add_argument( '--eval_filename', type=str, default='val.csv', help="File to load eval data from (relative to data_dir)") parser.add_argument( '--data_format', type=str, choices=['swag', 'codah'], default='swag', help= "Format of the train and eval files (original SWAG CSV format vs our TSV format)" ) parser.add_argument( '--model_labels_save_filename', type=str, default='model_labels.json', help= "JSON file to save model outputs/labels to (relative to output_dir)") parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=32) parser.add_argument('--eval_batch_size', type=int, default=8) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument('--warmup_proportion', type=float, default=0.002) parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.5) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument( '--gradient_accumulation_steps', type=int, default=8, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if args.do_eval and (not args.do_train) and args.load_model_from is None: args.load_model_from = os.path.join(args.output_dir, 'pytorch_model.bin') # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained( args.model_name, special_tokens=special_tokens) special_tokens_ids = list( tokenizer.convert_tokens_to_ids(token) for token in special_tokens) model = OpenAIGPTDoubleHeadsModel.from_pretrained( args.model_name, num_special_tokens=len(special_tokens)) config = model.config if args.load_model_from: model_state_dict = torch.load(args.load_model_from) model = OpenAIGPTDoubleHeadsModel(config) model.load_state_dict(model_state_dict) model.to(device) # Load and encode the datasets logger.info("Loading datasets...") datasets = [] dataset_keys = dict() if args.do_train: train_dataset = read_swag_examples(os.path.join( args.data_dir, args.train_filename), is_training=True, answer_only=args.answer_only, data_format=args.data_format) train_dataset = [ EncodedSwagExample(ex, tokenizer) for ex in tqdm(train_dataset, desc='Encoding train') ] dataset_keys['train'] = len(datasets) datasets.append(train_dataset) if args.do_eval: eval_dataset = read_swag_examples(os.path.join(args.data_dir, args.eval_filename), is_training=True, answer_only=args.answer_only, data_format=args.data_format) eval_dataset = [ EncodedSwagExample(ex, tokenizer) for ex in tqdm(eval_dataset, desc='Encoding eval') ] dataset_keys['eval'] = len(datasets) datasets.append(eval_dataset) # Compute the max input length for the Transformer max_length = model.config.n_positions // 2 - 2 input_length = max(len(swagex.context_tokens[:max_length]) + len(swagex.start_ending_tokens[:max_length]) + max(len(ending[:max_length]) for ending in swagex.endings_tokens) + 3 \ for dataset in datasets for swagex in dataset) input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model print('---') print('Input length: {}\n'.format(input_length)) print('---') # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(datasets, input_length, max_length, *special_tokens_ids) if args.do_train: train_tensor_dataset = tensor_datasets[dataset_keys['train']] if args.do_eval: eval_tensor_dataset = tensor_datasets[dataset_keys['eval']] # Prepare optimizer if args.do_train: train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] #num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size num_train_optimization_steps = int( len(train_data) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids, lm_labels, mc_labels) loss = args.lm_coef * losses[0] + losses[1] if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_steps += 1 exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, optimizer.get_lr()[0]) if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() # Save a trained model output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") if args.do_train: model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self torch.save(model_to_save.state_dict(), output_model_file) if args.do_eval: eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Load a trained model that you have fine-tuned if args.do_train: model_state_dict = torch.load(output_model_file) model = OpenAIGPTDoubleHeadsModel(config) model.load_state_dict(model_state_dict) model.to(device) model.eval() all_model_outputs = [] data_index = 0 eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch with torch.no_grad(): _, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels) _, mc_logits = model(input_ids, mc_token_ids) mc_logits = mc_logits.detach().cpu().numpy() mc_labels = mc_labels.to('cpu').numpy() tmp_eval_accuracy = accuracy(mc_logits, mc_labels) eval_loss += mc_loss.mean().item() eval_accuracy += tmp_eval_accuracy for i in range(input_ids.size(0)): output_obj = dict() output_obj['logits'] = [float(x) for x in mc_logits[i]] output_obj['true_label'] = int(mc_labels[i]) output_obj['model_label'] = int(np.argmax(mc_logits[i])) output_obj['swag_data'] = datasets[ dataset_keys['eval']][data_index].raw_example.to_dict() all_model_outputs.append(output_obj) data_index += 1 nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples train_loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'train_loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) with open( os.path.join(args.output_dir, args.model_labels_save_filename), 'w') as f: json.dump(all_model_outputs, f)
def train(self, which_to_train, num_epochs=10): # optimizer param_optimizer = list(self.model_A.named_parameters()) + \ list(self.model_B.named_parameters()) if "A" in which_to_train: print("clf_A to optimize") param_optimizer += list(self.clf_A.named_parameters()) if "B" in which_to_train: print("clf_B to optimize") param_optimizer += list(self.clf_B.named_parameters()) if "TF" in which_to_train: print("clf_TF to optimize") param_optimizer += list(self.clf_TF.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_optimization_steps = len(self.train_dataset) * num_epochs // self.batch_size // self.num_gradients_accumulation self.optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=2e-5, warmup=0.1, max_grad_norm=1.0, weight_decay=0.01, t_total=num_train_optimization_steps) update_count = 0 progress_bar = tqdm.tqdm start = time.time() best_acc_A = -float('Inf') best_f1_A = -float('Inf') best_acc_B = -float('Inf') best_f1_B = -float('Inf') best_acc_TF = -float('Inf') best_f1_TF = -float('Inf') for ep in tqdm.tqdm(range(num_epochs)): # set train mode self.model_A.train() self.model_B.train() self.clf_A.train() self.clf_B.train() self.clf_TF.train() "Training" pbar = progress_bar(self.train_dataloader) train_dataloader_TF_list = list(self.train_dataloader_TF) for i, batch in enumerate(pbar): batch = batch[0] batch_TF = train_dataloader_TF_list[i%len(train_dataloader_TF_list)] # without relative position # if sum([len(item) for item in batch[1]]) > 1024: # input("1024 here!") # continue record_loss = self.train_one_iter(batch, batch_TF, update_count, which_to_train, fp16=False) update_count += 1 if update_count % self.num_gradients_accumulation == self.num_gradients_accumulation - 1: # update for gradient accumulation self.optimizer.step() self.optimizer.zero_grad() # speed measure end = time.time() speed = self.batch_size * self.num_gradients_accumulation / (end - start) start = end # show progress pbar.set_postfix(loss=record_loss, speed=speed) "Evaluation" self.model_A.eval() self.model_B.eval() self.clf_A.eval() self.clf_B.eval() self.clf_TF.eval() (val_acc_A, val_f1_A), (val_acc_B, val_f1_B), (val_acc_TF, val_f1_TF) = self.validate(self.val_dataloader, self.val_dataloader_TF, ep, which_to_train) print(f"A: val f1: {val_f1_A}, valid acc: {val_acc_A}") print(f"B: val f1: {val_f1_B}, valid acc: {val_acc_B}") print(f"TF: val f1: {val_f1_TF}, valid acc: {val_acc_TF}") is_best_so_far_TF = val_f1_TF > best_f1_TF is_best_so_far_A = val_f1_A > best_f1_A is_best_so_far_B = val_f1_TF > best_f1_B if is_best_so_far_TF: best_acc_TF = val_acc_TF best_f1_TF = val_f1_TF if is_best_so_far_A: best_acc_A = val_acc_A best_f1_A = val_f1_A if is_best_so_far_B: best_acc_B = val_acc_B best_f1_B = val_f1_B SAVED = False if is_best_so_far_TF and not SAVED: SAVED = True torch.save((self.model_A.state_dict(), self.model_B.state_dict(), self.clf_A.state_dict(), self.clf_B.state_dict(), self.clf_TF.state_dict()), f"Checkpoint_act_clf/epoch{ep}_multitask_TF_best_acc_{val_acc_TF}_f1_{val_f1_TF}_A_acc_{val_acc_A}_f1_{val_f1_A}_B_acc_{val_acc_B}_f1_{val_f1_B}.pth") if is_best_so_far_A and not SAVED: SAVED = True torch.save((self.model_A.state_dict(), self.model_B.state_dict(), self.clf_A.state_dict(), self.clf_B.state_dict(), self.clf_TF.state_dict()), f"Checkpoint_act_clf/epoch{ep}_multitask_TF_best_acc_{val_acc_TF}_f1_{val_f1_TF}_A_acc_{val_acc_A}_f1_{val_f1_A}_B_acc_{val_acc_B}_f1_{val_f1_B}.pth") if is_best_so_far_B and not SAVED: SAVED = True torch.save((self.model_A.state_dict(), self.model_B.state_dict(), self.clf_A.state_dict(), self.clf_B.state_dict(), self.clf_TF.state_dict()), f"Checkpoint_act_clf/epoch{ep}_multitask_TF_best_acc_{val_acc_TF}_f1_{val_f1_TF}_A_acc_{val_acc_A}_f1_{val_f1_A}_B_acc_{val_acc_B}_f1_{val_f1_B}.pth") # if which_to_train == "A": # torch.save(model_A.state_dict(), f"Checkpoint_act_clf/A/best_acc_{best_acc}_f1_{best_f1}.pth") # elif which_to_train == "B": # torch.save(model_A.state_dict(), f"Checkpoint_act_clf/B/best_acc_{best_acc}_f1_{best_f1}.pth") # checkpointer.save_checkpoint(ep, model_A.state_dict(), {"None": None}, is_best_so_far) print("finally") print("A: \nbest acc: {}, best f1: {}".format(best_acc_A, best_f1_A)) print("B: \nbest acc: {}, best f1: {}".format(best_acc_B, best_f1_B)) print("TF: \nbest acc: {}, best f1: {}".format(best_acc_TF, best_f1_TF))
class ModelClassifier(object): def __init__(self, config, which_to_train, model_A, model_B, tokenizer, device1, device2): # config.num_labels = le.classes_.shape[0] # label encode # super().__init__() self.config = config self.le_A = load_pkl("training/data/labelencoder_A.pkl") self.le_B = load_pkl("training/data/labelencoder_B.pkl") self.clf_A = SequenceSummary(num_labels=self.le_A.classes_.shape[0], config=config) self.clf_B = SequenceSummary(num_labels=self.le_B.classes_.shape[0], config=config) self.clf_TF = SequenceSummary(num_labels=2, config=config) # self.apply(self.init_weight) self.past = None self.history = [] # model self.model_A = model_A self.model_B = model_B self.tokenizer = tokenizer self.cls_token_id = tokenizer.cls_token_id self.device1 = device1 self.device2 = device2 self.to_device(self.device1) # define loss self.criterion = nn.CrossEntropyLoss() # optimizer parameters self.num_gradients_accumulation = 1 self.batch_size = 1 self.batch_size_TF = 8 # load training data self.load_data() def reload(self): self.past = None self.history = [] def to_device(self, device): # to device self.clf_A = self.clf_A.to(device) self.clf_B = self.clf_B.to(device) self.clf_TF = self.clf_TF.to(device) self.clf_A.device = device self.clf_B.device = device self.clf_TF.device = device # self.model_A = self.model_A.to(self.device) # self.model_B = self.model_B.to(self.device) def load_data(self): # load training data self.train_data = load_pkl("training/data/train_data.pkl") self.val_data = load_pkl("training/data/val_data.pkl") self.train_data_TF, self.val_data_TF = torch.load("demonstration/old_model/demonstration_train_with_text_only.pkl", map_location="cpu"), \ torch.load("demonstration/old_model/demonstration_val_with_text_only.pkl", map_location="cpu") self.train_dataset = PersuadeDataset(self.train_data, self.tokenizer) self.val_dataset = PersuadeDataset(self.val_data, self.tokenizer) self.train_dataset_TF, self.val_dataset_TF = TFDataset(self.train_data_TF, self.tokenizer), \ TFDataset(self.val_data_TF, self.tokenizer) self.train_dataloader = DataLoader(dataset=self.train_dataset, shuffle=True, batch_size=self.batch_size, collate_fn=self.train_dataset.collate) self.val_dataloader = DataLoader(dataset=self.val_dataset, shuffle=False, batch_size=self.batch_size, collate_fn=self.train_dataset.collate) self.train_dataloader_TF = DataLoader(dataset=self.train_dataset_TF, shuffle=True, batch_size=self.batch_size_TF, collate_fn=self.train_dataset_TF.collate) self.val_dataloader_TF = DataLoader(dataset=self.val_dataset_TF, shuffle=False, batch_size=self.batch_size_TF, collate_fn=self.val_dataset_TF.collate) def load_model(self, all_model_dir=None, clf_A_dir=None, clf_B_dir=None, clf_TF_dir=None): if all_model_dir is None: if clf_A_dir: clf_A_state = torch.load(clf_A_dir) self.clf_A.load_state_dict(clf_A_state) print(f"clf_A loaded") if clf_B_dir: clf_B_state = torch.load(clf_B_dir) self.clf_B.load_state_dict(clf_B_state) print(f"clf_B loaded") if clf_TF_dir: clf_TF_state = torch.load(clf_TF_dir) self.clf_TF.load_state_dict(clf_TF_state) print(f"clf_TF loaded") else: model_A_state, model_B_state, clf_A_state, clf_B_state, clf_TF_state = torch.load(all_model_dir) self.model_A.load_state_dict(model_A_state) self.model_B.load_state_dict(model_B_state) self.clf_A.load_state_dict(clf_A_state) self.clf_B.load_state_dict(clf_B_state) self.clf_TF.load_state_dict(clf_TF_state) print(f"all models loaded") def train(self, which_to_train, num_epochs=10): # optimizer param_optimizer = list(self.model_A.named_parameters()) + \ list(self.model_B.named_parameters()) if "A" in which_to_train: print("clf_A to optimize") param_optimizer += list(self.clf_A.named_parameters()) if "B" in which_to_train: print("clf_B to optimize") param_optimizer += list(self.clf_B.named_parameters()) if "TF" in which_to_train: print("clf_TF to optimize") param_optimizer += list(self.clf_TF.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_optimization_steps = len(self.train_dataset) * num_epochs // self.batch_size // self.num_gradients_accumulation self.optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=2e-5, warmup=0.1, max_grad_norm=1.0, weight_decay=0.01, t_total=num_train_optimization_steps) update_count = 0 progress_bar = tqdm.tqdm start = time.time() best_acc_A = -float('Inf') best_f1_A = -float('Inf') best_acc_B = -float('Inf') best_f1_B = -float('Inf') best_acc_TF = -float('Inf') best_f1_TF = -float('Inf') for ep in tqdm.tqdm(range(num_epochs)): # set train mode self.model_A.train() self.model_B.train() self.clf_A.train() self.clf_B.train() self.clf_TF.train() "Training" pbar = progress_bar(self.train_dataloader) train_dataloader_TF_list = list(self.train_dataloader_TF) for i, batch in enumerate(pbar): batch = batch[0] batch_TF = train_dataloader_TF_list[i%len(train_dataloader_TF_list)] # without relative position # if sum([len(item) for item in batch[1]]) > 1024: # input("1024 here!") # continue record_loss = self.train_one_iter(batch, batch_TF, update_count, which_to_train, fp16=False) update_count += 1 if update_count % self.num_gradients_accumulation == self.num_gradients_accumulation - 1: # update for gradient accumulation self.optimizer.step() self.optimizer.zero_grad() # speed measure end = time.time() speed = self.batch_size * self.num_gradients_accumulation / (end - start) start = end # show progress pbar.set_postfix(loss=record_loss, speed=speed) "Evaluation" self.model_A.eval() self.model_B.eval() self.clf_A.eval() self.clf_B.eval() self.clf_TF.eval() (val_acc_A, val_f1_A), (val_acc_B, val_f1_B), (val_acc_TF, val_f1_TF) = self.validate(self.val_dataloader, self.val_dataloader_TF, ep, which_to_train) print(f"A: val f1: {val_f1_A}, valid acc: {val_acc_A}") print(f"B: val f1: {val_f1_B}, valid acc: {val_acc_B}") print(f"TF: val f1: {val_f1_TF}, valid acc: {val_acc_TF}") is_best_so_far_TF = val_f1_TF > best_f1_TF is_best_so_far_A = val_f1_A > best_f1_A is_best_so_far_B = val_f1_TF > best_f1_B if is_best_so_far_TF: best_acc_TF = val_acc_TF best_f1_TF = val_f1_TF if is_best_so_far_A: best_acc_A = val_acc_A best_f1_A = val_f1_A if is_best_so_far_B: best_acc_B = val_acc_B best_f1_B = val_f1_B SAVED = False if is_best_so_far_TF and not SAVED: SAVED = True torch.save((self.model_A.state_dict(), self.model_B.state_dict(), self.clf_A.state_dict(), self.clf_B.state_dict(), self.clf_TF.state_dict()), f"Checkpoint_act_clf/epoch{ep}_multitask_TF_best_acc_{val_acc_TF}_f1_{val_f1_TF}_A_acc_{val_acc_A}_f1_{val_f1_A}_B_acc_{val_acc_B}_f1_{val_f1_B}.pth") if is_best_so_far_A and not SAVED: SAVED = True torch.save((self.model_A.state_dict(), self.model_B.state_dict(), self.clf_A.state_dict(), self.clf_B.state_dict(), self.clf_TF.state_dict()), f"Checkpoint_act_clf/epoch{ep}_multitask_TF_best_acc_{val_acc_TF}_f1_{val_f1_TF}_A_acc_{val_acc_A}_f1_{val_f1_A}_B_acc_{val_acc_B}_f1_{val_f1_B}.pth") if is_best_so_far_B and not SAVED: SAVED = True torch.save((self.model_A.state_dict(), self.model_B.state_dict(), self.clf_A.state_dict(), self.clf_B.state_dict(), self.clf_TF.state_dict()), f"Checkpoint_act_clf/epoch{ep}_multitask_TF_best_acc_{val_acc_TF}_f1_{val_f1_TF}_A_acc_{val_acc_A}_f1_{val_f1_A}_B_acc_{val_acc_B}_f1_{val_f1_B}.pth") # if which_to_train == "A": # torch.save(model_A.state_dict(), f"Checkpoint_act_clf/A/best_acc_{best_acc}_f1_{best_f1}.pth") # elif which_to_train == "B": # torch.save(model_A.state_dict(), f"Checkpoint_act_clf/B/best_acc_{best_acc}_f1_{best_f1}.pth") # checkpointer.save_checkpoint(ep, model_A.state_dict(), {"None": None}, is_best_so_far) print("finally") print("A: \nbest acc: {}, best f1: {}".format(best_acc_A, best_f1_A)) print("B: \nbest acc: {}, best f1: {}".format(best_acc_B, best_f1_B)) print("TF: \nbest acc: {}, best f1: {}".format(best_acc_TF, best_f1_TF)) def validate(self, dataloader, dataloader_TF, ep, which_to_train): from sklearn.metrics import f1_score from sklearn.metrics import confusion_matrix from utils import print_cm # evaluation mode self.model_A.eval() self.model_B.eval() self.clf_A.eval() self.clf_B.eval() self.clf_TF.eval() def get_numbers_for_one_task(sents, logits, acts, x, y_true, y_pred, total, correct): _, predicted_acts = torch.max(logits, 1) x.extend(sents) y_true.extend(acts.tolist()[0]) y_pred.extend(predicted_acts.tolist()) total += len(acts.tolist()[0]) correct += (predicted_acts == acts).sum().item() return x, y_true, y_pred, total, correct progress_bar = tqdm.tqdm with torch.no_grad(): pbar = progress_bar(dataloader) dataloader_TF_list = list(dataloader_TF) correct = 0 total = 0 x_A, y_true_A, y_pred_A, correct_A, total_A = [], [], [], 0, 0 x_B, y_true_B, y_pred_B, correct_B, total_B = [], [], [], 0, 0 x_TF, y_true_TF, y_pred_TF, correct_TF, total_TF = [], [], [], 0, 0 for i, batch in enumerate(pbar): batch = batch[0] batch_TF = dataloader_TF_list[i%len(dataloader_TF_list)] # if sum([len(item) for item in batch[1]]) > 1024: # continue sents_A, logits_A, acts_A,\ sents_B, logits_B, acts_B,\ sents_TF, logits_TF, acts_TF = self.train_one_iter(batch, batch_TF, None, which_to_train, fp16=False, is_validation=True) x_A, y_true_A, y_pred_A, total_A, correct_A = get_numbers_for_one_task(sents_A, logits_A, acts_A,\ x_A, y_true_A, y_pred_A, total_A, correct_A) x_B, y_true_B, y_pred_B, total_B, correct_B = get_numbers_for_one_task(sents_B, logits_B, acts_B,\ x_B, y_true_B, y_pred_B, total_B, correct_B) x_TF, y_true_TF, y_pred_TF, total_TF, correct_TF = get_numbers_for_one_task(sents_TF, logits_TF, acts_TF,\ x_TF, y_true_TF, y_pred_TF, total_TF, correct_TF) f1_A = f1_score(y_true_A, y_pred_A, average="weighted") f1_B = f1_score(y_true_B, y_pred_B, average="weighted") f1_TF = f1_score(y_true_TF, y_pred_TF, average="binary") # pdb.set_trace() pd.DataFrame(zip(x_A, self.le_A.inverse_transform(y_true_A).tolist(), self.le_A.inverse_transform(y_pred_A).tolist()), columns=['sent', 'y_true', 'y_pred']).to_csv(f"Checkpoint_act_clf/A/act_classifier_val_results_epoch{ep}.csv", index=None) print(f"A: Epoch {ep} Validation accuracy: {correct_A/total_A}, f1: {f1_A}") pd.DataFrame(zip(x_B, self.le_B.inverse_transform(y_true_B).tolist(), self.le_B.inverse_transform(y_pred_B).tolist()), columns=['sent', 'y_true', 'y_pred']).to_csv(f"Checkpoint_act_clf/B/act_classifier_val_results_epoch{ep}.csv", index=None) print(f"B: Epoch {ep} Validation accuracy: {correct_B/total_B}, f1: {f1_B}") pd.DataFrame(zip(x_TF, y_true_TF, y_pred_TF), columns=['sent', 'y_true', 'y_pred']).to_csv(f"Checkpoint_act_clf/TF/act_classifier_val_results_epoch{ep}.csv", index=None) print(f"TF: Epoch {ep} Validation accuracy: {correct_TF/total_TF}, f1: {f1_TF}") # print_cm(confusion_matrix(y_true, y_pred, labels=range(len(le.classes_))), labels=[l[:] for l in le.classes_.tolist()]) return (correct_A/total_A, f1_A), (correct_B/total_B, f1_B), (correct_TF/total_TF, f1_TF) def set_past(self, sent, which_task): "sent: str, a whole sent" # assert sent.startswith("A:") or sent.startswith("B:") if sent.startswith("A:") or sent.startswith("B:"): pdb.set_trace() sent = sent[2:] if which_task == "A": lm_model = self.model_A prefix = "A:" device = lm_model.device elif which_task == "B": lm_model = self.model_B prefix = "B:" device = lm_model.device elif which_task == "TF": lm_model = self.model_A prefix = "A:" # candidate_sent = prefix+" ".join(separate_sents) device = lm_model.device # encode sent self.history.append(prefix+sent) sent = self.tokenizer.encode(prefix) + self.tokenizer.encode(sent) + self.train_dataset.turn_ending sent = torch.LongTensor(sent).unsqueeze(0).to(device) past = self.move_to_device(self.past, lm_model) _, past, _ = lm_model(sent, past) self.past = past def predict(self, separate_sents, which_task): "separate_sents: list of sentences with no prefix" past = self.past if which_task == "A": lm_model = self.model_A clf_head = self.clf_A le = self.le_A prefix = "A:" device = lm_model.device elif which_task == "B": lm_model = self.model_B clf_head = self.clf_B le = self.le_B prefix = "B:" device = lm_model.device elif which_task == "TF": lm_model = self.model_A clf_head = self.clf_TF prefix = "A:" candidate_sent = " ".join(separate_sents) device = lm_model.device # evaluation mode self.model_A.eval() self.model_B.eval() self.clf_A.eval() self.clf_B.eval() self.clf_TF.eval() with torch.no_grad(): if which_task in ["A", "B"]: all_logits = [] for i, sent in enumerate(separate_sents): if i == 0: sent = self.tokenizer.encode(prefix) + self.tokenizer.encode(sent) else: sent = self.tokenizer.encode(" "+sent) # pdb.set_trace() sent = torch.LongTensor(sent).unsqueeze(0).to(device) past = self.move_to_device(past, lm_model) logits, past, hidden_states = lm_model(sent, past) # encode [CLS] cls_token_tensor = torch.LongTensor([self.cls_token_id]).unsqueeze(0).to(device) _, _, hidden_states = lm_model(cls_token_tensor, past) hidden_states = self.move_to_device(hidden_states, clf_head) mc_logits = clf_head(hidden_states[-1], cls_index=None).squeeze(-1) all_logits.append(mc_logits) # finish tail end_input = torch.LongTensor(self.train_dataset.turn_ending).unsqueeze(0).to(device) past = self.move_to_device(past, lm_model) _, past, _ = lm_model(end_input, past) # get labels all_logits = torch.cat(all_logits, dim=0) # pdb.set_trace() _, predicted_acts = torch.max(all_logits, 1) predicted_acts = predicted_acts.tolist() predicted_acts = le.inverse_transform(predicted_acts).tolist() return predicted_acts, past elif which_task == "TF": # encode candidate candidate = self.tokenizer.encode(prefix) + self.tokenizer.encode(candidate_sent) # pdb.set_trace() candidate = torch.LongTensor(candidate).unsqueeze(0).to(device) past = self.move_to_device(past, self.model_A) logits, past, hidden_states = self.model_A(candidate, past) # encode [CLS] cls_token_tensor = torch.LongTensor([self.cls_token_id]).unsqueeze(0).to(device) _, _, hidden_states = self.model_A(cls_token_tensor, past) hidden_states = self.move_to_device(hidden_states, self.clf_TF) mc_logits = self.clf_TF(hidden_states[-1], cls_index=None).squeeze(-1) # pdb.set_trace() _, predicted_acts = torch.max(mc_logits, 1) predicted_acts = predicted_acts.tolist() assert len(predicted_acts) == 1 return predicted_acts[0], past def train_one_iter(self, batch, batch_TF, update_count, which_to_train, fp16=False, is_validation=False): # role_ids, whole_sents, separate_sents, acts = batch past = None all_sents_A, all_logits_A, all_acts_A = [], [], [] all_sents_B, all_logits_B, all_acts_B = [], [], [] for i, (role_id, whole_sent, separate_sents, acts) in enumerate(zip(*batch)): if role_id == 0: whole_sent = torch.LongTensor(whole_sent).unsqueeze(0).to(self.device1) try: assert self.tokenizer.decode(whole_sent[0][:2].tolist()) == "A:" except: pdb.set_trace() if "A" in which_to_train: past = self.move_to_device(past, self.model_A) _, real_past, _ = self.model_A(whole_sent, past) for act, sent in zip(acts, separate_sents): all_sents_A.append(self.tokenizer.decode(sent)) # pdb.set_trace() # 'A:HI I would like to tell you About a childrens charity called Save the CHildren.' sent = torch.LongTensor(sent).unsqueeze(0).to(self.device1) past = self.move_to_device(past, self.model_A) logits, past, hidden_states = self.model_A(sent, past) # pdb.set_trace() # encode [CLS] cls_token_tensor = torch.LongTensor([self.cls_token_id]).unsqueeze(0).to(self.device1) past = self.move_to_device(past, self.model_A) _, _, hidden_states = self.model_A(cls_token_tensor, past) mc_logits = self.clf_A(hidden_states[-1], cls_index=None).squeeze(-1) all_logits_A.append(mc_logits) all_acts_A.append(act) # pdb.set_trace() past = real_past # # finish tail # end_input = torch.LongTensor(self.train_dataset.turn_ending).unsqueeze(0).to(self.device1) # _, past, _ = self.model_A(end_input, past) else: past = self.move_to_device(past, self.model_A) _, past, hidden_states = self.model_A(whole_sent, past) else: whole_sent = torch.LongTensor(whole_sent).unsqueeze(0).to(self.device2) try: assert self.tokenizer.decode(whole_sent[0][:2].tolist()) == "B:" except: pdb.set_trace() if "B" in which_to_train: past = self.move_to_device(past, self.model_B) _, real_past, _ = self.model_B(whole_sent, past) for act, sent in zip(acts, separate_sents): all_sents_B.append(self.tokenizer.decode(sent)) # pdb.set_trace() #'B:ok please do' sent = torch.LongTensor(sent).unsqueeze(0).to(self.device2) past = self.move_to_device(past, self.model_B) logits, past, hidden_states = self.model_B(sent, past) # encode [CLS] cls_token_tensor = torch.LongTensor([self.cls_token_id]).unsqueeze(0).to(self.device2) _, _, hidden_states = self.model_B(cls_token_tensor, past) hidden_states = self.move_to_device(hidden_states, self.clf_B) mc_logits = self.clf_B(hidden_states[-1], cls_index=None).squeeze(-1) all_logits_B.append(mc_logits) all_acts_B.append(act) past = real_past # finish tail # end_input = torch.LongTensor(self.train_dataset.turn_ending).unsqueeze(0).to(self.device2) # past = self.move_to_device(past, self.model_B) # _, past, _ = self.model_B(end_input, past) else: past = self.move_to_device(past, self.model_B) _, past, hidden_states = self.model_B(whole_sent, past) all_logits_A = torch.cat(all_logits_A, dim=0) all_acts_A = torch.tensor(all_acts_A).unsqueeze(0).to(self.device1) # pdb.set_trace() loss_A = self.criterion(all_logits_A.view(-1, all_logits_A.size(-1)), all_acts_A.view(-1)) all_logits_B = torch.cat(all_logits_B, dim=0) all_acts_B = torch.tensor(all_acts_B).unsqueeze(0).to(self.device1) loss_B = self.criterion(all_logits_B.view(-1, all_logits_B.size(-1)), all_acts_B.view(-1)) # TF task all_contexts_candidate_TF = [] all_logits_TF = [] all_acts_TF = [] for one_dial in batch_TF: past = None contexts, candidate, pick_or_not = one_dial all_contexts_candidate_TF.append((" ".join([self.tokenizer.decode(c) for c in contexts]), self.tokenizer.decode(candidate))) # get past for i, context in enumerate(contexts): if i%2 == 0: # pdb.set_trace() #'A:Would you like to know more about the charity Save the Children?\n\n\n' context = torch.LongTensor(context).unsqueeze(0).to(self.device1) past = self.move_to_device(past, self.model_A) logits, past, hidden_states = self.model_A(context, past) else: # pdb.set_trace() #'B:hello I am great.\n\n\n' context = torch.LongTensor(context).unsqueeze(0).to(self.device2) past = self.move_to_device(past, self.model_B) logits, past, hidden_states = self.model_B(context, past) # encode candidate # pdb.set_trace() # "A:Save the Children is an international non-governmental organization that promotes children's rights, provides relief and helps support children in developing countries." candidate = torch.LongTensor(candidate).unsqueeze(0).to(self.device1) past = self.move_to_device(past, self.model_A) logits, past, hidden_states = self.model_A(candidate, past) # encode [CLS] cls_token_tensor = torch.LongTensor([self.cls_token_id]).unsqueeze(0).to(self.device1) _, _, hidden_states = self.model_A(cls_token_tensor, past) mc_logits = self.clf_TF(hidden_states[-1], cls_index=None).squeeze(-1) all_logits_TF.append(mc_logits) all_acts_TF.append(pick_or_not) all_logits_TF = torch.cat(all_logits_TF, dim=0) all_acts_TF = torch.tensor(all_acts_TF).unsqueeze(0).to(self.device1) loss_TF = self.criterion(all_logits_TF.view(-1, all_logits_TF.size(-1)), all_acts_TF.view(-1)) if is_validation: return all_sents_A, all_logits_A, all_acts_A,\ all_sents_B, all_logits_B, all_acts_B,\ all_contexts_candidate_TF, all_logits_TF, all_acts_TF loss = loss_A.to(self.device1) + loss_B.to(self.device1) + loss_TF.to(self.device1) loss /= self.num_gradients_accumulation if fp16: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() record_loss = loss.item() * self.num_gradients_accumulation return record_loss#, perplexity def move_to_device(self, past, target): if past is not None and target.device != past[0].device: past = [p.to(target.device) for p in past] return past
def train(): config_file = "configs/train_full_pipeline_config.json" config = Config.from_json_file(config_file) # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", config.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(config)) # Initialize distributed training if needed config.distributed = (config.local_rank != -1) if config.distributed: torch.cuda.set_device(config.local_rank) config.device = torch.device("cuda", config.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info( "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning" ) tokenizer_class = GPT2Tokenizer if "gpt2" in config.model_checkpoint else OpenAIGPTTokenizer tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint) model_class = GPT2DoubleHeadsModel if "gpt2" in config.model_checkpoint else OpenAIGPTDoubleHeadLMEmotionRecognitionModel model = model_class.from_pretrained(config.model_checkpoint) tokenizer.set_special_tokens(SPECIAL_TOKENS) model.set_num_special_tokens(len(SPECIAL_TOKENS)) model.to(config.device) optimizer = OpenAIAdam(model.parameters(), lr=config.lr) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if config.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=config.fp16) if config.distributed: model = DistributedDataParallel(model, device_ids=[config.local_rank], output_device=config.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders( config, tokenizer) # Evaluation function and evaluator (evaluator output is the input of the metrics) model.eval() num_correct = 0 num_all = len(val_loader) for batch in val_loader: with torch.no_grad(): batch = tuple( input_tensor.to(config.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids = batch model_outputs = model(input_ids, mc_token_ids, token_type_ids=token_type_ids, token_emotion_ids=token_emotion_ids) lm_logits, mc_logits = model_outputs[0], model_outputs[ 1] # So we can also use GPT2 outputs indices = torch.argmax(mc_logits, dim=1) correct = torch.eq(indices, mc_labels).view(-1) num_correct += torch.sum(correct).item() print(num_correct / num_all)
def train( *, model, criterion, x_train, y_train, epochs, yield_steps, bucket, lr, batch_size: int, accumulation_steps: int, pad_idx: int, ): train_dataset = TensorDataset(torch.tensor(x_train, dtype=torch.long), torch.tensor(y_train, dtype=torch.float)) model.zero_grad() model = model.to(device) param_optimizer = list(model.named_parameters()) num_train_optimization_steps = int(epochs * len(train_dataset) / (batch_size * accumulation_steps)) if isinstance(model, BertForSequenceClassification): no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }, ] optimizer = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=0.05, t_total=num_train_optimization_steps) elif isinstance(model, GPT2ClassificationHeadModel): optimizer = OpenAIAdam([p for _, p in param_optimizer], lr=lr, warmup=0.1, t_total=num_train_optimization_steps) else: raise ValueError model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) model.train() if bucket: sampler = RandomSampler(train_dataset) batch_sampler = BucketBatchSampler(sampler, batch_size, drop_last=False, pad_idx=pad_idx) train_loader = DataLoader(train_dataset, batch_sampler=batch_sampler) else: train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) smoothed_loss = None step = 0 epoch_pbar = tqdm.trange(epochs) def _state(): return model, optimizer, epoch_pbar, smoothed_loss, step * batch_size print(f'Starting training for ' f'{num_train_optimization_steps * accumulation_steps:,} steps, ' f'checkpoint interval {yield_steps:,}') yield _state() torch.cuda.empty_cache() for _ in epoch_pbar: optimizer.zero_grad() pbar = tqdm.tqdm(train_loader, leave=False) for x_batch, y_batch in pbar: step += 1 if bucket: x_batch, y_batch = trim_tensors([x_batch, y_batch], pad_idx) x_batch = x_batch.to(device) y_batch = y_batch.to(device) try: y_pred = model(x_batch, attention_mask=x_batch > 0, labels=None) loss = criterion(y_pred, y_batch) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if step % accumulation_steps == 0: optimizer.step() optimizer.zero_grad() except RuntimeError as e: if 'CUDA out of memory' in str(e): print('ignoring', e) torch.cuda.empty_cache() continue raise if smoothed_loss is not None: smoothed_loss = 0.98 * smoothed_loss + 0.02 * loss.item() else: smoothed_loss = loss.item() pbar.set_postfix(loss=f'{smoothed_loss:.4f}') if step % yield_steps == 0: yield _state() yield _state() torch.cuda.empty_cache()
def train(): config_file = "configs/train_daily_dialog_emotion_detection_config.json" config = Config.from_json_file(config_file) # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN) logger.warning("Running process %d", config.local_rank) logger.info("Arguments: %s", pformat(config)) # Initialize distributed training if needed config.distributed = (config.local_rank != -1) if config.distributed: torch.cuda.set_device(config.local_rank) config.device = torch.device("cuda", config.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info( "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning" ) tokenizer_class = GPT2Tokenizer if "gpt2" in config.model_checkpoint else OpenAIGPTTokenizer tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint) model_class = OpenAIGPTForEmotionDetection model = model_class.from_pretrained(config.model_checkpoint) tokenizer.set_special_tokens(SPECIAL_TOKENS) model.set_num_special_tokens(len(SPECIAL_TOKENS)) model.to(config.device) optimizer = OpenAIAdam(model.parameters(), lr=config.lr) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if config.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=config.fp16) if config.distributed: model = DistributedDataParallel(model, device_ids=[config.local_rank], output_device=config.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders( config, tokenizer) model.eval() n_emotions = 0 num_correct = 0 positives = 0 all_true_positives = 0 num_all = len(val_loader) for batch in val_loader: with torch.no_grad(): batch = tuple( input_tensor.to(config.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch # logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) model_outputs = model(input_ids, mc_token_ids, token_type_ids=token_type_ids) lm_logits, mc_logits = model_outputs[0], model_outputs[ 1] # So we can also use GPT2 outputs indices = torch.argmax(mc_logits, dim=1) correct = torch.eq(indices, mc_labels).view(-1) num_correct += torch.sum(correct).item() num_classes = mc_logits.size(1) mc_labels = to_onehot(mc_labels.view(-1), num_classes=num_classes) indices = torch.argmax(mc_logits, dim=1).view(-1) mc_logits = to_onehot(indices, num_classes=num_classes) mc_labels = mc_labels.type_as(mc_logits) correct = mc_labels * mc_logits all_positives = mc_logits.sum(dim=0).type( torch.DoubleTensor) # Convert from int cuda/cpu to double cpu if correct.sum() == 0: true_positives = torch.zeros_like(all_positives) else: true_positives = correct.sum(dim=0) true_positives = true_positives.type(torch.DoubleTensor) positives += all_positives all_true_positives += true_positives print(num_correct / num_all) print(all_true_positives / positives) print(n_emotions)
def train_model(epochs=10, num_gradients_accumulation=4, batch_size=8, gpu_id=0, lr=1e-4, load_dir='decoder_model', decoder_model='original_pretrained_model_for_bertGPT.pth'): # make sure your model is on GPU device = torch.device(f"cuda:{gpu_id}") #------------------------LOAD MODEL----------------- print('load the model....') model = BertGPT() model.load_state_dict(torch.load(decoder_model)) # model = nn.DataParallel(model, device_ids = [0]) model = model.to(device) print('load success') #------------------------END LOAD MODEL-------------- #------------------------LOAD TRAIN DATA------------------ train_data = torch.load("train_data.pth") train_dataset = MyDataset(*train_data) train_dataloader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=batch_size, num_workers=2, collate_fn=collate_fn) val_data = torch.load("validate_data.pth") val_dataset = MyDataset(*val_data) val_dataloader = DataLoader(dataset=val_dataset, shuffle=True, batch_size=batch_size, num_workers=2, collate_fn=collate_fn) #------------------------END LOAD TRAIN DATA-------------- #------------------------SET OPTIMIZER------------------- num_train_optimization_steps = len( train_dataset) * epochs // batch_size // num_gradients_accumulation param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and p.requires_grad ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) and p.requires_grad ], 'weight_decay': 0.0 }] print('train') print(len(optimizer_grouped_parameters[0]['params'])) optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=lr, warmup=0.01, max_grad_norm=1.0, weight_decay=0.01, t_total=num_train_optimization_steps) #------------------------END SET OPTIMIZER-------------- #------------------------START TRAINING------------------- update_count = 0 start = time.time() print('start training....') for epoch in range(epochs): #------------------------training------------------------ model.train() losses = 0 times = 0 for batch in tqdm(train_dataloader, desc='dirs'): batch = [item.to(device) for item in batch] encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch logits = model(encoder_input, mask_encoder_input, decoder_input, mask_decoder_input) out = logits[:, :-1].contiguous() target = decoder_input[:, 1:].contiguous() target_mask = mask_decoder_input[:, 1:].contiguous() loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token") loss.backward() losses += loss.item() times += 1 update_count += 1 if update_count % num_gradients_accumulation == num_gradients_accumulation - 1: optimizer.step() optimizer.zero_grad() end = time.time() print('-' * 20 + f'epoch {epoch}' + '-' * 20) print(f'time: {(end - start)}') print(f'loss: {losses / times}') start = end #------------------------validate------------------------ model.eval() perplexity = 0 batch_count = 0 print('start calculate the perplexity....') with torch.no_grad(): for batch in tqdm(val_dataloader): batch = [item.to(device) for item in batch] encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch logits = model(encoder_input, mask_encoder_input, decoder_input, mask_decoder_input) out = logits[:, :-1].contiguous() target = decoder_input[:, 1:].contiguous() target_mask = mask_decoder_input[:, 1:].contiguous() loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token") perplexity += np.exp(loss.item()) batch_count += 1 print(f'validate perplexity: {perplexity / batch_count}') direct_path = os.path.join(os.path.abspath('.'), load_dir) if not os.path.exists(direct_path): os.mkdir(direct_path) torch.save(model.state_dict(), os.path.join(direct_path, str(epoch) + "model.pth"))
def main(): # Parse the arguments parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--train_dataset', type=str, default='') parser.add_argument('--eval_dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=1) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument('--warmup_proportion', type=float, default=0.002) parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument('--max_seq_length', type=int, default=110) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Set the seed for random, numpy, PyTorch random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned special_tokens = ['<POS>', '<NEG>', '<CON_START>', '<START>', '<END>'] tokenizer = OpenAIGPTTokenizer.from_pretrained( args.model_name, special_tokens=special_tokens) start_token_id = tokenizer.convert_tokens_to_ids(['<START>'])[0] model = OpenAIGPTLMHeadModel.from_pretrained( args.model_name, num_special_tokens=len(special_tokens)) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # Load and encode dataset def tokenize_and_encode(file_path): ''' This method tokenizes the input data and encodes it using the OpenAIGPTTokenizer :param file_path: Path of the input file, dtype: str :return: encoded dataset dtype: list ''' with open(file_path, 'r') as in_fp: lines = in_fp.read().splitlines() tokenized_dataset = lines for i, line in enumerate(tqdm(lines)): token = tokenizer.tokenize(line)[:512] tokenized_dataset[i] = tokenizer.convert_tokens_to_ids(token) return tokenized_dataset logger.info("Encoding dataset...") train_dataset = tokenize_and_encode(args.train_dataset) eval_dataset = tokenize_and_encode(args.eval_dataset) print("Training samples = {}".format(len(train_dataset))) print("Validation samples = {}".format(len(eval_dataset))) print("Example = {}".format(train_dataset[0])) time.sleep(2) # Compute the mex input length for the Transformer train_dataset = [ x for x in train_dataset if len(x) <= args.max_seq_length and start_token_id in x ] # Remove all sentence longer than max_seq_length eval_dataset = [ x for x in eval_dataset if len(x) <= args.max_seq_length and start_token_id in x ] input_length = max(max(len(t) for t in train_dataset), max(len(q) for q in eval_dataset)) if n_gpu > 1: input_length = min(input_length, model.module.config.n_positions) else: input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model print("Input Length = {}".format(input_length)) def pre_process_dataset(encoded_dataset, input_length, start_token_id): """ This method is to create torch tensor of input ids and lm labels :param encoded_dataset: Input dataset, dtype: list :param input_length: Maximum length of sentence from training and eval dataset, dtype: int :param start_token_id: id of the '<START>' token, dtype: int :return: torch.tensor of size [len(encoded_dataset), 2] """ n_batch = len(encoded_dataset) input_ids = np.zeros(shape=(n_batch, input_length), dtype=np.int64) lm_labels = np.full(shape=(n_batch, input_length), fill_value=-1, dtype=np.int64) for i, tokens in enumerate(encoded_dataset): try: #tokens = tokens[:input_length] start_id_index = tokens.index(start_token_id) input_ids[i, :len(tokens)] = tokens start_id_index = tokens.index(start_token_id) lm_labels[i, start_id_index:len(tokens) - 1] = tokens[start_id_index + 1:len(tokens)] # LM loss calculate only for tokens after <START> token in the sentence #lm_labels[i, :len(tokens)-1] = tokens[1:] except ValueError: print("Index {} doesn't have start token".format(i)) input_ids = torch.tensor(input_ids) lm_labels = torch.tensor(lm_labels) tensor_dataset = (input_ids, lm_labels) #tensor_dataset.append(torch.tensor(d) for d in all_inputs) return tensor_dataset # Prepare input tensors and dataloders train_tensor_dataset = pre_process_dataset(train_dataset, input_length, start_token_id=start_token_id) eval_tensor_dataset = pre_process_dataset(eval_dataset, input_length, start_token_id=start_token_id) print("Training Example Input ids= {}".format(train_tensor_dataset[0][0])) print("Training Example Language Modeling ids = {}".format( train_tensor_dataset[1][0])) time.sleep(10) train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = RandomSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = len( train_data) * args.num_train_epochs // args.train_batch_size optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for epoch in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, lm_labels = batch loss = model(input_ids, lm_labels=lm_labels) if n_gpu > 1: loss.mean().backward() else: loss.backward() optimizer.step() optimizer.zero_grad() if n_gpu > 1: tmp_loss = loss.mean().item() else: tmp_loss = loss.item() exp_average_loss = tmp_loss if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * tmp_loss nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, optimizer.get_lr()[0]) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pytorch_model_zero_grad_{}.bin".format(epoch + 1)) config = model.module.config if hasattr(model, 'module') else model.config torch.save(model_to_save.state_dict(), output_model_file) model_state_dict = torch.load(output_model_file) model = OpenAIGPTLMHeadModel(config) model.load_state_dict(model_state_dict) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_eval: model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, lm_labels = batch with torch.no_grad(): lm_loss = model(input_ids, lm_labels=lm_labels) eval_loss += lm_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps train_loss = tr_loss / nb_tr_steps if args.do_train else None result = {'eval_loss': eval_loss, 'train_loss': train_loss} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def run_model(): parser = argparse.ArgumentParser() parser.add_argument( '--model_name', type=str, default='openai-gpt', help='pretrained model name or path to local checkpoint') parser.add_argument('--setting', type=str, default='explain_predict') parser.add_argument('--eval_preds_prefix', type=str, default='preds_') parser.add_argument("--n_train_print", type=int, default=10) parser.add_argument("--n_gen", type=int, default=20) parser.add_argument("--batch_size", type=int, default=-1) parser.add_argument("--length", type=int, default=-1) parser.add_argument("--temperature", type=int, default=1) parser.add_argument("--top_k", type=int, default=0) parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_test", action='store_true', help="Whether to run eval on the test set.") parser.add_argument("--do_eval_train", action='store_true', help="Whether to run eval on the test set.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=10) parser.add_argument('--num_eval_print', type=int, default=15) parser.add_argument('--train_batch_size', type=int, default=36) parser.add_argument('--eval_batch_size', type=int, default=60) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=1e-6) parser.add_argument('--warmup_proportion', type=float, default=0.002) parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--data', type=str, default='/stage/examples/commonsenseqa/') args = parser.parse_args() print(args) if args.batch_size == -1: args.batch_size = 1 np.random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval and not args.do_test: raise ValueError( "At least one of `do_train` or `do_eval` or do_test must be True." ) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) special_tokens = [ '_start_</w>', 'or</w>', '_answer_</w>', '_classify_</w>', '_end_</w>' ] tokenizer = OpenAIGPTTokenizer.from_pretrained( args.model_name, special_tokens=special_tokens) special_tokens_ids = list( tokenizer.convert_tokens_to_ids(token) for token in special_tokens) model = OpenAIGPTLMHeadModel.from_pretrained( args.model_name, num_special_tokens=len(special_tokens)) model.to(device) datasets = parse_cqa(args.data, args.setting) numericalized = [ CommonsenseExample.numericalize_list( CommonsenseExample.tokenize_list(d, tokenizer), tokenizer) for d in datasets ] tensor_datasets = pre_process_datasets(numericalized, *special_tokens_ids) # train_tensor_dataset, eval_tensor_dataset, test_tensor_dataset = tensor_datasets[0], tensor_datasets[1], tensor_datasets[2] train_sampler, train_data = None, None if args.do_train or args.do_eval_train: train_tensor_dataset = tensor_datasets[0] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) if args.do_eval_train: train_sampler = SequentialSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) if args.do_eval: if args.do_eval_train: eval_data = train_data eval_sampler = train_sampler else: eval_tensor_dataset = tensor_datasets[1] eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) if args.do_test: test_tensor_dataset = tensor_datasets[-1] test_data = TensorDataset(*test_tensor_dataset) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) # Prepare optimizer if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = len( train_data) * args.num_train_epochs // args.train_batch_size optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) def trim_unks(x): try: unk_id = x.index('_end_</w>') return x[:unk_id] except: return x def detokenize(x): y = ''.join(trim_unks(x)) y = y.replace('</w>', ' ') y = y.replace(' .', '.') y = y.replace(' ,', ',') y = y.replace(' ?', '?') y = y.replace(' !', '!') y = y.replace(' \' ', '\'') y = y.replace(' \'re', '\'re') y = y.replace(' \'s', '\'s') y = y.replace(' n\'t', 'n\'t') return y def detok_batch(x): if not isinstance(x, list): x = x.tolist() return [ detokenize( tokenizer.convert_ids_to_tokens([z for z in y if z >= 0])) for y in x ] if args.do_train: best_eval = 0 nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for _ in range(int(args.num_train_epochs)): tr_loss, train_ppl, n_train_examples = 0, 0, 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") train_pred_strs, train_lab_strs = [], [] for step, batch in enumerate(tqdm_bar): inputs = batch[0].to(device) labels = batch[1].to(device) loss = model(inputs, lm_labels=labels) train_ppl += loss.item() * inputs.size(0) n_train_examples += inputs.size(0) loss.backward() optimizer.step() tr_loss += loss.item() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) nb_tr_steps += 1 if args.n_train_print > 0: with torch.no_grad(): preds = sample(model, batch[2], 10, device) pred_str = detok_batch(preds) label_str = detok_batch(labels) train_lab_strs.extend(label_str) train_pred_strs.extend(pred_str) input_str = detok_batch(inputs) for print_idx in range( min(args.n_train_print, inputs.size(0))): print('INPT: ', input_str[print_idx]) print('GOLD: ', label_str[print_idx]) print('PRED: ', pred_str[print_idx]) print() train_bleu = None if args.n_train_print > 0: train_bleu = computeBLEU(train_pred_strs, [[x] for x in train_lab_strs]) train_ppl = math.exp(train_ppl / n_train_examples) if args.do_eval: model.eval() eval_loss, eval_em, eval_ppl = 0, 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 label_strs, prediction_strs = [], [] n_words = 0 for batch in eval_dataloader: inputs = batch[0].to(device) labels = batch[1].to(device) with torch.no_grad(): loss = model(inputs, lm_labels=labels) preds = sample(model, batch[2], args.n_gen, device) eval_loss += loss.item() eval_ppl += loss.item() * inputs.size(0) nb_eval_examples += inputs.size(0) nb_eval_steps += 1 pred_str = detok_batch(preds) label_str = detok_batch(labels) label_strs.extend(label_str) prediction_strs.extend(pred_str) input_str = detok_batch(inputs) eval_em += sum( [x == y for x, y in zip(pred_str, label_str)]) for print_idx in range( min(inputs.size(0), args.num_eval_print)): print('INPT: ', input_str[print_idx]) print('GOLD: ', label_str[print_idx]) print('PRED: ', pred_str[print_idx]) print() eval_bleu = computeBLEU(prediction_strs, [[x] for x in label_strs]) eval_ppl = math.exp(eval_ppl / nb_eval_examples) eval_em = eval_em / nb_eval_examples eval_loss = eval_loss / nb_eval_steps train_loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_em': eval_em, 'eval_bleu': eval_bleu, 'eval_ppl': eval_ppl, 'train_loss': train_loss, 'train_bleu': train_bleu, 'train_ppl': train_ppl } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if eval_bleu > best_eval: best_eval = eval_bleu # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") config = model.config torch.save(model_to_save.state_dict(), output_model_file) if args.do_eval: # Load a trained model that you have fine-tuned output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") model_state_dict = torch.load(output_model_file) model = OpenAIGPTLMHeadModel(model.config) model.load_state_dict(model_state_dict) # uncomment to try out the default not finue-tuned model # model = OpenAIGPTLMHeadModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens), cache_dir=os.path.dirname(args.data)) model.to(device) model.eval() eval_loss, eval_em, eval_ppl = 0, 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 label_strs, prediction_strs = [], [] n_words = 0 for batch in eval_dataloader: inputs = batch[0].to(device) labels = batch[1].to(device) with torch.no_grad(): loss = model(inputs, lm_labels=labels) preds = sample(model, batch[2], args.n_gen, device) eval_loss += loss.item() eval_ppl += loss.item() * inputs.size(0) nb_eval_examples += inputs.size(0) nb_eval_steps += 1 pred_str = detok_batch(preds) label_str = detok_batch(labels) label_strs.extend(label_str) prediction_strs.extend(pred_str) input_str = detok_batch(inputs) eval_em += sum([x == y for x, y in zip(pred_str, label_str)]) for print_idx in range(min(inputs.size(0), args.num_eval_print)): print('INPT: ', input_str[print_idx]) print('GOLD: ', label_str[print_idx]) print('PRED: ', pred_str[print_idx]) print() eval_bleu = computeBLEU(prediction_strs, [[x] for x in label_strs]) eval_ppl = math.exp(eval_ppl / nb_eval_examples) eval_em = eval_em / nb_eval_examples eval_loss = eval_loss / nb_eval_steps train_loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_em': eval_em, 'eval_bleu': eval_bleu, 'eval_ppl': eval_ppl, 'train_loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: logger.info("***** Best Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) output_preds_file = os.path.join( args.output_dir, f"{args.eval_preds_prefix}_{args.setting}.txt") with open(output_preds_file, 'w') as writer: logger.info("Writing predictions") for p in prediction_strs: writer.write(p + '\n') if args.do_test: # Load a trained model that you have fine-tuned output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") model_state_dict = torch.load(output_model_file) model = OpenAIGPTLMHeadModel(model.config) model.load_state_dict(model_state_dict) model.to(device) model.eval() eval_loss, eval_em, eval_ppl = 0, 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 label_strs, prediction_strs = [], [] n_words = 0 for batch in test_dataloader: inputs = batch[0].to(device) with torch.no_grad(): preds = sample(model, batch[1], args.n_gen, device) pred_str = detok_batch(preds) prediction_strs.extend(pred_str) output_preds_file = os.path.join( args.output_dir, f"{args.test_preds_prefix}_{args.setting}.txt") with open(output_preds_file, 'w') as writer: logger.info("Writing predictions") for p in prediction_strs: writer.write(f'"{p.strip()}"\n')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--output_dir", default='tuned_gpt2', type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument('--train_dataset', type=str, default='') parser.add_argument('--source_eval', type=str, default='') parser.add_argument('--target_eval', type=str, default='') parser.add_argument('--source_train', type=str, default='') parser.add_argument('--target_train', type=str, default='') parser.add_argument('--eval_dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=10) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--effective_batch_size',type=int, default=64) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument('--warmup_proportion', type=float, default=0.002) parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument('--bsz', type=int, default = 20) parser.add_argument('--bptt', type=int, default = 40) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() # print(args) model_type = 'gpt2' if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device(type='cuda') n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) # if not args.do_train and not args.do_eval: # raise ValueError("At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained('gpt2').to('cuda') model.to(device) #file_train = args.train_dataset #'cnn_train.txt' #file_eval = args.eval_dataset #'cnn_valid.txt' bptt = args.bptt bsz = args.bsz # X_eval, nbatch_eval = load_dataset(file_eval, tokenizer, bptt, bsz) # X_train, nbatch_train = load_dataset(file_train, tokenizer, bptt, bsz) batches_eval, labels_eval, nbatch_eval = load_dataset(args.source_eval, args.target_eval, tokenizer, bptt, bsz) batches_train, labels_train, nbatch_train = load_dataset(args.source_train, args.target_train, tokenizer, bptt, bsz) # Prepare optimizer # param_optimizer = list(model.parameters()) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] print('here 3') # num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size num_train_optimization_steps = nbatch_train * args.num_train_epochs optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) eval_loss_min = None print('here 4') model.to(device) nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for epoch_i in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 for i_batch in tqdm(list(range(nbatch_train)), desc='Evaluating epoch {}'.format(epoch_i)): batch = batches_train[i_batch]#X_train[:, i_batch*bsz:(1+i_batch)*bsz].permute(1,0) batch = batch.cuda() lm_labels = labels_train[i_batch].cuda() if batch.numel() == 0: break #loss = model(batch, lm_labels = labels_train[i_batch].cuda()) # TRY DOING IT MANUALLY loss_fct = CrossEntropyLoss(reduction = 'none') lm_logits,_ = model(batch) shift_logits = lm_logits[:, :-1, :].contiguous() shift_labels = batch[:,1:].contiguous() shift_labels_mask = (lm_labels[:,1:].contiguous().view(-1) != -1).float() loss_mat = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) loss = (loss_mat*shift_labels_mask).view(-1).sum()/shift_labels_mask.sum() # avg over non-masked indices loss.backward() # only step the model if you've gone through 'effective_batch_size' examples if (i_batch*args.train_batch_size) % args.effective_batch_size == 0 and i_batch != 0: optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item() nb_tr_steps += 1 ### # Evaluations ### if i_batch % 1000 == 0: # get eval score eval_loss = eval_model(model, nbatch_eval,batches_eval,labels_eval, bsz) # if eval_loss improves, save model if eval_loss_min is None or eval_loss < eval_loss_min: eval_loss_min = eval_loss # save model if eval loss is lower model_to_save = model # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) to_json_file(model_to_save.config,output_config_file) print('eval_loss {}',format(eval_loss)) model.train() if i_batch % 200 == 0: # try generating from model print("Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, optimizer.get_lr()[0])) model.eval() if model_type == 'gpt': encode = lambda a: tokenizer.convert_tokens_to_ids(tokenizer.tokenize(a)) decode = tokenizer.decode elif model_type == 'gpt2': encode = tokenizer.encode decode = tokenizer.decode generate_from_model(encode, decode, model = model,model_type = model_type) model.train()
def train(self): if self.debug_mode: self.epochs = 1 # 加载 dataloader train_loader, valid_loader = self.create_dataloader() # 训练 self.seed_everything() lr = 2e-5 accumulation_steps = math.ceil(self.batch_size / self.base_batch_size) # 加载预训练模型 print("Load pre-trained model") model = GPT2NeuralNet.from_pretrained(self.gpt2_model_path, cache_dir=None) model.zero_grad() model = model.to(self.device) """ # 不同的参数组设置不同的 weight_decay param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] """ epoch_steps = int(self.train_len * 0.5 / self.base_batch_size / accumulation_steps) num_train_optimization_steps = int(self.epochs * epoch_steps) valid_every = math.floor(epoch_steps * accumulation_steps / 5) optimizer = OpenAIAdam(model.parameters(), lr=lr, warmup=0.05, t_total=num_train_optimization_steps) # 渐变学习速率 #scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) # 开始训练 print("Train") best_auc_score_1 = 0 best_auc_score_2 = 0 best_auc_score_3 = 0 best_auc_score_4 = 0 f_log = open("train_log.txt", "w") for epoch in range(self.epochs): model.train() optimizer.zero_grad() # 加载每个 batch 并训练 train_start_time = time.time() for i, batch_data in enumerate(train_loader): x_batch = batch_data[0] y_batch = batch_data[1] target_weight_batch = batch_data[2] aux_weight_batch = batch_data[3] identity_weight_batch = batch_data[4] np_weight_batch = batch_data[5] np_identity_weight_batch = batch_data[6] y_pred = model(x_batch.to(self.device)) target_loss, aux_loss, identity_loss, np_loss = self.custom_loss( y_pred, y_batch, epoch, target_weight_batch, aux_weight_batch, identity_weight_batch, np_weight_batch) loss = target_loss + aux_loss + identity_loss + np_loss with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if (i + 1) % accumulation_steps == 0: optimizer.step() optimizer.zero_grad() # 验证 if (i + 1) % valid_every == 0: model.eval() stage = int((i + 1) / valid_every) train_stage_duration = int( (time.time() - train_start_time) / 60) valid_start_time = time.time() y_pred = np.zeros((len(self.train_df) - self.train_len)) for j, valid_batch_data in enumerate(valid_loader): x_batch = valid_batch_data[0] batch_y_pred = self.sigmoid( model(x_batch.to( self.device)).detach().cpu().numpy())[:, 0] y_pred[j * self.base_batch_size:(j + 1) * self.base_batch_size] = batch_y_pred # 计算得分 auc_score = self.evaluator.get_final_metric(y_pred) valid_duration = int((time.time() - valid_start_time) / 60) train_start_time = time.time() f_log.write( "epoch: %d stage: %d train_stage_duration: %dmin valid_duration: %dmin auc_score: %.4f\n" % (epoch, stage, train_stage_duration, valid_duration, auc_score)) print( "epoch: %d stage: %d train_stage_duration: %dmin valid_duration: %dmin auc_score: %.4f" % (epoch, stage, train_stage_duration, valid_duration, auc_score)) if auc_score > best_auc_score_4: state_dict = model.state_dict() if auc_score > best_auc_score_1: best_auc_score_1 = auc_score torch.save(state_dict, "model1.bin") elif auc_score > best_auc_score_2: best_auc_score_2 = auc_score torch.save(state_dict, "model2.bin") elif auc_score > best_auc_score_3: best_auc_score_3 = auc_score torch.save(state_dict, "model3.bin") else: best_auc_score_4 = auc_score torch.save(state_dict, "model4.bin") with open("model_score.txt", "w") as f: f.write( "model1: %.4f model2: %.4f model3: %.4f model4: %.4f" % (best_auc_score_1, best_auc_score_2, best_auc_score_3, best_auc_score_4)) print( "model1: %.4f model2: %.4f model3: %.4f model4: %.4f" % (best_auc_score_1, best_auc_score_2, best_auc_score_3, best_auc_score_4)) model.train() if self.last is True: state_dict = model.state_dict() torch.save(state_dict, "model_last.bin") # del 训练相关输入和模型 training_history = [train_loader, valid_loader, model, optimizer] for variable in training_history: del variable gc.collect()
def train(): config_file = "configs/train_daily_dialog_emotion_action_config.json" config = Config.from_json_file(config_file) # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", config.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(config)) # Initialize distributed training if needed config.distributed = (config.local_rank != -1) if config.distributed: torch.cuda.set_device(config.local_rank) config.device = torch.device("cuda", config.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info( "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning" ) tokenizer_class = GPT2Tokenizer if "gpt2" in config.model_checkpoint else OpenAIGPTTokenizer tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint) model_class = GPT2DoubleHeadsModel if "gpt2" in config.model_checkpoint else OpenAIGPTDoubleHeadsModel model = model_class.from_pretrained(config.model_checkpoint) tokenizer.set_special_tokens(SPECIAL_TOKENS) model.set_num_special_tokens(len(SPECIAL_TOKENS)) model.to(config.device) optimizer = OpenAIAdam(model.parameters(), lr=config.lr) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if config.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=config.fp16) if config.distributed: model = DistributedDataParallel(model, device_ids=[config.local_rank], output_device=config.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders( config, tokenizer) # Training function and trainer def update(engine, batch): model.train() input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids, token_action_ids = tuple( input_tensor.to(config.device) for input_tensor in batch) lm_loss, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids, token_action_ids) loss = (lm_loss * config.lm_coef + mc_loss * config.mc_coef) / config.gradient_accumulation_steps if config.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), config.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_norm) if engine.state.iteration % config.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple( input_tensor.to(config.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids, token_action_ids = batch #logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) model_outputs = model(input_ids, mc_token_ids, token_type_ids=token_type_ids, token_emotion_ids=token_emotion_ids, token_action_ids=token_action_ids) lm_logits, mc_logits = model_outputs[0], model_outputs[ 1] # So we can also use GPT2 outputs lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if config.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if config.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if config.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, config.lr), (config.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])), "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1])) } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], config), "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], config) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if config.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) tb_logger = TensorboardLogger(log_dir=config.log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" take care of distributed encapsulation torch.save(config, tb_logger.writer.log_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file( os.path.join(tb_logger.writer.log_dir, CONFIG_NAME)) tokenizer.save_vocabulary(tb_logger.writer.log_dir) # Run the training trainer.run(train_loader, max_epochs=config.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if config.local_rank in [-1, 0] and config.n_epochs > 0: os.rename( checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def train(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model_checkpoint", type=str, default="/home/rohola/logs", help="Path, url or short name of the model") #parser.add_argument("--model_checkpoint", type=str, default="openai-gpt", help="Path, url or short name of the model") parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training") parser.add_argument("--max_history", type=int, default=2, help="Number of previous exchanges to keep in history") parser.add_argument("--train_batch_size", type=int, default=2, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=1, help="Batch size for validation") parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps") parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate") parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient") parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--n_epochs", type=int, default=20, help="Number of training epochs") parser.add_argument("--personality_permutations", type=int, default=1, help="Number of permutations of personality sentences") parser.add_argument( "--eval_before_start", action='store_true', help="If true start with a first evaluation before training") parser.add_argument("--device", type=str, default="cpu" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument( "--fp16", type=str, default="", help= "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument( "--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") parser.add_argument( "--log_dir", type=str, default="", help="Local rank for distributed training (-1: not distributed)") args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", args.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info( "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning" ) tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model_class = GPT2DoubleHeadsModel if "gpt2" in args.model_checkpoint else OpenAIGPTDoubleHeadsModel model = model_class.from_pretrained(args.model_checkpoint) tokenizer.set_special_tokens(SPECIAL_TOKENS) model.set_num_special_tokens(len(SPECIAL_TOKENS)) model.to(args.device) optimizer = OpenAIAdam(model.parameters(), lr=args.lr) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) # train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(args, tokenizer) # # def inference(engine, batch): # model.eval() # with torch.no_grad(): # batch = tuple(input_tensor.to(args.device) for input_tensor in batch) # input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch # model_outputs = model(input_ids, mc_token_ids, token_type_ids=token_type_ids) # lm_logits, mc_logits, label = model_outputs[0], model_outputs[1], model_outputs[2] # if label!=19: # print(tokenizer.decode(input_ids[0, -1, :].tolist())) # print(tokenizer.decode(input_ids[0, label, :].tolist())) # lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1)) # lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) # return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels) # # evaluator = Engine(inference) # # evaluator.run(val_loader) def tokenize_sentence(s): bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids( SPECIAL_TOKENS[:-1]) s1_ids = [bos] + tokenizer.convert_tokens_to_ids( tokenizer.tokenize(s)) + [eos] input_ids = torch.Tensor(s1_ids).type( torch.int64).unsqueeze(0).unsqueeze(0) return input_ids def compare_sentences_using_openai_embedding(): s1 = "I love biking" s2 = "I want to buy a bicycle" s3 = "It is wrong" input_ids = tokenize_sentence(s1) hidden_states1 = model(input_ids, mc_token_ids=None) hidden_states1 = hidden_states1.squeeze(0).squeeze(0).mean(dim=0) input_ids = tokenize_sentence(s2) hidden_states2 = model(input_ids, mc_token_ids=None) hidden_states2 = hidden_states2.squeeze(0).squeeze(0).mean(dim=0) input_ids = tokenize_sentence(s3) hidden_states3 = model(input_ids, mc_token_ids=None) hidden_states3 = hidden_states3.squeeze(0).squeeze(0).mean(dim=0) print() compare_sentences_using_openai_embedding()
def train(): parser = ArgumentParser() parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model_checkpoint", type=str, default="openai-gpt", help="Path, url or short name of the model") parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training") parser.add_argument("--max_history", type=int, default=2, help="Number of previous exchanges to keep in history") parser.add_argument("--train_batch_size", type=int, default=4, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=4, help="Batch size for validation") parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps") parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate") parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient") parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs") parser.add_argument("--personality_permutations", type=int, default=1, help="Number of permutations of personality sentences") parser.add_argument("--eval_before_start", action='store_true', help="If true start with a first evaluation before training") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--fp16", type=str, default="", help="Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument("--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Running process %d", args.local_rank) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info("Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning") tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model_class = GPT2LMHeadModel if "gpt2" in args.model_checkpoint else OpenAIGPTLMHeadModel model = model_class.from_pretrained(args.model_checkpoint) tokenizer.set_special_tokens(SPECIAL_TOKENS) model.set_num_special_tokens(len(SPECIAL_TOKENS)) model.to(args.device) optimizer = OpenAIAdam(model.parameters(), lr=args.lr) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(args, tokenizer) # Training function and trainer def update(engine, batch): model.train() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) lm_loss, mc_loss = model(*batch) loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple(input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) model_outputs = model(input_ids, mc_token_ids, token_type_ids=token_type_ids) lm_logits, mc_logits = model_outputs[0], model_outputs[1] # So we can also use GPT2 outputs lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])), "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))} metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args), "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args)}) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics))) tb_logger = TensorboardLogger(log_dir=None) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) # "getattr" take care of distributed encapsulation torch.save(args, tb_logger.writer.log_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file(os.path.join(tb_logger.writer.log_dir, CONFIG_NAME)) tokenizer.save_vocabulary(tb_logger.writer.log_dir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME)) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
[p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = int(FACTOR * 2 * EPOCHS * len(train_loader.tensors[0]) / batch_size / accumulation_steps) optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=lr, warmup=WARMUP, t_total=num_train_optimization_steps) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) ####################### # multi-gpu ####################### model = nn.DataParallel(model, device_ids=[0, 1, 2, 3, 4, 5]) model = model.train() del param_optimizer, optimizer_grouped_parameters