def main(test_file, vocab_file, embeddings_file, pretrained_file, max_length=50, gpu_index=0, batch_size=128): """ Test the ESIM model with pretrained weights on some dataset. Args: test_file: The path to a file containing preprocessed NLI data. pretrained_file: The path to a checkpoint produced by the 'train_model' script. vocab_size: The number of words in the vocabulary of the model being tested. embedding_dim: The size of the embeddings in the model. hidden_size: The size of the hidden layers in the model. Must match the size used during training. Defaults to 300. num_classes: The number of classes in the output of the model. Must match the value used during training. Defaults to 3. batch_size: The size of the batches used for testing. Defaults to 32. """ device = torch.device("cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu") print(20 * "=", " Preparing for testing ", 20 * "=") if platform == "linux" or platform == "linux2": checkpoint = torch.load(pretrained_file) else: checkpoint = torch.load(pretrained_file, map_location=device) # Retrieving model parameters from checkpoint. hidden_size = checkpoint["model"]["projection.0.weight"].size(0) num_classes = checkpoint["model"]["classification.6.weight"].size(0) embeddings = load_embeddings(embeddings_file) print("\t* Loading test data...") test_data = LCQMC_Dataset(test_file, vocab_file, max_length) test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size) print("\t* Building model...") model = ESIM(hidden_size, embeddings=embeddings, num_classes=num_classes, device=device).to(device) model.load_state_dict(checkpoint["model"]) print(20 * "=", " Testing ESIM model on device: {} ".format(device), 20 * "=") batch_time, total_time, accuracy, auc = test(model, test_loader) print("\n-> Average batch processing time: {:.4f}s, total test time: {:.4f}s, accuracy: {:.4f}%, auc: {:.4f}\n".format(batch_time, total_time, (accuracy*100), auc))
def train(preproc_dir, n_classes, max_length, hidden_units, dropout, batch_size, epochs, output_dir): """ Train the ESIM model on some dataset and save the learned weights. Args: preproc_dir: The directory where the preprocessed data is saved. n_classes: The number of classes in the problem. max_length: The maximum length of the sentences in the premises and hypotheses of the dataset. hidden_units: The number of hidden units to use in the various layers of the model. dropout: The dropout rate to use in the model. batch_size: The size of the batches to use for training. epochs: The number of epochs to apply during training. output_dir: The path to the directory where the weights learned during training must be saved. """ print("Loading training and validation data...") train_premises, train_hyps, train_labels = prepare_data( preproc_dir, 'train', n_classes, max_length) valid_premises, valid_hyps, valid_labels = prepare_data( preproc_dir, 'dev', n_classes, max_length) # train_premises是如下形式: # [[5, 6, 7, 8, 9, 3, 10, 11, 12, 13, 14, 2, 15, 16, 3,0,0,0,0], # [17, 18, 19, 20, 21, 22, 4, 23, 2, 24,0,0,0,0,0,0,0,0,0], # [25, 26, 27,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]] print("Loading embedding weights...") embedding_weights = load_embeddings( os.path.join(preproc_dir, "embedding_weights.pkl")) # Build the model. esim = ESIM(n_classes, embedding_weights, max_length, hidden_units, dropout) model = esim.build_model() if not os.path.exists(output_dir): os.makedirs(output_dir) filepath = os.path.join(output_dir, "weights-{epoch:02d}-{val_acc:.2f}.hdf5") checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') model.fit(x=[train_premises, train_hyps], y=train_labels, batch_size=batch_size, epochs=epochs, validation_data=([valid_premises, valid_hyps], valid_labels), callbacks=[checkpoint], shuffle=True)
def model_load_test(test_df, vocab_file, embeddings_file, pretrained_file, test_prediction_dir, test_prediction_name, mode, num_labels, max_length=50, gpu_index=0, batch_size=128): device = torch.device( "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu") print(20 * "=", " Preparing for testing ", 20 * "=") if platform == "linux" or platform == "linux2": checkpoint = torch.load(pretrained_file, map_location=device) else: checkpoint = torch.load(pretrained_file, map_location=device) # Retrieving model parameters from checkpoint. hidden_size = checkpoint["model"]["projection.0.weight"].size(0) num_classes = checkpoint["model"]["classification.6.weight"].size(0) embeddings = load_embeddings(embeddings_file) print("\t* Loading test data...") test_data = My_Dataset(test_df, vocab_file, max_length, mode) test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size) print("\t* Building model...") model = ESIM(hidden_size, embeddings=embeddings, num_labels=num_labels, device=device).to(device) model.load_state_dict(checkpoint["model"]) print(20 * "=", " Testing ESIM model on device: {} ".format(device), 20 * "=") batch_time, total_time, accuracy, predictions = test(model, test_loader) print( "\n-> Average batch processing time: {:.4f}s, total test time: {:.4f}s, accuracy: {:.4f}%\n" .format(batch_time, total_time, (accuracy * 100))) test_prediction = pd.DataFrame({'prediction': predictions}) if not os.path.exists(test_prediction_dir): os.makedirs(test_prediction_dir) test_prediction.to_csv(os.path.join(test_prediction_dir, test_prediction_name), index=False)
def main(args): print(20 * "=", " Preparing for training ", 20 * "=") if not os.path.exists(args.result): os.makedirs(args.result) # -------------------- Loda pretraining model ------------------- # checkpoints = torch.load(args.pretrained_file) # 可以从模型中直接恢复,也可以直接在前面定义 Retrieving model parameters from checkpoint. # hidden_size = checkpoints["model"]["projection.0.weight"].size(0) # num_classes = checkpoints["model"]["classification.6.weight"].size(0) # -------------------- Data loading ------------------- # print("\t* Loading training data...") test_data = LCQMC_dataset(args.test_file, args.vocab_file, args.max_length, test_flag=True) test_loader = DataLoader(test_data, batch_size=args.batch_size) # -------------------- Model definition ------------------- # print("\t* Building model...") embeddings = load_embeddings(args.embed_file) model = ESIM(args, embeddings=embeddings).to(args.device) model.load_state_dict(checkpoints["model"]) print(20 * "=", " Testing ESIM model on device: {} ".format(args.device), 20 * "=") all_predict = predict(model, test_loader) index = np.array([], dtype=int) for i in range(len(all_predict)): index = np.append(index, i) # ---------------------生成文件-------------------------- df_test = pd.DataFrame(columns=['index', 'prediction']) df_test['index'] = index df_test['prediction'] = all_predict df_test.to_csv(args.submit_example_path, index=False, columns=['index', 'prediction'], sep='\t')
def load(args, checkpoint_dir): state_dict = torch.load(os.path.join(checkpoint_dir, 'checkpoint.pth')) from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): if 'module' in k: namekey = k[7:] # remove `module.` else: namekey = k new_state_dict[namekey] = v if args.model_type == 'bert': config = BertConfig.from_json_file( os.path.join(checkpoint_dir, 'config.bin')) model = BertForSequenceClassification(config) model.load_state_dict(new_state_dict) elif args.model_type == 'bow': model = BOWModel(new_state_dict['embedding.weight'], n_vocab=args.vocab_size, embed_size=args.embed_size, hidden_size=args.hidden_size, num_classes=args.num_labels) model.load_state_dict(new_state_dict) elif args.model_type == 'decom_att': model = DecompAttentionModel(args.word_mat, n_vocab=args.vocab_size, embed_size=args.embed_size, hidden_size=args.hidden_size, num_classes=args.num_labels) model.load_state_dict(new_state_dict) elif args.model_type == 'esim': model = ESIM(vocab_size=args.vocab_size, embedding_dim=args.embed_size, hidden_size=args.hidden_size, embeddings=None, padding_idx=0, dropout=0.1, num_classes=args.num_labels, device=args.device) model.load_state_dict(new_state_dict) else: raise ValueError('model type is not found!') return model.to(args.device)
def main(train_file, dev_file, vocab_file, target_dir, max_length=50, hidden_size=300, dropout=0.2, num_classes=2, epochs=1, batch_size=256, lr=0.0005, patience=5, max_grad_norm=10.0, gpu_index=0, checkpoint=None): #device = torch.device("cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu") device = torch.device("cpu") print(20 * "=", " Preparing for training ", 20 * "=") # 保存模型的路径 if not os.path.exists(target_dir): os.makedirs(target_dir) # -------------------- Data loading ------------------- # print("\t* Loading training data...") train_data = LCQMC_Dataset(train_file, vocab_file, max_length) train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size) print("\t* Loading validation data...") dev_data = LCQMC_Dataset(dev_file, vocab_file, max_length) dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size) # -------------------- Model definition ------------------- # print("\t* Building model...") # embeddings = load_embeddings(embeddings_file) model = ESIM(hidden_size, dropout=dropout, num_labels=num_classes, device=device).to(device) # -------------------- Preparation for training ------------------- # print('a') criterion = nn.CrossEntropyLoss() # 过滤出需要梯度更新的参数 parameters = filter(lambda p: p.requires_grad, model.parameters()) print('b') # optimizer = optim.Adadelta(parameters, params["LEARNING_RATE"]) optimizer = torch.optim.Adam(parameters, lr=lr) # optimizer = torch.optim.Adam(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.85, patience=0) best_score = 0.0 start_epoch = 1 # Data for loss curves plot epochs_count = [] train_losses = [] valid_losses = [] # Continuing training from a checkpoint if one was given as argument if checkpoint: checkpoint = torch.load(checkpoint) start_epoch = checkpoint["epoch"] + 1 best_score = checkpoint["best_score"] print("\t* Training will continue on existing model from epoch {}...". format(start_epoch)) model.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) epochs_count = checkpoint["epochs_count"] train_losses = checkpoint["train_losses"] valid_losses = checkpoint["valid_losses"] # Compute loss and accuracy before starting (or resuming) training. _, valid_loss, valid_accuracy, auc = validate(model, dev_loader, criterion) print( "\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}" .format(valid_loss, (valid_accuracy * 100), auc)) # -------------------- Training epochs ------------------- # print("\n", 20 * "=", "Training ESIM model on device: {}".format(device), 20 * "=") patience_counter = 0 for epoch in range(start_epoch, epochs + 1): epochs_count.append(epoch) print("* Training epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader, optimizer, criterion, epoch, max_grad_norm) train_losses.append(epoch_loss) print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%". format(epoch_time, epoch_loss, (epoch_accuracy * 100))) print("* Validation for epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy, epoch_auc = validate( model, dev_loader, criterion) valid_losses.append(epoch_loss) print( "-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}\n" .format(epoch_time, epoch_loss, (epoch_accuracy * 100), epoch_auc)) # Update the optimizer's learning rate with the scheduler. scheduler.step(epoch_accuracy) # Early stopping on validation accuracy. if epoch_accuracy < best_score: patience_counter += 1 else: best_score = epoch_accuracy patience_counter = 0 torch.save( { "epoch": epoch, "model": model.state_dict(), "best_score": best_score, "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses }, os.path.join(target_dir, "best.pth.tar")) # Save the model at each epoch. torch.save( { "epoch": epoch, "model": model.state_dict(), "best_score": best_score, "optimizer": optimizer.state_dict(), "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses }, os.path.join(target_dir, "esim_{}.pth.tar".format(epoch))) if patience_counter >= patience: print("-> Early stopping: patience limit reached, stopping...") break
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--embeddings_file", default=None, type=str, required=True) parser.add_argument("--output_dir", default=None, type=str, required=True) parser.add_argument("--train_language", default=None, type=str, required=True) parser.add_argument("--train_steps", default=-1, type=int, required=True) parser.add_argument("--eval_steps", default=-1, type=int, required=True) parser.add_argument( "--load_word2vec", action='store_true', help= 'if true, load word2vec file for the first time; if false, load generated word-vector csv file' ) parser.add_argument("--generate_word2vec_csv", action='store_true', help='if true, generate word2vec csv file') ## normal parameters parser.add_argument("--embedding_size", default=300, type=int) parser.add_argument("--query_maxlen", default=30, type=int) parser.add_argument("--hidden_size", default=300, type=int) parser.add_argument("--learning_rate", default=5e-4, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_classes", default=2, type=int) parser.add_argument("--dropout", default=0.2, type=float) parser.add_argument("--do_test", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_eval_train", action='store_true', help="Whether to run eval on the train set.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--per_gpu_eval_batch_size", default=10, type=int) parser.add_argument("--per_gpu_train_batch_size", default=10, type=int) parser.add_argument("--seed", default=1, type=int) parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--gradient_accumulation_steps", default=1, type=int) args = parser.parse_args() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() # device = torch.device("cpu") args.device = device # Set seed set_seed(args) logger.info("Training/evaluation parameters %s", args) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Training if args.do_train: # build model logger.info("*** building model ***") embeddings = load_embeddings(args) model = ESIM(args.hidden_size, embeddings=embeddings, dropout=args.dropout, num_classes=args.num_classes, device=args.device) model.to(args.device) if args.n_gpu > 1: model = torch.nn.DataParallel(model) args.train_batch_size = args.per_gpu_train_batch_size * max( 1, args.n_gpu) logger.info("*** Loading training data ***") train_data = ATEC_Dataset(os.path.join(args.data_dir, 'train.csv'), os.path.join(args.data_dir, 'vocab.csv'), args.query_maxlen) train_loader = DataLoader(train_data, shuffle=True, batch_size=args.train_batch_size) logger.info("*** Loading validation data ***") dev_data = ATEC_Dataset(os.path.join(args.data_dir, 'dev.csv'), os.path.join(args.data_dir, 'vocab.csv'), args.query_maxlen) dev_loader = DataLoader(dev_data, shuffle=False, batch_size=args.eval_batch_size) num_train_optimization_steps = args.train_steps # 过滤出需要梯度更新的参数 parameters = filter(lambda p: p.requires_grad, model.parameters()) # optimizer = optim.Adadelta(parameters, params["LEARNING_RATE"]) optimizer = torch.optim.Adam(parameters, lr=args.learning_rate) # optimizer = torch.optim.Adam(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.85, patience=0) criterion = nn.CrossEntropyLoss() global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_data)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 bar = tqdm(range(num_train_optimization_steps), total=num_train_optimization_steps) train_loader = cycle(train_loader) output_dir = args.output_dir + "eval_results_{}_{}_{}_{}_{}_{}".format( 'ESIM', str(args.query_maxlen), str(args.learning_rate), str(args.train_batch_size), str(args.train_language), str(args.train_steps)) try: os.makedirs(output_dir) except: pass output_eval_file = os.path.join(output_dir, 'eval_result.txt') with open(output_eval_file, "w") as writer: writer.write('*' * 80 + '\n') for step in bar: batch = next(train_loader) batch = tuple(t.to(device) for t in batch) q1, q1_lens, q2, q2_lens, labels = batch # 正常训练 optimizer.zero_grad() logits, probs = model(q1, q1_lens, q2, q2_lens) loss = criterion(logits, labels) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() train_loss = round( tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4) bar.set_description("loss {}".format(train_loss)) nb_tr_examples += q1.size(0) nb_tr_steps += 1 loss.backward() # 对抗训练 # fgm.attack() # 在embedding上添加对抗扰动 # loss_adv = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) # if args.n_gpu > 1: # loss_adv = loss_adv.mean() # mean() to average on multi-gpu. # if args.gradient_accumulation_steps > 1: # loss_adv = loss_adv / args.gradient_accumulation_steps # loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 # fgm.restore() # 恢复embedding参数 if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: # scheduler.step() optimizer.step() global_step += 1 if (step + 1) % (args.eval_steps * args.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if args.do_eval and (step + 1) % ( args.eval_steps * args.gradient_accumulation_steps) == 0: if args.do_eval_train: file_list = ['train.csv', 'dev.csv'] else: file_list = ['dev.csv'] for file in file_list: inference_labels = [] gold_labels = [] inference_logits = [] logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(dev_data)) logger.info(" Batch size = %d", args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for q1, q1_lens, q2, q2_lens, labels in tqdm(dev_loader): with torch.no_grad(): logits, probs = model(q1, q1_lens, q2, q2_lens) probs = probs.detach().cpu().numpy() # print(logits.shape, probs.shape) # label_ids = labels.to('cpu').numpy() inference_labels.append(np.argmax(probs, 1)) gold_labels.append(labels) # eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += logits.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_labels = np.concatenate(inference_labels, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = get_f1(inference_labels, gold_labels) result = { # 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'train_loss': train_loss } if 'dev' in file: with open(output_eval_file, "a") as writer: writer.write(file + '\n') for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("Best ACC", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) with open(output_eval_file, "a") as writer: writer.write('bert_acc: %f' % best_acc) if args.do_test: if args.do_train == False: output_dir = args.output_dir # build model logger.info("*** building model ***") embeddings = load_embeddings(args) model = ESIM(args.hidden_size, embeddings=embeddings, dropout=args.dropout, num_classes=args.num_classes, device=args.device) model.load_state_dict( torch.load(os.path.join(output_dir, 'pytorch_model.bin'))) model.to(args.device) if args.n_gpu > 1: model = torch.nn.DataParallel(model) inference_labels = [] gold_labels = [] logger.info("*** Loading testing data ***") dev_data = ATEC_Dataset(os.path.join(args.data_dir, 'test.csv'), os.path.join(args.data_dir, 'vocab.csv'), args.query_maxlen) dev_loader = DataLoader(dev_data, shuffle=False, batch_size=args.eval_batch_size) logger.info(" *** Run Prediction ***") logger.info(" Num examples = %d", len(dev_data)) logger.info(" Batch size = %d", args.eval_batch_size) model.eval() for q1, q1_lens, q2, q2_lens, labels in tqdm(dev_loader): with torch.no_grad(): logits, probs = model(q1, q1_lens, q2, q2_lens) probs = probs.detach().cpu().numpy() inference_labels.append(np.argmax(probs, 1)) gold_labels.append(labels) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) test_f1 = get_f1(logits, gold_labels) logger.info('predict f1:{}'.format(str(test_f1)))
random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.device != -1: torch.cuda.manual_seed(args.seed) logger = get_logger() logger.info(pprint.pformat(vars(args))) dataset_cls, embedding, train_loader, test_loader, dev_loader \ = DatasetFactory.get_dataset(args.dataset, args.word_vectors_dir, args.word_vectors_file, args.batch_size, args.device) filter_widths = list(range(1, args.max_window_size + 1)) + [np.inf] ext_feats = dataset_cls.EXT_FEATS if args.sparse_features else 0 model = ESIM(embedding_size=args.word_vectors_dim, device=device, num_units=args.word_vectors_dim, num_classes=dataset_cls.NUM_CLASSES, dropout=args.dropout, max_sentence_length=args.maxlen) model = model.to(device) embedding = embedding.to(device) optimizer = None if args.optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.regularization, eps=args.epsilon) elif args.optimizer == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.regularization) else: raise ValueError('optimizer not recognized: it should be either adam or sgd') train_evaluator = EvaluatorFactory.get_evaluator(dataset_cls, model, embedding, train_loader, args.batch_size, args.device) test_evaluator = EvaluatorFactory.get_evaluator(dataset_cls, model, embedding, test_loader, args.batch_size,
def main(train_file, valid_file, embeddings_file, target_dir, hidden_size=300, dropout=0.5, num_classes=3, epochs=50, batch_size=32, lr=0.0004, patience=5, max_grad_norm=10.0, checkpoint=None): """ Train the ESIM model on the SNLI dataset. Args: train_file: A path to some preprocessed data that must be used to train the model. valid_file: A path to some preprocessed data that must be used to validate the model. embeddings_file: A path to some preprocessed word embeddings that must be used to initialise the model. target_dir: The path to a directory where the trained model must be saved. hidden_size: The size of the hidden layers in the model. Defaults to 300. dropout: The dropout rate to use in the model. Defaults to 0.5. num_classes: The number of classes in the output of the model. Defaults to 3. epochs: The maximum number of epochs for training. Defaults to 64. batch_size: The size of the batches for training. Defaults to 32. lr: The learning rate for the optimizer. Defaults to 0.0004. patience: The patience to use for early stopping. Defaults to 5. checkpoint: A checkpoint from which to continue training. If None, training starts from scratch. Defaults to None. """ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(20 * "=", " Preparing for training ", 20 * "=") # -------------------- Data loading ------------------- # print("\t* Loading training data...") with open(train_file, "rb") as pkl: train_data = NLIDataset(pickle.load(pkl)) print("Training data length: ", len(train_data)) train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size) print("\t* Loading validation data...") with open(valid_file, "rb") as pkl: valid_data = NLIDataset(pickle.load(pkl)) print("Validation data length: ", len(valid_data)) valid_loader = DataLoader(valid_data, shuffle=False, batch_size=batch_size) # print(train_loader) # -------------------- Model definition ------------------- # print("\t* Building model...") with open(embeddings_file, "rb") as pkl: embeddings = torch.tensor(pickle.load(pkl), dtype=torch.float)\ .to(device) print(embeddings.size()) esim_model = ESIM(embeddings.shape[0], embeddings.shape[1], hidden_size, embeddings, dropout, num_classes, device).to(device) # -------------------- Preparation for training ------------------- # criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(esim_model.parameters(), lr=l_rate, weight_decay=1e-5) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.5, patience=0) best_score = 0.0 start_epoch = 1 # Data for loss curves plot. epochs_count = [] train_losses = [] valid_losses = [] # -------------------- Training epochs ------------------- # print("\n", 20 * "=", "Training ESIM model on device: {}".format(device), 20 * "=") patience_counter = 0 for epoch in range(start_epoch, epochs + 1): epochs_count.append(epoch) print("* Training epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy = train(esim_model, train_loader, optimizer, criterion, epoch, max_grad_norm) train_losses.append(epoch_loss) print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%". format(epoch_time, epoch_loss, (epoch_accuracy * 100)))
def main(): device = args.device print(20 * "=", " Preparing for training ", 20 * "=") # 保存模型的路径 if not os.path.exists(args.target_dir): os.makedirs(args.target_dir) # -------------------- Data loading ------------------- # print("Loading data......") train_loader, dev_loader, test_loader, SEN1, SEN2 = load_data( args.batch_size, device) embedding = SEN1.vectors vocab_size = len(embedding) print("vocab_size:", vocab_size) # -------------------- Model definition ------------------- # print("\t* Building model...") model = ESIM(args.hidden_size, embedding=embedding, dropout=args.dropout, num_labels=args.num_classes, device=device).to(device) # -------------------- Preparation for training ------------------- # criterion = nn.CrossEntropyLoss() # 过滤出需要梯度更新的参数 parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = torch.optim.Adam(parameters, lr=args.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.1, patience=10) best_score = 0.0 if args.ckp: checkpoint = torch.load(os.path.join(args.target_dir, args.ckp)) best_score = checkpoint["best_score"] model.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) _, valid_loss, valid_accuracy = validate(model, dev_loader, criterion) print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%". format(valid_loss, (valid_accuracy * 100))) # -------------------- Training epochs ------------------- # print("\n", 20 * "=", "Training ESIM model on device: {}".format(device), 20 * "=") patience_counter = 0 for epoch in range(args.num_epoch): print("* Training epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader, optimizer, criterion, args.max_grad_norm, device) print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%". format(epoch_time, epoch_loss, (epoch_accuracy * 100))) print("* Validation for epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy = validate( model, dev_loader, criterion, device) print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n". format(epoch_time, epoch_loss, (epoch_accuracy * 100))) # Update the optimizer's learning rate with the scheduler. scheduler.step(epoch_accuracy) # Early stopping on validation accuracy. if epoch_accuracy < best_score: patience_counter += 1 else: print("save model!!!!") best_score = epoch_accuracy patience_counter = 0 torch.save( { "model": model.state_dict(), "best_score": best_score, "optimizer": optimizer.state_dict(), }, os.path.join(args.target_dir, "best.pth.tar")) if patience_counter >= 5: print("-> Early stopping: patience limit reached, stopping...") break # ##-------------------- Testing epochs ------------------- # # print(20 * "=", " Testing ", 20 * "=") # if args.ckp: # checkpoint = torch.load(os.path.join(args.target_dir, args.ckp)) # best_score = checkpoint["best_score"] # model.load_state_dict(checkpoint["model"]) # optimizer.load_state_dict(checkpoint["optimizer"]) # # print("best_score:", best_score) # all_labels = test(model, test_loader, device) # print(all_labels[:10]) # target_label = [id2label[id] for id in all_labels] # print(target_label[:10]) # with open(os.path.join(args.target_dir, 'result.txt'), 'w+') as f: # for label in target_label: # f.write(label + '\n') del train_loader del dev_loader del test_loader del SEN1 del SEN2 del embedding
def main(args): print(20 * "=", " Preparing for training ", 20 * "=") # 保存模型的路径 if not os.path.exists(args.target_dir): os.makedirs(args.target_dir) # -------------------- Data loading ------------------- # print("\t* Loading training data...") # train_data = LCQMC_dataset(args.train_file, args.vocab_file, args.max_length, test_flag=False) train_data = LCQMC_dataset(args.train_file, args.vocab_file, args.max_length, test_flag=False) train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True) print("\t* Loading valid data...") dev_data = LCQMC_dataset(args.dev_file, args.vocab_file, args.max_length, test_flag=False) dev_loader = DataLoader(dev_data, batch_size=args.batch_size, shuffle=True) # -------------------- Model definition ------------------- # print("\t* Building model...") embeddings = load_embeddings(args.embed_file) model = ESIM(args, embeddings=embeddings).to(args.device) # -------------------- Preparation for training ------------------- # criterion = nn.CrossEntropyLoss() # 交叉熵损失函数 # 过滤出需要梯度更新的参数 parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = torch.optim.Adam(parameters, lr=args.lr) # 优化器 # 学习计划 scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.85, patience=0) best_score = 0.0 start_epoch = 1 epochs_count = [] train_losses = [] valid_losses = [] # Continuing training from a checkpoint if one was given as argument if args.checkpoint: # 从文件中加载checkpoint数据, 从而继续训练模型 checkpoints = torch.load(args.checkpoint) start_epoch = checkpoints["epoch"] + 1 best_score = checkpoints["best_score"] print("\t* Training will continue on existing model from epoch {}...".format(start_epoch)) model.load_state_dict(checkpoints["model"]) # 模型部分 optimizer.load_state_dict(checkpoints["optimizer"]) epochs_count = checkpoints["epochs_count"] train_losses = checkpoints["train_losses"] valid_losses = checkpoints["valid_losses"] # 这里改为只有从以前加载的checkpoint中才进行计算 valid, Compute loss and accuracy before starting (or resuming) training. _, valid_loss, valid_accuracy, auc = validate(model, dev_loader, criterion) print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}" .format(valid_loss, (valid_accuracy * 100), auc)) # -------------------- Training epochs ------------------- # print("\n", 20 * "=", "Training ESIM model on device: {}".format(args.device), 20 * "=") patience_counter = 0 for epoch in range(start_epoch, args.epochs + 1): epochs_count.append(epoch) print("* Training epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader, optimizer, criterion, epoch, args.max_grad_norm) train_losses.append(epoch_loss) print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%" .format(epoch_time, epoch_loss, (epoch_accuracy * 100))) print("* Validation for epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy, epoch_auc = validate(model, train_loader, criterion) valid_losses.append(epoch_loss) print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}\n" .format(epoch_time, epoch_loss, (epoch_accuracy * 100), epoch_auc)) # Update the optimizer's learning rate with the scheduler. scheduler.step(epoch_accuracy) # Early stopping on validation accuracy. if epoch_accuracy < best_score: patience_counter += 1 else: best_score = epoch_accuracy patience_counter = 0 # 保存最好的结果,需要保存的参数,这些参数在checkpoint中都能找到 torch.save( { "epoch": epoch, "model": model.state_dict(), "best_score": best_score, "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses}, os.path.join(args.target_dir, "new_best.pth.tar")) # 保存每个epoch的结果 Save the model at each epoch.(这里可要可不要) torch.save( { "epoch": epoch, "model": model.state_dict(), "best_score": best_score, "optimizer": optimizer.state_dict(), "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses}, os.path.join(args.target_dir, "new_esim_{}.pth.tar".format(epoch))) if patience_counter >= args.patience: print("-> Early stopping: patience limit reached, stopping...") break
if torch.cuda.is_available(): if not args.cuda: print("WARNING: You have a CUDA device, so you should probably run with --cuda") else: torch.cuda.manual_seed(args.seed) # Load Dictionary assert os.path.exists(args.train_data) assert os.path.exists(args.val_data) dictionary = Dictionary(join_path(data_dir,'data/atec_nlp_sim_train.csv')) args.vocab_size = len(dictionary) best_val_loss = None best_f1 = None n_token = len(dictionary) model = ESIM(args) if torch.cuda.is_available(): model = model.cuda() print(model) print('Begin to load data.') train_data = MyDataset(args.train_data, args.sequence_length, dictionary.word2idx, args.char_model) val_data = MyDataset(args.val_data, args.sequence_length, dictionary.word2idx, args.char_model) train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=16) val_loader = DataLoader(val_data, batch_size=1, shuffle=False) try: for epoch in range(args.epochs): train(train_loader, val_loader, model, args) except KeyboardInterrupt: print('-' * 89) print('Exit from training early.')
# train_hypo = arrayToTensor(h_trian) # eval_prem = arrayToTensor(p_eval) # eval_hypo = arrayToTensor(p_eval) # 生成数据集 train_dataset = tf.data.Dataset.from_tensor_slices((p_train, h_trian, y_trian)) eval_dataset = tf.data.Dataset.from_tensor_slices((p_eval, h_eval, y_eval)) # 分成多个batch train_dataset = train_dataset.shuffle(len(p_train)).batch(args.batch_size, drop_remainder=True) eval_dataset = eval_dataset.shuffle(len(p_eval)).batch(args.batch_size, drop_remainder=True) # 载入模型 model = ESIM() # 初始化优化器 optimizer = tf.keras.optimizers.Adam(args.lr) # 对于文本匹配的模型,使用二元交叉熵和二元准确率评价函数 train_metric = tf.keras.metrics.SparseCategoricalAccuracy( name='train_accuracy') # loss_func = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) train_loss = tf.keras.metrics.SparseCategoricalCrossentropy(name='train_loss') # 初始化模型保存机制 ckpt = tf.train.Checkpoint(optimizer=optimizer, model=model) ckpt.restore(tf.train.latest_checkpoint(args.model_path)) ckpt_manager = tf.train.CheckpointManager(ckpt, args.model_path,
def model_train_validate_test(train_df, dev_df, test_df, embeddings_file, vocab_file, target_dir, mode, num_labels=2, max_length=50, hidden_size=200, dropout=0.2, epochs=50, batch_size=256, lr=0.0005, patience=5, max_grad_norm=10.0, gpu_index=0, if_save_model=False, checkpoint=None): device = torch.device( "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu") print(20 * "=", " Preparing for training ", 20 * "=") # 保存模型的路径 if not os.path.exists(target_dir): os.makedirs(target_dir) # -------------------- Data loading ------------------- # print("\t* Loading training data...") train_data = My_Dataset(train_df, vocab_file, max_length, mode) train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size) print("\t* Loading validation data...") dev_data = My_Dataset(dev_df, vocab_file, max_length, mode) dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size) print("\t* Loading test data...") test_data = My_Dataset(test_df, vocab_file, max_length, mode) test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size) # -------------------- Model definition ------------------- # print("\t* Building model...") if (embeddings_file is not None): embeddings = load_embeddings(embeddings_file) else: embeddings = None model = ESIM(hidden_size, embeddings=embeddings, dropout=dropout, num_labels=num_labels, device=device).to(device) total_params = sum(p.numel() for p in model.parameters()) print(f'{total_params:,} total parameters.') total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print(f'{total_trainable_params:,} training parameters.') # -------------------- Preparation for training ------------------- # criterion = nn.CrossEntropyLoss() # 过滤出需要梯度更新的参数 parameters = filter(lambda p: p.requires_grad, model.parameters()) # optimizer = optim.Adadelta(parameters, params["LEARNING_RATE"]) optimizer = torch.optim.Adam(parameters, lr=lr) # optimizer = torch.optim.Adam(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.85, patience=0) best_score = 0.0 start_epoch = 1 # Data for loss curves plot epochs_count = [] train_losses = [] valid_losses = [] # Continuing training from a checkpoint if one was given as argument if checkpoint: checkpoint = torch.load(checkpoint) start_epoch = checkpoint["epoch"] + 1 best_score = checkpoint["best_score"] print("\t* Training will continue on existing model from epoch {}...". format(start_epoch)) model.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) epochs_count = checkpoint["epochs_count"] train_losses = checkpoint["train_losses"] valid_losses = checkpoint["valid_losses"] # Compute loss and accuracy before starting (or resuming) training. _, valid_loss, valid_accuracy, _, = validate(model, dev_loader, criterion) print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%". format(valid_loss, (valid_accuracy * 100))) # -------------------- Training epochs ------------------- # print("\n", 20 * "=", "Training ESIM model on device: {}".format(device), 20 * "=") patience_counter = 0 for epoch in range(start_epoch, epochs + 1): epochs_count.append(epoch) print("* Training epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader, optimizer, criterion, epoch, max_grad_norm) train_losses.append(epoch_loss) print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%". format(epoch_time, epoch_loss, (epoch_accuracy * 100))) print("* Validation for epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy, _, = validate( model, dev_loader, criterion) valid_losses.append(epoch_loss) print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n". format(epoch_time, epoch_loss, (epoch_accuracy * 100))) # Update the optimizer's learning rate with the scheduler. scheduler.step(epoch_accuracy) # Early stopping on validation accuracy. if epoch_accuracy < best_score: patience_counter += 1 else: best_score = epoch_accuracy patience_counter = 0 if (if_save_model): torch.save( { "epoch": epoch, "model": model.state_dict(), "best_score": best_score, "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses }, os.path.join(target_dir, "best.pth.tar")) print("save model succesfully!\n") print("* Test for epoch {}:".format(epoch)) _, _, test_accuracy, predictions = validate( model, test_loader, criterion) print("Test accuracy: {:.4f}%\n".format(test_accuracy)) test_prediction = pd.DataFrame({'prediction': predictions}) test_prediction.to_csv(os.path.join(target_dir, "test_prediction.csv"), index=False) if patience_counter >= patience: print("-> Early stopping: patience limit reached, stopping...") break
seq2_list = [] target_list = [] with open(url, "r", encoding="utf8") as f: for item in jsonlines.Reader(f): seq1_list.append(item["sentence1"]) seq2_list.append(item["sentence2"]) target_list.append(item["gold_label"]) examples = [data.Example.fromlist(list(item), fields) for item in zip(seq1_list, seq2_list, target_list)] super(MyDataset, self).__init__(examples, fields, **kwargs) if __name__ == "__main__": args = parser.parse_args() seq_field = data.Field(lower=True) target_field = data.Field(unk_token=None, pad_token=None) logging.info("Start prepare dataset") train_dataset = MyDataset(args.train_url, seq_field, target_field) valid_dataset = MyDataset(args.valid_url, seq_field, target_field) seq_field.build_vocab(train_dataset, valid_dataset) target_field.build_vocab(train_dataset, valid_dataset) train_iter, valid_iter = data.Iterator.splits((train_dataset, valid_dataset), batch_sizes=(args.batch_size, args.batch_size), repeat=False) args.class_num = len(target_field.vocab) args.embed_num = len(seq_field.vocab) logging.debug(target_field.vocab.stoi) logging.info("Success") model = ESIM(args) train.train(train_iter, valid_iter, model, args)
def main(test_q1_file, test_q2_file, test_labels_file, pretrained_file, gpu_index=0, batch_size=64): """ Test the ESIM model with pretrained weights on some dataset. Args: test_file: The path to a file containing preprocessed NLI data. pretrained_file: The path to a checkpoint produced by the 'train_model' script. vocab_size: The number of words in the vocabulary of the model being tested. embedding_dim: The size of the embeddings in the model. hidden_size: The size of the hidden layers in the model. Must match the size used during training. Defaults to 300. num_classes: The number of classes in the output of the model. Must match the value used during training. Defaults to 3. batch_size: The size of the batches used for testing. Defaults to 32. """ device = torch.device( "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu") print(20 * "=", " Preparing for testing ", 20 * "=") if platform == "linux" or platform == "linux2": checkpoint = torch.load(pretrained_file) else: checkpoint = torch.load(pretrained_file, map_location="cuda:0") # Retrieving model parameters from checkpoint. vocab_size = checkpoint["model"]["word_embedding.weight"].size(0) embedding_dim = checkpoint["model"]['word_embedding.weight'].size(1) hidden_size = checkpoint["model"]["projection.0.weight"].size(0) num_classes = checkpoint["model"]["classification.6.weight"].size(0) print("\t* Loading test data...") test_q1 = np.load(test_q1_file) test_q2 = np.load(test_q2_file) test_labels = np.load(test_labels_file) # test_labels = label_transformer(test_labels) test_data = {"q1": test_q1, "q2": test_q2, "labels": test_labels} test_data = QQPDataset(test_data) test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size) print("\t* Building model...") model = ESIM(vocab_size, embedding_dim, hidden_size, num_classes=num_classes, device=device).to(device) model.load_state_dict(checkpoint["model"]) print(20 * "=", " Testing ESIM model on device: {} ".format(device), 20 * "=") batch_time, total_time, accuracy = test(model, test_loader) print() print( "-> Average batch processing time: {:.4f}s, total test time: {:.4f}s, accuracy: {:.4f}%" .format(batch_time, total_time, (accuracy * 100))) print()
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument('--data_dir', default=None, type=str, required=True, help="The input data dir.") parser.add_argument( '--model_type', default=None, type=str, required=True, help="Model type selected in [bert, xlnet, xlm, bow, decom_att]") parser.add_argument( '--model_name_or_path', default='bert-base-uncased', type=str, help="Shortcut name is selected in [bert-base-uncased, ]") parser.add_argument('--task_name', default='snli', type=str, help="The name of task is selected in [snli]") parser.add_argument( '--output_dir', default='../out', type=str, help= "The output directory where the model predictions and checkpoints will be written." ) # other parameters parser.add_argument("--cache_dir", default='../cache', type=str, help="Store the cache files.") parser.add_argument( "--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization.") parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm. Avoiding over-fitting.") parser.add_argument("--num_train_epochs", default=60, type=int, help="Total number of training epochs to perform.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--seed", default=42, type=int, help="Random seed for initializaiton.") parser.add_argument("--train", action='store_true', help="Whether to run training.") parser.add_argument("--eval", action='store_true', help="Whether to run eval on dev set.") parser.add_argument("--ckpt", default=-1, type=int, help="Which ckpt to load.") parser.add_argument("--from_scratch", action='store_true', help="Whether to train from scratch.") parser.add_argument("--train_type", default='normal', type=str, help="Train type is selected in [normal, rs].") args = parser.parse_args() if not os.path.exists(args.data_dir): raise ValueError("input data dir is not exist.") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.device = device logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger.warning("model type: %s, task name: %s, device: %s, ", args.model_type, args.task_name, device) # set seed set_seed(args) # Prepare task if args.task_name not in processors: raise ValueError("Task not found: %s" % args.task_name) task_class = processors[args.task_name]() label_list = task_class.get_labels() num_labels = len(label_list) args.num_labels = num_labels # load vocab. if args.model_type != 'bert': if os.path.exists(args.cache_dir + '/{}_vocab_train.pkl'.format(args.task_name)): with open( args.cache_dir + '/{}_vocab_train.pkl'.format(args.task_name), 'rb') as f: vocab = pickle.load(f) index2word = vocab['index2word'] word2index = vocab['word2index'] word_mat = vocab['word_mat'] else: glove_path = '../data/glove/glove.840B.300d.txt' index2word, word2index, word_mat = load_vocab(args, glove_path) args.word_mat = word_mat args.vocab_size = len(index2word) # load model. model = None if args.model_type == 'bert': tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=True) args.vocab_size = tokenizer.vocab_size config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name) model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, config=config) elif args.model_type == 'bow': args.embed_size = 300 args.hidden_size = 100 model = BOWModel(word_mat, n_vocab=args.vocab_size, embed_size=args.embed_size, hidden_size=args.hidden_size, num_classes=args.num_labels) elif args.model_type == 'decom_att': # No using args.embed_size = 300 args.hidden_size = 100 model = DecompAttentionModel(word_mat, n_vocab=args.vocab_size, embed_size=args.embed_size, hidden_size=args.hidden_size, num_classes=args.num_labels) elif args.model_type == 'esim': args.embed_size = 300 args.hidden_size = 100 model = ESIM(vocab_size=args.vocab_size, embedding_dim=args.embed_size, hidden_size=args.hidden_size, embeddings=torch.tensor(word_mat).float(), padding_idx=0, dropout=0.1, num_classes=args.num_labels, device=args.device) else: raise ValueError('model type is not found!') model.to(device) logger.info("Training/evaluation parameters %s", args) # Create output directory if needed if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Create cache directory if needed if not os.path.exists(args.cache_dir): os.makedirs(args.cache_dir) train_dataset = None eval_dataset = None test_dataset = None if args.train: train_dataset = load_and_cache_normal_example( args, word2index, 'train') if args.model_type not in [ 'bert' ] else load_and_cache_bert_example(args, tokenizer, 'train') eval_dataset = load_and_cache_normal_example( args, word2index, 'eval') if args.model_type not in [ 'bert' ] else load_and_cache_bert_example(args, tokenizer, 'eval') if args.eval: test_dataset = load_and_cache_normal_example( args, word2index, 'test') if args.model_type not in [ 'bert' ] else load_and_cache_bert_example(args, tokenizer, 'test') # Training if args.train: if args.from_scratch: # default False global_step, train_loss = normal_train(args, model, train_dataset, eval_dataset) else: if args.ckpt < 0: checkpoints = glob.glob(args.output_dir + '/normal_{}_{}_checkpoint-*'.format( args.task_name, args.model_type)) checkpoints.sort(key=lambda x: int(x.split('-')[-1])) checkpoint = checkpoints[-1] ckpt = int(checkpoint.split('-')[-1]) else: checkpoint = os.path.join( args.output_dir, 'normal_{}_{}_checkpoint-{}'.format( args.task_name, args.model_type, args.ckpt)) ckpt = args.ckpt model = load(args, checkpoint) print("Load model from {}".format(checkpoint)) global_step, train_loss = normal_train(args, model, train_dataset, eval_dataset, ckpt + 1) logger.info(" global_step = %s, average loss = %s", global_step, train_loss) # Evaluation if args.eval: if args.ckpt < 0: checkpoints = glob.glob( args.output_dir + '/{}_{}_{}_checkpoint-*'.format( args.train_type, args.task_name, args.model_type)) checkpoints.sort(key=lambda x: int(x.split('-')[-1])) checkpoint = checkpoints[-1] else: checkpoint = os.path.join( args.output_dir, '{}_{}_{}_checkpoint-{}'.format(args.train_type, args.task_name, args.model_type, args.ckpt)) model = load(args, checkpoint) print("Evaluation result, load model from {}".format(checkpoint)) acc = evaluate(args, model, test_dataset) print("acc={:.4f}".format(acc))
def main(): parser = ArgumentParser() parser.add_argument("--epoch", type=int, required=True) parser.add_argument("--seed", type=int, required=True) parser.add_argument("--emb_file", type=str, required=True) parser.add_argument("--checkpoint", type=str, required=True) parser.add_argument("--save_dir", type=str, required=True) parser.add_argument("--train_file", type=str, required=True) parser.add_argument("--log_file", type=str, required=False) parser.add_argument("--ratio", type=str, required=True) parser.add_argument("--vocab_size", type=int, required=True) parser.add_argument("--emb_size", type=int, required=True) parser.add_argument("--learning_rate", type=float, required=True) parser.add_argument("--batch_size", type=int, required=True) parser.add_argument("--max_length", type=int, required=True) parser.add_argument("--max_grad_norm", type=int, required=True) args = parser.parse_args() split_ratio = [float(val) for val in args.ratio.split(",")] has_cuda = torch.cuda.is_available() random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s" DATE_FORMAT = "%m/%d/%Y %H:%M:%S %p" logging.basicConfig(filename=args.log_file, level=logging.INFO, format=LOG_FORMAT, datefmt=DATE_FORMAT) logging.info("start preparing data") data_preprocessor = DataPreprocess() emb, word_idx_map = data_preprocessor.build_emb_vocab(args.emb_file) data_preprocessor.load(args.train_file, use_mask=False, is_test=False) train_dataset, dev_dataset = data_preprocessor.generate_train_dev_dataset( ratio=split_ratio) train_dataset, dev_dataset = CompDataSet( train_dataset, word_idx_map, max_len=args.max_length, emb_size=args.emb_size), CompDataSet(dev_dataset, word_idx_map, max_len=args.max_length, emb_size=args.emb_size) train_dataset = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) dev_dataset = DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=True) logging.info("init model") start_epoch = 0 if args.checkpoint: model = torch.load(args.checkpoint) start_epoch = re.findall("\d+(?=\_\d+.pt)", args.checkpoint) start_epoch = int(start_epoch[0]) + 1 else: model = ESIM(args.vocab_size, args.emb_size, emb, max_len=args.max_length) optimizer = AdamW(model.parameters(), lr=args.learning_rate) criterion = FocalLoss() if has_cuda: model = model.cuda() logging.info("start training") neg_auc, pos_auc = validate(model, dev_dataset) logging.info(f"pre-train neg_auc {str(neg_auc)} pos_auc {str(pos_auc)}") for epoch in range(start_epoch, args.epoch): running_loss = 0.0 for step, data in enumerate(train_dataset): model.train() start_time = time.time() optimizer.zero_grad() outputs = model(data["premise"], data["premise_mask"], data["hypothese"], data["hypothese_mask"]) loss = criterion(outputs["probs"], data["label"]) loss.backward() clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() end_time = time.time() running_loss += loss.item() if step % 100 == 99: logging.info( f"epoch: {epoch}, step: {step}, time: {end_time - start_time} loss: {running_loss / 100}" ) running_loss = 0 if step % 500 == 499: neg_auc, pos_auc = validate(model, dev_dataset) logging.info( f"pre-train neg_auc {str(neg_auc)} pos_auc {str(pos_auc)}") torch.save(model, Path(args.save_dir) / f"{epoch}_{step}.pt")
HIDDEN_DIM = 100 LINEAR_SIZE = 200 DROPOUT = 0.5 BATCH_SIZE = 128 device = torch.device('cuda') train_iterator, valid_iterator = data.BucketIterator.splits( (train_data, valid_data), batch_size=BATCH_SIZE, sort_key=lambda x: len(x.text1), device=device, shuffle=True) pretrained_embeddings = TEXT.vocab.vectors model = ESIM(pretrained_embeddings, VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, LINEAR_SIZE, DROPOUT) optimizer = optim.Adam(model.parameters()) criterion = FocalLoss(2) model = model.to(device) criterion = criterion.to(device) def categorical_accuracy(preds, y): """ Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8 """ max_preds = preds.argmax( dim=1, keepdim=True) # get the index of the max probability correct = max_preds.squeeze(1).eq(y) return correct.sum() / torch.FloatTensor([y.shape[0]])
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of task is selected in [imdb, amazon]") parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir.") parser.add_argument("--cache_dir", default='../cache', type=str, help="The cache data dir.") parser.add_argument( '--model_type', default=None, type=str, required=True, help="Model type selected in [bert, xlnet, xlm, cnn, lstm]") parser.add_argument( '--model_name_or_path', default='bert-base-uncased', type=str, help="Shortcut name is selected in [bert-base-uncased, ]") parser.add_argument( '--output_dir', default='../out', type=str, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( "--skip", default=20, type=int, help="Evaluate one testing point every skip testing point.") parser.add_argument("--num_random_sample", default=5000, type=int, help="The number of random samples of each texts.") parser.add_argument("--similarity_threshold", default=0.8, type=float, help="The similarity constraint to be " "considered as synonym.") parser.add_argument("--perturbation_constraint", default=100, type=int, help="The maximum size of perturbation " "set of each word.") parser.add_argument( "--mc_error", default=0.01, type=float, help="Monte Carlo Error based on concentration inequality.") parser.add_argument("--train_type", default='normal', type=str, help="Train type is selected in [normal, rs].") # other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization.") parser.add_argument("--batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--ckpt", default=-1, type=int, help="Which ckpt to load.") parser.add_argument("--seed", default=42, type=int, help="Random seed for initializaiton.") args = parser.parse_args() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.device = device logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger.warning("model type: %s, task name: %s, device: %s, train_type: %s", args.model_type, args.task_name, device, args.train_type) set_seed(args) if args.task_name not in processors: raise ValueError("Task not found: %s" % args.task_name) task_class = processors[args.task_name]() label_list = task_class.get_labels() num_labels = len(label_list) args.num_labels = num_labels # load vocab. word2index = None if args.model_type != 'bert': with open( args.cache_dir + '/{}_vocab_train.pkl'.format(args.task_name), 'rb') as f: vocab = pickle.load(f) index2word = vocab['index2word'] word2index = vocab['word2index'] word_mat = vocab['word_mat'] args.word_mat = word_mat args.vocab_size = len(index2word) tokenizer = None if args.model_type == 'bert': tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=True) args.vocab_size = tokenizer.vocab_size config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name) model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, config=config) elif args.model_type == 'bow': args.embed_size = 300 args.hidden_size = 100 model = BOWModel(word_mat, n_vocab=args.vocab_size, embed_size=args.embed_size, hidden_size=args.hidden_size, num_classes=args.num_labels) elif args.model_type == 'decom_att': # No using args.embed_size = 300 args.hidden_size = 100 model = DecompAttentionModel(word_mat, n_vocab=args.vocab_size, embed_size=args.embed_size, hidden_size=args.hidden_size, num_classes=args.num_labels) elif args.model_type == 'esim': args.embed_size = 300 args.hidden_size = 100 model = ESIM(vocab_size=args.vocab_size, embedding_dim=args.embed_size, hidden_size=args.hidden_size, embeddings=torch.tensor(word_mat).float(), padding_idx=0, dropout=0.1, num_classes=args.num_labels, device=args.device) else: raise ValueError('model type is not found!') model.to(device) similarity_threshold = args.similarity_threshold perturbation_constraint = args.perturbation_constraint perturbation_file = args.cache_dir + '/' + args.task_name + '_perturbation_constraint_pca' + str( similarity_threshold) + "_" + str(perturbation_constraint) + '.pkl' with open(perturbation_file, 'rb') as f: perturb = pickle.load(f) # random smooth random_smooth = WordSubstitute(perturb) # generate randomized data randomize_testset(args, random_smooth, similarity_threshold, perturbation_constraint) # calculate total variation calculate_tv_perturb(args, perturb) # Evaluation if args.ckpt < 0: checkpoints = glob.glob( args.output_dir + '/{}_{}_{}_checkpoint-*'.format( args.train_type, args.task_name, args.model_type)) checkpoints.sort(key=lambda x: int(x.split('-')[-1])) checkpoint = checkpoints[-1] else: checkpoint = os.path.join( args.output_dir, '{}_{}_{}_checkpoint-{}'.format(args.train_type, args.task_name, args.model_type, args.ckpt)) print("Evaluation result, load model from {}".format(checkpoint)) model = load(args, checkpoint) randomized_evaluate(args, model, tokenizer, word2index)
parser.add_argument('--max_len', type=int, default=100) parser.add_argument('--batch_size', type=int, default=32) parser.add_argument('--lr', type=float, default=0.0004) parser.add_argument('--embedding_dim', type=int, default=300) parser.add_argument('--hidden_dim', type=int, default=300) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--weight_decay', type=float, default=0.1) parser.add_argument('--num_epochs', type=int, default=20) parser.add_argument('--model_path', type=str, default='./model/best.bin1') args = parser.parse_args() label2idx = {'entailment': 0, 'neutral': 1, 'contradiction': 2} train_iter, dev_iter, vocab = data_process.load_data(args, device) # 定义模型、优化器、损失函数 net = ESIM(args, vocab) net.to(device) crition = nn.CrossEntropyLoss() optimizer = optim.Adam(net.parameters(), lr=args.lr) #验证集的准确率 def val_test(net, data_iter, crition): acc_sum, loss_sum, n, batch_num = 0.0, 0.0, 0, 0 net.eval() for batch in data_iter: sent1, sent2 = batch.sentence1[0], batch.sentence2[0] mask1 = (sent1 == 1) mask2 = (sent2 == 1) y = batch.label y = y.to(device)
def main(train_q1_file, train_q2_file, train_labels_file, dev_q1_file, dev_q2_file, dev_labels_file, embeddings_file, target_dir, hidden_size=128, dropout=0.5, num_classes=2, epochs=15, batch_size=64, lr=0.001, patience=5, max_grad_norm=10.0, gpu_index=0, checkpoint=None): device = torch.device( "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu") print(20 * "=", " Preparing for training ", 20 * "=") # 保存模型的路径 if not os.path.exists(target_dir): os.makedirs(target_dir) # -------------------- Data loading ------------------- # print("\t* Loading training data...") train_q1 = np.load(train_q1_file) train_q2 = np.load(train_q2_file) train_labels = np.load(train_labels_file) # train_labels = label_transformer(train_labels) train_data = {"q1": train_q1, "q2": train_q2, "labels": train_labels} train_data = QQPDataset(train_data) train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size) print("\t* Loading validation data...") dev_q1 = np.load(dev_q1_file) dev_q2 = np.load(dev_q2_file) dev_labels = np.load(dev_labels_file) # dev_labels = label_transformer(dev_labels) dev_data = {"q1": dev_q1, "q2": dev_q2, "labels": dev_labels} dev_data = QQPDataset(dev_data) dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size) # -------------------- Model definition ------------------- # print("\t* Building model...") embeddings = torch.tensor(np.load(embeddings_file), dtype=torch.float).to(device) model = ESIM(embeddings.shape[0], embeddings.shape[1], hidden_size, embeddings=embeddings, dropout=dropout, num_classes=num_classes, device=device).to(device) # -------------------- Preparation for training ------------------- # criterion = nn.CrossEntropyLoss() # 过滤出需要梯度更新的参数 parameters = filter(lambda p: p.requires_grad, model.parameters()) # optimizer = optim.Adadelta(parameters, params["LEARNING_RATE"]) optimizer = torch.optim.Adam(parameters, lr=lr) # optimizer = torch.optim.Adam(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.85, patience=0) best_score = 0.0 start_epoch = 1 # Data for loss curves plot epochs_count = [] train_losses = [] valid_losses = [] # Continuing training from a checkpoint if one was given as argument if checkpoint: checkpoint = torch.load(checkpoint) start_epoch = checkpoint["epoch"] + 1 best_score = checkpoint["best_score"] print("\t* Training will continue on existing model from epoch {}...". format(start_epoch)) model.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) epochs_count = checkpoint["epochs_count"] train_losses = checkpoint["train_losses"] valid_losses = checkpoint["valid_losses"] # Compute loss and accuracy before starting (or resuming) training. _, valid_loss, valid_accuracy = validate(model, dev_loader, criterion) print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%". format(valid_loss, (valid_accuracy * 100))) # -------------------- Training epochs ------------------- # print("\n", 20 * "=", "Training ESIM model on device: {}".format(device), 20 * "=") patience_counter = 0 for epoch in range(start_epoch, epochs + 1): epochs_count.append(epoch) print("* Training epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader, optimizer, criterion, epoch, max_grad_norm) train_losses.append(epoch_loss) print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%". format(epoch_time, epoch_loss, (epoch_accuracy * 100))) print("* Validation for epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy = validate( model, dev_loader, criterion) valid_losses.append(epoch_loss) print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n". format(epoch_time, epoch_loss, (epoch_accuracy * 100))) # Update the optimizer's learning rate with the scheduler. scheduler.step(epoch_accuracy) # Early stopping on validation accuracy. if epoch_accuracy < best_score: patience_counter += 1 else: best_score = epoch_accuracy patience_counter = 0 # Save the best model. The optimizer is not saved to avoid having # a checkpoint file that is too heavy to be shared. To resume # training from the best model, use the 'esim_*.pth.tar' # checkpoints instead. torch.save( { "epoch": epoch, "model": model.state_dict(), "best_score": best_score, "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses }, os.path.join(target_dir, "best.pth.tar")) # Save the model at each epoch. torch.save( { "epoch": epoch, "model": model.state_dict(), "best_score": best_score, "optimizer": optimizer.state_dict(), "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses }, os.path.join(target_dir, "esim_{}.pth.tar".format(epoch))) if patience_counter >= patience: print("-> Early stopping: patience limit reached, stopping...") break
def main(test_file, pretrained_file, embeddings_file, batch_size=1): """ Test the ESIM model with pretrained weights on some dataset. Args: test_file: The path to a file containing preprocessed NLI data. pretrained_file: The path to a checkpoint produced by the 'train_model' script. vocab_size: The number of words in the vocabulary of the model being tested. embedding_dim: The size of the embeddings in the model. hidden_size: The size of the hidden layers in the model. Must match the size used during training. Defaults to 300. num_classes: The number of classes in the output of the model. Must match the value used during training. Defaults to 3. batch_size: The size of the batches used for testing. Defaults to 32. """ debug_file = open('test_debug.txt', 'w') device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #device = torch.device("cpu") print(20 * "=", " Preparing for generating representations ", 20 * "=") checkpoint = torch.load(pretrained_file) # Retrieving model parameters from checkpoint. vocab_size = checkpoint["model"]["_word_embedding.weight"].size(0) embedding_dim = checkpoint["model"]['_word_embedding.weight'].size(1) hidden_size = checkpoint["model"]["_projection.0.weight"].size(0) num_classes = checkpoint["model"]["_classification.4.weight"].size(0) print("\t* Loading the data...") with open(test_file, "rb") as pkl: test_data = NLIDataset(pickle.load(pkl)) print(test_data, file=debug_file) test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size) print("\t* Building model...") # loading the embedding weights separately # with open(embeddings_file, "rb") as pkl: pkl = open(embeddings_file, "rb") embeddings = torch.tensor(pickle.load(pkl), dtype=torch.float)\ .to(device) pkl.close() # model = ESIM(vocab_size, # embedding_dim, # hidden_size, # num_classes=num_classes, # device=device).to(device) model = ESIM(embeddings.shape[0], embeddings.shape[1], hidden_size, embeddings=embeddings, num_classes=num_classes, device=device).to(device) # Writing custom load_state_dict pretrained_dict = checkpoint["model"] own_state = model.state_dict() for i, (name, param) in enumerate(pretrained_dict.items()): #print(name, type(name)) # if name is "_word_embedding.weight": # print(name) # continue if i == 0: continue if isinstance(param, Parameter): # backwards compatibility for serialized parameters param = param.data own_state[name].copy_(param) #model.load_state_dict(checkpoint["model"]) print( 20 * "=", " Loading the representations from ESIM model on device: {} ".format( device), 20 * "=") batch_time, total_time, save_rep = test(model, test_loader) print("-> Average batch processing time: {:.4f}s, total test time:\ {:.4f}s,%".format(batch_time, total_time)) file_debug = open('test_save_rep_details.txt', 'w') print('len of save_rep is' + str(len(save_rep)), file=file_debug) try: print('save_rep sample key is' + str(list(save_rep.keys())[0]), file=file_debug) print('save_rep sample value is' + str(list(save_rep.values())[0]), file=file_debug) except: pass # Dump save_rep as a pickle file with open('test_nv_repr.pickle', 'wb') as handle: pickle.dump(save_rep, handle, protocol=pickle.HIGHEST_PROTOCOL)