def test_esim(): sent1, sent2, labels = io_utils.read_snli(config.SNLI_VALID) embedding = embedding_utils.Embedding(config.EMBEDDING_DATA, sent1 + sent2, config.SNLI_MAX_LENGTH) label_dict = data_utils.create_label_dict(labels) sent1_len = np.array(list(map(embedding.len_transform, sent1))) sent2_len = np.array(list(map(embedding.len_transform, sent2))) sent1 = np.array(list(map(embedding.text_transform, sent1))) sent2 = np.array(list(map(embedding.text_transform, sent2))) labels = np.array(list(map(lambda x: label_dict[x], labels))) train_set = list(zip(sent1, sent2, sent1_len, sent2_len, labels)) valid_set = list(zip(sent1, sent2, sent1_len, sent2_len)) valid = (valid_set, labels) kwargs = { "num_classes": len(label_dict), "vocab_size": embedding.vocab_size, "embedding_size": embedding.embedding_dim, "seq_len": config.SNLI_MAX_LENGTH, "word_embeddings": embedding.embedding, "hparams": { "num_units": 30, "input_dropout": 0.9, "output_dropout": 0.9, "state_dropout": 0.9, "hidden_layers": 1, "hidden_units": 50, "hidden_dropout": 0.9, "lr": 0.001, "l2_reg_lambda": 0.0001, "batch_size": 256, "num_epochs": 20, } } model = ESIM(**kwargs) sess = create_session() model.init(sess) model.fit(sess, train_set, valid)
def from_config(cls, config, rng): """ Builds dataset, vocabs, embbedings, streams and models from config. :param config: `dict` containing configuration options. :param rng: random number generator :return: `Experiment` object with all experiment components. """ # 0. Load dataset dataset = NLIData.from_config(config['dataset']) # 1. Load vocabularies vocabs = {} for name, vocab_config in config['vocabs'].items(): vocab_config['file_or_data'] = os.path.join( dataset.path, vocab_config['file_or_data']) vocabs[name] = NLIVocabulary.from_config(config=vocab_config) # 2. Load embeddings embeddings = {} for name, emb_config in config['embeddings'].items(): emb_config['file'] = os.path.join(EMBEDDINGS_DIR, emb_config['file']) embeddings[name] = NLIEmbedding.from_config(config=emb_config, rng=rng, vocabs=vocabs) # 3. Batch transformers batch_transformers = [] for bt_config in config['batch_transformers']: bt_config = copy.deepcopy(bt_config) if 'vocab' in bt_config: bt_config['vocab'] = vocabs.get(bt_config['vocab']) transformer = NLITransformer.from_config(bt_config) batch_transformers.append(transformer) class StreamRegistry(dict): def __init__(self): super(StreamRegistry, self).__init__() self.__dict__ = self # 4. Load streams streams = StreamRegistry() for name in dataset.parts: streams[name] = NLIStream.from_config( config=config['streams'][name], dataset=dataset.part(name), rng=rng, batch_transformers=batch_transformers) # 5. Build model model = ESIM.from_config(config=config['model'], embeddings=embeddings) logger.info(model.summary()) return cls(config=config, model=model, dataset=dataset, vocabs=vocabs, embeddings=embeddings, batch_transformers=batch_transformers, streams=streams)
def main(): # capture the config path from the run arguments # then process the json configuration file try: args = get_args() config = process_config(args.config) except Exception as e: print("missing or invalid arguments %s" % e) exit(0) # create your data generator #data = SemDataGenerator(config) #data = MSRPGenerator(config) data = ATECGenerator(config) if args.step == "build_data": print("build data....") data.build_data() elif args.step == "train": # create the experiments dirs create_dirs([config.summary_dir, config.checkpoint_dir]) # create tensorflow session sess = tf.Session() # load word2vec config.embedding = data.get_trimmed_glove_vectors() model = SelfAttESIM(config) # create tensorboard logger logger = Logger(sess, config) # create trainer and pass all the previous components to it trainer = SentSemTrainer(sess, model, data, config, logger) #load model if exists #model.load(sess) # here you train your model trainer.train() elif args.step == "tune": import itertools tune_num = 0 param_names = config["parameter_tune"].keys() print(param_names) cand_params = [ config["parameter_tune"][pname] for pname in param_names ] print(cand_params) for params in itertools.product(*cand_params): print(params) for i, param_name in enumerate(param_names): config[param_name] = params[i] #print(config) data = ATECGenerator(config) create_dirs([config.summary_dir, config.checkpoint_dir]) sess = tf.Session() config.embedding = data.get_trimmed_glove_vectors() model = ESIM(config) logger = Logger(sess, config) trainer = SentSemTrainer(sess, model, data, config, logger) trainer.train() tf.reset_default_graph() else: print("no support step!!")
char_embedding_matrix = load_char_embed(bimpm_params['max_features'],bimpm_params['embed_size']) bimpm_params['embedding_matrix'] = char_embedding_matrix params = bimpm_params backend = BiMPM(params) elif model_name == "esim": esim_params = base_params esim_params['mlp_num_layers'] = 1 esim_params['mlp_num_units'] = 256 esim_params['mlp_num_fan_out'] = 128 esim_params['lstm_units'] = 64 esim_params['dropout_rate'] = 0.3 esim_params['embed_size'] = 100 char_embedding_matrix = load_char_embed(esim_params['max_features'],esim_params['embed_size']) esim_params['embedding_matrix'] = char_embedding_matrix params = esim_params backend = ESIM(params) elif model_name == "match_pyramid": mp_params = base_params mp_params['matching_type'] = 'dot' mp_params['num_blocks'] = 2 mp_params['kernel_count'] = [16, 32] mp_params['kernel_size'] = [[3, 3], [3, 3]] mp_params['pool_size'] = [3, 3] mp_params['mlp_num_layers'] = 1 mp_params['mlp_num_units'] = 128 mp_params['mlp_num_fan_out'] = 128 mp_params['embed_size'] = 100 char_embedding_matrix = load_char_embed(mp_params['max_features'],mp_params['embed_size']) mp_params['embedding_matrix'] = char_embedding_matrix params = mp_params backend = MatchPyramid(params)
def predict(): print('Loading test data ...') start_time = time.time() # [q1_train, q2_train, y_train, q1_dev, q2_dev, y_dev, q1_test, q2_test, y_test, vocab_size, q1_mask_train, # q2_mask_train, q1_mask_dev, q2_mask_dev, q1_mask_test, q2_mask_test] = load_pkl_set(args.pkl_files, mask=True) # # del q1_train, q2_train, y_train, q1_dev, q2_dev, y_dev, q1_mask_train, q2_mask_train, q1_mask_dev, q2_mask_dev q1_train, q2_train, y_train, q1_dev, q2_dev, y_dev, q1_test, q2_test, y_test, vocab_size = load_pkl_set( args.pkl_files) del q1_train, q2_train, y_train, q1_dev, q2_dev, y_dev # ESIM model init model = ESIM(sequence_length=args.max_q_len, num_classes=args.num_classes, embedding_dim=args.embedding_dim, vocab_size=vocab_size, max_length=args.max_q_len, hidden_dim=args.hidden_size, learning_rate=args.learning_rate, optimizer=args.optim) session = tf.Session() session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(session, save_path=save_path) print('Testing ...') # loss_test, acc_test = evaluate(q1_test, q2_test, y_test, q1_mask_test, q2_mask_test, session, model) loss_test, acc_test = evaluate(q1_test, q2_test, y_test, session, model=model) print('Test loss:{0:6.2}, Test acc:{1:7.2%}'.format(loss_test, acc_test)) # test_batches = batch_iter_per_epoch_mask(q1_test, q2_test, q1_mask_test, q2_mask_test, y_test, shuffle=False) test_batches = batch_iter_per_epoch(q1_test, q2_test, y_test, shuffle=False) all_predictions = [] all_predict_prob = [] count = 0 # for q1_test_batch, q2_test_batch, q1_mask_batch, q2_mask_batch, y_test_batch in test_batches: # batch_predictions, batch_predict_probs = session.run([model.predict, model.probs], # feed_dict={ # model.input_q1: q1_test_batch, # model.input_q2: q2_test_batch, # model.q1_mask: q1_mask_batch, # model.q2_mask: q2_mask_batch, # model.dropout_keep_prob: 1.0 # }) for q1_test_batch, q2_test_batch, y_test_batch in test_batches: batch_predictions, batch_predict_probs = session.run( [model.y_pred, model.probs], feed_dict={ model.input_q1: q1_test_batch, model.input_q2: q2_test_batch, model.dropout_keep_prob: 1.0 }) all_predictions = np.concatenate([all_predictions, batch_predictions]) if count == 0: all_predict_prob = batch_predict_probs else: all_predict_prob = np.concatenate( [all_predict_prob, batch_predict_probs]) count = 1 y_test = [float(temp) for temp in y_test] # Evaluation indices print('Precision, Recall, F1-Score ...') print( metrics.classification_report(y_test, all_predictions, target_names=['not match', 'match'])) # Confusion Matrix print('Confusion Matrix ...') print(metrics.confusion_matrix(y_test, all_predictions)) # Write probability to csv # out_dir = os.path.join(args.save_dir, 'predict_prob_csv') # print('Saving evaluation to {0}'.format(out_dir)) # with open(out_dir, 'w') as f: # csv.writer(f).writerows(all_predict_prob) time_dif = get_time_dif(start_time) print('Time usage:', time_dif)
def train(): # Loading data print('Loading data ...') start_time = time.time() # [q1_train, q2_train, y_train, q1_dev, q2_dev, y_dev, q1_test, q2_test, y_test, vocab_size, q1_mask_train, # q2_mask_train, q1_mask_dev, q2_mask_dev, q1_mask_test, q2_mask_test] = load_pkl_set(args.pkl_files, mask=True) # del q1_test, q2_test, q1_mask_test, q2_mask_test, y_test q1_train, q2_train, y_train, q1_dev, q2_dev, y_dev, q1_test, q2_test, y_test, vocab_size = load_pkl_set( args.pkl_files) del q1_test, q2_test, y_test time_dif = get_time_dif(start_time) print('Time usage:', time_dif) print('Configuring TensorBoard and Saver ...') tensorboard_dir = args.tensorboard_dir if not os.path.exists(tensorboard_dir): os.makedirs(tensorboard_dir) # ESIM model init model = ESIM(sequence_length=args.max_q_len, num_classes=args.num_classes, embedding_dim=args.embedding_dim, vocab_size=vocab_size, max_length=args.max_q_len, hidden_dim=args.hidden_size, learning_rate=args.learning_rate, optimizer=args.optim) tf.summary.scalar('loss', model.loss) tf.summary.scalar('accuracy', model.accuracy) merged_summary = tf.summary.merge_all() writer = tf.summary.FileWriter(tensorboard_dir) # Configuring Saver saver = tf.train.Saver() if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) # Create Session session = tf.Session() session.run(tf.global_variables_initializer()) writer.add_graph(session.graph) print('Training and Deviation ...') start_time = time.time() total_batch = 0 best_acc_dev = 0.0 last_improved = 0 require_improvement = 30000 # Early stopping tag = False for epoch in range(args.epochs): print('Epoch:', epoch + 1) # batch_train = batch_iter_per_epoch_mask(q1_train, q2_train, # q1_mask_train, q2_mask_train, # y_train, args.batch_size) batch_train = batch_iter_per_epoch(q1_train, q2_train, y_train, args.batch_size) for q1_batch, q2_batch, y_batch in batch_train: feed_dict = feed_data(q1_batch, q2_batch, y_batch, args.dropout_keep_prob, model=model) if total_batch % args.checkpoint_every == 0: # write to tensorboard scalar summary = session.run(merged_summary, feed_dict) writer.add_summary(summary, total_batch) if total_batch % args.evaluate_every == 0: # print performance on train set and dev set feed_dict[model.dropout_keep_prob] = 1.0 loss_train, acc_train = session.run( [model.loss, model.accuracy], feed_dict=feed_dict) # loss_dev, acc_dev = evaluate(q1_dev, q2_dev, y_dev, q1_mask_dev, q2_mask_dev, session, model) loss_dev, acc_dev = evaluate(q1_dev, q2_dev, y_dev, session, model=model) if acc_dev > best_acc_dev: # save best result best_acc_dev = acc_dev last_improved = total_batch saver.save(sess=session, save_path=save_path) improved_str = '*' else: improved_str = '' time_dif = get_time_dif(start_time) print( 'Iter: {0:>6}, Train Loss: {1:6.2}, Train Acc: {2:7.2%}, Val loss:{3:6.2}, ' 'Val acc:{4:7.2%}, Time:{5}{6}'.format( total_batch, loss_train, acc_train, loss_dev, acc_dev, time_dif, improved_str)) session.run(model.optimizer, feed_dict) total_batch += 1 if total_batch - last_improved > require_improvement: # having no improvement for a long time print('No optimization for a long time, auto-stopping ...') tag = True break if tag: # early stopping break
def main(train_file, valid_file, embeddings_file, target_dir, hidden_size=300, dropout=0.5, num_classes=3, epochs=64, batch_size=32, lr=0.0004, patience=5, max_grad_norm=10.0, checkpoint=None): """ Train the ESIM model on the SNLI dataset. Args: train_file: A path to some preprocessed data that must be used to train the model. valid_file: A path to some preprocessed data that must be used to validate the model. embeddings_file: A path to some preprocessed word embeddings that must be used to initialise the model. target_dir: The path to a directory where the trained model must be saved. hidden_size: The size of the hidden layers in the model. Defaults to 300. dropout: The dropout rate to use in the model. Defaults to 0.5. num_classes: The number of classes in the output of the model. Defaults to 3. epochs: The maximum number of epochs for training. Defaults to 64. batch_size: The size of the batches for training. Defaults to 32. lr: The learning rate for the optimizer. Defaults to 0.0004. patience: The patience to use for early stopping. Defaults to 5. checkpoint: A checkpoint from which to continue training. If None, training starts from scratch. Defaults to None. """ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(20 * "=", " Preparing for training ", 20 * "=") if not os.path.exists(target_dir): os.makedirs(target_dir) # -------------------- Data loading ------------------- # print("\t* Loading training data...") with open(train_file, "rb") as pkl: train_data = NLIDataset(pickle.load(pkl)) train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size) print("\t* Loading validation data...") with open(valid_file, "rb") as pkl: valid_data = NLIDataset(pickle.load(pkl)) valid_loader = DataLoader(valid_data, shuffle=False, batch_size=batch_size) # -------------------- Model definition ------------------- # print("\t* Building model...") with open(embeddings_file, "rb") as pkl: embeddings = torch.tensor(pickle.load(pkl), dtype=torch.float)\ .to(device) model = ESIM(embeddings.shape[0], embeddings.shape[1], hidden_size, embeddings=embeddings, dropout=dropout, num_classes=num_classes, device=device).to(device) # -------------------- Preparation for training ------------------- # criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.5, patience=0) best_score = 0.0 start_epoch = 1 # Data for loss curves plot. epochs_count = [] train_losses = [] valid_losses = [] # Continuing training from a checkpoint if one was given as argument. if checkpoint: checkpoint = torch.load(checkpoint) start_epoch = checkpoint["epoch"] + 1 best_score = checkpoint["best_score"] print("\t* Training will continue on existing model from epoch {}..." .format(start_epoch)) model.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) epochs_count = checkpoint["epochs_count"] train_losses = checkpoint["train_losses"] valid_losses = checkpoint["valid_losses"] # Compute loss and accuracy before starting (or resuming) training. _, valid_loss, valid_accuracy = validate(model, valid_loader, criterion) print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%" .format(valid_loss, (valid_accuracy*100))) # -------------------- Training epochs ------------------- # print("\n", 20 * "=", "Training ESIM model on device: {}".format(device), 20 * "=") patience_counter = 0 for epoch in range(start_epoch, epochs+1): epochs_count.append(epoch) print("* Training epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader, optimizer, criterion, epoch, max_grad_norm) train_losses.append(epoch_loss) print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%" .format(epoch_time, epoch_loss, (epoch_accuracy*100))) print("* Validation for epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy = validate(model, valid_loader, criterion) valid_losses.append(epoch_loss) print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n" .format(epoch_time, epoch_loss, (epoch_accuracy*100))) # Update the optimizer's learning rate with the scheduler. scheduler.step(epoch_accuracy) # Early stopping on validation accuracy. if epoch_accuracy < best_score: patience_counter += 1 else: best_score = epoch_accuracy patience_counter = 0 # Save the best model. The optimizer is not saved to avoid having # a checkpoint file that is too heavy to be shared. To resume # training from the best model, use the 'esim_*.pth.tar' # checkpoints instead. torch.save({"epoch": epoch, "model": model.state_dict(), "best_score": best_score, "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses}, os.path.join(target_dir, "best.pth.tar")) # Save the model at each epoch. torch.save({"epoch": epoch, "model": model.state_dict(), "best_score": best_score, "optimizer": optimizer.state_dict(), "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses}, os.path.join(target_dir, "esim_{}.pth.tar".format(epoch))) if patience_counter >= patience: print("-> Early stopping: patience limit reached, stopping...") break # Plotting of the loss curves for the train and validation sets. plt.figure() plt.plot(epochs_count, train_losses, "-r") plt.plot(epochs_count, valid_losses, "-b") plt.xlabel("epoch") plt.ylabel("loss") plt.legend(["Training loss", "Validation loss"]) plt.title("Cross entropy loss") plt.show()