Ejemplo n.º 1
0
def test_esim():
    sent1, sent2, labels = io_utils.read_snli(config.SNLI_VALID)

    embedding = embedding_utils.Embedding(config.EMBEDDING_DATA, sent1 + sent2,
                                          config.SNLI_MAX_LENGTH)

    label_dict = data_utils.create_label_dict(labels)

    sent1_len = np.array(list(map(embedding.len_transform, sent1)))
    sent2_len = np.array(list(map(embedding.len_transform, sent2)))
    sent1 = np.array(list(map(embedding.text_transform, sent1)))
    sent2 = np.array(list(map(embedding.text_transform, sent2)))
    labels = np.array(list(map(lambda x: label_dict[x], labels)))

    train_set = list(zip(sent1, sent2, sent1_len, sent2_len, labels))
    valid_set = list(zip(sent1, sent2, sent1_len, sent2_len))
    valid = (valid_set, labels)

    kwargs = {
        "num_classes": len(label_dict),
        "vocab_size": embedding.vocab_size,
        "embedding_size": embedding.embedding_dim,
        "seq_len": config.SNLI_MAX_LENGTH,
        "word_embeddings": embedding.embedding,
        "hparams": {
            "num_units": 30,
            "input_dropout": 0.9,
            "output_dropout": 0.9,
            "state_dropout": 0.9,
            "hidden_layers": 1,
            "hidden_units": 50,
            "hidden_dropout": 0.9,
            "lr": 0.001,
            "l2_reg_lambda": 0.0001,
            "batch_size": 256,
            "num_epochs": 20,
        }
    }

    model = ESIM(**kwargs)
    sess = create_session()
    model.init(sess)
    model.fit(sess, train_set, valid)
Ejemplo n.º 2
0
    def from_config(cls, config, rng):
        """
        Builds dataset, vocabs, embbedings, streams and models from config.

        :param config: `dict` containing configuration options.
        :param rng: random number generator
        :return: `Experiment` object with all experiment components.
        """
        # 0. Load dataset
        dataset = NLIData.from_config(config['dataset'])

        # 1. Load vocabularies
        vocabs = {}
        for name, vocab_config in config['vocabs'].items():
            vocab_config['file_or_data'] = os.path.join(
                dataset.path, vocab_config['file_or_data'])
            vocabs[name] = NLIVocabulary.from_config(config=vocab_config)

        # 2. Load embeddings
        embeddings = {}
        for name, emb_config in config['embeddings'].items():
            emb_config['file'] = os.path.join(EMBEDDINGS_DIR,
                                              emb_config['file'])
            embeddings[name] = NLIEmbedding.from_config(config=emb_config,
                                                        rng=rng,
                                                        vocabs=vocabs)

        # 3. Batch transformers
        batch_transformers = []
        for bt_config in config['batch_transformers']:
            bt_config = copy.deepcopy(bt_config)
            if 'vocab' in bt_config:
                bt_config['vocab'] = vocabs.get(bt_config['vocab'])
            transformer = NLITransformer.from_config(bt_config)
            batch_transformers.append(transformer)

        class StreamRegistry(dict):
            def __init__(self):
                super(StreamRegistry, self).__init__()
                self.__dict__ = self

        # 4. Load streams
        streams = StreamRegistry()
        for name in dataset.parts:
            streams[name] = NLIStream.from_config(
                config=config['streams'][name],
                dataset=dataset.part(name),
                rng=rng,
                batch_transformers=batch_transformers)

        # 5. Build model
        model = ESIM.from_config(config=config['model'], embeddings=embeddings)
        logger.info(model.summary())

        return cls(config=config,
                   model=model,
                   dataset=dataset,
                   vocabs=vocabs,
                   embeddings=embeddings,
                   batch_transformers=batch_transformers,
                   streams=streams)
Ejemplo n.º 3
0
def main():
    # capture the config path from the run arguments
    # then process the json configuration file
    try:
        args = get_args()
        config = process_config(args.config)

    except Exception as e:
        print("missing or invalid arguments %s" % e)
        exit(0)

    # create your data generator
    #data = SemDataGenerator(config)
    #data = MSRPGenerator(config)
    data = ATECGenerator(config)

    if args.step == "build_data":
        print("build data....")
        data.build_data()
    elif args.step == "train":
        # create the experiments dirs
        create_dirs([config.summary_dir, config.checkpoint_dir])
        # create tensorflow session
        sess = tf.Session()

        # load word2vec
        config.embedding = data.get_trimmed_glove_vectors()

        model = SelfAttESIM(config)
        # create tensorboard logger
        logger = Logger(sess, config)
        # create trainer and pass all the previous components to it
        trainer = SentSemTrainer(sess, model, data, config, logger)
        #load model if exists
        #model.load(sess)
        # here you train your model
        trainer.train()
    elif args.step == "tune":

        import itertools
        tune_num = 0
        param_names = config["parameter_tune"].keys()
        print(param_names)
        cand_params = [
            config["parameter_tune"][pname] for pname in param_names
        ]
        print(cand_params)
        for params in itertools.product(*cand_params):
            print(params)
            for i, param_name in enumerate(param_names):
                config[param_name] = params[i]
            #print(config)
            data = ATECGenerator(config)
            create_dirs([config.summary_dir, config.checkpoint_dir])
            sess = tf.Session()
            config.embedding = data.get_trimmed_glove_vectors()
            model = ESIM(config)
            logger = Logger(sess, config)
            trainer = SentSemTrainer(sess, model, data, config, logger)
            trainer.train()
            tf.reset_default_graph()
    else:
        print("no support step!!")
Ejemplo n.º 4
0
     char_embedding_matrix = load_char_embed(bimpm_params['max_features'],bimpm_params['embed_size'])
     bimpm_params['embedding_matrix'] = char_embedding_matrix
     params = bimpm_params
     backend = BiMPM(params)
 elif model_name == "esim":
     esim_params = base_params
     esim_params['mlp_num_layers'] = 1
     esim_params['mlp_num_units'] = 256
     esim_params['mlp_num_fan_out'] = 128
     esim_params['lstm_units'] = 64
     esim_params['dropout_rate'] = 0.3
     esim_params['embed_size'] = 100
     char_embedding_matrix = load_char_embed(esim_params['max_features'],esim_params['embed_size'])
     esim_params['embedding_matrix'] = char_embedding_matrix
     params = esim_params
     backend = ESIM(params)
 elif model_name == "match_pyramid":
     mp_params = base_params
     mp_params['matching_type'] = 'dot'
     mp_params['num_blocks'] = 2
     mp_params['kernel_count'] = [16, 32]
     mp_params['kernel_size'] = [[3, 3], [3, 3]]
     mp_params['pool_size'] = [3, 3]
     mp_params['mlp_num_layers'] = 1
     mp_params['mlp_num_units'] = 128
     mp_params['mlp_num_fan_out'] = 128
     mp_params['embed_size'] = 100
     char_embedding_matrix = load_char_embed(mp_params['max_features'],mp_params['embed_size'])
     mp_params['embedding_matrix'] = char_embedding_matrix
     params = mp_params
     backend = MatchPyramid(params)
Ejemplo n.º 5
0
def predict():
    print('Loading test data ...')
    start_time = time.time()
    # [q1_train, q2_train, y_train, q1_dev, q2_dev, y_dev, q1_test, q2_test, y_test, vocab_size, q1_mask_train,
    #  q2_mask_train, q1_mask_dev, q2_mask_dev, q1_mask_test, q2_mask_test] = load_pkl_set(args.pkl_files, mask=True)
    #
    # del q1_train, q2_train, y_train, q1_dev, q2_dev, y_dev, q1_mask_train, q2_mask_train, q1_mask_dev, q2_mask_dev

    q1_train, q2_train, y_train, q1_dev, q2_dev, y_dev, q1_test, q2_test, y_test, vocab_size = load_pkl_set(
        args.pkl_files)

    del q1_train, q2_train, y_train, q1_dev, q2_dev, y_dev

    # ESIM model init
    model = ESIM(sequence_length=args.max_q_len,
                 num_classes=args.num_classes,
                 embedding_dim=args.embedding_dim,
                 vocab_size=vocab_size,
                 max_length=args.max_q_len,
                 hidden_dim=args.hidden_size,
                 learning_rate=args.learning_rate,
                 optimizer=args.optim)

    session = tf.Session()
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(session, save_path=save_path)

    print('Testing ...')
    # loss_test, acc_test = evaluate(q1_test, q2_test, y_test, q1_mask_test, q2_mask_test, session, model)
    loss_test, acc_test = evaluate(q1_test,
                                   q2_test,
                                   y_test,
                                   session,
                                   model=model)
    print('Test loss:{0:6.2}, Test acc:{1:7.2%}'.format(loss_test, acc_test))

    # test_batches = batch_iter_per_epoch_mask(q1_test, q2_test, q1_mask_test, q2_mask_test, y_test, shuffle=False)
    test_batches = batch_iter_per_epoch(q1_test,
                                        q2_test,
                                        y_test,
                                        shuffle=False)
    all_predictions = []
    all_predict_prob = []
    count = 0
    # for q1_test_batch, q2_test_batch, q1_mask_batch, q2_mask_batch, y_test_batch in test_batches:
    #     batch_predictions, batch_predict_probs = session.run([model.predict, model.probs],
    #                                                          feed_dict={
    #                                                              model.input_q1: q1_test_batch,
    #                                                              model.input_q2: q2_test_batch,
    #                                                              model.q1_mask: q1_mask_batch,
    #                                                              model.q2_mask: q2_mask_batch,
    #                                                              model.dropout_keep_prob: 1.0
    #                                                          })
    for q1_test_batch, q2_test_batch, y_test_batch in test_batches:
        batch_predictions, batch_predict_probs = session.run(
            [model.y_pred, model.probs],
            feed_dict={
                model.input_q1: q1_test_batch,
                model.input_q2: q2_test_batch,
                model.dropout_keep_prob: 1.0
            })
        all_predictions = np.concatenate([all_predictions, batch_predictions])
        if count == 0:
            all_predict_prob = batch_predict_probs
        else:
            all_predict_prob = np.concatenate(
                [all_predict_prob, batch_predict_probs])
        count = 1
    y_test = [float(temp) for temp in y_test]

    # Evaluation indices
    print('Precision, Recall, F1-Score ...')
    print(
        metrics.classification_report(y_test,
                                      all_predictions,
                                      target_names=['not match', 'match']))

    # Confusion Matrix
    print('Confusion Matrix ...')
    print(metrics.confusion_matrix(y_test, all_predictions))

    # Write probability to csv
    # out_dir = os.path.join(args.save_dir, 'predict_prob_csv')
    # print('Saving evaluation to {0}'.format(out_dir))
    # with open(out_dir, 'w') as f:
    #     csv.writer(f).writerows(all_predict_prob)

    time_dif = get_time_dif(start_time)
    print('Time usage:', time_dif)
Ejemplo n.º 6
0
def train():
    # Loading data
    print('Loading data ...')
    start_time = time.time()
    # [q1_train, q2_train, y_train, q1_dev, q2_dev, y_dev, q1_test, q2_test, y_test, vocab_size, q1_mask_train,
    #  q2_mask_train, q1_mask_dev, q2_mask_dev, q1_mask_test, q2_mask_test] = load_pkl_set(args.pkl_files, mask=True)

    # del q1_test, q2_test, q1_mask_test, q2_mask_test, y_test

    q1_train, q2_train, y_train, q1_dev, q2_dev, y_dev, q1_test, q2_test, y_test, vocab_size = load_pkl_set(
        args.pkl_files)

    del q1_test, q2_test, y_test

    time_dif = get_time_dif(start_time)
    print('Time usage:', time_dif)

    print('Configuring TensorBoard and Saver ...')
    tensorboard_dir = args.tensorboard_dir
    if not os.path.exists(tensorboard_dir):
        os.makedirs(tensorboard_dir)

    # ESIM model init
    model = ESIM(sequence_length=args.max_q_len,
                 num_classes=args.num_classes,
                 embedding_dim=args.embedding_dim,
                 vocab_size=vocab_size,
                 max_length=args.max_q_len,
                 hidden_dim=args.hidden_size,
                 learning_rate=args.learning_rate,
                 optimizer=args.optim)
    tf.summary.scalar('loss', model.loss)
    tf.summary.scalar('accuracy', model.accuracy)
    merged_summary = tf.summary.merge_all()
    writer = tf.summary.FileWriter(tensorboard_dir)

    # Configuring Saver
    saver = tf.train.Saver()
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)

    # Create Session
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    writer.add_graph(session.graph)

    print('Training and Deviation ...')
    start_time = time.time()
    total_batch = 0
    best_acc_dev = 0.0
    last_improved = 0
    require_improvement = 30000  # Early stopping

    tag = False
    for epoch in range(args.epochs):
        print('Epoch:', epoch + 1)
        # batch_train = batch_iter_per_epoch_mask(q1_train, q2_train,
        #                                         q1_mask_train, q2_mask_train,
        #                                         y_train, args.batch_size)

        batch_train = batch_iter_per_epoch(q1_train, q2_train, y_train,
                                           args.batch_size)
        for q1_batch, q2_batch, y_batch in batch_train:
            feed_dict = feed_data(q1_batch,
                                  q2_batch,
                                  y_batch,
                                  args.dropout_keep_prob,
                                  model=model)
            if total_batch % args.checkpoint_every == 0:
                # write to tensorboard scalar
                summary = session.run(merged_summary, feed_dict)
                writer.add_summary(summary, total_batch)

            if total_batch % args.evaluate_every == 0:
                # print performance on train set and dev set
                feed_dict[model.dropout_keep_prob] = 1.0
                loss_train, acc_train = session.run(
                    [model.loss, model.accuracy], feed_dict=feed_dict)
                # loss_dev, acc_dev = evaluate(q1_dev, q2_dev, y_dev, q1_mask_dev, q2_mask_dev, session, model)
                loss_dev, acc_dev = evaluate(q1_dev,
                                             q2_dev,
                                             y_dev,
                                             session,
                                             model=model)

                if acc_dev > best_acc_dev:
                    # save best result
                    best_acc_dev = acc_dev
                    last_improved = total_batch
                    saver.save(sess=session, save_path=save_path)
                    improved_str = '*'
                else:
                    improved_str = ''

                time_dif = get_time_dif(start_time)
                print(
                    'Iter: {0:>6}, Train Loss: {1:6.2}, Train Acc: {2:7.2%}, Val loss:{3:6.2}, '
                    'Val acc:{4:7.2%}, Time:{5}{6}'.format(
                        total_batch, loss_train, acc_train, loss_dev, acc_dev,
                        time_dif, improved_str))

            session.run(model.optimizer, feed_dict)
            total_batch += 1

            if total_batch - last_improved > require_improvement:
                # having no improvement for a long time
                print('No optimization for a long time, auto-stopping ...')
                tag = True
                break
        if tag:  # early stopping
            break
Ejemplo n.º 7
0
def main(train_file,
         valid_file,
         embeddings_file,
         target_dir,
         hidden_size=300,
         dropout=0.5,
         num_classes=3,
         epochs=64,
         batch_size=32,
         lr=0.0004,
         patience=5,
         max_grad_norm=10.0,
         checkpoint=None):
    """
    Train the ESIM model on the SNLI dataset.
    Args:
        train_file: A path to some preprocessed data that must be used
            to train the model.
        valid_file: A path to some preprocessed data that must be used
            to validate the model.
        embeddings_file: A path to some preprocessed word embeddings that
            must be used to initialise the model.
        target_dir: The path to a directory where the trained model must
            be saved.
        hidden_size: The size of the hidden layers in the model. Defaults
            to 300.
        dropout: The dropout rate to use in the model. Defaults to 0.5.
        num_classes: The number of classes in the output of the model.
            Defaults to 3.
        epochs: The maximum number of epochs for training. Defaults to 64.
        batch_size: The size of the batches for training. Defaults to 32.
        lr: The learning rate for the optimizer. Defaults to 0.0004.
        patience: The patience to use for early stopping. Defaults to 5.
        checkpoint: A checkpoint from which to continue training. If None,
            training starts from scratch. Defaults to None.
    """
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    print(20 * "=", " Preparing for training ", 20 * "=")

    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    # -------------------- Data loading ------------------- #
    print("\t* Loading training data...")
    with open(train_file, "rb") as pkl:
        train_data = NLIDataset(pickle.load(pkl))

    train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)

    print("\t* Loading validation data...")
    with open(valid_file, "rb") as pkl:
        valid_data = NLIDataset(pickle.load(pkl))

    valid_loader = DataLoader(valid_data, shuffle=False, batch_size=batch_size)

    # -------------------- Model definition ------------------- #
    print("\t* Building model...")
    with open(embeddings_file, "rb") as pkl:
        embeddings = torch.tensor(pickle.load(pkl), dtype=torch.float)\
                     .to(device)

    model = ESIM(embeddings.shape[0],
                 embeddings.shape[1],
                 hidden_size,
                 embeddings=embeddings,
                 dropout=dropout,
                 num_classes=num_classes,
                 device=device).to(device)

    # -------------------- Preparation for training  ------------------- #
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode="max",
                                                           factor=0.5,
                                                           patience=0)

    best_score = 0.0
    start_epoch = 1

    # Data for loss curves plot.
    epochs_count = []
    train_losses = []
    valid_losses = []

    # Continuing training from a checkpoint if one was given as argument.
    if checkpoint:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint["epoch"] + 1
        best_score = checkpoint["best_score"]

        print("\t* Training will continue on existing model from epoch {}..."
              .format(start_epoch))

        model.load_state_dict(checkpoint["model"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        epochs_count = checkpoint["epochs_count"]
        train_losses = checkpoint["train_losses"]
        valid_losses = checkpoint["valid_losses"]

    # Compute loss and accuracy before starting (or resuming) training.
    _, valid_loss, valid_accuracy = validate(model,
                                             valid_loader,
                                             criterion)
    print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%"
          .format(valid_loss, (valid_accuracy*100)))

    # -------------------- Training epochs ------------------- #
    print("\n",
          20 * "=",
          "Training ESIM model on device: {}".format(device),
          20 * "=")

    patience_counter = 0
    for epoch in range(start_epoch, epochs+1):
        epochs_count.append(epoch)

        print("* Training epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = train(model,
                                                       train_loader,
                                                       optimizer,
                                                       criterion,
                                                       epoch,
                                                       max_grad_norm)

        train_losses.append(epoch_loss)
        print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%"
              .format(epoch_time, epoch_loss, (epoch_accuracy*100)))

        print("* Validation for epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = validate(model,
                                                          valid_loader,
                                                          criterion)

        valid_losses.append(epoch_loss)
        print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n"
              .format(epoch_time, epoch_loss, (epoch_accuracy*100)))

        # Update the optimizer's learning rate with the scheduler.
        scheduler.step(epoch_accuracy)

        # Early stopping on validation accuracy.
        if epoch_accuracy < best_score:
            patience_counter += 1
        else:
            best_score = epoch_accuracy
            patience_counter = 0
            # Save the best model. The optimizer is not saved to avoid having
            # a checkpoint file that is too heavy to be shared. To resume
            # training from the best model, use the 'esim_*.pth.tar'
            # checkpoints instead.
            torch.save({"epoch": epoch,
                        "model": model.state_dict(),
                        "best_score": best_score,
                        "epochs_count": epochs_count,
                        "train_losses": train_losses,
                        "valid_losses": valid_losses},
                       os.path.join(target_dir, "best.pth.tar"))

        # Save the model at each epoch.
        torch.save({"epoch": epoch,
                    "model": model.state_dict(),
                    "best_score": best_score,
                    "optimizer": optimizer.state_dict(),
                    "epochs_count": epochs_count,
                    "train_losses": train_losses,
                    "valid_losses": valid_losses},
                   os.path.join(target_dir, "esim_{}.pth.tar".format(epoch)))

        if patience_counter >= patience:
            print("-> Early stopping: patience limit reached, stopping...")
            break

    # Plotting of the loss curves for the train and validation sets.
    plt.figure()
    plt.plot(epochs_count, train_losses, "-r")
    plt.plot(epochs_count, valid_losses, "-b")
    plt.xlabel("epoch")
    plt.ylabel("loss")
    plt.legend(["Training loss", "Validation loss"])
    plt.title("Cross entropy loss")
    plt.show()