Example #1
0
    def setUp(self):
        super(TestBiMPM, self).setUp()
        self.write_duplicate_questions_train_file()
        self.write_duplicate_questions_validation_file()
        self.write_duplicate_questions_test_file()
        self.data_manager = DataManager(STSInstance)
        self.batch_size = 3
        self.get_train_gen, self.train_size = self.data_manager.get_train_data_from_file(
            [self.TRAIN_FILE], mode="word+character")
        self.get_val_gen, self.val_size = self.data_manager.get_validation_data_from_file(
            [self.VALIDATION_FILE], mode="word+character")
        self.get_test_gen, self.test_size = self.data_manager.get_test_data_from_file(
            [self.TEST_FILE], mode="word+character")

        self.embedding_manager = EmbeddingManager(
            self.data_manager.data_indexer)
        self.word_embedding_dim = 5
        self.word_embedding_matrix = self.embedding_manager.get_embedding_matrix(
            self.word_embedding_dim)
        self.char_embedding_dim = 2
        self.char_embedding_matrix = self.embedding_manager.get_embedding_matrix(
            self.char_embedding_dim)
        self.char_rnn_hidden_size = 6
        self.context_rnn_hidden_size = 3
        self.aggregation_rnn_hidden_size = 4
        self.dropout_ratio = 0.1
        self.config_dict = {
            "mode":
            "train",
            "word_vocab_size":
            self.data_manager.data_indexer.get_vocab_size(),
            "word_embedding_dim":
            self.word_embedding_dim,
            "word_embedding_matrix":
            self.word_embedding_matrix,
            "char_vocab_size":
            self.data_manager.data_indexer.get_vocab_size(
                namespace="characters"),
            "char_embedding_dim":
            self.char_embedding_dim,
            "char_embedding_matrix":
            self.char_embedding_matrix,
            "char_rnn_hidden_size":
            self.char_rnn_hidden_size,
            "fine_tune_embeddings":
            False,
            "context_rnn_hidden_size":
            self.context_rnn_hidden_size,
            "aggregation_rnn_hidden_size":
            self.aggregation_rnn_hidden_size,
            "dropout_ratio":
            self.dropout_ratio
        }

        self.num_train_steps_per_epoch = int(
            math.ceil(self.train_size / self.batch_size))
        self.num_val_steps = int(math.ceil(self.val_size / self.batch_size))
        self.num_test_steps = int(math.ceil(self.test_size / self.batch_size))
    def setUp(self):
        super(TestSiameseMatchingBiLSTM, self).setUp()
        self.write_duplicate_questions_train_file()
        self.write_duplicate_questions_validation_file()
        self.write_duplicate_questions_test_file()
        self.data_manager = DataManager(STSInstance)
        self.batch_size = 2
        self.get_train_gen, self.train_size = self.data_manager.get_train_data_from_file(
            [self.TRAIN_FILE])
        self.get_val_gen, self.val_size = self.data_manager.get_validation_data_from_file(
            [self.VALIDATION_FILE])
        self.get_test_gen, self.test_size = self.data_manager.get_test_data_from_file(
            [self.TEST_FILE])

        self.embedding_manager = EmbeddingManager(self.data_manager.data_indexer)
        self.word_embedding_dim = 5
        self.embedding_matrix = self.embedding_manager.get_embedding_matrix(
            self.word_embedding_dim)
        self.rnn_hidden_size = 6
        self.output_keep_prob = 1.0
        self.share_encoder_weights = True
        self.config_dict = {
            "mode": "train",
            "word_vocab_size": self.data_manager.data_indexer.get_vocab_size(),
            "word_embedding_dim": self.word_embedding_dim,
            "fine_tune_embeddings": False,
            "word_embedding_matrix": self.embedding_matrix,
            "rnn_hidden_size": self.rnn_hidden_size,
            "output_keep_prob": self.output_keep_prob,
            "share_encoder_weights": self.share_encoder_weights
        }
        self.num_train_steps_per_epoch = int(math.ceil(self.train_size / self.batch_size))
        self.num_val_steps = int(math.ceil(self.val_size / self.batch_size))
        self.num_test_steps = int(math.ceil(self.test_size / self.batch_size))
    def test_generate_train_batches(self):
        get_train_gen, train_size = self.data_manager.get_train_data_from_file(
            [self.TRAIN_FILE])
        batch_gen = DataManager.get_batch_generator(get_train_gen, 2)
        new_batch_gen = DataManager.get_batch_generator(get_train_gen, 2)

        # Assert that the new generator is a different object
        # than the old generator.
        assert new_batch_gen != batch_gen
        assert train_size == 3

        first_batch = batch_gen.__next__()
        new_first_batch = new_batch_gen.__next__()
        inputs, labels = first_batch
        new_inputs, new_labels = new_first_batch
        assert len(inputs) == len(new_inputs) == 2
        assert len(labels) == len(new_labels) == 1

        # Ensure output matches ground truth
        assert_allclose(inputs[0], np.array([[2, 0], [5, 0]]))
        assert_allclose(inputs[1], np.array([[3, 4], [6, 0]]))
        assert_allclose(labels[0], np.array([[1, 0], [0, 1]]))
        # Ensure both generators produce same results.
        assert_allclose(inputs[0], new_inputs[0])
        assert_allclose(inputs[1], new_inputs[1])
        assert_allclose(labels[0], labels[0])

        second_batch = batch_gen.__next__()
        new_second_batch = new_batch_gen.__next__()
        inputs, labels = second_batch
        new_inputs, new_labels = new_second_batch
        assert len(inputs) == len(new_inputs) == 2
        assert len(labels) == len(new_labels) == 1

        # Ensure output matches ground truth
        assert_allclose(inputs[0], np.array([[7, 0]]))
        assert_allclose(inputs[1], np.array([[8, 0]]))
        assert_allclose(labels[0], np.array([[1, 0]]))
        # Ensure both generators produce same results.
        assert_allclose(inputs[0], new_inputs[0])
        assert_allclose(inputs[1], new_inputs[1])
        assert_allclose(labels[0], labels[0])

        # Should raise a StopIteration
        with self.assertRaises(StopIteration):
            batch_gen.__next__()
            new_batch_gen.__next__()
    def setUp(self):
        super(TestSiameseBiLSTM, self).setUp()
        self.write_duplicate_questions_train_file()
        self.write_duplicate_questions_validation_file()
        self.write_duplicate_questions_test_file()
        self.data_manager = DataManager(STSInstance)
        self.batch_size = 2
        self.get_train_gen, self.train_size = self.data_manager.get_train_data_from_file(
            [self.TRAIN_FILE])
        self.get_val_gen, self.val_size = self.data_manager.get_validation_data_from_file(
            [self.VALIDATION_FILE])
        self.get_test_gen, self.test_size = self.data_manager.get_test_data_from_file(
            [self.TEST_FILE])

        self.embedding_manager = EmbeddingManager(self.data_manager.data_indexer)
        self.word_embedding_dim = 5
        self.embedding_matrix = self.embedding_manager.get_embedding_matrix(
            self.word_embedding_dim)
        self.rnn_hidden_size = 6
        self.rnn_output_mode = "last"
        self.output_keep_prob = 1.0
        self.share_encoder_weights = True
        self.config_dict = {
            "mode": "train",
            "word_vocab_size": self.data_manager.data_indexer.get_vocab_size(),
            "word_embedding_dim": self.word_embedding_dim,
            "fine_tune_embeddings": False,
            "word_embedding_matrix": self.embedding_matrix,
            "rnn_hidden_size": self.rnn_hidden_size,
            "rnn_output_mode": self.rnn_output_mode,
            "output_keep_prob": self.output_keep_prob,
            "share_encoder_weights": self.share_encoder_weights
        }
        self.num_train_steps_per_epoch = int(math.ceil(self.train_size / self.batch_size))
        self.num_val_steps = int(math.ceil(self.val_size / self.batch_size))
        self.num_test_steps = int(math.ceil(self.test_size / self.batch_size))
Example #5
0
def main():
    project_dir = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)

    # Parse config arguments
    argparser = argparse.ArgumentParser(
        description=("Run the Bilateral Multi-Perspective "
                     "Matching (biMPM) model on the paraphrase "
                     "identification task."))
    argparser.add_argument("mode",
                           type=str,
                           choices=["train", "predict"],
                           help=("One of {train|predict}, to "
                                 "indicate what you want the model to do. "
                                 "If you pick \"predict\", then you must also "
                                 "supply the path to a pretrained model and "
                                 "DataIndexer to load."))
    argparser.add_argument("--model_load_dir",
                           type=str,
                           help=("The path to a directory with checkpoints to "
                                 "load for evaluation or prediction. The "
                                 "latest checkpoint will be loaded."))
    argparser.add_argument("--dataindexer_load_path",
                           type=str,
                           help=("The path to the dataindexer fit on the "
                                 "train data, so we can properly index the "
                                 "test data for evaluation or prediction."))
    argparser.add_argument("--train_file",
                           type=str,
                           default=os.path.join(
                               project_dir, "data/processed/quora/"
                               "train_cleaned_train_split.csv"),
                           help="Path to a file to train on.")
    argparser.add_argument(
        "--val_file",
        type=str,
        default=os.path.join(
            project_dir, "data/processed/quora/"
            "train_cleaned_val_split.csv"),
        help="Path to a file to monitor validation acc. on.")
    argparser.add_argument("--test_file",
                           type=str,
                           default=os.path.join(
                               project_dir, "data/processed/quora/"
                               "test_final.csv"))
    argparser.add_argument("--batch_size",
                           type=int,
                           default=64,
                           help="Number of instances per batch.")
    argparser.add_argument("--num_epochs",
                           type=int,
                           default=10,
                           help=("Number of epochs to perform in "
                                 "training."))
    argparser.add_argument("--early_stopping_patience",
                           type=int,
                           default=0,
                           help=("number of epochs with no validation "
                                 "accuracy improvement after which training "
                                 "will be stopped"))
    argparser.add_argument("--num_sentence_words",
                           type=int,
                           default=100,
                           help=("The maximum length of a sentence. Longer "
                                 "sentences will be truncated, and shorter "
                                 "ones will be padded."))
    argparser.add_argument("--num_word_characters",
                           type=int,
                           default=10,
                           help=("The maximum length of a word. Longer "
                                 "words will be truncated, and shorter "
                                 "ones will be padded."))
    argparser.add_argument("--word_embedding_dim",
                           type=int,
                           default=300,
                           help="Dimensionality of the word embedding layer")
    argparser.add_argument(
        "--pretrained_word_embeddings_file_path",
        type=str,
        help="Path to a file with pretrained word embeddings.",
        default=os.path.join(project_dir, "data/external/",
                             "glove.6B.300d.txt"))
    argparser.add_argument("--char_embedding_dim",
                           type=int,
                           default=20,
                           help="Dimensionality of the char embedding layer")
    argparser.add_argument("--fine_tune_embeddings",
                           action="store_true",
                           help=("Whether to train the embedding layer "
                                 "(if True), or keep it fixed (False)."))
    argparser.add_argument("--char_rnn_hidden_size",
                           type=int,
                           default=50,
                           help=("The output dimension of the character "
                                 "encoder RNN."))
    argparser.add_argument("--context_rnn_hidden_size",
                           type=int,
                           default=100,
                           help=("The output dimension of the context "
                                 "encoding RNN."))
    argparser.add_argument("--aggregation_rnn_hidden_size",
                           type=int,
                           default=100,
                           help=("The output dimension of the aggregation "
                                 "encoding RNN."))
    argparser.add_argument("--dropout_ratio",
                           type=float,
                           default=0.1,
                           help=("The proportion of RNN outputs to "
                                 "drop out."))
    argparser.add_argument("--log_period",
                           type=int,
                           default=10,
                           help=("Number of steps between each summary "
                                 "op evaluation."))
    argparser.add_argument("--val_period",
                           type=int,
                           default=2500,
                           help=("Number of steps between each evaluation of "
                                 "validation performance."))
    argparser.add_argument("--log_dir",
                           type=str,
                           default=os.path.join(project_dir, "logs/"),
                           help=("Directory to save logs to."))
    argparser.add_argument("--save_period",
                           type=int,
                           default=2500,
                           help=("Number of steps between each "
                                 "model checkpoint"))
    argparser.add_argument("--save_dir",
                           type=str,
                           default=os.path.join(project_dir, "models/"),
                           help=("Directory to save model checkpoints to."))
    argparser.add_argument("--run_id",
                           type=str,
                           required=True,
                           help=("Identifying run ID for this run. If "
                                 "predicting, you probably want this "
                                 "to be the same as the train run_id"))
    argparser.add_argument("--model_name",
                           type=str,
                           required=True,
                           help=("Identifying model name for this run. If"
                                 "predicting, you probably want this "
                                 "to be the same as the train run_id"))
    argparser.add_argument("--reweight_predictions_for_kaggle",
                           action="store_true",
                           help=("Only relevant when predicting. Whether to "
                                 "reweight the prediction probabilities to "
                                 "account for class proportion discrepancy "
                                 "between train and test."))

    config = argparser.parse_args()

    model_name = config.model_name
    run_id = config.run_id
    mode = config.mode

    # Get the data.
    batch_size = config.batch_size
    if mode == "train":
        # Read the train data from a file, and use it to index the
        # validation data
        data_manager = DataManager(STSInstance)
        num_sentence_words = config.num_sentence_words
        num_word_characters = config.num_word_characters
        get_train_data_gen, train_data_size = data_manager.get_train_data_from_file(
            [config.train_file],
            max_lengths={
                "num_sentence_words": num_sentence_words,
                "num_word_characters": num_word_characters
            },
            mode="word+character")
        get_val_data_gen, val_data_size = data_manager.get_validation_data_from_file(
            [config.val_file],
            max_lengths={
                "num_sentence_words": num_sentence_words,
                "num_word_characters": num_word_characters
            },
            mode="word+character")
    else:
        # Load the fitted DataManager, and use it to index the test data
        logger.info("Loading pickled DataManager from {}".format(
            config.dataindexer_load_path))
        data_manager = pickle.load(open(config.dataindexer_load_path, "rb"))
        get_test_data_gen, test_data_size = data_manager.get_test_data_from_file(
            [config.test_file], mode="word+character")

    vars(config)["word_vocab_size"] = data_manager.data_indexer.get_vocab_size(
    )
    vars(config)["char_vocab_size"] = data_manager.data_indexer.get_vocab_size(
        namespace="characters")

    # Log the run parameters.
    log_dir = config.log_dir
    log_path = os.path.join(log_dir, model_name, run_id.zfill(2))
    logger.info("Writing logs to {}".format(log_path))
    if not os.path.exists(log_path):
        logger.info("log path {} does not exist, "
                    "creating it".format(log_path))
        os.makedirs(log_path)
    params_path = os.path.join(log_path, mode + "params.json")
    logger.info("Writing params to {}".format(params_path))
    with open(params_path, 'w') as params_file:
        json.dump(vars(config), params_file, indent=4)

    # Get the embeddings.
    embedding_manager = EmbeddingManager(data_manager.data_indexer)
    word_embedding_matrix = embedding_manager.get_embedding_matrix(
        config.word_embedding_dim, config.pretrained_word_embeddings_file_path)
    vars(config)["word_embedding_matrix"] = word_embedding_matrix
    char_embedding_matrix = embedding_manager.get_embedding_matrix(
        config.char_embedding_dim, namespace="characters")
    vars(config)["char_embedding_matrix"] = char_embedding_matrix

    # Initialize the model.
    model = BiMPM(vars(config))
    model.build_graph()

    if mode == "train":
        # Train the model.
        num_epochs = config.num_epochs
        num_train_steps_per_epoch = int(math.ceil(train_data_size /
                                                  batch_size))
        num_val_steps = int(math.ceil(val_data_size / batch_size))
        log_period = config.log_period
        val_period = config.val_period

        save_period = config.save_period
        save_dir = os.path.join(config.save_dir, model_name,
                                run_id.zfill(2) + "/")
        save_path = os.path.join(save_dir, model_name + "-" + run_id.zfill(2))

        logger.info("Checkpoints will be written to {}".format(save_dir))
        if not os.path.exists(save_dir):
            logger.info("save path {} does not exist, "
                        "creating it".format(save_dir))
            os.makedirs(save_dir)

        logger.info("Saving fitted DataManager to {}".format(save_dir))
        data_manager_pickle_name = "{}-{}-DataManager.pkl".format(
            model_name, run_id.zfill(2))
        pickle.dump(
            data_manager,
            open(os.path.join(save_dir, data_manager_pickle_name), "wb"))
        patience = config.early_stopping_patience
        model.train(get_train_instance_generator=get_train_data_gen,
                    get_val_instance_generator=get_val_data_gen,
                    batch_size=batch_size,
                    num_train_steps_per_epoch=num_train_steps_per_epoch,
                    num_epochs=num_epochs,
                    num_val_steps=num_val_steps,
                    save_path=save_path,
                    log_path=log_path,
                    log_period=log_period,
                    val_period=val_period,
                    save_period=save_period,
                    patience=patience)
    else:
        # Predict with the model
        model_load_dir = config.model_load_dir
        num_test_steps = int(math.ceil(test_data_size / batch_size))
        # Numpy array of shape (num_test_examples, 2)
        raw_predictions = model.predict(
            get_test_instance_generator=get_test_data_gen,
            model_load_dir=model_load_dir,
            batch_size=batch_size,
            num_test_steps=num_test_steps)
        # Remove the first column, so we're left with just the probabilities
        # that a question is a duplicate.
        is_duplicate_probabilities = np.delete(raw_predictions, 0, 1)

        # The class balance between kaggle train and test seems different.
        # This edits prediction probability to account for the discrepancy.
        # See: https://www.kaggle.com/c/quora-question-pairs/discussion/31179
        if config.reweight_predictions_for_kaggle:
            positive_weight = 0.165 / 0.37
            negative_weight = (1 - 0.165) / (1 - 0.37)
            is_duplicate_probabilities = (
                (positive_weight * is_duplicate_probabilities) /
                (positive_weight * is_duplicate_probabilities +
                 negative_weight * (1 - is_duplicate_probabilities)))

        # Write the predictions to an output submission file
        output_predictions_path = os.path.join(
            log_path,
            model_name + "-" + run_id.zfill(2) + "-output_predictions.csv")
        logger.info(
            "Writing predictions to {}".format(output_predictions_path))
        is_duplicate_df = pd.DataFrame(is_duplicate_probabilities)
        is_duplicate_df.to_csv(output_predictions_path,
                               index_label="test_id",
                               header=["is_duplicate"])
 def setUp(self):
     super(TestDataManagerTest, self).setUp()
     self.write_duplicate_questions_train_file()
     self.write_duplicate_questions_test_file()
     self.data_manager = DataManager(STSInstance)
     self.data_manager.get_train_data_from_file([self.TRAIN_FILE])
class TestDataManagerTest(DuplicateTestCase):
    @overrides
    def setUp(self):
        super(TestDataManagerTest, self).setUp()
        self.write_duplicate_questions_train_file()
        self.write_duplicate_questions_test_file()
        self.data_manager = DataManager(STSInstance)
        self.data_manager.get_train_data_from_file([self.TRAIN_FILE])

    def test_get_test_data_default(self):
        get_test_gen, test_size = self.data_manager.get_test_data_from_file(
            [self.TEST_FILE])
        assert test_size == 3
        test_gen = get_test_gen()
        inputs1, labels1 = test_gen.__next__()
        assert_allclose(inputs1[0], np.array([2, 1]))
        assert_allclose(inputs1[1], np.array([1, 0]))

        inputs2, labels2 = test_gen.__next__()
        assert_allclose(inputs2[0], np.array([4, 0]))
        assert_allclose(inputs2[1], np.array([5, 1]))

        inputs3, labels3 = test_gen.__next__()
        assert_allclose(inputs3[0], np.array([6, 0]))
        assert_allclose(inputs3[1], np.array([7, 0]))

        # Should raise a StopIteration
        with self.assertRaises(StopIteration):
            test_gen.__next__()

        # Test that we can make a new test generator
        new_test_gen = get_test_gen()
        # Verify that the new and old generator are not the same object
        assert new_test_gen != test_gen
        new_inputs1, new_labels1 = new_test_gen.__next__()
        assert_allclose(new_inputs1, inputs1)
        assert_allclose(new_labels1, labels1)
        new_inputs2, new_labels2 = new_test_gen.__next__()
        assert_allclose(new_inputs2, inputs2)
        assert_allclose(new_labels2, labels2)
        new_inputs3, new_labels3 = new_test_gen.__next__()
        assert_allclose(new_inputs3, inputs3)
        assert_allclose(new_labels3, labels3)
        # Should raise a StopIteration
        with self.assertRaises(StopIteration):
            new_test_gen.__next__()

    def test_get_test_data_default_character(self):
        get_test_gen, test_size = self.data_manager.get_test_data_from_file(
            [self.TEST_FILE], mode="character")
        test_gen = get_test_gen()
        assert test_size == 3
        inputs1, labels = test_gen.__next__()
        assert_allclose(inputs1[0], np.array([[6, 9, 2, 7, 8, 3, 5, 4, 10, 0, 0, 0],
                                              [6, 9, 2, 7, 8, 3, 5, 4, 9, 4, 1, 10]]))
        assert_allclose(inputs1[1], np.array([[6, 9, 2, 7, 8, 3, 5, 4, 9, 4, 1, 11],
                                              [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
        assert len(labels) == 0

        inputs2, labels = test_gen.__next__()
        assert_allclose(inputs2[0], np.array([[6, 9, 2, 7, 8, 3, 5, 4, 12, 19, 17, 18],
                                              [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
        assert_allclose(inputs2[1], np.array([[6, 9, 2, 7, 8, 3, 5, 4, 13, 0, 0, 0],
                                              [6, 9, 2, 7, 8, 3, 5, 4, 9, 4, 1, 12]]))
        assert len(labels) == 0

        inputs3, labels = test_gen.__next__()
        assert_allclose(inputs3[0], np.array([[6, 9, 2, 7, 8, 3, 5, 4, 14, 0, 0, 0],
                                              [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
        assert_allclose(inputs3[1], np.array([[6, 9, 2, 7, 8, 3, 5, 4, 15, 0, 0, 0],
                                              [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
        assert len(labels) == 0
        # Should raise a StopIteration
        with self.assertRaises(StopIteration):
            test_gen.__next__()

    def test_get_test_data_default_word_and_character(self):
        get_test_gen, test_size = self.data_manager.get_test_data_from_file(
            [self.TEST_FILE], mode="word+character")
        test_gen = get_test_gen()
        assert test_size == 3
        inputs1, labels = test_gen.__next__()
        assert_allclose(inputs1[0], np.array([2, 1]))
        assert_allclose(inputs1[1], np.array([[6, 9, 2, 7, 8, 3, 5, 4, 10, 0, 0, 0],
                                              [6, 9, 2, 7, 8, 3, 5, 4, 9, 4, 1, 10]]))
        assert_allclose(inputs1[2], np.array([1, 0]))
        assert_allclose(inputs1[3], np.array([[6, 9, 2, 7, 8, 3, 5, 4, 9, 4, 1, 11],
                                              [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
        assert len(labels) == 0

        inputs2, labels = test_gen.__next__()
        assert_allclose(inputs2[0], np.array([4, 0]))
        assert_allclose(inputs2[1], np.array([[6, 9, 2, 7, 8, 3, 5, 4, 12, 19, 17, 18],
                                              [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
        assert_allclose(inputs2[2], np.array([5, 1]))
        assert_allclose(inputs2[3], np.array([[6, 9, 2, 7, 8, 3, 5, 4, 13, 0, 0, 0],
                                              [6, 9, 2, 7, 8, 3, 5, 4, 9, 4, 1, 12]]))
        assert len(labels) == 0

        inputs3, labels = test_gen.__next__()
        assert_allclose(inputs3[0], np.array([6, 0]))
        assert_allclose(inputs3[1], np.array([[6, 9, 2, 7, 8, 3, 5, 4, 14, 0, 0, 0],
                                              [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
        assert_allclose(inputs3[2], np.array([7, 0]))
        assert_allclose(inputs3[3], np.array([[6, 9, 2, 7, 8, 3, 5, 4, 15, 0, 0, 0],
                                              [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
        assert len(labels) == 0

        # Should raise a StopIteration
        with self.assertRaises(StopIteration):
            test_gen.__next__()

    def test_get_test_data_pad_with_max_lens(self):
        get_test_gen, test_size = self.data_manager.get_test_data_from_file(
            [self.TEST_FILE],
            max_lengths={"num_sentence_words": 1})
        test_gen = get_test_gen()
        assert test_size == 3

        inputs, labels = test_gen.__next__()
        assert_allclose(inputs[0], np.array([2]))
        assert_allclose(inputs[1], np.array([1]))
        assert len(labels) == 0

        inputs, labels = test_gen.__next__()
        assert_allclose(inputs[0], np.array([4]))
        assert_allclose(inputs[1], np.array([5]))
        assert len(labels) == 0

        inputs, labels = test_gen.__next__()
        assert_allclose(inputs[0], np.array([6]))
        assert_allclose(inputs[1], np.array([7]))
        assert len(labels) == 0

        # Should raise a StopIteration
        with self.assertRaises(StopIteration):
            test_gen.__next__()

    def test_get_test_data_with_max_instances(self):
        get_test_gen, test_size = self.data_manager.get_test_data_from_file(
            [self.TEST_FILE],
            max_instances=2)
        test_gen = get_test_gen()
        assert test_size == 2

        inputs, labels = test_gen.__next__()
        assert_allclose(inputs[0], np.array([2, 1]))
        assert_allclose(inputs[1], np.array([1, 0]))
        assert len(labels) == 0

        inputs, labels = test_gen.__next__()
        assert_allclose(inputs[0], np.array([4, 0]))
        assert_allclose(inputs[1], np.array([5, 1]))
        assert len(labels) == 0

        # Should raise a StopIteration
        with self.assertRaises(StopIteration):
            test_gen.__next__()

    def test_get_test_data_errors(self):
        with self.assertRaises(ValueError):
            self.data_manager.get_test_data_from_file(
                [self.TEST_FILE],
                max_lengths={"num_sentence_words": 1},
                pad=False)
        with self.assertRaises(ValueError):
            self.data_manager.get_test_data_from_file(
                [self.TEST_FILE],
                max_lengths={"some wrong key": 1})

    def test_get_test_data_no_pad(self):
        get_test_gen, test_size = self.data_manager.get_test_data_from_file(
            [self.TEST_FILE],
            pad=False)
        test_gen = get_test_gen()
        assert test_size == 3

        inputs, labels = test_gen.__next__()
        assert_allclose(inputs[0], np.array([2, 1, 2]))
        assert_allclose(inputs[1], np.array([1]))
        assert len(labels) == 0

        inputs, labels = test_gen.__next__()
        assert_allclose(inputs[0], np.array([4]))
        assert_allclose(inputs[1], np.array([5, 1]))
        assert len(labels) == 0

        inputs, labels = test_gen.__next__()
        assert_allclose(inputs[0], np.array([6]))
        assert_allclose(inputs[1], np.array([7]))
        assert len(labels) == 0

        # Should raise a StopIteration
        with self.assertRaises(StopIteration):
            test_gen.__next__()

    def test_generate_test_batches(self):
        get_test_gen, test_size = self.data_manager.get_test_data_from_file(
            [self.TEST_FILE])
        batch_gen = self.data_manager.get_batch_generator(get_test_gen, 2)
        new_batch_gen = DataManager.get_batch_generator(get_test_gen, 2)

        # Assert that the new generator is a different object
        # than the old generator.
        assert new_batch_gen != batch_gen
        assert test_size == 3

        first_batch = batch_gen.__next__()
        new_first_batch = new_batch_gen.__next__()
        inputs, labels = first_batch
        new_inputs, new_labels = new_first_batch
        assert len(inputs) == 2
        assert len(labels) == 0

        # Ensure output matches ground truth
        assert_allclose(inputs[0], np.array([[2, 1], [4, 0]]))
        assert_allclose(inputs[1], np.array([[1, 0], [5, 1]]))
        # Ensure both generators produce same results.
        assert_allclose(inputs[0], new_inputs[0])
        assert_allclose(inputs[1], new_inputs[1])

        second_batch = batch_gen.__next__()
        new_second_batch = new_batch_gen.__next__()
        inputs, labels = second_batch
        new_inputs, new_labels = new_second_batch
        assert len(inputs) == 2
        assert len(labels) == 0

        # Ensure output matches ground truth
        assert_allclose(inputs[0], np.array([[6, 0]]))
        assert_allclose(inputs[1], np.array([[7, 0]]))
        # Ensure both generators produce same results.
        assert_allclose(inputs[0], new_inputs[0])
        assert_allclose(inputs[1], new_inputs[1])

        with self.assertRaises(StopIteration):
            batch_gen.__next__()
            new_batch_gen.__next__()
 def setUp(self):
     super(TestDataManagerTrain, self).setUp()
     self.write_duplicate_questions_train_file()
     self.data_manager = DataManager(STSInstance)
def main():
    project_dir = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)

    # Parse config arguments
    argparser = argparse.ArgumentParser(
        description=("Run the Siamese BiLSTM model with an added "
                     "matching layer for paraphase detection."))
    argparser.add_argument("mode", type=str,
                           choices=["train", "predict"],
                           help=("One of {train|predict}, to "
                                 "indicate what you want the model to do. "
                                 "If you pick \"predict\", then you must also "
                                 "supply the path to a pretrained model and "
                                 "DataIndexer to load."))
    argparser.add_argument("--model_load_dir", type=str,
                           help=("The path to a directory with checkpoints to "
                                 "load for evaluation or prediction. The "
                                 "latest checkpoint will be loaded."))
    argparser.add_argument("--dataindexer_load_path", type=str,
                           help=("The path to the DataIndexer fit on the "
                                 "train data, so we can properly index the "
                                 "test data for evaluation or prediction."))
    argparser.add_argument("--train_file", type=str,
                           default=os.path.join(project_dir,
                                                "data/processed/quora/"
                                                "train_cleaned_train_split.csv"),
                           help="Path to a file to train on.")
    argparser.add_argument("--val_file", type=str,
                           default=os.path.join(project_dir,
                                                "data/processed/quora/"
                                                "train_cleaned_val_split.csv"),
                           help="Path to a file to monitor validation acc. on.")
    argparser.add_argument("--test_file", type=str,
                           default=os.path.join(project_dir,
                                                "data/processed/quora/"
                                                "test_final.csv"))
    argparser.add_argument("--batch_size", type=int, default=128,
                           help="Number of instances per batch.")
    argparser.add_argument("--num_epochs", type=int, default=10,
                           help=("Number of epochs to perform in "
                                 "training."))
    argparser.add_argument("--early_stopping_patience", type=int, default=0,
                           help=("number of epochs with no validation "
                                 "accuracy improvement after which training "
                                 "will be stopped"))
    argparser.add_argument("--num_sentence_words", type=int, default=30,
                           help=("The maximum length of a sentence. Longer "
                                 "sentences will be truncated, and shorter "
                                 "ones will be padded."))
    argparser.add_argument("--word_embedding_dim", type=int, default=300,
                           help="Dimensionality of the word embedding layer")
    argparser.add_argument("--pretrained_embeddings_file_path", type=str,
                           help="Path to a file with pretrained embeddings.",
                           default=os.path.join(project_dir,
                                                "data/external/",
                                                "glove.6B.300d.txt"))
    argparser.add_argument("--fine_tune_embeddings", action="store_true",
                           help=("Whether to train the embedding layer "
                                 "(if True), or keep it fixed (False)."))
    argparser.add_argument("--rnn_hidden_size", type=int, default=256,
                           help=("The output dimension of the RNN."))
    argparser.add_argument("--share_encoder_weights", action="store_true",
                           help=("Whether to use the same encoder on both "
                                 "input sentences (thus sharing weights), "
                                 "or a different one for each sentence"))
    argparser.add_argument("--output_keep_prob", type=float, default=1.0,
                           help=("The proportion of RNN outputs to keep, "
                                 "where the rest are dropped out."))
    argparser.add_argument("--log_period", type=int, default=10,
                           help=("Number of steps between each summary "
                                 "op evaluation."))
    argparser.add_argument("--val_period", type=int, default=250,
                           help=("Number of steps between each evaluation of "
                                 "validation performance."))
    argparser.add_argument("--log_dir", type=str,
                           default=os.path.join(project_dir,
                                                "logs/"),
                           help=("Directory to save logs to."))
    argparser.add_argument("--save_period", type=int, default=250,
                           help=("Number of steps between each "
                                 "model checkpoint"))
    argparser.add_argument("--save_dir", type=str,
                           default=os.path.join(project_dir,
                                                "models/"),
                           help=("Directory to save model checkpoints to."))
    argparser.add_argument("--run_id", type=str, required=True,
                           help=("Identifying run ID for this run. If "
                                 "predicting, you probably want this "
                                 "to be the same as the train run_id"))
    argparser.add_argument("--model_name", type=str, required=True,
                           help=("Identifying model name for this run. If"
                                 "predicting, you probably want this "
                                 "to be the same as the train run_id"))
    argparser.add_argument("--reweight_predictions_for_kaggle", action="store_true",
                           help=("Only relevant when predicting. Whether to "
                                 "reweight the prediction probabilities to "
                                 "account for class proportion discrepancy "
                                 "between train and test."))

    config = argparser.parse_args()

    model_name = config.model_name
    run_id = config.run_id
    mode = config.mode

    # Get the data.
    batch_size = config.batch_size
    if mode == "train":
        # Read the train data from a file, and use it to index the
        # validation data
        data_manager = DataManager(STSInstance)
        num_sentence_words = config.num_sentence_words
        get_train_data_gen, train_data_size = data_manager.get_train_data_from_file(
            [config.train_file], max_lengths={"num_sentence_words": num_sentence_words})
        get_val_data_gen, val_data_size = data_manager.get_validation_data_from_file(
            [config.val_file], max_lengths={"num_sentence_words": num_sentence_words})
    else:
        # Load the fitted DataManager, and use it to index the test data
        logger.info("Loading pickled DataManager from {}".format(
            config.dataindexer_load_path))
        data_manager = pickle.load(open(config.dataindexer_load_path, "rb"))
        get_test_data_gen, test_data_size = data_manager.get_test_data_from_file(
            [config.test_file])

    vars(config)["word_vocab_size"] = data_manager.data_indexer.get_vocab_size()

    # Log the run parameters.
    log_dir = config.log_dir
    log_path = os.path.join(log_dir, model_name, run_id.zfill(2))
    logger.info("Writing logs to {}".format(log_path))
    if not os.path.exists(log_path):
        logger.info("log path {} does not exist, "
                    "creating it".format(log_path))
        os.makedirs(log_path)
    params_path = os.path.join(log_path, mode + "params.json")
    logger.info("Writing params to {}".format(params_path))
    with open(params_path, 'w') as params_file:
        json.dump(vars(config), params_file, indent=4)

    # Get the embeddings.
    embedding_manager = EmbeddingManager(data_manager.data_indexer)
    embedding_matrix = embedding_manager.get_embedding_matrix(
        config.word_embedding_dim,
        config.pretrained_embeddings_file_path)
    vars(config)["word_embedding_matrix"] = embedding_matrix

    # Initialize the model.
    model = SiameseMatchingBiLSTM(vars(config))
    model.build_graph()

    if mode == "train":
        # Train the model.
        num_epochs = config.num_epochs
        num_train_steps_per_epoch = int(math.ceil(train_data_size / batch_size))
        num_val_steps = int(math.ceil(val_data_size / batch_size))
        log_period = config.log_period
        val_period = config.val_period

        save_period = config.save_period
        save_dir = os.path.join(config.save_dir, model_name, run_id.zfill(2) + "/")
        save_path = os.path.join(save_dir, model_name + "-" + run_id.zfill(2))

        logger.info("Checkpoints will be written to {}".format(save_dir))
        if not os.path.exists(save_dir):
            logger.info("save path {} does not exist, "
                        "creating it".format(save_dir))
            os.makedirs(save_dir)

        logger.info("Saving fitted DataManager to {}".format(save_dir))
        data_manager_pickle_name = "{}-{}-DataManager.pkl".format(model_name,
                                                                  run_id.zfill(2))
        pickle.dump(data_manager,
                    open(os.path.join(save_dir, data_manager_pickle_name), "wb"))

        patience = config.early_stopping_patience
        model.train(get_train_instance_generator=get_train_data_gen,
                    get_val_instance_generator=get_val_data_gen,
                    batch_size=batch_size,
                    num_train_steps_per_epoch=num_train_steps_per_epoch,
                    num_epochs=num_epochs,
                    num_val_steps=num_val_steps,
                    save_path=save_path,
                    log_path=log_path,
                    log_period=log_period,
                    val_period=val_period,
                    save_period=save_period,
                    patience=patience)
    else:
        # Predict with the model
        model_load_dir = config.model_load_dir
        num_test_steps = int(math.ceil(test_data_size / batch_size))
        # Numpy array of shape (num_test_examples, 2)
        raw_predictions = model.predict(get_test_instance_generator=get_test_data_gen,
                                        model_load_dir=model_load_dir,
                                        batch_size=batch_size,
                                        num_test_steps=num_test_steps)

        # Remove the first column, so we're left with just the probabilities
        # that a question is a duplicate.
        is_duplicate_probabilities = np.delete(raw_predictions, 0, 1)

        # The class balance between kaggle train and test seems different.
        # This edits prediction probability to account for the discrepancy.
        # See: https://www.kaggle.com/c/quora-question-pairs/discussion/31179
        if config.reweight_predictions_for_kaggle:
            positive_weight = 0.165 / 0.37
            negative_weight = (1 - 0.165) / (1 - 0.37)
            is_duplicate_probabilities = ((positive_weight * is_duplicate_probabilities) /
                                          (positive_weight * is_duplicate_probabilities +
                                           negative_weight *
                                           (1 - is_duplicate_probabilities)))

        # Write the predictions to an output submission file
        output_predictions_path = os.path.join(log_path, model_name + "-" +
                                               run_id.zfill(2) +
                                               "-output_predictions.csv")
        logger.info("Writing predictions to {}".format(output_predictions_path))
        is_duplicate_df = pd.DataFrame(is_duplicate_probabilities)
        is_duplicate_df.to_csv(output_predictions_path, index_label="test_id",
                               header=["is_duplicate"])
Example #10
0
class TestBiMPM(DuplicateTestCase):
    @overrides
    def setUp(self):
        super(TestBiMPM, self).setUp()
        self.write_duplicate_questions_train_file()
        self.write_duplicate_questions_validation_file()
        self.write_duplicate_questions_test_file()
        self.data_manager = DataManager(STSInstance)
        self.batch_size = 3
        self.get_train_gen, self.train_size = self.data_manager.get_train_data_from_file(
            [self.TRAIN_FILE], mode="word+character")
        self.get_val_gen, self.val_size = self.data_manager.get_validation_data_from_file(
            [self.VALIDATION_FILE], mode="word+character")
        self.get_test_gen, self.test_size = self.data_manager.get_test_data_from_file(
            [self.TEST_FILE], mode="word+character")

        self.embedding_manager = EmbeddingManager(
            self.data_manager.data_indexer)
        self.word_embedding_dim = 5
        self.word_embedding_matrix = self.embedding_manager.get_embedding_matrix(
            self.word_embedding_dim)
        self.char_embedding_dim = 2
        self.char_embedding_matrix = self.embedding_manager.get_embedding_matrix(
            self.char_embedding_dim)
        self.char_rnn_hidden_size = 6
        self.context_rnn_hidden_size = 3
        self.aggregation_rnn_hidden_size = 4
        self.dropout_ratio = 0.1
        self.config_dict = {
            "mode":
            "train",
            "word_vocab_size":
            self.data_manager.data_indexer.get_vocab_size(),
            "word_embedding_dim":
            self.word_embedding_dim,
            "word_embedding_matrix":
            self.word_embedding_matrix,
            "char_vocab_size":
            self.data_manager.data_indexer.get_vocab_size(
                namespace="characters"),
            "char_embedding_dim":
            self.char_embedding_dim,
            "char_embedding_matrix":
            self.char_embedding_matrix,
            "char_rnn_hidden_size":
            self.char_rnn_hidden_size,
            "fine_tune_embeddings":
            False,
            "context_rnn_hidden_size":
            self.context_rnn_hidden_size,
            "aggregation_rnn_hidden_size":
            self.aggregation_rnn_hidden_size,
            "dropout_ratio":
            self.dropout_ratio
        }

        self.num_train_steps_per_epoch = int(
            math.ceil(self.train_size / self.batch_size))
        self.num_val_steps = int(math.ceil(self.val_size / self.batch_size))
        self.num_test_steps = int(math.ceil(self.test_size / self.batch_size))

    def test_default_does_not_crash(self):
        # Initialize the model
        model = BiMPM(self.config_dict)
        model.build_graph()
        # Train the model
        model.train(get_train_instance_generator=self.get_train_gen,
                    get_val_instance_generator=self.get_val_gen,
                    batch_size=self.batch_size,
                    num_train_steps_per_epoch=self.num_train_steps_per_epoch,
                    num_epochs=2,
                    num_val_steps=self.num_val_steps,
                    save_path=self.TEST_DIR,
                    log_path=self.TEST_DIR,
                    log_period=2,
                    val_period=2,
                    save_period=2,
                    patience=0)

        tf.reset_default_graph()
        # Load and predict with the model
        self.config_dict["mode"] = "test"
        del self.config_dict["word_embedding_matrix"]
        del self.config_dict["char_embedding_matrix"]
        loaded_model = BiMPM(self.config_dict)
        loaded_model.build_graph()
        loaded_model.predict(get_test_instance_generator=self.get_test_gen,
                             model_load_dir=self.TEST_DIR,
                             batch_size=self.batch_size,
                             num_test_steps=self.num_test_steps)
Example #11
0
def main():
    default_run_id = "01"
    project_dir = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)

    # Parse config arguments
    argparser = configargparse.ArgumentParser(
        default_config_files=['../../config/default.yml'],
        description=("Run a baseline Siamese BiLSTM model "
                     "for paraphrase identification."))
    argparser.add_argument("mode",
                           type=str,
                           choices=["train", "predict"],
                           help=("One of {train|predict}, to "
                                 "indicate what you want the model to do. "
                                 "If you pick \"predict\", then you must also "
                                 "supply the path to a pretrained model and "
                                 "DataIndexer to load."))
    argparser.add_argument("--config_file",
                           is_config_file_arg=True,
                           help="The path to a config file.")
    argparser.add_argument(
        "--data_file_dir",
        type=str,
        default=os.path.join(project_dir, "data/processed/"),
        help="Path of the dir to the (train, val, test).csv files.")
    argparser.add_argument("--train_filename",
                           type=str,
                           default=os.path.join("train.csv"),
                           help="Basename of the train file.")
    argparser.add_argument("--val_filename",
                           type=str,
                           default=os.path.join("train.csv"),
                           help="Basename of the validation file.")
    argparser.add_argument("--test_filename",
                           type=str,
                           default=os.path.join("test.csv"),
                           help="Basename of the test file.")
    argparser.add_argument("--batch_size",
                           type=int,
                           default=128,
                           help="Number of instances per batch.")
    argparser.add_argument("--num_epochs",
                           type=int,
                           default=10,
                           help=("Number of epochs to perform in "
                                 "training."))
    argparser.add_argument("--early_stopping_patience",
                           type=int,
                           default=0,
                           help=("number of epochs with no validation "
                                 "accuracy improvement after which training "
                                 "will be stopped"))
    argparser.add_argument("--num_sentence_words",
                           type=int,
                           default=30,
                           help=("The maximum length of a sentence. Longer "
                                 "sentences will be truncated, and shorter "
                                 "ones will be padded."))
    argparser.add_argument("--embedding_file_path_template",
                           type=str,
                           help="Path to a file with pretrained embeddings.",
                           default=os.path.join(project_dir,
                                                "data/external/bcb",
                                                "{name}.{dim}d.txt"))
    argparser.add_argument("--embedding_file_name",
                           type=str,
                           help="Name of the embedding file.")
    argparser.add_argument("--word_embedding_dim",
                           type=int,
                           default=8,
                           help="Dimensionality of the word embedding layer")
    argparser.add_argument("--fine_tune_embeddings",
                           action="store_true",
                           help=("Whether to train the embedding layer "
                                 "(if True), or keep it fixed (False)."))
    argparser.add_argument("--rnn_hidden_size",
                           type=int,
                           default=256,
                           help=("The output dimension of the RNN."))
    argparser.add_argument("--share_encoder_weights",
                           action="store_true",
                           help=("Whether to use the same encoder on both "
                                 "input sentences (thus sharing weights), "
                                 "or a different one for each sentence"))
    argparser.add_argument("--rnn_output_mode",
                           type=str,
                           default="last",
                           choices=["mean_pool", "last"],
                           help=("How to calculate the final sentence "
                                 "representation from the RNN outputs. "
                                 "\"mean_pool\" indicates that the outputs "
                                 "will be averaged (with respect to padding), "
                                 "and \"last\" indicates that the last "
                                 "relevant output will be used as the "
                                 "sentence representation."))
    argparser.add_argument("--output_keep_prob",
                           type=float,
                           default=1.0,
                           help=("The proportion of RNN outputs to keep, "
                                 "where the rest are dropped out."))
    argparser.add_argument("--log_period",
                           type=int,
                           default=10,
                           help=("Number of steps between each summary "
                                 "op evaluation."))
    argparser.add_argument("--val_period",
                           type=int,
                           default=250,
                           help=("Number of steps between each evaluation of "
                                 "validation performance."))
    argparser.add_argument("--log_dir",
                           type=str,
                           default=os.path.join(project_dir, "logs/"),
                           help=("Directory to save logs to."))
    argparser.add_argument("--save_period",
                           type=int,
                           default=250,
                           help=("Number of steps between each "
                                 "model checkpoint"))
    argparser.add_argument("--model_save_root",
                           type=str,
                           default=os.path.join(project_dir, "models/"),
                           help=("Directory to save model checkpoints to."))
    argparser.add_argument("--token_file_dir",
                           type=str,
                           help=("Directory to token files."))
    argparser.add_argument("--token_file_ext",
                           type=str,
                           help=("File extentions of token files."))
    argparser.add_argument("--run_id",
                           type=str,
                           default=default_run_id,
                           help=("Identifying run ID for this run. If "
                                 "predicting, you probably want this "
                                 "to be the same as the train run_id"))
    argparser.add_argument("--model_name",
                           type=str,
                           help=("Identifying model name for this run. If"
                                 "predicting, you probably want this "
                                 "to be the same as the train run_id"))
    argparser.add_argument("--reweight_predictions_for_kaggle",
                           action="store_true",
                           help=("Only relevant when predicting. Whether to "
                                 "reweight the prediction probabilities to "
                                 "account for class proportion discrepancy "
                                 "between train and test."))

    config = argparser.parse_args()
    # logger.info(config)

    model_name = config.model_name
    run_id = config.run_id.zfill(2)
    mode = config.mode
    batch_size = config.batch_size

    paths = construct_paths(
        model_name, run_id, config.data_file_dir, config.train_filename,
        config.val_filename, config.test_filename, config.model_save_root,
        config.log_dir, config.embedding_file_path_template,
        config.embedding_file_name, config.word_embedding_dim)
    model_save_file_path = paths['model_save_file_path']
    model_save_dir = paths['model_save_dir']
    data_manager_pickle_file_path = paths['data_manager_pickle_file_path']

    if mode == "train":
        # Read the train data from a file, and use it to index the validation data

        # TODO: determine from config
        # data_manager = DataManager(STSInstance)
        CodeInstance.set_token_file(config.token_file_dir,
                                    config.token_file_ext)
        data_manager = DataManager(CodeInstance)
        num_sentence_words = config.num_sentence_words
        get_train_data_gen, train_data_size = data_manager.get_train_data_from_file(
            [paths['train_file_path']],
            max_lengths={"num_sentence_words": num_sentence_words})
        get_val_data_gen, val_data_size = data_manager.get_validation_data_from_file(
            [paths['val_file_path']],
            max_lengths={"num_sentence_words": num_sentence_words})
    else:
        # Load the fitted DataManager, and use it to index the test data
        logger.info("Loading pickled DataManager "
                    "from {}".format(data_manager_pickle_file_path))
        data_manager = pickle.load(open(data_manager_pickle_file_path, "rb"))
        test_data_gen, test_data_size = data_manager.get_test_data_from_file(
            [paths['test_file_path']])

    vars(config)["word_vocab_size"] = data_manager.data_indexer.get_vocab_size(
    )

    # Log the run parameters.
    log_path = paths['log_file_path']
    logger.info("Writing logs to {}".format(log_path))
    if not os.path.exists(log_path):
        logger.info("log path {} does not exist, "
                    "creating it".format(log_path))
        os.makedirs(log_path)

    params_path = os.path.join(log_path, mode + "params.json")
    logger.info("Writing params to {}".format(params_path))
    with open(params_path, 'w') as params_file:
        json.dump(vars(config), params_file, indent=4)

    # Get the embeddings.
    embedding_manager = EmbeddingManager(data_manager.data_indexer)
    embedding_matrix = embedding_manager.get_embedding_matrix(
        config.word_embedding_dim, paths['embedding_file_path'])
    vars(config)["word_embedding_matrix"] = embedding_matrix

    # Initialize the model.
    model = SiameseBiLSTM(vars(config))
    model.build_graph()

    if mode == "train":
        # Train the model.
        num_epochs = config.num_epochs
        num_train_steps_per_epoch = int(math.ceil(train_data_size /
                                                  batch_size))
        num_val_steps = int(math.ceil(val_data_size / batch_size))
        log_period = config.log_period
        val_period = config.val_period

        save_period = config.save_period

        logger.info("Checkpoints will be written to {}".format(model_save_dir))
        if not os.path.exists(model_save_dir):
            logger.info("save path {} does not exist, "
                        "creating it".format(model_save_dir))
            os.makedirs(model_save_dir)

        logger.info("Saving fitted DataManager to {}".format(model_save_dir))
        pickle.dump(data_manager, open(data_manager_pickle_file_path, "wb"))

        patience = config.early_stopping_patience
        model.train(get_train_instance_generator=get_train_data_gen,
                    get_val_instance_generator=get_val_data_gen,
                    batch_size=batch_size,
                    num_train_steps_per_epoch=num_train_steps_per_epoch,
                    num_epochs=num_epochs,
                    num_val_steps=num_val_steps,
                    save_path=model_save_file_path,
                    log_path=log_path,
                    log_period=log_period,
                    val_period=val_period,
                    save_period=save_period,
                    patience=patience)
    else:
        # Predict with the model
        num_test_steps = int(math.ceil(test_data_size / batch_size))
        # Numpy array of shape (num_test_examples, 2)
        raw_predictions, encodings = model.predict(
            get_test_instance_generator=test_data_gen,
            model_load_dir=model_save_dir,
            batch_size=batch_size,
            num_test_steps=num_test_steps)
        # Remove the first column, so we're left with just the probabilities
        # that a question is a duplicate.
        is_duplicate_probabilities = np.delete(raw_predictions, 0, 1)

        # The class balance between kaggle train and test seems different.
        # This edits prediction probability to account for the discrepancy.
        # See: https://www.kaggle.com/c/quora-question-pairs/discussion/31179
        if config.reweight_predictions_for_kaggle:
            positive_weight = 0.165 / 0.37
            negative_weight = (1 - 0.165) / (1 - 0.37)
            is_duplicate_probabilities = (
                (positive_weight * is_duplicate_probabilities) /
                (positive_weight * is_duplicate_probabilities +
                 negative_weight * (1 - is_duplicate_probabilities)))

        # Write the predictions to an output submission file
        predictions_file_path = paths['predictions_file_path']
        logger.info("Writing predictions to {}".format(predictions_file_path))
        is_duplicate_df = pd.DataFrame(is_duplicate_probabilities)
        # is_duplicate_df.to_csv(predictions_file_path, index_label="test_id",
        #                        header=["is_duplicate"])
        # is_duplicate_df.to_csv(predictions_file_path, index=False, header=False)

        encodings_df = pd.DataFrame(encodings)
        pair_info_df = pd.read_csv(paths['test_file_path'], header=None)

        # print(pair_info_df.shape, is_duplicate_df.shape, encodings_df.shape)

        # result = pd.DataFrame(np.hstack((pair_info_df, is_duplicate_df, encodings_df)))
        result = pd.DataFrame(np.hstack((pair_info_df, is_duplicate_df)))

        result.to_csv(predictions_file_path, index=False, header=False)
        plot_pairs(predictions_file_path)
class TestSiameseBiLSTM(DuplicateTestCase):
    @overrides
    def setUp(self):
        super(TestSiameseBiLSTM, self).setUp()
        self.write_duplicate_questions_train_file()
        self.write_duplicate_questions_validation_file()
        self.write_duplicate_questions_test_file()
        self.data_manager = DataManager(STSInstance)
        self.batch_size = 2
        self.get_train_gen, self.train_size = self.data_manager.get_train_data_from_file(
            [self.TRAIN_FILE])
        self.get_val_gen, self.val_size = self.data_manager.get_validation_data_from_file(
            [self.VALIDATION_FILE])
        self.get_test_gen, self.test_size = self.data_manager.get_test_data_from_file(
            [self.TEST_FILE])

        self.embedding_manager = EmbeddingManager(self.data_manager.data_indexer)
        self.word_embedding_dim = 5
        self.embedding_matrix = self.embedding_manager.get_embedding_matrix(
            self.word_embedding_dim)
        self.rnn_hidden_size = 6
        self.rnn_output_mode = "last"
        self.output_keep_prob = 1.0
        self.share_encoder_weights = True
        self.config_dict = {
            "mode": "train",
            "word_vocab_size": self.data_manager.data_indexer.get_vocab_size(),
            "word_embedding_dim": self.word_embedding_dim,
            "fine_tune_embeddings": False,
            "word_embedding_matrix": self.embedding_matrix,
            "rnn_hidden_size": self.rnn_hidden_size,
            "rnn_output_mode": self.rnn_output_mode,
            "output_keep_prob": self.output_keep_prob,
            "share_encoder_weights": self.share_encoder_weights
        }
        self.num_train_steps_per_epoch = int(math.ceil(self.train_size / self.batch_size))
        self.num_val_steps = int(math.ceil(self.val_size / self.batch_size))
        self.num_test_steps = int(math.ceil(self.test_size / self.batch_size))

    def test_default_does_not_crash(self):
        # Initialize the model
        model = SiameseBiLSTM(self.config_dict)
        model.build_graph()
        # Train the model
        model.train(get_train_instance_generator=self.get_train_gen,
                    get_val_instance_generator=self.get_val_gen,
                    batch_size=self.batch_size,
                    num_train_steps_per_epoch=self.num_train_steps_per_epoch,
                    num_epochs=2,
                    num_val_steps=self.num_val_steps,
                    save_path=self.TEST_DIR,
                    log_path=self.TEST_DIR,
                    log_period=2,
                    val_period=2,
                    save_period=2,
                    patience=0)

        tf.reset_default_graph()
        # Load and predict with the model
        self.config_dict["mode"] = "test"
        del self.config_dict["word_embedding_matrix"]
        loaded_model = SiameseBiLSTM(self.config_dict)
        loaded_model.build_graph()
        loaded_model.predict(get_test_instance_generator=self.get_test_gen,
                             model_load_dir=self.TEST_DIR,
                             batch_size=self.batch_size,
                             num_test_steps=self.num_test_steps)

    def test_mean_pool_does_not_crash(self):
        # Initialize the model
        self.config_dict["rnn_output_mode"] = "mean_pool"
        model = SiameseBiLSTM(self.config_dict)
        model.build_graph()
        # Train the model
        model.train(get_train_instance_generator=self.get_train_gen,
                    get_val_instance_generator=self.get_val_gen,
                    batch_size=self.batch_size,
                    num_train_steps_per_epoch=self.num_train_steps_per_epoch,
                    num_epochs=2,
                    num_val_steps=self.num_val_steps,
                    save_path=self.TEST_DIR,
                    log_path=self.TEST_DIR,
                    log_period=2,
                    val_period=2,
                    save_period=2,
                    patience=0)

        tf.reset_default_graph()
        # Load and predict with the model
        self.config_dict["mode"] = "test"
        del self.config_dict["word_embedding_matrix"]
        loaded_model = SiameseBiLSTM(self.config_dict)
        loaded_model.build_graph()
        loaded_model.predict(get_test_instance_generator=self.get_test_gen,
                             model_load_dir=self.TEST_DIR,
                             batch_size=self.batch_size,
                             num_test_steps=self.num_test_steps)

    def test_non_sharing_encoders_does_not_crash(self):
        # Initialize the model
        self.config_dict["share_encoder_weights"] = False
        model = SiameseBiLSTM(self.config_dict)
        model.build_graph()
        # Train the model
        model.train(get_train_instance_generator=self.get_train_gen,
                    get_val_instance_generator=self.get_val_gen,
                    batch_size=self.batch_size,
                    num_train_steps_per_epoch=self.num_train_steps_per_epoch,
                    num_epochs=2,
                    num_val_steps=self.num_val_steps,
                    save_path=self.TEST_DIR,
                    log_path=self.TEST_DIR,
                    log_period=2,
                    val_period=2,
                    save_period=2,
                    patience=0)

        tf.reset_default_graph()
        # Load and predict with the model
        self.config_dict["mode"] = "test"
        del self.config_dict["word_embedding_matrix"]
        loaded_model = SiameseBiLSTM(self.config_dict)
        loaded_model.build_graph()
        loaded_model.predict(get_test_instance_generator=self.get_test_gen,
                             model_load_dir=self.TEST_DIR,
                             batch_size=self.batch_size,
                             num_test_steps=self.num_test_steps)
Example #13
0
def main():
    project_dir = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)

    # Parse config arguments
    argparser = argparse.ArgumentParser(
        description=("Run the Siamese BiLSTM model with an added "
                     "matching layer for paraphase detection."))
    argparser.add_argument("mode",
                           type=str,
                           choices=["train", "predict"],
                           help=("One of {train|predict}, to "
                                 "indicate what you want the model to do. "
                                 "If you pick \"predict\", then you must also "
                                 "supply the path to a pretrained model and "
                                 "DataIndexer to load."))
    argparser.add_argument("--model_load_dir",
                           type=str,
                           default=os.path.join(project_dir, "models/"),
                           help=("The path to a directory with checkpoints to "
                                 "load for evaluation or prediction. The "
                                 "latest checkpoint will be loaded."))
    argparser.add_argument("--dataindexer_load_path",
                           type=str,
                           default=os.path.join(project_dir, "models/"),
                           help=("The path to the DataIndexer fit on the "
                                 "train data, so we can properly index the "
                                 "test data for evaluation or prediction."))
    argparser.add_argument("--train_file",
                           type=str,
                           default=os.path.join(
                               project_dir, "data/processed/weizhong/"
                               "weizhong_train_train_split.txt"),
                           help="Path to a file to train on.")
    argparser.add_argument("--val_file",
                           type=str,
                           default=os.path.join(
                               project_dir, "data/processed/weizhong/"
                               "weizhong_train_val_split.txt"))
    argparser.add_argument("--test_file",
                           type=str,
                           default=os.path.join(project_dir, "data/raw/"
                                                "weizhong_dev.txt"))
    argparser.add_argument("--batch_size",
                           type=int,
                           default=512,
                           help="Number of instances per batch.")
    argparser.add_argument("--num_epochs",
                           type=int,
                           default=200,
                           help=("Number of epochs to perform in "
                                 "training."))
    argparser.add_argument("--early_stopping_patience",
                           type=int,
                           default=100,
                           help=("number of epochs with no validation "
                                 "accuracy improvement after which training "
                                 "will be stopped"))
    argparser.add_argument("--num_sentence_words",
                           type=int,
                           default=25,
                           help=("The maximum length of a sentence. Longer "
                                 "sentences will be truncated, and shorter "
                                 "ones will be padded."))
    argparser.add_argument("--word_embedding_dim",
                           type=int,
                           default=300,
                           help="Dimensionality of the word embedding layer")
    argparser.add_argument("--pretrained_embeddings_file_path",
                           type=str,
                           help="Path to a file with pretrained embeddings.",
                           default=os.path.join(
                               project_dir, "data/external/",
                               "word2vec_finance_train_use.txt"))
    argparser.add_argument("--fine_tune_embeddings",
                           action="store_true",
                           help=("Whether to train the embedding layer "
                                 "(if True), or keep it fixed (False)."))
    argparser.add_argument("--rnn_hidden_size",
                           type=int,
                           default=256,
                           help=("The output dimension of the RNN."))
    argparser.add_argument("--share_encoder_weights",
                           action="store_true",
                           help=("Whether to use the same encoder on both "
                                 "input sentences (thus sharing weights), "
                                 "or a different one for each sentence"))
    argparser.add_argument("--output_keep_prob",
                           type=float,
                           default=0.9,
                           help=("The proportion of RNN outputs to keep, "
                                 "where the rest are dropped out."))
    argparser.add_argument("--log_period",
                           type=int,
                           default=10,
                           help=("Number of steps between each summary "
                                 "op evaluation."))
    argparser.add_argument("--val_period",
                           type=int,
                           default=250,
                           help=("Number of steps between each evaluation of "
                                 "validation performance."))
    argparser.add_argument("--log_dir",
                           type=str,
                           default=os.path.join(project_dir, "logs/"),
                           help=("Directory to save logs to."))
    argparser.add_argument("--save_period",
                           type=int,
                           default=250,
                           help=("Number of steps between each "
                                 "model checkpoint"))
    argparser.add_argument("--save_dir",
                           type=str,
                           default=os.path.join(project_dir, "models/"),
                           help=("Directory to save model checkpoints to."))
    argparser.add_argument("--run_id",
                           type=str,
                           required=True,
                           help=("Identifying run ID for this run. If "
                                 "predicting, you probably want this "
                                 "to be the same as the train run_id"))
    argparser.add_argument("--model_name",
                           type=str,
                           required=True,
                           help=("Identifying model name for this run. If"
                                 "predicting, you probably want this "
                                 "to be the same as the train run_id"))
    argparser.add_argument("--reweight_predictions_for_kaggle",
                           action="store_true",
                           help=("Only relevant when predicting. Whether to "
                                 "reweight the prediction probabilities to "
                                 "account for class proportion discrepancy "
                                 "between train and test."))

    config = argparser.parse_args()

    model_name = config.model_name
    run_id = config.run_id
    mode = config.mode
    print("======================", config.share_encoder_weights)

    # Get the data.
    batch_size = config.batch_size
    if mode == "train":
        # Read the train data from a file, and use it to index the
        # validation data
        data_manager = DataManager(WeizhongInstance)
        num_sentence_words = config.num_sentence_words
        get_train_data_gen, train_data_size = data_manager.get_train_data_from_file(
            [config.train_file],
            max_lengths={"num_sentence_words": num_sentence_words})
        get_val_data_gen, val_data_size = data_manager.get_validation_data_from_file(
            [config.val_file],
            max_lengths={"num_sentence_words": num_sentence_words})
    else:
        # Load the fitted DataManager, and use it to index the test data
        logger.info("Loading pickled DataManager from {}".format(
            config.dataindexer_load_path))
        config.dataindexer_load_path = os.path.join(
            config.dataindexer_load_path, config.model_name,
            config.run_id.zfill(2), config.model_name + "-" +
            config.run_id.zfill(2) + "-" + "DataManager.pkl")
        data_manager = pickle.load(open(config.dataindexer_load_path, "rb"))
        get_test_data_gen, test_data_size = data_manager.get_test_data_from_file(
            [config.test_file])

    vars(config)["word_vocab_size"] = data_manager.data_indexer.get_vocab_size(
    )

    # Log the run parameters.
    log_dir = config.log_dir
    log_path = os.path.join(log_dir, model_name, run_id.zfill(2))
    logger.info("Writing logs to {}".format(log_path))
    if not os.path.exists(log_path):
        logger.info("log path {} does not exist, "
                    "creating it".format(log_path))
        os.makedirs(log_path)
    params_path = os.path.join(log_path, mode + "params.json")
    logger.info("Writing params to {}".format(params_path))
    with open(params_path, 'w') as params_file:
        json.dump(vars(config), params_file, indent=4)

    # Get the embeddings.
    embedding_manager = EmbeddingManager(data_manager.data_indexer)
    embedding_matrix = embedding_manager.get_embedding_matrix(
        config.word_embedding_dim, config.pretrained_embeddings_file_path)
    vars(config)["word_embedding_matrix"] = embedding_matrix

    # Initialize the model.
    model = SiameseMatchingBiLSTM(vars(config))
    model.build_graph()

    if mode == "train":
        # Train the model.
        num_epochs = config.num_epochs
        num_train_steps_per_epoch = int(math.ceil(train_data_size /
                                                  batch_size))
        num_val_steps = int(math.ceil(val_data_size / batch_size))
        log_period = config.log_period
        val_period = config.val_period

        save_period = config.save_period
        save_dir = os.path.join(config.save_dir, model_name,
                                run_id.zfill(2) + "/")
        save_path = os.path.join(save_dir, model_name + "-" + run_id.zfill(2))

        logger.info("Checkpoints will be written to {}".format(save_dir))
        if not os.path.exists(save_dir):
            logger.info("save path {} does not exist, "
                        "creating it".format(save_dir))
            os.makedirs(save_dir)

        logger.info("Saving fitted DataManager to {}".format(save_dir))
        data_manager_pickle_name = "{}-{}-DataManager.pkl".format(
            model_name, run_id.zfill(2))
        pickle.dump(
            data_manager,
            open(os.path.join(save_dir, data_manager_pickle_name), "wb"))

        patience = config.early_stopping_patience
        model.train(get_train_instance_generator=get_train_data_gen,
                    get_val_instance_generator=get_val_data_gen,
                    batch_size=batch_size,
                    num_train_steps_per_epoch=num_train_steps_per_epoch,
                    num_epochs=num_epochs,
                    num_val_steps=num_val_steps,
                    save_path=save_path,
                    log_path=log_path,
                    log_period=log_period,
                    val_period=val_period,
                    save_period=save_period,
                    patience=patience)
    else:
        # Predict with the model
        model_load_dir = os.path.join(config.model_load_dir, config.model_name,
                                      config.run_id.zfill(2))

        num_test_steps = int(math.ceil(test_data_size / batch_size))
        # Numpy array of shape (num_test_examples, 2)
        raw_predictions, lineids = model.predict(
            get_test_instance_generator=get_test_data_gen,
            model_load_dir=model_load_dir,
            batch_size=batch_size,
            num_test_steps=num_test_steps)

        # Write the predictions to an output submission file
        output_predictions_path = os.path.join(
            log_path,
            model_name + "-" + run_id.zfill(2) + "-output_predictions.csv")

        with open(output_predictions_path, "w") as output_file:
            output_file.write("test_id,result\n")
            for index, lineid in enumerate(lineids):
                output_file.write(
                    str(lineid) + "," + str(raw_predictions[index]) + "\n")

        logger.info(
            "Writing predictions to {}".format(output_predictions_path))