def setUp(self): super(TestBiMPM, self).setUp() self.write_duplicate_questions_train_file() self.write_duplicate_questions_validation_file() self.write_duplicate_questions_test_file() self.data_manager = DataManager(STSInstance) self.batch_size = 3 self.get_train_gen, self.train_size = self.data_manager.get_train_data_from_file( [self.TRAIN_FILE], mode="word+character") self.get_val_gen, self.val_size = self.data_manager.get_validation_data_from_file( [self.VALIDATION_FILE], mode="word+character") self.get_test_gen, self.test_size = self.data_manager.get_test_data_from_file( [self.TEST_FILE], mode="word+character") self.embedding_manager = EmbeddingManager( self.data_manager.data_indexer) self.word_embedding_dim = 5 self.word_embedding_matrix = self.embedding_manager.get_embedding_matrix( self.word_embedding_dim) self.char_embedding_dim = 2 self.char_embedding_matrix = self.embedding_manager.get_embedding_matrix( self.char_embedding_dim) self.char_rnn_hidden_size = 6 self.context_rnn_hidden_size = 3 self.aggregation_rnn_hidden_size = 4 self.dropout_ratio = 0.1 self.config_dict = { "mode": "train", "word_vocab_size": self.data_manager.data_indexer.get_vocab_size(), "word_embedding_dim": self.word_embedding_dim, "word_embedding_matrix": self.word_embedding_matrix, "char_vocab_size": self.data_manager.data_indexer.get_vocab_size( namespace="characters"), "char_embedding_dim": self.char_embedding_dim, "char_embedding_matrix": self.char_embedding_matrix, "char_rnn_hidden_size": self.char_rnn_hidden_size, "fine_tune_embeddings": False, "context_rnn_hidden_size": self.context_rnn_hidden_size, "aggregation_rnn_hidden_size": self.aggregation_rnn_hidden_size, "dropout_ratio": self.dropout_ratio } self.num_train_steps_per_epoch = int( math.ceil(self.train_size / self.batch_size)) self.num_val_steps = int(math.ceil(self.val_size / self.batch_size)) self.num_test_steps = int(math.ceil(self.test_size / self.batch_size))
def setUp(self): super(TestSiameseMatchingBiLSTM, self).setUp() self.write_duplicate_questions_train_file() self.write_duplicate_questions_validation_file() self.write_duplicate_questions_test_file() self.data_manager = DataManager(STSInstance) self.batch_size = 2 self.get_train_gen, self.train_size = self.data_manager.get_train_data_from_file( [self.TRAIN_FILE]) self.get_val_gen, self.val_size = self.data_manager.get_validation_data_from_file( [self.VALIDATION_FILE]) self.get_test_gen, self.test_size = self.data_manager.get_test_data_from_file( [self.TEST_FILE]) self.embedding_manager = EmbeddingManager(self.data_manager.data_indexer) self.word_embedding_dim = 5 self.embedding_matrix = self.embedding_manager.get_embedding_matrix( self.word_embedding_dim) self.rnn_hidden_size = 6 self.output_keep_prob = 1.0 self.share_encoder_weights = True self.config_dict = { "mode": "train", "word_vocab_size": self.data_manager.data_indexer.get_vocab_size(), "word_embedding_dim": self.word_embedding_dim, "fine_tune_embeddings": False, "word_embedding_matrix": self.embedding_matrix, "rnn_hidden_size": self.rnn_hidden_size, "output_keep_prob": self.output_keep_prob, "share_encoder_weights": self.share_encoder_weights } self.num_train_steps_per_epoch = int(math.ceil(self.train_size / self.batch_size)) self.num_val_steps = int(math.ceil(self.val_size / self.batch_size)) self.num_test_steps = int(math.ceil(self.test_size / self.batch_size))
def test_generate_train_batches(self): get_train_gen, train_size = self.data_manager.get_train_data_from_file( [self.TRAIN_FILE]) batch_gen = DataManager.get_batch_generator(get_train_gen, 2) new_batch_gen = DataManager.get_batch_generator(get_train_gen, 2) # Assert that the new generator is a different object # than the old generator. assert new_batch_gen != batch_gen assert train_size == 3 first_batch = batch_gen.__next__() new_first_batch = new_batch_gen.__next__() inputs, labels = first_batch new_inputs, new_labels = new_first_batch assert len(inputs) == len(new_inputs) == 2 assert len(labels) == len(new_labels) == 1 # Ensure output matches ground truth assert_allclose(inputs[0], np.array([[2, 0], [5, 0]])) assert_allclose(inputs[1], np.array([[3, 4], [6, 0]])) assert_allclose(labels[0], np.array([[1, 0], [0, 1]])) # Ensure both generators produce same results. assert_allclose(inputs[0], new_inputs[0]) assert_allclose(inputs[1], new_inputs[1]) assert_allclose(labels[0], labels[0]) second_batch = batch_gen.__next__() new_second_batch = new_batch_gen.__next__() inputs, labels = second_batch new_inputs, new_labels = new_second_batch assert len(inputs) == len(new_inputs) == 2 assert len(labels) == len(new_labels) == 1 # Ensure output matches ground truth assert_allclose(inputs[0], np.array([[7, 0]])) assert_allclose(inputs[1], np.array([[8, 0]])) assert_allclose(labels[0], np.array([[1, 0]])) # Ensure both generators produce same results. assert_allclose(inputs[0], new_inputs[0]) assert_allclose(inputs[1], new_inputs[1]) assert_allclose(labels[0], labels[0]) # Should raise a StopIteration with self.assertRaises(StopIteration): batch_gen.__next__() new_batch_gen.__next__()
def setUp(self): super(TestSiameseBiLSTM, self).setUp() self.write_duplicate_questions_train_file() self.write_duplicate_questions_validation_file() self.write_duplicate_questions_test_file() self.data_manager = DataManager(STSInstance) self.batch_size = 2 self.get_train_gen, self.train_size = self.data_manager.get_train_data_from_file( [self.TRAIN_FILE]) self.get_val_gen, self.val_size = self.data_manager.get_validation_data_from_file( [self.VALIDATION_FILE]) self.get_test_gen, self.test_size = self.data_manager.get_test_data_from_file( [self.TEST_FILE]) self.embedding_manager = EmbeddingManager(self.data_manager.data_indexer) self.word_embedding_dim = 5 self.embedding_matrix = self.embedding_manager.get_embedding_matrix( self.word_embedding_dim) self.rnn_hidden_size = 6 self.rnn_output_mode = "last" self.output_keep_prob = 1.0 self.share_encoder_weights = True self.config_dict = { "mode": "train", "word_vocab_size": self.data_manager.data_indexer.get_vocab_size(), "word_embedding_dim": self.word_embedding_dim, "fine_tune_embeddings": False, "word_embedding_matrix": self.embedding_matrix, "rnn_hidden_size": self.rnn_hidden_size, "rnn_output_mode": self.rnn_output_mode, "output_keep_prob": self.output_keep_prob, "share_encoder_weights": self.share_encoder_weights } self.num_train_steps_per_epoch = int(math.ceil(self.train_size / self.batch_size)) self.num_val_steps = int(math.ceil(self.val_size / self.batch_size)) self.num_test_steps = int(math.ceil(self.test_size / self.batch_size))
def main(): project_dir = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir) # Parse config arguments argparser = argparse.ArgumentParser( description=("Run the Bilateral Multi-Perspective " "Matching (biMPM) model on the paraphrase " "identification task.")) argparser.add_argument("mode", type=str, choices=["train", "predict"], help=("One of {train|predict}, to " "indicate what you want the model to do. " "If you pick \"predict\", then you must also " "supply the path to a pretrained model and " "DataIndexer to load.")) argparser.add_argument("--model_load_dir", type=str, help=("The path to a directory with checkpoints to " "load for evaluation or prediction. The " "latest checkpoint will be loaded.")) argparser.add_argument("--dataindexer_load_path", type=str, help=("The path to the dataindexer fit on the " "train data, so we can properly index the " "test data for evaluation or prediction.")) argparser.add_argument("--train_file", type=str, default=os.path.join( project_dir, "data/processed/quora/" "train_cleaned_train_split.csv"), help="Path to a file to train on.") argparser.add_argument( "--val_file", type=str, default=os.path.join( project_dir, "data/processed/quora/" "train_cleaned_val_split.csv"), help="Path to a file to monitor validation acc. on.") argparser.add_argument("--test_file", type=str, default=os.path.join( project_dir, "data/processed/quora/" "test_final.csv")) argparser.add_argument("--batch_size", type=int, default=64, help="Number of instances per batch.") argparser.add_argument("--num_epochs", type=int, default=10, help=("Number of epochs to perform in " "training.")) argparser.add_argument("--early_stopping_patience", type=int, default=0, help=("number of epochs with no validation " "accuracy improvement after which training " "will be stopped")) argparser.add_argument("--num_sentence_words", type=int, default=100, help=("The maximum length of a sentence. Longer " "sentences will be truncated, and shorter " "ones will be padded.")) argparser.add_argument("--num_word_characters", type=int, default=10, help=("The maximum length of a word. Longer " "words will be truncated, and shorter " "ones will be padded.")) argparser.add_argument("--word_embedding_dim", type=int, default=300, help="Dimensionality of the word embedding layer") argparser.add_argument( "--pretrained_word_embeddings_file_path", type=str, help="Path to a file with pretrained word embeddings.", default=os.path.join(project_dir, "data/external/", "glove.6B.300d.txt")) argparser.add_argument("--char_embedding_dim", type=int, default=20, help="Dimensionality of the char embedding layer") argparser.add_argument("--fine_tune_embeddings", action="store_true", help=("Whether to train the embedding layer " "(if True), or keep it fixed (False).")) argparser.add_argument("--char_rnn_hidden_size", type=int, default=50, help=("The output dimension of the character " "encoder RNN.")) argparser.add_argument("--context_rnn_hidden_size", type=int, default=100, help=("The output dimension of the context " "encoding RNN.")) argparser.add_argument("--aggregation_rnn_hidden_size", type=int, default=100, help=("The output dimension of the aggregation " "encoding RNN.")) argparser.add_argument("--dropout_ratio", type=float, default=0.1, help=("The proportion of RNN outputs to " "drop out.")) argparser.add_argument("--log_period", type=int, default=10, help=("Number of steps between each summary " "op evaluation.")) argparser.add_argument("--val_period", type=int, default=2500, help=("Number of steps between each evaluation of " "validation performance.")) argparser.add_argument("--log_dir", type=str, default=os.path.join(project_dir, "logs/"), help=("Directory to save logs to.")) argparser.add_argument("--save_period", type=int, default=2500, help=("Number of steps between each " "model checkpoint")) argparser.add_argument("--save_dir", type=str, default=os.path.join(project_dir, "models/"), help=("Directory to save model checkpoints to.")) argparser.add_argument("--run_id", type=str, required=True, help=("Identifying run ID for this run. If " "predicting, you probably want this " "to be the same as the train run_id")) argparser.add_argument("--model_name", type=str, required=True, help=("Identifying model name for this run. If" "predicting, you probably want this " "to be the same as the train run_id")) argparser.add_argument("--reweight_predictions_for_kaggle", action="store_true", help=("Only relevant when predicting. Whether to " "reweight the prediction probabilities to " "account for class proportion discrepancy " "between train and test.")) config = argparser.parse_args() model_name = config.model_name run_id = config.run_id mode = config.mode # Get the data. batch_size = config.batch_size if mode == "train": # Read the train data from a file, and use it to index the # validation data data_manager = DataManager(STSInstance) num_sentence_words = config.num_sentence_words num_word_characters = config.num_word_characters get_train_data_gen, train_data_size = data_manager.get_train_data_from_file( [config.train_file], max_lengths={ "num_sentence_words": num_sentence_words, "num_word_characters": num_word_characters }, mode="word+character") get_val_data_gen, val_data_size = data_manager.get_validation_data_from_file( [config.val_file], max_lengths={ "num_sentence_words": num_sentence_words, "num_word_characters": num_word_characters }, mode="word+character") else: # Load the fitted DataManager, and use it to index the test data logger.info("Loading pickled DataManager from {}".format( config.dataindexer_load_path)) data_manager = pickle.load(open(config.dataindexer_load_path, "rb")) get_test_data_gen, test_data_size = data_manager.get_test_data_from_file( [config.test_file], mode="word+character") vars(config)["word_vocab_size"] = data_manager.data_indexer.get_vocab_size( ) vars(config)["char_vocab_size"] = data_manager.data_indexer.get_vocab_size( namespace="characters") # Log the run parameters. log_dir = config.log_dir log_path = os.path.join(log_dir, model_name, run_id.zfill(2)) logger.info("Writing logs to {}".format(log_path)) if not os.path.exists(log_path): logger.info("log path {} does not exist, " "creating it".format(log_path)) os.makedirs(log_path) params_path = os.path.join(log_path, mode + "params.json") logger.info("Writing params to {}".format(params_path)) with open(params_path, 'w') as params_file: json.dump(vars(config), params_file, indent=4) # Get the embeddings. embedding_manager = EmbeddingManager(data_manager.data_indexer) word_embedding_matrix = embedding_manager.get_embedding_matrix( config.word_embedding_dim, config.pretrained_word_embeddings_file_path) vars(config)["word_embedding_matrix"] = word_embedding_matrix char_embedding_matrix = embedding_manager.get_embedding_matrix( config.char_embedding_dim, namespace="characters") vars(config)["char_embedding_matrix"] = char_embedding_matrix # Initialize the model. model = BiMPM(vars(config)) model.build_graph() if mode == "train": # Train the model. num_epochs = config.num_epochs num_train_steps_per_epoch = int(math.ceil(train_data_size / batch_size)) num_val_steps = int(math.ceil(val_data_size / batch_size)) log_period = config.log_period val_period = config.val_period save_period = config.save_period save_dir = os.path.join(config.save_dir, model_name, run_id.zfill(2) + "/") save_path = os.path.join(save_dir, model_name + "-" + run_id.zfill(2)) logger.info("Checkpoints will be written to {}".format(save_dir)) if not os.path.exists(save_dir): logger.info("save path {} does not exist, " "creating it".format(save_dir)) os.makedirs(save_dir) logger.info("Saving fitted DataManager to {}".format(save_dir)) data_manager_pickle_name = "{}-{}-DataManager.pkl".format( model_name, run_id.zfill(2)) pickle.dump( data_manager, open(os.path.join(save_dir, data_manager_pickle_name), "wb")) patience = config.early_stopping_patience model.train(get_train_instance_generator=get_train_data_gen, get_val_instance_generator=get_val_data_gen, batch_size=batch_size, num_train_steps_per_epoch=num_train_steps_per_epoch, num_epochs=num_epochs, num_val_steps=num_val_steps, save_path=save_path, log_path=log_path, log_period=log_period, val_period=val_period, save_period=save_period, patience=patience) else: # Predict with the model model_load_dir = config.model_load_dir num_test_steps = int(math.ceil(test_data_size / batch_size)) # Numpy array of shape (num_test_examples, 2) raw_predictions = model.predict( get_test_instance_generator=get_test_data_gen, model_load_dir=model_load_dir, batch_size=batch_size, num_test_steps=num_test_steps) # Remove the first column, so we're left with just the probabilities # that a question is a duplicate. is_duplicate_probabilities = np.delete(raw_predictions, 0, 1) # The class balance between kaggle train and test seems different. # This edits prediction probability to account for the discrepancy. # See: https://www.kaggle.com/c/quora-question-pairs/discussion/31179 if config.reweight_predictions_for_kaggle: positive_weight = 0.165 / 0.37 negative_weight = (1 - 0.165) / (1 - 0.37) is_duplicate_probabilities = ( (positive_weight * is_duplicate_probabilities) / (positive_weight * is_duplicate_probabilities + negative_weight * (1 - is_duplicate_probabilities))) # Write the predictions to an output submission file output_predictions_path = os.path.join( log_path, model_name + "-" + run_id.zfill(2) + "-output_predictions.csv") logger.info( "Writing predictions to {}".format(output_predictions_path)) is_duplicate_df = pd.DataFrame(is_duplicate_probabilities) is_duplicate_df.to_csv(output_predictions_path, index_label="test_id", header=["is_duplicate"])
def setUp(self): super(TestDataManagerTest, self).setUp() self.write_duplicate_questions_train_file() self.write_duplicate_questions_test_file() self.data_manager = DataManager(STSInstance) self.data_manager.get_train_data_from_file([self.TRAIN_FILE])
class TestDataManagerTest(DuplicateTestCase): @overrides def setUp(self): super(TestDataManagerTest, self).setUp() self.write_duplicate_questions_train_file() self.write_duplicate_questions_test_file() self.data_manager = DataManager(STSInstance) self.data_manager.get_train_data_from_file([self.TRAIN_FILE]) def test_get_test_data_default(self): get_test_gen, test_size = self.data_manager.get_test_data_from_file( [self.TEST_FILE]) assert test_size == 3 test_gen = get_test_gen() inputs1, labels1 = test_gen.__next__() assert_allclose(inputs1[0], np.array([2, 1])) assert_allclose(inputs1[1], np.array([1, 0])) inputs2, labels2 = test_gen.__next__() assert_allclose(inputs2[0], np.array([4, 0])) assert_allclose(inputs2[1], np.array([5, 1])) inputs3, labels3 = test_gen.__next__() assert_allclose(inputs3[0], np.array([6, 0])) assert_allclose(inputs3[1], np.array([7, 0])) # Should raise a StopIteration with self.assertRaises(StopIteration): test_gen.__next__() # Test that we can make a new test generator new_test_gen = get_test_gen() # Verify that the new and old generator are not the same object assert new_test_gen != test_gen new_inputs1, new_labels1 = new_test_gen.__next__() assert_allclose(new_inputs1, inputs1) assert_allclose(new_labels1, labels1) new_inputs2, new_labels2 = new_test_gen.__next__() assert_allclose(new_inputs2, inputs2) assert_allclose(new_labels2, labels2) new_inputs3, new_labels3 = new_test_gen.__next__() assert_allclose(new_inputs3, inputs3) assert_allclose(new_labels3, labels3) # Should raise a StopIteration with self.assertRaises(StopIteration): new_test_gen.__next__() def test_get_test_data_default_character(self): get_test_gen, test_size = self.data_manager.get_test_data_from_file( [self.TEST_FILE], mode="character") test_gen = get_test_gen() assert test_size == 3 inputs1, labels = test_gen.__next__() assert_allclose(inputs1[0], np.array([[6, 9, 2, 7, 8, 3, 5, 4, 10, 0, 0, 0], [6, 9, 2, 7, 8, 3, 5, 4, 9, 4, 1, 10]])) assert_allclose(inputs1[1], np.array([[6, 9, 2, 7, 8, 3, 5, 4, 9, 4, 1, 11], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])) assert len(labels) == 0 inputs2, labels = test_gen.__next__() assert_allclose(inputs2[0], np.array([[6, 9, 2, 7, 8, 3, 5, 4, 12, 19, 17, 18], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])) assert_allclose(inputs2[1], np.array([[6, 9, 2, 7, 8, 3, 5, 4, 13, 0, 0, 0], [6, 9, 2, 7, 8, 3, 5, 4, 9, 4, 1, 12]])) assert len(labels) == 0 inputs3, labels = test_gen.__next__() assert_allclose(inputs3[0], np.array([[6, 9, 2, 7, 8, 3, 5, 4, 14, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])) assert_allclose(inputs3[1], np.array([[6, 9, 2, 7, 8, 3, 5, 4, 15, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])) assert len(labels) == 0 # Should raise a StopIteration with self.assertRaises(StopIteration): test_gen.__next__() def test_get_test_data_default_word_and_character(self): get_test_gen, test_size = self.data_manager.get_test_data_from_file( [self.TEST_FILE], mode="word+character") test_gen = get_test_gen() assert test_size == 3 inputs1, labels = test_gen.__next__() assert_allclose(inputs1[0], np.array([2, 1])) assert_allclose(inputs1[1], np.array([[6, 9, 2, 7, 8, 3, 5, 4, 10, 0, 0, 0], [6, 9, 2, 7, 8, 3, 5, 4, 9, 4, 1, 10]])) assert_allclose(inputs1[2], np.array([1, 0])) assert_allclose(inputs1[3], np.array([[6, 9, 2, 7, 8, 3, 5, 4, 9, 4, 1, 11], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])) assert len(labels) == 0 inputs2, labels = test_gen.__next__() assert_allclose(inputs2[0], np.array([4, 0])) assert_allclose(inputs2[1], np.array([[6, 9, 2, 7, 8, 3, 5, 4, 12, 19, 17, 18], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])) assert_allclose(inputs2[2], np.array([5, 1])) assert_allclose(inputs2[3], np.array([[6, 9, 2, 7, 8, 3, 5, 4, 13, 0, 0, 0], [6, 9, 2, 7, 8, 3, 5, 4, 9, 4, 1, 12]])) assert len(labels) == 0 inputs3, labels = test_gen.__next__() assert_allclose(inputs3[0], np.array([6, 0])) assert_allclose(inputs3[1], np.array([[6, 9, 2, 7, 8, 3, 5, 4, 14, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])) assert_allclose(inputs3[2], np.array([7, 0])) assert_allclose(inputs3[3], np.array([[6, 9, 2, 7, 8, 3, 5, 4, 15, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])) assert len(labels) == 0 # Should raise a StopIteration with self.assertRaises(StopIteration): test_gen.__next__() def test_get_test_data_pad_with_max_lens(self): get_test_gen, test_size = self.data_manager.get_test_data_from_file( [self.TEST_FILE], max_lengths={"num_sentence_words": 1}) test_gen = get_test_gen() assert test_size == 3 inputs, labels = test_gen.__next__() assert_allclose(inputs[0], np.array([2])) assert_allclose(inputs[1], np.array([1])) assert len(labels) == 0 inputs, labels = test_gen.__next__() assert_allclose(inputs[0], np.array([4])) assert_allclose(inputs[1], np.array([5])) assert len(labels) == 0 inputs, labels = test_gen.__next__() assert_allclose(inputs[0], np.array([6])) assert_allclose(inputs[1], np.array([7])) assert len(labels) == 0 # Should raise a StopIteration with self.assertRaises(StopIteration): test_gen.__next__() def test_get_test_data_with_max_instances(self): get_test_gen, test_size = self.data_manager.get_test_data_from_file( [self.TEST_FILE], max_instances=2) test_gen = get_test_gen() assert test_size == 2 inputs, labels = test_gen.__next__() assert_allclose(inputs[0], np.array([2, 1])) assert_allclose(inputs[1], np.array([1, 0])) assert len(labels) == 0 inputs, labels = test_gen.__next__() assert_allclose(inputs[0], np.array([4, 0])) assert_allclose(inputs[1], np.array([5, 1])) assert len(labels) == 0 # Should raise a StopIteration with self.assertRaises(StopIteration): test_gen.__next__() def test_get_test_data_errors(self): with self.assertRaises(ValueError): self.data_manager.get_test_data_from_file( [self.TEST_FILE], max_lengths={"num_sentence_words": 1}, pad=False) with self.assertRaises(ValueError): self.data_manager.get_test_data_from_file( [self.TEST_FILE], max_lengths={"some wrong key": 1}) def test_get_test_data_no_pad(self): get_test_gen, test_size = self.data_manager.get_test_data_from_file( [self.TEST_FILE], pad=False) test_gen = get_test_gen() assert test_size == 3 inputs, labels = test_gen.__next__() assert_allclose(inputs[0], np.array([2, 1, 2])) assert_allclose(inputs[1], np.array([1])) assert len(labels) == 0 inputs, labels = test_gen.__next__() assert_allclose(inputs[0], np.array([4])) assert_allclose(inputs[1], np.array([5, 1])) assert len(labels) == 0 inputs, labels = test_gen.__next__() assert_allclose(inputs[0], np.array([6])) assert_allclose(inputs[1], np.array([7])) assert len(labels) == 0 # Should raise a StopIteration with self.assertRaises(StopIteration): test_gen.__next__() def test_generate_test_batches(self): get_test_gen, test_size = self.data_manager.get_test_data_from_file( [self.TEST_FILE]) batch_gen = self.data_manager.get_batch_generator(get_test_gen, 2) new_batch_gen = DataManager.get_batch_generator(get_test_gen, 2) # Assert that the new generator is a different object # than the old generator. assert new_batch_gen != batch_gen assert test_size == 3 first_batch = batch_gen.__next__() new_first_batch = new_batch_gen.__next__() inputs, labels = first_batch new_inputs, new_labels = new_first_batch assert len(inputs) == 2 assert len(labels) == 0 # Ensure output matches ground truth assert_allclose(inputs[0], np.array([[2, 1], [4, 0]])) assert_allclose(inputs[1], np.array([[1, 0], [5, 1]])) # Ensure both generators produce same results. assert_allclose(inputs[0], new_inputs[0]) assert_allclose(inputs[1], new_inputs[1]) second_batch = batch_gen.__next__() new_second_batch = new_batch_gen.__next__() inputs, labels = second_batch new_inputs, new_labels = new_second_batch assert len(inputs) == 2 assert len(labels) == 0 # Ensure output matches ground truth assert_allclose(inputs[0], np.array([[6, 0]])) assert_allclose(inputs[1], np.array([[7, 0]])) # Ensure both generators produce same results. assert_allclose(inputs[0], new_inputs[0]) assert_allclose(inputs[1], new_inputs[1]) with self.assertRaises(StopIteration): batch_gen.__next__() new_batch_gen.__next__()
def setUp(self): super(TestDataManagerTrain, self).setUp() self.write_duplicate_questions_train_file() self.data_manager = DataManager(STSInstance)
def main(): project_dir = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir) # Parse config arguments argparser = argparse.ArgumentParser( description=("Run the Siamese BiLSTM model with an added " "matching layer for paraphase detection.")) argparser.add_argument("mode", type=str, choices=["train", "predict"], help=("One of {train|predict}, to " "indicate what you want the model to do. " "If you pick \"predict\", then you must also " "supply the path to a pretrained model and " "DataIndexer to load.")) argparser.add_argument("--model_load_dir", type=str, help=("The path to a directory with checkpoints to " "load for evaluation or prediction. The " "latest checkpoint will be loaded.")) argparser.add_argument("--dataindexer_load_path", type=str, help=("The path to the DataIndexer fit on the " "train data, so we can properly index the " "test data for evaluation or prediction.")) argparser.add_argument("--train_file", type=str, default=os.path.join(project_dir, "data/processed/quora/" "train_cleaned_train_split.csv"), help="Path to a file to train on.") argparser.add_argument("--val_file", type=str, default=os.path.join(project_dir, "data/processed/quora/" "train_cleaned_val_split.csv"), help="Path to a file to monitor validation acc. on.") argparser.add_argument("--test_file", type=str, default=os.path.join(project_dir, "data/processed/quora/" "test_final.csv")) argparser.add_argument("--batch_size", type=int, default=128, help="Number of instances per batch.") argparser.add_argument("--num_epochs", type=int, default=10, help=("Number of epochs to perform in " "training.")) argparser.add_argument("--early_stopping_patience", type=int, default=0, help=("number of epochs with no validation " "accuracy improvement after which training " "will be stopped")) argparser.add_argument("--num_sentence_words", type=int, default=30, help=("The maximum length of a sentence. Longer " "sentences will be truncated, and shorter " "ones will be padded.")) argparser.add_argument("--word_embedding_dim", type=int, default=300, help="Dimensionality of the word embedding layer") argparser.add_argument("--pretrained_embeddings_file_path", type=str, help="Path to a file with pretrained embeddings.", default=os.path.join(project_dir, "data/external/", "glove.6B.300d.txt")) argparser.add_argument("--fine_tune_embeddings", action="store_true", help=("Whether to train the embedding layer " "(if True), or keep it fixed (False).")) argparser.add_argument("--rnn_hidden_size", type=int, default=256, help=("The output dimension of the RNN.")) argparser.add_argument("--share_encoder_weights", action="store_true", help=("Whether to use the same encoder on both " "input sentences (thus sharing weights), " "or a different one for each sentence")) argparser.add_argument("--output_keep_prob", type=float, default=1.0, help=("The proportion of RNN outputs to keep, " "where the rest are dropped out.")) argparser.add_argument("--log_period", type=int, default=10, help=("Number of steps between each summary " "op evaluation.")) argparser.add_argument("--val_period", type=int, default=250, help=("Number of steps between each evaluation of " "validation performance.")) argparser.add_argument("--log_dir", type=str, default=os.path.join(project_dir, "logs/"), help=("Directory to save logs to.")) argparser.add_argument("--save_period", type=int, default=250, help=("Number of steps between each " "model checkpoint")) argparser.add_argument("--save_dir", type=str, default=os.path.join(project_dir, "models/"), help=("Directory to save model checkpoints to.")) argparser.add_argument("--run_id", type=str, required=True, help=("Identifying run ID for this run. If " "predicting, you probably want this " "to be the same as the train run_id")) argparser.add_argument("--model_name", type=str, required=True, help=("Identifying model name for this run. If" "predicting, you probably want this " "to be the same as the train run_id")) argparser.add_argument("--reweight_predictions_for_kaggle", action="store_true", help=("Only relevant when predicting. Whether to " "reweight the prediction probabilities to " "account for class proportion discrepancy " "between train and test.")) config = argparser.parse_args() model_name = config.model_name run_id = config.run_id mode = config.mode # Get the data. batch_size = config.batch_size if mode == "train": # Read the train data from a file, and use it to index the # validation data data_manager = DataManager(STSInstance) num_sentence_words = config.num_sentence_words get_train_data_gen, train_data_size = data_manager.get_train_data_from_file( [config.train_file], max_lengths={"num_sentence_words": num_sentence_words}) get_val_data_gen, val_data_size = data_manager.get_validation_data_from_file( [config.val_file], max_lengths={"num_sentence_words": num_sentence_words}) else: # Load the fitted DataManager, and use it to index the test data logger.info("Loading pickled DataManager from {}".format( config.dataindexer_load_path)) data_manager = pickle.load(open(config.dataindexer_load_path, "rb")) get_test_data_gen, test_data_size = data_manager.get_test_data_from_file( [config.test_file]) vars(config)["word_vocab_size"] = data_manager.data_indexer.get_vocab_size() # Log the run parameters. log_dir = config.log_dir log_path = os.path.join(log_dir, model_name, run_id.zfill(2)) logger.info("Writing logs to {}".format(log_path)) if not os.path.exists(log_path): logger.info("log path {} does not exist, " "creating it".format(log_path)) os.makedirs(log_path) params_path = os.path.join(log_path, mode + "params.json") logger.info("Writing params to {}".format(params_path)) with open(params_path, 'w') as params_file: json.dump(vars(config), params_file, indent=4) # Get the embeddings. embedding_manager = EmbeddingManager(data_manager.data_indexer) embedding_matrix = embedding_manager.get_embedding_matrix( config.word_embedding_dim, config.pretrained_embeddings_file_path) vars(config)["word_embedding_matrix"] = embedding_matrix # Initialize the model. model = SiameseMatchingBiLSTM(vars(config)) model.build_graph() if mode == "train": # Train the model. num_epochs = config.num_epochs num_train_steps_per_epoch = int(math.ceil(train_data_size / batch_size)) num_val_steps = int(math.ceil(val_data_size / batch_size)) log_period = config.log_period val_period = config.val_period save_period = config.save_period save_dir = os.path.join(config.save_dir, model_name, run_id.zfill(2) + "/") save_path = os.path.join(save_dir, model_name + "-" + run_id.zfill(2)) logger.info("Checkpoints will be written to {}".format(save_dir)) if not os.path.exists(save_dir): logger.info("save path {} does not exist, " "creating it".format(save_dir)) os.makedirs(save_dir) logger.info("Saving fitted DataManager to {}".format(save_dir)) data_manager_pickle_name = "{}-{}-DataManager.pkl".format(model_name, run_id.zfill(2)) pickle.dump(data_manager, open(os.path.join(save_dir, data_manager_pickle_name), "wb")) patience = config.early_stopping_patience model.train(get_train_instance_generator=get_train_data_gen, get_val_instance_generator=get_val_data_gen, batch_size=batch_size, num_train_steps_per_epoch=num_train_steps_per_epoch, num_epochs=num_epochs, num_val_steps=num_val_steps, save_path=save_path, log_path=log_path, log_period=log_period, val_period=val_period, save_period=save_period, patience=patience) else: # Predict with the model model_load_dir = config.model_load_dir num_test_steps = int(math.ceil(test_data_size / batch_size)) # Numpy array of shape (num_test_examples, 2) raw_predictions = model.predict(get_test_instance_generator=get_test_data_gen, model_load_dir=model_load_dir, batch_size=batch_size, num_test_steps=num_test_steps) # Remove the first column, so we're left with just the probabilities # that a question is a duplicate. is_duplicate_probabilities = np.delete(raw_predictions, 0, 1) # The class balance between kaggle train and test seems different. # This edits prediction probability to account for the discrepancy. # See: https://www.kaggle.com/c/quora-question-pairs/discussion/31179 if config.reweight_predictions_for_kaggle: positive_weight = 0.165 / 0.37 negative_weight = (1 - 0.165) / (1 - 0.37) is_duplicate_probabilities = ((positive_weight * is_duplicate_probabilities) / (positive_weight * is_duplicate_probabilities + negative_weight * (1 - is_duplicate_probabilities))) # Write the predictions to an output submission file output_predictions_path = os.path.join(log_path, model_name + "-" + run_id.zfill(2) + "-output_predictions.csv") logger.info("Writing predictions to {}".format(output_predictions_path)) is_duplicate_df = pd.DataFrame(is_duplicate_probabilities) is_duplicate_df.to_csv(output_predictions_path, index_label="test_id", header=["is_duplicate"])
class TestBiMPM(DuplicateTestCase): @overrides def setUp(self): super(TestBiMPM, self).setUp() self.write_duplicate_questions_train_file() self.write_duplicate_questions_validation_file() self.write_duplicate_questions_test_file() self.data_manager = DataManager(STSInstance) self.batch_size = 3 self.get_train_gen, self.train_size = self.data_manager.get_train_data_from_file( [self.TRAIN_FILE], mode="word+character") self.get_val_gen, self.val_size = self.data_manager.get_validation_data_from_file( [self.VALIDATION_FILE], mode="word+character") self.get_test_gen, self.test_size = self.data_manager.get_test_data_from_file( [self.TEST_FILE], mode="word+character") self.embedding_manager = EmbeddingManager( self.data_manager.data_indexer) self.word_embedding_dim = 5 self.word_embedding_matrix = self.embedding_manager.get_embedding_matrix( self.word_embedding_dim) self.char_embedding_dim = 2 self.char_embedding_matrix = self.embedding_manager.get_embedding_matrix( self.char_embedding_dim) self.char_rnn_hidden_size = 6 self.context_rnn_hidden_size = 3 self.aggregation_rnn_hidden_size = 4 self.dropout_ratio = 0.1 self.config_dict = { "mode": "train", "word_vocab_size": self.data_manager.data_indexer.get_vocab_size(), "word_embedding_dim": self.word_embedding_dim, "word_embedding_matrix": self.word_embedding_matrix, "char_vocab_size": self.data_manager.data_indexer.get_vocab_size( namespace="characters"), "char_embedding_dim": self.char_embedding_dim, "char_embedding_matrix": self.char_embedding_matrix, "char_rnn_hidden_size": self.char_rnn_hidden_size, "fine_tune_embeddings": False, "context_rnn_hidden_size": self.context_rnn_hidden_size, "aggregation_rnn_hidden_size": self.aggregation_rnn_hidden_size, "dropout_ratio": self.dropout_ratio } self.num_train_steps_per_epoch = int( math.ceil(self.train_size / self.batch_size)) self.num_val_steps = int(math.ceil(self.val_size / self.batch_size)) self.num_test_steps = int(math.ceil(self.test_size / self.batch_size)) def test_default_does_not_crash(self): # Initialize the model model = BiMPM(self.config_dict) model.build_graph() # Train the model model.train(get_train_instance_generator=self.get_train_gen, get_val_instance_generator=self.get_val_gen, batch_size=self.batch_size, num_train_steps_per_epoch=self.num_train_steps_per_epoch, num_epochs=2, num_val_steps=self.num_val_steps, save_path=self.TEST_DIR, log_path=self.TEST_DIR, log_period=2, val_period=2, save_period=2, patience=0) tf.reset_default_graph() # Load and predict with the model self.config_dict["mode"] = "test" del self.config_dict["word_embedding_matrix"] del self.config_dict["char_embedding_matrix"] loaded_model = BiMPM(self.config_dict) loaded_model.build_graph() loaded_model.predict(get_test_instance_generator=self.get_test_gen, model_load_dir=self.TEST_DIR, batch_size=self.batch_size, num_test_steps=self.num_test_steps)
def main(): default_run_id = "01" project_dir = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir) # Parse config arguments argparser = configargparse.ArgumentParser( default_config_files=['../../config/default.yml'], description=("Run a baseline Siamese BiLSTM model " "for paraphrase identification.")) argparser.add_argument("mode", type=str, choices=["train", "predict"], help=("One of {train|predict}, to " "indicate what you want the model to do. " "If you pick \"predict\", then you must also " "supply the path to a pretrained model and " "DataIndexer to load.")) argparser.add_argument("--config_file", is_config_file_arg=True, help="The path to a config file.") argparser.add_argument( "--data_file_dir", type=str, default=os.path.join(project_dir, "data/processed/"), help="Path of the dir to the (train, val, test).csv files.") argparser.add_argument("--train_filename", type=str, default=os.path.join("train.csv"), help="Basename of the train file.") argparser.add_argument("--val_filename", type=str, default=os.path.join("train.csv"), help="Basename of the validation file.") argparser.add_argument("--test_filename", type=str, default=os.path.join("test.csv"), help="Basename of the test file.") argparser.add_argument("--batch_size", type=int, default=128, help="Number of instances per batch.") argparser.add_argument("--num_epochs", type=int, default=10, help=("Number of epochs to perform in " "training.")) argparser.add_argument("--early_stopping_patience", type=int, default=0, help=("number of epochs with no validation " "accuracy improvement after which training " "will be stopped")) argparser.add_argument("--num_sentence_words", type=int, default=30, help=("The maximum length of a sentence. Longer " "sentences will be truncated, and shorter " "ones will be padded.")) argparser.add_argument("--embedding_file_path_template", type=str, help="Path to a file with pretrained embeddings.", default=os.path.join(project_dir, "data/external/bcb", "{name}.{dim}d.txt")) argparser.add_argument("--embedding_file_name", type=str, help="Name of the embedding file.") argparser.add_argument("--word_embedding_dim", type=int, default=8, help="Dimensionality of the word embedding layer") argparser.add_argument("--fine_tune_embeddings", action="store_true", help=("Whether to train the embedding layer " "(if True), or keep it fixed (False).")) argparser.add_argument("--rnn_hidden_size", type=int, default=256, help=("The output dimension of the RNN.")) argparser.add_argument("--share_encoder_weights", action="store_true", help=("Whether to use the same encoder on both " "input sentences (thus sharing weights), " "or a different one for each sentence")) argparser.add_argument("--rnn_output_mode", type=str, default="last", choices=["mean_pool", "last"], help=("How to calculate the final sentence " "representation from the RNN outputs. " "\"mean_pool\" indicates that the outputs " "will be averaged (with respect to padding), " "and \"last\" indicates that the last " "relevant output will be used as the " "sentence representation.")) argparser.add_argument("--output_keep_prob", type=float, default=1.0, help=("The proportion of RNN outputs to keep, " "where the rest are dropped out.")) argparser.add_argument("--log_period", type=int, default=10, help=("Number of steps between each summary " "op evaluation.")) argparser.add_argument("--val_period", type=int, default=250, help=("Number of steps between each evaluation of " "validation performance.")) argparser.add_argument("--log_dir", type=str, default=os.path.join(project_dir, "logs/"), help=("Directory to save logs to.")) argparser.add_argument("--save_period", type=int, default=250, help=("Number of steps between each " "model checkpoint")) argparser.add_argument("--model_save_root", type=str, default=os.path.join(project_dir, "models/"), help=("Directory to save model checkpoints to.")) argparser.add_argument("--token_file_dir", type=str, help=("Directory to token files.")) argparser.add_argument("--token_file_ext", type=str, help=("File extentions of token files.")) argparser.add_argument("--run_id", type=str, default=default_run_id, help=("Identifying run ID for this run. If " "predicting, you probably want this " "to be the same as the train run_id")) argparser.add_argument("--model_name", type=str, help=("Identifying model name for this run. If" "predicting, you probably want this " "to be the same as the train run_id")) argparser.add_argument("--reweight_predictions_for_kaggle", action="store_true", help=("Only relevant when predicting. Whether to " "reweight the prediction probabilities to " "account for class proportion discrepancy " "between train and test.")) config = argparser.parse_args() # logger.info(config) model_name = config.model_name run_id = config.run_id.zfill(2) mode = config.mode batch_size = config.batch_size paths = construct_paths( model_name, run_id, config.data_file_dir, config.train_filename, config.val_filename, config.test_filename, config.model_save_root, config.log_dir, config.embedding_file_path_template, config.embedding_file_name, config.word_embedding_dim) model_save_file_path = paths['model_save_file_path'] model_save_dir = paths['model_save_dir'] data_manager_pickle_file_path = paths['data_manager_pickle_file_path'] if mode == "train": # Read the train data from a file, and use it to index the validation data # TODO: determine from config # data_manager = DataManager(STSInstance) CodeInstance.set_token_file(config.token_file_dir, config.token_file_ext) data_manager = DataManager(CodeInstance) num_sentence_words = config.num_sentence_words get_train_data_gen, train_data_size = data_manager.get_train_data_from_file( [paths['train_file_path']], max_lengths={"num_sentence_words": num_sentence_words}) get_val_data_gen, val_data_size = data_manager.get_validation_data_from_file( [paths['val_file_path']], max_lengths={"num_sentence_words": num_sentence_words}) else: # Load the fitted DataManager, and use it to index the test data logger.info("Loading pickled DataManager " "from {}".format(data_manager_pickle_file_path)) data_manager = pickle.load(open(data_manager_pickle_file_path, "rb")) test_data_gen, test_data_size = data_manager.get_test_data_from_file( [paths['test_file_path']]) vars(config)["word_vocab_size"] = data_manager.data_indexer.get_vocab_size( ) # Log the run parameters. log_path = paths['log_file_path'] logger.info("Writing logs to {}".format(log_path)) if not os.path.exists(log_path): logger.info("log path {} does not exist, " "creating it".format(log_path)) os.makedirs(log_path) params_path = os.path.join(log_path, mode + "params.json") logger.info("Writing params to {}".format(params_path)) with open(params_path, 'w') as params_file: json.dump(vars(config), params_file, indent=4) # Get the embeddings. embedding_manager = EmbeddingManager(data_manager.data_indexer) embedding_matrix = embedding_manager.get_embedding_matrix( config.word_embedding_dim, paths['embedding_file_path']) vars(config)["word_embedding_matrix"] = embedding_matrix # Initialize the model. model = SiameseBiLSTM(vars(config)) model.build_graph() if mode == "train": # Train the model. num_epochs = config.num_epochs num_train_steps_per_epoch = int(math.ceil(train_data_size / batch_size)) num_val_steps = int(math.ceil(val_data_size / batch_size)) log_period = config.log_period val_period = config.val_period save_period = config.save_period logger.info("Checkpoints will be written to {}".format(model_save_dir)) if not os.path.exists(model_save_dir): logger.info("save path {} does not exist, " "creating it".format(model_save_dir)) os.makedirs(model_save_dir) logger.info("Saving fitted DataManager to {}".format(model_save_dir)) pickle.dump(data_manager, open(data_manager_pickle_file_path, "wb")) patience = config.early_stopping_patience model.train(get_train_instance_generator=get_train_data_gen, get_val_instance_generator=get_val_data_gen, batch_size=batch_size, num_train_steps_per_epoch=num_train_steps_per_epoch, num_epochs=num_epochs, num_val_steps=num_val_steps, save_path=model_save_file_path, log_path=log_path, log_period=log_period, val_period=val_period, save_period=save_period, patience=patience) else: # Predict with the model num_test_steps = int(math.ceil(test_data_size / batch_size)) # Numpy array of shape (num_test_examples, 2) raw_predictions, encodings = model.predict( get_test_instance_generator=test_data_gen, model_load_dir=model_save_dir, batch_size=batch_size, num_test_steps=num_test_steps) # Remove the first column, so we're left with just the probabilities # that a question is a duplicate. is_duplicate_probabilities = np.delete(raw_predictions, 0, 1) # The class balance between kaggle train and test seems different. # This edits prediction probability to account for the discrepancy. # See: https://www.kaggle.com/c/quora-question-pairs/discussion/31179 if config.reweight_predictions_for_kaggle: positive_weight = 0.165 / 0.37 negative_weight = (1 - 0.165) / (1 - 0.37) is_duplicate_probabilities = ( (positive_weight * is_duplicate_probabilities) / (positive_weight * is_duplicate_probabilities + negative_weight * (1 - is_duplicate_probabilities))) # Write the predictions to an output submission file predictions_file_path = paths['predictions_file_path'] logger.info("Writing predictions to {}".format(predictions_file_path)) is_duplicate_df = pd.DataFrame(is_duplicate_probabilities) # is_duplicate_df.to_csv(predictions_file_path, index_label="test_id", # header=["is_duplicate"]) # is_duplicate_df.to_csv(predictions_file_path, index=False, header=False) encodings_df = pd.DataFrame(encodings) pair_info_df = pd.read_csv(paths['test_file_path'], header=None) # print(pair_info_df.shape, is_duplicate_df.shape, encodings_df.shape) # result = pd.DataFrame(np.hstack((pair_info_df, is_duplicate_df, encodings_df))) result = pd.DataFrame(np.hstack((pair_info_df, is_duplicate_df))) result.to_csv(predictions_file_path, index=False, header=False) plot_pairs(predictions_file_path)
class TestSiameseBiLSTM(DuplicateTestCase): @overrides def setUp(self): super(TestSiameseBiLSTM, self).setUp() self.write_duplicate_questions_train_file() self.write_duplicate_questions_validation_file() self.write_duplicate_questions_test_file() self.data_manager = DataManager(STSInstance) self.batch_size = 2 self.get_train_gen, self.train_size = self.data_manager.get_train_data_from_file( [self.TRAIN_FILE]) self.get_val_gen, self.val_size = self.data_manager.get_validation_data_from_file( [self.VALIDATION_FILE]) self.get_test_gen, self.test_size = self.data_manager.get_test_data_from_file( [self.TEST_FILE]) self.embedding_manager = EmbeddingManager(self.data_manager.data_indexer) self.word_embedding_dim = 5 self.embedding_matrix = self.embedding_manager.get_embedding_matrix( self.word_embedding_dim) self.rnn_hidden_size = 6 self.rnn_output_mode = "last" self.output_keep_prob = 1.0 self.share_encoder_weights = True self.config_dict = { "mode": "train", "word_vocab_size": self.data_manager.data_indexer.get_vocab_size(), "word_embedding_dim": self.word_embedding_dim, "fine_tune_embeddings": False, "word_embedding_matrix": self.embedding_matrix, "rnn_hidden_size": self.rnn_hidden_size, "rnn_output_mode": self.rnn_output_mode, "output_keep_prob": self.output_keep_prob, "share_encoder_weights": self.share_encoder_weights } self.num_train_steps_per_epoch = int(math.ceil(self.train_size / self.batch_size)) self.num_val_steps = int(math.ceil(self.val_size / self.batch_size)) self.num_test_steps = int(math.ceil(self.test_size / self.batch_size)) def test_default_does_not_crash(self): # Initialize the model model = SiameseBiLSTM(self.config_dict) model.build_graph() # Train the model model.train(get_train_instance_generator=self.get_train_gen, get_val_instance_generator=self.get_val_gen, batch_size=self.batch_size, num_train_steps_per_epoch=self.num_train_steps_per_epoch, num_epochs=2, num_val_steps=self.num_val_steps, save_path=self.TEST_DIR, log_path=self.TEST_DIR, log_period=2, val_period=2, save_period=2, patience=0) tf.reset_default_graph() # Load and predict with the model self.config_dict["mode"] = "test" del self.config_dict["word_embedding_matrix"] loaded_model = SiameseBiLSTM(self.config_dict) loaded_model.build_graph() loaded_model.predict(get_test_instance_generator=self.get_test_gen, model_load_dir=self.TEST_DIR, batch_size=self.batch_size, num_test_steps=self.num_test_steps) def test_mean_pool_does_not_crash(self): # Initialize the model self.config_dict["rnn_output_mode"] = "mean_pool" model = SiameseBiLSTM(self.config_dict) model.build_graph() # Train the model model.train(get_train_instance_generator=self.get_train_gen, get_val_instance_generator=self.get_val_gen, batch_size=self.batch_size, num_train_steps_per_epoch=self.num_train_steps_per_epoch, num_epochs=2, num_val_steps=self.num_val_steps, save_path=self.TEST_DIR, log_path=self.TEST_DIR, log_period=2, val_period=2, save_period=2, patience=0) tf.reset_default_graph() # Load and predict with the model self.config_dict["mode"] = "test" del self.config_dict["word_embedding_matrix"] loaded_model = SiameseBiLSTM(self.config_dict) loaded_model.build_graph() loaded_model.predict(get_test_instance_generator=self.get_test_gen, model_load_dir=self.TEST_DIR, batch_size=self.batch_size, num_test_steps=self.num_test_steps) def test_non_sharing_encoders_does_not_crash(self): # Initialize the model self.config_dict["share_encoder_weights"] = False model = SiameseBiLSTM(self.config_dict) model.build_graph() # Train the model model.train(get_train_instance_generator=self.get_train_gen, get_val_instance_generator=self.get_val_gen, batch_size=self.batch_size, num_train_steps_per_epoch=self.num_train_steps_per_epoch, num_epochs=2, num_val_steps=self.num_val_steps, save_path=self.TEST_DIR, log_path=self.TEST_DIR, log_period=2, val_period=2, save_period=2, patience=0) tf.reset_default_graph() # Load and predict with the model self.config_dict["mode"] = "test" del self.config_dict["word_embedding_matrix"] loaded_model = SiameseBiLSTM(self.config_dict) loaded_model.build_graph() loaded_model.predict(get_test_instance_generator=self.get_test_gen, model_load_dir=self.TEST_DIR, batch_size=self.batch_size, num_test_steps=self.num_test_steps)
def main(): project_dir = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir) # Parse config arguments argparser = argparse.ArgumentParser( description=("Run the Siamese BiLSTM model with an added " "matching layer for paraphase detection.")) argparser.add_argument("mode", type=str, choices=["train", "predict"], help=("One of {train|predict}, to " "indicate what you want the model to do. " "If you pick \"predict\", then you must also " "supply the path to a pretrained model and " "DataIndexer to load.")) argparser.add_argument("--model_load_dir", type=str, default=os.path.join(project_dir, "models/"), help=("The path to a directory with checkpoints to " "load for evaluation or prediction. The " "latest checkpoint will be loaded.")) argparser.add_argument("--dataindexer_load_path", type=str, default=os.path.join(project_dir, "models/"), help=("The path to the DataIndexer fit on the " "train data, so we can properly index the " "test data for evaluation or prediction.")) argparser.add_argument("--train_file", type=str, default=os.path.join( project_dir, "data/processed/weizhong/" "weizhong_train_train_split.txt"), help="Path to a file to train on.") argparser.add_argument("--val_file", type=str, default=os.path.join( project_dir, "data/processed/weizhong/" "weizhong_train_val_split.txt")) argparser.add_argument("--test_file", type=str, default=os.path.join(project_dir, "data/raw/" "weizhong_dev.txt")) argparser.add_argument("--batch_size", type=int, default=512, help="Number of instances per batch.") argparser.add_argument("--num_epochs", type=int, default=200, help=("Number of epochs to perform in " "training.")) argparser.add_argument("--early_stopping_patience", type=int, default=100, help=("number of epochs with no validation " "accuracy improvement after which training " "will be stopped")) argparser.add_argument("--num_sentence_words", type=int, default=25, help=("The maximum length of a sentence. Longer " "sentences will be truncated, and shorter " "ones will be padded.")) argparser.add_argument("--word_embedding_dim", type=int, default=300, help="Dimensionality of the word embedding layer") argparser.add_argument("--pretrained_embeddings_file_path", type=str, help="Path to a file with pretrained embeddings.", default=os.path.join( project_dir, "data/external/", "word2vec_finance_train_use.txt")) argparser.add_argument("--fine_tune_embeddings", action="store_true", help=("Whether to train the embedding layer " "(if True), or keep it fixed (False).")) argparser.add_argument("--rnn_hidden_size", type=int, default=256, help=("The output dimension of the RNN.")) argparser.add_argument("--share_encoder_weights", action="store_true", help=("Whether to use the same encoder on both " "input sentences (thus sharing weights), " "or a different one for each sentence")) argparser.add_argument("--output_keep_prob", type=float, default=0.9, help=("The proportion of RNN outputs to keep, " "where the rest are dropped out.")) argparser.add_argument("--log_period", type=int, default=10, help=("Number of steps between each summary " "op evaluation.")) argparser.add_argument("--val_period", type=int, default=250, help=("Number of steps between each evaluation of " "validation performance.")) argparser.add_argument("--log_dir", type=str, default=os.path.join(project_dir, "logs/"), help=("Directory to save logs to.")) argparser.add_argument("--save_period", type=int, default=250, help=("Number of steps between each " "model checkpoint")) argparser.add_argument("--save_dir", type=str, default=os.path.join(project_dir, "models/"), help=("Directory to save model checkpoints to.")) argparser.add_argument("--run_id", type=str, required=True, help=("Identifying run ID for this run. If " "predicting, you probably want this " "to be the same as the train run_id")) argparser.add_argument("--model_name", type=str, required=True, help=("Identifying model name for this run. If" "predicting, you probably want this " "to be the same as the train run_id")) argparser.add_argument("--reweight_predictions_for_kaggle", action="store_true", help=("Only relevant when predicting. Whether to " "reweight the prediction probabilities to " "account for class proportion discrepancy " "between train and test.")) config = argparser.parse_args() model_name = config.model_name run_id = config.run_id mode = config.mode print("======================", config.share_encoder_weights) # Get the data. batch_size = config.batch_size if mode == "train": # Read the train data from a file, and use it to index the # validation data data_manager = DataManager(WeizhongInstance) num_sentence_words = config.num_sentence_words get_train_data_gen, train_data_size = data_manager.get_train_data_from_file( [config.train_file], max_lengths={"num_sentence_words": num_sentence_words}) get_val_data_gen, val_data_size = data_manager.get_validation_data_from_file( [config.val_file], max_lengths={"num_sentence_words": num_sentence_words}) else: # Load the fitted DataManager, and use it to index the test data logger.info("Loading pickled DataManager from {}".format( config.dataindexer_load_path)) config.dataindexer_load_path = os.path.join( config.dataindexer_load_path, config.model_name, config.run_id.zfill(2), config.model_name + "-" + config.run_id.zfill(2) + "-" + "DataManager.pkl") data_manager = pickle.load(open(config.dataindexer_load_path, "rb")) get_test_data_gen, test_data_size = data_manager.get_test_data_from_file( [config.test_file]) vars(config)["word_vocab_size"] = data_manager.data_indexer.get_vocab_size( ) # Log the run parameters. log_dir = config.log_dir log_path = os.path.join(log_dir, model_name, run_id.zfill(2)) logger.info("Writing logs to {}".format(log_path)) if not os.path.exists(log_path): logger.info("log path {} does not exist, " "creating it".format(log_path)) os.makedirs(log_path) params_path = os.path.join(log_path, mode + "params.json") logger.info("Writing params to {}".format(params_path)) with open(params_path, 'w') as params_file: json.dump(vars(config), params_file, indent=4) # Get the embeddings. embedding_manager = EmbeddingManager(data_manager.data_indexer) embedding_matrix = embedding_manager.get_embedding_matrix( config.word_embedding_dim, config.pretrained_embeddings_file_path) vars(config)["word_embedding_matrix"] = embedding_matrix # Initialize the model. model = SiameseMatchingBiLSTM(vars(config)) model.build_graph() if mode == "train": # Train the model. num_epochs = config.num_epochs num_train_steps_per_epoch = int(math.ceil(train_data_size / batch_size)) num_val_steps = int(math.ceil(val_data_size / batch_size)) log_period = config.log_period val_period = config.val_period save_period = config.save_period save_dir = os.path.join(config.save_dir, model_name, run_id.zfill(2) + "/") save_path = os.path.join(save_dir, model_name + "-" + run_id.zfill(2)) logger.info("Checkpoints will be written to {}".format(save_dir)) if not os.path.exists(save_dir): logger.info("save path {} does not exist, " "creating it".format(save_dir)) os.makedirs(save_dir) logger.info("Saving fitted DataManager to {}".format(save_dir)) data_manager_pickle_name = "{}-{}-DataManager.pkl".format( model_name, run_id.zfill(2)) pickle.dump( data_manager, open(os.path.join(save_dir, data_manager_pickle_name), "wb")) patience = config.early_stopping_patience model.train(get_train_instance_generator=get_train_data_gen, get_val_instance_generator=get_val_data_gen, batch_size=batch_size, num_train_steps_per_epoch=num_train_steps_per_epoch, num_epochs=num_epochs, num_val_steps=num_val_steps, save_path=save_path, log_path=log_path, log_period=log_period, val_period=val_period, save_period=save_period, patience=patience) else: # Predict with the model model_load_dir = os.path.join(config.model_load_dir, config.model_name, config.run_id.zfill(2)) num_test_steps = int(math.ceil(test_data_size / batch_size)) # Numpy array of shape (num_test_examples, 2) raw_predictions, lineids = model.predict( get_test_instance_generator=get_test_data_gen, model_load_dir=model_load_dir, batch_size=batch_size, num_test_steps=num_test_steps) # Write the predictions to an output submission file output_predictions_path = os.path.join( log_path, model_name + "-" + run_id.zfill(2) + "-output_predictions.csv") with open(output_predictions_path, "w") as output_file: output_file.write("test_id,result\n") for index, lineid in enumerate(lineids): output_file.write( str(lineid) + "," + str(raw_predictions[index]) + "\n") logger.info( "Writing predictions to {}".format(output_predictions_path))