def train_parser(parser, optimizer, dataset, n_epochs=1, n_train_insts=1000, name="eng"): for epoch in range(n_epochs): model_name = name + "-bakeoff-" + str(epoch + 1) + ".model" print("Epoch {}".format(epoch + 1), "Model:", model_name) parser.train() # turn on dropout layers if they are there parsing.train(dataset.training_data[:n_train_insts], parser, optimizer, verbose=True) print("Dev Evaluation") parser.eval() # turn them off for evaluation parsing.evaluate(dataset.dev_data, parser, verbose=True) print("F-Score: {}".format( evaluation.compute_metric(parser, dataset.dev_data, evaluation.fscore))) print("Attachment Score: {}".format( evaluation.compute_attachment(parser, dataset.dev_data))) print("\n") print("Saving Model:") torch.save(parser.state_dict(), model_name)
def train_english(model_file_name=None): # Params bakeoff_ETA_0_en = 0.01 bakeoff_DROPOUT_en = 0.5 bakeoff_LSTM_NUM_LAYERS_en = 1 pretrained_embeds = pickle.load(open("data/polyglot-en.pkl", 'rb'), encoding='latin1') pretrained_embeds = transform_tuple_to_dict(pretrained_embeds) en_dataset = Dataset(EN_TRAIN_FILE, EN_DEV_FILE, EN_TEST_FILE) word_to_ix_en = {word: i for i, word in enumerate(en_dataset.vocab)} en_dev_data = [i.sentence for i in en_dataset.dev_data] train(ETA=bakeoff_ETA_0_en, DROPOUT=bakeoff_DROPOUT_en, LSTM_NUM_LAYERS=bakeoff_LSTM_NUM_LAYERS_en, n_epochs=4, dataset=en_dataset, word_to_ix=word_to_ix_en, pretrained_embeds=pretrained_embeds, output_preds_filename="bakeoff-dev-en.preds", dev_data=en_dev_data, name="eng", model_file_name=model_file_name, bakeoff_csv_name="KAGGLE-bakeoff-preds-en.csv", output_test_filename="bakeoff-test-en.preds")
def train_norwegian(): # Params bakeoff_ETA_0_nr = 0.01 bakeoff_DROPOUT_nr = 0.5 bakeoff_LSTM_NUM_LAYERS_nr = 1 pretrained_embeds_nr = pickle.load( open(PRETRAINED_EMBEDS_FILE_NR, 'rb'), encoding='latin1') # NOT DOING ANYTHING FOR NORWEGIAN pretrained_embeds = transform_tuple_to_dict(pretrained_embeds_nr) nr_dataset = Dataset(NR_TRAIN_FILE, NR_DEV_FILE, NR_TEST_FILE) word_to_ix_nr = {word: i for i, word in enumerate(nr_dataset.vocab)} nr_dev_data = [i.sentence for i in nr_dataset.dev_data] train(ETA=bakeoff_ETA_0_nr, DROPOUT=bakeoff_DROPOUT_nr, LSTM_NUM_LAYERS=bakeoff_LSTM_NUM_LAYERS_nr, n_epochs=5, dataset=nr_dataset, word_to_ix=word_to_ix_nr, pretrained_embeds=pretrained_embeds, output_preds_filename="bakeoff-dev-nr.preds", dev_data=nr_dev_data, name="norweg", bakeoff_csv_name="KAGGLE-bakeoff-preds-nr.csv", output_test_filename="bakeoff-test-nr.preds")
def test_predict_after_train_d3_1(): global test_sent, gold, word_to_ix, vocab torch.manual_seed(1) feat_extract = SimpleFeatureExtractor() word_embed = VanillaWordEmbedding(word_to_ix, TEST_EMBEDDING_DIM) act_chooser = FFActionChooser(TEST_EMBEDDING_DIM * NUM_FEATURES) combiner = FFCombiner(TEST_EMBEDDING_DIM) parser = TransitionParser(feat_extract, word_embed, act_chooser, combiner) # Train for i in range(75): train([ (test_sent[:-1], gold) ], parser, optim.SGD(parser.parameters(), lr=0.01), verbose=False) # predict pred = parser.predict(test_sent[:-1]) gold_graph = dependency_graph_from_oracle(test_sent[:-1], gold) assert pred == gold_graph
def test_predict_after_train_d3_1(): """ 1 point(s) """ global test_sent, gold, word_to_ix, vocab torch.manual_seed(1) feat_extract = SimpleFeatureExtractor() word_embed = VanillaWordEmbeddingLookup(word_to_ix, TEST_EMBEDDING_DIM) act_chooser = ActionChooserNetwork(TEST_EMBEDDING_DIM * NUM_FEATURES) combiner = MLPCombinerNetwork(TEST_EMBEDDING_DIM) parser = TransitionParser(feat_extract, word_embed, act_chooser, combiner) # Train for i in xrange(75): train([ (test_sent[:-1], gold) ], parser, optim.SGD(parser.parameters(), lr=0.01), verbose=False) # predict pred = parser.predict(test_sent[:-1]) gold_graph = dependency_graph_from_oracle(test_sent[:-1], gold) assert pred == gold_graph