def train_bi_ltsm(train_data, n_epochs=1): train_data = list(train_data) vocab_words, vocab_tags = make_vocabs(train_data) model = BiLSTM_Tagger(50, 300, len(vocab_words), len(vocab_tags)) loss_function = nn.NLLLoss() optimizer = optim.SGD(model.parameters(), lr=0.1) for epoch in range( n_epochs ): # again, normally you would NOT do 300 epochs, it is toy data random.shuffle(train_data) c = 0 for sentence, tags in batchify(train_data): c += 1 print(f"{c}/{len(train_data)}", end="\r") # Step 1. Remember that Pytorch accumulates gradients. # We need to clear them out before each instance model.zero_grad() # Step 2. Run our forward pass. tag_scores = model(sentence) # Step 3. Compute the loss, gradients, and update the parameters by # calling optimizer.step() loss = loss_function(tag_scores, tags) loss.backward() optimizer.step() return model
def train_perceptron(train_data, n_epochs=1): word_vocab, tag_vocab = make_vocabs(train_data) parser = PerceptronParser(word_vocab, tag_vocab) trainer = PerceptronTrainer(parser.model) for sample in samples(train_data, parser): features, gold_move = sample trainer.update(features, gold_move) trainer.finalize() return parser
def train_eisner(train_data, n_epochs=1): word_vocab, tag_vocab = make_vocabs(train_data) parser = Eisner(word_vocab) trainer = PerceptronTrainer(parser.model) for sample in samples(train_data, parser): #TODO Add training loop trainer.finalize() return parser
def train_neural(train_data, n_epochs=1, batch_size=300): train_data = list(train_data) # because we will shuffle in-place vocab_words, vocab_tags = make_vocabs(train_data) classifier = NeuralParser(vocab_words, vocab_tags) optimizer = optim.Adam(classifier.model.parameters()) for epoch in range(1, n_epochs+1): random.shuffle(train_data) for bx, by in batchify(train_data, batch_size, classifier, n_epochs): optimizer.zero_grad() output = classifier.model.forward(bx) loss = F.cross_entropy(output, by) loss.backward() optimizer.step() return classifier
def eval_feature_parser(train_file, dev_file): with bz2.open(train_file, 'rt', encoding="utf-8") as source: train_data = list(read_data(source)) with bz2.open(dev_file, 'rt', encoding="utf-8") as source: dev_data = list(read_data(source)) train_data = filter_data(train_data, [1, 3, 6]) dev_data = filter_data(dev_data, [1, 3, 6]) vocab_words, vocab_tags = make_vocabs(train_data) perceptron_parser = train_perceptron(train_data, n_epochs=EPOCHS) print("UAS score for feature engineered perceptron:") print("{:.4f}".format(uas(perceptron_parser, dev_data))) print()
def eval_tagger(train_file, dev_file): with bz2.open(train_file, 'rt', encoding="utf-8") as source: train_data = list(read_data(source)) with bz2.open(dev_file, 'rt', encoding="utf-8") as source: dev_data = list(read_data(source)) train_data = filter_data(train_data, [1, 3]) dev_data = filter_data(dev_data, [1, 3]) vocab_words, vocab_tags = make_vocabs(train_data) encoded_train_data = encode(train_data, vocab_words, vocab_tags) encoded_dev_data = encode(dev_data, vocab_words, vocab_tags) bi_lstm_tagger = train_bi_ltsm(encoded_train_data, n_epochs=EPOCHS) print("Tagger accuracy for BiLSTM:") print("{:.4f}".format(accuracy(bi_lstm_tagger, encoded_dev_data))) print()
def train_perceptron(train_data, n_epochs=1, encoded=True): train_data = list(train_data) # because we will shuffle in-place vocab_words, vocab_tags = make_vocabs(train_data) tagger = GoldTagger(vocab_words, vocab_tags, encoded) if encoded: trainer = PerceptronTrainer(tagger.model) else: trainer = PerceptronTrainer(tagger.model, vocab_tags) #MOST FREQ pre_sufix = {PAD: 0} for sentence in train_data: for i, (w, tag) in enumerate(sentence): if w[:1] not in pre_sufix: pre_sufix[w[:1]] = len(pre_sufix) if w[:2] not in pre_sufix: pre_sufix[w[:2]] = len(pre_sufix) if w[:3] not in pre_sufix: pre_sufix[w[:3]] = len(pre_sufix) if w[:4] not in pre_sufix: pre_sufix[w[:4]] = len(pre_sufix) if w[-1:] not in pre_sufix: pre_sufix[w[-1:]] = len(pre_sufix) if w[-2:] not in pre_sufix: pre_sufix[w[-2:]] = len(pre_sufix) if w[-3:] not in pre_sufix: pre_sufix[w[-3:]] = len(pre_sufix) if w[-4:] not in pre_sufix: pre_sufix[w[-4:]] = len(pre_sufix) tagger.pre_sufix = pre_sufix for _ in range(n_epochs): random.shuffle(train_data) for i, sentence in enumerate(train_data): words, gold_tags = zip(*sentence) pred_tags = [] for i, gold_tag in enumerate(gold_tags): features = tagger.featurize( words, i, pred_tags, sentence[i + 1][1] if i + 1 < len(sentence) else PAD) trainer.update(features, gold_tag) pred_tags.append(gold_tag) trainer.finalize() return tagger
def train_perceptron(train_data, n_epochs=1, encoded=True): train_data = list(train_data) # because we will shuffle in-place vocab_words, vocab_tags = make_vocabs(train_data) tagger = PerceptronTagger(vocab_words, vocab_tags, encoded) if encoded: trainer = PerceptronTrainer(tagger.model) else: trainer = PerceptronTrainer(tagger.model, vocab_tags) for _ in range(n_epochs): random.shuffle(train_data) for i, sentence in enumerate(train_data): words, gold_tags = zip(*sentence) pred_tags = [] for i, gold_tag in enumerate(gold_tags): features = tagger.featurize(words, i, pred_tags) trainer.update(features, gold_tag) pred_tags.append(gold_tag) trainer.finalize() return tagger
def eval_parser(train_file, dev_file): with bz2.open(train_file, 'rt', encoding="utf-8") as source: train_data = list(read_data(source)) with bz2.open(dev_file, 'rt', encoding="utf-8") as source: dev_data = list(read_data(source)) train_data = filter_data(train_data, [1, 3, 6]) dev_data = filter_data(dev_data, [1, 3, 6]) vocab_words, vocab_tags = make_vocabs(train_data) perceptron_parser = train_perceptron(train_data, n_epochs=EPOCHS) print("UAS score for perceptron:") print("{:.4f}".format(uas(perceptron_parser, dev_data))) print() # L4 read gives: ~0.6698 # Our read gives: ~0.6643 neural_parser = train_neural(train_data, n_epochs=EPOCHS) print("UAS score for neural:") print("{:.4f}".format(uas(neural_parser, dev_data))) print()
def eval_tagger(train_file, dev_file): with bz2.open(train_file, 'rt', encoding="utf-8") as source: train_data = list(read_data(source)) with bz2.open(dev_file, 'rt', encoding="utf-8") as source: dev_data = list(read_data(source)) train_data = filter_data(train_data, [1, 3]) dev_data = filter_data(dev_data, [1, 3]) vocab_words, vocab_tags = make_vocabs(train_data) encoded_train_data = encode(train_data, vocab_words, vocab_tags) encoded_dev_data = encode(dev_data, vocab_words, vocab_tags) perceptron_tagger = train_perceptron(encoded_train_data, n_epochs=EPOCHS) print("Tagger accuracy for perceptron:") print("{:.4f}".format(accuracy(perceptron_tagger, encoded_dev_data))) print() # L3 read gives: 0.8736 # Our read gives: 0.8736 neural_tagger = train_neural(encoded_train_data, n_epochs=EPOCHS) print("Tagger accuracy for neural:") print("{:.4f}".format(accuracy(neural_tagger, encoded_dev_data))) print()
from lib import Eisner from src import eisner_parser_trainer from util.utils import read_data, filter_data, make_vocabs from util.parser_utils import uas, output import bz2 with bz2.open("files/train.conllu.bz2", 'rt') as source: train_data = list(read_data(source)) with bz2.open("files/dev.conllu.bz2", 'rt') as source: dev_data = list(read_data(source)) train_data = filter_data(train_data, [1, 3, 6]) dev_data = filter_data(dev_data, [1, 3, 6]) vocab_words, vocab_tags = make_vocabs(train_data) #eisner_parser_trainer.train_eisner(train_data) eisner = Eisner.Eisner(vocab_words) tree = eisner.build_dependency_tree(train_data[0]) print(tree)