Beispiel #1
0
def train_bi_ltsm(train_data, n_epochs=1):
    train_data = list(train_data)
    vocab_words, vocab_tags = make_vocabs(train_data)
    model = BiLSTM_Tagger(50, 300, len(vocab_words), len(vocab_tags))
    loss_function = nn.NLLLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.1)

    for epoch in range(
            n_epochs
    ):  # again, normally you would NOT do 300 epochs, it is toy data
        random.shuffle(train_data)
        c = 0
        for sentence, tags in batchify(train_data):
            c += 1
            print(f"{c}/{len(train_data)}", end="\r")
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            # Step 2. Run our forward pass.
            tag_scores = model(sentence)

            # Step 3. Compute the loss, gradients, and update the parameters by
            #  calling optimizer.step()
            loss = loss_function(tag_scores, tags)
            loss.backward()
            optimizer.step()
    return model
def train_perceptron(train_data, n_epochs=1):
    word_vocab, tag_vocab = make_vocabs(train_data)
    parser = PerceptronParser(word_vocab, tag_vocab)
    trainer = PerceptronTrainer(parser.model)

    for sample in samples(train_data, parser):
        features, gold_move = sample
        trainer.update(features, gold_move)
    trainer.finalize()
    return parser
Beispiel #3
0
def train_eisner(train_data, n_epochs=1):
    word_vocab, tag_vocab = make_vocabs(train_data)
    parser = Eisner(word_vocab)
    trainer = PerceptronTrainer(parser.model)

    for sample in samples(train_data, parser):
        #TODO Add training loop
        
    trainer.finalize()
    return parser
Beispiel #4
0
def train_neural(train_data, n_epochs=1, batch_size=300):
    train_data = list(train_data)  # because we will shuffle in-place
    vocab_words, vocab_tags = make_vocabs(train_data)
    classifier = NeuralParser(vocab_words, vocab_tags)
    optimizer = optim.Adam(classifier.model.parameters())
    for epoch in range(1, n_epochs+1):
        random.shuffle(train_data)
        for bx, by in batchify(train_data, batch_size, classifier, n_epochs):
            optimizer.zero_grad()
            output = classifier.model.forward(bx)
            loss = F.cross_entropy(output, by)
            loss.backward()
            optimizer.step()
    return classifier
Beispiel #5
0
def eval_feature_parser(train_file, dev_file):
    with bz2.open(train_file, 'rt', encoding="utf-8") as source:
        train_data = list(read_data(source))

    with bz2.open(dev_file, 'rt', encoding="utf-8") as source:
        dev_data = list(read_data(source))

    train_data = filter_data(train_data, [1, 3, 6])
    dev_data = filter_data(dev_data, [1, 3, 6])
    vocab_words, vocab_tags = make_vocabs(train_data)

    perceptron_parser = train_perceptron(train_data, n_epochs=EPOCHS)
    print("UAS score for feature engineered perceptron:")
    print("{:.4f}".format(uas(perceptron_parser, dev_data)))
    print()
Beispiel #6
0
def eval_tagger(train_file, dev_file):
    with bz2.open(train_file, 'rt', encoding="utf-8") as source:
        train_data = list(read_data(source))

    with bz2.open(dev_file, 'rt', encoding="utf-8") as source:
        dev_data = list(read_data(source))

    train_data = filter_data(train_data, [1, 3])
    dev_data = filter_data(dev_data, [1, 3])
    vocab_words, vocab_tags = make_vocabs(train_data)
    encoded_train_data = encode(train_data, vocab_words, vocab_tags)
    encoded_dev_data = encode(dev_data, vocab_words, vocab_tags)

    bi_lstm_tagger = train_bi_ltsm(encoded_train_data, n_epochs=EPOCHS)
    print("Tagger accuracy for BiLSTM:")
    print("{:.4f}".format(accuracy(bi_lstm_tagger, encoded_dev_data)))
    print()
Beispiel #7
0
def train_perceptron(train_data, n_epochs=1, encoded=True):
    train_data = list(train_data)  # because we will shuffle in-place
    vocab_words, vocab_tags = make_vocabs(train_data)

    tagger = GoldTagger(vocab_words, vocab_tags, encoded)
    if encoded:
        trainer = PerceptronTrainer(tagger.model)
    else:
        trainer = PerceptronTrainer(tagger.model, vocab_tags)

    #MOST FREQ
    pre_sufix = {PAD: 0}
    for sentence in train_data:
        for i, (w, tag) in enumerate(sentence):
            if w[:1] not in pre_sufix:
                pre_sufix[w[:1]] = len(pre_sufix)
            if w[:2] not in pre_sufix:
                pre_sufix[w[:2]] = len(pre_sufix)
            if w[:3] not in pre_sufix:
                pre_sufix[w[:3]] = len(pre_sufix)
            if w[:4] not in pre_sufix:
                pre_sufix[w[:4]] = len(pre_sufix)
            if w[-1:] not in pre_sufix:
                pre_sufix[w[-1:]] = len(pre_sufix)
            if w[-2:] not in pre_sufix:
                pre_sufix[w[-2:]] = len(pre_sufix)
            if w[-3:] not in pre_sufix:
                pre_sufix[w[-3:]] = len(pre_sufix)
            if w[-4:] not in pre_sufix:
                pre_sufix[w[-4:]] = len(pre_sufix)

    tagger.pre_sufix = pre_sufix
    for _ in range(n_epochs):
        random.shuffle(train_data)
        for i, sentence in enumerate(train_data):
            words, gold_tags = zip(*sentence)
            pred_tags = []
            for i, gold_tag in enumerate(gold_tags):
                features = tagger.featurize(
                    words, i, pred_tags,
                    sentence[i + 1][1] if i + 1 < len(sentence) else PAD)
                trainer.update(features, gold_tag)
                pred_tags.append(gold_tag)
    trainer.finalize()
    return tagger
Beispiel #8
0
def train_perceptron(train_data, n_epochs=1, encoded=True):
    train_data = list(train_data)  # because we will shuffle in-place
    vocab_words, vocab_tags = make_vocabs(train_data)

    tagger = PerceptronTagger(vocab_words, vocab_tags, encoded)
    if encoded:
        trainer = PerceptronTrainer(tagger.model)
    else:
        trainer = PerceptronTrainer(tagger.model, vocab_tags)

    for _ in range(n_epochs):
        random.shuffle(train_data)
        for i, sentence in enumerate(train_data):
            words, gold_tags = zip(*sentence)
            pred_tags = []
            for i, gold_tag in enumerate(gold_tags):
                features = tagger.featurize(words, i, pred_tags)
                trainer.update(features, gold_tag)
                pred_tags.append(gold_tag)
    trainer.finalize()
    return tagger
Beispiel #9
0
def eval_parser(train_file, dev_file):
    with bz2.open(train_file, 'rt', encoding="utf-8") as source:
        train_data = list(read_data(source))

    with bz2.open(dev_file, 'rt', encoding="utf-8") as source:
        dev_data = list(read_data(source))

    train_data = filter_data(train_data, [1, 3, 6])
    dev_data = filter_data(dev_data, [1, 3, 6])
    vocab_words, vocab_tags = make_vocabs(train_data)

    perceptron_parser = train_perceptron(train_data, n_epochs=EPOCHS)
    print("UAS score for perceptron:")
    print("{:.4f}".format(uas(perceptron_parser, dev_data)))
    print()
    # L4 read gives: ~0.6698
    # Our read gives: ~0.6643

    neural_parser = train_neural(train_data, n_epochs=EPOCHS)
    print("UAS score for neural:")
    print("{:.4f}".format(uas(neural_parser, dev_data)))
    print()
Beispiel #10
0
def eval_tagger(train_file, dev_file):
    with bz2.open(train_file, 'rt', encoding="utf-8") as source:
        train_data = list(read_data(source))

    with bz2.open(dev_file, 'rt', encoding="utf-8") as source:
        dev_data = list(read_data(source))

    train_data = filter_data(train_data, [1, 3])
    dev_data = filter_data(dev_data, [1, 3])
    vocab_words, vocab_tags = make_vocabs(train_data)
    encoded_train_data = encode(train_data, vocab_words, vocab_tags)
    encoded_dev_data = encode(dev_data, vocab_words, vocab_tags)

    perceptron_tagger = train_perceptron(encoded_train_data, n_epochs=EPOCHS)
    print("Tagger accuracy for perceptron:")
    print("{:.4f}".format(accuracy(perceptron_tagger, encoded_dev_data)))
    print()
    # L3 read gives: 0.8736
    # Our read gives: 0.8736

    neural_tagger = train_neural(encoded_train_data, n_epochs=EPOCHS)
    print("Tagger accuracy for neural:")
    print("{:.4f}".format(accuracy(neural_tagger, encoded_dev_data)))
    print()
Beispiel #11
0
from lib import Eisner
from src import eisner_parser_trainer
from util.utils import read_data, filter_data, make_vocabs
from util.parser_utils import uas, output
import bz2

with bz2.open("files/train.conllu.bz2", 'rt') as source:
    train_data = list(read_data(source))

with bz2.open("files/dev.conllu.bz2", 'rt') as source:
    dev_data = list(read_data(source))

train_data = filter_data(train_data, [1, 3, 6])
dev_data = filter_data(dev_data, [1, 3, 6])

vocab_words, vocab_tags = make_vocabs(train_data)

#eisner_parser_trainer.train_eisner(train_data)
eisner = Eisner.Eisner(vocab_words)
tree = eisner.build_dependency_tree(train_data[0])
print(tree)