Beispiel #1
0
def evaluate_tagger_and_writeout(tagger):
    stdin = conllu.reader()
    stdout = conllu.writer()
    for sentence in stdin:
        x = []
        for word in sentence:
            x.append(tagger.vocab.get(TaggingDataset.word_obj_to_str(word), tagger.vocab['#OOV']))

        x = np.array([x], dtype='int32')

        y_hat = tagger.predict(x)[0]
        y_hat_str = [tagger.tags.rev(tag_id) for tag_id in y_hat]

        for word, utag in zip(sentence, y_hat_str):
            word.upos = utag

        stdout.write_sentence(sentence)
Beispiel #2
0
def evaluate_tagger_and_writeout(tagger):
    stdin = conllu.reader()
    stdout = conllu.writer()
    for sentence in stdin:
        x = []
        for word in sentence:
            x.append(
                tagger.vocab.get(TaggingDataset.word_obj_to_str(word),
                                 tagger.vocab['#OOV']))

        x = np.array([x], dtype='int32')

        y_hat = tagger.predict(x)[0]
        y_hat_str = [tagger.tags.rev(tag_id) for tag_id in y_hat]

        for word, utag in zip(sentence, y_hat_str):
            word.upos = utag

        stdout.write_sentence(sentence)
Beispiel #3
0
dictionary = {}

# Process all arguments as training files
for arg in sys.argv[1:]:
    reader = conllu.reader(arg)
    sentence = []
    while reader.next_sentence(sentence):
        for word in sentence:
            dictionary.setdefault(
                word.form, collections.defaultdict(lambda: 0))["\t".join(
                    [word.lemma, word.upos, word.lpos, word.feats])] += 1

# Find most frequent analysis, using the lexicographically smaller when equal
for form in dictionary:
    best, best_count = '', 0
    for analysis, count in dictionary[form].iteritems():
        if count > best_count or (count == best_count and analysis < best):
            best, best_count = analysis, count
    dictionary[form] = best

# Analyse all data passed on standard input to standard output
stdin = conllu.reader()
stdout = conllu.writer()
sentence = []
while stdin.next_sentence(sentence):
    for word in sentence:
        word.lemma, word.upos, word.lpos, word.feats = dictionary.get(
            word.form, '\t\t\t').split('\t')
    stdout.write_sentence(sentence)
import sys

import conllu

dictionary = {}

# Process all arguments as training files
for arg in sys.argv[1:]:
    reader = conllu.reader(arg)
    sentence = []
    while reader.next_sentence(sentence):
        for word in sentence:
            dictionary.setdefault(word.form, collections.defaultdict(lambda:0))["\t".join([word.lemma, word.upos, word.lpos, word.feats])] += 1

# Find most frequent analysis, using the lexicographically smaller when equal
for form in dictionary:
    best, best_count = '', 0
    for analysis, count in dictionary[form].iteritems():
        if count > best_count or (count == best_count and analysis < best):
            best, best_count = analysis, count
    dictionary[form] = best

# Analyse all data passed on standard input to standard output
stdin = conllu.reader()
stdout = conllu.writer()
sentence = []
while stdin.next_sentence(sentence):
    for word in sentence:
        word.lemma, word.upos, word.lpos, word.feats = dictionary.get(word.form, '\t\t\t').split('\t')
    stdout.write_sentence(sentence)