def evaluate_tagger_and_writeout(tagger): stdin = conllu.reader() stdout = conllu.writer() for sentence in stdin: x = [] for word in sentence: x.append(tagger.vocab.get(TaggingDataset.word_obj_to_str(word), tagger.vocab['#OOV'])) x = np.array([x], dtype='int32') y_hat = tagger.predict(x)[0] y_hat_str = [tagger.tags.rev(tag_id) for tag_id in y_hat] for word, utag in zip(sentence, y_hat_str): word.upos = utag stdout.write_sentence(sentence)
def evaluate_tagger_and_writeout(tagger): stdin = conllu.reader() stdout = conllu.writer() for sentence in stdin: x = [] for word in sentence: x.append( tagger.vocab.get(TaggingDataset.word_obj_to_str(word), tagger.vocab['#OOV'])) x = np.array([x], dtype='int32') y_hat = tagger.predict(x)[0] y_hat_str = [tagger.tags.rev(tag_id) for tag_id in y_hat] for word, utag in zip(sentence, y_hat_str): word.upos = utag stdout.write_sentence(sentence)
dictionary = {} # Process all arguments as training files for arg in sys.argv[1:]: reader = conllu.reader(arg) sentence = [] while reader.next_sentence(sentence): for word in sentence: dictionary.setdefault( word.form, collections.defaultdict(lambda: 0))["\t".join( [word.lemma, word.upos, word.lpos, word.feats])] += 1 # Find most frequent analysis, using the lexicographically smaller when equal for form in dictionary: best, best_count = '', 0 for analysis, count in dictionary[form].iteritems(): if count > best_count or (count == best_count and analysis < best): best, best_count = analysis, count dictionary[form] = best # Analyse all data passed on standard input to standard output stdin = conllu.reader() stdout = conllu.writer() sentence = [] while stdin.next_sentence(sentence): for word in sentence: word.lemma, word.upos, word.lpos, word.feats = dictionary.get( word.form, '\t\t\t').split('\t') stdout.write_sentence(sentence)
import sys import conllu dictionary = {} # Process all arguments as training files for arg in sys.argv[1:]: reader = conllu.reader(arg) sentence = [] while reader.next_sentence(sentence): for word in sentence: dictionary.setdefault(word.form, collections.defaultdict(lambda:0))["\t".join([word.lemma, word.upos, word.lpos, word.feats])] += 1 # Find most frequent analysis, using the lexicographically smaller when equal for form in dictionary: best, best_count = '', 0 for analysis, count in dictionary[form].iteritems(): if count > best_count or (count == best_count and analysis < best): best, best_count = analysis, count dictionary[form] = best # Analyse all data passed on standard input to standard output stdin = conllu.reader() stdout = conllu.writer() sentence = [] while stdin.next_sentence(sentence): for word in sentence: word.lemma, word.upos, word.lpos, word.feats = dictionary.get(word.form, '\t\t\t').split('\t') stdout.write_sentence(sentence)