Example #1
0
    def load_from_file(fname, vocab=None, alphabet=None, tags=None):
        """Load dataset from the given file."""
        reader = conllu.reader(fname)

        learn_tags, learn_vocab, tagset, vocab, alphabet = \
                TaggingDataset.initialize_vocab_and_tags(tags, vocab, alphabet)

        seqs = []
        for sentence in reader:
            words = []
            tags = []
            chars = []
            lemma_chars = []

            for word in sentence:
                word_id, char_ids, tag_id, lemma_char_ids = \
                        TaggingDataset.get_word_and_tag_id(word, vocab, alphabet, tagset,
                                                           learn_vocab, learn_tags)

                words.append(word_id)
                chars.append(char_ids)
                tags.append(tag_id)
                lemma_chars.append(lemma_char_ids)

            seqs.append((words, chars, tags, lemma_chars))

        res = TaggingDataset(seqs, vocab, alphabet, tagset)

        return res
Example #2
0
    def load_from_file(fname, vocab=None, alphabet=None, tags=None):
        """Load dataset from the given file."""
        reader = conllu.reader(fname)

        learn_tags, learn_vocab, tagset, vocab, alphabet = \
                TaggingDataset.initialize_vocab_and_tags(tags, vocab, alphabet)

        seqs = []
        for sentence in reader:
            words = []
            tags = []
            chars = []
            lemma_chars = []

            for word in sentence:
                word_id, char_ids, tag_id, lemma_char_ids = \
                        TaggingDataset.get_word_and_tag_id(word, vocab, alphabet, tagset,
                                                           learn_vocab, learn_tags)

                words.append(word_id)
                chars.append(char_ids)
                tags.append(tag_id)
                lemma_chars.append(lemma_char_ids)

            seqs.append((words, chars, tags, lemma_chars))

        res = TaggingDataset(seqs, vocab, alphabet, tagset)

        return res
Example #3
0
def evaluate_tagger_and_writeout(tagger):
    stdin = conllu.reader()
    stdout = conllu.writer()
    for sentence in stdin:
        x = []
        for word in sentence:
            x.append(tagger.vocab.get(TaggingDataset.word_obj_to_str(word), tagger.vocab['#OOV']))

        x = np.array([x], dtype='int32')

        y_hat = tagger.predict(x)[0]
        y_hat_str = [tagger.tags.rev(tag_id) for tag_id in y_hat]

        for word, utag in zip(sentence, y_hat_str):
            word.upos = utag

        stdout.write_sentence(sentence)
Example #4
0
def evaluate_tagger_and_writeout(tagger):
    stdin = conllu.reader()
    stdout = conllu.writer()
    for sentence in stdin:
        x = []
        for word in sentence:
            x.append(
                tagger.vocab.get(TaggingDataset.word_obj_to_str(word),
                                 tagger.vocab['#OOV']))

        x = np.array([x], dtype='int32')

        y_hat = tagger.predict(x)[0]
        y_hat_str = [tagger.tags.rev(tag_id) for tag_id in y_hat]

        for word, utag in zip(sentence, y_hat_str):
            word.upos = utag

        stdout.write_sentence(sentence)
Example #5
0
    def load_from_file(fname, vocab=None, tags=None):
        """Load dataset from the given file."""
        reader = conllu.reader(fname)

        learn_tags, learn_vocab, tags, vocab = TaggingDataset.initialize_vocab_and_tags(tags, vocab)

        seqs = []
        for sentence in reader:
            x = []
            y = []
            for word in sentence:
                word_id, tag_id = TaggingDataset.get_word_and_tag_id(word, vocab, tags, learn_vocab, learn_tags)

                x.append(word_id)
                y.append(tag_id)

            seqs.append((x, y))

        res = TaggingDataset(seqs, vocab, tags)

        return res
Example #6
0
    def load_from_file(fname, vocab=None, tags=None):
        """Load dataset from the given file."""
        reader = conllu.reader(fname)

        learn_tags, learn_vocab, tags, vocab = TaggingDataset.initialize_vocab_and_tags(
            tags, vocab)

        seqs = []
        for sentence in reader:
            x = []
            y = []
            for word in sentence:
                word_id, tag_id = TaggingDataset.get_word_and_tag_id(
                    word, vocab, tags, learn_vocab, learn_tags)

                x.append(word_id)
                y.append(tag_id)

            seqs.append((x, y))

        res = TaggingDataset(seqs, vocab, tags)

        return res
Example #7
0
# Mathematics and Physics, Charles University in Prague, Czech Republic.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

import collections
import sys

import conllu

dictionary = {}

# Process all arguments as training files
for arg in sys.argv[1:]:
    reader = conllu.reader(arg)
    sentence = []
    while reader.next_sentence(sentence):
        for word in sentence:
            dictionary.setdefault(
                word.form, collections.defaultdict(lambda: 0))["\t".join(
                    [word.lemma, word.upos, word.lpos, word.feats])] += 1

# Find most frequent analysis, using the lexicographically smaller when equal
for form in dictionary:
    best, best_count = '', 0
    for analysis, count in dictionary[form].iteritems():
        if count > best_count or (count == best_count and analysis < best):
            best, best_count = analysis, count
    dictionary[form] = best
# Mathematics and Physics, Charles University in Prague, Czech Republic.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

import collections
import sys

import conllu

dictionary = {}

# Process all arguments as training files
for arg in sys.argv[1:]:
    reader = conllu.reader(arg)
    sentence = []
    while reader.next_sentence(sentence):
        for word in sentence:
            dictionary.setdefault(word.form, collections.defaultdict(lambda:0))["\t".join([word.lemma, word.upos, word.lpos, word.feats])] += 1

# Find most frequent analysis, using the lexicographically smaller when equal
for form in dictionary:
    best, best_count = '', 0
    for analysis, count in dictionary[form].iteritems():
        if count > best_count or (count == best_count and analysis < best):
            best, best_count = analysis, count
    dictionary[form] = best

# Analyse all data passed on standard input to standard output
stdin = conllu.reader()
Example #9
0
#!/usr/bin/env python

# This file is part of RH_NNTagging <http://github.com/ufal/rh_nntagging/>.
#
# Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
# Mathematics and Physics, Charles University in Prague, Czech Republic.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

# Remove all annotations from CoNLL-U file except for form.
# Reads from files given as arguments and writes to standard output.

import sys
import conllu

stdout = conllu.writer()
sentence = []
for arg in sys.argv[1:]:
    reader = conllu.reader(arg)
    while reader.next_sentence(sentence):
        for word in sentence:
            word.lemma, word.upos, word.lpos, word.feats, word.head, word.deprel, word.deps, word.misc = '', '', '', '', -1, '', '', ''
        stdout.write_sentence(sentence)