Esempio n. 1
0
 def test_load(self):
     data_dir = English.default_data_dir()
     if path.exists(path.join(data_dir, 'vocab')):
         vocab = Vocab.from_dir(path.join(data_dir, 'vocab'))
     if path.exists(path.join(data_dir, 'deps')):
         parser = Parser.from_dir(path.join(data_dir, 'deps'),
                                  vocab.strings, ArcEager)
Esempio n. 2
0
    def test_thinc_load(self):
        data_dir = English.default_data_dir()
        model_loc = path.join(data_dir, 'deps', 'model')

        # n classes. moves.n_moves above
        # n features. len(templates) + 1 above
        model = LinearModel(92, 116)
        model.load(model_loc)
Esempio n. 3
0
    def test_thinc_load(self):
        data_dir = English.default_data_dir()
        model_loc = path.join(data_dir, 'deps', 'model')

        # n classes. moves.n_moves above
        # n features. len(templates) + 1 above
        if path.exists(model_loc):
            model = LinearModel(92, 116)
            model.load(model_loc)
Esempio n. 4
0
    def test_load_careful(self):
        config_data = {"labels": {"0": {"": True}, "1": {"": True}, "2": {"cc": True, "agent": True, "ccomp": True, "prt": True, "meta": True, "nsubjpass": True, "csubj": True, "conj": True, "dobj": True, "neg": True, "csubjpass": True, "mark": True, "auxpass": True, "advcl": True, "aux": True, "ROOT": True, "prep": True, "parataxis": True, "xcomp": True, "nsubj": True, "nummod": True, "advmod": True, "punct": True, "relcl": True, "quantmod": True, "acomp": True, "compound": True, "pcomp": True, "intj": True, "poss": True, "npadvmod": True, "case": True, "attr": True, "dep": True, "appos": True, "det": True, "nmod": True, "amod": True, "dative": True, "pobj": True, "expl": True, "predet": True, "preconj": True, "oprd": True, "acl": True}, "3": {"cc": True, "agent": True, "ccomp": True, "prt": True, "meta": True, "nsubjpass": True, "csubj": True, "conj": True, "acl": True, "poss": True, "neg": True, "mark": True, "auxpass": True, "advcl": True, "aux": True, "amod": True, "ROOT": True, "prep": True, "parataxis": True, "xcomp": True, "nsubj": True, "nummod": True, "advmod": True, "punct": True, "quantmod": True, "acomp": True, "pcomp": True, "intj": True, "relcl": True, "npadvmod": True, "case": True, "attr": True, "dep": True, "appos": True, "det": True, "nmod": True, "dobj": True, "dative": True, "pobj": True, "iobj": True, "expl": True, "predet": True, "preconj": True, "oprd": True}, "4": {"ROOT": True}}, "seed": 0, "features": "basic", "beam_width": 1}

        data_dir = English.default_data_dir()
        vocab = Vocab.from_dir(path.join(data_dir, 'vocab'))

        moves = ArcEager(vocab.strings, config_data['labels'])
        templates = get_templates(config_data['features'])

        model = Model(moves.n_moves, templates, path.join(data_dir, 'deps'))

        parser = Parser(vocab.strings, moves, model)
Esempio n. 5
0
def count_freqs(input_loc, output_loc):
    print(output_loc)
    vocab = English.default_vocab(get_lex_attr=None)
    tokenizer = Tokenizer.from_dir(
        vocab, path.join(English.default_data_dir(), 'tokenizer'))

    counts = PreshCounter()
    for json_comment in iter_comments(input_loc):
        doc = tokenizer(json_comment['body'])
        doc.count_by(ORTH, counts=counts)

    with codecs.open(output_loc, 'w', 'utf8') as file_:
        for orth, freq in counts:
            string = tokenizer.vocab.strings[orth]
            if not string.isspace():
                file_.write('%d\t%s\n' % (freq, string))
Esempio n. 6
0
def count_freqs(input_loc, output_loc):
    print(output_loc)
    vocab = English.default_vocab(get_lex_attr=None)
    tokenizer = Tokenizer.from_dir(vocab,
                    path.join(English.default_data_dir(), 'tokenizer'))

    counts = PreshCounter()
    for json_comment in iter_comments(input_loc):
        doc = tokenizer(json_comment['body'])
        doc.count_by(ORTH, counts=counts)

    with io.open(output_loc, 'w', 'utf8') as file_:
        for orth, freq in counts:
            string = tokenizer.vocab.strings[orth]
            if not string.isspace():
                file_.write('%d\t%s\n' % (freq, string))
Esempio n. 7
0
 def test_load(self):
     data_dir = English.default_data_dir()
     vocab = Vocab.from_dir(path.join(data_dir, 'vocab'))
     parser = Parser.from_dir(path.join(data_dir, 'deps'), vocab.strings, ArcEager)
Esempio n. 8
0
 def test_load(self):
     data_dir = English.default_data_dir()
     vocab = Vocab.from_dir(path.join(data_dir, 'vocab'))
     tagger = Tagger.from_dir(path.join(data_dir, 'tagger'), vocab)
Esempio n. 9
0
 def test_load(self):
     data_dir = English.default_data_dir()
     vocab = Vocab.from_dir(path.join(data_dir, 'vocab'))
     tokenizer = Tokenizer.from_dir(vocab, path.join(data_dir, 'tokenizer'))
Esempio n. 10
0
 def test_load(self):
     vocab = Vocab.from_dir(path.join(English.default_data_dir(), 'vocab'))
Esempio n. 11
0
    def test_load_careful(self):
        config_data = {
            "labels": {
                "0": {
                    "": True
                },
                "1": {
                    "": True
                },
                "2": {
                    "cc": True,
                    "agent": True,
                    "ccomp": True,
                    "prt": True,
                    "meta": True,
                    "nsubjpass": True,
                    "csubj": True,
                    "conj": True,
                    "dobj": True,
                    "neg": True,
                    "csubjpass": True,
                    "mark": True,
                    "auxpass": True,
                    "advcl": True,
                    "aux": True,
                    "ROOT": True,
                    "prep": True,
                    "parataxis": True,
                    "xcomp": True,
                    "nsubj": True,
                    "nummod": True,
                    "advmod": True,
                    "punct": True,
                    "relcl": True,
                    "quantmod": True,
                    "acomp": True,
                    "compound": True,
                    "pcomp": True,
                    "intj": True,
                    "poss": True,
                    "npadvmod": True,
                    "case": True,
                    "attr": True,
                    "dep": True,
                    "appos": True,
                    "det": True,
                    "nmod": True,
                    "amod": True,
                    "dative": True,
                    "pobj": True,
                    "expl": True,
                    "predet": True,
                    "preconj": True,
                    "oprd": True,
                    "acl": True
                },
                "3": {
                    "cc": True,
                    "agent": True,
                    "ccomp": True,
                    "prt": True,
                    "meta": True,
                    "nsubjpass": True,
                    "csubj": True,
                    "conj": True,
                    "acl": True,
                    "poss": True,
                    "neg": True,
                    "mark": True,
                    "auxpass": True,
                    "advcl": True,
                    "aux": True,
                    "amod": True,
                    "ROOT": True,
                    "prep": True,
                    "parataxis": True,
                    "xcomp": True,
                    "nsubj": True,
                    "nummod": True,
                    "advmod": True,
                    "punct": True,
                    "quantmod": True,
                    "acomp": True,
                    "pcomp": True,
                    "intj": True,
                    "relcl": True,
                    "npadvmod": True,
                    "case": True,
                    "attr": True,
                    "dep": True,
                    "appos": True,
                    "det": True,
                    "nmod": True,
                    "dobj": True,
                    "dative": True,
                    "pobj": True,
                    "iobj": True,
                    "expl": True,
                    "predet": True,
                    "preconj": True,
                    "oprd": True
                },
                "4": {
                    "ROOT": True
                }
            },
            "seed": 0,
            "features": "basic",
            "beam_width": 1
        }

        data_dir = English.default_data_dir()
        vocab = Vocab.from_dir(path.join(data_dir, 'vocab'))

        moves = ArcEager(vocab.strings, config_data['labels'])
        templates = get_templates(config_data['features'])

        model = Model(moves.n_moves, templates, path.join(data_dir, 'deps'))

        parser = Parser(vocab.strings, moves, model)
Esempio n. 12
0
    def test_load(self):
        data_dir = English.default_data_dir()

        if path.exists(path.join(data_dir, 'vocab')):
            vocab = Vocab.from_dir(path.join(data_dir, 'vocab'))
            tagger = Tagger.from_dir(path.join(data_dir, 'tagger'), vocab)
Esempio n. 13
0
 def test_load(self):
     data_dir = English.default_data_dir()
     if path.exists(path.join(data_dir, 'vocab')):
         vocab = Vocab.from_dir(path.join(data_dir, 'vocab'))
         tokenizer = Tokenizer.from_dir(vocab,
                                        path.join(data_dir, 'tokenizer'))
Esempio n. 14
0
 def test_load(self):
     if path.exists(path.join(English.default_data_dir(), 'vocab')):
         vocab = Vocab.from_dir(
             path.join(English.default_data_dir(), 'vocab'))