Exemple #1
0
def test_serialize_vocab_lex_attrs_disk(strings, lex_attr):
    vocab1 = Vocab(strings=strings)
    vocab2 = Vocab()
    vocab1[strings[0]].norm_ = lex_attr
    assert vocab1[strings[0]].norm_ == lex_attr
    assert vocab2[strings[0]].norm_ != lex_attr
    with make_tempdir() as d:
        file_path = d / "vocab"
        vocab1.to_disk(file_path)
        vocab2 = vocab2.from_disk(file_path)
    assert vocab2[strings[0]].norm_ == lex_attr
def test_serialize_vocab_lex_attrs_disk(strings, lex_attr):
    vocab1 = Vocab(strings=strings)
    vocab2 = Vocab()
    vocab1[strings[0]].norm_ = lex_attr
    assert vocab1[strings[0]].norm_ == lex_attr
    assert vocab2[strings[0]].norm_ != lex_attr
    with make_tempdir() as d:
        file_path = d / "vocab"
        vocab1.to_disk(file_path)
        vocab2 = vocab2.from_disk(file_path)
    assert vocab2[strings[0]].norm_ == lex_attr
def test_lookups_to_from_disk_via_vocab():
    table_name = "test"
    vocab = Vocab()
    vocab.lookups.add_table(table_name, {"foo": "bar", "hello": "world"})
    assert table_name in vocab.lookups
    with make_tempdir() as tmpdir:
        vocab.to_disk(tmpdir)
        new_vocab = Vocab()
        new_vocab.from_disk(tmpdir)
    assert len(new_vocab.lookups) == len(vocab.lookups)
    assert table_name in new_vocab.lookups
    table = new_vocab.lookups.get_table(table_name)
    assert len(table) == 2
    assert table["hello"] == "world"
Exemple #4
0
def test_serialize_vocab_roundtrip_disk(strings1, strings2):
    vocab1 = Vocab(strings=strings1)
    vocab2 = Vocab(strings=strings2)
    with make_tempdir() as d:
        file_path1 = d / "vocab1"
        file_path2 = d / "vocab2"
        vocab1.to_disk(file_path1)
        vocab2.to_disk(file_path2)
        vocab1_d = Vocab().from_disk(file_path1)
        vocab2_d = Vocab().from_disk(file_path2)
        assert list(vocab1_d) == list(vocab1)
        assert list(vocab2_d) == list(vocab2)
        if strings1 == strings2:
            assert list(vocab1_d) == list(vocab2_d)
        else:
            assert list(vocab1_d) != list(vocab2_d)
def test_serialize_vocab_roundtrip_disk(strings1, strings2):
    vocab1 = Vocab(strings=strings1)
    vocab2 = Vocab(strings=strings2)
    with make_tempdir() as d:
        file_path1 = d / "vocab1"
        file_path2 = d / "vocab2"
        vocab1.to_disk(file_path1)
        vocab2.to_disk(file_path2)
        vocab1_d = Vocab().from_disk(file_path1)
        vocab2_d = Vocab().from_disk(file_path2)
        assert list(vocab1_d) == list(vocab1)
        assert list(vocab2_d) == list(vocab2)
        if strings1 == strings2:
            assert list(vocab1_d) == list(vocab2_d)
        else:
            assert list(vocab1_d) != list(vocab2_d)
Exemple #6
0
def test_serialize_vocab_roundtrip_disk(strings1, strings2):
    vocab1 = Vocab(strings=strings1)
    vocab2 = Vocab(strings=strings2)
    with make_tempdir() as d:
        file_path1 = d / "vocab1"
        file_path2 = d / "vocab2"
        vocab1.to_disk(file_path1)
        vocab2.to_disk(file_path2)
        vocab1_d = Vocab().from_disk(file_path1)
        vocab2_d = Vocab().from_disk(file_path2)
        # check strings rather than lexemes, which are only reloaded on demand
        assert strings1 == [s for s in vocab1_d.strings]
        assert strings2 == [s for s in vocab2_d.strings]
        if strings1 == strings2:
            assert [s for s in vocab1_d.strings] == [s for s in vocab2_d.strings]
        else:
            assert [s for s in vocab1_d.strings] != [s for s in vocab2_d.strings]
def main():
    ap = argparse.ArgumentParser()
    ap.add_argument('input_path')
    ap.add_argument('output_path')
    ap.add_argument('--append', action='store_true')
    args = ap.parse_args()

    if args.append:
        logging.info('Loading existing model...')
        model = Vocab().from_disk(args.output_path)
    else:
        model = Vocab()

    logging.info('Loading vectors into spacy...')
    load_vectors_into_model(args.input_path, model)

    logging.info('Writing model to disk...')
    model.to_disk(args.output_path)

    logging.info('Done!')
Exemple #8
0
        "man",
        "woman",
        "cousin",
        "neice",
        "king",
        "queen",
        "dude",
        "guy",
        "gal",
        "fire",
        "dog",
        "cat",
        "mouse",
        "red",
        "bluee",
        "green",
        "yellow",
        "water",
        "person",
        "family",
        "brother",
        "sister",
    ]
    nlp = spacy.load("en_core_web_md")
    vec_data = {w: nlp(w).vector for w in words}
    vocab = Vocab(strings=words)
    for word, vector in vec_data.items():
        vocab.set_vector(word, vector)
    nlp = Language(vocab=vocab, meta={"lang": "en"})
    vocab.to_disk("custom_test_vocab")
        "neice",
        "king",
        "queen",
        "dude",
        "guy",
        "gal",
        "fire",
        "dog",
        "cat",
        "mouse",
        "red",
        "bluee",
        "green",
        "yellow",
        "water",
        "person",
        "family",
        "brother",
        "sister",
    ]
    nlp = spacy.load("en_core_web_md")
    vec_data = {w: nlp(w).vector for w in words}
    vocab = Vocab(strings=words)
    for word, vector in vec_data.items():
        vocab.set_vector(word, vector)
    nlp = Language(vocab=vocab, meta={"lang": "en"})
    vocab.to_disk("tests/custom_test_vocab")
    print("local vocab saved for spacy")
    nlp.to_disk("tests/custom_test_lang")
    print("local nlp saved for spacy")