def test_serialize_vocab_lex_attrs_disk(strings, lex_attr): vocab1 = Vocab(strings=strings) vocab2 = Vocab() vocab1[strings[0]].norm_ = lex_attr assert vocab1[strings[0]].norm_ == lex_attr assert vocab2[strings[0]].norm_ != lex_attr with make_tempdir() as d: file_path = d / "vocab" vocab1.to_disk(file_path) vocab2 = vocab2.from_disk(file_path) assert vocab2[strings[0]].norm_ == lex_attr
def test_lookups_to_from_disk_via_vocab(): table_name = "test" vocab = Vocab() vocab.lookups.add_table(table_name, {"foo": "bar", "hello": "world"}) assert table_name in vocab.lookups with make_tempdir() as tmpdir: vocab.to_disk(tmpdir) new_vocab = Vocab() new_vocab.from_disk(tmpdir) assert len(new_vocab.lookups) == len(vocab.lookups) assert table_name in new_vocab.lookups table = new_vocab.lookups.get_table(table_name) assert len(table) == 2 assert table["hello"] == "world"
def test_serialize_vocab_roundtrip_disk(strings1, strings2): vocab1 = Vocab(strings=strings1) vocab2 = Vocab(strings=strings2) with make_tempdir() as d: file_path1 = d / "vocab1" file_path2 = d / "vocab2" vocab1.to_disk(file_path1) vocab2.to_disk(file_path2) vocab1_d = Vocab().from_disk(file_path1) vocab2_d = Vocab().from_disk(file_path2) assert list(vocab1_d) == list(vocab1) assert list(vocab2_d) == list(vocab2) if strings1 == strings2: assert list(vocab1_d) == list(vocab2_d) else: assert list(vocab1_d) != list(vocab2_d)
def test_serialize_vocab_roundtrip_disk(strings1, strings2): vocab1 = Vocab(strings=strings1) vocab2 = Vocab(strings=strings2) with make_tempdir() as d: file_path1 = d / "vocab1" file_path2 = d / "vocab2" vocab1.to_disk(file_path1) vocab2.to_disk(file_path2) vocab1_d = Vocab().from_disk(file_path1) vocab2_d = Vocab().from_disk(file_path2) # check strings rather than lexemes, which are only reloaded on demand assert strings1 == [s for s in vocab1_d.strings] assert strings2 == [s for s in vocab2_d.strings] if strings1 == strings2: assert [s for s in vocab1_d.strings] == [s for s in vocab2_d.strings] else: assert [s for s in vocab1_d.strings] != [s for s in vocab2_d.strings]
def main(): ap = argparse.ArgumentParser() ap.add_argument('input_path') ap.add_argument('output_path') ap.add_argument('--append', action='store_true') args = ap.parse_args() if args.append: logging.info('Loading existing model...') model = Vocab().from_disk(args.output_path) else: model = Vocab() logging.info('Loading vectors into spacy...') load_vectors_into_model(args.input_path, model) logging.info('Writing model to disk...') model.to_disk(args.output_path) logging.info('Done!')
"man", "woman", "cousin", "neice", "king", "queen", "dude", "guy", "gal", "fire", "dog", "cat", "mouse", "red", "bluee", "green", "yellow", "water", "person", "family", "brother", "sister", ] nlp = spacy.load("en_core_web_md") vec_data = {w: nlp(w).vector for w in words} vocab = Vocab(strings=words) for word, vector in vec_data.items(): vocab.set_vector(word, vector) nlp = Language(vocab=vocab, meta={"lang": "en"}) vocab.to_disk("custom_test_vocab")
"neice", "king", "queen", "dude", "guy", "gal", "fire", "dog", "cat", "mouse", "red", "bluee", "green", "yellow", "water", "person", "family", "brother", "sister", ] nlp = spacy.load("en_core_web_md") vec_data = {w: nlp(w).vector for w in words} vocab = Vocab(strings=words) for word, vector in vec_data.items(): vocab.set_vector(word, vector) nlp = Language(vocab=vocab, meta={"lang": "en"}) vocab.to_disk("tests/custom_test_vocab") print("local vocab saved for spacy") nlp.to_disk("tests/custom_test_lang") print("local nlp saved for spacy")