def setup_vocab(src_dir, dst_dir): if not dst_dir.exists(): dst_dir.mkdir() vectors_src = src_dir / 'vectors.tgz' if vectors_src.exists(): write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin')) vocab = Vocab(data_dir=None, get_lex_props=get_lex_props) clusters = _read_clusters(src_dir / 'clusters.txt') probs = _read_probs(src_dir / 'words.sgt.prob') for word in clusters: if word not in probs: probs[word] = -17.0 lexicon = [] for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])): entry = get_lex_props(word) if word in clusters or float(prob) >= -17: entry['prob'] = float(prob) cluster = clusters.get(word, '0') # Decode as a little-endian string, so that we can do & 15 to get # the first 4 bits. See _parse_features.pyx entry['cluster'] = int(cluster[::-1], 2) vocab[word] = entry vocab.dump(str(dst_dir / 'lexemes.bin')) vocab.strings.dump(str(dst_dir / 'strings.txt'))
def setup_vocab(src_dir, dst_dir): if not dst_dir.exists(): dst_dir.mkdir() vectors_src = src_dir / "vectors.tgz" if vectors_src.exists(): write_binary_vectors(str(vectors_src), str(dst_dir / "vec.bin")) else: print("Warning: Word vectors file not found") vocab = Vocab(data_dir=None, get_lex_props=get_lex_props) clusters = _read_clusters(src_dir / "clusters.txt") probs = _read_probs(src_dir / "words.sgt.prob") if not probs: min_prob = 0.0 else: min_prob = min(probs.values()) for word in clusters: if word not in probs: probs[word] = min_prob lexicon = [] for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])): entry = get_lex_props(word) if word in clusters or float(prob) >= -17: entry["prob"] = float(prob) cluster = clusters.get(word, "0") # Decode as a little-endian string, so that we can do & 15 to get # the first 4 bits. See _parse_features.pyx entry["cluster"] = int(cluster[::-1], 2) orth_senses = set() lemmas = [] vocab[word] = entry vocab.dump(str(dst_dir / "lexemes.bin")) vocab.strings.dump(str(dst_dir / "strings.txt"))
def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr): vocab1 = Vocab(strings=strings) vocab2 = Vocab() vocab1[strings[0]].norm_ = lex_attr assert vocab1[strings[0]].norm_ == lex_attr assert vocab2[strings[0]].norm_ != lex_attr vocab2 = vocab2.from_bytes(vocab1.to_bytes()) assert vocab2[strings[0]].norm_ == lex_attr
def test_vocab_add_vector(): vocab = Vocab() data = numpy.ndarray((5, 3), dtype="f") data[0] = 1.0 data[1] = 2.0 vocab.set_vector("cat", data[0]) vocab.set_vector("dog", data[1]) cat = vocab["cat"] assert list(cat.vector) == [1.0, 1.0, 1.0] dog = vocab["dog"] assert list(dog.vector) == [2.0, 2.0, 2.0]
def test_serialize_vocab_lex_attrs_disk(strings, lex_attr): vocab1 = Vocab(strings=strings) vocab2 = Vocab() vocab1[strings[0]].norm_ = lex_attr assert vocab1[strings[0]].norm_ == lex_attr assert vocab2[strings[0]].norm_ != lex_attr with make_tempdir() as d: file_path = d / "vocab" vocab1.to_disk(file_path) vocab2 = vocab2.from_disk(file_path) assert vocab2[strings[0]].norm_ == lex_attr
def test_pickle_vocab(text1, text2): vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]}) vocab.set_vector("dog", numpy.ones((5,), dtype="f")) lex1 = vocab[text1] lex2 = vocab[text2] assert lex1.norm_ == text1[:-1] assert lex2.norm_ == text2[:-1] data = srsly.pickle_dumps(vocab) unpickled = srsly.pickle_loads(data) assert unpickled[text1].orth == lex1.orth assert unpickled[text2].orth == lex2.orth assert unpickled[text1].norm == lex1.norm assert unpickled[text2].norm == lex2.norm assert unpickled[text1].norm != unpickled[text2].norm assert unpickled.vectors is not None assert list(vocab["dog"].vector) == [1.0, 1.0, 1.0, 1.0, 1.0]
def load_vocab(path): path = Path(path) if not path.exists(): raise IOError("Cannot load vocab from %s\nDoes not exist" % path) if not path.is_dir(): raise IOError("Cannot load vocab from %s\nNot a directory" % path) return Vocab.load(path)
def setup_vocab(lex_attr_getters, tag_map, src_dir, dst_dir): if not dst_dir.exists(): dst_dir.mkdir() vectors_src = src_dir / 'vectors.bz2' if vectors_src.exists(): write_binary_vectors(vectors_src.as_posix, (dst_dir / 'vec.bin').as_posix()) else: print("Warning: Word vectors file not found") vocab = Vocab(lex_attr_getters=lex_attr_getters, tag_map=tag_map) clusters = _read_clusters(src_dir / 'clusters.txt') probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob') if not probs: probs, oov_prob = _read_freqs(src_dir / 'freqs.txt.gz') if not probs: oov_prob = -20 else: oov_prob = min(probs.values()) for word in clusters: if word not in probs: probs[word] = oov_prob lexicon = [] for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])): # First encode the strings into the StringStore. This way, we can map # the orth IDs to frequency ranks orth = vocab.strings[word] # Now actually load the vocab for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])): lexeme = vocab[word] lexeme.prob = prob lexeme.is_oov = False # Decode as a little-endian string, so that we can do & 15 to get # the first 4 bits. See _parse_features.pyx if word in clusters: lexeme.cluster = int(clusters[word][::-1], 2) else: lexeme.cluster = 0 vocab.dump((dst_dir / 'lexemes.bin').as_posix()) with (dst_dir / 'strings.json').open('w') as file_: vocab.strings.dump(file_) with (dst_dir / 'oov_prob').open('w') as file_: file_.write('%f' % oov_prob)
def test_load_careful(self): config_data = {"labels": {"0": {"": True}, "1": {"": True}, "2": {"cc": True, "agent": True, "ccomp": True, "prt": True, "meta": True, "nsubjpass": True, "csubj": True, "conj": True, "dobj": True, "neg": True, "csubjpass": True, "mark": True, "auxpass": True, "advcl": True, "aux": True, "ROOT": True, "prep": True, "parataxis": True, "xcomp": True, "nsubj": True, "nummod": True, "advmod": True, "punct": True, "relcl": True, "quantmod": True, "acomp": True, "compound": True, "pcomp": True, "intj": True, "poss": True, "npadvmod": True, "case": True, "attr": True, "dep": True, "appos": True, "det": True, "nmod": True, "amod": True, "dative": True, "pobj": True, "expl": True, "predet": True, "preconj": True, "oprd": True, "acl": True}, "3": {"cc": True, "agent": True, "ccomp": True, "prt": True, "meta": True, "nsubjpass": True, "csubj": True, "conj": True, "acl": True, "poss": True, "neg": True, "mark": True, "auxpass": True, "advcl": True, "aux": True, "amod": True, "ROOT": True, "prep": True, "parataxis": True, "xcomp": True, "nsubj": True, "nummod": True, "advmod": True, "punct": True, "quantmod": True, "acomp": True, "pcomp": True, "intj": True, "relcl": True, "npadvmod": True, "case": True, "attr": True, "dep": True, "appos": True, "det": True, "nmod": True, "dobj": True, "dative": True, "pobj": True, "iobj": True, "expl": True, "predet": True, "preconj": True, "oprd": True}, "4": {"ROOT": True}}, "seed": 0, "features": "basic", "beam_width": 1} data_dir = English.default_data_dir() vocab = Vocab.from_dir(path.join(data_dir, 'vocab')) moves = ArcEager(vocab.strings, config_data['labels']) templates = get_templates(config_data['features']) model = Model(moves.n_moves, templates, path.join(data_dir, 'deps')) parser = Parser(vocab.strings, moves, model)
def test_serialize_vocab_roundtrip_bytes(strings1, strings2): vocab1 = Vocab(strings=strings1) vocab2 = Vocab(strings=strings2) vocab1_b = vocab1.to_bytes() vocab2_b = vocab2.to_bytes() if strings1 == strings2: assert vocab1_b == vocab2_b else: assert vocab1_b != vocab2_b vocab1 = vocab1.from_bytes(vocab1_b) assert vocab1.to_bytes() == vocab1_b new_vocab1 = Vocab().from_bytes(vocab1_b) assert new_vocab1.to_bytes() == vocab1_b assert len(new_vocab1) == len(strings1) assert sorted([lex.text for lex in new_vocab1]) == sorted(strings1)
def setup_vocab(src_dir, dst_dir): if not dst_dir.exists(): dst_dir.mkdir() vectors_src = src_dir / 'vectors.tgz' if vectors_src.exists(): write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin')) vocab = Vocab(data_dir=None, get_lex_props=get_lex_props) clusters = _read_clusters(src_dir / 'clusters.txt') senses = _read_senses(src_dir / 'supersenses.txt') probs = _read_probs(src_dir / 'words.sgt.prob') for word in set(clusters).union(set(senses)): if word not in probs: probs[word] = -17.0 lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ) lexicon = [] for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])): entry = get_lex_props(word) if word in clusters or float(prob) >= -17: entry['prob'] = float(prob) cluster = clusters.get(word, '0') # Decode as a little-endian string, so that we can do & 15 to get # the first 4 bits. See _parse_features.pyx entry['cluster'] = int(cluster[::-1], 2) orth_senses = set() lemmas = [] for pos in [NOUN, VERB, ADJ]: for lemma in lemmatizer(word.lower(), pos): lemmas.append(lemma) orth_senses.update(senses[lemma][pos]) if word.lower() == 'dogging': print word print lemmas print [spacy.senses.STRINGS[si] for si in orth_senses] entry['senses'] = list(sorted(orth_senses)) vocab[word] = entry vocab.dump(str(dst_dir / 'lexemes.bin')) vocab.strings.dump(str(dst_dir / 'strings.txt'))
def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir): if not dst_dir.exists(): dst_dir.mkdir() vectors_src = src_dir / 'vectors.tgz' if vectors_src.exists(): write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin')) else: print("Warning: Word vectors file not found") vocab = Vocab(get_lex_attr=get_lex_attr, tag_map=tag_map) clusters = _read_clusters(src_dir / 'clusters.txt') probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob') if not probs: probs, oov_prob = _read_freqs(src_dir / 'freqs.txt') if not probs: oov_prob = -20 else: oov_prob = min(probs.values()) for word in clusters: if word not in probs: probs[word] = oov_prob lexicon = [] for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])): lexeme = vocab[word] lexeme.prob = prob lexeme.is_oov = False # Decode as a little-endian string, so that we can do & 15 to get # the first 4 bits. See _parse_features.pyx if word in clusters: lexeme.cluster = int(clusters[word][::-1], 2) else: lexeme.cluster = 0 vocab.dump(str(dst_dir / 'lexemes.bin')) vocab.strings.dump(str(dst_dir / 'strings.txt')) with (dst_dir / 'oov_prob').open('w') as file_: file_.write('%f' % oov_prob)
def test_doc_token_api_vectors(): vocab = Vocab() vocab.reset_vectors(width=2) vocab.set_vector("apples", vector=numpy.asarray([0.0, 2.0], dtype="f")) vocab.set_vector("oranges", vector=numpy.asarray([0.0, 1.0], dtype="f")) doc = Doc(vocab, words=["apples", "oranges", "oov"]) assert doc.has_vector assert doc[0].has_vector assert doc[1].has_vector assert not doc[2].has_vector apples_norm = (0 * 0 + 2 * 2) ** 0.5 oranges_norm = (0 * 0 + 1 * 1) ** 0.5 cosine = ((0 * 0) + (2 * 1)) / (apples_norm * oranges_norm) assert doc[0].similarity(doc[1]) == cosine
def test_Example_from_dict_with_empty_entities(): annots = { "words": ["I", "like", "New", "York", "and", "Berlin", "."], "entities": [], } vocab = Vocab() predicted = Doc(vocab, words=annots["words"]) example = Example.from_dict(predicted, annots) # entities as empty list sets everything to O assert example.reference.has_annotation("ENT_IOB") assert len(list(example.reference.ents)) == 0 assert all(token.ent_iob_ == "O" for token in example.reference) # various unset/missing entities leaves entities unset annots["entities"] = None example = Example.from_dict(predicted, annots) assert not example.reference.has_annotation("ENT_IOB") annots.pop("entities", None) example = Example.from_dict(predicted, annots) assert not example.reference.has_annotation("ENT_IOB")
def test_issue1799(): """Test sentence boundaries are deserialized correctly, even for non-projective sentences.""" heads_deps = numpy.asarray( [ [1, 397], [4, 436], [2, 426], [1, 402], [0, 8206900633647566924], [18446744073709551615, 440], [18446744073709551614, 442], ], dtype="uint64", ) doc = Doc(Vocab(), words="Just what I was looking for .".split()) doc.vocab.strings.add("ROOT") doc = doc.from_array([HEAD, DEP], heads_deps) assert len(list(doc.sents)) == 1
def case_spacy(spacy_load_mock): import re data = re.sub( " +", "\t", """ # sent_id = testtext.1 1 Dies Dies PRON PDS _ 2 sb _ _ 2 ist sein AUX VAFIN _ 0 ROOT _ _ 3 ein einen DET ART _ 4 nk _ _ 4 Test Test NOUN NN _ 2 pd _ _ 5 . . PUNCT $. _ 2 punct _ _ """) result = conllu.parse(data) words = ["Dies", "ist", "ein", "Test", "."] vocab = Vocab(strings=words) heads = [1, 1, 3, 1, 1] tags = ["PDS", "VAFIN", "ART", "NN", "$."] pos = ["PRON", "AUX", "DET", "NOUN", "PUNCT"] lemmas = ["Dies", "sein", "einen", "Test", "."] deps = ["sb", "ROOT", "nk", "pd", "punct"] doc = Doc(vocab, words, pos=pos, tags=tags, lemmas=lemmas, deps=deps, heads=heads) spacy_annotator = spacy_load_mock.return_value spacy_annotator.return_value = doc annotator = wikiannotator.Annotator.createAnnotator('spacy', {'model_name': 'model_name'}) spacy_load_mock.assert_called_once_with('model_name') return ( annotator, wikiannotator.SpacyAnnotator, { 'text': 'Dies ist ein Test.', 'textname': 'testtext', 'parse': result } )
def test_issue7056(): """Test that the Unshift transition works properly, and doesn't cause sentence segmentation errors.""" vocab = Vocab() ae = ArcEager( vocab.strings, ArcEager.get_actions(left_labels=["amod"], right_labels=["pobj"])) doc = Doc(vocab, words="Severe pain , after trauma".split()) state = ae.init_batch([doc])[0] ae.apply_transition(state, "S") ae.apply_transition(state, "L-amod") ae.apply_transition(state, "S") ae.apply_transition(state, "S") ae.apply_transition(state, "S") ae.apply_transition(state, "R-pobj") ae.apply_transition(state, "D") ae.apply_transition(state, "D") ae.apply_transition(state, "D") assert not state.eol()
def test_issue850_basic(): """Test Matcher matches with '*' operator and Boolean flag""" vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()}) matcher = Matcher(vocab) pattern = [{ "LOWER": "bob" }, { "OP": "*", "LOWER": "and" }, { "LOWER": "frank" }] matcher.add("FarAway", None, pattern) doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"]) match = matcher(doc) assert len(match) == 1 ent_id, start, end = match[0] assert start == 0 assert end == 4
def init_vocab(): return Vocab( lex_attr_getters={ LOWER: lambda string: string.lower(), SHAPE: orth_funcs.word_shape, PREFIX: lambda string: string[0], SUFFIX: lambda string: string[-3:], CLUSTER: lambda string: 0, IS_ALPHA: orth_funcs.is_alpha, IS_ASCII: orth_funcs.is_ascii, IS_DIGIT: lambda string: string.isdigit(), IS_LOWER: orth_funcs.is_lower, IS_PUNCT: orth_funcs.is_punct, IS_SPACE: lambda string: string.isspace(), IS_TITLE: orth_funcs.is_title, IS_UPPER: orth_funcs.is_upper, IS_STOP: lambda string: False, IS_OOV: lambda string: True })
def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Optional[Language]): """Initialize the pipe for training, using data examples if available. get_examples (Callable[[], Iterable[Example]]): Optional function that returns gold-standard Example objects. nlp (Language): The current nlp object. DOCS: https://nightly.spacy.io/api/transformer#initialize """ validate_get_examples(get_examples, "Transformer.initialize") docs = [Doc(Vocab(), words=["hello"])] self.model.initialize(X=docs) if nlp is not None: for i, (name1, proc1) in enumerate(nlp.pipeline): if proc1 is self: for name2, proc2 in nlp.pipeline[i:]: self.find_listeners(proc2) break
def test_vocab_prune_vectors(): vocab = Vocab(vectors_name="test_vocab_prune_vectors") _ = vocab["cat"] # noqa: F841 _ = vocab["dog"] # noqa: F841 _ = vocab["kitten"] # noqa: F841 data = numpy.ndarray((5, 3), dtype="f") data[0] = [1.0, 1.2, 1.1] data[1] = [0.3, 1.3, 1.0] data[2] = [0.9, 1.22, 1.05] vocab.set_vector("cat", data[0]) vocab.set_vector("dog", data[1]) vocab.set_vector("kitten", data[2]) remap = vocab.prune_vectors(2, batch_size=2) assert list(remap.keys()) == ["kitten"] neighbour, similarity = list(remap.values())[0] assert neighbour == "cat", remap assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-4, rtol=1e-3)
def test_vocab_prune_vectors(): vocab = Vocab() _ = vocab["cat"] # noqa: F841 _ = vocab["dog"] # noqa: F841 _ = vocab["kitten"] # noqa: F841 data = numpy.ndarray((5, 3), dtype="f") data[0] = 1.0 data[1] = 2.0 data[2] = 1.1 vocab.set_vector("cat", data[0]) vocab.set_vector("dog", data[1]) vocab.set_vector("kitten", data[2]) remap = vocab.prune_vectors(2) assert list(remap.keys()) == ["kitten"] neighbour, similarity = list(remap.values())[0] assert neighbour == "cat", remap assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-6)
def test_Example_missing_heads(): vocab = Vocab() words = ["I", "like", "London", "and", "Berlin", "."] deps = ["nsubj", "ROOT", "dobj", None, "conj", "punct"] heads = [1, 1, 1, None, 2, 1] annots = {"words": words, "heads": heads, "deps": deps} predicted = Doc(vocab, words=words) example = Example.from_dict(predicted, annots) parsed_heads = [t.head.i for t in example.reference] assert parsed_heads[0] == heads[0] assert parsed_heads[1] == heads[1] assert parsed_heads[2] == heads[2] assert parsed_heads[4] == heads[4] assert parsed_heads[5] == heads[5] expected = [True, True, True, False, True, True] assert [t.has_head() for t in example.reference] == expected # Ensure that the missing head doesn't create an artificial new sentence start expected = [True, False, False, False, False, False] assert example.get_aligned_sent_starts() == expected
def main(output_dir): ensure_dir(output_dir) ensure_dir(output_dir, "pos") ensure_dir(output_dir, "vocab") vocab = Vocab(tag_map=TAG_MAP) tokenizer = Tokenizer(vocab, {}, None, None, None) # The default_templates argument is where features are specified. See # spacy/tagger.pyx for the defaults. tagger = Tagger.blank(vocab, Tagger.default_templates()) for i in range(5): for words, tags in DATA: tokens = tokenizer.tokens_from_list(words) tagger.train(tokens, tags) random.shuffle(DATA) tagger.model.end_training() tagger.model.dump(path.join(output_dir, 'pos', 'model')) with io.open(output_dir, 'vocab', 'strings.json') as file_: tagger.vocab.strings.dump(file_)
def __call__(self, current_file: ProtocolFile) -> Doc: with open(self.path) as file: current_transcription = file.read().split('\n') tokens, attributes = [], [] for line in current_transcription: if line == '': continue _, speaker, start, end, text, confidence = line.split() start, end, confidence = map(float, (start, end, confidence)) tokens.append(text) attributes.append((speaker, start, end, confidence)) current_transcription = Doc(Vocab(), tokens) for token, (speaker, time_start, time_end, alignment_confidence) in zip(current_transcription, attributes): token._.speaker, token._.time_start, token._.time_end, token._.alignment_confidence = speaker, time_start, time_end, alignment_confidence return current_transcription
def test_empty_doc(): width = 128 embed_size = 2000 vocab = Vocab() doc = Doc(vocab, words=[]) tok2vec = build_Tok2Vec_model( MultiHashEmbed( width=width, rows=[embed_size, embed_size, embed_size, embed_size], include_static_vectors=False, attrs=["NORM", "PREFIX", "SUFFIX", "SHAPE"], ), MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3), ) tok2vec.initialize() vectors, backprop = tok2vec.begin_update([doc]) assert len(vectors) == 1 assert vectors[0].shape == (0, width)
def test_serialize_vocab_roundtrip_disk(strings1, strings2): vocab1 = Vocab(strings=strings1) vocab2 = Vocab(strings=strings2) with make_tempdir() as d: file_path1 = d / "vocab1" file_path2 = d / "vocab2" vocab1.to_disk(file_path1) vocab2.to_disk(file_path2) vocab1_d = Vocab().from_disk(file_path1) vocab2_d = Vocab().from_disk(file_path2) assert list(vocab1_d) == list(vocab1) assert list(vocab2_d) == list(vocab2) if strings1 == strings2: assert list(vocab1_d) == list(vocab2_d) else: assert list(vocab1_d) != list(vocab2_d)
def test_issue4054(en_vocab): """Test that a new blank model can be made with a vocab from file, and that serialization does not drop the language at any point.""" nlp1 = English() vocab1 = nlp1.vocab with make_tempdir() as d: vocab_dir = ensure_path(d / "vocab") if not vocab_dir.exists(): vocab_dir.mkdir() vocab1.to_disk(vocab_dir) vocab2 = Vocab().from_disk(vocab_dir) print("lang", vocab2.lang) nlp2 = spacy.blank("en", vocab=vocab2) nlp_dir = ensure_path(d / "nlp") if not nlp_dir.exists(): nlp_dir.mkdir() nlp2.to_disk(nlp_dir) nlp3 = spacy.load(nlp_dir) assert nlp3.lang == "en"
def test_vocab_serialization(nlp): """Test that string information is retained across storage""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity="Q1", freq=27, entity_vector=[1]) q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2]) mykb.add_entity(entity="Q3", freq=5, entity_vector=[3]) # adding aliases mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]) adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) candidates = mykb.get_alias_candidates("adam") assert len(candidates) == 1 assert candidates[0].entity == q2_hash assert candidates[0].entity_ == "Q2" assert candidates[0].alias == adam_hash assert candidates[0].alias_ == "adam" with make_tempdir() as d: mykb.to_disk(d / "kb") kb_new_vocab = KnowledgeBase(Vocab(), entity_vector_length=1) kb_new_vocab.from_disk(d / "kb") candidates = kb_new_vocab.get_alias_candidates("adam") assert len(candidates) == 1 assert candidates[0].entity == q2_hash assert candidates[0].entity_ == "Q2" assert candidates[0].alias == adam_hash assert candidates[0].alias_ == "adam" assert kb_new_vocab.get_vector("Q2") == [2] assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4)
def __init__(self): self.nlp = spacy.load("en_core_sci_lg", disable=["tagger"]) self.nlp.max_length = 2000000 # We also need to detect language, or else we'll be parsing non-english text # as if it were English. self.nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) # Add the abbreviation pipe to the spacy pipeline. Only need to run this once. abbreviation_pipe = AbbreviationDetector(self.nlp) self.nlp.add_pipe(abbreviation_pipe) # Our linker will look up named entities/concepts in the UMLS graph and normalize # the data for us. self.linker = UmlsEntityLinker(resolve_abbreviations=True) self.nlp.add_pipe(self.linker) new_vector = self.nlp( """Positive-sense single‐stranded ribonucleic acid virus, subgenus sarbecovirus of the genus Betacoronavirus. Also known as severe acute respiratory syndrome coronavirus 2, also known by 2019 novel coronavirus. It is contagious in humans and is the cause of the ongoing pandemic of coronavirus disease. Coronavirus disease 2019 is a zoonotic infectious disease.""").vector vector_data = { "COVID-19": new_vector, "2019-nCoV": new_vector, "SARS-CoV-2": new_vector } vocab = Vocab() for word, vector in vector_data.items(): self.nlp.vocab.set_vector(word, vector) return
def test_graph_walk(): doc = Doc(Vocab(), words=["a", "b", "c", "d"]) graph = Graph( doc, name="hello", nodes=[(0, ), (1, ), (2, ), (3, )], edges=[(0, 1), (0, 2), (0, 3), (3, 0)], labels=None, weights=None, ) node0, node1, node2, node3 = list(graph.nodes) assert [tuple(h) for h in node0.heads()] == [(3, )] assert [tuple(h) for h in node1.heads()] == [(0, )] assert [tuple(h) for h in node0.walk_heads()] == [(3, ), (0, )] assert [tuple(h) for h in node1.walk_heads()] == [(0, ), (3, ), (0, )] assert [tuple(h) for h in node2.walk_heads()] == [(0, ), (3, ), (0, )] assert [tuple(h) for h in node3.walk_heads()] == [(0, ), (3, )] assert [tuple(t) for t in node0.walk_tails()] == [(1, ), (2, ), (3, ), (0, )] assert [tuple(t) for t in node1.walk_tails()] == [] assert [tuple(t) for t in node2.walk_tails()] == [] assert [tuple(t) for t in node3.walk_tails()] == [(0, ), (1, ), (2, ), (3, )]
def test_extract_tokens(task_head, allennlp_tokens): tokenizer = Tokenizer(Vocab()) input_tokens = list(tokenizer("test this sentence.")) if allennlp_tokens: input_tokens = [spacy_to_allennlp_token(tok) for tok in input_tokens] tf = TextField(input_tokens, None) instance = Instance({"test": tf}) tokens = task_head._extract_tokens(instance) assert all([isinstance(tok, Token) for tok in tokens]) assert all(itok.text == otok.text for itok, otok in zip(input_tokens, tokens)) assert all(itok.idx == otok.start for itok, otok in zip(input_tokens, tokens)) if allennlp_tokens: assert all(itok.idx_end == otok.end for itok, otok in zip(input_tokens, tokens)) else: assert all(itok.idx + len(itok.text) == otok.end for itok, otok in zip(input_tokens, tokens)) assert all([tok.field == "test" for tok in tokens])
def read_spacy_docs(filepath, vocab_filepath): """ Reads serialized spacy docs from a file into memory. Parameters ---------- filepath: str File path to serialized spacy docs Returns ------- list of spacy.tokens.doc.Doc List of spacy Docs loaded from file """ from spacy.vocab import Vocab with open(vocab_filepath, 'rb') as f: vocab = Vocab().from_bytes(f.read()) with open(filepath, 'rb') as f: data = f.read() doc_bin = DocBin().from_bytes(data) docs = list(doc_bin.get_docs(vocab)) return docs
def __call__(self, current_file: ProtocolFile) -> Doc: with open(self.path) as file: current_transcription = file.read().split('\n') tokens, speakers = [], [] for line in current_transcription: # line should not be empty if line == '': continue line = line.split() # there should be at least one speaker and one token per line if len(line) < 2: continue speaker = line[0] for token in line[1:]: speakers.append(speaker) tokens.append(token) current_transcription = Doc(Vocab(), tokens) for token, speaker in zip(current_transcription, speakers): token._.speaker = speaker return current_transcription
def _make_task_prediction( self, single_forward_output: Dict, instance: Instance, ) -> TokenClassificationPrediction: # The dims are: top_k, tags tags: List[List[str]] = self._make_tags( single_forward_output["viterbi_paths"]) # construct a spacy Doc pre_tokenized = not isinstance(single_forward_output["raw_text"], str) if pre_tokenized: # compose doc from tokens doc = Doc(Vocab(), words=single_forward_output["raw_text"]) else: doc = self.backbone.tokenizer.nlp( single_forward_output["raw_text"]) return TokenClassificationPrediction( tags=tags, scores=[ score for tags, score in single_forward_output["viterbi_paths"] ], entities=self._make_entities(doc, tags, pre_tokenized), )
def test_oracle_bad_tokenization(vocab, arc_eager): words_deps_heads = """ [catalase] dep is : punct is that nsubj is is root is bad comp is """ gold_words = [] gold_deps = [] gold_heads = [] for line in words_deps_heads.strip().split("\n"): line = line.strip() if not line: continue word, dep, head = line.split() gold_words.append(word) gold_deps.append(dep) gold_heads.append(head) gold_heads = [gold_words.index(head) for head in gold_heads] for dep in gold_deps: arc_eager.add_action(2, dep) # Left arc_eager.add_action(3, dep) # Right reference = Doc(Vocab(), words=gold_words, deps=gold_deps, heads=gold_heads) predicted = Doc(reference.vocab, words=["[", "catalase", "]", ":", "that", "is", "bad"]) example = Example(predicted=predicted, reference=reference) ae_oracle_actions = arc_eager.get_oracle_sequence(example, _debug=False) ae_oracle_actions = [ arc_eager.get_class_name(i) for i in ae_oracle_actions ] assert ae_oracle_actions
def main(): ap = argparse.ArgumentParser() ap.add_argument('input_path') ap.add_argument('output_path') ap.add_argument('--append', action='store_true') args = ap.parse_args() if args.append: logging.info('Loading existing model...') model = Vocab().from_disk(args.output_path) else: model = Vocab() logging.info('Loading vectors into spacy...') load_vectors_into_model(args.input_path, model) logging.info('Writing model to disk...') model.to_disk(args.output_path) logging.info('Done!')
import jsonlines import spacy import spacy.language from spacy.tokens import Doc from spacy.scorer import Scorer from spacy.vocab import Vocab # global variables vocab = Vocab() scorer = Scorer() # path to jsonl overlap files path_coco = r"/Users/jhoff/Universität St.Gallen/STUD-Capstoneproject Tell 6 - Dokumente/General/02-Coding/01-Data/20_overlap/overlap_total/overlap_coco.jsonl" path_graf = r"/Users/jhoff/Universität St.Gallen/STUD-Capstoneproject Tell 6 - Dokumente/General/02-Coding/01-Data/20_overlap/overlap_total/overlap_leo.jsonl" path_hoff = r"/Users/jhoff/Universität St.Gallen/STUD-Capstoneproject Tell 6 - Dokumente/General/02-Coding/01-Data/20_overlap/overlap_total/overlap_jona.jsonl" path_jthn = r"/Users/jhoff/Universität St.Gallen/STUD-Capstoneproject Tell 6 - Dokumente/General/02-Coding/01-Data/20_overlap/overlap_total/overlap_jonathan.jsonl" # functions def jsonl_to_list(path): """takes path to jsonl file and returns list of dicts""" with jsonlines.open(path) as reader: list_of_dicts = list(reader) print("----") print(list_of_dicts) return list_of_dicts def create_ent_set(span_list):
def test_deserialize_vocab_seen_entries(strings, lex_attr): # Reported in #2153 vocab = Vocab(strings=strings) length = len(vocab) vocab.from_bytes(vocab.to_bytes()) assert len(vocab) == length
def test_doc_api_has_vector(): vocab = Vocab() vocab.reset_vectors(width=2) vocab.set_vector("kitten", vector=numpy.asarray([0.0, 2.0], dtype="f")) doc = Doc(vocab, words=["kitten"]) assert doc.has_vector
def get_random_doc(n_words): vocab = Vocab() # Make the words numbers, so that they're easy to track. numbers = [str(i) for i in range(0, n_words)] return Doc(vocab, words=numbers)
def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str): nlp = English() nlp_plain = English() # load both vec and hashvec tables with make_tempdir() as tmpdir: p = tmpdir / "test.hashvec" with open(p, "w") as fileh: fileh.write(floret_vectors_hashvec_str) convert_vectors(nlp, p, truncate=0, prune=-1, mode="floret") p = tmpdir / "test.vec" with open(p, "w") as fileh: fileh.write(floret_vectors_vec_str) convert_vectors(nlp_plain, p, truncate=0, prune=-1) word = "der" # ngrams: full padded word + padded 2-grams + padded 3-grams ngrams = nlp.vocab.vectors._get_ngrams(word) assert ngrams == ["<der>", "<d", "de", "er", "r>", "<de", "der", "er>"] # rows: 2 rows per ngram rows = OPS.xp.asarray( [ h % nlp.vocab.vectors.shape[0] for ngram in ngrams for h in nlp.vocab.vectors._get_ngram_hashes(ngram) ], dtype="uint32", ) assert_equal( OPS.to_numpy(rows), numpy.asarray([5, 6, 7, 5, 8, 2, 8, 9, 3, 3, 4, 6, 7, 3, 0, 2]), ) assert len(rows) == len(ngrams) * nlp.vocab.vectors.hash_count # all vectors are equivalent for plain static table vs. hash ngrams for word in nlp_plain.vocab.vectors: word = nlp_plain.vocab.strings.as_string(word) assert_almost_equal(nlp.vocab[word].vector, nlp_plain.vocab[word].vector, decimal=3) # every word has a vector assert nlp.vocab[word * 5].has_vector # n_keys is -1 for floret assert nlp_plain.vocab.vectors.n_keys > 0 assert nlp.vocab.vectors.n_keys == -1 # check that single and batched vector lookups are identical words = [s for s in nlp_plain.vocab.vectors] single_vecs = OPS.to_numpy( OPS.asarray([nlp.vocab[word].vector for word in words])) batch_vecs = OPS.to_numpy(nlp.vocab.vectors.get_batch(words)) assert_equal(single_vecs, batch_vecs) # an empty key returns 0s assert_equal( OPS.to_numpy(nlp.vocab[""].vector), numpy.zeros((nlp.vocab.vectors.shape[0], )), ) # an empty batch returns 0s assert_equal( OPS.to_numpy(nlp.vocab.vectors.get_batch([""])), numpy.zeros((1, nlp.vocab.vectors.shape[0])), ) # an empty key within a batch returns 0s assert_equal( OPS.to_numpy(nlp.vocab.vectors.get_batch(["a", "", "b"])[1]), numpy.zeros((nlp.vocab.vectors.shape[0], )), ) # the loaded ngram vector table cannot be modified # except for clear: warning, then return without modifications vector = list(range(nlp.vocab.vectors.shape[1])) orig_bytes = nlp.vocab.vectors.to_bytes(exclude=["strings"]) with pytest.warns(UserWarning): nlp.vocab.set_vector("the", vector) assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"]) with pytest.warns(UserWarning): nlp.vocab[word].vector = vector assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"]) with pytest.warns(UserWarning): nlp.vocab.vectors.add("the", row=6) assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"]) with pytest.warns(UserWarning): nlp.vocab.vectors.resize(shape=(100, 10)) assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"]) with pytest.raises(ValueError): nlp.vocab.vectors.clear() # data and settings are serialized correctly with make_tempdir() as d: nlp.vocab.to_disk(d) vocab_r = Vocab() vocab_r.from_disk(d) assert nlp.vocab.vectors.to_bytes() == vocab_r.vectors.to_bytes() assert_equal(OPS.to_numpy(nlp.vocab.vectors.data), OPS.to_numpy(vocab_r.vectors.data)) assert_equal(nlp.vocab.vectors._get_cfg(), vocab_r.vectors._get_cfg()) assert_almost_equal( OPS.to_numpy(nlp.vocab[word].vector), OPS.to_numpy(vocab_r[word].vector), decimal=6, )
def test_issue4373(): """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab).""" matcher = Matcher(Vocab()) assert isinstance(matcher.vocab, Vocab) matcher = PhraseMatcher(Vocab()) assert isinstance(matcher.vocab, Vocab)
def test_issue1807(): """Test vocab.set_vector also adds the word to the vocab.""" vocab = Vocab() assert "hello" not in vocab vocab.set_vector("hello", numpy.ones((50,), dtype="f")) assert "hello" in vocab
def test_load(self): data_dir = English.default_data_dir() vocab = Vocab.from_dir(path.join(data_dir, 'vocab')) parser = Parser.from_dir(path.join(data_dir, 'deps'), vocab.strings, ArcEager)
def main(json_loc: Path, train_file: Path, dev_file: Path, test_file: Path, test_split=0.189, train_split=0.709): """Creating the corpus from the Prodigy annotations.""" Doc.set_extension("rel", default={}) vocab = Vocab() docs = {"train": [], "dev": [], "test": []} ids = {"train": set(), "dev": set(), "test": set()} count_all = {"train": 0, "dev": 0, "test": 0} count_pos = {"train": 0, "dev": 0, "test": 0} long_rel_count = 0 #how many relations are longer error_count_rel = 0 #how often is something different than ARGO, ARG1, ARG with json_loc.open("r", encoding="utf8") as jsonfile: length_training_data = len([ True for line in jsonfile if json.loads(line)["answer"] == "accept" ]) msg.info(f"Number of accepted recipes: {length_training_data}") with json_loc.open("r", encoding="utf8") as jsonfile: for line in jsonfile: example = json.loads(line) #one recipe span_starts = set() if example["answer"] == "accept": neg = 0 pos = 0 try: # Parse the tokens -> example["tokens"] = list of dicts words = [t["text"] for t in example["tokens"] ] #list containing all words spaces = [ t["ws"] for t in example["tokens"] ] #list containing ws is behind word (ws = True/False) doc = Doc(vocab, words=words, spaces=spaces) # Parse the entities spans = example[ "spans"] #list of dicts containing entities entities = [] span_end_to_start = {} ents_dict = {} for span in spans: #every detected span entity = doc.char_span( span["start"], span["end"], label=span["label"] ) #"start" = wievielter character ist start character des spans im doc span_end_to_start[span["token_end"]] = span[ "token_start"] #end_token of span as key for start_token (start token = wievielter token in doc) entities.append(entity) #appended to list span_starts.add(span["token_start"]) #added to set ents_dict[span["token_start"]] = (span["label"], span["token_start"]) doc.ents = entities #entity list assigned as doc entites # Parse the relations rels = {} # create token combinations for x1 in span_starts: #VERBS_TO_OTHER 1a if VERBS_TO_OTHER == True: if ents_dict[x1][0] == "V": #filter entity type for x2 in span_starts: if ents_dict[x2][0] in [ "Z", "TOOL", "ATTR", "TEMP", "DAUER", "ZEITP", "PRÄP" ]: #filter entity type #DIFF_FRONT_BACK 1a if DIFF_FRONT_BACK == True: if ((x1 - x2) >= 0 and (x1 - x2) <= BACK) or ( (x1 - x2) < 0 and (x1 - x2) >= FRONT * -1): rels[(x1, x2)] = {} else: pass #DIFF_FRONT_BACK 1b else: if abs( ents_dict[x1][1] - ents_dict[x2][1] ) <= TOKEN_LENGTH: #filter token distance (match with config?) rels[(x1, x2)] = { } #every possible span combination becomes key for individual dict (1,1), (1,2) ... #VERBS_TO_OTHER 1b else: for x2 in span_starts: #DIFF_FRONT_BACK 2a if DIFF_FRONT_BACK == True: if ((x1 - x2) >= 0 and (x1 - x2) <= BACK) or ( (x1 - x2) < 0 and (x1 - x2) >= FRONT * -1): rels[(x1, x2)] = {} else: pass #DIFF_FRONT_BACK 2b else: if abs( ents_dict[x1][1] - ents_dict[x2][1] ) <= TOKEN_LENGTH: #filter token distance (match with config?) rels[(x1, x2)] = { } #every possible span combination becomes key for individual dict (1,1), (1,2) ... relations = example[ "relations"] #relations is list of dict for relation in relations: # the 'head' and 'child' annotations refer to the end token in the span # but we want the first token start = span_end_to_start[relation[ "head"]] #wievielter token ist start token des head end = span_end_to_start[relation[ "child"]] #wievielter token ist start token des child label = relation["label"] #DETAILED_ARGS 1a if DETAILED_ARGS == True: if label == "ARG0": if ents_dict[end][0] not in ["Z", "TOOL"]: label = MAP_LABELS_ARG[ents_dict[end][0]] else: label = MAP_LABELS_ARG0[ents_dict[end][ 0]] #assign new label based on span type elif label == "ARG1": if ents_dict[end][0] not in ["Z", "TOOL"]: label = MAP_LABELS_ARG[ents_dict[end][0]] else: label = MAP_LABELS_ARG1[ents_dict[end][0]] elif label == "ARG": if ents_dict[end][0] in ["Z", "TOOL"]: if ents_dict[end][0] == "Z": label = "Arg0Z" elif ents_dict[end][0] == "TOOL": label = "Arg1Tool" else: label = MAP_LABELS_ARG[ents_dict[end][0]] else: error_count_rel += 1 #DETAILED_ARGS 1b else: label = MAP_LABELS_STANDARD[ label] #MAP_LABELS = dict containing label as key # Positive relations are being added try: if label not in rels[( start, end )]: #check if label already exists for token combination rels[( start, end )][label] = 1.0 #initialize label as new key with value 1.0 pos += 1 #positive case except: long_rel_count += 1 #error only if relation exists in annotation but isn't a valid token combi (too long/not starting from verb) pass # The annotation is complete, so fill in zero's where the data is missing for x1 in span_starts: #VERBS_TO_OTHER 2a if VERBS_TO_OTHER == True: if ents_dict[x1][0] == "V": #filter entity type for x2 in span_starts: if ents_dict[x2][0] in [ "Z", "TOOL", "ATTR", "TEMP", "DAUER", "ZEITP", "PRÄP" ]: #filter entity type #DIFF_FRONT_BACK 2a if DIFF_FRONT_BACK == True: if ((x1 - x2) >= 0 and (x1 - x2) <= BACK) or ( (x1 - x2) < 0 and (x1 - x2) >= FRONT * -1): #DETAILED_ARGS 2a if DETAILED_ARGS == True: merged_labels = list( MAP_LABELS_ARG0.values( )) + list( MAP_LABELS_ARG1. values()) + list( MAP_LABELS_ARG. values()) for label in merged_labels: if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[( x1, x2 )][label] = 0.0 #DETAILED_ARGS 2b else: for label in MAP_LABELS_STANDARD.values( ): #for every label if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[( x1, x2 )][label] = 0.0 #DIFF_FRONT_BACK 2b else: if abs( ents_dict[x1][1] - ents_dict[x2][1] ) <= TOKEN_LENGTH: #filter token distance (match with config?) #DETAILED_ARGS 3a if DETAILED_ARGS == True: merged_labels = list( MAP_LABELS_ARG0.values( )) + list( MAP_LABELS_ARG1. values()) + list( MAP_LABELS_ARG. values()) for label in merged_labels: if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[( x1, x2 )][label] = 0.0 #DETAILED_ARGS 3b else: for label in MAP_LABELS_STANDARD.values( ): #for every label if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[( x1, x2 )][label] = 0.0 #span combination with label as key gets 0 as value #VERBS_TO_OTHER 2b else: for x2 in span_starts: #DIFF_FRONT_BACK 3a if DIFF_FRONT_BACK == True: if ((x1 - x2) >= 0 and (x1 - x2) <= BACK) or ( (x1 - x2) < 0 and (x1 - x2) >= FRONT * -1): #DETAILED_ARGS 4a if DETAILED_ARGS == True: merged_labels = list( MAP_LABELS_ARG0.values() ) + list(MAP_LABELS_ARG1.values( )) + list(MAP_LABELS_ARG.values()) for label in merged_labels: if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[(x1, x2)][label] = 0.0 #DETAILED_ARGS 4b else: for label in MAP_LABELS_STANDARD.values( ): #for every label if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[(x1, x2)][label] = 0.0 #DIFF_FRONT_BACK 3b else: if abs( ents_dict[x1][1] - ents_dict[x2][1] ) <= TOKEN_LENGTH: #filter token distance (match with config?) #DETAILED_ARGS 5a if DETAILED_ARGS == True: merged_labels = list( MAP_LABELS_ARG0.values() ) + list(MAP_LABELS_ARG1.values( )) + list(MAP_LABELS_ARG.values()) for label in merged_labels: if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[(x1, x2)][label] = 0.0 #DETAILED_ARGS 5b else: for label in MAP_LABELS_STANDARD.values( ): #for every label if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[(x1, x2)][label] = 0.0 #print(rels) doc._.rel = rels # rels = {(1,1): {Arg0 : 1, Arg1 : 0, Arg : 0}, (1,2): {Arg0 : 0, ...}} # only keeping documents with at least 1 positive case (if doc isn't annotated relations = empty list) if pos > 0: recipe_id = example["_input_hash"] if len(docs["train"]) < round( train_split * length_training_data): ids["train"].add(recipe_id) docs["train"].append(doc) count_pos["train"] += pos count_all["train"] += pos + neg elif len(docs["test"]) < round( test_split * length_training_data): ids["test"].add(recipe_id) docs["test"].append(doc) count_pos["test"] += pos count_all["test"] += pos + neg else: ids["dev"].add(recipe_id) docs["dev"].append(doc) count_pos["dev"] += pos count_all["dev"] += pos + neg except KeyError as e: msg.fail( f"Skipping doc because of key error: {e} in {example['_input_hash']}" ) msg.info( f"{long_rel_count} relations have been cut because tokens are too far apart." ) docbin = DocBin(docs=docs["train"], store_user_data=True) docbin.to_disk(train_file) msg.info( f"{len(docs['train'])} training recipes from {len(ids['train'])} unique recipes, " f"{count_pos['train']}/{count_all['train']} pos instances.") docbin = DocBin(docs=docs["dev"], store_user_data=True) docbin.to_disk(dev_file) msg.info( f"{len(docs['dev'])} dev recipes from {len(ids['dev'])} unique recipes, " f"{count_pos['dev']}/{count_all['dev']} pos instances.") docbin = DocBin(docs=docs["test"], store_user_data=True) docbin.to_disk(test_file) msg.info( f"{len(docs['test'])} test recipes from {len(ids['test'])} unique recipes, " f"{count_pos['test']}/{count_all['test']} pos instances.")
def test_load(self): vocab = Vocab.from_dir(path.join(English.default_data_dir(), 'vocab'))
def test_issue1807(): """Test vocab.set_vector also adds the word to the vocab.""" vocab = Vocab(vectors_name="test_issue1807") assert "hello" not in vocab vocab.set_vector("hello", numpy.ones((50, ), dtype="f")) assert "hello" in vocab
def test_load(self): data_dir = English.default_data_dir() vocab = Vocab.from_dir(path.join(data_dir, 'vocab')) tokenizer = Tokenizer.from_dir(vocab, path.join(data_dir, 'tokenizer'))
def vocab(): return Vocab()
def test_load(self): data_dir = English.default_data_dir() vocab = Vocab.from_dir(path.join(data_dir, 'vocab')) tagger = Tagger.from_dir(path.join(data_dir, 'tagger'), vocab)