def test_factories_merge_noun_chunks(doc): assert len(doc) == 7 nlp = Language() merge_noun_chunks = nlp.create_pipe("merge_noun_chunks") merge_noun_chunks(doc) assert len(doc) == 6 assert doc[2].text == "New York"
def test_factories_merge_ents(doc): assert len(doc) == 7 assert len(list(doc.ents)) == 1 nlp = Language() merge_entities = nlp.create_pipe("merge_entities") merge_entities(doc) assert len(doc) == 6 assert len(list(doc.ents)) == 1 assert doc[2].text == "New York"
def test_list_of_docs_pickles_efficiently(): nlp = Language() for i in range(10000): _ = nlp.vocab[unicode_(i)] # noqa: F841 one_pickled = pickle.dumps(nlp("0"), -1) docs = list(nlp.pipe(unicode_(i) for i in range(100))) many_pickled = pickle.dumps(docs, -1) assert len(many_pickled) < (len(one_pickled) * 2) many_unpickled = pickle.loads(many_pickled) assert many_unpickled[0].text == "0" assert many_unpickled[-1].text == "99" assert len(many_unpickled) == 100
def test_issue1654(): nlp = Language(Vocab()) assert not nlp.pipeline nlp.add_pipe(lambda doc: doc, name="1") nlp.add_pipe(lambda doc: doc, name="2", after="1") nlp.add_pipe(lambda doc: doc, name="3", after="2") assert nlp.pipe_names == ["1", "2", "3"] nlp2 = Language(Vocab()) assert not nlp2.pipeline nlp2.add_pipe(lambda doc: doc, name="3") nlp2.add_pipe(lambda doc: doc, name="2", before="3") nlp2.add_pipe(lambda doc: doc, name="1", before="2") assert nlp2.pipe_names == ["1", "2", "3"]
def test_issue1915(): cfg = {"hidden_depth": 2} # should error out nlp = Language() nlp.add_pipe(nlp.create_pipe("ner")) nlp.get_pipe("ner").add_label("answer") with pytest.raises(ValueError): nlp.begin_training(**cfg)
def test_issue1967(label): nlp = Language() config = {} ner = nlp.create_pipe("ner", config=config) example = Example.from_dict( Doc(ner.vocab, words=["word"]), { "ids": [0], "words": ["word"], "tags": ["tag"], "heads": [0], "deps": ["dep"], "entities": [label], }, ) assert "JOB-NAME" in ner.moves.get_actions(examples=[example])[1]
def test_textcat_learns_multilabel(): random.seed(5) numpy.random.seed(5) docs = [] nlp = Language() letters = ["a", "b", "c"] for w1 in letters: for w2 in letters: cats = {letter: float(w2 == letter) for letter in letters} docs.append((Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3), cats)) random.shuffle(docs) textcat = TextCategorizer(nlp.vocab, width=8) for letter in letters: textcat.add_label(letter) optimizer = textcat.initialize(lambda: []) for i in range(30): losses = {} examples = [ Example.from_dict(doc, {"cats": cats}) for doc, cat in docs ] textcat.update(examples, sgd=optimizer, losses=losses) random.shuffle(docs) for w1 in letters: for w2 in letters: doc = Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3) truth = {letter: w2 == letter for letter in letters} textcat(doc) for cat, score in doc.cats.items(): if not truth[cat]: assert score < 0.5 else: assert score > 0.5
def _extract_doc_matches( self, lang: Language, doc: Doc, keywords: Sequence[str], scores: Sequence[float], ) -> Dict[str, DocMatch]: ''' extract and format info for all keywords in a given document. attr: str the spacy token attribute to use to match in the sentence search ''' matcher = PhraseMatcher(lang.vocab, attr='LOWER') patterns = [lang.make_doc(str(kw)) for kw in keywords] matcher.add("Keywords", patterns) sents = self._extract_sentence_matches(doc, keywords, matcher, attr='LOWER') matches: Dict[str, DocMatch] = { kw: DocMatch(doc, kw, score, sents[kw]) for kw, score in zip(keywords, scores) } return matches
def test_pipe_factories_config_excludes_nlp(): """Test that the extra values we temporarily add to component config blocks/functions are removed and not copied around. """ name = "test_pipe_factories_config_excludes_nlp" func = lambda nlp, name: lambda doc: doc Language.factory(name, func=func) config = { "nlp": {"lang": "en", "pipeline": [name]}, "components": {name: {"factory": name}}, } nlp = English.from_config(config) assert nlp.pipe_names == [name] pipe_cfg = nlp.get_pipe_config(name) pipe_cfg == {"factory": name} assert nlp._pipe_configs[name] == {"factory": name}
def test_user_data_from_disk(): nlp = Language() doc = nlp("Hello") doc.user_data[(0, 1)] = False b = doc.to_bytes() doc2 = doc.__class__(doc.vocab).from_bytes(b) assert doc2.user_data[(0, 1)] is False
def corpus(nlp: Language): for original_example in original_examples: doc = nlp.make_doc(original_example[0]) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=UserWarning) spacy_example = Example.from_dict(doc, original_example[1]) yield spacy_example
def test_user_data_unpickles(): nlp = Language() doc = nlp("Hello") doc.user_data[(0, 1)] = False b = pickle.dumps(doc) doc2 = pickle.loads(b) assert doc2.user_data[(0, 1)] is False
def get_smiles_language(): """ Get SMILES language. Returns: a spacy.language.Language representing SMILES. """ valid_values = list(filter(lambda k: k != PADDING_ATOM, ATOM_MAPPING.keys())) vocabulary = Vocab(strings=valid_values) def make_doc(smiles): """ Make a SMILES document. Arguments: smiles (str): a SMILES representing a molecule. Returns: a spacy.tokens.Doc representing the molecule. """ if len(smiles) == 0: tokens = np.random.choice(valid_values) else: tokens = [ token for token in ATOM_REGEX.split(smiles) if token ][:MAX_LENGTH] return Doc(vocabulary, words=tokens, spaces=[False]*len(tokens)) return Language(vocabulary, make_doc)
def test_add_pipe(nlp: Language): """ Works as a pipeline component and can be disabled. """ # given nlp.add_pipe("topicrank", last=True) # works as a pipeline component # when text = "linear constraints over the" doc = nlp(text) phrases = [p.text for p in doc._.phrases] # then assert len(doc._.phrases) > 0 assert any(map(lambda x: "constraints" in x, phrases)) # identifies phrases not in noun chunks # when text = """\ everything you need to know about student loan interest rates, variable \ and fixed rates, capitalization, amortization, student loan refinancing \ and more.\ """ doc = nlp(text) phrases = [p.text for p in doc._.phrases] # then assert len(doc._.phrases) >= 2 # resolves Py 3.5 dict KeyError # when text = "linear constraints over the set of natural numbers" doc = nlp(text) phrases = [p.text for p in doc._.phrases] # then assert any(map(lambda x: "constraints" in x, phrases)) # pipeline can be disabled # when with nlp.select_pipes(disable=["topicrank"]): doc = nlp(text) # then assert len(doc._.phrases) == 0
def test_ner_warns_no_lookups(): nlp = Language() nlp.vocab.lookups = Lookups() assert not len(nlp.vocab.lookups) ner = nlp.create_pipe("ner") nlp.add_pipe(ner) with pytest.warns(UserWarning): nlp.begin_training() nlp.vocab.lookups.add_table("lexeme_norm") nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A" with pytest.warns(None) as record: nlp.begin_training() assert not record.list
def from_dir(cls, tag_map, model_dir): vocab = Vocab(tag_map=tag_map, get_lex_attr=Language.default_lex_attrs()) tokenizer = Tokenizer(vocab, {}, None, None, None) tagger = Tagger.blank(vocab, TAGGER_TEMPLATES) cfg = Config.read(path.join(model_dir, 'deps'), 'config') parser = Parser.from_dir(path.join(model_dir, 'deps'), vocab.strings, ArcEager) return cls(vocab, tokenizer, tagger, parser)
def matcher(nlp: Language, ) -> FuzzyMatcher: """Fuzzy matcher with patterns added.""" animals = ["Heifer", "chicken"] sounds = ["mooo"] names = ["Steven"] matcher = FuzzyMatcher(nlp.vocab) matcher.add( "ANIMAL", [nlp.make_doc(animal) for animal in animals], kwargs=[{ "ignore_case": False }, {}], ) matcher.add("SOUND", [nlp.make_doc(sound) for sound in sounds]) matcher.add("NAME", [nlp.make_doc(name) for name in names], on_match=add_name_ent) return matcher
def vocab(): vocab = Vocab(Language.default_lex_attrs()) lex = vocab['dog'] assert vocab[vocab.strings['dog']].orth_ == 'dog' lex = vocab['the'] lex = vocab['quick'] lex = vocab['jumped'] return vocab
def test__scan_doc_returns_matches_over_min_r1( searcher: FuzzySearcher, nlp: Language, scan_example: Doc ) -> None: """It returns all spans of len(query) in doc if ratio >= min_r1.""" query = nlp.make_doc("Shirley") assert searcher._scan_doc( scan_example, query, fuzzy_func="simple", min_r1=30, ignore_case=True ) == {4: 86}
def test_issue999(train_data): """Test that adding entities and resuming training works passably OK. There are two issues here: 1) We have to readd labels. This isn't very nice. 2) There's no way to set the learning rate for the weight update, so we end up out-of-scale, causing it to learn too fast. """ TRAIN_DATA = [ ["hey", []], ["howdy", []], ["hey there", []], ["hello", []], ["hi", []], ["i'm looking for a place to eat", []], ["i'm looking for a place in the north of town", [[31, 36, "LOCATION"]]], ["show me chinese restaurants", [[8, 15, "CUISINE"]]], ["show me chines restaurants", [[8, 14, "CUISINE"]]], ] nlp = Language() ner = nlp.create_pipe("ner") nlp.add_pipe(ner) for _, offsets in TRAIN_DATA: for start, end, label in offsets: ner.add_label(label) nlp.begin_training() ner.model.learn_rate = 0.001 for itn in range(100): random.shuffle(TRAIN_DATA) for raw_text, entity_offsets in TRAIN_DATA: nlp.update([raw_text], [{"entities": entity_offsets}]) with make_tempdir() as model_dir: nlp.to_disk(model_dir) nlp2 = Language().from_disk(model_dir) for raw_text, entity_offsets in TRAIN_DATA: doc = nlp2(raw_text) ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents} for start, end, label in entity_offsets: if (start, end) in ents: assert ents[(start, end)] == label break else: if entity_offsets: raise Exception(ents)
def test__adjust_left_right_positions_finds_better_match( searcher: FuzzySearcher, nlp: Language ) -> None: """It optimizes the initial match to find a better match.""" doc = nlp.make_doc("Patient was prescribed Zithromax tablets.") query = nlp.make_doc("zithromax tablet") match_values = {0: 30, 2: 50, 3: 97, 4: 50} assert searcher._adjust_left_right_positions( doc, query, match_values, pos=3, fuzzy_func="simple", min_r2=70, ignore_case=True, flex=2, ) == (3, 5, 97)
def test__adjust_left_right_positions_with_no_flex( searcher: FuzzySearcher, nlp: Language ) -> None: """It returns the intial match when flex value = 0.""" doc = nlp.make_doc("Patient was prescribed Zithroma tablets.") query = nlp.make_doc("zithromax") match_values = {3: 94} assert searcher._adjust_left_right_positions( doc, query, match_values, pos=3, fuzzy_func="simple", min_r2=70, ignore_case=True, flex=0, ) == (3, 4, 94)
def test__scan_doc_returns_all_matches_with_no_min_r1( searcher: FuzzySearcher, nlp: Language, scan_example: Doc ) -> None: """It returns all spans of len(query) in doc if min_r1 = 0.""" query = nlp.make_doc("Shirley") assert searcher._scan_doc( scan_example, query, fuzzy_func="simple", min_r1=0, ignore_case=True ) == {0: 0, 1: 0, 2: 18, 3: 22, 4: 86}
def main(use_gpu=False, nb_epoch=50): if use_gpu: Model.ops = CupyOps() Model.Ops = CupyOps train, test = datasets.imdb() print("Load data") train_X, train_y = zip(*train) test_X, test_y = zip(*test) train_y = to_categorical(train_y, nb_classes=2) test_y = to_categorical(test_y, nb_classes=2) nlp = Language() dev_X = train_X[-1000:] dev_y = train_y[-1000:] train_X = train_X[:-1000] train_y = train_y[:-1000] print("Parse data") train_X = [nlp.make_doc(x) for x in train_X] dev_X = [nlp.make_doc(x) for x in dev_X] model = build_model(2, 1) print("Begin training") with model.begin_training(train_X, train_y, L2=1e-6) as (trainer, optimizer): epoch_loss = [0.0] def report_progress(): with model.use_params(optimizer.averages): print(epoch_loss[-1], model.evaluate(dev_X, dev_y), trainer.dropout) epoch_loss.append(0.0) trainer.each_epoch.append(report_progress) trainer.nb_epoch = nb_epoch trainer.dropout = 0.0 trainer.batch_size = 128 trainer.dropout_decay = 0.0 for X, y in trainer.iterate(train_X[:1000], train_y[:1000]): yh, backprop = model.begin_update(X, drop=trainer.dropout) loss = ((yh - y) ** 2.0).sum() / y.shape[0] backprop((yh - y) / y.shape[0], optimizer) epoch_loss[-1] += loss with model.use_params(optimizer.averages): print("Avg dev.: %.3f" % model.evaluate(dev_X, dev_y)) with open("out.pickle", "wb") as file_: pickle.dump(model, file_, -1)
def test_simple_train(): nlp = Language() nlp.add_pipe(nlp.create_pipe("textcat")) nlp.get_pipe("textcat").add_label("answer") nlp.begin_training() for i in range(5): for text, answer in [ ("aaaa", 1.0), ("bbbb", 0), ("aa", 1.0), ("bbbbbbbbb", 0.0), ("aaaaaa", 1), ]: nlp.update([text], [{"cats": {"answer": answer}}]) doc = nlp("aaa") assert "answer" in doc.cats assert doc.cats["answer"] >= 0.5
def test_pickle_single_doc(): nlp = Language() doc = nlp("pickle roundtrip") doc._context = 3 data = pickle.dumps(doc, 1) doc2 = pickle.loads(data) assert doc2.text == "pickle roundtrip" assert doc2._context == 3
def test_serialize_language_exclude(meta_data): name = "name-in-fixture" nlp = Language(meta=meta_data) assert nlp.meta["name"] == name new_nlp = Language().from_bytes(nlp.to_bytes()) assert new_nlp.meta["name"] == name new_nlp = Language().from_bytes(nlp.to_bytes(), exclude=["meta"]) assert not new_nlp.meta["name"] == name new_nlp = Language().from_bytes(nlp.to_bytes(exclude=["meta"])) assert not new_nlp.meta["name"] == name
def loadWordFeatures(dom, loc="wordfeats", loadpickle=False, savepickle=False): if loadpickle: # we load saved word features from our pickle file # the word features were generated as below with open(os.path.join(dirname, 'data', 'wordfeatures.pkl'), 'rb') as f: featdict = pickle.load(f) return featdict[dom] nlp = Language().from_disk(loc) taskwords = { 'household': [[' '], ['Pick and Place glass'], ['Pick and Place plastic can'], ['Pick and Place lemon'], ['Pick and Place plastic bottle'], ['Pick and Place apple'], ['Pick and Place plastic cup'], ['Navigate while avoiding moving people'], ['Navigate to the main room door'], ['Navigate while following a person'], ['Navigate to the dining table'], ['Navigate while avoiding obstacles'], ['Navigate to the living room']], 'driving': [ [' '], ['Parking backwards cars and people around, misaligned'], ['Parking backwards empty lot, misaligned'], ['Parking backwards cars and people around, aligned'], ['Parking forwards empty lot, aligned'], ['Parking forwards cars and people around, misaligned'], ['Parking forwards empty lot, misaligned'], ['Navigating lane merge with other moving vehicles'], ['Navigating lane merge on a clear road'], ['Navigating traffic-circle with other moving vehicles'], ['Navigating traffic-circle on a clear road'], ['Navigating T-junction with other moving vehicles'], ['Navigating T-junction on a clear road'], ] } featdict = {} for d, task_word_list in taskwords.items(): wordfeatures = [] for i in range(len(task_word_list)): print(task_word_list[i][0]) wordfeatures.append(nlp(task_word_list[i][0]).vector) wordfeatures = np.array(wordfeatures) featdict[d] = wordfeatures wordfeatures = featdict[dom] # save the data if savepickle: with open(os.path.join(dirname, 'data', 'wordfeatures.pkl'), 'wb') as f: pickle.dump(featdict, f, protocol=pickle.HIGHEST_PROTOCOL) return wordfeatures
def test_beam_parse(): nlp = Language() nlp.add_pipe(DependencyParser(nlp.vocab), name="parser") nlp.parser.add_label("nsubj") nlp.parser.begin_training([], token_vector_width=8, hidden_width=8) doc = nlp.make_doc("Australia is a country") nlp.parser(doc, beam_width=2)
def test_tagger_warns_no_lemma_lookups(): nlp = Language() nlp.vocab.lookups = Lookups() assert not len(nlp.vocab.lookups) tagger = nlp.create_pipe("tagger") with pytest.warns(UserWarning): tagger.begin_training() nlp.add_pipe(tagger) with pytest.warns(UserWarning): nlp.begin_training() nlp.vocab.lookups.add_table("lemma_lookup") with pytest.warns(None) as record: nlp.begin_training() assert not record.list
def test_pipe_class_component_config(): name = "test_class_component_config" @Language.factory(name) class Component: def __init__( self, nlp: Language, name: str, value1: StrictInt, value2: StrictStr ): self.nlp = nlp self.value1 = value1 self.value2 = value2 self.is_base = True def __call__(self, doc: Doc) -> Doc: return doc @English.factory(name) class ComponentEN: def __init__( self, nlp: Language, name: str, value1: StrictInt, value2: StrictStr ): self.nlp = nlp self.value1 = value1 self.value2 = value2 self.is_base = False def __call__(self, doc: Doc) -> Doc: return doc nlp = Language() with pytest.raises(ConfigValidationError): # no config provided nlp.add_pipe(name) with pytest.raises(ConfigValidationError): # invalid config nlp.add_pipe(name, config={"value1": "10", "value2": "hello"}) nlp.add_pipe(name, config={"value1": 10, "value2": "hello"}) pipe = nlp.get_pipe(name) assert isinstance(pipe.nlp, Language) assert pipe.value1 == 10 assert pipe.value2 == "hello" assert pipe.is_base is True nlp_en = English() with pytest.raises(ConfigValidationError): # invalid config nlp_en.add_pipe(name, config={"value1": "10", "value2": "hello"}) nlp_en.add_pipe(name, config={"value1": 10, "value2": "hello"}) pipe = nlp_en.get_pipe(name) assert isinstance(pipe.nlp, English) assert pipe.value1 == 10 assert pipe.value2 == "hello" assert pipe.is_base is False
def test_implicit_label(): nlp = Language() nlp.add_pipe("tagger") train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) nlp.initialize(get_examples=lambda: train_examples)
def normalize_np(np: List[Token], nlp_md: Language) -> str: """ Function to normalize a noun phrase """ normalized_np = " ".join([ token.lemma_ for token in nlp_md.tokenizer(np) if token.text not in NLTK_STOPWORDS ]) return normalized_np
def test_language_source_and_vectors(nlp2): nlp = Language(Vocab()) textcat = nlp.add_pipe("textcat") for label in ("POSITIVE", "NEGATIVE"): textcat.add_label(label) nlp.initialize() long_string = "thisisalongstring" assert long_string not in nlp.vocab.strings assert long_string not in nlp2.vocab.strings nlp.vocab.strings.add(long_string) assert nlp.vocab.vectors.to_bytes() != nlp2.vocab.vectors.to_bytes() vectors_bytes = nlp.vocab.vectors.to_bytes() with pytest.warns(UserWarning): nlp2.add_pipe("textcat", name="textcat2", source=nlp) # strings should be added assert long_string in nlp2.vocab.strings # vectors should remain unmodified assert nlp.vocab.vectors.to_bytes() == vectors_bytes
def test_language_pipe_error_handler_pipe(en_vocab, n_process): """Test the error handling of a component's pipe method""" Language.component("my_perhaps_sentences", func=perhaps_set_sentences) Language.component("assert_sents_error", func=assert_sents_error) ops = get_current_ops() if isinstance(ops, NumpyOps) or n_process < 2: texts = [f"{str(i)} is enough. Done" for i in range(100)] nlp = English() nlp.add_pipe("my_perhaps_sentences") nlp.add_pipe("assert_sents_error") nlp.initialize() with pytest.raises(ValueError): # assert_sents_error requires sentence boundaries, will throw an error otherwise docs = list(nlp.pipe(texts, n_process=n_process, batch_size=10)) nlp.set_error_handler(ignore_error) docs = list(nlp.pipe(texts, n_process=n_process, batch_size=10)) # we lose/ignore the failing 4,40-49 docs assert len(docs) == 89
def build_pipeline(disable: list = []): """ Function that creates the pipeline for the creation of (sentence, reference) tuples. Returns: - nlp: spaCy pipeline instance """ nlp = load_spacy_model("de_core_news_sm") # Matching section references using ReferenceMatcher class Language.component("reference_matcher", func=match_reference) nlp.add_pipe("sentencizer") nlp.add_pipe("reference_matcher", before="tagger") nlp.disable_pipes(*disable) print("\nActivated pipes:") print(nlp.pipe_names) return nlp
def test_entity_ruler_existing_bytes_old_format_safe(patterns, en_vocab): nlp = Language(vocab=en_vocab) ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) bytes_old_style = srsly.msgpack_dumps(ruler.patterns) new_ruler = EntityRuler(nlp) new_ruler = new_ruler.from_bytes(bytes_old_style) assert len(new_ruler) == len(ruler) assert new_ruler.patterns == ruler.patterns assert new_ruler.overwrite is not ruler.overwrite
def lang(): vector_data = {k: np.random.normal(0, 1, (2,)) for k in ["red", "blue", "cat", "dog", "green", "purple"]} vector_data['cat'] += 10 vector_data['dog'] += 10 vocab = Vocab(strings=vector_data.keys()) for word, vector in vector_data.items(): vocab.set_vector(word, vector) nlp = Language(vocab=vocab) return SpacyLanguage(nlp)
def register_benepar_component_factory(): # Starting with spaCy 3.0, nlp.add_pipe no longer directly accepts # BeneparComponent instances. We must instead register a component factory. import spacy if spacy.__version__.startswith("2"): return from spacy.language import Language Language.factory( "benepar", default_config={ "subbatch_max_tokens": 500, "disable_tagger": False, }, func=create_benepar_component, )
def create_pipeline(nlp: Language, cfg: omegaconf.DictConfig) -> List[Pipe]: if not isinstance(cfg, omegaconf.DictConfig): cfg = OmegaConf.create(cfg) pipes = [] for name, pipe_config in cfg.items(): pipe_config = OmegaConf.to_container(pipe_config or OmegaConf.create({})) pipes.append(nlp.create_pipe(name, config=pipe_config or dict())) return pipes
def main(train_loc, dev_loc, model_dir, tag_map_loc): with open(tag_map_loc) as file_: tag_map = json.loads(file_.read()) train_sents = list(read_conllx(train_loc)) train_sents = PseudoProjectivity.preprocess_training_data(train_sents) actions = ArcEager.get_actions(gold_parses=train_sents) features = get_templates('basic') model_dir = pathlib.Path(model_dir) with (model_dir / 'deps' / 'config.json').open('w') as file_: json.dump({'pseudoprojective': True, 'labels': actions, 'features': features}, file_) vocab = Vocab(lex_attr_getters=Language.Defaults.lex_attr_getters, tag_map=tag_map) # Populate vocab for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: for word in words: _ = vocab[word] for dep in deps: _ = vocab[dep] for tag in tags: _ = vocab[tag] for tag in tags: assert tag in tag_map, repr(tag) tagger = Tagger(vocab, tag_map=tag_map) parser = DependencyParser(vocab, actions=actions, features=features) for itn in range(15): for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: doc = Doc(vocab, words=words) gold = GoldParse(doc, tags=tags, heads=heads, deps=deps) tagger(doc) parser.update(doc, gold) doc = Doc(vocab, words=words) tagger.update(doc, gold) random.shuffle(train_sents) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f' % (itn, scorer.uas, scorer.tags_acc)) nlp = Language(vocab=vocab, tagger=tagger, parser=parser) nlp.end_training(model_dir) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
def test_serialize_with_custom_tokenizer(): """Test that serialization with custom tokenizer works without token_match. See: https://support.prodi.gy/t/how-to-save-a-custom-tokenizer/661/2 """ prefix_re = re.compile(r"""1/|2/|:[0-9][0-9][A-K]:|:[0-9][0-9]:""") suffix_re = re.compile(r"""""") infix_re = re.compile(r"""[~]""") def custom_tokenizer(nlp): return Tokenizer( nlp.vocab, {}, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, ) nlp = Language() nlp.tokenizer = custom_tokenizer(nlp) with make_tempdir() as d: nlp.to_disk(d)
def test_serialize_language_exclude(meta_data): name = "name-in-fixture" nlp = Language(meta=meta_data) assert nlp.meta["name"] == name new_nlp = Language().from_bytes(nlp.to_bytes()) assert nlp.meta["name"] == name new_nlp = Language().from_bytes(nlp.to_bytes(), exclude=["meta"]) assert not new_nlp.meta["name"] == name new_nlp = Language().from_bytes(nlp.to_bytes(exclude=["meta"])) assert not new_nlp.meta["name"] == name with pytest.raises(ValueError): nlp.to_bytes(meta=False) with pytest.raises(ValueError): Language().from_bytes(nlp.to_bytes(), meta=False)
def test_serialize_language_meta_disk(meta_data): language = Language(meta=meta_data) with make_tempdir() as d: language.to_disk(d) new_language = Language().from_disk(d) assert new_language.meta == language.meta