def test_matcher_pipe(nlp: Language) -> None: """It returns a stream of Doc objects.""" doc_stream = ( nlp.make_doc("test doc 1: Corvold"), nlp.make_doc("test doc 2: Prosh"), ) matcher = FuzzyMatcher(nlp.vocab) output = matcher.pipe(doc_stream) assert list(output) == list(doc_stream)
def test_matcher_pipe_with_context(nlp: Language) -> None: """It returns a stream of Doc objects as tuples with context.""" doc_stream = ( (nlp.make_doc("test doc 1: United States"), "Country"), (nlp.make_doc("test doc 2: US"), "Country"), ) matcher = RegexMatcher(nlp.vocab) output = matcher.pipe(doc_stream, as_tuples=True) assert list(output) == list(doc_stream)
def test_matcher_pipe_with_context(nlp: Language) -> None: """It returns a stream of Doc objects as tuples with context.""" doc_stream = ( (nlp.make_doc("test doc 1: Corvold"), "Jund"), (nlp.make_doc("test doc 2: Prosh"), "Jund"), ) matcher = FuzzyMatcher(nlp.vocab) output = matcher.pipe(doc_stream, as_tuples=True) assert list(output) == list(doc_stream)
def test_matcher_pipe(nlp: Language) -> None: """It returns a stream of Doc objects.""" doc_stream = ( nlp.make_doc("test doc 1: United States"), nlp.make_doc("test doc 2: US"), ) matcher = RegexMatcher(nlp.vocab) output = matcher.pipe(doc_stream) assert list(output) == list(doc_stream)
def test_matcher_pipe_with_matches(nlp: Language) -> None: """It returns a stream of Doc objects and matches as tuples.""" doc_stream = ( nlp.make_doc("test doc 1: United States"), nlp.make_doc("test doc 2: US"), ) matcher = RegexMatcher(nlp.vocab) matcher.add("GPE", ["[Uu](nited|\\.?) ?[Ss](tates|\\.?)"]) output = matcher.pipe(doc_stream, return_matches=True) matches = [entry[1] for entry in output] assert matches == [[("GPE", 4, 6)], [("GPE", 4, 5)]]
def test_matcher_pipe_with_matches(nlp: Language) -> None: """It returns a stream of Doc objects and matches as tuples.""" doc_stream = ( nlp.make_doc("test doc 1: Corvold"), nlp.make_doc("test doc 2: Prosh"), ) matcher = FuzzyMatcher(nlp.vocab) matcher.add("DRAGON", [nlp.make_doc("Korvold"), nlp.make_doc("Prossh")]) output = matcher.pipe(doc_stream, return_matches=True) matches = [entry[1] for entry in output] assert matches == [[("DRAGON", 4, 5, 86)], [("DRAGON", 4, 5, 91)]]
def test_matcher_pipe_with_matches_and_context(nlp: Language) -> None: """It returns a stream of Doc objects, matches, and context as a tuple.""" doc_stream = ( (nlp.make_doc("test doc 1: United States"), "Country"), (nlp.make_doc("test doc 2: US"), "Country"), ) matcher = RegexMatcher(nlp.vocab) matcher.add("GPE", ["[Uu](nited|\\.?) ?[Ss](tates|\\.?)"]) output = matcher.pipe(doc_stream, return_matches=True, as_tuples=True) matches = [(entry[0][1], entry[1]) for entry in output] assert matches == [([("GPE", 4, 6)], "Country"), ([("GPE", 4, 5)], "Country")]
def test_matcher_pipe_with_matches_and_context(nlp: Language) -> None: """It returns a stream of Doc objects and matches and context as tuples.""" doc_stream = ( (nlp.make_doc("test doc 1: Corvold"), "Jund"), (nlp.make_doc("test doc 2: Prosh"), "Jund"), ) matcher = FuzzyMatcher(nlp.vocab) matcher.add("DRAGON", [nlp.make_doc("Korvold"), nlp.make_doc("Prossh")]) output = matcher.pipe(doc_stream, return_matches=True, as_tuples=True) matches = [(entry[0][1], entry[1]) for entry in output] assert matches == [ ([("DRAGON", 4, 5, 86)], "Jund"), ([("DRAGON", 4, 5, 91)], "Jund"), ]
def main(use_gpu=False, nb_epoch=50): if use_gpu: Model.ops = CupyOps() Model.Ops = CupyOps train, test = datasets.imdb() print("Load data") train_X, train_y = zip(*train) test_X, test_y = zip(*test) train_y = to_categorical(train_y, nb_classes=2) test_y = to_categorical(test_y, nb_classes=2) nlp = Language() dev_X = train_X[-1000:] dev_y = train_y[-1000:] train_X = train_X[:-1000] train_y = train_y[:-1000] print("Parse data") train_X = [nlp.make_doc(x) for x in train_X] dev_X = [nlp.make_doc(x) for x in dev_X] model = build_model(2, 1) print("Begin training") with model.begin_training(train_X, train_y, L2=1e-6) as (trainer, optimizer): epoch_loss = [0.] def report_progress(): with model.use_params(optimizer.averages): print(epoch_loss[-1], model.evaluate(dev_X, dev_y), trainer.dropout) epoch_loss.append(0.) trainer.each_epoch.append(report_progress) trainer.nb_epoch = nb_epoch trainer.dropout = 0.0 trainer.batch_size = 128 trainer.dropout_decay = 0.0 for X, y in trainer.iterate(train_X[:1000], train_y[:1000]): yh, backprop = model.begin_update(X, drop=trainer.dropout) loss = ((yh - y)**2.).sum() / y.shape[0] backprop((yh - y) / y.shape[0], optimizer) epoch_loss[-1] += loss with model.use_params(optimizer.averages): print('Avg dev.: %.3f' % model.evaluate(dev_X, dev_y)) with open('out.pickle', 'wb') as file_: pickle.dump(model, file_, -1)
def _extract_doc_matches( self, lang: Language, doc: Doc, keywords: Sequence[str], scores: Sequence[float], ) -> Dict[str, DocMatch]: ''' extract and format info for all keywords in a given document. attr: str the spacy token attribute to use to match in the sentence search ''' matcher = PhraseMatcher(lang.vocab, attr='LOWER') patterns = [lang.make_doc(str(kw)) for kw in keywords] matcher.add("Keywords", patterns) sents = self._extract_sentence_matches(doc, keywords, matcher, attr='LOWER') matches: Dict[str, DocMatch] = { kw: DocMatch(doc, kw, score, sents[kw]) for kw, score in zip(keywords, scores) } return matches
def test_beam_parse(): nlp = Language() nlp.add_pipe(DependencyParser(nlp.vocab), name="parser") nlp.parser.add_label("nsubj") nlp.parser.begin_training([], token_vector_width=8, hidden_width=8) doc = nlp.make_doc("Australia is a country") nlp.parser(doc, beam_width=2)
def corpus(nlp: Language): for original_example in original_examples: doc = nlp.make_doc(original_example[0]) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=UserWarning) spacy_example = Example.from_dict(doc, original_example[1]) yield spacy_example
def test__adjust_left_right_positions_with_no_flex(searcher: FuzzySearcher, nlp: Language) -> None: """It returns the intial match when flex value = 0.""" doc = nlp.make_doc("Patient was prescribed Zithroma tablets.") query = nlp.make_doc("zithromax") match_values = {3: 94} assert searcher._adjust_left_right_positions( doc, query, match_values, pos=3, fuzzy_func="simple", min_r2=70, ignore_case=True, flex=0, ) == (3, 4, 94)
def test__adjust_left_right_positions_finds_better_match( searcher: FuzzySearcher, nlp: Language) -> None: """It optimizes the initial match to find a better match.""" doc = nlp.make_doc("Patient was prescribed Zithromax tablets.") query = nlp.make_doc("zithromax tablet") match_values = {0: 30, 2: 50, 3: 97, 4: 50} assert searcher._adjust_left_right_positions( doc, query, match_values, pos=3, fuzzy_func="simple", min_r2=70, ignore_case=True, flex=2, ) == (3, 5, 97)
def test_implicit_label(): nlp = Language() nlp.add_pipe("tagger") train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) nlp.initialize(get_examples=lambda: train_examples)
def test_beam_parse(): nlp = Language() nlp.add_pipe(DependencyParser(nlp.vocab), name="parser") nlp.parser.add_label("nsubj") nlp.parser.begin_training([], token_vector_width=8, hidden_width=8) doc = nlp.make_doc("Australia is a country") nlp.parser(doc, beam_width=2)
def test__calc_flex_warns_if_flex_longer_than_query( nlp: Language, searcher: FuzzySearcher ) -> None: """It provides UserWarning if flex > len(query).""" query = nlp.make_doc("Test query.") with pytest.warns(FlexWarning): searcher._calc_flex(query, 5)
def test__scan_doc_with_no_matches(searcher: FuzzySearcher, nlp: Language, scan_example: Doc) -> None: """It returns None if no matches >= min_r1.""" query = nlp.make_doc("xenomorph") assert (searcher._scan_doc( scan_example, query, fuzzy_func="simple", min_r1=30, ignore_case=True) is None)
def main(use_gpu=False, nb_epoch=50): if use_gpu: Model.ops = CupyOps() Model.Ops = CupyOps train, test = datasets.imdb() print("Load data") train_X, train_y = zip(*train) test_X, test_y = zip(*test) train_y = to_categorical(train_y, nb_classes=2) test_y = to_categorical(test_y, nb_classes=2) nlp = Language() dev_X = train_X[-1000:] dev_y = train_y[-1000:] train_X = train_X[:-1000] train_y = train_y[:-1000] print("Parse data") train_X = [nlp.make_doc(x) for x in train_X] dev_X = [nlp.make_doc(x) for x in dev_X] model = build_model(2, 1) print("Begin training") with model.begin_training(train_X, train_y, L2=1e-6) as (trainer, optimizer): epoch_loss = [0.0] def report_progress(): with model.use_params(optimizer.averages): print(epoch_loss[-1], model.evaluate(dev_X, dev_y), trainer.dropout) epoch_loss.append(0.0) trainer.each_epoch.append(report_progress) trainer.nb_epoch = nb_epoch trainer.dropout = 0.0 trainer.batch_size = 128 trainer.dropout_decay = 0.0 for X, y in trainer.iterate(train_X[:1000], train_y[:1000]): yh, backprop = model.begin_update(X, drop=trainer.dropout) loss = ((yh - y) ** 2.0).sum() / y.shape[0] backprop((yh - y) / y.shape[0], optimizer) epoch_loss[-1] += loss with model.use_params(optimizer.averages): print("Avg dev.: %.3f" % model.evaluate(dev_X, dev_y)) with open("out.pickle", "wb") as file_: pickle.dump(model, file_, -1)
def test__scan_doc_returns_all_matches_with_no_min_r1( searcher: FuzzySearcher, nlp: Language, scan_example: Doc ) -> None: """It returns all spans of len(query) in doc if min_r1 = 0.""" query = nlp.make_doc("Shirley") assert searcher._scan_doc( scan_example, query, fuzzy_func="simple", min_r1=0, ignore_case=True ) == {0: 0, 1: 0, 2: 18, 3: 22, 4: 86}
def test_beam_parse(examples, beam_width): nlp = Language() parser = nlp.add_pipe("beam_parser") parser.cfg["beam_width"] = beam_width parser.add_label("nsubj") parser.initialize(lambda: examples) doc = nlp.make_doc("Australia is a country") parser(doc)
def matcher(nlp: Language, ) -> FuzzyMatcher: """Fuzzy matcher with patterns added.""" animals = ["Heifer", "chicken"] sounds = ["mooo"] names = ["Steven"] matcher = FuzzyMatcher(nlp.vocab) matcher.add( "ANIMAL", [nlp.make_doc(animal) for animal in animals], kwargs=[{ "ignore_case": False }, {}], ) matcher.add("SOUND", [nlp.make_doc(sound) for sound in sounds]) matcher.add("NAME", [nlp.make_doc(name) for name in names], on_match=add_name_ent) return matcher
def test_error_with_multi_labels(): nlp = Language() nlp.add_pipe("textcat") train_examples = [] for text, annotations in TRAIN_DATA_MULTI_LABEL: train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) with pytest.raises(ValueError): nlp.initialize(get_examples=lambda: train_examples)
def test__scan_doc_returns_matches_over_min_r1( searcher: FuzzySearcher, nlp: Language, scan_example: Doc ) -> None: """It returns all spans of len(query) in doc if ratio >= min_r1.""" query = nlp.make_doc("Shirley") assert searcher._scan_doc( scan_example, query, fuzzy_func="simple", min_r1=30, ignore_case=True ) == {4: 86}
def test_implicit_labels(): nlp = Language() spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) assert len(spancat.labels) == 0 train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) nlp.initialize(get_examples=lambda: train_examples) assert spancat.labels == ("PERSON", "LOC")
def test_ner_labels_added_implicitly_on_update(): nlp = Language() ner = nlp.add_pipe("ner") for label in ["A", "B", "C"]: ner.add_label(label) nlp.initialize() doc = Doc(nlp.vocab, words=["hello", "world"], ents=["B-D", "O"]) example = Example(nlp.make_doc(doc.text), doc) assert "D" not in ner.labels nlp.update([example]) assert "D" in ner.labels
def test_factories_doc_cleaner(): nlp = Language() nlp.add_pipe("doc_cleaner") doc = nlp.make_doc("text") doc.tensor = [1, 2, 3] doc = nlp(doc) assert doc.tensor is None nlp = Language() nlp.add_pipe("doc_cleaner", config={"silent": False}) with pytest.warns(UserWarning): doc = nlp("text") Doc.set_extension("test_attr", default=-1) nlp = Language() nlp.add_pipe("doc_cleaner", config={"attrs": {"_.test_attr": 0}}) doc = nlp.make_doc("text") doc._.test_attr = 100 doc = nlp(doc) assert doc._.test_attr == 0
def test_add_with_more_patterns_than_explicit_kwargs_warns( matcher: FuzzyMatcher, nlp: Language) -> None: """It will warn when more patterns are added than explicit kwargs.""" with pytest.warns(KwargsWarning): matcher.add( "TEST", [nlp.make_doc("Test1"), nlp.make_doc("Test2")], [{ "ignore_case": False }], )
def simple_nlp(): nlp = Language() nlp.add_pipe("transformer") train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) optimizer = nlp.initialize() for i in range(2): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) return nlp
def test_initialize_examples(): nlp = Language() nlp.add_pipe("senter") train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) # you shouldn't really call this more than once, but for testing it should be fine nlp.initialize() nlp.initialize(get_examples=lambda: train_examples) with pytest.raises(TypeError): nlp.initialize(get_examples=lambda: None) with pytest.raises(TypeError): nlp.initialize(get_examples=train_examples)
def test_transformer_pipeline_simple(): """Test that a simple pipeline with just a transformer at least runs""" nlp = Language() nlp.add_pipe("transformer") train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) optimizer = nlp.initialize() for i in range(2): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) doc = nlp("We're interested at underwater basket weaving.") assert doc
def test_make_spangroup(max_positive, nr_results): fix_random_seed(0) nlp = Language() spancat = nlp.add_pipe( "spancat", config={ "spans_key": SPAN_KEY, "threshold": 0.5, "max_positive": max_positive }, ) doc = nlp.make_doc("Greater London") ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")( sizes=[1, 2]) indices = ngram_suggester([doc])[0].dataXd assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]])) labels = ["Thing", "City", "Person", "GreatCity"] scores = numpy.asarray( [[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f") spangroup = spancat._make_span_group(doc, indices, scores, labels) assert len(spangroup) == nr_results # first span is always the second token "London" assert spangroup[0].text == "London" assert spangroup[0].label_ == "City" assert_almost_equal(0.6, spangroup.attrs["scores"][0], 5) # second span depends on the number of positives that were allowed assert spangroup[1].text == "Greater London" if max_positive == 1: assert spangroup[1].label_ == "GreatCity" assert_almost_equal(0.9, spangroup.attrs["scores"][1], 5) else: assert spangroup[1].label_ == "Thing" assert_almost_equal(0.8, spangroup.attrs["scores"][1], 5) if nr_results > 2: assert spangroup[2].text == "Greater London" if max_positive == 2: assert spangroup[2].label_ == "GreatCity" assert_almost_equal(0.9, spangroup.attrs["scores"][2], 5) else: assert spangroup[2].label_ == "City" assert_almost_equal(0.7, spangroup.attrs["scores"][2], 5) assert spangroup[-1].text == "Greater London" assert spangroup[-1].label_ == "GreatCity" assert_almost_equal(0.9, spangroup.attrs["scores"][-1], 5)