Beispiel #1
0
def test_matcher_pipe(nlp: Language) -> None:
    """It returns a stream of Doc objects."""
    doc_stream = (
        nlp.make_doc("test doc 1: Corvold"),
        nlp.make_doc("test doc 2: Prosh"),
    )
    matcher = FuzzyMatcher(nlp.vocab)
    output = matcher.pipe(doc_stream)
    assert list(output) == list(doc_stream)
Beispiel #2
0
def test_matcher_pipe_with_context(nlp: Language) -> None:
    """It returns a stream of Doc objects as tuples with context."""
    doc_stream = (
        (nlp.make_doc("test doc 1: United States"), "Country"),
        (nlp.make_doc("test doc 2: US"), "Country"),
    )
    matcher = RegexMatcher(nlp.vocab)
    output = matcher.pipe(doc_stream, as_tuples=True)
    assert list(output) == list(doc_stream)
Beispiel #3
0
def test_matcher_pipe_with_context(nlp: Language) -> None:
    """It returns a stream of Doc objects as tuples with context."""
    doc_stream = (
        (nlp.make_doc("test doc 1: Corvold"), "Jund"),
        (nlp.make_doc("test doc 2: Prosh"), "Jund"),
    )
    matcher = FuzzyMatcher(nlp.vocab)
    output = matcher.pipe(doc_stream, as_tuples=True)
    assert list(output) == list(doc_stream)
Beispiel #4
0
def test_matcher_pipe(nlp: Language) -> None:
    """It returns a stream of Doc objects."""
    doc_stream = (
        nlp.make_doc("test doc 1: United States"),
        nlp.make_doc("test doc 2: US"),
    )
    matcher = RegexMatcher(nlp.vocab)
    output = matcher.pipe(doc_stream)
    assert list(output) == list(doc_stream)
Beispiel #5
0
def test_matcher_pipe_with_matches(nlp: Language) -> None:
    """It returns a stream of Doc objects and matches as tuples."""
    doc_stream = (
        nlp.make_doc("test doc 1: United States"),
        nlp.make_doc("test doc 2: US"),
    )
    matcher = RegexMatcher(nlp.vocab)
    matcher.add("GPE", ["[Uu](nited|\\.?) ?[Ss](tates|\\.?)"])
    output = matcher.pipe(doc_stream, return_matches=True)
    matches = [entry[1] for entry in output]
    assert matches == [[("GPE", 4, 6)], [("GPE", 4, 5)]]
Beispiel #6
0
def test_matcher_pipe_with_matches(nlp: Language) -> None:
    """It returns a stream of Doc objects and matches as tuples."""
    doc_stream = (
        nlp.make_doc("test doc 1: Corvold"),
        nlp.make_doc("test doc 2: Prosh"),
    )
    matcher = FuzzyMatcher(nlp.vocab)
    matcher.add("DRAGON", [nlp.make_doc("Korvold"), nlp.make_doc("Prossh")])
    output = matcher.pipe(doc_stream, return_matches=True)
    matches = [entry[1] for entry in output]
    assert matches == [[("DRAGON", 4, 5, 86)], [("DRAGON", 4, 5, 91)]]
Beispiel #7
0
def test_matcher_pipe_with_matches_and_context(nlp: Language) -> None:
    """It returns a stream of Doc objects, matches, and context as a tuple."""
    doc_stream = (
        (nlp.make_doc("test doc 1: United States"), "Country"),
        (nlp.make_doc("test doc 2: US"), "Country"),
    )
    matcher = RegexMatcher(nlp.vocab)
    matcher.add("GPE", ["[Uu](nited|\\.?) ?[Ss](tates|\\.?)"])
    output = matcher.pipe(doc_stream, return_matches=True, as_tuples=True)
    matches = [(entry[0][1], entry[1]) for entry in output]
    assert matches == [([("GPE", 4, 6)], "Country"),
                       ([("GPE", 4, 5)], "Country")]
Beispiel #8
0
def test_matcher_pipe_with_matches_and_context(nlp: Language) -> None:
    """It returns a stream of Doc objects and matches and context as tuples."""
    doc_stream = (
        (nlp.make_doc("test doc 1: Corvold"), "Jund"),
        (nlp.make_doc("test doc 2: Prosh"), "Jund"),
    )
    matcher = FuzzyMatcher(nlp.vocab)
    matcher.add("DRAGON", [nlp.make_doc("Korvold"), nlp.make_doc("Prossh")])
    output = matcher.pipe(doc_stream, return_matches=True, as_tuples=True)
    matches = [(entry[0][1], entry[1]) for entry in output]
    assert matches == [
        ([("DRAGON", 4, 5, 86)], "Jund"),
        ([("DRAGON", 4, 5, 91)], "Jund"),
    ]
Beispiel #9
0
def main(use_gpu=False, nb_epoch=50):
    if use_gpu:
        Model.ops = CupyOps()
        Model.Ops = CupyOps
    train, test = datasets.imdb()
    print("Load data")
    train_X, train_y = zip(*train)
    test_X, test_y = zip(*test)
    train_y = to_categorical(train_y, nb_classes=2)
    test_y = to_categorical(test_y, nb_classes=2)

    nlp = Language()

    dev_X = train_X[-1000:]
    dev_y = train_y[-1000:]
    train_X = train_X[:-1000]
    train_y = train_y[:-1000]
    print("Parse data")
    train_X = [nlp.make_doc(x) for x in train_X]
    dev_X = [nlp.make_doc(x) for x in dev_X]

    model = build_model(2, 1)

    print("Begin training")
    with model.begin_training(train_X, train_y,
                              L2=1e-6) as (trainer, optimizer):
        epoch_loss = [0.]

        def report_progress():
            with model.use_params(optimizer.averages):
                print(epoch_loss[-1], model.evaluate(dev_X, dev_y),
                      trainer.dropout)
            epoch_loss.append(0.)

        trainer.each_epoch.append(report_progress)
        trainer.nb_epoch = nb_epoch
        trainer.dropout = 0.0
        trainer.batch_size = 128
        trainer.dropout_decay = 0.0
        for X, y in trainer.iterate(train_X[:1000], train_y[:1000]):
            yh, backprop = model.begin_update(X, drop=trainer.dropout)
            loss = ((yh - y)**2.).sum() / y.shape[0]
            backprop((yh - y) / y.shape[0], optimizer)
            epoch_loss[-1] += loss
        with model.use_params(optimizer.averages):
            print('Avg dev.: %.3f' % model.evaluate(dev_X, dev_y))
            with open('out.pickle', 'wb') as file_:
                pickle.dump(model, file_, -1)
Beispiel #10
0
    def _extract_doc_matches(
        self,
        lang: Language,
        doc: Doc,
        keywords: Sequence[str],
        scores: Sequence[float],
    ) -> Dict[str, DocMatch]:
        '''
        extract and format info for all keywords in a given document.
        attr: str the spacy token attribute to use to match in the sentence search
        '''

        matcher = PhraseMatcher(lang.vocab, attr='LOWER')
        patterns = [lang.make_doc(str(kw)) for kw in keywords]
        matcher.add("Keywords", patterns)
        sents = self._extract_sentence_matches(doc,
                                               keywords,
                                               matcher,
                                               attr='LOWER')

        matches: Dict[str, DocMatch] = {
            kw: DocMatch(doc, kw, score, sents[kw])
            for kw, score in zip(keywords, scores)
        }

        return matches
Beispiel #11
0
def test_beam_parse():
    nlp = Language()
    nlp.add_pipe(DependencyParser(nlp.vocab), name="parser")
    nlp.parser.add_label("nsubj")
    nlp.parser.begin_training([], token_vector_width=8, hidden_width=8)
    doc = nlp.make_doc("Australia is a country")
    nlp.parser(doc, beam_width=2)
Beispiel #12
0
 def corpus(nlp: Language):
     for original_example in original_examples:
         doc = nlp.make_doc(original_example[0])
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", category=UserWarning)
             spacy_example = Example.from_dict(doc, original_example[1])
         yield spacy_example
Beispiel #13
0
def test__adjust_left_right_positions_with_no_flex(searcher: FuzzySearcher,
                                                   nlp: Language) -> None:
    """It returns the intial match when flex value = 0."""
    doc = nlp.make_doc("Patient was prescribed Zithroma tablets.")
    query = nlp.make_doc("zithromax")
    match_values = {3: 94}
    assert searcher._adjust_left_right_positions(
        doc,
        query,
        match_values,
        pos=3,
        fuzzy_func="simple",
        min_r2=70,
        ignore_case=True,
        flex=0,
    ) == (3, 4, 94)
Beispiel #14
0
def test__adjust_left_right_positions_finds_better_match(
        searcher: FuzzySearcher, nlp: Language) -> None:
    """It optimizes the initial match to find a better match."""
    doc = nlp.make_doc("Patient was prescribed Zithromax tablets.")
    query = nlp.make_doc("zithromax tablet")
    match_values = {0: 30, 2: 50, 3: 97, 4: 50}
    assert searcher._adjust_left_right_positions(
        doc,
        query,
        match_values,
        pos=3,
        fuzzy_func="simple",
        min_r2=70,
        ignore_case=True,
        flex=2,
    ) == (3, 5, 97)
Beispiel #15
0
def test_implicit_label():
    nlp = Language()
    nlp.add_pipe("tagger")
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    nlp.initialize(get_examples=lambda: train_examples)
Beispiel #16
0
def test_beam_parse():
    nlp = Language()
    nlp.add_pipe(DependencyParser(nlp.vocab), name="parser")
    nlp.parser.add_label("nsubj")
    nlp.parser.begin_training([], token_vector_width=8, hidden_width=8)
    doc = nlp.make_doc("Australia is a country")
    nlp.parser(doc, beam_width=2)
Beispiel #17
0
def test__calc_flex_warns_if_flex_longer_than_query(
    nlp: Language, searcher: FuzzySearcher
) -> None:
    """It provides UserWarning if flex > len(query)."""
    query = nlp.make_doc("Test query.")
    with pytest.warns(FlexWarning):
        searcher._calc_flex(query, 5)
Beispiel #18
0
def test__scan_doc_with_no_matches(searcher: FuzzySearcher, nlp: Language,
                                   scan_example: Doc) -> None:
    """It returns None if no matches >= min_r1."""
    query = nlp.make_doc("xenomorph")
    assert (searcher._scan_doc(
        scan_example, query, fuzzy_func="simple", min_r1=30, ignore_case=True)
            is None)
Beispiel #19
0
def main(use_gpu=False, nb_epoch=50):
    if use_gpu:
        Model.ops = CupyOps()
        Model.Ops = CupyOps
    train, test = datasets.imdb()
    print("Load data")
    train_X, train_y = zip(*train)
    test_X, test_y = zip(*test)
    train_y = to_categorical(train_y, nb_classes=2)
    test_y = to_categorical(test_y, nb_classes=2)

    nlp = Language()

    dev_X = train_X[-1000:]
    dev_y = train_y[-1000:]
    train_X = train_X[:-1000]
    train_y = train_y[:-1000]
    print("Parse data")
    train_X = [nlp.make_doc(x) for x in train_X]
    dev_X = [nlp.make_doc(x) for x in dev_X]

    model = build_model(2, 1)

    print("Begin training")
    with model.begin_training(train_X, train_y, L2=1e-6) as (trainer, optimizer):
        epoch_loss = [0.0]

        def report_progress():
            with model.use_params(optimizer.averages):
                print(epoch_loss[-1], model.evaluate(dev_X, dev_y), trainer.dropout)
            epoch_loss.append(0.0)

        trainer.each_epoch.append(report_progress)
        trainer.nb_epoch = nb_epoch
        trainer.dropout = 0.0
        trainer.batch_size = 128
        trainer.dropout_decay = 0.0
        for X, y in trainer.iterate(train_X[:1000], train_y[:1000]):
            yh, backprop = model.begin_update(X, drop=trainer.dropout)
            loss = ((yh - y) ** 2.0).sum() / y.shape[0]
            backprop((yh - y) / y.shape[0], optimizer)
            epoch_loss[-1] += loss
        with model.use_params(optimizer.averages):
            print("Avg dev.: %.3f" % model.evaluate(dev_X, dev_y))
            with open("out.pickle", "wb") as file_:
                pickle.dump(model, file_, -1)
Beispiel #20
0
def test__scan_doc_returns_all_matches_with_no_min_r1(
    searcher: FuzzySearcher, nlp: Language, scan_example: Doc
) -> None:
    """It returns all spans of len(query) in doc if min_r1 = 0."""
    query = nlp.make_doc("Shirley")
    assert searcher._scan_doc(
        scan_example, query, fuzzy_func="simple", min_r1=0, ignore_case=True
    ) == {0: 0, 1: 0, 2: 18, 3: 22, 4: 86}
Beispiel #21
0
def test_beam_parse(examples, beam_width):
    nlp = Language()
    parser = nlp.add_pipe("beam_parser")
    parser.cfg["beam_width"] = beam_width
    parser.add_label("nsubj")
    parser.initialize(lambda: examples)
    doc = nlp.make_doc("Australia is a country")
    parser(doc)
Beispiel #22
0
def matcher(nlp: Language, ) -> FuzzyMatcher:
    """Fuzzy matcher with patterns added."""
    animals = ["Heifer", "chicken"]
    sounds = ["mooo"]
    names = ["Steven"]
    matcher = FuzzyMatcher(nlp.vocab)
    matcher.add(
        "ANIMAL",
        [nlp.make_doc(animal) for animal in animals],
        kwargs=[{
            "ignore_case": False
        }, {}],
    )
    matcher.add("SOUND", [nlp.make_doc(sound) for sound in sounds])
    matcher.add("NAME", [nlp.make_doc(name) for name in names],
                on_match=add_name_ent)
    return matcher
Beispiel #23
0
def test_error_with_multi_labels():
    nlp = Language()
    nlp.add_pipe("textcat")
    train_examples = []
    for text, annotations in TRAIN_DATA_MULTI_LABEL:
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
    with pytest.raises(ValueError):
        nlp.initialize(get_examples=lambda: train_examples)
Beispiel #24
0
def test__scan_doc_returns_matches_over_min_r1(
    searcher: FuzzySearcher, nlp: Language, scan_example: Doc
) -> None:
    """It returns all spans of len(query) in doc if ratio >= min_r1."""
    query = nlp.make_doc("Shirley")
    assert searcher._scan_doc(
        scan_example, query, fuzzy_func="simple", min_r1=30, ignore_case=True
    ) == {4: 86}
Beispiel #25
0
def test_implicit_labels():
    nlp = Language()
    spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
    assert len(spancat.labels) == 0
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    nlp.initialize(get_examples=lambda: train_examples)
    assert spancat.labels == ("PERSON", "LOC")
Beispiel #26
0
def test_ner_labels_added_implicitly_on_update():
    nlp = Language()
    ner = nlp.add_pipe("ner")
    for label in ["A", "B", "C"]:
        ner.add_label(label)
    nlp.initialize()
    doc = Doc(nlp.vocab, words=["hello", "world"], ents=["B-D", "O"])
    example = Example(nlp.make_doc(doc.text), doc)
    assert "D" not in ner.labels
    nlp.update([example])
    assert "D" in ner.labels
Beispiel #27
0
def test_factories_doc_cleaner():
    nlp = Language()
    nlp.add_pipe("doc_cleaner")
    doc = nlp.make_doc("text")
    doc.tensor = [1, 2, 3]
    doc = nlp(doc)
    assert doc.tensor is None

    nlp = Language()
    nlp.add_pipe("doc_cleaner", config={"silent": False})
    with pytest.warns(UserWarning):
        doc = nlp("text")

    Doc.set_extension("test_attr", default=-1)
    nlp = Language()
    nlp.add_pipe("doc_cleaner", config={"attrs": {"_.test_attr": 0}})
    doc = nlp.make_doc("text")
    doc._.test_attr = 100
    doc = nlp(doc)
    assert doc._.test_attr == 0
Beispiel #28
0
def test_add_with_more_patterns_than_explicit_kwargs_warns(
        matcher: FuzzyMatcher, nlp: Language) -> None:
    """It will warn when more patterns are added than explicit kwargs."""
    with pytest.warns(KwargsWarning):
        matcher.add(
            "TEST",
            [nlp.make_doc("Test1"),
             nlp.make_doc("Test2")],
            [{
                "ignore_case": False
            }],
        )
def simple_nlp():
    nlp = Language()
    nlp.add_pipe("transformer")
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))

    optimizer = nlp.initialize()
    for i in range(2):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)

    return nlp
Beispiel #30
0
def test_initialize_examples():
    nlp = Language()
    nlp.add_pipe("senter")
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    # you shouldn't really call this more than once, but for testing it should be fine
    nlp.initialize()
    nlp.initialize(get_examples=lambda: train_examples)
    with pytest.raises(TypeError):
        nlp.initialize(get_examples=lambda: None)
    with pytest.raises(TypeError):
        nlp.initialize(get_examples=train_examples)
def test_transformer_pipeline_simple():
    """Test that a simple pipeline with just a transformer at least runs"""
    nlp = Language()
    nlp.add_pipe("transformer")
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))

    optimizer = nlp.initialize()
    for i in range(2):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    doc = nlp("We're interested at underwater basket weaving.")
    assert doc
Beispiel #32
0
def test_make_spangroup(max_positive, nr_results):
    fix_random_seed(0)
    nlp = Language()
    spancat = nlp.add_pipe(
        "spancat",
        config={
            "spans_key": SPAN_KEY,
            "threshold": 0.5,
            "max_positive": max_positive
        },
    )
    doc = nlp.make_doc("Greater London")
    ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(
        sizes=[1, 2])
    indices = ngram_suggester([doc])[0].dataXd
    assert_array_equal(OPS.to_numpy(indices),
                       numpy.asarray([[0, 1], [1, 2], [0, 2]]))
    labels = ["Thing", "City", "Person", "GreatCity"]
    scores = numpy.asarray(
        [[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]],
        dtype="f")
    spangroup = spancat._make_span_group(doc, indices, scores, labels)
    assert len(spangroup) == nr_results

    # first span is always the second token "London"
    assert spangroup[0].text == "London"
    assert spangroup[0].label_ == "City"
    assert_almost_equal(0.6, spangroup.attrs["scores"][0], 5)

    # second span depends on the number of positives that were allowed
    assert spangroup[1].text == "Greater London"
    if max_positive == 1:
        assert spangroup[1].label_ == "GreatCity"
        assert_almost_equal(0.9, spangroup.attrs["scores"][1], 5)
    else:
        assert spangroup[1].label_ == "Thing"
        assert_almost_equal(0.8, spangroup.attrs["scores"][1], 5)

    if nr_results > 2:
        assert spangroup[2].text == "Greater London"
        if max_positive == 2:
            assert spangroup[2].label_ == "GreatCity"
            assert_almost_equal(0.9, spangroup.attrs["scores"][2], 5)
        else:
            assert spangroup[2].label_ == "City"
            assert_almost_equal(0.7, spangroup.attrs["scores"][2], 5)

    assert spangroup[-1].text == "Greater London"
    assert spangroup[-1].label_ == "GreatCity"
    assert_almost_equal(0.9, spangroup.attrs["scores"][-1], 5)