Esempio n. 1
0
def test_resize_same_results(name, textcat_config):
    # Ensure that the resized textcat classifiers still produce the same results for old labels
    fix_random_seed(0)
    nlp = English()
    pipe_config = {"model": textcat_config}
    textcat = nlp.add_pipe(name, config=pipe_config)

    train_examples = []
    for text, annotations in TRAIN_DATA_SINGLE_LABEL:
        train_examples.append(
            Example.from_dict(nlp.make_doc(text), annotations))
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    assert textcat.model.maybe_get_dim("nO") in [2, None]

    for i in range(5):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)

    # test the trained model before resizing
    test_text = "I am happy."
    doc = nlp(test_text)
    assert len(doc.cats) == 2
    pos_pred = doc.cats["POSITIVE"]
    neg_pred = doc.cats["NEGATIVE"]

    # test the trained model again after resizing
    textcat.add_label("NEUTRAL")
    doc = nlp(test_text)
    assert len(doc.cats) == 3
    assert doc.cats["POSITIVE"] == pos_pred
    assert doc.cats["NEGATIVE"] == neg_pred
    assert doc.cats["NEUTRAL"] <= 1

    for i in range(5):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)

    # test the trained model again after training further with new label
    doc = nlp(test_text)
    assert len(doc.cats) == 3
    assert doc.cats["POSITIVE"] != pos_pred
    assert doc.cats["NEGATIVE"] != neg_pred
    for cat in doc.cats:
        assert doc.cats[cat] <= 1
Esempio n. 2
0
def test_get_span_characteristics_return_value():
    nlp = English()
    spans_key = "sc"

    pred = Doc(nlp.vocab, words=["Welcome", "to", "the", "Bank", "of", "China", "."])
    pred.spans[spans_key] = [Span(pred, 3, 6, "ORG"), Span(pred, 5, 6, "GPE")]
    ref = Doc(nlp.vocab, words=["Welcome", "to", "the", "Bank", "of", "China", "."])
    ref.spans[spans_key] = [Span(ref, 3, 6, "ORG"), Span(ref, 5, 6, "GPE")]
    eg = Example(pred, ref)

    examples = [eg]
    data = _compile_gold(examples, ["spancat"], nlp, True)
    span_characteristics = _get_span_characteristics(
        examples=examples, compiled_gold=data, spans_key=spans_key
    )

    assert {"sd", "bd", "lengths"}.issubset(span_characteristics.keys())
    assert span_characteristics["min_length"] == 1
    assert span_characteristics["max_length"] == 3
Esempio n. 3
0
def test_textcat_multi_threshold():
    # Ensure the scorer can be called with a different threshold
    nlp = English()
    nlp.add_pipe("textcat_multilabel")

    train_examples = []
    for text, annotations in TRAIN_DATA_SINGLE_LABEL:
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
    nlp.initialize(get_examples=lambda: train_examples)

    # score the model (it's not actually trained but that doesn't matter)
    scores = nlp.evaluate(train_examples)
    assert 0 <= scores["cats_score"] <= 1

    scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 1.0})
    assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 0

    scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 0})
    assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
Esempio n. 4
0
def test_split_sents(merged_dict):
    nlp = English()
    example = Example.from_dict(
        Doc(nlp.vocab, words=merged_dict["words"], spaces=merged_dict["spaces"]),
        merged_dict,
    )
    assert example.text == "Hi there everyone It is just me"
    split_examples = example.split_sents()
    assert len(split_examples) == 2
    assert split_examples[0].text == "Hi there everyone "
    assert split_examples[1].text == "It is just me"
    token_annotation_1 = split_examples[0].to_dict()["token_annotation"]
    assert token_annotation_1["ORTH"] == ["Hi", "there", "everyone"]
    assert token_annotation_1["TAG"] == ["INTJ", "ADV", "PRON"]
    assert token_annotation_1["SENT_START"] == [1, 0, 0]
    token_annotation_2 = split_examples[1].to_dict()["token_annotation"]
    assert token_annotation_2["ORTH"] == ["It", "is", "just", "me"]
    assert token_annotation_2["TAG"] == ["PRON", "AUX", "ADV", "PRON"]
    assert token_annotation_2["SENT_START"] == [1, 0, 0, 0]
Esempio n. 5
0
def test_tok2vec_listener_callback():
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config,
                                      auto_fill=True,
                                      validate=True)
    assert nlp.pipe_names == ["tok2vec", "tagger"]
    tagger = nlp.get_pipe("tagger")
    tok2vec = nlp.get_pipe("tok2vec")
    nlp._link_components()
    docs = [nlp.make_doc("A random sentence")]
    tok2vec.model.initialize(X=docs)
    gold_array = [[1.0 for tag in ["V", "Z"]] for word in docs]
    label_sample = [tagger.model.ops.asarray(gold_array, dtype="float32")]
    tagger.model.initialize(X=docs, Y=label_sample)
    docs = [nlp.make_doc("Another entirely random sentence")]
    tok2vec.update([Example.from_dict(x, {}) for x in docs])
    Y, get_dX = tagger.model.begin_update(docs)
    # assure that the backprop call works (and doesn't hit a 'None' callback)
    assert get_dX(Y) is not None
Esempio n. 6
0
def test_train_empty():
    """Test that training an empty text does not throw errors."""
    train_data = [
        ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
        ("", {"entities": []}),
    ]

    nlp = English()
    train_examples = []
    for t in train_data:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    ner = nlp.add_pipe("ner", last=True)
    ner.add_label("PERSON")
    nlp.initialize()
    for itn in range(2):
        losses = {}
        batches = util.minibatch(train_examples, size=8)
        for batch in batches:
            nlp.update(batch, losses=losses)
Esempio n. 7
0
def test_train_negative_deprecated():
    """Test that the deprecated negative entity format raises a custom error."""
    train_data = [
        ("Who is Shaka Khan?", {"entities": [(7, 17, "!PERSON")]}),
    ]

    nlp = English()
    train_examples = []
    for t in train_data:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    ner = nlp.add_pipe("ner", last=True)
    ner.add_label("PERSON")
    nlp.initialize()
    for itn in range(2):
        losses = {}
        batches = util.minibatch(train_examples, size=8)
        for batch in batches:
            with pytest.raises(ValueError):
                nlp.update(batch, losses=losses)
Esempio n. 8
0
def test_issue2800():
    """Test issue that arises when too many labels are added to NER model.
    Used to cause segfault.
    """
    nlp = English()
    train_data = []
    train_data.extend(
        [Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})]
    )
    entity_types = [str(i) for i in range(1000)]
    ner = nlp.add_pipe("ner")
    for entity_type in list(entity_types):
        ner.add_label(entity_type)
    optimizer = nlp.initialize()
    for i in range(20):
        losses = {}
        random.shuffle(train_data)
        for example in train_data:
            nlp.update([example], sgd=optimizer, losses=losses, drop=0.5)
Esempio n. 9
0
def test_issue4030():
    """Test whether textcat works fine with empty doc"""
    unique_classes = ["offensive", "inoffensive"]
    x_train = [
        "This is an offensive text",
        "This is the second offensive text",
        "inoff",
    ]
    y_train = ["offensive", "offensive", "inoffensive"]
    nlp = spacy.blank("en")
    # preparing the data
    train_data = []
    for text, train_instance in zip(x_train, y_train):
        cat_dict = {label: label == train_instance for label in unique_classes}
        train_data.append(
            Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
    # add a text categorizer component
    model = {
        "@architectures": "spacy.TextCatBOW.v1",
        "exclusive_classes": True,
        "ngram_size": 2,
        "no_output_layer": False,
    }
    textcat = nlp.add_pipe("textcat", config={"model": model}, last=True)
    for label in unique_classes:
        textcat.add_label(label)
    # training the network
    with nlp.select_pipes(enable="textcat"):
        optimizer = nlp.initialize()
        for i in range(3):
            losses = {}
            batches = util.minibatch(train_data,
                                     size=compounding(4.0, 32.0, 1.001))

            for batch in batches:
                nlp.update(examples=batch,
                           sgd=optimizer,
                           drop=0.1,
                           losses=losses)
    # processing of an empty doc should result in 0.0 for all categories
    doc = nlp("")
    assert doc.cats["offensive"] == 0.0
    assert doc.cats["inoffensive"] == 0.0
Esempio n. 10
0
def test_initialize_examples():
    nlp = Language()
    tagger = nlp.add_pipe("tagger")
    train_examples = []
    for tag in TAGS:
        tagger.add_label(tag)
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    # you shouldn't really call this more than once, but for testing it should be fine
    nlp.initialize()
    nlp.initialize(get_examples=lambda: train_examples)
    with pytest.raises(TypeError):
        nlp.initialize(get_examples=lambda: None)
    with pytest.raises(TypeError):
        nlp.initialize(get_examples=lambda: train_examples[0])
    with pytest.raises(TypeError):
        nlp.initialize(get_examples=lambda: [])
    with pytest.raises(TypeError):
        nlp.initialize(get_examples=train_examples)
Esempio n. 11
0
def test_incomplete_data():
    # Test that the lemmatizer works with incomplete information
    nlp = English()
    lemmatizer = nlp.add_pipe("trainable_lemmatizer")
    lemmatizer.min_tree_freq = 1
    train_examples = []
    for t in PARTIAL_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["trainable_lemmatizer"] < 0.00001

    # test the trained model
    test_text = "She likes blue eggs"
    doc = nlp(test_text)
    assert doc[1].lemma_ == "like"
    assert doc[2].lemma_ == "blue"
Esempio n. 12
0
def test_tokenization(sented_doc):
    scorer = Scorer()
    gold = {"sent_starts": [t.sent_start for t in sented_doc]}
    example = Example.from_dict(sented_doc, gold)
    scores = scorer.score([example])
    assert scores["token_acc"] == 1.0

    nlp = English()
    example.predicted = Doc(
        nlp.vocab,
        words=["One", "sentence.", "Two", "sentences.", "Three", "sentences."],
        spaces=[True, True, True, True, True, False],
    )
    example.predicted[1].is_sent_start = False
    scores = scorer.score([example])
    assert scores["token_acc"] == approx(0.66666666)
    assert scores["token_p"] == 0.5
    assert scores["token_r"] == approx(0.33333333)
    assert scores["token_f"] == 0.4
def test_annotates_on_update():
    # The custom component checks for sentence annotation
    @Language.factory("assert_sents", default_config={})
    def assert_sents(nlp, name):
        return AssertSents(name)

    class AssertSents:
        def __init__(self, name, **cfg):
            self.name = name
            pass

        def __call__(self, doc):
            if not doc.has_annotation("SENT_START"):
                raise ValueError("No sents")
            return doc

        def update(self, examples, *, drop=0.0, sgd=None, losses=None):
            for example in examples:
                if not example.predicted.has_annotation("SENT_START"):
                    raise ValueError("No sents")
            return {}

    nlp = English()
    nlp.add_pipe("sentencizer")
    nlp.add_pipe("assert_sents")

    # When the pipeline runs, annotations are set
    nlp("This is a sentence.")

    examples = []
    for text in ["a a", "b b", "c c"]:
        examples.append(Example(nlp.make_doc(text), nlp(text)))

    for example in examples:
        assert not example.predicted.has_annotation("SENT_START")

    # If updating without setting annotations, assert_sents will raise an error
    with pytest.raises(ValueError):
        nlp.update(examples)

    # Updating while setting annotations for the sentencizer succeeds
    nlp.update(examples, annotates=["sentencizer"])
Esempio n. 14
0
def test_beam_valid_parse(neg_key):
    """Regression test for previously flakey behaviour"""
    nlp = English()
    beam_width = 16
    beam_density = 0.0001
    config = {
        "beam_width": beam_width,
        "beam_density": beam_density,
        "incorrect_spans_key": neg_key,
    }
    nlp.add_pipe("beam_ner", config=config)
    # fmt: off
    tokens = [
        'FEDERAL', 'NATIONAL', 'MORTGAGE', 'ASSOCIATION', '(', 'Fannie', 'Mae',
        '):', 'Posted', 'yields', 'on', '30', 'year', 'mortgage',
        'commitments', 'for', 'delivery', 'within', '30', 'days', '(',
        'priced', 'at', 'par', ')', '9.75', '%', ',', 'standard',
        'conventional', 'fixed', '-', 'rate', 'mortgages', ';', '8.70', '%',
        ',', '6/2', 'rate', 'capped', 'one', '-', 'year', 'adjustable', 'rate',
        'mortgages', '.', 'Source', ':', 'Telerate', 'Systems', 'Inc.'
    ]
    iob = [
        'B-ORG', 'I-ORG', 'I-ORG', 'L-ORG', 'O', 'B-ORG', 'L-ORG', 'O', 'O',
        'O', 'O', 'B-DATE', 'L-DATE', 'O', 'O', 'O', 'O', 'O', 'B-DATE',
        'L-DATE', 'O', 'O', 'O', 'O', 'O', 'B-PERCENT', 'L-PERCENT', 'O', 'O',
        'O', 'O', 'O', 'O', 'O', 'O', 'B-PERCENT', 'L-PERCENT', 'O',
        'U-CARDINAL', 'O', 'O', 'B-DATE', 'I-DATE', 'L-DATE', 'O', 'O', 'O',
        'O', 'O', 'O', 'O', 'O', 'O'
    ]
    # fmt: on

    doc = Doc(nlp.vocab, words=tokens)
    example = Example.from_dict(doc, {"ner": iob})
    neg_span = Span(doc, 50, 53, "ORG")
    example.reference.spans[neg_key] = [neg_span]

    optimizer = nlp.initialize()

    for i in range(5):
        losses = {}
        nlp.update([example], sgd=optimizer, losses=losses)
    assert "beam_ner" in losses
Esempio n. 15
0
def test_beam_overfitting_IO():
    # Simple test to try and quickly overfit the Beam NER component
    nlp = English()
    beam_width = 16
    beam_density = 0.0001
    config = {
        "beam_width": beam_width,
        "beam_density": beam_density,
    }
    ner = nlp.add_pipe("beam_ner", config=config)
    train_examples = []
    for text, annotations in TRAIN_DATA:
        train_examples.append(
            Example.from_dict(nlp.make_doc(text), annotations))
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    optimizer = nlp.initialize()

    # run overfitting
    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["beam_ner"] < 0.0001

    # test the scores from the beam
    test_text = "I like London."
    docs = [nlp.make_doc(test_text)]
    beams = ner.predict(docs)
    entity_scores = ner.scored_ents(beams)[0]
    assert entity_scores[(2, 3, "LOC")] == 1.0
    assert entity_scores[(2, 3, "PERSON")] == 0.0

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        docs2 = [nlp2.make_doc(test_text)]
        ner2 = nlp2.get_pipe("beam_ner")
        beams2 = ner2.predict(docs2)
        entity_scores2 = ner2.scored_ents(beams2)[0]
        assert entity_scores2[(2, 3, "LOC")] == 1.0
        assert entity_scores2[(2, 3, "PERSON")] == 0.0
Esempio n. 16
0
def test_update_with_annotates():
    name = "test_with_annotates"
    results = {}

    def make_component(name):
        results[name] = ""

        def component(doc):
            nonlocal results
            results[name] += doc.text
            return doc

        return component

    Language.component(f"{name}1", func=make_component(f"{name}1"))
    Language.component(f"{name}2", func=make_component(f"{name}2"))

    components = set([f"{name}1", f"{name}2"])

    nlp = English()
    texts = ["a", "bb", "ccc"]
    examples = []
    for text in texts:
        examples.append(Example(nlp.make_doc(text), nlp.make_doc(text)))

    for components_to_annotate in [
        [],
        [f"{name}1"],
        [f"{name}1", f"{name}2"],
        [f"{name}2", f"{name}1"],
    ]:
        for key in results:
            results[key] = ""
        nlp = English(vocab=nlp.vocab)
        nlp.add_pipe(f"{name}1")
        nlp.add_pipe(f"{name}2")
        nlp.update(examples, annotates=components_to_annotate)
        for component in components_to_annotate:
            assert results[component] == "".join(eg.predicted.text
                                                 for eg in examples)
        for component in components - set(components_to_annotate):
            assert results[component] == ""
Esempio n. 17
0
    async def train(self, sources: Sources):
        train_examples = await self._preprocess_data(sources)
        for _, entities in train_examples:
            for ent in entities.get("entities"):
                self.ner.add_label(ent[2])

        # get names of other pipes to disable them during training
        pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
        other_pipes = [
            pipe for pipe in self.nlp.pipe_names if pipe not in pipe_exceptions
        ]
        # only train NER
        with self.nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
            # show warnings for misaligned entity spans once
            warnings.filterwarnings("once",
                                    category=UserWarning,
                                    module="spacy")
            if self.parent.config.model_name_or_path is None:
                self.nlp.begin_training()
            for itn in range(self.parent.config.n_iter):
                random.shuffle(train_examples)
                losses = {}
                batches = minibatch(train_examples,
                                    size=compounding(4.0, 32.0, 1.001))
                for batch in batches:
                    examples = []
                    for doc, gold_dict in batch:
                        doc = self.nlp.make_doc(doc)
                        examples.append(Example.from_dict(doc, gold_dict))
                    self.nlp.update(
                        examples,
                        drop=self.parent.config.dropout,
                        losses=losses,
                    )
                self.logger.debug(f"Losses: {losses}")

        if self.parent.config.directory is not None:
            if not self.parent.config.directory.exists():
                self.parent.config.directory.mkdir(parents=True)
            self.nlp.to_disk(self.parent.config.directory)
            self.logger.debug(
                f"Saved model to {self.parent.config.directory.name}")
Esempio n. 18
0
def parser(vocab):
    vocab.strings.add("ROOT")
    cfg = {"model": DEFAULT_PARSER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
    parser = DependencyParser(vocab, model)
    parser.cfg["token_vector_width"] = 4
    parser.cfg["hidden_width"] = 32
    # parser.add_label('right')
    parser.add_label("left")
    parser.initialize(lambda: [_parser_example(parser)])
    sgd = Adam(0.001)

    for i in range(10):
        losses = {}
        doc = Doc(vocab, words=["a", "b", "c", "d"])
        example = Example.from_dict(
            doc, {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]}
        )
        parser.update([example], sgd=sgd, losses=losses)
    return parser
Esempio n. 19
0
def test_negative_samples_two_word_input(tsys, vocab, neg_key):
    """Test that we don't get stuck in a two word input when we have a negative
    span. This could happen if we don't have the right check on the B action.
    """
    tsys.cfg["neg_key"] = neg_key
    doc = Doc(vocab, words=["A", "B"])
    entity_annots = [None, None]
    example = Example.from_dict(doc, {"entities": entity_annots})
    # These mean that the oracle sequence shouldn't have O for the first
    # word, and it shouldn't analyse it as B-PERSON, L-PERSON
    example.y.spans[neg_key] = [
        Span(example.y, 0, 1, label="O"),
        Span(example.y, 0, 2, label="PERSON"),
    ]
    act_classes = tsys.get_oracle_sequence(example)
    names = [tsys.get_class_name(act) for act in act_classes]
    assert names
    assert names[0] != "O"
    assert names[0] != "B-PERSON"
    assert names[1] != "L-PERSON"
Esempio n. 20
0
def test_replace_listeners_from_config():
    orig_config = Config().from_str(cfg_string_multi)
    nlp = util.load_model_from_config(orig_config, auto_fill=True)
    annots = {"tags": ["V", "Z"], "entities": [(0, 1, "A"), (1, 2, "B")]}
    examples = [Example.from_dict(nlp.make_doc("x y"), annots)]
    nlp.initialize(lambda: examples)
    tok2vec = nlp.get_pipe("tok2vec")
    tagger = nlp.get_pipe("tagger")
    ner = nlp.get_pipe("ner")
    assert tok2vec.listening_components == ["tagger", "ner"]
    assert any(isinstance(node, Tok2VecListener) for node in ner.model.walk())
    assert any(isinstance(node, Tok2VecListener) for node in tagger.model.walk())
    with make_tempdir() as dir_path:
        nlp.to_disk(dir_path)
        base_model = str(dir_path)
        new_config = {
            "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger", "ner"]},
            "components": {
                "tok2vec": {"source": base_model},
                "tagger": {
                    "source": base_model,
                    "replace_listeners": ["model.tok2vec"],
                },
                "ner": {"source": base_model},
            },
        }
        new_nlp = util.load_model_from_config(new_config, auto_fill=True)
    new_nlp.initialize(lambda: examples)
    tok2vec = new_nlp.get_pipe("tok2vec")
    tagger = new_nlp.get_pipe("tagger")
    ner = new_nlp.get_pipe("ner")
    assert tok2vec.listening_components == ["ner"]
    assert any(isinstance(node, Tok2VecListener) for node in ner.model.walk())
    assert not any(isinstance(node, Tok2VecListener) for node in tagger.model.walk())
    t2v_cfg = new_nlp.config["components"]["tok2vec"]["model"]
    assert t2v_cfg["@architectures"] == "spacy.Tok2Vec.v2"
    assert new_nlp.config["components"]["tagger"]["model"]["tok2vec"] == t2v_cfg
    assert (
        new_nlp.config["components"]["ner"]["model"]["tok2vec"]["@architectures"]
        == "spacy.Tok2VecListener.v1"
    )
Esempio n. 21
0
def test_oracle_moves_whitespace(en_vocab):
    words = [
        "production", "\n", "of", "Northrop", "\n", "Corp.", "\n", "'s",
        "radar"
    ]
    biluo_tags = ["O", "O", "O", "B-ORG", None, "I-ORG", "L-ORG", "O", "O"]

    doc = Doc(en_vocab, words=words)
    example = Example.from_dict(doc, {"entities": biluo_tags})

    moves = BiluoPushDown(en_vocab.strings)
    move_types = ("M", "B", "I", "L", "U", "O")
    for tag in biluo_tags:
        if tag is None:
            continue
        elif tag == "O":
            moves.add_action(move_types.index("O"), "")
        else:
            action, label = tag.split("-")
            moves.add_action(move_types.index(action), label)
    moves.get_oracle_sequence(example)
Esempio n. 22
0
def test_oracle_moves_missing_B(en_vocab):
    words = ["B", "52", "Bomber"]
    biluo_tags = [None, None, "L-PRODUCT"]

    doc = Doc(en_vocab, words=words)
    example = Example.from_dict(doc, {"words": words, "entities": biluo_tags})

    moves = BiluoPushDown(en_vocab.strings)
    move_types = ("M", "B", "I", "L", "U", "O")
    for tag in biluo_tags:
        if tag is None:
            continue
        elif tag == "O":
            moves.add_action(move_types.index("O"), "")
        else:
            action, label = tag.split("-")
            moves.add_action(move_types.index("B"), label)
            moves.add_action(move_types.index("I"), label)
            moves.add_action(move_types.index("L"), label)
            moves.add_action(move_types.index("U"), label)
    moves.get_oracle_sequence(example)
Esempio n. 23
0
def test_model_config_inline(model):
    nlp = spacy.load("en_core_web_sm")
    conf = {"sklearn_model": model, "label": "pos", "classes": ["pos", "neg"]}
    nlp.add_pipe("sklearn-cat", config=conf)

    texts = [
        "you are a nice person", "this is a great movie",
        "i do not like coffee"
    ]
    labels = ["pos", "pos", "neg"]

    with nlp.select_pipes(enable="sklearn-cat"):
        optimizer = nlp.resume_training()
        for itn in range(100):
            for t, lab in zip(texts, labels):
                doc = nlp.make_doc(t)
                example = Example.from_dict(doc, {"cats": {"pos": lab}})
                nlp.update([example], sgd=optimizer)

    assert len(nlp("you are a nice person").cats.keys()) > 0
    assert len(nlp("coffee i do not like").cats.keys()) > 0
Esempio n. 24
0
def test_debug_data_compile_gold_for_spans():
    nlp = English()
    spans_key = "sc"

    pred = Doc(nlp.vocab, words=["Welcome", "to", "the", "Bank", "of", "China", "."])
    pred.spans[spans_key] = [Span(pred, 3, 6, "ORG"), Span(pred, 5, 6, "GPE")]
    ref = Doc(nlp.vocab, words=["Welcome", "to", "the", "Bank", "of", "China", "."])
    ref.spans[spans_key] = [Span(ref, 3, 6, "ORG"), Span(ref, 5, 6, "GPE")]
    eg = Example(pred, ref)

    data = _compile_gold([eg], ["spancat"], nlp, True)

    assert data["spancat"][spans_key] == Counter({"ORG": 1, "GPE": 1})
    assert data["spans_length"][spans_key] == {"ORG": [3], "GPE": [1]}
    assert data["spans_per_type"][spans_key] == {
        "ORG": [Span(ref, 3, 6, "ORG")],
        "GPE": [Span(ref, 5, 6, "GPE")],
    }
    assert data["sb_per_type"][spans_key] == {
        "ORG": {"start": [ref[2:3]], "end": [ref[6:7]]},
        "GPE": {"start": [ref[4:5]], "end": [ref[6:7]]},
    }
Esempio n. 25
0
def spacy_model_with_data():
    # Creating blank model and setting up the spaCy pipeline
    nlp = spacy.blank("en")
    if IS_SPACY_VERSION_NEWER_THAN_OR_EQUAL_TO_3_0_0:
        from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL

        model = {
            "@architectures": "spacy.TextCatCNN.v1",
            "exclusive_classes": True,
            "tok2vec": DEFAULT_TOK2VEC_MODEL,
        }
        textcat = nlp.add_pipe("textcat", config={"model": model}, last=True)
    else:
        textcat = nlp.create_pipe("textcat",
                                  config={
                                      "exclusive_classes": True,
                                      "architecture": "simple_cnn"
                                  })
        nlp.add_pipe(textcat, last=True)

    # Training the model to recognize between computer graphics and baseball in 20newsgroups dataset
    categories = ["comp.graphics", "rec.sport.baseball"]
    for cat in categories:
        textcat.add_label(cat)

    # Split train/test and train the model
    train_x, train_y, test_x, _ = _get_train_test_dataset(categories)
    train_data = list(zip(train_x, [{"cats": cats} for cats in train_y]))

    if IS_SPACY_VERSION_NEWER_THAN_OR_EQUAL_TO_3_0_0:
        from spacy.training import Example

        train_data = [
            Example.from_dict(nlp.make_doc(text), cats)
            for text, cats in train_data
        ]

    _train_model(nlp, train_data)
    return ModelWithData(nlp, pd.DataFrame(test_x))
Esempio n. 26
0
def test_attributeruler_score(nlp, pattern_dicts):
    # initialize with patterns
    ruler = nlp.add_pipe("attribute_ruler")
    ruler.initialize(lambda: [], patterns=pattern_dicts)
    doc = nlp("This is a test.")
    assert doc[2].lemma_ == "the"
    assert str(doc[2].morph) == "Case=Nom|Number=Plur"
    assert doc[3].lemma_ == "cat"
    assert str(doc[3].morph) == "Case=Nom|Number=Sing"
    doc = nlp.make_doc("This is a test.")
    dev_examples = [
        Example.from_dict(doc, {"lemmas": ["this", "is", "a", "cat", "."]})
    ]
    scores = nlp.evaluate(dev_examples)
    # "cat" is the only correct lemma
    assert scores["lemma_acc"] == pytest.approx(0.2)
    # no morphs are set
    assert scores["morph_acc"] is None
    nlp.remove_pipe("attribute_ruler")

    # test with custom scorer
    @registry.misc("weird_scorer.v1")
    def make_weird_scorer():
        def weird_scorer(examples, weird_score, **kwargs):
            return {"weird_score": weird_score}

        return weird_scorer

    ruler = nlp.add_pipe("attribute_ruler",
                         config={"scorer": {
                             "@misc": "weird_scorer.v1"
                         }})
    ruler.initialize(lambda: [], patterns=pattern_dicts)
    scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.12345})
    assert scores["weird_score"] == 0.12345
    assert "token_acc" in scores
    assert "lemma_acc" not in scores
    scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.23456})
    assert scores["weird_score"] == 0.23456
Esempio n. 27
0
def test_incomplete_data(pipe_name):
    # Test that the parser works with incomplete information
    nlp = English()
    parser = nlp.add_pipe(pipe_name)
    train_examples = []
    for text, annotations in PARTIAL_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
        for dep in annotations.get("deps", []):
            if dep is not None:
                parser.add_label(dep)
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    for i in range(150):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses[pipe_name] < 0.0001

    # test the trained model
    test_text = "I like securities."
    doc = nlp(test_text)
    assert doc[0].dep_ == "nsubj"
    assert doc[2].dep_ == "dobj"
    assert doc[0].head.i == 1
    assert doc[2].head.i == 1
Esempio n. 28
0
def test_issue7029():
    """Test that an empty document doesn't mess up an entire batch."""
    TRAIN_DATA = [
        ("I like green eggs", {
            "tags": ["N", "V", "J", "N"]
        }),
        ("Eat blue ham", {
            "tags": ["V", "J", "N"]
        }),
    ]
    nlp = English.from_config(load_config_from_str(CONFIG_7029))
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    texts = ["first", "second", "third", "fourth", "and", "then", "some", ""]
    docs1 = list(nlp.pipe(texts, batch_size=1))
    docs2 = list(nlp.pipe(texts, batch_size=4))
    assert [doc[0].tag_
            for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]]
Esempio n. 29
0
def test_aligned_spans_y2x(en_vocab, en_tokenizer):
    words = ["Mr and Mrs Smith", "flew", "to", "San Francisco Valley", "."]
    spaces = [True, True, True, False, False]
    doc = Doc(en_vocab, words=words, spaces=spaces)
    prefix = "Mr and Mrs Smith flew to "
    entities = [
        (0, len("Mr and Mrs Smith"), "PERSON"),
        (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
    ]
    # fmt: off
    tokens_ref = [
        "Mr", "and", "Mrs", "Smith", "flew", "to", "San", "Francisco",
        "Valley", "."
    ]
    # fmt: on
    example = Example.from_dict(doc, {
        "words": tokens_ref,
        "entities": entities
    })
    ents_ref = example.reference.ents
    assert [(ent.start, ent.end) for ent in ents_ref] == [(0, 4), (6, 9)]
    ents_y2x = example.get_aligned_spans_y2x(ents_ref)
    assert [(ent.start, ent.end) for ent in ents_y2x] == [(0, 1), (3, 4)]
Esempio n. 30
0
def spacy_model() -> spacy.language.Language:
    examples: t.List[t.Any] = []
    model = spacy.blank("en")
    if "ner" not in model.pipe_names:
        ner = model.add_pipe("ner", last=True)
    else:
        ner = model.get_pipe("ner")

    for text, annotations in train_data:
        examples.append(Example.from_dict(model.make_doc(text), annotations))  # noqa
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    other_pipes = [pipe for pipe in model.pipe_names if pipe != "ner"]

    with model.disable_pipes(*other_pipes):
        optimizer = model.begin_training()
        for _ in range(10):
            random.shuffle(examples)
            for batch in minibatch(examples, size=8):
                model.update(batch, sgd=optimizer)

    return model