Example #1
0
def test_factories_merge_noun_chunks(doc):
    assert len(doc) == 7
    nlp = Language()
    merge_noun_chunks = nlp.create_pipe("merge_noun_chunks")
    merge_noun_chunks(doc)
    assert len(doc) == 6
    assert doc[2].text == "New York"
Example #2
0
def test_factories_merge_ents(doc):
    assert len(doc) == 7
    assert len(list(doc.ents)) == 1
    nlp = Language()
    merge_entities = nlp.create_pipe("merge_entities")
    merge_entities(doc)
    assert len(doc) == 6
    assert len(list(doc.ents)) == 1
    assert doc[2].text == "New York"
Example #3
0
def test_list_of_docs_pickles_efficiently():
    nlp = Language()
    for i in range(10000):
        _ = nlp.vocab[unicode_(i)]  # noqa: F841
    one_pickled = pickle.dumps(nlp("0"), -1)
    docs = list(nlp.pipe(unicode_(i) for i in range(100)))
    many_pickled = pickle.dumps(docs, -1)
    assert len(many_pickled) < (len(one_pickled) * 2)
    many_unpickled = pickle.loads(many_pickled)
    assert many_unpickled[0].text == "0"
    assert many_unpickled[-1].text == "99"
    assert len(many_unpickled) == 100
Example #4
0
def test_issue1654():
    nlp = Language(Vocab())
    assert not nlp.pipeline
    nlp.add_pipe(lambda doc: doc, name="1")
    nlp.add_pipe(lambda doc: doc, name="2", after="1")
    nlp.add_pipe(lambda doc: doc, name="3", after="2")
    assert nlp.pipe_names == ["1", "2", "3"]
    nlp2 = Language(Vocab())
    assert not nlp2.pipeline
    nlp2.add_pipe(lambda doc: doc, name="3")
    nlp2.add_pipe(lambda doc: doc, name="2", before="3")
    nlp2.add_pipe(lambda doc: doc, name="1", before="2")
    assert nlp2.pipe_names == ["1", "2", "3"]
Example #5
0
def test_issue1915():
    cfg = {"hidden_depth": 2}  # should error out
    nlp = Language()
    nlp.add_pipe(nlp.create_pipe("ner"))
    nlp.get_pipe("ner").add_label("answer")
    with pytest.raises(ValueError):
        nlp.begin_training(**cfg)
def test_issue1967(label):
    nlp = Language()
    config = {}
    ner = nlp.create_pipe("ner", config=config)
    example = Example.from_dict(
        Doc(ner.vocab, words=["word"]),
        {
            "ids": [0],
            "words": ["word"],
            "tags": ["tag"],
            "heads": [0],
            "deps": ["dep"],
            "entities": [label],
        },
    )
    assert "JOB-NAME" in ner.moves.get_actions(examples=[example])[1]
Example #7
0
def test_textcat_learns_multilabel():
    random.seed(5)
    numpy.random.seed(5)
    docs = []
    nlp = Language()
    letters = ["a", "b", "c"]
    for w1 in letters:
        for w2 in letters:
            cats = {letter: float(w2 == letter) for letter in letters}
            docs.append((Doc(nlp.vocab,
                             words=["d"] * 3 + [w1, w2] + ["d"] * 3), cats))
    random.shuffle(docs)
    textcat = TextCategorizer(nlp.vocab, width=8)
    for letter in letters:
        textcat.add_label(letter)
    optimizer = textcat.initialize(lambda: [])
    for i in range(30):
        losses = {}
        examples = [
            Example.from_dict(doc, {"cats": cats}) for doc, cat in docs
        ]
        textcat.update(examples, sgd=optimizer, losses=losses)
        random.shuffle(docs)
    for w1 in letters:
        for w2 in letters:
            doc = Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3)
            truth = {letter: w2 == letter for letter in letters}
            textcat(doc)
            for cat, score in doc.cats.items():
                if not truth[cat]:
                    assert score < 0.5
                else:
                    assert score > 0.5
Example #8
0
    def _extract_doc_matches(
        self,
        lang: Language,
        doc: Doc,
        keywords: Sequence[str],
        scores: Sequence[float],
    ) -> Dict[str, DocMatch]:
        '''
        extract and format info for all keywords in a given document.
        attr: str the spacy token attribute to use to match in the sentence search
        '''

        matcher = PhraseMatcher(lang.vocab, attr='LOWER')
        patterns = [lang.make_doc(str(kw)) for kw in keywords]
        matcher.add("Keywords", patterns)
        sents = self._extract_sentence_matches(doc,
                                               keywords,
                                               matcher,
                                               attr='LOWER')

        matches: Dict[str, DocMatch] = {
            kw: DocMatch(doc, kw, score, sents[kw])
            for kw, score in zip(keywords, scores)
        }

        return matches
Example #9
0
def test_pipe_factories_config_excludes_nlp():
    """Test that the extra values we temporarily add to component config
    blocks/functions are removed and not copied around.
    """
    name = "test_pipe_factories_config_excludes_nlp"
    func = lambda nlp, name: lambda doc: doc
    Language.factory(name, func=func)
    config = {
        "nlp": {"lang": "en", "pipeline": [name]},
        "components": {name: {"factory": name}},
    }
    nlp = English.from_config(config)
    assert nlp.pipe_names == [name]
    pipe_cfg = nlp.get_pipe_config(name)
    pipe_cfg == {"factory": name}
    assert nlp._pipe_configs[name] == {"factory": name}
Example #10
0
def test_user_data_from_disk():
    nlp = Language()
    doc = nlp("Hello")
    doc.user_data[(0, 1)] = False
    b = doc.to_bytes()
    doc2 = doc.__class__(doc.vocab).from_bytes(b)
    assert doc2.user_data[(0, 1)] is False
Example #11
0
 def corpus(nlp: Language):
     for original_example in original_examples:
         doc = nlp.make_doc(original_example[0])
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", category=UserWarning)
             spacy_example = Example.from_dict(doc, original_example[1])
         yield spacy_example
Example #12
0
def test_user_data_unpickles():
    nlp = Language()
    doc = nlp("Hello")
    doc.user_data[(0, 1)] = False
    b = pickle.dumps(doc)
    doc2 = pickle.loads(b)
    assert doc2.user_data[(0, 1)] is False
Example #13
0
def get_smiles_language():
    """
    Get SMILES language.
    
    Returns:
        a spacy.language.Language representing SMILES.
    """
    valid_values = list(filter(lambda k: k != PADDING_ATOM, ATOM_MAPPING.keys()))
    vocabulary = Vocab(strings=valid_values)
    def make_doc(smiles):
        """
        Make a SMILES document.

        Arguments:
            smiles (str): a SMILES representing a molecule.
        Returns:
            a spacy.tokens.Doc representing the molecule.
        """
        if len(smiles) == 0:
            tokens = np.random.choice(valid_values)
        else:
            tokens = [
                token
                for token in ATOM_REGEX.split(smiles)
                if token
            ][:MAX_LENGTH]
        return Doc(vocabulary, words=tokens, spaces=[False]*len(tokens))
    return Language(vocabulary, make_doc)
Example #14
0
def test_add_pipe(nlp: Language):
    """
Works as a pipeline component and can be disabled.
    """
    # given
    nlp.add_pipe("topicrank", last=True)

    # works as a pipeline component
    # when
    text = "linear constraints over the"
    doc = nlp(text)
    phrases = [p.text for p in doc._.phrases]

    # then
    assert len(doc._.phrases) > 0
    assert any(map(lambda x: "constraints" in x, phrases))

    # identifies phrases not in noun chunks
    # when
    text = """\
everything you need to know about student loan interest rates, variable \
and fixed rates, capitalization, amortization, student loan refinancing \
and more.\
"""

    doc = nlp(text)
    phrases = [p.text for p in doc._.phrases]

    # then
    assert len(doc._.phrases) >= 2

    # resolves Py 3.5 dict KeyError
    # when
    text = "linear constraints over the set of natural numbers"
    doc = nlp(text)
    phrases = [p.text for p in doc._.phrases]

    # then
    assert any(map(lambda x: "constraints" in x, phrases))

    # pipeline can be disabled
    # when
    with nlp.select_pipes(disable=["topicrank"]):
        doc = nlp(text)

        # then
        assert len(doc._.phrases) == 0
Example #15
0
def test_ner_warns_no_lookups():
    nlp = Language()
    nlp.vocab.lookups = Lookups()
    assert not len(nlp.vocab.lookups)
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)
    with pytest.warns(UserWarning):
        nlp.begin_training()
    nlp.vocab.lookups.add_table("lexeme_norm")
    nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
    with pytest.warns(None) as record:
        nlp.begin_training()
        assert not record.list
Example #16
0
    def from_dir(cls, tag_map, model_dir):
        vocab = Vocab(tag_map=tag_map, get_lex_attr=Language.default_lex_attrs())
        tokenizer = Tokenizer(vocab, {}, None, None, None)
        tagger = Tagger.blank(vocab, TAGGER_TEMPLATES)

        cfg = Config.read(path.join(model_dir, 'deps'), 'config')
        parser = Parser.from_dir(path.join(model_dir, 'deps'), vocab.strings, ArcEager)
        return cls(vocab, tokenizer, tagger, parser)
Example #17
0
def matcher(nlp: Language, ) -> FuzzyMatcher:
    """Fuzzy matcher with patterns added."""
    animals = ["Heifer", "chicken"]
    sounds = ["mooo"]
    names = ["Steven"]
    matcher = FuzzyMatcher(nlp.vocab)
    matcher.add(
        "ANIMAL",
        [nlp.make_doc(animal) for animal in animals],
        kwargs=[{
            "ignore_case": False
        }, {}],
    )
    matcher.add("SOUND", [nlp.make_doc(sound) for sound in sounds])
    matcher.add("NAME", [nlp.make_doc(name) for name in names],
                on_match=add_name_ent)
    return matcher
Example #18
0
def vocab():
    vocab = Vocab(Language.default_lex_attrs())
    lex = vocab['dog']
    assert vocab[vocab.strings['dog']].orth_ == 'dog'
    lex  = vocab['the']
    lex = vocab['quick']
    lex = vocab['jumped']
    return vocab
Example #19
0
def test__scan_doc_returns_matches_over_min_r1(
    searcher: FuzzySearcher, nlp: Language, scan_example: Doc
) -> None:
    """It returns all spans of len(query) in doc if ratio >= min_r1."""
    query = nlp.make_doc("Shirley")
    assert searcher._scan_doc(
        scan_example, query, fuzzy_func="simple", min_r1=30, ignore_case=True
    ) == {4: 86}
Example #20
0
def test_issue999(train_data):
    """Test that adding entities and resuming training works passably OK.
    There are two issues here:
    1) We have to readd labels. This isn't very nice.
    2) There's no way to set the learning rate for the weight update, so we
        end up out-of-scale, causing it to learn too fast.
    """
    TRAIN_DATA = [
        ["hey", []],
        ["howdy", []],
        ["hey there", []],
        ["hello", []],
        ["hi", []],
        ["i'm looking for a place to eat", []],
        ["i'm looking for a place in the north of town", [[31, 36, "LOCATION"]]],
        ["show me chinese restaurants", [[8, 15, "CUISINE"]]],
        ["show me chines restaurants", [[8, 14, "CUISINE"]]],
    ]

    nlp = Language()
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)
    for _, offsets in TRAIN_DATA:
        for start, end, label in offsets:
            ner.add_label(label)
    nlp.begin_training()
    ner.model.learn_rate = 0.001
    for itn in range(100):
        random.shuffle(TRAIN_DATA)
        for raw_text, entity_offsets in TRAIN_DATA:
            nlp.update([raw_text], [{"entities": entity_offsets}])

    with make_tempdir() as model_dir:
        nlp.to_disk(model_dir)
        nlp2 = Language().from_disk(model_dir)

    for raw_text, entity_offsets in TRAIN_DATA:
        doc = nlp2(raw_text)
        ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents}
        for start, end, label in entity_offsets:
            if (start, end) in ents:
                assert ents[(start, end)] == label
                break
        else:
            if entity_offsets:
                raise Exception(ents)
Example #21
0
def test__adjust_left_right_positions_finds_better_match(
    searcher: FuzzySearcher, nlp: Language
) -> None:
    """It optimizes the initial match to find a better match."""
    doc = nlp.make_doc("Patient was prescribed Zithromax tablets.")
    query = nlp.make_doc("zithromax tablet")
    match_values = {0: 30, 2: 50, 3: 97, 4: 50}
    assert searcher._adjust_left_right_positions(
        doc,
        query,
        match_values,
        pos=3,
        fuzzy_func="simple",
        min_r2=70,
        ignore_case=True,
        flex=2,
    ) == (3, 5, 97)
Example #22
0
def test__adjust_left_right_positions_with_no_flex(
    searcher: FuzzySearcher, nlp: Language
) -> None:
    """It returns the intial match when flex value = 0."""
    doc = nlp.make_doc("Patient was prescribed Zithroma tablets.")
    query = nlp.make_doc("zithromax")
    match_values = {3: 94}
    assert searcher._adjust_left_right_positions(
        doc,
        query,
        match_values,
        pos=3,
        fuzzy_func="simple",
        min_r2=70,
        ignore_case=True,
        flex=0,
    ) == (3, 4, 94)
Example #23
0
def test__scan_doc_returns_all_matches_with_no_min_r1(
    searcher: FuzzySearcher, nlp: Language, scan_example: Doc
) -> None:
    """It returns all spans of len(query) in doc if min_r1 = 0."""
    query = nlp.make_doc("Shirley")
    assert searcher._scan_doc(
        scan_example, query, fuzzy_func="simple", min_r1=0, ignore_case=True
    ) == {0: 0, 1: 0, 2: 18, 3: 22, 4: 86}
Example #24
0
def main(use_gpu=False, nb_epoch=50):
    if use_gpu:
        Model.ops = CupyOps()
        Model.Ops = CupyOps
    train, test = datasets.imdb()
    print("Load data")
    train_X, train_y = zip(*train)
    test_X, test_y = zip(*test)
    train_y = to_categorical(train_y, nb_classes=2)
    test_y = to_categorical(test_y, nb_classes=2)

    nlp = Language()

    dev_X = train_X[-1000:]
    dev_y = train_y[-1000:]
    train_X = train_X[:-1000]
    train_y = train_y[:-1000]
    print("Parse data")
    train_X = [nlp.make_doc(x) for x in train_X]
    dev_X = [nlp.make_doc(x) for x in dev_X]

    model = build_model(2, 1)

    print("Begin training")
    with model.begin_training(train_X, train_y, L2=1e-6) as (trainer, optimizer):
        epoch_loss = [0.0]

        def report_progress():
            with model.use_params(optimizer.averages):
                print(epoch_loss[-1], model.evaluate(dev_X, dev_y), trainer.dropout)
            epoch_loss.append(0.0)

        trainer.each_epoch.append(report_progress)
        trainer.nb_epoch = nb_epoch
        trainer.dropout = 0.0
        trainer.batch_size = 128
        trainer.dropout_decay = 0.0
        for X, y in trainer.iterate(train_X[:1000], train_y[:1000]):
            yh, backprop = model.begin_update(X, drop=trainer.dropout)
            loss = ((yh - y) ** 2.0).sum() / y.shape[0]
            backprop((yh - y) / y.shape[0], optimizer)
            epoch_loss[-1] += loss
        with model.use_params(optimizer.averages):
            print("Avg dev.: %.3f" % model.evaluate(dev_X, dev_y))
            with open("out.pickle", "wb") as file_:
                pickle.dump(model, file_, -1)
Example #25
0
def test_simple_train():
    nlp = Language()
    nlp.add_pipe(nlp.create_pipe("textcat"))
    nlp.get_pipe("textcat").add_label("answer")
    nlp.begin_training()
    for i in range(5):
        for text, answer in [
            ("aaaa", 1.0),
            ("bbbb", 0),
            ("aa", 1.0),
            ("bbbbbbbbb", 0.0),
            ("aaaaaa", 1),
        ]:
            nlp.update([text], [{"cats": {"answer": answer}}])
    doc = nlp("aaa")
    assert "answer" in doc.cats
    assert doc.cats["answer"] >= 0.5
Example #26
0
def test_pickle_single_doc():
    nlp = Language()
    doc = nlp("pickle roundtrip")
    doc._context = 3
    data = pickle.dumps(doc, 1)
    doc2 = pickle.loads(data)
    assert doc2.text == "pickle roundtrip"
    assert doc2._context == 3
Example #27
0
def test_serialize_language_exclude(meta_data):
    name = "name-in-fixture"
    nlp = Language(meta=meta_data)
    assert nlp.meta["name"] == name
    new_nlp = Language().from_bytes(nlp.to_bytes())
    assert new_nlp.meta["name"] == name
    new_nlp = Language().from_bytes(nlp.to_bytes(), exclude=["meta"])
    assert not new_nlp.meta["name"] == name
    new_nlp = Language().from_bytes(nlp.to_bytes(exclude=["meta"]))
    assert not new_nlp.meta["name"] == name
Example #28
0
def loadWordFeatures(dom, loc="wordfeats", loadpickle=False, savepickle=False):
    if loadpickle:
        # we load saved word features from our pickle file
        # the word features were generated as below
        with open(os.path.join(dirname, 'data', 'wordfeatures.pkl'),
                  'rb') as f:
            featdict = pickle.load(f)
        return featdict[dom]

    nlp = Language().from_disk(loc)

    taskwords = {
        'household': [[' '], ['Pick and Place glass'],
                      ['Pick and Place plastic can'], ['Pick and Place lemon'],
                      ['Pick and Place plastic bottle'],
                      ['Pick and Place apple'], ['Pick and Place plastic cup'],
                      ['Navigate while avoiding moving people'],
                      ['Navigate to the main room door'],
                      ['Navigate while following a person'],
                      ['Navigate to the dining table'],
                      ['Navigate while avoiding obstacles'],
                      ['Navigate to the living room']],
        'driving': [
            [' '],
            ['Parking backwards cars and people around, misaligned'],
            ['Parking backwards empty lot, misaligned'],
            ['Parking backwards cars and people around, aligned'],
            ['Parking forwards empty lot, aligned'],
            ['Parking forwards cars and people around, misaligned'],
            ['Parking forwards empty lot, misaligned'],
            ['Navigating lane merge with other moving vehicles'],
            ['Navigating lane merge on a clear road'],
            ['Navigating traffic-circle with other moving vehicles'],
            ['Navigating traffic-circle on a clear road'],
            ['Navigating T-junction with other moving vehicles'],
            ['Navigating T-junction on a clear road'],
        ]
    }

    featdict = {}
    for d, task_word_list in taskwords.items():
        wordfeatures = []
        for i in range(len(task_word_list)):
            print(task_word_list[i][0])
            wordfeatures.append(nlp(task_word_list[i][0]).vector)

        wordfeatures = np.array(wordfeatures)
        featdict[d] = wordfeatures

    wordfeatures = featdict[dom]

    # save the data
    if savepickle:
        with open(os.path.join(dirname, 'data', 'wordfeatures.pkl'),
                  'wb') as f:
            pickle.dump(featdict, f, protocol=pickle.HIGHEST_PROTOCOL)

    return wordfeatures
Example #29
0
def test_beam_parse():
    nlp = Language()
    nlp.add_pipe(DependencyParser(nlp.vocab), name="parser")
    nlp.parser.add_label("nsubj")
    nlp.parser.begin_training([], token_vector_width=8, hidden_width=8)
    doc = nlp.make_doc("Australia is a country")
    nlp.parser(doc, beam_width=2)
Example #30
0
def test_tagger_warns_no_lemma_lookups():
    nlp = Language()
    nlp.vocab.lookups = Lookups()
    assert not len(nlp.vocab.lookups)
    tagger = nlp.create_pipe("tagger")
    with pytest.warns(UserWarning):
        tagger.begin_training()
    nlp.add_pipe(tagger)
    with pytest.warns(UserWarning):
        nlp.begin_training()
    nlp.vocab.lookups.add_table("lemma_lookup")
    with pytest.warns(None) as record:
        nlp.begin_training()
        assert not record.list
Example #31
0
def test_pipe_class_component_config():
    name = "test_class_component_config"

    @Language.factory(name)
    class Component:
        def __init__(
            self, nlp: Language, name: str, value1: StrictInt, value2: StrictStr
        ):
            self.nlp = nlp
            self.value1 = value1
            self.value2 = value2
            self.is_base = True

        def __call__(self, doc: Doc) -> Doc:
            return doc

    @English.factory(name)
    class ComponentEN:
        def __init__(
            self, nlp: Language, name: str, value1: StrictInt, value2: StrictStr
        ):
            self.nlp = nlp
            self.value1 = value1
            self.value2 = value2
            self.is_base = False

        def __call__(self, doc: Doc) -> Doc:
            return doc

    nlp = Language()
    with pytest.raises(ConfigValidationError):  # no config provided
        nlp.add_pipe(name)
    with pytest.raises(ConfigValidationError):  # invalid config
        nlp.add_pipe(name, config={"value1": "10", "value2": "hello"})
    nlp.add_pipe(name, config={"value1": 10, "value2": "hello"})
    pipe = nlp.get_pipe(name)
    assert isinstance(pipe.nlp, Language)
    assert pipe.value1 == 10
    assert pipe.value2 == "hello"
    assert pipe.is_base is True

    nlp_en = English()
    with pytest.raises(ConfigValidationError):  # invalid config
        nlp_en.add_pipe(name, config={"value1": "10", "value2": "hello"})
    nlp_en.add_pipe(name, config={"value1": 10, "value2": "hello"})
    pipe = nlp_en.get_pipe(name)
    assert isinstance(pipe.nlp, English)
    assert pipe.value1 == 10
    assert pipe.value2 == "hello"
    assert pipe.is_base is False
Example #32
0
def test_implicit_label():
    nlp = Language()
    nlp.add_pipe("tagger")
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    nlp.initialize(get_examples=lambda: train_examples)
Example #33
0
def normalize_np(np: List[Token], nlp_md: Language) -> str:
    """
    Function to normalize a noun phrase
    """
    normalized_np = " ".join([
        token.lemma_ for token in nlp_md.tokenizer(np)
        if token.text not in NLTK_STOPWORDS
    ])
    return normalized_np
Example #34
0
def test_language_source_and_vectors(nlp2):
    nlp = Language(Vocab())
    textcat = nlp.add_pipe("textcat")
    for label in ("POSITIVE", "NEGATIVE"):
        textcat.add_label(label)
    nlp.initialize()
    long_string = "thisisalongstring"
    assert long_string not in nlp.vocab.strings
    assert long_string not in nlp2.vocab.strings
    nlp.vocab.strings.add(long_string)
    assert nlp.vocab.vectors.to_bytes() != nlp2.vocab.vectors.to_bytes()
    vectors_bytes = nlp.vocab.vectors.to_bytes()
    with pytest.warns(UserWarning):
        nlp2.add_pipe("textcat", name="textcat2", source=nlp)
    # strings should be added
    assert long_string in nlp2.vocab.strings
    # vectors should remain unmodified
    assert nlp.vocab.vectors.to_bytes() == vectors_bytes
Example #35
0
def test_language_pipe_error_handler_pipe(en_vocab, n_process):
    """Test the error handling of a component's pipe method"""
    Language.component("my_perhaps_sentences", func=perhaps_set_sentences)
    Language.component("assert_sents_error", func=assert_sents_error)
    ops = get_current_ops()
    if isinstance(ops, NumpyOps) or n_process < 2:
        texts = [f"{str(i)} is enough. Done" for i in range(100)]
        nlp = English()
        nlp.add_pipe("my_perhaps_sentences")
        nlp.add_pipe("assert_sents_error")
        nlp.initialize()
        with pytest.raises(ValueError):
            # assert_sents_error requires sentence boundaries, will throw an error otherwise
            docs = list(nlp.pipe(texts, n_process=n_process, batch_size=10))
        nlp.set_error_handler(ignore_error)
        docs = list(nlp.pipe(texts, n_process=n_process, batch_size=10))
        # we lose/ignore the failing 4,40-49 docs
        assert len(docs) == 89
Example #36
0
def build_pipeline(disable: list = []):
    """
    Function that creates the pipeline for the creation of (sentence, reference) tuples.
    Returns:
    - nlp: spaCy pipeline instance
    """
    nlp = load_spacy_model("de_core_news_sm")

    # Matching section references using ReferenceMatcher class
    Language.component("reference_matcher", func=match_reference)

    nlp.add_pipe("sentencizer")
    nlp.add_pipe("reference_matcher", before="tagger")
    nlp.disable_pipes(*disable)

    print("\nActivated pipes:")
    print(nlp.pipe_names)
    return nlp
Example #37
0
def test_entity_ruler_existing_bytes_old_format_safe(patterns, en_vocab):
    nlp = Language(vocab=en_vocab)
    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
    bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
    new_ruler = EntityRuler(nlp)
    new_ruler = new_ruler.from_bytes(bytes_old_style)
    assert len(new_ruler) == len(ruler)
    assert new_ruler.patterns == ruler.patterns
    assert new_ruler.overwrite is not ruler.overwrite
Example #38
0
def lang():
    vector_data = {k: np.random.normal(0, 1, (2,)) for k in ["red", "blue", "cat", "dog", "green", "purple"]}
    vector_data['cat'] += 10
    vector_data['dog'] += 10
    vocab = Vocab(strings=vector_data.keys())
    for word, vector in vector_data.items():
        vocab.set_vector(word, vector)
    nlp = Language(vocab=vocab)
    return SpacyLanguage(nlp)
def register_benepar_component_factory():
    # Starting with spaCy 3.0, nlp.add_pipe no longer directly accepts
    # BeneparComponent instances. We must instead register a component factory.
    import spacy

    if spacy.__version__.startswith("2"):
        return

    from spacy.language import Language

    Language.factory(
        "benepar",
        default_config={
            "subbatch_max_tokens": 500,
            "disable_tagger": False,
        },
        func=create_benepar_component,
    )
Example #40
0
def create_pipeline(nlp: Language, cfg: omegaconf.DictConfig) -> List[Pipe]:
    if not isinstance(cfg, omegaconf.DictConfig):
        cfg = OmegaConf.create(cfg)

    pipes = []
    for name, pipe_config in cfg.items():
        pipe_config = OmegaConf.to_container(pipe_config or OmegaConf.create({}))
        pipes.append(nlp.create_pipe(name, config=pipe_config or dict()))
    return pipes
Example #41
0
def main(train_loc, dev_loc, model_dir, tag_map_loc):
    with open(tag_map_loc) as file_:
        tag_map = json.loads(file_.read())
    train_sents = list(read_conllx(train_loc))
    train_sents = PseudoProjectivity.preprocess_training_data(train_sents)

    actions = ArcEager.get_actions(gold_parses=train_sents)
    features = get_templates('basic')
    
    model_dir = pathlib.Path(model_dir)
    with (model_dir / 'deps' / 'config.json').open('w') as file_:
        json.dump({'pseudoprojective': True, 'labels': actions, 'features': features}, file_)

    vocab = Vocab(lex_attr_getters=Language.Defaults.lex_attr_getters, tag_map=tag_map)
    # Populate vocab
    for _, doc_sents in train_sents:
        for (ids, words, tags, heads, deps, ner), _ in doc_sents:
            for word in words:
                _ = vocab[word]
            for dep in deps:
                _ = vocab[dep]
            for tag in tags:
                _ = vocab[tag]
            for tag in tags:
                assert tag in tag_map, repr(tag)
    tagger = Tagger(vocab, tag_map=tag_map)
    parser = DependencyParser(vocab, actions=actions, features=features)
    
    for itn in range(15):
        for _, doc_sents in train_sents:
            for (ids, words, tags, heads, deps, ner), _ in doc_sents:
                doc = Doc(vocab, words=words)
                gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
                tagger(doc)
                parser.update(doc, gold)
                doc = Doc(vocab, words=words)
                tagger.update(doc, gold)
        random.shuffle(train_sents)
        scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
        print('%d:\t%.3f\t%.3f' % (itn, scorer.uas, scorer.tags_acc))
    nlp = Language(vocab=vocab, tagger=tagger, parser=parser)
    nlp.end_training(model_dir)
    scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
    print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
Example #42
0
def test_serialize_with_custom_tokenizer():
    """Test that serialization with custom tokenizer works without token_match.
    See: https://support.prodi.gy/t/how-to-save-a-custom-tokenizer/661/2
    """
    prefix_re = re.compile(r"""1/|2/|:[0-9][0-9][A-K]:|:[0-9][0-9]:""")
    suffix_re = re.compile(r"""""")
    infix_re = re.compile(r"""[~]""")

    def custom_tokenizer(nlp):
        return Tokenizer(
            nlp.vocab,
            {},
            prefix_search=prefix_re.search,
            suffix_search=suffix_re.search,
            infix_finditer=infix_re.finditer,
        )

    nlp = Language()
    nlp.tokenizer = custom_tokenizer(nlp)
    with make_tempdir() as d:
        nlp.to_disk(d)
Example #43
0
def test_beam_parse():
    nlp = Language()
    nlp.add_pipe(DependencyParser(nlp.vocab), name="parser")
    nlp.parser.add_label("nsubj")
    nlp.parser.begin_training([], token_vector_width=8, hidden_width=8)
    doc = nlp.make_doc("Australia is a country")
    nlp.parser(doc, beam_width=2)
Example #44
0
def test_serialize_language_exclude(meta_data):
    name = "name-in-fixture"
    nlp = Language(meta=meta_data)
    assert nlp.meta["name"] == name
    new_nlp = Language().from_bytes(nlp.to_bytes())
    assert nlp.meta["name"] == name
    new_nlp = Language().from_bytes(nlp.to_bytes(), exclude=["meta"])
    assert not new_nlp.meta["name"] == name
    new_nlp = Language().from_bytes(nlp.to_bytes(exclude=["meta"]))
    assert not new_nlp.meta["name"] == name
    with pytest.raises(ValueError):
        nlp.to_bytes(meta=False)
    with pytest.raises(ValueError):
        Language().from_bytes(nlp.to_bytes(), meta=False)
Example #45
0
def test_serialize_language_meta_disk(meta_data):
    language = Language(meta=meta_data)
    with make_tempdir() as d:
        language.to_disk(d)
        new_language = Language().from_disk(d)
    assert new_language.meta == language.meta