コード例 #1
0
def test_pipe_factories_language_specific():
    """Test that language sub-classes can have their own factories, with
    fallbacks to the base factories."""
    name1 = "specific_component1"
    name2 = "specific_component2"
    Language.component(name1, func=lambda: "base")
    English.component(name1, func=lambda: "en")
    German.component(name2, func=lambda: "de")

    assert Language.has_factory(name1)
    assert not Language.has_factory(name2)
    assert English.has_factory(name1)
    assert not English.has_factory(name2)
    assert German.has_factory(name1)
    assert German.has_factory(name2)

    nlp = Language()
    assert nlp.create_pipe(name1)() == "base"
    with pytest.raises(ValueError):
        nlp.create_pipe(name2)
    nlp_en = English()
    assert nlp_en.create_pipe(name1)() == "en"
    with pytest.raises(ValueError):
        nlp_en.create_pipe(name2)
    nlp_de = German()
    assert nlp_de.create_pipe(name1)() == "base"
    assert nlp_de.create_pipe(name2)() == "de"
コード例 #2
0
def test_language_pipe_error_handler_input_as_tuples(en_vocab, n_process):
    """Test the error handling of nlp.pipe with input as tuples"""
    Language.component("my_evil_component", func=evil_component)
    ops = get_current_ops()
    if isinstance(ops, NumpyOps) or n_process < 2:
        nlp = English()
        nlp.add_pipe("my_evil_component")
        texts = [
            ("TEXT 111", 111),
            ("TEXT 222", 222),
            ("TEXT 333", 333),
            ("TEXT 342", 342),
            ("TEXT 666", 666),
        ]
        with pytest.raises(ValueError):
            list(nlp.pipe(texts, as_tuples=True))
        nlp.set_error_handler(warn_error)
        logger = logging.getLogger("spacy")
        with mock.patch.object(logger, "warning") as mock_warning:
            tuples = list(nlp.pipe(texts, as_tuples=True, n_process=n_process))
            # HACK/TODO? the warnings in child processes don't seem to be
            # detected by the mock logger
            if n_process == 1:
                mock_warning.assert_called()
                assert mock_warning.call_count == 2
                assert len(tuples) + mock_warning.call_count == len(texts)
            assert (tuples[0][0].text, tuples[0][1]) == ("TEXT 111", 111)
            assert (tuples[1][0].text, tuples[1][1]) == ("TEXT 333", 333)
            assert (tuples[2][0].text, tuples[2][1]) == ("TEXT 666", 666)
コード例 #3
0
ファイル: test_language.py プロジェクト: snimrod/lda_test1
def test_language_pipe_error_handler_custom(en_vocab, n_process):
    """Test the error handling of a custom component that has no pipe method"""
    Language.component("my_evil_component", func=evil_component)
    ops = get_current_ops()
    if isinstance(ops, NumpyOps) or n_process < 2:
        nlp = English()
        nlp.add_pipe("my_evil_component")
        texts = ["TEXT 111", "TEXT 222", "TEXT 333", "TEXT 342", "TEXT 666"]
        with pytest.raises(ValueError):
            # the evil custom component throws an error
            list(nlp.pipe(texts))

        nlp.set_error_handler(warn_error)
        logger = logging.getLogger("spacy")
        with mock.patch.object(logger, "warning") as mock_warning:
            # the errors by the evil custom component raise a warning for each
            # bad doc
            docs = list(nlp.pipe(texts, n_process=n_process))
            # HACK/TODO? the warnings in child processes don't seem to be
            # detected by the mock logger
            if n_process == 1:
                mock_warning.assert_called()
                assert mock_warning.call_count == 2
                assert len(docs) + mock_warning.call_count == len(texts)
            assert [doc.text for doc in docs] == ["TEXT 111", "TEXT 333", "TEXT 666"]
コード例 #4
0
    def add_meta_cat(self, meta_cat, name):
        component_name = spacy.util.get_object_name(meta_cat)
        Language.component(name=component_name, func=meta_cat)
        self.nlp.add_pipe(component_name, name=name, last=True)

        # Only the meta_anns field is needed, it will be a dictionary
        #of {category_name: value, ...}
        Span.set_extension('meta_anns', default=None, force=True)
コード例 #5
0
    def add_token_normalizer(self, config, spell_checker=None):
        token_normalizer = TokenNormalizer(spell_checker=spell_checker,
                                           config=config)
        component_name = spacy.util.get_object_name(token_normalizer)
        Language.component(name=component_name, func=token_normalizer)
        self.nlp.add_pipe(component_name, name='token_normalizer', last=True)

        # Add custom fields needed for this usecase
        Token.set_extension('norm', default=None, force=True)
コード例 #6
0
def load():
  nlp=Thai()
  if SPACY_V3:
    Language.component("thai_tagger",func=ThaiTagger(nlp))
    nlp.add_pipe("thai_tagger")
    Language.component("thai_parser",func=ThaiParser(nlp))
    nlp.add_pipe("thai_parser")
  else:
    nlp.add_pipe(ThaiTagger(nlp))
    nlp.add_pipe(ThaiParser(nlp))
  return nlp
コード例 #7
0
    def add_linker(self, linker):
        r''' Add entity linker to the pipeline, will also add the necessary fields
        to Span object.

        linker (object/function):
            Any object/function created based on the requirements for a spaCy pipeline components. Have
            a look at https://spacy.io/usage/processing-pipelines#custom-components
        '''
        component_name = spacy.util.get_object_name(linker)
        Language.component(name=component_name, func=linker)
        self.nlp.add_pipe(component_name, name='cat_linker', last=True)
        Span.set_extension('cui', default=-1, force=True)
        Span.set_extension('context_similarity', default=-1, force=True)
コード例 #8
0
    def add_ner(self, ner):
        r''' Add NER from CAT to the pipeline, will also add the necessary fields
        to the document and Span objects.

        '''
        component_name = spacy.util.get_object_name(ner)
        Language.component(name=component_name, func=ner)
        self.nlp.add_pipe(component_name, name='cat_ner', last=True)

        Doc.set_extension('ents', default=[], force=True)
        Span.set_extension('confidence', default=-1, force=True)
        Span.set_extension('id', default=0, force=True)
        # Do not set this property if a vocabulary apporach is not used, this name must
        #refer to a name2cuis in the cdb.
        Span.set_extension('detected_name', default=None, force=True)
        Span.set_extension('link_candidates', default=None, force=True)
コード例 #9
0
ファイル: test_language.py プロジェクト: snimrod/lda_test1
def test_language_pipe_error_handler_pipe(en_vocab, n_process):
    """Test the error handling of a component's pipe method"""
    Language.component("my_perhaps_sentences", func=perhaps_set_sentences)
    Language.component("assert_sents_error", func=assert_sents_error)
    ops = get_current_ops()
    if isinstance(ops, NumpyOps) or n_process < 2:
        texts = [f"{str(i)} is enough. Done" for i in range(100)]
        nlp = English()
        nlp.add_pipe("my_perhaps_sentences")
        nlp.add_pipe("assert_sents_error")
        nlp.initialize()
        with pytest.raises(ValueError):
            # assert_sents_error requires sentence boundaries, will throw an error otherwise
            docs = list(nlp.pipe(texts, n_process=n_process, batch_size=10))
        nlp.set_error_handler(ignore_error)
        docs = list(nlp.pipe(texts, n_process=n_process, batch_size=10))
        # we lose/ignore the failing 4,40-49 docs
        assert len(docs) == 89
コード例 #10
0
def build_pipeline(disable: list = []):
    """
    Function that creates the pipeline for the creation of (sentence, reference) tuples.
    Returns:
    - nlp: spaCy pipeline instance
    """
    nlp = load_spacy_model("de_core_news_sm")

    # Matching section references using ReferenceMatcher class
    Language.component("reference_matcher", func=match_reference)

    nlp.add_pipe("sentencizer")
    nlp.add_pipe("reference_matcher", before="tagger")
    nlp.disable_pipes(*disable)

    print("\nActivated pipes:")
    print(nlp.pipe_names)
    return nlp
コード例 #11
0
def test_pipe_factories_decorator_idempotent(i, func, func2):
    """Check that decorator can be run multiple times if the function is the
    same. This is especially relevant for live reloading because we don't
    want spaCy to raise an error if a module registering components is reloaded.
    """
    name = f"test_pipe_factories_decorator_idempotent_{i}"
    for i in range(5):
        Language.factory(name, func=func)
    nlp = Language()
    nlp.add_pipe(name)
    Language.factory(name, func=func)
    # Make sure it also works for component decorator, which creates the
    # factory function
    name2 = f"{name}2"
    for i in range(5):
        Language.component(name2, func=func2)
    nlp = Language()
    nlp.add_pipe(name)
    Language.component(name2, func=func2)
コード例 #12
0
def test_update_with_annotates():
    name = "test_with_annotates"
    results = {}

    def make_component(name):
        results[name] = ""

        def component(doc):
            nonlocal results
            results[name] += doc.text
            return doc

        return component

    Language.component(f"{name}1", func=make_component(f"{name}1"))
    Language.component(f"{name}2", func=make_component(f"{name}2"))

    components = set([f"{name}1", f"{name}2"])

    nlp = English()
    texts = ["a", "bb", "ccc"]
    examples = []
    for text in texts:
        examples.append(Example(nlp.make_doc(text), nlp.make_doc(text)))

    for components_to_annotate in [
        [],
        [f"{name}1"],
        [f"{name}1", f"{name}2"],
        [f"{name}2", f"{name}1"],
    ]:
        for key in results:
            results[key] = ""
        nlp = English(vocab=nlp.vocab)
        nlp.add_pipe(f"{name}1")
        nlp.add_pipe(f"{name}2")
        nlp.update(examples, annotates=components_to_annotate)
        for component in components_to_annotate:
            assert results[component] == "".join(eg.predicted.text
                                                 for eg in examples)
        for component in components - set(components_to_annotate):
            assert results[component] == ""
コード例 #13
0
def nlp2(nlp, sample_vectors):
    Language.component("test_language_vector_modification_pipe",
                       func=vector_modification_pipe)
    Language.component("test_language_userdata_pipe", func=userdata_pipe)
    Language.component("test_language_ner_pipe", func=ner_pipe)
    add_vecs_to_vocab(nlp.vocab, sample_vectors)
    nlp.add_pipe("test_language_vector_modification_pipe")
    nlp.add_pipe("test_language_ner_pipe")
    nlp.add_pipe("test_language_userdata_pipe")
    return nlp
コード例 #14
0
def test_add_pipe_last(nlp, name1, name2):
    Language.component("new_pipe2", func=lambda doc: doc)
    nlp.add_pipe("new_pipe2", name=name2)
    nlp.add_pipe("new_pipe", name=name1, last=True)
    assert nlp.pipeline[0][0] != name1
    assert nlp.pipeline[-1][0] == name1
コード例 #15
0
def test_disable_enable_pipes():
    name = "test_disable_enable_pipes"
    results = {}

    def make_component(name):
        results[name] = ""

        def component(doc):
            nonlocal results
            results[name] = doc.text
            return doc

        return component

    c1 = Language.component(f"{name}1", func=make_component(f"{name}1"))
    c2 = Language.component(f"{name}2", func=make_component(f"{name}2"))

    nlp = Language()
    nlp.add_pipe(f"{name}1")
    nlp.add_pipe(f"{name}2")
    assert results[f"{name}1"] == ""
    assert results[f"{name}2"] == ""
    assert nlp.pipeline == [(f"{name}1", c1), (f"{name}2", c2)]
    assert nlp.pipe_names == [f"{name}1", f"{name}2"]
    nlp.disable_pipe(f"{name}1")
    assert nlp.disabled == [f"{name}1"]
    assert nlp.component_names == [f"{name}1", f"{name}2"]
    assert nlp.pipe_names == [f"{name}2"]
    assert nlp.config["nlp"]["disabled"] == [f"{name}1"]
    nlp("hello")
    assert results[f"{name}1"] == ""  # didn't run
    assert results[f"{name}2"] == "hello"  # ran
    nlp.enable_pipe(f"{name}1")
    assert nlp.disabled == []
    assert nlp.pipe_names == [f"{name}1", f"{name}2"]
    assert nlp.config["nlp"]["disabled"] == []
    nlp("world")
    assert results[f"{name}1"] == "world"
    assert results[f"{name}2"] == "world"
    nlp.disable_pipe(f"{name}2")
    nlp.remove_pipe(f"{name}2")
    assert nlp.components == [(f"{name}1", c1)]
    assert nlp.pipeline == [(f"{name}1", c1)]
    assert nlp.component_names == [f"{name}1"]
    assert nlp.pipe_names == [f"{name}1"]
    assert nlp.disabled == []
    assert nlp.config["nlp"]["disabled"] == []
    nlp.rename_pipe(f"{name}1", name)
    assert nlp.components == [(name, c1)]
    assert nlp.component_names == [name]
    nlp("!")
    assert results[f"{name}1"] == "!"
    assert results[f"{name}2"] == "world"
    with pytest.raises(ValueError):
        nlp.disable_pipe(f"{name}2")
    nlp.disable_pipe(name)
    assert nlp.component_names == [name]
    assert nlp.pipe_names == []
    assert nlp.config["nlp"]["disabled"] == [name]
    nlp("?")
    assert results[f"{name}1"] == "!"
コード例 #16
0
def test_add_lots_of_pipes(nlp, n_pipes):
    Language.component("n_pipes", func=lambda doc: doc)
    for i in range(n_pipes):
        nlp.add_pipe("n_pipes", name=f"pipe_{i}")
    assert len(nlp.pipe_names) == n_pipes
コード例 #17
0
            meta_token.is_sent_start = False
        # The leading '=' is a sentence boundary
        doc[i - 9].is_sent_start = True
        # Any token following the metadata is also a new sentence.
        doc[i + 1].is_sent_start = True
    return doc


NLP = spacy.load('nl_core_news_sm')
try:
    NLP.add_pipe(_metadata_sentence_segmentation,
                 before="parser")  # Insert before the parser
except ValueError:
    # spacy>=3
    from spacy.language import Language
    Language.component('meta-sentence-segmentation')(
        _metadata_sentence_segmentation)  # pylint: disable=E1101
    NLP.add_pipe('meta-sentence-segmentation',
                 before="parser")  # Insert before the parser

for case in TOKENIZER_SPECIAL_CASES:
    NLP.tokenizer.add_special_case(case, [{ORTH: case}])
    NLP.tokenizer.add_special_case(case.lower(), [{ORTH: case.lower()}])

infixes = NLP.Defaults.infixes + [r'\(', r'\)', r'(?<=[\D])\/(?=[\D])']
infix_regex = spacy.util.compile_infix_regex(infixes)
NLP.tokenizer.infix_finditer = infix_regex.finditer


class TokenizerOns(Tokenizer):
    def parse_text(self, text: str) -> spacy.tokens.doc.Doc:
        """Custom spacy tokenizer for the 'ons' corpus that takes care of special metadata tokens.