Ejemplo n.º 1
0
    def add_tagger(self, tagger, name, additional_fields=[]):
        r''' Add any kind of a tagger for tokens.

        Args:
            tagger (`object/function`):
                Any object/function that takes a spacy doc as an input, does something
                and returns the same doc.
            name (`str`):
                Name for this component in the pipeline.
            additional_fields (`List[str]`):
                Fields to be added to the `_` properties of a token.
        '''
        component_factory_name = spacy.util.get_object_name(tagger)
        Language.factory(name=component_factory_name,
                         default_config={"config": self.config},
                         func=tagger)
        self.nlp.add_pipe(component_factory_name,
                          name='tag_' + name,
                          first=True)
        # Add custom fields needed for this usecase
        Token.set_extension('to_skip', default=False, force=True)

        # Add any additional fields that are required
        for field in additional_fields:
            Token.set_extension(field, default=False, force=True)
Ejemplo n.º 2
0
def test_language_factories_scores():
    name = "test_language_factories_scores"
    func = lambda nlp, name: lambda doc: doc
    weights1 = {"a1": 0.5, "a2": 0.5}
    weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1}
    Language.factory(f"{name}1", default_score_weights=weights1, func=func)
    Language.factory(f"{name}2", default_score_weights=weights2, func=func)
    meta1 = Language.get_factory_meta(f"{name}1")
    assert meta1.default_score_weights == weights1
    meta2 = Language.get_factory_meta(f"{name}2")
    assert meta2.default_score_weights == weights2
    nlp = Language()
    nlp._config["training"]["score_weights"] = {}
    nlp.add_pipe(f"{name}1")
    nlp.add_pipe(f"{name}2")
    cfg = nlp.config["training"]
    expected_weights = {"a1": 0.25, "a2": 0.25, "b1": 0.1, "b2": 0.35, "b3": 0.05}
    assert cfg["score_weights"] == expected_weights
    # Test with custom defaults
    config = nlp.config.copy()
    config["training"]["score_weights"]["a1"] = 0.0
    config["training"]["score_weights"]["b3"] = 1.3
    nlp = English.from_config(config)
    score_weights = nlp.config["training"]["score_weights"]
    expected = {"a1": 0.0, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.65}
    assert score_weights == expected
    # Test with null values
    config = nlp.config.copy()
    config["training"]["score_weights"]["a1"] = None
    nlp = English.from_config(config)
    score_weights = nlp.config["training"]["score_weights"]
    expected = {"a1": None, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.66}
    assert score_weights == expected
Ejemplo n.º 3
0
    def __init__(self):
        # Load spacy model
        self.spacy_obj = spacy.load(
            "en_core_web_sm",
            disable=["tok2vec", "tagger", "attribute_ruler", "lemmatizer"])

        # Language Detector Function
        def get_lang_detector(nlp, name):
            return LanguageDetector(seed=42)

        # # Create instance for language detection
        Language.factory("language_detector", func=get_lang_detector)
        self.spacy_obj.add_pipe('language_detector', last=True)
Ejemplo n.º 4
0
def test_pipe_factories_config_excludes_nlp():
    """Test that the extra values we temporarily add to component config
    blocks/functions are removed and not copied around.
    """
    name = "test_pipe_factories_config_excludes_nlp"
    func = lambda nlp, name: lambda doc: doc
    Language.factory(name, func=func)
    config = {
        "nlp": {"lang": "en", "pipeline": [name]},
        "components": {name: {"factory": name}},
    }
    nlp = English.from_config(config)
    assert nlp.pipe_names == [name]
    pipe_cfg = nlp.get_pipe_config(name)
    pipe_cfg == {"factory": name}
    assert nlp._pipe_configs[name] == {"factory": name}
def register_benepar_component_factory():
    # Starting with spaCy 3.0, nlp.add_pipe no longer directly accepts
    # BeneparComponent instances. We must instead register a component factory.
    import spacy

    if spacy.__version__.startswith("2"):
        return

    from spacy.language import Language

    Language.factory(
        "benepar",
        default_config={
            "subbatch_max_tokens": 500,
            "disable_tagger": False,
        },
        func=create_benepar_component,
    )
Ejemplo n.º 6
0
def test_pipe_factories_decorator_idempotent(i, func, func2):
    """Check that decorator can be run multiple times if the function is the
    same. This is especially relevant for live reloading because we don't
    want spaCy to raise an error if a module registering components is reloaded.
    """
    name = f"test_pipe_factories_decorator_idempotent_{i}"
    for i in range(5):
        Language.factory(name, func=func)
    nlp = Language()
    nlp.add_pipe(name)
    Language.factory(name, func=func)
    # Make sure it also works for component decorator, which creates the
    # factory function
    name2 = f"{name}2"
    for i in range(5):
        Language.component(name2, func=func2)
    nlp = Language()
    nlp.add_pipe(name)
    Language.component(name2, func=func2)
def test_component_factories_class_func():
    """Test that class components can implement a from_nlp classmethod that
    gives them access to the nlp object and config via the factory."""
    class TestComponent5:
        def __call__(self, doc):
            return doc

    mock = Mock()
    mock.return_value = TestComponent5()

    def test_componen5_factory(nlp, foo: str = "bar", name="c5"):
        return mock(nlp, foo=foo)

    Language.factory("c5", func=test_componen5_factory)
    assert Language.has_factory("c5")
    nlp = Language()
    nlp.add_pipe("c5", config={"foo": "bar"})
    assert nlp("hello world")
    mock.assert_called_once_with(nlp, foo="bar")
Ejemplo n.º 8
0
            'APRESENTAR': 0.0,
            'FALAR SOBRE SEMEAR': 0.0,
            'FALAR SOBRE ADA': 0.0,
            'MÚSICA': 0.0,
            'SOLETRAR': 0.0,
            'DANÇAR': 1.0
        })

    baseDeDadosFinal.append([texto, dic.copy()])
"""
#Check do dicionário
print(len(baseDeDadosFinal))
print(baseDeDadosFinal[233][0])
print(baseDeDadosFinal[233][1])
"""
"""
Language.factory("language_detector", func=get_lang_detector)
nlp.add_pipe('language_detector')
"""

#Criando o classificador
modelo = spacy.blank('pt')
print(modelo.pipe_names)
"""
categorias = modelo.create_pipe()
categorias.add_label("ACORDAR")
categorias.add_label("DORMIR")
categorias.add_label("GIRAR")
categorias.add_label("LEVANTAR BRAÇOS")
categorias.add_label("PIADA")
categorias.add_label("FALAR O NOME")
Ejemplo n.º 9
0
        nlp: object,
        name: str,
        callbacks: dict,
        sets: dict,
        map_doc: str,
        sort_length: bool,
        rules: str,
    ) -> SpacyCore:
        map_fn = registry.get(*map_doc.split("."))
        callbacks = {
            key: registry.get(*value.split("."))
            for key, value in callbacks.items()
        }
        core = SpacyCore(callbacks, sets, map_fn, sort_length)
        core.load(rules)
        return core

    Language.factory(
        "hmrb",
        default_config={
            "callbacks": {},
            "sets": {},
            "map_doc": _default_map,
            "sort_length": False,
            "rules": "",
        },
        func=spacy_factory,
    )
except (ImportError, AttributeError):
    logging.debug("disabling support for spaCy 3.0+")
def test_initialize_arguments():
    name = "test_initialize_arguments"

    class CustomTokenizer:
        def __init__(self, tokenizer):
            self.tokenizer = tokenizer
            self.from_initialize = None

        def __call__(self, text):
            return self.tokenizer(text)

        def initialize(self, get_examples, nlp, custom: int):
            self.from_initialize = custom

    class Component:
        def __init__(self):
            self.from_initialize = None

        def initialize(self,
                       get_examples,
                       nlp,
                       custom1: str,
                       custom2: StrictBool = False):
            self.from_initialize = (custom1, custom2)

    Language.factory(name, func=lambda nlp, name: Component())

    nlp = English()
    nlp.tokenizer = CustomTokenizer(nlp.tokenizer)
    example = Example.from_dict(nlp("x"), {})
    get_examples = lambda: [example]
    nlp.add_pipe(name)
    # The settings here will typically come from the [initialize] block
    init_cfg = {"tokenizer": {"custom": 1}, "components": {name: {}}}
    nlp.config["initialize"].update(init_cfg)
    with pytest.raises(ConfigValidationError) as e:
        # Empty config for component, no required custom1 argument
        nlp.initialize(get_examples)
    errors = e.value.errors
    assert len(errors) == 1
    assert errors[0]["loc"] == ("custom1", )
    assert errors[0]["type"] == "value_error.missing"
    init_cfg = {
        "tokenizer": {
            "custom": 1
        },
        "components": {
            name: {
                "custom1": "x",
                "custom2": 1
            }
        },
    }
    nlp.config["initialize"].update(init_cfg)
    with pytest.raises(ConfigValidationError) as e:
        # Wrong type of custom 2
        nlp.initialize(get_examples)
    errors = e.value.errors
    assert len(errors) == 1
    assert errors[0]["loc"] == ("custom2", )
    assert errors[0]["type"] == "value_error.strictbool"
    init_cfg = {
        "tokenizer": {
            "custom": 1
        },
        "components": {
            name: {
                "custom1": "x"
            }
        },
    }
    nlp.config["initialize"].update(init_cfg)
    nlp.initialize(get_examples)
    assert nlp.tokenizer.from_initialize == 1
    pipe = nlp.get_pipe(name)
    assert pipe.from_initialize == ("x", False)
Ejemplo n.º 11
0
for key, value in MAPPING_TO_FUNCTION.items():

    def create_term_extraction_component(nlp: Language,
                                         name: str,
                                         force,
                                         args,
                                         kwargs,
                                         local_value=value):
        # fix based on Cell variable defined in loop (PYL-W0640)
        return TermExtractionPipeline(nlp, local_value, force, *args, **kwargs)

    Language.factory(
        key,
        func=copy.copy(create_term_extraction_component),
        default_config={
            "force": True,
            "args": [],
            "kwargs": {}
        },
    )


class TermExtractionPipeline:
    """
    This is for adding PyATE as a spaCy pipeline component.
    """
    def __init__(self,
                 nlp,
                 func: Callable[..., pd.Series] = combo_basic,
                 force: bool = True,
                 *args,
Ejemplo n.º 12
0
import spacy
from spacy.matcher import Matcher
from spacy_langdetect import LanguageDetector
from spacy.language import Language
import pandas as pd
import difflib

nlp = spacy.load('en_core_web_md')
matcher = Matcher(nlp.vocab)


def create_lang_detector(nlp, name):
    return LanguageDetector()


Language.factory("language_detector", func=create_lang_detector)
nlp.add_pipe("language_detector", last=True)


def text_from_pdf(pdf_file):
    return extract_text(pdf_file)


def extract_names(text):
    doc = nlp(text)

    if doc._.language['language'] == 'en':
        nlp_new = spacy.load('en_core_web_md')
    else:
        nlp_new = spacy.load('pl_core_news_md')