def add_tagger(self, tagger, name, additional_fields=[]): r''' Add any kind of a tagger for tokens. Args: tagger (`object/function`): Any object/function that takes a spacy doc as an input, does something and returns the same doc. name (`str`): Name for this component in the pipeline. additional_fields (`List[str]`): Fields to be added to the `_` properties of a token. ''' component_factory_name = spacy.util.get_object_name(tagger) Language.factory(name=component_factory_name, default_config={"config": self.config}, func=tagger) self.nlp.add_pipe(component_factory_name, name='tag_' + name, first=True) # Add custom fields needed for this usecase Token.set_extension('to_skip', default=False, force=True) # Add any additional fields that are required for field in additional_fields: Token.set_extension(field, default=False, force=True)
def test_language_factories_scores(): name = "test_language_factories_scores" func = lambda nlp, name: lambda doc: doc weights1 = {"a1": 0.5, "a2": 0.5} weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1} Language.factory(f"{name}1", default_score_weights=weights1, func=func) Language.factory(f"{name}2", default_score_weights=weights2, func=func) meta1 = Language.get_factory_meta(f"{name}1") assert meta1.default_score_weights == weights1 meta2 = Language.get_factory_meta(f"{name}2") assert meta2.default_score_weights == weights2 nlp = Language() nlp._config["training"]["score_weights"] = {} nlp.add_pipe(f"{name}1") nlp.add_pipe(f"{name}2") cfg = nlp.config["training"] expected_weights = {"a1": 0.25, "a2": 0.25, "b1": 0.1, "b2": 0.35, "b3": 0.05} assert cfg["score_weights"] == expected_weights # Test with custom defaults config = nlp.config.copy() config["training"]["score_weights"]["a1"] = 0.0 config["training"]["score_weights"]["b3"] = 1.3 nlp = English.from_config(config) score_weights = nlp.config["training"]["score_weights"] expected = {"a1": 0.0, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.65} assert score_weights == expected # Test with null values config = nlp.config.copy() config["training"]["score_weights"]["a1"] = None nlp = English.from_config(config) score_weights = nlp.config["training"]["score_weights"] expected = {"a1": None, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.66} assert score_weights == expected
def __init__(self): # Load spacy model self.spacy_obj = spacy.load( "en_core_web_sm", disable=["tok2vec", "tagger", "attribute_ruler", "lemmatizer"]) # Language Detector Function def get_lang_detector(nlp, name): return LanguageDetector(seed=42) # # Create instance for language detection Language.factory("language_detector", func=get_lang_detector) self.spacy_obj.add_pipe('language_detector', last=True)
def test_pipe_factories_config_excludes_nlp(): """Test that the extra values we temporarily add to component config blocks/functions are removed and not copied around. """ name = "test_pipe_factories_config_excludes_nlp" func = lambda nlp, name: lambda doc: doc Language.factory(name, func=func) config = { "nlp": {"lang": "en", "pipeline": [name]}, "components": {name: {"factory": name}}, } nlp = English.from_config(config) assert nlp.pipe_names == [name] pipe_cfg = nlp.get_pipe_config(name) pipe_cfg == {"factory": name} assert nlp._pipe_configs[name] == {"factory": name}
def register_benepar_component_factory(): # Starting with spaCy 3.0, nlp.add_pipe no longer directly accepts # BeneparComponent instances. We must instead register a component factory. import spacy if spacy.__version__.startswith("2"): return from spacy.language import Language Language.factory( "benepar", default_config={ "subbatch_max_tokens": 500, "disable_tagger": False, }, func=create_benepar_component, )
def test_pipe_factories_decorator_idempotent(i, func, func2): """Check that decorator can be run multiple times if the function is the same. This is especially relevant for live reloading because we don't want spaCy to raise an error if a module registering components is reloaded. """ name = f"test_pipe_factories_decorator_idempotent_{i}" for i in range(5): Language.factory(name, func=func) nlp = Language() nlp.add_pipe(name) Language.factory(name, func=func) # Make sure it also works for component decorator, which creates the # factory function name2 = f"{name}2" for i in range(5): Language.component(name2, func=func2) nlp = Language() nlp.add_pipe(name) Language.component(name2, func=func2)
def test_component_factories_class_func(): """Test that class components can implement a from_nlp classmethod that gives them access to the nlp object and config via the factory.""" class TestComponent5: def __call__(self, doc): return doc mock = Mock() mock.return_value = TestComponent5() def test_componen5_factory(nlp, foo: str = "bar", name="c5"): return mock(nlp, foo=foo) Language.factory("c5", func=test_componen5_factory) assert Language.has_factory("c5") nlp = Language() nlp.add_pipe("c5", config={"foo": "bar"}) assert nlp("hello world") mock.assert_called_once_with(nlp, foo="bar")
'APRESENTAR': 0.0, 'FALAR SOBRE SEMEAR': 0.0, 'FALAR SOBRE ADA': 0.0, 'MÚSICA': 0.0, 'SOLETRAR': 0.0, 'DANÇAR': 1.0 }) baseDeDadosFinal.append([texto, dic.copy()]) """ #Check do dicionário print(len(baseDeDadosFinal)) print(baseDeDadosFinal[233][0]) print(baseDeDadosFinal[233][1]) """ """ Language.factory("language_detector", func=get_lang_detector) nlp.add_pipe('language_detector') """ #Criando o classificador modelo = spacy.blank('pt') print(modelo.pipe_names) """ categorias = modelo.create_pipe() categorias.add_label("ACORDAR") categorias.add_label("DORMIR") categorias.add_label("GIRAR") categorias.add_label("LEVANTAR BRAÇOS") categorias.add_label("PIADA") categorias.add_label("FALAR O NOME")
nlp: object, name: str, callbacks: dict, sets: dict, map_doc: str, sort_length: bool, rules: str, ) -> SpacyCore: map_fn = registry.get(*map_doc.split(".")) callbacks = { key: registry.get(*value.split(".")) for key, value in callbacks.items() } core = SpacyCore(callbacks, sets, map_fn, sort_length) core.load(rules) return core Language.factory( "hmrb", default_config={ "callbacks": {}, "sets": {}, "map_doc": _default_map, "sort_length": False, "rules": "", }, func=spacy_factory, ) except (ImportError, AttributeError): logging.debug("disabling support for spaCy 3.0+")
def test_initialize_arguments(): name = "test_initialize_arguments" class CustomTokenizer: def __init__(self, tokenizer): self.tokenizer = tokenizer self.from_initialize = None def __call__(self, text): return self.tokenizer(text) def initialize(self, get_examples, nlp, custom: int): self.from_initialize = custom class Component: def __init__(self): self.from_initialize = None def initialize(self, get_examples, nlp, custom1: str, custom2: StrictBool = False): self.from_initialize = (custom1, custom2) Language.factory(name, func=lambda nlp, name: Component()) nlp = English() nlp.tokenizer = CustomTokenizer(nlp.tokenizer) example = Example.from_dict(nlp("x"), {}) get_examples = lambda: [example] nlp.add_pipe(name) # The settings here will typically come from the [initialize] block init_cfg = {"tokenizer": {"custom": 1}, "components": {name: {}}} nlp.config["initialize"].update(init_cfg) with pytest.raises(ConfigValidationError) as e: # Empty config for component, no required custom1 argument nlp.initialize(get_examples) errors = e.value.errors assert len(errors) == 1 assert errors[0]["loc"] == ("custom1", ) assert errors[0]["type"] == "value_error.missing" init_cfg = { "tokenizer": { "custom": 1 }, "components": { name: { "custom1": "x", "custom2": 1 } }, } nlp.config["initialize"].update(init_cfg) with pytest.raises(ConfigValidationError) as e: # Wrong type of custom 2 nlp.initialize(get_examples) errors = e.value.errors assert len(errors) == 1 assert errors[0]["loc"] == ("custom2", ) assert errors[0]["type"] == "value_error.strictbool" init_cfg = { "tokenizer": { "custom": 1 }, "components": { name: { "custom1": "x" } }, } nlp.config["initialize"].update(init_cfg) nlp.initialize(get_examples) assert nlp.tokenizer.from_initialize == 1 pipe = nlp.get_pipe(name) assert pipe.from_initialize == ("x", False)
for key, value in MAPPING_TO_FUNCTION.items(): def create_term_extraction_component(nlp: Language, name: str, force, args, kwargs, local_value=value): # fix based on Cell variable defined in loop (PYL-W0640) return TermExtractionPipeline(nlp, local_value, force, *args, **kwargs) Language.factory( key, func=copy.copy(create_term_extraction_component), default_config={ "force": True, "args": [], "kwargs": {} }, ) class TermExtractionPipeline: """ This is for adding PyATE as a spaCy pipeline component. """ def __init__(self, nlp, func: Callable[..., pd.Series] = combo_basic, force: bool = True, *args,
import spacy from spacy.matcher import Matcher from spacy_langdetect import LanguageDetector from spacy.language import Language import pandas as pd import difflib nlp = spacy.load('en_core_web_md') matcher = Matcher(nlp.vocab) def create_lang_detector(nlp, name): return LanguageDetector() Language.factory("language_detector", func=create_lang_detector) nlp.add_pipe("language_detector", last=True) def text_from_pdf(pdf_file): return extract_text(pdf_file) def extract_names(text): doc = nlp(text) if doc._.language['language'] == 'en': nlp_new = spacy.load('en_core_web_md') else: nlp_new = spacy.load('pl_core_news_md')