def _add_custom_spacy_extensions(self):
        for n, f in self.is_attrs_name2func:
            ext = Token.get_extension(n)
            if ext is None:
                Token.set_extension(n, getter=f, force=True)
        for item in [Span, Doc]:
            for n, f in self.has_attrs_name2func:
                ext = item.get_extension(n)
                if ext is None:
                    #print(f"Setting: {item}.set_extension({n}, getter= {f})")
                    item.set_extension(n, getter=f, force=True)

        # Add Attr Getters for Span (i.e. Doc.ents)
        for n, f in self.get_attrs_name2func:
            ext = Span.get_extension(n)
            if ext is None:
                Span.set_extension(n, getter=f, force=True)
Example #2
0
    def __init__(self,
                 nlp: Language,
                 attr: str = "spaczz_ent",
                 **cfg: Any) -> None:
        """Initialize the spaczz ruler with a Language object and cfg parameters.

        All spaczz ruler cfg parameters are prepended with "spaczz_".
        If spaczz_patterns is supplied here, they need to be a list of spaczz patterns:
        dictionaries with a "label", "pattern", "type", and optional "kwargs" key.
        For example:
        {'label': 'ORG', 'pattern': 'Apple', 'type': 'fuzzy', 'kwargs': {'min_r2': 90}}.


        Args:
            nlp: The shared nlp object to pass the vocab to the matchers
                (not currently used by spaczz matchers) and process fuzzy patterns.
            attr: Name of custom Span attribute that denotes whether an
                entity was added via the spaczz ruler or not.
                Default is "spaczz_ent".
            **cfg: Other config parameters. The SpaczzRuler makes heavy use
                of cfg to pass additional parameters down to the matchers.
                spaczz config parameters start with "spaczz_" to keep them
                from colliding with other cfg components.
                SpaczzRuler cfg components include (with "spaczz_" prepended to them):
                overwrite_ents (bool): Whether to overwrite exisiting Doc.ents
                    with new matches. Default is False.
                ent_id_sep (str): String to separate entity labels and ids on.
                regex_config (Union[str, RegexConfig]): Config to use with the
                    regex matcher. Default is "default". See RegexMatcher/RegexSearcher
                    documentation for available parameter details.
                fuzzy_defaults (Dict[str, Any]): Modified default parameters to use with
                    the fuzzy matcher. Default is an empty dictionary -
                    utilizing defaults.
                regex_defaults (Dict[str, Any]): Modified default parameters to use with
                    the regex matcher. Default is an empty dictionary -
                    utilizing defaults. See RegexMatcher/RegexSearcher documentation
                    for parameter details.
                patterns (Iterable[Dict[str, Any]]): Patterns to initialize
                    the ruler with. Default is None.
                If SpaczzRuler is loaded as part of a model pipeline,
                cfg will include all keyword arguments passed to spacy.load.

        Raises:
            TypeError: If spaczz_{name}_defaults passed are not dictionaries.
        """
        if not Span.get_extension(attr):
            Span.set_extension(attr, default=False)
        self.nlp = nlp
        self.fuzzy_patterns: DefaultDict[str, DefaultDict[
            str, Any]] = defaultdict(lambda: defaultdict(list))
        self.regex_patterns: DefaultDict[str, DefaultDict[
            str, Any]] = defaultdict(lambda: defaultdict(list))
        self.ent_id_sep = cfg.get("spaczz_ent_id_sep", DEFAULT_ENT_ID_SEP)
        self._ent_ids: Dict[Any, Any] = defaultdict(dict)
        self.overwrite = cfg.get("spaczz_overwrite_ents", False)
        default_names = ("spaczz_fuzzy_defaults", "spaczz_regex_defaults")
        self.defaults = {}
        for name in default_names:
            if name in cfg:
                if isinstance(cfg[name], dict):
                    self.defaults[name] = cfg[name]
                else:
                    raise TypeError((
                        "Defaults must be a dictionary of keyword arguments,",
                        f"not {type(cfg[name])}.",
                    ))
        self.fuzzy_matcher = FuzzyMatcher(
            nlp.vocab,
            **self.defaults.get("spaczz_fuzzy_defaults", {}),
        )
        self.regex_matcher = RegexMatcher(
            nlp.vocab,
            cfg.get("spaczz_regex_config", "default"),
            **self.defaults.get("spaczz_regex_defaults", {}),
        )
        patterns = cfg.get("spaczz_patterns")
        if patterns is not None:
            self.add_patterns(patterns)