Esempio n. 1
0
    def __init__(
        self,
        nlp,
        match_dict=None,
        forms_lookup=None,
        custom_match_hooks: Optional[ModuleType] = None,
        allow_multiple_whitespaces=False,
        lemmatizer="pyInflect"
    ):
        self.default_match_hooks = default_match_hooks
        self.custom_match_hooks = custom_match_hooks
        self.nlp = nlp
        self.match_dict = match_dict if match_dict else get_match_dict()
        self.forms_lookup = forms_lookup if forms_lookup else get_forms_lookup()
        self.allow_multiple_whitespaces = allow_multiple_whitespaces

        self.matcher = Matcher(self.nlp.vocab)
        self._init_matcher()
        self.spans: List[Span] = []
        self.inflector = Inflector(nlp=self.nlp, forms_lookup=self.forms_lookup, lemmatizer=lemmatizer)

        # set custom extensions for any unexpected keys found in the match_dict
        novel_properites = (
            seq(self.match_dict.values())
            .flat_map(lambda x: x.keys())
            .distinct()
            .difference(expected_properties)
        )
        novel_prop_defaults: Dict[str, Any] = {}
        for x in self.match_dict.values():
            for k, v in x.items():
                if k in novel_properites and k not in novel_prop_defaults.keys():
                    if isinstance(v, str):
                        novel_prop_defaults[k] = ""
                    elif isinstance(v, list):
                        novel_prop_defaults[k] = []
                    elif isinstance(v, dict):
                        novel_prop_defaults[k] = {}
                    elif isinstance(v, int):
                        novel_prop_defaults[k] = 0
                    elif isinstance(v, float):
                        novel_prop_defaults[k] = 0.0
                    elif isinstance(v, bool):
                        novel_prop_defaults[k] = False
                    else:
                        # just default to whatever value we find
                        print(k, v)
                        novel_prop_defaults[k] = v

        for prop, default in novel_prop_defaults.items():
            Span.set_extension(prop, default=default, force=True)
        self.novel_prop_defaults = novel_prop_defaults
Esempio n. 2
0
 def __init__(
     self,
     nlp,
     match_dict=None,
     forms_lookup=None,
     custom_match_hooks: Optional[ModuleType] = None,
     allow_multiple_whitespaces=False,
     max_suggestions_count=1000,
     lm_path=None,
     filter_suggestions=False,
     default_max_count=None,
     debug=False,
 ):
     self.debug = debug
     self.logger = logging.getLogger("replaCy")
     self.default_match_hooks = default_match_hooks
     self.custom_match_hooks = custom_match_hooks
     self.nlp = nlp
     self.match_dict = match_dict if match_dict else get_match_dict()
     self.allow_multiple_whitespaces = allow_multiple_whitespaces
     self.matcher = Matcher(self.nlp.vocab)
     self._init_matcher()
     self.spans: List[Span] = []
     self.max_suggestions_count = max_suggestions_count
     self.forms_lookup = forms_lookup if forms_lookup else get_forms_lookup(
     )
     self.suggestion_gen = SuggestionGenerator(nlp, forms_lookup,
                                               filter_suggestions,
                                               default_max_count)
     self.novel_prop_defaults = get_novel_prop_defaults(self.match_dict)
     self._set_scorer(lm_path)
     # Pipeline doesn't include matcher, since doesn't have the signature List[Span] -> None
     self.pipeline: List[Tuple[str, PipelineComponent]] = [
         ("sorter", self.scorer.sort_suggestions),
         ("filter", self.max_count_filter),
         ("joiner", join_suggestions),
     ]
Esempio n. 3
0
def test_valid_format():
    match_dict = get_match_dict()
    ReplaceMatcher.validate_match_dict(match_dict)
Esempio n. 4
0
then call the test functions with @pytest.mark.parametrize
That way, failures log which test case failed, not just one in a long list

I would do this, but I am pretty sure I did it once a few PRs ago, and I guess it got overwritten
"""
import pytest
import spacy
from replacy import ReplaceMatcher
from replacy.db import get_match_dict
from functional import seq

xfail = pytest.mark.xfail

nlp = spacy.load("en_core_web_sm")

match_dict = get_match_dict()
r_matcher = ReplaceMatcher(nlp, match_dict)


rule_all_suggs_pos = []
rule_all_suggs_neg = []

for rule_name in r_matcher.match_dict:
    rule_suggestions = []
    for suggestion in r_matcher.match_dict[rule_name]["suggestions"]:
        rule_suggestions.append(" ".join([t["TEXT"] for t in suggestion]))

    rule_suggestions = (
        seq(rule_suggestions)
        .map(lambda phrase: nlp(phrase))
        .map(lambda doc: " ".join([token.lemma_ for token in doc]))