def __init__( self, nlp, match_dict=None, forms_lookup=None, custom_match_hooks: Optional[ModuleType] = None, allow_multiple_whitespaces=False, lemmatizer="pyInflect" ): self.default_match_hooks = default_match_hooks self.custom_match_hooks = custom_match_hooks self.nlp = nlp self.match_dict = match_dict if match_dict else get_match_dict() self.forms_lookup = forms_lookup if forms_lookup else get_forms_lookup() self.allow_multiple_whitespaces = allow_multiple_whitespaces self.matcher = Matcher(self.nlp.vocab) self._init_matcher() self.spans: List[Span] = [] self.inflector = Inflector(nlp=self.nlp, forms_lookup=self.forms_lookup, lemmatizer=lemmatizer) # set custom extensions for any unexpected keys found in the match_dict novel_properites = ( seq(self.match_dict.values()) .flat_map(lambda x: x.keys()) .distinct() .difference(expected_properties) ) novel_prop_defaults: Dict[str, Any] = {} for x in self.match_dict.values(): for k, v in x.items(): if k in novel_properites and k not in novel_prop_defaults.keys(): if isinstance(v, str): novel_prop_defaults[k] = "" elif isinstance(v, list): novel_prop_defaults[k] = [] elif isinstance(v, dict): novel_prop_defaults[k] = {} elif isinstance(v, int): novel_prop_defaults[k] = 0 elif isinstance(v, float): novel_prop_defaults[k] = 0.0 elif isinstance(v, bool): novel_prop_defaults[k] = False else: # just default to whatever value we find print(k, v) novel_prop_defaults[k] = v for prop, default in novel_prop_defaults.items(): Span.set_extension(prop, default=default, force=True) self.novel_prop_defaults = novel_prop_defaults
def __init__( self, nlp, match_dict=None, forms_lookup=None, custom_match_hooks: Optional[ModuleType] = None, allow_multiple_whitespaces=False, max_suggestions_count=1000, lm_path=None, filter_suggestions=False, default_max_count=None, debug=False, ): self.debug = debug self.logger = logging.getLogger("replaCy") self.default_match_hooks = default_match_hooks self.custom_match_hooks = custom_match_hooks self.nlp = nlp self.match_dict = match_dict if match_dict else get_match_dict() self.allow_multiple_whitespaces = allow_multiple_whitespaces self.matcher = Matcher(self.nlp.vocab) self._init_matcher() self.spans: List[Span] = [] self.max_suggestions_count = max_suggestions_count self.forms_lookup = forms_lookup if forms_lookup else get_forms_lookup( ) self.suggestion_gen = SuggestionGenerator(nlp, forms_lookup, filter_suggestions, default_max_count) self.novel_prop_defaults = get_novel_prop_defaults(self.match_dict) self._set_scorer(lm_path) # Pipeline doesn't include matcher, since doesn't have the signature List[Span] -> None self.pipeline: List[Tuple[str, PipelineComponent]] = [ ("sorter", self.scorer.sort_suggestions), ("filter", self.max_count_filter), ("joiner", join_suggestions), ]
def test_valid_format(): match_dict = get_match_dict() ReplaceMatcher.validate_match_dict(match_dict)
then call the test functions with @pytest.mark.parametrize That way, failures log which test case failed, not just one in a long list I would do this, but I am pretty sure I did it once a few PRs ago, and I guess it got overwritten """ import pytest import spacy from replacy import ReplaceMatcher from replacy.db import get_match_dict from functional import seq xfail = pytest.mark.xfail nlp = spacy.load("en_core_web_sm") match_dict = get_match_dict() r_matcher = ReplaceMatcher(nlp, match_dict) rule_all_suggs_pos = [] rule_all_suggs_neg = [] for rule_name in r_matcher.match_dict: rule_suggestions = [] for suggestion in r_matcher.match_dict[rule_name]["suggestions"]: rule_suggestions.append(" ".join([t["TEXT"] for t in suggestion])) rule_suggestions = ( seq(rule_suggestions) .map(lambda phrase: nlp(phrase)) .map(lambda doc: " ".join([token.lemma_ for token in doc]))