def __init__(self, d: Dict, nlp) -> None: """ Initialize a pattern, construct spacy token for matching according to type Args: d: Dict nlp Returns: """ self.type = d["type"] self.in_output = tf_transfer(d["is_in_output"]) self.max = d["maximum"] self.min = d["minimum"] self.prefix = d["prefix"] self.suffix = d["suffix"] self.full_shape = d.get("shapes") if self.type == "word": self.spacy_token_lst = self._construct_word_token(d, nlp) elif self.type == "shape": self.spacy_token_lst = self._construct_shape_token(d) elif self.type == "number": self.spacy_token_lst = self._construct_number_token(d, nlp) elif self.type == "punctuation": self.spacy_token_lst = self._construct_punctuation_token(d, nlp) elif self.type == "linebreak": self.spacy_token_lst = self._construct_linebreak_token(d)
def __init__(self, d: Dict, nlp) -> None: """ Storing information for each Rule, create list of Pattern for a rule Args: d: Dict nlp Returns: """ self.dependencies = d["dependencies"] if "dependencies" in d else [] self.description = d["description"] if "description" in d else "" self.active = tf_transfer(d["is_active"]) self.identifier = d["identifier"] self.output_format = d["output_format"] self.polarity = tf_transfer(d["polarity"]) self.patterns = [] for pattern_idx, a_pattern in enumerate(d["pattern"]): this_pattern = Pattern(a_pattern, nlp) self.patterns.append(this_pattern)
def _add_common_constrain(token_lst: List[Dict], d: Dict) -> List[Dict]: """ Add common constrain for every token type, like "is_required" Args: token_lst: List[Dict] d: Dict Returns: List[Dict] """ result = [] for a_token in token_lst: if not tf_transfer(d["is_required"]): a_token["OP"] = "?" result.append(a_token) return result
def _construct_word_token(self, d: Dict, nlp) -> List[Dict]: """ Construct a word token Args: d: Dict nlp Returns: List[Dict] """ result = [] if len(d["token"]) == 1: if tf_transfer(d["match_all_forms"]): this_token = {attrs.LEMMA: nlp(d["token"][0])[0].lemma_} else: this_token = {attrs.LOWER: d["token"][0].lower()} result.append(this_token) if d["capitalization"]: result = self._add_capitalization_constrain(result, d["capitalization"], d["token"]) elif not d["token"]: if tf_transfer(d["contain_digit"]): this_token = {attrs.IS_ASCII: True, attrs.IS_PUNCT: False} else: this_token = {attrs.IS_ALPHA: True} if tf_transfer(d["is_out_of_vocabulary"]) and not tf_transfer(d["is_in_vocabulary"]): this_token[attrs.IS_OOV] = True elif not tf_transfer(d["is_out_of_vocabulary"]) and tf_transfer(d["is_in_vocabulary"]): this_token[attrs.IS_OOV] = False result.append(this_token) if d["length"]: result = self._add_length_constrain(result, d["length"]) if d["capitalization"]: result = self._add_capitalization_constrain(result, d["capitalization"], d["token"]) else: if "match_all_forms" in d and not tf_transfer(d["match_all_forms"]): global FLAG_ID token_set = set(d["token"]) def is_selected_token(x): return x in token_set FLAG_DICT[FLAG_ID] = nlp.vocab.add_flag(is_selected_token) this_token = {FLAG_DICT[FLAG_ID]: True} FLAG_ID += 1 result.append(this_token) else: token_set = [nlp(x)[0].lemma_ for x in set(d["token"])] for a_lemma in token_set: this_token = {attrs.LEMMA: a_lemma} result.append(this_token) if d["capitalization"]: result = self._add_capitalization_constrain(result, d["capitalization"], d["token"]) result = self._add_common_constrain(result, d) if d["part_of_speech"]: result = self._add_pos_constrain(result, d["part_of_speech"]) return result