Ejemplo n.º 1
0
    def __init__(self, d: Dict, nlp) -> None:
        """
        Initialize a pattern, construct spacy token for matching according to type
        Args:
            d: Dict
            nlp

        Returns:
        """

        self.type = d["type"]
        self.in_output = tf_transfer(d["is_in_output"])
        self.max = d["maximum"]
        self.min = d["minimum"]
        self.prefix = d["prefix"]
        self.suffix = d["suffix"]
        self.full_shape = d.get("shapes")

        if self.type == "word":
            self.spacy_token_lst = self._construct_word_token(d, nlp)
        elif self.type == "shape":
            self.spacy_token_lst = self._construct_shape_token(d)
        elif self.type == "number":
            self.spacy_token_lst = self._construct_number_token(d, nlp)
        elif self.type == "punctuation":
            self.spacy_token_lst = self._construct_punctuation_token(d, nlp)
        elif self.type == "linebreak":
            self.spacy_token_lst = self._construct_linebreak_token(d)
Ejemplo n.º 2
0
    def __init__(self, d: Dict, nlp) -> None:
        """
        Storing information for each Rule, create list of Pattern for a rule
        Args:
            d: Dict
            nlp

        Returns:
        """

        self.dependencies = d["dependencies"] if "dependencies" in d else []
        self.description = d["description"] if "description" in d else ""
        self.active = tf_transfer(d["is_active"])
        self.identifier = d["identifier"]
        self.output_format = d["output_format"]
        self.polarity = tf_transfer(d["polarity"])
        self.patterns = []
        for pattern_idx, a_pattern in enumerate(d["pattern"]):
            this_pattern = Pattern(a_pattern, nlp)
            self.patterns.append(this_pattern)
Ejemplo n.º 3
0
    def _add_common_constrain(token_lst: List[Dict], d: Dict) -> List[Dict]:
        """
        Add common constrain for every token type, like "is_required"
        Args:
            token_lst: List[Dict]
            d: Dict

        Returns: List[Dict]
        """

        result = []
        for a_token in token_lst:
            if not tf_transfer(d["is_required"]):
                a_token["OP"] = "?"
            result.append(a_token)
        return result
Ejemplo n.º 4
0
    def _construct_word_token(self, d: Dict, nlp) -> List[Dict]:
        """
        Construct a word token
        Args:
            d: Dict
            nlp

        Returns: List[Dict]
        """

        result = []
        if len(d["token"]) == 1:
            if tf_transfer(d["match_all_forms"]):
                this_token = {attrs.LEMMA: nlp(d["token"][0])[0].lemma_}
            else:
                this_token = {attrs.LOWER: d["token"][0].lower()}
            result.append(this_token)
            if d["capitalization"]:
                result = self._add_capitalization_constrain(result, d["capitalization"], d["token"])

        elif not d["token"]:
            if tf_transfer(d["contain_digit"]):
                this_token = {attrs.IS_ASCII: True, attrs.IS_PUNCT: False}
            else:
                this_token = {attrs.IS_ALPHA: True}
            if tf_transfer(d["is_out_of_vocabulary"]) and not tf_transfer(d["is_in_vocabulary"]):
                this_token[attrs.IS_OOV] = True
            elif not tf_transfer(d["is_out_of_vocabulary"]) and tf_transfer(d["is_in_vocabulary"]):
                this_token[attrs.IS_OOV] = False
            result.append(this_token)
            if d["length"]:
                result = self._add_length_constrain(result, d["length"])
            if d["capitalization"]:
                result = self._add_capitalization_constrain(result, d["capitalization"], d["token"])

        else:
            if "match_all_forms" in d and not tf_transfer(d["match_all_forms"]):
                global FLAG_ID
                token_set = set(d["token"])

                def is_selected_token(x):
                    return x in token_set

                FLAG_DICT[FLAG_ID] = nlp.vocab.add_flag(is_selected_token)
                this_token = {FLAG_DICT[FLAG_ID]: True}
                FLAG_ID += 1
                result.append(this_token)

            else:
                token_set = [nlp(x)[0].lemma_ for x in set(d["token"])]
                for a_lemma in token_set:
                    this_token = {attrs.LEMMA: a_lemma}
                    result.append(this_token)

            if d["capitalization"]:
                result = self._add_capitalization_constrain(result, d["capitalization"], d["token"])

        result = self._add_common_constrain(result, d)
        if d["part_of_speech"]:
            result = self._add_pos_constrain(result, d["part_of_speech"])

        return result