Ejemplo n.º 1
0
    def try_merge_modifier_token(self,
                                 extract_result: ExtractResult,
                                 pattern: Pattern,
                                 source: str,
                                 potential_ambiguity: bool = False) -> bool:
        before_str = source[0:extract_result.start]
        after_str = source[extract_result.start:extract_result.length]

        # Avoid adding mod for ambiguity cases, such as "from" in "from ... to ..." should not add mod
        if potential_ambiguity and self.config.ambiguous_range_modifier_prefix and \
                regex.search(self.config.ambiguous_range_modifier_prefix, before_str):
            matches = list(
                regex.finditer(self.config.potential_ambiguous_range_regex,
                               source))
            if matches and len(matches):
                return any(match.start() < extract_result.start +
                           extract_result.length
                           and match.end() > extract_result.start
                           for match in matches)
                # return self._filter_item(extract_result, matches)

        token = self.has_token_index(before_str.strip(), pattern)
        if token.matched:
            mod_len = len(before_str) - token.index
            extract_result.length += mod_len
            extract_result.start -= mod_len
            extract_result.text = source[extract_result.
                                         start:extract_result.start +
                                         extract_result.length]

            extract_result.meta_data = self.assign_mod_metadata(
                extract_result.meta_data)
            return True
        elif self.config.check_both_before_after:
            # check also after_str
            after_str = source[extract_result.start:extract_result.length]
            token = self.has_token_index(after_str.strip(), pattern)
            if token.matched:
                mod_len = token.index + len(after_str) - len(after_str.strip())
                extract_result.length += mod_len
                extract_result.text = source[extract_result.
                                             start:extract_result.start +
                                             extract_result.length]
                extract_result.data = Constants.HAS_MOD
                extract_result.meta_data = self.assign_mod_metadata(
                    extract_result.meta_data)

                return True

        return False
Ejemplo n.º 2
0
    def try_merge_modifier_token(self,
                                 er: ExtractResult,
                                 pattern: Pattern,
                                 source: str,
                                 potentialAmbiguity: bool = False) -> bool:
        before_str = source[0:er.start]

        # Avoid adding mod for ambiguity cases, such as "from" in "from ... to ..." should not add mod
        if potentialAmbiguity and self.config.ambiguous_range_modifier_prefix and regex.search(
                self.config.ambiguous_range_modifier_prefix, before_str):
            matches = list(
                regex.finditer(self.config.potential_ambiguous_range_regex,
                               source))
            if matches and len(matches):
                return self._filter_item(er, matches)

        token = self.has_token_index(before_str.strip(), pattern)
        if token.matched:
            mod_len = len(before_str) - token.index
            er.length += mod_len
            er.start -= mod_len
            er.text = source[er.start:er.start + er.length]

            er.meta_data = self.assign_mod_metadata(er.meta_data)
            return True

        return False
Ejemplo n.º 3
0
def merge_all_tokens(tokens: List[Token], source: str,
                     extractor_name: str) -> List[ExtractResult]:
    result = []

    merged_tokens: List[Token] = list()
    tokens_ = sorted(filter(None, tokens), key=lambda x: x.start)

    for token in tokens_:
        add = True

        for index, m_token in enumerate(merged_tokens):
            if not add:
                break

            if token.start >= m_token.start and token.end <= m_token.end:
                add = False

            if m_token.start < token.start < m_token.end:
                add = False

            if token.start <= m_token.start and token.end >= m_token.end:
                add = False
                merged_tokens[index] = token

        if add:
            merged_tokens.append(token)

    for token in merged_tokens:
        start = token.start
        length = token.length
        sub_str = source[start:start + length]

        extracted_result = ExtractResult()
        extracted_result.start = start
        extracted_result.length = length
        extracted_result.text = sub_str
        extracted_result.type = extractor_name
        extracted_result.data = None
        extracted_result.meta_data = token.metadata

        result.append(extracted_result)

    return result