def extract(self, source: str) -> List[ExtractResult]:
        result: List[ExtractResult] = list()
        if not self._pre_check_str(source):
            return result

        matched: List[bool] = [False] * len(source)

        match_source: Dict[Match, str] = dict()
        matches_list = list(
            map(
                lambda x: MatchesVal(matches=list(re.finditer(x.re, source)),
                                     val=x.val), self.regexes))
        matches_list = list(filter(lambda x: len(x.matches) > 0, matches_list))

        for ml in matches_list:
            for m in ml.matches:
                for j in range(len(m.group())):
                    matched[m.start() + j] = True
                # Keep Source Data for extra information
                match_source[m] = ml.val
        last = -1

        for i in range(len(source)):
            if not matched[i]:
                last = i
            else:
                if i + 1 == len(source) or not matched[i + 1]:
                    start = last + 1
                    length = i - last
                    substring = source[start:start + length].strip()
                    simple_tokenizer = SimpleTokenizer()
                    if substring.startswith(Constants.IPV6_ELLIPSIS) and (
                            start > 0 and (str.isdigit(source[start - 1]) or
                                           (str.isalpha(source[start - 1])
                                            and not simple_tokenizer.is_cjk(
                                                c=list(source)[start - 1])))):
                        continue

                    elif substring.endswith(Constants.IPV6_ELLIPSIS) and (
                            i + 1 < len(source) and
                        (str.isdigit(source[i + 1]) or
                         (str.isalpha(source[i + 1]) and not simple_tokenizer.
                          is_cjk(c=list(source)[start - 1])))):
                        continue

                    src_match = next(
                        (x for x in iter(match_source)
                         if (x.start() == start and (x.end() -
                                                     x.start()) == length)),
                        None)

                    if src_match is not None:
                        value = ExtractResult()
                        value.start = start
                        value.length = length
                        value.text = substring
                        value.type = self._extract_type
                        value.data = match_source.get(src_match, None)
                        result.append(value)
        return result
Beispiel #2
0
    def strip_inequality(extract_result: ExtractResult, regexp: Pattern, in_prefix: bool):
        if regex.search(regexp, extract_result.text):
            original_length = len(extract_result.text)
            extract_result.text = str(regexp).replace(extract_result.text, '').strip()
            if in_prefix:
                extract_result.start += original_length - len(extract_result.text)

            extract_result.length = len(extract_result.text)
            extract_result.data = ''
    def extract(self, source: str):
        results: List[ExtractResult] = list()
        partial_results: List[ExtractResult] = list()
        trimmed_source = source.lower()

        if source is None or source.strip() == '':
            return results
        source_tokens = self.__tokenize(trimmed_source)

        for (regexp, type_extracted) in self.config.regexes_map.items():
            for match in RegExpUtility.get_matches(regexp, trimmed_source):
                match_tokens = self.__tokenize(match)
                top_score = 0.0

                for i in range(len(source_tokens)):
                    score = self.match_value(source_tokens, match_tokens, i)
                    top_score = max(top_score, score)

                if top_score > 0.0:
                    value = ExtractResult()
                    start = trimmed_source.index(match)
                    length = len(match)
                    text = source[start:start + length].strip()
                    value.start = start
                    value.length = length
                    value.text = text
                    value.type = type_extracted
                    value.data = ChoiceExtractDataResult(source, top_score)

                    partial_results.append(value)

        if len(partial_results) == 0:
            return results

        partial_results = sorted(partial_results, key=lambda res: res.start)

        if self.config.only_top_match:
            top_score = 0.0
            top_result_index = 0
            for i in range(len(partial_results)):
                data = ChoiceExtractDataResult(source,
                                               partial_results[i].data.score)
                if data.score > top_score:
                    top_score = data.score
                    top_result_index = i

            top_result = ChoiceExtractDataResult(
                partial_results[top_result_index].data.source,
                partial_results[top_result_index].data.score)
            top_result.other_matches = partial_results
            results.append(partial_results[top_result_index])
        else:
            results = partial_results

        return results
Beispiel #4
0
    def extract(self, source: str) -> List[ExtractResult]:
        if source is None or len(source.strip()) is 0:
            return list()
        result: List[ExtractResult] = list()
        match_source = dict()
        matched: List[bool] = [False] * len(source)

        matches_list = list(
            map(
                lambda x: MatchesVal(
                    matches=list(regex.finditer(x.re, source)), val=x.val),
                self.regexes))
        matches_list = list(filter(lambda x: len(x.matches) > 0, matches_list))
        for ml in matches_list:
            for m in ml.matches:
                for j in range(len(m.group())):
                    matched[m.start() + j] = True
                # Keep Source Data for extra information
                match_source[m] = ml.val

        last = -1
        for i in range(len(source)):
            if not matched[i]:
                last = i
            else:
                if i + 1 == len(source) or not matched[i + 1]:
                    start = last + 1
                    length = i - last
                    substr = source[start:start + length].strip()
                    src_match = next(
                        (x for x in iter(match_source)
                         if (x.start() == start and (x.end() -
                                                     x.start()) == length)),
                        None)

                    # extract negative numbers
                    if self._negative_number_terms is not None:
                        match = regex.search(self._negative_number_terms,
                                             source[0:start])
                        if match is not None:
                            start = match.start()
                            length = length + match.end() - match.start()
                            substr = source[start:start + length].strip()

                    if src_match is not None:
                        value = ExtractResult()
                        value.start = start
                        value.length = length
                        value.text = substr
                        value.type = self._extract_type
                        value.data = match_source.get(src_match, None)
                        result.append(value)

        result = self._filter_ambiguity(result, source)
        return result
    def try_merge_modifier_token(self,
                                 extract_result: ExtractResult,
                                 pattern: Pattern,
                                 source: str,
                                 potential_ambiguity: bool = False) -> bool:
        before_str = source[0:extract_result.start]
        after_str = source[extract_result.start:extract_result.length]

        # Avoid adding mod for ambiguity cases, such as "from" in "from ... to ..." should not add mod
        if potential_ambiguity and self.config.ambiguous_range_modifier_prefix and \
                regex.search(self.config.ambiguous_range_modifier_prefix, before_str):
            matches = list(
                regex.finditer(self.config.potential_ambiguous_range_regex,
                               source))
            if matches and len(matches):
                return any(match.start() < extract_result.start +
                           extract_result.length
                           and match.end() > extract_result.start
                           for match in matches)
                # return self._filter_item(extract_result, matches)

        token = self.has_token_index(before_str.strip(), pattern)
        if token.matched:
            mod_len = len(before_str) - token.index
            extract_result.length += mod_len
            extract_result.start -= mod_len
            extract_result.text = source[extract_result.
                                         start:extract_result.start +
                                         extract_result.length]

            extract_result.meta_data = self.assign_mod_metadata(
                extract_result.meta_data)
            return True
        elif self.config.check_both_before_after:
            # check also after_str
            after_str = source[extract_result.start:extract_result.length]
            token = self.has_token_index(after_str.strip(), pattern)
            if token.matched:
                mod_len = token.index + len(after_str) - len(after_str.strip())
                extract_result.length += mod_len
                extract_result.text = source[extract_result.
                                             start:extract_result.start +
                                             extract_result.length]
                extract_result.data = Constants.HAS_MOD
                extract_result.meta_data = self.assign_mod_metadata(
                    extract_result.meta_data)

                return True

        return False
    def extract(self, source: str) -> List[ExtractResult]:
        result: List[ExtractResult] = list()
        if not self._pre_check_str(source):
            return result

        matched: List[bool] = [False] * len(source)

        match_source: Dict[Match, str] = dict()

        matches_list = list(
            map(
                lambda x: MatchesVal(matches=list(re.finditer(x.re, source)),
                                     val=x.val), self.regexes))
        matches_list = list(filter(lambda x: len(x.matches) > 0, matches_list))

        for ml in matches_list:
            for m in ml.matches:
                if self._is_valid_match(m):
                    for j in range(len(m.group())):
                        matched[m.start() + j] = True
                    # Keep Source Data for extra information
                    match_source[m] = ml.val
        last = -1

        for i in range(len(source)):
            if not matched[i]:
                last = i
            else:
                if i + 1 == len(source) or not matched[i + 1]:
                    start = last + 1
                    length = i - last
                    substring = source[start:start + length].strip()
                    src_match = next(
                        (x for x in iter(match_source)
                         if (x.start() == start and (x.end() -
                                                     x.start()) == length)),
                        None)

                    if src_match is not None:
                        value = ExtractResult()
                        value.start = start
                        value.length = length
                        value.text = substring
                        value.type = self._extract_type
                        value.data = match_source.get(src_match, None)
                        result.append(value)

        return result
Beispiel #7
0
    def parse(self, source: ExtractResult) -> Optional[ParseResult]:
        original = source.text

        # do replace text & data from extended info
        if isinstance(source.data, list):
            source.text = source.data[0]
            source.data = source.data[1].data

        result: ParseResult = super().parse(source)

        if not result.resolution_str is None and result.resolution_str:
            if not result.resolution_str.strip().endswith('%'):
                result.resolution_str = result.resolution_str.strip() + '%'

        result.data = source.text
        result.text = original

        return result
def merge_all_tokens(tokens: List[Token], source: str,
                     extractor_name: str) -> List[ExtractResult]:
    result = []

    merged_tokens: List[Token] = list()
    tokens_ = sorted(filter(None, tokens), key=lambda x: x.start)

    for token in tokens_:
        add = True

        for index, m_token in enumerate(merged_tokens):
            if not add:
                break

            if token.start >= m_token.start and token.end <= m_token.end:
                add = False

            if m_token.start < token.start < m_token.end:
                add = False

            if token.start <= m_token.start and token.end >= m_token.end:
                add = False
                merged_tokens[index] = token

        if add:
            merged_tokens.append(token)

    for token in merged_tokens:
        start = token.start
        length = token.length
        sub_str = source[start:start + length]

        extracted_result = ExtractResult()
        extracted_result.start = start
        extracted_result.length = length
        extracted_result.text = sub_str
        extracted_result.type = extractor_name
        extracted_result.data = None
        extracted_result.meta_data = token.metadata

        result.append(extracted_result)

    return result
    def __merged_compound_units(self, source: str):
        ers = NumberWithUnitExtractor(self.config).extract(source)
        ers = self.__merge_pure_number(source, ers)

        result = []
        groups = [0] * len(ers)

        idx = 0
        while idx < len(ers) - 1:
            if ers[idx].type != ers[idx + 1].type and not ers[idx].type == Constants.SYS_NUM and not ers[idx + 1].type == Constants.SYS_NUM:
                idx = idx + 1
                continue

            if isinstance(ers[idx].data, ExtractResult) and not str(ers[idx].data.data).startswith("Integer"):
                groups[idx + 1] = groups[idx] + 1
                idx = idx + 1
                continue

            middle_begin = ers[idx].start + ers[idx].length
            middle_end = ers[idx + 1].start

            middle_str = source[middle_begin: middle_begin + (middle_end -
                                                              middle_begin)].strip().lower()

            # Separated by whitespace
            if not middle_str:
                groups[idx + 1] = groups[idx]
                idx = idx + 1
                continue

            # Separated by connector
            match = self.config.compound_unit_connector_regex.match(middle_str)
            if match is not None:
                splitted_match = match.string.split(" ")
            if match and match.pos == 0 and len(splitted_match[0]) == len(middle_str):
                groups[idx + 1] = groups[idx]
            else:
                groups[idx + 1] = groups[idx] + 1

            idx = idx + 1

        idx = 0
        while idx < len(ers):
            if idx == 0 or groups[idx] != groups[idx - 1]:
                tmp_extract_result = ers[idx]
                tmp = ExtractResult()
                tmp.data = ers[idx].data
                tmp.length = ers[idx].length
                tmp.start = ers[idx].start
                tmp.text = ers[idx].text
                tmp.type = ers[idx].type
                tmp_extract_result.data = [tmp]

                result.append(tmp_extract_result)

            # reduce extract results in same group
            if idx + 1 < len(ers) and groups[idx + 1] == groups[idx]:
                group = groups[idx]

                period_begin = result[group].start
                period_end = ers[idx + 1].start + ers[idx + 1].length

                result[group].length = period_end - period_begin
                result[group].text = source[period_begin:period_begin + (period_end - period_begin)]
                result[group].type = Constants.SYS_UNIT_CURRENCY
                if isinstance(result[group].data, list):
                    result[group].data.append(ers[idx + 1])

            idx = idx + 1

        idx = 0
        while idx < len(result):
            inner_data = result[idx].data
            if len(inner_data) == 1:
                result[idx] = inner_data[0]
            idx = idx + 1

        result = [x for x in result if not x.type == Constants.SYS_NUM]

        return result
    def extract(self, source: str) -> List[ExtractResult]:
        if not self._pre_check_str(source):
            return []

        non_unit_match = None
        numbers = None

        mapping_prefix: Dict[float, PrefixUnitResult] = dict()
        matched = [False] * len(source)
        result = []
        prefix_matched = False
        prefix_match: List[MatchResult] = sorted(self.prefix_matcher.find(source), key=lambda o: o.start)
        suffix_match: List[MatchResult] = sorted(self.suffix_matcher.find(source), key=lambda o: o.start)

        if len(prefix_match) > 0 or len(suffix_match) > 0:

            numbers: List[ExtractResult] = sorted(self.config.unit_num_extractor.extract(source), key=lambda o: o.start)

            if len(numbers) > 0 and self.config.extract_type is Constants.SYS_UNIT_CURRENCY and len(prefix_match) > 0 and len(suffix_match) > 0:

                for number in numbers:
                    start = number.start
                    length = number.length
                    number_prefix = [(mr.start + mr.length) == start for mr in prefix_match]
                    number_suffix = [mr.start == (start + length) for mr in suffix_match]
                    if True in number_prefix and True in number_suffix and "," in number.text:
                        comma_index = number.start + number.text.index(",")
                        source = source[:comma_index] + " " + source[comma_index + 1:]

                numbers: List[ExtractResult] = sorted(self.config.unit_num_extractor.extract(source), key=lambda o: o.start)

            # Special case for cases where number multipliers clash with unit
            ambiguous_multiplier_regex = self.config.ambiguous_unit_number_multiplier_regex
            if ambiguous_multiplier_regex is not None:

                for num in numbers:
                    match = list(filter(lambda x: x.group(), regex.finditer(
                        ambiguous_multiplier_regex, num.text)))
                    if match and len(match) == 1:
                        new_length = num.length - \
                            (match[0].span()[1] - match[0].span()[0])
                        num.text = num.text[0:new_length]
                        num.length = new_length

            for number in numbers:
                if number.start is None or number.length is None:
                    continue
                start = int(number.start)
                length = int(number.length)
                max_find_pref = min(self.max_prefix_match_len, number.start)
                max_find_suff = len(source) - start - length

                if max_find_pref != 0:
                    last_index = start
                    best_match = None

                    for m in prefix_match:
                        if m.length > 0 and m.end > start:
                            break

                        if m.length > 0 and source[m.start: m.start + (last_index - m.start)].strip() == m.text:
                            best_match = m
                            break

                    if best_match is not None:
                        off_set = last_index - best_match.start
                        unit_str = source[best_match.start:best_match.start + off_set]
                        self.add_element(mapping_prefix, number.start, (PrefixUnitResult(off_set, unit_str)))
                prefix_unit = mapping_prefix.get(start, None)
                if max_find_suff > 0:

                    max_len = 0
                    first_index = start + length

                    for m in suffix_match:

                        if m.length > 0 and m.start >= first_index:

                            end_pos = m.start + m.length - first_index
                            if max_len < end_pos:
                                mid_str = source[first_index: first_index + (m.start - first_index)]
                                if mid_str is None or not mid_str or str.isspace(mid_str) \
                                        or mid_str.strip() == self.config.connector_token:
                                    max_len = end_pos

                    if max_len != 0:
                        substr = source[start: start + length + max_len]
                        er = ExtractResult()

                        er.start = start
                        er.length = length + max_len
                        er.text = substr
                        er.type = self.config.extract_type

                        if prefix_unit is not None:
                            prefix_matched = True
                            er.start -= prefix_unit[0].offset
                            er.length += prefix_unit[0].offset
                            er.text = prefix_unit[0].unit + er.text

                        # Relative position will be used in Parser
                        number.start = start - er.start
                        er.data = number

                        # Special treatment, handle cases like '2:00 pm', '00 pm' is not dimension
                        is_not_unit = False

                        if er.type is Constants.SYS_UNIT_DIMENSION:
                            if non_unit_match is None:
                                non_unit_match = list(self.config.non_unit_regex.finditer(source))
                            for time in non_unit_match:
                                trimmed_source = source.lower()
                                index = trimmed_source.index(time.group())
                                if er.start >= time.start() and er.start + er.length <= \
                                        time.start() + len(time.group()):
                                    is_not_unit = True
                                    break

                        if is_not_unit:
                            continue

                        result.append(er)

                if prefix_unit and prefix_unit is not None and not prefix_matched:
                    er = ExtractResult()
                    er.start = number.start - prefix_unit[0].offset
                    er.length = number.length + prefix_unit[0].offset
                    er.text = prefix_unit[0].unit + number.text
                    er.type = self.config.extract_type

                    # Relative position will be used in Parser
                    number.start = start - er.start
                    er.data = number
                    result.append(er)

        # Extract Separate unit
        if self.separate_regex:
            if non_unit_match is None:
                try:
                    non_unit_match = list(self.config.non_unit_regex.match(source))
                except:
                    non_unit_match = []

            self._extract_separate_units(source, result, non_unit_match)

            # Remove common ambiguous cases
            result = self._filter_ambiguity(result, source)

        # Expand Chinese phrase to the `half` patterns when it follows closely origin phrase.
        self.config.expand_half_suffix(source, result, numbers)

        return result
Beispiel #11
0
    def extract(self, source: str) -> List[ExtractResult]:
        if not self._pre_check_str(source):
            return list()

        mapping_prefix: Dict[float, PrefixUnitResult] = dict()
        matched: List[bool] = [False] * len(source)
        numbers: List[ExtractResult] = self.config.unit_num_extractor.extract(
            source)
        result: List[ExtractResult] = list()
        source_len = len(source)

        if self.max_prefix_match_len != 0:
            for num in numbers:
                if num.start is None or num.length is None:
                    continue
                max_find_prefix = min(self.max_prefix_match_len, num.start)
                if max_find_prefix == 0:
                    continue

                left: str = source[num.start - max_find_prefix:num.start]
                last_index = len(left)
                best_match: Match = None
                for pattern in self.prefix_regex:
                    collection = list(
                        filter(lambda x: len(x.group()),
                               regex.finditer(pattern, left)))
                    for match in collection:
                        if left[match.start():last_index].strip(
                        ) == match.group():
                            if best_match is None or best_match.start(
                            ) >= match.start():
                                best_match = match
                if best_match:
                    mapping_prefix[num.start] = PrefixUnitResult(
                        offset=last_index - best_match.start(),
                        unit=left[best_match.start():last_index])
        for num in numbers:
            if num.start is None or num.length is None:
                continue
            start = num.start
            length = num.length
            max_find_len = source_len - start - length

            prefix_unit: PrefixUnitResult = mapping_prefix.get(start, None)

            if max_find_len > 0:
                right = source[start + length:start + length + max_find_len]
                unit_match_list = map(lambda x: list(regex.finditer(x, right)),
                                      self.suffix_regex)
                unit_match = chain.from_iterable(unit_match_list)
                unit_match = list(filter(lambda x: x.group(), unit_match))

                max_len = 0
                for match in unit_match:
                    if match.group():
                        end_pos = match.start() + len(match.group())
                        if match.start() >= 0:
                            middle: str = right[:min(match.start(), len(right)
                                                     )]
                            if max_len < end_pos and (
                                    not middle.strip() or middle.strip()
                                    == self.config.connector_token):
                                max_len = end_pos
                if max_len != 0:
                    for i in range(length + max_len):
                        matched[i + start] = True
                    ex_result = ExtractResult()
                    ex_result.start = start
                    ex_result.length = length + max_len
                    ex_result.text = source[start:start + length + max_len]
                    ex_result.type = self.config.extract_type

                    if prefix_unit:
                        ex_result.start -= prefix_unit.offset
                        ex_result.length += prefix_unit.offset
                        ex_result.text = prefix_unit.unit + ex_result.text

                    num.start = start - ex_result.start
                    ex_result.data = num

                    is_not_unit = False
                    if ex_result.type == Constants.SYS_UNIT_DIMENSION:
                        non_unit_match = self.config.pm_non_unit_regex.finditer(
                            source)
                        for match in non_unit_match:
                            if ex_result.start >= match.start(
                            ) and ex_result.end <= match.end():
                                is_not_unit = True

                    if is_not_unit:
                        continue

                    result.append(ex_result)
                    continue
            if prefix_unit:
                ex_result = ExtractResult()
                ex_result.start = num.start - prefix_unit.offset
                ex_result.length = num.length + prefix_unit.offset
                ex_result.text = prefix_unit.unit + num.text
                ex_result.type = self.config.extract_type

                num.start = start - ex_result.start
                ex_result.data = num
                result.append(ex_result)

        if self.separate_regex:
            result = self._extract_separate_units(source, result)

        return result