Esempio n. 1
0
    def frac_parse(self, source: ExtractResult) -> ParseResult:
        result = ParseResult(source)

        source_text = source.text
        split_result = regex.split(self.config.frac_split_regex, source_text)

        parts = namedtuple('parts', ['intval', 'demo', 'num'])

        result_part: parts

        if len(split_result) == 3:
            result_part = parts(
                intval=split_result[0],
                demo=split_result[1],
                num=split_result[2]
            )
        else:
            result_part = parts(
                intval=self.config.zero_char,
                demo=split_result[0],
                num=split_result[1]
            )

        int_value = Decimal(self.get_value_from_part(result_part.intval))
        num_value = Decimal(self.get_value_from_part(result_part.num))
        demo_value = Decimal(self.get_value_from_part(result_part.demo))

        if regex.search(self.config.negative_number_sign_regex, result_part.intval) is not None:
            result.value = int_value - num_value / demo_value
        else:
            result.value = int_value + num_value / demo_value

        result.resolution_str = self.__format(result.value)
        return result
Esempio n. 2
0
    def ord_parse(self, source: ExtractResult) -> ParseResult:
        result = ParseResult(source)
        source_text = source.text[1:]

        if regex.search(self.config.digit_num_regex, source_text) is not None:
            result.value = self.get_digit_value(source_text, 1)
        else:
            result.value = self.get_int_value(source_text)

        result.resolution_str = self.__format(result.value)
        return result
Esempio n. 3
0
    def _text_number_parse(self, ext_result: ExtractResult) -> ParseResult:
        result = ParseResult(ext_result)

        handle = regex.sub(self.config.half_a_dozen_regex,
                           self.config.half_a_dozen_text,
                           ext_result.text.lower())
        num_group = self.__split_multi(
            handle,
            list(
                filter(lambda x: x is not None,
                       self.config.written_decimal_separator_texts)))

        int_part = num_group[0]

        matches = list(
            map(lambda x: x.group().lower(),
                list(regex.finditer(self.text_number_regex,
                                    int_part)))) if int_part else list()

        int_part_real = self.__get_int_value(matches)

        point_part_real = Decimal(0)
        if len(num_group) == 2:
            point_part = num_group[1]
            matches = list(
                map(lambda x: x.group().lower(),
                    list(regex.finditer(self.text_number_regex, point_part))))
            point_part_real += self.__get_point_value(matches)

        result.value = int_part_real + Decimal(point_part_real)
        return result
Esempio n. 4
0
    def parse(self, source: ExtractResult) -> Optional[ParseResult]:
        ret = ParseResult(source)
        number_result = None
        if source.data and isinstance(source.data, ExtractResult):
            number_result = source.data
        else:  # if there is no unitResult, means there is just unit
            number_result = ExtractResult()
            number_result.start = -1
            number_result.length = 0
            number_result.text = None
            number_result.type = None
        # key contains units
        key = source.text
        unit_key_build = ''
        unit_keys = []
        i = 0
        while i <= len(key):
            if i == len(key):
                if unit_key_build:
                    self.__add_if_not_contained(
                        unit_keys, unit_key_build.strip())
            # number_result.start is a relative position
            elif i == number_result.start:
                if unit_key_build:
                    self.__add_if_not_contained(
                        unit_keys, unit_key_build.strip())
                    unit_key_build = ''
                if number_result.length:
                    i = number_result.start + number_result.length - 1
            else:
                unit_key_build += key[i]
            i += 1

        # Unit type depends on last unit in suffix.
        last_unit = unit_keys[-1]
        normalized_last_unit = last_unit.lower()
        if self.config.connector_token and normalized_last_unit.startswith(self.config.connector_token):
            normalized_last_unit = normalized_last_unit[len(
                self.config.connector_token):].strip()
            last_unit = last_unit[len(self.config.connector_token):].strip()
        if key and self.config.unit_map:
            unit_value = None
            if last_unit in self.config.unit_map:
                unit_value = self.config.unit_map[last_unit]
            elif normalized_last_unit in self.config.unit_map:
                unit_value = self.config.unit_map[normalized_last_unit]
            if unit_value:
                num_value = self.config.internal_number_parser.parse(
                    number_result) if number_result.text else None
                resolution_str = num_value.resolution_str if num_value else None

                ret.value = UnitValue(
                    number=resolution_str,
                    unit=unit_value)
                ret.resolution_str = f'{resolution_str} {unit_value}'.strip()

        ret.text = ret.text.lower()

        return ret
Esempio n. 5
0
 def parse(self, ext_result):
     result = ParseResult(ext_result)
     data = ChoiceExtractDataResult(ext_result.data)
     result.value = self.config.resolutions.get(result.type)
     result.data = ChoiceParseDataResult(
         data.score,
         [self.__to_other_match_result(m) for m in data.other_matches])
     return result
Esempio n. 6
0
    def _power_number_parse(self, ext_result: ExtractResult) -> ParseResult:
        result = ParseResult(ext_result)

        handle = ext_result.text.upper()
        exponent = '^' not in ext_result.text

        # [1] 1e10
        # [2] 1.1^-23
        call_stack = list()
        scale = 10
        dot = False
        negative = False
        tmp = 0

        for i in range(len(handle)):
            c = handle[i]
            if c in ['^', 'E']:
                if negative:
                    call_stack.append(-tmp)
                else:
                    call_stack.append(tmp)
                tmp = 0
                scale = 10
                dot = False
                negative = False
            elif c.isdigit():
                if dot:
                    tmp = tmp + scale * int(c)
                    scale *= 0.1
                else:
                    tmp = tmp * scale + int(c)
            elif c == self.config.decimal_separator_char:
                dot = True
                scale = 0.1
            elif c == '-':
                negative = not negative
            elif c == '+':
                continue
            if i == len(handle) - 1:
                if negative:
                    call_stack.append(-tmp)
                else:
                    call_stack.append(tmp)
        result_value = 0
        a = Decimal(call_stack.pop(0))
        b = Decimal(call_stack.pop(0))
        if exponent:
            result_value = getcontext().multiply(
                a,
                getcontext().power(Decimal(10), b))
        else:
            result_value = getcontext().power(a, b)

        result.value = result_value
        result.resolution_str = str(result_value)

        return result
Esempio n. 7
0
 def parse(self, source: ExtractResult):
     res = ParseResult(source)
     res.resolution_str = source.text
     res.start = source.start
     res.length = source.length
     res.text = source.text
     res.type = source.type
     res.value = self.score_guid(source.text)
     return res
Esempio n. 8
0
    def dou_parse(self, source: ExtractResult) -> ParseResult:
        result = ParseResult(source)

        source_text = self.replace_unit(source.text)

        if (regex.search(self.config.double_and_round_regex, source.text)) is not None:
            power = self.config.round_number_map_char[source_text[-1:]]
            result.value = self.get_digit_value(source_text[:-1], power)
        else:
            split_result = regex.split(self.config.point_regex, source_text)
            if split_result[0] == '':
                split_result[0] = '零'
            if regex.search(self.config.negative_number_sign_regex, split_result[0]) is not None:
                result.value = self.get_int_value(split_result[0]) - self.get_point_value(split_result[1])
            else:
                result.value = self.get_int_value(split_result[0]) + self.get_point_value(split_result[1])

        result.resolution_str = self.__format(result.value)
        return result
Esempio n. 9
0
    def _digit_number_parse(self, ext_result: ExtractResult) -> ParseResult:
        result = ParseResult()
        result.start = ext_result.start
        result.length = ext_result.length
        result.text = ext_result.text
        result.type = ext_result.type
        result.meta_data = MetaData(
        ) if not result.meta_data else result.meta_data

        # [1] 24
        # [2] 12 32/33
        # [3] 1,000,000
        # [4] 234.567
        # [5] 44/55
        # [6] 2 hundred
        # dot occurred.

        power = 1
        tmp_index = -1
        start_index = 0
        handle = ext_result.text.lower()

        matches = list(regex.finditer(self.config.digital_number_regex,
                                      handle))
        if matches:
            for match in matches:
                rep = self.config.round_number_map.get(match.group())
                # \\s+ for filter the spaces.
                power *= rep

                tmp_index = handle.find(match.group(), start_index)
                while tmp_index >= 0:
                    front = handle[0:tmp_index].rstrip()
                    start_index = len(front)
                    handle = front + handle[tmp_index + len(match):]
                    tmp_index = handle.find(match.group(), start_index)

        # Scale used in the calculate of double
        result.value = self._get_digital_value(handle, power)

        return result
Esempio n. 10
0
    def __merge_compound_unit(self,
                              compound_result: ExtractResult) -> ParseResult:
        results = []
        compound_unit = compound_result.data

        count = 0
        result = None
        number_value = ''
        main_unit_value = ''
        main_unit_iso_code = ''
        fraction_unit_string = ''

        idx = 0

        while idx < len(compound_unit):
            extract_result = compound_unit[idx]
            parse_result = self.number_with_unit_parser.parse(extract_result)
            parse_result_value = parse_result.value
            try:
                unit_value = parse_result_value.unit if parse_result_value else None
            except AttributeError:
                unit_value = None
            # Process a new group
            if count == 0:
                if not extract_result.type == Constants.SYS_UNIT_CURRENCY:
                    idx = idx + 1
                    continue

                # Initialize a new result
                result = ParseResult()
                result.start = extract_result.start
                result.length = extract_result.length
                result.text = extract_result.text
                result.type = extract_result.type

                main_unit_value = unit_value
                if parse_result_value and parse_result_value.number:
                    number_value = float(parse_result_value.number)
                result.resolution_str = parse_result.resolution_str

                main_unit_iso_code = self.config.currency_name_to_iso_code_map.get(
                    unit_value, None)
                # If the main unit can't be recognized, finish process this group.
                if not main_unit_iso_code:
                    result.value = UnitValue(
                        self.__get_number_value(number_value), main_unit_value)
                    results.append(result)
                    result = None
                    idx = idx + 1
                    continue

                fraction_units_string = self.config.currency_fraction_mapping.get(
                    main_unit_iso_code)
            else:
                if extract_result.type == Constants.SYS_NUM:
                    number_value = number_value + \
                        float(parse_result.value) * (1 / 100)
                    result.resolution_str = result.resolution_str + ' ' + str(
                        parse_result.resolution_str or '')
                    result.length = parse_result.start + parse_result.length - result.start
                    count = count + 1
                    idx = idx + 1
                    continue

                fraction_unit_code = self.config.currency_fraction_code_list.get(
                    unit_value, None)
                fraction_num_value = self.config.currency_fraction_num_map.get(
                    parse_result_value.unit,
                    None) if parse_result_value else None

                if fraction_unit_code and fraction_num_value != 0 and self.__check_units_string_contains(
                        fraction_unit_code, fraction_units_string):
                    number_value = number_value + (
                        float(parse_result_value.number) *
                        (1 / fraction_num_value) if parse_result_value else 0)
                    result.resolution_str = result.resolution_str + ' ' + parse_result.resolution_str
                    result.length = parse_result.start + parse_result.length - result.start
                else:
                    if result:
                        result = self.__create_currency_result(
                            result, main_unit_iso_code, number_value,
                            main_unit_value)
                        results.append(result)
                        result = None

                    count = 0
                    number_value = ''
                    continue

            count = count + 1
            idx = idx + 1

        if result:
            result = self.__create_currency_result(result, main_unit_iso_code,
                                                   number_value,
                                                   main_unit_value)
            results.append(result)

        self.__resolve_text(results, compound_result.text,
                            compound_result.start)

        ret = ParseResult(compound_result)

        ret.value = results
        return ret
Esempio n. 11
0
 def int_parse(self, source: ExtractResult) -> ParseResult:
     result = ParseResult(source)
     result.value = self.get_int_value(source.text)
     result.resolution_str = self.__format(result.value)
     return result
Esempio n. 12
0
    def per_parse(self, source: ExtractResult) -> ParseResult:
        result = ParseResult(source)
        source_text = source.text
        power = 1

        if 'Spe' in source.data:
            source_text = self.replace_full_with_half(source_text)
            source_text = self.replace_unit(source_text)

            if source_text == '半額' or source_text == '半折':
                result.value = 50
            elif source_text == '10成' or source_text == '10割' or source_text == '十割':
                result.value = 100
            else:
                matches = list(regex.finditer(
                    self.config.spe_get_number_regex, source_text))
                int_number: int
                if len(matches) == 2:
                    int_number_char = matches[0].group()[0]
                    if int_number_char == self.config.pair_char:
                        int_number = 5
                    elif int_number_char in self.config.ten_chars:
                        int_number = 10
                    else:
                        int_number = self.config.zero_to_nine_map[int_number_char]

                    point_number_char = matches[1].group()[0]
                    point_number: float
                    if point_number_char == '半':
                        point_number = 0.5
                    else:
                        point_number = self.config.zero_to_nine_map[point_number_char] * 0.1

                    result.value = (int_number + point_number) * 10
                elif len(matches) == 5:
                    # Deal the Japanese percentage case like "xxx割xxx分xxx厘", get the integer value and convert into result.
                    int_number_char = matches[0].group()[0]
                    point_number_char = matches[1].group()[0]
                    dot_number_char = matches[3].group()[0]

                    point_number = self.config.zero_to_nine_map[point_number_char] * 0.1
                    dot_number = self.config.zero_to_nine_map[dot_number_char] * 0.01

                    int_number = self.config.zero_to_nine_map[int_number_char]

                    result.value = (
                        int_number + point_number + dot_number) * 10
                else:
                    int_number_char = matches[0].group()[0]
                    if int_number_char == self.config.pair_char:
                        int_number = 5
                    elif int_number_char in self.config.ten_chars:
                        int_number = 10
                    else:
                        int_number = self.config.zero_to_nine_map[int_number_char]
                    result.value = int_number * 10

        elif 'Num' in source.data:
            double_match = regex.search(
                self.config.percentage_regex, source_text)
            double_text = double_match.group()

            if any(x for x in ['k', 'K', 'k', 'K'] if x in double_text):
                power = 1000
            elif any(x for x in ['M', 'M'] if x in double_text):
                power = 1000000
            elif any(x for x in ['G', 'G'] if x in double_text):
                power = 1000000000
            elif any(x for x in ['T', 'T'] if x in double_text):
                power = 1000000000000
            result.value = self.get_digit_value(double_text, power)

        else:
            double_match = regex.search(
                self.config.percentage_regex, source_text)
            double_text = self.replace_unit(double_match.group())

            split_result = regex.split(self.config.point_regex, double_text)
            if split_result[0] == '':
                split_result[0] = self.config.zero_char

            double_value = self.get_int_value(split_result[0])
            if len(split_result) == 2:
                if regex.search(self.config.negative_number_sign_regex, split_result[0]) is not None:
                    double_value -= self.get_point_value(split_result[1])
                else:
                    double_value += self.get_point_value(split_result[1])
            result.value = double_value

        result.resolution_str = self.__format(result.value) + '%'
        return result
Esempio n. 13
0
    def _frac_like_number_parse(self,
                                ext_result: ExtractResult) -> ParseResult:
        result = ParseResult()
        result.start = ext_result.start
        result.length = ext_result.length
        result.text = ext_result.text
        result.type = ext_result.type

        result_text = ext_result.text.lower()
        if regex.search(self.config.fraction_marker_token, result_text):
            over_index = result_text.find(self.config.fraction_marker_token)
            small_part = result_text[0:over_index].strip()
            big_part = result_text[over_index +
                                   len(self.config.fraction_marker_token
                                       ):len(result_text)].strip()
            small_value = self._get_digital_value(
                small_part, 1) if self._is_digit(
                    small_part[0]) else self.__get_int_value(
                        self.__get_matches(small_part))
            big_value = self._get_digital_value(big_part, 1) if self._is_digit(
                big_part[0]) else self.__get_int_value(
                    self.__get_matches(big_part))

            result.value = small_value / big_value
        else:
            words = list(filter(lambda x: x, result_text.split(' ')))
            frac_words = self.config.normalize_token_set(words, result)

            # Split fraction with integer
            split_index = len(frac_words) - 1
            current_value = self.config.resolve_composite_number(
                frac_words[split_index])
            round_value = 1

            for split_index in range(len(frac_words) - 2, -1, -1):
                if (frac_words[split_index]
                        in self.config.written_fraction_separator_texts
                        or frac_words[split_index]
                        in self.config.written_integer_separator_texts):
                    continue
                previous_value = current_value
                current_value = self.config.resolve_composite_number(
                    frac_words[split_index])

                sm_hundreds = 100

                # previous: hundred
                # current: one
                if ((previous_value >= sm_hundreds
                     and previous_value > current_value) or
                    (previous_value < sm_hundreds
                     and self.__is_composable(current_value, previous_value))):
                    if (previous_value < sm_hundreds
                            and current_value >= round_value):
                        round_value = current_value
                    elif (previous_value < sm_hundreds
                          and current_value < round_value):
                        split_index += 1
                        break

                    # current is the first word
                    if split_index == 0:
                        # scan, skip the first word
                        split_index = 1
                        while split_index <= len(frac_words) - 2:
                            # e.g. one hundred thousand
                            # frac[i+1] % 100 and frac[i] % 100 = 0
                            if (self.config.resolve_composite_number(
                                    frac_words[split_index]) >= sm_hundreds
                                    and not frac_words[split_index + 1] in self
                                    .config.written_fraction_separator_texts
                                    and self.config.resolve_composite_number(
                                        frac_words[split_index + 1]) <
                                    sm_hundreds):
                                split_index += 1
                                break
                            split_index += 1
                        break
                    continue
                split_index += 1
                break

            frac_part = []
            for i in range(split_index, len(frac_words)):
                if frac_words[i].find('-') > -1:
                    split = frac_words[i].split('-')
                    frac_part.append(split[0])
                    frac_part.append('-')
                    frac_part.append(split[1])
                else:
                    frac_part.append(frac_words[i])

            frac_words = frac_words[:split_index]

            # denomi = denominator
            denomi_value = self.__get_int_value(frac_part)
            # Split mixed number with fraction
            numer_value = 0
            int_value = 0

            mixed_index = len(frac_words)
            for i in range(len(frac_words) - 1, -1, -1):
                if (i < len(frac_words) - 1 and frac_words[i]
                        in self.config.written_fraction_separator_texts):
                    numer_str = ' '.join(frac_words[i + 1:len(frac_words)])
                    numer_value = self.__get_int_value(
                        self.__get_matches(numer_str))
                    mixed_index = i + 1
                    break

            int_str = ' '.join(frac_words[0:mixed_index])
            int_value = self.__get_int_value(self.__get_matches(int_str))

            # Find mixed number
            if (mixed_index != len(frac_words) and numer_value < denomi_value):
                # int_value + numer_value / denomi_value
                result.value = int_value + numer_value / denomi_value
            else:
                # (int_value + numer_value) / denomi_value
                result.value = (int_value + numer_value) / denomi_value

            # Convert to float for fixed float point vs. exponential notation consistency /w C#/TS/JS
            result.value = float(result.value)
        return result
Esempio n. 14
0
 def parse(self, source: ExtractResult):
     result = ParseResult(source)
     result.resolution_str = source.text
     result.value = self.score_phone_number(source.text)
     return result
Esempio n. 15
0
    def per_parse_chs(self, source: ExtractResult) -> ParseResult:
        result = ParseResult(source)
        source_text = source.text
        power = 1

        if 'Spe' in source.data:
            source_text = self.replace_full_with_half(source_text)
            source_text = self.replace_unit(source_text)

            if source_text == '半折':
                result.value = 50
            elif source_text == '10成':
                result.value = 100
            else:
                matches = list(regex.finditer(self.config.spe_get_number_regex, source_text))
                int_number: int
                if len(matches) == 2:
                    int_number_char = matches[0].group()[0]
                    if int_number_char == '对':
                        int_number = 5
                    elif int_number_char == '十' or int_number_char == '拾':
                        int_number = 10
                    else:
                        int_number = self.config.zero_to_nine_map_chs[int_number_char]

                    point_number_char = matches[1].group()[0]
                    point_number: float
                    if point_number_char == '半':
                        point_number = 0.5
                    else:
                        point_number = self.config.zero_to_nine_map_chs[point_number_char] * 0.1

                    result.value = (int_number + point_number) * 10
                else:
                    int_number_char = matches[0].group()[0]
                    if int_number_char == '对':
                        int_number = 5
                    elif int_number_char == '十' or int_number_char == '拾':
                        int_number = 10
                    else:
                        int_number = self.config.zero_to_nine_map_chs[int_number_char]
                    result.value = int_number * 10

        elif 'Num' in source.data:
            double_match = regex.search(self.config.percentage_regex, source_text)
            double_text = double_match.group()

            if any(x for x in ['k', 'K', 'k', 'K'] if x in double_text):
                power = 1000
            elif any(x for x in ['M', 'M'] if x in double_text):
                power = 1000000
            elif any(x for x in ['G', 'G'] if x in double_text):
                power = 1000000000
            elif any(x for x in ['T', 'T'] if x in double_text):
                power = 1000000000000
            result.value = self.get_digit_value_chs(double_text, power)

        else:
            double_match = regex.search(self.config.percentage_regex, source_text)
            double_text = self.replace_unit(double_match.group())

            split_result = regex.split(self.config.point_regex_chs, double_text)
            if split_result[0] == '':
                split_result[0] = '零'

            double_value = self.get_int_value_chs(split_result[0])
            if len(split_result) == 2:
                if regex.search(self.config.negative_number_sign_regex, split_result[0]) is not None:
                    double_value -= self.get_point_value_chs(split_result[1])
                else:
                    double_value += self.get_point_value_chs(split_result[1])
            result.value = double_value

        result.resolution_str = self.__format(result.value) + '%'
        return result