Example #1
0
    def frac_parse(self, source: ExtractResult) -> ParseResult:
        result = ParseResult(source)

        source_text = source.text
        split_result = regex.split(self.config.frac_split_regex, source_text)

        parts = namedtuple('parts', ['intval', 'demo', 'num'])

        result_part: parts

        if len(split_result) == 3:
            result_part = parts(
                intval=split_result[0],
                demo=split_result[1],
                num=split_result[2]
            )
        else:
            result_part = parts(
                intval=self.config.zero_char,
                demo=split_result[0],
                num=split_result[1]
            )

        int_value = Decimal(self.get_value_from_part(result_part.intval))
        num_value = Decimal(self.get_value_from_part(result_part.num))
        demo_value = Decimal(self.get_value_from_part(result_part.demo))

        if regex.search(self.config.negative_number_sign_regex, result_part.intval) is not None:
            result.value = int_value - num_value / demo_value
        else:
            result.value = int_value + num_value / demo_value

        result.resolution_str = self.__format(result.value)
        return result
Example #2
0
    def parse(self, source: ExtractResult) -> Optional[ParseResult]:
        ret = ParseResult(source)
        number_result = None
        if source.data and isinstance(source.data, ExtractResult):
            number_result = source.data
        else:  # if there is no unitResult, means there is just unit
            number_result = ExtractResult()
            number_result.start = -1
            number_result.length = 0
            number_result.text = None
            number_result.type = None
        # key contains units
        key = source.text
        unit_key_build = ''
        unit_keys = []
        i = 0
        while i <= len(key):
            if i == len(key):
                if unit_key_build:
                    self.__add_if_not_contained(
                        unit_keys, unit_key_build.strip())
            # number_result.start is a relative position
            elif i == number_result.start:
                if unit_key_build:
                    self.__add_if_not_contained(
                        unit_keys, unit_key_build.strip())
                    unit_key_build = ''
                if number_result.length:
                    i = number_result.start + number_result.length - 1
            else:
                unit_key_build += key[i]
            i += 1

        # Unit type depends on last unit in suffix.
        last_unit = unit_keys[-1]
        normalized_last_unit = last_unit.lower()
        if self.config.connector_token and normalized_last_unit.startswith(self.config.connector_token):
            normalized_last_unit = normalized_last_unit[len(
                self.config.connector_token):].strip()
            last_unit = last_unit[len(self.config.connector_token):].strip()
        if key and self.config.unit_map:
            unit_value = None
            if last_unit in self.config.unit_map:
                unit_value = self.config.unit_map[last_unit]
            elif normalized_last_unit in self.config.unit_map:
                unit_value = self.config.unit_map[normalized_last_unit]
            if unit_value:
                num_value = self.config.internal_number_parser.parse(
                    number_result) if number_result.text else None
                resolution_str = num_value.resolution_str if num_value else None

                ret.value = UnitValue(
                    number=resolution_str,
                    unit=unit_value)
                ret.resolution_str = f'{resolution_str} {unit_value}'.strip()

        ret.text = ret.text.lower()

        return ret
Example #3
0
    def _power_number_parse(self, ext_result: ExtractResult) -> ParseResult:
        result = ParseResult(ext_result)

        handle = ext_result.text.upper()
        exponent = '^' not in ext_result.text

        # [1] 1e10
        # [2] 1.1^-23
        call_stack = list()
        scale = 10
        dot = False
        negative = False
        tmp = 0

        for i in range(len(handle)):
            c = handle[i]
            if c in ['^', 'E']:
                if negative:
                    call_stack.append(-tmp)
                else:
                    call_stack.append(tmp)
                tmp = 0
                scale = 10
                dot = False
                negative = False
            elif c.isdigit():
                if dot:
                    tmp = tmp + scale * int(c)
                    scale *= 0.1
                else:
                    tmp = tmp * scale + int(c)
            elif c == self.config.decimal_separator_char:
                dot = True
                scale = 0.1
            elif c == '-':
                negative = not negative
            elif c == '+':
                continue
            if i == len(handle) - 1:
                if negative:
                    call_stack.append(-tmp)
                else:
                    call_stack.append(tmp)
        result_value = 0
        a = Decimal(call_stack.pop(0))
        b = Decimal(call_stack.pop(0))
        if exponent:
            result_value = getcontext().multiply(
                a,
                getcontext().power(Decimal(10), b))
        else:
            result_value = getcontext().power(a, b)

        result.value = result_value
        result.resolution_str = str(result_value)

        return result
Example #4
0
 def parse(self, source: ExtractResult):
     res = ParseResult(source)
     res.resolution_str = source.text
     res.start = source.start
     res.length = source.length
     res.text = source.text
     res.type = source.type
     res.value = self.score_guid(source.text)
     return res
Example #5
0
 def parse(self, ext_result: ExtractResult):
     result = ParseResult(ext_result)
     result.start = ext_result.start
     result.length = ext_result.length
     result.text = ext_result.text
     result.type = ext_result.type
     result.resolution_str = self.drop_leading_zeros(ext_result.text)
     result.data = ext_result.data
     return result
Example #6
0
    def ord_parse(self, source: ExtractResult) -> ParseResult:
        result = ParseResult(source)
        source_text = source.text[1:]

        if regex.search(self.config.digit_num_regex, source_text) is not None:
            result.value = self.get_digit_value(source_text, 1)
        else:
            result.value = self.get_int_value(source_text)

        result.resolution_str = self.__format(result.value)
        return result
Example #7
0
    def dou_parse(self, source: ExtractResult) -> ParseResult:
        result = ParseResult(source)

        source_text = self.replace_unit(source.text)

        if (regex.search(self.config.double_and_round_regex, source.text)) is not None:
            power = self.config.round_number_map_char[source_text[-1:]]
            result.value = self.get_digit_value(source_text[:-1], power)
        else:
            split_result = regex.split(self.config.point_regex, source_text)
            if split_result[0] == '':
                split_result[0] = '零'
            if regex.search(self.config.negative_number_sign_regex, split_result[0]) is not None:
                result.value = self.get_int_value(split_result[0]) - self.get_point_value(split_result[1])
            else:
                result.value = self.get_int_value(split_result[0]) + self.get_point_value(split_result[1])

        result.resolution_str = self.__format(result.value)
        return result
Example #8
0
    def __merge_compound_unit(self,
                              compound_result: ExtractResult) -> ParseResult:
        results = []
        compound_unit = compound_result.data

        count = 0
        result = None
        number_value = ''
        main_unit_value = ''
        main_unit_iso_code = ''
        fraction_unit_string = ''

        idx = 0

        while idx < len(compound_unit):
            extract_result = compound_unit[idx]
            parse_result = self.number_with_unit_parser.parse(extract_result)
            parse_result_value = parse_result.value
            try:
                unit_value = parse_result_value.unit if parse_result_value else None
            except AttributeError:
                unit_value = None
            # Process a new group
            if count == 0:
                if not extract_result.type == Constants.SYS_UNIT_CURRENCY:
                    idx = idx + 1
                    continue

                # Initialize a new result
                result = ParseResult()
                result.start = extract_result.start
                result.length = extract_result.length
                result.text = extract_result.text
                result.type = extract_result.type

                main_unit_value = unit_value
                if parse_result_value and parse_result_value.number:
                    number_value = float(parse_result_value.number)
                result.resolution_str = parse_result.resolution_str

                main_unit_iso_code = self.config.currency_name_to_iso_code_map.get(
                    unit_value, None)
                # If the main unit can't be recognized, finish process this group.
                if not main_unit_iso_code:
                    result.value = UnitValue(
                        self.__get_number_value(number_value), main_unit_value)
                    results.append(result)
                    result = None
                    idx = idx + 1
                    continue

                fraction_units_string = self.config.currency_fraction_mapping.get(
                    main_unit_iso_code)
            else:
                if extract_result.type == Constants.SYS_NUM:
                    number_value = number_value + \
                        float(parse_result.value) * (1 / 100)
                    result.resolution_str = result.resolution_str + ' ' + str(
                        parse_result.resolution_str or '')
                    result.length = parse_result.start + parse_result.length - result.start
                    count = count + 1
                    idx = idx + 1
                    continue

                fraction_unit_code = self.config.currency_fraction_code_list.get(
                    unit_value, None)
                fraction_num_value = self.config.currency_fraction_num_map.get(
                    parse_result_value.unit,
                    None) if parse_result_value else None

                if fraction_unit_code and fraction_num_value != 0 and self.__check_units_string_contains(
                        fraction_unit_code, fraction_units_string):
                    number_value = number_value + (
                        float(parse_result_value.number) *
                        (1 / fraction_num_value) if parse_result_value else 0)
                    result.resolution_str = result.resolution_str + ' ' + parse_result.resolution_str
                    result.length = parse_result.start + parse_result.length - result.start
                else:
                    if result:
                        result = self.__create_currency_result(
                            result, main_unit_iso_code, number_value,
                            main_unit_value)
                        results.append(result)
                        result = None

                    count = 0
                    number_value = ''
                    continue

            count = count + 1
            idx = idx + 1

        if result:
            result = self.__create_currency_result(result, main_unit_iso_code,
                                                   number_value,
                                                   main_unit_value)
            results.append(result)

        self.__resolve_text(results, compound_result.text,
                            compound_result.start)

        ret = ParseResult(compound_result)

        ret.value = results
        return ret
Example #9
0
 def int_parse(self, source: ExtractResult) -> ParseResult:
     result = ParseResult(source)
     result.value = self.get_int_value(source.text)
     result.resolution_str = self.__format(result.value)
     return result
Example #10
0
    def per_parse(self, source: ExtractResult) -> ParseResult:
        result = ParseResult(source)
        source_text = source.text
        power = 1

        if 'Spe' in source.data:
            source_text = self.replace_full_with_half(source_text)
            source_text = self.replace_unit(source_text)

            if source_text == '半額' or source_text == '半折':
                result.value = 50
            elif source_text == '10成' or source_text == '10割' or source_text == '十割':
                result.value = 100
            else:
                matches = list(regex.finditer(
                    self.config.spe_get_number_regex, source_text))
                int_number: int
                if len(matches) == 2:
                    int_number_char = matches[0].group()[0]
                    if int_number_char == self.config.pair_char:
                        int_number = 5
                    elif int_number_char in self.config.ten_chars:
                        int_number = 10
                    else:
                        int_number = self.config.zero_to_nine_map[int_number_char]

                    point_number_char = matches[1].group()[0]
                    point_number: float
                    if point_number_char == '半':
                        point_number = 0.5
                    else:
                        point_number = self.config.zero_to_nine_map[point_number_char] * 0.1

                    result.value = (int_number + point_number) * 10
                elif len(matches) == 5:
                    # Deal the Japanese percentage case like "xxx割xxx分xxx厘", get the integer value and convert into result.
                    int_number_char = matches[0].group()[0]
                    point_number_char = matches[1].group()[0]
                    dot_number_char = matches[3].group()[0]

                    point_number = self.config.zero_to_nine_map[point_number_char] * 0.1
                    dot_number = self.config.zero_to_nine_map[dot_number_char] * 0.01

                    int_number = self.config.zero_to_nine_map[int_number_char]

                    result.value = (
                        int_number + point_number + dot_number) * 10
                else:
                    int_number_char = matches[0].group()[0]
                    if int_number_char == self.config.pair_char:
                        int_number = 5
                    elif int_number_char in self.config.ten_chars:
                        int_number = 10
                    else:
                        int_number = self.config.zero_to_nine_map[int_number_char]
                    result.value = int_number * 10

        elif 'Num' in source.data:
            double_match = regex.search(
                self.config.percentage_regex, source_text)
            double_text = double_match.group()

            if any(x for x in ['k', 'K', 'k', 'K'] if x in double_text):
                power = 1000
            elif any(x for x in ['M', 'M'] if x in double_text):
                power = 1000000
            elif any(x for x in ['G', 'G'] if x in double_text):
                power = 1000000000
            elif any(x for x in ['T', 'T'] if x in double_text):
                power = 1000000000000
            result.value = self.get_digit_value(double_text, power)

        else:
            double_match = regex.search(
                self.config.percentage_regex, source_text)
            double_text = self.replace_unit(double_match.group())

            split_result = regex.split(self.config.point_regex, double_text)
            if split_result[0] == '':
                split_result[0] = self.config.zero_char

            double_value = self.get_int_value(split_result[0])
            if len(split_result) == 2:
                if regex.search(self.config.negative_number_sign_regex, split_result[0]) is not None:
                    double_value -= self.get_point_value(split_result[1])
                else:
                    double_value += self.get_point_value(split_result[1])
            result.value = double_value

        result.resolution_str = self.__format(result.value) + '%'
        return result
Example #11
0
 def parse(self, source: ExtractResult):
     result = ParseResult(source)
     result.resolution_str = source.text
     result.value = self.score_phone_number(source.text)
     return result
Example #12
0
 def parse(self, source: ExtractResult) -> Optional[ParseResult]:
     res = ParseResult(source)
     res.resolution_str = res.text
     return res
Example #13
0
    def per_parse_chs(self, source: ExtractResult) -> ParseResult:
        result = ParseResult(source)
        source_text = source.text
        power = 1

        if 'Spe' in source.data:
            source_text = self.replace_full_with_half(source_text)
            source_text = self.replace_unit(source_text)

            if source_text == '半折':
                result.value = 50
            elif source_text == '10成':
                result.value = 100
            else:
                matches = list(regex.finditer(self.config.spe_get_number_regex, source_text))
                int_number: int
                if len(matches) == 2:
                    int_number_char = matches[0].group()[0]
                    if int_number_char == '对':
                        int_number = 5
                    elif int_number_char == '十' or int_number_char == '拾':
                        int_number = 10
                    else:
                        int_number = self.config.zero_to_nine_map_chs[int_number_char]

                    point_number_char = matches[1].group()[0]
                    point_number: float
                    if point_number_char == '半':
                        point_number = 0.5
                    else:
                        point_number = self.config.zero_to_nine_map_chs[point_number_char] * 0.1

                    result.value = (int_number + point_number) * 10
                else:
                    int_number_char = matches[0].group()[0]
                    if int_number_char == '对':
                        int_number = 5
                    elif int_number_char == '十' or int_number_char == '拾':
                        int_number = 10
                    else:
                        int_number = self.config.zero_to_nine_map_chs[int_number_char]
                    result.value = int_number * 10

        elif 'Num' in source.data:
            double_match = regex.search(self.config.percentage_regex, source_text)
            double_text = double_match.group()

            if any(x for x in ['k', 'K', 'k', 'K'] if x in double_text):
                power = 1000
            elif any(x for x in ['M', 'M'] if x in double_text):
                power = 1000000
            elif any(x for x in ['G', 'G'] if x in double_text):
                power = 1000000000
            elif any(x for x in ['T', 'T'] if x in double_text):
                power = 1000000000000
            result.value = self.get_digit_value_chs(double_text, power)

        else:
            double_match = regex.search(self.config.percentage_regex, source_text)
            double_text = self.replace_unit(double_match.group())

            split_result = regex.split(self.config.point_regex_chs, double_text)
            if split_result[0] == '':
                split_result[0] = '零'

            double_value = self.get_int_value_chs(split_result[0])
            if len(split_result) == 2:
                if regex.search(self.config.negative_number_sign_regex, split_result[0]) is not None:
                    double_value -= self.get_point_value_chs(split_result[1])
                else:
                    double_value += self.get_point_value_chs(split_result[1])
            result.value = double_value

        result.resolution_str = self.__format(result.value) + '%'
        return result