def frac_parse(self, source: ExtractResult) -> ParseResult: result = ParseResult(source) source_text = source.text split_result = regex.split(self.config.frac_split_regex, source_text) parts = namedtuple('parts', ['intval', 'demo', 'num']) result_part: parts if len(split_result) == 3: result_part = parts( intval=split_result[0], demo=split_result[1], num=split_result[2] ) else: result_part = parts( intval=self.config.zero_char, demo=split_result[0], num=split_result[1] ) int_value = Decimal(self.get_value_from_part(result_part.intval)) num_value = Decimal(self.get_value_from_part(result_part.num)) demo_value = Decimal(self.get_value_from_part(result_part.demo)) if regex.search(self.config.negative_number_sign_regex, result_part.intval) is not None: result.value = int_value - num_value / demo_value else: result.value = int_value + num_value / demo_value result.resolution_str = self.__format(result.value) return result
def parse(self, source: ExtractResult) -> Optional[ParseResult]: ret = ParseResult(source) number_result = None if source.data and isinstance(source.data, ExtractResult): number_result = source.data else: # if there is no unitResult, means there is just unit number_result = ExtractResult() number_result.start = -1 number_result.length = 0 number_result.text = None number_result.type = None # key contains units key = source.text unit_key_build = '' unit_keys = [] i = 0 while i <= len(key): if i == len(key): if unit_key_build: self.__add_if_not_contained( unit_keys, unit_key_build.strip()) # number_result.start is a relative position elif i == number_result.start: if unit_key_build: self.__add_if_not_contained( unit_keys, unit_key_build.strip()) unit_key_build = '' if number_result.length: i = number_result.start + number_result.length - 1 else: unit_key_build += key[i] i += 1 # Unit type depends on last unit in suffix. last_unit = unit_keys[-1] normalized_last_unit = last_unit.lower() if self.config.connector_token and normalized_last_unit.startswith(self.config.connector_token): normalized_last_unit = normalized_last_unit[len( self.config.connector_token):].strip() last_unit = last_unit[len(self.config.connector_token):].strip() if key and self.config.unit_map: unit_value = None if last_unit in self.config.unit_map: unit_value = self.config.unit_map[last_unit] elif normalized_last_unit in self.config.unit_map: unit_value = self.config.unit_map[normalized_last_unit] if unit_value: num_value = self.config.internal_number_parser.parse( number_result) if number_result.text else None resolution_str = num_value.resolution_str if num_value else None ret.value = UnitValue( number=resolution_str, unit=unit_value) ret.resolution_str = f'{resolution_str} {unit_value}'.strip() ret.text = ret.text.lower() return ret
def _power_number_parse(self, ext_result: ExtractResult) -> ParseResult: result = ParseResult(ext_result) handle = ext_result.text.upper() exponent = '^' not in ext_result.text # [1] 1e10 # [2] 1.1^-23 call_stack = list() scale = 10 dot = False negative = False tmp = 0 for i in range(len(handle)): c = handle[i] if c in ['^', 'E']: if negative: call_stack.append(-tmp) else: call_stack.append(tmp) tmp = 0 scale = 10 dot = False negative = False elif c.isdigit(): if dot: tmp = tmp + scale * int(c) scale *= 0.1 else: tmp = tmp * scale + int(c) elif c == self.config.decimal_separator_char: dot = True scale = 0.1 elif c == '-': negative = not negative elif c == '+': continue if i == len(handle) - 1: if negative: call_stack.append(-tmp) else: call_stack.append(tmp) result_value = 0 a = Decimal(call_stack.pop(0)) b = Decimal(call_stack.pop(0)) if exponent: result_value = getcontext().multiply( a, getcontext().power(Decimal(10), b)) else: result_value = getcontext().power(a, b) result.value = result_value result.resolution_str = str(result_value) return result
def parse(self, source: ExtractResult): res = ParseResult(source) res.resolution_str = source.text res.start = source.start res.length = source.length res.text = source.text res.type = source.type res.value = self.score_guid(source.text) return res
def parse(self, ext_result: ExtractResult): result = ParseResult(ext_result) result.start = ext_result.start result.length = ext_result.length result.text = ext_result.text result.type = ext_result.type result.resolution_str = self.drop_leading_zeros(ext_result.text) result.data = ext_result.data return result
def ord_parse(self, source: ExtractResult) -> ParseResult: result = ParseResult(source) source_text = source.text[1:] if regex.search(self.config.digit_num_regex, source_text) is not None: result.value = self.get_digit_value(source_text, 1) else: result.value = self.get_int_value(source_text) result.resolution_str = self.__format(result.value) return result
def dou_parse(self, source: ExtractResult) -> ParseResult: result = ParseResult(source) source_text = self.replace_unit(source.text) if (regex.search(self.config.double_and_round_regex, source.text)) is not None: power = self.config.round_number_map_char[source_text[-1:]] result.value = self.get_digit_value(source_text[:-1], power) else: split_result = regex.split(self.config.point_regex, source_text) if split_result[0] == '': split_result[0] = '零' if regex.search(self.config.negative_number_sign_regex, split_result[0]) is not None: result.value = self.get_int_value(split_result[0]) - self.get_point_value(split_result[1]) else: result.value = self.get_int_value(split_result[0]) + self.get_point_value(split_result[1]) result.resolution_str = self.__format(result.value) return result
def __merge_compound_unit(self, compound_result: ExtractResult) -> ParseResult: results = [] compound_unit = compound_result.data count = 0 result = None number_value = '' main_unit_value = '' main_unit_iso_code = '' fraction_unit_string = '' idx = 0 while idx < len(compound_unit): extract_result = compound_unit[idx] parse_result = self.number_with_unit_parser.parse(extract_result) parse_result_value = parse_result.value try: unit_value = parse_result_value.unit if parse_result_value else None except AttributeError: unit_value = None # Process a new group if count == 0: if not extract_result.type == Constants.SYS_UNIT_CURRENCY: idx = idx + 1 continue # Initialize a new result result = ParseResult() result.start = extract_result.start result.length = extract_result.length result.text = extract_result.text result.type = extract_result.type main_unit_value = unit_value if parse_result_value and parse_result_value.number: number_value = float(parse_result_value.number) result.resolution_str = parse_result.resolution_str main_unit_iso_code = self.config.currency_name_to_iso_code_map.get( unit_value, None) # If the main unit can't be recognized, finish process this group. if not main_unit_iso_code: result.value = UnitValue( self.__get_number_value(number_value), main_unit_value) results.append(result) result = None idx = idx + 1 continue fraction_units_string = self.config.currency_fraction_mapping.get( main_unit_iso_code) else: if extract_result.type == Constants.SYS_NUM: number_value = number_value + \ float(parse_result.value) * (1 / 100) result.resolution_str = result.resolution_str + ' ' + str( parse_result.resolution_str or '') result.length = parse_result.start + parse_result.length - result.start count = count + 1 idx = idx + 1 continue fraction_unit_code = self.config.currency_fraction_code_list.get( unit_value, None) fraction_num_value = self.config.currency_fraction_num_map.get( parse_result_value.unit, None) if parse_result_value else None if fraction_unit_code and fraction_num_value != 0 and self.__check_units_string_contains( fraction_unit_code, fraction_units_string): number_value = number_value + ( float(parse_result_value.number) * (1 / fraction_num_value) if parse_result_value else 0) result.resolution_str = result.resolution_str + ' ' + parse_result.resolution_str result.length = parse_result.start + parse_result.length - result.start else: if result: result = self.__create_currency_result( result, main_unit_iso_code, number_value, main_unit_value) results.append(result) result = None count = 0 number_value = '' continue count = count + 1 idx = idx + 1 if result: result = self.__create_currency_result(result, main_unit_iso_code, number_value, main_unit_value) results.append(result) self.__resolve_text(results, compound_result.text, compound_result.start) ret = ParseResult(compound_result) ret.value = results return ret
def int_parse(self, source: ExtractResult) -> ParseResult: result = ParseResult(source) result.value = self.get_int_value(source.text) result.resolution_str = self.__format(result.value) return result
def per_parse(self, source: ExtractResult) -> ParseResult: result = ParseResult(source) source_text = source.text power = 1 if 'Spe' in source.data: source_text = self.replace_full_with_half(source_text) source_text = self.replace_unit(source_text) if source_text == '半額' or source_text == '半折': result.value = 50 elif source_text == '10成' or source_text == '10割' or source_text == '十割': result.value = 100 else: matches = list(regex.finditer( self.config.spe_get_number_regex, source_text)) int_number: int if len(matches) == 2: int_number_char = matches[0].group()[0] if int_number_char == self.config.pair_char: int_number = 5 elif int_number_char in self.config.ten_chars: int_number = 10 else: int_number = self.config.zero_to_nine_map[int_number_char] point_number_char = matches[1].group()[0] point_number: float if point_number_char == '半': point_number = 0.5 else: point_number = self.config.zero_to_nine_map[point_number_char] * 0.1 result.value = (int_number + point_number) * 10 elif len(matches) == 5: # Deal the Japanese percentage case like "xxx割xxx分xxx厘", get the integer value and convert into result. int_number_char = matches[0].group()[0] point_number_char = matches[1].group()[0] dot_number_char = matches[3].group()[0] point_number = self.config.zero_to_nine_map[point_number_char] * 0.1 dot_number = self.config.zero_to_nine_map[dot_number_char] * 0.01 int_number = self.config.zero_to_nine_map[int_number_char] result.value = ( int_number + point_number + dot_number) * 10 else: int_number_char = matches[0].group()[0] if int_number_char == self.config.pair_char: int_number = 5 elif int_number_char in self.config.ten_chars: int_number = 10 else: int_number = self.config.zero_to_nine_map[int_number_char] result.value = int_number * 10 elif 'Num' in source.data: double_match = regex.search( self.config.percentage_regex, source_text) double_text = double_match.group() if any(x for x in ['k', 'K', 'k', 'K'] if x in double_text): power = 1000 elif any(x for x in ['M', 'M'] if x in double_text): power = 1000000 elif any(x for x in ['G', 'G'] if x in double_text): power = 1000000000 elif any(x for x in ['T', 'T'] if x in double_text): power = 1000000000000 result.value = self.get_digit_value(double_text, power) else: double_match = regex.search( self.config.percentage_regex, source_text) double_text = self.replace_unit(double_match.group()) split_result = regex.split(self.config.point_regex, double_text) if split_result[0] == '': split_result[0] = self.config.zero_char double_value = self.get_int_value(split_result[0]) if len(split_result) == 2: if regex.search(self.config.negative_number_sign_regex, split_result[0]) is not None: double_value -= self.get_point_value(split_result[1]) else: double_value += self.get_point_value(split_result[1]) result.value = double_value result.resolution_str = self.__format(result.value) + '%' return result
def parse(self, source: ExtractResult): result = ParseResult(source) result.resolution_str = source.text result.value = self.score_phone_number(source.text) return result
def parse(self, source: ExtractResult) -> Optional[ParseResult]: res = ParseResult(source) res.resolution_str = res.text return res
def per_parse_chs(self, source: ExtractResult) -> ParseResult: result = ParseResult(source) source_text = source.text power = 1 if 'Spe' in source.data: source_text = self.replace_full_with_half(source_text) source_text = self.replace_unit(source_text) if source_text == '半折': result.value = 50 elif source_text == '10成': result.value = 100 else: matches = list(regex.finditer(self.config.spe_get_number_regex, source_text)) int_number: int if len(matches) == 2: int_number_char = matches[0].group()[0] if int_number_char == '对': int_number = 5 elif int_number_char == '十' or int_number_char == '拾': int_number = 10 else: int_number = self.config.zero_to_nine_map_chs[int_number_char] point_number_char = matches[1].group()[0] point_number: float if point_number_char == '半': point_number = 0.5 else: point_number = self.config.zero_to_nine_map_chs[point_number_char] * 0.1 result.value = (int_number + point_number) * 10 else: int_number_char = matches[0].group()[0] if int_number_char == '对': int_number = 5 elif int_number_char == '十' or int_number_char == '拾': int_number = 10 else: int_number = self.config.zero_to_nine_map_chs[int_number_char] result.value = int_number * 10 elif 'Num' in source.data: double_match = regex.search(self.config.percentage_regex, source_text) double_text = double_match.group() if any(x for x in ['k', 'K', 'k', 'K'] if x in double_text): power = 1000 elif any(x for x in ['M', 'M'] if x in double_text): power = 1000000 elif any(x for x in ['G', 'G'] if x in double_text): power = 1000000000 elif any(x for x in ['T', 'T'] if x in double_text): power = 1000000000000 result.value = self.get_digit_value_chs(double_text, power) else: double_match = regex.search(self.config.percentage_regex, source_text) double_text = self.replace_unit(double_match.group()) split_result = regex.split(self.config.point_regex_chs, double_text) if split_result[0] == '': split_result[0] = '零' double_value = self.get_int_value_chs(split_result[0]) if len(split_result) == 2: if regex.search(self.config.negative_number_sign_regex, split_result[0]) is not None: double_value -= self.get_point_value_chs(split_result[1]) else: double_value += self.get_point_value_chs(split_result[1]) result.value = double_value result.resolution_str = self.__format(result.value) + '%' return result