def parse(self, ext_result: ExtractResult): result = ParseResult(ext_result) result.start = ext_result.start result.length = ext_result.length result.text = ext_result.text result.type = ext_result.type result.resolution_str = self.drop_leading_zeros(ext_result.text) result.data = ext_result.data return result
def parse(self, source: ExtractResult): res = ParseResult(source) res.resolution_str = source.text res.start = source.start res.length = source.length res.text = source.text res.type = source.type res.value = self.score_guid(source.text) return res
def _digit_number_parse(self, ext_result: ExtractResult) -> ParseResult: result = ParseResult() result.start = ext_result.start result.length = ext_result.length result.text = ext_result.text result.type = ext_result.type result.meta_data = MetaData( ) if not result.meta_data else result.meta_data # [1] 24 # [2] 12 32/33 # [3] 1,000,000 # [4] 234.567 # [5] 44/55 # [6] 2 hundred # dot occurred. power = 1 tmp_index = -1 start_index = 0 handle = ext_result.text.lower() matches = list(regex.finditer(self.config.digital_number_regex, handle)) if matches: for match in matches: rep = self.config.round_number_map.get(match.group()) # \\s+ for filter the spaces. power *= rep tmp_index = handle.find(match.group(), start_index) while tmp_index >= 0: front = handle[0:tmp_index].rstrip() start_index = len(front) handle = front + handle[tmp_index + len(match):] tmp_index = handle.find(match.group(), start_index) # Scale used in the calculate of double result.value = self._get_digital_value(handle, power) return result
def __merge_compound_unit(self, compound_result: ExtractResult) -> ParseResult: results = [] compound_unit = compound_result.data count = 0 result = None number_value = '' main_unit_value = '' main_unit_iso_code = '' fraction_unit_string = '' idx = 0 while idx < len(compound_unit): extract_result = compound_unit[idx] parse_result = self.number_with_unit_parser.parse(extract_result) parse_result_value = parse_result.value try: unit_value = parse_result_value.unit if parse_result_value else None except AttributeError: unit_value = None # Process a new group if count == 0: if not extract_result.type == Constants.SYS_UNIT_CURRENCY: idx = idx + 1 continue # Initialize a new result result = ParseResult() result.start = extract_result.start result.length = extract_result.length result.text = extract_result.text result.type = extract_result.type main_unit_value = unit_value if parse_result_value and parse_result_value.number: number_value = float(parse_result_value.number) result.resolution_str = parse_result.resolution_str main_unit_iso_code = self.config.currency_name_to_iso_code_map.get( unit_value, None) # If the main unit can't be recognized, finish process this group. if not main_unit_iso_code: result.value = UnitValue( self.__get_number_value(number_value), main_unit_value) results.append(result) result = None idx = idx + 1 continue fraction_units_string = self.config.currency_fraction_mapping.get( main_unit_iso_code) else: if extract_result.type == Constants.SYS_NUM: number_value = number_value + \ float(parse_result.value) * (1 / 100) result.resolution_str = result.resolution_str + ' ' + str( parse_result.resolution_str or '') result.length = parse_result.start + parse_result.length - result.start count = count + 1 idx = idx + 1 continue fraction_unit_code = self.config.currency_fraction_code_list.get( unit_value, None) fraction_num_value = self.config.currency_fraction_num_map.get( parse_result_value.unit, None) if parse_result_value else None if fraction_unit_code and fraction_num_value != 0 and self.__check_units_string_contains( fraction_unit_code, fraction_units_string): number_value = number_value + ( float(parse_result_value.number) * (1 / fraction_num_value) if parse_result_value else 0) result.resolution_str = result.resolution_str + ' ' + parse_result.resolution_str result.length = parse_result.start + parse_result.length - result.start else: if result: result = self.__create_currency_result( result, main_unit_iso_code, number_value, main_unit_value) results.append(result) result = None count = 0 number_value = '' continue count = count + 1 idx = idx + 1 if result: result = self.__create_currency_result(result, main_unit_iso_code, number_value, main_unit_value) results.append(result) self.__resolve_text(results, compound_result.text, compound_result.start) ret = ParseResult(compound_result) ret.value = results return ret
def _frac_like_number_parse(self, ext_result: ExtractResult) -> ParseResult: result = ParseResult() result.start = ext_result.start result.length = ext_result.length result.text = ext_result.text result.type = ext_result.type result_text = ext_result.text.lower() if regex.search(self.config.fraction_marker_token, result_text): over_index = result_text.find(self.config.fraction_marker_token) small_part = result_text[0:over_index].strip() big_part = result_text[over_index + len(self.config.fraction_marker_token ):len(result_text)].strip() small_value = self._get_digital_value( small_part, 1) if self._is_digit( small_part[0]) else self.__get_int_value( self.__get_matches(small_part)) big_value = self._get_digital_value(big_part, 1) if self._is_digit( big_part[0]) else self.__get_int_value( self.__get_matches(big_part)) result.value = small_value / big_value else: words = list(filter(lambda x: x, result_text.split(' '))) frac_words = self.config.normalize_token_set(words, result) # Split fraction with integer split_index = len(frac_words) - 1 current_value = self.config.resolve_composite_number( frac_words[split_index]) round_value = 1 for split_index in range(len(frac_words) - 2, -1, -1): if (frac_words[split_index] in self.config.written_fraction_separator_texts or frac_words[split_index] in self.config.written_integer_separator_texts): continue previous_value = current_value current_value = self.config.resolve_composite_number( frac_words[split_index]) sm_hundreds = 100 # previous: hundred # current: one if ((previous_value >= sm_hundreds and previous_value > current_value) or (previous_value < sm_hundreds and self.__is_composable(current_value, previous_value))): if (previous_value < sm_hundreds and current_value >= round_value): round_value = current_value elif (previous_value < sm_hundreds and current_value < round_value): split_index += 1 break # current is the first word if split_index == 0: # scan, skip the first word split_index = 1 while split_index <= len(frac_words) - 2: # e.g. one hundred thousand # frac[i+1] % 100 and frac[i] % 100 = 0 if (self.config.resolve_composite_number( frac_words[split_index]) >= sm_hundreds and not frac_words[split_index + 1] in self .config.written_fraction_separator_texts and self.config.resolve_composite_number( frac_words[split_index + 1]) < sm_hundreds): split_index += 1 break split_index += 1 break continue split_index += 1 break frac_part = [] for i in range(split_index, len(frac_words)): if frac_words[i].find('-') > -1: split = frac_words[i].split('-') frac_part.append(split[0]) frac_part.append('-') frac_part.append(split[1]) else: frac_part.append(frac_words[i]) frac_words = frac_words[:split_index] # denomi = denominator denomi_value = self.__get_int_value(frac_part) # Split mixed number with fraction numer_value = 0 int_value = 0 mixed_index = len(frac_words) for i in range(len(frac_words) - 1, -1, -1): if (i < len(frac_words) - 1 and frac_words[i] in self.config.written_fraction_separator_texts): numer_str = ' '.join(frac_words[i + 1:len(frac_words)]) numer_value = self.__get_int_value( self.__get_matches(numer_str)) mixed_index = i + 1 break int_str = ' '.join(frac_words[0:mixed_index]) int_value = self.__get_int_value(self.__get_matches(int_str)) # Find mixed number if (mixed_index != len(frac_words) and numer_value < denomi_value): # int_value + numer_value / denomi_value result.value = int_value + numer_value / denomi_value else: # (int_value + numer_value) / denomi_value result.value = (int_value + numer_value) / denomi_value # Convert to float for fixed float point vs. exponential notation consistency /w C#/TS/JS result.value = float(result.value) return result