def test_amount_annotation(self): ant = AmountAnnotation(coords=(2, 12), value=2.3, locale='pg') self.assertEqual('pg', ant.locale) s = ant.__repr__() self.assertGreater(len(s), 0) cite = ant.get_cite() self.assertEqual('/pg/amount/2.3', cite)
def parse_annotations(self, text: str, float_digits: int = 4, return_sources: bool = True ) -> Generator[AmountAnnotation, None, None]: """ Find possible amount references in the text. :param text: text :param return_sources: return amount AND source text :param extended_sources: return data around amount itself :param float_digits: round float to N digits, don't round if None :return: list of amounts """ for match in self.NUM_PTN_RE.finditer(text): found_item = match.group() if self.WRONG_FULLMATCH_RE.fullmatch(found_item): continue try: amount = self.text2num(found_item) except Exception as e: print(e) continue if amount is None: continue if float_digits: amount: Decimal = quantize_by_float_digit( amount=amount, float_digits=float_digits) ant = AmountAnnotation(coords=match.span(), value=amount, locale=self.language) if return_sources: unit = '' next_text = text[match.span()[1]:] if next_text: for np in get_np(next_text): if next_text.startswith(np): unit = np if unit: found_item = ' '.join([found_item.strip(), unit]) if not unit: prev_text = text[:match.span()[0]] prev_text_tags = nltk.word_tokenize(prev_text) if prev_text_tags and prev_text_tags[-1].lower( ) in allowed_prev_units: sep = ' ' if text[match.span()[0] - 1] == ' ' else '' found_item = sep.join( [prev_text_tags[-1], found_item.rstrip()]) ant.text = found_item.strip() yield ant
def get_amount_annotations(text: str, extended_sources=True, float_digits=4) \ -> Generator[AmountAnnotation, None, None]: """ Find possible amount references in the text. :param text: text :param extended_sources: return data around amount itself :param float_digits: round float to N digits, don't round if None :return: list of amounts """ for match in NUM_PTN_RE.finditer(text): found_item = match.group() if AND_RE.fullmatch(found_item): continue try: amount = text2num(found_item) except: continue if amount is None: continue if isinstance(amount, float) and float_digits: amount = round(amount, float_digits) if extended_sources: unit = '' next_text = text[match.span()[1]:] if next_text: for np, _ in get_np(next_text): if next_text.startswith(np): unit = np if unit: found_item = ' '.join([found_item.strip(), unit]) if not unit: prev_text = text[:match.span()[0]] prev_text_tags = nltk.word_tokenize(prev_text) if prev_text_tags and prev_text_tags[-1].lower( ) in allowed_prev_units: sep = ' ' if text[match.span()[0] - 1] == ' ' else '' found_item = sep.join( [prev_text_tags[-1], found_item.rstrip()]) ant = AmountAnnotation(coords=match.span(), value=amount, text=found_item.strip()) yield ant else: ant = AmountAnnotation(coords=match.span(), value=amount, text=match.group()) yield ant
def get_amount_annotations( text: str, extended_sources: bool = True, float_digits: int = 4, ) -> Generator[AmountAnnotation, None, None]: """ Find possible amount references in the text. :param text: text :param extended_sources: return data around amount itself :param float_digits: round float to N digits, don't round if None :return: list of amounts """ for match in NUM_PTN_RE.finditer(text): found_item = match.group() fraction_tail_items = FRACTION_TAIL_RE.finditer(found_item) for fraction_tail in fraction_tail_items: fraction_tail_smb = fraction_tail.group().strip(' ') if fraction_tail_smb in fraction_smb_to_string: fraction_ending = fraction_smb_to_string[fraction_tail_smb] found_item = found_item[:fraction_tail.span()[0]] found_item += fraction_ending break if AND_RE.fullmatch(found_item): continue try: amount: Optional[Decimal] = text2num(found_item) except: continue if amount is None: continue if float_digits: amount: Decimal = quantize_by_float_digit( amount=amount, float_digits=float_digits) if extended_sources: unit = '' next_text = text[match.span()[1]:] if next_text: for np, _ in get_np(next_text): if next_text.startswith(np): unit = np if unit: found_item = ' '.join([found_item.strip(), unit]) if not unit: prev_text = text[:match.span()[0]] prev_text_tags = nltk.word_tokenize(prev_text) if prev_text_tags and prev_text_tags[-1].lower( ) in allowed_prev_units: sep = ' ' if text[match.span()[0] - 1] == ' ' else '' found_item = sep.join( [prev_text_tags[-1], found_item.rstrip()]) yield AmountAnnotation(coords=match.span(), value=amount, text=found_item.strip()) else: yield AmountAnnotation(coords=match.span(), value=amount, text=match.group())