コード例 #1
0
    def test_amount_annotation(self):
        ant = AmountAnnotation(coords=(2, 12), value=2.3, locale='pg')
        self.assertEqual('pg', ant.locale)
        s = ant.__repr__()
        self.assertGreater(len(s), 0)

        cite = ant.get_cite()
        self.assertEqual('/pg/amount/2.3', cite)
コード例 #2
0
    def parse_annotations(self,
                          text: str,
                          float_digits: int = 4,
                          return_sources: bool = True
                          ) -> Generator[AmountAnnotation, None, None]:
        """
        Find possible amount references in the text.
        :param text: text
        :param return_sources: return amount AND source text
        :param extended_sources: return data around amount itself
        :param float_digits: round float to N digits, don't round if None
        :return: list of amounts
        """
        for match in self.NUM_PTN_RE.finditer(text):
            found_item = match.group()
            if self.WRONG_FULLMATCH_RE.fullmatch(found_item):
                continue
            try:
                amount = self.text2num(found_item)
            except Exception as e:
                print(e)
                continue
            if amount is None:
                continue
            if float_digits:
                amount: Decimal = quantize_by_float_digit(
                    amount=amount, float_digits=float_digits)

            ant = AmountAnnotation(coords=match.span(),
                                   value=amount,
                                   locale=self.language)

            if return_sources:
                unit = ''
                next_text = text[match.span()[1]:]
                if next_text:
                    for np in get_np(next_text):
                        if next_text.startswith(np):
                            unit = np
                    if unit:
                        found_item = ' '.join([found_item.strip(), unit])
                if not unit:
                    prev_text = text[:match.span()[0]]
                    prev_text_tags = nltk.word_tokenize(prev_text)
                    if prev_text_tags and prev_text_tags[-1].lower(
                    ) in allowed_prev_units:
                        sep = ' ' if text[match.span()[0] - 1] == ' ' else ''
                        found_item = sep.join(
                            [prev_text_tags[-1],
                             found_item.rstrip()])

                ant.text = found_item.strip()
            yield ant
コード例 #3
0
def get_amount_annotations(text: str,
                           extended_sources=True,
                           float_digits=4) \
        -> Generator[AmountAnnotation, None, None]:
    """
    Find possible amount references in the text.
    :param text: text
    :param extended_sources: return data around amount itself
    :param float_digits: round float to N digits, don't round if None
    :return: list of amounts
    """
    for match in NUM_PTN_RE.finditer(text):
        found_item = match.group()
        if AND_RE.fullmatch(found_item):
            continue
        try:
            amount = text2num(found_item)
        except:
            continue
        if amount is None:
            continue
        if isinstance(amount, float) and float_digits:
            amount = round(amount, float_digits)

        if extended_sources:
            unit = ''
            next_text = text[match.span()[1]:]
            if next_text:
                for np, _ in get_np(next_text):
                    if next_text.startswith(np):
                        unit = np
                if unit:
                    found_item = ' '.join([found_item.strip(), unit])
            if not unit:
                prev_text = text[:match.span()[0]]
                prev_text_tags = nltk.word_tokenize(prev_text)
                if prev_text_tags and prev_text_tags[-1].lower(
                ) in allowed_prev_units:
                    sep = ' ' if text[match.span()[0] - 1] == ' ' else ''
                    found_item = sep.join(
                        [prev_text_tags[-1],
                         found_item.rstrip()])

            ant = AmountAnnotation(coords=match.span(),
                                   value=amount,
                                   text=found_item.strip())
            yield ant
        else:
            ant = AmountAnnotation(coords=match.span(),
                                   value=amount,
                                   text=match.group())
            yield ant
コード例 #4
0
def get_amount_annotations(
    text: str,
    extended_sources: bool = True,
    float_digits: int = 4,
) -> Generator[AmountAnnotation, None, None]:
    """
    Find possible amount references in the text.
    :param text: text
    :param extended_sources: return data around amount itself
    :param float_digits: round float to N digits, don't round if None
    :return: list of amounts
    """
    for match in NUM_PTN_RE.finditer(text):
        found_item = match.group()
        fraction_tail_items = FRACTION_TAIL_RE.finditer(found_item)
        for fraction_tail in fraction_tail_items:
            fraction_tail_smb = fraction_tail.group().strip(' ')
            if fraction_tail_smb in fraction_smb_to_string:
                fraction_ending = fraction_smb_to_string[fraction_tail_smb]
                found_item = found_item[:fraction_tail.span()[0]]
                found_item += fraction_ending
            break

        if AND_RE.fullmatch(found_item):
            continue
        try:
            amount: Optional[Decimal] = text2num(found_item)
        except:
            continue
        if amount is None:
            continue

        if float_digits:
            amount: Decimal = quantize_by_float_digit(
                amount=amount, float_digits=float_digits)

        if extended_sources:
            unit = ''
            next_text = text[match.span()[1]:]
            if next_text:
                for np, _ in get_np(next_text):
                    if next_text.startswith(np):
                        unit = np
                if unit:
                    found_item = ' '.join([found_item.strip(), unit])
            if not unit:
                prev_text = text[:match.span()[0]]
                prev_text_tags = nltk.word_tokenize(prev_text)
                if prev_text_tags and prev_text_tags[-1].lower(
                ) in allowed_prev_units:
                    sep = ' ' if text[match.span()[0] - 1] == ' ' else ''
                    found_item = sep.join(
                        [prev_text_tags[-1],
                         found_item.rstrip()])

            yield AmountAnnotation(coords=match.span(),
                                   value=amount,
                                   text=found_item.strip())
        else:
            yield AmountAnnotation(coords=match.span(),
                                   value=amount,
                                   text=match.group())