Beispiel #1
0
    def en_parsers_speed(self):
        file_path = os.path.join(lexnlp_test_path, 'long_parsed_text.txt')
        with codecs.open(file_path, 'r', encoding='utf-8') as fr:
            text = fr.read()

        ge_path = os.path.join(lexnlp_test_path, 'lexnlp/extract/en/tests/test_geoentities/')
        entities_fn = ge_path + 'geoentities.csv'
        aliases_fn = ge_path + 'geoaliases.csv'
        geo_config = list(DictionaryEntry.load_entities_from_files(entities_fn, aliases_fn))

        times = {}  # type: Dict[str, float]
        self.check_time(text, lambda s: list(get_amounts(s)), 'get_amounts', times)
        self.check_time(text, lambda s: list(get_acts(s)), 'get_acts', times)
        self.check_time(text, lambda s: list(get_citations(s)), 'get_citations', times)
        self.check_time(text, lambda s: list(get_conditions(s)), 'get_conditions', times)
        self.check_time(text, lambda s: list(get_constraints(s)), 'get_constraints', times)
        self.check_time(text, lambda s: list(get_copyright(s)), 'get_copyright', times)
        self.check_time(text, lambda s: list(_get_courts(s)), 'get_courts', times)
        self.check_time(text, lambda s: list(get_cusip(s)), 'get_cusip', times)
        self.check_time(text, lambda s: list(get_dates(s)), 'get_dates', times)
        self.check_time(text, lambda s: list(get_definitions(s)), 'get_definitions', times)
        self.check_time(text, lambda s: list(get_distances(s)), 'get_distances', times)
        self.check_time(text, lambda s: list(get_durations(s)), 'get_durations', times)
        self.check_time(text, lambda s: list(get_geoentities(s, geo_config)), 'get_geoentities', times)
        self.check_time(text, lambda s: list(get_money(s)), 'get_money', times)
        self.check_time(text, lambda s: list(get_percents(s)), 'get_percents', times)
        self.check_time(text, lambda s: list(get_pii(s)), 'get_pii', times)
        self.check_time(text, lambda s: list(get_ratios(s)), 'get_ratios', times)
        self.check_time(text, lambda s: list(get_regulations(s)), 'get_regulations', times)
        self.check_time(text, lambda s: list(get_trademarks(s)), 'get_trademarks', times)
        self.check_time(text, lambda s: list(get_urls(s)), 'get_urls', times)

        self.assertTrue('get_amounts' in times)
Beispiel #2
0
def get_money_annotations(text: str, float_digits=4) \
        -> Generator[MoneyAnnotation, None, None]:
    for match in CURRENCY_PTN_RE.finditer(text):
        capture = match.capturesdict()
        if not (capture['prefix']
                or capture['postfix']) and not (capture['trigger_word']):
            continue
        prefix = capture['prefix']
        postfix = capture['postfix']
        amount = list(
            get_amounts(capture['amount'][0], float_digits=float_digits))
        if len(amount) != 1:
            continue
        if prefix:
            prefix = prefix[0].lower()
            currency_type = CURRENCY_SYMBOL_MAP.get(prefix)\
                            or CURRENCY_PREFIX_MAP.get(prefix)\
                            or prefix.upper()
        elif postfix:
            postfix = postfix[0].lower()
            currency_type = CURRENCY_TOKEN_MAP.get(postfix) or (
                capture['postfix'][0]).upper()
        else:
            currency_type = None
        if not currency_type:
            currency_type = DEFAULT_CURRENCY
        text = capture['text'][0].strip(
            string.punctuation.replace('$', '') + string.whitespace)
        ant = MoneyAnnotation(coords=match.span(),
                              amount=amount[0],
                              text=text,
                              currency=currency_type)
        yield ant
def get_money(text, return_sources=False, float_digits=4) -> Generator:
    for match in CURRENCY_PTN_RE.finditer(text):
        capture = match.capturesdict()
        if not (capture['prefix'] or capture['postfix']):
            continue
        prefix = capture['prefix']
        postfix = capture['postfix']
        amount = list(
            get_amounts(capture['amount'][0], float_digits=float_digits))
        if len(amount) != 1:
            continue
        if prefix:
            prefix = prefix[0].lower()
            currency_type = CURRENCY_SYMBOL_MAP.get(prefix)\
                            or CURRENCY_PREFIX_MAP.get(prefix)\
                            or prefix.upper()
        else:
            postfix = postfix[0].lower()
            currency_type = CURRENCY_TOKEN_MAP.get(postfix) or (
                capture['postfix'][0]).upper()
        item = (amount[0], currency_type)
        if return_sources:
            item += (capture['text'][0].strip(
                string.punctuation.replace('$', '') + string.whitespace), )
        yield item
 def _extract_variants_from_text(self, field, text: str, **kwargs):
     amounts = get_amounts(text, return_sources=False)
     if not amounts:
         return None
     amounts = [int(i) if int(i) == i else i for i in amounts
                if isinstance(i, (float, int))]
     return amounts or None
Beispiel #5
0
    def get_all_annotations(cls,
                            text: str,
                            float_digits=4) \
            -> List[DurationAnnotation]:

        all_annotations = []

        for match in cls.DURATION_PTN_RE.finditer(text.lower()):
            source_text, number_text, duration_type = match.groups()
            amount = list(get_amounts(number_text, float_digits=float_digits))
            if len(amount) != 1:
                continue
            amount = amount[0]
            if float_digits:
                amount = round(amount, float_digits)
            duration_days = cls.DURATION_MAP[duration_type] * amount
            if duration_type == 'anniversaries':
                duration_type = 'anniversary'
            ant = DurationAnnotation(coords=match.span(),
                                     amount=amount,
                                     duration_type=duration_type,
                                     duration_days=duration_days,
                                     text=source_text.strip())
            all_annotations.append(ant)
        return all_annotations
 def get_all_annotations(
     cls,
     text: str,
     float_digits: int = 4,
 ) -> List[DurationAnnotation]:
     all_annotations: List[DurationAnnotation] = []
     for match in cls.DURATION_PTN_RE.finditer(text.lower()):
         source_text, number_text, duration_type = match.groups()
         amount = list(get_amounts(number_text, float_digits=float_digits))
         if len(amount) != 1:
             continue
         amount = amount[0]
         _duration_fraction: Fraction = cls.DURATION_MAP[duration_type]
         duration_days: Decimal = Decimal(
             (_duration_fraction.numerator * amount) /
             _duration_fraction.denominator)
         if float_digits:
             duration_days: Decimal = quantize_by_float_digit(
                 amount=duration_days, float_digits=float_digits)
         if duration_type == 'anniversaries':
             duration_type = 'anniversary'
         ant: DurationAnnotation = DurationAnnotation(
             coords=match.span(),
             amount=amount,
             duration_type=duration_type,
             duration_days=duration_days,
             text=source_text.strip())
         all_annotations.append(ant)
     return all_annotations
Beispiel #7
0
def get_ratios(text, return_sources=False, float_digits=4) -> Generator:
    for source_text, ratio_1_text, ratio_2_text in RATIO_PTN_RE.findall(
            text.lower()):
        amount_1 = list(get_amounts(ratio_1_text, float_digits=float_digits))
        amount_2 = list(get_amounts(ratio_2_text, float_digits=float_digits))
        if len(amount_1) != 1 or len(amount_2) != 1:
            continue
        amount_1 = amount_1[0]
        amount_2 = amount_2[0]
        if amount_1 == 0 or amount_2 == 0:
            continue
        if float_digits:
            amount_1 = round(amount_1, float_digits)
            amount_2 = round(amount_2, float_digits)
        total = float(amount_1) / amount_2
        item = (amount_1, amount_2, total)
        if return_sources:
            item += (source_text.strip(), )
        yield item
Beispiel #8
0
 def extraction_function(self, field, possible_value, text):
     if possible_value is None and not text:
         return None
     try:
         return float(possible_value)
     except:
         possible_value = str(possible_value) if possible_value else text
         floats = list(get_amounts(possible_value, return_sources=False))
         return ValueExtractionHint.get_value(
             floats, field.item_number) if floats else None
Beispiel #9
0
def get_ratio_annotations(text: str, float_digits=4) \
        -> Generator[RatioAnnotation, None, None]:
    for match in RATIO_PTN_RE.finditer(text.lower()):
        source_text, ratio_1_text, ratio_2_text = match.groups()
        amount_1 = list(get_amounts(ratio_1_text, float_digits=float_digits))
        amount_2 = list(get_amounts(ratio_2_text, float_digits=float_digits))
        if len(amount_1) != 1 or len(amount_2) != 1:
            continue
        amount_1 = amount_1[0]
        amount_2 = amount_2[0]
        if amount_1 == 0 or amount_2 == 0:
            continue
        if float_digits:
            amount_1 = round(amount_1, float_digits)
            amount_2 = round(amount_2, float_digits)
        total = float(amount_1) / amount_2
        ant = RatioAnnotation(coords=match.span(),
                              text=source_text.strip(),
                              left=amount_1,
                              right=amount_2,
                              ratio=total)
        yield ant
Beispiel #10
0
def get_distances(text, return_sources=False, float_digits=4) -> Generator:
    for source_text, number_text, distance_item in DISTANCE_PTN_RE.findall(text.lower()):
        amount = list(get_amounts(number_text, float_digits=float_digits))
        if len(amount) != 1:
            continue
        distance_type = DISTANCE_SYMBOL_MAP.get(distance_item) or DISTANCE_TOKEN_MAP.get(distance_item)
        amount = amount[0]
        if float_digits:
            amount = round(amount, float_digits)
        item = (amount, distance_type)
        if return_sources:
            item += (source_text.strip(),)
        yield item
Beispiel #11
0
def get_distance_annotations(
        text: str,
        float_digits: int = 4) -> Generator[DistanceAnnotation, None, None]:
    for match in DISTANCE_PTN_RE.finditer(text.lower()):
        source_text, number_text, distance_item = match.groups()
        amount = list(get_amounts(number_text, float_digits=float_digits))
        if len(amount) != 1:
            continue
        distance_type = DISTANCE_SYMBOL_MAP.get(distance_item) \
                        or DISTANCE_TOKEN_MAP.get(distance_item)
        yield DistanceAnnotation(coords=match.span(),
                                 amount=amount[0],
                                 distance_type=distance_type,
                                 text=source_text.strip())
Beispiel #12
0
 def test_amounts(self):
     text = """
     2. Amendment to Interest Rate. Beginning on February 1, 1998, and
             continuing until July 18, 2002, which is the fifth anniversary of the Loan
             conversion date, interest shall be fixed at an annual rate of 7.38%, which rate
             is equal to 200 basis points above the Bank's five-year ""Treasury Constant
             Rate"" in effect on January 23, 1998. In accordance with the Agreement, the
             interest rate shall be adjusted again on July 18, 2002.
     """
     amts = list(get_amounts(text))
     str_vals = ', '.join([str(f) for f in amts])
     self.assertEqual(
         '2.0, 1.0, 1998.0, 18.0, 2002.0, 5, 7.38, 200.0, 5, 23.0, 1998.0, 18.0, 2002.0',
         str_vals)
def get_durations(text, return_sources=False, float_digits=4) -> Generator:
    for source_text, number_text, duration_type in DURATION_PTN_RE.findall(text.lower()):
        amount = list(get_amounts(number_text, float_digits=float_digits))
        if len(amount) != 1:
            continue
        amount = amount[0]
        if float_digits:
            amount = round(amount, float_digits)
        duration_days = DURATION_MAP[duration_type] * amount
        if duration_type == 'anniversaries':
            duration_type = 'anniversary'
        item = (duration_type, amount, duration_days)
        if return_sources:
            item += (source_text.strip(),)
        yield item
Beispiel #14
0
 def parse(self, text, text_unit_id, _text_unit_lang,
           **kwargs) -> ParseResults:
     found = list(
         amounts.get_amounts(text,
                             return_sources=True,
                             extended_sources=False))
     if found:
         unique = set(found)
         return ParseResults({
             AmountUsage: [
                 AmountUsage(text_unit_id=text_unit_id,
                             amount=item[0],
                             amount_str=item[1][:300] if item[1] else None,
                             count=found.count(item)) for item in unique
             ]
         })
 def _extract_variants_from_text(self, field, text: str, **kwargs):
     amounts = get_amounts(text, return_sources=False)
     return list(amounts) if amounts else None
Beispiel #16
0
 def _extract_variants_from_text(self, field, text: str):
     amounts = get_amounts(text, return_sources=False)
     if not amounts:
         return None
     amounts = [n for n in amounts if n.is_integer()]
     return amounts or None