from typing import Generator from lexnlp.extract.en.amounts import get_amounts, NUM_PTN __author__ = "ContraxSuite, LLC; LexPredict, LLC" __copyright__ = "Copyright 2015-2017, ContraxSuite, LLC" __license__ = "https://github.com/LexPredict/lexpredict-lexnlp/blob/master/LICENSE" __version__ = "0.1.4" __maintainer__ = "LexPredict, LLC" __email__ = "*****@*****.**" RATIO_PTN = r""" (({num_ptn_1})\s* (?:to|\:)\s* ({num_ptn_2}))(?!\s*[ap].?m(?:\W|$)) """.format(num_ptn_1=NUM_PTN.replace('(?:(?:no|\\d{1,2})/100)?', '').replace('(?:\\W|$)', ''), num_ptn_2=NUM_PTN.replace('(?:(?:no|\\d{1,2})/100)?', '')) RATIO_PTN_RE = re.compile( RATIO_PTN, re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE) def get_ratios(text, return_sources=False, float_digits=4) -> Generator: for source_text, ratio_1_text, ratio_2_text in RATIO_PTN_RE.findall( text.lower()): amount_1 = list(get_amounts(ratio_1_text, float_digits=float_digits)) amount_2 = list(get_amounts(ratio_2_text, float_digits=float_digits)) if len(amount_1) != 1 or len(amount_2) != 1: continue amount_1 = amount_1[0] amount_2 = amount_2[0] if amount_1 == 0 or amount_2 == 0:
DISTANCE_SYMBOL_MAP = { "km": "kilometer", "mi": "mile", } DISTANCE_TOKEN_MAP = { "kilometers": "kilometer", "kilometer": "kilometer", "miles": "mile", "mile": "mile", } DISTANCE_PTN = r""" (({num_ptn})\s* ({distance_tokens}|{distance_symbols}))(?:\W|$) """.format(num_ptn=NUM_PTN.replace('(?:\\W|$)', '').replace('(?<=\\W|^)', ''), distance_symbols='|'.join(DISTANCE_SYMBOL_MAP), distance_tokens='|'.join(DISTANCE_TOKEN_MAP)) DISTANCE_PTN_RE = re.compile( DISTANCE_PTN, re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE) def get_distances( text: str, return_sources: bool = False, float_digits: int = 4 ) -> Generator[Union[Tuple[Decimal, str], Tuple[Decimal, str, str]], None, None]: for ant in get_distance_annotations(text, float_digits): if return_sources: yield ant.amount, ant.distance_type, ant.text
CURRENCY_TOKEN_MAP = OrderedDict([('chinese yuans', 'CNY'), ('chinese yuan', 'CNY'), ('dollars', 'USD'), ('dollar', 'USD'), ('euros', 'EUR'), ('euro', 'EUR'), ('pounds', 'GBP'), ('pound', 'GBP'), ('renminbi', 'CNY'), ('yens', 'JPY'), ('yen', 'JPY'), ('yuans', 'CNY'), ('yuan', 'CNY')]) CURRENCY_ABBR_LIST = set( list(CURRENCY_SYMBOL_MAP.values()) + list(CURRENCY_TOKEN_MAP.values()) + list(CURRENCY_PREFIX_MAP.values())) CURRENCY_PREFIXES = set( list(CURRENCY_PREFIX_MAP.keys()) + list(CURRENCY_SYMBOL_MAP.values())) CURR_NUM_PTN = NUM_PTN.replace('(?<=\\W|^)', '') CURRENCY_PTN = r""" (?P<text> (?P<prefix>{currency_prefixes}|[{currency_symbols}])\s* (?P<amount>{num_ptn_1}) | (?P<amount>{num_ptn_2})\s* (?P<postfix>{currency_tokens}|{currency_abbreviations})(?:\W|$)) """.format( num_ptn_1=CURR_NUM_PTN, num_ptn_2=CURR_NUM_PTN, currency_prefixes='|'.join(CURRENCY_PREFIXES), currency_symbols=''.join([re.escape(i) for i in CURRENCY_SYMBOL_MAP]), currency_tokens='|'.join( [i.replace(' ', '\\s+') for i in CURRENCY_TOKEN_MAP]),