def test_compare_date_string(self): text = """ In the event the real estate taxes levied or assessed against the land and building of which the premises are a part in future tax years are greater than the real estate taxes for the base tax year, the TENANT, shall pay within thirty (30) days after submission of the bill to TENANT for the increase in real estate taxes, as additional rent a proportionate share of such increases, which proportionate share shall be computed at 22.08% of the increase in taxes, but shall exclude any fine, penalty, or interest charge for late or non-payment of taxes by LANDLORD. The base tax year shall be July 1, 1994 to June 30, 1995. """ dtok = DateFinder() tokens = dtok.tokenize_string(text) merged = dtok.merge_tokens(tokens) pattern_start = """at 22.08# July 1, 1994 to June 30, 1995.""" merged_start = '#'.join([m.match_str for m in merged]).strip() self.assertEqual(pattern_start, merged_start) dstrs = list(dtok.extract_date_strings(text, strict=True)) dold = DateFinderOld() ostrs = list(dold.extract_date_strings(text, strict=True)) # tokenizers has slightly different logic self.assertGreaterEqual(len(dstrs), len(ostrs))
def test_get_tokens(self): text = "At 1997, 20 FEB here, in" dtok = DateFinder() tokens = dtok.tokenize_string(text) self.assertEqual(11, len(tokens)) self.assertEqual('', tokens[10][1]) self.assertEqual('delimiters', tokens[1][1])
def test_merge_tokens(self): text = "At 1997, 20 FEB here, in" dtok = DateFinder() tokens = dtok.tokenize_string(text) merged = dtok.merge_tokens(tokens) self.assertEqual(1, len(merged)) self.assertEqual('At 1997, 20 FEB ', merged[0].match_str) self.assertEqual((0, 16), merged[0].indices) self.assertEqual('At', merged[0].captures['extra_tokens'][0].strip())
def test_parse_time(self): dir_path = os.path.dirname(os.path.realpath(__file__)) file_path = dir_path + '/../../../../test_data/long_parsed_text.txt' with codecs.open(file_path, 'r', encoding='utf-8') as fr: text = fr.read() base_date = datetime.datetime.now().replace( day=1, month=1, hour=0, minute=0, second=0, microsecond=0) date_finder = DateFinder(base_date=base_date) t1 = time.time() _ = list(date_finder.extract_date_strings(text, strict=False)) d1 = time.time() - t1 self.assertLess(d1, 15)
def test_parse_str(self): text = """ ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ - 569 - 15 - - - - - - - - - + 1,195 1,339 3,019 1,820 13,831 """ base_date = datetime.datetime.now().replace( day=1, month=1, hour=0, minute=0, second=0, microsecond=0) # Find potential dates date_finder = DateFinder(base_date=base_date) possible_dates = list(date_finder.extract_date_strings(text, strict=False)) self.assertGreater(len(possible_dates), 0)
def test_get_date_strings(self): text = """ 2. Amendment to Interest Rate. Beginning on February 1, 1998, and continuing until July 18, 2002, which is the fifth anniversary of the Loan conversion date, interest shall be fixed at an annual rate of 7.38%, which rate is equal to 200 basis points above the Bank's five-year ""Treasury Constant Rate"" in effect on January 23, 1998. In accordance with the Agreement, the interest rate shall be adjusted again on July 18, 2002. """ dtok = DateFinder() dstrs = list(dtok.extract_date_strings(text, True)) self.assertEqual(4, len(dstrs)) self.assertEqual('until July 18, 2002', dstrs[1][0]) self.assertEqual((117, 139), dstrs[1][1]) groups = dstrs[1][2] self.assertEqual([], groups['time']) self.assertEqual('18', groups['digits'][0]) self.assertEqual('2002', groups['digits'][1])
def get_raw_dates(text, strict=False, base_date=None, return_source=False) -> Generator: """ Find "raw" or potential date matches prior to false positive classification. :param text: raw text to search :param strict: whether to return only complete or strict matches :param base_date: base date to use for implied or partial matches :param return_source: whether to return raw text around date :return: """ # Setup base date if not base_date: base_date = datetime.datetime.now().replace( day=1, month=1, hour=0, minute=0, second=0, microsecond=0) # Find potential dates date_finder = DateFinder(base_date=base_date) for extra_token in date_finder.EXTRA_TOKENS_PATTERN.split('|'): if extra_token != 't': date_finder.REPLACEMENTS[extra_token] = ' ' # Iterate through possible matches possible_dates = [(date_string, index, date_props) for date_string, index, date_props in date_finder.extract_date_strings(text, strict=strict)] possible_matched = [] for i, possible_date in enumerate(possible_dates): # Get date_string = possible_date[0] index = possible_date[1] date_props = possible_date[2] # Cleanup "day of" strings if "of" in date_props["extra_tokens"] or "OF" in date_props["extra_tokens"]: num_dig_mod = len(possible_dates[i - 1][2]["digits_modifier"]) if i > 0 and not possible_matched[i - 1] and num_dig_mod == 1: date_props["digits_modifier"].extend(possible_dates[i - 1][2]["digits_modifier"]) date_string = possible_dates[i - 1][2]["digits_modifier"].pop().replace("st", "").replace("nd", "") \ .replace("rd", "").replace("th", "") + date_string # Skip only digits modifiers num_dig_mod = len(date_props["digits_modifier"]) num_dig = len(date_props["digits"]) num_days = len(date_props["days"]) num_month = len(date_props["months"]) num_slash = date_props["delimiters"].count("/") num_hyphen = date_props["delimiters"].count("-") num_point = date_props["delimiters"].count(".") # Remove double months if num_month > 1: possible_matched.append(False) continue # Remove wrong months like Dec*ided or Mar*tin if num_month == 1 and date_props['extra_tokens'] \ and (date_props['months'][0] + date_props['extra_tokens'][-1]) in date_string: possible_matched.append(False) continue # Check strange strings if num_dig_mod > 0 and num_dig == 0: possible_matched.append(False) continue # Skip DOW only if num_days > 0 and num_dig == 0: possible_matched.append(False) continue # Skip DOM only if num_month == 0 and num_dig_mod == 0 and num_dig <= 1: possible_matched.append(False) continue # Skip odd date like "1 10" if re.match(r'\d{1,2}\s+\d{1,2}', date_string): possible_matched.append(False) continue # Skip floats if num_point and not num_month and not re.match(r'\d{2}\.\d{2}\.\d{2,4}', date_string): possible_matched.append(False) continue # Skip odd months from string like "Nil 62. Marquee" if re.search(r'\d{2,4}\.\s*[A-Za-z]', date_string): possible_matched.append(False) continue # Skip fractions if (num_slash == 1 or num_hyphen == 1) and num_dig > 2: possible_matched.append(False) continue # Skip three-digit blocks and double zero years found_triple = False found_dz = False for digit in date_props["digits"]: if len(digit) == 3: found_triple = True if digit.startswith("00"): found_dz = True if found_triple or found_dz: possible_matched.append(False) continue # Skip " may " alone if num_dig == 0 and num_days == 0 and "".join(date_props["months"]).lower() == "may": possible_matched.append(False) continue # Cleanup for token in sorted(date_props["extra_tokens"], key=len, reverse=True): if token.lower() in ["to", "t"]: continue date_string = date_string.replace(token, "") date_string = date_string.strip() date_props["extra_tokens"] = [] # Skip strings too long if len(date_string) > DATE_MAX_LENGTH: possible_matched.append(False) continue # Skip numbers only match_delims = set("".join(date_props["delimiters"])) bad_delims = {",", " ", "\n", "\t"} len_diff_set = len(match_delims - bad_delims) if len_diff_set == 0 and num_month == 0: possible_matched.append(False) continue # Parse and skip nones date = None try: date_string_tokens = date_string.split() for cutter in range(len(date_string_tokens)): for direction in (0, 1): if cutter > 0: if direction: _date_string_tokens = date_string_tokens[cutter:] else: _date_string_tokens = date_string_tokens[:-cutter] date_string = ' '.join(_date_string_tokens) try: date = date_finder.parse_date_string(date_string, date_props) except: date = None if date: break else: continue # executed if the loop ended normally (no break) break # executed if 'continue' was skipped (break) except TypeError: possible_matched.append(False) continue if not date: possible_matched.append(False) continue else: # for case when datetime.datetime(2001, 1, 22, 20, 1, tzinfo=tzoffset(None, -104400)) if hasattr(date, 'tzinfo'): try: _ = date.isoformat() except ValueError: possible_matched.append(False) continue possible_matched.append(True) if isinstance(date, datetime.datetime) and date.hour == 0 and date.minute == 0: date = date.date() # Append if return_source: yield (date, index) else: yield date
def test_get_date_time(self): text = "March 20, 2015 3:30 pm GMT " dtok = DateFinder() tokens = dtok.tokenize_string(text) merged = dtok.merge_tokens(tokens) self.assertEqual(1, len(merged))