Esempio n. 1
0
    def test_compare_date_string(self):
        text = """
               In the event the real estate taxes levied or assessed against the land
               and building of which the premises are a part in future tax years are
               greater than the real estate taxes for the base tax year, the TENANT,
               shall pay within thirty (30) days after submission of the bill to TENANT for the increase in
               real estate taxes, as additional rent a proportionate share of such
               increases, which proportionate share shall be computed at 22.08% of the
               increase in taxes, but shall exclude any fine, penalty, or interest
               charge for late or non-payment of taxes by LANDLORD. The base tax year
               shall be July 1, 1994 to June 30, 1995.
               """
        dtok = DateFinder()
        tokens = dtok.tokenize_string(text)
        merged = dtok.merge_tokens(tokens)

        pattern_start = """at 22.08# July 1, 1994 to June 30, 1995."""
        merged_start = '#'.join([m.match_str for m in merged]).strip()
        self.assertEqual(pattern_start, merged_start)

        dstrs = list(dtok.extract_date_strings(text, strict=True))
        dold = DateFinderOld()
        ostrs = list(dold.extract_date_strings(text, strict=True))

        # tokenizers has slightly different logic
        self.assertGreaterEqual(len(dstrs), len(ostrs))
Esempio n. 2
0
 def test_get_tokens(self):
     text = "At 1997, 20 FEB here, in"
     dtok = DateFinder()
     tokens = dtok.tokenize_string(text)
     self.assertEqual(11, len(tokens))
     self.assertEqual('', tokens[10][1])
     self.assertEqual('delimiters', tokens[1][1])
Esempio n. 3
0
 def test_merge_tokens(self):
     text = "At 1997, 20 FEB here, in"
     dtok = DateFinder()
     tokens = dtok.tokenize_string(text)
     merged = dtok.merge_tokens(tokens)
     self.assertEqual(1, len(merged))
     self.assertEqual('At 1997, 20 FEB ', merged[0].match_str)
     self.assertEqual((0, 16), merged[0].indices)
     self.assertEqual('At', merged[0].captures['extra_tokens'][0].strip())
    def test_parse_time(self):
        dir_path = os.path.dirname(os.path.realpath(__file__))
        file_path = dir_path + '/../../../../test_data/long_parsed_text.txt'
        with codecs.open(file_path, 'r', encoding='utf-8') as fr:
            text = fr.read()

        base_date = datetime.datetime.now().replace(
            day=1, month=1, hour=0, minute=0, second=0, microsecond=0)
        date_finder = DateFinder(base_date=base_date)
        t1 = time.time()
        _ = list(date_finder.extract_date_strings(text, strict=False))
        d1 = time.time() - t1
        self.assertLess(d1, 15)
    def test_parse_str(self):
        text = """
        ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 -                     569                -                     15                  -                     -                     -                     -                     -                     -                     -                     -                     -                     +
 1,195             1,339             3,019             1,820             13,831
        """
        base_date = datetime.datetime.now().replace(
            day=1, month=1, hour=0, minute=0, second=0, microsecond=0)

        # Find potential dates
        date_finder = DateFinder(base_date=base_date)
        possible_dates = list(date_finder.extract_date_strings(text, strict=False))
        self.assertGreater(len(possible_dates), 0)
Esempio n. 6
0
 def test_get_date_strings(self):
     text = """
             2. Amendment to Interest Rate. Beginning on February 1, 1998, and
                     continuing until July 18, 2002, which is the fifth anniversary of the Loan
                     conversion date, interest shall be fixed at an annual rate of 7.38%, which rate
                     is equal to 200 basis points above the Bank's five-year ""Treasury Constant
                     Rate"" in effect on January 23, 1998. In accordance with the Agreement, the
                     interest rate shall be adjusted again on July 18, 2002.
             """
     dtok = DateFinder()
     dstrs = list(dtok.extract_date_strings(text, True))
     self.assertEqual(4, len(dstrs))
     self.assertEqual('until July 18, 2002', dstrs[1][0])
     self.assertEqual((117, 139), dstrs[1][1])
     groups = dstrs[1][2]
     self.assertEqual([], groups['time'])
     self.assertEqual('18', groups['digits'][0])
     self.assertEqual('2002', groups['digits'][1])
Esempio n. 7
0
def get_raw_dates(text, strict=False, base_date=None, return_source=False) -> Generator:
    """
    Find "raw" or potential date matches prior to false positive classification.
    :param text: raw text to search
    :param strict: whether to return only complete or strict matches
    :param base_date: base date to use for implied or partial matches
    :param return_source: whether to return raw text around date
    :return:
    """
    # Setup base date
    if not base_date:
        base_date = datetime.datetime.now().replace(
            day=1, month=1, hour=0, minute=0, second=0, microsecond=0)

    # Find potential dates
    date_finder = DateFinder(base_date=base_date)

    for extra_token in date_finder.EXTRA_TOKENS_PATTERN.split('|'):
        if extra_token != 't':
            date_finder.REPLACEMENTS[extra_token] = ' '

    # Iterate through possible matches
    possible_dates = [(date_string, index, date_props) for date_string, index, date_props in
                      date_finder.extract_date_strings(text, strict=strict)]
    possible_matched = []

    for i, possible_date in enumerate(possible_dates):
        # Get
        date_string = possible_date[0]
        index = possible_date[1]
        date_props = possible_date[2]

        # Cleanup "day of" strings
        if "of" in date_props["extra_tokens"] or "OF" in date_props["extra_tokens"]:
            num_dig_mod = len(possible_dates[i - 1][2]["digits_modifier"])
            if i > 0 and not possible_matched[i - 1] and num_dig_mod == 1:
                date_props["digits_modifier"].extend(possible_dates[i - 1][2]["digits_modifier"])
                date_string = possible_dates[i - 1][2]["digits_modifier"].pop().replace("st", "").replace("nd", "") \
                                  .replace("rd", "").replace("th", "") + date_string

        # Skip only digits modifiers
        num_dig_mod = len(date_props["digits_modifier"])
        num_dig = len(date_props["digits"])
        num_days = len(date_props["days"])
        num_month = len(date_props["months"])
        num_slash = date_props["delimiters"].count("/")
        num_hyphen = date_props["delimiters"].count("-")
        num_point = date_props["delimiters"].count(".")

        # Remove double months
        if num_month > 1:
            possible_matched.append(False)
            continue

        # Remove wrong months like Dec*ided or Mar*tin
        if num_month == 1 and date_props['extra_tokens'] \
                and (date_props['months'][0] + date_props['extra_tokens'][-1]) in date_string:
            possible_matched.append(False)
            continue

        # Check strange strings
        if num_dig_mod > 0 and num_dig == 0:
            possible_matched.append(False)
            continue

        # Skip DOW only
        if num_days > 0 and num_dig == 0:
            possible_matched.append(False)
            continue

        # Skip DOM only
        if num_month == 0 and num_dig_mod == 0 and num_dig <= 1:
            possible_matched.append(False)
            continue

        # Skip odd date like "1 10"
        if re.match(r'\d{1,2}\s+\d{1,2}', date_string):
            possible_matched.append(False)
            continue

        # Skip floats
        if num_point and not num_month and not re.match(r'\d{2}\.\d{2}\.\d{2,4}', date_string):
            possible_matched.append(False)
            continue

        # Skip odd months from string like "Nil 62. Marquee"
        if re.search(r'\d{2,4}\.\s*[A-Za-z]', date_string):
            possible_matched.append(False)
            continue

        # Skip fractions
        if (num_slash == 1 or num_hyphen == 1) and num_dig > 2:
            possible_matched.append(False)
            continue

        # Skip three-digit blocks and double zero years
        found_triple = False
        found_dz = False
        for digit in date_props["digits"]:
            if len(digit) == 3:
                found_triple = True
            if digit.startswith("00"):
                found_dz = True
        if found_triple or found_dz:
            possible_matched.append(False)
            continue

        # Skip " may " alone
        if num_dig == 0 and num_days == 0 and "".join(date_props["months"]).lower() == "may":
            possible_matched.append(False)
            continue

        # Cleanup
        for token in sorted(date_props["extra_tokens"], key=len, reverse=True):
            if token.lower() in ["to", "t"]:
                continue
            date_string = date_string.replace(token, "")
        date_string = date_string.strip()
        date_props["extra_tokens"] = []

        # Skip strings too long
        if len(date_string) > DATE_MAX_LENGTH:
            possible_matched.append(False)
            continue

        # Skip numbers only
        match_delims = set("".join(date_props["delimiters"]))
        bad_delims = {",", " ", "\n", "\t"}
        len_diff_set = len(match_delims - bad_delims)
        if len_diff_set == 0 and num_month == 0:
            possible_matched.append(False)
            continue

        # Parse and skip nones
        date = None
        try:
            date_string_tokens = date_string.split()
            for cutter in range(len(date_string_tokens)):
                for direction in (0, 1):
                    if cutter > 0:
                        if direction:
                            _date_string_tokens = date_string_tokens[cutter:]
                        else:
                            _date_string_tokens = date_string_tokens[:-cutter]
                        date_string = ' '.join(_date_string_tokens)
                    try:
                        date = date_finder.parse_date_string(date_string, date_props)
                    except:
                        date = None
                    if date:
                        break
                else:
                    continue  # executed if the loop ended normally (no break)
                break  # executed if 'continue' was skipped (break)
        except TypeError:
            possible_matched.append(False)
            continue

        if not date:
            possible_matched.append(False)
            continue
        else:
            # for case when datetime.datetime(2001, 1, 22, 20, 1, tzinfo=tzoffset(None, -104400))
            if hasattr(date, 'tzinfo'):
                try:
                    _ = date.isoformat()
                except ValueError:
                    possible_matched.append(False)
                    continue
            possible_matched.append(True)

        if isinstance(date, datetime.datetime) and date.hour == 0 and date.minute == 0:
            date = date.date()
        # Append
        if return_source:
            yield (date, index)
        else:
            yield date
Esempio n. 8
0
 def test_get_date_time(self):
     text = "March 20, 2015 3:30 pm GMT "
     dtok = DateFinder()
     tokens = dtok.tokenize_string(text)
     merged = dtok.merge_tokens(tokens)
     self.assertEqual(1, len(merged))