def test_parse_date_string_find_replace(date_string, expected_parse_arg, expected_captures, expected_date):
    dt = datefinder.DateFinder()
    with mock.patch.object(parser, 'parse', wraps=parser.parse) as spy:
        actual_datetime = dt.parse_date_string(date_string, expected_captures)
        spy.assert_called_with(expected_parse_arg)
        logger.debug("acutal={}  expected={}".format(actual_datetime, expected_date))
        assert actual_datetime == expected_date
def test_extract_date_strings(date_string, expected_match_date_string):
    dt = datefinder.DateFinder()
    for actual_date_string, indexes, captures in dt.extract_date_strings(
            date_string):
        logger.debug("actual={}  expected={}".format(
            actual_date_string, expected_match_date_string))
        assert actual_date_string == expected_match_date_string
        assert len(captures.get('timezones', [])) > 0
def test_find_and_replace(date_string, expected_replaced_string, captures, expected_tz_string):
    dt = datefinder.DateFinder()
    expected_replacements = copy.copy(dt.REPLACEMENTS)
    actual_date_string, actual_tz_string = dt._find_and_replace(date_string, captures)

    # assert that dt._find_and_replace did not mutate dt.REPLACEMENTS
    assert dt.REPLACEMENTS == expected_replacements

    # assert the return values of dt._find_and_replace
    assert actual_date_string == expected_replaced_string
    assert actual_tz_string == expected_tz_string
def test_extract_date_strings_with_strict_option(date_string, expected_match_date_string):
    """
    make sure that `strict` mode works for the dates we care about
    and doesn't work for others

    :param date_string:
    :param expected_match_date_string:
    :return:
    """
    dt = datefinder.DateFinder()
    for actual_date_string, indexes, captures in dt.extract_date_strings(date_string,strict=True):
        logger.debug("actual={}  expected={}".format(actual_date_string, expected_match_date_string))
        assert actual_date_string == expected_match_date_string
def identifyDatetime(text, typeName="DATETIME_TYPE"):
    try:
        finder = datefinder.DateFinder()
        #print(finder.DATES_PATTERN)
        finds = finder.extract_date_strings(text, strict=True)
        for date_string, indices, captures in finds:
            #print(date_string)
            #print(indices)
            #print(captures)
            parsed_date = finder.parse_date_string(date_string, captures)
            if parsed_date is not None:
                #print(parsed_date)
                text = text.replace(date_string, formatTypeName(typeName))
    except Exception as e:
        pass
    return text
    def extract_datereference(self, file_obj, date):

        self.date_referred_list = []
        base_date = self.str_to_dtfmt(date)
        date_finder = DF.DateFinder(base_date=base_date)

        for line in file_obj:
            str1 = date_finder.find_dates(line)
            while True:
                try:
                    t = (str1.__iter__()).next()
                    one, flag = self.valid_date(t)
                    if flag:
                        #one_unaware = one.replace(tzinfo=None)
                        self.date_referred_list.append(one)
                except ValueError:
                    continue
                except StopIteration:
                    break
def test_parse_date_string_find_replace_nonexistent_tzinfo(date_string, expected_parse_arg, expected_captures, expected_date):
    '''
    mimic what happens when dateutil.tz.gettz tries
    to find a non-existent tzinfo string with mocks
    because some operating systems might resolve 'CST' and 'IRST'

    :param date_string:
    :param expected_parse_arg:
    :param expected_captures:
    :param expected_date:
    :return:
    '''
    dt = datefinder.DateFinder()
    with mock.patch.object(tz, 'gettz', wraps=tz.gettz) as mock_gettz:
        mock_gettz.return_value = None
        actual_datetime = dt.parse_date_string(date_string, expected_captures)
        mock_gettz.assert_called_with(expected_captures['timezones'][0])
        logger.debug("acutal={}  expected={}".format(actual_datetime, expected_date))
        assert actual_datetime == expected_date
Exemple #8
0
def test_parse_date_string_find_replace_nonexistent_tzinfo(
        date_string, expected_parse_arg, expected_captures, expected_date):
    '''
    mimic what happens when dateutil tries
    to find a non-existent tzinfo string
    because some operating systems might resolve 'CST' and 'IRST' this should raise a warning.

    :param date_string:
    :param expected_parse_arg:
    :param expected_captures:
    :param expected_date:
    :return:
    '''
    with pytest.warns(parser.UnknownTimezoneWarning):
        warnings.simplefilter('always')
        dt = datefinder.DateFinder()
        actual_datetime = dt.parse_date_string(date_string, expected_captures)
        logger.debug("actual={}  expected={}".format(actual_datetime,
                                                     expected_date))
        assert actual_datetime == expected_date
Exemple #9
0
def test_tz_gettz_for_all_patterns():
    """
    determine which pattern matching tz_strings
    dateutil.tz.gettz will not handle

    :warning: currently tz.gettz only matches 14 of regex timezones of our ~400
    [ GOOD MATCHES ]: ['PST', 'EST', 'MST', 'CET', 'EET', 'EST', 'GMT', 'HST', 'MET', 'MST', 'PDT', 'PST', 'UTC', 'WET']
    """
    bad_tz_strings = []
    good_tz_strings = []
    finder = datefinder.DateFinder()
    test_tz_strings = finder.NA_TIMEZONES_PATTERN.split(
        '|') + finder.TIMEZONES_PATTERN.split('|\s')
    for tz_string in test_tz_strings:
        if tz_string in finder.TIMEZONE_REPLACEMENTS.keys():
            tz_string = finder.TIMEZONE_REPLACEMENTS[tz_string]
        tz_object = tz.gettz(tz_string.replace('\s', ''))
        if tz_object is None:
            bad_tz_strings.append(tz_string)
        else:
            good_tz_strings.append(tz_string)
    logger.debug("[ BAD TZINFO ]: {}".format(bad_tz_strings))
    logger.debug("[ GOOD TZINFO ]: {}".format(good_tz_strings))
Exemple #10
0
def get_raw_dates(text,
                  strict=False,
                  base_date=None,
                  return_source=False) -> Generator:
    """
    Find "raw" or potential date matches prior to false positive classification.
    :param text: raw text to search
    :param strict: whether to return only complete or strict matches
    :param base_date: base date to use for implied or partial matches
    :param return_source: whether to return raw text around date
    :return:
    """
    # Setup base date
    if not base_date:
        base_date = datetime.datetime.now().replace(day=1,
                                                    month=1,
                                                    hour=0,
                                                    minute=0,
                                                    second=0,
                                                    microsecond=0)

    # Find potential dates
    date_finder = datefinder.DateFinder(base_date=base_date)

    for extra_token in date_finder.EXTRA_TOKENS_PATTERN.split('|'):
        if extra_token != 't':
            date_finder.REPLACEMENTS[extra_token] = ' '

    # Iterate through possible matches
    possible_dates = [(date_string, index, date_props)
                      for date_string, index, date_props in
                      date_finder.extract_date_strings(text, strict=strict)]
    possible_matched = []
    for i, possible_date in enumerate(possible_dates):
        # Get
        date_string = possible_date[0]
        index = possible_date[1]
        date_props = possible_date[2]

        # Cleanup "day of" strings
        if "of" in date_props["extra_tokens"] or "OF" in date_props[
                "extra_tokens"]:
            num_dig_mod = len(possible_dates[i - 1][2]["digits_modifier"])
            if i > 0 and not possible_matched[i - 1] and num_dig_mod == 1:
                date_props["digits_modifier"].extend(
                    possible_dates[i - 1][2]["digits_modifier"])
                date_string = possible_dates[i - 1][2]["digits_modifier"].pop().replace("st", "").replace("nd", "") \
                                  .replace("rd", "").replace("th", "") + date_string

        # Skip only digits modifiers
        num_dig_mod = len(date_props["digits_modifier"])
        num_dig = len(date_props["digits"])
        num_days = len(date_props["days"])
        num_month = len(date_props["months"])
        num_slash = date_props["delimiters"].count("/")
        num_hyphen = date_props["delimiters"].count("-")

        # Remove double months
        if num_month > 1:
            possible_matched.append(False)
            continue

        # Remove wrong months like Dec*ided or Mar*tin
        if num_month == 1 and date_props['extra_tokens'] \
                and (date_props['months'][0] + date_props['extra_tokens'][-1]) in date_string:
            possible_matched.append(False)
            continue

        # Check strange strings
        if num_dig_mod > 0 and num_dig == 0:
            possible_matched.append(False)
            continue

        # Skip DOW only
        if num_days > 0 and num_dig == 0:
            possible_matched.append(False)
            continue

        # Skip DOM only
        if num_month == 0 and num_dig_mod == 0 and num_dig <= 1:
            possible_matched.append(False)
            continue

        # Skip fractions
        if (num_slash == 1 or num_hyphen == 1) and num_dig > 2:
            possible_matched.append(False)
            continue

        # Skip three-digit blocks and double zero years
        found_triple = False
        found_dz = False
        for digit in date_props["digits"]:
            if len(digit) == 3:
                found_triple = True
            if digit.startswith("00"):
                found_dz = True
        if found_triple or found_dz:
            possible_matched.append(False)
            continue

        # Skip " may " alone
        if num_dig == 0 and num_days == 0 and "".join(
                date_props["months"]).lower() == "may":
            possible_matched.append(False)
            continue

        # Cleanup
        for token in sorted(date_props["extra_tokens"], key=len, reverse=True):
            if token.lower() in ["to", "t"]:
                continue
            date_string = date_string.replace(token, "")
        date_string = date_string.strip()
        date_props["extra_tokens"] = []

        # Skip strings too long
        if len(date_string) > DATE_MAX_LENGTH:
            possible_matched.append(False)
            continue

        # Skip numbers only
        match_delims = set("".join(date_props["delimiters"]))
        bad_delims = {",", " ", "\n", "\t"}
        len_diff_set = len(match_delims - bad_delims)
        if len_diff_set == 0 and num_month == 0:
            possible_matched.append(False)
            continue

        # Parse and skip nones
        date = None
        try:
            date_string_tokens = date_string.split()
            for cutter in range(len(date_string_tokens)):
                for direction in (0, 1):
                    if cutter > 0:
                        if direction:
                            _date_string_tokens = date_string_tokens[cutter:]
                        else:
                            _date_string_tokens = date_string_tokens[:-cutter]
                        date_string = ' '.join(_date_string_tokens)
                    try:
                        date = date_finder.parse_date_string(
                            date_string, date_props)
                    except:
                        date = None
                    if date:
                        break
                else:
                    continue  # executed if the loop ended normally (no break)
                break  # executed if 'continue' was skipped (break)
        except TypeError:
            possible_matched.append(False)
            continue

        if not date:
            possible_matched.append(False)
            continue
        else:
            # for case when datetime.datetime(2001, 1, 22, 20, 1, tzinfo=tzoffset(None, -104400))
            if hasattr(date, 'tzinfo'):
                try:
                    _ = date.isoformat()
                except ValueError:
                    possible_matched.append(False)
                    continue
            possible_matched.append(True)

        if isinstance(
                date,
                datetime.datetime) and date.hour == 0 and date.minute == 0:
            date = date.date()
        # Append
        if return_source:
            yield (date, index)
        else:
            yield date
Exemple #11
0
def test_add_tzinfo(naive_datetime_obj, timezone_string):
    expected_datetime = naive_datetime_obj.replace(
        tzinfo=tz.gettz(timezone_string))
    finder = datefinder.DateFinder()
    actual_datetime = finder._add_tzinfo(naive_datetime_obj, timezone_string)
    assert actual_datetime == expected_datetime
def get_raw_dates(text,
                  strict=False,
                  base_date=None,
                  return_source=False) -> Generator:
    """
    Find "raw" or potential date matches prior to false positive classification.
    :param text: raw text to search
    :param strict: whether to return only complete or strict matches
    :param base_date: base date to use for implied or partial matches
    :param return_source: whether to return raw text around date
    :return:
    """
    # Setup base date
    if not base_date:
        base_date = datetime.date(datetime.date.today().year, 1, 1)

    # Find potential dates
    date_finder = datefinder.DateFinder(base_date=base_date)

    # Iterate through possible matches
    possible_dates = [(date_string, index, date_props)
                      for date_string, index, date_props in
                      date_finder.extract_date_strings(text, strict=strict)]
    possible_matched = []

    for i, possible_date in enumerate(possible_dates):
        # Get
        date_string = possible_date[0]
        index = possible_date[1]
        date_props = possible_date[2]

        # Cleanup "day of" strings
        if "of" in date_props["extra_tokens"] or "OF" in date_props[
                "extra_tokens"]:
            num_dig_mod = len(possible_dates[i - 1][2]["digits_modifier"])
            if i > 0 and not possible_matched[i - 1] and num_dig_mod == 1:
                date_props["digits_modifier"].extend(
                    possible_dates[i - 1][2]["digits_modifier"])
                date_string = possible_dates[i - 1][2]["digits_modifier"].pop().replace("st", "").replace("nd", "") \
                                  .replace("rd", "").replace("th", "") + date_string

        # Skip only digits modifiers
        num_dig_mod = len(date_props["digits_modifier"])
        num_dig = len(date_props["digits"])
        num_days = len(date_props["days"])
        num_month = len(date_props["months"])
        num_slash = date_props["delimiters"].count("/")
        num_hyphen = date_props["delimiters"].count("-")

        # Remove double months
        if num_month > 1:
            possible_matched.append(False)
            continue

        # Remove wrong months like Dec*ided or Mar*tin
        if num_month == 1 and date_props['extra_tokens'] \
                and (date_props['months'][0] + date_props['extra_tokens'][-1]) in date_string:
            possible_matched.append(False)
            continue

        # Check strange strings
        if num_dig_mod > 0 and num_dig == 0:
            possible_matched.append(False)
            continue

        # Skip DOW only
        if num_days > 0 and num_dig == 0:
            possible_matched.append(False)
            continue

        # Skip DOM only
        if num_month == 0 and num_dig_mod == 0 and num_dig <= 1:
            possible_matched.append(False)
            continue

        # Skip fractions
        if (num_slash == 1 or num_hyphen == 1) and num_dig > 2:
            possible_matched.append(False)
            continue

        # Skip three-digit blocks and double zero years
        found_triple = False
        found_dz = False
        for digit in date_props["digits"]:
            if len(digit) == 3:
                found_triple = True
            if digit.startswith("00"):
                found_dz = True
        if found_triple or found_dz:
            possible_matched.append(False)
            continue

        # Skip " may " alone
        if num_dig == 0 and num_days == 0 and "".join(
                date_props["months"]).lower() == "may":
            possible_matched.append(False)
            continue

        # Cleanup
        for token in date_props["extra_tokens"]:
            if token.lower() in ["to"]:
                continue
            date_string = date_string.replace(token, "")
        date_string = date_string.strip()
        date_props["extra_tokens"] = []

        # Skip strings too long
        if len(date_string) > DATE_MAX_LENGTH:
            possible_matched.append(False)
            continue

        # Skip numbers only
        match_delims = set("".join(date_props["delimiters"]))
        bad_delims = {",", " ", "\n", "\t"}
        len_diff_set = len(match_delims - bad_delims)
        if len_diff_set == 0 and num_month == 0:
            possible_matched.append(False)
            continue

        # Parse and skip nones
        try:
            date = date_finder.parse_date_string(date_string, date_props)
        except TypeError:
            possible_matched.append(False)
            continue

        if not date:
            possible_matched.append(False)
            continue
        else:
            possible_matched.append(True)

        # Append
        if return_source:
            yield (date, index)
        else:
            yield date
Exemple #13
0
from pymarc import MARCReader
import datefinder
import re
import hashlib
dt = datefinder.DateFinder()


def dateparser(s):
    if s is not None:
        m1 = re.search(r'c?\s*(\d{4})', s)
        m2 = re.search(r'\[(\d{4})\]', s)
        if m1:
            return m1.group(1)
        elif m2:
            return m2.group(1)
        else:
            return None


cache = {}


def authorparser(s):
    if s is not None:
        m = re.match(
            r'^(?P<last>\w+),\s*(?P<first>\w+)(,\s*(?P<born>\d{4})-(?P<died>\d{4}))?',
            s)
        if m:
            a = ''
            b = ''
            if m.group('born'):