Beispiel #1
0
def robust_date_parser(d):
    """
    Robust wrapper around some date parsing libs, making a best effort to return
    a single 8601 date from the input string. No range checking is performed, and
    any date other than the first occuring will be ignored.

    We use timelib for its ability to make at least some sense of invalid dates,
    e.g. 2012/02/31 -> 2012/03/03

    We rely only on dateutil.parser for picking out dates from nearly arbitrary
    strings (fuzzy=True), but at the cost of being forgiving of invalid dates
    in those kinds of strings.

    Returns None if it fails
    """
    dd = dateparser.to_iso8601(d)
    if dd is None or out_of_range(d):
        try:
            dd = dateutil_parse(d, fuzzy=True, default=DEFAULT_DATETIME)
            if dd.year == DEFAULT_DATETIME.year:
                dd = None
        except Exception:
            try:
                dd = timelib.strtodatetime(d, now=DEFAULT_DATETIME_SECS)
            except ValueError:
                pass
            except Exception as e:
                logger.error(e)

        if dd:
            ddiso = dd.isoformat()
            return ddiso[:ddiso.index('T')]

    return dd
Beispiel #2
0
def robust_date_parser(d):
    """
    Robust wrapper around some date parsing libs, making a best effort to return
    a single 8601 date from the input string. No range checking is performed, and
    any date other than the first occuring will be ignored.

    We use timelib for its ability to make at least some sense of invalid dates,
    e.g. 2012/02/31 -> 2012/03/03

    We rely only on dateutil.parser for picking out dates from nearly arbitrary
    strings (fuzzy=True), but at the cost of being forgiving of invalid dates
    in those kinds of strings.

    Returns None if it fails
    """
    dd = dateparser.to_iso8601(d)
    if dd is None or out_of_range(d):
        try:
            dd = dateutil_parse(d, fuzzy=True, default=DEFAULT_DATETIME)
            if dd.year == DEFAULT_DATETIME.year:
                dd = None
        except Exception:
            try:
                dd = timelib.strtodatetime(d, now=DEFAULT_DATETIME_SECS)
            except ValueError:
                pass
            except Exception as e:
                logger.error(e)

        if dd:
            ddiso = dd.isoformat()
            return ddiso[:ddiso.index('T')]

    return dd
Beispiel #3
0
def smart_parse_date(date):
    '''
    Accepts a string or unicode date to be parsed and returns a datetime.datetime result
    
    A very restrictive list of dates that can be parsed (i.e. some date formats not
    listed here should work):

    W3C dates, documented here:

    http://www.w3.org/TR/NOTE-datetime

    A subset of undelimited ISO-8601 dates work (as prevalent in LC MODS).

    YYYYDDMM
    YYYYDDMMhhmmss

    In general the dates have to be internationally unambiguous, Y2K-safe
    One exception is support for US convention, Y2K-safe year dates.

    MM/DD/YYYY
    '''
    date = date.strip()
    #FIXME: Yes, layers on layers.  Streamline it.
    try:
        dt = iso8601.parse_date(dateparser.to_iso8601(date))
        return dt
    except (KeyboardInterrupt, SystemExit):
        raise
    except Exception, e:
        pass
Beispiel #4
0
def test_time_strings():
    reference_date = None
    for line in TEST_DATES.splitlines():
        if not line or line.startswith("#"):
            # Ignore blank lines and comments
            continue
        if line.startswith("ISO:"):
            # This is a new date reference.
            # Check that it's value and save for future comparisons
            _, reference_date = line.split()
            # Test for round-trip parsing
            iso_date = to_iso8601(reference_date)
            assert iso_date is not None, reference_date  # could not be parsed
            assert iso_date == reference_date, (iso_date, reference_date) # did not match
            print "New date:", reference_date
            continue

        print "Test", line
        date = to_iso8601(line)
        assert date == reference_date, (line, date, reference_date)
Beispiel #5
0
def robust_date_parser(d):
    """
    Robust wrapper around some date parsing libs, making a best effort to return
    a single 8601 date from the input string. No range checking is performed, and
    any date other than the first occuring will be ignored.

    We use timelib for its ability to make at least some sense of invalid dates,
    e.g. 2012/02/31 -> 2012/03/03

    We rely only on dateutil.parser for picking out dates from nearly arbitrary
    strings (fuzzy=True), but at the cost of being forgiving of invalid dates
    in those kinds of strings.

    Returns None if it fails
    """
    # Function for a formatted date string, since datetime.datetime.strftime()
    # only works with years >= 1900.
    return_date = lambda d: "%d-%02d-%02d" % (d.year, d.month, d.day)

    # Check for EDTF timestamp first, because it is simple.
    if edtf_date_and_time.match(d):
        try:
            dateinfo = dateutil_parse(d)
            return return_date(dateinfo)
        except TypeError:
            # not parseable by dateutil_parse()
            dateinfo = None
    isodate = dateparser.to_iso8601(d)
    if isodate is None or out_of_range(d):
        try:
            dateinfo = dateutil_parse(d, fuzzy=True, default=DEFAULT_DATETIME)
            if dateinfo.year == DEFAULT_DATETIME.year:
                dateinfo = None
        except Exception:
            try:
                dateinfo = timelib.strtodatetime(d, now=DEFAULT_DATETIME_SECS)
            except ValueError:
                dateinfo = None
            except Exception as e:
                logger.error("Exception %s in %s" % (e, __name__))

        if dateinfo:
            return return_date(dateinfo)

    return isodate
def robust_date_parser(d):
    """
    Robust wrapper around some date parsing libs, making a best effort to
    return a single 8601 date from the input string. No range checking is
    performed, and any date other than the first occuring will be ignored.

    We use timelib for its ability to make at least some sense of invalid
    dates, e.g. 2012/02/31 -> 2012/03/03

    We rely only on dateutil.parser for picking out dates from nearly arbitrary
    strings (fuzzy=True), but at the cost of being forgiving of invalid dates
    in those kinds of strings.

    Returns None if it fails
    """
    # Function for a formatted date string, since datetime.datetime.strftime()
    # only works with years >= 1900.
    return_date = lambda d: "%d-%02d-%02d" % (d.year, d.month, d.day)

    # Check for EDTF timestamp first, because it is simple.
    if edtf_date_and_time.match(d):
        try:
            dateinfo = dateutil_parse(d)
            return return_date(dateinfo)
        except TypeError:
            # not parseable by dateutil_parse()
            dateinfo = None
    isodate = dateparser.to_iso8601(d)
    if isodate is None or out_of_range(d):
        try:
            dateinfo = dateutil_parse(d, fuzzy=True, default=DEFAULT_DATETIME)
            if dateinfo.year == DEFAULT_DATETIME.year:
                dateinfo = None
        except Exception:
            try:
                dateinfo = timelib.strtodatetime(d, now=DEFAULT_DATETIME_SECS)
            except ValueError:
                dateinfo = None
            except Exception as e:
                logger.error("Exception %s in %s" % (e, __name__))

        if dateinfo:
            return return_date(dateinfo)

    return isodate
Beispiel #7
0
def parse_date_or_range(d):
    #TODO: Handle dates with BC, AD, AH
    #      Handle ranges like 1920s - 1930s
    #      Handle ranges like 11th - 12th century
    a, b = None, None

    if re.search("B\.?C\.?|A\.?D\.?|A\.?H\.?", d.upper()):
        pass
    elif year_range.match(d):
        match = year_range.match(d)
        a, b = sorted((match.group("year1"), match.group("year2")))
    elif len(d.split("-"))%2 == 0 or len(d.split("/"))%2 == 0:
        # Handle ranges
        delim = "-" if len(d.split("-"))%2 == 0 else "/"
        if day_range.match(d):
            # ie 1970-08-01/02
            match = day_range.match(d)
            a = "%s-%s-%s" % (match.group("year"), match.group("month"),
                              match.group("day_begin"))
            b = "%s-%s-%s" % (match.group("year"),match.group("month"),
                              match.group("day_end"))
        elif decade_date.match(d):
            match = decade_date.match(d)
            a = match.group("year") + "0"
            b = match.group("year") + "9"
        elif any([0 < len(s) < 4 for s in d.split(delim) if
                  len(d.split(delim)) == 2]):
            # ie 1970-90, 1970/90, 1970-9, 1970/9, 9/1979
            match = circa_range.match(d)
            if match:
                year_begin = match.group("century") + match.group("year_begin")
                year_end = match.group("century") + match.group("year_end")
                if int(year_begin) < int(year_end):
                    # ie 1970-90
                    a = robust_date_parser(year_begin)
                    b = robust_date_parser(year_end)
                else:
                    # ie 1970-9
                    (y, m) = d.split(delim)
                    # If the second number is a month, format it to two digits
                    # and use "-" as the delim for consistency in the
                    # dateparser.to_iso8601 result
                    if int(m) in range(1,13):
                        d = "%s-%02d" % (y, int(m))
                    else:
                        # ie 1970-13
                        # Just use the year
                        d = y

                    a = robust_date_parser(d)
                    b = robust_date_parser(d)
            else:
                match = month_year.match(d)
                if match:
                    d = "%s-%02d" % (match.group("year"), int(match.group("month")))
                    a = robust_date_parser(d)
                    b = robust_date_parser(d)
        elif "" in d.split(delim):
            # ie 1970- or -1970
            s = d.split(delim)
            if s[0]:
                a, b = s[0], None
            else:
                a, b = None, s[1]
        else:
            # ie 1970-01-01-1971-01-01, 1970Fall/August, 1970April/May, or
            # wordy date like "mid 11th century AH/AD 17th century (Mughal)"
            d = d.split(delim)
            begin = delim.join(d[:len(d)/2])
            end = delim.join(d[len(d)/2:])

            # Check if month in begin or end
            m1 = re.sub("[-\d/]", "", begin)
            m2 = re.sub("[-\d/]", "", end)
            if m1 or m2:
                # ie 2004July/August, 2004Fall/Winter, or wordy date
                begin, end = None, None

                # Extract year
                for v in d:
                    y = re.sub(r"(?i)[a-z]", "", v)
                    if len(y) == 4:
                        begin = y + m1.capitalize()
                        end = y + m2.capitalize()
                        if not dateparser.to_iso8601(begin) or not\
                               dateparser.to_iso8601(end):
                            begin, end = y, y
                        break

            if begin:
                a, b = robust_date_parser(begin), robust_date_parser(end)
    elif decade_date_s.match(d):
        match = decade_date_s.match(d)
        year_begin = match.group("year")
        year_end = match.group("year")[:3] + "9"
        a, b = year_begin, year_end
    elif between_date.match(d):
        match = between_date.match(d)
        year1 = int(match.group("year1"))
        year2 = int(match.group("year2"))
        a, b = str(min(year1, year2)), str(max(year1, year2))
    else:
        parsed = robust_date_parser(d)
        a, b = parsed, parsed

    return a, b
Beispiel #8
0
def parse_date_or_range(d):
    #TODO: Handle dates with BC, AD, AH
    #      Handle ranges like 1920s - 1930s
    #      Handle ranges like 11th - 12th century
    a, b = None, None

    if re.search("B\.?C\.?|A\.?D\.?|A\.?H\.?", d.upper()):
        pass
    is_edtf_timestamp = edtf_date_and_time.match(d)
    hyphen_split = d.split("-")
    slash_split = d.split("/")
    ellipse_split = d.split("..")
    is_hyphen_split = (len(hyphen_split) % 2 == 0)
    is_slash_split = (len(slash_split) % 2 == 0)
    is_ellipse_split = (len(ellipse_split) % 2 == 0)
    if year_range.match(d):
        match = year_range.match(d)
        a, b = sorted((match.group("year1"), match.group("year2")))
    elif (is_hyphen_split or is_slash_split or is_ellipse_split) \
            and not is_edtf_timestamp:
        # We passed over EDTF timestamps because they contain hyphens and we
        # can handle them below.  Note that we don't deal with ranges of
        # timestamps.
        #
        # Handle ranges
        if is_hyphen_split:
            delim = "-"
            split_result = hyphen_split
        elif is_slash_split:
            delim = "/"
            split_result = slash_split
        elif is_ellipse_split:
            delim = ".."
            split_result = ellipse_split
        if day_range.match(d):
            # ie 1970-08-01/02
            match = day_range.match(d)
            a = "%s-%s-%s" % (match.group("year"), match.group("month"),
                              match.group("day_begin"))
            b = "%s-%s-%s" % (match.group("year"),match.group("month"),
                              match.group("day_end"))
        elif decade_date.match(d):
            match = decade_date.match(d)
            a = match.group("year") + "0"
            b = match.group("year") + "9"
        elif any([0 < len(s) < 4
                  for s in split_result
                  if len(split_result) == 2]):
            # ie 1970-90, 1970/90, 1970-9, 1970/9, 9/1979
            match = circa_range.match(d)
            if match:
                year_begin = match.group("century") + match.group("year_begin")
                year_end = match.group("century") + match.group("year_end")
                if int(year_begin) < int(year_end):
                    # ie 1970-90
                    a = robust_date_parser(year_begin)
                    b = robust_date_parser(year_end)
                else:
                    # ie 1970-9
                    (y, m) = split_result
                    # If the second number is a month, format it to two digits
                    # and use "-" as the delim for consistency in the
                    # dateparser.to_iso8601 result
                    if int(m) in range(1,13):
                        d = "%s-%02d" % (y, int(m))
                    else:
                        # ie 1970-13
                        # Just use the year
                        d = y

                    a = robust_date_parser(d)
                    b = robust_date_parser(d)
            else:
                match = month_year.match(d)
                if match:
                    d = "%s-%02d" % (match.group("year"), int(match.group("month")))
                    a = robust_date_parser(d)
                    b = robust_date_parser(d)
        elif "" in split_result:
            # ie 1970- or -1970 (but not 19uu- nor -19uu)
            s = split_result
            if len(s[0]) == 4 and "u" not in s[0]:
                a, b = s[0], None
            elif len(s[1]) == 4 and "u" not in s[1]:
                a, b = None, s[1]
            else:
                a, b = None, None
        else:
            # ie 1970-01-01-1971-01-01, 1970 Fall/August, 1970 April/May, or
            # wordy date like "mid 11th century AH/AD 17th century (Mughal)"
            d = d.replace(" ", "")
            d = d.split(delim)
            begin = delim.join(d[:len(d)/2])
            end = delim.join(d[len(d)/2:])

            # Check if month in begin or end
            m1 = re.sub("[-\d/]", "", begin)
            m2 = re.sub("[-\d/]", "", end)
            if m1 or m2:
                # ie 2004July/August, 2004Fall/Winter, or wordy date
                begin, end = None, None

                # Extract year
                for v in d:
                    y = re.sub(r"(?i)[a-z]", "", v)
                    if len(y) == 4:
                        begin = y + m1.capitalize()
                        end = y + m2.capitalize()
                        if not dateparser.to_iso8601(begin) or not\
                               dateparser.to_iso8601(end):
                            begin, end = y, y
                        break

            if begin:
                a, b = robust_date_parser(begin), robust_date_parser(end)
    elif decade_date_s.match(d):
        match = decade_date_s.match(d)
        year_begin = match.group("year")
        year_end = match.group("year")[:3] + "9"
        a, b = year_begin, year_end
    elif between_date.match(d):
        match = between_date.match(d)
        year1 = int(match.group("year1"))
        year2 = int(match.group("year2"))
        a, b = str(min(year1, year2)), str(max(year1, year2))
    else:
        # This picks up a variety of things, in addition to timestamps.
        parsed = robust_date_parser(d)
        a, b = parsed, parsed

    return a, b
Beispiel #9
0
def parse_date_or_range(d):
    #TODO: Handle dates with BC, AD, AH
    #      Handle ranges like 1920s - 1930s
    #      Handle ranges like 11th - 12th century
    a, b = None, None

    if re.search("B\.?C\.?|A\.?D\.?|A\.?H\.?", d.upper()):
        pass
    elif year_range.match(d):
        match = year_range.match(d)
        a, b = sorted((match.group("year1"), match.group("year2")))
    elif len(d.split("-")) % 2 == 0 or len(d.split("/")) % 2 == 0:
        # Handle ranges
        delim = "-" if len(d.split("-")) % 2 == 0 else "/"
        if day_range.match(d):
            # ie 1970-08-01/02
            match = day_range.match(d)
            a = "%s-%s-%s" % (match.group("year"), match.group("month"),
                              match.group("day_begin"))
            b = "%s-%s-%s" % (match.group("year"), match.group("month"),
                              match.group("day_end"))
        elif decade_date.match(d):
            match = decade_date.match(d)
            a = match.group("year") + "0"
            b = match.group("year") + "9"
        elif any([
                0 < len(s) < 4 for s in d.split(delim)
                if len(d.split(delim)) == 2
        ]):
            # ie 1970-90, 1970/90, 1970-9, 1970/9, 9/1979
            match = circa_range.match(d)
            if match:
                year_begin = match.group("century") + match.group("year_begin")
                year_end = match.group("century") + match.group("year_end")
                if int(year_begin) < int(year_end):
                    # ie 1970-90
                    a = robust_date_parser(year_begin)
                    b = robust_date_parser(year_end)
                else:
                    # ie 1970-9
                    (y, m) = d.split(delim)
                    # If the second number is a month, format it to two digits
                    # and use "-" as the delim for consistency in the
                    # dateparser.to_iso8601 result
                    if int(m) in range(1, 13):
                        d = "%s-%02d" % (y, int(m))
                    else:
                        # ie 1970-13
                        # Just use the year
                        d = y

                    a = robust_date_parser(d)
                    b = robust_date_parser(d)
            else:
                match = month_year.match(d)
                if match:
                    d = "%s-%02d" % (match.group("year"),
                                     int(match.group("month")))
                    a = robust_date_parser(d)
                    b = robust_date_parser(d)
        elif "" in d.split(delim):
            # ie 1970- or -1970
            s = d.split(delim)
            if s[0]:
                a, b = s[0], None
            else:
                a, b = None, s[1]
        else:
            # ie 1970-01-01-1971-01-01, 1970Fall/August, 1970April/May, or
            # wordy date like "mid 11th century AH/AD 17th century (Mughal)"
            d = d.split(delim)
            begin = delim.join(d[:len(d) / 2])
            end = delim.join(d[len(d) / 2:])

            # Check if month in begin or end
            m1 = re.sub("[-\d/]", "", begin)
            m2 = re.sub("[-\d/]", "", end)
            if m1 or m2:
                # ie 2004July/August, 2004Fall/Winter, or wordy date
                begin, end = None, None

                # Extract year
                for v in d:
                    y = re.sub(r"(?i)[a-z]", "", v)
                    if len(y) == 4:
                        begin = y + m1.capitalize()
                        end = y + m2.capitalize()
                        if not dateparser.to_iso8601(begin) or not\
                               dateparser.to_iso8601(end):
                            begin, end = y, y
                        break

            if begin:
                a, b = robust_date_parser(begin), robust_date_parser(end)
    elif decade_date_s.match(d):
        match = decade_date_s.match(d)
        year_begin = match.group("year")
        year_end = match.group("year")[:3] + "9"
        a, b = year_begin, year_end
    elif between_date.match(d):
        match = between_date.match(d)
        year1 = int(match.group("year1"))
        year2 = int(match.group("year2"))
        a, b = str(min(year1, year2)), str(max(year1, year2))
    else:
        parsed = robust_date_parser(d)
        a, b = parsed, parsed

    return a, b
def parse_date_or_range(d):
    # TODO: Handle dates with BC, AD, AH
    #      Handle ranges like 1920s - 1930s
    #      Handle ranges like 11th - 12th century
    a, b = None, None
    if re.search("B\.?C\.?|A\.?D\.?|A\.?H\.?", d.upper()):
        pass
    is_edtf_timestamp = edtf_date_and_time.match(d)
    hyphen_split = d.split("-")
    slash_split = d.split("/")
    ellipse_split = d.split("..")
    is_hyphen_split = (len(hyphen_split) % 2 == 0)
    is_slash_split = (len(slash_split) % 2 == 0)
    is_ellipse_split = (len(ellipse_split) % 2 == 0)
    if year_range.match(d):
        match = year_range.match(d)
        a, b = sorted((match.group("year1"), match.group("year2")))
    elif (is_hyphen_split or is_slash_split or is_ellipse_split) \
            and not is_edtf_timestamp:
        # We passed over EDTF timestamps because they contain hyphens and we
        # can handle them below.  Note that we don't deal with ranges of
        # timestamps.
        #
        # Handle ranges
        if is_hyphen_split:
            delim = "-"
            split_result = hyphen_split
        elif is_slash_split:
            delim = "/"
            split_result = slash_split
        elif is_ellipse_split:
            delim = ".."
            split_result = ellipse_split
        if day_range.match(d):
            # ie 1970-08-01/02
            match = day_range.match(d)
            a = "%s-%s-%s" % (match.group("year"), match.group("month"),
                              match.group("day_begin"))
            b = "%s-%s-%s" % (match.group("year"), match.group("month"),
                              match.group("day_end"))
        elif decade_date.match(d):
            match = decade_date.match(d)
            a = match.group("year") + "0"
            b = match.group("year") + "9"
        elif any(
             [0 < len(s) < 4 for s in split_result if len(split_result) == 2]):
            # ie 1970-90, 1970/90, 1970-9, 1970/9, 9/1979
            match = circa_range.match(d)
            if match:
                year_begin = match.group("century") + match.group("year_begin")
                year_end = match.group("century") + match.group("year_end")
                if int(year_begin) < int(year_end):
                    # ie 1970-90
                    a = robust_date_parser(year_begin)
                    b = robust_date_parser(year_end)
                else:
                    # ie 1970-9
                    (y, m) = split_result
                    # If the second number is a month, format it to two digits
                    # and use "-" as the delim for consistency in the
                    # dateparser.to_iso8601 result
                    if int(m) in range(1, 13):
                        d = "%s-%02d" % (y, int(m))
                    else:
                        # ie 1970-13
                        # Just use the year
                        d = y

                    a = robust_date_parser(d)
                    b = robust_date_parser(d)
            else:
                match = month_year.match(d)
                if match:
                    d = "%s-%02d" % (match.group("year"),
                                     int(match.group("month")))
                    a = robust_date_parser(d)
                    b = robust_date_parser(d)
        elif "" in split_result:
            # ie 1970- or -1970 (but not 19uu- nor -19uu)
            s = split_result
            if len(s[0]) == 4 and "u" not in s[0]:
                a, b = s[0], None
            elif len(s[1]) == 4 and "u" not in s[1]:
                a, b = None, s[1]
            else:
                a, b = None, None
        else:
            # ie 1970-01-01-1971-01-01, 1970 Fall/August, 1970 April/May, or
            # wordy date like "mid 11th century AH/AD 17th century (Mughal)"
            d = d.replace(" ", "")
            d = d.split(delim)
            begin = delim.join(d[:len(d) / 2])
            end = delim.join(d[len(d) / 2:])

            # Check if month in begin or end
            m1 = re.sub("[-\d/]", "", begin)
            m2 = re.sub("[-\d/]", "", end)
            if m1 or m2:
                # ie 2004July/August, 2004Fall/Winter, or wordy date
                begin, end = None, None

                # Extract year
                for v in d:
                    y = re.sub(r"(?i)[a-z]", "", v)
                    if len(y) == 4:
                        begin = y + m1.capitalize()
                        end = y + m2.capitalize()
                        if not dateparser.to_iso8601(begin) or not\
                               dateparser.to_iso8601(end):
                            begin, end = y, y
                        break

            if begin:
                a, b = robust_date_parser(begin), robust_date_parser(end)
    elif decade_date_s.match(d):
        match = decade_date_s.match(d)
        year_begin = match.group("year")
        year_end = match.group("year")[:3] + "9"
        a, b = year_begin, year_end
    elif between_date.match(d):
        match = between_date.match(d)
        year1 = int(match.group("year1"))
        year2 = int(match.group("year2"))
        a, b = str(min(year1, year2)), str(max(year1, year2))
    else:
        # This picks up a variety of things, in addition to timestamps.
        parsed = robust_date_parser(d)
        a, b = parsed, parsed

    return a, b