Beispiel #1
0
def build2DigitYear(s, chrono_id, chrono_list, flags):
    b, text, startSpan, endSpan = has2DigitYear(s)
    if b and not flags["fourdigityear"]:
        # In most cases this will be at the end of the Span
        ref_StartSpan, ref_EndSpan = s.getSpan()
        abs_StartSpan = ref_StartSpan + startSpan
        abs_EndSpan = abs_StartSpan + abs(endSpan - startSpan)
        chrono_2_digit_year_entity = chrono.ChronoTwoDigitYearOperator(
            entityID=str(chrono_id) + "entity",
            start_span=abs_StartSpan,
            end_span=abs_EndSpan,
            value=text)
        chrono_id = chrono_id + 1

        # Check for Month in same element
        bMonth, textMonth, startSpanMonth, endSpanMonth = hasMonthOfYear(s)
        if bMonth and not flags["month"]:
            flags["month"] = True
            abs_StartSpanMonth = ref_StartSpan + startSpanMonth
            abs_EndSpanMonth = abs_StartSpanMonth + abs(endSpanMonth -
                                                        startSpanMonth)
            m = utils.getMonthNumber(textMonth)

            if (m <= 12):
                chrono_month_entity = chrono.chronoMonthOfYearEntity(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_StartSpanMonth,
                    end_span=abs_EndSpanMonth,
                    month_type=calendar.month_name[m])
                chrono_id = chrono_id + 1
                chrono_2_digit_year_entity.set_sub_interval(
                    chrono_month_entity.get_id())

            # Check for Day in same element

            bDay, textDay, startSpanDay, endSpanDay = hasDayOfMonth(s)
            if bDay and not flags["day"]:
                flags["day"] = True
                abs_StartSpanDay = ref_StartSpan + startSpanDay
                abs_EndSpanDay = abs_StartSpanDay + abs(endSpanDay -
                                                        startSpanDay)
                if (int(textDay) <= 31):
                    chrono_day_entity = chrono.ChronoDayOfMonthEntity(
                        entityID=str(chrono_id) + "entity",
                        start_span=abs_StartSpanDay,
                        end_span=abs_EndSpanDay,
                        value=int(textDay))
                    chrono_id = chrono_id + 1
                    chrono_month_entity.set_sub_interval(
                        chrono_day_entity.get_id())

                # Check for Hour in same element
                bHour, textHour, startSpanHour, endSpanHour = hasHourOfDay(s)
                if bHour and not flags["hour"]:

                    flags["hour"] = True
                    ref_StartSpan, ref_EndSpan = s.getSpan()
                    abs_StartSpanHour = ref_StartSpan + startSpanHour
                    abs_EndSpanHour = abs_StartSpanHour + abs(endSpanHour -
                                                              startSpanHour)
                    if (int(textHour) <= 24):
                        chrono_hour_entity = chrono.ChronoHourOfDayEntity(
                            entityID=str(chrono_id) + "entity",
                            start_span=abs_StartSpanHour,
                            end_span=abs_EndSpanHour,
                            value=int(textHour))
                        chrono_id = chrono_id + 1
                        chrono_day_entity.set_sub_interval(
                            chrono_hour_entity.get_id())

                    # Check for Minute in same element
                    bMinute, textMinute, startSpanMinute, endSpanMinute = hasMinuteOfHour(
                        s)
                    if bMinute and not flags["minute"]:
                        flags["minute"] = True
                        ref_StartSpan, ref_EndSpan = s.getSpan()
                        abs_StartSpanMinute = ref_StartSpan + startSpanMinute
                        abs_EndSpanMinute = abs_StartSpanMinute + abs(
                            endSpanMinute - startSpanMinute)
                        if (int(textMinute) <= 60):
                            chrono_minute_entity = chrono.ChronoMinuteOfHourEntity(
                                entityID=str(chrono_id) + "entity",
                                start_span=abs_StartSpanMinute,
                                end_span=abs_EndSpanMinute,
                                value=int(textMinute))
                            chrono_id = chrono_id + 1
                            chrono_hour_entity.set_sub_interval(
                                chrono_minute_entity.get_id())

                        # Check for Second in same element
                        bSecond, textSecond, startSpanSecond, endSpanSecond = hasSecondOfMinute(
                            s)
                        if bSecond and not flags["second"]:
                            flags["second"] = True
                            ref_StartSpan, ref_EndSpan = s.getSpan()
                            abs_StartSpanSecond = ref_StartSpan + startSpanSecond
                            abs_EndSpanSecond = abs_StartSpanSecond + abs(
                                endSpanSecond - startSpanSecond)
                            if (int(textSecond) <= 60):
                                chrono_second_entity = chrono.ChronoSecondOfMinuteEntity(
                                    entityID=str(chrono_id) + "entity",
                                    start_span=abs_StartSpanSecond,
                                    end_span=abs_EndSpanSecond,
                                    value=int(textSecond))
                                chrono_list.append(chrono_second_entity)
                                chrono_id = chrono_id + 1
                                chrono_minute_entity.set_sub_interval(
                                    chrono_second_entity.get_id())

                        chrono_list.append(chrono_minute_entity)

                    chrono_list.append(chrono_hour_entity)

                chrono_list.append(chrono_day_entity)

            chrono_list.append(chrono_month_entity)

        chrono_list.append(chrono_2_digit_year_entity)

    return chrono_list, chrono_id, flags
Beispiel #2
0
def buildNumericDate(s, chrono_id, chrono_list, flags):
    # convert to all lower
    text_lower = s.getText().lower()
    # remove all punctuation
    # text_norm = text_lower.translate(str.maketrans("", "", string.punctuation))
    # print("After:" + text_norm)
    # convert to list
    text_norm = text_lower.strip(".,")
    text_list = text_norm.split(" ")

    for text in text_list:
        ## See if there is a 4 digit number and assume it is a year if between 1500 and 2050
        ## Note that 24hour times in this range will be interpreted as years.  However, if a timezone like 1800EDT is attached it will not be parsed here.
        if len(text) == 4:

            num = utils.getNumberFromText(text)
            if num is not None:
                if (num >= 1500) and (num <= 2050) and not flags[
                        "fourdigityear"] and not flags["loneDigitYear"]:
                    flags["loneDigitYear"] = True
                    # print("Found Lone Digit Year")
                    ## build year
                    ref_StartSpan, ref_EndSpan = s.getSpan()
                    start_idx, end_idx = re.search(text, s.getText()).span(0)

                    chrono_year_entity = chrono.ChronoYearEntity(
                        entityID=str(chrono_id) + "entity",
                        start_span=ref_StartSpan + start_idx,
                        end_span=ref_StartSpan + end_idx,
                        value=num)
                    chrono_id = chrono_id + 1
                    chrono_list.append(chrono_year_entity)

        ## parse out the condesnsed date format like 19980303 or 03031998.
        elif len(text) == 8 and utils.getNumberFromText(text) is not None:
            # Identify format yyyymmdd
            y = utils.getNumberFromText(text[0:4])
            m = utils.getNumberFromText(text[4:6])
            d = utils.getNumberFromText(text[6:8])
            if y is not None:
                if (y >= 1500) and (y <= 2050) and (m <= 12) and (d <= 31):
                    ref_StartSpan, ref_EndSpan = s.getSpan()
                    # add year

                    chrono_year_entity = chrono.ChronoYearEntity(
                        entityID=str(chrono_id) + "entity",
                        start_span=ref_StartSpan,
                        end_span=ref_StartSpan + 4,
                        value=y)
                    chrono_id = chrono_id + 1
                    # add month
                    chrono_month_entity = chrono.chronoMonthOfYearEntity(
                        entityID=str(chrono_id) + "entity",
                        start_span=ref_StartSpan + 4,
                        end_span=ref_StartSpan + 6,
                        month_type=calendar.month_name[m])
                    chrono_id = chrono_id + 1
                    chrono_year_entity.set_sub_interval(
                        chrono_month_entity.get_id())
                    # add day
                    chrono_day_entity = chrono.ChronoDayOfMonthEntity(
                        entityID=str(chrono_id) + "entity",
                        start_span=ref_StartSpan + 6,
                        end_span=ref_StartSpan + 8,
                        value=d)
                    chrono_id = chrono_id + 1
                    chrono_month_entity.set_sub_interval(
                        chrono_day_entity.get_id())

                    chrono_list.append(chrono_year_entity)
                    chrono_list.append(chrono_month_entity)
                    chrono_list.append(chrono_day_entity)
                else:
                    # test for mmddyyyy
                    y2 = utils.getNumberFromText(text[4:8])
                    m2 = utils.getNumberFromText(text[0:2])
                    d2 = utils.getNumberFromText(text[2:4])
                    if y2 is not None:
                        if (y2 >= 1500) and (y2 <= 2050) and (m2 <= 12) and (
                                d2 <= 31):
                            ref_StartSpan, ref_EndSpan = s.getSpan()
                            # add year

                            chrono_year_entity = chrono.ChronoYearEntity(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_StartSpan + 4,
                                end_span=ref_StartSpan + 8,
                                value=y)
                            chrono_id = chrono_id + 1
                            # add month
                            chrono_month_entity = chrono.chronoMonthOfYearEntity(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_StartSpan,
                                end_span=ref_StartSpan + 2,
                                month_type=calendar.month_name[m2])
                            chrono_id = chrono_id + 1
                            chrono_year_entity.set_sub_interval(
                                chrono_month_entity.get_id())
                            # add day
                            chrono_day_entity = chrono.ChronoDayOfMonthEntity(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_StartSpan + 2,
                                end_span=ref_StartSpan + 4,
                                value=d)
                            chrono_id = chrono_id + 1
                            chrono_month_entity.set_sub_interval(
                                chrono_day_entity.get_id())

                            chrono_list.append(chrono_year_entity)
                            chrono_list.append(chrono_month_entity)
                            chrono_list.append(chrono_day_entity)

        ## parse out the condesnsed date format like 030399 or 990303.
        ## Note that dates such as 12-01-2006 (120106 vs 061201) and similar are not distinguishable.
        elif len(text) == 6 and utils.getNumberFromText(text) is not None:
            # Identify format mmddyy

            y = utils.getNumberFromText(text[4:6])
            m = utils.getNumberFromText(text[0:2])
            d = utils.getNumberFromText(text[2:4])
            if y is not None and m is not None and d is not None:
                if (m <= 12) and (d <= 31):
                    ref_StartSpan, ref_EndSpan = s.getSpan()
                    # add year
                    chrono_year_entity = chrono.ChronoTwoDigitYearOperator(
                        entityID=str(chrono_id) + "entity",
                        start_span=ref_StartSpan + 4,
                        end_span=ref_StartSpan + 6,
                        value=y)
                    chrono_id = chrono_id + 1
                    # add month
                    chrono_month_entity = chrono.chronoMonthOfYearEntity(
                        entityID=str(chrono_id) + "entity",
                        start_span=ref_StartSpan,
                        end_span=ref_StartSpan + 2,
                        month_type=calendar.month_name[m])
                    chrono_id = chrono_id + 1
                    chrono_year_entity.set_sub_interval(
                        chrono_month_entity.get_id())
                    # add day
                    chrono_day_entity = chrono.ChronoDayOfMonthEntity(
                        entityID=str(chrono_id) + "entity",
                        start_span=ref_StartSpan + 2,
                        end_span=ref_StartSpan + 4,
                        value=d)
                    chrono_id = chrono_id + 1
                    chrono_month_entity.set_sub_interval(
                        chrono_day_entity.get_id())

                    chrono_list.append(chrono_year_entity)
                    chrono_list.append(chrono_month_entity)
                    chrono_list.append(chrono_day_entity)
                else:
                    # test for yymmdd
                    y2 = utils.getNumberFromText(text[0:2])
                    m2 = utils.getNumberFromText(text[2:4])
                    d2 = utils.getNumberFromText(text[4:6])
                    if y2 is not None:
                        if (m2 <= 12) and (d2 <= 31):
                            ref_StartSpan, ref_EndSpan = s.getSpan()
                            # add year
                            chrono_year_entity = chrono.ChronoTwoDigitYearOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_StartSpan,
                                end_span=ref_StartSpan + 2,
                                value=y2)
                            chrono_id = chrono_id + 1
                            # add month
                            chrono_month_entity = chrono.chronoMonthOfYearEntity(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_StartSpan + 2,
                                end_span=ref_StartSpan + 4,
                                month_type=calendar.month_name[m2])
                            chrono_id = chrono_id + 1
                            chrono_year_entity.set_sub_interval(
                                chrono_month_entity.get_id())
                            # add day
                            chrono_day_entity = chrono.ChronoDayOfMonthEntity(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_StartSpan + 4,
                                end_span=ref_StartSpan + 6,
                                value=d2)
                            chrono_id = chrono_id + 1
                            chrono_month_entity.set_sub_interval(
                                chrono_day_entity.get_id())

                            chrono_list.append(chrono_year_entity)
                            chrono_list.append(chrono_month_entity)
                            chrono_list.append(chrono_day_entity)

    return chrono_list, chrono_id, flags


####
# END_MODULE
####