Exemple #1
0
def hasModifierText(tpentity):

    text_lower = tpentity.getText().lower()
    #remove all punctuation
    text_norm = text_lower.translate(str.maketrans("", "", ","))
    #convert to list
    text_list = text_norm.split(" ")

    if len(text_list) > 0:
        #loop through list looking for expression
        temp_text = [
            "nearly", "almost", "<", "late", "mid", "fiscal", "fy", "over",
            "early", "approximately", "beginning"
        ]

        for t in text_list:
            answer = next((m for m in temp_text if m in t), None)
            if answer is not None:
                answer2 = next((m for m in temp_text if t in m), None)
                if answer2 is not None:
                    return True, t, calculateSpan(text_norm,
                                                  t)[0], calculateSpan(
                                                      text_norm, t)[1]
                else:
                    return False, None, None, None  # if no 2 digit hour expressions were found return false
            else:
                return False, None, None, None  # if no 2 digit day expressions were found return false
    else:

        return False, None, None, None  # if the text_list does not have any entries, return false
Exemple #2
0
def hasPartOfWeek(tpentity):
    # convert to all lower
    # text_lower = tpentity.getText().lower()
    text = tpentity.getText()
    # remove all punctuation
    text_norm = text.translate(str.maketrans("", "", string.punctuation))
    # convert to list
    text_list = text_norm.split(" ")

    # define my period lists
    partofday = ["weekend", "weekends"]

    # figure out if any of the tokens in the text_list are also in the ampm list
    intersect = list(set(text_list) & set(partofday))

    # only proceed if the intersect list has a length of 1 or more.
    # For this method I'm assuming it will only be a length of 1, if it is not then we don't know what to do with it.
    if len(intersect) == 1:

        term = intersect[0]
        start_idx, end_idx = calculateSpan(text_norm, term)
        if term == "weekend" or term == "weekends":
            return True, "Weekend", start_idx, end_idx
        else:
            return False, None, None, None
    else:
        return False, None, None, None
Exemple #3
0
def hasAMPM(tpentity):
    # convert to all lower
    # text_lower = tpentity.getText().lower()
    text = tpentity.getText()
    # remove all punctuation
    text_norm = text.translate(str.maketrans("", "", ","))
    # convert to list
    text_list = text_norm.split(" ")

    if len(text_list) > 0:
        for text in text_list:
            if (re.search('AM|A\.M\.|am|a\.m\.', text)):
                match = re.search('AM|A\.M\.|am|a\.m\.', text).group(0)
                start_idx, end_idx = calculateSpan(text_norm, match)
                return True, "AM", start_idx, end_idx
            elif (re.search('PM|P\.M\.|pm|p\.m\.', text)):
                match = re.search('PM|P\.M\.|pm|p\.m\.', text).group(0)
                start_idx, end_idx = calculateSpan(text_norm, match)
                return True, "PM", start_idx, end_idx
    return False, None, None, None
def has24HourTime(tpentity, flags):
    # text_lower = tpentity.getText().lower()
    # remove all punctuation
    # text_norm = text_lower.translate(str.maketrans("", "", ","))
    # convert to list
    stext = tpentity.getText()
    text_list = stext.split(" ")

    if not flags["loneDigitYear"]:
        # loop through list looking for expression
        for text in text_list:
            tz_format = re.search(
                '\d{0,4}(AST|EST|EDT|CST|CDT|MST|MDT|PST|PDT|AKST|HST|HAST|HADT|SST|SDT|GMT|CHST|UTC)',
                text)
            if len(text) == 4:
                num = utils.getNumberFromText(text)
                if num is not None:
                    hour = utils.getNumberFromText(text[:2])
                    minute = utils.getNumberFromText(text[2:])
                    if (hour is not None) and (minute is not None):
                        if (minute > 60) or (hour > 24):
                            return False, None, None, None
                        else:
                            start_idx, end_idx = calculateSpan(stext, text)
                            return True, text, start_idx, end_idx
            elif tz_format is not None:
                time = tz_format[0]

                hour = utils.getNumberFromText(time[0:2])
                minute = utils.getNumberFromText(time[2:4])
                # if (minute > 60) or (hour > 24):
                #     return False, None, None, None
                # else:
                start_idx, end_idx = calculateSpan(stext, time)
                return True, time, start_idx, end_idx

        return False, None, None, None  # if no 4 digit year expressions were found return false
    else:
        return False, None, None, None  # if loneDigitYearFlag has already been set
Exemple #5
0
def hasPartOfDay(tpentity):
    # convert to all lower
    text = tpentity.getText().lower()
    # text = tpentity.getText()
    # remove all punctuation
    text_norm = text.translate(str.maketrans("", "", string.punctuation))
    # convert to list
    text_list = text_norm.split(" ")

    # define my period lists
    partofday = [
        "morning", "evening", "afternoon", "night", "dawn", "dusk", "tonight",
        "overnight", "nights", "mornings", "evening", "afternoons", "noon",
        "bedtime", "midnight", "eve"
    ]

    # figure out if any of the tokens in the text_list are also in the ampm list
    intersect = list(set(text_list) & set(partofday))

    # only proceed if the intersect list has a length of 1 or more.
    # For this method I'm assuming it will only be a length of 1, if it is not then we don't know what to do with it.
    if len(intersect) == 1:

        term = intersect[0]
        start_idx, end_idx = calculateSpan(text_norm, term)
        if term == "morning" or term == "mornings":
            return True, "Morning", start_idx, end_idx
        if term == "dawn":
            return True, "Dawn", start_idx, end_idx
        elif term == "evening" or term == "dusk" or term == "evenings" or term == "eve":
            return True, "Evening", start_idx, end_idx
        elif term == "afternoon" or term == "afternoons":
            return True, "Afternoon", start_idx, end_idx
        elif term == "nights":
            return True, "Night", start_idx, end_idx
        elif term == "noon":
            return True, "Noon", start_idx, end_idx
        elif term == "bedtime":
            return True, "Unknown", start_idx, end_idx
        elif term == "midnight":
            return True, "Midnight", start_idx, end_idx
        elif term == "night" or term == "overnight" or term == "tonight":
            m = re.search("night", text_norm)
            sidx = m.span(0)[0]
            eidx = m.span(0)[1]
            return True, "Night", sidx, eidx
        else:
            return False, None, None, None
    else:
        return False, None, None, None
Exemple #6
0
def hasEmbeddedPeriodInterval(tpentity):
    # convert to all lower
    # text_lower = tpentity.getText().lower()
    text = tpentity.getText()
    # remove all punctuation
    text_norm = text.translate(
        str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    # convert to list
    text_list = text_norm.split(" ")

    # define my period/interval term lists
    terms = [
        "day", "week", "month", "hour", "days", "weeks", "months", "hours",
        "hrs"
    ]  #, "date"]

    ## if the term does not exist by itself it may be a substring. Go through each word in the TimePhrase string and see if a substring matches.
    for t in text_list:
        for r in terms:
            ## see if r is a substring of t
            ## if yes and the substring is at the end, extract the first substring and test to see if it is a number.
            idx = t.find(r)
            if (idx > 0):
                # then the r term is not the first substring.  Extract and test.
                sub1 = t[:idx]
                sub2 = t[idx:]
                # sub1 should be a number
                if (isinstance(utils.getNumberFromText(sub1), (int))):
                    # if it is a number then test to figure out what sub2 is.
                    this_term = sub2
                    start_idx, end_idx = calculateSpan(text_norm, this_term)
                    if this_term in ["day", "daily", "days"]:
                        #print("ACK! Found an Embedded Day")
                        return True, "Day", start_idx, end_idx, sub1
                    elif this_term in ["week", "weekly", "weeks"]:
                        return True, "Week", start_idx, end_idx, sub1
                    elif this_term in ["month", "monthly", "months"]:
                        return True, "Month", start_idx, end_idx, sub1
                    elif this_term in ["hour", "hourly", "hours"]:
                        return True, "Hour", start_idx, end_idx, sub1

                else:
                    return False, None, None, None, None
    return False, None, None, None, None
Exemple #7
0
def hasSeasonOfYear(tpentity, ref_list):
    refStart_span, refEnd_span = tpentity.getSpan()

    # convert to all lower
    # text_lower = tpentity.getText().lower()
    text = tpentity.getText().lower()
    # remove all punctuation
    text_norm = text.translate(
        str.maketrans(string.punctuation,
                      ' ' * len(string.punctuation))).strip()

    # convert to list
    text_list = text_norm.split(" ")

    # define my period lists
    seasonofyear = [
        "summer", "winter", "fall", "spring", "summers", "falls", "winters",
        "springs"
    ]

    # figure out if any of the tokens in the text_list are also in the ampm list
    intersect = list(set(text_list) & set(seasonofyear))

    # only proceed if the intersect list has a length of 1 or more.
    # For this method I'm assuming it will only be a length of 1, if it is not then we don't know what to do with it.
    if len(intersect) == 1:

        term = intersect[0]
        start_idx, end_idx = calculateSpan(text_norm, term)
        if term == "summer" or term == "summers":
            start_idx, end_idx = calculateSpan(text_norm, "summer")
            absStart = refStart_span + start_idx
            absEnd = refStart_span + end_idx
            postag = ref_list[utils.getRefIdx(ref_list, absStart,
                                              absEnd)].getPos()

            if postag == "NN":
                return True, "Summer", start_idx, end_idx

        elif term == "winter" or term == "winters":
            start_idx, end_idx = calculateSpan(text_norm, "winter")
            absStart = refStart_span + start_idx
            absEnd = refStart_span + end_idx
            postag = ref_list[utils.getRefIdx(ref_list, absStart,
                                              absEnd)].getPos()

            if postag == "NN":
                return True, "Winter", start_idx, end_idx

        elif term == "fall" or term == "falls":
            start_idx, end_idx = calculateSpan(text_norm, "fall")
            absStart = refStart_span + start_idx
            absEnd = refStart_span + end_idx
            postag = ref_list[utils.getRefIdx(ref_list, absStart,
                                              absEnd)].getPos()

            if postag == "NN":
                return True, "Fall", start_idx, end_idx

        elif term == "spring" or term == "springs":
            start_idx, end_idx = calculateSpan(text_norm, "spring")
            absStart = refStart_span + start_idx
            absEnd = refStart_span + end_idx
            postag = ref_list[utils.getRefIdx(ref_list, absStart,
                                              absEnd)].getPos()

            if postag == "NN":
                return True, "Spring", start_idx, end_idx

        else:
            return False, None, None, None

    return False, None, None, None
Exemple #8
0
def hasPeriodInterval(tpentity):
    # convert to all lower
    # text_lower = tpentity.getText().lower()
    text = tpentity.getText().lower()
    #print("In hasPeriodInterval text: ", text)

    reg = re.search(
        "date/time",
        text)  ##we don't want to annotate these specific types of mentions
    if reg:
        #print("Found date/time, returning FALSE")
        return False, None, None, None, None

    # remove all punctuation
    text_norm = text.translate(
        str.maketrans(string.punctuation,
                      ' ' * len(string.punctuation))).strip()
    # convert to list
    text_list = text_norm.split(" ")
    #print("text list: " + str(text_list))

    # define my period lists
    terms = [
        "decades", "decade", "yesterday", "yesterdays", "today", "todays",
        "tomorrow", "tomorrows", "day", "week", "month", "year", "daily",
        "weekly", "monthly", "yearly", "century", "minute", "second", "hour",
        "hourly", "days", "weeks", "months", "years", "centuries", "century",
        "minutes", "seconds", "hours", "time", "shortly", "soon", "briefly",
        "awhile", "future", "lately", "annual", "hr", "hrs", "min", "mins",
        "quarter"
    ]  #, "date"]

    # figure out if any of the tokens in the text_list are also in the interval list
    intersect = list(set(text_list) & set(terms))

    #print("My intersection: " + str(intersect))

    # only proceed if the intersect list has a length of 1 or more.
    # For this method I'm assuming it will only be a length of 1, if it is not then we don't know what to do with it.
    if len(intersect) == 1:
        # test if the intersect list contains plural or singular period.

        this_term = list(set(intersect) & set(terms))[0]
        start_idx, end_idx = calculateSpan(text_norm, this_term)
        if this_term in [
                "day", "daily", "days", "yesterday", "tomorrow", "yesterdays",
                "tomorrows", "today", "todays"
        ]:
            return True, "Day", start_idx, end_idx, False
        elif this_term in ["week", "weekly", "weeks"]:
            return True, "Week", start_idx, end_idx, False
        elif this_term in ["month", "monthly", "months"]:
            return True, "Month", start_idx, end_idx, False
        elif this_term in ["year", "yearly", "years", "annual"]:
            return True, "Year", start_idx, end_idx, False
        elif this_term in ["century", "centuries"]:
            return True, "Century", start_idx, end_idx, False
        elif this_term in ["decade", "decades"]:
            return True, "Decade", start_idx, end_idx, False
        elif this_term in ["minute", "minutes", "min", "mins"]:
            return True, "Minute", start_idx, end_idx, False
        elif this_term in ["second", "seconds"]:
            return True, "Second", start_idx, end_idx, False
        elif this_term in ["hour", "hourly", "hours", "hr", "hrs"]:
            return True, "Hour", start_idx, end_idx, False
        elif this_term in [
                "time", "shortly", "soon", "briefly", "awhile", "future",
                "lately", "quarter"
        ]:
            return True, "Unknown", start_idx, end_idx, False
        else:
            return False, None, None, None, None

    elif len(intersect) > 1:
        this_term = list(
            set(intersect) & set([
                "daily", "weekly", "monthly", "yearly", "weeks", "days",
                "months", "years"
            ]))

        if (this_term):
            if (len(this_term) == 1):
                this_term = this_term[0]
                start_idx, end_idx = calculateSpan(text_norm, this_term)

                if this_term in ["daily", "days"]:
                    #print("Returning a Daily")
                    return True, "Day", start_idx, end_idx, False
                elif this_term in ["weekly", "weeks"]:
                    return True, "Week", start_idx, end_idx, False
                elif this_term in ["monthly", "months"]:
                    return True, "Month", start_idx, end_idx, False
                elif this_term in ["yearly", "years"]:
                    return True, "Year", start_idx, end_idx, False
                else:
                    return False, None, None, None, None
            else:
                return False, None, None, None, None
        else:
            return False, None, None, None, None

    else:
        return False, None, None, None, None
Exemple #9
0
def hasTextMonth(tpentity, ref_list):
    refStart_span, refEnd_span = tpentity.getSpan()

    # convert to all lower
    text_lower = tpentity.getText().lower()
    # remove all punctuation
    # text_norm = text_lower.translate(str.maketrans(",", ' ')).strip()
    text_norm = text_lower.translate(
        str.maketrans(string.punctuation,
                      ' ' * len(string.punctuation))).strip()
    # convert to list
    text_list = text_norm.split(" ")

    # define my month lists
    full_month = [
        "january", "february", "march", "april", "may", "june", "july",
        "august", "september", "october", "november", "december"
    ]

    # run for full month
    t_flag = False
    for tok in text_list:
        answer = next((m for m in full_month if tok in m), None)
        if answer is not None and not t_flag:
            answer2 = next((m for m in full_month if m in tok), None)
            if answer2 is not None and not t_flag:
                t_flag = True
                # answer2 should contain the element that matches.  We need to find the span in the original phrase and return the correct value
                start_idx, end_idx = calculateSpan(text_lower, answer2)
                absStart = refStart_span + start_idx
                absEnd = refStart_span + end_idx
                postag = ref_list[utils.getRefIdx(ref_list, absStart,
                                                  absEnd)].getPos()

                if postag == "NNP":
                    if answer2 in ["january"]:
                        return True, "January", start_idx, end_idx
                    elif answer2 in ["february"]:
                        return True, "February", start_idx, end_idx
                    elif answer2 in ["march"]:
                        return True, "March", start_idx, end_idx
                    elif answer2 in ["april"]:
                        return True, "April", start_idx, end_idx
                    elif answer2 in ["may"]:
                        return True, "May", start_idx, end_idx
                    elif answer2 in ["june"]:
                        return True, "June", start_idx, end_idx
                    elif answer2 in ["july"]:
                        return True, "July", start_idx, end_idx
                    elif answer2 in ["august"]:
                        return True, "August", start_idx, end_idx
                    elif answer2 in ["september"]:
                        return True, "September", start_idx, end_idx
                    elif answer2 in ["october"]:
                        return True, "October", start_idx, end_idx
                    elif answer2 in ["november"]:
                        return True, "November", start_idx, end_idx
                    elif answer2 in ["december"]:
                        return True, "December", start_idx, end_idx

    # run for abbr month
    abbr_month = [
        "jan.", "feb.", "mar.", "apr.", "jun.", "jul.", "aug.", "sept.",
        "sep.", "oct.", "nov.", "dec."
    ]
    adj_punc = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~'
    text_norm2 = text_lower.translate(
        str.maketrans(adj_punc, ' ' * len(adj_punc))).strip()
    # convert to list
    text_list2 = text_norm2.split(" ")

    t_flag = False
    for tok in text_list2:
        answer = next((m for m in abbr_month if tok in m), None)
        if answer is not None and not t_flag:
            answer2 = next((m for m in abbr_month if m in tok), None)
            if answer2 is not None and not t_flag:
                t_flag = True
                # answer2 should contain the element that matches.  We need to find the span in the original phrase and return the correct value
                start_idx, end_idx = calculateSpan(text_lower, answer2)
                absStart = refStart_span + start_idx
                absEnd = refStart_span + end_idx
                postag = ref_list[utils.getRefIdx(ref_list, absStart,
                                                  absEnd)].getPos()

                if postag == "NNP":
                    if answer2 in ["jan."]:
                        return True, "January", start_idx, end_idx
                    elif answer2 in ["feb."]:
                        return True, "February", start_idx, end_idx
                    elif answer2 in ["mar."]:
                        return True, "March", start_idx, end_idx
                    elif answer2 in ["apr."]:
                        return True, "April", start_idx, end_idx
                    elif answer2 in ["jun."]:
                        return True, "June", start_idx, end_idx
                    elif answer2 in ["jul."]:
                        return True, "July", start_idx, end_idx
                    elif answer2 in ["aug."]:
                        return True, "August", start_idx, end_idx
                    elif answer2 in ["sept.", "sep."]:
                        return True, "September", start_idx, end_idx
                    elif answer2 in ["oct."]:
                        return True, "October", start_idx, end_idx
                    elif answer2 in ["nov."]:
                        return True, "November", start_idx, end_idx
                    elif answer2 in ["dec."]:
                        return True, "December", start_idx, end_idx

    # run for abbr month without punctuation
    abbr_month = [
        "jan", "feb", "mar", "apr", "jun", "jul", "aug", "sept", "sep", "oct",
        "nov", "dec"
    ]
    adj_punc = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~'
    text_norm2 = text_lower.translate(
        str.maketrans(adj_punc, ' ' * len(adj_punc))).strip()
    # convert to list
    text_list2 = text_norm2.split(" ")

    t_flag = False
    for tok in text_list2:
        answer = next((m for m in abbr_month if tok in m), None)
        if answer is not None and not t_flag:
            answer2 = next((m for m in abbr_month if m in tok), None)
            if answer2 is not None and not t_flag:
                t_flag = True
                # answer2 should contain the element that matches.  We need to find the span in the original phrase and return the correct value
                start_idx, end_idx = calculateSpan(text_lower, answer2)
                absStart = refStart_span + start_idx
                absEnd = refStart_span + end_idx
                postag = ref_list[utils.getRefIdx(ref_list, absStart,
                                                  absEnd)].getPos()

                if postag == "NNP":
                    if answer2 in ["jan"]:
                        return True, "January", start_idx, end_idx
                    elif answer2 in ["feb"]:
                        return True, "February", start_idx, end_idx
                    elif answer2 in ["mar"]:
                        return True, "March", start_idx, end_idx
                    elif answer2 in ["apr"]:
                        return True, "April", start_idx, end_idx
                    elif answer2 in ["jun"]:
                        return True, "June", start_idx, end_idx
                    elif answer2 in ["jul"]:
                        return True, "July", start_idx, end_idx
                    elif answer2 in ["aug"]:
                        return True, "August", start_idx, end_idx
                    elif answer2 in ["sept", "sep"]:
                        return True, "September", start_idx, end_idx
                    elif answer2 in ["oct"]:
                        return True, "October", start_idx, end_idx
                    elif answer2 in ["nov"]:
                        return True, "November", start_idx, end_idx
                    elif answer2 in ["dec"]:
                        return True, "December", start_idx, end_idx

    return False, None, None, None
Exemple #10
0
def buildTextMonthAndDay(s,
                         chrono_id,
                         chrono_list,
                         flags,
                         dct=None,
                         ref_list=None):
    boo, val, idxstart, idxend = hasTextMonth(s, ref_list)
    if boo and not flags["month"]:
        flags["month"] = True
        ref_Sspan, ref_Espan = s.getSpan()
        abs_Sspan = ref_Sspan + idxstart
        abs_Espan = ref_Sspan + idxend
        my_month_entity = chrono.chronoMonthOfYearEntity(
            entityID=str(chrono_id) + "entity",
            start_span=abs_Sspan,
            end_span=abs_Espan,
            month_type=val)
        chrono_id = chrono_id + 1

        ## assume all numbers 1-31 are days
        ## assume all numbers >1000 are years
        ## parse all text before month
        ## test to see if all text is a number or text year
        ## if no:
        ## remove all punctuation
        ## seperate by spaces
        ## parse each token, if find a number then assign to day or year as appropriate
        ## if yes:
        ## assign to day or year as appropriate

        ## parse all text after month
        ## test to see if all text is a number or text year
        ## if no:
        ## remove all punctuation
        ## seperate by spaces
        ## parse each token, if find a number then assign to day or year as appropriate
        ## if yes:
        ## assign to day or year as appropriate

        #idx_end is the last index of the month.  If there are any characters after it the length of the string will be greater than the endidx.
        if (idxend < len(s.getText())):
            substr = s.getText()[idxend:].strip(",.").strip()

            num = utils.getNumberFromText(substr)
            if num is not None:
                if num <= 31 and not flags["day"]:
                    flags["day"] = True
                    day_startidx, day_endidx = calculateSpan(
                        s.getText(), str(num))  #substr)
                    abs_Sspan = ref_Sspan + day_startidx
                    abs_Espan = ref_Sspan + day_endidx
                    my_day_entity = chrono.ChronoDayOfMonthEntity(
                        entityID=str(chrono_id) + "entity",
                        start_span=abs_Sspan,
                        end_span=abs_Espan,
                        value=num)
                    chrono_list.append(my_day_entity)
                    chrono_id = chrono_id + 1

                    #now figure out if it is a NEXT or LAST
                    #create doctime
                    if False:  #dct is not None:
                        mStart = my_month_entity.get_start_span()
                        mEnd = my_month_entity.get_end_span()
                        this_dct = datetime.datetime(
                            int(dct.year),
                            int(
                                utils.getMonthNumber(
                                    my_month_entity.get_month_type())),
                            int(my_day_entity.get_value()), 0, 0)
                        if this_dct > dct:
                            chrono_list.append(
                                chrono.ChronoNextOperator(
                                    entityID=str(chrono_id) + "entity",
                                    start_span=mStart,
                                    end_span=mEnd,
                                    repeating_interval=my_month_entity.get_id(
                                    )))
                            chrono_id = chrono_id + 1
                        elif this_dct < dct:
                            chrono_list.append(
                                chrono.ChronoLastOperator(
                                    entityID=str(chrono_id) + "entity",
                                    start_span=mStart,
                                    end_span=mEnd,
                                    repeating_interval=my_month_entity.get_id(
                                    )))
                            chrono_id = chrono_id + 1
                elif num >= 1500 and num <= 2050 and not flags[
                        "fourdigityear"] and not flags["loneDigitYear"]:
                    flags["fourdigityear"] = True
                    year_startidx, year_endidx = calculateSpan(
                        s.getText(), substr)
                    abs_Sspan = ref_Sspan + year_startidx
                    abs_Espan = ref_Sspan + year_endidx

                    my_year_entity = chrono.ChronoYearEntity(
                        entityID=str(chrono_id) + "entity",
                        start_span=abs_Sspan,
                        end_span=abs_Espan,
                        value=num)
                    chrono_list.append(my_year_entity)
                    my_year_entity.set_sub_interval(my_month_entity.get_id())
                    chrono_id = chrono_id + 1
            else:
                ##parse and process each token
                ##replace punctuation
                substr = substr.translate(
                    str.maketrans(string.punctuation,
                                  ' ' * len(string.punctuation)))
                ##split on spaces
                tokenized_text = WhitespaceTokenizer().tokenize(substr)
                for i in range(0, len(tokenized_text)):
                    num = utils.getNumberFromText(tokenized_text[i])
                    if num is not None:
                        if num <= 31:
                            day_startidx, day_endidx = calculateSpan(
                                s.getText(), tokenized_text[i])
                            abs_Sspan = ref_Sspan + day_startidx
                            abs_Espan = ref_Sspan + day_endidx
                            my_day_entity = chrono.ChronoDayOfMonthEntity(
                                entityID=str(chrono_id) + "entity",
                                start_span=abs_Sspan,
                                end_span=abs_Espan,
                                value=num)
                            chrono_list.append(my_day_entity)
                            chrono_id = chrono_id + 1

                            #now figure out if it is a NEXT or LAST
                            #create doctime
                            if False:  #dct is not None:
                                mStart = my_month_entity.get_start_span()
                                mEnd = my_month_entity.get_end_span()
                                this_dct = datetime.datetime(
                                    int(dct.year),
                                    int(
                                        utils.getMonthNumber(
                                            my_month_entity.get_month_type())),
                                    int(my_day_entity.get_value()), 0, 0)
                                if this_dct > dct:
                                    chrono_list.append(
                                        chrono.ChronoNextOperator(
                                            entityID=str(chrono_id) + "entity",
                                            start_span=mStart,
                                            end_span=mEnd,
                                            repeating_interval=my_month_entity.
                                            get_id()))
                                    chrono_id = chrono_id + 1
                                elif this_dct < dct:
                                    chrono_list.append(
                                        chrono.ChronoLastOperator(
                                            entityID=str(chrono_id) + "entity",
                                            start_span=mStart,
                                            end_span=mEnd,
                                            repeating_interval=my_month_entity.
                                            get_id()))
                                    chrono_id = chrono_id + 1
                        elif num >= 1500 and num <= 2050 and not flags[
                                "fourdigityear"] and not flags["loneDigitYear"]:
                            flags["fourdigityear"] = True
                            year_startidx, year_endidx = calculateSpan(
                                s.getText(), tokenized_text[i])
                            abs_Sspan = ref_Sspan + year_startidx
                            abs_Espan = ref_Sspan + year_endidx

                            my_year_entity = chrono.ChronoYearEntity(
                                entityID=str(chrono_id) + "entity",
                                start_span=abs_Sspan,
                                end_span=abs_Espan,
                                value=num)
                            chrono_list.append(my_year_entity)
                            my_year_entity.set_sub_interval(
                                my_month_entity.get_id())
                            chrono_id = chrono_id + 1

        ## if the start of the month is not 0 then we have leading text to parse
        if (idxstart > 0):
            #substr = s.getText()[:idxstart].strip(",.").strip()
            hasMod, mod_type, mod_start, mod_end = hasModifier(s)
            if (hasMod):
                if mod_type == "This":
                    chrono_list.append(
                        chrono.ChronoThisOperator(
                            entityID=str(chrono_id) + "entity",
                            start_span=ref_Sspan + mod_start,
                            end_span=ref_Sspan + mod_end,
                            repeating_interval=my_month_entity.get_id()))
                    chrono_id = chrono_id + 1

                if mod_type == "Next":
                    chrono_list.append(
                        chrono.ChronoNextOperator(
                            entityID=str(chrono_id) + "entity",
                            start_span=ref_Sspan + mod_start,
                            end_span=ref_Sspan + mod_end,
                            repeating_interval=my_month_entity.get_id()))
                    chrono_id = chrono_id + 1

                if mod_type == "Last":
                    # print("FOUND LAST")
                    chrono_list.append(
                        chrono.ChronoLastOperator(
                            entityID=str(chrono_id) + "entity",
                            start_span=ref_Sspan + mod_start,
                            end_span=ref_Sspan + mod_end,
                            repeating_interval=my_month_entity.get_id(),
                            semantics="Interval-Not-Included"))
                    chrono_id = chrono_id + 1

        chrono_list.append(my_month_entity)

    return chrono_list, chrono_id, flags
Exemple #11
0
def hasEmbeddedPeriodInterval(tpentity):
    # convert to all lower
    # text_lower = tpentity.getText().lower()
    text = tpentity.getText()
    # remove all punctuation
    text_norm = text.translate(
        str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    # convert to list
    text_list = text_norm.split(" ")

    # define my period/interval term lists
    print(
        "TOFIX: PeriodInterval.py @ line 388: convert to using the dictionary."
    )
    terms = [
        "decades", "decade", "yesterday", "yesterdays", "today", "todays",
        "tomorrow", "tomorrows", "day", "week", "month", "year", "daily",
        "weekly", "monthly", "yearly", "century", "minute", "second", "hour",
        "hourly", "days", "weeks", "months", "years", "centuries", "century",
        "minutes", "seconds", "hours", "time", "shortly", "soon", "briefly",
        "awhile", "future", "lately", "annual", "hr", "hrs", "min", "mins",
        "quarter"
    ]  #, "date"]

    ## if the term does not exist by itself it may be a substring. Go through each word in the TimePhrase string and see if a substring matches.
    for t in text_list:
        for r in terms:
            ## see if r is a substring of t
            ## if yes and the substring is at the end, extract the first substring and test to see if it is a number.
            idx = t.find(r)
            if (idx > 0):
                # then the r term is not the first substring.  Extract and test.
                sub1 = t[:idx]
                sub2 = t[idx:]
                # sub1 should be a number
                if (isinstance(utils.getNumberFromText(sub1), (int))):
                    # if it is a number then test to figure out what sub2 is.
                    this_term = sub2
                    start_idx, end_idx = calculateSpan(text_norm, this_term)
                    if this_term in [
                            "day", "daily", "days", "yesterday", "tomorrow",
                            "yesterdays", "tomorrows", "today", "todays"
                    ]:
                        #print("ACK! Found an Embedded Day")
                        return True, "Day", start_idx, end_idx, sub1
                    elif this_term in ["week", "weekly", "weeks"]:
                        return True, "Week", start_idx, end_idx, sub1
                    elif this_term in ["month", "monthly", "months"]:
                        return True, "Month", start_idx, end_idx, sub1
                    elif this_term in ["year", "yearly", "years"]:
                        return True, "Year", start_idx, end_idx, sub1
                    elif this_term in ["century", "centuries"]:
                        return True, "Century", start_idx, end_idx, sub1
                    elif this_term in ["decade", "decades"]:
                        return True, "Decade", start_idx, end_idx, sub1
                    elif this_term in ["minute", "minutes"]:
                        return True, "Minute", start_idx, end_idx, sub1
                    elif this_term in ["second", "seconds"]:
                        return True, "Second", start_idx, end_idx, sub1
                    elif this_term in ["hour", "hourly", "hours"]:
                        return True, "Hour", start_idx, end_idx, sub1
                    elif this_term in [
                            "time", "shortly", "soon", "briefly", "awhile",
                            "future", "lately"
                    ]:
                        return True, "Unknown", start_idx, end_idx, sub1

                else:
                    return False, None, None, None, None
    return False, None, None, None, None