コード例 #1
0
ファイル: TextYear.py プロジェクト: jacobkantrowitz/Chrono
def hasTextYear(tpentity):
    #remove ending punctuation
    text1 = tpentity.getText().strip(",.")
    #replace all other punctuation and replace with spaces
    text = text1.translate(
        str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    #make sure it is all letters
    m = re.search('[a-z,A-Z,-,\s]*', text)
    if m.group(0) is not '':
        ##split on spaces
        tokenized_text = WhitespaceTokenizer().tokenize(text)
        for t in tokenized_text:
            if utils.getNumberFromText(t) is None:
                return False, None, None, None
        val = utils.getNumberFromText(text)

        if val is not None:
            if val >= 1500 and val <= 2050:
                r = re.search(text1, tpentity.getText())
                start, end = r.span(0)
                return True, val, start, end
            else:
                return False, None, None, None
        else:
            return False, None, None, None
    return False, None, None, None
コード例 #2
0
def hasDateOrTime(text):

    punct = "!\"#$%&\'()*+,-/:;<=>?@[]^_`{|}~"
    text_norm = text.translate(str.maketrans(punct, ' ' * len(punct))).strip()
    #convert to list
    text_list = text_norm.split(' ')

    #loop through list looking for expression
    for text in text_list:
        if len(text) == 4:
            num = utils.getNumberFromText(text)
            if (num >= 1800) and (num <= 2050):
                ## for 4 digit years, but not all 4 digit numbers will be temporal. I set a specific range for 4-digit years.
                return True
        if len(text) == 6:
            ## could be yymmdd or mmddyy
            ## possible ranges for the year: 00 - 99
            ## possible ranges for the month: 01-12
            ## possible ranges for the day: 01-31
            ## It will be hard to narrow down these ranges at this point without context.
            return True
        if len(text) == 8:
            return True

    return False
コード例 #3
0
ファイル: MonthYear.py プロジェクト: nehadil/TACChrono
def hasYear(tpentity, flags):
    text_lower = tpentity.getText().lower()
    # remove all punctuation
    text_norm = text_lower.translate(str.maketrans(",", ' ')).strip()
    # convert to list
    text_list = text_norm.split(" ")

    if len(text_list) > 0:
        # loop through list looking for expression
        for text in text_list:
            # get start coordinate of this token in the full string so we can calculate the position of the temporal matches.
            text_start, text_end = Chrono.utils.calculateSpan(text_norm, text)

            result = re.search(
                '([0-9]{1,2})[-/:]([0-9]{1,2}|[A-Za-z]{3,4})[-/:]([0-9]{4})',
                text)

            # define regular expression to find a 4-digit year from the date format
            if result:
                result = result.group(0)
                split_result = re.split('[/:-]', result)

                if len(split_result) == 3:
                    start_idx, end_idx = Chrono.utils.calculateSpan(
                        text, split_result[2])
                    return True, split_result[
                        2], text_start + start_idx, text_start + end_idx, flags
                else:
                    return False, None, None, None, flags
            ## look for year at start of date
            ## added by Amy Olex
            elif len(text) > 7:
                result = re.search(
                    '([0-9]{4})[-/:]([0-9]{1,2}|[A-Za-z]{3,4})[-/:]([0-9]{1,2})',
                    text)
                if result:
                    result = result.group(0)
                    split_result = re.split('[/:-]', result)
                    if len(split_result) == 3:
                        start_idx, end_idx = Chrono.utils.calculateSpan(
                            result, split_result[0])
                        return True, split_result[
                            0], text_start + start_idx, text_start + end_idx, flags
                    else:
                        return False, None, None, None, flags
            ## special case to look for c.yyyy
            elif len(text) == 6:
                result = re.search("c\.([0-9]{4})", text)
                if result:
                    rval = utils.getNumberFromText(result.group(1))
                    if rval:
                        if rval >= 1500 and rval <= 2050:
                            start_idx, end_idx = result.span(1)
                            return True, rval, start_idx, end_idx, flags

        return False, None, None, None, flags  # if no 4 digit year expressions were found return false

    else:
        return False, None, None, None, flags  # if the text_list does not have any entries, return false
コード例 #4
0
def extract_numeric_feature(reftok_list, reftok_idx, obs_list):
    ## identify numeric feature
    before = max(reftok_idx - 1, 0)
    after = min(reftok_idx + 1, len(reftok_list) - 1)

    if (before != reftok_idx and isinstance(
            utils.getNumberFromText(reftok_list[before].getText()), (int))):
        obs_list.update({'feat_numeric': 1})
        return (obs_list)
    elif (after != reftok_idx
          and isinstance(utils.getNumberFromText(reftok_list[after].getText()),
                         (int))):
        obs_list.update({'feat_numeric': 1})
        return (obs_list)
    else:
        obs_list.update({'feat_numeric': 0})
        return (obs_list)
コード例 #5
0
def has24HourTime(text):

    punct = "!\"#$%&\'()*+,-/:;<=>?@[]^_`{|}~"
    text_norm = text.translate(str.maketrans(punct, ' ' * len(punct))).strip()
    #convert to list
    text_list = text_norm.split(' ')

    #loop through list looking for expression
    for text in text_list:
        if len(text) == 4:
            num = utils.getNumberFromText(text)
            if num is not None:
                hour = utils.getNumberFromText(text[:2])
                minute = utils.getNumberFromText(text[2:])
                if (hour is not None) and (minute is not None):
                    if (minute >= 60) or (hour >= 24):
                        return False
                    else:
                        return True

    return False
コード例 #6
0
def has24HourTime(tpentity, flags):
    # text_lower = tpentity.getText().lower()
    # remove all punctuation
    # text_norm = text_lower.translate(str.maketrans("", "", ","))
    # convert to list
    stext = tpentity.getText()
    text_list = stext.split(" ")

    if not flags["loneDigitYear"]:
        # loop through list looking for expression
        for text in text_list:
            tz_format = re.search(
                '\d{0,4}(AST|EST|EDT|CST|CDT|MST|MDT|PST|PDT|AKST|HST|HAST|HADT|SST|SDT|GMT|CHST|UTC)',
                text)
            if len(text) == 4:
                num = utils.getNumberFromText(text)
                if num is not None:
                    hour = utils.getNumberFromText(text[:2])
                    minute = utils.getNumberFromText(text[2:])
                    if (hour is not None) and (minute is not None):
                        if (minute > 60) or (hour > 24):
                            return False, None, None, None
                        else:
                            start_idx, end_idx = calculateSpan(stext, text)
                            return True, text, start_idx, end_idx
            elif tz_format is not None:
                time = tz_format[0]

                hour = utils.getNumberFromText(time[0:2])
                minute = utils.getNumberFromText(time[2:4])
                # if (minute > 60) or (hour > 24):
                #     return False, None, None, None
                # else:
                start_idx, end_idx = calculateSpan(stext, time)
                return True, time, start_idx, end_idx

        return False, None, None, None  # if no 4 digit year expressions were found return false
    else:
        return False, None, None, None  # if loneDigitYearFlag has already been set
コード例 #7
0
def hasEmbeddedPeriodInterval(tpentity):
    # convert to all lower
    # text_lower = tpentity.getText().lower()
    text = tpentity.getText()
    # remove all punctuation
    text_norm = text.translate(
        str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    # convert to list
    text_list = text_norm.split(" ")

    # define my period/interval term lists
    terms = [
        "day", "week", "month", "hour", "days", "weeks", "months", "hours",
        "hrs"
    ]  #, "date"]

    ## if the term does not exist by itself it may be a substring. Go through each word in the TimePhrase string and see if a substring matches.
    for t in text_list:
        for r in terms:
            ## see if r is a substring of t
            ## if yes and the substring is at the end, extract the first substring and test to see if it is a number.
            idx = t.find(r)
            if (idx > 0):
                # then the r term is not the first substring.  Extract and test.
                sub1 = t[:idx]
                sub2 = t[idx:]
                # sub1 should be a number
                if (isinstance(utils.getNumberFromText(sub1), (int))):
                    # if it is a number then test to figure out what sub2 is.
                    this_term = sub2
                    start_idx, end_idx = calculateSpan(text_norm, this_term)
                    if this_term in ["day", "daily", "days"]:
                        #print("ACK! Found an Embedded Day")
                        return True, "Day", start_idx, end_idx, sub1
                    elif this_term in ["week", "weekly", "weeks"]:
                        return True, "Week", start_idx, end_idx, sub1
                    elif this_term in ["month", "monthly", "months"]:
                        return True, "Month", start_idx, end_idx, sub1
                    elif this_term in ["hour", "hourly", "hours"]:
                        return True, "Hour", start_idx, end_idx, sub1

                else:
                    return False, None, None, None, None
    return False, None, None, None, None
コード例 #8
0
def extract_bow_features(reftok_list, reftok_idx, window, obs_dict, obs_list):
    ## identify bow feature
    #this_bow = {}

    start = max(reftok_idx - window, 0)
    end = min(reftok_idx + (window + 1), len(reftok_list) - 1)

    for r in range(start, end):
        if r != reftok_idx:
            num_check = utils.getNumberFromText(reftok_list[r].getText())
            if (isinstance(num_check, (int))):
                #this_bow[num_check] = 1
                obs_list.update({num_check: 1})
                obs_dict.update({num_check: 0})
            else:
                #this_bow[reftok_list[r].getText()] = 1
                obs_list.update({reftok_list[r].getText(): 1})
                obs_dict.update({reftok_list[r].getText(): 0})
    #print(str(this_bow))
    return (obs_list, obs_dict)
コード例 #9
0
def buildPeriodInterval(s, chrono_id, chrono_list, ref_list, classifier,
                        feats):

    features = feats.copy()
    ref_Sspan, ref_Espan = s.getSpan()
    #print("In buildPeriodInterval(), TimePhrase Text: " + s.getText())
    boo, val, idxstart, idxend, plural = hasPeriodInterval(s)

    # FIND terms that are always marked as calendar intervals!
    if boo and re.search(
            "yesterday|yesterdays|tomorrow|tomorrows|today|todays|daily|/min|/week",
            s.getText()):
        abs_Sspan = ref_Sspan + idxstart
        abs_Espan = ref_Sspan + idxend
        my_entity = chrono.ChronoCalendarIntervalEntity(
            entityID=str(chrono_id) + "entity",
            start_span=abs_Sspan,
            end_span=abs_Espan,
            calendar_type=val,
            number=None)
        chrono_id = chrono_id + 1

        if re.search("yesterday|yesterdays", s.getText()):

            my_last_entity = chrono.ChronoLastOperator(
                entityID=str(chrono_id) + "entity",
                start_span=abs_Sspan,
                end_span=abs_Espan,
                repeating_interval=str(chrono_id - 1) + "entity")
            chrono_id = chrono_id + 1
            chrono_list.append(my_last_entity)

        chrono_list.append(my_entity)

    # FIND terms that are always marked as periods!
    elif boo and val == "Unknown":
        abs_Sspan = ref_Sspan + idxstart
        abs_Espan = ref_Sspan + idxend
        my_entity = chrono.ChronoPeriodEntity(entityID=str(chrono_id) +
                                              "entity",
                                              start_span=abs_Sspan,
                                              end_span=abs_Espan,
                                              period_type=val,
                                              number=None)
        chrono_id = chrono_id + 1
        chrono_list.append(my_entity)

    elif boo:
        abs_Sspan = ref_Sspan + idxstart
        abs_Espan = ref_Sspan + idxend

        # get index of overlapping reference token
        #ref_idx = -1
        #for i in range(0,len(ref_list)):
        #    if(utils.overlap(ref_list[i].getSpan(),(abs_Sspan,abs_Espan))):
        #        ref_idx = i
        #        break

        ref_idx = utils.getRefIdx(ref_list, abs_Sspan, abs_Espan)

        # extract ML features
        my_features = utils.extract_prediction_features(
            ref_list, ref_idx, feats.copy())

        # classify into period or interval
        if classifier[1] == "NN":
            my_class = ChronoKeras.keras_classify(
                classifier[0], np.array(list(my_features.values())))
            #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))
        elif classifier[1] in ("SVM", "RF"):
            feat_array = [int(i) for i in my_features.values()]
            my_class = classifier[0].predict([feat_array])[0]
        else:
            my_class = classifier[0].classify(my_features)
            #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))

        # if 1 then it is a period, if 0 then it is an interval
        if my_class == 1:
            my_entity = chrono.ChronoPeriodEntity(
                entityID=str(chrono_id) + "entity",
                start_span=abs_Sspan,
                end_span=abs_Espan,
                period_type=getPeriodValue(val),
                number=None)
            chrono_id = chrono_id + 1
            ### Check to see if this calendar interval has a "this" in front of it
            prior_tok = ref_list[ref_idx - 1].getText().lower()
            if prior_tok.translate(
                    str.maketrans(string.punctuation,
                                  ' ' * len(string.punctuation))) == "this":
                # add a This entitiy and link it to the interval.
                start_span, end_span = re.search(prior_tok, "this").span(0)
                prior_start, prior_end = ref_list[ref_idx - 1].getSpan()

                chrono_this_entity = chrono.ChronoThisOperator(
                    entityID=str(chrono_id) + "entity",
                    start_span=prior_start + start_span,
                    end_span=prior_start + end_span)
                chrono_id = chrono_id + 1
                chrono_this_entity.set_period(my_entity.get_id())
                chrono_list.append(chrono_this_entity)

            else:
                # check for a Last Word
                hasMod, mod_type, mod_start, mod_end = hasModifier(s)

                if (hasMod):
                    if mod_type == "Next":
                        chrono_list.append(
                            chrono.ChronoNextOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                period=my_entity.get_id()))
                        chrono_id = chrono_id + 1

                    if mod_type == "Last":
                        chrono_list.append(
                            chrono.ChronoLastOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                period=my_entity.get_id(),
                                semantics="Interval-Not-Included"))
                        chrono_id = chrono_id + 1

        else:
            my_entity = chrono.ChronoCalendarIntervalEntity(
                entityID=str(chrono_id) + "entity",
                start_span=abs_Sspan,
                end_span=abs_Espan,
                calendar_type=val,
                number=None)
            chrono_id = chrono_id + 1
            ### Check to see if this calendar interval has a "this" in front of it
            prior_tok = ref_list[ref_idx - 1].getText().lower()
            if prior_tok.translate(
                    str.maketrans(string.punctuation,
                                  ' ' * len(string.punctuation))) == "this":
                # add a This entitiy and link it to the interval.
                start_span, end_span = re.search(prior_tok, "this").span(0)
                prior_start, prior_end = ref_list[ref_idx - 1].getSpan()

                chrono_this_entity = chrono.ChronoThisOperator(
                    entityID=str(chrono_id) + "entity",
                    start_span=prior_start + start_span,
                    end_span=prior_start + end_span)
                chrono_id = chrono_id + 1
                chrono_this_entity.set_repeating_interval(my_entity.get_id())
                chrono_list.append(chrono_this_entity)
            else:
                # check for a Last Word
                hasMod, mod_type, mod_start, mod_end = hasModifier(s)
                if (hasMod):
                    if mod_type == "Next":
                        chrono_list.append(
                            chrono.ChronoNextOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                repeating_interval=my_entity.get_id()))
                        chrono_id = chrono_id + 1

                    if mod_type == "Last":
                        chrono_list.append(
                            chrono.ChronoLastOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                repeating_interval=my_entity.get_id(),
                                semantics="Interval-Not-Included"))
                        chrono_id = chrono_id + 1

        #check to see if it has a number associated with it.  We assume the number comes before the interval string
        if idxstart > 0:
            substr = s.getText()[0:idxstart]
            m = re.search('([0-9]{1,2})', substr)
            if m is not None:
                num_val = m.group(0)
                abs_Sspan = ref_Sspan + m.span(0)[0]
                abs_Espan = ref_Sspan + m.span(0)[1]

                my_number_entity = chrono.ChronoNumber(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    value=num_val)
                chrono_id = chrono_id + 1

                #add the number entity to the list
                chrono_list.append(my_number_entity)
                my_entity.set_number(my_number_entity.get_id())
            #else search for a text number
            else:
                texNumVal = utils.getNumberFromText(substr)
                if texNumVal is not None:
                    #create the number entity
                    my_number_entity = chrono.ChronoNumber(
                        entityID=str(chrono_id) + "entity",
                        start_span=ref_Sspan,
                        end_span=ref_Sspan + (idxstart - 1),
                        value=texNumVal)
                    chrono_id = chrono_id + 1
                    #append to list
                    chrono_list.append(my_number_entity)
                    #link to interval entity
                    my_entity.set_number(my_number_entity.get_id())

        chrono_list.append(my_entity)

    else:
        boo2, val, idxstart, idxend, numstr = hasEmbeddedPeriodInterval(s)
        if (boo2):
            abs_Sspan = ref_Sspan + idxstart
            abs_Espan = ref_Sspan + idxend

            # get index of overlapping reference token
            ref_idx = -1
            for i in range(0, len(ref_list)):
                if (utils.overlap(ref_list[i].getSpan(),
                                  (abs_Sspan, abs_Espan))):
                    ref_idx = i
                    break

            # extract ML features
            my_features = utils.extract_prediction_features(
                ref_list, ref_idx, features)

            # classify into period or interval
            if (classifier[1] == "NN"):
                my_class = ChronoKeras.keras_classify(
                    classifier[0], np.array(list(my_features.values())))
                #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))
            else:
                my_class = classifier[0].classify(my_features)
                #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))

            # if 1 then it is a period, if 0 then it is an interval
            if (my_class == 1):
                my_entity = chrono.ChronoPeriodEntity(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    period_type=getPeriodValue(val),
                    number=None)
                chrono_id = chrono_id + 1
            else:
                my_entity = chrono.ChronoCalendarIntervalEntity(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    calendar_type=val)
                chrono_id = chrono_id + 1

            #Extract the number and identify the span of numstr

            substr = s.getText(
            )[:idxstart]  ## extract entire first part of TimePhrase phrase
            m = re.search(
                '([0-9]{1,2})', substr
            )  #search for an integer in the subphrase and extract it's coordinates
            if m is not None:
                num_val = m.group(0)
                abs_Sspan = ref_Sspan + m.span(0)[0]
                abs_Espan = ref_Sspan + m.span(0)[1]

                my_number_entity = chrono.ChronoNumber(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    value=num_val)
                chrono_id = chrono_id + 1

                #add the number entity to the list
                chrono_list.append(my_number_entity)
                #link to interval entity
                my_entity.set_number(my_number_entity.get_id())
            #else search for a text number
            else:
                texNumVal = utils.getNumberFromText(numstr)
                if texNumVal is not None:
                    m = re.search(
                        numstr,
                        substr)  #search for the number string in the subphrase
                    if m is not None:
                        abs_Sspan = ref_Sspan + m.span(0)[0]
                        abs_Espan = ref_Sspan + m.span(0)[1]
                        #create the number entity
                        my_number_entity = chrono.ChronoNumber(
                            entityID=str(chrono_id) + "entity",
                            start_span=abs_Sspan,
                            end_span=abs_Espan,
                            value=texNumVal)
                        chrono_id = chrono_id + 1
                        #append to list
                        chrono_list.append(my_number_entity)
                        #link to interval entity
                        my_entity.set_number(my_number_entity.get_id())

            chrono_list.append(my_entity)

    return chrono_list, chrono_id
コード例 #10
0
ファイル: NumericDate.py プロジェクト: NLPatVCU/TACChrono
def buildNumericDate(s, chrono_id, chrono_list, flags):
    # convert to all lower
    text_lower = s.getText().lower()
    # remove all punctuation
    # text_norm = text_lower.translate(str.maketrans("", "", string.punctuation))
    # print("After:" + text_norm)
    # convert to list
    text_norm = text_lower.strip(".,")
    text_list = text_norm.split(" ")

    for text in text_list:
        ## See if there is a 4 digit number and assume it is a year if between 1500 and 2050
        ## Note that 24hour times in this range will be interpreted as years.  However, if a timezone like 1800EDT is attached it will not be parsed here.
        if len(text) == 4:

            num = utils.getNumberFromText(text)
            if num is not None:
                if (num >= 1500) and (num <= 2050) and not flags[
                        "fourdigityear"] and not flags["loneDigitYear"]:
                    flags["loneDigitYear"] = True
                    # print("Found Lone Digit Year")
                    ## build year
                    ref_StartSpan, ref_EndSpan = s.getSpan()
                    start_idx, end_idx = re.search(text, s.getText()).span(0)

                    chrono_year_entity = chrono.ChronoYearEntity(
                        entityID=str(chrono_id) + "entity",
                        start_span=ref_StartSpan + start_idx,
                        end_span=ref_StartSpan + end_idx,
                        value=num)
                    chrono_id = chrono_id + 1
                    chrono_list.append(chrono_year_entity)

        ## parse out the condesnsed date format like 19980303 or 03031998.
        elif len(text) == 8 and utils.getNumberFromText(text) is not None:
            # Identify format yyyymmdd
            y = utils.getNumberFromText(text[0:4])
            m = utils.getNumberFromText(text[4:6])
            d = utils.getNumberFromText(text[6:8])
            if y is not None:
                if (y >= 1500) and (y <= 2050) and (m <= 12) and (d <= 31):
                    ref_StartSpan, ref_EndSpan = s.getSpan()
                    # add year

                    chrono_year_entity = chrono.ChronoYearEntity(
                        entityID=str(chrono_id) + "entity",
                        start_span=ref_StartSpan,
                        end_span=ref_StartSpan + 4,
                        value=y)
                    chrono_id = chrono_id + 1
                    # add month
                    chrono_month_entity = chrono.chronoMonthOfYearEntity(
                        entityID=str(chrono_id) + "entity",
                        start_span=ref_StartSpan + 4,
                        end_span=ref_StartSpan + 6,
                        month_type=calendar.month_name[m])
                    chrono_id = chrono_id + 1
                    chrono_year_entity.set_sub_interval(
                        chrono_month_entity.get_id())
                    # add day
                    chrono_day_entity = chrono.ChronoDayOfMonthEntity(
                        entityID=str(chrono_id) + "entity",
                        start_span=ref_StartSpan + 6,
                        end_span=ref_StartSpan + 8,
                        value=d)
                    chrono_id = chrono_id + 1
                    chrono_month_entity.set_sub_interval(
                        chrono_day_entity.get_id())

                    chrono_list.append(chrono_year_entity)
                    chrono_list.append(chrono_month_entity)
                    chrono_list.append(chrono_day_entity)
                else:
                    # test for mmddyyyy
                    y2 = utils.getNumberFromText(text[4:8])
                    m2 = utils.getNumberFromText(text[0:2])
                    d2 = utils.getNumberFromText(text[2:4])
                    if y2 is not None:
                        if (y2 >= 1500) and (y2 <= 2050) and (m2 <= 12) and (
                                d2 <= 31):
                            ref_StartSpan, ref_EndSpan = s.getSpan()
                            # add year

                            chrono_year_entity = chrono.ChronoYearEntity(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_StartSpan + 4,
                                end_span=ref_StartSpan + 8,
                                value=y)
                            chrono_id = chrono_id + 1
                            # add month
                            chrono_month_entity = chrono.chronoMonthOfYearEntity(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_StartSpan,
                                end_span=ref_StartSpan + 2,
                                month_type=calendar.month_name[m2])
                            chrono_id = chrono_id + 1
                            chrono_year_entity.set_sub_interval(
                                chrono_month_entity.get_id())
                            # add day
                            chrono_day_entity = chrono.ChronoDayOfMonthEntity(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_StartSpan + 2,
                                end_span=ref_StartSpan + 4,
                                value=d)
                            chrono_id = chrono_id + 1
                            chrono_month_entity.set_sub_interval(
                                chrono_day_entity.get_id())

                            chrono_list.append(chrono_year_entity)
                            chrono_list.append(chrono_month_entity)
                            chrono_list.append(chrono_day_entity)

        ## parse out the condesnsed date format like 030399 or 990303.
        ## Note that dates such as 12-01-2006 (120106 vs 061201) and similar are not distinguishable.
        elif len(text) == 6 and utils.getNumberFromText(text) is not None:
            # Identify format mmddyy

            y = utils.getNumberFromText(text[4:6])
            m = utils.getNumberFromText(text[0:2])
            d = utils.getNumberFromText(text[2:4])
            if y is not None and m is not None and d is not None:
                if (m <= 12) and (d <= 31):
                    ref_StartSpan, ref_EndSpan = s.getSpan()
                    # add year
                    chrono_year_entity = chrono.ChronoTwoDigitYearOperator(
                        entityID=str(chrono_id) + "entity",
                        start_span=ref_StartSpan + 4,
                        end_span=ref_StartSpan + 6,
                        value=y)
                    chrono_id = chrono_id + 1
                    # add month
                    chrono_month_entity = chrono.chronoMonthOfYearEntity(
                        entityID=str(chrono_id) + "entity",
                        start_span=ref_StartSpan,
                        end_span=ref_StartSpan + 2,
                        month_type=calendar.month_name[m])
                    chrono_id = chrono_id + 1
                    chrono_year_entity.set_sub_interval(
                        chrono_month_entity.get_id())
                    # add day
                    chrono_day_entity = chrono.ChronoDayOfMonthEntity(
                        entityID=str(chrono_id) + "entity",
                        start_span=ref_StartSpan + 2,
                        end_span=ref_StartSpan + 4,
                        value=d)
                    chrono_id = chrono_id + 1
                    chrono_month_entity.set_sub_interval(
                        chrono_day_entity.get_id())

                    chrono_list.append(chrono_year_entity)
                    chrono_list.append(chrono_month_entity)
                    chrono_list.append(chrono_day_entity)
                else:
                    # test for yymmdd
                    y2 = utils.getNumberFromText(text[0:2])
                    m2 = utils.getNumberFromText(text[2:4])
                    d2 = utils.getNumberFromText(text[4:6])
                    if y2 is not None:
                        if (m2 <= 12) and (d2 <= 31):
                            ref_StartSpan, ref_EndSpan = s.getSpan()
                            # add year
                            chrono_year_entity = chrono.ChronoTwoDigitYearOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_StartSpan,
                                end_span=ref_StartSpan + 2,
                                value=y2)
                            chrono_id = chrono_id + 1
                            # add month
                            chrono_month_entity = chrono.chronoMonthOfYearEntity(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_StartSpan + 2,
                                end_span=ref_StartSpan + 4,
                                month_type=calendar.month_name[m2])
                            chrono_id = chrono_id + 1
                            chrono_year_entity.set_sub_interval(
                                chrono_month_entity.get_id())
                            # add day
                            chrono_day_entity = chrono.ChronoDayOfMonthEntity(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_StartSpan + 4,
                                end_span=ref_StartSpan + 6,
                                value=d2)
                            chrono_id = chrono_id + 1
                            chrono_month_entity.set_sub_interval(
                                chrono_day_entity.get_id())

                            chrono_list.append(chrono_year_entity)
                            chrono_list.append(chrono_month_entity)
                            chrono_list.append(chrono_day_entity)

    return chrono_list, chrono_id, flags


####
# END_MODULE
####
コード例 #11
0
ファイル: AMPM.py プロジェクト: AmyOlex/Chrono
def buildAMPM(s, chrono_id, chrono_list, flags):
    am_flag = True
    ref_Sspan, ref_Espan = s.getSpan()
    ## Identify if a time zone string exists
    # tz = hasTimeZone(s)
    # if tz is not None:
    #     my_tz_entity = chrono.ChronoTimeZoneEntity(str(chrono_id) + "entity", start_span =tz.span(0)[0] + ref_Sspan, end_span=tz.span(0)[1] + ref_Sspan)
    #     chrono_list.append(my_tz_entity)
    #     chrono_id = chrono_id + 1
    # else:
    #     my_tz_entity = None

    boo, val, idxstart, idxend = hasAMPM(s)
    if boo:
        if val == "PM":
            am_flag = False

        abs_Sspan = ref_Sspan + idxstart
        abs_Espan = ref_Sspan + idxend
        my_AMPM_entity = chrono.ChronoAMPMOfDayEntity(entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, ampm_type=val)
        chrono_id = chrono_id + 1
        chrono_list.append(my_AMPM_entity)
        
        print("In AMPM")
        #check to see if it has a time associated with it.  We assume the time comes before the AMPM string
        #We could parse out the time from the TimePhrase normalized value.  The problem is getting the correct span.
        #idx_start is the first index of the ampm.  If there are any characters before it, it will be greater than 0.
        if idxstart > 0 and not flags['hour']:
            substr = s.getText()[0:idxstart]
            m = re.search('([0-9]{1,4})', substr)
            if m is not None :
                time_val = m.group(0)
                if len(time_val) <=2:
                    if int(time_val) <= 12:
                        abs_Sspan = ref_Sspan + m.span(0)[0]
                        abs_Espan = ref_Sspan + m.span(0)[1]
                        #print("Adding Hour in AMPM")
                        my_hour_entity = chrono.ChronoHourOfDayEntity(entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=time_val, ampm=my_AMPM_entity.get_id())
                        chrono_id = chrono_id + 1
                        chrono_list.append(my_hour_entity)
                        flags["hour"] = True
                
                elif len(time_val) == 3:
                    print("My Time_val: " + time_val)
                    k = re.search('([0-9])([0-9]{2})', time_val)
                    print("K0: " + k.group(0))
                    print("K1: " + k.group(1))
                    print("K2: " + k.group(2))
                    if int(k.group(2)) < 60:
                        abs_Sspan1 = ref_Sspan + k.span(2)[0]
                        abs_Espan1 = ref_Sspan + k.span(2)[1]
                        print("Adding Minute in AMPM")
                        my_minute_entity = chrono.ChronoMinuteOfHourEntity(entityID=str(chrono_id) + "entity", start_span=abs_Sspan1, end_span=abs_Espan1, value=k.group(2))
                        chrono_id = chrono_id + 1
                        chrono_list.append(my_minute_entity)
                        flags["minute"] = True
                    
                        if int(k.group(1)) <= 12:
                            abs_Sspan = ref_Sspan + k.span(1)[0]
                            abs_Espan = ref_Sspan + k.span(1)[1]
                            print("Adding Hour in AMPM")
                            my_hour_entity = chrono.ChronoHourOfDayEntity(entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=k.group(1), ampm=my_AMPM_entity.get_id(), sub_interval=my_minute_entity)
                            chrono_id = chrono_id + 1
                            chrono_list.append(my_hour_entity)
                            flags["hour"] = True
                    
                elif len(time_val) == 4:
                    k = re.search('([0-9]{2})([0-9]{2})', time_val)
                    
                    if int(k.group(2)) < 60:
                        abs_Sspan1 = ref_Sspan + k.span(2)[0]
                        abs_Espan1 = ref_Sspan + k.span(2)[1]
                        print("Adding Minute in AMPM")
                        my_minute_entity = chrono.ChronoMinuteOfHourEntity(entityID=str(chrono_id) + "entity", start_span=abs_Sspan1, end_span=abs_Espan1, value=k.group(2))
                        chrono_id = chrono_id + 1
                        chrono_list.append(my_minute_entity)
                        flags["minute"] = True
                    
                        if int(k.group(1)) <= 12:
                            abs_Sspan = ref_Sspan + k.span(1)[0]
                            abs_Espan = ref_Sspan + k.span(1)[1]
                            print("Adding Hour in AMPM")
                            my_hour_entity = chrono.ChronoHourOfDayEntity(entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=k.group(1), ampm=my_AMPM_entity.get_id(), sub_interval=my_minute_entity)
                            chrono_id = chrono_id + 1
                            chrono_list.append(my_hour_entity)
                            flags["hour"] = True

            #else search for a text number
            else:
                texNumVal = utils.getNumberFromText(substr)

                if texNumVal is not None:
                    #create the hour entity
                    if not flags['hour']:
                        my_hour_entity = chrono.ChronoHourOfDayEntity(entityID=str(chrono_id) + "entity", start_span=ref_Sspan, end_span=ref_Sspan + (idxstart - 1), value=texNumVal, ampm=my_AMPM_entity.get_id())
                        chrono_id = chrono_id + 1
                        chrono_list.append(my_hour_entity)
                        flags["hour"] = True


    return chrono_list, chrono_id
コード例 #12
0
ファイル: AMPM.py プロジェクト: nehadil/TACChrono
def buildAMPM(s, chrono_id, chrono_list, flags):
    am_flag = True
    ref_Sspan, ref_Espan = s.getSpan()
    ## Identify if a time zone string exists
    # tz = hasTimeZone(s)
    # if tz is not None:
    #     my_tz_entity = chrono.ChronoTimeZoneEntity(str(chrono_id) + "entity", start_span =tz.span(0)[0] + ref_Sspan, end_span=tz.span(0)[1] + ref_Sspan)
    #     chrono_list.append(my_tz_entity)
    #     chrono_id = chrono_id + 1
    # else:
    #     my_tz_entity = None

    boo, val, idxstart, idxend = hasAMPM(s)
    if boo:
        if val == "PM":
            am_flag = False

        abs_Sspan = ref_Sspan + idxstart
        abs_Espan = ref_Sspan + idxend
        my_AMPM_entity = chrono.ChronoAMPMOfDayEntity(entityID=str(chrono_id) +
                                                      "entity",
                                                      start_span=abs_Sspan,
                                                      end_span=abs_Espan,
                                                      ampm_type=val)
        chrono_id = chrono_id + 1
        chrono_list.append(my_AMPM_entity)

        #check to see if it has a time associated with it.  We assume the time comes before the AMPM string
        #We could parse out the time from the TimePhrase normalized value.  The problem is getting the correct span.
        #idx_start is the first index of the ampm.  If there are any characters before it, it will be greater than 0.
        if idxstart > 0 and not flags['hour']:
            substr = s.getText()[0:idxstart]
            m = re.search('([0-9]{1,2})', substr)
            if m is not None:
                hour_val = m.group(0)
                abs_Sspan = ref_Sspan + m.span(0)[0]
                abs_Espan = ref_Sspan + m.span(0)[1]

                my_hour_entity = chrono.ChronoHourOfDayEntity(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    value=hour_val,
                    ampm=my_AMPM_entity.get_id())
                chrono_id = chrono_id + 1
                chrono_list.append(my_hour_entity)

            #else search for a text number
            else:
                texNumVal = utils.getNumberFromText(substr)

                if texNumVal is not None:
                    #create the hour entity
                    if not flags['hour']:
                        my_hour_entity = chrono.ChronoHourOfDayEntity(
                            entityID=str(chrono_id) + "entity",
                            start_span=ref_Sspan,
                            end_span=ref_Sspan + (idxstart - 1),
                            value=texNumVal,
                            ampm=my_AMPM_entity.get_id())
                        chrono_id = chrono_id + 1
                        chrono_list.append(my_hour_entity)

    return chrono_list, chrono_id
コード例 #13
0
def buildTextMonthAndDay(s,
                         chrono_id,
                         chrono_list,
                         flags,
                         dct=None,
                         ref_list=None):
    boo, val, idxstart, idxend = hasTextMonth(s, ref_list)
    if boo and not flags["month"]:
        flags["month"] = True
        ref_Sspan, ref_Espan = s.getSpan()
        abs_Sspan = ref_Sspan + idxstart
        abs_Espan = ref_Sspan + idxend
        my_month_entity = chrono.chronoMonthOfYearEntity(
            entityID=str(chrono_id) + "entity",
            start_span=abs_Sspan,
            end_span=abs_Espan,
            month_type=val)
        chrono_id = chrono_id + 1

        ## assume all numbers 1-31 are days
        ## assume all numbers >1000 are years
        ## parse all text before month
        ## test to see if all text is a number or text year
        ## if no:
        ## remove all punctuation
        ## seperate by spaces
        ## parse each token, if find a number then assign to day or year as appropriate
        ## if yes:
        ## assign to day or year as appropriate

        ## parse all text after month
        ## test to see if all text is a number or text year
        ## if no:
        ## remove all punctuation
        ## seperate by spaces
        ## parse each token, if find a number then assign to day or year as appropriate
        ## if yes:
        ## assign to day or year as appropriate

        #idx_end is the last index of the month.  If there are any characters after it the length of the string will be greater than the endidx.
        if (idxend < len(s.getText())):
            substr = s.getText()[idxend:].strip(",.").strip()

            num = utils.getNumberFromText(substr)
            if num is not None:
                if num <= 31 and not flags["day"]:
                    flags["day"] = True
                    day_startidx, day_endidx = calculateSpan(
                        s.getText(), str(num))  #substr)
                    abs_Sspan = ref_Sspan + day_startidx
                    abs_Espan = ref_Sspan + day_endidx
                    my_day_entity = chrono.ChronoDayOfMonthEntity(
                        entityID=str(chrono_id) + "entity",
                        start_span=abs_Sspan,
                        end_span=abs_Espan,
                        value=num)
                    chrono_list.append(my_day_entity)
                    chrono_id = chrono_id + 1

                    #now figure out if it is a NEXT or LAST
                    #create doctime
                    if False:  #dct is not None:
                        mStart = my_month_entity.get_start_span()
                        mEnd = my_month_entity.get_end_span()
                        this_dct = datetime.datetime(
                            int(dct.year),
                            int(
                                utils.getMonthNumber(
                                    my_month_entity.get_month_type())),
                            int(my_day_entity.get_value()), 0, 0)
                        if this_dct > dct:
                            chrono_list.append(
                                chrono.ChronoNextOperator(
                                    entityID=str(chrono_id) + "entity",
                                    start_span=mStart,
                                    end_span=mEnd,
                                    repeating_interval=my_month_entity.get_id(
                                    )))
                            chrono_id = chrono_id + 1
                        elif this_dct < dct:
                            chrono_list.append(
                                chrono.ChronoLastOperator(
                                    entityID=str(chrono_id) + "entity",
                                    start_span=mStart,
                                    end_span=mEnd,
                                    repeating_interval=my_month_entity.get_id(
                                    )))
                            chrono_id = chrono_id + 1
                elif num >= 1500 and num <= 2050 and not flags[
                        "fourdigityear"] and not flags["loneDigitYear"]:
                    flags["fourdigityear"] = True
                    year_startidx, year_endidx = calculateSpan(
                        s.getText(), substr)
                    abs_Sspan = ref_Sspan + year_startidx
                    abs_Espan = ref_Sspan + year_endidx

                    my_year_entity = chrono.ChronoYearEntity(
                        entityID=str(chrono_id) + "entity",
                        start_span=abs_Sspan,
                        end_span=abs_Espan,
                        value=num)
                    chrono_list.append(my_year_entity)
                    my_year_entity.set_sub_interval(my_month_entity.get_id())
                    chrono_id = chrono_id + 1
            else:
                ##parse and process each token
                ##replace punctuation
                substr = substr.translate(
                    str.maketrans(string.punctuation,
                                  ' ' * len(string.punctuation)))
                ##split on spaces
                tokenized_text = WhitespaceTokenizer().tokenize(substr)
                for i in range(0, len(tokenized_text)):
                    num = utils.getNumberFromText(tokenized_text[i])
                    if num is not None:
                        if num <= 31:
                            day_startidx, day_endidx = calculateSpan(
                                s.getText(), tokenized_text[i])
                            abs_Sspan = ref_Sspan + day_startidx
                            abs_Espan = ref_Sspan + day_endidx
                            my_day_entity = chrono.ChronoDayOfMonthEntity(
                                entityID=str(chrono_id) + "entity",
                                start_span=abs_Sspan,
                                end_span=abs_Espan,
                                value=num)
                            chrono_list.append(my_day_entity)
                            chrono_id = chrono_id + 1

                            #now figure out if it is a NEXT or LAST
                            #create doctime
                            if False:  #dct is not None:
                                mStart = my_month_entity.get_start_span()
                                mEnd = my_month_entity.get_end_span()
                                this_dct = datetime.datetime(
                                    int(dct.year),
                                    int(
                                        utils.getMonthNumber(
                                            my_month_entity.get_month_type())),
                                    int(my_day_entity.get_value()), 0, 0)
                                if this_dct > dct:
                                    chrono_list.append(
                                        chrono.ChronoNextOperator(
                                            entityID=str(chrono_id) + "entity",
                                            start_span=mStart,
                                            end_span=mEnd,
                                            repeating_interval=my_month_entity.
                                            get_id()))
                                    chrono_id = chrono_id + 1
                                elif this_dct < dct:
                                    chrono_list.append(
                                        chrono.ChronoLastOperator(
                                            entityID=str(chrono_id) + "entity",
                                            start_span=mStart,
                                            end_span=mEnd,
                                            repeating_interval=my_month_entity.
                                            get_id()))
                                    chrono_id = chrono_id + 1
                        elif num >= 1500 and num <= 2050 and not flags[
                                "fourdigityear"] and not flags["loneDigitYear"]:
                            flags["fourdigityear"] = True
                            year_startidx, year_endidx = calculateSpan(
                                s.getText(), tokenized_text[i])
                            abs_Sspan = ref_Sspan + year_startidx
                            abs_Espan = ref_Sspan + year_endidx

                            my_year_entity = chrono.ChronoYearEntity(
                                entityID=str(chrono_id) + "entity",
                                start_span=abs_Sspan,
                                end_span=abs_Espan,
                                value=num)
                            chrono_list.append(my_year_entity)
                            my_year_entity.set_sub_interval(
                                my_month_entity.get_id())
                            chrono_id = chrono_id + 1

        ## if the start of the month is not 0 then we have leading text to parse
        if (idxstart > 0):
            #substr = s.getText()[:idxstart].strip(",.").strip()
            hasMod, mod_type, mod_start, mod_end = hasModifier(s)
            if (hasMod):
                if mod_type == "This":
                    chrono_list.append(
                        chrono.ChronoThisOperator(
                            entityID=str(chrono_id) + "entity",
                            start_span=ref_Sspan + mod_start,
                            end_span=ref_Sspan + mod_end,
                            repeating_interval=my_month_entity.get_id()))
                    chrono_id = chrono_id + 1

                if mod_type == "Next":
                    chrono_list.append(
                        chrono.ChronoNextOperator(
                            entityID=str(chrono_id) + "entity",
                            start_span=ref_Sspan + mod_start,
                            end_span=ref_Sspan + mod_end,
                            repeating_interval=my_month_entity.get_id()))
                    chrono_id = chrono_id + 1

                if mod_type == "Last":
                    # print("FOUND LAST")
                    chrono_list.append(
                        chrono.ChronoLastOperator(
                            entityID=str(chrono_id) + "entity",
                            start_span=ref_Sspan + mod_start,
                            end_span=ref_Sspan + mod_end,
                            repeating_interval=my_month_entity.get_id(),
                            semantics="Interval-Not-Included"))
                    chrono_id = chrono_id + 1

        chrono_list.append(my_month_entity)

    return chrono_list, chrono_id, flags
コード例 #14
0
def buildPeriodInterval(s, chrono_id, chrono_list, ref_list, classifier,
                        feats):

    features = feats.copy()
    ref_Sspan, ref_Espan = s.getSpan()
    boo, val, idxstart, idxend, plural = hasPeriodInterval(s)

    # FIND terms that are always marked as calendar intervals!
    if boo and re.search(
            "yesterday|yesterdays|tomorrow|tomorrows|today|todays|daily|/min|/week",
            s.getText()):
        abs_Sspan = ref_Sspan + idxstart
        abs_Espan = ref_Sspan + idxend
        my_entity = chrono.ChronoCalendarIntervalEntity(
            entityID=str(chrono_id) + "entity",
            start_span=abs_Sspan,
            end_span=abs_Espan,
            calendar_type=val,
            number=None)
        chrono_id = chrono_id + 1

        if re.search("yesterday|yesterdays", s.getText()):

            my_last_entity = chrono.ChronoLastOperator(
                entityID=str(chrono_id) + "entity",
                start_span=abs_Sspan,
                end_span=abs_Espan,
                repeating_interval=str(chrono_id - 1) + "entity")
            chrono_id = chrono_id + 1
            chrono_list.append(my_last_entity)

        chrono_list.append(my_entity)

    # FIND terms that are always marked as periods!
    elif boo and val == "Unknown":
        abs_Sspan = ref_Sspan + idxstart
        abs_Espan = ref_Sspan + idxend
        my_entity = chrono.ChronoPeriodEntity(entityID=str(chrono_id) +
                                              "entity",
                                              start_span=abs_Sspan,
                                              end_span=abs_Espan,
                                              period_type=val,
                                              number=None)
        chrono_id = chrono_id + 1
        chrono_list.append(my_entity)

    elif boo:
        abs_Sspan = ref_Sspan + idxstart
        abs_Espan = ref_Sspan + idxend

        # get index of overlapping reference token
        ref_idx = utils.getRefIdx(ref_list, abs_Sspan, abs_Espan)

        # extract ML features
        my_features = utils.extract_prediction_features(
            ref_list, ref_idx, feats.copy())

        # classify into period or interval
        if classifier[1] == "NN":
            my_class = ChronoKeras.keras_classify(
                classifier[0], np.array(list(my_features.values())))
            #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))
        elif classifier[1] in ("SVM", "RF"):
            feat_array = [int(i) for i in my_features.values()]
            my_class = classifier[0].predict([feat_array])[0]
        else:
            my_class = classifier[0].classify(my_features)
            #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))

        # if 1 then it is a period, if 0 then it is an interval
        if my_class == 1:
            my_entity = chrono.ChronoPeriodEntity(
                entityID=str(chrono_id) + "entity",
                start_span=abs_Sspan,
                end_span=abs_Espan,
                period_type=getPeriodValue(val),
                number=None)
            chrono_id = chrono_id + 1
            ### Check to see if this calendar interval has a "this" in front of it
            prior_tok = ref_list[ref_idx - 1].getText().lower()
            if prior_tok.translate(
                    str.maketrans(string.punctuation,
                                  ' ' * len(string.punctuation))) == "this":
                # add a This entitiy and link it to the interval.
                start_span, end_span = re.search(prior_tok, "this").span(0)
                prior_start, prior_end = ref_list[ref_idx - 1].getSpan()

                chrono_this_entity = chrono.ChronoThisOperator(
                    entityID=str(chrono_id) + "entity",
                    start_span=prior_start + start_span,
                    end_span=prior_start + end_span)
                chrono_id = chrono_id + 1
                chrono_this_entity.set_period(my_entity.get_id())
                chrono_list.append(chrono_this_entity)

            else:
                # check for a Last Word
                hasMod, mod_type, mod_start, mod_end = hasNextLastThis(s)

                if (hasMod):
                    if mod_type == "Next":
                        chrono_list.append(
                            chrono.ChronoNextOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                period=my_entity.get_id()))
                        chrono_id = chrono_id + 1

                    if mod_type == "Last":
                        chrono_list.append(
                            chrono.ChronoLastOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                period=my_entity.get_id(),
                                semantics="Interval-Not-Included"))
                        chrono_id = chrono_id + 1

        else:
            my_entity = chrono.ChronoCalendarIntervalEntity(
                entityID=str(chrono_id) + "entity",
                start_span=abs_Sspan,
                end_span=abs_Espan,
                calendar_type=val,
                number=None)
            chrono_id = chrono_id + 1
            ### Check to see if this calendar interval has a "this" in front of it
            prior_tok = ref_list[ref_idx - 1].getText().lower()
            if prior_tok.translate(
                    str.maketrans(string.punctuation,
                                  ' ' * len(string.punctuation))) == "this":
                # add a This entitiy and link it to the interval.
                start_span, end_span = re.search(prior_tok, "this").span(0)
                prior_start, prior_end = ref_list[ref_idx - 1].getSpan()

                chrono_this_entity = chrono.ChronoThisOperator(
                    entityID=str(chrono_id) + "entity",
                    start_span=prior_start + start_span,
                    end_span=prior_start + end_span)
                chrono_id = chrono_id + 1
                chrono_this_entity.set_repeating_interval(my_entity.get_id())
                chrono_list.append(chrono_this_entity)
            else:
                # check for a Last Word
                hasMod, mod_type, mod_start, mod_end = hasNextLastThis(s)
                if (hasMod):
                    if mod_type == "Next":
                        chrono_list.append(
                            chrono.ChronoNextOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                repeating_interval=my_entity.get_id()))
                        chrono_id = chrono_id + 1

                    if mod_type == "Last":
                        chrono_list.append(
                            chrono.ChronoLastOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                repeating_interval=my_entity.get_id(),
                                semantics="Interval-Not-Included"))
                        chrono_id = chrono_id + 1

        #check to see if it has a number associated with it.  We assume the number comes before the interval string
        #to figure out if the number we find is close to the interval token the end of the number string needs to be within 2 characters of the start of the interval token.
        #I tried just extracting the previous reference token, but that doesn't work because phrases like "42-year-old" are actually one reference token.
        # So I decided I had to do it the hard way with index arithmetic.  The one concern about this method is that I assume there is a space at the end.  This could cause some issues down the line.
        # Yep, we are getting the spans wrong for phrases like "six-months".  I am going to test for a space as the last character before just assuming there was one.
        if idxstart > 0:
            ## get the absolute span of the interval token
            abs_Sspan = ref_Sspan + idxstart
            abs_Espan = ref_Sspan + idxend

            ## purposfully split on a single space
            substr = s.getText()[0:idxstart]
            # test to see if last character is a space and set a flag.
            has_space = True if substr[len(substr) - 1] == ' ' else False
            substr = substr.strip(' ').split(' ')

            ## get the previous token
            prevtok = substr[len(substr) - 1]
            prev_sSpan = idxstart - len(
                prevtok) - 1 if has_space else idxstart - len(prevtok)
            prev_eSpan = idxstart - 1

            ## get the rest of the substring joined by a space
            if len(substr) > 1:
                rest_of_phrase = ' '.join(substr[0:len(substr) - 1])
                rest_of_phrase_length = len(rest_of_phrase) + 1

            else:
                rest_of_phrase_length = 0

            m = re.search('([0-9]{1,2})', prevtok)
            if m is not None:
                num_val = m.group(0)
                abs_Sspan = ref_Sspan + rest_of_phrase_length + m.span(0)[0]
                abs_Espan = ref_Sspan + rest_of_phrase_length + m.span(0)[1]

                my_number_entity = chrono.ChronoNumber(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    value=num_val)
                chrono_id = chrono_id + 1

                #add the number entity to the list
                chrono_list.append(my_number_entity)
                my_entity.set_number(my_number_entity.get_id())
            #else search for a text number
            else:
                texNumVal = utils.getNumberFromText(prevtok)
                if texNumVal is not None:
                    abs_Sspan = ref_Sspan + rest_of_phrase_length
                    abs_Espan = ref_Sspan + rest_of_phrase_length + len(
                        prevtok
                    ) if has_space else ref_Sspan + rest_of_phrase_length + len(
                        prevtok) - 1

                    #create the number entity
                    my_number_entity = chrono.ChronoNumber(
                        entityID=str(chrono_id) + "entity",
                        start_span=abs_Sspan,
                        end_span=abs_Espan,
                        value=texNumVal)
                    chrono_id = chrono_id + 1
                    #append to list
                    chrono_list.append(my_number_entity)
                    #link to interval entity
                    my_entity.set_number(my_number_entity.get_id())

        chrono_list.append(my_entity)

    else:
        boo2, val, idxstart, idxend, numstr = hasEmbeddedPeriodInterval(s)
        if (boo2):
            abs_Sspan = ref_Sspan + idxstart
            abs_Espan = ref_Sspan + idxend

            # get index of overlapping reference token
            ref_idx = -1
            for i in range(0, len(ref_list)):
                if (utils.overlap(ref_list[i].getSpan(),
                                  (abs_Sspan, abs_Espan))):
                    ref_idx = i
                    break

            # extract ML features
            my_features = utils.extract_prediction_features(
                ref_list, ref_idx, features)

            # classify into period or interval
            if (classifier[1] == "NN"):
                my_class = ChronoKeras.keras_classify(
                    classifier[0], np.array(list(my_features.values())))
                #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))
            elif classifier[1] in ("SVM", "RF"):
                feat_array = [int(i) for i in my_features.values()]
                my_class = classifier[0].predict([feat_array])[0]
            else:
                my_class = classifier[0].classify(my_features)
                #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))

            # if 1 then it is a period, if 0 then it is an interval
            if (my_class == 1):
                my_entity = chrono.ChronoPeriodEntity(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    period_type=getPeriodValue(val),
                    number=None)
                chrono_id = chrono_id + 1
            else:
                my_entity = chrono.ChronoCalendarIntervalEntity(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    calendar_type=val)
                chrono_id = chrono_id + 1

            #Extract the number and identify the span of numstr
            if idxstart > 0:
                ## get the absolute span of the interval token
                abs_Sspan = ref_Sspan + idxstart
                abs_Espan = ref_Sspan + idxend

                ## purposfully split on a single space
                substr = s.getText()[0:idxstart]
                # test to see if last character is a space and set a flag.
                has_space = True if substr[len(substr) - 1] == ' ' else False
                substr = substr.strip(' ').split(' ')

                ## get the previous token
                prevtok = substr[len(substr) - 1]
                prev_sSpan = idxstart - len(
                    prevtok) - 1 if has_space else idxstart - len(prevtok)
                prev_eSpan = idxstart - 1

                ## get the rest of the substring joined by a space
                if len(substr) > 1:
                    rest_of_phrase = ' '.join(substr[0:len(substr) - 1])
                    rest_of_phrase_length = len(rest_of_phrase) + 1

                else:
                    rest_of_phrase_length = 0

                ## now calculate the relative span of prevtok
                #rel_Sspan = rest_of_phrase_length
                #rel_Espan = rest_of_phrase_length + len(prevtok)

                m = re.search('([0-9]{1,2})', prevtok)
                if m is not None:
                    num_val = m.group(0)
                    abs_Sspan = ref_Sspan + rest_of_phrase_length + m.span(
                        0)[0]
                    abs_Espan = ref_Sspan + rest_of_phrase_length + m.span(
                        0)[1]

                    my_number_entity = chrono.ChronoNumber(
                        entityID=str(chrono_id) + "entity",
                        start_span=abs_Sspan,
                        end_span=abs_Espan,
                        value=num_val)
                    chrono_id = chrono_id + 1

                    #add the number entity to the list
                    chrono_list.append(my_number_entity)
                    my_entity.set_number(my_number_entity.get_id())
                #else search for a text number
                else:
                    texNumVal = utils.getNumberFromText(prevtok)
                    if texNumVal is not None:
                        abs_Sspan = ref_Sspan + rest_of_phrase_length
                        abs_Espan = ref_Sspan + rest_of_phrase_length + len(
                            prevtok
                        ) if has_space else ref_Sspan + rest_of_phrase_length + len(
                            prevtok) - 1
                        #create the number entity
                        my_number_entity = chrono.ChronoNumber(
                            entityID=str(chrono_id) + "entity",
                            start_span=abs_Sspan,
                            end_span=abs_Espan,
                            value=texNumVal)
                        chrono_id = chrono_id + 1
                        #append to list
                        chrono_list.append(my_number_entity)
                        #link to interval entity
                        my_entity.set_number(my_number_entity.get_id())

            chrono_list.append(my_entity)

    return chrono_list, chrono_id
コード例 #15
0
def hasEmbeddedPeriodInterval(tpentity):
    # convert to all lower
    # text_lower = tpentity.getText().lower()
    text = tpentity.getText()
    # remove all punctuation
    text_norm = text.translate(
        str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    # convert to list
    text_list = text_norm.split(" ")

    # define my period/interval term lists
    print(
        "TOFIX: PeriodInterval.py @ line 388: convert to using the dictionary."
    )
    terms = [
        "decades", "decade", "yesterday", "yesterdays", "today", "todays",
        "tomorrow", "tomorrows", "day", "week", "month", "year", "daily",
        "weekly", "monthly", "yearly", "century", "minute", "second", "hour",
        "hourly", "days", "weeks", "months", "years", "centuries", "century",
        "minutes", "seconds", "hours", "time", "shortly", "soon", "briefly",
        "awhile", "future", "lately", "annual", "hr", "hrs", "min", "mins",
        "quarter"
    ]  #, "date"]

    ## if the term does not exist by itself it may be a substring. Go through each word in the TimePhrase string and see if a substring matches.
    for t in text_list:
        for r in terms:
            ## see if r is a substring of t
            ## if yes and the substring is at the end, extract the first substring and test to see if it is a number.
            idx = t.find(r)
            if (idx > 0):
                # then the r term is not the first substring.  Extract and test.
                sub1 = t[:idx]
                sub2 = t[idx:]
                # sub1 should be a number
                if (isinstance(utils.getNumberFromText(sub1), (int))):
                    # if it is a number then test to figure out what sub2 is.
                    this_term = sub2
                    start_idx, end_idx = calculateSpan(text_norm, this_term)
                    if this_term in [
                            "day", "daily", "days", "yesterday", "tomorrow",
                            "yesterdays", "tomorrows", "today", "todays"
                    ]:
                        #print("ACK! Found an Embedded Day")
                        return True, "Day", start_idx, end_idx, sub1
                    elif this_term in ["week", "weekly", "weeks"]:
                        return True, "Week", start_idx, end_idx, sub1
                    elif this_term in ["month", "monthly", "months"]:
                        return True, "Month", start_idx, end_idx, sub1
                    elif this_term in ["year", "yearly", "years"]:
                        return True, "Year", start_idx, end_idx, sub1
                    elif this_term in ["century", "centuries"]:
                        return True, "Century", start_idx, end_idx, sub1
                    elif this_term in ["decade", "decades"]:
                        return True, "Decade", start_idx, end_idx, sub1
                    elif this_term in ["minute", "minutes"]:
                        return True, "Minute", start_idx, end_idx, sub1
                    elif this_term in ["second", "seconds"]:
                        return True, "Second", start_idx, end_idx, sub1
                    elif this_term in ["hour", "hourly", "hours"]:
                        return True, "Hour", start_idx, end_idx, sub1
                    elif this_term in [
                            "time", "shortly", "soon", "briefly", "awhile",
                            "future", "lately"
                    ]:
                        return True, "Unknown", start_idx, end_idx, sub1

                else:
                    return False, None, None, None, None
    return False, None, None, None, None
コード例 #16
0
def buildSeasonOfYear(s, chrono_id, chrono_list, ref_list):

    boo, val, idxstart, idxend = hasSeasonOfYear(s, ref_list)
    if boo:
        ref_Sspan, ref_Espan = s.getSpan()
        abs_Sspan = ref_Sspan + idxstart
        abs_Espan = ref_Sspan + idxend
        my_entity = chrono.ChronoSeasonOfYearEntity(entityID=str(chrono_id) +
                                                    "entity",
                                                    start_span=abs_Sspan,
                                                    end_span=abs_Espan,
                                                    season_type=val)
        chrono_id = chrono_id + 1

        #check here to see if it has a modifier
        hasMod, mod_type, mod_start, mod_end = hasNextLastThis(s)
        if (hasMod):
            if mod_type == "This":
                chrono_list.append(
                    chrono.ChronoThisOperator(
                        entityID=str(chrono_id) + "entity",
                        start_span=abs_Sspan,
                        end_span=abs_Espan,
                        repeating_interval=my_entity.get_id()))
                chrono_id = chrono_id + 1

            if mod_type == "Next":
                chrono_list.append(
                    chrono.ChronoNextOperator(
                        entityID=str(chrono_id) + "entity",
                        start_span=abs_Sspan,
                        end_span=abs_Espan,
                        repeating_interval=my_entity.get_id()))
                chrono_id = chrono_id + 1

            if mod_type == "Last":
                chrono_list.append(
                    chrono.ChronoLastOperator(
                        entityID=str(chrono_id) + "entity",
                        start_span=abs_Sspan,
                        end_span=abs_Espan,
                        repeating_interval=my_entity.get_id()))
                chrono_id = chrono_id + 1
            #else:
            #    chrono_list.append(chrono.ChronoLastOperator(entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, repeating_interval=my_entity.get_id()))
            #    chrono_id = chrono_id + 1

    # else:
    #        chrono_list.append(chrono.ChronoLastOperator(entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, repeating_interval=my_entity.get_id()))
    #       chrono_id = chrono_id+1

    #check to see if it has a number associated with it.  We assume the number comes before the interval string
        if idxstart > 0:
            substr = s.getText()[0:idxstart]
            m = re.search('([0-9]{1,2})', substr)
            if m is not None:
                num_val = m.group(0)
                abs_Sspan = ref_Sspan + m.span(0)[0]
                abs_Espan = ref_Sspan + m.span(0)[1]

                my_number_entity = chrono.ChronoNumber(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    value=num_val)
                chrono_id = chrono_id + 1

                #add the number entity to the list
                chrono_list.append(my_number_entity)
                my_entity.set_number(my_number_entity.get_id())
                #else search for a text number
            else:
                texNumVal = utils.getNumberFromText(substr)
                if texNumVal is not None:
                    #create the number entity
                    my_number_entity = chrono.ChronoNumber(
                        entityID=str(chrono_id) + "entity",
                        start_span=ref_Sspan,
                        end_span=ref_Sspan + (idxstart - 1),
                        value=texNumVal)
                    chrono_id = chrono_id + 1
                    #append to list
                    chrono_list.append(my_number_entity)
                    #link to interval entity
                    my_entity.set_number(my_number_entity.get_id())

        chrono_list.append(my_entity)

    return chrono_list, chrono_id
コード例 #17
0
ファイル: DoseDuration.py プロジェクト: nehadil/TACChrono
def buildDoseDuration(s, chrono_id, chrono_list, ref_list, classifier, feats):
    features = feats.copy()
    ref_Sspan, ref_Espan = s.getSpan()
    #print("In buildPeriodInterval(), TimePhrase Text: " + s.getText())
    bad = re.compile(r"^q\d|^Q\d")
    parts = s.getText().split()
    containsnum = False
    #various checks to ensure that this phrase is actually a dose duration

    if isDoseDuration(parts[0]):

        return chrono_list, chrono_id

    if "every" in s.getText().lower() or "time" in s.getText().lower(
    ) or "per" in s.getText().lower():

        return chrono_list, chrono_id

    if bad.match(s.getText()):

        return chrono_list, chrono_id

    if "/" in s.getText():

        return chrono_list, chrono_id

    if "[**" in s.getText() or "**]" in s.getText():

        return chrono_list, chrono_id

    if "ly" in s.getText():

        return chrono_list, chrono_id

    if "(" in s.getText() or ")" in s.getText():

        return chrono_list, chrono_id

    if "once" in s.getText().lower() or "twice" in s.getText().lower():

        return chrono_list, chrono_id

    if "past" in s.getText().lower() or "ago" in s.getText().lower():

        return chrono_list, chrono_id

    if "RANDOM" in s.getText():
        return chrono_list, chrono_id
    for part in parts:
        for ref in ref_list:
            if ref.getText().lower() == part.lower():
                if (ref.isNumeric()):
                    containsnum = True
                    break
                elif not tt.hasDoseDuration(ref.getText().lower()):
                    return chrono_list, chrono_id

    if containsnum == False:
        return chrono_list, chrono_id

    boo, val, idxstart, idxend, plural = hasDoseDuration(s)
    if boo:
        abs_Sspan = ref_Sspan
        abs_Espan = ref_Sspan + idxend

        # get index of overlapping reference token
        # ref_idx = -1
        # for i in range(0,len(ref_list)):
        #    if(utils.overlap(ref_list[i].getSpan(),(abs_Sspan,abs_Espan))):
        #        ref_idx = i
        #        break

        ref_idx = utils.getRefIdx(ref_list, abs_Sspan, abs_Espan)

        # extract ML features
        my_features = utils.extract_prediction_features(
            ref_list, ref_idx, feats.copy())

        # classify into period or interval
        if classifier[1] == "NN":
            my_class = ChronoKeras.keras_classify(
                classifier[0], np.array(list(my_features.values())))
            # print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))
        elif classifier[1] in ("SVM", "RF"):
            feat_array = [int(i) for i in my_features.values()]
            my_class = classifier[0].predict([feat_array])[0]
        else:
            my_class = classifier[0].classify(my_features)
            # print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))

        # if 1 then it is a period, if 0 then it is an interval
        if my_class == 1:
            my_entity = chrono.ChronoDoseDurationEntity(
                entityID=str(chrono_id) + "entity",
                start_span=abs_Sspan,
                end_span=abs_Espan,
                dose_type=getDoseDurationValue(val),
                number=None,
                text=s.getText())
            chrono_id = chrono_id + 1
            ### Check to see if this calendar interval has a "this" in front of it
            prior_tok = ref_list[ref_idx - 1].getText().lower()
            if prior_tok.translate(
                    str.maketrans(string.punctuation,
                                  ' ' * len(string.punctuation))) == "this":
                # add a This entitiy and link it to the interval.
                start_span, end_span = re.search(prior_tok, "this").span(0)
                prior_start, prior_end = ref_list[ref_idx - 1].getSpan()

                chrono_this_entity = chrono.ChronoThisOperator(
                    entityID=str(chrono_id) + "entity",
                    start_span=prior_start + start_span,
                    end_span=prior_start + end_span)
                chrono_id = chrono_id + 1
                chrono_this_entity.set_period(my_entity.get_id())
                chrono_list.append(chrono_this_entity)

            else:
                # check for a Last Word
                hasMod, mod_type, mod_start, mod_end = hasModifier(s)

                if (hasMod):
                    if mod_type == "Next":
                        chrono_list.append(
                            chrono.ChronoNextOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                period=my_entity.get_id()))
                        chrono_id = chrono_id + 1

                    if mod_type == "Last":
                        chrono_list.append(
                            chrono.ChronoLastOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                period=my_entity.get_id(),
                                semantics="Interval-Not-Included"))
                        chrono_id = chrono_id + 1

        else:
            my_entity = chrono.ChronoDoseDurationEntity(
                entityID=str(chrono_id) + "entity",
                start_span=abs_Sspan,
                end_span=abs_Espan,
                dose_type=val,
                number=None,
                text=s.getText())
            chrono_id = chrono_id + 1
            ### Check to see if this calendar interval has a "this" in front of it
            prior_tok = ref_list[ref_idx - 1].getText().lower()
            if prior_tok.translate(
                    str.maketrans(string.punctuation,
                                  ' ' * len(string.punctuation))) == "this":
                # add a This entitiy and link it to the interval.
                start_span, end_span = re.search(prior_tok, "this").span(0)
                prior_start, prior_end = ref_list[ref_idx - 1].getSpan()

                chrono_this_entity = chrono.ChronoThisOperator(
                    entityID=str(chrono_id) + "entity",
                    start_span=prior_start + start_span,
                    end_span=prior_start + end_span)
                chrono_id = chrono_id + 1
                chrono_this_entity.set_repeating_interval(my_entity.get_id())
                chrono_list.append(chrono_this_entity)
            else:
                # check for a Last Word
                hasMod, mod_type, mod_start, mod_end = hasModifier(s)
                if (hasMod):
                    if mod_type == "Next":
                        chrono_list.append(
                            chrono.ChronoNextOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                repeating_interval=my_entity.get_id()))
                        chrono_id = chrono_id + 1

                    if mod_type == "Last":
                        chrono_list.append(
                            chrono.ChronoLastOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                repeating_interval=my_entity.get_id(),
                                semantics="Interval-Not-Included"))
                        chrono_id = chrono_id + 1

        # check to see if it has a number associated with it.  We assume the number comes before the interval string
        if idxstart > 0:
            substr = s.getText()[0:idxstart]
            m = re.search('([0-9]{1,2})', substr)
            if m is not None:
                num_val = m.group(0)
                abs_Sspan = ref_Sspan + m.span(0)[0]
                abs_Espan = ref_Sspan + m.span(0)[1]

                my_number_entity = chrono.ChronoNumber(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    value=num_val)
                #chrono_id = chrono_id + 1

                # add the number entity to the list
                #chrono_list.append(my_number_entity)
                my_entity.set_number(my_number_entity.get_id())
            # else search for a text number
            else:
                texNumVal = utils.getNumberFromText(substr)
                if texNumVal is not None:
                    # create the number entity
                    my_number_entity = chrono.ChronoNumber(
                        entityID=str(chrono_id) + "entity",
                        start_span=ref_Sspan,
                        end_span=ref_Sspan + (idxstart - 1),
                        value=texNumVal)
                    #chrono_id = chrono_id + 1
                    # append to list
                    #chrono_list.append(my_number_entity)
                    # link to interval entity
                    my_entity.set_number(my_number_entity.get_id())

        chrono_list.append(my_entity)
    else:
        boo2, val, idxstart, idxend, numstr = hasEmbeddedPeriodInterval(s)
        if (boo2):
            abs_Sspan = ref_Sspan
            abs_Espan = ref_Sspan + idxend

            # get index of overlapping reference token
            ref_idx = -1
            for i in range(0, len(ref_list)):
                if (utils.overlap(ref_list[i].getSpan(),
                                  (abs_Sspan, abs_Espan))):
                    ref_idx = i
                    break

            # extract ML features
            my_features = utils.extract_prediction_features(
                ref_list, ref_idx, features)

            # classify into period or interval
            if (classifier[1] == "NN"):
                my_class = ChronoKeras.keras_classify(
                    classifier[0], np.array(list(my_features.values())))
                #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))
            else:
                my_class = classifier[0].classify(my_features)
                #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))

            # if 1 then it is a period, if 0 then it is an interval
            if (my_class == 1):
                my_entity = chrono.ChronoDoseDurationEntity(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    dose_type=getDoseDurationValue(val),
                    number=None,
                    text=s.getText())
                chrono_id = chrono_id + 1
            else:
                my_entity = chrono.ChronoDoseDurationEntity(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    dose_type=val,
                    number=None,
                    text=s.getText())
                chrono_id = chrono_id + 1

            #Extract the number and identify the span of numstr

            substr = s.getText(
            )[:idxstart]  ## extract entire first part of TimePhrase phrase
            m = re.search(
                '([0-9]{1,2})', substr
            )  #search for an integer in the subphrase and extract it's coordinates
            if m is not None:
                num_val = m.group(0)
                abs_Sspan = ref_Sspan + m.span(0)[0]
                abs_Espan = ref_Sspan + m.span(0)[1]

                my_number_entity = chrono.ChronoNumber(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    value=num_val)
                #chrono_id = chrono_id + 1

                #add the number entity to the list
                #chrono_list.append(my_number_entity)
                #link to interval entity
                my_entity.set_number(my_number_entity.get_id())
            #else search for a text number
            else:
                texNumVal = utils.getNumberFromText(numstr)
                if texNumVal is not None:
                    m = re.search(
                        numstr,
                        substr)  #search for the number string in the subphrase
                    if m is not None:
                        abs_Sspan = ref_Sspan + m.span(0)[0]
                        abs_Espan = ref_Sspan + m.span(0)[1]
                        #create the number entity
                        my_number_entity = chrono.ChronoNumber(
                            entityID=str(chrono_id) + "entity",
                            start_span=abs_Sspan,
                            end_span=abs_Espan,
                            value=texNumVal)
                        #chrono_id = chrono_id + 1
                        #append to list
                        #chrono_list.append(my_number_entity)
                        #link to interval entity
                        my_entity.set_number(my_number_entity.get_id())

            chrono_list.append(my_entity)

    return chrono_list, chrono_id