Esempio n. 1
0
def hasNthFromStart(tpentity, ref_list):
    refStart_span, refEnd_span = tpentity.getSpan()

    # convert to all lower
    text = tpentity.getText().lower()
    # remove all punctuation
    text_norm = text.translate(
        str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    # convert to list
    text_list = text_norm.split(" ")

    ## if the term does not exist by itself it may be a substring. Go through each word in the TimePhrase string and see if a substring matches.
    for t in text_list:
        val = utils.isOrdinal(t)

        if val is not None:
            start_idx, end_idx = Chrono.utils.calculateSpan(text_norm, t)
            # now get the reference index of this token and see if there are any temporal tokens next to it.
            idx = utils.getRefIdx(ref_list, refStart_span + start_idx,
                                  refStart_span + end_idx)
            if ref_list[idx - 1].isTemporal() or ref_list[idx +
                                                          1].isTemporal():
                return True, val, start_idx, end_idx

    return False, None, None, None


####
# END_MODULE
####
Esempio n. 2
0
def hasSeasonOfYear(tpentity, ref_list):
    refStart_span, refEnd_span = tpentity.getSpan()

    # convert to all lower
    # text_lower = tpentity.getText().lower()
    text = tpentity.getText().lower()
    # remove all punctuation
    text_norm = text.translate(
        str.maketrans(string.punctuation,
                      ' ' * len(string.punctuation))).strip()

    # convert to list
    text_list = text_norm.split(" ")

    # define my period lists
    seasonofyear = [
        "summer", "winter", "fall", "spring", "summers", "falls", "winters",
        "springs"
    ]

    # figure out if any of the tokens in the text_list are also in the ampm list
    intersect = list(set(text_list) & set(seasonofyear))

    # only proceed if the intersect list has a length of 1 or more.
    # For this method I'm assuming it will only be a length of 1, if it is not then we don't know what to do with it.
    if len(intersect) == 1:

        term = intersect[0]
        start_idx, end_idx = calculateSpan(text_norm, term)
        if term == "summer" or term == "summers":
            start_idx, end_idx = calculateSpan(text_norm, "summer")
            absStart = refStart_span + start_idx
            absEnd = refStart_span + end_idx
            postag = ref_list[utils.getRefIdx(ref_list, absStart,
                                              absEnd)].getPos()

            if postag == "NN":
                return True, "Summer", start_idx, end_idx

        elif term == "winter" or term == "winters":
            start_idx, end_idx = calculateSpan(text_norm, "winter")
            absStart = refStart_span + start_idx
            absEnd = refStart_span + end_idx
            postag = ref_list[utils.getRefIdx(ref_list, absStart,
                                              absEnd)].getPos()

            if postag == "NN":
                return True, "Winter", start_idx, end_idx

        elif term == "fall" or term == "falls":
            start_idx, end_idx = calculateSpan(text_norm, "fall")
            absStart = refStart_span + start_idx
            absEnd = refStart_span + end_idx
            postag = ref_list[utils.getRefIdx(ref_list, absStart,
                                              absEnd)].getPos()

            if postag == "NN":
                return True, "Fall", start_idx, end_idx

        elif term == "spring" or term == "springs":
            start_idx, end_idx = calculateSpan(text_norm, "spring")
            absStart = refStart_span + start_idx
            absEnd = refStart_span + end_idx
            postag = ref_list[utils.getRefIdx(ref_list, absStart,
                                              absEnd)].getPos()

            if postag == "NN":
                return True, "Spring", start_idx, end_idx

        else:
            return False, None, None, None

    return False, None, None, None
Esempio n. 3
0
def buildPeriodInterval(s, chrono_id, chrono_list, ref_list, classifier,
                        feats):

    features = feats.copy()
    ref_Sspan, ref_Espan = s.getSpan()
    #print("In buildPeriodInterval(), TimePhrase Text: " + s.getText())
    boo, val, idxstart, idxend, plural = hasPeriodInterval(s)

    # FIND terms that are always marked as calendar intervals!
    if boo and re.search(
            "yesterday|yesterdays|tomorrow|tomorrows|today|todays|daily|/min|/week",
            s.getText()):
        abs_Sspan = ref_Sspan + idxstart
        abs_Espan = ref_Sspan + idxend
        my_entity = chrono.ChronoCalendarIntervalEntity(
            entityID=str(chrono_id) + "entity",
            start_span=abs_Sspan,
            end_span=abs_Espan,
            calendar_type=val,
            number=None)
        chrono_id = chrono_id + 1

        if re.search("yesterday|yesterdays", s.getText()):

            my_last_entity = chrono.ChronoLastOperator(
                entityID=str(chrono_id) + "entity",
                start_span=abs_Sspan,
                end_span=abs_Espan,
                repeating_interval=str(chrono_id - 1) + "entity")
            chrono_id = chrono_id + 1
            chrono_list.append(my_last_entity)

        chrono_list.append(my_entity)

    # FIND terms that are always marked as periods!
    elif boo and val == "Unknown":
        abs_Sspan = ref_Sspan + idxstart
        abs_Espan = ref_Sspan + idxend
        my_entity = chrono.ChronoPeriodEntity(entityID=str(chrono_id) +
                                              "entity",
                                              start_span=abs_Sspan,
                                              end_span=abs_Espan,
                                              period_type=val,
                                              number=None)
        chrono_id = chrono_id + 1
        chrono_list.append(my_entity)

    elif boo:
        abs_Sspan = ref_Sspan + idxstart
        abs_Espan = ref_Sspan + idxend

        # get index of overlapping reference token
        #ref_idx = -1
        #for i in range(0,len(ref_list)):
        #    if(utils.overlap(ref_list[i].getSpan(),(abs_Sspan,abs_Espan))):
        #        ref_idx = i
        #        break

        ref_idx = utils.getRefIdx(ref_list, abs_Sspan, abs_Espan)

        # extract ML features
        my_features = utils.extract_prediction_features(
            ref_list, ref_idx, feats.copy())

        # classify into period or interval
        if classifier[1] == "NN":
            my_class = ChronoKeras.keras_classify(
                classifier[0], np.array(list(my_features.values())))
            #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))
        elif classifier[1] in ("SVM", "RF"):
            feat_array = [int(i) for i in my_features.values()]
            my_class = classifier[0].predict([feat_array])[0]
        else:
            my_class = classifier[0].classify(my_features)
            #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))

        # if 1 then it is a period, if 0 then it is an interval
        if my_class == 1:
            my_entity = chrono.ChronoPeriodEntity(
                entityID=str(chrono_id) + "entity",
                start_span=abs_Sspan,
                end_span=abs_Espan,
                period_type=getPeriodValue(val),
                number=None)
            chrono_id = chrono_id + 1
            ### Check to see if this calendar interval has a "this" in front of it
            prior_tok = ref_list[ref_idx - 1].getText().lower()
            if prior_tok.translate(
                    str.maketrans(string.punctuation,
                                  ' ' * len(string.punctuation))) == "this":
                # add a This entitiy and link it to the interval.
                start_span, end_span = re.search(prior_tok, "this").span(0)
                prior_start, prior_end = ref_list[ref_idx - 1].getSpan()

                chrono_this_entity = chrono.ChronoThisOperator(
                    entityID=str(chrono_id) + "entity",
                    start_span=prior_start + start_span,
                    end_span=prior_start + end_span)
                chrono_id = chrono_id + 1
                chrono_this_entity.set_period(my_entity.get_id())
                chrono_list.append(chrono_this_entity)

            else:
                # check for a Last Word
                hasMod, mod_type, mod_start, mod_end = hasModifier(s)

                if (hasMod):
                    if mod_type == "Next":
                        chrono_list.append(
                            chrono.ChronoNextOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                period=my_entity.get_id()))
                        chrono_id = chrono_id + 1

                    if mod_type == "Last":
                        chrono_list.append(
                            chrono.ChronoLastOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                period=my_entity.get_id(),
                                semantics="Interval-Not-Included"))
                        chrono_id = chrono_id + 1

        else:
            my_entity = chrono.ChronoCalendarIntervalEntity(
                entityID=str(chrono_id) + "entity",
                start_span=abs_Sspan,
                end_span=abs_Espan,
                calendar_type=val,
                number=None)
            chrono_id = chrono_id + 1
            ### Check to see if this calendar interval has a "this" in front of it
            prior_tok = ref_list[ref_idx - 1].getText().lower()
            if prior_tok.translate(
                    str.maketrans(string.punctuation,
                                  ' ' * len(string.punctuation))) == "this":
                # add a This entitiy and link it to the interval.
                start_span, end_span = re.search(prior_tok, "this").span(0)
                prior_start, prior_end = ref_list[ref_idx - 1].getSpan()

                chrono_this_entity = chrono.ChronoThisOperator(
                    entityID=str(chrono_id) + "entity",
                    start_span=prior_start + start_span,
                    end_span=prior_start + end_span)
                chrono_id = chrono_id + 1
                chrono_this_entity.set_repeating_interval(my_entity.get_id())
                chrono_list.append(chrono_this_entity)
            else:
                # check for a Last Word
                hasMod, mod_type, mod_start, mod_end = hasModifier(s)
                if (hasMod):
                    if mod_type == "Next":
                        chrono_list.append(
                            chrono.ChronoNextOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                repeating_interval=my_entity.get_id()))
                        chrono_id = chrono_id + 1

                    if mod_type == "Last":
                        chrono_list.append(
                            chrono.ChronoLastOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                repeating_interval=my_entity.get_id(),
                                semantics="Interval-Not-Included"))
                        chrono_id = chrono_id + 1

        #check to see if it has a number associated with it.  We assume the number comes before the interval string
        if idxstart > 0:
            substr = s.getText()[0:idxstart]
            m = re.search('([0-9]{1,2})', substr)
            if m is not None:
                num_val = m.group(0)
                abs_Sspan = ref_Sspan + m.span(0)[0]
                abs_Espan = ref_Sspan + m.span(0)[1]

                my_number_entity = chrono.ChronoNumber(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    value=num_val)
                chrono_id = chrono_id + 1

                #add the number entity to the list
                chrono_list.append(my_number_entity)
                my_entity.set_number(my_number_entity.get_id())
            #else search for a text number
            else:
                texNumVal = utils.getNumberFromText(substr)
                if texNumVal is not None:
                    #create the number entity
                    my_number_entity = chrono.ChronoNumber(
                        entityID=str(chrono_id) + "entity",
                        start_span=ref_Sspan,
                        end_span=ref_Sspan + (idxstart - 1),
                        value=texNumVal)
                    chrono_id = chrono_id + 1
                    #append to list
                    chrono_list.append(my_number_entity)
                    #link to interval entity
                    my_entity.set_number(my_number_entity.get_id())

        chrono_list.append(my_entity)

    else:
        boo2, val, idxstart, idxend, numstr = hasEmbeddedPeriodInterval(s)
        if (boo2):
            abs_Sspan = ref_Sspan + idxstart
            abs_Espan = ref_Sspan + idxend

            # get index of overlapping reference token
            ref_idx = -1
            for i in range(0, len(ref_list)):
                if (utils.overlap(ref_list[i].getSpan(),
                                  (abs_Sspan, abs_Espan))):
                    ref_idx = i
                    break

            # extract ML features
            my_features = utils.extract_prediction_features(
                ref_list, ref_idx, features)

            # classify into period or interval
            if (classifier[1] == "NN"):
                my_class = ChronoKeras.keras_classify(
                    classifier[0], np.array(list(my_features.values())))
                #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))
            else:
                my_class = classifier[0].classify(my_features)
                #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))

            # if 1 then it is a period, if 0 then it is an interval
            if (my_class == 1):
                my_entity = chrono.ChronoPeriodEntity(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    period_type=getPeriodValue(val),
                    number=None)
                chrono_id = chrono_id + 1
            else:
                my_entity = chrono.ChronoCalendarIntervalEntity(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    calendar_type=val)
                chrono_id = chrono_id + 1

            #Extract the number and identify the span of numstr

            substr = s.getText(
            )[:idxstart]  ## extract entire first part of TimePhrase phrase
            m = re.search(
                '([0-9]{1,2})', substr
            )  #search for an integer in the subphrase and extract it's coordinates
            if m is not None:
                num_val = m.group(0)
                abs_Sspan = ref_Sspan + m.span(0)[0]
                abs_Espan = ref_Sspan + m.span(0)[1]

                my_number_entity = chrono.ChronoNumber(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    value=num_val)
                chrono_id = chrono_id + 1

                #add the number entity to the list
                chrono_list.append(my_number_entity)
                #link to interval entity
                my_entity.set_number(my_number_entity.get_id())
            #else search for a text number
            else:
                texNumVal = utils.getNumberFromText(numstr)
                if texNumVal is not None:
                    m = re.search(
                        numstr,
                        substr)  #search for the number string in the subphrase
                    if m is not None:
                        abs_Sspan = ref_Sspan + m.span(0)[0]
                        abs_Espan = ref_Sspan + m.span(0)[1]
                        #create the number entity
                        my_number_entity = chrono.ChronoNumber(
                            entityID=str(chrono_id) + "entity",
                            start_span=abs_Sspan,
                            end_span=abs_Espan,
                            value=texNumVal)
                        chrono_id = chrono_id + 1
                        #append to list
                        chrono_list.append(my_number_entity)
                        #link to interval entity
                        my_entity.set_number(my_number_entity.get_id())

            chrono_list.append(my_entity)

    return chrono_list, chrono_id
Esempio n. 4
0
def buildSubIntervals(chrono_list, chrono_id, dct, ref_list):
    year = None
    month = None
    day = None
    hour = None
    minute = None
    second = None
    daypart = None
    dayweek = None
    interval = None
    period = None
    nth = None
    nxt = None
    this = None
    tz = None
    ampm = None
    modifier = None
    last = None
    
    entity_count = 0
   
    #print("in Build Subintervals") 
    ## loop through all entities and pull out the approriate IDs
    for e in range(0,len(chrono_list)):
        #print(chrono_list[e].get_id())
        e_type = chrono_list[e].get_type()
        #print("E-type: " + e_type)
        
        if e_type == "Two-Digit-Year" or e_type == "Year":
            year = e
            entity_count = entity_count + 1
            # print("YEAR VALUE: " + str(chrono_list[e].get_value()))
        elif e_type == "Month-Of-Year":
            # print("FOUND Month")
            month = e
            entity_count = entity_count + 1
        elif e_type == "Day-Of-Month":
            day = e
            entity_count = entity_count + 1
        elif e_type == "Hour-Of-Day":
            hour = e
            entity_count = entity_count + 1
        elif e_type == "Minute-Of-Hour":
            minute = e
            entity_count = entity_count + 1
        elif e_type == "Second-Of-Minute":
            second = e
            entity_count = entity_count + 1
        elif e_type == "Part-Of-Day":
            daypart = e
            entity_count = entity_count + 1
        elif e_type == "Day-Of-Week":
            dayweek = e
            entity_count = entity_count + 1
        elif e_type == "Calendar-Interval":
            interval = e
            entity_count = entity_count + 1
        elif e_type == "Period":
            period = e
            entity_count = entity_count + 1
        elif e_type == "NthFromStart":
            nth = e
            entity_count = entity_count + 1
        elif e_type == "Next":
            nxt = e
            entity_count = entity_count + 1
        elif e_type == "This":
            this = e
            entity_count = entity_count + 1
        
        elif e_type == "Time-Zone":
            tz = e
            entity_count = entity_count + 1
        elif e_type == "AMPM-Of-Day":
            ampm = e
            entity_count = entity_count + 1
        elif e_type == "Modifier":
            modifier = e
            entity_count = entity_count + 1
        elif e_type == "Last":
            last = e
            entity_count = entity_count + 1
            
        
        
    ## Now add additional NEXT and LAST entities where needed
    ## Need to edit to figure out if a modifier word exists first, then test for year, etc.
    ## need to look specifically for modifier words in the other methods.  This method catches full dates that are next or last with no modifier words.
    ## update: I now have a buildLast() method that identifies the modifier words.
    if year is None:
        if dct is not None:
            if month is not None and this is None and nxt is None and last is None:                
                mStart = chrono_list[month].get_start_span()
                mEnd = chrono_list[month].get_end_span()
                
                my_month = utils.getMonthNumber(chrono_list[month].get_month_type())
                
                if day is not None and my_month == dct.month:
                    # add a Last
                    if chrono_list[day].get_value() <= dct.day:
                        chrono_list.append(chrono.ChronoLastOperator(entityID=str(chrono_id) + "entity", start_span=mStart, end_span=mEnd, repeating_interval=chrono_list[month].get_id()))
                        chrono_id = chrono_id + 1
                    elif chrono_list[day].get_value() > dct.day:
                        chrono_list.append(chrono.ChronoNextOperator(entityID=str(chrono_id) + "entity", start_span=mStart, end_span=mEnd, repeating_interval=chrono_list[month].get_id()))
                        chrono_id = chrono_id + 1
                
                elif my_month < dct.month:
                    chrono_list.append(chrono.ChronoLastOperator(entityID=str(chrono_id) + "entity", start_span=mStart, end_span=mEnd, repeating_interval=chrono_list[month].get_id()))
                    chrono_id = chrono_id + 1
                    
                elif my_month > dct.month:
                    chrono_list.append(chrono.ChronoNextOperator(entityID=str(chrono_id) + "entity", start_span=mStart, end_span=mEnd, repeating_interval=chrono_list[month].get_id()))
                    chrono_id = chrono_id + 1      
            
            ##having a problem where a past day is being referenced without it being explicit.  
            ##need to look at the closest preceding verb tense to see if it is past or present I think.
            ##will need the reference list to do this.
            if dayweek is not None and this is None and nxt is None and last is None:                
                mStart = chrono_list[dayweek].get_start_span()
                mEnd = chrono_list[dayweek].get_end_span()
                
                #Get ref idx for this token
                ref = utils.getRefIdx(ref_list, mStart, mEnd)
                vb = None
                
                while vb is None and ref != 0:
                    if "VB" in ref_list[ref].getPos():
                        if ref_list[ref].getPos() in ["VBD","VBN"]:
                            #past tense so put as a last
                            chrono_list.append(chrono.ChronoLastOperator(entityID=str(chrono_id) + "entity", start_span=mStart, end_span=mEnd, repeating_interval=chrono_list[dayweek].get_id()))
                            chrono_id = chrono_id + 1
                            # print("FOUND DAYWEEK LAST")
                        elif ref_list[ref].getPos() in ["VB","VBG","VBP","VBZ"]:
                            #present tense so put as a next
                            chrono_list.append(chrono.ChronoNextOperator(entityID=str(chrono_id) + "entity", start_span=mStart, end_span=mEnd, repeating_interval=chrono_list[dayweek].get_id()))
                            chrono_id = chrono_id + 1  
                            # print("FOUND DAYWEEK NEXT")
                        vb = True
                    # print("Ref Tok: " + str(ref))
                    ref-=1
                
                '''
                weekdays = {"Monday":0, "Tuesday":1, "Wednesday":2, "Thursday":3, "Friday":4, "Saturday":5, "Sunday":6}
                ##Monday is 0 and Sunday is 6
                dct_day = dct.weekday()
                ##need convert the doctime to a day of week
                my_dayweek = weekdays[chrono_list[dayweek].get_day_type()]
                
                if my_dayweek < dct_day:
                    chrono_list.append(chrono.ChronoLastOperator(entityID=str(chrono_id) + "entity", start_span=mStart, end_span=mEnd, repeating_interval=chrono_list[dayweek].get_id()))
                    chrono_id = chrono_id + 1
                    print("FOUND DAYWEEK LAST")
                    
                elif my_dayweek > dct_day:
                    chrono_list.append(chrono.ChronoNextOperator(entityID=str(chrono_id) + "entity", start_span=mStart, end_span=mEnd, repeating_interval=chrono_list[dayweek].get_id()))
                    chrono_id = chrono_id + 1  
                    print("FOUND DAYWEEK NEXT")        
                '''
    
    ## Now assign all sub-intervals
    if second is not None and minute is not None:
        chrono_list[minute].set_sub_interval(chrono_list[second].get_id())
    if minute is not None and hour is not None:
        #print("Linking entities " + str(minute) + " and " + str(hour))
        chrono_list[hour].set_sub_interval(chrono_list[minute].get_id())
    if hour is not None and day is not None:
        chrono_list[day].set_sub_interval(chrono_list[hour].get_id())
    if day is not None and month is not None:
        chrono_list[month].set_sub_interval(chrono_list[day].get_id())
    if month is not None and year is not None:
        chrono_list[year].set_sub_interval(chrono_list[month].get_id())
    if dayweek is not None and hour is not None:
        chrono_list[dayweek].set_sub_interval(chrono_list[hour].get_id())
    if dayweek is not None and daypart is not None and hour is None:
        chrono_list[dayweek].set_sub_interval(chrono_list[daypart].get_id())
    if day is not None and daypart is not None and hour is None:
        chrono_list[day].set_sub_interval(chrono_list[daypart].get_id())
    if nth is not None and period is not None:
        # print("Adding period sub-interval")
        chrono_list[nth].set_period(chrono_list[period].get_id())
    elif nth is not None and interval is not None:
        # print("Adding interval sub-interval")
        chrono_list[nth].set_repeating_interval(chrono_list[interval].get_id())
    
    
    ## Test to see if we have a Last entity AND the entity count is only 1
    ## If yes, then remove the Last entity
    ## Current not implementing this, but may need to add it in the future.  This removal of entities reduced our recall by half
#    if last is not None and entity_count == 1:
#        print("Found a Last without a temporal entity")
#        del chrono_list[last]
        
    reindex = False
    if ampm is not None and hour is not None:
        chrono_list[hour].set_ampm(chrono_list[ampm].get_id())
    elif ampm is not None and hour is None:
        # Delete the AMPM entity if no hour associated with it.
        #print("Deleting AMPM")
        del chrono_list[ampm]
        reindex = True

    ## I know I need to reindex here, but I honestly forgot exactly why.
    if reindex:
        for e in range(0,len(chrono_list)):
            #print(chrono_list[e].get_id())
            e_type = chrono_list[e].get_type()
            if e_type == "Time-Zone":
                #print("Reindexing Time Zone Value: " + str(chrono_list[e]))
                tz = e
                    
        
    if tz is not None and hour is not None:
        chrono_list[hour].set_time_zone(chrono_list[tz].get_id())
    elif tz is not None and hour is None:
        # Delete the tz entity if there is no hour to link it to.  Not sure if this will work for all cases.
        #print("Deleting TimeZone")
        del chrono_list[tz]

    # Link modifiers
    if modifier is not None and period is not None:
        chrono_list[period].set_modifier(chrono_list[modifier].get_id())
    elif modifier is not None and interval is not None:
        chrono_list[interval].set_modifier(chrono_list[modifier].get_id())
    elif modifier is not None and period is None and interval is None:
        # Delete the modifier entity if there is no period or interval to link it to.  Not sure if this will work for all cases.
        #print("Deleting Modifier")
        del chrono_list[modifier]
    
    
    ##### Notes: This next bit is complicated.  If I include it I remove some False Positives, but I also create some False Negatives.
    ##### I think more complex parsing is needed here to figure out if the ordinal is an NthFromStart or not.  
    ##### I think implementing a machine learning method here may help.
    #elif nth is not None:
        # if the nthFromStart does not have a corresponding interval we should remove it from the list.
        #print("REMOVING NthFromStart: " + str(chrono_list[nth]))
        #del chrono_list[nth]
    
    return chrono_list, chrono_id
Esempio n. 5
0
def buildDoseDuration(s, chrono_id, chrono_list, ref_list, classifier, feats):
    features = feats.copy()
    ref_Sspan, ref_Espan = s.getSpan()
    #print("In buildPeriodInterval(), TimePhrase Text: " + s.getText())
    bad = re.compile(r"^q\d|^Q\d")
    parts = s.getText().split()
    containsnum = False

    #various checks to ensure that this phrase is actually a dose duration

    if isDoseDuration(parts[0]):
        return chrono_list, chrono_id
    if "every" in s.getText().lower() or "time" in s.getText().lower(
    ) or "per" in s.getText().lower():
        return chrono_list, chrono_id
    if bad.match(s.getText()):
        return chrono_list, chrono_id
    if "/" in s.getText():
        return chrono_list, chrono_id
    if "[**" in s.getText() or "**]" in s.getText():
        return chrono_list, chrono_id
    if "ly" in s.getText():
        return chrono_list, chrono_id
    if "(" in s.getText() or ")" in s.getText():
        return chrono_list, chrono_id
    if "once" in s.getText().lower() or "twice" in s.getText().lower():
        return chrono_list, chrono_id
    if "past" in s.getText().lower() or "ago" in s.getText().lower():
        return chrono_list, chrono_id
    if "RANDOM" in s.getText():
        return chrono_list, chrono_id
    for part in parts:
        part = re.sub('[' + string.punctuation + ']', '', part).strip()
        for ref in ref_list:
            if ref.getText().lower() == part.lower():
                if (ref.isNumeric()):
                    containsnum = True
                    if utils.isOrdinal(ref.getText()):
                        return chrono_list, chrono_id
                    break

                elif not tt.hasDoseDuration(ref.getText().lower()):
                    return chrono_list, chrono_id
    if containsnum == False:
        return chrono_list, chrono_id

    boo, val, idxstart, idxend, plural = hasDoseDuration(s)
    if boo:
        abs_Sspan = ref_Sspan
        abs_Espan = ref_Espan

        # get index of overlapping reference token
        # ref_idx = -1
        # for i in range(0,len(ref_list)):
        #    if(utils.overlap(ref_list[i].getSpan(),(abs_Sspan,abs_Espan))):
        #        ref_idx = i
        #        break

        ref_idx = utils.getRefIdx(ref_list, abs_Sspan, abs_Espan)

        # extract ML features
        my_features = utils.extract_prediction_features(
            ref_list, ref_idx, feats.copy())

        # classify into period or interval
        if classifier[1] == "NN":
            my_class = ChronoKeras.keras_classify(
                classifier[0], np.array(list(my_features.values())))
            # print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))
        elif classifier[1] in ("SVM", "RF"):
            feat_array = [int(i) for i in my_features.values()]
            my_class = classifier[0].predict([feat_array])[0]
        else:
            my_class = classifier[0].classify(my_features)
            # print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))

        # if 1 then it is a period, if 0 then it is an interval
        if my_class == 1:
            my_entity = chrono.ChronoDoseDurationEntity(
                entityID=str(chrono_id) + "entity",
                start_span=abs_Sspan,
                end_span=abs_Espan,
                dose_type=getDoseDurationValue(val),
                number=None,
                text=s.getText())
            chrono_id = chrono_id + 1
            ### Check to see if this calendar interval has a "this" in front of it
            prior_tok = ref_list[ref_idx - 1].getText().lower()
            if prior_tok.translate(
                    str.maketrans(string.punctuation,
                                  ' ' * len(string.punctuation))) == "this":
                # add a This entitiy and link it to the interval.
                start_span, end_span = re.search(prior_tok, "this").span(0)
                prior_start, prior_end = ref_list[ref_idx - 1].getSpan()

                chrono_this_entity = chrono.ChronoThisOperator(
                    entityID=str(chrono_id) + "entity",
                    start_span=prior_start + start_span,
                    end_span=prior_start + end_span)
                chrono_id = chrono_id + 1
                chrono_this_entity.set_period(my_entity.get_id())
                chrono_list.append(chrono_this_entity)

            else:
                # check for a Last Word
                hasMod, mod_type, mod_start, mod_end = hasModifier(s)

                if (hasMod):
                    if mod_type == "Next":
                        chrono_list.append(
                            chrono.ChronoNextOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                period=my_entity.get_id()))
                        chrono_id = chrono_id + 1

                    if mod_type == "Last":
                        chrono_list.append(
                            chrono.ChronoLastOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                period=my_entity.get_id(),
                                semantics="Interval-Not-Included"))
                        chrono_id = chrono_id + 1

        else:
            my_entity = chrono.ChronoDoseDurationEntity(
                entityID=str(chrono_id) + "entity",
                start_span=abs_Sspan,
                end_span=abs_Espan,
                dose_type=val,
                number=None,
                text=s.getText())
            chrono_id = chrono_id + 1
            ### Check to see if this calendar interval has a "this" in front of it
            prior_tok = ref_list[ref_idx - 1].getText().lower()
            if prior_tok.translate(
                    str.maketrans(string.punctuation,
                                  ' ' * len(string.punctuation))) == "this":
                # add a This entitiy and link it to the interval.
                start_span, end_span = re.search(prior_tok, "this").span(0)
                prior_start, prior_end = ref_list[ref_idx - 1].getSpan()

                chrono_this_entity = chrono.ChronoThisOperator(
                    entityID=str(chrono_id) + "entity",
                    start_span=prior_start + start_span,
                    end_span=prior_start + end_span)
                chrono_id = chrono_id + 1
                chrono_this_entity.set_repeating_interval(my_entity.get_id())
                chrono_list.append(chrono_this_entity)
            else:
                # check for a Last Word
                hasMod, mod_type, mod_start, mod_end = hasModifier(s)
                if (hasMod):
                    if mod_type == "Next":
                        chrono_list.append(
                            chrono.ChronoNextOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                repeating_interval=my_entity.get_id()))
                        chrono_id = chrono_id + 1

                    if mod_type == "Last":
                        chrono_list.append(
                            chrono.ChronoLastOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                repeating_interval=my_entity.get_id(),
                                semantics="Interval-Not-Included"))
                        chrono_id = chrono_id + 1

        # check to see if it has a number associated with it.  We assume the number comes before the interval string

        chrono_list.append(my_entity)
    else:
        boo2, val, idxstart, idxend, numstr = hasEmbeddedPeriodInterval(s)
        if (boo2):
            abs_Sspan = ref_Sspan
            abs_Espan = ref_Espan

            # get index of overlapping reference token
            ref_idx = -1
            for i in range(0, len(ref_list)):
                if (utils.overlap(ref_list[i].getSpan(),
                                  (abs_Sspan, abs_Espan))):
                    ref_idx = i
                    break

            # extract ML features
            my_features = utils.extract_prediction_features(
                ref_list, ref_idx, features)

            # classify into period or interval
            if (classifier[1] == "NN"):
                my_class = ChronoKeras.keras_classify(
                    classifier[0], np.array(list(my_features.values())))
                #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))
            else:
                my_class = classifier[0].classify(my_features)
                #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))

            # if 1 then it is a period, if 0 then it is an interval
            if (my_class == 1):
                my_entity = chrono.ChronoDoseDurationEntity(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    dose_type=getDoseDurationValue(val),
                    number=None,
                    text=s.getText())
                chrono_id = chrono_id + 1
            else:
                my_entity = chrono.ChronoDoseDurationEntity(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    dose_type=val,
                    number=None,
                    text=s.getText())
                chrono_id = chrono_id + 1

            #Extract the number and identify the span of numstr

            substr = s.getText(
            )[:idxstart]  ## extract entire first part of TimePhrase phrase
            m = re.search(
                '([0-9]{1,2})', substr
            )  #search for an integer in the subphrase and extract it's coordinates

            chrono_list.append(my_entity)

    return chrono_list, chrono_id
Esempio n. 6
0
def hasTextMonth(tpentity, ref_list):
    refStart_span, refEnd_span = tpentity.getSpan()

    # convert to all lower
    text_lower = tpentity.getText().lower()
    # remove all punctuation
    # text_norm = text_lower.translate(str.maketrans(",", ' ')).strip()
    text_norm = text_lower.translate(
        str.maketrans(string.punctuation,
                      ' ' * len(string.punctuation))).strip()
    # convert to list
    text_list = text_norm.split(" ")

    # define my month lists
    full_month = [
        "january", "february", "march", "april", "may", "june", "july",
        "august", "september", "october", "november", "december"
    ]

    # run for full month
    t_flag = False
    for tok in text_list:
        answer = next((m for m in full_month if tok in m), None)
        if answer is not None and not t_flag:
            answer2 = next((m for m in full_month if m in tok), None)
            if answer2 is not None and not t_flag:
                t_flag = True
                # answer2 should contain the element that matches.  We need to find the span in the original phrase and return the correct value
                start_idx, end_idx = calculateSpan(text_lower, answer2)
                absStart = refStart_span + start_idx
                absEnd = refStart_span + end_idx
                postag = ref_list[utils.getRefIdx(ref_list, absStart,
                                                  absEnd)].getPos()

                if postag == "NNP":
                    if answer2 in ["january"]:
                        return True, "January", start_idx, end_idx
                    elif answer2 in ["february"]:
                        return True, "February", start_idx, end_idx
                    elif answer2 in ["march"]:
                        return True, "March", start_idx, end_idx
                    elif answer2 in ["april"]:
                        return True, "April", start_idx, end_idx
                    elif answer2 in ["may"]:
                        return True, "May", start_idx, end_idx
                    elif answer2 in ["june"]:
                        return True, "June", start_idx, end_idx
                    elif answer2 in ["july"]:
                        return True, "July", start_idx, end_idx
                    elif answer2 in ["august"]:
                        return True, "August", start_idx, end_idx
                    elif answer2 in ["september"]:
                        return True, "September", start_idx, end_idx
                    elif answer2 in ["october"]:
                        return True, "October", start_idx, end_idx
                    elif answer2 in ["november"]:
                        return True, "November", start_idx, end_idx
                    elif answer2 in ["december"]:
                        return True, "December", start_idx, end_idx

    # run for abbr month
    abbr_month = [
        "jan.", "feb.", "mar.", "apr.", "jun.", "jul.", "aug.", "sept.",
        "sep.", "oct.", "nov.", "dec."
    ]
    adj_punc = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~'
    text_norm2 = text_lower.translate(
        str.maketrans(adj_punc, ' ' * len(adj_punc))).strip()
    # convert to list
    text_list2 = text_norm2.split(" ")

    t_flag = False
    for tok in text_list2:
        answer = next((m for m in abbr_month if tok in m), None)
        if answer is not None and not t_flag:
            answer2 = next((m for m in abbr_month if m in tok), None)
            if answer2 is not None and not t_flag:
                t_flag = True
                # answer2 should contain the element that matches.  We need to find the span in the original phrase and return the correct value
                start_idx, end_idx = calculateSpan(text_lower, answer2)
                absStart = refStart_span + start_idx
                absEnd = refStart_span + end_idx
                postag = ref_list[utils.getRefIdx(ref_list, absStart,
                                                  absEnd)].getPos()

                if postag == "NNP":
                    if answer2 in ["jan."]:
                        return True, "January", start_idx, end_idx
                    elif answer2 in ["feb."]:
                        return True, "February", start_idx, end_idx
                    elif answer2 in ["mar."]:
                        return True, "March", start_idx, end_idx
                    elif answer2 in ["apr."]:
                        return True, "April", start_idx, end_idx
                    elif answer2 in ["jun."]:
                        return True, "June", start_idx, end_idx
                    elif answer2 in ["jul."]:
                        return True, "July", start_idx, end_idx
                    elif answer2 in ["aug."]:
                        return True, "August", start_idx, end_idx
                    elif answer2 in ["sept.", "sep."]:
                        return True, "September", start_idx, end_idx
                    elif answer2 in ["oct."]:
                        return True, "October", start_idx, end_idx
                    elif answer2 in ["nov."]:
                        return True, "November", start_idx, end_idx
                    elif answer2 in ["dec."]:
                        return True, "December", start_idx, end_idx

    # run for abbr month without punctuation
    abbr_month = [
        "jan", "feb", "mar", "apr", "jun", "jul", "aug", "sept", "sep", "oct",
        "nov", "dec"
    ]
    adj_punc = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~'
    text_norm2 = text_lower.translate(
        str.maketrans(adj_punc, ' ' * len(adj_punc))).strip()
    # convert to list
    text_list2 = text_norm2.split(" ")

    t_flag = False
    for tok in text_list2:
        answer = next((m for m in abbr_month if tok in m), None)
        if answer is not None and not t_flag:
            answer2 = next((m for m in abbr_month if m in tok), None)
            if answer2 is not None and not t_flag:
                t_flag = True
                # answer2 should contain the element that matches.  We need to find the span in the original phrase and return the correct value
                start_idx, end_idx = calculateSpan(text_lower, answer2)
                absStart = refStart_span + start_idx
                absEnd = refStart_span + end_idx
                postag = ref_list[utils.getRefIdx(ref_list, absStart,
                                                  absEnd)].getPos()

                if postag == "NNP":
                    if answer2 in ["jan"]:
                        return True, "January", start_idx, end_idx
                    elif answer2 in ["feb"]:
                        return True, "February", start_idx, end_idx
                    elif answer2 in ["mar"]:
                        return True, "March", start_idx, end_idx
                    elif answer2 in ["apr"]:
                        return True, "April", start_idx, end_idx
                    elif answer2 in ["jun"]:
                        return True, "June", start_idx, end_idx
                    elif answer2 in ["jul"]:
                        return True, "July", start_idx, end_idx
                    elif answer2 in ["aug"]:
                        return True, "August", start_idx, end_idx
                    elif answer2 in ["sept", "sep"]:
                        return True, "September", start_idx, end_idx
                    elif answer2 in ["oct"]:
                        return True, "October", start_idx, end_idx
                    elif answer2 in ["nov"]:
                        return True, "November", start_idx, end_idx
                    elif answer2 in ["dec"]:
                        return True, "December", start_idx, end_idx

    return False, None, None, None
Esempio n. 7
0
def buildPeriodInterval(s, chrono_id, chrono_list, ref_list, classifier,
                        feats):

    features = feats.copy()
    ref_Sspan, ref_Espan = s.getSpan()
    boo, val, idxstart, idxend, plural = hasPeriodInterval(s)

    # FIND terms that are always marked as calendar intervals!
    if boo and re.search(
            "yesterday|yesterdays|tomorrow|tomorrows|today|todays|daily|/min|/week",
            s.getText()):
        abs_Sspan = ref_Sspan + idxstart
        abs_Espan = ref_Sspan + idxend
        my_entity = chrono.ChronoCalendarIntervalEntity(
            entityID=str(chrono_id) + "entity",
            start_span=abs_Sspan,
            end_span=abs_Espan,
            calendar_type=val,
            number=None)
        chrono_id = chrono_id + 1

        if re.search("yesterday|yesterdays", s.getText()):

            my_last_entity = chrono.ChronoLastOperator(
                entityID=str(chrono_id) + "entity",
                start_span=abs_Sspan,
                end_span=abs_Espan,
                repeating_interval=str(chrono_id - 1) + "entity")
            chrono_id = chrono_id + 1
            chrono_list.append(my_last_entity)

        chrono_list.append(my_entity)

    # FIND terms that are always marked as periods!
    elif boo and val == "Unknown":
        abs_Sspan = ref_Sspan + idxstart
        abs_Espan = ref_Sspan + idxend
        my_entity = chrono.ChronoPeriodEntity(entityID=str(chrono_id) +
                                              "entity",
                                              start_span=abs_Sspan,
                                              end_span=abs_Espan,
                                              period_type=val,
                                              number=None)
        chrono_id = chrono_id + 1
        chrono_list.append(my_entity)

    elif boo:
        abs_Sspan = ref_Sspan + idxstart
        abs_Espan = ref_Sspan + idxend

        # get index of overlapping reference token
        ref_idx = utils.getRefIdx(ref_list, abs_Sspan, abs_Espan)

        # extract ML features
        my_features = utils.extract_prediction_features(
            ref_list, ref_idx, feats.copy())

        # classify into period or interval
        if classifier[1] == "NN":
            my_class = ChronoKeras.keras_classify(
                classifier[0], np.array(list(my_features.values())))
            #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))
        elif classifier[1] in ("SVM", "RF"):
            feat_array = [int(i) for i in my_features.values()]
            my_class = classifier[0].predict([feat_array])[0]
        else:
            my_class = classifier[0].classify(my_features)
            #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))

        # if 1 then it is a period, if 0 then it is an interval
        if my_class == 1:
            my_entity = chrono.ChronoPeriodEntity(
                entityID=str(chrono_id) + "entity",
                start_span=abs_Sspan,
                end_span=abs_Espan,
                period_type=getPeriodValue(val),
                number=None)
            chrono_id = chrono_id + 1
            ### Check to see if this calendar interval has a "this" in front of it
            prior_tok = ref_list[ref_idx - 1].getText().lower()
            if prior_tok.translate(
                    str.maketrans(string.punctuation,
                                  ' ' * len(string.punctuation))) == "this":
                # add a This entitiy and link it to the interval.
                start_span, end_span = re.search(prior_tok, "this").span(0)
                prior_start, prior_end = ref_list[ref_idx - 1].getSpan()

                chrono_this_entity = chrono.ChronoThisOperator(
                    entityID=str(chrono_id) + "entity",
                    start_span=prior_start + start_span,
                    end_span=prior_start + end_span)
                chrono_id = chrono_id + 1
                chrono_this_entity.set_period(my_entity.get_id())
                chrono_list.append(chrono_this_entity)

            else:
                # check for a Last Word
                hasMod, mod_type, mod_start, mod_end = hasNextLastThis(s)

                if (hasMod):
                    if mod_type == "Next":
                        chrono_list.append(
                            chrono.ChronoNextOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                period=my_entity.get_id()))
                        chrono_id = chrono_id + 1

                    if mod_type == "Last":
                        chrono_list.append(
                            chrono.ChronoLastOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                period=my_entity.get_id(),
                                semantics="Interval-Not-Included"))
                        chrono_id = chrono_id + 1

        else:
            my_entity = chrono.ChronoCalendarIntervalEntity(
                entityID=str(chrono_id) + "entity",
                start_span=abs_Sspan,
                end_span=abs_Espan,
                calendar_type=val,
                number=None)
            chrono_id = chrono_id + 1
            ### Check to see if this calendar interval has a "this" in front of it
            prior_tok = ref_list[ref_idx - 1].getText().lower()
            if prior_tok.translate(
                    str.maketrans(string.punctuation,
                                  ' ' * len(string.punctuation))) == "this":
                # add a This entitiy and link it to the interval.
                start_span, end_span = re.search(prior_tok, "this").span(0)
                prior_start, prior_end = ref_list[ref_idx - 1].getSpan()

                chrono_this_entity = chrono.ChronoThisOperator(
                    entityID=str(chrono_id) + "entity",
                    start_span=prior_start + start_span,
                    end_span=prior_start + end_span)
                chrono_id = chrono_id + 1
                chrono_this_entity.set_repeating_interval(my_entity.get_id())
                chrono_list.append(chrono_this_entity)
            else:
                # check for a Last Word
                hasMod, mod_type, mod_start, mod_end = hasNextLastThis(s)
                if (hasMod):
                    if mod_type == "Next":
                        chrono_list.append(
                            chrono.ChronoNextOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                repeating_interval=my_entity.get_id()))
                        chrono_id = chrono_id + 1

                    if mod_type == "Last":
                        chrono_list.append(
                            chrono.ChronoLastOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                repeating_interval=my_entity.get_id(),
                                semantics="Interval-Not-Included"))
                        chrono_id = chrono_id + 1

        #check to see if it has a number associated with it.  We assume the number comes before the interval string
        #to figure out if the number we find is close to the interval token the end of the number string needs to be within 2 characters of the start of the interval token.
        #I tried just extracting the previous reference token, but that doesn't work because phrases like "42-year-old" are actually one reference token.
        # So I decided I had to do it the hard way with index arithmetic.  The one concern about this method is that I assume there is a space at the end.  This could cause some issues down the line.
        # Yep, we are getting the spans wrong for phrases like "six-months".  I am going to test for a space as the last character before just assuming there was one.
        if idxstart > 0:
            ## get the absolute span of the interval token
            abs_Sspan = ref_Sspan + idxstart
            abs_Espan = ref_Sspan + idxend

            ## purposfully split on a single space
            substr = s.getText()[0:idxstart]
            # test to see if last character is a space and set a flag.
            has_space = True if substr[len(substr) - 1] == ' ' else False
            substr = substr.strip(' ').split(' ')

            ## get the previous token
            prevtok = substr[len(substr) - 1]
            prev_sSpan = idxstart - len(
                prevtok) - 1 if has_space else idxstart - len(prevtok)
            prev_eSpan = idxstart - 1

            ## get the rest of the substring joined by a space
            if len(substr) > 1:
                rest_of_phrase = ' '.join(substr[0:len(substr) - 1])
                rest_of_phrase_length = len(rest_of_phrase) + 1

            else:
                rest_of_phrase_length = 0

            m = re.search('([0-9]{1,2})', prevtok)
            if m is not None:
                num_val = m.group(0)
                abs_Sspan = ref_Sspan + rest_of_phrase_length + m.span(0)[0]
                abs_Espan = ref_Sspan + rest_of_phrase_length + m.span(0)[1]

                my_number_entity = chrono.ChronoNumber(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    value=num_val)
                chrono_id = chrono_id + 1

                #add the number entity to the list
                chrono_list.append(my_number_entity)
                my_entity.set_number(my_number_entity.get_id())
            #else search for a text number
            else:
                texNumVal = utils.getNumberFromText(prevtok)
                if texNumVal is not None:
                    abs_Sspan = ref_Sspan + rest_of_phrase_length
                    abs_Espan = ref_Sspan + rest_of_phrase_length + len(
                        prevtok
                    ) if has_space else ref_Sspan + rest_of_phrase_length + len(
                        prevtok) - 1

                    #create the number entity
                    my_number_entity = chrono.ChronoNumber(
                        entityID=str(chrono_id) + "entity",
                        start_span=abs_Sspan,
                        end_span=abs_Espan,
                        value=texNumVal)
                    chrono_id = chrono_id + 1
                    #append to list
                    chrono_list.append(my_number_entity)
                    #link to interval entity
                    my_entity.set_number(my_number_entity.get_id())

        chrono_list.append(my_entity)

    else:
        boo2, val, idxstart, idxend, numstr = hasEmbeddedPeriodInterval(s)
        if (boo2):
            abs_Sspan = ref_Sspan + idxstart
            abs_Espan = ref_Sspan + idxend

            # get index of overlapping reference token
            ref_idx = -1
            for i in range(0, len(ref_list)):
                if (utils.overlap(ref_list[i].getSpan(),
                                  (abs_Sspan, abs_Espan))):
                    ref_idx = i
                    break

            # extract ML features
            my_features = utils.extract_prediction_features(
                ref_list, ref_idx, features)

            # classify into period or interval
            if (classifier[1] == "NN"):
                my_class = ChronoKeras.keras_classify(
                    classifier[0], np.array(list(my_features.values())))
                #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))
            elif classifier[1] in ("SVM", "RF"):
                feat_array = [int(i) for i in my_features.values()]
                my_class = classifier[0].predict([feat_array])[0]
            else:
                my_class = classifier[0].classify(my_features)
                #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))

            # if 1 then it is a period, if 0 then it is an interval
            if (my_class == 1):
                my_entity = chrono.ChronoPeriodEntity(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    period_type=getPeriodValue(val),
                    number=None)
                chrono_id = chrono_id + 1
            else:
                my_entity = chrono.ChronoCalendarIntervalEntity(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    calendar_type=val)
                chrono_id = chrono_id + 1

            #Extract the number and identify the span of numstr
            if idxstart > 0:
                ## get the absolute span of the interval token
                abs_Sspan = ref_Sspan + idxstart
                abs_Espan = ref_Sspan + idxend

                ## purposfully split on a single space
                substr = s.getText()[0:idxstart]
                # test to see if last character is a space and set a flag.
                has_space = True if substr[len(substr) - 1] == ' ' else False
                substr = substr.strip(' ').split(' ')

                ## get the previous token
                prevtok = substr[len(substr) - 1]
                prev_sSpan = idxstart - len(
                    prevtok) - 1 if has_space else idxstart - len(prevtok)
                prev_eSpan = idxstart - 1

                ## get the rest of the substring joined by a space
                if len(substr) > 1:
                    rest_of_phrase = ' '.join(substr[0:len(substr) - 1])
                    rest_of_phrase_length = len(rest_of_phrase) + 1

                else:
                    rest_of_phrase_length = 0

                ## now calculate the relative span of prevtok
                #rel_Sspan = rest_of_phrase_length
                #rel_Espan = rest_of_phrase_length + len(prevtok)

                m = re.search('([0-9]{1,2})', prevtok)
                if m is not None:
                    num_val = m.group(0)
                    abs_Sspan = ref_Sspan + rest_of_phrase_length + m.span(
                        0)[0]
                    abs_Espan = ref_Sspan + rest_of_phrase_length + m.span(
                        0)[1]

                    my_number_entity = chrono.ChronoNumber(
                        entityID=str(chrono_id) + "entity",
                        start_span=abs_Sspan,
                        end_span=abs_Espan,
                        value=num_val)
                    chrono_id = chrono_id + 1

                    #add the number entity to the list
                    chrono_list.append(my_number_entity)
                    my_entity.set_number(my_number_entity.get_id())
                #else search for a text number
                else:
                    texNumVal = utils.getNumberFromText(prevtok)
                    if texNumVal is not None:
                        abs_Sspan = ref_Sspan + rest_of_phrase_length
                        abs_Espan = ref_Sspan + rest_of_phrase_length + len(
                            prevtok
                        ) if has_space else ref_Sspan + rest_of_phrase_length + len(
                            prevtok) - 1
                        #create the number entity
                        my_number_entity = chrono.ChronoNumber(
                            entityID=str(chrono_id) + "entity",
                            start_span=abs_Sspan,
                            end_span=abs_Espan,
                            value=texNumVal)
                        chrono_id = chrono_id + 1
                        #append to list
                        chrono_list.append(my_number_entity)
                        #link to interval entity
                        my_entity.set_number(my_number_entity.get_id())

            chrono_list.append(my_entity)

    return chrono_list, chrono_id