def buildSeasonOfYear(s, chrono_id, chrono_list, ref_list): boo, val, idxstart, idxend = hasSeasonOfYear(s, ref_list) if boo: ref_Sspan, ref_Espan = s.getSpan() abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend my_entity = chrono.ChronoSeasonOfYearEntity(entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, season_type=val) chrono_id = chrono_id + 1 #check here to see if it has a modifier hasMod, mod_type, mod_start, mod_end = hasNextLastThis(s) if (hasMod): if mod_type == "This": chrono_list.append( chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, repeating_interval=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, repeating_interval=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, repeating_interval=my_entity.get_id())) chrono_id = chrono_id + 1 #else: # chrono_list.append(chrono.ChronoLastOperator(entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, repeating_interval=my_entity.get_id())) # chrono_id = chrono_id + 1 # else: # chrono_list.append(chrono.ChronoLastOperator(entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, repeating_interval=my_entity.get_id())) # chrono_id = chrono_id+1 #check to see if it has a number associated with it. We assume the number comes before the interval string if idxstart > 0: substr = s.getText()[0:idxstart] m = re.search('([0-9]{1,2})', substr) if m is not None: num_val = m.group(0) abs_Sspan = ref_Sspan + m.span(0)[0] abs_Espan = ref_Sspan + m.span(0)[1] my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num_val) chrono_id = chrono_id + 1 #add the number entity to the list chrono_list.append(my_number_entity) my_entity.set_number(my_number_entity.get_id()) #else search for a text number else: texNumVal = utils.getNumberFromText(substr) if texNumVal is not None: #create the number entity my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=ref_Sspan, end_span=ref_Sspan + (idxstart - 1), value=texNumVal) chrono_id = chrono_id + 1 #append to list chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) chrono_list.append(my_entity) return chrono_list, chrono_id
def buildPeriodInterval(s, chrono_id, chrono_list, ref_list, classifier, feats): features = feats.copy() ref_Sspan, ref_Espan = s.getSpan() boo, val, idxstart, idxend, plural = hasPeriodInterval(s) # FIND terms that are always marked as calendar intervals! if boo and re.search( "yesterday|yesterdays|tomorrow|tomorrows|today|todays|daily|/min|/week", s.getText()): abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val, number=None) chrono_id = chrono_id + 1 if re.search("yesterday|yesterdays", s.getText()): my_last_entity = chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, repeating_interval=str(chrono_id - 1) + "entity") chrono_id = chrono_id + 1 chrono_list.append(my_last_entity) chrono_list.append(my_entity) # FIND terms that are always marked as periods! elif boo and val == "Unknown": abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend my_entity = chrono.ChronoPeriodEntity(entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=val, number=None) chrono_id = chrono_id + 1 chrono_list.append(my_entity) elif boo: abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend # get index of overlapping reference token ref_idx = utils.getRefIdx(ref_list, abs_Sspan, abs_Espan) # extract ML features my_features = utils.extract_prediction_features( ref_list, ref_idx, feats.copy()) # classify into period or interval if classifier[1] == "NN": my_class = ChronoKeras.keras_classify( classifier[0], np.array(list(my_features.values()))) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) elif classifier[1] in ("SVM", "RF"): feat_array = [int(i) for i in my_features.values()] my_class = classifier[0].predict([feat_array])[0] else: my_class = classifier[0].classify(my_features) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) # if 1 then it is a period, if 0 then it is an interval if my_class == 1: my_entity = chrono.ChronoPeriodEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=getPeriodValue(val), number=None) chrono_id = chrono_id + 1 ### Check to see if this calendar interval has a "this" in front of it prior_tok = ref_list[ref_idx - 1].getText().lower() if prior_tok.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) == "this": # add a This entitiy and link it to the interval. start_span, end_span = re.search(prior_tok, "this").span(0) prior_start, prior_end = ref_list[ref_idx - 1].getSpan() chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=prior_start + start_span, end_span=prior_start + end_span) chrono_id = chrono_id + 1 chrono_this_entity.set_period(my_entity.get_id()) chrono_list.append(chrono_this_entity) else: # check for a Last Word hasMod, mod_type, mod_start, mod_end = hasNextLastThis(s) if (hasMod): if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, period=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, period=my_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 else: my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val, number=None) chrono_id = chrono_id + 1 ### Check to see if this calendar interval has a "this" in front of it prior_tok = ref_list[ref_idx - 1].getText().lower() if prior_tok.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) == "this": # add a This entitiy and link it to the interval. start_span, end_span = re.search(prior_tok, "this").span(0) prior_start, prior_end = ref_list[ref_idx - 1].getSpan() chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=prior_start + start_span, end_span=prior_start + end_span) chrono_id = chrono_id + 1 chrono_this_entity.set_repeating_interval(my_entity.get_id()) chrono_list.append(chrono_this_entity) else: # check for a Last Word hasMod, mod_type, mod_start, mod_end = hasNextLastThis(s) if (hasMod): if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 #check to see if it has a number associated with it. We assume the number comes before the interval string #to figure out if the number we find is close to the interval token the end of the number string needs to be within 2 characters of the start of the interval token. #I tried just extracting the previous reference token, but that doesn't work because phrases like "42-year-old" are actually one reference token. # So I decided I had to do it the hard way with index arithmetic. The one concern about this method is that I assume there is a space at the end. This could cause some issues down the line. # Yep, we are getting the spans wrong for phrases like "six-months". I am going to test for a space as the last character before just assuming there was one. if idxstart > 0: ## get the absolute span of the interval token abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend ## purposfully split on a single space substr = s.getText()[0:idxstart] # test to see if last character is a space and set a flag. has_space = True if substr[len(substr) - 1] == ' ' else False substr = substr.strip(' ').split(' ') ## get the previous token prevtok = substr[len(substr) - 1] prev_sSpan = idxstart - len( prevtok) - 1 if has_space else idxstart - len(prevtok) prev_eSpan = idxstart - 1 ## get the rest of the substring joined by a space if len(substr) > 1: rest_of_phrase = ' '.join(substr[0:len(substr) - 1]) rest_of_phrase_length = len(rest_of_phrase) + 1 else: rest_of_phrase_length = 0 m = re.search('([0-9]{1,2})', prevtok) if m is not None: num_val = m.group(0) abs_Sspan = ref_Sspan + rest_of_phrase_length + m.span(0)[0] abs_Espan = ref_Sspan + rest_of_phrase_length + m.span(0)[1] my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num_val) chrono_id = chrono_id + 1 #add the number entity to the list chrono_list.append(my_number_entity) my_entity.set_number(my_number_entity.get_id()) #else search for a text number else: texNumVal = utils.getNumberFromText(prevtok) if texNumVal is not None: abs_Sspan = ref_Sspan + rest_of_phrase_length abs_Espan = ref_Sspan + rest_of_phrase_length + len( prevtok ) if has_space else ref_Sspan + rest_of_phrase_length + len( prevtok) - 1 #create the number entity my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=texNumVal) chrono_id = chrono_id + 1 #append to list chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) chrono_list.append(my_entity) else: boo2, val, idxstart, idxend, numstr = hasEmbeddedPeriodInterval(s) if (boo2): abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend # get index of overlapping reference token ref_idx = -1 for i in range(0, len(ref_list)): if (utils.overlap(ref_list[i].getSpan(), (abs_Sspan, abs_Espan))): ref_idx = i break # extract ML features my_features = utils.extract_prediction_features( ref_list, ref_idx, features) # classify into period or interval if (classifier[1] == "NN"): my_class = ChronoKeras.keras_classify( classifier[0], np.array(list(my_features.values()))) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) elif classifier[1] in ("SVM", "RF"): feat_array = [int(i) for i in my_features.values()] my_class = classifier[0].predict([feat_array])[0] else: my_class = classifier[0].classify(my_features) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) # if 1 then it is a period, if 0 then it is an interval if (my_class == 1): my_entity = chrono.ChronoPeriodEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=getPeriodValue(val), number=None) chrono_id = chrono_id + 1 else: my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val) chrono_id = chrono_id + 1 #Extract the number and identify the span of numstr if idxstart > 0: ## get the absolute span of the interval token abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend ## purposfully split on a single space substr = s.getText()[0:idxstart] # test to see if last character is a space and set a flag. has_space = True if substr[len(substr) - 1] == ' ' else False substr = substr.strip(' ').split(' ') ## get the previous token prevtok = substr[len(substr) - 1] prev_sSpan = idxstart - len( prevtok) - 1 if has_space else idxstart - len(prevtok) prev_eSpan = idxstart - 1 ## get the rest of the substring joined by a space if len(substr) > 1: rest_of_phrase = ' '.join(substr[0:len(substr) - 1]) rest_of_phrase_length = len(rest_of_phrase) + 1 else: rest_of_phrase_length = 0 ## now calculate the relative span of prevtok #rel_Sspan = rest_of_phrase_length #rel_Espan = rest_of_phrase_length + len(prevtok) m = re.search('([0-9]{1,2})', prevtok) if m is not None: num_val = m.group(0) abs_Sspan = ref_Sspan + rest_of_phrase_length + m.span( 0)[0] abs_Espan = ref_Sspan + rest_of_phrase_length + m.span( 0)[1] my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num_val) chrono_id = chrono_id + 1 #add the number entity to the list chrono_list.append(my_number_entity) my_entity.set_number(my_number_entity.get_id()) #else search for a text number else: texNumVal = utils.getNumberFromText(prevtok) if texNumVal is not None: abs_Sspan = ref_Sspan + rest_of_phrase_length abs_Espan = ref_Sspan + rest_of_phrase_length + len( prevtok ) if has_space else ref_Sspan + rest_of_phrase_length + len( prevtok) - 1 #create the number entity my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=texNumVal) chrono_id = chrono_id + 1 #append to list chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) chrono_list.append(my_entity) return chrono_list, chrono_id
def buildPeriodInterval(s, chrono_id, chrono_list, ref_list, classifier, feats): features = feats.copy() ref_Sspan, ref_Espan = s.getSpan() #print("In buildPeriodInterval(), TimePhrase Text: " + s.getText()) boo, val, idxstart, idxend, plural = hasPeriodInterval(s) # FIND terms that are always marked as calendar intervals! if boo and re.search( "yesterday|yesterdays|tomorrow|tomorrows|today|todays|daily|/min|/week", s.getText()): abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val, number=None) chrono_id = chrono_id + 1 if re.search("yesterday|yesterdays", s.getText()): my_last_entity = chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, repeating_interval=str(chrono_id - 1) + "entity") chrono_id = chrono_id + 1 chrono_list.append(my_last_entity) chrono_list.append(my_entity) # FIND terms that are always marked as periods! elif boo and val == "Unknown": abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend my_entity = chrono.ChronoPeriodEntity(entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=val, number=None) chrono_id = chrono_id + 1 chrono_list.append(my_entity) elif boo: abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend # get index of overlapping reference token #ref_idx = -1 #for i in range(0,len(ref_list)): # if(utils.overlap(ref_list[i].getSpan(),(abs_Sspan,abs_Espan))): # ref_idx = i # break ref_idx = utils.getRefIdx(ref_list, abs_Sspan, abs_Espan) # extract ML features my_features = utils.extract_prediction_features( ref_list, ref_idx, feats.copy()) # classify into period or interval if classifier[1] == "NN": my_class = ChronoKeras.keras_classify( classifier[0], np.array(list(my_features.values()))) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) elif classifier[1] in ("SVM", "RF"): feat_array = [int(i) for i in my_features.values()] my_class = classifier[0].predict([feat_array])[0] else: my_class = classifier[0].classify(my_features) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) # if 1 then it is a period, if 0 then it is an interval if my_class == 1: my_entity = chrono.ChronoPeriodEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=getPeriodValue(val), number=None) chrono_id = chrono_id + 1 ### Check to see if this calendar interval has a "this" in front of it prior_tok = ref_list[ref_idx - 1].getText().lower() if prior_tok.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) == "this": # add a This entitiy and link it to the interval. start_span, end_span = re.search(prior_tok, "this").span(0) prior_start, prior_end = ref_list[ref_idx - 1].getSpan() chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=prior_start + start_span, end_span=prior_start + end_span) chrono_id = chrono_id + 1 chrono_this_entity.set_period(my_entity.get_id()) chrono_list.append(chrono_this_entity) else: # check for a Last Word hasMod, mod_type, mod_start, mod_end = hasModifier(s) if (hasMod): if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, period=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, period=my_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 else: my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val, number=None) chrono_id = chrono_id + 1 ### Check to see if this calendar interval has a "this" in front of it prior_tok = ref_list[ref_idx - 1].getText().lower() if prior_tok.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) == "this": # add a This entitiy and link it to the interval. start_span, end_span = re.search(prior_tok, "this").span(0) prior_start, prior_end = ref_list[ref_idx - 1].getSpan() chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=prior_start + start_span, end_span=prior_start + end_span) chrono_id = chrono_id + 1 chrono_this_entity.set_repeating_interval(my_entity.get_id()) chrono_list.append(chrono_this_entity) else: # check for a Last Word hasMod, mod_type, mod_start, mod_end = hasModifier(s) if (hasMod): if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 #check to see if it has a number associated with it. We assume the number comes before the interval string if idxstart > 0: substr = s.getText()[0:idxstart] m = re.search('([0-9]{1,2})', substr) if m is not None: num_val = m.group(0) abs_Sspan = ref_Sspan + m.span(0)[0] abs_Espan = ref_Sspan + m.span(0)[1] my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num_val) chrono_id = chrono_id + 1 #add the number entity to the list chrono_list.append(my_number_entity) my_entity.set_number(my_number_entity.get_id()) #else search for a text number else: texNumVal = utils.getNumberFromText(substr) if texNumVal is not None: #create the number entity my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=ref_Sspan, end_span=ref_Sspan + (idxstart - 1), value=texNumVal) chrono_id = chrono_id + 1 #append to list chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) chrono_list.append(my_entity) else: boo2, val, idxstart, idxend, numstr = hasEmbeddedPeriodInterval(s) if (boo2): abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend # get index of overlapping reference token ref_idx = -1 for i in range(0, len(ref_list)): if (utils.overlap(ref_list[i].getSpan(), (abs_Sspan, abs_Espan))): ref_idx = i break # extract ML features my_features = utils.extract_prediction_features( ref_list, ref_idx, features) # classify into period or interval if (classifier[1] == "NN"): my_class = ChronoKeras.keras_classify( classifier[0], np.array(list(my_features.values()))) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) else: my_class = classifier[0].classify(my_features) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) # if 1 then it is a period, if 0 then it is an interval if (my_class == 1): my_entity = chrono.ChronoPeriodEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=getPeriodValue(val), number=None) chrono_id = chrono_id + 1 else: my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val) chrono_id = chrono_id + 1 #Extract the number and identify the span of numstr substr = s.getText( )[:idxstart] ## extract entire first part of TimePhrase phrase m = re.search( '([0-9]{1,2})', substr ) #search for an integer in the subphrase and extract it's coordinates if m is not None: num_val = m.group(0) abs_Sspan = ref_Sspan + m.span(0)[0] abs_Espan = ref_Sspan + m.span(0)[1] my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num_val) chrono_id = chrono_id + 1 #add the number entity to the list chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) #else search for a text number else: texNumVal = utils.getNumberFromText(numstr) if texNumVal is not None: m = re.search( numstr, substr) #search for the number string in the subphrase if m is not None: abs_Sspan = ref_Sspan + m.span(0)[0] abs_Espan = ref_Sspan + m.span(0)[1] #create the number entity my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=texNumVal) chrono_id = chrono_id + 1 #append to list chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) chrono_list.append(my_entity) return chrono_list, chrono_id
def buildDoseDuration(s, chrono_id, chrono_list, ref_list, classifier, feats): features = feats.copy() ref_Sspan, ref_Espan = s.getSpan() #print("In buildPeriodInterval(), TimePhrase Text: " + s.getText()) bad = re.compile(r"^q\d|^Q\d") parts = s.getText().split() containsnum = False #various checks to ensure that this phrase is actually a dose duration if isDoseDuration(parts[0]): return chrono_list, chrono_id if "every" in s.getText().lower() or "time" in s.getText().lower( ) or "per" in s.getText().lower(): return chrono_list, chrono_id if bad.match(s.getText()): return chrono_list, chrono_id if "/" in s.getText(): return chrono_list, chrono_id if "[**" in s.getText() or "**]" in s.getText(): return chrono_list, chrono_id if "ly" in s.getText(): return chrono_list, chrono_id if "(" in s.getText() or ")" in s.getText(): return chrono_list, chrono_id if "once" in s.getText().lower() or "twice" in s.getText().lower(): return chrono_list, chrono_id if "past" in s.getText().lower() or "ago" in s.getText().lower(): return chrono_list, chrono_id if "RANDOM" in s.getText(): return chrono_list, chrono_id for part in parts: for ref in ref_list: if ref.getText().lower() == part.lower(): if (ref.isNumeric()): containsnum = True break elif not tt.hasDoseDuration(ref.getText().lower()): return chrono_list, chrono_id if containsnum == False: return chrono_list, chrono_id boo, val, idxstart, idxend, plural = hasDoseDuration(s) if boo: abs_Sspan = ref_Sspan abs_Espan = ref_Sspan + idxend # get index of overlapping reference token # ref_idx = -1 # for i in range(0,len(ref_list)): # if(utils.overlap(ref_list[i].getSpan(),(abs_Sspan,abs_Espan))): # ref_idx = i # break ref_idx = utils.getRefIdx(ref_list, abs_Sspan, abs_Espan) # extract ML features my_features = utils.extract_prediction_features( ref_list, ref_idx, feats.copy()) # classify into period or interval if classifier[1] == "NN": my_class = ChronoKeras.keras_classify( classifier[0], np.array(list(my_features.values()))) # print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) elif classifier[1] in ("SVM", "RF"): feat_array = [int(i) for i in my_features.values()] my_class = classifier[0].predict([feat_array])[0] else: my_class = classifier[0].classify(my_features) # print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) # if 1 then it is a period, if 0 then it is an interval if my_class == 1: my_entity = chrono.ChronoDoseDurationEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, dose_type=getDoseDurationValue(val), number=None, text=s.getText()) chrono_id = chrono_id + 1 ### Check to see if this calendar interval has a "this" in front of it prior_tok = ref_list[ref_idx - 1].getText().lower() if prior_tok.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) == "this": # add a This entitiy and link it to the interval. start_span, end_span = re.search(prior_tok, "this").span(0) prior_start, prior_end = ref_list[ref_idx - 1].getSpan() chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=prior_start + start_span, end_span=prior_start + end_span) chrono_id = chrono_id + 1 chrono_this_entity.set_period(my_entity.get_id()) chrono_list.append(chrono_this_entity) else: # check for a Last Word hasMod, mod_type, mod_start, mod_end = hasModifier(s) if (hasMod): if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, period=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, period=my_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 else: my_entity = chrono.ChronoDoseDurationEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, dose_type=val, number=None, text=s.getText()) chrono_id = chrono_id + 1 ### Check to see if this calendar interval has a "this" in front of it prior_tok = ref_list[ref_idx - 1].getText().lower() if prior_tok.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) == "this": # add a This entitiy and link it to the interval. start_span, end_span = re.search(prior_tok, "this").span(0) prior_start, prior_end = ref_list[ref_idx - 1].getSpan() chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=prior_start + start_span, end_span=prior_start + end_span) chrono_id = chrono_id + 1 chrono_this_entity.set_repeating_interval(my_entity.get_id()) chrono_list.append(chrono_this_entity) else: # check for a Last Word hasMod, mod_type, mod_start, mod_end = hasModifier(s) if (hasMod): if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 # check to see if it has a number associated with it. We assume the number comes before the interval string if idxstart > 0: substr = s.getText()[0:idxstart] m = re.search('([0-9]{1,2})', substr) if m is not None: num_val = m.group(0) abs_Sspan = ref_Sspan + m.span(0)[0] abs_Espan = ref_Sspan + m.span(0)[1] my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num_val) #chrono_id = chrono_id + 1 # add the number entity to the list #chrono_list.append(my_number_entity) my_entity.set_number(my_number_entity.get_id()) # else search for a text number else: texNumVal = utils.getNumberFromText(substr) if texNumVal is not None: # create the number entity my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=ref_Sspan, end_span=ref_Sspan + (idxstart - 1), value=texNumVal) #chrono_id = chrono_id + 1 # append to list #chrono_list.append(my_number_entity) # link to interval entity my_entity.set_number(my_number_entity.get_id()) chrono_list.append(my_entity) else: boo2, val, idxstart, idxend, numstr = hasEmbeddedPeriodInterval(s) if (boo2): abs_Sspan = ref_Sspan abs_Espan = ref_Sspan + idxend # get index of overlapping reference token ref_idx = -1 for i in range(0, len(ref_list)): if (utils.overlap(ref_list[i].getSpan(), (abs_Sspan, abs_Espan))): ref_idx = i break # extract ML features my_features = utils.extract_prediction_features( ref_list, ref_idx, features) # classify into period or interval if (classifier[1] == "NN"): my_class = ChronoKeras.keras_classify( classifier[0], np.array(list(my_features.values()))) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) else: my_class = classifier[0].classify(my_features) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) # if 1 then it is a period, if 0 then it is an interval if (my_class == 1): my_entity = chrono.ChronoDoseDurationEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, dose_type=getDoseDurationValue(val), number=None, text=s.getText()) chrono_id = chrono_id + 1 else: my_entity = chrono.ChronoDoseDurationEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, dose_type=val, number=None, text=s.getText()) chrono_id = chrono_id + 1 #Extract the number and identify the span of numstr substr = s.getText( )[:idxstart] ## extract entire first part of TimePhrase phrase m = re.search( '([0-9]{1,2})', substr ) #search for an integer in the subphrase and extract it's coordinates if m is not None: num_val = m.group(0) abs_Sspan = ref_Sspan + m.span(0)[0] abs_Espan = ref_Sspan + m.span(0)[1] my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num_val) #chrono_id = chrono_id + 1 #add the number entity to the list #chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) #else search for a text number else: texNumVal = utils.getNumberFromText(numstr) if texNumVal is not None: m = re.search( numstr, substr) #search for the number string in the subphrase if m is not None: abs_Sspan = ref_Sspan + m.span(0)[0] abs_Espan = ref_Sspan + m.span(0)[1] #create the number entity my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=texNumVal) #chrono_id = chrono_id + 1 #append to list #chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) chrono_list.append(my_entity) return chrono_list, chrono_id