def buildMonthOfYear(s, chrono_id, chrono_list, flags): b, text, startSpan, endSpan = hasMonthOfYear(s) if b and not flags["month"]: flags["month"] = True ref_StartSpan, ref_EndSpan = s.getSpan() abs_StartSpan = ref_StartSpan + startSpan abs_EndSpan = abs_StartSpan + abs(endSpan - startSpan) if (int(text) <= 12): chrono_entity = chrono.chronoMonthOfYearEntity( entityID=str(chrono_id) + "entity", start_span=abs_StartSpan, end_span=abs_EndSpan, month_type=calendar.month_name[utils.getMonthNumber(text)]) chrono_list.append(chrono_entity) chrono_id = chrono_id + 1 return chrono_list, chrono_id, flags
def buildNumericDate(s, chrono_id, chrono_list, flags): # convert to all lower text_lower = s.getText().lower() # remove all punctuation # text_norm = text_lower.translate(str.maketrans("", "", string.punctuation)) # print("After:" + text_norm) # convert to list text_norm = text_lower.strip(".,") text_list = text_norm.split(" ") for text in text_list: ## See if there is a 4 digit number and assume it is a year if between 1500 and 2050 ## Note that 24hour times in this range will be interpreted as years. However, if a timezone like 1800EDT is attached it will not be parsed here. if len(text) == 4: num = utils.getNumberFromText(text) if num is not None: if (num >= 1500) and (num <= 2050) and not flags[ "fourdigityear"] and not flags["loneDigitYear"]: flags["loneDigitYear"] = True # print("Found Lone Digit Year") ## build year ref_StartSpan, ref_EndSpan = s.getSpan() start_idx, end_idx = re.search(text, s.getText()).span(0) chrono_year_entity = chrono.ChronoYearEntity( entityID=str(chrono_id) + "entity", start_span=ref_StartSpan + start_idx, end_span=ref_StartSpan + end_idx, value=num) chrono_id = chrono_id + 1 chrono_list.append(chrono_year_entity) ## parse out the condesnsed date format like 19980303 or 03031998. elif len(text) == 8 and utils.getNumberFromText(text) is not None: # Identify format yyyymmdd y = utils.getNumberFromText(text[0:4]) m = utils.getNumberFromText(text[4:6]) d = utils.getNumberFromText(text[6:8]) if y is not None: if (y >= 1500) and (y <= 2050) and (m <= 12) and (d <= 31): ref_StartSpan, ref_EndSpan = s.getSpan() # add year chrono_year_entity = chrono.ChronoYearEntity( entityID=str(chrono_id) + "entity", start_span=ref_StartSpan, end_span=ref_StartSpan + 4, value=y) chrono_id = chrono_id + 1 # add month chrono_month_entity = chrono.chronoMonthOfYearEntity( entityID=str(chrono_id) + "entity", start_span=ref_StartSpan + 4, end_span=ref_StartSpan + 6, month_type=calendar.month_name[m]) chrono_id = chrono_id + 1 chrono_year_entity.set_sub_interval( chrono_month_entity.get_id()) # add day chrono_day_entity = chrono.ChronoDayOfMonthEntity( entityID=str(chrono_id) + "entity", start_span=ref_StartSpan + 6, end_span=ref_StartSpan + 8, value=d) chrono_id = chrono_id + 1 chrono_month_entity.set_sub_interval( chrono_day_entity.get_id()) chrono_list.append(chrono_year_entity) chrono_list.append(chrono_month_entity) chrono_list.append(chrono_day_entity) else: # test for mmddyyyy y2 = utils.getNumberFromText(text[4:8]) m2 = utils.getNumberFromText(text[0:2]) d2 = utils.getNumberFromText(text[2:4]) if y2 is not None: if (y2 >= 1500) and (y2 <= 2050) and (m2 <= 12) and ( d2 <= 31): ref_StartSpan, ref_EndSpan = s.getSpan() # add year chrono_year_entity = chrono.ChronoYearEntity( entityID=str(chrono_id) + "entity", start_span=ref_StartSpan + 4, end_span=ref_StartSpan + 8, value=y) chrono_id = chrono_id + 1 # add month chrono_month_entity = chrono.chronoMonthOfYearEntity( entityID=str(chrono_id) + "entity", start_span=ref_StartSpan, end_span=ref_StartSpan + 2, month_type=calendar.month_name[m2]) chrono_id = chrono_id + 1 chrono_year_entity.set_sub_interval( chrono_month_entity.get_id()) # add day chrono_day_entity = chrono.ChronoDayOfMonthEntity( entityID=str(chrono_id) + "entity", start_span=ref_StartSpan + 2, end_span=ref_StartSpan + 4, value=d) chrono_id = chrono_id + 1 chrono_month_entity.set_sub_interval( chrono_day_entity.get_id()) chrono_list.append(chrono_year_entity) chrono_list.append(chrono_month_entity) chrono_list.append(chrono_day_entity) ## parse out the condesnsed date format like 030399 or 990303. ## Note that dates such as 12-01-2006 (120106 vs 061201) and similar are not distinguishable. elif len(text) == 6 and utils.getNumberFromText(text) is not None: # Identify format mmddyy y = utils.getNumberFromText(text[4:6]) m = utils.getNumberFromText(text[0:2]) d = utils.getNumberFromText(text[2:4]) if y is not None and m is not None and d is not None: if (m <= 12) and (d <= 31): ref_StartSpan, ref_EndSpan = s.getSpan() # add year chrono_year_entity = chrono.ChronoTwoDigitYearOperator( entityID=str(chrono_id) + "entity", start_span=ref_StartSpan + 4, end_span=ref_StartSpan + 6, value=y) chrono_id = chrono_id + 1 # add month chrono_month_entity = chrono.chronoMonthOfYearEntity( entityID=str(chrono_id) + "entity", start_span=ref_StartSpan, end_span=ref_StartSpan + 2, month_type=calendar.month_name[m]) chrono_id = chrono_id + 1 chrono_year_entity.set_sub_interval( chrono_month_entity.get_id()) # add day chrono_day_entity = chrono.ChronoDayOfMonthEntity( entityID=str(chrono_id) + "entity", start_span=ref_StartSpan + 2, end_span=ref_StartSpan + 4, value=d) chrono_id = chrono_id + 1 chrono_month_entity.set_sub_interval( chrono_day_entity.get_id()) chrono_list.append(chrono_year_entity) chrono_list.append(chrono_month_entity) chrono_list.append(chrono_day_entity) else: # test for yymmdd y2 = utils.getNumberFromText(text[0:2]) m2 = utils.getNumberFromText(text[2:4]) d2 = utils.getNumberFromText(text[4:6]) if y2 is not None: if (m2 <= 12) and (d2 <= 31): ref_StartSpan, ref_EndSpan = s.getSpan() # add year chrono_year_entity = chrono.ChronoTwoDigitYearOperator( entityID=str(chrono_id) + "entity", start_span=ref_StartSpan, end_span=ref_StartSpan + 2, value=y2) chrono_id = chrono_id + 1 # add month chrono_month_entity = chrono.chronoMonthOfYearEntity( entityID=str(chrono_id) + "entity", start_span=ref_StartSpan + 2, end_span=ref_StartSpan + 4, month_type=calendar.month_name[m2]) chrono_id = chrono_id + 1 chrono_year_entity.set_sub_interval( chrono_month_entity.get_id()) # add day chrono_day_entity = chrono.ChronoDayOfMonthEntity( entityID=str(chrono_id) + "entity", start_span=ref_StartSpan + 4, end_span=ref_StartSpan + 6, value=d2) chrono_id = chrono_id + 1 chrono_month_entity.set_sub_interval( chrono_day_entity.get_id()) chrono_list.append(chrono_year_entity) chrono_list.append(chrono_month_entity) chrono_list.append(chrono_day_entity) return chrono_list, chrono_id, flags #### # END_MODULE ####
def build2DigitYear(s, chrono_id, chrono_list, flags): b, text, startSpan, endSpan = has2DigitYear(s) if b and not flags["fourdigityear"]: # In most cases this will be at the end of the Span ref_StartSpan, ref_EndSpan = s.getSpan() abs_StartSpan = ref_StartSpan + startSpan abs_EndSpan = abs_StartSpan + abs(endSpan - startSpan) chrono_2_digit_year_entity = chrono.ChronoTwoDigitYearOperator( entityID=str(chrono_id) + "entity", start_span=abs_StartSpan, end_span=abs_EndSpan, value=text) chrono_id = chrono_id + 1 # Check for Month in same element bMonth, textMonth, startSpanMonth, endSpanMonth = hasMonthOfYear(s) if bMonth and not flags["month"]: flags["month"] = True abs_StartSpanMonth = ref_StartSpan + startSpanMonth abs_EndSpanMonth = abs_StartSpanMonth + abs(endSpanMonth - startSpanMonth) m = utils.getMonthNumber(textMonth) if (m <= 12): chrono_month_entity = chrono.chronoMonthOfYearEntity( entityID=str(chrono_id) + "entity", start_span=abs_StartSpanMonth, end_span=abs_EndSpanMonth, month_type=calendar.month_name[m]) chrono_id = chrono_id + 1 chrono_2_digit_year_entity.set_sub_interval( chrono_month_entity.get_id()) # Check for Day in same element bDay, textDay, startSpanDay, endSpanDay = hasDayOfMonth(s) if bDay and not flags["day"]: flags["day"] = True abs_StartSpanDay = ref_StartSpan + startSpanDay abs_EndSpanDay = abs_StartSpanDay + abs(endSpanDay - startSpanDay) if (int(textDay) <= 31): chrono_day_entity = chrono.ChronoDayOfMonthEntity( entityID=str(chrono_id) + "entity", start_span=abs_StartSpanDay, end_span=abs_EndSpanDay, value=int(textDay)) chrono_id = chrono_id + 1 chrono_month_entity.set_sub_interval( chrono_day_entity.get_id()) # Check for Hour in same element bHour, textHour, startSpanHour, endSpanHour = hasHourOfDay(s) if bHour and not flags["hour"]: flags["hour"] = True ref_StartSpan, ref_EndSpan = s.getSpan() abs_StartSpanHour = ref_StartSpan + startSpanHour abs_EndSpanHour = abs_StartSpanHour + abs(endSpanHour - startSpanHour) if (int(textHour) <= 24): chrono_hour_entity = chrono.ChronoHourOfDayEntity( entityID=str(chrono_id) + "entity", start_span=abs_StartSpanHour, end_span=abs_EndSpanHour, value=int(textHour)) chrono_id = chrono_id + 1 chrono_day_entity.set_sub_interval( chrono_hour_entity.get_id()) # Check for Minute in same element bMinute, textMinute, startSpanMinute, endSpanMinute = hasMinuteOfHour( s) if bMinute and not flags["minute"]: flags["minute"] = True ref_StartSpan, ref_EndSpan = s.getSpan() abs_StartSpanMinute = ref_StartSpan + startSpanMinute abs_EndSpanMinute = abs_StartSpanMinute + abs( endSpanMinute - startSpanMinute) if (int(textMinute) <= 60): chrono_minute_entity = chrono.ChronoMinuteOfHourEntity( entityID=str(chrono_id) + "entity", start_span=abs_StartSpanMinute, end_span=abs_EndSpanMinute, value=int(textMinute)) chrono_id = chrono_id + 1 chrono_hour_entity.set_sub_interval( chrono_minute_entity.get_id()) # Check for Second in same element bSecond, textSecond, startSpanSecond, endSpanSecond = hasSecondOfMinute( s) if bSecond and not flags["second"]: flags["second"] = True ref_StartSpan, ref_EndSpan = s.getSpan() abs_StartSpanSecond = ref_StartSpan + startSpanSecond abs_EndSpanSecond = abs_StartSpanSecond + abs( endSpanSecond - startSpanSecond) if (int(textSecond) <= 60): chrono_second_entity = chrono.ChronoSecondOfMinuteEntity( entityID=str(chrono_id) + "entity", start_span=abs_StartSpanSecond, end_span=abs_EndSpanSecond, value=int(textSecond)) chrono_list.append(chrono_second_entity) chrono_id = chrono_id + 1 chrono_minute_entity.set_sub_interval( chrono_second_entity.get_id()) chrono_list.append(chrono_minute_entity) chrono_list.append(chrono_hour_entity) chrono_list.append(chrono_day_entity) chrono_list.append(chrono_month_entity) chrono_list.append(chrono_2_digit_year_entity) return chrono_list, chrono_id, flags
def buildTextMonthAndDay(s, chrono_id, chrono_list, flags, dct=None, ref_list=None): boo, val, idxstart, idxend = hasTextMonth(s, ref_list) if boo and not flags["month"]: flags["month"] = True ref_Sspan, ref_Espan = s.getSpan() abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend my_month_entity = chrono.chronoMonthOfYearEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, month_type=val) chrono_id = chrono_id + 1 ## assume all numbers 1-31 are days ## assume all numbers >1000 are years ## parse all text before month ## test to see if all text is a number or text year ## if no: ## remove all punctuation ## seperate by spaces ## parse each token, if find a number then assign to day or year as appropriate ## if yes: ## assign to day or year as appropriate ## parse all text after month ## test to see if all text is a number or text year ## if no: ## remove all punctuation ## seperate by spaces ## parse each token, if find a number then assign to day or year as appropriate ## if yes: ## assign to day or year as appropriate #idx_end is the last index of the month. If there are any characters after it the length of the string will be greater than the endidx. if (idxend < len(s.getText())): substr = s.getText()[idxend:].strip(",.").strip() num = utils.getNumberFromText(substr) if num is not None: if num <= 31 and not flags["day"]: flags["day"] = True day_startidx, day_endidx = calculateSpan( s.getText(), str(num)) #substr) abs_Sspan = ref_Sspan + day_startidx abs_Espan = ref_Sspan + day_endidx my_day_entity = chrono.ChronoDayOfMonthEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num) chrono_list.append(my_day_entity) chrono_id = chrono_id + 1 #now figure out if it is a NEXT or LAST #create doctime if False: #dct is not None: mStart = my_month_entity.get_start_span() mEnd = my_month_entity.get_end_span() this_dct = datetime.datetime( int(dct.year), int( utils.getMonthNumber( my_month_entity.get_month_type())), int(my_day_entity.get_value()), 0, 0) if this_dct > dct: chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=mStart, end_span=mEnd, repeating_interval=my_month_entity.get_id( ))) chrono_id = chrono_id + 1 elif this_dct < dct: chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=mStart, end_span=mEnd, repeating_interval=my_month_entity.get_id( ))) chrono_id = chrono_id + 1 elif num >= 1500 and num <= 2050 and not flags[ "fourdigityear"] and not flags["loneDigitYear"]: flags["fourdigityear"] = True year_startidx, year_endidx = calculateSpan( s.getText(), substr) abs_Sspan = ref_Sspan + year_startidx abs_Espan = ref_Sspan + year_endidx my_year_entity = chrono.ChronoYearEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num) chrono_list.append(my_year_entity) my_year_entity.set_sub_interval(my_month_entity.get_id()) chrono_id = chrono_id + 1 else: ##parse and process each token ##replace punctuation substr = substr.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) ##split on spaces tokenized_text = WhitespaceTokenizer().tokenize(substr) for i in range(0, len(tokenized_text)): num = utils.getNumberFromText(tokenized_text[i]) if num is not None: if num <= 31: day_startidx, day_endidx = calculateSpan( s.getText(), tokenized_text[i]) abs_Sspan = ref_Sspan + day_startidx abs_Espan = ref_Sspan + day_endidx my_day_entity = chrono.ChronoDayOfMonthEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num) chrono_list.append(my_day_entity) chrono_id = chrono_id + 1 #now figure out if it is a NEXT or LAST #create doctime if False: #dct is not None: mStart = my_month_entity.get_start_span() mEnd = my_month_entity.get_end_span() this_dct = datetime.datetime( int(dct.year), int( utils.getMonthNumber( my_month_entity.get_month_type())), int(my_day_entity.get_value()), 0, 0) if this_dct > dct: chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=mStart, end_span=mEnd, repeating_interval=my_month_entity. get_id())) chrono_id = chrono_id + 1 elif this_dct < dct: chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=mStart, end_span=mEnd, repeating_interval=my_month_entity. get_id())) chrono_id = chrono_id + 1 elif num >= 1500 and num <= 2050 and not flags[ "fourdigityear"] and not flags["loneDigitYear"]: flags["fourdigityear"] = True year_startidx, year_endidx = calculateSpan( s.getText(), tokenized_text[i]) abs_Sspan = ref_Sspan + year_startidx abs_Espan = ref_Sspan + year_endidx my_year_entity = chrono.ChronoYearEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num) chrono_list.append(my_year_entity) my_year_entity.set_sub_interval( my_month_entity.get_id()) chrono_id = chrono_id + 1 ## if the start of the month is not 0 then we have leading text to parse if (idxstart > 0): #substr = s.getText()[:idxstart].strip(",.").strip() hasMod, mod_type, mod_start, mod_end = hasModifier(s) if (hasMod): if mod_type == "This": chrono_list.append( chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_month_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_month_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": # print("FOUND LAST") chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_month_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 chrono_list.append(my_month_entity) return chrono_list, chrono_id, flags