def getTransaction(extendedNERAnalyzedParse): ''' (Object) -> Object Takes the NER Parse as input and annotates the transaction request. INPUT: booking, cancel, status OUTPUT: book, cancel, status ''' logger.info("ENTERING TRANSACTION IDENTIFICATION MODULE") for key in extendedNERAnalyzedParse.keys(): word = extendedNERAnalyzedParse[key]["word"] lemma = extendedNERAnalyzedParse[key]["lemma"] transaction = getTransactionItems(lemma) if transaction: annotateParse(extendedNERAnalyzedParse, key, transaction, transaction, transactionMarker) logger.debug("Transaction Identified = '%s' from word = '%s'", transaction, word) logger.info("TRANSACTION IDENTIFICATION DONE")
def findCity(extendedNERAnalyzedParse): ''' (Object) -> Object Takes the NER Parse as input and annotates the city code corresponding to the city at each city token. The city list is loaded from the database matching the first token of city name. INPUT: new delhi, new francisco, mumbai OUTPUT: DEL, no, BOM ''' logger.info("ENTERING CITY IDENTIFICATION MODULE") ner_city = [] for key in extendedNERAnalyzedParse.keys(): if extendedNERAnalyzedParse[key][ "NER"] == "0" and extendedNERAnalyzedParse[key]["POS"] in [ "NN", "NNP", "JJ" ]: token = extendedNERAnalyzedParse[key]["word"] matched_cities = findCityByName(token) for city_tuple in matched_cities: city_code = city_tuple[0] city = city_tuple[1] if matchLongestString(extendedNERAnalyzedParse, key, city): annotateParse(extendedNERAnalyzedParse, key, city, city_code, cityMarker) ner_city.append(city_code) logger.debug("City Identified = '%s' with code = '%s'", city, city_code) logger.info("CITY IDENTIFICATION DONE") return ner_city
def getTransport(extendedNERAnalyzedParse): ''' (Object) -> Object Takes the NER Parse as input and annotates the transport type. INPUT: flight, air, bus, rail OUTPUT: flight, flight, bus, train ''' logger.info("ENTERING TRANSPORT IDENTIFICATION MODULE") for key in extendedNERAnalyzedParse.keys(): word = extendedNERAnalyzedParse[key]["word"] lemma = extendedNERAnalyzedParse[key]["lemma"] transport = getTransportItems(lemma) if transport: annotateParse(extendedNERAnalyzedParse, key, transport, transport, transportMarker) logger.debug("Transport Identified = '%s' from word = '%s'", transport, word) logger.info("TRANSPORT IDENTIFICATION DONE")
def getOtherPreference(extendedNERAnalyzedParse): ''' (Object) -> Object Takes the NER Parse as input and annotates the preferences of user like shortest, quickest, cheapest flights etc. INPUT: shortest, quickest, cheapest flights OUTPUT: fastest, fastest, cheapest ''' logger.info("ENTERING OTHER PREFERENCES IDENTIFICATION MODULE") for key in extendedNERAnalyzedParse.keys(): token = extendedNERAnalyzedParse[key]["word"] matched_preferences = findPreferences(token) for preference_tuple in matched_preferences: preference_name = preference_tuple[0] preference_type = preference_tuple[1] if matchLongestString(extendedNERAnalyzedParse, key, preference_name): annotateParse(extendedNERAnalyzedParse, key, preference_name, preference_type, otherPreferenceMarker) logger.debug( "Other preferences Identified = '%s' with type = '%s'", preference_name, preference_type) logger.info("OTHER PREFERENCES IDENTIFICATION DONE")
def findOrganization(extendedNERAnalyzedParse): ''' (Object) -> Object Takes the NER Parse as input and annotates the Organization corresponding to the Organization at each Organization token. The Organization list is loaded from the database matching the first token of Organization name. INPUT: Indigo, spicejet, mumbai OUTPUT: Indigo, spicejet, no ''' logger.info("ENTERING ORGANIZATION IDENTIFICATION MODULE") ner_organization = [] for key in extendedNERAnalyzedParse.keys(): if extendedNERAnalyzedParse[key]["POS"] in ["NN", "NNP", "JJ"]: token = extendedNERAnalyzedParse[key]["word"] matched_organization = findOrganizationByName(token) for organization_tuple in matched_organization: organization = organization_tuple[1] if matchLongestString(extendedNERAnalyzedParse, key, organization): annotateParse(extendedNERAnalyzedParse, key, organization, organization, organizationMarker) ner_organization.append(organization) logger.debug( "Organization Identified = '%s' with code = '%s'", organization, organization) logger.info("ORGANIZATION IDENTIFICATION DONE") return ner_organization
def getNumSeats(extendedNERAnalyzedParse, last_requested_DF): ''' (Object) -> Object Takes the NER Parse as input and annotates the number of seats. INPUT: 2 tickets; 3 adults, 2 children OUTPUT: 2, {3,2} ''' logger.info("ENTERING NUM SEATS IDENTIFICATION MODULE") template = {"adults": 0, "children": 0, "infants": 0} seatsList = findNumberToken(extendedNERAnalyzedParse) if (len(seatsList) > 0): if len(seatsList) == 1 and last_requested_DF.lower( ) == numSeatMarker.lower(): index = seatsList[0] indexWord = extendedNERAnalyzedParse[index]["word"] template['adults'] = extendedNERAnalyzedParse[index][ "NormalizedNER"] annotateParse(extendedNERAnalyzedParse, index, indexWord, template, numSeatMarker) logger.debug("Num Seats Identified = '%s'", str(template)) else: seat_flag = 0 for key in extendedNERAnalyzedParse.keys(): word = extendedNERAnalyzedParse[key]["word"] lemma = extendedNERAnalyzedParse[key]["lemma"] seat_word = isSeatIdentifier(lemma) if seat_word: seat_flag = 1 break if seat_flag: classDict = matchSeatIdentifier(extendedNERAnalyzedParse) map_list = mapClassToNumber(extendedNERAnalyzedParse, seatsList, classDict, template) if map_list: for key in classDict.keys(): word = extendedNERAnalyzedParse[key]["word"] annotateParse(extendedNERAnalyzedParse, key, word, template, numSeatMarker) logger.debug("Num Seats Identified = '%s'", str(template)) logger.info("NUM SEATS IDENTIFICATION DONE")
def TravelType(extendedNERAnalyzedParse, last_requested_DF): ''' (Object) -> Object Takes the NER Parse as input and annotates the round_trip code corresponding to the round_trip at each round_trip token. The round_trip list is loaded from the database matching the first token of round_trip name. INPUT: one way, 2-way, returning OUTPUT: 0,1,1 ''' logger.info("ENTERING ROUND TRIP IDENTIFICATION MODULE") checkFlag = 1 if last_requested_DF == travelTypeMarker.lower(): key = 1 word = extendedNERAnalyzedParse[key]['word'].lower() if (word == "yes"): annotateParse(extendedNERAnalyzedParse, key, word, "1", travelTypeMarker) checkFlag = 0 logger.debug("Round Trip identifier = '%s' with code = '%s'", word, "1") elif (word == "no"): annotateParse(extendedNERAnalyzedParse, key, word, "0", travelTypeMarker) checkFlag = 0 logger.debug("Round Trip identifier = '%s' with code = '%s'", word, "0") if checkFlag: for key in extendedNERAnalyzedParse.keys(): word = extendedNERAnalyzedParse[key]["word"] lemma = extendedNERAnalyzedParse[key]["lemma"] matched_round_trip = findRoundTrip(word, lemma) for round_trip_tuple in matched_round_trip: round_trip_code = round_trip_tuple[0] round_trip = round_trip_tuple[1] if matchLongestString(extendedNERAnalyzedParse, key, round_trip): annotateParse(extendedNERAnalyzedParse, key, round_trip, round_trip_code, travelTypeMarker) logger.debug( "Round Trip identifier = '%s' with code = '%s'", round_trip, round_trip_code) logger.info("ROUND TRIP IDENTIFICATION DONE")
def getTimePreference(extendedNERAnalyzedParse): ''' (Object) -> Object Takes the NER Parse as input and annotates the semantic time periods like morning, late evening. INPUT: late evening, afternoon, mumbai OUTPUT: yes, yes, no ''' logger.info("ENTERING TIME PREFERENCE MODULE") for key in extendedNERAnalyzedParse.keys(): token = extendedNERAnalyzedParse[key]["word"] matched_period = getTimePeriod(token) if matched_period: time_preference = {"exact": "", "range": {"min": "", "max": ""}} start_time = matched_period[0].seconds * 1.0 / seconds_in_hour end_time = matched_period[1].seconds * 1.0 / seconds_in_hour previous_key = key - 1 match_modifier = () if previous_key in extendedNERAnalyzedParse: previous_token = extendedNERAnalyzedParse[previous_key]["word"] match_modifier = isTimeModifier(previous_token) # New time ranges if match_modifier: # Calculate new time period start_percent = match_modifier[0] end_percent = match_modifier[1] start_time, end_time = calculateTime(start_time, end_time, start_percent, end_percent) time_preference["range"]["min"] = start_time time_preference["range"]["max"] = end_time extendedNERAnalyzedParse[key]["NER"] = timePreferenceMarker extendedNERAnalyzedParse[key]["NormalizedNER"] = time_preference logger.debug("Time Preference Identified between %.2f and %.2f", start_time, end_time) logger.info("TIME PREFERENCE DONE")
def getNumStops(extendedNERAnalyzedParse): ''' (Object) -> Object Takes the NER Parse as input and annotates the number of stops. INPUT: 2 stops, less than 3 stops OUTPUT: 2, {0,2} ''' logger.info("ENTERING NUM STOPS IDENTIFICATION MODULE") range_dict = {"exact": "", "range": {"min": "", "max": ""}} stopsList = findNumberToken(extendedNERAnalyzedParse) if (len(stopsList) > 0): for key in extendedNERAnalyzedParse.keys(): word = extendedNERAnalyzedParse[key]["word"] lemma = extendedNERAnalyzedParse[key]["lemma"] stop_word = isStopItem(lemma) if stop_word: index = compareStopList(key, stopsList) if index: indexWord = extendedNERAnalyzedParse[index]["word"] indexWordValue = extendedNERAnalyzedParse[index][ "NormalizedNER"] if indexWord.lower() == "no": range_dict["exact"] = "0" else: range_dict["exact"] = indexWordValue annotateParse(extendedNERAnalyzedParse, index, indexWord, range_dict, numStopMarker) logger.debug( "Num Stops Identified = '%s' from word = '%s'", range_dict["exact"], stop_word) logger.info("NUM STOPS IDENTIFICATION DONE")
def normalize(query): ''' (String) -> String Takes a noisy query as input and returns normalized query. INPUT: I wanna go to hyderabad 2morrow OUTPUT: I want to go to hyderabad tomorrow ''' normalized_query = "" logger.info("ENTERING NOISY NORMALIZER MODULE") try: logger.debug("Query = " + query) tokens = query.split() for token in tokens: normalized_word = get_normalized_word(token) normalized_query += normalized_word + " " normalized_query = normalized_query.strip() logger.info("NORMALIZATION DONE\n") return normalized_query except: logger.error(sys.exc_info()[1]) logger.info("NORMALIZATION DONE\n") return query
def normalize(word): ''' (String) -> String Takes a noisy word as input and returns normalized word. INPUT: nah, yeah OUTPUT: no, yes ''' logger.info("ENTERING SPELL NOISY NORMALIZER MODULE") try: normalized_word = findWordByAbbreviation(word) return_word = "" if normalized_word != '': logger.debug("'%s' --> '%s' ", word, normalized_word) return_word = normalized_word logger.info("SPELL NORMALIZATION DONE\n") return return_word except: logger.error(sys.exc_info()[1]) logger.info("SPELL NORMALIZATION DONE\n") return word
def correctAbbreviation(query): ''' (String) -> String Takes a query with abbreviations and resolves them into their counterpart. INPUT: I want to travel from hyd to blr on 2 oct. OUTPUT: I want to travel from Hyderabad to Bangalore on 2 October. ''' abbreviated_query = "" logger.info("ENTERING ABBREVIATION CORRECTION MODULE") try: logger.debug("Query = " + query) tokens = query.split() for token in tokens: expanded_word = expandWord(token) abbreviated_query += expanded_word + " " abbreviated_query = abbreviated_query.strip() logger.info("ABBREVIATION CORRECTION DONE\n") return abbreviated_query except: logger.error(sys.exc_info()[1]) logger.info("ABBREVIATION CORRECTION DONE\n") return query
def updateRangeInNER(extendedNERAnalyzedParse): ''' (Object) -> Object Takes the NER Parse as input and updates the values of NERs which are marked as exact but are actually range values, eg. before 27th October. INPUT: less than Rs 10,000, around 4 p.m., mumbai OUTPUT: {"exact":null, "range":{"max":10000,"min":0}}, {"exact":null, "range":{"max":16,"min":0}}, no ''' logger.info("ENTERING RANGE HANDLER MODULE") # print "================ ENTeRED NER HERE ==================" # print extendedNERAnalyzedParse print "====================================================" for key in extendedNERAnalyzedParse.keys(): token = extendedNERAnalyzedParse[key]["word"] matched_range_identifiers = getRangeIdentifier(token) for range_identifier in matched_range_identifiers: word = range_identifier[0] category = range_identifier[1] if matchLongestString(extendedNERAnalyzedParse, key, word): # Get NER key => where identifier ends word_length = len(word.split()) ner_key = key + word_length if ner_key in extendedNERAnalyzedParse.keys(): ner_token = extendedNERAnalyzedParse[ner_key]['NER'] if ner_token != "0": convertToRange(extendedNERAnalyzedParse, ner_key, category) print "Range Identified = " + ner_token + " " + category # logger.debug("City Identified = '%s' with code = '%s'", city, city_code) # print "================ LEAVING NER HERE ==================" # print extendedNERAnalyzedParse print "====================================================" logger.info("RANGE HANDLER DONE")
def getNumResults(extendedNERAnalyzedParse): ''' (Object) -> Object Takes the NER Parse as input and annotates the number of flight results to display. INPUT: 2 flights, 3 cheapest flights OUTPUT: 2, 3 ''' logger.info("ENTERING NUM RESULTS IDENTIFICATION MODULE") resultsList = findNumberToken(extendedNERAnalyzedParse) if (len(resultsList) > 0): for key in extendedNERAnalyzedParse.keys(): word = extendedNERAnalyzedParse[key]["word"] lemma = extendedNERAnalyzedParse[key]["lemma"] result_word = isResultItem(lemma) if result_word: index = compareResultList(key, resultsList) if index: indexWord = extendedNERAnalyzedParse[index]["word"] indexWordValue = extendedNERAnalyzedParse[index][ "NormalizedNER"] annotateParse(extendedNERAnalyzedParse, index, indexWord, indexWordValue, numResultMarker) logger.debug( "Num Results Identified = '%s' from word = '%s'", indexWordValue, result_word) logger.info("NUM RESULTS IDENTIFICATION DONE")
def completeDate(query): ''' (String) -> String Takes a query with incomplete dates (missing month), and fills it using Server Time. Also, it replaces 'th' to make the term number for gap filling module. INPUT: I will be travelling on 25th. OUTPUT: I will be travelling on 25 October. ''' logger.info("ENTERING GAP FILLER MODULE") try: logger.debug("Query = " + query) gap_filled_query = "" tokens = query.split() new_tokens = [] skip_flag = 0 skip_tokens = [] for index in range(0, len(tokens)): token = tokens[index] date = isDateToken(token) if date: new_tokens.append(str(date)) if not isMonthSpecified(index, tokens): month = getComingMonth(date) new_tokens.append(month) # else: # skip_tokens, month = isRelativeMonthSpecified(index,tokens) # if skip_tokens: # new_tokens.append(month) # skip_flag = 1 else: # if skip_flag and index not in skip_tokens: # skip_flag = 0 # skip_tokens = [] # if not skip_flag: new_tokens.append(token) gap_filled_query = " ".join(new_tokens) logger.info("GAP FILLING DONE\n") return gap_filled_query except: logger.error(sys.exc_info()[1]) logger.info("GAP FILLING DONE\n") return query
def splitNumberString(query): ''' (String) -> String Takes a query where multiple word are clubbed together in single token and separates such tokens to multiple words. INPUT: My budget is Rs.50 and extra luggage 10-15kgs. OUTPUT: My budget is Rs 50 and extra luggage 10 - 15 kgs. Cases to handle: Rs.50 -> Rs 50 Rs50 -> Rs 50 10-15kgs -> 10 - 15 Kgs 10Kgs-15kgs -> 10 Kgs - 15 Kgs 10Kg. -> 10 Kg 10.1 -> 10.1 10-12-2015 -> 10-12-2015 10.12.2015 -> 10.12.2015 END. -> END . one-way -> one way 1-tier -> 1 tier 4:00am -> 4:00 am going.I -> going . I // Handle ticket/pnr no. and don't split them Rules (in order): 1. Split '-' ---> 10-15 -> 10 - 15, if tier, way -> remove '-', handle date case 2. Case '.', (i) two numbers: do nothing, (ii) two words: split, (iii) one word-one num: split and remove '.' 3. Split NUM and String. If last char == '.', if word in dict -> remove '.', else full stop. If split == 'nd' (for date), delete token ''' splitted_query = "" logger.info("ENTERING SPLITTER MODULE") try: logger.debug("Query = " + query) tokens = query.split() for token in tokens: splitted_word = split_word(token) splitted_query += splitted_word + " " splitted_query = splitted_query.strip() logger.info("SPLITTING DONE\n") return splitted_query except: logger.error(sys.exc_info()[1]) logger.info("SPLITTING DONE\n") return query
def unigramSpellCheck(query, PWLdict): ''' (String) -> String Takes a noisy query with ungrammatical/Out of Vocab words as input and returns the spell corrected query. INPUT: I want to buk a flight from hydrabad to banglore. OUTPUT: I want to book a flight from Hyderabad to Bangalore. ''' logger.info("ENTERING SPELL CHECKER MODULE") try: logger.debug("Query = " + query) word_list = nltk.word_tokenize(query) pos_list = nltk.pos_tag(word_list) replacerDict = SpellingReplacer() # print replacerDict.check("mumbai") replacerPWL = SpellingReplacer(PWLdict) # print replacerPWL.check('mumbai') checked_list = [] for item in pos_list: word = item[0] pos = item[1] truncate_word = re.sub(r'(.)\1+', r'\1', word) normalized_word = normalize(truncate_word) # If word is a special char, don't spell check it if re.match("([^\w@#])", word): checked_list.append(word) elif normalized_word: checked_list.append(normalized_word) elif replacerPWL.check(truncate_word): correctedWord = truncate_word.title() checked_list.append(correctedWord) elif not replacerDict.check(word): correctedWord = "" dist = 100 # Do not replace words from PWL if len(word) <= 3 if len(truncate_word) > 3: correctedWordPWL = replacerPWL.replace(truncate_word) distPWL = edit_distance(truncate_word, correctedWordPWL) else: distPWL = dist correctedWordPWL = truncate_word correctedWordDict = replacerDict.replace(word) distDict = edit_distance(word, correctedWordDict) if distPWL > distDict or correctedWordPWL == truncate_word: correctedWord = correctedWordDict else: correctedWord = correctedWordPWL.title() if correctedWord == "": correctedWord = word else: logger.debug("'%s' --> '%s' ", word, correctedWord) checked_list.append(correctedWord) else: checked_list.append(word) spell_checked_query = " ".join(checked_list) logger.info("SPELL CORRECTION DONE\n") return spell_checked_query except: logger.error(sys.exc_info()[1]) logger.info("SPELL CORRECTION DONE\n") return query
def fillArguments(query): ''' (String) -> String Takes a query with range items missing units, and returns the gaps filled with units. INPUT: I will be travelling between 25rd and 25th October. OUTPUT: I will be travelling between 25rd October and 25th October. Cases to handle: 10 - 15 Kilograms -> 10 Kilograms - 15 Kilograms 10 to 15 Kilograms -> 10 Kilograms to 15 Kilograms 10 and 15 Kilograms -> 10 Kilograms and 15 Kilograms rupees 10 - 15 -> rupees 10 - rupees 15 rupees 10 to 15 -> rupees 10 to rupees 15 rupees 10 and 15 -> rupees 10 and rupees 15 ''' logger.info("ENTERING GAP FILLER MODULE") logger.debug("Query = " + query) gap_filled_query = "" tokens = query.split() numberedTokens = getNumberTokens(tokens) to_insert = {} for index in range(1, len(numberedTokens)): if numberedTokens[index] - numberedTokens[index - 1] == 2: if isRangeIdentifier(tokens[numberedTokens[index] - 1]): boundary = range(0, len(tokens)) if (numberedTokens[index - 1] - 1) in boundary: prev_word = tokens[numberedTokens[index - 1] - 1] if isUnit(prev_word): # Copy unit to query to_insert[numberedTokens[index]] = prev_word if (numberedTokens[index] + 1) in boundary: next_word = tokens[numberedTokens[index] + 1] if isUnit(next_word): # Copy unit to query to_insert[numberedTokens[index - 1] + 1] = next_word gap_filled_tokens = [] insert_keys = to_insert.keys() insert_length = len(insert_keys) count = 0 for index in range(0, len(tokens)): if count < insert_length and index == insert_keys[count]: gap_filled_tokens.append(to_insert[index]) logger.debug("Inserted '%s' at index = %d", to_insert[index], index) count += 1 gap_filled_tokens.append(tokens[index]) gap_filled_query = ' '.join(gap_filled_tokens) logger.info("GAP FILLING DONE\n") return gap_filled_query
def travelNLP(query, category, last_requested_DF): ''' (String,String,String) -> Object Takes the input query, category and last requested DF and annotates the NERs in the query. INPUT: ('hyd to blr', 'travel', 'source') OUTPUT: {source: HYD, destination: BLR} ''' # logger = logging.getLogger(__name__) allExtendedNerDF = {} logger.info("ENTERING TRAVEL MODULE") try: logger.debug(query + " " + last_requested_DF + "\n") query = query.lower() noiseRemovedQuery = preprocessing_tools.noisy_word_normalizer.normalizer.normalize( query) print "Normalize = ", noiseRemovedQuery logger.debug("Normalize = " + noiseRemovedQuery) splittedQuery = preprocessing_tools.number_string_splitter.number_string_splitter.splitNumberString( noiseRemovedQuery) print "Splitted = ", splittedQuery logger.debug("Splitted = " + splittedQuery) abbreviatedQuery = preprocessing_tools.abbreviation_checker.abbreviation_corrector.correctAbbreviation( splittedQuery) print "Abbreviated = ", abbreviatedQuery logger.debug("Abbreviated = " + abbreviatedQuery) spellCheckedQuery = preprocessing_tools.spell_checker.spell_checker.spellCheck( abbreviatedQuery, PWL_FILE) print "Spellchecked = ", spellCheckedQuery logger.debug("Spellchecked = " + spellCheckedQuery) monthFilledQuery = preprocessing_tools.month_filler.month_filler.completeDate( spellCheckedQuery) print "MonthFilledQuery = ", monthFilledQuery logger.debug("MonthFilledQuery = " + monthFilledQuery) gapFilledQuery = preprocessing_tools.argument_filler.argument_filler.fillArguments( monthFilledQuery) print "GapFilledQuery = ", gapFilledQuery logger.debug("GapFilledQuery = " + gapFilledQuery) normalizedQuery = gapFilledQuery print "Final Normalized Query = ", gapFilledQuery print logger.debug("Final Normalized Query = " + gapFilledQuery) NERAnalyzedParse, chunkedParse = preprocessing_tools.corenlp.corenlp.identifyNER( normalizedQuery) print "NER Parse = ", NERAnalyzedParse print "Chunking = ", chunkedParse for index in range(0, len(chunkedParse)): # print NERAnalyzedParse[index], chunkedParse[index] extendedNerDF = preprocessing_tools.extended_ner.travel.travel_extended_ner.identifyExtendedNER( normalizedQuery, category, NERAnalyzedParse[index], last_requested_DF) disambiguatedDF = preprocessing_tools.category_disambiguator.category_disambiguator.disambiguateCategories( normalizedQuery, category, NERAnalyzedParse[index], chunkedParse[index], last_requested_DF) # print "Disambiguated = ", # print disambiguatedDF singleExtendedNerDF = preprocessing_tools.category_disambiguator.category_disambiguator.mergeDictionaries( extendedNerDF, disambiguatedDF) allExtendedNerDF = mergeDictionaries(allExtendedNerDF, singleExtendedNerDF) if "0" in allExtendedNerDF.keys(): del allExtendedNerDF["0"] print "Final Analyzed NERs = ", allExtendedNerDF except: # print "Unexpected error:", sys.exc_info() logger.error(sys.exc_info()[1]) finally: logger.info("LEAVING TRAVEL MODULE") return allExtendedNerDF
def normalizeTime(extendedNERAnalyzedParse): ''' (Object) -> Object Takes the NER Parse as input and converts Stanford SUTime duration and date convention to our convention. INPUT: PT2H, 2015-09-28-WXX-1 OUTPUT: 2, 2015-10-05 ''' logger.info("ENTERING TIME NORMALIZATION IDENTIFICATION MODULE") for key in extendedNERAnalyzedParse.keys(): ner = extendedNERAnalyzedParse[key]["NER"].lower() normalizedValue = extendedNERAnalyzedParse[key]["NormalizedNER"] if ner in time_NER: if ner == "duration": value = "" range_dict = {"exact": "", "range": {"min": "", "max": ""}} normalizedValue = re.sub("\>|\<", r"", normalizedValue) if normalizedValue[0:2] == "PT": value = normalizedValue[-2:] elif normalizedValue[0:1] == "P": value = normalizedValue[1:] unit = value[-1:] amount = value[0:-1] if isNumber(amount): amount = int(amount) hours = calculateTime(unit, amount) range_dict["exact"] = hours extendedNERAnalyzedParse[key]["NormalizedNER"] = range_dict elif ner == "date": tokens = normalizedValue.split("-") length_token = len(tokens) if length_token >= 3 and tokens[2][0:2] == "WE": print "Weekend Identified" elif length_token > 4 and tokens[3][0:1] == "W": offset = tokens[4] addWeek = 0 if isNumber(offset): offset = int(offset) today = datetime.datetime.today().weekday() + 1 if today > offset: addWeek = 1 week_date = calculateDate(tokens, addWeek) extendedNERAnalyzedParse[key][ "NormalizedNER"] = week_date.strftime("%Y-%m-%d") logger.info("TIME NORMALIZATION DONE")