コード例 #1
0
def normalize(query):

	'''
	(String) -> String

	Takes a noisy query as input and returns normalized query.

	INPUT: I wanna go to hyderabad 2morrow
	OUTPUT: I want to go to hyderabad tomorrow 

	'''
	
	normalized_query = ""
	logger.info("ENTERING NOISY NORMALIZER MODULE")

	try:
		logger.debug("Query = " + query)

		tokens = query.split()
		for token in tokens:
			normalized_word = get_normalized_word(token)
			normalized_query += normalized_word + " "
		
		normalized_query = normalized_query.strip()
	
		logger.info("NORMALIZATION DONE\n")
		return normalized_query
	
	except:
		logger.error(sys.exc_info()[1])
		logger.info("NORMALIZATION DONE\n")
		return query
コード例 #2
0
def getTransaction(extendedNERAnalyzedParse):
    '''
	(Object) -> Object

	Takes the NER Parse as input and annotates the transaction request.

	INPUT: booking, cancel, status
	OUTPUT: book, cancel, status

	'''

    logger.info("ENTERING TRANSACTION IDENTIFICATION MODULE")

    for key in extendedNERAnalyzedParse.keys():
        word = extendedNERAnalyzedParse[key]["word"]
        lemma = extendedNERAnalyzedParse[key]["lemma"]

        transaction = getTransactionItems(lemma)
        if transaction:
            annotateParse(extendedNERAnalyzedParse, key, transaction,
                          transaction, transactionMarker)
            logger.debug("Transaction Identified = '%s' from word = '%s'",
                         transaction, word)

    logger.info("TRANSACTION IDENTIFICATION DONE")
コード例 #3
0
def getTransport(extendedNERAnalyzedParse):
    '''
	(Object) -> Object

	Takes the NER Parse as input and annotates the transport type.

	INPUT: flight, air, bus, rail
	OUTPUT: flight, flight, bus, train

	'''

    logger.info("ENTERING TRANSPORT IDENTIFICATION MODULE")

    for key in extendedNERAnalyzedParse.keys():
        word = extendedNERAnalyzedParse[key]["word"]
        lemma = extendedNERAnalyzedParse[key]["lemma"]

        transport = getTransportItems(lemma)
        if transport:
            annotateParse(extendedNERAnalyzedParse, key, transport, transport,
                          transportMarker)
            logger.debug("Transport Identified = '%s' from word = '%s'",
                         transport, word)

    logger.info("TRANSPORT IDENTIFICATION DONE")
コード例 #4
0
def findOrganization(extendedNERAnalyzedParse):
    '''
	(Object) -> Object

	Takes the NER Parse as input and annotates the Organization corresponding
	to the Organization at each Organization token.
	The Organization list is loaded from the database matching the first token of Organization name.

	INPUT: Indigo, spicejet, mumbai
	OUTPUT: Indigo, spicejet, no

	'''

    logger.info("ENTERING ORGANIZATION IDENTIFICATION MODULE")

    ner_organization = []

    for key in extendedNERAnalyzedParse.keys():
        if extendedNERAnalyzedParse[key]["POS"] in ["NN", "NNP", "JJ"]:
            token = extendedNERAnalyzedParse[key]["word"]
            matched_organization = findOrganizationByName(token)
            for organization_tuple in matched_organization:
                organization = organization_tuple[1]
                if matchLongestString(extendedNERAnalyzedParse, key,
                                      organization):
                    annotateParse(extendedNERAnalyzedParse, key, organization,
                                  organization, organizationMarker)
                    ner_organization.append(organization)
                    logger.debug(
                        "Organization Identified = '%s' with code = '%s'",
                        organization, organization)

    logger.info("ORGANIZATION IDENTIFICATION DONE")
    return ner_organization
コード例 #5
0
def findCity(extendedNERAnalyzedParse):
    '''
	(Object) -> Object

	Takes the NER Parse as input and annotates the city code corresponding to the city at each city token.
	The city list is loaded from the database matching the first token of city name.

	INPUT: new delhi, new francisco, mumbai
	OUTPUT: DEL, no, BOM

	'''

    logger.info("ENTERING CITY IDENTIFICATION MODULE")

    ner_city = []

    for key in extendedNERAnalyzedParse.keys():
        if extendedNERAnalyzedParse[key][
                "NER"] == "0" and extendedNERAnalyzedParse[key]["POS"] in [
                    "NN", "NNP", "JJ"
                ]:
            token = extendedNERAnalyzedParse[key]["word"]
            matched_cities = findCityByName(token)
            for city_tuple in matched_cities:
                city_code = city_tuple[0]
                city = city_tuple[1]
                if matchLongestString(extendedNERAnalyzedParse, key, city):
                    annotateParse(extendedNERAnalyzedParse, key, city,
                                  city_code, cityMarker)
                    ner_city.append(city_code)
                    logger.debug("City Identified = '%s' with code = '%s'",
                                 city, city_code)

    logger.info("CITY IDENTIFICATION DONE")
    return ner_city
コード例 #6
0
def normalize(word):
    '''
	(String) -> String

	Takes a noisy word as input and returns normalized word.

	INPUT: nah, yeah
	OUTPUT: no, yes

	'''

    logger.info("ENTERING SPELL NOISY NORMALIZER MODULE")

    try:
        normalized_word = findWordByAbbreviation(word)
        return_word = ""
        if normalized_word != '':
            logger.debug("'%s' --> '%s' ", word, normalized_word)
            return_word = normalized_word

        logger.info("SPELL NORMALIZATION DONE\n")
        return return_word

    except:
        logger.error(sys.exc_info()[1])
        logger.info("SPELL NORMALIZATION DONE\n")
        return word
コード例 #7
0
def correctAbbreviation(query):
	'''
	(String) -> String

	Takes a query with abbreviations and resolves them into their counterpart.

	INPUT: I want to travel from hyd to blr on 2 oct.
	OUTPUT: I want to travel from Hyderabad to Bangalore on 2 October.

	'''

	abbreviated_query = ""
	logger.info("ENTERING ABBREVIATION CORRECTION MODULE")
	
	try:
		logger.debug("Query = " + query)

		tokens = query.split()
		for token in tokens:
			expanded_word = expandWord(token)
			abbreviated_query += expanded_word + " "
		
		abbreviated_query = abbreviated_query.strip()
		
		logger.info("ABBREVIATION CORRECTION DONE\n")
		return abbreviated_query
	
	except:
		logger.error(sys.exc_info()[1])
		logger.info("ABBREVIATION CORRECTION DONE\n")
		return query
コード例 #8
0
def getOtherPreference(extendedNERAnalyzedParse):
    '''
	(Object) -> Object

	Takes the NER Parse as input and annotates the preferences of user like
	shortest, quickest, cheapest flights etc.

	INPUT: shortest, quickest, cheapest flights
	OUTPUT: fastest, fastest, cheapest

	'''

    logger.info("ENTERING OTHER PREFERENCES IDENTIFICATION MODULE")

    for key in extendedNERAnalyzedParse.keys():
        token = extendedNERAnalyzedParse[key]["word"]
        matched_preferences = findPreferences(token)
        for preference_tuple in matched_preferences:
            preference_name = preference_tuple[0]
            preference_type = preference_tuple[1]

            if matchLongestString(extendedNERAnalyzedParse, key,
                                  preference_name):
                annotateParse(extendedNERAnalyzedParse, key, preference_name,
                              preference_type, otherPreferenceMarker)
                logger.debug(
                    "Other preferences Identified = '%s' with type = '%s'",
                    preference_name, preference_type)

    logger.info("OTHER PREFERENCES IDENTIFICATION DONE")
コード例 #9
0
def getNumSeats(extendedNERAnalyzedParse, last_requested_DF):
    '''
	(Object) -> Object

	Takes the NER Parse as input and annotates the number of seats.

	INPUT: 2 tickets; 3 adults, 2 children
	OUTPUT: 2, {3,2}

	'''

    logger.info("ENTERING NUM SEATS IDENTIFICATION MODULE")

    template = {"adults": 0, "children": 0, "infants": 0}

    seatsList = findNumberToken(extendedNERAnalyzedParse)

    if (len(seatsList) > 0):
        if len(seatsList) == 1 and last_requested_DF.lower(
        ) == numSeatMarker.lower():
            index = seatsList[0]
            indexWord = extendedNERAnalyzedParse[index]["word"]

            template['adults'] = extendedNERAnalyzedParse[index][
                "NormalizedNER"]
            annotateParse(extendedNERAnalyzedParse, index, indexWord, template,
                          numSeatMarker)
            logger.debug("Num Seats Identified = '%s'", str(template))

        else:
            seat_flag = 0
            for key in extendedNERAnalyzedParse.keys():
                word = extendedNERAnalyzedParse[key]["word"]
                lemma = extendedNERAnalyzedParse[key]["lemma"]

                seat_word = isSeatIdentifier(lemma)
                if seat_word:
                    seat_flag = 1
                    break

            if seat_flag:
                classDict = matchSeatIdentifier(extendedNERAnalyzedParse)

                map_list = mapClassToNumber(extendedNERAnalyzedParse,
                                            seatsList, classDict, template)

                if map_list:
                    for key in classDict.keys():
                        word = extendedNERAnalyzedParse[key]["word"]
                        annotateParse(extendedNERAnalyzedParse, key, word,
                                      template, numSeatMarker)

                logger.debug("Num Seats Identified = '%s'", str(template))

    logger.info("NUM SEATS IDENTIFICATION DONE")
コード例 #10
0
def completeDate(query):
    '''
	(String) -> String

	Takes a query with incomplete dates (missing month), and fills it
	using Server Time. Also, it replaces 'th' to make the term number
	for gap filling module.

	INPUT: I will be travelling on 25th.
	OUTPUT: I will be travelling on 25 October.

	'''

    logger.info("ENTERING GAP FILLER MODULE")

    try:
        logger.debug("Query = " + query)

        gap_filled_query = ""

        tokens = query.split()
        new_tokens = []

        skip_flag = 0
        skip_tokens = []

        for index in range(0, len(tokens)):
            token = tokens[index]
            date = isDateToken(token)
            if date:
                new_tokens.append(str(date))
                if not isMonthSpecified(index, tokens):
                    month = getComingMonth(date)
                    new_tokens.append(month)
                # else:
                # 	skip_tokens, month = isRelativeMonthSpecified(index,tokens)
                # 	if skip_tokens:
                # 		new_tokens.append(month)
                # 		skip_flag = 1
            else:
                # if skip_flag and index not in skip_tokens:
                # 	skip_flag = 0
                # 	skip_tokens = []
                # if not skip_flag:
                new_tokens.append(token)

        gap_filled_query = " ".join(new_tokens)

        logger.info("GAP FILLING DONE\n")
        return gap_filled_query

    except:
        logger.error(sys.exc_info()[1])
        logger.info("GAP FILLING DONE\n")
        return query
コード例 #11
0
def splitNumberString(query):
    '''
	(String) -> String

	Takes a query where multiple word are clubbed together in single token
	and separates such tokens to multiple words.

	INPUT: My budget is Rs.50 and extra luggage 10-15kgs.
	OUTPUT: My budget is Rs 50 and extra luggage 10 - 15 kgs.

	Cases to handle:
	Rs.50	->	Rs 50
	Rs50	->	Rs 50
	10-15kgs	-> 10 - 15 Kgs
	10Kgs-15kgs	-> 10 Kgs - 15 Kgs
	10Kg. 	-> 10 Kg
	10.1	-> 10.1
	10-12-2015	-> 10-12-2015
	10.12.2015	-> 10.12.2015
	END.	-> END .
	one-way -> one way
	1-tier	-> 1 tier
	4:00am	-> 4:00 am
	going.I 	-> going . I
	// Handle ticket/pnr no. and don't split them

	Rules (in order):
	1. Split '-' ---> 10-15 -> 10 - 15, if tier, way -> remove '-', handle date case
	2. Case '.', (i) two numbers: do nothing, (ii) two words: split, (iii) one word-one num: split and remove '.'
	3. Split NUM and String. If last char == '.', if word in dict -> remove '.', else full stop. If split == 'nd' (for date), delete token
	'''

    splitted_query = ""
    logger.info("ENTERING SPLITTER MODULE")

    try:
        logger.debug("Query = " + query)

        tokens = query.split()
        for token in tokens:
            splitted_word = split_word(token)
            splitted_query += splitted_word + " "

        splitted_query = splitted_query.strip()

        logger.info("SPLITTING DONE\n")
        return splitted_query

    except:
        logger.error(sys.exc_info()[1])
        logger.info("SPLITTING DONE\n")
        return query
コード例 #12
0
def isUnit(word):
    '''
	(String) -> Boolean

	Takes a word and returns whether it is a unit term or not.

	INPUT: Kilograms,October,no
	OUTPUT: True,True,False

	'''

    word = word.lower()
    if checkUnitIdentifier(word):
        logger.debug("'%s' is a Unit Identifier term", word)
        return True
    return False
コード例 #13
0
def getTimePreference(extendedNERAnalyzedParse):
    '''
	(Object) -> Object

	Takes the NER Parse as input and annotates the semantic time periods like morning, late evening.

	INPUT: late evening, afternoon, mumbai
	OUTPUT: yes, yes, no

	'''

    logger.info("ENTERING TIME PREFERENCE MODULE")

    for key in extendedNERAnalyzedParse.keys():
        token = extendedNERAnalyzedParse[key]["word"]
        matched_period = getTimePeriod(token)
        if matched_period:
            time_preference = {"exact": "", "range": {"min": "", "max": ""}}

            start_time = matched_period[0].seconds * 1.0 / seconds_in_hour
            end_time = matched_period[1].seconds * 1.0 / seconds_in_hour

            previous_key = key - 1
            match_modifier = ()
            if previous_key in extendedNERAnalyzedParse:
                previous_token = extendedNERAnalyzedParse[previous_key]["word"]
                match_modifier = isTimeModifier(previous_token)

            # New time ranges
            if match_modifier:
                # Calculate new time period
                start_percent = match_modifier[0]
                end_percent = match_modifier[1]
                start_time, end_time = calculateTime(start_time, end_time,
                                                     start_percent,
                                                     end_percent)

            time_preference["range"]["min"] = start_time
            time_preference["range"]["max"] = end_time
            extendedNERAnalyzedParse[key]["NER"] = timePreferenceMarker
            extendedNERAnalyzedParse[key]["NormalizedNER"] = time_preference
            logger.debug("Time Preference Identified between %.2f and %.2f",
                         start_time, end_time)

    logger.info("TIME PREFERENCE DONE")
コード例 #14
0
def split_word(word):
    try:
        new_word = ""
        splitted_hypen_terms = split_hypen(word)
        # print "Hypen = ",
        # print splitted_hypen_terms
        for term in splitted_hypen_terms:
            splitted_dot_terms = split_dot(term)
            new_word += splitted_dot_terms + " "

        new_word = new_word.strip()
        if new_word != word:
            logger.debug("%s --> %s", word, new_word)
        return new_word

    except:
        logger.error(sys.exc_info()[1])
        return word
コード例 #15
0
def isRangeIdentifier(word):
    '''
	(String) -> Boolean

	Takes a word and returns whether it is a range identifier or not.

	INPUT: -,to,and,no
	OUTPUT: True,True,True,False

	'''

    try:
        word = word.lower()
        if checkRangeIdentifier(word):
            logger.debug("'%s' is a Range Identifier term", word)
            return True
        return False
    except:
        return False
コード例 #16
0
def getNumStops(extendedNERAnalyzedParse):
    '''
	(Object) -> Object

	Takes the NER Parse as input and annotates the number of stops.

	INPUT: 2 stops, less than 3 stops
	OUTPUT: 2, {0,2}

	'''

    logger.info("ENTERING NUM STOPS IDENTIFICATION MODULE")

    range_dict = {"exact": "", "range": {"min": "", "max": ""}}

    stopsList = findNumberToken(extendedNERAnalyzedParse)

    if (len(stopsList) > 0):
        for key in extendedNERAnalyzedParse.keys():
            word = extendedNERAnalyzedParse[key]["word"]
            lemma = extendedNERAnalyzedParse[key]["lemma"]

            stop_word = isStopItem(lemma)
            if stop_word:
                index = compareStopList(key, stopsList)

                if index:
                    indexWord = extendedNERAnalyzedParse[index]["word"]
                    indexWordValue = extendedNERAnalyzedParse[index][
                        "NormalizedNER"]
                    if indexWord.lower() == "no":
                        range_dict["exact"] = "0"
                    else:
                        range_dict["exact"] = indexWordValue

                    annotateParse(extendedNERAnalyzedParse, index, indexWord,
                                  range_dict, numStopMarker)
                    logger.debug(
                        "Num Stops Identified = '%s' from word = '%s'",
                        range_dict["exact"], stop_word)

    logger.info("NUM STOPS IDENTIFICATION DONE")
コード例 #17
0
def expandWord(word):
	'''
	(String) -> String

	Takes a abbreviated word as input and returns the expanded word.

	INPUT: hyd, kg, rs
	OUTPUT: hyderabad, kilogram, rupees

	'''

	try:
		expanded_word = findWordByAbbreviation(word)
		if expanded_word != '':
			logger.debug("'%s' --> '%s' ", word, expanded_word)
			return expanded_word
		return word
	except:
		logger.error(sys.exc_info()[1])
		return word
コード例 #18
0
def get_normalized_word(word):
	'''
	(String) -> String

	Takes a noisy word as input and returns normalized word.

	INPUT: b4, 2morrow, uttar, going
	OUTPUT: before, tomorrow, uttar, ''

	'''

	try:
		normalized_word = findWordByAbbreviation(word)
		if normalized_word != '':
			logger.debug("'%s' --> '%s' ", word, normalized_word)
			return normalized_word
		return word
	except:
		logger.error(sys.exc_info()[1])
		return word
コード例 #19
0
def getNumResults(extendedNERAnalyzedParse):
    '''
	(Object) -> Object

	Takes the NER Parse as input and annotates the number of flight results to display.

	INPUT: 2 flights, 3 cheapest flights
	OUTPUT: 2, 3

	'''

    logger.info("ENTERING NUM RESULTS IDENTIFICATION MODULE")

    resultsList = findNumberToken(extendedNERAnalyzedParse)

    if (len(resultsList) > 0):
        for key in extendedNERAnalyzedParse.keys():
            word = extendedNERAnalyzedParse[key]["word"]
            lemma = extendedNERAnalyzedParse[key]["lemma"]

            result_word = isResultItem(lemma)
            if result_word:

                index = compareResultList(key, resultsList)

                if index:
                    indexWord = extendedNERAnalyzedParse[index]["word"]
                    indexWordValue = extendedNERAnalyzedParse[index][
                        "NormalizedNER"]

                    annotateParse(extendedNERAnalyzedParse, index, indexWord,
                                  indexWordValue, numResultMarker)
                    logger.debug(
                        "Num Results Identified = '%s' from word = '%s'",
                        indexWordValue, result_word)

    logger.info("NUM RESULTS IDENTIFICATION DONE")
コード例 #20
0
def TravelType(extendedNERAnalyzedParse, last_requested_DF):
    '''
	(Object) -> Object

	Takes the NER Parse as input and annotates the round_trip code corresponding 
	to the round_trip at each round_trip token.
	The round_trip list is loaded from the database matching the first token of round_trip name.

	INPUT: one way, 2-way, returning
	OUTPUT: 0,1,1
	'''

    logger.info("ENTERING ROUND TRIP IDENTIFICATION MODULE")

    checkFlag = 1

    if last_requested_DF == travelTypeMarker.lower():
        key = 1
        word = extendedNERAnalyzedParse[key]['word'].lower()

        if (word == "yes"):
            annotateParse(extendedNERAnalyzedParse, key, word, "1",
                          travelTypeMarker)
            checkFlag = 0
            logger.debug("Round Trip identifier = '%s' with code = '%s'", word,
                         "1")

        elif (word == "no"):
            annotateParse(extendedNERAnalyzedParse, key, word, "0",
                          travelTypeMarker)
            checkFlag = 0
            logger.debug("Round Trip identifier = '%s' with code = '%s'", word,
                         "0")

    if checkFlag:
        for key in extendedNERAnalyzedParse.keys():
            word = extendedNERAnalyzedParse[key]["word"]
            lemma = extendedNERAnalyzedParse[key]["lemma"]

            matched_round_trip = findRoundTrip(word, lemma)
            for round_trip_tuple in matched_round_trip:
                round_trip_code = round_trip_tuple[0]
                round_trip = round_trip_tuple[1]

                if matchLongestString(extendedNERAnalyzedParse, key,
                                      round_trip):
                    annotateParse(extendedNERAnalyzedParse, key, round_trip,
                                  round_trip_code, travelTypeMarker)
                    logger.debug(
                        "Round Trip identifier = '%s' with code = '%s'",
                        round_trip, round_trip_code)

    logger.info("ROUND TRIP IDENTIFICATION DONE")
コード例 #21
0
def unigramSpellCheck(query, PWLdict):
    '''
	(String) -> String

	Takes a noisy query with ungrammatical/Out of Vocab words as
	input and returns the spell corrected query.

	INPUT: I want to buk a flight from hydrabad to banglore.
	OUTPUT: I want to book a flight from Hyderabad to Bangalore.

	'''

    logger.info("ENTERING SPELL CHECKER MODULE")

    try:
        logger.debug("Query = " + query)

        word_list = nltk.word_tokenize(query)
        pos_list = nltk.pos_tag(word_list)

        replacerDict = SpellingReplacer()
        # print replacerDict.check("mumbai")

        replacerPWL = SpellingReplacer(PWLdict)
        # print replacerPWL.check('mumbai')

        checked_list = []
        for item in pos_list:
            word = item[0]
            pos = item[1]

            truncate_word = re.sub(r'(.)\1+', r'\1', word)
            normalized_word = normalize(truncate_word)

            # If word is a special char, don't spell check it
            if re.match("([^\w@#])", word):
                checked_list.append(word)

            elif normalized_word:
                checked_list.append(normalized_word)

            elif replacerPWL.check(truncate_word):
                correctedWord = truncate_word.title()
                checked_list.append(correctedWord)

            elif not replacerDict.check(word):
                correctedWord = ""
                dist = 100

                # Do not replace words from PWL if len(word) <= 3
                if len(truncate_word) > 3:
                    correctedWordPWL = replacerPWL.replace(truncate_word)
                    distPWL = edit_distance(truncate_word, correctedWordPWL)
                else:
                    distPWL = dist
                    correctedWordPWL = truncate_word

                correctedWordDict = replacerDict.replace(word)
                distDict = edit_distance(word, correctedWordDict)

                if distPWL > distDict or correctedWordPWL == truncate_word:
                    correctedWord = correctedWordDict
                else:
                    correctedWord = correctedWordPWL.title()

                if correctedWord == "":
                    correctedWord = word
                else:
                    logger.debug("'%s' --> '%s' ", word, correctedWord)

                checked_list.append(correctedWord)
            else:
                checked_list.append(word)

        spell_checked_query = " ".join(checked_list)

        logger.info("SPELL CORRECTION DONE\n")
        return spell_checked_query

    except:
        logger.error(sys.exc_info()[1])
        logger.info("SPELL CORRECTION DONE\n")
        return query
コード例 #22
0
def fillArguments(query):
    '''
	(String) -> String

	Takes a query with range items missing units, and returns
	the gaps filled with units.

	INPUT: I will be travelling between 25rd and 25th October.
	OUTPUT: I will be travelling between 25rd October and 25th October.

	Cases to handle:
	10 - 15 Kilograms	-> 10 Kilograms - 15 Kilograms
	10 to 15 Kilograms	-> 10 Kilograms to 15 Kilograms
	10 and 15 Kilograms	-> 10 Kilograms and 15 Kilograms
	rupees 10 - 15	-> rupees 10 - rupees 15
	rupees 10 to 15	-> rupees 10 to rupees 15
	rupees 10 and 15	-> rupees 10 and rupees 15
	
	'''

    logger.info("ENTERING GAP FILLER MODULE")
    logger.debug("Query = " + query)

    gap_filled_query = ""

    tokens = query.split()

    numberedTokens = getNumberTokens(tokens)

    to_insert = {}

    for index in range(1, len(numberedTokens)):
        if numberedTokens[index] - numberedTokens[index - 1] == 2:
            if isRangeIdentifier(tokens[numberedTokens[index] - 1]):

                boundary = range(0, len(tokens))
                if (numberedTokens[index - 1] - 1) in boundary:
                    prev_word = tokens[numberedTokens[index - 1] - 1]
                    if isUnit(prev_word):
                        # Copy unit to query
                        to_insert[numberedTokens[index]] = prev_word

                if (numberedTokens[index] + 1) in boundary:
                    next_word = tokens[numberedTokens[index] + 1]
                    if isUnit(next_word):
                        # Copy unit to query
                        to_insert[numberedTokens[index - 1] + 1] = next_word

    gap_filled_tokens = []
    insert_keys = to_insert.keys()
    insert_length = len(insert_keys)
    count = 0

    for index in range(0, len(tokens)):
        if count < insert_length and index == insert_keys[count]:
            gap_filled_tokens.append(to_insert[index])
            logger.debug("Inserted '%s' at index = %d", to_insert[index],
                         index)
            count += 1
        gap_filled_tokens.append(tokens[index])

    gap_filled_query = ' '.join(gap_filled_tokens)

    logger.info("GAP FILLING DONE\n")
    return gap_filled_query
コード例 #23
0
def travelNLP(query, category, last_requested_DF):
    '''
	(String,String,String) -> Object

	Takes the input query, category and last requested DF
	and annotates the NERs in the query.

	INPUT: ('hyd to blr', 'travel', 'source')
	OUTPUT: {source: HYD, destination: BLR}

	'''
    # logger = logging.getLogger(__name__)
    allExtendedNerDF = {}
    logger.info("ENTERING TRAVEL MODULE")

    try:
        logger.debug(query + " " + last_requested_DF + "\n")

        query = query.lower()

        noiseRemovedQuery = preprocessing_tools.noisy_word_normalizer.normalizer.normalize(
            query)
        print "Normalize = ", noiseRemovedQuery
        logger.debug("Normalize = " + noiseRemovedQuery)

        splittedQuery = preprocessing_tools.number_string_splitter.number_string_splitter.splitNumberString(
            noiseRemovedQuery)
        print "Splitted = ", splittedQuery
        logger.debug("Splitted = " + splittedQuery)

        abbreviatedQuery = preprocessing_tools.abbreviation_checker.abbreviation_corrector.correctAbbreviation(
            splittedQuery)
        print "Abbreviated = ", abbreviatedQuery
        logger.debug("Abbreviated = " + abbreviatedQuery)

        spellCheckedQuery = preprocessing_tools.spell_checker.spell_checker.spellCheck(
            abbreviatedQuery, PWL_FILE)
        print "Spellchecked = ", spellCheckedQuery
        logger.debug("Spellchecked = " + spellCheckedQuery)

        monthFilledQuery = preprocessing_tools.month_filler.month_filler.completeDate(
            spellCheckedQuery)
        print "MonthFilledQuery = ", monthFilledQuery
        logger.debug("MonthFilledQuery = " + monthFilledQuery)

        gapFilledQuery = preprocessing_tools.argument_filler.argument_filler.fillArguments(
            monthFilledQuery)
        print "GapFilledQuery = ", gapFilledQuery
        logger.debug("GapFilledQuery = " + gapFilledQuery)

        normalizedQuery = gapFilledQuery
        print "Final Normalized Query = ", gapFilledQuery
        print
        logger.debug("Final Normalized Query = " + gapFilledQuery)

        NERAnalyzedParse, chunkedParse = preprocessing_tools.corenlp.corenlp.identifyNER(
            normalizedQuery)
        print "NER Parse = ", NERAnalyzedParse
        print "Chunking = ", chunkedParse

        for index in range(0, len(chunkedParse)):
            # print NERAnalyzedParse[index], chunkedParse[index]
            extendedNerDF = preprocessing_tools.extended_ner.travel.travel_extended_ner.identifyExtendedNER(
                normalizedQuery, category, NERAnalyzedParse[index],
                last_requested_DF)

            disambiguatedDF = preprocessing_tools.category_disambiguator.category_disambiguator.disambiguateCategories(
                normalizedQuery, category, NERAnalyzedParse[index],
                chunkedParse[index], last_requested_DF)
            # print "Disambiguated = ",
            # print disambiguatedDF

            singleExtendedNerDF = preprocessing_tools.category_disambiguator.category_disambiguator.mergeDictionaries(
                extendedNerDF, disambiguatedDF)
            allExtendedNerDF = mergeDictionaries(allExtendedNerDF,
                                                 singleExtendedNerDF)

        if "0" in allExtendedNerDF.keys():
            del allExtendedNerDF["0"]

        print "Final Analyzed NERs = ", allExtendedNerDF

    except:
        # print "Unexpected error:", sys.exc_info()
        logger.error(sys.exc_info()[1])

    finally:
        logger.info("LEAVING TRAVEL MODULE")
        return allExtendedNerDF