Python info Beispiele, nlp_logging.logger.info Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: ner_transaction.py Projekt: bopopescu/Personal_Assistant

def getTransaction(extendedNERAnalyzedParse):
    '''
	(Object) -> Object

	Takes the NER Parse as input and annotates the transaction request.

	INPUT: booking, cancel, status
	OUTPUT: book, cancel, status

	'''

    logger.info("ENTERING TRANSACTION IDENTIFICATION MODULE")

    for key in extendedNERAnalyzedParse.keys():
        word = extendedNERAnalyzedParse[key]["word"]
        lemma = extendedNERAnalyzedParse[key]["lemma"]

        transaction = getTransactionItems(lemma)
        if transaction:
            annotateParse(extendedNERAnalyzedParse, key, transaction,
                          transaction, transactionMarker)
            logger.debug("Transaction Identified = '%s' from word = '%s'",
                         transaction, word)

    logger.info("TRANSACTION IDENTIFICATION DONE")

Beispiel #2

0

Datei anzeigen

def findCity(extendedNERAnalyzedParse):
    '''
	(Object) -> Object

	Takes the NER Parse as input and annotates the city code corresponding to the city at each city token.
	The city list is loaded from the database matching the first token of city name.

	INPUT: new delhi, new francisco, mumbai
	OUTPUT: DEL, no, BOM

	'''

    logger.info("ENTERING CITY IDENTIFICATION MODULE")

    ner_city = []

    for key in extendedNERAnalyzedParse.keys():
        if extendedNERAnalyzedParse[key][
                "NER"] == "0" and extendedNERAnalyzedParse[key]["POS"] in [
                    "NN", "NNP", "JJ"
                ]:
            token = extendedNERAnalyzedParse[key]["word"]
            matched_cities = findCityByName(token)
            for city_tuple in matched_cities:
                city_code = city_tuple[0]
                city = city_tuple[1]
                if matchLongestString(extendedNERAnalyzedParse, key, city):
                    annotateParse(extendedNERAnalyzedParse, key, city,
                                  city_code, cityMarker)
                    ner_city.append(city_code)
                    logger.debug("City Identified = '%s' with code = '%s'",
                                 city, city_code)

    logger.info("CITY IDENTIFICATION DONE")
    return ner_city

Beispiel #3

0

Datei anzeigen

def getTransport(extendedNERAnalyzedParse):
    '''
	(Object) -> Object

	Takes the NER Parse as input and annotates the transport type.

	INPUT: flight, air, bus, rail
	OUTPUT: flight, flight, bus, train

	'''

    logger.info("ENTERING TRANSPORT IDENTIFICATION MODULE")

    for key in extendedNERAnalyzedParse.keys():
        word = extendedNERAnalyzedParse[key]["word"]
        lemma = extendedNERAnalyzedParse[key]["lemma"]

        transport = getTransportItems(lemma)
        if transport:
            annotateParse(extendedNERAnalyzedParse, key, transport, transport,
                          transportMarker)
            logger.debug("Transport Identified = '%s' from word = '%s'",
                         transport, word)

    logger.info("TRANSPORT IDENTIFICATION DONE")

Beispiel #4

0

Datei anzeigen

def getOtherPreference(extendedNERAnalyzedParse):
    '''
	(Object) -> Object

	Takes the NER Parse as input and annotates the preferences of user like
	shortest, quickest, cheapest flights etc.

	INPUT: shortest, quickest, cheapest flights
	OUTPUT: fastest, fastest, cheapest

	'''

    logger.info("ENTERING OTHER PREFERENCES IDENTIFICATION MODULE")

    for key in extendedNERAnalyzedParse.keys():
        token = extendedNERAnalyzedParse[key]["word"]
        matched_preferences = findPreferences(token)
        for preference_tuple in matched_preferences:
            preference_name = preference_tuple[0]
            preference_type = preference_tuple[1]

            if matchLongestString(extendedNERAnalyzedParse, key,
                                  preference_name):
                annotateParse(extendedNERAnalyzedParse, key, preference_name,
                              preference_type, otherPreferenceMarker)
                logger.debug(
                    "Other preferences Identified = '%s' with type = '%s'",
                    preference_name, preference_type)

    logger.info("OTHER PREFERENCES IDENTIFICATION DONE")

Beispiel #5

0

Datei anzeigen

def findOrganization(extendedNERAnalyzedParse):
    '''
	(Object) -> Object

	Takes the NER Parse as input and annotates the Organization corresponding
	to the Organization at each Organization token.
	The Organization list is loaded from the database matching the first token of Organization name.

	INPUT: Indigo, spicejet, mumbai
	OUTPUT: Indigo, spicejet, no

	'''

    logger.info("ENTERING ORGANIZATION IDENTIFICATION MODULE")

    ner_organization = []

    for key in extendedNERAnalyzedParse.keys():
        if extendedNERAnalyzedParse[key]["POS"] in ["NN", "NNP", "JJ"]:
            token = extendedNERAnalyzedParse[key]["word"]
            matched_organization = findOrganizationByName(token)
            for organization_tuple in matched_organization:
                organization = organization_tuple[1]
                if matchLongestString(extendedNERAnalyzedParse, key,
                                      organization):
                    annotateParse(extendedNERAnalyzedParse, key, organization,
                                  organization, organizationMarker)
                    ner_organization.append(organization)
                    logger.debug(
                        "Organization Identified = '%s' with code = '%s'",
                        organization, organization)

    logger.info("ORGANIZATION IDENTIFICATION DONE")
    return ner_organization

Beispiel #6

0

Datei anzeigen

def getNumSeats(extendedNERAnalyzedParse, last_requested_DF):
    '''
	(Object) -> Object

	Takes the NER Parse as input and annotates the number of seats.

	INPUT: 2 tickets; 3 adults, 2 children
	OUTPUT: 2, {3,2}

	'''

    logger.info("ENTERING NUM SEATS IDENTIFICATION MODULE")

    template = {"adults": 0, "children": 0, "infants": 0}

    seatsList = findNumberToken(extendedNERAnalyzedParse)

    if (len(seatsList) > 0):
        if len(seatsList) == 1 and last_requested_DF.lower(
        ) == numSeatMarker.lower():
            index = seatsList[0]
            indexWord = extendedNERAnalyzedParse[index]["word"]

            template['adults'] = extendedNERAnalyzedParse[index][
                "NormalizedNER"]
            annotateParse(extendedNERAnalyzedParse, index, indexWord, template,
                          numSeatMarker)
            logger.debug("Num Seats Identified = '%s'", str(template))

        else:
            seat_flag = 0
            for key in extendedNERAnalyzedParse.keys():
                word = extendedNERAnalyzedParse[key]["word"]
                lemma = extendedNERAnalyzedParse[key]["lemma"]

                seat_word = isSeatIdentifier(lemma)
                if seat_word:
                    seat_flag = 1
                    break

            if seat_flag:
                classDict = matchSeatIdentifier(extendedNERAnalyzedParse)

                map_list = mapClassToNumber(extendedNERAnalyzedParse,
                                            seatsList, classDict, template)

                if map_list:
                    for key in classDict.keys():
                        word = extendedNERAnalyzedParse[key]["word"]
                        annotateParse(extendedNERAnalyzedParse, key, word,
                                      template, numSeatMarker)

                logger.debug("Num Seats Identified = '%s'", str(template))

    logger.info("NUM SEATS IDENTIFICATION DONE")

Beispiel #7

0

Datei anzeigen

def TravelType(extendedNERAnalyzedParse, last_requested_DF):
    '''
	(Object) -> Object

	Takes the NER Parse as input and annotates the round_trip code corresponding 
	to the round_trip at each round_trip token.
	The round_trip list is loaded from the database matching the first token of round_trip name.

	INPUT: one way, 2-way, returning
	OUTPUT: 0,1,1
	'''

    logger.info("ENTERING ROUND TRIP IDENTIFICATION MODULE")

    checkFlag = 1

    if last_requested_DF == travelTypeMarker.lower():
        key = 1
        word = extendedNERAnalyzedParse[key]['word'].lower()

        if (word == "yes"):
            annotateParse(extendedNERAnalyzedParse, key, word, "1",
                          travelTypeMarker)
            checkFlag = 0
            logger.debug("Round Trip identifier = '%s' with code = '%s'", word,
                         "1")

        elif (word == "no"):
            annotateParse(extendedNERAnalyzedParse, key, word, "0",
                          travelTypeMarker)
            checkFlag = 0
            logger.debug("Round Trip identifier = '%s' with code = '%s'", word,
                         "0")

    if checkFlag:
        for key in extendedNERAnalyzedParse.keys():
            word = extendedNERAnalyzedParse[key]["word"]
            lemma = extendedNERAnalyzedParse[key]["lemma"]

            matched_round_trip = findRoundTrip(word, lemma)
            for round_trip_tuple in matched_round_trip:
                round_trip_code = round_trip_tuple[0]
                round_trip = round_trip_tuple[1]

                if matchLongestString(extendedNERAnalyzedParse, key,
                                      round_trip):
                    annotateParse(extendedNERAnalyzedParse, key, round_trip,
                                  round_trip_code, travelTypeMarker)
                    logger.debug(
                        "Round Trip identifier = '%s' with code = '%s'",
                        round_trip, round_trip_code)

    logger.info("ROUND TRIP IDENTIFICATION DONE")

Beispiel #8

0

Datei anzeigen

def getTimePreference(extendedNERAnalyzedParse):
    '''
	(Object) -> Object

	Takes the NER Parse as input and annotates the semantic time periods like morning, late evening.

	INPUT: late evening, afternoon, mumbai
	OUTPUT: yes, yes, no

	'''

    logger.info("ENTERING TIME PREFERENCE MODULE")

    for key in extendedNERAnalyzedParse.keys():
        token = extendedNERAnalyzedParse[key]["word"]
        matched_period = getTimePeriod(token)
        if matched_period:
            time_preference = {"exact": "", "range": {"min": "", "max": ""}}

            start_time = matched_period[0].seconds * 1.0 / seconds_in_hour
            end_time = matched_period[1].seconds * 1.0 / seconds_in_hour

            previous_key = key - 1
            match_modifier = ()
            if previous_key in extendedNERAnalyzedParse:
                previous_token = extendedNERAnalyzedParse[previous_key]["word"]
                match_modifier = isTimeModifier(previous_token)

            # New time ranges
            if match_modifier:
                # Calculate new time period
                start_percent = match_modifier[0]
                end_percent = match_modifier[1]
                start_time, end_time = calculateTime(start_time, end_time,
                                                     start_percent,
                                                     end_percent)

            time_preference["range"]["min"] = start_time
            time_preference["range"]["max"] = end_time
            extendedNERAnalyzedParse[key]["NER"] = timePreferenceMarker
            extendedNERAnalyzedParse[key]["NormalizedNER"] = time_preference
            logger.debug("Time Preference Identified between %.2f and %.2f",
                         start_time, end_time)

    logger.info("TIME PREFERENCE DONE")

Beispiel #9

0

Datei anzeigen

Datei: ner_num_stops.py Projekt: bopopescu/Personal_Assistant

def getNumStops(extendedNERAnalyzedParse):
    '''
	(Object) -> Object

	Takes the NER Parse as input and annotates the number of stops.

	INPUT: 2 stops, less than 3 stops
	OUTPUT: 2, {0,2}

	'''

    logger.info("ENTERING NUM STOPS IDENTIFICATION MODULE")

    range_dict = {"exact": "", "range": {"min": "", "max": ""}}

    stopsList = findNumberToken(extendedNERAnalyzedParse)

    if (len(stopsList) > 0):
        for key in extendedNERAnalyzedParse.keys():
            word = extendedNERAnalyzedParse[key]["word"]
            lemma = extendedNERAnalyzedParse[key]["lemma"]

            stop_word = isStopItem(lemma)
            if stop_word:
                index = compareStopList(key, stopsList)

                if index:
                    indexWord = extendedNERAnalyzedParse[index]["word"]
                    indexWordValue = extendedNERAnalyzedParse[index][
                        "NormalizedNER"]
                    if indexWord.lower() == "no":
                        range_dict["exact"] = "0"
                    else:
                        range_dict["exact"] = indexWordValue

                    annotateParse(extendedNERAnalyzedParse, index, indexWord,
                                  range_dict, numStopMarker)
                    logger.debug(
                        "Num Stops Identified = '%s' from word = '%s'",
                        range_dict["exact"], stop_word)

    logger.info("NUM STOPS IDENTIFICATION DONE")

Beispiel #10

0

Datei anzeigen

Datei: normalizer.py Projekt: bopopescu/Personal_Assistant

def normalize(query):

	'''
	(String) -> String

	Takes a noisy query as input and returns normalized query.

	INPUT: I wanna go to hyderabad 2morrow
	OUTPUT: I want to go to hyderabad tomorrow 

	'''
	
	normalized_query = ""
	logger.info("ENTERING NOISY NORMALIZER MODULE")

	try:
		logger.debug("Query = " + query)

		tokens = query.split()
		for token in tokens:
			normalized_word = get_normalized_word(token)
			normalized_query += normalized_word + " "
		
		normalized_query = normalized_query.strip()
	
		logger.info("NORMALIZATION DONE\n")
		return normalized_query
	
	except:
		logger.error(sys.exc_info()[1])
		logger.info("NORMALIZATION DONE\n")
		return query

Beispiel #11

0

Datei anzeigen

def normalize(word):
    '''
	(String) -> String

	Takes a noisy word as input and returns normalized word.

	INPUT: nah, yeah
	OUTPUT: no, yes

	'''

    logger.info("ENTERING SPELL NOISY NORMALIZER MODULE")

    try:
        normalized_word = findWordByAbbreviation(word)
        return_word = ""
        if normalized_word != '':
            logger.debug("'%s' --> '%s' ", word, normalized_word)
            return_word = normalized_word

        logger.info("SPELL NORMALIZATION DONE\n")
        return return_word

    except:
        logger.error(sys.exc_info()[1])
        logger.info("SPELL NORMALIZATION DONE\n")
        return word

Beispiel #12

0

Datei anzeigen

Datei: abbreviation_corrector.py Projekt: bopopescu/Personal_Assistant

def correctAbbreviation(query):
	'''
	(String) -> String

	Takes a query with abbreviations and resolves them into their counterpart.

	INPUT: I want to travel from hyd to blr on 2 oct.
	OUTPUT: I want to travel from Hyderabad to Bangalore on 2 October.

	'''

	abbreviated_query = ""
	logger.info("ENTERING ABBREVIATION CORRECTION MODULE")
	
	try:
		logger.debug("Query = " + query)

		tokens = query.split()
		for token in tokens:
			expanded_word = expandWord(token)
			abbreviated_query += expanded_word + " "
		
		abbreviated_query = abbreviated_query.strip()
		
		logger.info("ABBREVIATION CORRECTION DONE\n")
		return abbreviated_query
	
	except:
		logger.error(sys.exc_info()[1])
		logger.info("ABBREVIATION CORRECTION DONE\n")
		return query

Beispiel #13

0

Datei anzeigen

def updateRangeInNER(extendedNERAnalyzedParse):
    '''
	(Object) -> Object

	Takes the NER Parse as input and updates the values of NERs which are marked as
	exact but are actually range values, eg. before 27th October.

	INPUT: less than Rs 10,000, around 4 p.m., mumbai
	OUTPUT: {"exact":null, "range":{"max":10000,"min":0}}, {"exact":null, "range":{"max":16,"min":0}}, no

	'''

    logger.info("ENTERING RANGE HANDLER MODULE")

    # print "================ ENTeRED NER HERE =================="
    # print extendedNERAnalyzedParse
    print "===================================================="
    for key in extendedNERAnalyzedParse.keys():
        token = extendedNERAnalyzedParse[key]["word"]
        matched_range_identifiers = getRangeIdentifier(token)
        for range_identifier in matched_range_identifiers:
            word = range_identifier[0]
            category = range_identifier[1]

            if matchLongestString(extendedNERAnalyzedParse, key, word):
                # Get NER key => where identifier ends
                word_length = len(word.split())
                ner_key = key + word_length
                if ner_key in extendedNERAnalyzedParse.keys():
                    ner_token = extendedNERAnalyzedParse[ner_key]['NER']
                    if ner_token != "0":
                        convertToRange(extendedNERAnalyzedParse, ner_key,
                                       category)

                        print "Range Identified = " + ner_token + " " + category
                        # logger.debug("City Identified = '%s' with code = '%s'", city, city_code)
    # print "================ LEAVING NER HERE =================="
    # print extendedNERAnalyzedParse
    print "===================================================="
    logger.info("RANGE HANDLER DONE")

Beispiel #14

0

Datei anzeigen

def getNumResults(extendedNERAnalyzedParse):
    '''
	(Object) -> Object

	Takes the NER Parse as input and annotates the number of flight results to display.

	INPUT: 2 flights, 3 cheapest flights
	OUTPUT: 2, 3

	'''

    logger.info("ENTERING NUM RESULTS IDENTIFICATION MODULE")

    resultsList = findNumberToken(extendedNERAnalyzedParse)

    if (len(resultsList) > 0):
        for key in extendedNERAnalyzedParse.keys():
            word = extendedNERAnalyzedParse[key]["word"]
            lemma = extendedNERAnalyzedParse[key]["lemma"]

            result_word = isResultItem(lemma)
            if result_word:

                index = compareResultList(key, resultsList)

                if index:
                    indexWord = extendedNERAnalyzedParse[index]["word"]
                    indexWordValue = extendedNERAnalyzedParse[index][
                        "NormalizedNER"]

                    annotateParse(extendedNERAnalyzedParse, index, indexWord,
                                  indexWordValue, numResultMarker)
                    logger.debug(
                        "Num Results Identified = '%s' from word = '%s'",
                        indexWordValue, result_word)

    logger.info("NUM RESULTS IDENTIFICATION DONE")

Beispiel #15

0

Datei anzeigen

def completeDate(query):
    '''
	(String) -> String

	Takes a query with incomplete dates (missing month), and fills it
	using Server Time. Also, it replaces 'th' to make the term number
	for gap filling module.

	INPUT: I will be travelling on 25th.
	OUTPUT: I will be travelling on 25 October.

	'''

    logger.info("ENTERING GAP FILLER MODULE")

    try:
        logger.debug("Query = " + query)

        gap_filled_query = ""

        tokens = query.split()
        new_tokens = []

        skip_flag = 0
        skip_tokens = []

        for index in range(0, len(tokens)):
            token = tokens[index]
            date = isDateToken(token)
            if date:
                new_tokens.append(str(date))
                if not isMonthSpecified(index, tokens):
                    month = getComingMonth(date)
                    new_tokens.append(month)
                # else:
                # 	skip_tokens, month = isRelativeMonthSpecified(index,tokens)
                # 	if skip_tokens:
                # 		new_tokens.append(month)
                # 		skip_flag = 1
            else:
                # if skip_flag and index not in skip_tokens:
                # 	skip_flag = 0
                # 	skip_tokens = []
                # if not skip_flag:
                new_tokens.append(token)

        gap_filled_query = " ".join(new_tokens)

        logger.info("GAP FILLING DONE\n")
        return gap_filled_query

    except:
        logger.error(sys.exc_info()[1])
        logger.info("GAP FILLING DONE\n")
        return query

Beispiel #16

0

Datei anzeigen

Datei: number_string_splitter.py Projekt: bopopescu/Personal_Assistant

def splitNumberString(query):
    '''
	(String) -> String

	Takes a query where multiple word are clubbed together in single token
	and separates such tokens to multiple words.

	INPUT: My budget is Rs.50 and extra luggage 10-15kgs.
	OUTPUT: My budget is Rs 50 and extra luggage 10 - 15 kgs.

	Cases to handle:
	Rs.50	->	Rs 50
	Rs50	->	Rs 50
	10-15kgs	-> 10 - 15 Kgs
	10Kgs-15kgs	-> 10 Kgs - 15 Kgs
	10Kg. 	-> 10 Kg
	10.1	-> 10.1
	10-12-2015	-> 10-12-2015
	10.12.2015	-> 10.12.2015
	END.	-> END .
	one-way -> one way
	1-tier	-> 1 tier
	4:00am	-> 4:00 am
	going.I 	-> going . I
	// Handle ticket/pnr no. and don't split them

	Rules (in order):
	1. Split '-' ---> 10-15 -> 10 - 15, if tier, way -> remove '-', handle date case
	2. Case '.', (i) two numbers: do nothing, (ii) two words: split, (iii) one word-one num: split and remove '.'
	3. Split NUM and String. If last char == '.', if word in dict -> remove '.', else full stop. If split == 'nd' (for date), delete token
	'''

    splitted_query = ""
    logger.info("ENTERING SPLITTER MODULE")

    try:
        logger.debug("Query = " + query)

        tokens = query.split()
        for token in tokens:
            splitted_word = split_word(token)
            splitted_query += splitted_word + " "

        splitted_query = splitted_query.strip()

        logger.info("SPLITTING DONE\n")
        return splitted_query

    except:
        logger.error(sys.exc_info()[1])
        logger.info("SPLITTING DONE\n")
        return query

Beispiel #17

0

Datei anzeigen

Datei: spell_checker.py Projekt: bopopescu/Personal_Assistant

def unigramSpellCheck(query, PWLdict):
    '''
	(String) -> String

	Takes a noisy query with ungrammatical/Out of Vocab words as
	input and returns the spell corrected query.

	INPUT: I want to buk a flight from hydrabad to banglore.
	OUTPUT: I want to book a flight from Hyderabad to Bangalore.

	'''

    logger.info("ENTERING SPELL CHECKER MODULE")

    try:
        logger.debug("Query = " + query)

        word_list = nltk.word_tokenize(query)
        pos_list = nltk.pos_tag(word_list)

        replacerDict = SpellingReplacer()
        # print replacerDict.check("mumbai")

        replacerPWL = SpellingReplacer(PWLdict)
        # print replacerPWL.check('mumbai')

        checked_list = []
        for item in pos_list:
            word = item[0]
            pos = item[1]

            truncate_word = re.sub(r'(.)\1+', r'\1', word)
            normalized_word = normalize(truncate_word)

            # If word is a special char, don't spell check it
            if re.match("([^\w@#])", word):
                checked_list.append(word)

            elif normalized_word:
                checked_list.append(normalized_word)

            elif replacerPWL.check(truncate_word):
                correctedWord = truncate_word.title()
                checked_list.append(correctedWord)

            elif not replacerDict.check(word):
                correctedWord = ""
                dist = 100

                # Do not replace words from PWL if len(word) <= 3
                if len(truncate_word) > 3:
                    correctedWordPWL = replacerPWL.replace(truncate_word)
                    distPWL = edit_distance(truncate_word, correctedWordPWL)
                else:
                    distPWL = dist
                    correctedWordPWL = truncate_word

                correctedWordDict = replacerDict.replace(word)
                distDict = edit_distance(word, correctedWordDict)

                if distPWL > distDict or correctedWordPWL == truncate_word:
                    correctedWord = correctedWordDict
                else:
                    correctedWord = correctedWordPWL.title()

                if correctedWord == "":
                    correctedWord = word
                else:
                    logger.debug("'%s' --> '%s' ", word, correctedWord)

                checked_list.append(correctedWord)
            else:
                checked_list.append(word)

        spell_checked_query = " ".join(checked_list)

        logger.info("SPELL CORRECTION DONE\n")
        return spell_checked_query

    except:
        logger.error(sys.exc_info()[1])
        logger.info("SPELL CORRECTION DONE\n")
        return query

Beispiel #18

0

Datei anzeigen

def fillArguments(query):
    '''
	(String) -> String

	Takes a query with range items missing units, and returns
	the gaps filled with units.

	INPUT: I will be travelling between 25rd and 25th October.
	OUTPUT: I will be travelling between 25rd October and 25th October.

	Cases to handle:
	10 - 15 Kilograms	-> 10 Kilograms - 15 Kilograms
	10 to 15 Kilograms	-> 10 Kilograms to 15 Kilograms
	10 and 15 Kilograms	-> 10 Kilograms and 15 Kilograms
	rupees 10 - 15	-> rupees 10 - rupees 15
	rupees 10 to 15	-> rupees 10 to rupees 15
	rupees 10 and 15	-> rupees 10 and rupees 15
	
	'''

    logger.info("ENTERING GAP FILLER MODULE")
    logger.debug("Query = " + query)

    gap_filled_query = ""

    tokens = query.split()

    numberedTokens = getNumberTokens(tokens)

    to_insert = {}

    for index in range(1, len(numberedTokens)):
        if numberedTokens[index] - numberedTokens[index - 1] == 2:
            if isRangeIdentifier(tokens[numberedTokens[index] - 1]):

                boundary = range(0, len(tokens))
                if (numberedTokens[index - 1] - 1) in boundary:
                    prev_word = tokens[numberedTokens[index - 1] - 1]
                    if isUnit(prev_word):
                        # Copy unit to query
                        to_insert[numberedTokens[index]] = prev_word

                if (numberedTokens[index] + 1) in boundary:
                    next_word = tokens[numberedTokens[index] + 1]
                    if isUnit(next_word):
                        # Copy unit to query
                        to_insert[numberedTokens[index - 1] + 1] = next_word

    gap_filled_tokens = []
    insert_keys = to_insert.keys()
    insert_length = len(insert_keys)
    count = 0

    for index in range(0, len(tokens)):
        if count < insert_length and index == insert_keys[count]:
            gap_filled_tokens.append(to_insert[index])
            logger.debug("Inserted '%s' at index = %d", to_insert[index],
                         index)
            count += 1
        gap_filled_tokens.append(tokens[index])

    gap_filled_query = ' '.join(gap_filled_tokens)

    logger.info("GAP FILLING DONE\n")
    return gap_filled_query

Beispiel #19

0

Datei anzeigen

def travelNLP(query, category, last_requested_DF):
    '''
	(String,String,String) -> Object

	Takes the input query, category and last requested DF
	and annotates the NERs in the query.

	INPUT: ('hyd to blr', 'travel', 'source')
	OUTPUT: {source: HYD, destination: BLR}

	'''
    # logger = logging.getLogger(__name__)
    allExtendedNerDF = {}
    logger.info("ENTERING TRAVEL MODULE")

    try:
        logger.debug(query + " " + last_requested_DF + "\n")

        query = query.lower()

        noiseRemovedQuery = preprocessing_tools.noisy_word_normalizer.normalizer.normalize(
            query)
        print "Normalize = ", noiseRemovedQuery
        logger.debug("Normalize = " + noiseRemovedQuery)

        splittedQuery = preprocessing_tools.number_string_splitter.number_string_splitter.splitNumberString(
            noiseRemovedQuery)
        print "Splitted = ", splittedQuery
        logger.debug("Splitted = " + splittedQuery)

        abbreviatedQuery = preprocessing_tools.abbreviation_checker.abbreviation_corrector.correctAbbreviation(
            splittedQuery)
        print "Abbreviated = ", abbreviatedQuery
        logger.debug("Abbreviated = " + abbreviatedQuery)

        spellCheckedQuery = preprocessing_tools.spell_checker.spell_checker.spellCheck(
            abbreviatedQuery, PWL_FILE)
        print "Spellchecked = ", spellCheckedQuery
        logger.debug("Spellchecked = " + spellCheckedQuery)

        monthFilledQuery = preprocessing_tools.month_filler.month_filler.completeDate(
            spellCheckedQuery)
        print "MonthFilledQuery = ", monthFilledQuery
        logger.debug("MonthFilledQuery = " + monthFilledQuery)

        gapFilledQuery = preprocessing_tools.argument_filler.argument_filler.fillArguments(
            monthFilledQuery)
        print "GapFilledQuery = ", gapFilledQuery
        logger.debug("GapFilledQuery = " + gapFilledQuery)

        normalizedQuery = gapFilledQuery
        print "Final Normalized Query = ", gapFilledQuery
        print
        logger.debug("Final Normalized Query = " + gapFilledQuery)

        NERAnalyzedParse, chunkedParse = preprocessing_tools.corenlp.corenlp.identifyNER(
            normalizedQuery)
        print "NER Parse = ", NERAnalyzedParse
        print "Chunking = ", chunkedParse

        for index in range(0, len(chunkedParse)):
            # print NERAnalyzedParse[index], chunkedParse[index]
            extendedNerDF = preprocessing_tools.extended_ner.travel.travel_extended_ner.identifyExtendedNER(
                normalizedQuery, category, NERAnalyzedParse[index],
                last_requested_DF)

            disambiguatedDF = preprocessing_tools.category_disambiguator.category_disambiguator.disambiguateCategories(
                normalizedQuery, category, NERAnalyzedParse[index],
                chunkedParse[index], last_requested_DF)
            # print "Disambiguated = ",
            # print disambiguatedDF

            singleExtendedNerDF = preprocessing_tools.category_disambiguator.category_disambiguator.mergeDictionaries(
                extendedNerDF, disambiguatedDF)
            allExtendedNerDF = mergeDictionaries(allExtendedNerDF,
                                                 singleExtendedNerDF)

        if "0" in allExtendedNerDF.keys():
            del allExtendedNerDF["0"]

        print "Final Analyzed NERs = ", allExtendedNerDF

    except:
        # print "Unexpected error:", sys.exc_info()
        logger.error(sys.exc_info()[1])

    finally:
        logger.info("LEAVING TRAVEL MODULE")
        return allExtendedNerDF

Beispiel #20

0

Datei anzeigen

Datei: time_normalizer.py Projekt: bopopescu/Personal_Assistant

def normalizeTime(extendedNERAnalyzedParse):
    '''
	(Object) -> Object

	Takes the NER Parse as input and converts Stanford SUTime
	duration and date convention to our convention.

	INPUT: PT2H, 2015-09-28-WXX-1
	OUTPUT: 2, 2015-10-05

	'''

    logger.info("ENTERING TIME NORMALIZATION IDENTIFICATION MODULE")

    for key in extendedNERAnalyzedParse.keys():
        ner = extendedNERAnalyzedParse[key]["NER"].lower()
        normalizedValue = extendedNERAnalyzedParse[key]["NormalizedNER"]

        if ner in time_NER:
            if ner == "duration":
                value = ""
                range_dict = {"exact": "", "range": {"min": "", "max": ""}}

                normalizedValue = re.sub("\>|\<", r"", normalizedValue)
                if normalizedValue[0:2] == "PT":
                    value = normalizedValue[-2:]
                elif normalizedValue[0:1] == "P":
                    value = normalizedValue[1:]

                unit = value[-1:]
                amount = value[0:-1]

                if isNumber(amount):
                    amount = int(amount)
                    hours = calculateTime(unit, amount)

                    range_dict["exact"] = hours

                    extendedNERAnalyzedParse[key]["NormalizedNER"] = range_dict

            elif ner == "date":

                tokens = normalizedValue.split("-")
                length_token = len(tokens)

                if length_token >= 3 and tokens[2][0:2] == "WE":
                    print "Weekend Identified"

                elif length_token > 4 and tokens[3][0:1] == "W":
                    offset = tokens[4]
                    addWeek = 0

                    if isNumber(offset):
                        offset = int(offset)

                        today = datetime.datetime.today().weekday() + 1
                        if today > offset:
                            addWeek = 1

                        week_date = calculateDate(tokens, addWeek)

                        extendedNERAnalyzedParse[key][
                            "NormalizedNER"] = week_date.strftime("%Y-%m-%d")

    logger.info("TIME NORMALIZATION DONE")