コード例 #1
0
 def adj_noun_pairs_linear(self, term, pos):
     if self.last_term_pos is None:
         self.last_term_pos = (term, pos)
         return 1
     # If there is an adjacent adjective and noun in the query
     if wn.is_adjective(self.last_term_pos[1]) and wn.is_noun(pos):
         # Get term locations inside this document
         posting1 = self.vector_collection.get_term_posting_for_doc(term, self.doc.id)
         posting2 = self.vector_collection.get_term_posting_for_doc(self.last_term_pos[0], self.doc.id)
         # Determine if both terms appear in the document
         if posting1 is None or posting2 is None:
             return 1
         # Boost if ADJ and NOUN appear in same sentence and same order as query
         sentences = zip(posting1.sentence, posting2.sentence)
         for idx, s1_s2 in enumerate(sentences):
             s1 = s1_s2[0]
             s2 = s1_s2[1]
             if s1 == s2 and posting1.offsets[idx] > posting2.offsets[idx]:
                 m = self.adj_noun_pairs_m
                 b = self.adj_noun_pairs_b
                 idxNn = posting1.offsets[idx]
                 idxAdj = posting2.offsets[idx]
                 # print("Adj:" + str(self.last_term_pos[0]) + " Nn:" + term + " y:" + str(max(m * (idxNn-idxAdj) + (b-m), 1)) + " ___ ")
                 return max(m * (idxNn-idxAdj) + (b-m), 1)
     return 1
コード例 #2
0
ファイル: lang.py プロジェクト: braynebuddy/PyBrayne
def get_verbs(tree):
    """
    Find all of the possible verbs in a parse tree. Return them in order of likelihood
    if possible
    """
    res = []
    verbs = ['VP', 'BE', 'BEZ', 'HV']
    # First, we'll simply look for the parsed verb(s) in 'tree'
    for phrase in tree:
        try:
            phrase.node
        except AttributeError:
            # it's not a node, so treat it like a tuple
            if phrase[1] in verbs:
                res.append(phrase[0])
        else:
            # phrase.node is defined, so see if it's a verb
            if phrase.node in verbs:
                s = ''
                for word in phrase.leaves():
                    s = s + word[0] + ' '
                res.append(s.strip())
    # Now look for other words in 'tree' that could be verbs
    for phrase in tree:
        try:
            phrase.leaves()
        except AttributeError:
            # it's not a node, so treat it like a tuple
            words = WordNet.wordinfo(phrase[0])
            for w in words:
                if w[1].lower() == 'v':
                    res.append(w[0])
        else:
            # phrase.leaves() is defined, so see if it might be a verb
            for word in phrase.leaves():
                words = WordNet.wordinfo(word[0])
                for w in words:
                    if w[1].lower() == 'v':
                        res.append(w[0])
    # Magic return line removes duplicates (from  http://code.activestate.com/recipes/52560)!
    #return [ u for u in res if u not in locals()['_[1]'] ]
    return unique(res)
コード例 #3
0
 def adv_verb_pairs(self, term, pos):
     if self.last_term_pos is None:
         self.last_term_pos = (term, pos)
         return False
     # If there is an adjacent adjective and noun in the query
     if wn.is_adverb(self.last_term_pos[1]) and wn.is_verb(pos):
         # Get term locations inside this document
         posting1 = self.vector_collection.get_term_posting_for_doc(term, self.doc.id)
         posting2 = self.vector_collection.get_term_posting_for_doc(self.last_term_pos[0], self.doc.id)
         # Determine if both terms appear in the document
         if posting1 is None or posting2 is None:
             return False
         # Boost if ADV and VERB appear in same sentence and same order as query
         sentences = zip(posting1.sentence, posting2.sentence)
         for idx, s1_s2 in enumerate(sentences):
             s1 = s1_s2[0]
             s2 = s1_s2[1]
             if s1 == s2 and posting1.offsets[idx] > posting2.offsets[idx]:
                 return True
     return False
コード例 #4
0
 def adj_noun_2gram(self, term, pos):
     if self.last_term_pos is None:
         self.last_term_pos = (term, pos)
         return False
     # If there is an adjacent adjective and noun in the query
     if wn.is_adjective(self.last_term_pos[1]) and wn.is_noun(pos):
         # Get term locations inside this document
         posting1 = self.vector_collection.get_term_posting_for_doc(term, self.doc.id)
         posting2 = self.vector_collection.get_term_posting_for_doc(self.last_term_pos[0], self.doc.id)
         # Determine if both terms appear in the document
         if posting1 is None or posting2 is None:
             return False
         # Boost if ADJ and NOUN appear in same sentence and right after each other as they do in the query
         sentences = zip(posting1.sentence, posting2.sentence)
         for idx, s1_s2 in enumerate(sentences):
             s1 = s1_s2[0]
             s2 = s1_s2[1]
             if s1 == s2 and ((posting1.offsets[idx] - posting2.offsets[idx]) == 1):
                 return True
     return False
コード例 #5
0
    def execute(self) -> float:
        okapi_sum = 0
        terms = []

        # Need to know idf scores ahead of time
        if self.is_sub_idf_top or self.is_w2v_sub_idf_top or self.is_sub_api_idf_top:
            is_top_idf_map = self.calc_idfs(self.sub_idf_top, top=True)
        if self.is_sub_idf_bottom or self.is_w2v_sub_idf_bottom or self.is_sub_api_idf_bottom:
            is_bottom_idf_map = self.calc_idfs(self.sub_idf_bottom, top=False)

        # Traverse query terms in order of how they appear
        # Do not want to double score the same term
        for term, pos, subs in zip(self.query.terms, self.query.terms_pos, self.query.terms_sub):
            product = 0
            if term not in terms:
                product = self.okapi(term)

            boosts = []  # Independent collection of boosts
            sub_boosts = []  # Substitution boosts

            if self.is_early:
                boosts.append(boost(product, self.early_term(term)))
            if self.is_early_noun:
                if wn.is_noun(pos):
                    boosts.append(boost(product, self.early_term_noun(term)))
            if self.is_early_verb:
                if wn.is_verb(pos):
                    boosts.append(boost(product, self.early_term_verb(term)))
            if self.is_early_adj:
                if wn.is_adjective(pos):
                    boosts.append(boost(product, self.early_term_adj(term)))
            if self.is_early_adv:
                if wn.is_adverb(pos):
                    boosts.append(boost(product, self.early_term_adv(term)))
            if self.is_early_noun_adj: # Boost both adjectives and nouns
                if wn.is_noun(pos) or wn.is_adjective(pos):
                    boosts.append(boost(product, self.early_term(term)))
            if self.is_early_verb_adv: # Boost both verbs and adv
                if wn.is_verb(pos) or wn.is_adverb(pos):
                    boosts.append(boost(product, self.early_term(term)))
            if self.is_early_not_noun:
                if not wn.is_noun(pos):
                    boosts.append(boost(product, self.early_term(term)))
            if self.is_early_not_verb:
                if not wn.is_verb(pos):
                    boosts.append(boost(product, self.early_term(term)))
            if self.is_early_not_adj:
                if not wn.is_adjective(pos):
                    boosts.append(boost(product, self.early_term(term)))
            if self.is_early_not_adv:
                if not wn.is_adverb(pos):
                    boosts.append(boost(product, self.early_term(term)))
            if self.is_early_not_verb_adv:
                if not wn.is_verb(pos) and not wn.is_adverb(pos):
                    boosts.append(boost(product, self.early_term(term)))
            if self.is_early_not_noun_adj:
                if not wn.is_noun(pos) and not wn.is_adjective(pos):
                    boosts.append(boost(product, self.early_term(term)))

            if self.is_early_q:
                boosts.append(boost(product, self.early_term_q(term)))
            if self.is_early_q_noun:
                if wn.is_noun(pos):
                    boosts.append(boost(product, self.early_term_q(term)))
            if self.is_early_q_verb:
                if wn.is_verb(pos):
                    boosts.append(boost(product, self.early_term_q(term)))

            if self.is_noun:
                if wn.is_noun(pos):
                    boosts.append(boost(product, self.noun_influence))
            if self.is_adj:
                if wn.is_adjective(pos):
                    boosts.append(boost(product, self.adj_influence))
            if self.is_verb:
                if wn.is_verb(pos):
                    boosts.append(boost(product, self.verb_influence))
            if self.is_adv:
                if wn.is_adverb(pos):
                    boosts.append(boost(product, self.adv_influence))

            if self.is_close_pairs:
                boosts.append(boost(product, self.close_pairs(term)))
                self.last_term = term
            if self.is_adj_noun_pairs:
                # If the last adjective in the query is before the current noun
                if self.adj_noun_pairs(term, pos):
                    boosts.append(boost(product, self.adj_noun_pairs_influence))
                self.last_term_pos = (term, pos)
            if self.is_adj_noun_linear_pairs:
                boosts.append(boost(product, self.adj_noun_pairs_linear(term, pos)))
                self.last_term_pos = (term, pos)
            if self.is_adv_verb_pairs:
                # If the last adverb in the query is before the current verb
                if self.adv_verb_pairs(term, pos):
                    boosts.append(boost(product, self.adv_verb_pairs_influence))
                self.last_term_pos = (term, pos)
            if self.is_adv_verb_linear_pairs:
                boosts.append(boost(product, self.adv_verb_pairs_linear(term, pos)))
                self.last_term_pos = (term, pos)

            if self.is_bigram:
                # If the last term in the query is right before the current term
                if self.bigram(term, pos):
                    boosts.append(boost(product, self.bigram_influence))
                self.last_term_pos = (term, pos)
            if self.is_adj_noun_2gram:
                # If the last adjective in the query is right before the current noun
                if self.adj_noun_2gram(term, pos):
                    boosts.append(boost(product, self.adj_noun_2gram_influence))
                self.last_term_pos = (term, pos)
            if self.is_adv_verb_2gram:
                # If the last adverb in the query is right before the current verb
                if self.adv_verb_2gram(term, pos):
                    boosts.append(boost(product, self.adv_verb_2gram_influence))
                self.last_term_pos = (term, pos)

            if self.is_sub_all:
                self.substitute(sub_boosts, subs)
            if self.is_sub_noun:
                if wn.is_noun(pos):
                    self.substitute(sub_boosts, subs)
            if self.is_sub_verb:
                if wn.is_verb(pos):
                    self.substitute(sub_boosts, subs)
            if self.is_sub_adj:
                if wn.is_adjective(pos):
                    self.substitute(sub_boosts, subs)
            if self.is_sub_adv:
                if wn.is_adverb(pos):
                    self.substitute(sub_boosts, subs)
            if self.is_sub_idf_top:  # substitute for the terms with the top idf scores
                if is_top_idf_map[term]:
                    self.substitute(sub_boosts, subs)
            if self.is_sub_idf_bottom:  # substitute for the terms with the bottom idf scores
                if is_bottom_idf_map[term]:
                    self.substitute(sub_boosts, subs)

            if self.is_sub_api_all:
                self.substitute_wn_api(sub_boosts, term)
            if self.is_sub_api_noun:
                if wn.is_noun(pos):
                    self.substitute_wn_api(sub_boosts, term)
            if self.is_sub_api_verb:
                if wn.is_verb(pos):
                    self.substitute_wn_api(sub_boosts, term)
            if self.is_sub_api_adj:
                if wn.is_adjective(pos):
                    self.substitute_wn_api(sub_boosts, term)
            if self.is_sub_api_adv:
                if wn.is_adverb(pos):
                    self.substitute_wn_api(sub_boosts, term)
            if self.is_sub_api_idf_top:
                if is_top_idf_map[term]:
                    self.substitute_wn_api(sub_boosts, term)
            if self.is_sub_api_idf_bottom:
                if is_bottom_idf_map[term]:
                    self.substitute_wn_api(sub_boosts, term)

            if self.is_w2v_sub_all:
                self.substitute_w2v(sub_boosts, term)
            if self.is_w2v_sub_noun:
                if wn.is_noun(pos):
                    self.substitute_w2v(sub_boosts, term)
            if self.is_w2v_sub_verb:
                if wn.is_verb(pos):
                    self.substitute_w2v(sub_boosts, term)
            if self.is_w2v_sub_adj:
                if wn.is_adjective(pos):
                    self.substitute_w2v(sub_boosts, term)
            if self.is_w2v_sub_adv:
                if wn.is_adverb(pos):
                    self.substitute_w2v(sub_boosts, term)
            if self.is_w2v_sub_idf_top:  # substitute for the terms with the top idf scores
                if is_top_idf_map[term]:
                    self.substitute_w2v(sub_boosts, term)
            if self.is_w2v_sub_idf_bottom:  # substitute for the terms with the bottom idf scores
                if is_bottom_idf_map[term]:
                    self.substitute_w2v(sub_boosts, term)

            if self.is_remove_adj:  # Needs to be last
                if wn.is_adjective(pos):
                    self.remove_adj_boosts = product + sum(boosts) + sum(sub_boosts)
                    self.remove_last_term_adj = term
                    terms.append(term)
                    continue
                # If we just found an adj and the next term is a noun found in both q and d
                elif wn.is_noun(pos) and self.remove_last_term_adj is not None\
                        and self.same_sentence(self.remove_last_term_adj, term):
                    boosts.append(self.remove_adj_boosts)
                    self.remove_adj_boosts = 0
                    self.remove_last_term_adj = None
                else:
                    self.remove_adj_boosts = 0
                    self.remove_last_term_adj = None
            if self.is_remove_adv:  # Needs to be last
                if wn.is_adverb(pos):
                    self.remove_adv_boosts = product + sum(boosts) + sum(sub_boosts)
                    self.remove_last_term_adv = term
                    terms.append(term)
                    continue
                # If we just found an adj and the next term is a noun found in both q and d
                elif wn.is_verb(pos) and self.remove_last_term_adv is not None\
                        and self.same_sentence(self.remove_last_term_adv, term):
                    boosts.append(self.remove_adv_boosts)
                    self.remove_adv_boosts = 0
                    self.remove_last_term_adv = None
                else:
                    self.remove_adv_boosts = 0
                    self.remove_last_term_adv = None

            terms.append(term)
            okapi_sum += product + sum(boosts) + sum(sub_boosts)
        return okapi_sum
コード例 #6
0
# Unmodified Cosine:                                                MAP=0.5302462663875682
# Unmodified Okapi:                                                 MAP=0.5353128722733899
# is_early_noun_adj I=2.4                                           MAP=0.541351091646442
# is_early_noun_adj I=2.4, is_adj_noun_linear_pairs b=1.5           MAP=0.541373479735338


import math
import sys
import DocumentVector
import QueryVector
import VectorCollection
import WordNet as wn
from Word2Vec import Word2Vec


wordnet = wn.WordNet()


class DistanceFunction:
    def __init__(self, vector_collection: VectorCollection):
        self.vector_collection = vector_collection
        self.query = None
        self.doc = None

    def set_query(self, query_tv: QueryVector):
        self.query = query_tv

    def set_doc(self, doc_tv: DocumentVector):
        self.doc = doc_tv

コード例 #7
0
ファイル: choicesNHS.py プロジェクト: damodamr/ac-webServies
def readFile():
    input_file = open(
        "C:\\Users\\Sergio\\Dropbox\\QMUL\\Data\\choicesNHS\\nhsChoices.txt",
        "r")
    #input_file = open("C:\\Users\\Sergio\\Dropbox\\QMUL\\Data\\choicesNHS\\nhsChoicesDiagnosis.txt", "r")
    #input_file = open("C:\\Users\\Sergio\\Dropbox\\QMUL\\Data\\choicesNHS\\nhsChoicesDiabetesWhole.txt", "r")
    lines = input_file.readlines()
    input_file.close()

    annotationsX = []
    annotationsSLR = []
    annotationsNER = []

    for x in lines:

        annotationX = x
        annotationSLR = annotator.getAnnotations(x, dep_parse=True)['srl']
        #annotationNER = annotator.getAnnotations(x,dep_parse=True)['ner']
        annotationsX.append(annotationX)
        annotationsSLR.append(annotationSLR)
        #annotationsNER.append(annotationNER)

    size = len(annotationsSLR)
    print size

    A0 = 0
    A1 = 0
    pbroles = []
    annotationsA0 = []
    annotationsA1 = []

    for an in range(5):
        print annotationsX[an]
        print annotationsSLR[an]
        sizeIn = len(annotationsSLR[an])
        #print sizeIn
        for an2 in range(sizeIn):

            print "--------------------------------------------------------------------------------------------------------"

            print annotationsSLR[an][an2]["V"]
            w = Word(annotationsSLR[an][an2]["V"]).lemmatize("v")
            #print w
            #print wn.synset(w+'.v.01')

            try:
                for role in propbank.roleset(w + '.01').findall("roles/role"):
                    print(role.attrib['f'], role.attrib['n'],
                          role.attrib['descr'])
                    pbroles.append(role.attrib['descr'])
                #for role in propbank.roleset(w+'.01').findall("aliases/alias"):
                #print(role.attrib['framenet'], role.attrib['pos'], role.attrib['verbnet'])
            except:
                pass

            try:
                print(
                    wn.lemma(w + '.v.01.' + w).derivationally_related_forms())
            except:
                pass

            if "A0" in annotationsSLR[an][an2]:
                print annotationsSLR[an][an2]["A0"]
                A0 = annotationsSLR[an][an2]["A0"]
                #try:
                #A0 = TextBlob(A0, np_extractor=extractor)
                #A0 = A0.noun_phrases[0]
                #print A0
                #except:
                #pass
                try:
                    annotationsA0 = WordNet.spotlightSearch(A0)
                    annotationsA0 = annotationsA0[0].get('URI')
                except:
                    annotationsA0 = "unknown"
                    pass

            if "A1" in annotationsSLR[an][an2]:
                print annotationsSLR[an][an2]["A1"]
                A1 = annotationsSLR[an][an2]["A1"]
                #try:
                #A1 = TextBlob(A1, np_extractor=extractor)
                #A1 = A1.noun_phrases[0]
                #print A1
                #except:
                #pass
                try:
                    annotationsA1 = WordNet.spotlightSearch(A1)
                    annotationsA1 = annotationsA1[0].get('URI')
                except:
                    annotationsA1 = "unknown"
                    pass

            print pbroles

            print "--------------------------------------------------------------------------------------------------------"

            CreateGraphNeo4J.createGraph(w, A0, A1, pbroles, annotationsA0,
                                         annotationsA1)
            del pbroles[:]
            annotationsA0 = []
            annotationsA1 = []
            A0 = 0
            A1 = 0
コード例 #8
0
ファイル: lang.py プロジェクト: braynebuddy/PyBrayne
def define(word):
    res = []
    parsed_words = WordNet.wordinfo(word)
    for entry in parsed_words:
        res.append("%s (%s) [%s] %s" % (entry[0], entry[1], entry[2].name, entry[4]))
    return res
コード例 #9
0
ファイル: lang.py プロジェクト: braynebuddy/PyBrayne
def qparse1(sentence):
    words = nltk.wordpunct_tokenize(sentence)
    #wdict = {'Sentence': words}
    wdict = {}
    pos = SpecialWords()

    # translation key for WordNet POS tags
    WN_part_name = {'n':'N','v':'V','a':'Adj','s':'Adj','r':'Adv'}
    BR_part_name = {}
    

    # Look up the POS of the words
    for w in words:
        # get special POS
        for p in pos.keys():
            if (w in pos[p]) or (w.lower() in pos[p]): 
                if wdict.has_key(w):
                    if not (p in wdict[w]):
                        wdict[w].append(p)
                else:
                    wdict[w] = [p]
        # get WordNet POS
        parts = WordNet.getPOS(w)
        for p in parts:
            pn = WN_part_name[p]
            if wdict.has_key(w):
                if not (pn in wdict[w]):
                    wdict[w].append(pn)
            else:
                wdict[w] = [pn]

    print str(wdict)
    
    chunks = {'NP':[], 'VP':[], 'Unknown':[]}

    # Chunk up the NPs going from left to right using these rules:
    # NP = Det-N, (Adj)-N, Det-(Adj)-N, PropN, ProN, N
    NP = {}
    newNP = ['PropN', 'ProN', 'N', 'Det', 'Adj']
    nextNP = []

    for w in words:
        print
        print 'chunks =', str(chunks)
        print 'NP: Processing "%s" ...' % w
        chunked = False

        # Try to add the word to the current NP chunk
        for p in nextNP:
            if p in wdict[w]:
                chunked = True
                NP.append((w,p)) # Add word as a tuple
                if (p == 'Det') or (p == 'Adj'):
                    nextNP = ['N', 'Adj']
                else:
                    nextNP = []
        #if not chunked:
            
        # Try to add the word to the current VP chunk
        #if looking:
        for p in nextVP:
            if p in wdict[w]:
                looking = False
                VP.append(w)
                nextVP = []            
                break            
        
        # Try to start a new NP chunk
        #if looking:
        for p in newNP:
            if p in wdict[w]:
                looking = False
                # Save NP/VP if necessary     
                if len(NP)>0:
                    chunks['NP'].append(NP)
                #if len(VP)>0:
                #    chunks['VP'].append(VP)
                #VP = []
                #nextVP = []
                NP = [w]
                if (p == 'Det') or (p == 'Adj'):
                    nextNP = ['N', 'Adj']
                else:
                    nextNP = []            
                break            
                    
        # Try to start a new VP chunk
        #if looking:
        for p in newVP:
            if p in wdict[w]:
                looking = False
                # Save NP/VP if necessary     
                if len(nextVP)==0 and len(VP)>0:
                    chunks['VP'].append(VP)
                #if len(NP)>0:
                #    chunks['NP'].append(NP)
                #NP = []
                #nextNP = []
                VP = [w]
                if p == 'Adv':
                    nextVP = ['V']
                else:
                    nextVP = ['Adv']
                break            
    
        # Put it in unknown
        if looking:
            Unknown.append(w)
            nextNP = []
            nextVP = []            
            # Save and reset NP/VP if necessary     
            if len(NP)>0:
                chunks['NP'].append(NP)
                NP = []
            if len(VP)>0:
                chunks['VP'].append(VP)
                VP = []

        print 'NP =', str(NP)
        print 'VP =', str(VP)
        print 'Unknown =', str(Unknown)

        print 'chunks =', str(chunks)

    if len(NP)>0:
        chunks['NP'].append(NP)
    if len(VP)>0:
        chunks['VP'].append(VP)

    return chunks
コード例 #10
0
#annotator.getAnnotations(wikipedia.summary("London", sentences=1))['chunk']

#word = wn.synset(searchFor+'.n.01')
#print word.hypernyms()

annotator = Annotator()
result = re.sub('\(.*?\)', "", wiki.string)
result = re.sub("\/.+?\/", "", result)
#print result
dep_parse = annotator.getAnnotations(result, dep_parse=True)['dep_parse']
dp_list = dep_parse.split('\n')
#print dp_list

spotlightTerms = WordNet.spotlightSearch(
    "London is the capital and most populous city of England and the United Kingdom."
)
print dp_list


def dpbediaQuery(query):
    #test = json.load(urllib2.urlopen("http://www.freesound.org/apiv2/search/text/?query=" + term + "&token=06mS7W2OiXidVC2tQ4ikMfe3nomU7rBptaJgBCvp"))
    #test2 = json.load(urllib2.urlopen("https://api.jamendo.com/v3.0/tracks/?client_id=4cb8fab9&format=jsonpretty&name=" + term))
    #pprint(test)
    #pprint(test2)
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setQuery("""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX dbpedia-owl: <http://dbpedia.org/ontology/>
        PREFIX dbres: <http://dbpedia.org/resource/>
コード例 #11
0
def process_question(question):

    all_businesses = read_business_file()

    #Finds the business name
    name_from_question = extract_super_key_value(question, business_names)

    extracted_business = extract_business_dictionaries(all_businesses, NAME,
                                                       name_from_question)
    print "Length after filtering by names: ", len(extracted_business)
    question_without_names = remove_super_key_values(question,
                                                     name_from_question)

    #Finds the neighborhood
    neighborhood_from_question = extract_super_key_value(
        question_without_names, business_neighborhood)
    #print "neighborhood", neighborhood_from_question

    extracted_business = extract_business_dictionaries(
        extracted_business, NEIGHBORHOOD, neighborhood_from_question)
    print "Len after filtering by neighborhood: ", len(extracted_business)
    question_without_neighborhood = remove_super_key_values(
        question_without_names, neighborhood_from_question)

    clean_question = remove_stopwords_punctuations(
        question_without_neighborhood)
    #print clean_question
    similarity_index = WordNet.tuning(clean_question,
                                      business_categories,
                                      type_of_super_key='category')
    categories_from_question = WordNet.extract_categories(
        clean_question, business_categories, similarity_index)
    #print "categories_from_question", categories_from_question
    print "categories from question", categories_from_question
    extracted_business = extract_business_dictionaries(
        extracted_business, CATEGORIES, categories_from_question)
    #print "Extracted Businesses", extracted_business[0:5]
    # business_id_subset=[]
    # for business in extracted_business:
    # 	business_id_subset.append(business['business_id'])
    # pickle.dump(business_id_subset, open(pickle_business_id_q1, 'w'))

    print "Length after filtering by categories: ", len(extracted_business)

    #extracted_business=filter_businesses_using_reviews.get_similarity(clean_question, extracted_business, categories_from_question)
    #print "Length after filtering by user reviews: ", len(extracted_business)

    extracted_misc_attributes = extract_misc_attributes(
        extracted_business, categories_from_question)

    print "extracted attributes", extracted_misc_attributes
    similarity_index = WordNet.tuning(clean_question,
                                      extracted_misc_attributes,
                                      type_of_super_key='attributes')
    misc_attributes_from_question = WordNet.extract_categories(
        clean_question, extracted_misc_attributes, similarity_index)

    extracted_business = extract_candidate_businesses(
        extracted_business, misc_attributes_from_question)

    extracted_business, yn = extract_misc_attribute_businesses(
        misc_attributes_from_question, extracted_business, question)

    print "**Length after filtering by attributes", len(extracted_business)

    return extracted_business, yn


#question = "What is the best place to have Sushi near Downtown?"
#print process_question(question)