def adj_noun_pairs_linear(self, term, pos): if self.last_term_pos is None: self.last_term_pos = (term, pos) return 1 # If there is an adjacent adjective and noun in the query if wn.is_adjective(self.last_term_pos[1]) and wn.is_noun(pos): # Get term locations inside this document posting1 = self.vector_collection.get_term_posting_for_doc(term, self.doc.id) posting2 = self.vector_collection.get_term_posting_for_doc(self.last_term_pos[0], self.doc.id) # Determine if both terms appear in the document if posting1 is None or posting2 is None: return 1 # Boost if ADJ and NOUN appear in same sentence and same order as query sentences = zip(posting1.sentence, posting2.sentence) for idx, s1_s2 in enumerate(sentences): s1 = s1_s2[0] s2 = s1_s2[1] if s1 == s2 and posting1.offsets[idx] > posting2.offsets[idx]: m = self.adj_noun_pairs_m b = self.adj_noun_pairs_b idxNn = posting1.offsets[idx] idxAdj = posting2.offsets[idx] # print("Adj:" + str(self.last_term_pos[0]) + " Nn:" + term + " y:" + str(max(m * (idxNn-idxAdj) + (b-m), 1)) + " ___ ") return max(m * (idxNn-idxAdj) + (b-m), 1) return 1
def get_verbs(tree): """ Find all of the possible verbs in a parse tree. Return them in order of likelihood if possible """ res = [] verbs = ['VP', 'BE', 'BEZ', 'HV'] # First, we'll simply look for the parsed verb(s) in 'tree' for phrase in tree: try: phrase.node except AttributeError: # it's not a node, so treat it like a tuple if phrase[1] in verbs: res.append(phrase[0]) else: # phrase.node is defined, so see if it's a verb if phrase.node in verbs: s = '' for word in phrase.leaves(): s = s + word[0] + ' ' res.append(s.strip()) # Now look for other words in 'tree' that could be verbs for phrase in tree: try: phrase.leaves() except AttributeError: # it's not a node, so treat it like a tuple words = WordNet.wordinfo(phrase[0]) for w in words: if w[1].lower() == 'v': res.append(w[0]) else: # phrase.leaves() is defined, so see if it might be a verb for word in phrase.leaves(): words = WordNet.wordinfo(word[0]) for w in words: if w[1].lower() == 'v': res.append(w[0]) # Magic return line removes duplicates (from http://code.activestate.com/recipes/52560)! #return [ u for u in res if u not in locals()['_[1]'] ] return unique(res)
def adv_verb_pairs(self, term, pos): if self.last_term_pos is None: self.last_term_pos = (term, pos) return False # If there is an adjacent adjective and noun in the query if wn.is_adverb(self.last_term_pos[1]) and wn.is_verb(pos): # Get term locations inside this document posting1 = self.vector_collection.get_term_posting_for_doc(term, self.doc.id) posting2 = self.vector_collection.get_term_posting_for_doc(self.last_term_pos[0], self.doc.id) # Determine if both terms appear in the document if posting1 is None or posting2 is None: return False # Boost if ADV and VERB appear in same sentence and same order as query sentences = zip(posting1.sentence, posting2.sentence) for idx, s1_s2 in enumerate(sentences): s1 = s1_s2[0] s2 = s1_s2[1] if s1 == s2 and posting1.offsets[idx] > posting2.offsets[idx]: return True return False
def adj_noun_2gram(self, term, pos): if self.last_term_pos is None: self.last_term_pos = (term, pos) return False # If there is an adjacent adjective and noun in the query if wn.is_adjective(self.last_term_pos[1]) and wn.is_noun(pos): # Get term locations inside this document posting1 = self.vector_collection.get_term_posting_for_doc(term, self.doc.id) posting2 = self.vector_collection.get_term_posting_for_doc(self.last_term_pos[0], self.doc.id) # Determine if both terms appear in the document if posting1 is None or posting2 is None: return False # Boost if ADJ and NOUN appear in same sentence and right after each other as they do in the query sentences = zip(posting1.sentence, posting2.sentence) for idx, s1_s2 in enumerate(sentences): s1 = s1_s2[0] s2 = s1_s2[1] if s1 == s2 and ((posting1.offsets[idx] - posting2.offsets[idx]) == 1): return True return False
def execute(self) -> float: okapi_sum = 0 terms = [] # Need to know idf scores ahead of time if self.is_sub_idf_top or self.is_w2v_sub_idf_top or self.is_sub_api_idf_top: is_top_idf_map = self.calc_idfs(self.sub_idf_top, top=True) if self.is_sub_idf_bottom or self.is_w2v_sub_idf_bottom or self.is_sub_api_idf_bottom: is_bottom_idf_map = self.calc_idfs(self.sub_idf_bottom, top=False) # Traverse query terms in order of how they appear # Do not want to double score the same term for term, pos, subs in zip(self.query.terms, self.query.terms_pos, self.query.terms_sub): product = 0 if term not in terms: product = self.okapi(term) boosts = [] # Independent collection of boosts sub_boosts = [] # Substitution boosts if self.is_early: boosts.append(boost(product, self.early_term(term))) if self.is_early_noun: if wn.is_noun(pos): boosts.append(boost(product, self.early_term_noun(term))) if self.is_early_verb: if wn.is_verb(pos): boosts.append(boost(product, self.early_term_verb(term))) if self.is_early_adj: if wn.is_adjective(pos): boosts.append(boost(product, self.early_term_adj(term))) if self.is_early_adv: if wn.is_adverb(pos): boosts.append(boost(product, self.early_term_adv(term))) if self.is_early_noun_adj: # Boost both adjectives and nouns if wn.is_noun(pos) or wn.is_adjective(pos): boosts.append(boost(product, self.early_term(term))) if self.is_early_verb_adv: # Boost both verbs and adv if wn.is_verb(pos) or wn.is_adverb(pos): boosts.append(boost(product, self.early_term(term))) if self.is_early_not_noun: if not wn.is_noun(pos): boosts.append(boost(product, self.early_term(term))) if self.is_early_not_verb: if not wn.is_verb(pos): boosts.append(boost(product, self.early_term(term))) if self.is_early_not_adj: if not wn.is_adjective(pos): boosts.append(boost(product, self.early_term(term))) if self.is_early_not_adv: if not wn.is_adverb(pos): boosts.append(boost(product, self.early_term(term))) if self.is_early_not_verb_adv: if not wn.is_verb(pos) and not wn.is_adverb(pos): boosts.append(boost(product, self.early_term(term))) if self.is_early_not_noun_adj: if not wn.is_noun(pos) and not wn.is_adjective(pos): boosts.append(boost(product, self.early_term(term))) if self.is_early_q: boosts.append(boost(product, self.early_term_q(term))) if self.is_early_q_noun: if wn.is_noun(pos): boosts.append(boost(product, self.early_term_q(term))) if self.is_early_q_verb: if wn.is_verb(pos): boosts.append(boost(product, self.early_term_q(term))) if self.is_noun: if wn.is_noun(pos): boosts.append(boost(product, self.noun_influence)) if self.is_adj: if wn.is_adjective(pos): boosts.append(boost(product, self.adj_influence)) if self.is_verb: if wn.is_verb(pos): boosts.append(boost(product, self.verb_influence)) if self.is_adv: if wn.is_adverb(pos): boosts.append(boost(product, self.adv_influence)) if self.is_close_pairs: boosts.append(boost(product, self.close_pairs(term))) self.last_term = term if self.is_adj_noun_pairs: # If the last adjective in the query is before the current noun if self.adj_noun_pairs(term, pos): boosts.append(boost(product, self.adj_noun_pairs_influence)) self.last_term_pos = (term, pos) if self.is_adj_noun_linear_pairs: boosts.append(boost(product, self.adj_noun_pairs_linear(term, pos))) self.last_term_pos = (term, pos) if self.is_adv_verb_pairs: # If the last adverb in the query is before the current verb if self.adv_verb_pairs(term, pos): boosts.append(boost(product, self.adv_verb_pairs_influence)) self.last_term_pos = (term, pos) if self.is_adv_verb_linear_pairs: boosts.append(boost(product, self.adv_verb_pairs_linear(term, pos))) self.last_term_pos = (term, pos) if self.is_bigram: # If the last term in the query is right before the current term if self.bigram(term, pos): boosts.append(boost(product, self.bigram_influence)) self.last_term_pos = (term, pos) if self.is_adj_noun_2gram: # If the last adjective in the query is right before the current noun if self.adj_noun_2gram(term, pos): boosts.append(boost(product, self.adj_noun_2gram_influence)) self.last_term_pos = (term, pos) if self.is_adv_verb_2gram: # If the last adverb in the query is right before the current verb if self.adv_verb_2gram(term, pos): boosts.append(boost(product, self.adv_verb_2gram_influence)) self.last_term_pos = (term, pos) if self.is_sub_all: self.substitute(sub_boosts, subs) if self.is_sub_noun: if wn.is_noun(pos): self.substitute(sub_boosts, subs) if self.is_sub_verb: if wn.is_verb(pos): self.substitute(sub_boosts, subs) if self.is_sub_adj: if wn.is_adjective(pos): self.substitute(sub_boosts, subs) if self.is_sub_adv: if wn.is_adverb(pos): self.substitute(sub_boosts, subs) if self.is_sub_idf_top: # substitute for the terms with the top idf scores if is_top_idf_map[term]: self.substitute(sub_boosts, subs) if self.is_sub_idf_bottom: # substitute for the terms with the bottom idf scores if is_bottom_idf_map[term]: self.substitute(sub_boosts, subs) if self.is_sub_api_all: self.substitute_wn_api(sub_boosts, term) if self.is_sub_api_noun: if wn.is_noun(pos): self.substitute_wn_api(sub_boosts, term) if self.is_sub_api_verb: if wn.is_verb(pos): self.substitute_wn_api(sub_boosts, term) if self.is_sub_api_adj: if wn.is_adjective(pos): self.substitute_wn_api(sub_boosts, term) if self.is_sub_api_adv: if wn.is_adverb(pos): self.substitute_wn_api(sub_boosts, term) if self.is_sub_api_idf_top: if is_top_idf_map[term]: self.substitute_wn_api(sub_boosts, term) if self.is_sub_api_idf_bottom: if is_bottom_idf_map[term]: self.substitute_wn_api(sub_boosts, term) if self.is_w2v_sub_all: self.substitute_w2v(sub_boosts, term) if self.is_w2v_sub_noun: if wn.is_noun(pos): self.substitute_w2v(sub_boosts, term) if self.is_w2v_sub_verb: if wn.is_verb(pos): self.substitute_w2v(sub_boosts, term) if self.is_w2v_sub_adj: if wn.is_adjective(pos): self.substitute_w2v(sub_boosts, term) if self.is_w2v_sub_adv: if wn.is_adverb(pos): self.substitute_w2v(sub_boosts, term) if self.is_w2v_sub_idf_top: # substitute for the terms with the top idf scores if is_top_idf_map[term]: self.substitute_w2v(sub_boosts, term) if self.is_w2v_sub_idf_bottom: # substitute for the terms with the bottom idf scores if is_bottom_idf_map[term]: self.substitute_w2v(sub_boosts, term) if self.is_remove_adj: # Needs to be last if wn.is_adjective(pos): self.remove_adj_boosts = product + sum(boosts) + sum(sub_boosts) self.remove_last_term_adj = term terms.append(term) continue # If we just found an adj and the next term is a noun found in both q and d elif wn.is_noun(pos) and self.remove_last_term_adj is not None\ and self.same_sentence(self.remove_last_term_adj, term): boosts.append(self.remove_adj_boosts) self.remove_adj_boosts = 0 self.remove_last_term_adj = None else: self.remove_adj_boosts = 0 self.remove_last_term_adj = None if self.is_remove_adv: # Needs to be last if wn.is_adverb(pos): self.remove_adv_boosts = product + sum(boosts) + sum(sub_boosts) self.remove_last_term_adv = term terms.append(term) continue # If we just found an adj and the next term is a noun found in both q and d elif wn.is_verb(pos) and self.remove_last_term_adv is not None\ and self.same_sentence(self.remove_last_term_adv, term): boosts.append(self.remove_adv_boosts) self.remove_adv_boosts = 0 self.remove_last_term_adv = None else: self.remove_adv_boosts = 0 self.remove_last_term_adv = None terms.append(term) okapi_sum += product + sum(boosts) + sum(sub_boosts) return okapi_sum
# Unmodified Cosine: MAP=0.5302462663875682 # Unmodified Okapi: MAP=0.5353128722733899 # is_early_noun_adj I=2.4 MAP=0.541351091646442 # is_early_noun_adj I=2.4, is_adj_noun_linear_pairs b=1.5 MAP=0.541373479735338 import math import sys import DocumentVector import QueryVector import VectorCollection import WordNet as wn from Word2Vec import Word2Vec wordnet = wn.WordNet() class DistanceFunction: def __init__(self, vector_collection: VectorCollection): self.vector_collection = vector_collection self.query = None self.doc = None def set_query(self, query_tv: QueryVector): self.query = query_tv def set_doc(self, doc_tv: DocumentVector): self.doc = doc_tv
def readFile(): input_file = open( "C:\\Users\\Sergio\\Dropbox\\QMUL\\Data\\choicesNHS\\nhsChoices.txt", "r") #input_file = open("C:\\Users\\Sergio\\Dropbox\\QMUL\\Data\\choicesNHS\\nhsChoicesDiagnosis.txt", "r") #input_file = open("C:\\Users\\Sergio\\Dropbox\\QMUL\\Data\\choicesNHS\\nhsChoicesDiabetesWhole.txt", "r") lines = input_file.readlines() input_file.close() annotationsX = [] annotationsSLR = [] annotationsNER = [] for x in lines: annotationX = x annotationSLR = annotator.getAnnotations(x, dep_parse=True)['srl'] #annotationNER = annotator.getAnnotations(x,dep_parse=True)['ner'] annotationsX.append(annotationX) annotationsSLR.append(annotationSLR) #annotationsNER.append(annotationNER) size = len(annotationsSLR) print size A0 = 0 A1 = 0 pbroles = [] annotationsA0 = [] annotationsA1 = [] for an in range(5): print annotationsX[an] print annotationsSLR[an] sizeIn = len(annotationsSLR[an]) #print sizeIn for an2 in range(sizeIn): print "--------------------------------------------------------------------------------------------------------" print annotationsSLR[an][an2]["V"] w = Word(annotationsSLR[an][an2]["V"]).lemmatize("v") #print w #print wn.synset(w+'.v.01') try: for role in propbank.roleset(w + '.01').findall("roles/role"): print(role.attrib['f'], role.attrib['n'], role.attrib['descr']) pbroles.append(role.attrib['descr']) #for role in propbank.roleset(w+'.01').findall("aliases/alias"): #print(role.attrib['framenet'], role.attrib['pos'], role.attrib['verbnet']) except: pass try: print( wn.lemma(w + '.v.01.' + w).derivationally_related_forms()) except: pass if "A0" in annotationsSLR[an][an2]: print annotationsSLR[an][an2]["A0"] A0 = annotationsSLR[an][an2]["A0"] #try: #A0 = TextBlob(A0, np_extractor=extractor) #A0 = A0.noun_phrases[0] #print A0 #except: #pass try: annotationsA0 = WordNet.spotlightSearch(A0) annotationsA0 = annotationsA0[0].get('URI') except: annotationsA0 = "unknown" pass if "A1" in annotationsSLR[an][an2]: print annotationsSLR[an][an2]["A1"] A1 = annotationsSLR[an][an2]["A1"] #try: #A1 = TextBlob(A1, np_extractor=extractor) #A1 = A1.noun_phrases[0] #print A1 #except: #pass try: annotationsA1 = WordNet.spotlightSearch(A1) annotationsA1 = annotationsA1[0].get('URI') except: annotationsA1 = "unknown" pass print pbroles print "--------------------------------------------------------------------------------------------------------" CreateGraphNeo4J.createGraph(w, A0, A1, pbroles, annotationsA0, annotationsA1) del pbroles[:] annotationsA0 = [] annotationsA1 = [] A0 = 0 A1 = 0
def define(word): res = [] parsed_words = WordNet.wordinfo(word) for entry in parsed_words: res.append("%s (%s) [%s] %s" % (entry[0], entry[1], entry[2].name, entry[4])) return res
def qparse1(sentence): words = nltk.wordpunct_tokenize(sentence) #wdict = {'Sentence': words} wdict = {} pos = SpecialWords() # translation key for WordNet POS tags WN_part_name = {'n':'N','v':'V','a':'Adj','s':'Adj','r':'Adv'} BR_part_name = {} # Look up the POS of the words for w in words: # get special POS for p in pos.keys(): if (w in pos[p]) or (w.lower() in pos[p]): if wdict.has_key(w): if not (p in wdict[w]): wdict[w].append(p) else: wdict[w] = [p] # get WordNet POS parts = WordNet.getPOS(w) for p in parts: pn = WN_part_name[p] if wdict.has_key(w): if not (pn in wdict[w]): wdict[w].append(pn) else: wdict[w] = [pn] print str(wdict) chunks = {'NP':[], 'VP':[], 'Unknown':[]} # Chunk up the NPs going from left to right using these rules: # NP = Det-N, (Adj)-N, Det-(Adj)-N, PropN, ProN, N NP = {} newNP = ['PropN', 'ProN', 'N', 'Det', 'Adj'] nextNP = [] for w in words: print print 'chunks =', str(chunks) print 'NP: Processing "%s" ...' % w chunked = False # Try to add the word to the current NP chunk for p in nextNP: if p in wdict[w]: chunked = True NP.append((w,p)) # Add word as a tuple if (p == 'Det') or (p == 'Adj'): nextNP = ['N', 'Adj'] else: nextNP = [] #if not chunked: # Try to add the word to the current VP chunk #if looking: for p in nextVP: if p in wdict[w]: looking = False VP.append(w) nextVP = [] break # Try to start a new NP chunk #if looking: for p in newNP: if p in wdict[w]: looking = False # Save NP/VP if necessary if len(NP)>0: chunks['NP'].append(NP) #if len(VP)>0: # chunks['VP'].append(VP) #VP = [] #nextVP = [] NP = [w] if (p == 'Det') or (p == 'Adj'): nextNP = ['N', 'Adj'] else: nextNP = [] break # Try to start a new VP chunk #if looking: for p in newVP: if p in wdict[w]: looking = False # Save NP/VP if necessary if len(nextVP)==0 and len(VP)>0: chunks['VP'].append(VP) #if len(NP)>0: # chunks['NP'].append(NP) #NP = [] #nextNP = [] VP = [w] if p == 'Adv': nextVP = ['V'] else: nextVP = ['Adv'] break # Put it in unknown if looking: Unknown.append(w) nextNP = [] nextVP = [] # Save and reset NP/VP if necessary if len(NP)>0: chunks['NP'].append(NP) NP = [] if len(VP)>0: chunks['VP'].append(VP) VP = [] print 'NP =', str(NP) print 'VP =', str(VP) print 'Unknown =', str(Unknown) print 'chunks =', str(chunks) if len(NP)>0: chunks['NP'].append(NP) if len(VP)>0: chunks['VP'].append(VP) return chunks
#annotator.getAnnotations(wikipedia.summary("London", sentences=1))['chunk'] #word = wn.synset(searchFor+'.n.01') #print word.hypernyms() annotator = Annotator() result = re.sub('\(.*?\)', "", wiki.string) result = re.sub("\/.+?\/", "", result) #print result dep_parse = annotator.getAnnotations(result, dep_parse=True)['dep_parse'] dp_list = dep_parse.split('\n') #print dp_list spotlightTerms = WordNet.spotlightSearch( "London is the capital and most populous city of England and the United Kingdom." ) print dp_list def dpbediaQuery(query): #test = json.load(urllib2.urlopen("http://www.freesound.org/apiv2/search/text/?query=" + term + "&token=06mS7W2OiXidVC2tQ4ikMfe3nomU7rBptaJgBCvp")) #test2 = json.load(urllib2.urlopen("https://api.jamendo.com/v3.0/tracks/?client_id=4cb8fab9&format=jsonpretty&name=" + term)) #pprint(test) #pprint(test2) sparql = SPARQLWrapper("http://dbpedia.org/sparql") sparql.setQuery(""" PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX dbpedia-owl: <http://dbpedia.org/ontology/> PREFIX dbres: <http://dbpedia.org/resource/>
def process_question(question): all_businesses = read_business_file() #Finds the business name name_from_question = extract_super_key_value(question, business_names) extracted_business = extract_business_dictionaries(all_businesses, NAME, name_from_question) print "Length after filtering by names: ", len(extracted_business) question_without_names = remove_super_key_values(question, name_from_question) #Finds the neighborhood neighborhood_from_question = extract_super_key_value( question_without_names, business_neighborhood) #print "neighborhood", neighborhood_from_question extracted_business = extract_business_dictionaries( extracted_business, NEIGHBORHOOD, neighborhood_from_question) print "Len after filtering by neighborhood: ", len(extracted_business) question_without_neighborhood = remove_super_key_values( question_without_names, neighborhood_from_question) clean_question = remove_stopwords_punctuations( question_without_neighborhood) #print clean_question similarity_index = WordNet.tuning(clean_question, business_categories, type_of_super_key='category') categories_from_question = WordNet.extract_categories( clean_question, business_categories, similarity_index) #print "categories_from_question", categories_from_question print "categories from question", categories_from_question extracted_business = extract_business_dictionaries( extracted_business, CATEGORIES, categories_from_question) #print "Extracted Businesses", extracted_business[0:5] # business_id_subset=[] # for business in extracted_business: # business_id_subset.append(business['business_id']) # pickle.dump(business_id_subset, open(pickle_business_id_q1, 'w')) print "Length after filtering by categories: ", len(extracted_business) #extracted_business=filter_businesses_using_reviews.get_similarity(clean_question, extracted_business, categories_from_question) #print "Length after filtering by user reviews: ", len(extracted_business) extracted_misc_attributes = extract_misc_attributes( extracted_business, categories_from_question) print "extracted attributes", extracted_misc_attributes similarity_index = WordNet.tuning(clean_question, extracted_misc_attributes, type_of_super_key='attributes') misc_attributes_from_question = WordNet.extract_categories( clean_question, extracted_misc_attributes, similarity_index) extracted_business = extract_candidate_businesses( extracted_business, misc_attributes_from_question) extracted_business, yn = extract_misc_attribute_businesses( misc_attributes_from_question, extracted_business, question) print "**Length after filtering by attributes", len(extracted_business) return extracted_business, yn #question = "What is the best place to have Sushi near Downtown?" #print process_question(question)