def getWeightsForQuery(self, query): indexTf = self.index.getIndex() words = query.lower().split() words_stem = [] for w in words: words_stem.append(porter.stem(w)) resultat = dict() terms = [] docs = [] docs = indexTf.keys() for d in docs: doc = dict() terms = indexTf[d].keys() for t in terms: if t in words_stem: doc[t] = (1 + np.log(indexTf[d][t])) * np.log( (1 + len(docs) / (1 + len(self.index.getTfsForStem(t).keys())))) else: doc[t] = 0 resultat[d] = doc return resultat
def getWeightsForQuery(self, query): words = query.lower().split() words_stem = [] for w in words: words_stem.append(porter.stem(w)) counter = dict(collections.Counter(words_stem)) resultat = dict() terms = [] indexTf = self.index.getIndex() docs = [] docs = indexTf.keys() for d in docs: doc = dict() terms = indexTf[d].keys() for t in terms: if t in words_stem: doc[t] = counter[t] else: doc[t] = 0 resultat[d] = doc return resultat
def simple_signature(ambiguous_word, pos=None, lemma=True, stem=False, \ hyperhypo=True, stop=True): synsets_signatures = {} for ss in wn.synsets(ambiguous_word): try: if pos and str(ss.pos()) != pos: continue except: if pos and str(ss.pos) != pos: continue signature = [] ss_definition = synset_properties(ss, 'definition') signature+=ss_definition ss_examples = synset_properties(ss, 'examples') signature+=list(chain(*[i.split() for i in ss_examples])) ss_lemma_names = synset_properties(ss, 'lemma_names') signature+= ss_lemma_names if hyperhypo == True: ss_hyponyms = synset_properties(ss, 'hyponyms') ss_hypernyms = synset_properties(ss, 'hypernyms') ss_hypohypernyms = ss_hypernyms+ss_hyponyms signature+= list(chain(*[i.lemma_names() for i in ss_hypohypernyms])) if stop == True: signature = [i for i in signature if i not in EN_STOPWORDS] if lemma == True: signature = [lemmatize(i) for i in signature] if stem == True: signature = [porter.stem(i) for i in signature] synsets_signatures[ss] = signature return synsets_signatures
def getScores(self,query,pertinence=None): """ Calcul du score basé sur Okapi-BM25 """ words=query.lower().split() words_stem=[] for w in words: words_stem.append(porter.stem(w)) index=self.index.getIndex() index_inverse=self.index.getIndexInverse() idf={t:self.getIdf(t,index,index_inverse,pertinence) for t in words_stem} length_docs={doc:sum([int(tf) for tf in index[doc].values()]) for doc in index.keys() } mean_length_docs=np.mean([int(val) for val in length_docs.values()]) docs=index.keys() docsScores={} for d in docs: score=0 for t in words_stem: score+=idf[t]*index[d].get(t,0)/(index[d].get(t,0) +self.K1*(1-self.B+self.B*(length_docs[d]/mean_length_docs))) docsScores[d]=score return docsScores
def adapted_lesk(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=True, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False, keepscore=False, normalizescore=False): """ This function is the implementation of the Adapted Lesk algorithm, described in Banerjee and Pederson (2002). It makes use of the lexical items from semantically related senses within the wordnet hierarchies and to generate more lexical items for each sense. see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None # Get the signatures for each synset. ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) for ss in ss_sign: # Includes holonyms. ss_mem_holonyms = synset_properties(ss, 'member_holonyms') ss_part_holonyms = synset_properties(ss, 'part_holonyms') ss_sub_holonyms = synset_properties(ss, 'substance_holonyms') # Includes meronyms. ss_mem_meronyms = synset_properties(ss, 'member_meronyms') ss_part_meronyms = synset_properties(ss, 'part_meronyms') ss_sub_meronyms = synset_properties(ss, 'substance_meronyms') # Includes similar_tos ss_simto = synset_properties(ss, 'similar_tos') related_senses = list( set(ss_mem_holonyms + ss_part_holonyms + ss_sub_holonyms + ss_mem_meronyms + ss_part_meronyms + ss_sub_meronyms + ss_simto)) signature = list([ j for j in chain( *[synset_properties(i, 'lemma_names') for i in related_senses]) if j not in EN_STOPWORDS ]) # Lemmatized context is preferred over stemmed context if lemma == True: signature = [lemmatize(i) for i in signature] # Matching exact words causes sparsity, so optional matching for stems. if stem == True: signature = [porter.stem(i) for i in signature] # Adds the extended signature to the simple signatures. ss_sign[ss] += signature # Disambiguate the sense in context. if context_is_lemmatized: context_sentence = context_sentence.split() else: context_sentence = lemmatize_sentence(context_sentence) best_sense = compare_overlaps(context_sentence, ss_sign, \ nbest=nbest, keepscore=keepscore, \ normalizescore=normalizescore) return best_sense
def adapted_lesk(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=True, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False, keepscore=False, normalizescore=False): """ This function is the implementation of the Adapted Lesk algorithm, described in Banerjee and Pederson (2002). It makes use of the lexical items from semantically related senses within the wordnet hierarchies and to generate more lexical items for each sense. see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # Get the signatures for each synset. ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) for ss in ss_sign: related_senses = list(set(ss.member_holonyms() + ss.member_meronyms() + ss.part_meronyms() + ss.part_holonyms() + ss.similar_tos() + ss.substance_holonyms() + ss.substance_meronyms())) try: signature = list([j for j in chain(*[i.lemma_names() for i in \ related_senses]) if j not in stopwords.words('english')]) except: signature = list([j for j in chain(*[i.lemma_names for i in \ related_senses]) if j not in stopwords.words('english')]) # Lemmatized context is preferred over stemmed context if lemma == True: signature = [lemmatize(i) for i in signature] # Matching exact words causes sparsity, so optional matching for stems. if stem == True: signature = [porter.stem(i) for i in signature] ss_sign[ss]+=signature # Disambiguate the sense in context. if context_is_lemmatized: context_sentence = [porter.stem(i) for i in context_sentence.split()] else: context_sentence = lemmatize_sentence(context_sentence) best_sense = compare_overlaps(context_sentence, ambiguous_word, ss_sign, \ nbest=nbest, keepscore=keepscore, \ normalizescore=normalizescore) return best_sense
def adapted_lesk(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=True, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False, keepscore=False, normalizescore=False): """ This function is the implementation of the Adapted Lesk algorithm, described in Banerjee and Pederson (2002). It makes use of the lexical items from semantically related senses within the wordnet hierarchies and to generate more lexical items for each sense. see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None # Get the signatures for each synset. ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) for ss in ss_sign: # Includes holonyms. ss_mem_holonyms = synset_properties(ss, 'member_holonyms') ss_part_holonyms = synset_properties(ss, 'part_holonyms') ss_sub_holonyms = synset_properties(ss, 'substance_holonyms') # Includes meronyms. ss_mem_meronyms = synset_properties(ss, 'member_meronyms') ss_part_meronyms = synset_properties(ss, 'part_meronyms') ss_sub_meronyms = synset_properties(ss, 'substance_meronyms') # Includes similar_tos ss_simto = synset_properties(ss, 'similar_tos') related_senses = list(set(ss_mem_holonyms+ss_part_holonyms+ ss_sub_holonyms+ss_mem_meronyms+ ss_part_meronyms+ss_sub_meronyms+ ss_simto)) signature = list([j for j in chain(*[synset_properties(i, 'lemma_names') for i in related_senses]) if j not in EN_STOPWORDS]) # Lemmatized context is preferred over stemmed context if lemma == True: signature = [lemmatize(i) for i in signature] # Matching exact words causes sparsity, so optional matching for stems. if stem == True: signature = [porter.stem(i) for i in signature] # Adds the extended signature to the simple signatures. ss_sign[ss]+=signature # Disambiguate the sense in context. if context_is_lemmatized: context_sentence = context_sentence.split() else: context_sentence = lemmatize_sentence(context_sentence) best_sense = compare_overlaps(context_sentence, ss_sign, \ nbest=nbest, keepscore=keepscore, \ normalizescore=normalizescore) return best_sense
def counter(doc): """ Processing d'un document Args: - doc : document considéré Return: - (mot,occurrence) pour chaque mot présent dans le document """ l = [pt.stem(w.lower()) for w in doc.split(" ")] stemmer = tr.PorterStemmer() l = [word for word in l if word not in stemmer.stopWords] return dict(collections.Counter(l).items())
def simple_signature(ambiguous_word, pos=None, lemma=True, stem=False, \ hyperhypo=True, stop=True): """ Returns a synsets_signatures dictionary that includes signature words of a sense from its: (i) definition (ii) example sentences (iii) hypernyms and hyponyms """ synsets_signatures = {} for ss in wn.synsets(ambiguous_word): # If POS is specified. try: if pos and str(ss.pos()) != pos: continue except: if pos and str(ss.pos) != pos: continue signature = [] # Includes definition. try: signature+= ss.definition().split() except: signature+= ss.definition.split() # Includes examples try: signature+= list(chain(*[i.split() for i in ss.examples()])) except: signature+= list(chain(*[i.split() for i in ss.examples])) # Includes lemma_names. try: signature+= ss.lemma_names() except: signature+= ss.lemma_names # Optional: includes lemma_names of hypernyms and hyponyms. if hyperhypo == True: try: signature+= list(chain(*[i.lemma_names() for i \ in ss.hypernyms()+ss.hyponyms()])) except: signature+= list(chain(*[i.lemma_names for i \ in ss.hypernyms()+ss.hyponyms()])) # Optional: removes stopwords. if stop == True: signature = [i for i in signature if i not in stopwords.words('english')] # Lemmatized context is preferred over stemmed context if lemma == True: signature = [lemmatize(i) for i in signature] # Matching exact words causes sparsity, so optional matching for stems. if stem == True: signature = [porter.stem(i) for i in signature] signature = [i.lower() for i in signature] synsets_signatures[ss] = signature return synsets_signatures
def getScores(self, query): weighter=self.getWeighter() words=query.lower().split() self.norm_doc={} #Les normes des documents words_stem=[] for w in words: words_stem.append(porter.stem(w)) if(self.normalized): #Methode Cosinus docs=weighter.getWeightsForQuery(query) keysDocs=docs.keys() docsScores={} for k in keysDocs: score=0 weighterDoc=weighter.getWeightsForDoc(k) for stem in words_stem: if stem in docs[k].keys(): score+=docs[k][stem]*weighterDoc[stem] norm_vect1=np.linalg.norm(np.array(list(docs[k].values())),ord=2) if k in self.norm_doc.keys(): norm_vect2=self.norm_doc[k] else: norm_vect2=np.linalg.norm(np.array(list(weighterDoc.values())),ord=2) self.norm_doc[k]=norm_vect2 docsScores[k]=score/(norm_vect1+norm_vect2) else: #Produit scalaire docs=weighter.getWeightsForQuery(query) keysDocs=docs.keys() docsScores={} for k in keysDocs: score=0 weighterDoc=weighter.getWeightsForDoc(k) for stem in words_stem: if stem in docs[k].keys(): score+=docs[k][stem]*weighterDoc[stem] docsScores[k]=score return docsScores
def cosine_lesk( context_sentence, ambiguous_word, pos=None, lemma=True, stem=True, hyperhypo=True, stop=True, context_is_lemmatized=False, nbest=False, ): """ In line with vector space models, we can use cosine to calculate overlaps instead of using raw overlap counts. Essentially, the idea of using signatures (aka 'sense paraphrases') is lesk-like. """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None synsets_signatures = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) if context_is_lemmatized: context_sentence = " ".join(context_sentence.split()) else: context_sentence = " ".join(lemmatize_sentence(context_sentence)) scores = [] for ss, signature in synsets_signatures.items(): # Lowercase and replace "_" with spaces. signature = " ".join(map(str, signature)).lower().replace("_", " ") # Removes punctuation. signature = [i for i in word_tokenize(signature) if i not in string.punctuation] # Optional: remove stopwords. if stop: signature = [i for i in signature if i not in EN_STOPWORDS] # Optional: Lemmatize the tokens. if lemma == True: signature = [lemmatize(i) for i in signature] # Optional: stem the tokens. if stem: signature = [porter.stem(i) for i in signature] scores.append((cos_sim(context_sentence, " ".join(signature)), ss)) if not nbest: return sorted(scores, reverse=True)[0][1] else: return [(j, i) for i, j in sorted(scores, reverse=True)]
def simple_signature(ambiguous_word, pos=None, lemma=True, stem=False, \ hyperhypo=True, stop=True): """ Returns a synsets_signatures dictionary that includes signature words of a sense from its: (i) definition (ii) example sentences (iii) hypernyms and hyponyms """ synsets_signatures = {} for ss in wn.synsets(ambiguous_word): try: # If POS is specified. if pos and str(ss.pos()) != pos: continue except: if pos and str(ss.pos) != pos: continue signature = [] # Includes definition. ss_definition = synset_properties(ss, 'definition') signature += ss_definition.split() # Includes examples ss_examples = synset_properties(ss, 'examples') signature += list(chain(*[i.split() for i in ss_examples])) # Includes lemma_names. ss_lemma_names = synset_properties(ss, 'lemma_names') signature += ss_lemma_names # Optional: includes lemma_names of hypernyms and hyponyms. if hyperhypo == True: ss_hyponyms = synset_properties(ss, 'hyponyms') ss_hypernyms = synset_properties(ss, 'hypernyms') ss_hypohypernyms = ss_hypernyms + ss_hyponyms signature += list( chain(*[i.lemma_names() for i in ss_hypohypernyms])) # Optional: removes stopwords. if stop == True: signature = [i for i in signature if i not in EN_STOPWORDS] # Lemmatized context is preferred over stemmed context. if lemma == True: signature = [lemmatize(i) for i in signature] # Matching exact words may cause sparsity, so optional matching for stems. if stem == True: signature = [porter.stem(i) for i in signature] synsets_signatures[ss] = signature return synsets_signatures
def simple_signature(ambiguous_word, pos=None, lemma=True, stem=False, \ hyperhypo=True, stop=True): """ Returns a synsets_signatures dictionary that includes signature words of a sense from its: (i) definition (ii) example sentences (iii) hypernyms and hyponyms """ synsets_signatures = {} for ss in wn.synsets(ambiguous_word): try: # If POS is specified. if pos and str(ss.pos()) != pos: continue except: if pos and str(ss.pos) != pos: continue signature = [] # Includes definition. ss_definition = synset_properties(ss, 'definition') signature+=ss_definition # Includes examples ss_examples = synset_properties(ss, 'examples') signature+=list(chain(*[i.split() for i in ss_examples])) # Includes lemma_names. ss_lemma_names = synset_properties(ss, 'lemma_names') signature+= ss_lemma_names # Optional: includes lemma_names of hypernyms and hyponyms. if hyperhypo == True: ss_hyponyms = synset_properties(ss, 'hyponyms') ss_hypernyms = synset_properties(ss, 'hypernyms') ss_hypohypernyms = ss_hypernyms+ss_hyponyms signature+= list(chain(*[i.lemma_names() for i in ss_hypohypernyms])) # Optional: removes stopwords. if stop == True: signature = [i for i in signature if i not in EN_STOPWORDS] # Lemmatized context is preferred over stemmed context. if lemma == True: signature = [lemmatize(i) for i in signature] # Matching exact words may cause sparsity, so optional matching for stems. if stem == True: signature = [porter.stem(i) for i in signature] synsets_signatures[ss] = signature return synsets_signatures
def cosine_lesk(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=True, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False): """ In line with vector space models, we can use cosine to calculate overlaps instead of using raw overlap counts. Essentially, the idea of using signatures (aka 'sense paraphrases') is lesk-like. """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None synsets_signatures = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) if context_is_lemmatized: context_sentence = " ".join(context_sentence.split()) else: context_sentence = " ".join(lemmatize_sentence(context_sentence)) scores = [] for ss, signature in synsets_signatures.items(): # Lowercase and replace "_" with spaces. signature = " ".join(map(str, signature)).lower().replace("_", " ") # Removes punctuation. signature = [i for i in word_tokenize(signature) \ if i not in string.punctuation] # Optional: remove stopwords. if stop: signature = [i for i in signature if i not in EN_STOPWORDS] # Optional: Lemmatize the tokens. if lemma == True: signature = [lemmatize(i) for i in signature] # Optional: stem the tokens. if stem: signature = [porter.stem(i) for i in signature] scores.append((cos_sim(context_sentence, " ".join(signature)), ss)) if not nbest: return sorted(scores, reverse=True)[0][1] else: return [(j, i) for i, j in sorted(scores, reverse=True)]
def adapted_lesk(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=True, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False, keepscore=False, normalizescore=False): # Ensure ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) for ss in ss_sign: ss_mem_holonyms = synset_properties(ss, 'member_holonyms') ss_part_holonyms = synset_properties(ss, 'part_holonyms') ss_sub_holonyms = synset_properties(ss, 'substance_holonyms') ss_mem_meronyms = synset_properties(ss, 'member_meronyms') ss_part_meronyms = synset_properties(ss, 'part_meronyms') ss_sub_meronyms = synset_properties(ss, 'substance_meronyms') ss_simto = synset_properties(ss, 'similar_tos') related_senses = list(set(ss_mem_holonyms+ss_part_holonyms+ ss_sub_holonyms+ss_mem_meronyms+ ss_part_meronyms+ss_sub_meronyms+ ss_simto)) signature = list([j for j in chain(*[synset_properties(i, 'lemma_names') for i in related_senses]) if j not in EN_STOPWORDS]) if lemma == True: signature = [lemmatize(i) for i in signature] #if stem == True: signature = [porter.stem(i) for i in signature] ss_sign[ss]+=signature if context_is_lemmatized: context_sentence = context_sentence.split() else: context_sentence = lemmatize_sentence(context_sentence) best_sense = compare_overlaps(context_sentence, ss_sign, \ nbest=nbest, keepscore=keepscore, \ normalizescore=normalizescore) return best_sense
def simple_lesk(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=False, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False, keepscore=False, normalizescore=False): """ Simple Lesk is somewhere in between using more than the original Lesk algorithm (1986) and using less signature words than adapted Lesk (Banerjee and Pederson, 2002) """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word).lower() # Get the signatures for each synset. ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) # Disambiguate the sense in context. if context_is_lemmatized: context_sentence = [porter.stem(i.lower()) for i in context_sentence.split()] else: context_sentence = lemmatize_sentence(context_sentence) best_sense = compare_overlaps(context_sentence, ambiguous_word, ss_sign, \ nbest=nbest, keepscore=keepscore, \ normalizescore=normalizescore) return best_sense
def getScores(self, query): """ Parameters ---------- query : String Returns Les scores des documents sous forme de couple (Doc , Score) dans un dict Calcul du score basé sur Le lissage Jelineck-Mercer ------- """ words=query.lower().split() words_stem=[] for w in words: words_stem.append(porter.stem(w)) index=self.index.getIndex() docs=index.keys() docsScores={} probaCollection={} sizeOfCollection=self.index.getCollectionSize() for t in words_stem: probaCollection[t]=sum([doc.get(t,0) for doc in index.values()])/sizeOfCollection for d in docs: score=1 for t in words_stem: score*=(1-self.lambda_)*index[d].get(t,0) + self.lambda_*probaCollection[t] docsScores[d]=score return docsScores