def simple_signature(ambiguous_word, pos=None, lemma=True, stem=False, \ hyperhypo=True, stop=True): synsets_signatures = {} for ss in wn.synsets(ambiguous_word): try: if pos and str(ss.pos()) != pos: continue except: if pos and str(ss.pos) != pos: continue signature = [] ss_definition = synset_properties(ss, 'definition') signature+=ss_definition ss_examples = synset_properties(ss, 'examples') signature+=list(chain(*[i.split() for i in ss_examples])) ss_lemma_names = synset_properties(ss, 'lemma_names') signature+= ss_lemma_names if hyperhypo == True: ss_hyponyms = synset_properties(ss, 'hyponyms') ss_hypernyms = synset_properties(ss, 'hypernyms') ss_hypohypernyms = ss_hypernyms+ss_hyponyms signature+= list(chain(*[i.lemma_names() for i in ss_hypohypernyms])) if stop == True: signature = [i for i in signature if i not in EN_STOPWORDS] if lemma == True: signature = [lemmatize(i) for i in signature] if stem == True: signature = [porter.stem(i) for i in signature] synsets_signatures[ss] = signature return synsets_signatures
def adapted_lesk(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=True, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False, keepscore=False, normalizescore=False): """ This function is the implementation of the Adapted Lesk algorithm, described in Banerjee and Pederson (2002). It makes use of the lexical items from semantically related senses within the wordnet hierarchies and to generate more lexical items for each sense. see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None # Get the signatures for each synset. ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) for ss in ss_sign: # Includes holonyms. ss_mem_holonyms = synset_properties(ss, 'member_holonyms') ss_part_holonyms = synset_properties(ss, 'part_holonyms') ss_sub_holonyms = synset_properties(ss, 'substance_holonyms') # Includes meronyms. ss_mem_meronyms = synset_properties(ss, 'member_meronyms') ss_part_meronyms = synset_properties(ss, 'part_meronyms') ss_sub_meronyms = synset_properties(ss, 'substance_meronyms') # Includes similar_tos ss_simto = synset_properties(ss, 'similar_tos') related_senses = list( set(ss_mem_holonyms + ss_part_holonyms + ss_sub_holonyms + ss_mem_meronyms + ss_part_meronyms + ss_sub_meronyms + ss_simto)) signature = list([ j for j in chain( *[synset_properties(i, 'lemma_names') for i in related_senses]) if j not in EN_STOPWORDS ]) # Lemmatized context is preferred over stemmed context if lemma == True: signature = [lemmatize(i) for i in signature] # Matching exact words causes sparsity, so optional matching for stems. if stem == True: signature = [porter.stem(i) for i in signature] # Adds the extended signature to the simple signatures. ss_sign[ss] += signature # Disambiguate the sense in context. if context_is_lemmatized: context_sentence = context_sentence.split() else: context_sentence = lemmatize_sentence(context_sentence) best_sense = compare_overlaps(context_sentence, ss_sign, \ nbest=nbest, keepscore=keepscore, \ normalizescore=normalizescore) return best_sense
def simple_signature(ambiguous_word, pos=None, lemma=True, stem=False, \ hyperhypo=True, stop=True): """ Returns a synsets_signatures dictionary that includes signature words of a sense from its: (i) definition (ii) example sentences (iii) hypernyms and hyponyms """ synsets_signatures = {} for ss in wn.synsets(ambiguous_word): try: # If POS is specified. if pos and str(ss.pos()) != pos: continue except: if pos and str(ss.pos) != pos: continue signature = [] # Includes definition. ss_definition = synset_properties(ss, 'definition') signature += ss_definition.split() # Includes examples ss_examples = synset_properties(ss, 'examples') signature += list(chain(*[i.split() for i in ss_examples])) # Includes lemma_names. ss_lemma_names = synset_properties(ss, 'lemma_names') signature += ss_lemma_names # Optional: includes lemma_names of hypernyms and hyponyms. if hyperhypo == True: ss_hyponyms = synset_properties(ss, 'hyponyms') ss_hypernyms = synset_properties(ss, 'hypernyms') ss_hypohypernyms = ss_hypernyms + ss_hyponyms signature += list( chain(*[i.lemma_names() for i in ss_hypohypernyms])) # Optional: removes stopwords. if stop == True: signature = [i for i in signature if i not in EN_STOPWORDS] # Lemmatized context is preferred over stemmed context. if lemma == True: signature = [lemmatize(i) for i in signature] # Matching exact words may cause sparsity, so optional matching for stems. if stem == True: signature = [porter.stem(i) for i in signature] synsets_signatures[ss] = signature return synsets_signatures
def simple_signature(ambiguous_word, pos=None, lemma=True, stem=False, \ hyperhypo=True, stop=True): """ Returns a synsets_signatures dictionary that includes signature words of a sense from its: (i) definition (ii) example sentences (iii) hypernyms and hyponyms """ synsets_signatures = {} for ss in wn.synsets(ambiguous_word): try: # If POS is specified. if pos and str(ss.pos()) != pos: continue except: if pos and str(ss.pos) != pos: continue signature = [] # Includes definition. ss_definition = synset_properties(ss, 'definition') signature+=ss_definition # Includes examples ss_examples = synset_properties(ss, 'examples') signature+=list(chain(*[i.split() for i in ss_examples])) # Includes lemma_names. ss_lemma_names = synset_properties(ss, 'lemma_names') signature+= ss_lemma_names # Optional: includes lemma_names of hypernyms and hyponyms. if hyperhypo == True: ss_hyponyms = synset_properties(ss, 'hyponyms') ss_hypernyms = synset_properties(ss, 'hypernyms') ss_hypohypernyms = ss_hypernyms+ss_hyponyms signature+= list(chain(*[i.lemma_names() for i in ss_hypohypernyms])) # Optional: removes stopwords. if stop == True: signature = [i for i in signature if i not in EN_STOPWORDS] # Lemmatized context is preferred over stemmed context. if lemma == True: signature = [lemmatize(i) for i in signature] # Matching exact words may cause sparsity, so optional matching for stems. if stem == True: signature = [porter.stem(i) for i in signature] synsets_signatures[ss] = signature return synsets_signatures
def adapted_lesk(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=True, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False, keepscore=False, normalizescore=False): """ This function is the implementation of the Adapted Lesk algorithm, described in Banerjee and Pederson (2002). It makes use of the lexical items from semantically related senses within the wordnet hierarchies and to generate more lexical items for each sense. see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None # Get the signatures for each synset. ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) for ss in ss_sign: # Includes holonyms. ss_mem_holonyms = synset_properties(ss, 'member_holonyms') ss_part_holonyms = synset_properties(ss, 'part_holonyms') ss_sub_holonyms = synset_properties(ss, 'substance_holonyms') # Includes meronyms. ss_mem_meronyms = synset_properties(ss, 'member_meronyms') ss_part_meronyms = synset_properties(ss, 'part_meronyms') ss_sub_meronyms = synset_properties(ss, 'substance_meronyms') # Includes similar_tos ss_simto = synset_properties(ss, 'similar_tos') related_senses = list(set(ss_mem_holonyms+ss_part_holonyms+ ss_sub_holonyms+ss_mem_meronyms+ ss_part_meronyms+ss_sub_meronyms+ ss_simto)) signature = list([j for j in chain(*[synset_properties(i, 'lemma_names') for i in related_senses]) if j not in EN_STOPWORDS]) # Lemmatized context is preferred over stemmed context if lemma == True: signature = [lemmatize(i) for i in signature] # Matching exact words causes sparsity, so optional matching for stems. if stem == True: signature = [porter.stem(i) for i in signature] # Adds the extended signature to the simple signatures. ss_sign[ss]+=signature # Disambiguate the sense in context. if context_is_lemmatized: context_sentence = context_sentence.split() else: context_sentence = lemmatize_sentence(context_sentence) best_sense = compare_overlaps(context_sentence, ss_sign, \ nbest=nbest, keepscore=keepscore, \ normalizescore=normalizescore) return best_sense
def original_lesk(context_sentence, ambiguous_word, dictionary=None): """ This function is the implementation of the original Lesk algorithm (1986). It requires a dictionary which contains the definition of the different sense of each word. See http://dl.acm.org/citation.cfm?id=318728 """ ambiguous_word = lemmatize(ambiguous_word) if not dictionary: # If dictionary is not provided, use the WN defintion. dictionary = {} for ss in wn.synsets(ambiguous_word): ss_definition = synset_properties(ss, 'definition') dictionary[ss] = ss_definition best_sense = compare_overlaps_greedy(context_sentence.split(), dictionary) return best_sense
def adapted_lesk(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=True, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False, keepscore=False, normalizescore=False): # Ensure ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) for ss in ss_sign: ss_mem_holonyms = synset_properties(ss, 'member_holonyms') ss_part_holonyms = synset_properties(ss, 'part_holonyms') ss_sub_holonyms = synset_properties(ss, 'substance_holonyms') ss_mem_meronyms = synset_properties(ss, 'member_meronyms') ss_part_meronyms = synset_properties(ss, 'part_meronyms') ss_sub_meronyms = synset_properties(ss, 'substance_meronyms') ss_simto = synset_properties(ss, 'similar_tos') related_senses = list(set(ss_mem_holonyms+ss_part_holonyms+ ss_sub_holonyms+ss_mem_meronyms+ ss_part_meronyms+ss_sub_meronyms+ ss_simto)) signature = list([j for j in chain(*[synset_properties(i, 'lemma_names') for i in related_senses]) if j not in EN_STOPWORDS]) if lemma == True: signature = [lemmatize(i) for i in signature] #if stem == True: signature = [porter.stem(i) for i in signature] ss_sign[ss]+=signature if context_is_lemmatized: context_sentence = context_sentence.split() else: context_sentence = lemmatize_sentence(context_sentence) best_sense = compare_overlaps(context_sentence, ss_sign, \ nbest=nbest, keepscore=keepscore, \ normalizescore=normalizescore) return best_sense