def max_similarity(context_sentence, ambiguous_word, option="path", lemma=True, context_is_lemmatized=False, pos=None, best=True): """ Perform WSD by maximizing the sum of maximum similarity between possible synsets of all words in the context sentence and the possible synsets of the ambiguous words (see http://goo.gl/XMq2BI): {argmax}_{synset(a)}(\sum_{i}^{n}{{max}_{synset(i)}(sim(i,a))} """ ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None if context_is_lemmatized: context_sentence = word_tokenize(context_sentence) else: context_sentence = [lemmatize(w) for w in word_tokenize(context_sentence)] result = {} for i in wn.synsets(ambiguous_word): try: if pos and pos != str(i.pos()): continue except: if pos and pos != str(i.pos): continue result[i] = sum(max([sim(i,k,option) for k in wn.synsets(j)]+[0]) \ for j in context_sentence) if option in ["res","resnik"]: # lower score = more similar result = sorted([(v,k) for k,v in result.items()]) else: # higher score = more similar result = sorted([(v,k) for k,v in result.items()],reverse=True) ##print result if best: return result[0][1]; return result
def adapted_lesk(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=True, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False, keepscore=False, normalizescore=False): """ This function is the implementation of the Adapted Lesk algorithm, described in Banerjee and Pederson (2002). It makes use of the lexical items from semantically related senses within the wordnet hierarchies and to generate more lexical items for each sense. see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None # Get the signatures for each synset. ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) for ss in ss_sign: # Includes holonyms. ss_mem_holonyms = synset_properties(ss, 'member_holonyms') ss_part_holonyms = synset_properties(ss, 'part_holonyms') ss_sub_holonyms = synset_properties(ss, 'substance_holonyms') # Includes meronyms. ss_mem_meronyms = synset_properties(ss, 'member_meronyms') ss_part_meronyms = synset_properties(ss, 'part_meronyms') ss_sub_meronyms = synset_properties(ss, 'substance_meronyms') # Includes similar_tos ss_simto = synset_properties(ss, 'similar_tos') related_senses = list( set(ss_mem_holonyms + ss_part_holonyms + ss_sub_holonyms + ss_mem_meronyms + ss_part_meronyms + ss_sub_meronyms + ss_simto)) signature = list([ j for j in chain( *[synset_properties(i, 'lemma_names') for i in related_senses]) if j not in EN_STOPWORDS ]) # Lemmatized context is preferred over stemmed context if lemma == True: signature = [lemmatize(i) for i in signature] # Matching exact words causes sparsity, so optional matching for stems. if stem == True: signature = [porter.stem(i) for i in signature] # Adds the extended signature to the simple signatures. ss_sign[ss] += signature # Disambiguate the sense in context. if context_is_lemmatized: context_sentence = context_sentence.split() else: context_sentence = lemmatize_sentence(context_sentence) best_sense = compare_overlaps(context_sentence, ss_sign, \ nbest=nbest, keepscore=keepscore, \ normalizescore=normalizescore) return best_sense
def adapted_lesk(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=True, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False, keepscore=False, normalizescore=False): """ This function is the implementation of the Adapted Lesk algorithm, described in Banerjee and Pederson (2002). It makes use of the lexical items from semantically related senses within the wordnet hierarchies and to generate more lexical items for each sense. see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None # Get the signatures for each synset. ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) for ss in ss_sign: # Includes holonyms. ss_mem_holonyms = synset_properties(ss, 'member_holonyms') ss_part_holonyms = synset_properties(ss, 'part_holonyms') ss_sub_holonyms = synset_properties(ss, 'substance_holonyms') # Includes meronyms. ss_mem_meronyms = synset_properties(ss, 'member_meronyms') ss_part_meronyms = synset_properties(ss, 'part_meronyms') ss_sub_meronyms = synset_properties(ss, 'substance_meronyms') # Includes similar_tos ss_simto = synset_properties(ss, 'similar_tos') related_senses = list(set(ss_mem_holonyms+ss_part_holonyms+ ss_sub_holonyms+ss_mem_meronyms+ ss_part_meronyms+ss_sub_meronyms+ ss_simto)) signature = list([j for j in chain(*[synset_properties(i, 'lemma_names') for i in related_senses]) if j not in EN_STOPWORDS]) # Lemmatized context is preferred over stemmed context if lemma == True: signature = [lemmatize(i) for i in signature] # Matching exact words causes sparsity, so optional matching for stems. if stem == True: signature = [porter.stem(i) for i in signature] # Adds the extended signature to the simple signatures. ss_sign[ss]+=signature # Disambiguate the sense in context. if context_is_lemmatized: context_sentence = context_sentence.split() else: context_sentence = lemmatize_sentence(context_sentence) best_sense = compare_overlaps(context_sentence, ss_sign, \ nbest=nbest, keepscore=keepscore, \ normalizescore=normalizescore) return best_sense
def get_alignment_complexity_scores(s0, s1): """ Run Sultan's aligner on two sentences and return the list that for each word in the first sentence specifies whether it was changed/simplified (1), kept unchanged (2) or cannot be linked to any other word in the sentence (0). :param s0: the first sentence as a list of tokens :param s1: the second sentence as a string :return: see above """ s0 = [x.lower() for x in s0] s1 = s1.lower() # check if the alignment has been performed before dict_key = " ".join(s0) + SEPARATOR + s1 if dict_key in ALIGN_DICT: return ALIGN_DICT[dict_key] result = np.full(len(s0), UNK) ALIGNMENT_STATS["total"] += 1 try: # tokenize and lemmatize the sentences s0_tok = tokenize(" ".join(s0)) s1_tok = tokenize(s1) s0_lem = lemmatize(s0_tok) s1_lem = lemmatize(s1_tok) pairs = align(s0_tok, s1_tok) # pairs of sentences aligned by Sultan's word aligner except: ALIGN_DICT[dict_key] = result ALIGNMENT_STATS["unsuccessful"] += 1 return result # iterate over aligned pairs and feel the result array for i in range(len(pairs[0])): w0, w1 = pairs[1][i][0].lower(), pairs[1][i][1].lower() if w0 in STOPWORDS or w1 in STOPWORDS: # such an alignment doesn't matter continue if w0 == w1 or s0_lem.get(w0, 'w0') == s1_lem.get(w1, 'w1'): # the alignment is valid but it only indicates that the word was kept as it is id = get_index(s0, w0, i, pairs) if id == -1: continue result[id] = SIMPLE else: id = get_index(s0, w0, i, pairs) if id == -1: continue result[id] = COMPLEX ALIGN_DICT[dict_key] = result return result
def cosine_lesk( context_sentence, ambiguous_word, pos=None, lemma=True, stem=True, hyperhypo=True, stop=True, context_is_lemmatized=False, nbest=False, ): """ In line with vector space models, we can use cosine to calculate overlaps instead of using raw overlap counts. Essentially, the idea of using signatures (aka 'sense paraphrases') is lesk-like. """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None synsets_signatures = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) if context_is_lemmatized: context_sentence = " ".join(context_sentence.split()) else: context_sentence = " ".join(lemmatize_sentence(context_sentence)) scores = [] for ss, signature in synsets_signatures.items(): # Lowercase and replace "_" with spaces. signature = " ".join(map(str, signature)).lower().replace("_", " ") # Removes punctuation. signature = [i for i in word_tokenize(signature) if i not in string.punctuation] # Optional: remove stopwords. if stop: signature = [i for i in signature if i not in EN_STOPWORDS] # Optional: Lemmatize the tokens. if lemma == True: signature = [lemmatize(i) for i in signature] # Optional: stem the tokens. if stem: signature = [porter.stem(i) for i in signature] scores.append((cos_sim(context_sentence, " ".join(signature)), ss)) if not nbest: return sorted(scores, reverse=True)[0][1] else: return [(j, i) for i, j in sorted(scores, reverse=True)]
def simple_lesk(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=False, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False, keepscore=False, normalizescore=False): """ Simple Lesk is somewhere in between using more than the original Lesk algorithm (1986) and using less signature words than adapted Lesk (Banerjee and Pederson, 2002) """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None # Get the signatures for each synset. ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) # Disambiguate the sense in context. if context_is_lemmatized: context_sentence = context_sentence.split() else: context_sentence = lemmatize_sentence(context_sentence) best_sense = compare_overlaps(context_sentence, ss_sign, \ nbest=nbest, keepscore=keepscore, \ normalizescore=normalizescore) return best_sense
def _tokenize(self, text='Текст нужно передать функции в виде строки!'): from utils import lemmatize if not self.udpipe_model: udpipe_model_path = os.path.join(BASE_DIR, 'model', 'udpipe_syntagrus.model') if not os.path.isfile(udpipe_model_path): msg = 'UDPipe model not found!' logging.critical(msg) raise IOError(msg) self.udpipe_model = Model.load(udpipe_model_path) t = time() process_pipeline = Pipeline(self.udpipe_model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') result = [] for line in nltk.sent_tokenize(text): # line = unify_sym(line.strip()) # здесь могла бы быть ваша функция очистки текста output = lemmatize(process_pipeline, text=line) result.extend(output) self.tagged_counter += 1 log(f'{self.tagged_counter} of {self.tagged_max} created, for {round(time() - t, 2)}s' ) return result
def create_test(self,verb_dict, verb_list): self.change_comma() triplets = self.create_oieresult() return_text="" for sentence in triplets: if len(sentence)==0: self.parsed.append('str(len(self.phrase_corpus))+" -1"') continue text=re.sub('\[[^\s]*','',sentence[0]['description']) text=re.sub('\]','',text).split() tags=[False]*len(sentence[0]['tags']) for triplet in sentence: arg_points=[x in ['I-ARG0','B-ARG0','I-ARG1','B-ARG1'] for x in triplet['tags']] abort=False for others in sentence: for place in range(len(others['tags'])): if others['tags'][place][-2:]=='-V' and arg_points[place]: abort=True break if abort: break if abort: continue subject=' '.join([text[x] for x in range(len(text)) if triplet['tags'][x] in ['I-ARG0','B-ARG0']]) objekt=' '.join([text[x] for x in range(len(text)) if triplet['tags'][x] in ['I-ARG1','B-ARG1']]) verb=triplet['verb'] verb=utils.lemmatize(triplet['verb']) if verb in auxillary_verbs: continue if len(subject)==0: continue if len(objekt)==0: continue verb = verb.upper() if verb not in verb_dict.keys(): print(verb) continue #if verb does not exist in verb_dict it can not be used to create verb_id=verb_dict[verb] max_id=len(self.phrase_corpus) subject_id,subject=self.deduplicate(subject) if subject_id==max_id: self.parsed.append(str(subject_id)) tags=[str(subject_id) if triplet['tags'][x] in ['I-ARG0','B-ARG0'] else tags[x] for x in range(len(text))] max_id=len(self.phrase_corpus) objekt_id,objekt=self.deduplicate(objekt) if objekt_id==max_id: self.parsed.append(str(objekt_id)) tags=[str(objekt_id) if triplet['tags'][x] in ['I-ARG1','B-ARG1'] else tags[x] for x in range(len(text))] if (subject,objekt,verb) not in self.triplet: self.triplet.append((subject,objekt,verb)) self.triplet_id.append((subject_id,verb_id,objekt_id)) self.parsed.append("str(len(self.phrase_corpus)+"+str(len(self.triplet))+")") self.parsed.append('str(len(self.phrase_corpus))+" -1"') text=['<phrase_'+str(tags[x])+'>' if tags[x] else text[x] for x in range(len(text))] text.append(None) text=[text[x] for x in range(len(text)-1) if (text[x]!=text[x+1] or text[x][0]!='<')] return_text=return_text+' '+' '.join(text) return self.phrase_corpus,self.triplet_id,return_text[1:],[eval(x, {"self": self}) for x in self.parsed]
def adapted_lesk(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=True, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False, keepscore=False, normalizescore=False): """ This function is the implementation of the Adapted Lesk algorithm, described in Banerjee and Pederson (2002). It makes use of the lexical items from semantically related senses within the wordnet hierarchies and to generate more lexical items for each sense. see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # Get the signatures for each synset. ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) for ss in ss_sign: related_senses = list(set(ss.member_holonyms() + ss.member_meronyms() + ss.part_meronyms() + ss.part_holonyms() + ss.similar_tos() + ss.substance_holonyms() + ss.substance_meronyms())) try: signature = list([j for j in chain(*[i.lemma_names() for i in \ related_senses]) if j not in stopwords.words('english')]) except: signature = list([j for j in chain(*[i.lemma_names for i in \ related_senses]) if j not in stopwords.words('english')]) if j in stopwords.words('english') print "Error" # Lemmatized context is preferred over stemmed context if lemma == True: signature = [lemmatize(i) for i in signature] # Matching exact words causes sparsity, so optional matching for stems. if stem == True: signature = [porter.stem(i) for i in signature] ss_sign[ss]+=signature # Disambiguate the sense in context. if context_is_lemmatized: context_sentence = context_sentence.split() else: context_sentence = lemmatize_sentence(context_sentence) best_sense = compare_overlaps(context_sentence, ss_sign, \ nbest=nbest, keepscore=keepscore, \ normalizescore=normalizescore) return best_sense
def cosine_lesk(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=True, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False): """ In line with vector space models, we can use cosine to calculate overlaps instead of using raw overlap counts. Essentially, the idea of using signatures (aka 'sense paraphrases') is lesk-like. """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None synsets_signatures = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) if context_is_lemmatized: context_sentence = " ".join(context_sentence.split()) else: context_sentence = " ".join(lemmatize_sentence(context_sentence)) scores = [] for ss, signature in synsets_signatures.items(): # Lowercase and replace "_" with spaces. signature = " ".join(map(str, signature)).lower().replace("_", " ") # Removes punctuation. signature = [i for i in word_tokenize(signature) \ if i not in string.punctuation] # Optional: remove stopwords. if stop: signature = [i for i in signature if i not in EN_STOPWORDS] # Optional: Lemmatize the tokens. if lemma == True: signature = [lemmatize(i) for i in signature] # Optional: stem the tokens. if stem: signature = [porter.stem(i) for i in signature] scores.append((cos_sim(context_sentence, " ".join(signature)), ss)) if not nbest: return sorted(scores, reverse=True)[0][1] else: return [(j, i) for i, j in sorted(scores, reverse=True)]
def max_similarity(context_sentence, ambiguous_word, option="path", lemma=True, context_is_lemmatized=False, pos=None, best=True): """ Perform WSD by maximizing the sum of maximum similarity between possible synsets of all words in the context sentence and the possible synsets of the ambiguous words (see http://goo.gl/XMq2BI): {argmax}_{synset(a)}(\sum_{i}^{n}{{max}_{synset(i)}(sim(i,a))} """ ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None if context_is_lemmatized: context_sentence = word_tokenize(context_sentence) else: context_sentence = [ lemmatize(w) for w in word_tokenize(context_sentence) ] result = {} for i in wn.synsets(ambiguous_word): try: if pos and pos != str(i.pos()): continue except: if pos and pos != str(i.pos): continue result[i] = sum(max([sim(i,k,option) for k in wn.synsets(j)]+[0]) \ for j in context_sentence) if option in ["res", "resnik"]: # lower score = more similar result = sorted([(v, k) for k, v in result.items()]) else: # higher score = more similar result = sorted([(v, k) for k, v in result.items()], reverse=True) ##print result if best: return result[0][1] return result
def simple_baseline_similarity(s1, s2): """ Find the sequence similarity between two words considering lemmas and words """ # Tokenize by sentences into words in lower case tokenized_sentence_1 = nltk.word_tokenize(s1.lower()) tokenized_sentence_2 = nltk.word_tokenize(s2.lower()) tagged_sentence_1 = pos_tag( tokenized_sentence_1) # [ (word, POS_TAG), ...] tagged_sentence_2 = pos_tag( tokenized_sentence_2) # [ (word, POS_TAG), ...] lemmas_sentence_1 = [ lemmatize(tagged_word, wnl) for tagged_word in tagged_sentence_1 if not tagged_word in stop_words ] lemmas_sentence_2 = [ lemmatize(tagged_word, wnl) for tagged_word in tagged_sentence_2 if not tagged_word in stop_words ] # [LEMMA_1, ...] word_seq_match = difflib.SequenceMatcher(None, tokenized_sentence_1, tokenized_sentence_2) word_match = word_seq_match.find_longest_match(0, len(tokenized_sentence_1), 0, len(tokenized_sentence_2)) lemm_seq_match = difflib.SequenceMatcher(None, lemmas_sentence_1, lemmas_sentence_2) lemm_match = lemm_seq_match.find_longest_match(0, len(lemmas_sentence_1), 0, len(lemmas_sentence_2)) word_sim = word_match.size / ( max(len(tokenized_sentence_1), len(tokenized_sentence_2)) + 0.001) lemm_sim = lemm_match.size / ( max(len(lemmas_sentence_1), len(lemmas_sentence_2)) + 0.001) return word_sim, lemm_sim
def lemmatize_answers(df): """ Given a pandas dataframe that contains a column called answer, returns a new dataframe with the lemmatized text from this column :param df: a pandas dataframe """ # lemmas = [lemmatize(speech) for speech in tqdm(df['answer'])] tqdm.pandas() lemmas = df['answer'].progress_apply(lambda x: lemmatize(x)) df['lemmatized_answer'] = lemmas return df
def simple_signature(ambiguous_word, pos=None, lemma=True, stem=False, \ hyperhypo=True, stop=True): """ Returns a synsets_signatures dictionary that includes signature words of a sense from its: (i) definition (ii) example sentences (iii) hypernyms and hyponyms """ synsets_signatures = {} for ss in wn.synsets(ambiguous_word): # If POS is specified. try: if pos and str(ss.pos()) != pos: continue except: if pos and str(ss.pos) != pos: continue signature = [] # Includes definition. try: signature+= ss.definition().split() except: signature+= ss.definition.split() # Includes examples try: signature+= list(chain(*[i.split() for i in ss.examples()])) except: signature+= list(chain(*[i.split() for i in ss.examples])) # Includes lemma_names. try: signature+= ss.lemma_names() except: signature+= ss.lemma_names # Optional: includes lemma_names of hypernyms and hyponyms. if hyperhypo == True: try: signature+= list(chain(*[i.lemma_names() for i \ in ss.hypernyms()+ss.hyponyms()])) except: signature+= list(chain(*[i.lemma_names for i \ in ss.hypernyms()+ss.hyponyms()])) # Optional: removes stopwords. if stop == True: signature = [i for i in signature if i not in stopwords.words('english')] # Lemmatized context is preferred over stemmed context if lemma == True: signature = [lemmatize(i) for i in signature] # Matching exact words causes sparsity, so optional matching for stems. if stem == True: signature = [porter.stem(i) for i in signature] signature = [i.lower() for i in signature] synsets_signatures[ss] = signature return synsets_signatures
def original_lesk(context_sentence, ambiguous_word, dictionary=None): """ This function is the implementation of the original Lesk algorithm (1986). It requires a dictionary which contains the definition of the different sense of each word. See http://dl.acm.org/citation.cfm?id=318728 """ ambiguous_word = lemmatize(ambiguous_word) if not dictionary: # If dictionary is not provided, use the WN defintion. dictionary = {} for ss in wn.synsets(ambiguous_word): ss_definition = synset_properties(ss, 'definition') dictionary[ss] = ss_definition best_sense = compare_overlaps_greedy(context_sentence.split(), dictionary) return best_sense
def clusterSentence(self, sentence): """ clusters the given sentence with existing cluster or creates a new cluster. sentence - sentence to be clustered """ words = utils.tokenize(sentence.lower()) lems = utils.lemmatize(words) terms = utils.filterStopWords(lems) tf = dict(Counter(terms)) self.clusterize(tf, sentence) # Every time a new sentence is clusterized, save latest clusters self.saveClusters()
def lemmas_similarity(s1, s2, filter_stop_words=True): """ Jaccard lematized sentences similarity """ # Tokenize by sentences into words in lower case tokenized_sentence_1 = nltk.word_tokenize(s1.lower()) tokenized_sentence_2 = nltk.word_tokenize(s2.lower()) if not filter_stop_words: tokenized_sentence_1 = [ token for token in tokenized_sentence_1 if token not in stop_words ] tokenized_sentence_2 = [ token for token in tokenized_sentence_2 if token not in stop_words ] tagged_sentence_1 = pos_tag( tokenized_sentence_1) # [ (word, POS_TAG), ...] tagged_sentence_2 = pos_tag( tokenized_sentence_2) # [ (word, POS_TAG), ...] lemmas_sentence_1 = [ lemmatize(tagged_word, wnl) for tagged_word in tagged_sentence_1 ] lemmas_sentence_2 = [ lemmatize(tagged_word, wnl) for tagged_word in tagged_sentence_2 ] # [LEMMA_1, ...] # Compute similarity if len(lemmas_sentence_1) > 0 and len(lemmas_sentence_2) > 0: similarity = 1 - jaccard_distance(set(lemmas_sentence_1), set(lemmas_sentence_2)) # Compute label of similarity return similarity else: return 0
def simple_signature(ambiguous_word, pos=None, lemma=True, stem=False, \ hyperhypo=True, stop=True): """ Returns a synsets_signatures dictionary that includes signature words of a sense from its: (i) definition (ii) example sentences (iii) hypernyms and hyponyms """ synsets_signatures = {} for ss in wn.synsets(ambiguous_word): try: # If POS is specified. if pos and str(ss.pos()) != pos: continue except: if pos and str(ss.pos) != pos: continue signature = [] # Includes definition. ss_definition = synset_properties(ss, 'definition') signature += ss_definition.split() # Includes examples ss_examples = synset_properties(ss, 'examples') signature += list(chain(*[i.split() for i in ss_examples])) # Includes lemma_names. ss_lemma_names = synset_properties(ss, 'lemma_names') signature += ss_lemma_names # Optional: includes lemma_names of hypernyms and hyponyms. if hyperhypo == True: ss_hyponyms = synset_properties(ss, 'hyponyms') ss_hypernyms = synset_properties(ss, 'hypernyms') ss_hypohypernyms = ss_hypernyms + ss_hyponyms signature += list( chain(*[i.lemma_names() for i in ss_hypohypernyms])) # Optional: removes stopwords. if stop == True: signature = [i for i in signature if i not in EN_STOPWORDS] # Lemmatized context is preferred over stemmed context. if lemma == True: signature = [lemmatize(i) for i in signature] # Matching exact words may cause sparsity, so optional matching for stems. if stem == True: signature = [porter.stem(i) for i in signature] synsets_signatures[ss] = signature return synsets_signatures
def original_lesk(context_sentence, ambiguous_word, dictionary=None): """ This function is the implementation of the original Lesk algorithm (1986). It requires a dictionary which contains the definition of the different sense of each word. See http://goo.gl/8TB15wb """ ambiguous_word = lemmatize(ambiguous_word) # If dictionary is not provided, use the WN defintion. if not dictionary: dictionary = {} for ss in wn.synsets(ambiguous_word): try: ss_definition = ss.definition().split() except: ss_definition = ss.definition.split() dictionary[ss] = ss_definition best_sense = compare_overlaps_greedy(context_sentence.split(), dictionary) return best_sense
def simple_signature(ambiguous_word, pos=None, lemma=True, stem=False, \ hyperhypo=True, stop=True): """ Returns a synsets_signatures dictionary that includes signature words of a sense from its: (i) definition (ii) example sentences (iii) hypernyms and hyponyms """ synsets_signatures = {} for ss in wn.synsets(ambiguous_word): try: # If POS is specified. if pos and str(ss.pos()) != pos: continue except: if pos and str(ss.pos) != pos: continue signature = [] # Includes definition. ss_definition = synset_properties(ss, 'definition') signature+=ss_definition # Includes examples ss_examples = synset_properties(ss, 'examples') signature+=list(chain(*[i.split() for i in ss_examples])) # Includes lemma_names. ss_lemma_names = synset_properties(ss, 'lemma_names') signature+= ss_lemma_names # Optional: includes lemma_names of hypernyms and hyponyms. if hyperhypo == True: ss_hyponyms = synset_properties(ss, 'hyponyms') ss_hypernyms = synset_properties(ss, 'hypernyms') ss_hypohypernyms = ss_hypernyms+ss_hyponyms signature+= list(chain(*[i.lemma_names() for i in ss_hypohypernyms])) # Optional: removes stopwords. if stop == True: signature = [i for i in signature if i not in EN_STOPWORDS] # Lemmatized context is preferred over stemmed context. if lemma == True: signature = [lemmatize(i) for i in signature] # Matching exact words may cause sparsity, so optional matching for stems. if stem == True: signature = [porter.stem(i) for i in signature] synsets_signatures[ss] = signature return synsets_signatures
def cl(s): # return rmv_smol_wds(rmv_digits(rmv_stp(cln(s)))) a = clean(s, fix_unicode=True, to_ascii=True, lower=True, no_line_breaks=True, no_urls=True, no_emails=True, no_phone_numbers=True, no_numbers=True, no_emoji=True, replace_with_url=" ", replace_with_email=" ", replace_with_phone_number=" ", replace_with_number=" ", lang="en") return rmv_smol_wds(lemmatize(rmv_stp(a)))