def wordnetLinking(self): for trp in self.triples: sub, rel, obj = trp['triple'] raw_sub, raw_rel, raw_obj = trp['raw_triple'] sub_id, rel_id, obj_id = self.ent2id[sub], self.rel2id[rel], self.ent2id[obj] for sentence in trp['src_sentences']: # sent = [wrd.lower() for wrd in sentence.split()] sent = sentence.split() ''' 92 is the length of list returned by dir when lesk is successful ''' self.ent2wnet[sub_id] = self.ent2wnet.get(sub_id, set()) res = lesk(sent, raw_sub) if len(dir(res)) == 92: self.ent2wnet[sub_id].add(res.name()) self.ent2wnet[obj_id] = self.ent2wnet.get(obj_id, set()) res = lesk(sent, raw_obj) if len(dir(res)) == 92: self.ent2wnet[obj_id].add(res.name()) self.rel2wnet[rel_id] = self.rel2wnet.get(rel_id, set()) res = lesk(sent, raw_rel) if len(dir(res)) == 92: self.rel2wnet[rel_id].add(res.name()) self.setHeading('Wordnet Entity Clusters') self.printCluster(self.ent2wnet, self.id2ent, 'm2ol') # for ent in self.ent_list: self.ent2wnet[self.ent2id[ent]] = [ele.name() for ele in lesk(ent)] # for rel in self.rel_list: self.rel2wnet[self.rel2id[rel]] = [ele.name() for ele in wordnet.synsets(rel)] self.setHeading('Wordnet Relation Clusters') self.printCluster(self.rel2wnet, self.id2rel, 'm2ol')
def caseC(question_string, question_string_pos_tagged, noun_phrases_in_question, likely_sentence_string, likely_sentence_string_tokenized, noun_phrases_in_likely_sentence): second_word_in_question_tag = question_string_pos_tagged[1][1] if (second_word_in_question_tag in noun_tags): second_word_in_question_supersense = lesk( question_string, question_string_pos_tagged[1][0], 'n') second_word_in_question_supersense = second_word_in_question_supersense.lexname( ) super_sense_matches_in_likely_sentence = [] for word in likely_sentence_string_tokenized: temp_supersense = lesk(likely_sentence_string, word, 'n') if (temp_supersense is None): continue temp_supersense = temp_supersense.lexname() if (temp_supersense == second_word_in_question_supersense): super_sense_matches_in_likely_sentence.append(word) if (len(super_sense_matches_in_likely_sentence) == 1): return super_sense_matches_in_likely_sentence[0] if (len(super_sense_matches_in_likely_sentence) > 1): first_np_in_likely_sentence = noun_phrases_in_likely_sentence[0] answer = closest_phrase_absolute( likely_sentence_string, first_np_in_likely_sentence, super_sense_matches_in_likely_sentence) return answer return None else: return None
def createAllNyms(self,sentence): wordTokens = word_tokenize(sentence) pos = pos_tag(wordTokens) hyper = {} hypo = {} mero = {} holo = {} index = 0 for tokenOrig in wordTokens: #print(tokenOrig) if(pos[index][1] in self.wordnet_tag_map): token = lesk(wordTokens, tokenOrig, self.wordnet_tag_map[pos[index][1]]) else: token = lesk(wordTokens, tokenOrig) index = index+1 #print(token)- hyper[token] = [] hypo[token] = [] mero[token] = [] holo[token] = [] if (token): if token.hypernyms(): hyper[token] = token.hypernyms() if token.hyponyms(): hypo[token] = token.hyponyms() mero[token] = token.part_meronyms() holo[token] = token.part_holonyms() return hyper, hypo, mero, holo
def simplified_lesk(word, sent3, sent1, sent2): a = lesk(sent1, 'bank', 'n') b = lesk(sent2, 'bank', 'n') c = lesk(sent3, 'bank', 'n') if a == b and b == c: best_sence = b elif a == b and a == c: best_sence = a elif b == c and a == c: best_sence = c max_overlap = 0 count_final = 0 for word in sent3: count = 0 if word in sent1: count = count + 1 if count > count_final: max_overlap = 1 for word in sent3: count = 0 if word in sent2: count = count + 1 if count > count_final: max_overlap = 2 return max_overlap
def my_lesk(tagged_strings, desired_word): """ @tagged_string: the string of words in the format 'word1_pos1 word2_pos2...' @desired_word: the word we want disambiguated Returns: - synset returned by lesk with part of speech (more accurate) - synset returned by lesk without pos specified if no pos (less accurate) - None if lesk returns nothing """ normal_string = '' desired_tag = '' for tagged_string in tagged_strings: word, tag = tagged_string.rsplit("_", 1) # Reject non-ASCII characters try: word = word.decode('ascii') except (UnicodeDecodeError, UnicodeEncodeError): continue if word == desired_word: desired_tag = tag normal_string += word + ' ' # ignore proper nouns and punctuation if desired_tag == 'NNP' or desired_tag == 'NNPS' or desired_tag in string.punctuation: return None # if the POS can be resolved to a wordnet POS, call lesk with POS # else call lesk without POS wn_pos = reduce_pos_tagset(desired_tag) if not wn_pos: return lesk(normal_string,desired_word) else: return lesk(normal_string, desired_word, wn_pos)
def createAllNyms(self, sentence): wordTokens = word_tokenize(sentence) pos = pos_tag(wordTokens) hyper = {} hypo = {} mero = {} holo = {} index = 0 hyperd = {} hypod = {} merod = {} holod = {} for tokenOrig in wordTokens: #print(tokenOrig) if (pos[index][1] in self.wordnet_tag_map): token = lesk(wordTokens, tokenOrig, self.wordnet_tag_map[pos[index][1]]) else: token = lesk(wordTokens, tokenOrig) index = index + 1 #print(token)- hyper[token] = [] hypo[token] = [] mero[token] = [] holo[token] = [] if (token): print("") print(tokenOrig) print(token) if token.hypernyms(): #hyper.append(token.hypernyms()) hyperd[token] = token.hypernyms() print("hypernyms") print(token.hypernyms()) print("") else: print("there are no hypernyms") if token.hyponyms(): #hypo[token] = token.hyponyms() hypod[token] = token.hyponyms() print("hyponyms") print(token.hyponyms()) print("") else: print("there are no hyponyms") #mero[token] = token.part_meronyms() merod[token] = token.part_meronyms() print("meronyms") print(token.part_meronyms()) print("") #holo[token] = token.part_holonyms() holod[token] = token.part_holonyms() print("holonyms") print(token.part_holonyms()) print("") return hyperd, hypod, merod, holod
def lesk_similarity(triple_1, triple_2): triple_1 = triple_1.lower().split(' ') triple_2 = triple_2.lower().split(' ') triple_1 = [ wnl.lemmatize(w.replace('^-1', '')) for w in triple_1 if w not in stop ] triple_2 = [ wnl.lemmatize(w.replace('^-1', '')) for w in triple_2 if w not in stop ] triple_1 = [x for x in triple_1 if x != ''] triple_2 = [x for x in triple_2 if x != ''] count = 0 for wrd_1 in triple_1: if count == 1: break for wrd_2 in triple_2: if wrd_1 == wrd_2 or wrd_1 in wrd_2 or wrd_2 in wrd_1: count += 1 index_1 = triple_1.index(wrd_1) index_2 = triple_2.index(wrd_2) break if count == 0: return 0 syn_1 = lesk(triple_1, triple_1[index_1]) syn_2 = lesk(triple_2, triple_2[index_2]) if syn_1 == syn_2: return 1 else: return 0
def wsd(sent, subj, obj): ''' Questa funzione effettua la WordSenseDisambiguation mediante il metodo lesk() fornito da WordNet restituendo il synset delle parole disambiguate. :param sent: frase del corpus; :param subj: soggetto della frase; :param obj: oggetto della frase; :return: synset relativi al soggetto ed oggetto. ''' possible_subj = ["i", "you", "he", "she", "it", "we", "they"] sing_subj = ["i", "he", "she", "it"] plural_subj = ["we", "you" "they"] if subj in possible_subj or subj is None: if subj in plural_subj: ris = wn.synsets('people')[0] else: ris = wn.synsets('person')[0] elif subj is not None: ris = lesk(sent, subj) if ris is None: ris = wn.synsets('people')[0] else: ris = None if obj is not None: ris1 = lesk(sent, obj) if ris1 is None: #ris1 = wn.synsets('thing')[0] ris1 = wn.synsets('food')[0] else: ris1 = wn.synsets('food')[0] #ris1 = wn.synsets('thing')[0] #print("Ris ", ris, "Ris1 ", ris1) return ris, ris1
def compute_supersense(tuple, sentence): if tuple[2] in personal_pronoun_tags: return 'noun.person' if tuple[0] != 'it' else 'noun.entity' if tuple[0] == 'who': return 'noun.person' if tuple[0] == 'what': return 'noun.entity' # Ex. in sentence: interest in how what people eat affects their health # ('what', 'obj', 'WP') what sense returns no sense # force to entity if tag_to_wnpos_map[tuple[2]] is not None: #ottieni la lista di wn pos abbinata al tag penn e calcola il sinset con lesk per ognuno di questi possible_syns_by_pos = [ x for x in [ lesk(sentence, tuple[0], pos=pos) for pos in tag_to_wnpos_map[tuple[2]] ] if x is not None ] #usa lesk per disambiguare tra i synset trovati filler_sense = lesk(sentence, tuple[0], synsets=possible_syns_by_pos) else: filler_sense = lesk(sentence, tuple[0]) filler_supersense = filler_sense.lexname( ) if filler_sense is not None else None return filler_supersense
def __setup_lesk(cls): try: lesk(word_tokenize('This device is used to jam the signal'), 'jam') except LookupError: nltk.download('punkt') nltk.download('wordnet') nltk.download('omw-1.4') lesk(word_tokenize('This device is used to jam the signal'), 'jam')
def assign_word_senses(self, word, partOfSpeech, sentenceIndex): if partOfSpeech == VERBS: wordSense = wn.synsets(word.word, pos=wn.VERB) if wordSense: word.wordSense = lesk(self.sentences[sentenceIndex], word.word, 'v') elif partOfSpeech == NOUNS: wordSense = wn.synsets(word.word, pos=wn.NOUN) if wordSense: word.wordSense = lesk(self.sentences[sentenceIndex], word.word, 'n')
def disambiguate(): """Returns the best synset if a word is ambiguous""" lesks = [] nouns = o_tag() text = one_line() for noun in nouns: if len(wn.synsets(noun[0])) > 1: if lesk(text, noun[0], 'n'): lesks.append((noun[0], lesk(text, noun[0], 'n'))) elif lesk(text, noun[0], 'n'): lesks.append((noun[0], wn.synsets(noun[0]))) return lesks
def apply_lesk(offset_sentic_dict): print('Appling Lesk Algorithms Started...') # making another offset_sentic_dict for rest of concepts and concat it to the existed one. # input: offset_sentic_dict["00044455-n"] = ['0.1', '0.1', '0.1', '0.1', #joy', '#surprise', 'positive', '0.726', 'appearance', 'start', 'casus_belli', 'beginning', 'egress'] // semantics might not included # output: last_offset_sentic_dict["00044455-n"] = ['0.1', '0.1', '0.1', '0.1', '#joy', '#surprise', 'positive', '0.726', 'appearance', 'start', 'casus_belli', 'beginning', 'egress'] with open('vocabulary/affectnet_dict.pkl', 'rb') as f: aff_fr_dict = pickle.load(f) fr_offset_dict = dict() deleted_offset = [] #direct mapped words were deleted before for word, value in senticnet.items(): context = word found = False for i in range(8, 13): context += senticnet[word][i] + ' ' try: synset = lesk(context, word) offset = str(synset.offset()).zfill(8) + '-' + synset.pos() found = True except AttributeError: # not found # sequentially, because it is arranged by c. similarity for v in value[8:13]: try: synset = lesk(context, v) offset = str(synset.offset()).zfill(8) + '-' + synset.pos() found = True except AttributeError: continue if found == False: continue # Direct mapped offset is not considered if offset in offset_sentic_dict: continue if offset in deleted_offset: continue if offset not in offset_sentic_dict: offset_sentic_dict[offset] = value fr_offset_dict[offset] = aff_fr_dict[word] else: if offset_sentic_dict[offset][6] != value[6]: del offset_sentic_dict[offset] deleted_offset.append(offset) continue else: offset_sentic_dict[offset] = weighted_sentic(offset_sentic_dict[offset], fr_offset_dict[offset], value, aff_fr_dict[word]) fr_offset_dict[offset] += aff_fr_dict[word] return offset_sentic_dict
def wsd(listOfWords, word, pos = None): if pos == None: syn = lesk(listOfWords, word) return [syn] if isSynset(syn) else [] else: syns = [] for x in pos: syn = lesk(listOfWords, word, x) if isSynset(syn): syns.append(syn) return syns
def batch_matrix(self, dataset, begin_idx, end_idx): indice = range(begin_idx, end_idx) r = np.zeros((end_idx - begin_idx, self.sequence_length, self.sequence_length, 5)) premise = np.array([dataset[i]['sentence1_binary_parse_index_sequence'] for i in indice]) hypothesis = np.array([dataset[i]['sentence2_binary_parse_index_sequence'] for i in indice]) premise_def, hypothesis_def = np.zeros((2, end_idx - begin_idx, self.sequence_length, self.sequence_length)) mask_p, mask_h = np.ones((2,end_idx - begin_idx, self.sequence_length)) mask_p_def, mask_h_def = np.ones((2,end_idx - begin_idx, self.sequence_length, self.sequence_length)) labels = np.array([dataset[i]['label'] for i in indice]) genres = np.array([dataset[i]['genre'] for i in indice]) for i in range(begin_idx, end_idx): pre, hyp = dataset[i]['sentence1'].split()[:self.sequence_length], dataset[i]['sentence2'].split()[:self.sequence_length] mask_p[i - begin_idx, :len(pre)] = 1 mask_h[i - begin_idx, :len(hyp)] = 1 for i_p, p in enumerate(pre): # lesk: find the most possible sysnet of word p from word p and whole sentence pre tmp = lesk(pre, p) if tmp is not None: # get p's definition tmp = tmp.definition().strip('\'()').split() else: continue # turn definition words to corresponding indices premise_def[i - begin_idx][i_p][:len(tmp)] = ([word_indices[i] if i in word_indices else 0 for i in tmp])[:self.sequence_length] mask_p_def[i - begin_idx][i_p][:len(tmp)] = 1 for i_h, h in enumerate(hyp): tmp = lesk(hyp, h) if tmp is not None: tmp = tmp.definition().strip('\'()').split() else: continue hypothesis_def[i - begin_idx][i_h][:len(tmp)] = ([word_indices[i] if i in word_indices else 0 for i in tmp])[:self.sequence_length] mask_h_def[i - begin_idx][i_h][:len(tmp)] = 1 for i_p, p in enumerate(pre): p_syn = lesk(pre, p) if p_syn is None: continue for i_h, h in enumerate(hyp): h_syn = lesk(hyp, h) if h_syn is None: continue r[i - begin_idx][i_p][i_h][0] = is_synonym(p_syn, h_syn) r[i - begin_idx][i_p][i_h][1] = is_antonym(p_syn, h_syn) r[i - begin_idx][i_p][i_h][2] = is_hypernym([p_syn], h_syn, 1) r[i - begin_idx][i_h][i_p][3] = is_hypernym([p_syn], h_syn, 1) r[i - begin_idx][i_p][i_h][4] = is_co_hypernym(p_syn, h_syn) return r, premise, hypothesis, premise_def, hypothesis_def ,labels, mask_p, mask_h, mask_p_def, mask_h_def, genres
def sentiment_analysis_wsd(text_n_tagged_text): pos_tagged_text = text_n_tagged_text[0] text = text_n_tagged_text[1] pos_arr = [] neg_arr = [] subj_arr = [] for obj in pos_tagged_text: if return_pos_sentiwordnet(obj[1]) == 0: continue pos = return_pos_sentiwordnet(obj[1]) if lesk(text, obj[0], pos): syn = lesk(text, obj[0], pos) polarity = polarity_score_1(syn) subj = subjectivity_score_1(syn) else: polarity = polarity_score_2(obj[0], pos) subj = subjectivity_score_2(obj[0], pos) subj_arr.append(subj) if polarity > 0.0: pos_arr.append(polarity) elif polarity < 0.0: neg_arr.append(polarity) else: continue #print pos_arr if np.array(pos_arr).size == 0: pos_mean_score = 0.0 else: pos_mean_score = round(np.mean(np.array(pos_arr)), 1) if np.array(neg_arr).size == 0: neg_mean_score = 0.0 else: neg_mean_score = round(np.mean(np.array(neg_arr)), 1) subj_mean_score = round(np.mean(np.array(subj_arr)), 1) temp_neg_score = neg_mean_score * -1.0 #if (pos_mean_score,neg_mean_score,subj_mean_score): #return (pos_mean_score,neg_mean_score,subj_mean_score) #else: #return (0,0.0) if pos_mean_score > temp_neg_score: return ('1', pos_mean_score + neg_mean_score, subj_mean_score) elif pos_mean_score < temp_neg_score: return ('-1', pos_mean_score + neg_mean_score, subj_mean_score) else: return ('0', 0.0, subj_mean_score)
def get_semantic_features(tagged_tok, line): ''' return features like synonyms, hypernyms, hyponyms, meronyms, holonymns extracted from each word of sentence ''' lemma_sen = set() hyper_sen = set() hypo_sen = set() mero_sen = set() holo_sen = set() for word, tag in tagged_tok: if tag[:2] in WN_TAG_LIST: sense = lesk(line, word, pos=WN_TAG_LIST.get(tag[:2])) if not sense: continue for lem in sense.lemmas(): lemma_sen.add(lem.name()) for hyper in sense.hypernyms()[:30]: hyper_sen.add(hyper.name()) for hypo in sense.hyponyms()[:30]: hypo_sen.add(hypo.name()) for mero in sense.part_meronyms()[:30]: mero_sen.add(mero.name()) for holo in sense.member_holonyms()[:30]: holo_sen.add(holo.name()) return (' '.join(lemma_sen), ' '.join(hyper_sen), ' '.join(hypo_sen), ' '.join(mero_sen), ' '.join(holo_sen))
def getFeatures(tokensTagged, line): lemmaS = set() hyperS = set() hypoS = set() meroS = set() holoS = set() for word, tag in tokensTagged: if tag[:2] in nltkWnMap: # and tag != 'NNP': sense = lesk(line, word, pos=nltkWnMap.get(tag[:2])) # sense = lesk(line, word) if not sense: continue for lem in sense.lemmas(): lemmaS.add(lem.name()) for hyper in sense.hypernyms()[:featureMaxLimit]: hyperS.add(hyper.name()) for hypo in sense.hyponyms()[:featureMaxLimit]: hypoS.add(hypo.name()) for mero in sense.part_meronyms()[:featureMaxLimit]: meroS.add(mero.name()) for holo in sense.member_holonyms()[:featureMaxLimit]: holoS.add(holo.name()) return (' '.join(lemmaS), ' '.join(hyperS), ' '.join(hypoS), ' '.join(meroS), ' '.join(holoS))
def word_synonyms(word, sentence): synset = lesk(sentence, word) if synset is None: return None lemmas = synset.lemmas() lemmas = [str(lemma.name()) for lemma in lemmas] return lemmas
def __subtree__(self, row): """ Find subtree pattern like X ->R<- Y and generation heuristic based score where X: Subject R:Relation Y:Object """ corpus = row["corpus"] output = [] tokens = row["tokens"] # iterate through all the tokens in the input sentence for i, sent in enumerate(corpus): r = '' for tok in sent: # # extract subject # if tok.dep_.find("subjpass") == True: # y = tok.text # # extract object # if tok.dep_.endswith("obj") == True: # x = tok.text # # extract relation if tok.dep_ == "ROOT": r = lesk(tokens[i], tok.text, get_wordnet_pos(tok.tag_)) if r is not None: r = r.name() break output.append(r) return output
def get_replacement(self, token_with_tag, tokens): word = token_with_tag[0] synset_lock = Lock() synset_lock.acquire() synset = lesk(tokens, word) synset_lock.release() if synset is None: return None lemmas = synset.lemmas() # find a replacement replacement = None for lemma in lemmas: lemma_name = lemma.name() lemma_pos = lemma.synset().pos() if (lemma_pos == 'n') and lemma_name != word: replacement = lemma_name break if replacement is None: #replacement = word return None return replacement.replace("_", " ")
def canonicalise(extractions): for ext in extractions: ext = extractions[ext] # relation synsets rel_synsets = set([]) rel_root = set([]) if ext['object']: sentence = ext['subject'] + ' ' + ext['relation'] + ext['object'] else: sentence = ext['subject'] + ' ' + ext['relation'] doc = nlp(ext['relation']) for token in doc: if token.pos_ == 'VERB' and token.text not in [ 'will', 'shall', 'may', 'must', 'can', 'could' ]: try: rel_synsets.add(lesk(sentence, token.text, 'v').name()) rel_root.add(token.lemma_) except: print( 'ERROR:', token.lemma_, ) ext['rel_synsets'] = list(rel_synsets) # entity canonicalisation ext['subject'] = entity_canonicalisation(ext['subject']) ext['object'] = entity_canonicalisation(ext['object']) for m in ext['modifiers'] + ext['subject_modifiers']: m['m_obj'] = entity_canonicalisation(m['m_obj']) return extractions
def findCategories(tokens, tags, nouns, verbs): for word in tokens: if not lesk(tokens, word): continue if lesk(tokens, word).pos() == 'n': category = lesk(tokens, word).lexname() if category not in nouns.keys(): nouns[category] = 1 else: nouns[category] += 1 elif lesk(tokens, word).pos() == 'v': category = lesk(tokens, word).lexname() if category not in verbs.keys(): verbs[category] = 1 else: verbs[category] += 1
def get_tokens_POS(sentence_token_complexity_pairs): stemmer = SnowballStemmer(LANGUAGE) tokens_POS = [(sentence, lesk(sentence, str(token)), complexity) for sentence, token, complexity in sentence_token_complexity_pairs] for i in range(len(tokens_POS)): sentence, token, complexity = tokens_POS[i] if token is None: word = sentence_token_complexity_pairs[i][1] token = lesk(sentence, stemmer.stem(word)) if token is None: tokens_POS[i] = (sentence, word, complexity) else: tokens_POS[i] = (sentence, token, complexity) return [(sentence, token, complexity) for sentence, token, complexity in tokens_POS]
def transform_tag(tag, word, words): synset = lesk(words, word, "n") if synset: if tag == "ORGANIZATION" or tag == "PERSON": return tag[:3] elif tag == "LOCATION": paths = synset.hypernym_paths() for path in paths: for synset in path: name = synset.name() if "city" in name or "town" in name: return "CIT" elif "country" in name or "state" in name: return "COU" return "NAT" elif tag == "MISC": paths = synset.hypernym_paths() for path in paths: for synset in path: name = synset.name() if "animal" in name: return "ANI" elif "sport" in name: return "SPO" elif "entertainment" in name: return "ENT" return "" else: return "" else: return ""
def query_expanded(list_of_words, weight, word_similarity = None): """ Expand the query using various word relations (synonyms, hypernyms or hyponyms) """ count = 0 expanded_query = [] for x in list_of_words: expanded_query.extend([x for i in range(weight)]) # # WSD syn = lesk(list_of_words, x) try: for l in syn.lemmas() : # if(count<3): if l.name() not in expanded_query: expanded_query.append(l.name()) count+=1 # for hyp in syn.hypernyms(): # for hyp_lemma in hyp.lemmas(): # if hyp_lemma.name() not in expanded_query: # expanded_query.append(hyp_lemma.name()) # count+=1 # for hyp in syn.hyponyms(): # for hyp_lemma in hyp.lemmas(): # if hyp_lemma.name() not in expanded_query: # expanded_query.append(hyp_lemma.name()) # count+=1 except: pass return expanded_query
def ExtendText(fileName, tagger=PerceptronTagger()): with io.open(fileName, 'r') as w: text = TextBlob(w.read(), pos_tagger=tagger) extended_text = [] for sent in text.sentences: for word in sent.pos_tags: #word = "bank" penn_tags = ['JJ', 'NN', 'V'] extending = False for tag in penn_tags: if tag in word[1]: extending = True pos = tag[0].lower() try: l = lesk(sent.string, word[0].lower(), pos) syns = l._lemma_names for syn in syns: extended_text.append(syn) break except: extended_text.append(word[0].lower()) if not extending: extended_text.append(word[0].lower()) extended_text = ' '.join([ word for word in extended_text if word not in cachedStopWords ]).lstrip() return extended_text
def main(questionLoc, categoriesLoc, featuresLoc): categories, subCategories = [],[] categories, subCategories = getCategories(categoriesLoc) questions = loadQuestions(questionLoc) features = getFeatureLists(featuresLoc) finalString = '' for i in range(len(questions)): disambedSens = None whWord = features[i][0] headWord = features[i][1] if headWord == 'null': headWord = None label = features[i][2] label = convertPOS(label) question = questions[i].strip() uni = GetUnigram(question) bi = GetBigram(question) tri = GetTrigram(question) wordShape = GetWordShape(uni) if headWord != None and ':' not in headWord: disambigSense = wsd.lesk(question, headWord, pos=label) if disambigSense: directHypernym = getHypernym(question, 5, disambigSense) indirectHypernym = mostSimilarCategory(disambigSense, subCategories) print whWord, headWord#, disambigSense, wordShape, uni, bi, tri
def desambiguar(men): # Para que no den problemas los carateres los reemplazo global cont # Separo el texto en frases frases = sentence_tokenizer.tokenize(men) for frase in frases: nfrase = frase #Nueva frase # Proceso la frase doc = nlp(frase) cont = 0 # Busco las palabras ambiguas de la frase for token in doc: cont = nfrase.find(token.text,cont) # Intento desambiguar synset = lesk(doc, token.text) if synset and len(wn.synsets(token.text)) > 1: nfrase = creaPildora(token.text, synset.name(), nfrase) else: cont += len(token.text) men = men.replace(frase,nfrase,1) return men
def synonym_paraphrase(words, xpos): verb_labels = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] adj_labels = ['JJ', 'JJR', 'JJS'] noun_labels = ['NN', 'NNP', 'NNS', 'NNPS'] adv_labels = ['RB', 'RBR', 'RBS'] synonyms = {} for i in range(len(words)): ### get pos tag pos_tag = None if (xpos[i] in verb_labels): pos_tag = 'v' if (xpos[i] in adj_labels): pos_tag = 'a' if (xpos[i] in noun_labels): pos_tag = 'n' if (xpos[i] in adv_labels): pos_tag = 'r' if (pos_tag == None): continue ### get morphy of word morphy = wn.morphy(words[i], pos_tag) if (morphy == None): continue ### get all synonyms meaning = lesk(' '.join(words), morphy, pos_tag) if (str(meaning) == 'None'): continue syns = meaning.lemma_names() synonyms[morphy] = syns return synonyms
def obtener_mejor_definicion(tokens, sustantivo): definiciones = encontrar_definicion(sustantivo) if len(definiciones) == 1: return definiciones[0]['definicion'] else: synset = lesk(tokens, sustantivo, 'n') return synset.definition()
def mostcommonsyns(row): #print "Evaluating new row" record, stop_words = row text = record['text'] #print "------------------\n\n", text #print "hi len: ", len(text) stopWords = get_stopwords() sentList = nltk.sent_tokenize(text) #print "hi2" wordsInSentsPos = [nltk.pos_tag(nltk.word_tokenize(s)) for s in sentList] #print "hi3" wordsInSentsWnPos = [[(w[0],penn2morphy(w[1])) for w in s if w[0].lower() not in stopWords] for s in wordsInSentsPos] #print "h4" #the above returns a list of sentences where each sentence is a list of #(word-as-string, pos tag) tuples. Stop words are removed here because pos_tag #uses grammatical structure but lesk does not. #This would also be the place to lemmatize, which will help lesk out. synsetsList = [lesk(s,w[0],w[1]) for s in wordsInSentsWnPos for w in s] #print "go" #print synsetsList, "\n\n" #return [3,2,2,2] #res = FreqDist([x for x in synsetsList if x is not None]) res = [s.name() for s in synsetsList if s is not None] return res
def main(): wikiDict = pickle.load(open('wikis.pickle','rb')) amountList = [] for value in wikiDict.values(): value = re.sub(r'\[[0-9]*\]',"",value) text = sent_tokenize(value) for sent in text: tokenized = word_tokenize(sent) pos = pos_tag(tokenized) for token, tag in pos: if (tag == 'NNPS' or tag == 'NNP' or tag == 'NNS' or tag == 'NN'): tag = "n" if len(wn.synsets(token, tag)) > 1: print(token, lesk(sent, token, tag)) print("All possible senses:") n = 0 for ss in wn.synsets(token, tag): print(ss, ss.definition()) n += 1 amountList.append(int(n)) #print() #print(amountList) c2 = Counter(amountList) print(c2)
def ExtendText(fileName,tagger=PerceptronTagger()): with io.open(fileName, 'r') as w: text = TextBlob(w.read(), pos_tagger=tagger) extended_text = [] for sent in text.sentences: for word in sent.pos_tags: #word = "bank" penn_tags = ['JJ','NN','V'] extending = False for tag in penn_tags: if tag in word[1]: extending = True pos = tag[0].lower() try: l = lesk(sent.string, word[0].lower(), pos) syns = l._lemma_names for syn in syns: extended_text.append(syn) break except: extended_text.append(word[0].lower()) if not extending: extended_text.append(word[0].lower()) extended_text = ' '.join([word for word in extended_text if word not in cachedStopWords]).lstrip() return extended_text
def abstraction_score(text, uselesk = False): """Takes in a list of sentences, tags the parts of speech from the word tokenized sentence, looks for part of speech beginning with N (a type of noun) returns the mean abstraction score by calling calculate_scores. Optional, uses the default lesk algorithm from NLTK """ nouns = [] for sent in text: nouns.append([token for token, pos in pos_tag(word_tokenize(sent)) if pos.startswith('N')]) # print nouns nouns = [item for sublist in nouns for item in sublist] scores=[] for i in nouns: if uselesk: y = lesk(sent, i) if y is not None: #print i, calculate_scores(y) scores.append(calculate_scores(y)) else: by_word=[] synsets = wn.synsets(i, "n") for synset in synsets: by_word.append(calculate_scores(synset)) absword = np.mean(by_word) scores.append(absword) #print absword scores = np.array(scores) #print scores return scores[~np.isnan(scores)].mean()
def analyser(doc): expand_tokens = [] expand_Lemms = [] synsets = [] # apply the preprocessing and tokenzation steps sent_text = nltk.sent_tokenize(doc) doc_clean = self.build_preprocessor()(doc) tokens = self.build_tokenizer()(doc_clean) # use CountVectorizer's _word_ngrams built in method # to remove stop words and extract n-grams n_grams = list(set(self._word_ngrams(tokens,self.get_stop_words()))) for x in n_grams: my_regex = r"\b(?=\w)" + re.escape(x) + r"\b(?!\w)" matched_sent = [s for s in sent_text if len(re.findall(my_regex, s, re.IGNORECASE)) > 0] expand_tokens.append(x) expand_Lemms.append(matched_sent) Expanded_Token_Lemmd = pd.DataFrame({'Word': expand_tokens,'Sentences': expand_Lemms}) for i in range(0, len(Expanded_Token_Lemmd.index)): for row_len in range(0, len(Expanded_Token_Lemmd.iloc[i]['Sentences'])): x = lesk(Expanded_Token_Lemmd.iloc[i]['Sentences'][row_len],Expanded_Token_Lemmd.iloc[i]['Word'].lower().replace(' ','_')) if not x: synsets.append(Expanded_Token_Lemmd.iloc[i]['Word'].lower()) else: synsets.append(str(x)[8:-2]) return(synsets)
def lexsense(word, context='') : if context: sense = lesk(context,word,pos=wn.NOUN); if sense: return {sense} return set(wn.synsets(word, pos=wn.NOUN))
def calculate_probable_synset(self, sentence: List[str]): if self.probable_synset: return self.probable_synset = lesk(sentence, self.token.text, pos=_token_pos_to_nltk_pos(self.token))
def main(file): # Stuff for unicode decode errors reload(sys) sys.setdefaultencoding("utf-8") # Get the text of the URL text = getText(file[1]) tokens = nltk.word_tokenize(text) # POS tag words taggedWords = nltk.pos_tag(tokens) # Filter all nouns nouns = [(word, tag) for word, tag in taggedWords if tag.startswith("N")] # Set values for the answers amountOfPolysemousWords = 0 amountOfSenses = 0 listofsenses = [] # Get wordnet synsets for word, tag in nouns: # Get the amount of senses senses = len(wordnet.synsets(word, "n")) # Check if the word is polysemous if senses > 1: # Count the polysemous words and senses amountOfPolysemousWords = amountOfPolysemousWords + 1 amountOfSenses = amountOfSenses + senses listofsenses.append(senses) # Answer for question 1 print("For this file, there are {} polysemous word".format(amountOfPolysemousWords)) # Answer for question 3 averageSenses = amountOfSenses/amountOfPolysemousWords print("For this file, the average senses are {} per polysemous word".format(averageSenses)) # Answer for question 4 result = Counter(listofsenses) print(result) # Answer for question 5 words = ["cars", "quantity", "carbon", "states", "change", "life"] pos = "n" textObject = nltk.Text(tokens) for sent in sent_tokenize(text): for word in words: context = textObject.concordance(word) print("\n\n" + str(context)) print ("\n\n The result of algorithm is: " + str(lesk(sent, word, pos))) print("\n\n All possible senses for " + word + ":") for ss in wordnet.synsets(word, "n"): print(ss, ss.definition())
def disambiguate(context, word, pos): """ Word sense disambiguation using Lesk algorithm @context: a string containing the word whose meaning we want to disambiguate @word: the word we want to disambiguate @pos: the part of speech Returns: the Synset of the most likely meaning of the word """ return lesk(context, word, pos)
def leskify(file): with(open(file, 'r')) as f: contents = f.read() words, describingWord = wordify(file) meaning = {} for word in words: meaning[word] = lesk(contents, word) # meaning.append((word, lesk(contents, word))) return meaning, describingWord
def define_func(nick,match,target): word=match.group('word') sentence=match.group('sentence') meaning=match.group('meaning') lang=match.group('lang') sentiment=match.group('sentiment')!=None if sentence: separator=re.compile("[.\s,]+") words=re.split(separator,sentence) for w in words: synsets=wn.synsets(w) if synsets!=[]: word=w break if synsets!=[]: synset=lesk(words,word) define_synset(nick,word,synset,target,lang,sentiment) else: mb.tell(nick+": what the flying f**k does that even mean",target) return if word: synsets=wn.synsets(word) if synsets==[]: mb.tell(nick+': no idea what "'+word+'" is. probably something gay. like you',target) elif len(synsets)==1: define_synset(nick,word,synsets[0],target,lang,sentiment) else: choose=[] temp_choose=[] for synset in synsets: name=None for hypernym in synset.hypernyms(): for lemma in hypernym.lemmas(): name=lemma.name() break break if not name: for lemma in synset.lemmas(): name=lemma.name() break temp_name=name if name in temp_choose: name=name+"("+str(temp_choose.count(name))+")" if meaning: if meaning.upper()==name.upper(): define_synset(nick,word,synset,target,lang,sentiment) return temp_choose.append(temp_name) choose.append(name) words="|".join([re.escape(x).replace("_","[_\s]+") for x in choose]) message=", ".join(choose) pattern="^(?:murderb[o0]t[,\s:!]+)?(?:(?:(?:(?:who|what)(?:\s+am|\s+is|\s+are|'s|'re|'m|s|re)(?:\s+a|\s+the)?)|(?:define))\s+)?(?:"+word+"\s+)?(?:as\s+in\s+)?(?P<word>{0})$".format(words); clarify=re.compile(pattern,flags=re.IGNORECASE) response={'nick':nick,'func':clarify_func,'pattern':clarify,'param':{'sent':sentiment,'lang':lang,'synsets':synsets,'words':[x.upper().replace("_","") for x in choose],'word':word},'target':target} mb.responses['define']=response mb.tell(nick+": "+word+" as in "+message+"?",target) return
def extract_nouns_info(sentence): nouns_info = [] tokens = word_tokenize(sentence) tagged_words = pos_tag(tokens) for (word, tag) in tagged_words: if('NN' in tag): pos = 'n' wsd = lesk(tokens, word, pos) syns = wsd.lemma_names() nouns_info.append( (word, wsd, syns) ) return nouns_info
def lexclass(word, context=''): rslt = set() if context: sense = lesk(context,word,pos=wn.NOUN); if sense: rslt.add(sense.lexname()) return rslt sets = wn.synsets(word, pos=wn.NOUN) for s in sets: rslt.add(s.lexname()) return rslt
def disambiguate(self, wsd_instance): """ Disambiguates the given instance, returning the predicted lemma sense key. @param wsd_instance - WSDInstance to disambiguate @return Sense key as a string if a sense could be found, None otherwise """ syn = lesk(wsd_instance.context, wsd_instance.lemma, 'n') if syn is not None: return to_sense_key(syn) return None
def getWordSentimentTuple(self, word, pos, wordlist): if wordlist != "sentiwordnet": raise InvalidDictionaryException("Invalid dictionary " + wordlist + \ "please use sentiwordnet") else: simplePOS = convertPOSTagToSimplePOS(pos) if pos: wordSense = lesk(self.tokens, word, simplePOS) if wordSense: sentiSynsetWord = swn.senti_synset(wordSense.name()) if sentiSynsetWord: return (sentiSynsetWord.pos_score(), sentiSynsetWord.neg_score(), sentiSynsetWord.obj_score()) return (0, 0, 0)
def synonym_picker3(word, wpos, sentence, npos): #print(word, pos, sentence) #print(word, " ", npos) start_synset = lesk(sentence, word, wpos) synonyms = [] #start_synset.res_similarity() if (start_synset): #print(start_synset.lemmas()) for l in start_synset.lemmas(): if l.name() not in synonyms and l.name() != word: synonyms.append(l.name().replace('_', ' ')) if len(synonyms) > 0: return random.choice(list(synonyms)) else: return word else: return word
def analyze(self, doc): res = [] for sentence in self.normalizer.sent_tokenize(doc): tagged_sentence = self.tagger.tag(self.normalizer.split_and_normalize(sentence)) lemmatized_doc = [] for w, pos in tagged_sentence: try: pos_ = pos[:1] wn_postag = self.translation_dict[pos_] except KeyError: wn_postag = None if wn_postag: lemmatized_doc.append(self.lem.lemmatize(w, wn_postag)) for w in lemmatized_doc: sense = wsd.lesk(lemmatized_doc, w) if sense: res.append(sense.name()) return res
def get_Candidate_Frequency_from_wordnet(word, tag, context): wordnet_tag = get_wordnet_pos(tag) sent = list(context) #print("context is") #print(sent) #syns = lesk(sent, word, wordnet_tag) syns = lesk(sent, word) res = [] if syns: for l in syns.lemmas(): if l: lemma_name = str(l.name()) st = LancasterStemmer() if ((st.stem(word) != st.stem(lemma_name))): res.append(lemma_name) candidate_list = list(set(res)) #print("for word: " + word) #print(candidate_list) ''' if candidate_list: for c in candidate_list: allsyns1 = set(ss for ss in wordnet.synsets(c)) print(allsyns1) allsyns2 = set(ss for ss in wordnet.synsets(word)) print(allsyns2) best = max((wordnet.wup_similarity(s1, s2) or 0, s1, s2) for s1, s2 in product(allsyns1, allsyns2)) print(best) ''' ''' for c in candidate_list: wordsFromList1 = wordnet.synsets(word) wordsFromList2 = wordnet.synsets(c) if wordsFromList1 and wordsFromList2: # Thanks to @alexis' note s = wordsFromList1[0].wup_similarity(wordsFromList2[0]) similarity_list.append(s) ''' return candidate_list
def main(): path = "group9/" dirs = ["p34", "p35"] number_of_ss = 0 synsets = [] for directory in dirs: for directory2 in os.listdir(path+directory): for filename in os.listdir(path+directory+"/"+directory2): if filename.endswith(".tok.off.pos.ent"): with open(os.path.join(path, directory+"/"+directory2, filename), 'r') as fname: output = open(os.path.join(path, directory+"/"+directory2, "output_disambiguation.txt"), 'w') ambiguous_words = [] ambiguous_lines = [] text_words = [] for line in fname: split_line = line.split() text_words.append(split_line) l = line.split() if l[4] == "NN" or l[4] == "NNP": if len(wordnet.synsets(l[3], "n")) > 1: number_of_ss += len(wordnet.synsets(l[3], "n")) synsets.append(len(wordnet.synsets(l[3], "n"))) ambiguous_words.append((l[2], l[3])) for word in ambiguous_words: start = int(str(word[0][0]) + "001") end = start + 999 lines = [] for l in text_words: if start <= int(l[2]) <= end: lines.append(l[3]) ambiguous_lines.append(lines) for i in range(len(ambiguous_words)): ss = lesk(ambiguous_lines[i], ambiguous_words[i][1], "n") outputwrite = str((ss, ss.definition())) + "\n" output.write(outputwrite) c = Counter(synsets) print(sorted(c.items(), key=lambda pair: pair[0], reverse=True)) print(number_of_ss)
def mostSimilarCategory(theSense, theCategories): maxValue = 0 catName = '' posList = None for cat in theCategories: for key, values in cat.iteritems(): posList = nltk.pos_tag(values) index = 0 contextSentence = ' '.join(values) for word in values:#this is each word in the category catSense = wsd.lesk(contextSentence, word) if catSense: similarityValue = wn.path_similarity(catSense, theSense) if similarityValue!=None: if similarityValue > maxValue: maxValue = similarityValue catName = key index += 1 return catName
def getSenseLocs(words, sentence): senseLocs = {} token_sent = nltk.word_tokenize(sentence) positions = matcher.getPositions(words, token_sent) tagged = [(word, get_wordnet_pos(pos)) for word, pos in nltk.pos_tag(token_sent)] for key, value in positions.items(): word = tagged[value][0] pos = tagged[value][1] #print(word) #print(pos) syns = wordnet.synsets(word, pos=pos) #print(syns) #print(token_sent) if len(syns) > 0: sense = lesk(token_sent, word, pos) if sense: senseLocs[str(pos)+"_senseLoc"] = syns.index(sense) return senseLocs
def combineTags(sentence): """ Adds the tags to the testfile """ #pbar = ProgressBar() refDict=defaultdict(list) sentList = [] #for sentence in taggedText: for i, words in enumerate(sentence): if i != 0: prevword = sentence[i-1][0] prevtag = sentence[i-1][1] else: prevword=prevtag='' if words[1] == prevtag: newTuple = (newTuple[0]+' '+words[0],words[1]) sentList.pop() sentList.append(newTuple) refDict[i+1,words].append(newTuple) else: newTuple = (words[0],words[1]) sentList.append(newTuple) refDict[i+1,words].append(newTuple) for i, words in enumerate(sentList): if i != 0 and i < len(sentList)-1: sent=sentList[i-1][0],words[0],sentList[i+1][0] if words[1]!= 'O': mwords=words[0].replace(' ','_') if len(wn.synsets(mwords, 'n')) > 1: leskDec=lesk(word_tokenize(' '.join(sent)), mwords, 'n') for value in refDict.values(): if value[0] == words and len(value) < 2: value.append([mwords,leskDec,leskDec.definition()]) else: for ss in wn.synsets(mwords, 'n'): for value in refDict.values(): if value[0] == words and len(value) < 2: value.append([mwords,ss, ss.definition()]) return refDict
import sys import pprint import nltk from nltk.corpus import wordnet as wn from nltk.wsd import lesk from collections import Counter meaning = [] # with open('resume.txt', 'r') as resume: # for line in resume: # #pprint.pprint(line.split()) # # tokens = nltk # pprint.pprint(1) if __name__ == '__main__': job_des_words = [] resume_words = [] if(3 == len(sys.argv)): script, job_des_file, resume_file = sys.argv with(open(job_des_file)) as jdf: description = jdf.read() job_des_words = description.split(" ") with(open(resume_file)) as rf: resume = rf.read() resume_words = resume.split(" ") print(job_des_words[0]) print(lesk(description, job_des_words[0])) else: print("No input present, exiting.")
def disAmbi(sents, ambi_nouns): noun_dict = dict() for noun in ambi_nouns: noun_dict[noun] = lesk(sents, noun, "n") return noun_dict
def similarity(string1,string2): #split the string into sentences and sentences into words sentences1=[d for d in re.split('\.\W',string1)] sentences2=[d for d in re.split('\.\W',string2)] #sentences1=[d for d in string1.split('|')] #sentences2=[d for d in string2.split('|')] sentences1=[d.split('|') for d in sentences1] sentences1=[d for e in sentences1 for d in e] sentences2=[d.split('|') for d in sentences2] sentences2=[d for e in sentences2 for d in e] #remove stop words stop=stopwords.words('english') #clean_sentence=[ j for j in sentence if j not in stop] #sentence1=[j for j in sentence1 if j not in stop] #sentence2=[j for j in sentence2 if j not in stop] #remove Punctuation regex=re.compile('[%s]' % re.escape(string.punctuation)) #remove_punct_map=dict((ord(char),None) for char in string.punctuation) #regex.sub('', s) #clean_sentence_npunct=[c.translate(remove_punct_map) for c in clean_sentence] sentences1=[regex.sub('', c) for c in sentences1] sentences2=[regex.sub('', c) for c in sentences2] sentences1=[c.replace('\n','') for c in sentences1] sentences2=[c.replace('\n','') for c in sentences2] sentences1=[c.replace('\t','') for c in sentences1] sentences2=[c.replace('\t','') for c in sentences2] #print(sentences1) #print(sentences2) totalsimilarity=0; for sent1 in sentences1: sentence1=sent1.split() for k,ktag in nltk.pos_tag(sentence1): maxsimilarity=0 syn1=lesk(sentence1,k) #print(k+" and"+ktag) if syn1 is None: #print("syn1 is none"); for i,sym1 in enumerate(wn.synsets(k)): syn1=sym1 if syn1 is not None: for sent2 in sentences2: sentence2=sent2.split() for j,jtag in nltk.pos_tag(sentence2): # print(syn1.name()+" and "+syn2.name()) sim=0 a=0 #print(j+ " and " +jtag) # if(ktag is jtag): if(a==0): syn2=lesk(sentence2,j) #print(syn1.name()+" and "+syn2.name()) if syn2 is None: #print("syn2 is none") for i,sym2 in enumerate(wn.synsets(j)): syn2=sym2 if syn2 is not None: #print(j+"is none") #else: # print(syn1.name()+" and "+syn2.name()) ps=syn1.path_similarity(syn2) ws=syn1.wup_similarity(syn2) ls=0 if(syn1.name().split('.')[1] == syn2.name().split('.')[1]): # print(syn1.name()+" and "+syn2.name()) ls=syn1.lch_similarity(syn2) if ps is not None: sim=sim+ps if ws is not None: sim=sim+ws if ls is not None: sim=sim+ls if maxsimilarity < sim : maxsimilarity=sim totalsimilarity=totalsimilarity+maxsimilarity #print ("\nscore"+str(totalsimilarity)); return totalsimilarity
def wiki_lookup(search_pass, tag_pass): """ This function looks up a word or bigram with a tag on wikipedia and returns the best possible results :param search_pass: the word or bigram to lookup :param tag_pass: The tag that belongs to the search_pass :return: Returns a list with 3 elements, 3 links or less. """ search = search_pass tag = tag_pass search_lower = search.lower() # These tags will return one link tagcheck = ["COUNTRY", "STATE", "CITY", "TOWN", "NATURAL_PLACE", "PERSON", "ORGANISATION", "ANIMAL", "SPORT"] # Since the link returned with president, is wrong often. This prevents president from being linked. if search_lower != "president": # If the search contains just one word. if len(search.split(" ")) == 1: # Try to get synset of the search, if not possible set synset to None try: search_syn = wordnet.synsets(search, pos="n")[0] search_syn = str(search_syn) except IndexError: search_syn = None # If the search contains multiple words, replace the spaces with _ else: search_clean = search.split(" ") search_clean = "_".join(search_clean) syn = wordnet.synsets(search_clean, pos="n") if len(syn) == 0: search_syn = None else: search_syn = str(wordnet.synsets(search_clean, pos="n")[0]) wiki_results = [] url_list = [] result_syns = [] to_return = [] # These tags wont be added to the wiki lookup if tag != "NATURAL_PLACE" and tag != "ANIMAL" and tag != "ENTERTAINMENT" and tag != "COUNTRY" and tag != "CITY": search = search+" "+tag search_results = wikipedia.search(search) else: search_results = wikipedia.search(search) # If search results are found. if len(search_results) != 0: # Get a summary of all the results found. for result in search_results: try: wiki_results.append([result, wikipedia.summary(result, sentences=2)]) except wikipedia.exceptions.DisambiguationError as e: for result_e in e: wiki_results.append([result_e, wikipedia.summary(result, sentences=2)]) except wikipedia.exceptions.PageError: pass # Cleanup the search results, so a synset can be created for result in wiki_results: result_words = result[0].split(" ") if len(result_words) >= 1: # Cleanup the search results result_clean = "_".join(result_words) # Lookup the synset of the search result, using the summary of the search word ss = lesk(result[1], result_clean, "n") try: if ss == None: result.append("-") else: result.append(str(ss)) result_syns.append(str(ss)) except AttributeError: result.append("-") result_syns.append("-") else: result.append("-") # Create a url for all the search results page = wikipedia.page(result[0]) result.append(page.url) url_list.append(page.url) print(search, search_results, url_list) # If a synset was found, compare the found synset with the synset of the search. if search_syn != None: if search_syn in result_syns: for result in wiki_results: if result[2] == search_syn: to_return = [result[3], "-", "-"] # Else return the first link else: to_return = [url_list[0], "-", "-"] # If the tag is in the list with tags that return one link, return one link elif tag in tagcheck: to_return = [url_list[0], "-", "-"] # Else return up to 3 links, if possible else: if len(url_list) >= 3: to_return = [url_list[0], url_list[1], url_list[2]] elif len(url_list) == 2: to_return = [url_list[0], url_list[1], "-"] else: to_return = [url_list[0], "-", "-"] else: to_return = ["-", "-", "-"] else: to_return = ["-", "-", "-"] return to_return
def RecursiveGlossOverlap_Classify(text): definitiongraphedges=defaultdict(list) definitiongraphedgelabels=defaultdict(list) #--------------------------------------------------------------------------------- #2.Compute intrinsic merit (either using linear or quadratic overlap) #--------------------------------------------------------------------------------- tokenized = nltk.word_tokenize(text) fdist1 = FreqDist(tokenized) stopwords = nltk.corpus.stopwords.words('english') stopwords = stopwords + [u' ',u'or',u'and',u'who',u'he',u'she',u'whom',u'well',u'is',u'was',u'were',u'are',u'there',u'where',u'when',u'may', u'The', u'the', u'In',u'in',u'A',u'B',u'C',u'D',u'E',u'F',u'G',u'H',u'I',u'J',u'K',u'L',u'M',u'N',u'O',u'P',u'Q',u'R',u'S',u'T',u'U',u'V',u'W',u'X',u'Y',u'Z'] puncts = [u' ',u'.', u'"', u',', u'{', u'}', u'+', u'-', u'*', u'/', u'%', u'&', u'(', ')', u'[', u']', u'=', u'@', u'#', u':', u'|', u';',u'\'s'] #at present tfidf filter is not applied #freqterms1 = [w for w in fdist1.keys() if w not in stopwords and w not in puncts and (fdist1.freq(w) * compute_idf(corpus, w))] freqterms1 = [w.decode("utf-8") for w in fdist1.keys() if w not in stopwords and w not in puncts] current_level = 1 nodewithmaxparents = '' noofparents = 0 maxparents = 0 relatedness = 0 first_convergence_level = 1 tokensofthislevel = [] convergingterms = [] convergingparents = [] tokensofprevlevel = [] prevlevelsynsets = [] commontokens = [] vertices = 0 edges = 0 overlap = 0 iter = 0 from nltk.corpus import wordnet as wn #recurse down to required depth and update intrinsic merit score #relatedness is either sum(overlaps) or sum((overlapping_parents)*(overlaps)^2) also called convergence factor while current_level < 3: #crucial - gather nodes which converge/overlap (have more than 1 parent) if current_level > 1: print current_level for x in freqterms1: for y in parents(x,prevlevelsynsets): ylemmanames=y.lemma_names() #for yl in ylemmanames: # definitiongraphedges[x].append(yl) definitiongraphedges[x].append(ylemmanames[0]) definitiongraphedgelabels[x + " - " + ylemmanames[0]].append(" is a subinstance of ") definitiongraphedgelabels[ylemmanames[0] + " - " + x].append(" is a superinstance of ") convergingterms = [w for w in freqterms1 if len(parents(w,prevlevelsynsets)) > 1] for kw in freqterms1: convergingparents = convergingparents + ([w for w in parents(kw, prevlevelsynsets) if len(parents(kw, prevlevelsynsets)) > 1]) for kw in freqterms1: noofparents = len(parents(kw, prevlevelsynsets)) if noofparents > maxparents: maxparents = noofparents nodewithmaxparents = kw for keyword in freqterms1: #WSD - invokes Lesk's algorithm adapted to recursive gloss overlap- best_matching_synset() #disamb_synset = best_matching_synset(set(doc1), wn.synsets(keyword)) if use_pywsd_lesk: disamb_synset = simple_lesk(" ".join(freqterms1), keyword) if use_nltk_lesk: disamb_synset = lesk(freqterms1, keyword) else: disamb_synset = best_matching_synset(freqterms1, wn.synsets(keyword)) prevlevelsynsets = prevlevelsynsets + [disamb_synset] if len(wn.synsets(keyword)) != 0: disamb_synset_def = disamb_synset.definition() tokens = nltk.word_tokenize(disamb_synset_def) fdist_tokens = FreqDist(tokens) #at present frequency filter is not applied #if keyword in convergingterms: tokensofthislevel = tokensofthislevel + ([w for w in fdist_tokens.keys() if w not in stopwords and w not in puncts and fdist_tokens.freq(w)]) listcount = len(tokensofthislevel) setcount = len(set(tokensofthislevel)) overlap = listcount-setcount if overlap > 0 and iter == 0 : first_convergence_level = current_level iter = 1 #choose between two relatedness/convergence criteria :- #1) simple linear overlap or 2) zipf distributed quadratic overlap #relatedness = relatedness + len(convergingparents)*overlap relatedness = relatedness + overlap + len(convergingparents) #relatedness = relatedness + ((len(convergingparents)*overlap*overlap) + 1) #find out common tokens of this and previous level so that same token does not get grasped again - #relatedness must be increased since repetition of keywords in two successive levels is a sign of #interrelatedness(a backedge from child-of-one-of-siblings to one-of-siblings). Remove vertices and edges #corresponding to common tokens commontokens = set(tokensofthislevel).intersection(set(tokensofprevlevel)) tokensofthislevel = set(tokensofthislevel).difference(commontokens) relatedness = relatedness + len(commontokens) #decrease the vertices count to address common tokens removed above - edges should remain same since they #would just point elsewhere vertices = vertices + setcount - len(commontokens) edges = edges + listcount current_level = current_level + 1 freqterms1 = set(tokensofthislevel) tokensofprevlevel = tokensofthislevel tokensofthislevel = [] intrinsic_merit = vertices*edges*relatedness / first_convergence_level print definitiongraphedges nxg=nx.DiGraph() pos=nx.spring_layout(nxg) #pos=nx.shell_layout(nxg) #pos=nx.random_layout(nxg) #pos=nx.spectral_layout(nxg) #nx.draw_graphviz(nxg,prog="neato") for k,v in definitiongraphedges.iteritems(): for l in v: nxg.add_edge(k,l) nxg.add_edge(l,k) #nx.draw_networkx(nxg) #plt.show() nxg.remove_edges_from(nxg.selfloop_edges()) #print "Core number =",nx.core_number(nxg) sorted_core_nxg=sorted(nx.core_number(nxg).items(),key=operator.itemgetter(1), reverse=True) print "Core number (sorted) :",sorted_core_nxg print "=============================================================================================================" print "Unsupervised Classification based on top percentile Core numbers of the definition graph(subgraph of WordNet)" print "=============================================================================================================" no_of_classes=len(nx.core_number(nxg)) top_percentile=0 max_core_number=0 max_core_number_class="" for n in sorted_core_nxg: print "This document belongs to class:",n[0],",core number=",n[1] if top_percentile < no_of_classes*0.50: top_percentile+=1 else: break if n[1] > max_core_number: max_core_number=n[1] max_core_number_class=n[0] print " max_core_number",max_core_number print "===================================================================" print "Betweenness Centrality of Recursive Gloss Overlap graph vertices" print "===================================================================" bc=nx.betweenness_centrality(nxg) sorted_bc=sorted(bc.items(),key=operator.itemgetter(1),reverse=True) print sorted_bc print "===================================================================" print "Closeness Centrality of Recursive Gloss Overlap graph vertices" print "===================================================================" cc=nx.closeness_centrality(nxg) sorted_cc=sorted(cc.items(),key=operator.itemgetter(1),reverse=True) print sorted_cc print "===================================================================" print "Degree Centrality of Recursive Gloss Overlap graph vertices" print "===================================================================" dc=nx.degree_centrality(nxg) sorted_dc=sorted(dc.items(),key=operator.itemgetter(1),reverse=True) print sorted_dc print "===================================================================" print "Page Rank of the vertices of RGO Definition Graph (a form of Eigenvector Centrality)" print "===================================================================" sorted_pagerank_nxg=sorted(nx.pagerank(nxg).items(),key=operator.itemgetter(1),reverse=True) print sorted_pagerank_nxg return (sorted_core_nxg, sorted_pagerank_nxg)