def jaccard(sen_1, sen_2): tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(sen_1)) words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')] sen_set_1 = set(words) tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(sen_2)) words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')] sen_set_2 = set(words) jaccard_value = jaccard_distance(sen_set_1, sen_set_2) return jaccard_value
def similarity_by_jaccard(ox_defis, wn_defis): matrix_similarity_jaccard = [[0 for x in range(len(ox_defis))] for x in range(len(wn_defis))]; for iWnWord in range(len(wn_defis)): tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wn_defis[iWnWord])); words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')]; # words = nltk.wordpunct_tokenize(wn.synset(wn_defis[iWnWord].name()).definition()); # print words for i in range(len(words)): words[i] = wordnet_lemmatizer.lemmatize(words[i]); wn_set = set(words); # print "\n" # print wn_set # wn_set = set(wn.synset(wn_defis[iWnWord].name()).definition().split()) # print wn_set # - - - - - - - - - - - - - - - - - - - - - - - - - - - # word-word for iDictWord in range(len(ox_defis)): # if not ox_defis[str(iDictWord)].has_key("d") or dict_words[str(iDictWord)]["d"] == None: # matrix_similarity_jaccard[iWnWord][iDictWord] = 1; # continue tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(ox_defis[iDictWord])); words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')]; # words = nltk.wordpunct_tokenize(ox_defis[str(iDictWord)]["d"]); # print words for i in range(len(words)): words[i] = wordnet_lemmatizer.lemmatize(words[i]); dict_set = set(words); # print dict_set # print # dict_set = set(ox_defis[str(iDictWord)]["d"].encode('utf8').split()); matrix_similarity_jaccard[iWnWord][iDictWord] = 1 - jaccard_distance(wn_set,dict_set); # matrix_similarity_jaccard[iWnWord][iDictWord] = cal_jacc_for_ngrams(wn_set, dict_set, 1) ######################################## return matrix_similarity_jaccard
def get_nouns(dict_words): dict_words_nouns = [] for iWord in range(len(dict_words)): print iWord dict_words_nouns.append([]) wordDict = dict_words[iWord] # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # sd tagged_sent = POSWrapper.pos_tag(nltk.word_tokenize(wordDict["sd"])) nouns = [word for word, pos in tagged_sent if (pos == "NN" or pos == "NNS")] for noun in nouns: if noun != wordDict["en"] and noun != "sth" and noun not in dict_words_nouns[iWord]: print noun dict_words_nouns[iWord].append(noun) # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # d tagged_sent = POSWrapper.pos_tag(nltk.word_tokenize(wordDict["d"])) nouns = [word for word, pos in tagged_sent if (pos == "NN" or pos == "NNS")] for noun in nouns: if noun != wordDict["en"] and noun != "sth" and noun not in dict_words_nouns[iWord]: # print noun; dict_words_nouns[iWord].append(noun) # - - - - - - - - - - - - - - - - - - - - - - - - - - - print wordDict["tv"] print dict_words_nouns[iWord] ######################################## return dict_words_nouns
def split_and_POS(sen): # tokenize tokens = nltk.wordpunct_tokenize(sen) # pos tag tagged_words = POSWrapper.pos_tag(tokens) arr_pos = [] for (word, pos) in tagged_words: arr_pos.append(pos) return arr_pos
def preprocess_sentence_to_nouns(sentence): # tokenize tokens = nltk.wordpunct_tokenize(sentence) # pos tag tagged_words = POSWrapper.pos_tag(tokens) # get n nouns = [word for word, pos in tagged_words if check_pos_noun(pos)] # stemming # nouns_stemmed = [] # for noun in nouns: # noun_stemmed = __wordnet_lemmatizer__.lemmatize(noun, pos='n') # nouns_stemmed.append(noun_stemmed) # return nouns
def read_nouns(): dict_wn = {} for synset in list(wn.all_synsets('n')): key = synset.name() + "-" + synset.definition() lemmas = [str(lemma.name()) for lemma in synset.lemmas()] key += "=" for lemma in lemmas: key = key + "-" + lemma dict_wn[key] = lemmas # # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # get hypernyms # print "\nhypernyms ------"; for hypernym in synset.hypernyms(): for lemma in wn.synset(hypernym.name()).lemmas(): lemma_name = lemma.name(); dict_wn[key].append(lemma_name) # - - - - - - - - - - - - - - - - - - - - - - - - - - - # get hyponyms for hyponym in synset.hyponyms(): for lemma in wn.synset(hyponym.name()).lemmas(): lemma_name = lemma.name(); dict_wn[key].append(lemma_name) # - - - - - - - - - - - - - - - - - - - - - - - - - - - # get description # print wn.synset(bank.name()).definition(); # # for meronym in synset.part_meronyms(): # for lemma in wn.synset(meronym.name()).lemmas(): # lemma_name = lemma.name(); # dict_wn[key].append(lemma_name) # # for holonym in synset.member_holonyms(): # for lemma in wn.synset(holonym.name()).lemmas(): # lemma_name = lemma.name(); # dict_wn[key].append(lemma_name) # # tagged_sent = POSWrapper.pos_tag(nltk.word_tokenize(synset.definition())); nouns = [word for word,pos in tagged_sent if pos == 'NN']; for i in range(len(nouns)): nouns[i] = wordnet_lemmatizer.lemmatize(nouns[i]); for noun in nouns: dict_wn[key].append(noun) ## return dict_wn
def get_nbest_synsets_n_v_with_word_vn(dict_words,word_concept): dict_words_nouns = []; dict_synsets_nouns = []; wn_words = wn.synsets(word_concept, pos = 'n'); if word_concept == 'bedroom': asdf = 0; for iWord in range(len(dict_words)): print iWord; dict_words_nouns.append([]); dict_synsets_nouns.append([]); wordDict = dict_words[iWord]; # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # sd if not wordDict.has_key('tv'): continue if not wordDict.has_key('d'): continue nouns = []; if wordDict.has_key("sd"): tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["sd"])); nouns = [word for word,pos in tagged_sent if ((pos == 'NN' or pos == 'NNS') and (word != 'sth' and word != 'etc'))]; if len(nouns) == 0: tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["d"])); print tagged_sent nouns = [word for word,pos in tagged_sent if ((pos == 'NN' or pos == 'NNS') and (word != 'sth' and word != 'etc'))]; elif wordDict.has_key("d") and wordDict["d"] != None: tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["d"])); print tagged_sent nouns = [word for word,pos in tagged_sent if ((pos == 'NN' or pos == 'NNS') and (word != 'sth' or word != 'etc'))]; else: continue for noun in nouns: noun = wordnet_lemmatizer.lemmatize(noun, pos='n'); if noun == None: continue if noun != "sth" and noun != 'etc' and noun not in dict_words_nouns[iWord]: dict_words_nouns[iWord].append(noun); if len(dict_words_nouns[iWord]) == 0: continue print dict_words_nouns[iWord] synsetsSD = []; for word in dict_words_nouns[iWord]: synsets = wn.synsets(word, pos = 'n'); for synset in synsets: synsetsSD.append(synset) if len(synsetsSD) == 0: continue # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # d tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["d"])); nouns = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS')]; if wordDict.has_key('xh0') and wordDict['xh0'] is not None and wordDict['xh0'] != 'nn': nouns.append(wordDict['xh0']); # if wordDict.has_key('xh1') and wordDict['xh1'] is not None: # nouns.append(wordDict['xh1']); # if wordDict.has_key('xh2') and wordDict['xh2'] is not None: # nouns.append(wordDict['xh2']); # print tagged_sent for noun in nouns: noun = wordnet_lemmatizer.lemmatize(noun, pos='n'); if noun == None: continue if noun.encode('utf8') != word_concept and noun != "sth" and noun not in dict_words_nouns[iWord]: # print noun; dict_words_nouns[iWord].append(noun); # - - - - - - - - - - - - - - - - - - - - - - - - - - - print wordDict["tv"] print dict_words_nouns[iWord] # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # synsets iSDMax = 0; pSD_max = 0; for iSyn in range(len(synsetsSD)): synsetSD = synsetsSD[iSyn]; pSD = 0; arr_p = []; for synset in wn_words: p_noun_max = 0; p = synsetSD.path_similarity(synset); # arr_p.append(p); # print "-----------------------" # print synsetSD # print synset # print p if p > p_noun_max: p_noun_max = p; arr_p.append(p_noun_max); arr_p = sorted(arr_p, reverse=True); for i in xrange(0, len(arr_p)-1): if i <= 0: pSD += arr_p[i]; # print "\n" if pSD > pSD_max: # print pSD # print pSD_max pSD_max = pSD; # print iSyn # print iSDMax iSDMax = iSyn; # print "\n" synsetRoot = synsetsSD[iSDMax]; print "synsetroot" print synsetRoot for noun in dict_words_nouns[iWord]: synsets_noun = wn.synsets(noun, pos = 'n'); if len(synsets_noun) <= 0: continue; p_noun_max = 0; synMax = synsets_noun[0]; for synset_noun in synsets_noun: # dict_synsets_nouns[iWord].append(synMax); for synset in wn_words: p = synset.path_similarity(synset_noun); # p = synsetRoot.path_similarity(synset_noun); if p > p_noun_max: p_noun_max = p; synMax = synset_noun; if synMax not in dict_synsets_nouns[iWord]: dict_synsets_nouns[iWord].append(synMax); # if synsets_noun[0] not in dict_synsets_nouns[iWord]: # dict_synsets_nouns[iWord].append(synsets_noun[0]); # if len(synsetsSD) >= 1: # synsetRoot = synsetsSD[0]; # print "synsetroot" # print synsetRoot # # for noun in dict_words_nouns[iWord]: # synsets_noun = wn.synsets(noun, pos = 'n'); # if len(synsets_noun) <= 0: # continue; # # p_noun_max = 0; # synMax = synsets_noun[0]; # # for synset_noun in synsets_noun: # p = synsetRoot.path_similarity(synset_noun); # if p > p_noun_max: # p_noun_max = p; # synMax = synset_noun; # # if synMax not in dict_synsets_nouns[iWord]: # dict_synsets_nouns[iWord].append(synMax); # if synsets_noun[0] not in dict_synsets_nouns[iWord]: # dict_synsets_nouns[iWord].append(synsets_noun[0]); # if len(synsetsSD) >= 3: # synsetRoot = synsetsSD[2]; # print "synsetroot" # print synsetRoot # # for noun in dict_words_nouns[iWord]: # synsets_noun = wn.synsets(noun, pos = 'n'); # if len(synsets_noun) <= 0: # continue; # # p_noun_max = 0; # synMax = synsets_noun[0]; # # for synset_noun in synsets_noun: # p = synsetRoot.path_similarity(synset_noun); # if p > p_noun_max: # p_noun_max = p; # synMax = synset_noun; # # if synMax not in dict_synsets_nouns[iWord]: # dict_synsets_nouns[iWord].append(synMax); # if synsets_noun[0] not in dict_synsets_nouns[iWord]: # dict_synsets_nouns[iWord].append(synsets_noun[0]); # continue tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["d"])); nouns = [word for word,pos in tagged_sent if (pos == 'VB' or pos == 'VBN' or pos == 'VBD')]; print "VVVVV" print nouns for noun in nouns: noun = wordnet_lemmatizer.lemmatize(noun, pos='v'); if noun == None: continue if noun.encode('utf8') != word_concept and noun != "sth" and noun not in dict_words_nouns[iWord]: # print noun; dict_words_nouns[iWord].append(noun); # - - - - - - - - - - - - - - - - - - - - - - - - - - - print wordDict["tv"] print dict_words_nouns[iWord] # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # synsets iSDMax = 0; pSD_max = 0; for iSyn in range(len(synsetsSD)): synsetSD = synsetsSD[iSyn]; pSD = 0; arr_p = []; for synset in wn_words: p_noun_max = 0; p = synsetSD.path_similarity(synset); # arr_p.append(p); # print "-----------------------" # print synsetSD # print synset # print p if p > p_noun_max: p_noun_max = p; arr_p.append(p_noun_max); arr_p = sorted(arr_p, reverse=True); for i in xrange(0, len(arr_p)-1): if i <= 1: pSD += arr_p[i]; # print "\n" if pSD > pSD_max: # print pSD # print pSD_max pSD_max = pSD; # print iSyn # print iSDMax iSDMax = iSyn; # print "\n" synsetRoot = synsetsSD[iSDMax]; print "synsetroot" print synsetRoot for noun in dict_words_nouns[iWord]: synsets_noun = wn.synsets(noun, pos = 'v'); if len(synsets_noun) <= 0: continue; p_noun_max = 0; synMax = synsets_noun[0]; for synset_noun in synsets_noun: p = synsetRoot.path_similarity(synset_noun); if p > p_noun_max: p_noun_max = p; synMax = synset_noun; if synMax not in dict_synsets_nouns[iWord]: dict_synsets_nouns[iWord].append(synMax); # if synsets_noun[0] not in dict_synsets_nouns[iWord]: # dict_synsets_nouns[iWord].append(synsets_noun[0]); print "dict_synsets_nouns" print dict_synsets_nouns[iWord] ######################################## return dict_synsets_nouns;
def similarity_by_synsets_synsets_nbest_withword_average(WORD, dict_words): if WORD == "bank": asf = 0; # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # dictionary data dict_words_synsets = get_nbest_synsets_n_v_with_word(dict_words,WORD); # print "dict-word_synsets" # print dict_words_synsets # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # wordnet data wn_words = wn.synsets(WORD, pos = 'n'); print "wn_words -------" print wn_words; wn_words_synsets = WordnetProcess.get_synsets_n_v(WORD, wn_words); print wn_words_synsets # matrix for similarity dict_words vs wn_words matrix_similarity = [[0 for x in range(len(dict_words))] for x in range(len(wn_words))]; # - - - - - - - - - - - - - - - - - - - - - - - - - - - #################################################################################################### # # calculate 2d matrix of p for iWnWord in range(len(wn_words)): # - - - - - - - - - - - - - - - - - - - - - - - - - - - # word-word for iDictWord in range(len(dict_words)): p_iWnWord_iDictWord = 0.; arr_p_word = []; # for dict_synset in dict_words_synsets[iDictWord]: # print "------------ dict noun" # print dictNoun; p_dictNoun_wnNouns = 0; # for some nouns don't have synsets arr_p = []; # - - - - - - - - - - - - - - - - - - - - - - - - for wn_synset in wn_words_synsets[iWnWord]: # p_max = dict_synset.path_similarity(wn_synset); if p_max == None: continue arr_p.append(p_max); # print p_max arr_p = sorted(arr_p, reverse=True); nBest = 8; count = 0.0001; for i in xrange(0, len(arr_p)-1): if i < nBest: p_dictNoun_wnNouns += arr_p[i]; count += 1; p_dictNoun_wnNouns = p_dictNoun_wnNouns/count; arr_p_word.append(p_dictNoun_wnNouns); arr_p_word = sorted(arr_p_word, reverse=True); nBest = 10; count = 5; for i in range(len(arr_p_word)): if i < nBest: if nBest > len(arr_p_word): if i == 0: p_iWnWord_iDictWord += arr_p_word[i]*5.; elif i< nBest/3: p_iWnWord_iDictWord += arr_p_word[i]*1; else: p_iWnWord_iDictWord += arr_p_word[i]*1; else: if i == 0: p_iWnWord_iDictWord += arr_p_word[i]*5.; elif i< len(arr_p_word)/3: p_iWnWord_iDictWord += arr_p_word[i]*1; else: p_iWnWord_iDictWord += arr_p_word[i]*1; count += 1; if count == 0: p_iWnWord_iDictWord = 0; else: p_iWnWord_iDictWord = p_iWnWord_iDictWord/count matrix_similarity[iWnWord][iDictWord] = p_iWnWord_iDictWord; # - - - - - - - - - - - - - - - - - - - - - - - - - # word-word # - - - - - - - - - - - - - - - - - - - - - - - - - - - #################################################################################################### print "----------------------------------------------------" s = [[str(e) for e in row] for row in matrix_similarity] lens = [max(map(len, col)) for col in zip(*s)] fmt = '\t'.join('{{:{}}}'.format(x) for x in lens) table = [fmt.format(*row) for row in s] print '\n'.join(table) # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # dictionary data wn_words = dict_words; wn_words_synsets = get_nbest_synsets_n_v_with_word(wn_words,WORD); # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # wordnet data dict_words = wn.synsets(WORD, pos = 'n'); # print wn_words; dict_words_synsets = WordnetProcess.get_synsets_n_v(WORD, dict_words); print "sysnets -----------------------.----.-----.--.-" # matrix for similarity dict_words vs wn_words matrix_similarity_reverse = [[0 for x in range(len(dict_words))] for x in range(len(wn_words))]; # - - - - - - - - - - - - - - - - - - - - - - - - - - - #################################################################################################### # # calculate 2d matrix of p for iWnWord in range(len(wn_words)): # - - - - - - - - - - - - - - - - - - - - - - - - - - - # word-word for iDictWord in range(len(dict_words)): p_iWnWord_iDictWord = 0.; arr_p_word = []; for dict_synset in dict_words_synsets[iDictWord]: # print dictNoun; p_dictNoun_wnNouns = 0; # for some nouns don't have synsets countwnNouns = 0.00000001; arr_p = []; # - - - - - - - - - - - - - - - - - - - - - - - - for wn_synset in wn_words_synsets[iWnWord]: p_max = dict_synset.path_similarity(wn_synset); if p_max != None: arr_p.append(p_max); # print p_max # - - - - - - - - - - - - - - - - - - - - - - - - arr_p = sorted(arr_p, reverse=True); nBest = 8; count = 0.0001 for i in range(len(arr_p)): if i < nBest: p_dictNoun_wnNouns += arr_p[i]; count +=1 p_dictNoun_wnNouns = p_dictNoun_wnNouns/count; arr_p_word.append(p_dictNoun_wnNouns); arr_p_word = sorted(arr_p_word, reverse=True); nBest = 10; count = 5; for i in xrange(0, len(arr_p_word)-1): if i < nBest: if nBest > len(arr_p_word): if i == 0: p_iWnWord_iDictWord += arr_p_word[i]*5; elif i< nBest/3: p_iWnWord_iDictWord += arr_p_word[i]*1.; else: p_iWnWord_iDictWord += arr_p_word[i]*1; else: if i == 0: p_iWnWord_iDictWord += arr_p_word[i]*5.; elif i< len(arr_p_word)/3: p_iWnWord_iDictWord += arr_p_word[i]*1.; else: p_iWnWord_iDictWord += arr_p_word[i]*1; count += 1; if count == 0: p_iWnWord_iDictWord = 0; else: p_iWnWord_iDictWord = p_iWnWord_iDictWord/count matrix_similarity_reverse[iWnWord][iDictWord] = p_iWnWord_iDictWord; # - - - - - - - - - - - - - - - - - - - - - - - - - # word-word # - - - - - - - - - - - - - - - - - - - - - - - - - - - #################################################################################################### print "----------------------------------------------------" s = [[str(e) for e in row] for row in matrix_similarity_reverse] lens = [max(map(len, col)) for col in zip(*s)] fmt = '\t'.join('{{:{}}}'.format(x) for x in lens) table = [fmt.format(*row) for row in s] print '\n'.join(table) # word-word # - - - - - - - - - - - - - - - - - - - - - - - - - - - dict_words = wn_words; wn_words = wn.synsets(WORD, pos = 'n'); for iWnWord in range(len(wn_words)): for iDictWord in range(len(dict_words)): matrix_similarity[iWnWord][iDictWord] = matrix_similarity[iWnWord][iDictWord] + matrix_similarity_reverse[iDictWord][iWnWord]; matrix_similarity[iWnWord][iDictWord] /= 2; #################################################################################################### print "----------------------------------------------------" s = [[str(e) for e in row] for row in matrix_similarity] lens = [max(map(len, col)) for col in zip(*s)] fmt = '\t'.join('{{:{}}}'.format(x) for x in lens) table = [fmt.format(*row) for row in s] print '\n'.join(table) #################################################################################################### # # @brief: # matrix_similarity_jaccard = [[0 for x in range(len(dict_words))] for x in range(len(wn_words))]; for iWnWord in range(len(wn_words)): tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wn.synset(wn_words[iWnWord].name()).definition())); words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')]; # words = nltk.wordpunct_tokenize(wn.synset(wn_words[iWnWord].name()).definition()); # print words for i in range(len(words)): words[i] = wordnet_lemmatizer.lemmatize(words[i]); wn_set = set(words); # wn_set = set(wn.synset(wn_words[iWnWord].name()).definition().split()) # print wn_set # - - - - - - - - - - - - - - - - - - - - - - - - - - - # word-word for iDictWord in range(len(dict_words)): if not dict_words[str(iDictWord)].has_key("d") or dict_words[str(iDictWord)]["d"] == None: matrix_similarity_jaccard[iWnWord][iDictWord] = 1; continue tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(dict_words[str(iDictWord)]["d"])); words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')]; # words = nltk.wordpunct_tokenize(dict_words[str(iDictWord)]["d"]); # print words for i in range(len(words)): words[i] = wordnet_lemmatizer.lemmatize(words[i]); dict_set = set(words); # print # dict_set = set(dict_words[str(iDictWord)]["d"].encode('utf8').split()); matrix_similarity_jaccard[iWnWord][iDictWord] = jaccard_distance(wn_set,dict_set); for iWnWord in range(len(wn_words)): for iDictWord in range(len(dict_words)): matrix_similarity[iWnWord][iDictWord] = matrix_similarity[iWnWord][iDictWord]*10 + 2*(1-matrix_similarity_jaccard[iWnWord][iDictWord]); matrix_similarity[iWnWord][iDictWord] /= 12; #################################################################################################### print "----------------------------------------------------" s = [[str(e) for e in row] for row in matrix_similarity] lens = [max(map(len, col)) for col in zip(*s)] fmt = '\t'.join('{{:{}}}'.format(x) for x in lens) table = [fmt.format(*row) for row in s] print '\n'.join(table) #################################################################################################### # # write file # - - - - - - - - - - - - - - - - - - - - - - - - - # col arrColWn = []; for i in range(len(wn_words)): matrix_similarity[i].insert(0,wn.synset(wn_words[i].name()).definition()); # - - - - - - - - - - - - - - - - - - - - - - - - - # row arrRowDict = []; arrRowDict.append("--"); for i in range(len(dict_words)): if not dict_words[str(i)].has_key('tv'): dict_words[str(i)]['tv'] = "--"; if dict_words[str(i)]['tv'] == None: dict_words[str(i)]['tv'] = "--" arrRowDict.append(dict_words[str(i)]["tv"].encode('utf8')); FileProcess.write_to_excel_file("Results/"+WORD+"_synsets_synsets_nbest_withword_average.csv",arrRowDict,matrix_similarity)
def get_nouns(word_origin, wn_words): wn_words_nouns = []; for iWord in range(len(wn_words)): print "- - - - - - - - - - - - - - - - - - - - - - - - - - -"; print iWord; wn_words_nouns.append([]); # get a bank in wn_words wordDict = wn_words[iWord]; # - - - - - - - - - - - - - - - - - - - - - - - - - - - # get synsets of bank print "synsets -------"; synsets_of_word = wn.synset(wordDict.name()); for lemma in synsets_of_word.lemmas(): lemma_name = lemma.name(); if lemma_name != word_origin: print lemma_name; wn_words_nouns[iWord].append(lemma_name); # # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # get hypernyms # print "\nhypernyms ------"; # for hypernym in wn.synset(wordDict.name()).hypernyms(): # # for lemma in wn.synset(hypernym.name()).lemmas(): # lemma_name = lemma.name(); # if lemma_name != "bank": # # if not(lemma_name in wn_words_nouns[iWord]): # print lemma_name; # wn_words_nouns[iWord].append(lemma_name); # - - - - - - - - - - - - - - - - - - - - - - - - - - - # get hyponyms print "\nhyponyms ------"; for hyponym in wn.synset(wordDict.name()).hyponyms(): for lemma in wn.synset(hyponym.name()).lemmas(): lemma_name = lemma.name(); if lemma_name != "bank": if not(lemma_name in wn_words_nouns[iWord]): print lemma_name; wn_words_nouns[iWord].append(lemma_name); # - - - - - - - - - - - - - - - - - - - - - - - - - - - # get description print "\ndefinition ------"; # print wn.synset(bank.name()).definition(); tagged_sent = POSWrapper.pos_tag(nltk.word_tokenize(wn.synset(wordDict.name()).definition())); nouns = [word for word,pos in tagged_sent if pos == 'NN']; for noun in nouns: if noun != "bank" and noun != "sth" and noun not in wn_words_nouns[iWord]: print noun; wn_words_nouns[iWord].append(noun); ######################################## return wn_words_nouns
def get_synsets_n_v(word_origin, wn_words): wn_words_synset = []; for iWord in range(len(wn_words)): print "- - - - - - - - - - - - - - - - - - - - - - - - - - -"; print iWord; wn_words_synset.append([]); # get a bank in wn_words wordDict = wn_words[iWord]; # - - - - - - - - - - - - - - - - - - - - - - - - - - - # get synsets of bank synset_of_word = wn.synset(wordDict.name()); wn_words_synset[iWord].append(synset_of_word); print synset_of_word print "---" # - - - - - - - - - - - - - - - - - - - - - - - - - - - # get hypernyms # for hypernym in wn.synset(wordDict.name()).hypernyms(): # print hypernym # wn_words_synset[iWord].append(hypernym); # - - - - - - - - - - - - - - - - - - - - - - - - - - - # get hyponyms print "---" for hyponym in wn.synset(wordDict.name()).part_meronyms(): print hyponym wn_words_synset[iWord].append(hyponym); # # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # get hyponyms # print "---" # for hyponym in wn.synset(wordDict.name()).member_holonyms(): # print hyponym # wn_words_synset[iWord].append(hyponym); # - - - - - - - - - - - - - - - - - - - - - - - - - - - # get hyponyms print "---" for hyponym in wn.synset(wordDict.name()).hyponyms(): print hyponym wn_words_synset[iWord].append(hyponym); # # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # get description print "\ndefinition ------"; # print wn.synset(bank.name()).definition(); tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wn.synset(wordDict.name()).definition())); print tagged_sent nouns = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ')]; for noun in nouns: noun = wordnet_lemmatizer.lemmatize(noun, pos='n'); if noun == None: continue # if noun != "bank" and noun != "sth" and noun not in wn_words_synset[iWord]: synsetsDictNoun = wn.synsets(noun, pos = "n"); if len(synsetsDictNoun) > 0: synsetMax = synsetsDictNoun[0]; p_max = 0; for synsetNoun in synsetsDictNoun: p = synsetNoun.path_similarity(synset_of_word); if p > p_max: p_max = p; # # synsetMax = synsetNoun print synsetMax if synsetMax not in wn_words_synset[iWord]: wn_words_synset[iWord].append(synsetMax); # if synsetsDictNoun[0] not in wn_words_synset[iWord]: # # wn_words_synset[iWord].append(synsetsDictNoun[0]); tagged_sent = POSWrapper.pos_tag(nltk.word_tokenize(wn.synset(wordDict.name()).definition())); nouns = [word for word,pos in tagged_sent if (pos == 'VB' or pos == 'VBD' or pos == 'VBN')]; for noun in nouns: noun = wordnet_lemmatizer.lemmatize(noun, pos='n'); if noun == None: continue # if noun != "bank" and noun != "sth" and noun not in wn_words_synset[iWord]: synsetsDictNoun = wn.synsets(noun, pos = "v"); if len(synsetsDictNoun) > 0: synsetMax = synsetsDictNoun[0]; p_max = 0; for synsetNoun in synsetsDictNoun: p = synsetNoun.path_similarity(synset_of_word); if p > p_max: p_max = p; # # synsetMax = synsetNoun print synsetMax if synsetMax not in wn_words_synset[iWord]: wn_words_synset[iWord].append(synsetMax); # if synsetsDictNoun[0] not in wn_words_synset[iWord]: # wn_words_synset[iWord].append(synsetsDictNoun[0]); ######################################## return wn_words_synset
def split_words(sen): tokens = nltk.wordpunct_tokenize(sen) tagged_words = POSWrapper.pos_tag(tokens) return tagged_words
def get_synsets_for_word_in_wn(word_origin, wn_synsets_for_word_origin): # arr synsets for arr words # each word has an array of synsets wn_synsets_for_words = []; # add p p_synsets_for_words = []; for iWord in range(len(wn_synsets_for_word_origin)): # print "- - - - - - - - - - - - - - - - - - - - - - - - - - -"; # print iWord; wn_synsets_for_words.append([]); # add p p_synsets_for_words.append([]); # get a bank in wn_words wordDict = wn_synsets_for_word_origin[iWord]; # - - - - - - - - - - - - - - - - - - - - - - - - - - - # get synsets of bank synset_of_word = wn.synset(wordDict.name()); wn_synsets_for_words[iWord].append(synset_of_word); # add p p_synsets_for_words[iWord].append(1.5); # print synset_of_word # print "---" # - - - - - - - - - - - - - - - - - - - - - - - - - - - # get hypernyms if PARAMETERS.DICT_WN_FEATURE_RELATION_hypernyms == 1: # print "hypernyms" for hypernym in wn.synset(wordDict.name()).hypernyms(): # print hypernym wn_synsets_for_words[iWord].append(hypernym); # add p p_synsets_for_words[iWord].append(1.2); # - - - - - - - - - - - - - - - - - - - - - - - - - - - # get meronyms if PARAMETERS.DICT_WN_FEATURE_RELATION_part_meronyms == 1: # print "meronyms" for meronym in wn.synset(wordDict.name()).part_meronyms(): # print meronym wn_synsets_for_words[iWord].append(meronym); # add p p_synsets_for_words[iWord].append(1.2); # # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # get holonyms if PARAMETERS.DICT_WN_FEATURE_RELATION_member_holonyms == 1: # print "holonyms" for holonym in wn.synset(wordDict.name()).member_holonyms(): # print holonym wn_synsets_for_words[iWord].append(holonym); # add p p_synsets_for_words[iWord].append(1.2); # - - - - - - - - - - - - - - - - - - - - - - - - - - - # get hyponyms if PARAMETERS.DICT_WN_FEATURE_RELATION_hyponyms == 1: # print "hyponyms" for hyponym in wn.synset(wordDict.name()).hyponyms(): # print hyponym wn_synsets_for_words[iWord].append(hyponym); # add p p_synsets_for_words[iWord].append(1.2); # # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # get description if PARAMETERS.DICT_WN_FEATURE_RELATION_definition == 1: # print "\ndefinition ------"; tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wn.synset(wordDict.name()).definition())); # print tagged_sent # - - - - - - - - - - - - - - - - - - - - - - - - - - - if PARAMETERS.POS_FEATURE_n == 1: nouns = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ')]; for noun in nouns: noun = wordnet_lemmatizer.lemmatize(noun, pos='n'); if noun == None: continue if noun != word_origin and noun != "sth": synsetsDictNoun = WordnetHandler.get_synsets_for_word(noun, "n"); if len(synsetsDictNoun) > 0: synsetMax = synsetsDictNoun[0]; p_max = 0; for synsetNoun in synsetsDictNoun: p = synsetNoun.path_similarity(synset_of_word); if p > p_max: p_max = p; synsetMax = synsetNoun # print synsetMax if synsetMax not in wn_synsets_for_words[iWord]: wn_synsets_for_words[iWord].append(synsetMax); # add p p_synsets_for_words[iWord].append(1.); # if synsetsDictNoun[0] not in wn_words_synset[iWord]: # # wn_words_synset[iWord].append(synsetsDictNoun[0]); # - - - - - - - - - - - - - - - - - - - - - - - - - - - # - - - - - - - - - - - - - - - - - - - - - - - - - - - if PARAMETERS.POS_FEATURE_v == 1: verbs = [word for word,pos in tagged_sent if (pos == 'VB' or pos == 'VBD' or pos == 'VBN')]; for verb in verbs: verb = wordnet_lemmatizer.lemmatize(verb, pos='v'); if verb == None: continue if verb != "bank": synsetsDictVerb = WordnetHandler.get_synsets_for_word(verb, "v"); if len(synsetsDictVerb) > 0: synsetMax = synsetsDictVerb[0]; p_max = 0; for synsetVerb in synsetsDictVerb: p = synsetVerb.path_similarity(synset_of_word); if p > p_max: p_max = p; synsetMax = synsetVerb # # print synsetMax if synsetMax not in wn_synsets_for_words[iWord]: wn_synsets_for_words[iWord].append(synsetMax); # add p p_synsets_for_words[iWord].append(1.); # if synsetsDictNoun[0] not in wn_words_synset[iWord]: # wn_words_synset[iWord].append(synsetsDictNoun[0]); # print wn_synsets_for_words[iWord] ######################################## return wn_synsets_for_words,p_synsets_for_words;
def get_nbest_synsets_for_word_in_oxford(dict_words,word_concept): dict_words_nouns = []; dict_words_verbs = []; dict_synsets_for_words = []; wn_words = WordnetHandler.get_synsets_for_word(word_concept, 'n'); # add p p_synsets_for_words = []; for iWord in range(len(dict_words)): # print iWord; dict_words_nouns.append([]); dict_words_verbs.append([]); dict_synsets_for_words.append([]); # add p p_synsets_for_words.append([]); wordDict = dict_words[str(iWord)]; # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # sd if not wordDict.has_key('tv'): continue if not wordDict.has_key('d'): continue nouns = []; if wordDict.has_key("sd") and PARAMETERS.DICT_OX_FEATURE_RELATION_sd == 1: tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["sd"])); nouns = [word for word,pos in tagged_sent if ((pos == 'NN' or pos == 'NNS') and (word != 'sth' and word != 'etc'))]; if len(nouns) == 0: tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["d"])); # print tagged_sent nouns = [word for word,pos in tagged_sent if ((pos == 'NN' or pos == 'NNS') and (word != 'sth' and word != 'etc'))]; elif wordDict.has_key("d") and wordDict["d"] != None: tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["d"])); # print tagged_sent nouns = [word for word,pos in tagged_sent if ((pos == 'NN' or pos == 'NNS') and (word != 'sth' or word != 'etc'))]; else: continue for noun in nouns: noun = wordnet_lemmatizer.lemmatize(noun, pos='n'); if noun == None: continue if noun != "sth" and noun != 'etc' and noun not in dict_words_nouns[iWord]: dict_words_nouns[iWord].append(noun); if len(dict_words_nouns[iWord]) == 0: continue # print dict_words_nouns[iWord] synsetsSD = []; for word in dict_words_nouns[iWord]: synsets = WordnetHandler.get_synsets_for_word(word, 'n'); for synset in synsets: synsetsSD.append(synset) if len(synsetsSD) == 0: continue # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # d if PARAMETERS.DICT_OX_FEATURE_RELATION_d == 1: tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["d"])); nouns = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS')]; if PARAMETERS.DICT_OX_FEATURE_RELATION_xh == 1: if wordDict.has_key('xh0') and wordDict['xh0'] is not None and wordDict['xh0'] != 'nn': nouns.append(wordDict['xh0']); if wordDict.has_key('xh1') and wordDict['xh1'] is not None: nouns.append(wordDict['xh1']); if wordDict.has_key('xh2') and wordDict['xh2'] is not None: nouns.append(wordDict['xh2']); # print tagged_sent for noun in nouns: noun = wordnet_lemmatizer.lemmatize(noun, pos='n'); if noun == None: continue if noun.encode('utf8') != word_concept and noun != "sth" and noun not in dict_words_nouns[iWord]: dict_words_nouns[iWord].append(noun); # - - - - - - - - - - - - - - - - - - - - - - - - - - - # print wordDict["tv"] # print dict_words_nouns[iWord] # # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # synsets iSDMax = 0; pSD_max = 0; for iSyn in range(len(synsetsSD)): synsetSD = synsetsSD[iSyn]; pSD = 0; arr_p = []; for synset in wn_words: # p_noun_max = 0; p = synsetSD.path_similarity(synset); # print "-----------------------" # if p > p_noun_max: p_noun_max = p; arr_p.append(p_noun_max); arr_p = sorted(arr_p, reverse=True); for i in xrange(0, len(arr_p)-1): if i <= 0: pSD += arr_p[i]; # print "\n" if pSD > pSD_max: pSD_max = pSD; iSDMax = iSyn; # print "\n" synsetRoot = synsetsSD[iSDMax]; # print "synsetroot" # print synsetRoot for noun in dict_words_nouns[iWord]: synsets_noun = WordnetHandler.get_synsets_for_word(noun, 'n'); if len(synsets_noun) <= 0: continue; p_noun_max = 0; synMax = synsets_noun[0]; for synset_noun in synsets_noun: # dict_synsets_nouns[iWord].append(synMax); for synset in wn_words: p = synset.path_similarity(synset_noun); # p = synsetRoot.path_similarity(synset_noun); if p > p_noun_max: p_noun_max = p; synMax = synset_noun; if synMax not in dict_synsets_for_words[iWord]: dict_synsets_for_words[iWord].append(synMax); if PARAMETERS.POS_FEATURE_v: # continue tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["d"])); verbs = [word for word,pos in tagged_sent if (pos == 'VB' or pos == 'VBN' or pos == 'VBD')]; # print "VVVVV" # print verbs for verb in verbs: verb = wordnet_lemmatizer.lemmatize(verb, pos='v'); if verb == None: continue if verb.encode('utf8') != word_concept and verb != "sth" and verb not in dict_words_verbs[iWord]: # print noun; dict_words_verbs[iWord].append(verb); # - - - - - - - - - - - - - - - - - - - - - - - - - - - # print dict_words_verbs[iWord] # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # synsets iSDMax = 0; pSD_max = 0; for iSyn in range(len(synsetsSD)): synsetSD = synsetsSD[iSyn]; pSD = 0; arr_p = []; for synset in wn_words: # p_noun_max = 0; p = synsetSD.path_similarity(synset); # arr_p.append(p); # print "-----------------------" # print synsetSD # print synset # print p # if p > p_noun_max: p_verb_max = p; arr_p.append(p_verb_max); arr_p = sorted(arr_p, reverse=True); for i in xrange(0, len(arr_p)-1): if i <= 1: pSD += arr_p[i]; # print "\n" if pSD > pSD_max: # print pSD # print pSD_max pSD_max = pSD; # print iSyn # print iSDMax iSDMax = iSyn; # print "\n" synsetRoot = synsetsSD[iSDMax]; # print "synsetroot" # print synsetRoot for verb in dict_words_verbs[iWord]: synsets_verb = WordnetHandler.get_synsets_for_word(verb, 'v'); if len(synsets_verb) <= 0: continue; p_verb_max = 0; synMax = synsets_verb[0]; for synset_verb in synsets_verb: # p = synsetRoot.path_similarity(synset_verb); for synset in wn_words: p = synset.path_similarity(synset_verb); if p > p_verb_max: p_verb_max = p; synMax = synset_verb; if synMax not in dict_synsets_for_words[iWord]: dict_synsets_for_words[iWord].append(synMax); # if synsets_noun[0] not in dict_synsets_nouns[iWord]: # dict_synsets_nouns[iWord].append(synsets_noun[0]); # print "dict_synsets_nouns" # print dict_synsets_for_words[iWord] ######################################## return dict_synsets_for_words;
def get_synsets(dict_words): dict_words_nouns = [] dict_synsets_nouns = [] for iWord in range(len(dict_words)): print iWord dict_words_nouns.append([]) dict_synsets_nouns.append([]) wordDict = dict_words[iWord] # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # sd tagged_sent = POSWrapper.pos_tag(nltk.word_tokenize(wordDict["sd"])) nouns = [word for word, pos in tagged_sent if (pos == "NN" or pos == "NNS")] for noun in nouns: if noun != "sth" and noun != "etc" and noun not in dict_words_nouns[iWord]: dict_words_nouns[iWord].append(noun) print dict_words_nouns[iWord] synsetsSD = wn.synsets(dict_words_nouns[iWord][len(dict_words_nouns[iWord]) - 1], pos="n") # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # d tagged_sent = POSWrapper.pos_tag(nltk.word_tokenize(wordDict["d"])) nouns = [word for word, pos in tagged_sent if (pos == "NN" or pos == "NNS")] for noun in nouns: if noun != wordDict["en"] and noun != "sth" and noun not in dict_words_nouns[iWord]: # print noun; dict_words_nouns[iWord].append(noun) # - - - - - - - - - - - - - - - - - - - - - - - - - - - print wordDict["tv"] print dict_words_nouns[iWord] # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # synsets iSDMax = 0 pSD_max = 0 for iSyn in range(len(synsetsSD) - 1): synsetSD = synsetsSD[iSyn] pSD = 0 for iNoun in range(len(dict_words_nouns[iWord]) - 1): if iNoun == 0: continue synsets_noun = wn.synsets(dict_words_nouns[iWord][iNoun], pos="n") p_noun_max = 0 for synset_noun in synsets_noun: p = synsetSD.path_similarity(synset_noun) # print synsetSD # print synset_noun # print p if p > p_noun_max: p_noun_max = p pSD += p_noun_max # print "\n" if pSD > pSD_max: # print pSD # print pSD_max pSD_max = pSD # print iSyn # print iSDMax iSDMax = iSyn # print "\n" synsetRoot = synsetsSD[0] print "synsetroot" print synsetRoot for noun in dict_words_nouns[iWord]: synsets_noun = wn.synsets(noun, pos="n") if len(synsets_noun) <= 0: continue p_noun_max = 0 synMax = synsets_noun[0] for synset_noun in synsets_noun: p = synsetRoot.path_similarity(synset_noun) if p > p_noun_max: p_noun_max = p synMax = synset_noun dict_synsets_nouns[iWord].append(synMax) # dict_synsets_nouns[iWord].append(synsets_noun[0]); print "dict_synsets_nouns" print dict_synsets_nouns[iWord] ######################################## return dict_synsets_nouns
def get_nbest_synsets_n_v_x_with_word(dict_words, word_concept): dict_words_nouns = [] dict_synsets_nouns = [] wn_words = wn.synsets(word_concept, pos="n") for iWord in range(len(dict_words)): print iWord dict_words_nouns.append([]) dict_synsets_nouns.append([]) wordDict = dict_words[iWord] # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # sd tagged_sent = POSWrapper.pos_tag(nltk.word_tokenize(wordDict["sd"])) nouns = [word for word, pos in tagged_sent if (pos == "NN" or pos == "NNS")] for noun in nouns: noun = wn.morphy(noun) if noun == None: continue if noun != "sth" and noun != "etc" and noun not in dict_words_nouns[iWord]: dict_words_nouns[iWord].append(noun) print dict_words_nouns[iWord] synsetsSD = wn.synsets(dict_words_nouns[iWord][len(dict_words_nouns[iWord]) - 1], pos="n") # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # d tagged_sent = POSWrapper.pos_tag(nltk.word_tokenize(wordDict["d"])) nouns = [word for word, pos in tagged_sent if (pos == "NN" or pos == "NNS")] for noun in nouns: noun = wn.morphy(noun) if noun == None: continue if noun != wordDict["en"] and noun != "sth" and noun not in dict_words_nouns[iWord]: # print noun; dict_words_nouns[iWord].append(noun) # - - - - - - - - - - - - - - - - - - - - - - - - - - - print wordDict["tv"] print dict_words_nouns[iWord] # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # synsets iSDMax = 0 pSD_max = 0 for iSyn in range(len(synsetsSD) - 1): synsetSD = synsetsSD[iSyn] pSD = 0 arr_p = [] for synset in wn_words: p_noun_max = 0 p = synsetSD.path_similarity(synset) # arr_p.append(p); # print "-----------------------" # print synsetSD # print synset # print p if p > p_noun_max: p_noun_max = p arr_p.append(p_noun_max) arr_p = sorted(arr_p, reverse=True) for i in xrange(0, len(arr_p) - 1): if i <= 1: pSD += arr_p[i] # print "\n" if pSD > pSD_max: # print pSD # print pSD_max pSD_max = pSD # print iSyn # print iSDMax iSDMax = iSyn # print "\n" synsetRoot = synsetsSD[iSDMax] print "synsetroot" print synsetRoot for noun in dict_words_nouns[iWord]: synsets_noun = wn.synsets(noun, pos="n") if len(synsets_noun) <= 0: continue p_noun_max = 0 synMax = synsets_noun[0] for synset_noun in synsets_noun: p = synsetRoot.path_similarity(synset_noun) if p > p_noun_max: p_noun_max = p synMax = synset_noun if synMax not in dict_synsets_nouns[iWord]: dict_synsets_nouns[iWord].append(synMax) if synsets_noun[0] not in dict_synsets_nouns[iWord]: dict_synsets_nouns[iWord].append(synsets_noun[0]) # if len(synsetsSD) >= 1: # synsetRoot = synsetsSD[0]; # print "synsetroot" # print synsetRoot # # for noun in dict_words_nouns[iWord]: # synsets_noun = wn.synsets(noun, pos = 'n'); # if len(synsets_noun) <= 0: # continue; # # p_noun_max = 0; # synMax = synsets_noun[0]; # # for synset_noun in synsets_noun: # p = synsetRoot.path_similarity(synset_noun); # if p > p_noun_max: # p_noun_max = p; # synMax = synset_noun; # # if synMax not in dict_synsets_nouns[iWord]: # dict_synsets_nouns[iWord].append(synMax); # if synsets_noun[0] not in dict_synsets_nouns[iWord]: # dict_synsets_nouns[iWord].append(synsets_noun[0]); # if len(synsetsSD) >= 3: # synsetRoot = synsetsSD[2]; # print "synsetroot" # print synsetRoot # # for noun in dict_words_nouns[iWord]: # synsets_noun = wn.synsets(noun, pos = 'n'); # if len(synsets_noun) <= 0: # continue; # # p_noun_max = 0; # synMax = synsets_noun[0]; # # for synset_noun in synsets_noun: # p = synsetRoot.path_similarity(synset_noun); # if p > p_noun_max: # p_noun_max = p; # synMax = synset_noun; # # if synMax not in dict_synsets_nouns[iWord]: # dict_synsets_nouns[iWord].append(synMax); # if synsets_noun[0] not in dict_synsets_nouns[iWord]: # dict_synsets_nouns[iWord].append(synsets_noun[0]); tagged_sent = POSWrapper.pos_tag(nltk.word_tokenize(wordDict["d"])) nouns = [word for word, pos in tagged_sent if (pos == "VB" or pos == "VBN" or pos == "VBD")] print "VVVVV" print nouns for noun in nouns: noun = wn.morphy(noun) if noun == None: continue if noun != wordDict["en"] and noun != "sth" and noun not in dict_words_nouns[iWord]: # print noun; dict_words_nouns[iWord].append(noun) # - - - - - - - - - - - - - - - - - - - - - - - - - - - print wordDict["tv"] print dict_words_nouns[iWord] # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # synsets iSDMax = 0 pSD_max = 0 for iSyn in range(len(synsetsSD) - 1): synsetSD = synsetsSD[iSyn] pSD = 0 arr_p = [] for synset in wn_words: p_noun_max = 0 p = synsetSD.path_similarity(synset) # arr_p.append(p); # print "-----------------------" # print synsetSD # print synset # print p if p > p_noun_max: p_noun_max = p arr_p.append(p_noun_max) arr_p = sorted(arr_p, reverse=True) for i in xrange(0, len(arr_p) - 1): if i <= 1: pSD += arr_p[i] # print "\n" if pSD > pSD_max: # print pSD # print pSD_max pSD_max = pSD # print iSyn # print iSDMax iSDMax = iSyn # print "\n" synsetRoot = synsetsSD[iSDMax] print "synsetroot" print synsetRoot for noun in dict_words_nouns[iWord]: synsets_noun = wn.synsets(noun, pos="v") if len(synsets_noun) <= 0: continue p_noun_max = 0 synMax = synsets_noun[0] for synset_noun in synsets_noun: p = synsetRoot.path_similarity(synset_noun) if p > p_noun_max: p_noun_max = p synMax = synset_noun if synMax not in dict_synsets_nouns[iWord]: print synMax dict_synsets_nouns[iWord].append(synMax) if synsets_noun[0] not in dict_synsets_nouns[iWord]: dict_synsets_nouns[iWord].append(synsets_noun[0]) # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # x1 if wordDict.has_key("x1"): tagged_sent = POSWrapper.pos_tag(nltk.word_tokenize(wordDict["x1"])) nouns = [word for word, pos in tagged_sent if (pos == "NN" or pos == "NNS")] for noun in nouns: noun = wn.morphy(noun) if noun == None: continue if noun != wordDict["en"] and noun != "sth" and noun not in dict_words_nouns[iWord]: # print noun; dict_words_nouns[iWord].append(noun) # - - - - - - - - - - - - - - - - - - - - - - - - - - - print wordDict["tv"] print dict_words_nouns[iWord] # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # synsets iSDMax = 0 pSD_max = 0 for iSyn in range(len(synsetsSD) - 1): synsetSD = synsetsSD[iSyn] pSD = 0 arr_p = [] for synset in wn_words: p_noun_max = 0 p = synsetSD.path_similarity(synset) # arr_p.append(p); # print "-----------------------" # print synsetSD # print synset # print p if p > p_noun_max: p_noun_max = p arr_p.append(p_noun_max) arr_p = sorted(arr_p, reverse=True) for i in xrange(0, len(arr_p) - 1): if i <= 1: pSD += arr_p[i] # print "\n" if pSD > pSD_max: # print pSD # print pSD_max pSD_max = pSD # print iSyn # print iSDMax iSDMax = iSyn # print "\n" synsetRoot = synsetsSD[iSDMax] print "synsetroot" print synsetRoot for noun in dict_words_nouns[iWord]: synsets_noun = wn.synsets(noun, pos="n") if len(synsets_noun) <= 0: continue p_noun_max = 0 synMax = synsets_noun[0] for synset_noun in synsets_noun: p = synsetRoot.path_similarity(synset_noun) if p > p_noun_max: p_noun_max = p synMax = synset_noun if synMax not in dict_synsets_nouns[iWord]: dict_synsets_nouns[iWord].append(synMax) if synsets_noun[0] not in dict_synsets_nouns[iWord]: dict_synsets_nouns[iWord].append(synsets_noun[0]) # if len(synsetsSD) >= 1: # synsetRoot = synsetsSD[0]; # print "synsetroot" # print synsetRoot # # for noun in dict_words_nouns[iWord]: # synsets_noun = wn.synsets(noun, pos = 'n'); # if len(synsets_noun) <= 0: # continue; # # p_noun_max = 0; # synMax = synsets_noun[0]; # # for synset_noun in synsets_noun: # p = synsetRoot.path_similarity(synset_noun); # if p > p_noun_max: # p_noun_max = p; # synMax = synset_noun; # # if synMax not in dict_synsets_nouns[iWord]: # dict_synsets_nouns[iWord].append(synMax); # if synsets_noun[0] not in dict_synsets_nouns[iWord]: # dict_synsets_nouns[iWord].append(synsets_noun[0]); # if len(synsetsSD) >= 3: # synsetRoot = synsetsSD[2]; # print "synsetroot" # print synsetRoot # # for noun in dict_words_nouns[iWord]: # synsets_noun = wn.synsets(noun, pos = 'n'); # if len(synsets_noun) <= 0: # continue; # # p_noun_max = 0; # synMax = synsets_noun[0]; # # for synset_noun in synsets_noun: # p = synsetRoot.path_similarity(synset_noun); # if p > p_noun_max: # p_noun_max = p; # synMax = synset_noun; # # if synMax not in dict_synsets_nouns[iWord]: # dict_synsets_nouns[iWord].append(synMax); # if synsets_noun[0] not in dict_synsets_nouns[iWord]: # dict_synsets_nouns[iWord].append(synsets_noun[0]); tagged_sent = POSWrapper.pos_tag(nltk.word_tokenize(wordDict["x1"])) nouns = [word for word, pos in tagged_sent if (pos == "VB" or pos == "VBN" or pos == "VBD")] print "VVVVV" print nouns for noun in nouns: noun = wn.morphy(noun) if noun == None: continue if noun != wordDict["en"] and noun != "sth" and noun not in dict_words_nouns[iWord]: # print noun; dict_words_nouns[iWord].append(noun) # - - - - - - - - - - - - - - - - - - - - - - - - - - - print wordDict["tv"] print dict_words_nouns[iWord] # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # synsets iSDMax = 0 pSD_max = 0 for iSyn in range(len(synsetsSD) - 1): synsetSD = synsetsSD[iSyn] pSD = 0 arr_p = [] for synset in wn_words: p_noun_max = 0 p = synsetSD.path_similarity(synset) # arr_p.append(p); # print "-----------------------" # print synsetSD # print synset # print p if p > p_noun_max: p_noun_max = p arr_p.append(p_noun_max) arr_p = sorted(arr_p, reverse=True) for i in xrange(0, len(arr_p) - 1): if i <= 1: pSD += arr_p[i] # print "\n" if pSD > pSD_max: # print pSD # print pSD_max pSD_max = pSD # print iSyn # print iSDMax iSDMax = iSyn # print "\n" synsetRoot = synsetsSD[iSDMax] print "synsetroot" print synsetRoot for noun in dict_words_nouns[iWord]: synsets_noun = wn.synsets(noun, pos="v") if len(synsets_noun) <= 0: continue p_noun_max = 0 synMax = synsets_noun[0] for synset_noun in synsets_noun: p = synsetRoot.path_similarity(synset_noun) if p > p_noun_max: p_noun_max = p synMax = synset_noun if synMax not in dict_synsets_nouns[iWord]: print synMax dict_synsets_nouns[iWord].append(synMax) if synsets_noun[0] not in dict_synsets_nouns[iWord]: dict_synsets_nouns[iWord].append(synsets_noun[0]) print "dict_synsets_nouns" print dict_synsets_nouns[iWord] ######################################## return dict_synsets_nouns