def get_greatest_synsets_similarity_between(synsets_wn, nouns): synset_wn_max = None p_max = 0 if len(nouns) != 0: for synset_wn in synsets_wn: p_noun = 0 for noun in nouns: synsets_of_noun = WordnetHandler.get_synsets_for_word(noun, 'n') if len(synsets_of_noun) > 0: p_each_noun = 0 for synset_of_noun in synsets_of_noun: # p = synset_wn.path_similarity(synset_of_noun) p = WordnetHandler.cal_similarity(synset_wn, synset_of_noun) p_each_noun += p p_each_noun = p_each_noun/len(synsets_of_noun) p_noun += p_each_noun p = p_noun/len(nouns) if p > p_max: synset_wn_max = synset_wn else: print "no nouns" return synset_wn_max
def get_gloss_synset_for(synset): key = synset.name() if key not in __dict_gloss_for_synset__: synsets_gloss = "" definition = synset.definition() synsets_gloss += definition + ". " # # # # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # # get hypernyms # # print "\nhypernyms ------"; for hypernym in synset.hypernyms(): synsets_gloss += WordnetHandler.get_lemma_synset(hypernym) + ". " # - - - - - - - - - - - - - - - - - - - - - - - - - - - # get hyponyms for hyponym in synset.hyponyms(): synsets_gloss += WordnetHandler.get_lemma_synset(hyponym) + ". " for meronym in synset.part_meronyms(): synsets_gloss += WordnetHandler.get_lemma_synset(meronym) + ". " for holonym in synset.member_holonyms(): synsets_gloss += WordnetHandler.get_lemma_synset(holonym) + ". " for example in synset.examples(): synsets_gloss += example + ". " __dict_gloss_for_synset__[key] = synsets_gloss # return __dict_gloss_for_synset__[key]
def synsets_for_word(word_pos): (word, pos) = word_pos synsets_of_noun = [] if pos_is_noun(pos): synsets_of_noun = WordnetHandler.get_synsets_for_word(word, "n") if pos_is_verb(pos): synsets_of_noun = WordnetHandler.get_synsets_for_word(word, "v") return synsets_of_noun
def sim_ox_wn_value_main_synsets(word): dict_vectors_wn = WordnetParseDefinition.get_dict_vectors_value_for(word) synsets_wn = WordnetHandler.get_synsets_for_word(word,'n') dict_vectors_ox = OxParseDefinition.get_vectors_value_for_word(word, synsets_wn) (keys_wn, vectors_wn) = Util.get_keys_values_of_dict(dict_vectors_wn) (keys_ox, vectors_ox) = Util.get_keys_values_of_dict(dict_vectors_ox) m2d_sim_defi_temp = sim_ox_wn_defi_WDS_via_main_syns(word) DebugHandler.print_2d_matrix(m2d_sim_defi_temp) m2d_sim_defi = [[0 for x in range(len(vectors_wn))] for x in range(len(vectors_ox))] for i in range(len(vectors_wn)): for j in range(len(vectors_ox)): m2d_sim_defi[j][i] = m2d_sim_defi_temp[i][j] m2d_sim = [[0 for x in range(len(vectors_ox))] for x in range(len(vectors_wn))] for i in range(len(vectors_wn)): vector_wn = vectors_wn[i] print vector_wn for j in range(len(vectors_ox)): vector_ox = vectors_ox[j] cosine = spatial.distance.cosine(m2d_sim_defi[j], vector_wn) m2d_sim[i][j] = cosine print "\n" for j in range(len(vectors_ox)): vector_ox = vectors_ox[j] print vector_ox return m2d_sim
def sim_ox_wn_defi_WDS_via_main_syns(word): dict_vectors_wn = WordnetParseDefinition.get_dict_vectors_synsets_for_word(word) synsets_wn = WordnetHandler.get_synsets_for_word(word,'n') dict_vectors_ox = OxParseDefinition.get_dict_vectors_synsets_for_word(word, synsets_wn) # (keys_wn, vectors_wn) = Util.get_keys_values_of_dict(dict_vectors_wn) (keys_ox, vectors_ox) = Util.get_keys_values_of_dict(dict_vectors_ox) # m2d_sim = sim_wn_ox_vector(vectors_ox, vectors_wn) # cal_sim_ngrams(word) # write to file # # - - - - - - - - - - - - - - - - - - - - - - - - - # for i in range(len(keys_wn)): # m2d_sim[i].insert(0,keys_wn[i]); # # - - - - - - - - - - - - - - - - - - - - - - - - - # # row # row_dict = []; # row_dict.append(word); # for i in range(len(keys_ox)): # row_dict.append(keys_ox[i].encode('utf8')); # # - - - - - - - - - - - - - - - - - - - - - - - - - # filename = 'Results/vector_definition/' + word + '.csv' # FileProcess.append_to_excel_file(filename, row_dict, m2d_sim) # # - - - - - - - - - - - - - - - - - - - - - - - - - return m2d_sim
def parse_ox_wn_defi_to_input(word): defis_wn = WordnetHandler.get_definitions_for_word(word) defis_ox = OxfordParser.get_definitions_of_word_for_svm(word) for defi_wn in defis_wn: for defi_ox in defis_ox: value = defi_wn + "\t" + defi_ox FileProcess.append_value_to_file(value, __filename_input_sen__)
def sim_2_word(word_1, word_2): synsets_1 = synsets_for_word(word_1) synsets_2 = synsets_for_word(word_2) p_max = 0 if __WSD_type__ == 1: for synset_1 in synsets_1: for synset_2 in synsets_2: p = WordnetHandler.cal_similarity(synset_1, synset_2) if p > p_max: p_max = p if __WSD_type__ == 0: if len(synsets_1) == 0 or len(synsets_2) == 0: return 0 p_max = WordnetHandler.cal_similarity(synsets_1[0], synsets_2[0]) return p_max
def get_gloss_for_jacc(word): vectors = OrderedDict() synsets = WordnetHandler.get_synsets_for_word(word, 'n') for synset in synsets: vector = get_gloss_synset_for(synset) key = synset.definition() vectors[key] = vector return vectors
def get_dict_vectors_value_for(word): vectors = OrderedDict() synsets = WordnetHandler.get_synsets_for_word(word, 'n') for synset in synsets: vector = get_value_synset_for(synset, synsets) key = synset.definition() vectors[key] = vector return vectors
def sim_for_synset_and_synsetvector(a_synset, vector): p_max = 0 for (synset,weight) in vector: p = WordnetHandler.cal_similarity(a_synset, synset) if p is not None: # p = p*weight if p > p_max: p_max = p return p_max
def get_dict_vectors_words_for_word(word): vectors = OrderedDict() synsets = WordnetHandler.get_synsets_for_word(word, 'n') for synset in synsets: definition = synset.definition() vector = PreprocessDefinition.preprocess_sentence(definition) key = synset.definition() vectors[key] = vector return vectors
def sim_ox_wn_defi_WDS_via_1_main_syn(word): dict_vectors_wn = WordnetParseDefinition.get_dict_vectors_synsets_for_word(word) synsets_wn = WordnetHandler.get_synsets_for_word(word,'n') dict_vectors_ox = OxParseDefinition.get_dict_vectors_synsets_for_word(word, synsets_wn) (keys_wn, vectors_wn) = Util.get_keys_values_of_dict(dict_vectors_wn) (keys_ox, vectors_ox) = Util.get_keys_values_of_dict(dict_vectors_ox) m2d_sim = sim_wn_ox_vector(vectors_ox, vectors_wn) return m2d_sim
def sim_ox_wn_defi_WDS_via_align_all(word): words_wn = WordnetParseDefinition.get_dict_vectors_words_for_word(word) (keys_wn, vectors_wn) = Util.get_keys_values_of_dict(words_wn) words_ox = OxParseDefinition.get_dict_vectors_word_for_word(word) (keys_ox, vectors_ox) = Util.get_keys_values_of_dict(words_ox) m2d_sim = [[0 for x in range(len(keys_ox))] for x in range(len(keys_wn))] for i in range(len(keys_wn)): vector_wn = vectors_wn[i] words_ox = [] for j in range(len(keys_ox)): words_ox += vectors_ox[j] synsets_wn = WordnetHandler.get_nearest_synsets_words_words_order(vector_wn, words_ox) for j in range(len(keys_ox)): synsets_ox = WordnetHandler.get_nearest_synsets_words_synsets_order(vectors_ox[j], synsets_wn) m2d_sim[i][j] = sim_2_vector(synsets_wn, synsets_ox) cal_sim_ngrams(word) return m2d_sim
def sim_ox_wn_defi_WDS_via_align(word): words_wn = WordnetParseDefinition.get_dict_vectors_words_for_word(word) (keys_wn, vectors_wn) = Util.get_keys_values_of_dict(words_wn) words_ox = OxParseDefinition.get_dict_vectors_word_for_word(word) (keys_ox, vectors_ox) = Util.get_keys_values_of_dict(words_ox) synsets_wn = WordnetHandler.get_synsets_for_word(word, 'n') m2d_sim = [[0 for x in range(len(keys_ox))] for x in range(len(keys_wn))] for i in range(len(keys_wn)): vector_wn = vectors_wn[i] synset_wn = synsets_wn[i] for j in range(len(keys_ox)): vector_ox = vectors_ox[j] m2d_sim[i][j] = WordnetHandler.sim_for_words_words_no_order(vector_ox, vector_wn, synset_wn) # (vector_1, vector_2) = WordnetHandler.get_nearest_synsets_words_words_noorder(vector_ox, vector_wn) # m2d_sim[i][j] = sim_2_vector(vector_1, vector_2) cal_sim_ngrams(word) return m2d_sim
def get_greatest_synset_similarity_between(synset_1, noun_2): synset_max = None (word, pos) = noun_2 synsets_of_noun = [] # if pos_is_noun(pos): # synsets_of_noun = WordnetHandler.get_synsets_for_word(word, 'n') # if pos_is_verb(pos): # synsets_of_noun = WordnetHandler.get_synsets_for_word(word, 'v') synsets_of_noun_1 = WordnetHandler.get_synsets_for_word(word, 'n') synsets_of_noun_2 = WordnetHandler.get_synsets_for_word(word, 'v') synsets_of_noun = synsets_of_noun_1 + synsets_of_noun_2 total_count = 0.1 + len(synsets_of_noun)*__SMOOTH_WEIGHT__ for synset_of_noun in synsets_of_noun: total_count += WordnetHandler.get_freq_count_of_synset(synset_of_noun) # if len(synsets_of_noun) > 0: synset_max = synsets_of_noun[0] # p_max = -1.0 # # for synset_of_noun in synsets_of_noun: ## p = synset_1.path_similarity(synset_of_noun) # p = WordnetHandler.cal_similarity(synset_1, synset_of_noun) # # if p is not None: # synset_freq_count = __SMOOTH_WEIGHT__ # synset_freq_count += WordnetHandler.get_freq_count_of_synset(synset_of_noun) # # p = p*(synset_freq_count/total_count) # ## # if p > p_max: # p_max = p # synset_max = synset_of_noun # return synset_max
def get_m2d_sim_for_word_from_svm_result(word): defis_wn = WordnetHandler.get_definitions_for_word(word) defis_ox = OxfordParser.get_definitions_of_word_for_svm(word) if len(defis_wn) == 0 or len(defis_ox) == 0: return None m2d_sim = [[0 for x in range(len(defis_ox))] for x in range(len(defis_wn))] for i_wn in range(len(defis_wn)): defi_wn = str(defis_wn[i_wn]) for i_ox in range(len(defis_ox)): defi_ox = str(defis_ox[i_ox]) m2d_sim[i_wn][i_ox] = ReadSVMResult.get_sim_for_sens(defi_wn, defi_ox) return m2d_sim
def wordnet_based_synset(syn_wn, sen_ox): sim = 0.0001 words_ox = split_words(sen_ox) count = 0 for word in words_ox: p_max = 0 synsets_1 = synsets_for_word(word) for synset in synsets_1: p = WordnetHandler.cal_similarity(synset, syn_wn) if p > p_max: p_max = p if p_max != 0: count += 1 sim += p_max sim /= count + 0.001 return sim
def create_input_sens_test(dict_ox): flag_can_go = False for word in dict_ox: if word == "blockage": flag_can_go = True if flag_can_go == False: continue if len(dict_ox[word]) == 0: continue defis_wn = WordnetHandler.get_definitions_for_word(word) defis_ox = OxfordParser.get_definitions_of_word_for_svm(word) if len(defis_ox) == 1 and len(defis_wn) == 1: continue if len(defis_ox) == 1 and len(defis_wn) > 1: all_defi_wn = "" for defi_wn in defis_wn: all_defi_wn += defi_wn + "\t" if all_defi_wn != "": all_defi_wn = all_defi_wn[:-1] for defi_wn in defis_wn: for defi_ox in defis_ox: value = defi_wn + "\t" + defi_ox + "\t" + all_defi_wn FileProcess.append_value_to_file(value, __filename_input_sen_test__) else: for defi_wn in defis_wn: all_defi_ox = "" for defi_ox in defis_ox: all_defi_ox += defi_ox + "\t" if all_defi_ox != "": all_defi_ox = all_defi_ox[:-1] for defi_ox in defis_ox: value = defi_wn + "\t" + defi_ox + "\t" + all_defi_ox FileProcess.append_value_to_file(value, __filename_input_sen_test__)
def sim_ox_wn_defi_WDS_via_curr_main_syn(word): dict_vectors_wn = WordnetParseDefinition.get_dict_vectors_synsets_for_word(word) (keys_wn, vectors_wn) = Util.get_keys_values_of_dict(dict_vectors_wn) synsets_wn = WordnetHandler.get_synsets_for_word(word, 'n') definitions = OxfordParser.get_definitions_of_word(word) m2d_sim = [[0 for x in range(len(definitions))] for x in range(len(vectors_wn))] for i in range(len(vectors_wn)): vector_wn = vectors_wn[i] dict_vectors_ox = OxParseDefinition.get_dict_vectors_synsets_for_word(word, [synsets_wn[i]]) (keys_ox, vectors_ox) = Util.get_keys_values_of_dict(dict_vectors_ox) for j in range(len(vectors_ox)): vector_ox = vectors_ox[j] m2d_sim[i][j] = sim_2_vector(vector_ox, vector_wn) cal_sim_ngrams(word) return m2d_sim
def get_value_synset_for(cur_synset, synsets): synsets_value = [] definition = cur_synset.definition() nouns = PreprocessDefinition.preprocess_sentence(definition) # nouns = list(set(nouns)) for synset in synsets: count = 0 p = 0 for noun in nouns: synset_max = get_greatest_synset_similarity_between(synset, noun) if synset_max is not None: count += 1 sim = WordnetHandler.cal_similarity(synset, synset_max) if sim != None: p += sim if count != 0: p = p/count synsets_value.append(p) return synsets_value
def create_input_for_test_svm(): dict_ox = OxfordParser.get_dict_nouns() flag_can_go = False for word in dict_ox: # if word == "brook": # flag_can_go = True # # if flag_can_go == False: # continue if len(dict_ox[word]) == 0: continue syns_wn = WordnetHandler.get_synsets_for_word(word, 'n') syns_ox = dict_ox[word] if len(syns_ox) == 1 and len(syns_wn) == 1: continue write_sens_for_reading(syns_wn, syns_ox, __filename_input_sen_test__) cal_features_and_write_to_file_for(syns_wn, syns_ox, __filename_input_test_feature_values__)
def get_definition_value_with_synsetwn(definition, synsets_wn): synsets_value = [] # nouns = PreprocessDefinition.preprocess_sentence_to_nouns(definition) nouns = PreprocessDefinition.preprocess_sentence(definition) # nouns = list(set(nouns)) for synset in synsets_wn: count = 0 p = 0 for noun in nouns: synset_max = get_greatest_synset_similarity_between([synset], noun) if synset_max is not None: count += 1 sim = WordnetHandler.cal_similarity(synset, synset_max) if sim != None: p += sim if count != 0: p = p/count synsets_value.append(p) return synsets_value
def similarity_nbest_withword_average(WORD, dict_words): wn_words = WordnetHandler.get_synsets_for_word(WORD, 'n'); matrix_similarity = similarity_by_synsets_synsets_nbest_withword_dict_wn(WORD, dict_words,wn_words) if matrix_similarity == None: return [],[] matrix_similarity_reverse = similarity_by_synsets_synsets_nbest_withword_wn_dict(WORD , wn_words, dict_words) for iWnWord in range(len(wn_words)): for iDictWord in range(len(dict_words)): matrix_similarity[iWnWord][iDictWord] = matrix_similarity[iWnWord][iDictWord] + matrix_similarity_reverse[iDictWord][iWnWord]; matrix_similarity[iWnWord][iDictWord] /= 2; matrix_similarity_jaccard = similarity_by_jaccard(WORD, dict_words, wn_words) for iWnWord in range(len(wn_words)): for iDictWord in range(len(dict_words)): matrix_similarity[iWnWord][iDictWord] = matrix_similarity[iWnWord][iDictWord]*(1-PARAMETERS.JACCARD_WEIGHT) + PARAMETERS.JACCARD_WEIGHT*(1-matrix_similarity_jaccard[iWnWord][iDictWord]); return matrix_similarity, wn_words
def pair_0_1_reducing_m2d_sim(matrix_similarity, num_rows, num_cols, word): if num_rows == 1 and num_cols == 1 and matrix_similarity[0][0] > Parameters.PARAMETERS_CHOICE_0_1.CHOICE_1_1_MIN: matrix_similarity[0][0] = 1; if num_rows > 1 and num_cols == 1: col = [] for iWnWord in range(num_rows): col.append(matrix_similarity[iWnWord][0]) order = heapq.nlargest(2, range(num_rows), col.__getitem__); if matrix_similarity[order[0]][0] >= Parameters.PARAMETERS_CHOICE_0_1.CHOICE_1_COL_RANGE_FIRST*matrix_similarity[order[1]][0] or \ matrix_similarity[order[0]][0] > Parameters.PARAMETERS_CHOICE_0_1.CHOICE_1_COL_MIN_FIRST: matrix_similarity[order[0]][0] = 1; if num_rows >= 1 and num_cols > 1: synsets_wn = WordnetHandler.get_synsets_for_word(word,'n') status_synsets = create_status_array(synsets_wn) updated = reducing_m2d_sim(matrix_similarity, status_synsets) while updated == 1: m2d = sim_ox_wn_defi_WDS_via_main_syns_for_reduce(synsets_wn, status_synsets, word) updated = reducing_m2d_sim(m2d, status_synsets) match_matrix_sim_with_temp_matrix(matrix_similarity, m2d) return matrix_similarity
def create_input_for_train_svm(): dict_ox = OxfordParser.get_dict_nouns() dict_gold = CompareWithGold.goldData for word in dict_ox: if len(dict_ox[word]) == 0 or word not in dict_gold: continue if word == "brook": return # if word != "bank": # continue syns_wn = WordnetHandler.get_synsets_for_word(word, 'n') syns_ox = dict_ox[word] if len(syns_ox) == 1 and len(syns_wn) == 1: continue write_label_for_svm(syns_wn, syns_ox, dict_gold[word]) write_sens_for_reading(syns_wn, syns_ox, __filename_input_sen_train__) cal_features_and_write_to_file_for(syns_wn, syns_ox, __filename_input_train_feature_values__)
def get_synsets_for_word_in_wn(word_origin, wn_synsets_for_word_origin): # arr synsets for arr words # each word has an array of synsets wn_synsets_for_words = []; # add p p_synsets_for_words = []; for iWord in range(len(wn_synsets_for_word_origin)): # print "- - - - - - - - - - - - - - - - - - - - - - - - - - -"; # print iWord; wn_synsets_for_words.append([]); # add p p_synsets_for_words.append([]); # get a bank in wn_words wordDict = wn_synsets_for_word_origin[iWord]; # - - - - - - - - - - - - - - - - - - - - - - - - - - - # get synsets of bank synset_of_word = wn.synset(wordDict.name()); wn_synsets_for_words[iWord].append(synset_of_word); # add p p_synsets_for_words[iWord].append(1.5); # print synset_of_word # print "---" # - - - - - - - - - - - - - - - - - - - - - - - - - - - # get hypernyms if PARAMETERS.DICT_WN_FEATURE_RELATION_hypernyms == 1: # print "hypernyms" for hypernym in wn.synset(wordDict.name()).hypernyms(): # print hypernym wn_synsets_for_words[iWord].append(hypernym); # add p p_synsets_for_words[iWord].append(1.2); # - - - - - - - - - - - - - - - - - - - - - - - - - - - # get meronyms if PARAMETERS.DICT_WN_FEATURE_RELATION_part_meronyms == 1: # print "meronyms" for meronym in wn.synset(wordDict.name()).part_meronyms(): # print meronym wn_synsets_for_words[iWord].append(meronym); # add p p_synsets_for_words[iWord].append(1.2); # # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # get holonyms if PARAMETERS.DICT_WN_FEATURE_RELATION_member_holonyms == 1: # print "holonyms" for holonym in wn.synset(wordDict.name()).member_holonyms(): # print holonym wn_synsets_for_words[iWord].append(holonym); # add p p_synsets_for_words[iWord].append(1.2); # - - - - - - - - - - - - - - - - - - - - - - - - - - - # get hyponyms if PARAMETERS.DICT_WN_FEATURE_RELATION_hyponyms == 1: # print "hyponyms" for hyponym in wn.synset(wordDict.name()).hyponyms(): # print hyponym wn_synsets_for_words[iWord].append(hyponym); # add p p_synsets_for_words[iWord].append(1.2); # # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # get description if PARAMETERS.DICT_WN_FEATURE_RELATION_definition == 1: # print "\ndefinition ------"; tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wn.synset(wordDict.name()).definition())); # print tagged_sent # - - - - - - - - - - - - - - - - - - - - - - - - - - - if PARAMETERS.POS_FEATURE_n == 1: nouns = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ')]; for noun in nouns: noun = wordnet_lemmatizer.lemmatize(noun, pos='n'); if noun == None: continue if noun != word_origin and noun != "sth": synsetsDictNoun = WordnetHandler.get_synsets_for_word(noun, "n"); if len(synsetsDictNoun) > 0: synsetMax = synsetsDictNoun[0]; p_max = 0; for synsetNoun in synsetsDictNoun: p = synsetNoun.path_similarity(synset_of_word); if p > p_max: p_max = p; synsetMax = synsetNoun # print synsetMax if synsetMax not in wn_synsets_for_words[iWord]: wn_synsets_for_words[iWord].append(synsetMax); # add p p_synsets_for_words[iWord].append(1.); # if synsetsDictNoun[0] not in wn_words_synset[iWord]: # # wn_words_synset[iWord].append(synsetsDictNoun[0]); # - - - - - - - - - - - - - - - - - - - - - - - - - - - # - - - - - - - - - - - - - - - - - - - - - - - - - - - if PARAMETERS.POS_FEATURE_v == 1: verbs = [word for word,pos in tagged_sent if (pos == 'VB' or pos == 'VBD' or pos == 'VBN')]; for verb in verbs: verb = wordnet_lemmatizer.lemmatize(verb, pos='v'); if verb == None: continue if verb != "bank": synsetsDictVerb = WordnetHandler.get_synsets_for_word(verb, "v"); if len(synsetsDictVerb) > 0: synsetMax = synsetsDictVerb[0]; p_max = 0; for synsetVerb in synsetsDictVerb: p = synsetVerb.path_similarity(synset_of_word); if p > p_max: p_max = p; synsetMax = synsetVerb # # print synsetMax if synsetMax not in wn_synsets_for_words[iWord]: wn_synsets_for_words[iWord].append(synsetMax); # add p p_synsets_for_words[iWord].append(1.); # if synsetsDictNoun[0] not in wn_words_synset[iWord]: # wn_words_synset[iWord].append(synsetsDictNoun[0]); # print wn_synsets_for_words[iWord] ######################################## return wn_synsets_for_words,p_synsets_for_words;
def cal_feature_values_for(syn_wn, syn_ox): feature_values = [] defi_wn = WordnetHandler.get_defi_for_syn(syn_wn) defi_ox = OxfordParser.get_defi_for_syn(syn_ox) gloss_wn = WordnetHandler.get_gloss_for_syn(syn_wn) gloss_ox = OxfordParser.get_gloss_for_syn(syn_ox) lemma_wn = WordnetHandler.get_lemma_for_synset(syn_wn) sd_ox = OxfordParser.get_short_defi_for_syn(syn_ox) ex_wn = WordnetHandler.get_ex_for_syn(syn_wn) ex_ox = OxfordParser.get_ex_for_syn(syn_ox) cl_ox = OxfordParser.get_collocation_for_syn(syn_ox) hyper_wn = WordnetHandler.get_hyper_defi_for_synset(syn_wn) mero_wn = WordnetHandler.get_mero_defi_for_synset(syn_wn) # # # # # # # # # # # # # # # # # # Literal literal_leven_value = 1-Literal.levenshtein(defi_wn, defi_ox) feature_values.append(literal_leven_value) literal_jacc_value = 1.00001-Literal.jaccard(defi_wn, defi_ox) feature_values.append(literal_jacc_value) # feature_values.append(literal_jacc_value+literal_leven_value) # # # # # # # # # # literal_leven_value = 1-Literal.levenshtein(gloss_wn, gloss_ox) feature_values.append(literal_leven_value) literal_jacc_value = 1.00001-Literal.jaccard(gloss_wn, gloss_ox) feature_values.append(literal_jacc_value) # feature_values.append(literal_jacc_value+literal_leven_value) # # # # # # # # # # literal_leven_ngram = literal_leven_value literal_jacc_ngram = literal_jacc_value ngrams_value = Ngrams.ngrams_word_for(gloss_wn, gloss_ox, 2) literal_jacc_ngram += ngrams_value literal_leven_ngram += ngrams_value ngrams_value = Ngrams.ngrams_word_for(gloss_wn, gloss_ox, 3) literal_jacc_ngram += ngrams_value literal_leven_ngram += ngrams_value ngrams_value = Ngrams.ngrams_word_for(gloss_wn, gloss_ox, 4) literal_jacc_ngram += ngrams_value literal_leven_ngram += ngrams_value ngrams_value = Ngrams.ngrams_word_for(gloss_wn, gloss_ox, 5) literal_jacc_ngram += ngrams_value literal_leven_ngram += ngrams_value feature_values.append(literal_jacc_ngram) # feature_values.append(literal_leven_ngram) # # # # # # # # # # # gloss_split_wn = Literal.split_and_stem(gloss_wn) # gloss_split_ox = Literal.split_and_stem(gloss_ox) # literal_jaro_winkler = Jelly.jaro_winkler(gloss_wn, gloss_ox) # feature_values.append(literal_jaro_winkler + literal_jacc_value) # # # # # # # # # # # literal_jacc_value = 1.00001-Literal.jaccard(ex_wn, ex_ox) # feature_values.append(literal_jacc_value) # # # # # # # # # # # # # # # # # # ShallowSyntactic # shallow_jaccard_POS = 0 # shallow_jaccard_POS += 1.0001 - ShallowSyntactic.jaccard_POS(gloss_wn, gloss_ox) # shallow_jaccard_POS += 1.0001 - ShallowSyntactic.jaccard_POS_ngrams(gloss_wn, gloss_ox, 2) # shallow_jaccard_POS += 1.0001 - ShallowSyntactic.jaccard_POS_ngrams(gloss_wn, gloss_ox, 3) # shallow_jaccard_POS += 1.0001 - ShallowSyntactic.jaccard_POS_ngrams(gloss_wn, gloss_ox, 4) # feature_values.append(shallow_jaccard_POS) # # # # # # # # # # # # # # # # # # wordnet-based, WSD wn_value = WordnetBased.wordnet_based(defi_wn, defi_ox, 0) feature_values.append(wn_value) # wn_value = WordnetBased.wordnet_based(hyper_wn, defi_ox, 0) # feature_values.append(wn_value) # hypo_value = 0 # if len(syn_wn.hyponyms()) > 0: # for hypo in syn_wn.hyponyms(): # hypo_value += WordnetBased.wordnet_based_synset(hypo, defi_ox) # hypo_value /= len(syn_wn.hyponyms()) # feature_values.append(hypo_value) # hyper_value = 0 # if len(syn_wn.hypernyms()) > 0: # for hyper in syn_wn.hypernyms(): # hyper_value += WordnetBased.wordnet_based_synset(hyper, defi_ox) # hyper_value /= len(syn_wn.hypernyms()) # feature_values.append(hyper_value) # # wn_value = WordnetBased.wordnet_based(ex_wn, ex_ox,0) # feature_values.append(wn_value) # # wn_value_1 = WordnetBased.wordnet_based(defi_wn, defi_ox, 1) # feature_values.append(wn_value + wn_value_1) # # wn_value = WordnetBased.wordnet_based(gloss_wn, gloss_ox, 0) # feature_values.append(wn_value) # # wn_value_1 = WordnetBased.wordnet_based(gloss_wn, gloss_ox, 1) # feature_values.append(wn_value + wn_value_1) # # # # # # # # # # # # # # # # # # lsa # lsa_tfidf = LSA.sim_tfidf(defi_wn, defi_ox) # feature_values.append(lsa_tfidf) ## # lsa_tfidf = LSA.sim_tfidf(hyper_wn, defi_ox) # feature_values.append(lsa_tfidf) # # lsa_tfidf = LSA.sim_tfidf(gloss_wn, gloss_ox) # feature_values.append(lsa_tfidf) # lsa_tfidf = LSA.sim_tfidf(lemma_wn, sd_ox) # feature_values.append(lsa_tfidf) # # lsa_tfidf = LSA.sim_tfidf(ex_wn, ex_ox) # feature_values.append(lsa_tfidf) return feature_values
def get_nbest_synsets_for_word_in_oxford(dict_words,word_concept): dict_words_nouns = []; dict_words_verbs = []; dict_synsets_for_words = []; wn_words = WordnetHandler.get_synsets_for_word(word_concept, 'n'); # add p p_synsets_for_words = []; for iWord in range(len(dict_words)): # print iWord; dict_words_nouns.append([]); dict_words_verbs.append([]); dict_synsets_for_words.append([]); # add p p_synsets_for_words.append([]); wordDict = dict_words[str(iWord)]; # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # sd if not wordDict.has_key('tv'): continue if not wordDict.has_key('d'): continue nouns = []; if wordDict.has_key("sd") and PARAMETERS.DICT_OX_FEATURE_RELATION_sd == 1: tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["sd"])); nouns = [word for word,pos in tagged_sent if ((pos == 'NN' or pos == 'NNS') and (word != 'sth' and word != 'etc'))]; if len(nouns) == 0: tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["d"])); # print tagged_sent nouns = [word for word,pos in tagged_sent if ((pos == 'NN' or pos == 'NNS') and (word != 'sth' and word != 'etc'))]; elif wordDict.has_key("d") and wordDict["d"] != None: tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["d"])); # print tagged_sent nouns = [word for word,pos in tagged_sent if ((pos == 'NN' or pos == 'NNS') and (word != 'sth' or word != 'etc'))]; else: continue for noun in nouns: noun = wordnet_lemmatizer.lemmatize(noun, pos='n'); if noun == None: continue if noun != "sth" and noun != 'etc' and noun not in dict_words_nouns[iWord]: dict_words_nouns[iWord].append(noun); if len(dict_words_nouns[iWord]) == 0: continue # print dict_words_nouns[iWord] synsetsSD = []; for word in dict_words_nouns[iWord]: synsets = WordnetHandler.get_synsets_for_word(word, 'n'); for synset in synsets: synsetsSD.append(synset) if len(synsetsSD) == 0: continue # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # d if PARAMETERS.DICT_OX_FEATURE_RELATION_d == 1: tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["d"])); nouns = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS')]; if PARAMETERS.DICT_OX_FEATURE_RELATION_xh == 1: if wordDict.has_key('xh0') and wordDict['xh0'] is not None and wordDict['xh0'] != 'nn': nouns.append(wordDict['xh0']); if wordDict.has_key('xh1') and wordDict['xh1'] is not None: nouns.append(wordDict['xh1']); if wordDict.has_key('xh2') and wordDict['xh2'] is not None: nouns.append(wordDict['xh2']); # print tagged_sent for noun in nouns: noun = wordnet_lemmatizer.lemmatize(noun, pos='n'); if noun == None: continue if noun.encode('utf8') != word_concept and noun != "sth" and noun not in dict_words_nouns[iWord]: dict_words_nouns[iWord].append(noun); # - - - - - - - - - - - - - - - - - - - - - - - - - - - # print wordDict["tv"] # print dict_words_nouns[iWord] # # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # synsets iSDMax = 0; pSD_max = 0; for iSyn in range(len(synsetsSD)): synsetSD = synsetsSD[iSyn]; pSD = 0; arr_p = []; for synset in wn_words: # p_noun_max = 0; p = synsetSD.path_similarity(synset); # print "-----------------------" # if p > p_noun_max: p_noun_max = p; arr_p.append(p_noun_max); arr_p = sorted(arr_p, reverse=True); for i in xrange(0, len(arr_p)-1): if i <= 0: pSD += arr_p[i]; # print "\n" if pSD > pSD_max: pSD_max = pSD; iSDMax = iSyn; # print "\n" synsetRoot = synsetsSD[iSDMax]; # print "synsetroot" # print synsetRoot for noun in dict_words_nouns[iWord]: synsets_noun = WordnetHandler.get_synsets_for_word(noun, 'n'); if len(synsets_noun) <= 0: continue; p_noun_max = 0; synMax = synsets_noun[0]; for synset_noun in synsets_noun: # dict_synsets_nouns[iWord].append(synMax); for synset in wn_words: p = synset.path_similarity(synset_noun); # p = synsetRoot.path_similarity(synset_noun); if p > p_noun_max: p_noun_max = p; synMax = synset_noun; if synMax not in dict_synsets_for_words[iWord]: dict_synsets_for_words[iWord].append(synMax); if PARAMETERS.POS_FEATURE_v: # continue tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["d"])); verbs = [word for word,pos in tagged_sent if (pos == 'VB' or pos == 'VBN' or pos == 'VBD')]; # print "VVVVV" # print verbs for verb in verbs: verb = wordnet_lemmatizer.lemmatize(verb, pos='v'); if verb == None: continue if verb.encode('utf8') != word_concept and verb != "sth" and verb not in dict_words_verbs[iWord]: # print noun; dict_words_verbs[iWord].append(verb); # - - - - - - - - - - - - - - - - - - - - - - - - - - - # print dict_words_verbs[iWord] # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # synsets iSDMax = 0; pSD_max = 0; for iSyn in range(len(synsetsSD)): synsetSD = synsetsSD[iSyn]; pSD = 0; arr_p = []; for synset in wn_words: # p_noun_max = 0; p = synsetSD.path_similarity(synset); # arr_p.append(p); # print "-----------------------" # print synsetSD # print synset # print p # if p > p_noun_max: p_verb_max = p; arr_p.append(p_verb_max); arr_p = sorted(arr_p, reverse=True); for i in xrange(0, len(arr_p)-1): if i <= 1: pSD += arr_p[i]; # print "\n" if pSD > pSD_max: # print pSD # print pSD_max pSD_max = pSD; # print iSyn # print iSDMax iSDMax = iSyn; # print "\n" synsetRoot = synsetsSD[iSDMax]; # print "synsetroot" # print synsetRoot for verb in dict_words_verbs[iWord]: synsets_verb = WordnetHandler.get_synsets_for_word(verb, 'v'); if len(synsets_verb) <= 0: continue; p_verb_max = 0; synMax = synsets_verb[0]; for synset_verb in synsets_verb: # p = synsetRoot.path_similarity(synset_verb); for synset in wn_words: p = synset.path_similarity(synset_verb); if p > p_verb_max: p_verb_max = p; synMax = synset_verb; if synMax not in dict_synsets_for_words[iWord]: dict_synsets_for_words[iWord].append(synMax); # if synsets_noun[0] not in dict_synsets_nouns[iWord]: # dict_synsets_nouns[iWord].append(synsets_noun[0]); # print "dict_synsets_nouns" # print dict_synsets_for_words[iWord] ######################################## return dict_synsets_for_words;
def sim_ox_wn_via_svm(): total_tp = 0.00001 total_tn = 0.00001 total_fn = 0.00001 total_fp = 0.00001 total_pair = 0 dict_ox = OxfordParser.get_dict_nouns() flag_can_go = False for word in dict_ox: # if word == "brook": # flag_can_go = True # # if flag_can_go == False: # continue word_syns_ox = dict_ox[word] wn_synsets = WordnetHandler.get_synsets_for_word(word, "n") m2d_sim = [[0 for x in range(len(word_syns_ox))] for x in range(len(wn_synsets))] if len(word_syns_ox) == 1 and len(wn_synsets) == 1: m2d_sim[0][0] = 1 else: m2d_sim = get_m2d_sim_for_word_from_svm_result(word) if m2d_sim == None: continue # DebugHandler.print_2d_matrix(m2d_sim) m2d_sim = choose_pair_0_1(m2d_sim, len(m2d_sim), len(m2d_sim[0])) # DebugHandler.print_2d_matrix(m2d_sim) pair = count_pair(m2d_sim) total_pair += pair (tp, tn, fn, fp) = CompareWithGold.compareGoldWithResult_without_cal_result(m2d_sim,word) if tp != -1: total_tp += tp total_tn += tn total_fn += fn total_fp += fp precision = total_tp / (total_tp + total_fp) recall = total_tp / (total_tp + total_fn) accuracy = (total_tp + total_tn) / (total_tp + total_tn + total_fp + total_fn) f_score = 0 if precision != 0 or recall != 0: f_score = 2*(precision*recall)/(precision + recall) print "total:" print total_pair print total_tp print total_tn print total_fn print total_fp print precision print recall print f_score print accuracy Parameters.append_result_to_file( precision, recall, f_score, accuracy) current_params = Parameters.get_current_params() current_params = copy.deepcopy(current_params) return f_score, current_params