def context_sim(new_tokens, trained_model, type_range, metric='maxsimc'): moretokens = [] for new_token in new_tokens: word_a = new_token.word1 word_b = new_token.word2 #list of synset pairs synsets_a = sp.synset_all(word_a) synsets_b = sp.synset_all(word_b) #average vector for the context for each word context_a = context_parser(word_a, new_token.sent1, trained_model) context_b = context_parser(word_b, new_token.sent2, trained_model) #clean synsets that only exist in the model vec_syna = sp.validate_synsets_model(word_a, synsets_a, trained_model) vec_synb = sp.validate_synsets_model(word_b, synsets_b, trained_model) if metric == 'maxsimc': sim_value = maxSimC(vec_syna, context_a, vec_synb, context_b, type_range) elif metric == 'avgsimc': sim_value = avgSimC(vec_syna, context_a, vec_synb, context_b, type_range) elif metric == 'globalsimc': sim_value = globalSimC(context_a, context_b, type_range) token_prime = bench_data.Token_Data(word_a, word_b, sim_value) moretokens.append(token_prime) return (moretokens)
def build_synset_packages(word, *pos): synset_wndata_list = [] #list of WNDATA{} #maybe deal with words that are not in the model if not pos: #for all POS synsets = sp.synset_all(word) else: #for specific POS synsets = sp.synset_pos(word, pos) for sys_element in synsets: synset_wndata_list.append( bench_data.WNData(sys_element, sys_element.offset(), sys_element.pos(), sys_element.definition())) return (synset_wndata_list)
def nocontext_sim(tokens, trained_model, type_range, metric='avgsim'): moretokens = [] for token in tokens: word_a = token.word1 word_b = token.word2 synsets_a = sp.synset_all(word_a) synsets_b = sp.synset_all(word_b) #clean synsets that only exist in the model vec_syna = sp.validate_synsets_model(word_a, synsets_a, trained_model) vec_synb = sp.validate_synsets_model(word_b, synsets_b, trained_model) if metric == 'maxsim': sim_value = maxSim(vec_syna, vec_synb, type_range) elif metric == 'avgsim': sim_value = avgSim(vec_syna, vec_synb, type_range) elif metric == 'globalsim': sim_value = globalSim(vec_syna, vec_synb, type_range) token_prime = bench_data.Token_Data(word_a, word_b, sim_value) moretokens.append(token_prime) return (moretokens)
def context_parser(anchor_word, text_items, trained_model): context_vector = [] for text_item in text_items: #if text_item == anchor_word: continue #discard the target/anchor word from the context - avoid bias synsets = sp.synset_all(text_item) for synset in synsets: key = sp.key_parser(text_item, synset) try: v1 = trained_model.word_vec(key) context_vector.append( v1 ) #put all vector words in the sentence together and average later except KeyError: pass #key not in the model return (numpy.average(context_vector, axis=0))
def build_synset_packages_refi(word, embed_model, *pos): synset_refidata_list = [] if not pos: #for all POS synsets = sp.synset_all(word) else: #for specific POS synsets = sp.synset_pos(word, pos) for synset in synsets: #create list of Synsets, offset, POS key = sp.key_parser(word, synset) wpack = bench_data.WNData(synset, synset.offset(), synset.pos(), synset.definition()) vec = sp.retrieve_synsetvec(key, embed_model) wpack.vector = vec synset_refidata_list.append(wpack) return (synset_refidata_list)