def get_top15(word, vocab_list, func): sim_vec = [{'id':w[0], 'text': w[1], 'similarity': func(word,w[1])} for w in vocab_list if ' '.join(lemmatize_an_idea(w[1])) != ' '.join(lemmatize_an_idea(word)) and func(word,w[1]) > -100] sim_vec = sorted(sim_vec, key=lambda t: t['similarity']) for_sim = [t for t in sim_vec if t['similarity'] < 0.5] return jsonify( word = word, similar = [i for i in reversed(for_sim[-15:])], different = sim_vec[:15])
def spacyPhraseSim(p1, p2): # TODO: find a more reasonable way to aggregate vector processed1 = ' '.join(lemmatize_an_idea(p1)) processed2 = ' '.join(lemmatize_an_idea(p2)) tok1 = nlp(unicode(processed1)) tok2 = nlp(unicode(processed2)) v1 = np.mean([t.repvec for t in tok1], axis=0) v2 = np.mean([t.repvec for t in tok2], axis=0) sim = cossim(v1, v2) return float(sim)
def get_glove_sim_set(topic): data = request.get_json() word = data['word']['text'] func = gloveSim this_dict_set = theme_dict_set if topic=='weddingTheme' else prop_dict_set vocab_list = this_dict_set['words'] sim_vec = [{'id':w[0], 'text':w[1], 'similarity':func(word,w[1])} for w in vocab_list if ' '.join(lemmatize_an_idea(w[1])) != ' '.join(lemmatize_an_idea(word)) and func(word,w[1]) > -100] sim_vec = sorted(sim_vec, key=lambda t: t['similarity']) for_sim = [t for t in sim_vec if t['similarity'] < 0.5] operation = data['operation'] similar_sets = [] different_sets = [] if operation == 'similar': similar_words = [i for i in reversed(for_sim[-5:])] for s in similar_words: s_idx = vocab_list.index((s['id'], s['text'])) tmp = random.choice(this_dict_set['set_dict'][s_idx]) tmp = ( {'id': vocab_list[tmp[0]][0], 'text': vocab_list[tmp[0]][1]}, {'id': vocab_list[tmp[1]][0], 'text': vocab_list[tmp[1]][1]}, {'id': vocab_list[tmp[2]][0], 'text': vocab_list[tmp[2]][1]}, ) similar_sets.append(tmp) else: different_words = sim_vec[:5] for s in different_words: s_idx = vocab_list.index((s['id'], s['text'])) tmp = random.choice(this_dict_set['set_dict'][s_idx]) tmp = ( {'id': vocab_list[tmp[0]][0], 'text': vocab_list[tmp[0]][1]}, {'id': vocab_list[tmp[1]][0], 'text': vocab_list[tmp[1]][1]}, {'id': vocab_list[tmp[2]][0], 'text': vocab_list[tmp[2]][1]}, ) different_sets.append(tmp) return jsonify( word = data['word'], similar = similar_sets, different = different_sets)
def vec_for_sentence(self, sentence): tokens = lemmatize_an_idea(sentence,False) return self.vec_for_tokens(tokens)
def get_sorted_similar(word, vocab_list, func): sim_vec = [{'id':w[0], 'text': w[1], 'similarity': func(word,w[1])} for w in vocab_list if ' '.join(lemmatize_an_idea(w[1])) != ' '.join(lemmatize_an_idea(word)) and func(word,w[1]) > -100] sim_vec = sorted(sim_vec, key=lambda t: t['similarity']) for_sim = [t for t in sim_vec if t['similarity'] < 0.5] return for_sim