def similar_poems_idx(query: str, poem_model, w2v_model, topn=5, use_associations=False) -> list: query_bag = sem.canonize_words(query.split()) if use_associations: query_bag += sem.semantic_association(query_bag, w2v_model, topn=5) query_mx = sem.bag_to_matrix(query_bag, w2v_model) similars = [(i, sem.semantic_similarity_fast(query_mx, np.vstack((mx, poem_model['a_matrices'][i])))) for i, mx in enumerate(poem_model['matrices']) if len(mx) > 0] else: query_mx = sem.bag_to_matrix(query_bag, w2v_model) similars = [(i, sem.semantic_similarity_fast(query_mx, mx)) for i, mx in enumerate(poem_model['matrices'])] similars.sort(key=lambda x: x[1], reverse=True) return similars[:topn]
def similar_poems_idx(query: str, poem_model, w2v_model, topn=5, use_associations=False) -> list: # [(poem_idx, sim)] query_bag = sem.canonize_words(query.split()) if use_associations: query_bag += sem.semantic_association(query_bag, w2v_model, topn=5) query_mx = sem.bag_to_matrix(query_bag, w2v_model) if len(query_mx) == 0: return [] similars = [(i, sem.semantic_similarity_fast(query_mx, np.vstack((mx, poem_model['a_matrices'][i])))) for i, mx in enumerate(poem_model['matrices']) if len(mx) > 0] else: query_mx = sem.bag_to_matrix(query_bag, w2v_model) if len(query_mx) == 0: return [] similars = [(i, sem.semantic_similarity_fast_log(query_mx, mx)) for i, mx in enumerate(poem_model['matrices'])] # similars.sort(key=lambda x: x[1], reverse=True) return heapq.nlargest(topn, similars, key=lambda x: x[1])
def make_poems_model(file_name: str, semantics=True) -> dict: print("making poems model...") poems = read_poems(file_name) print('poem count:', len(poems)) bags, voc = make_bags(poems) sa = [] sd = [] if semantics: print("loading w2v_model...") w2v_model = sem.load_w2v_model(sem.WORD2VEC_MODEL_FILE) print("adding semantics to model...") sd = [sem.semantic_density(bag, w2v_model, unknown_coef=-0.001) for bag in bags] sa = [sem.semantic_association(bag, w2v_model) for bag in bags] rates = [0.0 for _ in range(len(poems))] print("model created") return {'poems' : poems, 'bags' : bags, 'vocabulary' : voc, 'density' : sd, 'associations': sa, 'rates' : rates}