def calculate_left_right_similarityForpairing(left_vect,left_tfidf_part,left_part,left_extract,right_vect,right_tfidf_part,right_part,right_extract,titles): #print left_tfidf_part.shape #print right_tfidf_part.shape #transform the left side of extract into tfidf vector left_tfidf_extract = left_vect.transform([left_extract]) #calculate the similarity scores for left side with all the left sides in database left_similarity_scores = cosine_similarity(left_tfidf_extract,left_tfidf_part) #transform the right side of extract into tfidf vector right_tfidf_extract = right_vect.transform([right_extract]) #calculate the similarity scores for right side with all the right sides in database right_similarity_scores = cosine_similarity(right_tfidf_extract,right_tfidf_part) #calculate total similarity score corresponding to each entry in the database similarity_scores = left_similarity_scores + right_similarity_scores #get the indexes of top n scores. but not in sorted order ind = np.argpartition(similarity_scores, -5)[0][-5:] #get the top n scores. but not in sorted order selected_scores = similarity_scores[0][ind] #get the indexes in sorted order sorted_ind=ind[np.argsort(selected_scores)][::-1] titleAndScores = [titles[sorted_ind[0]]] +[sorted_ind[0]]+ [int(left_similarity_scores[0][i]*100) for i in sorted_ind] + [int(right_similarity_scores[0][i]*100) for i in sorted_ind] #print titleAndScores return titleAndScores
def print_recommendations_kmeans(df, km, svd_trans, album_idx, n=25, min_n=2000): ''' Prints list of recommended albums with kmeans preselect Args: df: dataframe with Pitchfork reviews km: fitted sklearn KMeans object svd_trans: the low dimensional representation of each review album_idx: the iloc value of album for which to generate reccs n: number of albums to recommend min_n: min number of samples to preselect with kmeans Returns: None ''' sims_clusters = cosine_similarity(svd_trans[album_idx, :].reshape(1, -1), km.cluster_centers_).flatten() cluster_assgns = km.predict(svd_trans) idx = [] for cluster in np.argsort(sims_clusters)[::-1]: idx.extend(np.where(cluster_assgns == cluster)[0]) if len(idx) > min_n: break sel = np.bool_(np.ones(len(svd_trans))) sel[idx] = 0 sims = cosine_similarity(svd_trans[album_idx, :].reshape(1, -1), svd_trans) sims[:, sel] = -1 df_temp = df.iloc[np.argsort(sims).flatten()[-n:]] df_temp['sim_scores'] = np.sort(sims.flatten())[-n:] print df_temp[['url', 'genres', 'sim_scores']][::-1]
def train(self, images, texts, K,lr_img,lr_txt, batch_size=16, verbose=False): assert images.shape[0] == texts.shape[0], "Must have same number of images and texts" n_train = images.shape[0] n_batches = n_train/batch_size for batch_id in range(n_batches): if verbose: fwrite('Mini-batch : %2d/%2d\r' % (batch_id, n_batches)) sys.stdout.flush() begin = batch_id*batch_size end = (batch_id + 1)*batch_size for idx_p in range(begin, end): self.forward_count = 0 im = images[idx_p] txt = texts[idx_p] x = np.dot(im, self.W_img) y = np.dot(txt, self.W_txt) t_txt = np.tile(txt.reshape(-1,1), self.n_hid).T t_im = np.tile(im.reshape(-1,1), self.n_hid).T s = cosine_similarity(x.reshape(1,-1),y.reshape(1,-1))[0][0] n = 0 while self.forward_count<K: #security n += 1 if n >= 100: break idx_n = np.random.randint(n_train) txt_n = texts[idx_n] y_n = np.dot(txt_n, self.W_txt) s_n = cosine_similarity(x.reshape(1,-1), y_n.reshape(1,-1))[0][0] J = 0.5 + s_n - s if J > 0.: t_txt_n = np.tile(txt_n.reshape(-1,1), self.n_hid).T self.update_grads(x, y, s, y_n, s_n, t_im, t_txt, t_txt_n) self.backward(lr_img,lr_txt, batch_size) if verbose: fwrite('\n')
def create_tf_idf_sim_matrix( title_rev_log, desc_rev_log, cr_area_top_level, title_file_name): #print "Title- rev", title_rev_log #print "Desc-rev", desc_rev_log #print "cr_area_top_level", cr_area_top_level #print "title_file_name", title_file_name # tfidf_vectorizer = TfidfVectorizer(stop_words='english',decode_error='ignore') tfidf_vectorizer = TfidfVectorizer(decode_error='ignore') title_rev_log_tfidf_matrix = tfidf_vectorizer.fit_transform(title_rev_log) desc_rev_log_tfidf_matrix = tfidf_vectorizer.fit_transform(desc_rev_log) cr_area_top_level_tfidf_matrix = tfidf_vectorizer.fit_transform(cr_area_top_level) title_file_name_tfidf_matrix = tfidf_vectorizer.fit_transform(title_file_name) #print "size=", title_rev_log_tfidf_matrix.shape, desc_rev_log_tfidf_matrix.shape, cr_area_top_level_tfidf_matrix.shape, title_file_name_tfidf_matrix.shape #print "Title Rev Log=", title_rev_log_tfidf_matrix #print "Desc rev log = ", desc_rev_log_tfidf_matrix #print "cr area top level=", cr_area_top_level_tfidf_matrix #print "title file name=", title_file_name_tfidf_matrix title_rev_log_sim_matrix = cosine_similarity(title_rev_log_tfidf_matrix[0:1], title_rev_log_tfidf_matrix) desc_rev_log_sim_matrix = cosine_similarity(desc_rev_log_tfidf_matrix[0:1], desc_rev_log_tfidf_matrix) cr_area_top_level_sim_matrix = cosine_similarity(cr_area_top_level_tfidf_matrix[0:1], cr_area_top_level_tfidf_matrix) title_file_name_sim_matrix = cosine_similarity( title_file_name_tfidf_matrix[0:1], title_file_name_tfidf_matrix) #print "sim title-rev log", title_rev_log_sim_matrix #print "desc rev log", desc_rev_log_sim_matrix #print "cr area top", cr_area_top_level_sim_matrix #print "title file name", title_file_name_sim_matrix return title_rev_log_sim_matrix, desc_rev_log_sim_matrix, cr_area_top_level_sim_matrix, title_file_name_sim_matrix
def rank_tweets(tweets): # print tweets vectorizer = CountVectorizer(min_df=1) X = vectorizer.fit_transform(tweets) vectors = X.toarray() sumvectors = [0] * len(vectors[0]) for v in vectors: for i,val in enumerate(v): sumvectors[i] += val centroid = [(x/len(vectors)) for x in sumvectors] #calculate cosines dists = [] for i,vector in enumerate(vectors): dists.append([i,cosine_similarity(vector,centroid)]) ranked_tweets = [] ranked_vectors = [] dists.sort(key = lambda x : x[1],reverse=True) for v in dists: vector = vectors[v[0]] sim = False for v2 in ranked_vectors: if cosine_similarity(vector,v2) == 1.0: sim = True if not sim: ranked_tweets.append(tweets[v[0]]) ranked_vectors.append(vector) if len(ranked_tweets) == 10: break return ranked_tweets
def find(request): if request.method == 'GET': query = request.GET.get('query') s = SessionStore() db.sessionHistory.update({'session_key': s.session_key},{'$push': {"query": [query]}}, upsert=True) from sklearn.metrics.pairwise import cosine_similarity from sklearn.feature_extraction.text import TfidfVectorizer dic = passQuery(query) docs = dic['docs'] ids = dic['ids'] print len(ids) print len(ids) all_url = [] urls = db.crawledScienceCollection.find() for url in urls: all_url.append(url) selected_url = [] for i in range(0, len(ids)): selected_url.append(all_url[ids[i]]['url']) print selected_url tfidf_vectorizer = TfidfVectorizer() tfidf_matrix = tfidf_vectorizer.fit_transform(docs) print tfidf_matrix.shape cosine_similarity(tfidf_matrix[0:1], tfidf_matrix) template = loader.get_template('results.html') context = {'docs': docs, 'url': selected_url, 'zip': zip(docs, selected_url)} return HttpResponse(template.render(context, request))
def gloveSolver_multiplication( ): count = 0 global deviation deviation = 0.001 for i, linet in enumerate(lineText): # linet = str(linet).lower() word = str(linet[2]).lower() if vectorDictionary.has_key(str(linet[0]).lower()): a = np.array(vectorDictionary[str(linet[0]).lower()]) if vectorDictionary.has_key(str(linet[1]).lower()): b = np.array(vectorDictionary[str(linet[1]).lower()]) if vectorDictionary.has_key(str(linet[2]).lower()): c = np.array(vectorDictionary[str(linet[2]).lower()]) aresult=((cosine_similarity(a,vecMatrix[0:30000]))+1)/2 bresult=((cosine_similarity(b,vecMatrix[0:30000]))+1)/2 cresult=((cosine_similarity(c, vecMatrix[0:30000]))+1) / 2 if aresult.all == 0: aresult=aresult+deviation similarity = cresult * bresult/aresult; indexOfSimilarity = np.argmax(similarity) if wordDictionary[indexOfSimilarity] ==str(linet[3]).lower(): count = count + 1 multiplicationModal = float(count)/float(len(lineText)) print f, multiplicationModal
def sim_score(path_problem, lexicon_8gram, lexicon_3gram, lexicon_bigram, lexicon_unigram): sim_score = {} for path, subdirs, files in os.walk(path_problem): for name_dir in subdirs: print(name_dir) vec_feature = [] sim_score_fw = [] sim_score_stylo = [] sim_score_8gram = [] sim_score_3gram = [] sim_score_bigram = [] sim_score_unigram = [] lang = name_dir[:2] if lang == 'EN': fw_file = './stopwords/english.txt' elif lang == 'DU': fw_file = './stopwords/dutch.txt' elif lang == 'GR': fw_file = './stopwords/greek.txt' elif lang == 'SP': fw_file = './stopwords/spanish.txt' dir_path = os.path.join(path_problem, name_dir) for name1 in glob.glob(dir_path + "/unknown.txt"): file_path_unknown = os.path.join(dir_path, name1) fw_unknown = feature_extractor.freq_function_word(file_path_unknown, fw_file) stylo_unknown = feature_extractor.stylometric_features(file_path_unknown, lang) eight_gr_unknown = feature_extractor.tfidf(file_path_unknown, 8, 'char', lexicon_8gram) three_gr_unknown = feature_extractor.tfidf(file_path_unknown, 3, 'char', lexicon_3gram) bigram_unknown = feature_extractor.tfidf(file_path_unknown, 2, 'word', lexicon_bigram) unigram_unknown = feature_extractor.tfidf(file_path_unknown, 1, 'word', lexicon_unigram) for name2 in glob.glob(dir_path + "/known??.txt"): file_path_known = os.path.join(dir_path, name2) fw_known = feature_extractor.freq_function_word(file_path_known, fw_file) stylo_known = feature_extractor.stylometric_features(file_path_known, lang) eight_gr_known = feature_extractor.tfidf(file_path_known, 8, 'char', lexicon_8gram) three_gr_known = feature_extractor.tfidf(file_path_known, 3, 'char', lexicon_3gram) bigram_known = feature_extractor.tfidf(file_path_known, 2, 'word', lexicon_bigram) unigram_known = feature_extractor.tfidf(file_path_known, 1, 'word', lexicon_unigram) sim_score_fw.append(minmax_sim(fw_unknown, fw_known)) sim_score_stylo.append(vec_diff(stylo_unknown, stylo_known)) sim_score_8gram.append(cosine_similarity(eight_gr_unknown, eight_gr_known)) sim_score_3gram.append(cosine_similarity(three_gr_unknown, three_gr_known)) sim_score_bigram.append(cosine_similarity(bigram_unknown, bigram_known)) sim_score_unigram.append(cosine_similarity(unigram_unknown, unigram_known)) vec_feature.append(np.mean(sim_score_stylo)) vec_feature.append(np.mean(sim_score_fw)) vec_feature.append(np.mean(sim_score_8gram)) vec_feature.append(np.mean(sim_score_3gram)) vec_feature.append(np.mean(sim_score_bigram)) vec_feature.append(np.mean(sim_score_unigram)) sim_score[name_dir] = vec_feature sort = OrderedDict(sorted(sim_score.items(), key=lambda s: s[0])) return sort
def main(output_dir, use_2015F, query_nums, sim_cutoff=.5, use_semsim=False): results = [] for query_num in query_nums: event = [e for e in cuttsum.events.get_events() if e.query_num == query_num][0] print event gold_probs = False df = get_input_stream( event, gold_probs, use_2015F=use_2015F, truncate=1) df = df.loc[df["stems"].apply(len) >= 10] df = df.reset_index(drop=True) print df[["update id", "sent text"]] if use_semsim: semsims = get_all_semsim() X_l = semsims[event.type].transform( df["stems"].apply(lambda x: ' '.join(x)).tolist()) K = cosine_similarity(X_l) else: Xtf = [] for stems in df["stems"].tolist(): sc = {} for stem in stems: sc[stem] = sc.get(stem, 0) + 1 Xtf.append(sc) dv = DictVectorizer() Xtf = dv.fit_transform(Xtf) print Xtf K = cosine_similarity(Xtf) print K S = [0] for s in range(1, len(df)): max_prev_sim = K[s,:s].max() if max_prev_sim < sim_cutoff: S.append(s) for sent_text in df.iloc[S]["pretty text"].tolist(): print sent_text for _, row in df.iloc[S].iterrows(): d = row.to_dict() d["query id"] = query_num d["conf"] = .5 d["team"] = "CUNLP" d["run id"] = "{}.c{}".format( "sem" if use_semsim else "bow", sim_cutoff) results.append(d) df = pd.DataFrame(results, columns=["query id", "team", "run id", "stream id", "sent id", "timestamp", "conf"]) if not os.path.exists(output_dir): os.makedirs(output_dir) o = os.path.join(output_dir, "{}.c{}.tsv".format( "sem" if use_semsim else "bow", sim_cutoff)) df.to_csv(o, sep="\t", header=False, index=False)
def cosine_sim(vec1, vec2): try: s = cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0] except: try: s = cosine_similarity(vec1, vec2)[0][0] except: s = MISSING_VALUE_NUMERIC return s
def getCos_topic(tokens): List = get_keyword() component = List[0] + List[1] + List[2] + List[3] print(component) function = List[4] + List[5] data = List[6] rootcause = List[7] + List[8] + List[9] type = [] t1 = get_topicvector(component, tokens) t2 = get_topicvector(function, tokens) t3 = get_topicvector(data, tokens) t4 = get_topicvector(rootcause, tokens) a1 = createCaseArray(component) a2 = createCaseArray(function) a3 = createCaseArray(data) a4 = createCaseArray(rootcause) print("-------------" + "topic similarity -----------------------------") minnum = 0 if t1 != []: num1 = cosine_similarity(t1, a1) print(num1) else: num1 = 0 if t2 != []: num2 = cosine_similarity(t2, a2) else: num2 = 0 if t3 != []: num3 = cosine_similarity(t3, a3) else: num3 = 0 if t4 != []: num4 = cosine_similarity(t4, a4) else: num4 = 0 sum = num1 + num2 + num3 + num4 minnum = min(num1, num2, num3, num4) #num = [num1, num2, num3, num4] if (sum != 0): print(minnum) if (num1 == minnum): #minnum = num2 type.append("component") if (num2 == minnum): #minnum = num3 type.append("function") if (num3 == minnum): #minmun = num4 type.append("data") if (num4 == minnum): type.append("rootcause") else: return ["other"] print(type) return type
def checkTriangleSanity(keypoint1,keypoint2,keypoint3): # CHECK 1: Compute distances between pair of points distX_12 = keypoint1.pt[0] - keypoint2.pt[0] distY_12 = keypoint1.pt[1] - keypoint2.pt[1] distX_23 = keypoint2.pt[0] - keypoint3.pt[0] distY_23 = keypoint2.pt[1] - keypoint3.pt[1] distX_31 = keypoint3.pt[0] - keypoint1.pt[0] distY_31 = keypoint3.pt[1] - keypoint1.pt[1] dist_12 = abs(distX_12) + abs(distY_12) dist_23 = abs(distX_23) + abs(distY_23) dist_31 = abs(distX_31) + abs(distY_31) # temp variable _ = '_' if(dist_12 < 5 or dist_23 < 5 or dist_31 < 5): return False,_,_,_,_,_ # CHECK 2: Compute distances ratio between pair of points ratio1 = dist_12/float(dist_31) ratio2 = dist_12/float(dist_23) if(ratio1<0.33 or ratio1>3 or ratio2<0.33 or ratio2>3): return False,_,_,_,_,_ # CHECK 3: Compute angle between every two lines # delta_1 = angle between vec 1->2 and vec 1->3 vec_12 = [-distX_12, -distY_12] vec_13 = [ distX_31, distY_31] cos_delta_1 = cosine_similarity(vec_12, vec_13) delta_1 = math.degrees(math.acos(cos_delta_1)) if delta_1<15: return False,_,_,_,_,_ # delta_2 = angle between vec 2->3 and vec 2->1 vec_23 = [-distX_23, -distY_23] vec_21 = [ distX_12, distY_12] cos_delta_2 = cosine_similarity(vec_23, vec_21) delta_2 = math.degrees(math.acos(cos_delta_2)) if delta_2<15: return False,_,_,_,_,_ # the third angle of the triangle formed by three input points # delta_3 = angle between vec 3->1 and vec 3->2 delta_3 = 180 - delta_1 - delta_2 if delta_3<15: return False,_,_,_,_,_ # compute 5-tuple representation for this triangle # atan2() -> range (-pi,pi) # keypoint.angle -> range (-pi,pi) OR (0,2*pi) (NOT SURE) !!! alpha = keypoint1.angle + delta_1 - math.degrees(math.atan2(distY_31, distX_31)) beta = keypoint2.angle + delta_2 - math.degrees(math.atan2(distY_12, distX_12)) gamma = keypoint3.angle + delta_3 - math.degrees(math.atan2(distY_23, distX_23)) return True,delta_1,delta_2,alpha,beta,gamma
def ReturnRank(self,Data_2Rank): """ Returns the Rank indexes [-1 to 1] of the data based on cosine similarity with Interested and Notinterested vectors Input: Data_2Rank, is feedparser output of the data to rank Output: Rank indexes [-1 to +1] of the input data based on cosine similarity. -1 is highest rank, and +1 is the lowest rank. """ Text_2Rank = (entry.title + entry.summary for entry in Data_2Rank.entries) Vectors_2Rank = self.vectorizer.transform(Text_2Rank) InterestedCosineRank = cosine_similarity(self.InterestedVector,Vectors_2Rank)[0] NotInterestedCosineRank = cosine_similarity(self.NotInterestedVector,Vectors_2Rank)[0] return NotInterestedCosineRank - InterestedCosineRank
def __call__(self, X1, X2): rows = [] for key_1, value_1 in X1.iteritems(): if self.xstats == 1: x1 = np.array([centroid(e) for e in value_1]).flatten() elif self.xtats == 2: x1 = np.array([dispersion(e) for e in value_1]).flatten() else: x1_cen = np.array([centroid(e) for e in value_1]).flatten() x1_dis = np.array([dispersion(e) for e in value_1]).flatten() columns = [] for key_2, value_2 in X2.iteritems(): if self.xstats == 1: x2 = np.array([centroid(e) for e in value_2]).flatten() elif self.xtats == 2: x2 = np.array([dispersion(e) for e in value_2]).flatten() else: x2_cen = np.array([centroid(e) for e in value_2]).flatten() x2_dis = np.array([dispersion(e) for e in value_2]).flatten() if self.similarity == 1: if self.xstats == 3: value_cen = polynomial_kernel(x1_cen,x2_cen).flatten()[0] value_dis = polynomial_kernel(x1_dis,x2_dis).flatten()[0] value = (value_cen + value_dis)/2 else: value = polynomial_kernel(x1,x2).flatten()[0] if self.domain_adapt: if (key_1 < 10500 and key_2 < 10500) or ((key_1 > 10500 and key_2 > 10500)): columns.append(value) else: columns.append(2*value) else: columns.append(value) else: if self.xstats == 3: value_cen = cosine_similarity(x1_cen,x2_cen).flatten()[0] value_dis = cosine_similarity(x1_dis,x2_dis).flatten()[0] value = (value_cen + value_dis)/2 else: value = cosine_similarity(x1,x2).flatten()[0] if self.domain_adapt: if (key_1 < 10500 and key_2 < 10500) or ((key_1 > 10500 and key_2 > 10500)): columns.append(value) else: columns.append(2*value) else: columns.append(value) rows.append(columns) m = np.asarray(rows) print m.shape return m
def similarity(ratings, kind='user'): if kind == 'user': sim = cosine_similarity(ratings) #cosine_similarity, treat one row as one data point # sim is a N by N matrix assert(sim.shape[0] == ratings.shape[0]) elif kind == 'item': sim = cosine_similarity(ratings.T) # sim is a p by p matrix assert(sim.shape[0] == ratings.shape[1]) sim[np.isnan(sim)] = 0 # when could nan happen? think about it np.fill_diagonal(sim,0) # when compute neighbors, we don't need to compute itself return sim
def compute_maximum_similarity(input_user_tweets_file): similarity_values = [] tweets_grouped_by_user.insert(0, input_user_tweets_file) user_tweets = [open(user_tweets) for user_tweets in tweets_grouped_by_user] tfidf_files = TfidfVectorizer(input='file').fit_transform(user_tweets) for i in range(1, tfidf_files.shape[0]): print cosine_similarity(tfidf_files[0], tfidf_files[i]) similarity_values.append(cosine_similarity(tfidf_files[0], tfidf_files[i])) most_similar_measure = max(similarity_values) # most_similar_doc_index = similarity_values.index(most_similar_measure) return most_similar_measure
def make_cosine_list(matrix): # Returns list: nth index in list is cosine sim of nth and nth + 1 utt in matrix (final utt cannot have cosine score) cosines = [] if switch == 'tfidf': # matrix type: scipy.sparse.lil.lil_matrix n_vectors = matrix.shape[0] for i in xrange(n_vectors - 1): cosines.append(cosine_similarity(matrix[i:i+1], matrix[i+1:i+2])[0][0]) elif switch == 'lda' or switch == 'doc2vec': # matrix type: list n_vectors = len(matrix) for i in xrange(n_vectors - 1): cosines.append(cosine_similarity(matrix[i].reshape(1, -1), matrix[i+1].reshape(1, -1))[0][0]) # elif switch == 'doc2vec': # print type(matrix) return cosines
def similarities_without_duplicates(tfidf_matrix, length): include = [1] * length similarities_base = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix).flatten() for i in range(length): if include[i] == 0: continue similarities = cosine_similarity(tfidf_matrix[i : i + 1], tfidf_matrix).flatten() for j in range(2, length): if similarities[j] > 0.98 and j != i: include[j] = 0 for i in range(length): if include[i] == 0: similarities_base[i] = 0 return similarities_base
def extract_numerical_feature(seg): list = [] num_query = len(seg[11].strip().split("|")) num_keyword = len(seg[12].strip().split("|")) num_title = len(seg[13].strip().split("|")) num_description = len(seg[14].strip().split("|")) list.append(str(process_Id_Feature("num_query"," ")) + ":" + str(num_query)) list.append(str(process_Id_Feature("num_keyword"," ")) + ":" + str(num_keyword)) list.append(str(process_Id_Feature("num_description"," ")) + ":" + str(num_description)) list.append(str(process_Id_Feature("num_title"," ")) + ":" + str(num_title)) tfidf_vectorizer = TfidfVectorizer() corpus = seg[11:15] tfidf_matrix = tfidf_vectorizer.fit_transform(corpus) #print tfidf_matrix.shape #tfidf = tf_idf(corpus) #query_similar_keyword = tfidf_similarity(tfidf[0],tfidf[1]) query_similar_keyword = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2]) #print query_similar_keyword #query_similar_title = tfidf_similarity(tfidf[0],tfidf[2]) query_similar_title = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[2:3]) #print query_similar_title #query_similar_description = tfidf_similarity(tfidf[0],tfidf[3]) query_similar_description = cosine_similarity(tfidf_matrix[0:1],tfidf_matrix[3:]) #print query_similar_description #keyword_similar_title = tfidf_similarity(tfidf[1],tfidf[2]) keyword_similar_title = cosine_similarity(tfidf_matrix[1:2],tfidf_matrix[2:3]) #print keyword_similar_title #title_similar_description = tfidf_similarity(tfidf[2],tfidf[3]) title_similar_description = cosine_similarity(tfidf_matrix[1:2],tfidf_matrix[3:]) #print title_similar_description list.append(str(process_Id_Feature(query_similar_keyword," ")) + ":" + str(query_similar_keyword)) list.append(str(process_Id_Feature(query_similar_title," ")) + ":" + str(query_similar_title)) list.append(str(process_Id_Feature(query_similar_description," ")) + ":" + str(query_similar_description)) list.append(str(process_Id_Feature(keyword_similar_title," ")) + ":" + str(keyword_similar_title)) list.append(str(process_Id_Feature(title_similar_description," ")) + ":" + str(title_similar_description)) depth = float(seg[4]) position float(seg[5]) relative_pos = float((depth - position)*10.0/depth) list.append(str(process_Id_feature("relative_pos_num"," ")) + ":" + str(relative_pos)) return list
def make_cosine_list(matrix): # Returns list: nth index in list is cosine sim of nth and nth + 1 utt in matrix cosines = [] if switch == 'doc2vec' or switch == 'lda': # Matrix is numpy array n_vectors = len(matrix) for i in xrange(n_vectors - 1): cosines.append(cosine_similarity(matrix[i].reshape(1, -1), matrix[i+1].reshape(1, -1))) elif switch == 'tfidf': n_vectors = matrix.shape[0] # Iterate over each utt in doc_matrix, score with subsequent utt (final utt cannot have cosine score) for i in xrange(n_vectors - 1): # Index the Sci-Py matrix by slice to extract vectors cosines.append(cosine_similarity(matrix[i:i+1], matrix[i+1:i+2])[0][0]) return cosines
def user_similarities_one_to_many(user_car_feat, df_cars_feat, df_cars_scraped, n_predict): cosines = [] # parallelize if takes long for car_all in df_cars_feat.featurized: cosines_all_one_car = [] for car in car_all: cosines_all_one_car.append(cosine_similarity(user_car_feat, car)[0][0]) cosines.append(max(cosines_all_one_car)) cosines = np.array(cosines) indexes = cosines.argsort()[::-1] df_cars_top = df_cars_feat.ix[indexes][:n_predict] df_cars_top = pd.merge(df_cars_top,df_cars_scraped, on='link') car_links = df_cars_top.link.values car_img_links = df_cars_top.img_x.apply(lambda x: x[0]).values car_model_year = df_cars_top.model_year.values.astype(int) car_make_and_model = df_cars_top.make_and_model.values car_price = df_cars_top.price.values car_price_clean = [] for price in car_price: if np.isnan(price) == True: car_price_clean.append(0.0) else: car_price_clean.append(price) car_price = car_price_clean result = zip(car_links, car_img_links, car_model_year, car_make_and_model, car_price) return result
def readJson3(jpath, s, finalTerms, finalTermsIDF, queryTFIDFs): reviews = {} authors = {} dates = {} j = 0 cosinesAll = {} for key0 in queryTFIDFs.keys(): cosinesAll[key0] = [] for f in os.listdir(jpath): fpath = os.path.join(jpath, f) if os.path.isfile(fpath): jfile = open(fpath).read() jsondata = json.loads(jfile) try: for k in range(len(jsondata['Reviews'])): try: reviews[s+str(j)] = jsondata['Reviews'][k]['Content'] authors[s+str(j)] = jsondata['Reviews'][k]['Author'] dates[s+str(j)] = jsondata['Reviews'][k]['Date'] tokens = tokenizer.tokenize(reviews[s+str(j)]) stemmedTokens = [] stemmedTokenF = [] bigram = [] for t in range(len(tokens)): try: tk = int(tokens[t]) tk = "NUM" except ValueError: tk = tokens[t] stemmedToken = stemmer.stem(tk.lower()) stemmedTokens.append(stemmedToken) if stemmedToken in finalTerms: stemmedTokenF.append(stemmedToken) for m in range(len(stemmedTokens)-1): tm = stemmedTokens[m] + '-' + stemmedTokens[m+1] if tm in finalTerms: bigram.append(tm) unibigram = stemmedTokenF + bigram c1 = Counter(unibigram) tfidfEachReview = [] for x in range(len(finalTerms)): if c1[finalTerms[x]] > 0: tf = 1 + np.log(c1[finalTerms[x]]) else: tf = 0 tfidf = tf * finalTermsIDF[x] tfidfEachReview.append(tfidf) for key0, value0 in queryTFIDFs.iteritems(): cosine = cosine_similarity(value0, tfidfEachReview) infoDoc = (cosine, reviews[s+str(j)], authors[s+str(j)], dates[s+str(j)]) cosinesAll[key0].append(infoDoc) j += 1 except ValueError: print 'Cannot find Review Content!' except ValueError: print 'Cannot find Review!' cosines = {} for k, value in cosinesAll.iteritems(): cosines[k] = sorted(cosinesAll[k], key=itemgetter(0), reverse = True)[:3] return cosines
def t_test_accuracy(topic_id, n_runs, estimator_params_votes_per_doc_tuples): """ Test if accuracy for estimators with given parameters is significantly better than that of the first estimator in the tuple """ texts, vote_lists, truths = texts_vote_lists_truths_by_topic_id[topic_id] vectorizer = TfidfVectorizer() text_similarity = cosine_similarity(vectorizer.fit_transform(texts)) accuracy_arrays = [] for estimator, args, votes_per_doc in estimator_params_votes_per_doc_tuples: stop_idx = votes_per_doc * len(texts) # Now get n_runs accuracies and put then into numpy arrays accuracies = Parallel(n_jobs=4)( delayed(get_accuracy_sequence)(estimator, stop_idx, texts, vote_lists, truths, text_similarity, idx, True, *args) for idx in xrange(n_runs) ) accuracy_arrays.append( np.array( filter(lambda x: x is not None, accuracies) ) ) # Baseline result_row = [] result_row.append( "%0.2f" % np.mean(accuracy_arrays[0]) ) # T-tests for accuracy_array in accuracy_arrays[1:]: _, pval = ttest_ind(accuracy_array, accuracy_arrays[0], equal_var=False) significance_indicator = lambda p: "*" if p < 0.01 else " " is_better = "$" if np.mean(accuracy_array) > np.mean(accuracy_arrays[0]) else " " result_row.append( "%0.2f %s %s" % (np.mean(accuracy_array), significance_indicator(pval), is_better)) return "|".join(result_row)
def MMR(docs, count): # Setup select_lst = [docs.pop(0)] candidates = [] tfidf_vectorizer = TfidfVectorizer() relevance_weight = 0.9 # Start recalculating scores while len(select_lst) != len(docs): select_sen = [] for i in select_lst: select_sen.append(i.sentence) for candidate in docs: old_score = candidate.rating stemmed_sen = stemming([candidate]) stemmed_lst = stemming(select_lst) tfidf_matrix = tfidf_vectorizer.fit_transform(stemmed_lst) target = tfidf_vectorizer.transform(stemmed_sen) similarities = cosine_similarity(target,tfidf_matrix).flatten() similarities.sort() similarity = similarities[-1] new_score = old_score * relevance_weight - similarity * (1 - relevance_weight) candidate.rating = new_score docs = sorted(docs, key=attrgetter("rating"), reverse=True) select_lst.append(docs.pop(0)) return select_lst
def classify(self, x): """Transforms and classifies x""" x = x.lower() x_matrix = self.input_transform_fnc([x]) x = cosine_similarity(self.tfidf_matrix, x_matrix) idx = numpy.where(x == max(x))[0][0] return self.y[idx]
def concept_to_concept_threshold_char(concept,name): #print "Concept_to_Concept ", name conList = [] docs = [(name + concept)] for x in range(0,len(concept)): docs.append((name+concept[0:x])) tfidf_matrix = tfidf_vectorizer.fit_transform(docs) matrix = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix) #print matrix for row in matrix: for x in row[1:]: conList.append(x) mean = statistics.mean(conList) stdev = statistics.pstdev(conList) thld = 1 - (Num_Deviations * stdev) #print abs(mean_confidence_interval(mean,stdev)) #thld1 = thld - abs(mean_confidence_interval(mean,stdev)) #if thld == 1: #return #print statistics.pstdev(conList) #print thld #out = [thld,thld1] #wr.writerow(out) return thld
def create_similarity_matrix_text(self, features=None): if features == None: features = self.text_features if features == None: print 'You must provide the text features as argument or run extract_text_features() first' else: self.text_similarity_matrix = cosine_similarity(features)
def writeOutput(tfidf_matrix_train,relname,othername,othername1,realname,realname1,authorMap,listsize, outname): temp = [] for j in range(tfidf_matrix_train.shape[0]): targetname = relname[j] targetname = targetname.strip() for n in range(len(othername1)): othername1[n] = othername1[n].strip() if (targetname == othername1[n]): print j,targetname temp.append(j) #print 'original one' result = cosine_similarity(tfidf_matrix_train[j:j+1], tfidf_matrix_train) index = [i[0] for i in sorted(enumerate(result[0]), key=lambda x:x[1],reverse=True)] recList = index[1:listsize+1] if(j in authorMap.keys()): coauther = authorMap.get(j) coauther = list(coauther) recList = deleteCoauthor(coauther, recList, index) recomd = [] similar = [] for k in recList: recomd.append(relname[k]) similar.append(result[0][k]) with open('../RecommendationResult/'+outname+'Model/'+relname[j]+'.txt','w') as fn: fn.write(realname1[n]+" ID:"+str(j)+"\n\n") for n1 in xrange(len(recomd)): for m in range(len(othername)): person = recomd[n1] person = person.strip() othername[m] = othername[m].strip() if (person == othername[m]): fn.write(str(n1+1)+". "+str(realname[m])+": "+str(similar[n1])+"\n") return temp
def print_cluster_sim(centers): sim_matrix = cosine_similarity(centers, centers) print('\t'.join('{}'.format(j) for j, _ in enumerate(sim_matrix))) for i, row in enumerate(sim_matrix): print('\t'.join( ('{:.2f}'.format(x) if i < j else ' ') for j, x in enumerate(row)), i, sep='\t')
def hCSimilarity(train_fn, test_fn, save_fn): ''' calculate History and Candidate matrix similarity n x 100 , m x 100 -> n x m #@TODO : there are a lot of ways to calcaute similarity ''' fp = open(train_fn, 'r') train = pickle.load(fp).toarray() fp = open(test_fn , 'r') test = pickle.load(fp).toarray() result = np.zeros((len(train), len(test))) # for i in range(len(train)): # for j in range(len(test)): # na = np.linalg.norm(train[i,:]) # nb = np.linalg.norm(test[i,:]) # if na == 0 or nb == 0 : # result[i, j] = 0 # else: # #result[i][j] = np.dot(train[i,:], test[j,:])/na/nb # # result[i][j] = cosine_similarity(train[i,:], test[j,:]) # result[i][j] = 1 - spatial.distance.cosine(train[i,:], test[j,:]) result = cosine_similarity(train, test) saveItem(result, save_fn)
def first_level_grouping(feature_map_dict, encoded_list_rearrange_concat, mask_arr, all_keys, keys_1d, keys_2d, keys_3d=[]): height = 32 width = 20 relation_all_df = pd.DataFrame(0, columns=all_keys, index=all_keys) num_data = len(encoded_list_rearrange_concat[0]) # num_data for n in range(num_data): print('n: ', n) for ds_name1 in all_keys: # 1D case if ds_name1 in keys_1d: temp_arr1 = feature_map_dict[ds_name1][n, :] # (24, 1, 1, 1) # (24, 1) - > [32, 20, 24] temp_1d_dup = np.repeat(temp_arr1, 32, axis=1) temp_1d_dup = np.repeat(temp_1d_dup, 20, axis=2) # 32, 20, 24, 1 temp_1d_dup = np.squeeze(temp_1d_dup, axis=-1) #[24, 32, 20,] temp_1d_dup = np.moveaxis(temp_1d_dup, 0, -1) # (32, 20, 24) dim1 = temp_arr1.shape[0] # number of layers in the 2d data # dim1 = temp_arr1.shape[-1] # number of layers in the 2d data for ds_name2 in all_keys: # 1D VS 1D if ds_name2 in keys_1d: ave_SR = 0 # print(ds_name1, ds_name2) temp_arr2 = feature_map_dict[ds_name2][n, :] sim_sparse = cosine_similarity( temp_arr1.reshape(1, -1), temp_arr2.reshape(1, -1)) ave_SR = sim_sparse[0][0] relation_all_df.loc[ds_name1, ds_name2] += ave_SR # 2D VS 1D # 2D: 32, 20, 1 # 1D duplicate: 32, 20, 3. This means that there is no spatial variations for 1D # duplicate 2D to 32, 20, 3. This means that there is no temporal variations for 2D # then flatten and compare # This means that there is no temporal variations for 2D if ds_name2 in keys_2d: # temp_arr2 = feature_map_dict[ds_name2][n,:,:,:] # 32, 20, 1 # # duplicate to [32, 20, 24] # temp_arr2_mean_dup = np.repeat(temp_arr2, dim1, axis = -1) # # compress_arr2 = remove_outside_cells(temp_arr2_mean_dup, mask_arr) # [32, 20, 24] # compress_arr1 = remove_outside_cells( temp_1d_dup, mask_arr) # [32, 20, 24] # # ave_SR = 0 # sim_sparse = cosine_similarity(compress_arr2.reshape(1, -1), # compress_arr1.reshape(1, -1)) # # ave_SR = sim_sparse[0][0] # relation_all_df.loc[ds_name1, ds_name2] += ave_SR relation_all_df.loc[ds_name1, ds_name2] += 0 # 3D VS 1D # duplicate 1D to 3D, flatten and compare if ds_name2 in keys_3d: temp_arr2 = feature_map_dict[ds_name2][ n, :, :, :, :] # 3d, e.g. [24, 32, 20, 1] temp_arr2 = np.squeeze(temp_arr2, axis=-1) #[24, 32, 20] temp_arr2 = np.moveaxis(temp_arr2, 0, -1) # (32, 20, 24) ave_SR = 0 # average spearman correlation compress_arr2 = remove_outside_cells( temp_arr2, mask_arr) compress_arr1 = remove_outside_cells( temp_1d_dup, mask_arr) sim_sparse = cosine_similarity( compress_arr1.reshape(1, -1), compress_arr2.reshape(1, -1)) ave_SR = sim_sparse[0][0] relation_all_df.loc[ds_name1, ds_name2] += ave_SR # 2D case if ds_name1 in keys_2d: temp_arr1 = feature_map_dict[ds_name1][ n, :, :, :] # [32, 20, 1] # print('temp_arr1_mean.shape: ', temp_arr1_mean.shape) # temp_arr1_mean_dup = np.repeat(temp_arr1_mean_dup, temp_arr2.shape[-1], axis = 0) for ds_name2 in all_keys: # 2D Vs 1D if ds_name2 in keys_1d: relation_all_df.loc[ ds_name1, ds_name2] = relation_all_df.loc[ds_name2, ds_name1] # 2D Vs 2D # take mean along 3rd dimension and compare if ds_name2 in keys_2d: ave_SR = 0 # average spearman correlation temp_arr2 = feature_map_dict[ds_name2][n, :, :, :] compress_arr2 = remove_outside_cells( temp_arr2, mask_arr) compress_arr1 = remove_outside_cells( temp_arr1, mask_arr) sim_sparse = cosine_similarity( compress_arr1.reshape(1, -1), compress_arr2.reshape(1, -1)) # pearson_coef, p_value = stats.pearsonr(temp_arr1[ :, :, i].ravel(), temp_arr2[ :, :, j].ravel()) ave_SR = sim_sparse[0][0] relation_all_df.loc[ds_name1, ds_name2] += ave_SR # 2D VS 3D # for 2D feature maps, output 3rd dimension of feature map is 1. # for 3D feature maps, output 3rd dimension is 3 # average 3D feature map by 3rd dimension # flatten and compare if ds_name2 in keys_3d: temp_arr2 = feature_map_dict[ds_name2][ n, :, :, :, :] #[24, 32, 20, 1] temp_arr2 = np.squeeze(temp_arr2, axis=-1) #[24, 32, 20] temp_arr2 = np.moveaxis(temp_arr2, 0, -1) # (32, 20, 24) # average along third dimension temp_arr2_mean = np.mean(temp_arr2, axis=-1) temp_arr2_mean_dup = np.expand_dims( temp_arr2_mean, axis=-1) #[32, 20, 1] compress_arr2 = remove_outside_cells( temp_arr2_mean_dup, mask_arr) compress_arr1 = remove_outside_cells( temp_arr1, mask_arr) ave_SR = 0 # average spearman correlation sim_sparse = cosine_similarity( compress_arr1.reshape(1, -1), compress_arr2.reshape(1, -1)) ave_SR = sim_sparse[0][0] relation_all_df.loc[ds_name1, ds_name2] += ave_SR # 3D if ds_name1 in keys_3d: temp_arr1 = feature_map_dict[ds_name1][ n, :, :, :, :] # [24, 32, 20, 1] temp_arr1 = np.squeeze(temp_arr1, axis=-1) #[24, 32, 20] temp_arr1 = np.moveaxis(temp_arr1, 0, -1) # (32, 20, 24) for ds_name2 in all_keys: # 1D if ds_name2 in keys_1d: relation_all_df.loc[ ds_name1, ds_name2] = relation_all_df.loc[ds_name2, ds_name1] # 3D VS 2D if ds_name2 in keys_2d: temp_arr2 = feature_map_dict[ds_name2] relation_all_df.loc[ ds_name1, ds_name2] = relation_all_df.loc[ds_name2, ds_name1] # 3D VS 3D # flatten and compare. Because 3rd dimension contains # temporal information if ds_name2 in keys_3d: temp_arr2 = feature_map_dict[ds_name2][n, :, :, :, :] temp_arr2 = np.squeeze(temp_arr2, axis=-1) #[24, 32, 20] temp_arr2 = np.moveaxis(temp_arr2, 0, -1) # (32, 20, 24) ave_SR = 0 # average spearman correlation compress_arr2 = remove_outside_cells( temp_arr2, mask_arr) compress_arr1 = remove_outside_cells( temp_arr1, mask_arr) sim_sparse = cosine_similarity( compress_arr1.reshape(1, -1), compress_arr2.reshape(1, -1)) ave_SR = float(sim_sparse[0][0]) relation_all_df.loc[ds_name1, ds_name2] += ave_SR relation_all_df = relation_all_df / num_data return relation_all_df
def run(test, n_songs, n_tags, spr_list, tag_tid_id): start = time.time() train_user_songs_A,train_user_tags_A,\ test_title,title_sp,gnr_sp,test_gnr_sp,\ title_gnr,test_title_gnr = spr_list res = [] for i in range(len(test)): dat = test.iloc[i] pid = i songs_already = dat["songs"] tags_already = dat["tags_id"] if len(dat['songs']) != 0 and len(dat['tags_id']) != 0: p = np.zeros((n_songs, 1)) p[dat['songs']] = 1 val_song = cosine_similarity(train_user_songs_A, p.T) pp = np.zeros((n_tags, 1)) pp[dat['tags_id']] = 1 val_tag = cosine_similarity(train_user_tags_A, pp.T) val_title_genre = cosine_similarity(title_gnr, test_title_gnr[i:(i + 1)]) val = val_song * val_tag * val_title_genre elif len(dat['songs']) != 0: p = np.zeros((n_songs, 1)) p[dat['songs']] = 1 val_song = cosine_similarity(train_user_songs_A, p.T) val_title_genre = cosine_similarity(title_gnr, test_title_gnr[i:(i + 1)]) val = val_song * val_title_genre elif len(dat['tags_id']) != 0: p = np.zeros((n_tags, 1)) p[dat['tags_id']] = 1 val = cosine_similarity(train_user_tags_A, p.T) if len(dat['plylst_title']) != 0: val_title = cosine_similarity(title_sp, test_title[i:(i + 1)]) val = val * val_title else: val = cosine_similarity(title_sp, test_title[i:(i + 1)]) cand_song = train_user_songs_A.T.tocsr().dot( val) # 행에는 노래 열에는 유저 정보 %*% 유사한 유저 -> 유사한 노래에 대하여 높은 값 나옴 cand_song_idx = cand_song.reshape( -1).argsort()[-300:][::-1] # 값이 높은 상위 150개 노래 추출 cand_song_idx = cand_song_idx[np.isin(cand_song_idx, songs_already) == False][:100] # 중복되는 노래 있는지 확인하고 100개 추출 cand_tag = train_user_tags_A.T.tocsr().dot(val) # 똑같은 작업 실시 cand_tag_idx = cand_tag.reshape(-1).argsort()[-30:][::-1] cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) == False][:10] rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx] res.append({ "id": test.loc[pid, 'id'], "songs": list(cand_song_idx), "tags": rec_tag_idx }) if i % 1000 == 0: print("{} time :".format(i), time.time() - start) write_json(res, "pre_tag.json")
def getCosineSimilarity(self,vec_Pair): sim_array = cosine_similarity(vec_Pair[0],vec_Pair[1]) return sim_array[0][0]
def find_similarity(test_description_modified): ## make sparse array of filtered test description test_document_vector = vectorizer.transform([test_description_modified]) test_document_encoded = (test_document_vector.toarray()) #print (test_document_encoded) ## Cosine Similarity: all_documents_similarity = [] ## all_documents_similarity is a array in whcih we save the similarity and the primary key/index together as we will have to sort the list for selecting top similar descriptions, so we need to save the indexes as well for i in range(len(all_documents_encoded)): all_documents_similarity.append([ cosine_similarity(all_documents_encoded[i], test_document_encoded), i ]) # ## Sort all similarities in desc order # all_documents_similarity_sorted = sorted(all_documents_similarity, reverse = True) ## Select similar documents: ## Select Top X% of the sorted values ## NOTE - Is there a better way to decide what percentage to select, other than trial and error on percentages? Xpercent = 0.15 ##Top 10 documents topXpercent = int(len(all_documents_similarity) * (Xpercent / 100)) # all_documents_similarity_sorted_topXpercent = all_documents_similarity_sorted[:topXpercent] # #print (all_documents_similarity) # #print (all_documents_similarity_sorted) # #print (topXpercent) # #print (all_documents_similarity_sorted_topXpercent) ## SORT ONLY THOSE MUCH SIMILARITIES THAT ARE NEEDED - all_documents_similarity_sorted - dont sort all 7000 or so entires - Sort only to get the topXpercent similarity values all_documents_similarity_sorted_topXpercent = sortTopXpercent( all_documents_similarity, topXpercent) print("\nRating of most similar app:", all_documents_similarity_sorted_topXpercent[0][0][0][0]) ## If highest similarity is < say 0.2 then tell the user to add more description for better results? - So as to tackle single word or line descriptions. To tackle persistant users, add a button if they want analytics with only that much of description? if (all_documents_similarity_sorted_topXpercent[0][0][0][0] < 0.35): print( "For better analytics, enter more description specific to your app idea" ) ## Print index of the description found to be similar #print (all_documents_similarity_sorted_topXpercent[0][1]) ## Link Datasets and Find Weighted Average of Ratings and other details total_weight = 0 total_weighted_rating = 0 users_by_rating_dict = { "1.0": 0, "1.5": 0, "2.0": 0, "2.5": 0, "3.0": 0, "3.5": 0, "4.0": 0, "4.5": 0, "5.0": 0 } ## For equalized 'number of users' - apple app store lets you rate with integers from 1 - 5 users_by_rating_equalized_dict = { "One": 0, "Two": 0, "Three": 0, "Four": 0, "Five": 0 } users_by_ageGroup_dict = { "Children_5": 0, "Teenager_13": 0, "Adult_18": 0, "Elderly_50": 0 } total_users_that_rated = 0 ## Arbitrary installs factor value, , through intuition - Apple dataset does not contain information about total installs so have to calculate an average value assuming that every 1 person among 'total_users_that_rated/installs_factor' number of persons rates the app installs_factor = 250 ## depends on topXpercent for i in range(len(all_documents_similarity_sorted_topXpercent)): #document_rating = data_full.iloc[(all_documents_similarity_sorted_topXpercent[i][1])]['user_rating'] document_rating = rating_array[( all_documents_similarity_sorted_topXpercent[i][1])] document_rating_count = rating_count_array[( all_documents_similarity_sorted_topXpercent[i][1])] if document_rating_count == 0: continue if document_rating == 0: continue document_name = track_name_array[( all_documents_similarity_sorted_topXpercent[i][1])] document_id = all_documents_similarity_sorted_topXpercent[i][1] print("id:", document_id, "name:", document_name, "rating:", document_rating, "rating_count:", document_rating_count, "similarity:", all_documents_similarity_sorted_topXpercent[i][0][0][0]) ## Find the final Average rating - Weighted average of ratings of topXpercent similar documents ## Considering Document weight = Similarity Score multiplied by document_rating_count document_weight = all_documents_similarity_sorted_topXpercent[i][0][0][ 0] * document_rating_count document_weighted_rating = document_weight * document_rating total_weighted_rating = total_weighted_rating + document_weighted_rating total_weight = total_weight + document_weight ## For the actual graph of "number of users" by "Rating Given" this_rating = str(document_rating) users_at_this_rating = users_by_rating_dict[this_rating] users_by_rating_dict[this_rating] = int(users_at_this_rating + document_weight) ## For the equalized graph of "number of users" by "Rating Given" ## Equalizing the number of user per rating: ## According to the 5 diff ratings, we make 5 diff dict of percentages of usage for the 5 ratings ## Arbitrary average percentage values, through intuition- Apple dataset does not contain information about total users per rating give, for each app. Therefore we use arbitrary average values users_by_rating_equalized_dict = users_by_rating_equalized_dict_modify_by_percentage( this_rating, users_by_rating_equalized_dict, document_weight) ## Keeping count of total_users_that_rated a particular app total_users_that_rated = total_users_that_rated + document_weight ## For the graph of "number of users" by "Age Group" - ## According to the 4 diff content ratings, we make 4 diff dict of percentages of usage for the 4 age groups ## Arbitrary average percentage values, through intuition- Apple dataset does not contain information about total user rating per age group or total installs per age group, for each app. Therefore we use arbitrary average values age_group = age_group_array[( all_documents_similarity_sorted_topXpercent[i][1])] users_by_ageGroup_dict = users_by_ageGroup_dict_modify_by_percentage( age_group, users_by_ageGroup_dict, document_weight) #print ("users:", document_weight, "rating:", document_rating, "previous users at this rating:", users_at_this_rating, "new users at this rating", users_by_rating_dict[this_rating]) print("users:", document_weight, "rating:", document_rating, "Content rating:", age_group) #print ("\nCurrent users_by_ageGroup_dict:", users_by_ageGroup_dict) ## Final Average rating final_rating = total_weighted_rating / total_weight ## Total users by rating - actual (output not modified): print("\nUsers by rating - actual:", users_by_rating_dict) ## total_users_that_rated as the total number of ratings given print("\nTotal users that are likely to rate: ", int(total_users_that_rated)) ## Total users by rating - equalized (output modified): ## users_by_rating_dict dictionary is not equalized, i.e it could be possible that: ## Consider the following final users_by_rating_dict dictionary - Users by rating: {'1.0': 0, '1.5': 0, '2.0': 0, '2.5': 0, '3.0': 503946, '3.5': 0, '4.0': 27265, '4.5': 9559, '5.0': 0} ## There are no users at rating 1.0, 2.0, and so on which will not give a distributed graph ## Therefore we have to equalize the graph to some extent so that the peaks get distributed and we get a smoother bar graph (This is possibly manipulation of dataset but the kaggle dataset does not have user count for each rating increment for any particular distribution, and that's why we have to normalize the graph) ## NOTE - Could not find a library for this so make a function for equalization? print("\nUsers by rating - equalized:", users_by_rating_equalized_dict) ## Total predicted installs factor = total_users_that_rated / installs_factor print("\nTotal installs: ", int(total_users_that_rated * factor)) ## Total predicted "users that rated" ordered by age print("\nUsers that rated, ordered by age:", users_by_ageGroup_dict) ## Total predicted "installs" ordered by age for x in users_by_ageGroup_dict: temp4 = users_by_ageGroup_dict[x] users_by_ageGroup_dict[x] = int(temp4 * factor) print("\nInstalls, ordered by age:", users_by_ageGroup_dict) ## Print genre as the genre of the description with the highest similarity prime_genre = genre[all_documents_similarity_sorted_topXpercent[0][1]] print("\nPrime Genre: ", prime_genre) ## Print rounded off final rating print("\nPredicted rating: ", round(final_rating, 2)) ## 2 decimal places if (final_rating >= 4): selling_ability = '"Selling_Ability" : "Excellent"' elif (final_rating >= 3 and final_rating < 4): selling_ability = '"Selling_Ability" : "Good"' elif (final_rating >= 2 and final_rating < 3): selling_ability = '"Selling_Ability" : "Average"' elif (final_rating >= 1 and final_rating < 2): selling_ability = '"Selling_Ability" : "Poor"' frontend_json = "" ## Making a string containing all the output data in jsonic form frontend_json += '{ "Predicted_Rating" : ' + '"' + str( round(final_rating, 2)) + '",' frontend_json += ' ' + selling_ability frontend_json += ', "Detected_Genre" : ' + '"' + prime_genre + '",' frontend_json += ' "Total_Installs" : ' + '"' + str( int(total_users_that_rated * factor)) + '",' frontend_json += ' "Total_Users_That_Rated" : ' + '"' + str( int(total_users_that_rated)) + '",' ## changing single quotes in users_by_rating_equalized_dict keys to double quotes json_graph_dict_ratings = json.dumps(users_by_rating_equalized_dict) json_graph_dict_age_group = json.dumps(users_by_ageGroup_dict) frontend_json += ' "Graph_Users_By_Ratings" : ' + '[ ' + json_graph_dict_ratings + ' ],' frontend_json += ' "Graph_Installs_By_Age_Group" : ' + '[ ' + json_graph_dict_age_group + ' ],' top_3_string_concat = "{ " for k in range(0, 3): top_3_string_concat += '"' + 'One' + str(k + 1) + '" : [ ' #print (top_3_string_concat) top_3_document_name = track_name_array[( all_documents_similarity_sorted_topXpercent[k][1])] top_3_document_rating = rating_array[( all_documents_similarity_sorted_topXpercent[k][1])] top_3_document_rating_count = rating_count_array[( all_documents_similarity_sorted_topXpercent[k][1])] top_3_this_document_installs = top_3_document_rating_count * ( top_3_document_rating_count / installs_factor) this_description_trunc = (unmodified_description_array[ (all_documents_similarity_sorted_topXpercent[k][1])][0:350] ).replace("\n", " ") + "..." top_3_dict_concat = '{ "Name" : "Name: ' + top_3_document_name + '", "Rating" : "Rating: ' + str( top_3_document_rating ) + '", "Similarity_Score" : "Similarity Score: ' + str( round( all_documents_similarity_sorted_topXpercent[k][0][0][0] * 100) ) + '%", "This_Description" : "Description: ' + this_description_trunc + '" }' top_3_string_concat += top_3_dict_concat + ' ]' if (k != 2): top_3_string_concat += ', ' top_3_string_concat += " }" frontend_json += ' "Top_3_Similar_Apps" : ' + '[ ' + top_3_string_concat + ' ]' frontend_json += ' }' print("FRONTEND\n") return frontend_json
def task1cFunc(userid): # read mltags, mlrating, mlmovies, movie-actor mltagsFile = pd.read_csv('mltags.csv') mlratingsFile = pd.read_csv('mlratings.csv') genomeFile = pd.read_csv('genome-tags.csv') movieFile = pd.read_csv('smallmlmovies.csv') # Exgtract tag from tagid genomeFile['tagid'] = genomeFile['tagId'] del genomeFile['tagId'] mltagsFile = pd.merge(mltagsFile, genomeFile, on='tagid') s = movieFile["genres"].str.split('|', expand=True).stack() i = s.index.get_level_values(0) movieFile = movieFile.loc[i].copy() movieFile["genres"] = s.values # Extract movie from movieid del movieFile['year'] mlratingsFile = pd.merge(mlratingsFile, movieFile, on='movieid') mltagsFile = pd.merge(mltagsFile, movieFile, on='movieid') mltagsFileUser = mltagsFile.loc[mltagsFile['userid'] == userid] tagUserMovies = mltagsFileUser['moviename'].values mlratingsFileUser = mlratingsFile.loc[mlratingsFile['userid'] == userid] ratingUserMovies = mlratingsFileUser['moviename'].values tagRatingUserMovies = list(set(tagUserMovies) | set(ratingUserMovies)) mltagsFileUser['timestamp'] = pd.to_datetime(mltagsFileUser['timestamp']) mltagsFileUser['timestamp'] = (mltagsFileUser['timestamp'] - dt.datetime(1970, 1, 1)).dt.total_seconds() mltagsFileUser['timestamp'] = \ ((mltagsFileUser['timestamp'] - mltagsFileUser['timestamp'].min()) / (mltagsFileUser['timestamp'].max() - mltagsFileUser['timestamp'].min()+1))+1 mlratingsFileUser['timestamp'] = pd.to_datetime( mlratingsFileUser['timestamp']) mlratingsFileUser['timestamp'] = ( mlratingsFileUser['timestamp'] - dt.datetime(1970, 1, 1)).dt.total_seconds() mlratingsFileUser['timestamp'] = \ ((mlratingsFileUser['timestamp'] - mlratingsFileUser['timestamp'].min()) / (mlratingsFileUser['timestamp'].max() - mlratingsFileUser['timestamp'].min()+1))+1 commonTagRating = list(set(tagUserMovies) & set(ratingUserMovies)) uncommonTag = list(set(tagUserMovies) ^ set(commonTagRating)) uncommonRating = list(set(ratingUserMovies) ^ set(commonTagRating)) timeWeights = {} for i in range(len(commonTagRating)): tag = mltagsFileUser.loc[mltagsFileUser['moviename'] == commonTagRating[i]]['timestamp'].values[0] rating = mlratingsFileUser.loc[ mlratingsFileUser['moviename'] == commonTagRating[i]]['timestamp'].values[0] if tag > rating: timeWeights[commonTagRating[i]] = tag else: timeWeights[commonTagRating[i]] = rating for i in range(len(uncommonRating)): rating = mlratingsFileUser.loc[ mlratingsFileUser['moviename'] == uncommonRating[i]]['timestamp'].values[0] timeWeights[uncommonRating[i]] = rating for i in range(len(uncommonTag)): tag = mltagsFileUser.loc[mltagsFileUser['moviename'] == uncommonTag[i]]['timestamp'].values[0] timeWeights[uncommonTag[i]] = tag # # deleting columns that are not required del mlratingsFile['timestamp'] del mlratingsFile['imdbid'] del mlratingsFile['userid'] del mltagsFile['timestamp'] del mltagsFile['userid'] del mltagsFile['tagid'] # creating a dictionary with movieid as key and a list of all tags associated with tha movie and removing duplicates movieGenreDict = { k: g['genres'].tolist() for k, g in movieFile.groupby('moviename') } movieGenreDict = {k: list(set(j)) for k, j in movieGenreDict.items()} # creating a dictionary with movieid as key and a list of all ratings given by a user for that particular movie and removing duplicates movieRatingDict = { k: g['rating'].tolist() for k, g in mlratingsFile.groupby('moviename') } movieRatingDict = {k: list(set(j)) for k, j in movieRatingDict.items()} # computing the average rating for all movies and storing in a dictionary avgRating = mlratingsFile.groupby('moviename').mean().reset_index() avgRatingDict = { k: g['rating'].tolist() for k, g in avgRating.groupby('moviename') } # List of unique movies, genres and ratings movieList = mlratingsFile.moviename.unique() movieList = np.asarray(movieList) movieListDict = dict(enumerate(movieList)) genreList = movieFile.genres.unique() genreList = np.asarray(genreList) genreListDict = dict(enumerate(genreList)) ratingList = mlratingsFile.rating.unique() ratingList = np.asarray(ratingList) ratingListDict = dict(enumerate(ratingList)) movieListDictInverse = invertDictionary(movieListDict) genreListDictInverse = invertDictionary(genreListDict) ratingListDictInverse = invertDictionary(ratingListDict) movieNotWatched = list(set(movieList) ^ set(tagRatingUserMovies)) # declaring a tensor with three modes - with movie, tags and ratings T = np.zeros((movieList.shape[0], genreList.shape[0], ratingList.shape[0])) arrayofvalues = [] for i in movieList: if i in movieRatingDict: if i in movieGenreDict: movieTags = movieGenreDict[i] rList = movieRatingDict[i] for j in movieTags: for k in rList: mIndex = movieListDictInverse[i] gIndex = genreListDictInverse[j] rIndex = ratingListDictInverse[k] avgRatingValue = avgRatingDict[i][0] if k >= avgRatingValue: T[mIndex, gIndex, rIndex] = 1 arrayofvalues.append([mIndex, gIndex, rIndex]) else: T[mIndex, gIndex, rIndex] = 0 # building the tensor using sktensor tensor = dtensor(T) # applying CP-decomposition with ALS(Alternating Least Squares) U, fit, itr, exectimes, P = cp_als(tensor, 5, init='random') latent_semantics_movie = pd.DataFrame( columns=['movie', 'ls1', 'ls2', 'ls3', 'ls4', 'ls5']) latent_semantics_movie['movie'] = movieList latent_semantics_movie['ls1'] = U[0][:, 0] latent_semantics_movie['ls2'] = U[0][:, 1] latent_semantics_movie['ls3'] = U[0][:, 2] latent_semantics_movie['ls4'] = U[0][:, 3] latent_semantics_movie['ls5'] = U[0][:, 4] x = latent_semantics_movie.loc[latent_semantics_movie['movie'].isin( tagRatingUserMovies)].values for i in range(len(x)): for j in range(1, len(x[0])): x[i][j] = x[i][j] * timeWeights.get(x[i][0]) y = latent_semantics_movie.loc[latent_semantics_movie['movie'].isin( movieNotWatched)].values cossim = cosine_similarity(x[:, 1:], y[:, 1:]) simDF = pd.DataFrame(cossim, index=tagRatingUserMovies, columns=movieNotWatched) simDF.to_csv('cos.csv') temp = simDF.values.tolist() sorted_movies_for_each_watched_movieDict = [] for i in range(len(temp)): sorted_movies_for_each_watched_movie = np.argsort(temp[i]) sorted_movies_for_each_watched_movieDict.append( sorted_movies_for_each_watched_movie.tolist()[:10]) sortedMoviesRavel = [ item for sublist in sorted_movies_for_each_watched_movieDict for item in sublist ] freq = {} for i in range(len(sorted_movies_for_each_watched_movieDict)): for j in range(len(sorted_movies_for_each_watched_movieDict[0])): freq[sorted_movies_for_each_watched_movieDict[i][j]] = 0 for i in range(len(sorted_movies_for_each_watched_movieDict)): for j in range(len(sorted_movies_for_each_watched_movieDict[0])): freq[sorted_movies_for_each_watched_movieDict[i][j]] += (10 - j) freq = OrderedDict(sorted(freq.items(), reverse=True, key=lambda t: t[1])) freq = freq.items() recommendedMovies = [] for i in range(10): index = freq[i][0] recommendedMovies.append(y[index][0]) relevant = [] notRelevant = [] choice = 'y' while choice != 'n': rel_dict = {} selected_dict = {} N = 5 R = 0 for i in range(len(recommendedMovies)): print "If ", recommendedMovies[ i], " is relevant, enter 1. If it is not relevant, enter 0" relevant.append(int(raw_input())) rel_dict[recommendedMovies[i]] = relevant[i] if relevant[i] == 1: R = R + 1 else: notRelevant.append(recommendedMovies[i]) genreset = set() for movie in recommendedMovies: genres_list = movieGenreDict[movie] selected_dict[movie] = genres_list genreset = genreset.union(set(genres_list)) genreTop5 = list(genreset) ri = [] ni = [] for i in range(0, len(genreTop5)): ri.append(0) ni.append(0) for m in recommendedMovies: for i in range(0, len(genreTop5)): l1 = selected_dict[m] rval = rel_dict[m] if genreTop5[i] in l1: ni[i] = ni[i] + 1 if rval == 1: ri[i] = ri[i] + 1 pr_feedback = {} for i in range(0, len(genreTop5)): try: numerator = ri[i] / (R - ri[i]) denominator = (ni[i] - ri[i]) / (N - R - ni[i] + ri[i]) pr = math.log((numerator / denominator), 2) except: numerator = (ri[i] + 0.5) / (R - ri[i] + 1) denominator = (ni[i] - ri[i] + 0.5) / (N - R - ni[i] + ri[i] + 1) pr = math.log((numerator / denominator), 2) pr_feedback[genreTop5[i]] = pr for key, value in pr_feedback.iteritems(): pr_feedback[key] = (pr_feedback[key] - min( pr_feedback.values())) / max(pr_feedback.values()) pr_dict = {} for i in movieList: if i in movieRatingDict: if i in movieGenreDict: movieTags = movieGenreDict[i] rList = movieRatingDict[i] for j in movieTags: for k in rList: mIndex = movieListDictInverse[i] tIndex = genreListDictInverse[j] rIndex = ratingListDictInverse[k] avgRatingValue = avgRatingDict[i][0] if k >= avgRatingValue: if j in genreTop5: T[mIndex, tIndex, rIndex] *= pr_feedback[j] tensor = dtensor(T) # applying CP-decomposition with ALS(Alternating Least Squares) U, fit, itr, exectimes, P = cp_als(tensor, 5, init='random') latent_semantics_movie = pd.DataFrame( columns=['movie', 'ls1', 'ls2', 'ls3', 'ls4', 'ls5']) latent_semantics_movie['movie'] = movieList latent_semantics_movie['ls1'] = U[0][:, 0] latent_semantics_movie['ls2'] = U[0][:, 1] latent_semantics_movie['ls3'] = U[0][:, 2] latent_semantics_movie['ls4'] = U[0][:, 3] latent_semantics_movie['ls5'] = U[0][:, 4] x = latent_semantics_movie.loc[latent_semantics_movie['movie'].isin( tagRatingUserMovies)].values for i in range(len(x)): for j in range(1, len(x[0])): x[i][j] = x[i][j] * timeWeights.get(x[i][0]) y = latent_semantics_movie.loc[latent_semantics_movie['movie'].isin( movieNotWatched)].values cossim = cosine_similarity(x[:, 1:], y[:, 1:]) simDF = pd.DataFrame(cossim, index=tagRatingUserMovies, columns=movieNotWatched) temp = simDF.values.tolist() sorted_movies_for_each_watched_movieDict = [] for i in range(len(temp)): sorted_movies_for_each_watched_movie = np.argsort(temp[i]) sorted_movies_for_each_watched_movieDict.append( sorted_movies_for_each_watched_movie.tolist()[:10]) sortedMoviesRavel = [ item for sublist in sorted_movies_for_each_watched_movieDict for item in sublist ] freq = {} for i in range(len(sorted_movies_for_each_watched_movieDict)): for j in range(len(sorted_movies_for_each_watched_movieDict[0])): freq[sorted_movies_for_each_watched_movieDict[i][j]] = 0 for i in range(len(sorted_movies_for_each_watched_movieDict)): for j in range(len(sorted_movies_for_each_watched_movieDict[0])): freq[sorted_movies_for_each_watched_movieDict[i][j]] += (10 - j) freq = OrderedDict( sorted(freq.items(), reverse=True, key=lambda t: t[1])) freq = freq.items() recommendedMovies = [] for i in range(10): index = freq[i][0] recommendedMovies.append(y[index][0]) print recommendedMovies relevant = [] print('Do you want to continue? Enter Y for yes and N for No') choice = raw_input() while choice not in ['y', 'n']: print('invalid input') choice = input()
def buildSimilarityMatrix(): #### Ideas for Optimization: # DONE if you come across a duplicate, for ex: (m1,m2) and (m2,m1) then look up this value and # WatchedBoth = ( "MATCH (U:USER) WHERE (U:USER)-[:Has_rated]->(:MOVIE{id:{A}}) " "AND (U:USER)-[:Has_rated]->(:MOVIE{id:{B}}) RETURN U.id") findRating = "MATCH (USER {id:{user_id}})-[r:Has_rated]->(MOVIE{id:{movie_id}}) RETURN r.rating" numMovies = graph.evaluate("MATCH (m:MOVIE) RETURN COUNT(m)") m1_ratings = [] m2_ratings = [] angle_in_degrees = 0 Row = [] matrix = [] pr = cProfile.Profile() # timing the queries for m1 in range(1, 3): #numMovies+1): pr.enable() for m2 in range(1, numMovies + 1): if m2 < m1: angle_in_degrees = matrix[m2 - 1][m1 - 1] # on diagonal if m1 == m2: angle_in_degrees = 0 else: # Find 'users' who've watched both m1 and m2 users = graph.run(WatchedBoth, { "A": m1, "B": m2 }).data() #[0]['U.id'] if len(users) == 0: angle_in_degrees = 90 else: # create arrays of m1's and m2's ratings for u in users: m1rating = graph.evaluate(findRating, { "user_id": u['U.id'], "movie_id": m1 }) m1_ratings.append(m1rating) m2rating = graph.evaluate(findRating, { "user_id": u['U.id'], "movie_id": m2 }) m2_ratings.append(m2rating) # create vector v1 andv2 v1 = np.array(m1_ratings).reshape(1, -1) v2 = np.array(m2_ratings).reshape(1, -1) # calculate cosine similarity similarity = cosine_similarity(v1, v2) similarity = np.clip(similarity, -1, 1) angle_in_radians = math.acos(similarity) angle_in_degrees = math.degrees(angle_in_radians) m1_ratings = [] m2_ratings = [] users = [] Row.append(angle_in_degrees) pr.disable() pr.print_stats() matrix.append(Row) Row = [] df = pd.DataFrame(matrix) print(df) return
def docSimilarity(invIndex, query1, query2): documentsMatrix = sparseMatrix(invIndex, query1) queryVector = sparseMatrix(invIndex, query2) similarity = cosine_similarity(documentsMatrix, queryVector) return similarity
def return_top_ranked_sentences(news_content): import numpy as np import pandas as pd from nltk import tokenize def process_text(news_content): #inputfilepath = inputfilepath = "C:\\Users\\ekrigos\\Desktop\\DataS\\REVA\\finalyearProj\\pdf_analysis\\inputs_healthHazard\\" #filename = 'server_noise_health_hazard.pdf.txt' # fileName1 = inputfilepath + filename # # file = open(fileName1,"r") # # fullText = file.read() # file.close() tokens_lst = tokenize.sent_tokenize(news_content) #print(tokens_lst[:1]) return tokens_lst sentences = process_text(news_content) #print(sentences[:1]) #clean the data # remove punctuations, numbers and special characters clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ") # make alphabets lowercase clean_sentences = [s.lower() for s in clean_sentences] #remove stopwords #nltk.download('stopwords') from nltk.corpus import stopwords stop_words = stopwords.words('english') # function to remove stopwords def remove_stopwords(sen): sen_new = " ".join([i for i in sen if i not in stop_words]) return sen_new # remove stopwords from the sentences clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences] #extract the word embeddings word_embeddings = {} f = open('c:\\datasets\\glove\\glove.6B.100d.txt', encoding='utf-8') for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') word_embeddings[word] = coefs f.close() len(word_embeddings) #let's create vectors for our sentences sentence_vectors = [] for i in clean_sentences: if len(i) != 0: v = sum( [word_embeddings.get(w, np.zeros((100, ))) for w in i.split()]) / (len(i.split()) + 0.001) else: v = np.zeros((100, )) sentence_vectors.append(v) #similarity matrix representation #Let’s first define a zero matrix of dimensions (n * n). #We will initialize this matrix with cosine similarity scores of the sentences. #Here, n is the number of sentences. # similarity matrix sim_mat = np.zeros([len(sentences), len(sentences)]) #We will use Cosine Similarity to compute the similarity between a pair of sentences. from sklearn.metrics.pairwise import cosine_similarity #And initialize the matrix with cosine similarity scores. for i in range(len(sentences)): for j in range(len(sentences)): if i != j: sim_mat[i][j] = cosine_similarity( sentence_vectors[i].reshape(1, 100), sentence_vectors[j].reshape(1, 100))[0, 0] # ============================================================================= # let’s convert the similarity matrix sim_mat into a graph. # The nodes of this graph will represent the sentences and # the edges will represent the similarity scores between the sentences. # On this graph, we will apply the PageRank algorithm to arrive at the sentence rankings. # ============================================================================= import networkx as nx nx_graph = nx.from_numpy_array(sim_mat) scores = nx.pagerank(nx_graph) #Summary Extraction #extract the top N sentences based on their rankings for summary generation ranked_sentences = sorted( ((scores[i], s) for i, s in enumerate(sentences)), reverse=True) top_ranked_sentences = [] # Extract top 10 sentences as the summary for i in range(10): #print(ranked_sentences[i][1]) top_ranked_sentences.append(ranked_sentences[i][1]) print("Inside Summary function") print(top_ranked_sentences) return top_ranked_sentences #dump into a file #with open(output_file_name_top_ranked, 'w') as filehandle: # for listitem in top_ranked_sentences: # filehandle.write('%s\n' % listitem) ########## #https://www.analyticsvidhya.com/blog/2018/11/introduction-text-summarization-textrank-python/ #########
def get_loadings(agg_doc_vecs_path, agg_dic_vecs_path, out_path, num_features, delimiter='\t'): """Get loadings between each document vector in agg-doc_vecs_path and each dictionary dimension in agg_dic_vecs_path""" n_docs = float(file_length.file_len(agg_doc_vecs_path)) prog_counter = 0 counter = 0 dic_vecs = pd.read_csv(agg_dic_vecs_path, sep=delimiter) dic_vecs = dic_vecs.to_dict(orient='list') with open(agg_doc_vecs_path, 'rb') as doc_vecs, open(out_path, 'wb') as out_file: doc_vecs_reader = csv.reader(doc_vecs, delimiter='\t') doc_vecs_reader.next() writer = csv.writer(out_file, delimiter='\t') fieldnames_out = ['ID'] + dic_vecs.keys() writer.writerow(fieldnames_out) for doc_vec in doc_vecs_reader: prog_counter += 1 counter += 1 doc_id = doc_vec[0] out_row = [doc_id] for dic_vec in dic_vecs.keys(): doc_vec = [float(x) for x in doc_vec[-num_features:]] dic_similarity = cosine_similarity(doc_vec, dic_vecs[dic_vec])[0][0] out_row.append(dic_similarity) writer.writerow(out_row) if prog_counter >= 0.05 * n_docs: prog_counter = 0 update_progress(counter / (n_docs - 1)) print 'Finished calculating document loadings' #get_loadings('out_test.txt', 'dic_vecs_out_test.tsv', 'hope.tsv') # # if __name__ == "__main__": # # # This is not finished. # # if sys.argv[1] is 'make_dic_vecs': # # model, num_features, model_word_set = load_model(model_path=sys.argv[2]) # dic_terms = getDicTerms(sys.argv[3]) # dic_vecs = getAggDicVec(dic_terms) # writeDicVecs(dic_vecs=dic_vecs, out_path=sys.argv[4]) # # elif sys.argv[2] is 'make_doc_vecs': # # model, num_features, model_word_set = load_model(model_path=sys.argv[2]) # getAggDocVecs(docs_path=sys.argv[3], out_path=sys.argv[4], text_col=sys.argv[5]) # # # elif sys.argv[3] is 'get_loadings': # # model, num_features, model_word_set = load_model(model_path=sys.argv[2]) # get_loadings(agg_doc_vecs=sys.argv[3], agg_dic_vecs=sys.argv[4], out_path=sys.argv[5])
def eval_emb_metrics(hypothesis, references, emb=None): from sklearn.metrics.pairwise import cosine_similarity from nltk.tokenize import word_tokenize import numpy as np if emb is None: emb = Embedding() emb_hyps = [] avg_emb_hyps = [] extreme_emb_hyps = [] for hyp in hypothesis: embs = [emb.vec(word) for word in word_tokenize(hyp)] avg_emb = np.sum(embs, axis=0) / np.linalg.norm(np.sum(embs, axis=0)) assert not np.any(np.isnan(avg_emb)) maxemb = np.max(embs, axis=0) minemb = np.min(embs, axis=0) extreme_emb = list( map( lambda x, y: x if ((x > y or x < -y) and y > 0) or ( (x < y or x > -y) and y < 0) else y, maxemb, minemb)) emb_hyps.append(embs) avg_emb_hyps.append(avg_emb) extreme_emb_hyps.append(extreme_emb) emb_refs = [] avg_emb_refs = [] extreme_emb_refs = [] for refsource in references: emb_refsource = [] avg_emb_refsource = [] extreme_emb_refsource = [] for ref in refsource: embs = [emb.vec(word) for word in word_tokenize(ref)] avg_emb = np.sum(embs, axis=0) / np.linalg.norm( np.sum(embs, axis=0)) assert not np.any(np.isnan(avg_emb)) maxemb = np.max(embs, axis=0) minemb = np.min(embs, axis=0) extreme_emb = list( map( lambda x, y: x if ((x > y or x < -y) and y > 0) or ( (x < y or x > -y) and y < 0) else y, maxemb, minemb)) emb_refsource.append(embs) avg_emb_refsource.append(avg_emb) extreme_emb_refsource.append(extreme_emb) emb_refs.append(emb_refsource) avg_emb_refs.append(avg_emb_refsource) extreme_emb_refs.append(extreme_emb_refsource) cos_similarity = list( map(lambda refv: cosine_similarity(refv, avg_emb_hyps).diagonal(), avg_emb_refs)) cos_similarity = np.max(cos_similarity, axis=0).mean() average = "EmbeddingAverageCosineSimilairty: %0.6f" % (cos_similarity) cos_similarity = list( map(lambda refv: cosine_similarity(refv, extreme_emb_hyps).diagonal(), extreme_emb_refs)) cos_similarity = np.max(cos_similarity, axis=0).mean() extrema = "VectorExtremaCosineSimilarity: %0.6f" % (cos_similarity) scores = [] for emb_refsource in emb_refs: score_source = [] for emb_ref, emb_hyp in zip(emb_refsource, emb_hyps): simi_matrix = cosine_similarity(emb_ref, emb_hyp) dir1 = simi_matrix.max(axis=0).mean() dir2 = simi_matrix.max(axis=1).mean() score_source.append((dir1 + dir2) / 2) scores.append(score_source) scores = np.max(scores, axis=0).mean() greedy = "GreedyMatchingScore: %0.6f" % (scores) rval = "\n".join([average, extrema, greedy]) return rval
def main(output_dir, sim_threshold, bucket_size): if not os.path.exists(output_dir): os.makedirs(output_dir) dev_qids = set([ 19, 23, 27, 34, 35, ] + [7, 24]) summary_data = [] K_data = [] for event in cuttsum.events.get_events(): if event.query_num in dev_qids: continue print event semsim = event2semsim(event) istream = get_input_stream(event, False, extractor="goose", thresh=.8, delay=None, topk=20) prev_time = 0 cache = None clusters = [] max_h = len(event.list_event_hours()) - 1 for h, hour in enumerate(event.list_event_hours()): if h % bucket_size != 0 and h != max_h: continue current_time = epoch(hour) input_sents = istream[ (istream["timestamp"] < current_time) & \ (istream["timestamp"] >= prev_time)] len_select = input_sents["lemmas stopped"].apply(len) > 10 input_sents = input_sents[len_select] if len(input_sents) <= 1: continue stems = input_sents["stems"].apply(lambda x: ' '.join(x)).tolist() X = semsim.transform(stems) K = -(1 - cosine_similarity(X)) K_ma = np.ma.masked_array(K, np.eye(K.shape[0])) Kmin = np.ma.min(K_ma) Kmax = np.ma.max(K_ma) median = np.ma.median(K_ma)[0] print "SYS TIME:", hour, "# SENTS:", K.shape[0], print "min/median/max pref: {}/{}/{}".format(Kmin, median, Kmax) # ap = AffinityPropagation(affinity="precomputed", verbose=True, max_iter=1000) ap.fit(K) labels = ap.labels_ if ap.cluster_centers_indices_ != None: for c in ap.cluster_centers_indices_: if cache == None: cache = X[c] updates_df = \ input_sents.reset_index(drop=True).iloc[c] updates_df["query id"] = event.query_num updates_df["system timestamp"] = current_time summary_data.append(updates_df[[ "query id", "stream id", "sent id", "system timestamp", "sent text" ]].to_frame().T) else: Ksum = cosine_similarity(cache, X[c]) if Ksum.max() < sim_threshold: cache = np.vstack([cache, X[c]]) updates_df = \ input_sents.reset_index(drop=True).iloc[c] updates_df["query id"] = event.query_num updates_df["system timestamp"] = current_time summary_data.append(updates_df[[ "query id", "stream id", "sent id", "system timestamp", "sent text" ]].to_frame().T) prev_time = current_time df = pd.DataFrame(K_data, columns=["min", "max", "median"]) print df print df.mean() print df.std() print df.max() df = pd.concat(summary_data) df["conf"] = .5 df["team id"] = "AP" df["run id"] = "sim{}_bs{}".format(sim_threshold, bucket_size) print df of = os.path.join( output_dir, "ap." + "sim{}_bs{}.tsv".format(sim_threshold, bucket_size)) cols = [ "query id", "team id", "run id", "stream id", "sent id", "system timestamp", "conf" ] df[cols].to_csv(of, sep="\t", header=False, index=False)
def combinedfeatures(row): return row['keywords'] + ' ' + row['cast'] + ' ' + row[ 'genres'] + ' ' + row['director'] for feature in features: df[feature] = df[feature].fillna('') df['combinedfeature'] = df.apply(combinedfeatures, axis=1) #print(df['combinedfeature']) cv = CountVectorizer() count_matrix = cv.fit_transform(df['combinedfeature']) cosine_sim = cosine_similarity(count_matrix) def get_index_from_title(title): return df[df.title == title]['index'].values[0] def get_title_from_index(index): return df[df.index == index]['title'].values[0] movie_user_liked = input('enter the movie name : ') movie_index = get_index_from_title(movie_user_liked) similar_movies = list(enumerate(cosine_sim[movie_index])) sorted_similar_movies = sorted(similar_movies,
def similarity(vec1, vec2): vec1 = vec1.reshape(1, -1) vec2 = vec2.reshape(1, -1) return cosine_similarity(vec1, vec2)[0][0]
def first_level_grouping_within_group(feature_map_dict, encoded_list_rearrange_concat, mask_arr, all_keys, keys_1d, keys_2d, keys_3d=[]): height = 32 width = 20 relation_1d_df = pd.DataFrame(0, columns=keys_1d, index=keys_1d) relation_2d_df = pd.DataFrame(0, columns=keys_2d, index=keys_2d) relation_3d_df = pd.DataFrame(0, columns=keys_3d, index=keys_3d) num_data = len(encoded_list_rearrange_concat[0]) timestep = 24 # num_data for n in range(num_data): print('n: ', n) for ds_name1 in all_keys: # 1D case if ds_name1 in keys_1d: temp_arr1 = feature_map_dict[ds_name1][n, :] # (24, 1, 1, 1) # (24, 1) - > [32, 20, 24] dim1 = temp_arr1.shape[0] # number of layers in the 2d data # dim1 = temp_arr1.shape[-1] # number of layers in the 2d data for ds_name2 in all_keys: # 1D VS 1D if ds_name2 in keys_1d: temp_arr2 = feature_map_dict[ds_name2][n, :] sim_sparse = cosine_similarity( temp_arr1.reshape(1, -1), temp_arr2.reshape(1, -1)) ave_SR = sim_sparse[0][0] relation_1d_df.loc[ds_name1, ds_name2] += ave_SR # 2D case if ds_name1 in keys_2d: temp_arr1 = feature_map_dict[ds_name1][ n, :, :, :] # [32, 20, 1] # duplicate to [32, 20, 24] temp_arr1_mean_dup = np.repeat(temp_arr1, timestep, axis=-1) for ds_name2 in all_keys: # 2D Vs 2D # all duplicate to 3D if ds_name2 in keys_2d: ave_SR = 0 # average spearman correlation temp_arr2 = feature_map_dict[ds_name2][n, :, :, :] compress_arr2 = remove_outside_cells( temp_arr2, mask_arr) compress_arr1 = remove_outside_cells( temp_arr1, mask_arr) sim_sparse = cosine_similarity( compress_arr1.reshape(1, -1), compress_arr2.reshape(1, -1)) ave_SR = sim_sparse[0][0] relation_2d_df.loc[ds_name1, ds_name2] += ave_SR # 3D if ds_name1 in keys_3d: temp_arr1 = feature_map_dict[ds_name1][ n, :, :, :, :] # [24, 32, 20, 1] temp_arr1 = np.squeeze(temp_arr1, axis=-1) #[24, 32, 20] temp_arr1 = np.moveaxis(temp_arr1, 0, -1) # (32, 20, 24) for ds_name2 in all_keys: # 3D VS 3D # flatten and compare. Because 3rd dimension contains # temporal information if ds_name2 in keys_3d: temp_arr2 = feature_map_dict[ds_name2][n, :, :, :, :] temp_arr2 = np.squeeze(temp_arr2, axis=-1) #[24, 32, 20] temp_arr2 = np.moveaxis(temp_arr2, 0, -1) # (32, 20, 24) ave_SR = 0 # average spearman correlation compress_arr2 = remove_outside_cells( temp_arr2, mask_arr) compress_arr1 = remove_outside_cells( temp_arr1, mask_arr) sim_sparse = cosine_similarity( compress_arr1.reshape(1, -1), compress_arr2.reshape(1, -1)) ave_SR = float(sim_sparse[0][0]) relation_3d_df.loc[ds_name1, ds_name2] += ave_SR relation_1d_df = relation_1d_df / num_data relation_2d_df = relation_2d_df / num_data relation_3d_df = relation_3d_df / num_data return relation_1d_df, relation_2d_df, relation_3d_df
def cosine_similarity(dataframe_1, dataframe_2): return float(cosine_similarity(dataframe_1.values, dataframe_2.values))
def get_cosine_sim(*strs): vectors = [t for t in get_vectors(*strs)] return cosine_similarity(vectors)
def topic_wtv(wtv_model, n_top_words, topic_model): ret_val = [] for topic in topic_model.components_: ret_val.append(np.sum(np.array([wtv_model[item[0]]*item[1] for item in [(tf_feature_names[i],topic[i]) for i \ in np.argsort(topic)[:-n_top_words - 1:-1]]]),axis=0)/np.sum(np.sort(topic)[:-n_top_words - 1:-1])) return np.array(ret_val) #Word for which reviews are requested category = 'entertainment' #Column name for True column category_label = 'show' #Weighted average word2vec vector topic_av_wtv = topic_wtv(tweet_w2v, lda.components_.shape[1], lda) #Calculate similarity of word2vec vector of word with LDA topic topic_sim = cosine_similarity(topic_av_wtv, tweet_w2v[category].reshape(1, -1)) #Taking only top 3 important topic topic_sim[np.argsort(topic_sim.reshape((1, -1)))[0][::-1][3:]] = 0 #Checking for review similarity with LDA topic distribution of review rewiew_category = cosine_similarity(topic_sim.reshape((1, -1)), BoW_lda) rest1['cat_sim_unscale'] = rewiew_category[0] #Scaling review importance between 0 and 1 rest1['cat_sim'] = (rest1.cat_sim_unscale - min(rest1.cat_sim_unscale)) / ( max(rest1.cat_sim_unscale) - min(rest1.cat_sim_unscale)) #Creating column for ground truth rest1['Pizza_Italian'] = rest1.categories.apply(lambda a: 1 if (\ ('Pizza' in a)\ |('Italian' in a)) else 0) rest1['beverage'] = rest1.categories.apply(lambda a: 1 if (\ ('Tea Rooms' in a)|\ ('Wineries' in a)|\
def recommendation_drink_of_contents_based(LiquorNum="", stopword="", top=6): # 실제 각행렬간 유사도 계산된것은 이미 REDIS에 저장된상태 # 새로운술이 들어오거나 기존의 술이 삭제되었을때에, 계산해둔다. ## DB에서 가져와야한다. drink_dataframe = pd.DataFrame(models.Liquor.objects.all().values()) drink_dataframe.set_index(drink_dataframe['liquornumber'], inplace=True) drink_dataframe.drop(columns=['liquornumber'], inplace=True) indexList = list(drink_dataframe['liquorname']) target = int(LiquorNum) - 1 keyword = indexList[target] print(keyword + " 의 와 유사한 술 계산 .... ") # 사용자가 못먹는 원재료가 들어간 술은 추천에서 제외 if len(stopword) > 0: drink_dataframe.drop(index=[ i for i, item in enumerate(drink_dataframe["liquoringredient"]) if len(list(set([stopword]) & set(item.split(",")))) > 0 ], inplace=True) # 출처지역,종류,원재료를 제외한 숫자데이터에서 유사도 추출 exceptList = [ "liquorname", "liquorarea", "liquoringredient", "url", 'liquorcategory' ] drink_dataframe_without_literal = drink_dataframe.drop(columns=exceptList) # MinMax Scaling을 통한 데이터 정규화 scaleList = [ item for item in drink_dataframe_without_literal.columns if item not in exceptList ] # 정규화시킬List를 선정 scaler = MinMaxScaler() drink_dataframe_without_literal[scaleList] = scaler.fit_transform( drink_dataframe_without_literal[scaleList]) drink_datafrmae_with_normalization = drink_dataframe_without_literal[ scaleList] # 피어슨&코사인 유사도 계산 dictionary similarity_dict = dict() # 피어스 유사도 추출 후 가장 항목이 높은 5가지 전통주 추천 pearson_similarity_metrix = drink_datafrmae_with_normalization.T.corr( method="pearson").to_numpy() topid = sorted(range(len(pearson_similarity_metrix[target])), key=lambda i: pearson_similarity_metrix[target][i])[-top:] return [i + 1 for i in reversed(topid)][1:] # For Testing # 코사인 유사도 추출후 가장 항목이 높은 5가지 전통주 추천 cosine_similarity_metrix = cosine_similarity( drink_datafrmae_with_normalization) # 내가 찾을 술과 유사한것 1번째는 자기자신일꺼임 index = LiquorNum topid = sorted(range(len(cosine_similarity_metrix[index])), key=lambda i: cosine_similarity_metrix[index][i])[-top:] recommendation_drink_of_contents_based_top_five = [] for i in range(top - 2, 0, -1): recommendation_drink_of_contents_based_top_five.append([ np.array(indexList[2:])[topid][:-1][i], round(cosine_similarity_metrix[index][topid][:-1][i] * 100, 3) ]) similarity_dict["cosine"] = recommendation_drink_of_contents_based_top_five return similarity_dict
def pipeline_test(train, test, lim_unigram): """ Process test set Returns: test_set: list, of numpy arrays """ # Initialise heads = [] heads_track = {} bodies = [] bodies_track = {} body_ids = [] test_heads = [] test_heads_track = {} test_bodies = [] test_bodies_track = {} test_body_ids = [] # Identify unique heads and bodies for instance in train.instances: head = instance['Headline'] body_id = instance['Body ID'] if head not in heads_track: heads.append(head) heads_track[head] = 1 if body_id not in bodies_track: bodies.append(train.bodies[body_id]) bodies_track[body_id] = 1 body_ids.append(body_id) for instance in test.instances: head = instance['Headline'] body_id = instance['Body ID'] if head not in test_heads_track: test_heads.append(head) test_heads_track[head] = 1 if body_id not in test_bodies_track: test_bodies.append(test.bodies[body_id]) test_bodies_track[body_id] = 1 test_body_ids.append(body_id) # Create vectorizers and BOW and TF arrays for train set bow_vectorizer = CountVectorizer(max_features=lim_unigram, stop_words=stop_words) bow = bow_vectorizer.fit_transform(heads + bodies) tfreq_vectorizer = TfidfTransformer(use_idf=False).fit(bow) tfidf_vectorizer = TfidfVectorizer(max_features=lim_unigram, stop_words=stop_words).\ fit(heads + bodies + test_heads + test_bodies) # Initialise test_set = [] heads_track = {} bodies_track = {} cos_track = {} # Process test set for instance in test.instances: head = instance['Headline'] body_id = instance['Body ID'] if head not in heads_track: head_bow = bow_vectorizer.transform([head]).toarray() head_tf = tfreq_vectorizer.transform( head_bow).toarray()[0].reshape(1, -1) head_tfidf = tfidf_vectorizer.transform([head]).toarray().reshape( 1, -1) heads_track[head] = (head_tf, head_tfidf) else: head_tf = heads_track[head][0] head_tfidf = heads_track[head][1] if body_id not in bodies_track: body_bow = bow_vectorizer.transform([test.bodies[body_id] ]).toarray() body_tf = tfreq_vectorizer.transform( body_bow).toarray()[0].reshape(1, -1) body_tfidf = tfidf_vectorizer.transform( [test.bodies[body_id]]).toarray().reshape(1, -1) bodies_track[body_id] = (body_tf, body_tfidf) else: body_tf = bodies_track[body_id][0] body_tfidf = bodies_track[body_id][1] if (head, body_id) not in cos_track: tfidf_cos = cosine_similarity(head_tfidf, body_tfidf)[0].reshape(1, 1) cos_track[(head, body_id)] = tfidf_cos else: tfidf_cos = cos_track[(head, body_id)] feat_vec = np.squeeze(np.c_[head_tf, body_tf, tfidf_cos]) test_set.append(feat_vec) return test_set
#similary_sc.to_csv(direc + 'similarity_score') #%% similary_sc = pd.read_csv(join(direc, 'similarity_score'), names = ['score']) #%% from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from sklearn.cluster import KMeans from sklearn.decomposition import PCA, KernelPCA tv = TfidfVectorizer(min_df=5, use_idf=True) tv_matrix = tv.fit_transform(sentences) tv_matrix = tv_matrix.toarray() vocab = tv.get_feature_names() similarity_matrix = cosine_similarity(tv_matrix) similarity_df = pd.DataFrame(similarity_matrix) #%% ##topic modeling #lda = LatentDirichletAllocation(n_components=2, max_iter=2, random_state=0) #dt_matrix = lda.fit_transform(tv_matrix) #features = pd.DataFrame(dt_matrix, columns=['T1', 'T2']) #tt_matrix = lda.components_ # #for topic_weights in tt_matrix: # topic = [(token, weight) for token, weight in zip(vocab, topic_weights)] # topic = sorted(topic, key=lambda x: -x[1]) # topic = [item for item in topic if item[1] > 0.9]
def visualize_heatmap(topic_model, topics: List[int] = None, top_n_topics: int = None, n_clusters: int = None, width: int = 800, height: int = 800) -> go.Figure: """ Visualize a heatmap of the topic's similarity matrix Based on the cosine similarity matrix between topic embeddings, a heatmap is created showing the similarity between topics. Arguments: topic_model: A fitted BERTopic instance. topics: A selection of topics to visualize. top_n_topics: Only select the top n most frequent topics. n_clusters: Create n clusters and order the similarity matrix by those clusters. width: The width of the figure. height: The height of the figure. Returns: fig: A plotly figure Usage: To visualize the similarity matrix of topics simply run: ```python topic_model.visualize_heatmap() ``` Or if you want to save the resulting figure: ```python fig = topic_model.visualize_heatmap() fig.write_html("path/to/file.html") ``` <iframe src="../../getting_started/visualization/heatmap.html" style="width:1000px; height: 720px; border: 0px;""></iframe> """ # Select topic embeddings if topic_model.topic_embeddings is not None: embeddings = np.array(topic_model.topic_embeddings) else: embeddings = topic_model.c_tf_idf # Select topics based on top_n and topics args if topics is not None: topics = list(topics) elif top_n_topics is not None: topics = sorted(topic_model.get_topic_freq().Topic.to_list()[1:top_n_topics + 1]) else: topics = sorted(list(topic_model.get_topics().keys())) # Order heatmap by similar clusters of topics if n_clusters: if n_clusters >= len(set(topics)): raise ValueError("Make sure to set `n_clusters` lower than " "the total number of unique topics.") embeddings = embeddings[[topic + 1 for topic in topics]] distance_matrix = cosine_similarity(embeddings) Z = linkage(distance_matrix, 'ward') clusters = fcluster(Z, t=n_clusters, criterion='maxclust') # Extract new order of topics mapping = {cluster: [] for cluster in clusters} for topic, cluster in zip(topics, clusters): mapping[cluster].append(topic) mapping = [cluster for cluster in mapping.values()] sorted_topics = [topic for cluster in mapping for topic in cluster] else: sorted_topics = topics # Select embeddings indices = np.array([topics.index(topic) for topic in sorted_topics]) embeddings = embeddings[indices] distance_matrix = cosine_similarity(embeddings) # Create nicer labels new_labels = [[[str(topic), None]] + topic_model.get_topic(topic) for topic in sorted_topics] new_labels = ["_".join([label[0] for label in labels[:4]]) for labels in new_labels] new_labels = [label if len(label) < 30 else label[:27] + "..." for label in new_labels] fig = px.imshow(distance_matrix, labels=dict(color="Similarity Score"), x=new_labels, y=new_labels, color_continuous_scale='GnBu' ) fig.update_layout( title={ 'text': "<b>Similarity Matrix", 'y': .95, 'x': 0.55, 'xanchor': 'center', 'yanchor': 'top', 'font': dict( size=22, color="Black") }, width=width, height=height, hoverlabel=dict( bgcolor="white", font_size=16, font_family="Rockwell" ), ) fig.update_layout(showlegend=True) fig.update_layout(legend_title_text='Trend') return fig
def Redundancy(xi, xj, count_vect, tfidf): #xi and xj are two sentences in the summary return cosine_similarity(getTfidf(xi, count_vect, tfidf), getTfidf(xj, count_vect, tfidf)).flatten()[0]
import streamlit as st from stream import rec_sim, simulate_matches2, recommendations3, match3 import pandas as pd from sklearn.model_selection import train_test_split from sklearn.metrics.pairwise import cosine_similarity from sklearn.linear_model import * df_scaled2 = pd.read_csv('df_scaled2.csv', index_col='Team') team_list = df_scaled2.index.tolist() df_stats_poss3 = pd.read_csv('df_stats_poss3.csv', index_col='Team') cos_sim1 = cosine_similarity(df_scaled2) df_leagues = pd.read_csv('df_stats_leagues.csv', index_col=0) indices = pd.Series(df_stats_poss3.index) df_merged2 = pd.read_csv('df_merged2.csv') df_merged2 = df_merged2.drop(df_merged2.columns[0], axis=1) col_list1 = df_merged2.columns[62:91] col_list2 = df_merged2.columns[6:62] col_list2 = col_list2.append(df_merged2.columns[1:3]) results = df_merged2.drop(['home_goals', 'away_goals', 'Home', 'Away'], 1) results['winner'] = None results['winner'][results.score > 0] = 1 results['winner'][results.score < 0] = 2 results['winner'][results.score == 0] = 0 y = results['winner'].astype(int) X = results[col_list1] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20) lr = LogisticRegression(C=0.1) lr.fit(X_train, y_train) def simulate_matches2(team, team2, n_matches=50):
def occ_vec(u, v): return cosine_similarity(u, v)[0][0]
def get_cos_sim(dset, n_cats, dtype, dset_name, version, sim_type, IPC_dict=None): """ This will take a dataset and calculate the cosine similiarity within and between classes, producing a csv with results and updating a main doc. :param dset: data to be tested, csv, (pd or np array?) :param n_cats: number of classes (items per-class calculated as items/classes) :param dtype: binary, chan_dist or chanProp. only needed for labelling :param dset_name: of dataset eg HBHW, HBLW, LBHW, LBLW :param version: number with 2 versions of each type :param sim_type: Describe the similarity e.g., HBHW or vary etc :param IPC_dict: defalt = None. if the number of items per class is not equal, enter a dict """ print("\nrunning ** get_cos_sim()**") file_path = "/home/nm13850/Documents/PhD/python_v2/experiments/" \ "within_between_dist_july2020/New_data/" if running_on_laptop(): file_path = '/Users/nickmartin/Library/Mobile Documents/com~apple~CloudDocs/' \ 'Documents/PhD/python_v2/experiments/' \ 'within_between_dist_july2020/New_data/' save_path = os.path.join(file_path, 'similarity_details') # # enter either 'cos_sim, 'cos_dist' or 'taxi' distance = 'cos_sim' dataset = np.asarray(dset) items, features = np.shape(dataset) print(f'\ndataset: {dataset}') print(f'items, features: {items}, {features}') # add IPC dict here if class_sizes are not equal if IPC_dict is None: cat_size = int(items / n_cats) IPC_dict = {i: cat_size for i in range(n_cats)} print(f'\nequal size IPC dict\n{IPC_dict}') else: print("using IPC dict") # separate out the individual classes # start with class inidices list containing zero, index of the first class class_indices = [0] IPC_vals = list(IPC_dict.values()) print(f'\nIPC_vals: {IPC_vals}') for i in range(n_cats): next_val = class_indices[-1] + IPC_vals[i] class_indices.append(next_val) # list of items numbers to start each class start_indices = class_indices[:n_cats] # print(f'\nstart_indices: {start_indices}') # list of indices to end each class end_indices = class_indices[1:] # print(f'end_indices: {end_indices}') # 1. define classes as slices of dataset array class_list = [] names_list = [] for cat in range(n_cats): this_name = f'class_{cat}' names_list.append(this_name) this_class = dataset[start_indices[cat]:end_indices[cat], :] class_list.append(this_class) # print(f'\n{this_name}\n{this_class}\n') # within class similarities # 3. make empty list to store results. within_list = [] for index, this_cat in enumerate(class_list): # print(f'\ngetting within class cos_sim for {names_list[index]}') # will do all pairwise comparrisons within the given category if distance in [ 'cos_sim', 'cosine_similarity', 'cosine_sim', 'cos_similarity' ]: within_cat = cosine_similarity(this_cat) # the SIMILARITY between two identical vectors will be 1 elif distance in [ 'cos_dist', 'cosine_distance', 'cosine_dist', 'cos_distance' ]: within_cat = cosine_distances(this_cat) # this DISTANCE between two identical vectors will be 0 # Cosine_distance = 1 - cosine_similarity elif distance in ['manhattan', 'taxi']: within_cat = manhattan_distances(this_cat) else: raise ValueError('must input a valid distance name') # print(within_cat) # just take the triangle since this analysis compares items with themselves triangle_indices = np.triu_indices(IPC_dict[index], 1) values_for_descriptives = (within_cat[triangle_indices]) # print(values_for_descriptives) data_similarity_descriptives = scipy.stats.describe( values_for_descriptives, axis=None) mean_sim = str(np.round(data_similarity_descriptives.mean, decimals=2)) print( f"\nWithin group mean {distance} for {names_list[index]}: {mean_sim}" ) within_list.append(mean_sim) print(f'\nwithin_list ({distance}): {within_list}\n') # between class similarities. print('\nbetween class similarities') ''' For each pair of classes - get the similarities of each item in one class to each item in the other class. - take the average of the whole matrix (not just the triangle) to get the mean similaritiy between these two classes. These mean between class similarities go into an n_cats x n_cats-1 matrix. (n_cats-1 because I am not going to have diagonals comparing classes with themselves. Each row shows a classes similarity to all other classes. - Take the average of each row to a get a class's mean between class similarity. Example below shows 4 classes (rows) and the values show which other class is being compared. e.g., class1 is compared with classes 2, 3, 4. Class2 is compared with classes 1, 3, 4. compA compB compC class1: 2 3 4 class2: 1 3 4 class3: 1 2 4 class4: 1 2 3 ''' class_pairs_list = list(combinations(class_list, 2)) class_names_list = list(combinations(names_list, 2)) class_index_list = list(combinations(range(n_cats), 2)) print( f'running {len(class_index_list)} between class comparrrions.\n{class_index_list}' ) between_array = np.zeros(shape=(n_cats, n_cats - 1)) for index, cat_pair in enumerate(class_pairs_list): cat_a = cat_pair[0] cat_name_a = class_names_list[index][0] cat_b = cat_pair[1] cat_name_b = class_names_list[index][1] print(f'\nbetween class {distance} for: {cat_name_a} and {cat_name_b}') # # do all pairwise comparrisons between the classes if distance in [ 'cos_sim', 'cosine_similarity', 'cosine_sim', 'cos_similarity' ]: between_pairs_matrix = cosine_similarity(X=cat_a, Y=cat_b) elif distance in [ 'cos_dist', 'cosine_distance', 'cosine_dist', 'cos_distance' ]: between_pairs_matrix = cosine_distances(X=cat_a, Y=cat_b) elif distance in ['manhattan', 'taxi']: between_pairs_matrix = manhattan_distances(X=cat_a, Y=cat_b) else: raise ValueError('must input a valid distance name') print(f'{between_pairs_matrix}') mean_between_pair = np.mean(between_pairs_matrix) print(f'mean_between_pair: {mean_between_pair}') # append to between array in both (ofset) diagonals idxA, idxB = class_index_list[index] print(f'add to matrix position: {idxA}, {idxB}') between_array[idxA, idxB - 1] = mean_between_pair between_array[idxB, idxA] = mean_between_pair print(f"\nbetween_array:\n{between_array}") print(f'\nmean between class {distance}') between_list = [] for index in range(n_cats): this_row = between_array[index] this_mean = np.mean(this_row) between_list.append(this_mean) print(index, this_mean) print("I want to get the mean of the between list and the within list") dset_between_mean = np.mean(between_list) dset_between_sd = np.std(between_list) print( f"dataset mean between class distance: {dset_between_mean} std.dev: {dset_between_sd}" ) print(f"check within list:\n{within_list}") within_list_num = [float(i) for i in within_list] print(f"check within_list_num:\n{within_list_num}") dset_within_mean = np.mean(within_list_num) dset_within_sd = np.std(within_list_num) print( f"dataset mean within class distance: {dset_within_mean} std.dev: {dset_within_sd}" ) # # save output. '''for each class: mean within mean between paired between ''' names_list.append('Dset_means') names_list.append('Dset_sd') within_list.append(dset_within_mean) within_list.append(dset_within_sd) between_list.append(dset_between_mean) between_list.append(dset_between_sd) class_sim_dict = { 'class': names_list, 'between': between_list, 'within': within_list } class_sim_df = pd.DataFrame(class_sim_dict) print(class_sim_df) csv_name = f'{dset_name}_{distance}.csv' csv_path = os.path.join(save_path, csv_name) class_sim_df.to_csv( csv_path, index_label='class', ) # check if similiarity summary exists similarity_info = [ dtype, dset_name, sim_type, version, n_cats, dset_between_mean, dset_between_sd, dset_within_mean, dset_within_sd ] print(f"similarity_info:\n{similarity_info}") # check if training_info.csv exists summary_name = 'similarity_summary.csv' print(f"\nlooking for file:\n{os.path.join(save_path, summary_name)}") if not os.path.isfile(os.path.join(save_path, summary_name)): print("making summary page") headers = [ "dtype", "dset_name", 'sim_type', "version", "n_cats", "mean_b", "sd_b", "mean_w", "sd_w" ] similarity_overview = open(os.path.join(save_path, summary_name), 'w') mywriter = csv.writer(similarity_overview) mywriter.writerow(headers) else: print("appending to summary page") similarity_overview = open(os.path.join(save_path, summary_name), 'a') mywriter = csv.writer(similarity_overview) mywriter.writerow(similarity_info) similarity_overview.close() return_dict = { "dtype": dtype, "dset_name": dset_name, 'sim_type': sim_type, "version": version, "n_cats": n_cats, "dset_between_mean": dset_between_mean, "dset_between_sd": dset_between_sd, "dset_within_mean": dset_within_mean, "dset_within_sd": dset_within_sd } return return_dict
def Relevant(xi, count_vect, tfidf, documents): #xi is a sentence in the summary return cosine_similarity(getTfidf(xi, count_vect, tfidf), getTfidf(' '.join(documents), count_vect, tfidf)).flatten()[0] + getPosition( xi, documents)
# update weights after example for e in range(0, len(input_feed)): #for all items in input x = sess.run(optimizer, feed_dict={ train_inputs: [input_feed[e]], train_labels: [output_feed[e]] }) # collect vectors inp_vectors = {} for v in range(0, len(vocab)): inp_vectors[vocab[v]] = sess.run( embed, feed_dict={train_inputs: [word_to_index[vocab[v]]]}) # calculate similarities for v in inp_vectors: for vv in inp_vectors: sim_dict[v][vv].append( cosine_similarity(inp_vectors[v], inp_vectors[vv])[0][0]) print('Bass - Acoustic: ', np.mean(sim_dict['bass']['acoustic'])) print('Bass - Trout: ', np.mean(sim_dict['bass']['trout'])) print('Bass - Acoustic Std: ', np.std(sim_dict['bass']['acoustic'])) print('Bass - Trout Std: ', np.std(sim_dict['bass']['trout'])) #dframe = pd.DataFrame(sim_dict) #dframe.to_pickle('Random_Isub_100runs.pkl') sess.close()
def get_response(q): my_q = vectorizer.transform([q]) cs = cosine_similarity(my_q, vec) rs = pd.Series(cs[0]).sort_values(ascending=0) rsi = rs.index[0] return convo_frame.iloc[rsi]['a']
def main(): synopses = [] Id = [] title = [] tags = [] for questions in question_cursor: synopses.append(questions["Body"]) Id.append(questions["Id"]) title.append(questions["Title"]) tags.append(questions["Tags"]) #use extend so it's a big flat list of vocab totalvocab_stemmed = [] totalvocab_tokenized = [] for i in synopses: i = text_preprocess(i) i = re.sub(r'python|Python|[^A-Za-z0-9. ]+',' ',i) allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list allwords_tokenized = tokenize_only(i) totalvocab_tokenized.extend(allwords_tokenized) vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed) print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame') print(vocab_frame.head()) print() from sklearn.feature_extraction.text import TfidfVectorizer #define vectorizer parameters tfidf_vectorizer = TfidfVectorizer(max_df=0.5, max_features=200000, min_df=0.2, stop_words='english', use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3)) tfidf_matrix = tfidf_vectorizer.fit_transform(synopses) #fit the vectorizer to synopses print(tfidf_matrix.shape) terms = tfidf_vectorizer.get_feature_names() from sklearn.metrics.pairwise import cosine_similarity dist = 1 - cosine_similarity(tfidf_matrix) print print from sklearn.cluster import KMeans num_clusters = 5 km = KMeans(n_clusters=num_clusters) km.fit(tfidf_matrix) clusters = km.labels_.tolist() #from sklearn.externals import joblib #uncomment the below to save your model #since I've already run my model I am loading from the pickle # joblib.dump(km, 'doc_cluster.pkl') # km = joblib.load('doc_cluster.pkl') #clusters = km.labels_.tolist() #uncomment the below to save your model since I've already run my model I am loading from the pickle joblib.dump(km, 'doc_cluster.pkl') km = joblib.load('doc_cluster.pkl') clusters = km.labels_.tolist() #posts = {'Title': title, "Id": Id, 'synopsis': synopses, 'cluster': clusters} posts = {"Id": Id, 'synopsis': synopses, 'cluster': clusters} #frame = pd.DataFrame(posts, index = [clusters] , columns = ['Title', 'Id', 'cluster']) frame = pd.DataFrame(posts, index = [clusters] , columns = ['Id', 'cluster']) print(frame['cluster'].value_counts()) #number of films per cluster (clusters from 0 to 4) print("Top terms per cluster:") print() #sort cluster centers by proximity to centroid order_centroids = km.cluster_centers_.argsort()[:, ::-1] for i in range(num_clusters): print("Cluster %d words: " % i, end='') for ind in order_centroids[i, :]: #replace 6 with n words per cluster print(vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0], end=',') #print(' %s' % frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',') print() #add whitespace print() #add whitespace print("Cluster %d ids:" % i, end='') for id in frame.ix[i]['Id'].values.tolist(): print(' %s,' % id, end=' ') print() #add whitespace print() #add whitespace
else: return '' # Apply clean_data function to your features. features = ['cast', 'keywords', 'director', 'genres'] for feature in features: metadata[feature] = metadata[feature].apply(clean_data) def create_soup(x): return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres']) #.replace(u'\xa0', u'') metadata['soup'] = metadata.apply(create_soup, axis=1) # import CountVectorizer and create the count matrix from sklearn.feature_extraction.text import CountVectorizer count = CountVectorizer(stop_words='english') count_matrix = count.fit_transform(metadata['soup']) # compute the cosine similarity matrix based on the count_matrix from sklearn.metrics.pairwise import cosine_similarity cosine_sim2 = cosine_similarity(count_matrix, count_matrix) # reset index of your main dataframe and construct reverse mapping as before metadata = metadata.reset_index() indices = pd.Series(metadata.index, index=metadata['title']) print(get_recommendations('The Dark Knight Rises', cosine_sim2)) print(get_recommendations('The Godfather', cosine_sim2))