def assign(x1,x2,c1,c2): r = ed(x1,c1) r2 = ed(x2,c2) r3=np.concatenate((r,r2), axis=1) z = np.argmin(r3, axis = 1) res = np.array( [zz if zz < c1.shape[0] else zz-c1.shape[0] for zz in z] ) return res, np.argmin(r, axis = 1) != np.argmin(r2, axis = 1)
def get_similarity_values(q1_csc, q2_csc): cosine_sim = [] manhattan_dis = [] eucledian_dis = [] jaccard_dis = [] minkowsk_dis = [] for i, j in zip(q1_csc, q2_csc): sim = cs(i, j) cosine_sim.append(sim[0][0]) sim = md(i, j) manhattan_dis.append(sim[0][0]) sim = ed(i, j) eucledian_dis.append(sim[0][0]) i_ = i.toarray() j_ = j.toarray() try: sim = jsc(i_, j_) jaccard_dis.append(sim) except: jaccard_dis.append(0) sim = minkowski_dis.pairwise(i_, j_) minkowsk_dis.append(sim[0][0]) return cosine_sim, manhattan_dis, eucledian_dis, jaccard_dis, minkowsk_dis
def compare_heatmap(y11, y12, y21, y22, mata, matb): from lapsolver import solve_dense from sklearn.metrics.pairwise import euclidean_distances as ed from natto.process.hungutil import make_canvas_and_spacemaps from natto.out.draw import quickdoubleheatmap distances = ed(mata, matb) hungmatch = solve_dense(distances) def prephtmap(y1, y2): # returns: canvas,y1map2,y2map2,row,col a, b, c = make_canvas_and_spacemaps(y1, y2, hungmatch, normalize=False) d, e = solve_dense(c) return c, a, b, d, e comp1 = prephtmap(y11, y12) comp2 = prephtmap(y21, y22) quickdoubleheatmap(comp1, comp2) def calcmissmatches(stuff): canv = stuff[0] r, c = stuff[-2:] for rr, cc in zip(r, c): canv[rr, cc] = 0 return canv.sum() print("clust1 missplaced:", calcmissmatches(comp1)) print("clust2 missplaced:", calcmissmatches(comp2)) print("set1 randindex:", rand(y11, y21)) print("set2 randindex:", rand(y12, y22))
def euclidean_distance(w1, w2, wv): v1 = vector_getter(w1, wv) v2 = vector_getter(w2, wv) distance = None # if v1 and v2 are vectors of dimension 300 if type(v1) != int and type(v2) != int: distance = ed(v1, v2)[0][0] return distance
def get_similarity_values(res_csc, jd_csc): cosine_sim = [] manhattan_dis = [] eucledian_dis = [] j= jd_csc for i in res_csc: sim = cs(i,j) cosine_sim.append(sim[0][0]) sim = md(i,j) manhattan_dis.append(sim[0][0]) sim = ed(i,j) eucledian_dis.append(sim[0][0]) return cosine_sim, manhattan_dis, eucledian_dis
def bm25_dist(row, dist_type, bm25_model, average_idf, feature_dim): assert dist_type in ['cs', 'ed', 'md'], 'dist type error' q1 = row['q1_w'].split() q2 = row['q2_w'].split() q1_bm25 = bm25_model.get_scores(q1, average_idf) q2_bm25 = bm25_model.get_scores(q2, average_idf) q1_bm25 = np.reshape(np.array(q1_bm25), (-1, feature_dim)) q2_bm25 = np.reshape(np.array(q2_bm25), (-1, feature_dim)) if dist_type == 'cs': score = cs(q1_bm25, q2_bm25).flatten()[0] elif dist_type == 'ed': score = ed(q1_bm25, q2_bm25).flatten()[0] elif dist_type == 'md': score = md(q1_bm25, q2_bm25).flatten()[0] return score
def kmeans(features, k, num_iters=100): """ Use kmeans algorithm to group features into k clusters. K-Means algorithm can be broken down into following steps: 1. Randomly initialize cluster centers 2. Assign each point to the closest center 3. Compute new center of each cluster 4. Stop if cluster assignments did not change 5. Go to step 2 Args: features - Array of N features vectors. Each row represents a feature vector. k - Number of clusters to form. num_iters - Maximum number of iterations the algorithm will run. Returns: assignments - Array representing cluster assignment of each point. (e.g. i-th point is assigned to cluster assignments[i]) """ N, D = features.shape assert N >= k, 'Number of clusters cannot be greater than number of points' # Randomly initalize cluster centers idxs = np.random.choice(N, size=k, replace=False) centers = features[idxs] assignments = np.zeros(N) for n in range(num_iters): ### YOUR CODE HERE matrix = ed(features, centers) tmp = np.argmin(matrix, axis=1) if np.all(tmp == assignments): break assignments = tmp new_centers = np.zeros_like(centers) for i in range(k): assigned_i = features[assignments == i] new_centers[i] = np.mean(assigned_i, axis=0) #m = ed(assigned_i, assigned_i) #new_centers[i] = assigned_i[np.argmin(np.sum(m, axis = 0))] centers = new_centers ### END YOUR CODE return assignments
def extract_tfidf_feature(self, df): q1_w_vec = self.tfidf_vectorizer.transform(df['q1_w'].values.tolist()) q2_w_vec = self.tfidf_vectorizer.transform(df['q2_w'].values.tolist()) df['tfidf_cs'] = np.concatenate([ cs(q1_w_vec[i], q2_w_vec[i]).flatten() for i in range(q1_w_vec.shape[0]) ]) df['tfidf_ed'] = np.concatenate([ ed(q1_w_vec[i], q2_w_vec[i]).flatten() for i in range(q1_w_vec.shape[0]) ]) df['tfidf_md'] = np.concatenate([ md(q1_w_vec[i], q2_w_vec[i]).flatten() for i in range(q1_w_vec.shape[0]) ]) corpus_tfidf = np.concatenate( [q1_w_vec.toarray(), q2_w_vec.toarray()], axis=0) svd_model = TruncatedSVD(n_components=5) svd_model.fit(corpus_tfidf) svd_topic = svd_model.transform(corpus_tfidf) q1_w_svd_feature = svd_topic[:q1_w_vec.shape[0]] q2_w_svd_feature = svd_topic[q1_w_vec.shape[0]:] df['svd_cs'] = np.concatenate([ cs(q1_w_svd_feature[i].reshape(-1, 5), q2_w_svd_feature[i].reshape(-1, 5)).flatten() for i in range(q1_w_svd_feature.shape[0]) ]) df['svd_ed'] = np.concatenate([ ed(q1_w_svd_feature[i].reshape(-1, 5), q2_w_svd_feature[i].reshape(-1, 5)).flatten() for i in range(q1_w_svd_feature.shape[0]) ]) df['svd_md'] = np.concatenate([ md(q1_w_svd_feature[i].reshape(-1, 5), q2_w_svd_feature[i].reshape(-1, 5)).flatten() for i in range(q1_w_svd_feature.shape[0]) ]) lda_model = LatentDirichletAllocation(n_components=5, random_state=0) lda_model.fit(corpus_tfidf) lda_topic = lda_model.transform(corpus_tfidf) q1_w_lda_feature = lda_topic[:q1_w_vec.shape[0]] q2_w_lda_feature = lda_topic[q1_w_vec.shape[0]:] df['lda_cs'] = np.concatenate([ cs(q1_w_lda_feature[i].reshape(-1, 5), q2_w_lda_feature[i].reshape(-1, 5)).flatten() for i in range(q1_w_lda_feature.shape[0]) ]) df['lda_ed'] = np.concatenate([ ed(q1_w_lda_feature[i].reshape(-1, 5), q2_w_lda_feature[i].reshape(-1, 5)).flatten() for i in range(q1_w_lda_feature.shape[0]) ]) df['lda_md'] = np.concatenate([ md(q1_w_lda_feature[i].reshape(-1, 5), q2_w_lda_feature[i].reshape(-1, 5)).flatten() for i in range(q1_w_lda_feature.shape[0]) ])
def similarity_matrix(self, sim='cos'): features = self.features if features is None: return None if sim == 'dot': sim = np.dot(features, features.T) elif sim == 'cos': norm = np.linalg.norm(features, axis=1)[np.newaxis] sim = np.dot(features, features.T) / np.dot(norm.T, norm) elif sim == 'kmeans': cluster = kmeans(features, K=2)[np.newaxis] cluster[cluster == 0] = -1 sim = np.dot(cluster.T, cluster) elif sim == 'comm': N = len(self.clusters) #sim = np.repeat(np.array(self.clusters)[np.newaxis], N, 0) theta, _ = self.get_params() sim = theta.dot(theta.T) sim = (sim == sim.T) * 1 sim[sim < 1] = -1 elif sim == 'euclide_old': from sklearn.metrics.pairwise import euclidean_distances as ed #from plot import kmeans_plus #kmeans_plus(features, K=4) print(features) dist = ed(features) K = self.parameters_['k'] devs = self.parameters_['devs'][0] sim = np.zeros(dist.shape) sim[dist <= 2.0 * devs / K] = 1 sim[dist > 2.0 * devs / K] = -1 elif sim == 'euclide_abs': from sklearn.metrics.pairwise import euclidean_distances as ed #from plot import kmeans_plus #kmeans_plus(features, K=4) N = len(features) K = self.parameters_['k'] devs = self.parameters_['devs'][0] a = np.repeat(features[:, 0][None], N, 0).T b = np.repeat(features[:, 0][None], N, 0) sim1 = np.abs(a - b) a = np.repeat(features[:, 1][None], N, 0).T b = np.repeat(features[:, 1][None], N, 0) sim2 = np.abs(a - b) sim3 = np.zeros((N, N)) sim3[sim1 <= 2.0 * devs / K] = 1 sim3[sim1 > 2.0 * devs / K] = -1 sim4 = np.zeros((N, N)) sim4[sim2 <= 2.0 * devs / K] = 1 sim4[sim2 > 2.0 * devs / K] = -1 sim = sim4 + sim3 sim[sim >= 0] = 1 sim[sim < 0] = -1 elif sim == 'euclide_dist': from sklearn.metrics.pairwise import euclidean_distances as ed #from plot import kmeans_plus #kmeans_plus(features, K=4) N = len(features) K = self.parameters_['k'] devs = self.parameters_['devs'][0] sim1 = ed(np.repeat(features[:, 0][None], 2, 0).T) sim2 = ed(np.repeat(features[:, 0][None], 2, 0).T) sim3 = np.zeros((N, N)) sim3[sim1 <= 2.0 * devs / K] = 1 sim3[sim1 > 2.0 * devs / K] = -1 sim4 = np.zeros((N, N)) sim4[sim2 <= 2.0 * devs / K] = 1 sim4[sim2 > 2.0 * devs / K] = -1 sim = sim4 + sim3 sim[sim >= 0] = 1 sim[sim < 0] = -1 return sim
import numpy as np import cPickle as pickle from sklearn.metrics.pairwise import euclidean_distances as ed from tqdm import * featureDic = {} with open('item_feature.dat') as f: featureDic = pickle.load(f) y = [] pred_y = [] dataDir = '/home/zhaokui/research/KDD/data/taobao/' testData = open(dataDir + 'pro_test_set.txt').readlines() i = 0 for item in featureDic: print(featureDic[item]) i += 1 if i == 10: exit() for line in tqdm(testData): tmp = line.split() itemA = tmp[0].split('/')[1].split('.')[0] itemB = tmp[1].split('/')[1].split('.')[0] #print(featureDic[itemA], featureDic[itemB]) print(ed(featureDic[itemA], featureDic[itemB])) i += 1 if i == 10: break
def my_euclidean_distance(word1, word2, wv): distance = ed(my_vector_getter(word1, wv), my_vector_getter(word2, wv)) return (round(distance, 4))
itemDir = '/home/zhaokui/research/KDD/data/taobao/' itemData = open(itemDir + 'pro_test_item.txt').readlines() featureDic = {} for item in tqdm(itemData): name = item.split('/')[1].split('.')[0] image = caffe.io.load_image(itemDir + item.split()[0]) net.predict([image], False) featureDic[name] = copy.deepcopy(net.blobs['loss3f'].data[0]) y = [] pred_y = [] dataDir = '/home/zhaokui/research/KDD/data/taobao/' testData = open(dataDir + 'pro_test_set.txt').readlines() for line in tqdm(testData): tmp = line.split() itemA = tmp[0].split('/')[1].split('.')[0] itemB = tmp[1].split('/')[1].split('.')[0] y.append(int(tmp[2])) pred_y.append( ed( np.array(featureDic[itemA]).reshape(1, -1), np.array(featureDic[itemB]).reshape(1, -1))[0][0]) with open('predict.dat', 'w') as f: pickle.dump((y, pred_y), f) with open('item_feature.dat', 'w') as f: pickle.dump(featureDic, f) print(auc(y, pred_y))
def euclidean_similarity(ratings): return ed(ratings)
def hierarchical_clustering(features, k): """ Run the hierarchical agglomerative clustering algorithm. The algorithm is conceptually simple: Assign each point to its own cluster While the number of clusters is greater than k: Compute the distance between all pairs of clusters Merge the pair of clusters that are closest to each other We will use Euclidean distance to define distance between clusters. Recomputing the centroids of all clusters and the distances between all pairs of centroids at each step of the loop would be very slow. Thankfully most of the distances and centroids remain the same in successive iterations of the outer loop; therefore we can speed up the computation by only recomputing the centroid and distances for the new merged cluster. Even with this trick, this algorithm will consume a lot of memory and run very slowly when clustering large set of points. In practice, you probably do not want to use this algorithm to cluster more than 10,000 points. Args: features - Array of N features vectors. Each row represents a feature vector. k - Number of clusters to form. Returns: assignments - Array representing cluster assignment of each point. (e.g. i-th point is assigned to cluster assignments[i]) """ N, D = features.shape assert N >= k, 'Number of clusters cannot be greater than number of points' # Assign each point to its own cluster assignments = np.arange(N) #luon dam bao [1,n_clusters] centers = np.copy(features) #luon dam bao co n_clusters n_clusters = N print(centers.shape) matrix = ed(centers, centers) # ta se delete dan dan mang matrix from scipy.spatial.distance import cdist while n_clusters > k: ### YOUR CODE HERE r = np.arange(matrix.shape[0]) matrix[r[:, None] >= r] = np.max(matrix) center1, center2 = np.unravel_index( np.argmin(matrix), matrix.shape) #index two centers that have minimum distance if center1 > center2: center1, center2 = center2, center1 assignments[assignments == center2] = center1 assignments[ assignments > center2] -= 1 #gan lai de dam bao assignments thuoc khoang [1,n_clusters] matrix = np.delete(np.delete(matrix, center2, 1), center2, 0) #xoa dong, cot id = center2 de thu nho matrix centers = np.delete(centers, center2, 0) #gan lai de dam bao centers[center1] = np.mean(features[assignments == center1], axis=0) matrix[center1] = cdist(centers[center1].reshape(1, D), centers) #gan lai hang matrix matrix[:, center1] = matrix[center1] # gan lai cot matrix n_clusters -= 1 ### END YOUR CODE return assignments
def euclideanDistance(a,b): # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.euclidean_distances.html aa = a.reshape(1, len(a)) bb = b.reshape(1, len(b)) return ed(aa,bb)