def test_pairwise_distances_argmin_min(): # Check pairwise minimum distances computation for any metric X = [[0], [1]] Y = [[-1], [2]] Xsp = dok_matrix(X) Ysp = csr_matrix(Y, dtype=np.float32) # euclidean metric D, E = pairwise_distances_argmin_min(X, Y, metric="euclidean") D2 = pairwise_distances_argmin(X, Y, metric="euclidean") assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(D2, [0, 1]) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # sparse matrix case Dsp, Esp = pairwise_distances_argmin_min(Xsp, Ysp, metric="euclidean") assert_array_equal(Dsp, D) assert_array_equal(Esp, E) # We don't want np.matrix here assert_equal(type(Dsp), np.ndarray) assert_equal(type(Esp), np.ndarray) # Non-euclidean scikit-learn metric D, E = pairwise_distances_argmin_min(X, Y, metric="manhattan") D2 = pairwise_distances_argmin(X, Y, metric="manhattan") assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(D2, [0, 1]) assert_array_almost_equal(E, [1., 1.]) D, E = pairwise_distances_argmin_min(Xsp, Ysp, metric="manhattan") D2 = pairwise_distances_argmin(Xsp, Ysp, metric="manhattan") assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Non-euclidean Scipy distance (callable) D, E = pairwise_distances_argmin_min(X, Y, metric=minkowski, metric_kwargs={"p": 2}) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Non-euclidean Scipy distance (string) D, E = pairwise_distances_argmin_min(X, Y, metric="minkowski", metric_kwargs={"p": 2}) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Compare with naive implementation rng = np.random.RandomState(0) X = rng.randn(97, 149) Y = rng.randn(111, 149) dist = pairwise_distances(X, Y, metric="manhattan") dist_orig_ind = dist.argmin(axis=0) dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))] dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min( X, Y, axis=0, metric="manhattan", batch_size=50) np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7) np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7)
def test_pairwise_distances_argmin_min(): """ Check pairwise minimum distances computation for any metric""" X = [[0], [1]] Y = [[-1], [2]] Xsp = dok_matrix(X) Ysp = csr_matrix(Y) # euclidean metric D, E = pairwise_distances_argmin_min(X, Y, metric="euclidean") D2 = pairwise_distances_argmin(X, Y, metric="euclidean") assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(D2, [0, 1]) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # sparse matrix case Dsp, Esp = pairwise_distances_argmin_min(Xsp, Ysp, metric="euclidean") assert_array_equal(Dsp, D) assert_array_equal(Esp, E) # We don't want np.matrix here assert_equal(type(Dsp), np.ndarray) assert_equal(type(Esp), np.ndarray) # Non-euclidean sklearn metric D, E = pairwise_distances_argmin_min(X, Y, metric="manhattan") D2 = pairwise_distances_argmin(X, Y, metric="manhattan") assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(D2, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # sparse matrix case assert_raises(ValueError, pairwise_distances_argmin_min, Xsp, Ysp, metric="manhattan") # Non-euclidean Scipy distance (callable) D, E = pairwise_distances_argmin_min(X, Y, metric=minkowski, metric_kwargs={"p": 2}) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Non-euclidean Scipy distance (string) D, E = pairwise_distances_argmin_min(X, Y, metric="minkowski", metric_kwargs={"p": 2}) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Compare with naive implementation rng = np.random.RandomState(0) X = rng.randn(97, 149) Y = rng.randn(111, 149) dist = pairwise_distances(X, Y, metric="manhattan") dist_orig_ind = dist.argmin(axis=0) dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))] dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min( X, Y, axis=0, metric="manhattan", batch_size=50) np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7) np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7)
def fit(self, x_): self.cluster_centers_ = x_.sample(self.k_) n_iter_ = 0 while n_iter_ < self.max_iter_: # 1. clustering: x -> ck if "cluster" in x_.columns.tolist(): self.labels_ = x_["cluster"] = pairwise_distances_argmin( x_.drop(["cluster"], axis=1), self.cluster_centers_, metric="euclidean") else: self.labels_ = x_["cluster"] = pairwise_distances_argmin( x_, self.cluster_centers_, metric="euclidean") # 2. recalculate means_ cluster_centers_ = x_.groupby(by=["cluster"]).mean().sort_values( by=x_.columns.tolist()[0]) # distance(centers) dis_ = sum( sum((self.cluster_centers_.values - cluster_centers_.values)**2)) if dis_ < self.tol_: print("n_iter_ is %d, means_ is %s" % (n_iter_, self.cluster_centers_)) return self.cluster_centers_ else: self.cluster_centers_ = cluster_centers_ n_iter_ += 1 print("n_iter_ is %d cluster_centers_ is %s" % (n_iter_, self.cluster_centers_.values)) return self.cluster_centers_
def run(self): self.run = True self.colors = ['#4EACC5', '#FF9C34', '#4E9A06'] # KMeans k_means_cluster_centers = np.sort(self.k_means.cluster_centers_, axis=0) k_means_labels = pairwise_distances_argmin(self.X, k_means_cluster_centers) for k, col in zip(range(self.n_clusters), self.colors): my_members = k_means_labels == k cluster_center = k_means_cluster_centers[k] self.t_mini_batch = time.time() - self.t0 lastN_diff = 0 while (self.run): # MiniBatchKMeans mbk_means_cluster_centers = np.sort(self.mbk.cluster_centers_, axis=0) mbk_means_labels = pairwise_distances_argmin( self.X, mbk_means_cluster_centers) order = pairwise_distances_argmin(k_means_cluster_centers, mbk_means_cluster_centers) for k, col in zip(range(self.n_clusters), self.colors): my_members = mbk_means_labels == order[k] cluster_center = mbk_means_cluster_centers[order[k]] # Initialise the different array to all False different = (mbk_means_labels == 4) nbK = np.arange(self.n_clusters) err = np.arange(self.n_clusters) nbL = np.arange(self.n_clusters) for k in range(self.n_clusters): different += ((k_means_labels == k) != (mbk_means_labels == order[k])) i = 0 for s in mbk_means_labels: if s == self.labels_true[i]: nbK[k] += 1 if self.labels_true[i] == k: nbL[k] += 1 i += 1 err[k] = nbK[k] / nbL[k] identic = np.logical_not(different) n_diff = len(self.X[different, ]) if lastN_diff != n_diff and (abs(lastN_diff - n_diff) < len(self.X) / 2): print('') '''for k in range(self.n_clusters): print('Error cluster %d : %f'%(k ,(nbK[k]/ nbL[k])))''' print('Difference K-Mean - Mini-batch: %d' % n_diff) ratio = n_diff / len(mbk_means_labels == 4) print('Ratio: %f' % ratio) lastN_diff = n_diff
def test_pairwise_distances_argmin_min(): """ Check pairwise minimum distances computation for any metric""" X = [[0], [1]] Y = [[-1], [2]] # euclidean metric D, E = pairwise_distances_argmin_min(X, Y, metric="euclidean") D2 = pairwise_distances_argmin(X, Y, metric="euclidean") assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(D2, [0, 1]) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Non-euclidean sklearn metric D, E = pairwise_distances_argmin_min(X, Y, metric="manhattan") D2 = pairwise_distances_argmin(X, Y, metric="manhattan") assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(D2, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Non-euclidean Scipy distance (callable) D, E = pairwise_distances_argmin_min(X, Y, metric=minkowski, metric_kwargs={"p": 2}) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Non-euclidean Scipy distance (string) D, E = pairwise_distances_argmin_min(X, Y, metric="minkowski", metric_kwargs={"p": 2}) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Compare with naive implementation rng = np.random.RandomState(0) X = rng.randn(97, 149) Y = rng.randn(111, 149) dist = pairwise_distances(X, Y, metric="manhattan") dist_orig_ind = dist.argmin(axis=0) dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))] dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min( X, Y, axis=0, metric="manhattan", batch_size=50) np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7) np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7)
def computeHoGFeatures(XData): results = Parallel(n_jobs=num_cores)(delayed(getHOGFeatures)(XData[i]) for i in range(XData.shape[0])) hog_descriptor = np.array(results) print("hog: ", hog_descriptor.shape) # return results #Nxpx36 hog_descriptor = np.reshape(hog_descriptor, (XData.shape[0], 16, 9)) input_dim = hog_descriptor.shape[0] N = hog_descriptor.shape[1] pixels_per_image = hog_descriptor.shape[2] hog_ordered = np.transpose(hog_descriptor, (2, 0, 1)) reshapedResponses = np.reshape(hog_ordered, (pixels_per_image, input_dim * N)).T K = 50 clusters = ComputeKMeans(reshapedResponses, K) print("Cluster center dimension: ", clusters.shape) labels = pairwise_distances_argmin(reshapedResponses, clusters) labels = np.reshape(labels, (input_dim, N)) bins = np.linspace(1, K, K) idx = np.searchsorted(bins, labels, 'right') scaled_idx = K * np.arange(labels.shape[0])[:, None] + idx limit = K * labels.shape[0] counts = np.bincount(scaled_idx.ravel(), minlength=limit + 1)[:-1] hist_hog = np.reshape(counts, (labels.shape[0], K)) return hist_hog
def computeImageFeatures(XData, K=50): """ choosing fewer training samples to run the code faster. Run the entire dataset on Ada """ num_samples = XData.shape[0] print("Input data dimensions: ", XData[:num_samples].shape) responseAllImages = [] responseAllImages = getFilterResponses(XData[:num_samples]) print("Filter Responses dimension: ", responseAllImages.shape) input_dim = responseAllImages.shape[0] N = responseAllImages.shape[1] pixels_per_image = responseAllImages.shape[2] reshapedResponses = np.reshape(responseAllImages, (input_dim, N * pixels_per_image)).T clusters = ComputeKMeans(reshapedResponses, K) print("Cluster center dimension: ", clusters.shape) labels = pairwise_distances_argmin(reshapedResponses, clusters) labels = np.reshape(labels, (N, pixels_per_image)) """ parallelizing the code to make histogram computation more efficient ref: https://stackoverflow.com/questions/44152436/calculate-histograms-along-axis/44155607#44155607 """ bins = np.linspace(1, K, K) idx = np.searchsorted(bins, labels, 'right') scaled_idx = K * np.arange(labels.shape[0])[:, None] + idx limit = K * labels.shape[0] counts = np.bincount(scaled_idx.ravel(), minlength=limit + 1)[:-1] hist = np.reshape(counts, (labels.shape[0], K)) print("Histogram dimension: ", hist.shape) return hist
def get_best_thread(self, question, tag_name): """ Returns id of the most similar thread for the question. The search is performed across the threads with a given tag. Parameters ---------- question : str The question asked tag_name : str The tag for the question Returns ------- int The id of the most similar thread of the question """ thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name) question_vec = question_to_vec(question=question, embeddings=self.word_embeddings, dim=thread_embeddings.shape[1]) best_thread = pairwise_distances_argmin(question_vec[np.newaxis, ...], thread_embeddings, metric='cosine') return thread_ids[best_thread][0]
def predict(self, X): """Predict the closest cluster for each sample in X. Parameters ---------- X : {array-like, sparse matrix}, shape (n_query, n_features), \ or (n_query, n_indexed) if metric == 'precomputed' New data to predict. Returns ------- labels : array, shape = (n_query,) Index of the cluster each sample belongs to. """ X = check_array(X, accept_sparse=["csr", "csc"]) if self.metric == "precomputed": check_is_fitted(self, "medoid_indices_") return np.argmin(X[:, self.medoid_indices_], axis=1) else: check_is_fitted(self, "cluster_centers_") # Return data points to clusters based on which cluster assignment # yields the smallest distance return pairwise_distances_argmin(X, Y=self.cluster_centers_, metric=self.metric)
def get_best_thread(self, question, tag_name): """ Returns id of the most similar thread for the question. The search is performed across the threads with a given tag. 拿到该标签对应的候选集合 将question转换为向量 求最相似的问题标签 """ thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name) print("thread_ids:", thread_ids) print("thread_embeddings:", thread_embeddings) # HINT: you have already implemented a similar routine in the 3rd assignment. #### YOUR CODE HERE #### question_vec = question_to_vec(question, self.word_embeddings, self.embeddings_dim) #### YOUR CODE HERE #### #从383456中找到最相似的 #print(question_vec) #print(thread_embeddings) question_vec = question_vec.reshape(1, -1) #只有一个样本,变成一行 best_thread = pairwise_distances_argmin(question_vec, thread_embeddings) print("best_thread:", best_thread[0]) #得打印出来看一下thread_ids的组成 return thread_ids[best_thread[0]]
def get_best_thread(self, question, tag_name): """ Returns id of the most similar thread for the question. The search is performed across the threads with a given tag. """ thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name) # HINT: you have already implemented a similar routine in the 3rd assignment. question_vec = question_to_vec( question, self.word_embeddings, self.embeddings_dim) #### YOUR CODE HERE #### """ question_vec = np.reshape(question_vec,(1,self.embeddings_dim)) sim_list = [] for ind, can_emb in zip(thread_ids, thread_embeddings): sim = cosine_similarity(question_vec,can_emb) sim_list.append(sim[0][0]) sort_ind = np.argsort(sim_list)[::-1] """ print(thread_embeddings.shape) print(thread_ids.shape) best_thread = pairwise_distances_argmin( thread_embeddings, question_vec, axis=0)[0] #sort_ind[0] #### YOUR CODE HERE #### print(best_thread) return thread_ids[best_thread]
def aggpool(self): # 得到节点特征 #self_vec =Ememory.node_feature_list()# 理论上应该有函数可以直接完成 self_vec=[]#dataset=[] for node in list(self.Gmemory.nodes()): self_vec.append(self.Gmemory.nodes[node]['attributes']) # print("self_cec",self_vec) # self_vec_matrix = np.array(self_vec) # self_vec_matrix3 = self_vec_matrix[:,np.newaxis] # print("self_vec",self_vec_matrix) # 得到所有节点的表示 Gnodes = [n for n in self.Gmemory.nodes()] # 在图中进行随机游走 n_walks= 4 pairs,feature_matrix = self.run_random_walks(self.Gmemory,Gnodes,self.memory_word_size,n_walks) # 为每个节点找到2跳邻居 5个, 用节点序号做标记 # print("feature_matrix",feature_matrix)# (n,n_walks,dim_obs) #delta_feature = feature_matrix - self_vec_matrix3 #得到特征向量 neigh_vecs = tf.cast(feature_matrix,tf.float32)#列为节点个数,行为邻居个数,每个元素为一个向量 #neigh_vecs = tf.cast(delta_feature,tf.float32) # 先进行aggregate #print("selfvec",self_vec,"neigh_vecs",neigh_vecs) #outputs = self.aggregator(self_vec,neigh_vecs) #训练参数 outputs = self.aggregator.aggwithoutpara(self_vec,neigh_vecs)# 无参数,加和平均 #print("output",outputs) # 再进行pooling # k=0.3 # subgraph,center_list = self.sagpooling(self.Gmemory,Gnodes,outputs,k) # 最后得到子图的节点 #center_list = subgraph.nodes() # 使用聚类的算法得到下采样 FeatureDict =dict(zip(Gnodes,list(outputs))) #print("FeatureDict",keys) # for node in Gnodes: # FeatureDict[node] t1 = 120 t2 = 100 self.gc = Cluster()#这个要在原函数上改改,不能直接放在这里 self.gc.setThreshold(t1,t2) canopies = self.gc.clustering(FeatureDict) #print("canopies",len(canopies)) center_list=[] # for i in range(len(canopies)): # center_list.append(canopies[i][0]) #这一步要把特征重新对应回标签上 k_means = KMeans(n_clusters=len(canopies)) k_means.fit_predict(outputs) k_means_cluster_centers = k_means.cluster_centers_ #print("k_means_cluster_centers",k_means_cluster_centers) argmin = pairwise_distances_argmin(k_means_cluster_centers,outputs,metric='euclidean') #print("argmin",argmin) for t in argmin: center_list.append(Gnodes[t]) return center_list
def get_best_thread(self, question, tag_name): """ Returns id of the most similar thread for the question. The search is performed across the threads with a given tag. """ thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name) # HINT: you have already implemented a similar routine in the 3rd assignment. question_vec = question_to_vec( question, self.word_embeddings, self.embeddings_dim) #### YOUR CODE HERE #### # tag_w2v = unpickle_file(os.path.join(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], os.path.normpath('%s.pkl' % tag_name)) # flag = 0 [best_thread] = pairwise_distances_argmin(X=question_vec.reshape( 1, self.embeddings_dim), Y=thread_embeddings, metric='cosine') # for i in range(len(thread_ids)): # if i == 0: # mx_sim = cos_sim(question_vec, thread_embeddings[0]) # best_thread = 0 # continue # if cos_sim(question_vec, thread_embeddings[i]) > mx_sim: # best_thread = i #### YOUR CODE HERE #### # print(best_thread) return thread_ids[best_thread]
def get_best_thread(self, question, tag_name): """ Returns id of the most similar thread for the question. The search is performed across the threads with a given tag. """ thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name) # HINT: you have already implemented a similar routine in the 3rd assignment. question_vec = np.reshape( question_to_vec(question, self.word_embeddings, self.embeddings_dim), (1, self.embeddings_dim)) #### YOUR CODE HERE #### print(thread_embeddings) print('Qvec') print(question_vec) print('dist_argmin_ans', pairwise_distances_argmin(thread_embeddings, question_vec)) sim_vals = cosine_similarity(thread_embeddings, question_vec) best_thread = np.argmax(sim_vals[:, 0]) #### YOUR CODE HERE #### print('best_thread', best_thread) print('sim_vals shape', sim_vals.shape) print('0th element of thread_ids', thread_ids.iloc[0]) print('answer', thread_ids.iloc[best_thread]) return thread_ids.iloc[best_thread] #thread_ids[best_thread]
def get_best_thread(self, question, tag_name): """ Returns id of the most similar thread for the question. The search is performed across the threads with a given tag. """ thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name) # HINT: you have already implemented a similar routine in the 3rd assignment. #print('Thread Ranker : question',question) #print('Thread Ranker : tag_name',tag_name) question_vec = question_to_vec(question, self.word_embeddings, 300).reshape(1, -1) # YOUR CODE HERE #### #print('Thread Ranker : question_vec',question_vec) # print(question_vec.shape) # print(thread_embeddings.shape) # print(thread_ids.shape) best_thread = pairwise_distances_argmin(question_vec, thread_embeddings)[0] # print(best_thread) # print(thread_ids[best_thread:best_thread+1]) return thread_ids.values[best_thread]
def get_best_thread(self, question, tag_name): #Returns id of the most similar thread for the question. #The search is performed across the threads with a given tag. thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name) question_vec = question_to_vec(question, self.word_embeddings, self.embeddings_dim) best_thread = pairwise_distances_argmin(X=question_vec.reshape( 1, self.embeddings_dim), Y=thread_embeddings, metric='cosine') best_thread_similarity = np.min( pairwise_distances(X=question_vec.reshape(1, self.embeddings_dim), Y=thread_embeddings, metric='cosine')) #print(best_thread_similarity) reply = self.programming.Main(question) if reply != "Please refer kammand prompt discord or ask you mentor for more info :)": return reply else: if best_thread_similarity >= 0.45: return f'I think its about {tag_name}\n This thread might help you: https://stackoverflow.com/questions/{thread_ids[best_thread][0]}' else: return "Please refer to kammand prompt discord or ask for your mentor"
def get_best_thread(self, question, tag_name): """ Returns id of the most similar thread for the question. The search is performed across the threads with a given tag. """ print("@@@@@@@@") print(question) print(tag_name) print(type(tag_name)) print("@@@@@@@@") thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name) # HINT: you have already implemented a similar routine in the 3rd assignment. # question_vec =question_to_vec(question, embeddings=self.word_embeddings, dim=50) # best_thread = rank_candidates(question_vec, thread_embeddings, dim=50) question = text_prepare(question) question_vec = question_to_vec(question, self.word_embeddings, 50).reshape(1, -1) print('---') print("vecs:") print(question_vec[:, :5]) print('~~') print(thread_embeddings[:3, :5]) print('---') best_thread = pairwise_distances_argmin(question_vec, thread_embeddings, metric='cosine')[0] print(best_thread) print(thread_ids[best_thread]) return thread_ids[best_thread]
def doKmeans(matrix, k, metric='cosine', batch_size=1024): mbk = MiniBatchKMeans(init='k-means++', n_clusters=k, batch_size=batch_size, n_init=10, max_no_improvement=10, verbose=0) mbk.fit(matrix) return pairwise_distances_argmin(X=matrix, Y=mbk.cluster_centers_, metric=metric)
def k_means_function(clust_data, figsize=(18, 6), **kwargs): k_means = KMeans(**kwargs) k_means.fit(clust_data) k_means_cluster_centers = k_means.cluster_centers_ k_means_labels = pairwise_distances_argmin(clust_data, k_means_cluster_centers) plots_count = clust_data.shape[1] - 1 f, ax = plt.subplots(1, plots_count, figsize=figsize) for i in range(kwargs.get('n_clusters')): my_members = k_means_labels == i cluster_center = k_means_cluster_centers[i] for j in range(plots_count): axis = ax if plots_count == 1 else ax[j] axis.plot(clust_data[my_members, j], clust_data[my_members, j + 1], 'p', markerfacecolor=colors[i], marker='o', markeredgecolor=colors[i], markersize=4) axis.plot(cluster_center[j], cluster_center[j + 1], 'o', markerfacecolor=colors[i], markeredgecolor='k', markersize=8) return k_means_labels, ax
def get_best_thread(self, question, tag_name): """ Returns id of the most similar thread for the question. The search is performed across the threads with a given tag. """ thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name) if MAX_TRDS_TO_LOAD is not None: # sample a predefined number of tags indices = range(0, thread_ids.shape[0]) random_indices_choice = np.random.choice(indices, size=min( len(indices), MAX_TRDS_TO_LOAD), replace=False) thread_ids = thread_ids[random_indices_choice, ] thread_embeddings = thread_embeddings[random_indices_choice, ] question_vec = question_to_vec(question, self.word_embeddings, self.embeddings_dim) min_dist = pairwise_distances_argmin(question_vec.reshape(1, -1), thread_embeddings, axis=1, metric='cosine') best_thread = min_dist[0] return thread_ids[best_thread]
def fit( self, x_, ): all_label = set(x_["label"].values) vectors_ = x_.sample(self.q_) # cover every class of samples while set(vectors_["label"].values) != all_label: vectors_ = x_.sample(self.q_) self.vectors_ = vectors_.drop(["label"], axis=1).values self.labels_ = vectors_["label"].values n_iter_ = 0 while n_iter_ < self.max_iter_: # sample_ = x_.sample(1).values[0] label_ = self.labels_[pairwise_distances_argmin([sample_[:-1]], self.vectors_)[0]] if label_ == sample_[-1]: vectors_ = self.vectors_ + self.eta_ * (self.vectors_ - sample_[:-1]) else: vectors_ = self.vectors_ - self.eta_ * (self.vectors_ - sample_[:-1]) if sum(sum(self.vectors_ - sample_[:-1])**2) < self.tol_: print(n_iter_, self.vectors_) return self.vectors_ self.vectors_ = vectors_ n_iter_ += 1 print(n_iter_, self.vectors_) return self.vectors_
def main(): parser = argparse.ArgumentParser(description='K-Means in Python') parser.add_argument('-f', '--filename', help='Name of the file', required=True) parser.add_argument('-k', '--k', help='The number of clusters', required=True, type=int) args = parser.parse_args() filename = args.filename k = args.k df = pd.read_csv(filename, converters={'date_time': parse_dates}) date_time = df['date_time'] df = df.drop('date_time', 1) start = time.time() k_means = KMeans(init='random', n_clusters=k).fit(df) # init='k-means++' print("[KMEANS] Finish all in {} seconds".format(time.time() - start)) k_means_cluster_centers = np.sort(k_means.cluster_centers_, axis=0) k_means_labels = pairwise_distances_argmin(df.values, k_means_cluster_centers) df['date_time'] = date_time df['cluster'] = k_means_labels output_name = "/var/www/project/k_means_result_{}.txt".format(k) transform_save(df, output_name)
def simple_center_adjustment(self, cluster_labels, cluster_sizes): labels = cluster_labels.copy() sizes = cluster_sizes.copy() centers = list( map(lambda x: self.calculate_geo_cluster_center(cluster_labels, x), list(range(self.num_cluster)))) centroids_dist_min = pairwise.pairwise_distances_argmin( self.geo_data, centers) filtering = [ list(np.where(labels == x)[0]) for x in cluster_sizes.index ] order_index = [item for elem in filtering for item in elem] for i in order_index: bestcl = centroids_dist_min[i].astype(int) actualcl = labels[i].astype(int) if actualcl != bestcl and sizes[bestcl] < self.cluster_max_size: sizes[bestcl] += 1 sizes[actualcl] -= 1 labels[i] = bestcl return labels, sizes
def filter_level_pruning(percents): global args, best_prec1 args = parser.parse_args() model = torch.nn.DataParallel(resnet.__dict__[args.arch]()) model.cuda() # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})".format( args.evaluate, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # print(model) val_loader = torch.utils.data.DataLoader(datasets.CIFAR10( root='./data', train=False, transform=transforms.Compose([ transforms.ToTensor(), normalize, ]), download=True), batch_size=128, shuffle=False, num_workers=args.workers, pin_memory=True) criterion = nn.CrossEntropyLoss().cuda() num_clusters_full = calc_num_clusters(model, None, percents) print(num_clusters_full) i = 0 with torch.no_grad(): for name, param in model.named_parameters(): if "conv" in name: num_channels, b, c, d = param.shape num_clusters = num_clusters_full[i] X = param.view(num_channels, -1).cpu() k_means = KMeans(init='k-means++', n_clusters=num_clusters, n_init=10) k_means.fit(X) cluster_centers = torch.from_numpy(k_means.cluster_centers_) cluster_ids_x = pairwise_distances_argmin(X, cluster_centers) for n, channel in enumerate(param): param[n] = cluster_centers[cluster_ids_x[n]].view(b, c, d) i += 1 return validate(val_loader, model, criterion)
def nearest_neighbor(s_samples, s_classes): """ :param s_samples: (n_samples, attributes_dim) :param s_classes: (n_classes, attributes_dim) :return: (n_samples, ), 返回每个测试样本在S空间距离最近的class index """ min_dist_pos = pairwise_distances_argmin(s_samples, s_classes) return np.array(min_dist_pos)
def array_to_label(X_array,n_clusters=3): batch_size = 45 mbk = MiniBatchKMeans(init='k-means++', n_clusters=n_clusters, batch_size=batch_size, n_init=10, max_no_improvement=10, verbose=0) mbk.fit(X_array) mbk_means_cluster_centers = np.sort(mbk.cluster_centers_, axis=0) mbk_means_labels = pairwise_distances_argmin(X_array, mbk_means_cluster_centers) return mbk_means_labels
def test_pairwise_distances_argmin_min(): """ Check pairwise minimum distances computation for any metric""" X = [[0], [1]] Y = [[-1], [2]] # euclidean metric D, E = pairwise_distances_argmin_min(X, Y, metric="euclidean") D2 = pairwise_distances_argmin(X, Y, metric="euclidean") assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(D2, [0, 1]) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Non-euclidean sklearn metric D, E = pairwise_distances_argmin_min(X, Y, metric="manhattan") D2 = pairwise_distances_argmin(X, Y, metric="manhattan") assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(D2, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Non-euclidean Scipy distance (callable) D, E = pairwise_distances_argmin_min(X, Y, metric=minkowski, metric_kwargs={"p": 2}) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Non-euclidean Scipy distance (string) D, E = pairwise_distances_argmin_min(X, Y, metric="minkowski", metric_kwargs={"p": 2}) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Compare with naive implementation rng = np.random.RandomState(0) X = rng.randn(97, 149) Y = rng.randn(111, 149) dist = pairwise_distances(X, Y, metric="manhattan") dist_orig_ind = dist.argmin(axis=0) dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))] dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min( X, Y, axis=0, metric="manhattan", batch_size=50) np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7) np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7)
def get_best_scheme(self, question, scheme_name): scheme_ids, scheme_embeddings = self.__load_embeddings_by_scheme(scheme_name) question_vec = question_to_vec(question, self.word_embeddings, self.embeddings_dim).reshape(1,-1) best_scheme = pairwise_distances_argmin(question_vec, scheme_embeddings)[0] return scheme_ids[best_scheme]
def find_clusters(x, n): # поиск кластеров k_means = KMeans(n_clusters=n, n_init=100) k_means.fit(x) # определение элементов, попавших в каждый кластер k_means_centers = np.sort(k_means.cluster_centers_, axis=0) k_means_indexes = pairwise_distances_argmin(x, k_means_centers) return k_means_centers, k_means_indexes
def get_best_thread(self, question, tag_name): """ Returns id of the most similar thread for the question. The search is performed across the threads with a given tag. """ thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name) question_vec = question_to_vec(question, self.word_embeddings, self.embeddings_dim)[np.newaxis, :] best_thread = pairwise_distances_argmin(question_vec, thread_embeddings, metric='cosine')[0] return thread_ids[best_thread]
def get_best_category(self, question, category_name): category_ids, category_embeddings = self.__load_embeddings_by_category(category_name) question_vec = question_to_vec(question, self.word_embeddings, self.embeddings_dim).reshape(1,-1) best_category = pairwise_distances_argmin(question_vec, category_embeddings)[0] return category_ids[best_category]
def find_nearest(self, X, means): nearest = [] for _ in means: nearest.append([]) #print(means) index = pairwise_distances_argmin(X, means) #print(index) for i, ind in enumerate(index): nearest[ind].append(X[i]) return nearest
def _k_means_discriminator(self, batch_size=45): from sklearn.cluster import MiniBatchKMeans from sklearn.metrics.pairwise import pairwise_distances_argmin mbk = MiniBatchKMeans(init='k-means++', n_clusters=2, batch_size=batch_size, n_init=10, max_no_improvement=10, verbose=0) #t0 = time.time() X = np.log10(self.ax.reshape(-1, 1)) mbk.fit(X) cc = np.sort(mbk.cluster_centers_,axis=0) self.clusters = pairwise_distances_argmin(X,cc)
def patch_partition(self, patch_list, n_clusters=30,patch_size=(21,21)): """ 输入path列表,返回聚类中心,和聚类标签 :param patch_list: :param n_clusters: :return: """ patch_data = np.array(patch_list) patch_data = np.reshape(patch_data, (-1, self.patch_size * self.patch_size * self.patch_depth)) k_means = KMeans(n_clusters=n_clusters) t0 = time.time() k_means.fit(patch_data) k_means_cluster_centers = np.sort(k_means.cluster_centers_, axis=0) k_means_labels = pairwise_distances_argmin(patch_data, k_means_cluster_centers) t_batch = time.time() - t0 print ("%d个patch进行Kmean聚类,分成%d类,耗时%d秒" % (patch_data.shape[0], n_clusters, t_batch)) return np.reshape(np.array(k_means_cluster_centers), (-1, patch_size[0], patch_size[1])), k_means_labels
def _k_means_discriminator(self, batch_size=45): from sklearn.cluster import MiniBatchKMeans from sklearn.metrics.pairwise import pairwise_distances_argmin mbk = MiniBatchKMeans(init='k-means++', n_clusters=self.nsources+1, batch_size=batch_size, n_init=10, max_no_improvement=10, verbose=0) #t0 = time.time() X = np.log10(self.ax) mbk.fit(X) cc = np.zeros(mbk.cluster_centers_.shape) # index of cluster corresponding to silence idx_silence = np.argmin(np.sum(mbk.cluster_centers_,axis=1)) cc[0,:] = mbk.cluster_centers_[idx_silence,:] idx_free = range(cc.shape[0]) idx_free.remove(idx_silence) cred = mbk.cluster_centers_-cc[0,:] # remaining indexes, sort them by channel used_chan=[] nchan = cc.shape[1] last_unmatched=0 while idx_free: crem = cred[idx_free,:] r,idx_chan = np.unravel_index(crem.argmax(),crem.shape) idx_center = idx_free[r] if idx_chan not in used_chan: this_center = idx_chan+1 else: # append to end of list this_center = cc.shape[0]-last_unmatched-1 sys.stderr.write('Cluster {} not matched to channel\n'.format(idx_center)) cc[this_center,:]=mbk.cluster_centers_[idx_center,:] used_chan.append(idx_chan) idx_free.remove(idx_center) cc[1:,:] = np.delete(mbk.cluster_centers_,idx_silence,axis=0) #cc = mbk.cluster_centers_[idxs,:] self.clusters = pairwise_distances_argmin(X,cc) self.centers = cc
cmap=plt.cm.Paired, aspect='auto', origin='lower') plt.plot(tsne_emb[:, 0], tsne_emb[:, 1], 'k.', markersize=2) # Plot the centroids as a white X centroids = k_means.cluster_centers_ plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='w', zorder=10) k_means_cluster_centers = np.sort(k_means.cluster_centers_, axis=0) k_means_labels = pairwise_distances_argmin(tsne_emb, k_means_cluster_centers) # KMeans # for k in range(n_clusters): # my_members = k_means_labels == k # cluster_center = k_means_cluster_centers[k] # plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=10) mc_words = frequency.most_common(200) mc_words = [w[0] for w in mc_words] final_points = [] final_voc = [] for word, values in dico.items():
def Gross_K_means(Data_list, n_clusters, n_init, max_iter, weight): # K-means #n_clusters = 9 #n_init = 100 #max_iter = 100 # top 11 colors colors = ['firebrick', 'orange', 'red', 'yellow','green', 'tan', 'skyblue', 'blue', 'violet', 'grey','magenta'] ############################################################################## # convert to numpy array temp_data = Data_list Data_TH = np.array(temp_data[1:][:]) # rewrite array format into data points DataPoints = [] for i in range(len(Data_TH[0])): DataPoints.append([Data_TH[0,i],Data_TH[1,i],Data_TH[2,i],weight*Data_TH[3,i],weight*Data_TH[4,i], weight*Data_TH[5,i],weight*Data_TH[6,i]]) DataPoints = np.array(DataPoints) ############################################################################## # plot data for demonstration fig = plt.figure(figsize=(16, 16)) fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9) # Score1 and Score2 ax = fig.add_subplot(2, 3, 1) ax.plot(DataPoints[:, 0], DataPoints[:, 1], 'w', markerfacecolor='blue', marker='.') ax.set_title('Score1 vs Score2') label_range = [i-2 for i in range(14)] ax.set_xticks(label_range) ax.set_xticklabels(label_range) ax.set_yticks(label_range) ax.set_yticklabels(label_range) # Muser_degree_abs and Muser_degree_ws ax = fig.add_subplot(2, 3, 2) ax.plot(DataPoints[:, 3], DataPoints[:, 4], 'w', markerfacecolor='blue', marker='.') ax.set_title('User_Degree_abs vs User_Degree_weighted') label_range = [weight*(i+2) for i in range(10)] ax.set_xticks(label_range) ax.set_xticklabels(label_range) ax.set_yticks(label_range) ax.set_yticklabels(label_range) # Tag_Degree_abs and Tag_Degree_weighted ax = fig.add_subplot(2, 3, 3) ax.plot(DataPoints[:, 5], DataPoints[:, 6], 'w', markerfacecolor='blue', marker='.') ax.set_title('Tag_Degree_abs vs Tag_Degree_weighted') label_range = [weight*(i+2) for i in range(10)] ax.set_xticks(label_range) ax.set_xticklabels(label_range) ax.set_yticks(label_range) ax.set_yticklabels(label_range) # Score1 and Norm_totalAction ax = fig.add_subplot(2, 3, 4) ax.plot(DataPoints[:, 0], DataPoints[:, 2], 'w', markerfacecolor='blue', marker='.') ax.set_title('Score1 vs Norm_totalAction') label_x = [i-2 for i in range(14)] label_y = [i+2 for i in range(10)] ax.set_xticks(label_x) ax.set_xticklabels(label_x) ax.set_yticks(label_y) ax.set_yticklabels(label_y) # Score2 and Norm_totalAction ax = fig.add_subplot(2, 3, 5) ax.plot(DataPoints[:, 1], DataPoints[:, 2], 'w', markerfacecolor='blue', marker='.') ax.set_title('Score2 vs Norm_totalAction') label_x = [i-2 for i in range(14)] label_y = [i+2 for i in range(10)] ax.set_xticks(label_x) ax.set_xticklabels(label_x) ax.set_yticks(label_y) ax.set_yticklabels(label_y) # tag_degree_abs and user_degree_abs ax = fig.add_subplot(2, 3, 6) ax.plot(DataPoints[:, 5], DataPoints[:, 3], 'w', markerfacecolor='blue', marker='.') ax.set_title('tag_degree_abs vs user_degree_abs') label_x = [weight*(i+2) for i in range(10)] label_y = [weight*(i+2) for i in range(10)] ax.set_xticks(label_x) ax.set_xticklabels(label_x) ax.set_yticks(label_y) ax.set_yticklabels(label_y) plt.savefig('../output/Tag_DataDemo.png') plt.show() ############################################################################## ############################################################################## # K-means with Scores ONLY print "K-means with Scores ONLY" k_means = KMeans(init='k-means++', n_clusters=n_clusters, n_init=n_init, max_iter = max_iter) k_means.fit(DataPoints[:,:2]) Scores_labels = k_means.labels_ Scores_cluster_centers = k_means.cluster_centers_ Scores_labels_unique = np.unique(Scores_labels) # K-means with Scores and user_degree_abs and tag_degree_abs print "K-means with Scores and user_degree_abs and tag_degree_abs" index = [0,1,3,5] k_means.fit(DataPoints[:,index]) Scores_bothDegree_labels = k_means.labels_ Scores_bothDegree_centers = k_means.cluster_centers_ Scores_bothDegree_unique = np.unique(Scores_bothDegree_labels) ############################################################################## # We want to have the same colors for the same cluster from the # MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per # closest one. order_STU = pairwise_distances_argmin(Scores_cluster_centers, Scores_bothDegree_centers[:,0:2]) print "\nScores_cluster_centers:\n ", Scores_cluster_centers print "\nScores_bothDegree_centers:\n ", Scores_bothDegree_centers[order_STU] ############################################################################## ############################################################################## index = [0,1,3,5] # PCA with Scores ONLY pca = PCA(n_components=4) X_r = pca.fit(DataPoints[:,index]).transform(DataPoints[:,index]) ############################################################################## ############################################################################## # Plot result fig = plt.figure(figsize=(16, 16)) fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9) # Scores ax = fig.add_subplot(2, 3, 1) for k, col in zip(range(n_clusters), colors): my_members = Scores_labels == k cluster_center = Scores_cluster_centers[k] ax.plot(DataPoints[:, 0], DataPoints[:, 1], 'o', markerfacecolor='blue', markeredgecolor='k', markersize=6) ax.set_title('Full Data Set, Score1 vs Score2') label_range = [i-2 for i in range(14)] ax.set_xticks(label_range) ax.set_xticklabels(label_range) ax.set_yticks(label_range) ax.set_yticklabels(label_range) # Scores and both_Degrees ax = fig.add_subplot(2, 3, 2) for k, col in zip(range(n_clusters), colors): my_members = Scores_bothDegree_labels == order_STU[k] cluster_center = Scores_bothDegree_centers[order_STU[k]] ax.plot(DataPoints[my_members, 0], DataPoints[my_members, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6) ax.set_title('K-Means by Scores & Degrees, Score1 vs Score2') label_range = [i-2 for i in range(14)] ax.set_xticks(label_range) ax.set_xticklabels(label_range) ax.set_yticks(label_range) ax.set_yticklabels(label_range) # user_degree vs tag_degree ax = fig.add_subplot(2, 3, 3) for k, col in zip(range(n_clusters), colors): my_members = Scores_bothDegree_labels == order_STU[k] cluster_center = Scores_bothDegree_centers[order_STU[k]] ax.plot(DataPoints[my_members, 3], DataPoints[my_members, 5], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6) ax.set_title('K-Means by Scores & Degrees, user_degree vs tag_degree') label_range = [weight*(i+2) for i in range(10)] ax.set_xticks(label_range) ax.set_xticklabels(label_range) ax.set_yticks(label_range) ax.set_yticklabels(label_range) # PCA 1 vs 2 ax = fig.add_subplot(2, 3, 4) for k, col in zip(range(n_clusters), colors): my_members = Scores_labels == k ax.plot(DataPoints[:, 3], DataPoints[:, 5], 'o', markerfacecolor='blue', markeredgecolor='k', markersize=6) ax.set_title('Full Data Set, user_degree vs tag_degree') # PCA 1 vs 2 ax = fig.add_subplot(2, 3, 5) for k, col in zip(range(n_clusters), colors): my_members = Scores_bothDegree_labels == order_STU[k] ax.plot(X_r[my_members, 0], X_r[my_members, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6) ax.set_title('K-Means by Score & Degrees, PCA compnent 1 vs 2') # PCA 1 vs 2 ax = fig.add_subplot(2, 3, 6) ax.plot(X_r[:, 0], X_r[:, 1], 'o', markerfacecolor='blue', markeredgecolor='k', markersize=6) ax.set_title('Full Data Set, PCA compnent 1 vs 2') #################################################################### plt.savefig('../output/Tag_Kmeans_N{}_W{}.png'.format(n_clusters, weight)) plt.show()
def _k_mean(self,samples,pits): #print samples k_means = KMeans(init='k-means++', n_clusters=N_CLUSTERS, n_init=100) k_means.fit(samples) k_means_labels = k_means.labels_ #print k_means.labels_ k_means_cluster_centers = k_means.cluster_centers_ classif=k_means.predict(samples) ####SAVE THE MODEL #joblib.dump(k_means, 'kmean.pkl') ###RELOAD THE MODEL #k_means = joblib.load('kmean.pkl') batch_size = 100 self._scores = classif classif_df=pd.DataFrame(classif,index=np.arange(1,len(classif)+1)) classif_df.columns = ['scores'] pit_df=pd.DataFrame(pits,index=np.arange(1,len(pits)+1)) pit_df.columns = ['pits'] result_per_pit = pd.concat([classif_df, pit_df], axis=1,verify_integrity=False) #print "result per pit in kmean function" #print result_per_pit #result_per_pit=pd.DataFrame(result_per_pit,index=np.arange(0,len(result_per_pit))) self.result = pd.concat([result_per_pit.groupby('pits')['scores'].sum(), result_per_pit.groupby('pits')['scores'].count()], axis=1,verify_integrity=False) self.result.columns = ['scores','count'] self.result.to_csv(os.path.join(OUTPUT_DIR, "final_result.csv"),sep=",") #result_per_pit.columns = ['pits','scores','count'] result_per_pit.to_csv(os.path.join(OUTPUT_DIR, "pre_result_simple_k_means_freedman.csv"),sep=",") #print result_per_pit.reindex(range(119)) #print result_per_pit[:1] result_per_pit= result_per_pit[['scores','pits']].values result_per_pit_df=pd.DataFrame(result_per_pit,index=np.arange(len(result_per_pit))) result_per_pit_df.columns =['scores','pits'] #result_per_pit_df = result_per_pit_df.set_index('index') grouped = result_per_pit_df.groupby('pits') for pit,cluster in grouped: print cluster print result_per_pit_df ############################################################################################ # Compute clustering with MiniBatchKMeans ############################################################################################ mbk = MiniBatchKMeans(init='k-means++', n_clusters=N_CLUSTERS, batch_size=batch_size, n_init=100, max_no_improvement=10, verbose=0) t0 = time() mbk.fit(samples) t_mini_batch = time() - t0 mbk_means_labels = mbk.labels_ mbk_means_cluster_centers = mbk.cluster_centers_ mbk_means_labels_unique = np.unique(mbk_means_labels) print mbk.labels_ classif=mbk.predict(samples) print classif classif_df=pd.DataFrame(classif,index=np.arange(1,len(classif)+1)) classif_df.columns = ['scores'] pit_df=pd.DataFrame(pits,index=np.arange(1,len(pits)+1)) pit_df.columns = ['pits'] result_per_pit = pd.concat([classif_df, pit_df], axis=1,verify_integrity=False) print "result per pit in Mini Batch KMeans function" print result_per_pit result_per_pit.to_csv(os.path.join(OUTPUT_DIR, "pre_result_batch_k_means_freedman.csv"),sep=",") result_per_pit= result_per_pit[['scores','pits']].values result_per_pit_df=pd.DataFrame(result_per_pit,index=np.arange(len(result_per_pit))) result_per_pit_df.columns =['scores','pits'] #result_per_pit_df = result_per_pit_df.set_index('index') grouped = result_per_pit_df.groupby('pits') for pit,cluster in grouped: print cluster print result_per_pit_df ############################################################################## # Plot result fig = plt.figure(figsize=(8, 3)) fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9) colors = ['#4EACC5', '#FF9C34', '#4E9A06','#4E9A06']#,'#555555'] # KMeans ax = fig.add_subplot(1, 3, 1) for k, col in zip(range(N_CLUSTERS), colors): my_members = k_means_labels == k cluster_center = k_means_cluster_centers[k] ax.plot(samples[my_members, 0], samples[my_members, 1], 'w', markerfacecolor=col, marker='.') ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6) ax.set_title('KMeans') ax.set_xticks(()) ax.set_yticks(()) # We want to have the same colors for the same cluster from the # MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per # closest one. order = pairwise_distances_argmin(k_means_cluster_centers, mbk_means_cluster_centers) # MiniBatchKMeans ax = fig.add_subplot(1, 3, 2) for k, col in zip(range(N_CLUSTERS), colors): my_members = mbk_means_labels == order[k] cluster_center = mbk_means_cluster_centers[order[k]] ax.plot(samples[my_members, 0], samples[my_members, 1], 'w', markerfacecolor=col, marker='.') ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6) ax.set_title('MiniBatchKMeans') ax.set_xticks(()) ax.set_yticks(()) plt.text(-3.5, 1.8, 'train time: %.2fs\ninertia: %f' % (t_mini_batch, mbk.inertia_)) plt.show()
def test_pairwise_distances_argmin_min(): # Check pairwise minimum distances computation for any metric X = [[0], [1]] Y = [[-2], [3]] Xsp = dok_matrix(X) Ysp = csr_matrix(Y, dtype=np.float32) expected_idx = [0, 1] expected_vals = [2, 2] expected_vals_sq = [4, 4] # euclidean metric idx, vals = pairwise_distances_argmin_min(X, Y, metric="euclidean") idx2 = pairwise_distances_argmin(X, Y, metric="euclidean") assert_array_almost_equal(idx, expected_idx) assert_array_almost_equal(idx2, expected_idx) assert_array_almost_equal(vals, expected_vals) # sparse matrix case idxsp, valssp = pairwise_distances_argmin_min(Xsp, Ysp, metric="euclidean") assert_array_almost_equal(idxsp, expected_idx) assert_array_almost_equal(valssp, expected_vals) # We don't want np.matrix here assert_equal(type(idxsp), np.ndarray) assert_equal(type(valssp), np.ndarray) # euclidean metric squared idx, vals = pairwise_distances_argmin_min(X, Y, metric="euclidean", metric_kwargs={"squared": True}) assert_array_almost_equal(idx, expected_idx) assert_array_almost_equal(vals, expected_vals_sq) # Non-euclidean scikit-learn metric idx, vals = pairwise_distances_argmin_min(X, Y, metric="manhattan") idx2 = pairwise_distances_argmin(X, Y, metric="manhattan") assert_array_almost_equal(idx, expected_idx) assert_array_almost_equal(idx2, expected_idx) assert_array_almost_equal(vals, expected_vals) # sparse matrix case idxsp, valssp = pairwise_distances_argmin_min(Xsp, Ysp, metric="manhattan") assert_array_almost_equal(idxsp, expected_idx) assert_array_almost_equal(valssp, expected_vals) # Non-euclidean Scipy distance (callable) idx, vals = pairwise_distances_argmin_min(X, Y, metric=minkowski, metric_kwargs={"p": 2}) assert_array_almost_equal(idx, expected_idx) assert_array_almost_equal(vals, expected_vals) # Non-euclidean Scipy distance (string) idx, vals = pairwise_distances_argmin_min(X, Y, metric="minkowski", metric_kwargs={"p": 2}) assert_array_almost_equal(idx, expected_idx) assert_array_almost_equal(vals, expected_vals) # Compare with naive implementation rng = np.random.RandomState(0) X = rng.randn(97, 149) Y = rng.randn(111, 149) dist = pairwise_distances(X, Y, metric="manhattan") dist_orig_ind = dist.argmin(axis=0) dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))] dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min( X, Y, axis=0, metric="manhattan") np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7) np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7)
def Gross_K_means(): ####################################################################### MySQL_DBkey2 = {'host':'localhost', 'user':'', 'password':'', 'db':'','charset':'utf8mb4'} # command comd_Score_TH = "\ select tagScore_T, tagSCore_H, step1_score_t, step1_score_h\n\ from tag_compare_all\n\ where step1_score_t > 0 or step1_score_h > 0 or tagScore_t >= 5 or tagScore_h >= 5;\n" temp_data = [[],[],[],[]] # Connect to the database connection = pymysql.connect(host=MySQL_DBkey2['host'], user=MySQL_DBkey2['user'], password=MySQL_DBkey2['password'], db=MySQL_DBkey2['db'], charset=MySQL_DBkey2['charset'], cursorclass=pymysql.cursors.DictCursor) try: with connection.cursor() as cursor: cursor.execute(comd_Score_TH) result = cursor.fetchall() # result is a list of dicts: {u'tagText': u'100yearsold'} for item in result: temp_data[0].append(item['tagScore_T']) temp_data[1].append(item['tagSCore_H']) temp_data[2].append(item['step1_score_t']) temp_data[3].append(item['step1_score_h']) finally: pass connection.close() ####################################################################### # data check Data_TH = np.array(temp_data) plt.scatter(Data_TH[0],Data_TH[1],color='black') axes = plt.gca() axes.set_xlim([-1,11]) axes.set_ylim([-1,11]) plt.show() plt.scatter(Data_TH[2],Data_TH[3],color='black') axes = plt.gca() axes.set_xlim([-1,11]) axes.set_ylim([-1,11]) plt.show() ################################################################## # rewrite array format into data points # only tagScores, x trump y hillary Data_tagScore_TH = [] for i in range(len(Data_TH[1])): Data_tagScore_TH.append([Data_TH[0,i],Data_TH[1,i]]) Data_tagScore_TH = np.array(Data_tagScore_TH) # only StepScores, x trump y hillary Data_stepScore_TH = [] for i in range(len(Data_TH[1])): Data_stepScore_TH.append([Data_TH[2,i],Data_TH[3,i]]) Data_stepScore_TH = np.array(Data_stepScore_TH) # 4D, x trump y hillary, tagscore then stepscore Data_4D_TH = [] for i in range(len(Data_TH[1])): Data_4D_TH.append([Data_TH[0,i],Data_TH[1,i],Data_TH[2,i],Data_TH[3,i]]) Data_4D_TH = np.array(Data_4D_TH) # re-fill empty step-score dimensions for i in range(len(Data_stepScore_TH)): # Data_stepScore_TH if Data_stepScore_TH[i,0] == 0: Data_stepScore_TH[i,0] = Data_tagScore_TH[i,0] if Data_stepScore_TH[i,1] == 0: Data_stepScore_TH[i,1] = Data_tagScore_TH[i,1] for i in range(len(Data_4D_TH)): # Data_4D_TH if Data_4D_TH[i,2] == 0: Data_4D_TH[i,2] = Data_tagScore_TH[i,0] if Data_4D_TH[i,3] == 0: Data_4D_TH[i,3] = Data_tagScore_TH[i,1] ############################################################################## # post refil data check fig = plt.figure() ax = fig.add_subplot(3, 1, 1) ax.plot(Data_tagScore_TH[:, 0], Data_tagScore_TH[:, 1], 'w', markerfacecolor='blue', marker='.') ax.set_title('tagScore') ax.set_xticks(()) ax.set_yticks(()) ax = fig.add_subplot(3, 1, 2) ax.plot(Data_stepScore_TH[:, 0], Data_stepScore_TH[:, 1], 'w', markerfacecolor='blue', marker='.') ax.set_title('StepScore') ax.set_xticks(()) ax.set_yticks(()) ax = fig.add_subplot(3, 1, 3) ax.plot(Data_4D_TH[:, 2], Data_4D_TH[:, 3], 'w', markerfacecolor='blue', marker='.') ax.set_title('StepScore') ax.set_xticks(()) ax.set_yticks(()) plt.show() ############################################################################## # K-means n_clusters = 8 n_init = 500 max_iter = 500 # top 11 colors colors = ['firebrick', 'red', 'orange', 'yellow','tan', 'green', 'skyblue', 'blue', 'violet', 'magenta','black'] ############################################################################## # Compute tagScores with K-means k_means = KMeans(init='k-means++', n_clusters=n_clusters, n_init=n_init, max_iter = max_iter) k_means.fit(Data_tagScore_TH) TS_labels = k_means.labels_ TS_cluster_centers = k_means.cluster_centers_ TS_labels_unique = np.unique(TS_labels) ############################################################################## # Compute StepScores with K-means k_means.fit(Data_stepScore_TH) SS_labels = k_means.labels_ SS_cluster_centers = k_means.cluster_centers_ SS_labels_unique = np.unique(SS_labels) ############################################################################## # Compute in 4D with K-means k_means.fit(Data_4D_TH) full4D_labels = k_means.labels_ full4D_cluster_centers = k_means.cluster_centers_ full4D_labels_unique = np.unique(full4D_labels) ############################################################################## # We want to have the same colors for the same cluster from the # MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per # closest one. order = pairwise_distances_argmin(TS_cluster_centers, SS_cluster_centers) order = pairwise_distances_argmin(TS_cluster_centers, full4D_cluster_centers[:,0:2]) print "tagScore centers: ", TS_cluster_centers print "StepScore centers: ", SS_cluster_centers print "full 4D centers: ", full4D_cluster_centers ############################################################################## # Plot result fig = plt.figure(figsize=(16, 16)) fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9) # tagScore ax = fig.add_subplot(2, 3, 1) for k, col in zip(range(n_clusters), colors): my_members = TS_labels == k cluster_center = TS_cluster_centers[k] ax.plot(Data_tagScore_TH[my_members, 0], Data_tagScore_TH[my_members, 1], 'w', markerfacecolor=col, marker='.') ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6) ax.set_title('tagScore') ax.set_xticks(()) ax.set_yticks(()) # StepScore ax = fig.add_subplot(2, 3, 2) for k, col in zip(range(n_clusters), colors): my_members = SS_labels == k cluster_center = SS_cluster_centers[order[k]] ax.plot(Data_stepScore_TH[my_members, 0], Data_stepScore_TH[my_members, 1], 'w', markerfacecolor=col, marker='.') ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6) ax.set_title('StepScore') ax.set_xticks(()) ax.set_yticks(()) #################################################################### # migrating blocks ax = fig.add_subplot(2, 3, 3) # migration for k, col in zip(range(n_clusters), colors): my_members = TS_labels == k cluster_center = TS_cluster_centers[k] ax.plot(Data_4D_TH[my_members, 2], Data_4D_TH[my_members, 3], 'w', markerfacecolor=col, marker='.') ax.set_title('Step.S clusters migrating in Step.S frame') ax.set_xticks(()) ax.set_yticks(()) #################################################################### # 4D, TagScore ax = fig.add_subplot(2, 3, 4) for k, col in zip(range(n_clusters), colors): my_members = full4D_labels == order[k] cluster_center = full4D_cluster_centers[order[k]] ax.plot(Data_4D_TH[my_members, 0], Data_4D_TH[my_members, 1], 'w', markerfacecolor=col, marker='.') ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6) ax.set_title('full 4D, in TagScore') ax.set_xticks(()) ax.set_yticks(()) # 4D, StepScore ax = fig.add_subplot(2, 3, 5) for k, col in zip(range(n_clusters), colors): my_members = full4D_labels == order[k] cluster_center = full4D_cluster_centers[order[k]] ax.plot(Data_4D_TH[my_members, 2], Data_4D_TH[my_members, 3], 'w', markerfacecolor=col, marker='.') ax.plot(cluster_center[2], cluster_center[3], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6) ax.set_title('full 4D, in StepScore') ax.set_xticks(()) ax.set_yticks(()) #################################################################### # migrating blocks ax = fig.add_subplot(2, 3, 6) different = (full4D_labels == n_clusters+1) for k in range(n_clusters): different += ((TS_labels == k) != (full4D_labels == order[k])) identic = np.logical_not(different) ax.plot(Data_4D_TH[identic, 0], Data_4D_TH[identic, 1], 'w', markerfacecolor='#bbbbbb', marker='.') ax.plot(Data_4D_TH[different, 0], Data_4D_TH[different, 1], 'w', markerfacecolor='m', marker='.') ax.set_title('4D.S diff Tag.S in Tag.S frame') ax.set_xticks(()) ax.set_yticks(()) plt.savefig('../output/Gross_Kmeans_setNcluster_{}.png'.format(n_clusters)) plt.show() #################################################################### return '../output/Gross_Kmeans_setNcluster_{}.png'.format(n_clusters)
mbk.fit(X) t_mini_batch = time.time() - t0 # ############################################################################# # Plot result fig = plt.figure(figsize=(8, 3)) fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9) colors = ['#4EACC5', '#FF9C34', '#4E9A06'] # We want to have the same colors for the same cluster from the # MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per # closest one. k_means_cluster_centers = np.sort(k_means.cluster_centers_, axis=0) mbk_means_cluster_centers = np.sort(mbk.cluster_centers_, axis=0) k_means_labels = pairwise_distances_argmin(X, k_means_cluster_centers) mbk_means_labels = pairwise_distances_argmin(X, mbk_means_cluster_centers) order = pairwise_distances_argmin(k_means_cluster_centers, mbk_means_cluster_centers) # KMeans ax = fig.add_subplot(1, 3, 1) for k, col in zip(range(n_clusters), colors): my_members = k_means_labels == k cluster_center = k_means_cluster_centers[k] ax.plot(X[my_members, 0], X[my_members, 1], 'w', markerfacecolor=col, marker='.') ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6) ax.set_title('KMeans') ax.set_xticks(())
# 预测分类 norm1 = multivariate_normal(mu1, sigma1) norm2 = multivariate_normal(mu2, sigma2) tau1 = norm1.pdf(data) tau2 = norm2.pdf(data) fig = plt.figure(figsize=(10, 5), facecolor='w') ax = fig.add_subplot(121, projection='3d') ax.scatter(data[:, 0], data[:, 1], data[:, 2], c='b', s=30, marker='o', edgecolors='k', depthshade=True) ax.set_xlabel('X') ax.set_ylabel('Y') ax.set_zlabel('Z') ax.set_title('原始数据', fontsize=15) ax = fig.add_subplot(122, projection='3d') order = pairwise_distances_argmin([mu1_fact, mu2_fact], [mu1, mu2], metric='euclidean') print(order) if order[0] == 0: c1 = tau1 > tau2 else: c1 = tau1 < tau2 c2 = ~c1 acc = np.mean(y == c1) print('准确率:%.2f%%' % (100*acc)) ax.scatter(data[c1, 0], data[c1, 1], data[c1, 2], c='r', s=30, marker='o', edgecolors='k', depthshade=True) ax.scatter(data[c2, 0], data[c2, 1], data[c2, 2], c='g', s=30, marker='^', edgecolors='k', depthshade=True) ax.set_xlabel('X') ax.set_ylabel('Y') ax.set_zlabel('Z') ax.set_title('EM算法分类', fontsize=15) plt.suptitle('EM算法的实现', fontsize=18)
mbk_means_labels = mbk.labels_ mbk_means_cluster_centers = mbk.cluster_centers_ mbk_means_labels_unique = np.unique(mbk_means_labels) ############################################################################## # Plot result fig = plt.figure(figsize=(8, 3)) fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9) colors = ['#4EACC5', '#FF9C34', '#4E9A06'] # We want to have the same colors for the same cluster from the # MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per # closest one. order = pairwise_distances_argmin(k_means_cluster_centers, mbk_means_cluster_centers) # KMeans ax = fig.add_subplot(1, 3, 1) for k, col in zip(range(n_clusters), colors): my_members = k_means_labels == k cluster_center = k_means_cluster_centers[k] ax.plot(X[my_members, 0], X[my_members, 1], 'w', markerfacecolor=col, marker='.') ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6) ax.set_title('KMeans') ax.set_xticks(()) ax.set_yticks(()) plt.text(-3.5, 1.8, 'train time: %.2fs\ninertia: %f' % ( t_batch, k_means.inertia_))
y = pd.Categorical(data[4]).codes n_components = 3 feature_pairs = [[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]] plt.figure(figsize=(8, 6), facecolor='w') for k, pair in enumerate(feature_pairs, start=1): x = x_prime[pair] m = np.array([np.mean(x[y == i], axis=0) for i in range(3)]) # 均值的实际值 print('实际均值 = \n', m) gmm = GaussianMixture(n_components=n_components, covariance_type='full', random_state=0) gmm.fit(x) print('预测均值 = \n', gmm.means_) print('预测方差 = \n', gmm.covariances_) y_hat = gmm.predict(x) order = pairwise_distances_argmin(m, gmm.means_, axis=1, metric='euclidean') print('顺序:\t', order) n_sample = y.size n_types = 3 change = np.empty((n_types, n_sample), dtype=np.bool) for i in range(n_types): change[i] = y_hat == order[i] for i in range(n_types): y_hat[change[i]] = i acc = '准确率:%.2f%%' % (100*np.mean(y_hat == y)) print(acc) cm_light = mpl.colors.ListedColormap(['#FF8080', '#77E0A0', '#A0A0FF']) cm_dark = mpl.colors.ListedColormap(['r', 'g', '#6060FF']) x1_min, x2_min = x.min()