def neuron_spectral_cluster_direct(neuron): """ Perform spectral cluster over neurons in the penultimate (conv5_3) layer :param neuron: [numpy.ndarray] The activation of neurons in the penultimate (conv5_3) layer, shape is (512, 1, 1) :return: [list] a list, whose element denotes the cluster number of its corresponding neuron, length is 512 """ # compute the cosine similarity matrix by # cosine = <A, B> / (|A|*|B|) # ui = neuron.squeeze(2) # print(ui.shape) # uj = torch.t(ui) # cosine_similarity_matrix = torch.matmul(ui, uj) / (torch.norm(ui) * torch.norm(uj)) # shape -> (512, 512) ui = np.squeeze(neuron) # shape -> (512, 1) uj = np.transpose(ui) # shape -> (1, 512) #cosine_similarity_matrix = np.matmul(ui, uj) / (np.linalg.norm(ui) * np.linalg.norm(uj)) # shape -> (512, 512) cosine_similarity_matrix = np.matmul(ui, uj) / (np.matmul( np.linalg.norm(ui, 2, axis=1), np.linalg.norm( uj, 2, axis=0))) # shape -> (512, 512) cosine_similarity_matrix = np.exp(cosine_similarity_matrix) # Perform spectral clustering on the similarity matrix sc = SpectralClustering(n_clusters=2, affinity='precomputed', n_init=100) sc.fit(cosine_similarity_matrix) #cluster_index = list(sc.labels_) cluster_index = sc.labels_ # print(cluster_index) assert len(cluster_index) == 512, 'error' return cluster_index
class spectralClustering(BaseEstimator, ClusterMixin, TransformerMixin): def __init__(self, n_clusters=2, gamma=1, n_neighbors=10): self.k = n_clusters self.gamma = gamma self.n_neighbors = n_neighbors def fit(self, X, y=None): self.cluster = SpectralClustering(n_clusters=self.k) self.cluster.fit(X) return self def predict(self, X): return self.cluster.fit_predict(X) def get_params(self, deep=True): return { "n_clusters": self.k, "gamma": self.gamma, "n_neighbors": self.n_neighbors } def set_params(self, **parameters): for parameter, value in parameters.items(): setattr(self, parameter, value) return self
def psc_distance_matrix(distance='spearman', clustering='spectral'): for i in range(0, df.shape[1]): for j in range(0, df.shape[1]): #Spearman correlation if distance == 'spearman': dist_mat.at[df.columns[i], df.columns[j]] = abs( round( scipy.stats.spearmanr( np.array(df.iloc[:, i]).astype(float), np.array(df.iloc[:, j]).astype(float))[0], 4)) #Euclidean distance else: dist_mat.at[df.columns[i], df.columns[j]] = np.linalg.norm( np.array(df.iloc[:, i]).astype(float) - np.array(df.iloc[:, j]).astype(float)) if clustering == 'spectral': clustering = SpectralClustering(n_clusters=2, affinity='precomputed', assign_labels='discretize', random_state=0) else: clustering = AgglomerativeClustering(affinity='precomputed', linkage='average') clustering.fit(dist_mat.values) bact_label = {0: [], 1: []} for i in range(0, df.shape[1]): bact_label[clustering.labels_[i]].append(df.columns[i]) df0 = df[bact_label[0]] df1 = df[bact_label[1]] pca_and_conf_matrix_per_group(df0) pca_and_conf_matrix_per_group(df1)
def spectral_cluster_combined(data: np.ndarray, epi_data: np.ndarray, num_clusters: int): #dist = squareform(pdist(data,'correlation'))*squareform(pdist(epi_data,'canberra')) dist = squareform(pdist(epi_data, 'canberra')) spec = SpectralClustering(n_clusters=num_clusters, affinity="precomputed") spec.fit(dist) return binarize_vector(spec.labels_, num_clusters)
def call_spectral(num_cluster ,mode_, data, update_flag): X = StandardScaler().fit_transform(data) spectral = SpectralClustering(n_clusters=num_cluster, eigen_solver='arpack', affinity='precomputed') connectivity = kneighbors_graph(X, n_neighbors=10) connectivity = 0.5 * (connectivity + connectivity.T) spectral.fit(connectivity) labels = spectral.labels_ if update_flag: return labels label_dict = {} label_dict_count = 0 for label in labels: label_dict[str(label_dict_count)] = float(label) label_dict_count = label_dict_count + 1 print label_dict unique_dict = {} unique_dict_count = 0 for uniq in np.unique(labels): print uniq unique_dict[str(unique_dict_count)] = float(uniq) unique_dict_count = unique_dict_count + 1 print unique_dict return label_dict, unique_dict
def SpectralClusteringAlgorithm(X, k): #参数n_clusters: integer, optional #The dimension of the projection subspace. sc = SpectralClustering(n_clusters=k) sc.fit(X) y_pred = sc.labels_ return y_pred
def e2cp_fit(similarity_matrix, ML, CL, n_clusters): """ apply constraint-propagation clustering e2cp on a given matrix. :param similarity_matrix: similarity matrix or affinity matrix of the dataset :param ML: must-link constraint set at the format of [[xx, yy], [yy, zz] .... ] :param CL: cannot-link constraint set :param n_clusters: #clusters :return: """ N = similarity_matrix.shape[0] nbrs = NearestNeighbors(n_neighbors=_k_E2CP + 1, algorithm='brute').fit(similarity_matrix) distances, indices = nbrs.kneighbors() W = np.zeros(similarity_matrix.shape) ind1 = (np.arange(N).reshape((-1, 1)) * np.ones( (1, _k_E2CP))).reshape(-1).astype('int') ind2 = indices[:, 1:].reshape(-1).astype('int') W[ind1, ind2] = similarity_matrix[ind1, ind2] / ( np.sqrt(similarity_matrix[ind1, ind1]) * np.sqrt(similarity_matrix[ind2, ind2])) W = (W + W.transpose()) / 2 Dsqrt = np.diag(np.sum(W, axis=1)**-0.5) Lbar = np.dot(np.dot(Dsqrt, W), Dsqrt) Z = np.zeros(similarity_matrix) Z[ML[:, 0], ML[:, 1]] = 1 Z[CL[:, 0], CL[:, 1]] = -1 # iterative approach # Fv = np.zeros(Z.shape) # for i in range(50): # Fv = self.alpha * np.dot(Lbar, Fv) + (1 - self.alpha) * Z # # Fh = np.zeros(Z.shape) # for i in range(50): # Fh = self.alpha * np.dot(Fh, Lbar) + (1 - self.alpha) * Fv # # Fbar = Fh / np.max(np.abs(Fh.reshape(-1))) # approximation of Fbar instead of the propagation iteration. temp = (1 - _alpha) * (np.eye(Lbar.shape[0]) - _alpha * Lbar) Fbar = np.dot(np.dot(temp, Z), temp.conj().T) Fbar = Fbar / np.max(np.abs(Fbar.reshape(-1))) # recover Wbar = np.zeros(similarity_matrix) mlInd = Fbar >= 0 Wbar[mlInd] = 1 - (1 - Fbar[mlInd]) * (1 - W[mlInd]) clInd = Fbar < 0 Wbar[clInd] = (1 + Fbar[clInd]) * W[clInd] specClus = SpectralClustering(n_clusters=n_clusters, affinity='precomputed') specClus.fit(Wbar) return specClus.labels_
def spectral_clustering(G, graph_name, num_clusters): #Find a way to figure out clusters number automatically subgraphs = [] write_directory = os.path.join(Constants.SPECTRAL_PATH,graph_name) if not os.path.exists(write_directory): os.makedirs(write_directory) nodeList = G.nodes() matrix_data = nx.to_numpy_matrix(G, nodelist = nodeList) spectral = SpectralClustering(n_clusters=2, eigen_solver='arpack', affinity="rbf") spectral.fit(matrix_data) label = spectral.labels_ clusters = {} for nodeIndex, nodeLabel in enumerate(label): if nodeLabel not in clusters: clusters[nodeLabel] = [] clusters[nodeLabel].append(nodeList[nodeIndex]) #countNodes is used to test whether we have all the nodes in the clusters for clusterIndex, subGraphNodes in enumerate(clusters.keys()): subgraph = G.subgraph(clusters[subGraphNodes]) subgraphs.append(subgraph) nx.write_gexf(subgraph, os.path.join(write_directory,graph_name+str(clusterIndex)+"_I"+Constants.GEXF_FORMAT)) #countNodes = countNodes + len(clusters[subGraphNodes]) return subgraphs
class SP2CcommunityClassifier(): def __init__(self,graph): self.G=graph self.A=to_numpy_matrix(graph) self.k=np.sum(self.A,axis=1) self.m=np.sum(self.k)/2 self.B=self.A-np.dot(self.k,self.k.transpose())/(2*self.m) self.sc = SpectralClustering(2, affinity='precomputed') self.Q=0 self.category={node:[] for node in graph.nodes} self.s=None self.G_positive=None self.G_negative=None self.done=False def fit(self): self.sc.fit(self.A) #self.sc.labels_ rows=list(zip(self.sc.labels_,list(self.G.nodes))) d = defaultdict(list) for k, v in rows: d[k].append(v) partitions=list(d.values()) ll=[] for i in self.sc.labels_: label=[2*int(h)-1 for h in list(bin(i)[2:])] ll.append(label) self.category=dict(zip(list(self.G.nodes),ll)) self.s=np.array([self.category[node][0] for node in self.G.nodes]) nodes=np.array(self.G.nodes) self.G_positive=self.G.subgraph(nodes[self.s==1]) self.G_negative=self.G.subgraph(nodes[self.s==-1]) self.Q=np.einsum("i,ij,j",self.s,self.B,self.s)/(4*self.m) if self.Q<0: self.done=True
def spectral_cluster(): #谱聚类 adj_mat, unidata = get_data() cluster_num = 2 #sc = SpectralClustering( cluster_num , affinity='precomputed', n_init=3000, assign_labels='discretize') sc = SpectralClustering(cluster_num, affinity='precomputed', n_init=3000, assign_labels='discretize') sc.fit(adj_mat) # Compare ground-truth and clustering-results print('spectral clustering') #print(sc.labels_)#输出标签 print('sc长度', len(sc.labels_)) class_array = [[100 for i in range(0)] for j in range(cluster_num)] class_length = np.zeros(cluster_num) for ci in range(cluster_num): for scj in range((len(sc.labels_))): if sc.labels_[scj] == ci: filenumber = scj class_array[ci].append(filenumber) class_length[ci] = class_length[ci] + 1 for ci in range(cluster_num): print('类编号 = ', ci, '类个数 =', class_length[ci]) print('类序号 = ', class_array[ci]) print('-----------------------------------') # Calculate some clustering metrics for i in range(len(sc.labels_)): srcfile = './13D归一化图像/' + str(i + 1) + '.jpeg' dstfile = './13D分类/' + str(sc.labels_[i]) + '/' + str(i + 1) + '.jpeg' mycopyfile(srcfile, dstfile)
def prepare_spectral_clustering_features(X, n_clusters): ''' Inputs: X: data matrix or dataframe. Each data instance is expected to be a row in the matrix or dataframe n_cluster: number of clusters Outputs: return: returns a one-hot vector encoding of the clusters. For example, if there are 6 data points, belonging to clusters [0,0,1,1,2,2], then the return array will be [1,0,0] [1,0,0] [0,1,0] [0,1,0] [0,0,1] [0,0,1] ''' cluster_model = SpectralClustering(n_clusters=n_clusters, n_init = 10, assign_labels="discretize", random_state=0) cluster_model.fit(X) labels_vec = cluster_model.labels_ labels_vec = np.reshape(labels_vec,(len(labels_vec),1),'F') enc = OneHotEncoder(handle_unknown='error') enc.fit(labels_vec) one_hot_vec = enc.transform(labels_vec) return one_hot_vec
def sklearn_test2(): read_path = 'F:\\result2019-2\\result0812\\datasets\\Wine\\' data_reader = np.loadtxt(read_path + 'data.csv', dtype=np.str, delimiter=',') label_reader = np.loadtxt(read_path + 'label.csv', dtype=np.str, delimiter=',') X = data_reader[:, :].astype(np.float) label_true = label_reader.astype(np.int) X = PreProcess.normalize(X) (n, dim) = X.shape k = 3 delta = 1.0 sc = SpectralClustering(n_clusters=k) sc.fit(X) label = sc.labels_ pca = PCA.PCA(X, 2) Y = pca.fit_transform() colors = ['c', 'm', 'y', 'b', 'r', 'g'] shapes = ['s', 'o', '^', 'p', '+', '*'] for i in range(0, n): plt.scatter(Y[i, 0], Y[i, 1], c=colors[int(label[i])], marker=shapes[int(label_true[i])]) plt.show()
def SepectralClustering(data, actualLabels): pca = PCA(n_components=2).fit(data) pca_2d = pca.transform(data) spectral = SpectralClustering(n_clusters=10, eigen_solver='arpack', affinity="nearest_neighbors") t0 = time() spectral.fit(pca_2d) print('% 9s' % 'init' ' time h**o compl v-meas ARI AMI silhouette') print('% 9s %.2fs %i %.3f %.3f %.3f %.3f' % ('Spectral', (time() - t0), metrics.homogeneity_score(actualLabels, spectral.labels_), metrics.completeness_score(actualLabels, spectral.labels_), metrics.v_measure_score(actualLabels, spectral.labels_), metrics.adjusted_rand_score(actualLabels, spectral.labels_), metrics.adjusted_mutual_info_score(actualLabels, spectral.labels_), metrics.silhouette_score( data, spectral.labels_, metric='euclidean', sample_size=10000))) print spectral.labels_ print len(np.unique(spectral.labels_)) colors = np.random.rand(15) scatter = plt.scatter(pca_2d[:, 0], pca_2d[:, 1], c=spectral.labels_, marker='*') plt.colorbar(scatter) plt.title('Spectral Clustering') plt.show()
def run_SpectralClustering(args): [propagated_profile_pca, n_clusters] = args[:2] cluster = SpectralClustering(affinity='nearest_neighbors', n_clusters=n_clusters, n_init=1000, gamma=0.5, n_neighbors=170, assign_labels='discretize') cluster.fit(propagated_profile_pca) # print "Calinski-Harabasz Score with n_clusters=", n_clusters,"score:", metrics.calinski_harabaz_score(propagated_profile_pca, cluster.labels_) return cluster.labels_
def fast_app_spe_cluster(data, label, k, n_cluster): #k-means get the representative points(centers points) start_time = time.clock() k_means = KMeans(n_clusters=k) k_means.fit(data) y_centers = k_means.cluster_centers_ # get the correspondence table x_to_centers_table = list() m = len(data) for i in range(m): min_distance = np.inf min_index = None for j in range(k): i_j_dis = np.sum((data[i, :] - y_centers[j, :]) ** 2) if min_distance > i_j_dis: min_index = j min_distance = i_j_dis x_to_centers_table.append(min_index) # spectral cluster spe_cluster = SpectralClustering(n_clusters=n_cluster) spe_cluster.fit(y_centers) spe_label = spe_cluster.labels_ # get m-way cluster membership x_label = list() for i in range(m): x_label.append(spe_label[x_to_centers_table[i]]) spend_time = time.clock() - start_time print("spend time is %f seconds" % spend_time) return x_label
def main(cm_file, perm_file, steps, labels_file, limit_classes=None): """Run optimization and generate output.""" # Load confusion matrix with open(cm_file) as f: cm = json.load(f) cm = np.array(cm) # Load labels if os.path.isfile(labels_file): with open(labels_file, "r") as f: labels = json.load(f) else: labels = list(range(len(cm))) n_clusters = 14 # hyperparameter spectral = SpectralClustering(n_clusters=n_clusters, eigen_solver='arpack', affinity="nearest_neighbors") spectral.fit(cm) if hasattr(spectral, 'labels_'): y_pred = spectral.labels_.astype(np.int) else: y_pred = spectral.predict(cm) sscore = silhouette_score(cm, y_pred) print("silhouette_score={} with {} clusters" .format(sscore, n_clusters)) grouping = [[] for _ in range(n_clusters)] for label, y in zip(labels, y_pred): grouping[y].append(label) for group in grouping: print(" {}: {}".format(len(group), group))
def run(): #generate synthetic data x, y, w, beta = sd(n=1000, p=100, k=1, sp_beta=0.8, sp_alpha=0.8) tr_x, tst_x, tr_y, tst_y = cv.train_test_split(x, y, test_size=0.2) #Train the fhim model. fhim = FHIM(lbd_beta=100, lbd_alpha=100) fhim.fit(tr_x, tst_x, tr_y, tst_y, KK=1, debug=True) #print np.min(fhim.a) #fhim.a[fhim.a < 0] = 0 ww = np.dot(fhim.a, fhim.a.T) w_pos = (w - np.min(w)) / (np.max(w) - np.min(w)) #cluster w sc = SpectralClustering(affinity='precomputed') sc.fit(w_pos) #wc = np.array() #wc = np.vstack([w[sc.labels_ == i, sc.labels_ == i] for i in np.unique(sc.labels_)]) wc = np.zeros(w.shape) count = 0 for i in np.unique(sc.labels_): wc[count:count + np.sum(sc.labels_ == i), :] = w[sc.labels_ == i, :] count += np.sum(sc.labels_ == i) count = 0 for i in np.unique(sc.labels_): wc[:, count:count + np.sum(sc.labels_ == i)] = w[:, sc.labels_ == i] count += np.sum(sc.labels_ == i) wwc = np.zeros(w.shape) count = 0 for i in np.unique(sc.labels_): wwc[count:count + np.sum(sc.labels_ == i), :] = ww[sc.labels_ == i, :] count += np.sum(sc.labels_ == i) count = 0 for i in np.unique(sc.labels_): wwc[:, count:count + np.sum(sc.labels_ == i)] = ww[:, sc.labels_ == i] count += np.sum(sc.labels_ == i) cmap = mcolors.ListedColormap([(0, 0, 1), (0, 1, 0), (1, 0, 0)]) plt.set_cmap('bwr') #plt.subplot(121) plt.title("Groundtruth Interaction Effects", fontsize=20) plt.grid(True) plt.imshow(w, vmin=-5, vmax=5) #, cmap=cmap) plt.colorbar() plt.show() #plt.subplot(221) plt.title("Learnt Interaction Effects", fontsize=20) plt.imshow(ww, vmin=-5, vmax=5) #, cmap=cmap) plt.grid(True) #plt.colormap() plt.colorbar() plt.show() return
def spectralClustering(self, similarity_measure_list, n_clusters=2): sim_dict = {} edge_set = set() for (file1, file2, val) in similarity_measure_list: sim_dict[(file1, file2)] = val sim_dict[(file2, file1)] = val edge_set.add(file1) edge_set.add(file2) edge_list = list(edge_set) affinity_matrix = [] for edge_id_x in xrange(len(edge_list)): temp = [] for edge_id_y in xrange(len(edge_list)): try: temp.append(sim_dict[(edge_list[edge_id_x], edge_list[edge_id_y])]) except: temp.append(0) affinity_matrix.append(temp) affinity_matrix = np.array(affinity_matrix) sc = SpectralClustering(n_clusters, affinity='precomputed', n_init=100) sc.fit(affinity_matrix) labels = sc.labels_ n_cluster = len(set(labels)) cluster_set = [] for x in xrange(n_cluster): cluster_set.append([]) for x in xrange(len(labels)): cluster_set[labels[x]].append(edge_list[x]) #self.cluster_set = cluster_set return cluster_set
def detect_regions(self, users): ''' Performs Spectral clustering on geo_coordinates :param users: geo-coordinates of all customers' locations. :return: dict of clusters: datapoints ''' self.logger.debug("Clustering settlements") affinity_matrix = self.get_affinity_matrix(users, k=100) nb_clusters, eigenvalues = self.eigen_decomposition(affinity_matrix, topK=50) K = nb_clusters * 1 # Adjustment factor self.logger.debug("Optimal K for Region Clustering " + str(K)) region_clustering = SpectralClustering(n_clusters=K, random_state=0, affinity='precomputed') region_clustering.fit(affinity_matrix) # Explicitly deleting the affinity matrix due to mem leak issues del affinity_matrix self.regions = self.format_regions(region_clustering.labels_, users) return self.regions
def cluster(self, k): sc = SpectralClustering(k, affinity='precomputed', n_init=100) sc.fit(self.adj_mat) print('spectral clustering') print(sc.labels_) print(len(sc.labels_)) if not self.is_large_network: for cluster_id in range(0, k): cluster_nodes = [] for idx in range(0, len(sc.labels_)): if sc.labels_[idx] == cluster_id: cluster_nodes.append(self.node_id_to_source_map[self.index_to_node[idx]]) print str(cluster_id) + ":" print "\t" + str(cluster_nodes) # get ground truth for node (only node ids with labels) self.gt = [] labeled_node_indices = [] for idx in range(0, len(self.index_to_node)): node_source = self.node_id_to_source_map[self.index_to_node[idx]] node_source_trust_score = self.get_trust_score_for_source(node_source) if node_source_trust_score is not None: labeled_node_indices.append(idx) print node_source_trust_score rounded_trust_score = round(node_source_trust_score, 1) self.gt.append(10 * rounded_trust_score) labeled_sc_labels = [] for idx in labeled_node_indices: labeled_sc_labels.append(sc.labels_[idx]) print "AMI metrics:{}".format(metrics.adjusted_mutual_info_score(self.gt, labeled_sc_labels))
def main(): '''Finds related artists to an input artist and constructs a clustered graph around them''' # Read command line args parser = argparse.ArgumentParser( description="Builds a graph of related artists colored by genre") parser.add_argument("artist", help="The artist to construct the graph around") parser.add_argument("num_artists", type=int, help="Number of artists to include in the graph") parser.add_argument("num_clusters", type=int, help="Number of clusters to show in the graph") args = parser.parse_args() # Get artist info and build graph related, info = get_artists(args.artist, args.num_artists) artist_graph = build_graph(related) # Spectral clustering adj_mat = nx.to_numpy_matrix(artist_graph) sc = SpectralClustering(args.num_clusters, affinity='precomputed', n_init=100, assign_labels='discretize') sc.fit(adj_mat) # Draw graph show_graph(info, artist_graph, sc)
def SC(k, data, parameter): eigen_solver = parameter['eigen_solver'] # n_components = parameter['n_components'] n_init = parameter['n_init'] random_state = parameter['random_state'] gamma = parameter['gamma'] affinity = parameter['affinity'] n_neighbors = parameter['n_neighbors'] eigen_tol = parameter['eigen_tol'] assign_labels = parameter['assign_labels'] degree = parameter['degree'] coef0 = parameter['eigen_tol'] kernel_params = parameter['kernel_params'] n_jobs = parameter['n_jobs'] # SC = SpectralClustering(n_clusters=k, eigen_solver=None, n_components=k-4, # random_state=1, n_init=10, gamma=0.2, affinity='rbf', # n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans', # degree=3, coef0=1, kernel_params=None, n_jobs=None) SC = SpectralClustering(n_clusters=k, eigen_solver=eigen_solver, random_state=random_state, n_init=n_init, gamma=gamma, affinity=affinity, n_neighbors=n_neighbors, eigen_tol=eigen_tol, assign_labels=assign_labels, degree=degree, coef0=coef0, kernel_params=kernel_params, n_jobs=n_jobs) SC.fit(data) labels = SC.fit_predict(data) return labels
def spectralclustering(params): distance_path='' distance_path+=params["distance_path"] print(distance_path) distance=np.loadtxt(distance_path,dtype=np.float32) print(distance.shape) delta=2 affinity=np.exp(-distance ** 2/ (2. * delta ** 2)) #using default values, set metric to 'precomputed' sp=SpectralClustering(n_clusters=10,affinity='precomputed') print(sp) sp.fit(affinity) #get labels labels = sp.labels_ print(labels,labels.shape) #get number of clusters no_clusters = len(set(labels)) - (1 if -1 in labels else 0) print(no_clusters,"no_clusters") #for i in range(no_clusters): #print('Cluster : ', np.nonzero(labels == i)[0]) #print(type(labels)) return_val=tuple(labels.tolist()) #print(type(return_val)) return return_val
def split_superinstance(self, si, k): data_to_cluster = self.data[np.ix_(si.indices, si.indices)] spec = SpectralClustering(k, affinity="precomputed") spec.fit(data_to_cluster) split_labels = spec.labels_.astype(np.int) labels_to_indices = [] for label in set(split_labels): labels_to_indices.append(np.where(split_labels == label)) training = [] no_training = [] for new_si_idx in set(split_labels): # go from super instance indices to global ones cur_indices = [si.indices[idx] for idx, c in enumerate(split_labels) if c == new_si_idx] si_train_indices = [x for x in cur_indices if x in self.train_indices] if len(si_train_indices) != 0: training.append(SuperInstance_DTW(self.data, cur_indices, self.train_indices, si)) else: no_training.append((cur_indices, get_prototype(self.data, cur_indices))) for indices, centroid in no_training: closest_train = max(training, key=lambda x: self.data[x.representative_idx, centroid]) closest_train.indices.extend(indices) si.children = training return training
def SpectralClusteringFunc(K, dataset, rightdataset): cluster = SpectralClustering(n_clusters=K, affinity='cosine') cluster.fit(dataset) #print(cluster.labels_) affinity_matrix = cluster.affinity_matrix_ k, _, _ = eigenDecomposition(affinity_matrix) print(f'Optimal number of clusters are: {k}') contingency_matrix = metrics.cluster.contingency_matrix( rightdataset, cluster.labels_) purity = np.sum(np.amax(contingency_matrix, axis=0)) / len(dataset) print("Purity for %d Clusters is: %f" % (K, purity)) # Gia thn pleiopsifia se kathe cluster clustersCategories = [] for i in range(K): if contingency_matrix[0][i] > contingency_matrix[1][i]: clustersCategories.append(0) else: clustersCategories.append(1) # Gia to F-Measure TotalFMeasure = 0 for i in range(K): # Gia kathe K TruePositive = 0 TrueNegative = 0 FalsePositive = 0 FalseNegative = 0 for j in range(len(dataset)): # Gia kathe paradeigma label = cluster.labels_[ j] # Krata to label tou paradeigmatos sumfwna me ton kmeans if (label != i): # an den einai idio me to cluster pou eksetazoume continue else: # an einai idio if rightdataset[j] == clustersCategories[ label] and clustersCategories[label] == 1: TruePositive = TruePositive + 1 elif rightdataset[j] == clustersCategories[ label] and clustersCategories[label] == 0: TrueNegative = TrueNegative + 1 elif rightdataset[j] != clustersCategories[ label] and clustersCategories[label] == 1: FalsePositive = FalsePositive + 1 elif rightdataset[j] != clustersCategories[ label] and clustersCategories[label] == 0: FalseNegative = FalseNegative + 1 if TruePositive != 0 and FalsePositive != 0: precision = TruePositive / (TruePositive + FalsePositive) recall = TruePositive / (TruePositive + FalseNegative) F1 = 2 / ((1 / precision) + (1 / recall)) else: precision = 0 recall = 0 F1 = 0 TotalFMeasure = TotalFMeasure + F1 print("Total F-Measure for %d Clusters is: %f" % (K, TotalFMeasure))
class UmapSpectral: def __init__(self, nclust, umapdim=2, umapN=10, umapMd=float(0), umapMetric='euclidean', random_state=0): self.nclust = nclust # change this bit for changing the manifold learner self.manifoldInEmbedding = umap.UMAP(random_state=random_state, metric=umapMetric, n_components=umapdim, n_neighbors=umapN, min_dist=umapMd) # change this bit to change the clustering mechanism self.clusterManifold = SpectralClustering(n_clusters=nclust, affinity='nearest_neighbors', random_state=random_state) self.hle = None def predict(self, hl): # obviously if you change the clustering method or the manifold learner # youll want to change the predict method too. self.hle = self.manifoldInEmbedding.fit_transform(hl) self.clusterManifold.fit(self.hle) y_pred = self.clusterManifold.fit_predict(self.hle) return (y_pred)
def trainModel(data, clusterNum): model = SpectralClustering(n_clusters=clusterNum, affinity="rbf", gamma=100, assign_labels="kmeans") model.fit(data) return model
def spectral_vader(tweetlist, vectorized_tweets, sim_measure = vader_pos_sim, max_n = 20): """Perform spectral clustering with VADER and silhouette analysis.""" affinity_matrix = vader_affinity_matrix(tweetlist, similarity = sim_measure) sil_scr_prev = -1 brk = 0 for n in range(2,max_n): print 'testing ', n, ' clusters' # cluster clf = SpectralClustering(n_clusters=n, affinity = 'precomputed') clf.fit(affinity_matrix) tweet_pred = clf.fit_predict(affinity_matrix) # cluster silhouette scores silhouette_avg = silhouette_score(vectorized_tweets, tweet_pred) print 'Silhouette average ', silhouette_avg # determine number of centroids to use for batch if silhouette_avg <= sil_scr_prev: sil_n = n - 1 sil_avg = sil_scr_prev brk = 1 # break if previous silhoutte score is smaller if brk == 1: break sil_scr_prev = silhouette_avg sil_pred_prev = tweet_pred return sil_pred_prev
def specclustering(): np.random.seed(1) # Get your mentioned graph G = buildGraph() fileid = open('Graph.txt', 'w') for n, nbrs in G.adjacency_iter(): for nbr, eattr in nbrs.items(): data = eattr['weight'] fileid.write('(%d, %d, %f)\n' % (n, nbr, data)) fileid.close() # Get adjacency-matrix as numpy-array adj_mat = nx.adjacency_matrix(G) print(adj_mat) # Cluster sc = SpectralClustering(30, affinity='precomputed', n_neighbors=10, n_init=10) sc.fit(adj_mat) # # Compare ground-truth and clustering-results print('spectral clustering') clusterfile = open('Cluster.txt', 'w') i = 0 while i<len(G.nodes()): clusterfile.write('%d ==> %d\n' % (G.nodes()[i], sc.labels_[i])) i = i+1 clusterfile.close() pass
def cluster_with_spectral_custering(X): scaler = StandardScaler() X = scaler.fit_transform(X) spectral_clusterer = SpectralClustering(n_clusters=2) spectral_clusterer.fit(X) y_pred = spectral_clusterer.labels_ return y_pred
def ibd_distance_matrix(distance='spearman', clustering='spectral'): for i in range(0, df.shape[1]): for j in range(0, df.shape[1]): #Spearman correlation if distance == 'spearman': dist_mat.at[df.columns[i], df.columns[j]] = abs( round( scipy.stats.spearmanr( np.array(df.iloc[:, i]).astype(float), np.array(df.iloc[:, j]).astype(float))[0], 4)) #Euclidean distance else: dist_mat.at[df.columns[i], df.columns[j]] = np.linalg.norm( np.array(df.iloc[:, i]).astype(float) - np.array(df.iloc[:, j]).astype(float)) if clustering == 'spectral': clustering = SpectralClustering(n_clusters=2, affinity='precomputed', assign_labels='discretize', random_state=0) else: clustering = AgglomerativeClustering(affinity='precomputed', linkage='average') clustering.fit(dist_mat.values) bact_label = {0: [], 1: []} for i in range(0, df.shape[1]): bact_label[clustering.labels_[i]].append(df.columns[i]) bact_label_name = {0: [], 1: []} bact_label_tmp = {0: [], 1: []} bact_level = level - 1 for k in [0, 1]: for i in bact_label[k]: for key, value in dict_bact.items(): for j in value: if i == j: bact_label_tmp[k].append(key) bact_label_tmp[k] = set(bact_label_tmp[k]) for i in bact_label_tmp[k]: if i != 'else': for j in taxonomy: try: if j.split(';')[bact_level] == i: bact_label_name[k].append(','.join( j.split(';')[0:bact_level + 1])) break except: continue else: bact_label_name[k].append('else') bact_label_name[k] = set(bact_label_name[k]) df0 = df[bact_label[0]] df1 = df[bact_label[1]] print(len(bact_label[0])) pca_and_conf_matrix_per_group(df0) print(len(bact_label[1])) pca_and_conf_matrix_per_group(df1)
def suggested_terminals_spectral(graph, terminal_count): """Suggests a set of terminal vertices for the given graph. The terminals are suggested according to a two-step procedure. First, we perform a spectral clustering on the graph with terminal_count clusters. Then, within each cluster, we suggest the vertex which has the highest degree. Args: graph: the graph in which to suggest the terminals. terminal_count: the number of terminals to suggest. Returns: terminals: the suggested terminal vertices in the graph. total_degree: total degree of the terminal vertices in the graph. """ adj_matrix = nx.to_numpy_matrix(graph) sc = SpectralClustering(n_clusters=terminal_count, affinity="precomputed") sc.fit(adj_matrix) deg = graph.degree() terminals = [] total_degree = 0 for c in range(terminal_count): restricted_nodes = [(degree, node) for node, degree in deg if sc.labels_[list(graph).index(node)] == c] maximizer = max(restricted_nodes) total_degree += maximizer[0] terminals.append(maximizer[1]) return terminals, total_degree
def do_clustering(cluster_num, mrna_corr_mat, mirna_corr_mat, mrna_corr_weight, sample_id_list): mrna_distance_mat = 1 - mrna_corr_mat mrna_normal_mat = calculate_corr_mat(mrna_distance_mat) mirna_distance_mat = 1 - mirna_corr_mat mirna_normal_mat = calculate_corr_mat(mirna_distance_mat) a = mrna_corr_weight normal_mat = a * mrna_normal_mat + (1 - a) * mirna_normal_mat cluster = SpectralClustering(n_clusters=cluster_num, affinity='precomputed', n_init=100) cluster.fit(normal_mat) predict_label = cluster.labels_ sample_id_col = np.array(["SampleID"]) sample_id_col = np.hstack((sample_id_col, sample_id_list)) clustering_result = sample_id_col.reshape(-1, 1) label_col = np.array(["Label"]) predict_label.astype(str) label_col = np.hstack((label_col, predict_label)) label_col = label_col.reshape(-1, 1) clustering_result = np.hstack((clustering_result, label_col)) return normal_mat, clustering_result
def run(self, features, number_of_clusters=2, restarts=10, delta=3.0): if number_of_clusters == 1: result = numpy.zeros(len(features), dtype=numpy.int32) return [result] classifier = SpectralClustering(k=number_of_clusters, n_init=restarts) similarity = get_similarity(features, delta) classifier.fit(similarity) return [classifier.labels_]
def test_affinities(): X, y = make_blobs(n_samples=40, random_state=1, centers=[[1, 1], [-1, -1]], cluster_std=0.4) # nearest neighbors affinity sp = SpectralClustering(n_clusters=2, affinity="nearest_neighbors", random_state=0) labels = sp.fit(X).labels_ assert_equal(adjusted_rand_score(y, labels), 1) sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0) labels = sp.fit(X).labels_ assert_equal(adjusted_rand_score(y, labels), 1)
def run_clustering(methods, cases): true_method_groups = [m[1] for m in methods] edge_model = GraphLassoCV(alphas=4, n_refinements=5, n_jobs=3, max_iter=100) edge_model.fit(cases) CV = edge_model.covariance_ num_clusters=3 spectral = SpectralClustering(n_clusters=num_clusters,affinity='precomputed') spectral.fit(np.asarray(CV)) spec_sort=np.argsort(spectral.labels_) for i,m in enumerate(methods): print "%s:%d\t%s"%(m[1],spectral.labels_[i],m[0]) print "Adj. Rand Score: %f"%adjusted_rand_score(spectral.labels_,true_method_groups)
def eval_k(max_k): a_score, idx = [], [] for k in xrange(2, max_k + 1): print 'k={}'.format(k) est = SpectralClustering(n_clusters=k, affinity='nearest_neighbors') # est = SpectralClustering(n_clusters=k, affinity='rbf', gamma=0.00001) est.fit(x) ari = metrics.adjusted_rand_score(y, est.labels_) print ari a_score.append(ari) idx.append(k) pl.plot(idx, a_score) pl.xlabel('# of clusters') pl.ylabel('ARI') pl.show()
def spectral(X, num_clusters): """ Spectral Clustering on X for response y Returns array of cluster groups """ model = SpectralClustering( n_clusters=num_clusters, eigen_solver="arpack", affinity="nearest_neighbors", n_neighbors=4, assign_labels="discretize", ) cleanX = preprocessing.scale(X.as_matrix()) model.fit(cleanX) return model.labels_
def spectral(x, num_clusters): spec = SpectralClustering( affinity='rbf', # 'rbf' n_clusters=num_clusters, n_init=10, assign_labels='kmeans', gamma=1.0, degree=3, coef0=1 ) spec.fit(x) c = spec.labels_ k = len(np.unique(c)) return spec, (None, c, k)
def test_affinities(): X, y = make_blobs(n_samples=40, random_state=1, centers=[[1, 1], [-1, -1]], cluster_std=0.4) # nearest neighbors affinity sp = SpectralClustering(n_clusters=2, affinity='nearest_neighbors', random_state=0) labels = sp.fit(X).labels_ assert_equal(adjusted_rand_score(y, labels), 1) sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0) labels = sp.fit(X).labels_ assert_equal(adjusted_rand_score(y, labels), 1) # raise error on unknown affinity sp = SpectralClustering(n_clusters=2, affinity='<unknown>') assert_raises(ValueError, sp.fit, X)
def test_affinities(): # Note: in the following, random_state has been selected to have # a dataset that yields a stable eigen decomposition both when built # on OSX and Linux X, y = make_blobs(n_samples=40, random_state=2, centers=[[1, 1], [-1, -1]], cluster_std=0.4) # nearest neighbors affinity sp = SpectralClustering(n_clusters=2, affinity="nearest_neighbors", random_state=0) labels = sp.fit(X).labels_ assert_equal(adjusted_rand_score(y, labels), 1) sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0) labels = sp.fit(X).labels_ assert_equal(adjusted_rand_score(y, labels), 1) # raise error on unknown affinity sp = SpectralClustering(n_clusters=2, affinity="<unknown>") assert_raises(ValueError, sp.fit, X)
def test_affinities(): # Note: in the following, random_state has been selected to have # a dataset that yields a stable eigen decomposition both when built # on OSX and Linux X, y = make_blobs(n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01 ) # nearest neighbors affinity sp = SpectralClustering(n_clusters=2, affinity='nearest_neighbors', random_state=0) assert_warns_message(UserWarning, 'not fully connected', sp.fit, X) assert_equal(adjusted_rand_score(y, sp.labels_), 1) sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0) labels = sp.fit(X).labels_ assert_equal(adjusted_rand_score(y, labels), 1) X = check_random_state(10).rand(10, 5) * 10 kernels_available = kernel_metrics() for kern in kernels_available: # Additive chi^2 gives a negative similarity matrix which # doesn't make sense for spectral clustering if kern != 'additive_chi2': sp = SpectralClustering(n_clusters=2, affinity=kern, random_state=0) labels = sp.fit(X).labels_ assert_equal((X.shape[0],), labels.shape) sp = SpectralClustering(n_clusters=2, affinity=lambda x, y: 1, random_state=0) labels = sp.fit(X).labels_ assert_equal((X.shape[0],), labels.shape) def histogram(x, y, **kwargs): """Histogram kernel implemented as a callable.""" assert_equal(kwargs, {}) # no kernel_params that we didn't ask for return np.minimum(x, y).sum() sp = SpectralClustering(n_clusters=2, affinity=histogram, random_state=0) labels = sp.fit(X).labels_ assert_equal((X.shape[0],), labels.shape) # raise error on unknown affinity sp = SpectralClustering(n_clusters=2, affinity='<unknown>') assert_raises(ValueError, sp.fit, X)
def doClustering(self): photos = self.getClusteringData() features = [] for p in photos: features.append( list(self.getCoordinates(p))) #km = KMeans(n_clusters = 10, init='k-means++', max_iter=100) #km.fit(features) #algo = MeanShift() algo = SpectralClustering(4) algo.fit(np.asarray(features)) f = file(self.file_name_prefix+'evening_msp_meanshift.csv', 'w') for idx in range(len(photos)): p = photos[idx] f.write( (str(p['location']['latitude'])+','+str(p['location']['longitude'])+','+str(algo.labels_[idx])+p['images']['standard_resolution']['url']+'\n' ))
def initializeW_clustering(n,relationFileName, nClusters): W = np.identity(n+1) with open(relationFileName) as f: f.readline() for line in f: line = line.split('\t') if int(line[0])<=n and int(line[1]) <=n: W[int(line[0])][int(line[1])] +=1 #KMeans ''' kmeans = KMeans(n_clusters=nClusters) kmeans.fit(W) label = kmeans.labels_ ''' #SpectralClustering #spc = SpectralClustering(n_clusters=nClusters, affinity = "precomputed") spc = SpectralClustering(n_clusters=nClusters) spc.fit(W) # What is the meaning label = spc.labels_ with open(relationFileName+'.cluster','w') as f: for i in range(n): f.write(str(label[i])+'\n') NeighborW = np.zeros(shape=(nClusters, nClusters)) for i in range(n): for j in range(n): if label[i]==label[j]: NeighborW[label[i]][label[j]] = 0 else: NeighborW[label[i]][label[j]] += W[i][j] NormalizedNeighborW = normalizeByRow(NeighborW) newW = np.identity(nClusters) + NormalizedNeighborW print 'newW', newW NormalizednewW = normalizeByRow(newW) print 'NormalizednewW', NormalizednewW.T return NormalizednewW.T, newW, label
def rbf(max_k): gamma_set = [math.pow(10, i) for i in xrange(-5, 1)] a_score, idx = [[] for i in xrange(len(gamma_set))], [] for k in xrange(2, max_k + 1): print 'k={}'.format(k) for i, gamma in enumerate(gamma_set): est = SpectralClustering(n_clusters=k, affinity='rbf', gamma=gamma) est.fit(x) ari = metrics.adjusted_rand_score(y, est.labels_) a_score[i].append(ari) idx.append(k) for i in xrange(len(gamma_set)): print gamma_set[i] print np.max(a_score[i]) pl.plot(idx, a_score[i], label='gamma={}'.format(gamma_set[i])) pl.legend(loc=4,prop={'size':12}) pl.xlabel('# of clusters') pl.ylabel('ARI') pl.show()
def main(): percentageDensityDistance = 0.35 data = [] with open('/home/casep/Dropbox/Docencia/UTFSM/MsC/Tesis/Data/segmentation.data', 'rb') as csvfile: visionData = csv.reader(csvfile, delimiter=',', quotechar='"') for row in visionData: if len(row) > 12: dataRow = [] dataRow.extend([row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8],row[9],row[10],row[11],row[12],row[13],row[14],row[15],row[16],row[17],row[18]]) data.append(dataRow) clusterData = np.array(data)[1:,:] clustersNumber, labels = dp.predict(clusterData, percentageDensityDistance) print 'clustersNumber',clustersNumber print 'fit DensityPeaks',metrics.silhouette_score(clusterData, labels, metric='euclidean') clustersNumber = 5 km = KMeans(init='k-means++', n_clusters=clustersNumber, n_init=10,n_jobs=-1) km.fit(clusterData) labels = km.labels_ print 'fit K-Means',metrics.silhouette_score(clusterData, labels, metric='euclidean') sc = SpectralClustering(n_clusters=clustersNumber, eigen_solver=None, \ random_state=None, n_init=10, gamma=1.0, affinity='nearest_neighbors', \ n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans', degree=3, \ coef0=1, kernel_params=None) sc.fit(clusterData) labels = sc.labels_ print 'fit Spectral',metrics.silhouette_score(clusterData, labels, metric='euclidean') clusterData=np.array(clusterData,dtype=float) gmix = mixture.GMM(n_components=clustersNumber, covariance_type='spherical') gmix.fit(clusterData) labels = gmix.predict(clusterData) print 'fit GMM',metrics.silhouette_score(clusterData, labels, metric='euclidean') return 0
def _fit_spectral(self, x): # FIXME: broken still D = euclidean_distances(x, x) A = HomoscedasticClusteringNode.gauss_heat_kernel(D) # clustering for c in xrange(len(self.crange)): k = self.crange[c] for r in xrange(self.repeats): # init if self.debug is True: print '\t[%s][c:%d][r:%d]' % ( self.clus_type, self.crange[c], r + 1), idx = c * self.repeats + r # evaluate model model = SpectralClustering(k=k) model.fit(A) self._labels[idx] = model.labels_ means = sp.zeros((k, x.shape[1])) for i in xrange(k): means[i] = x[model.labels_ == i].mean(0) self._parameters[idx] = means
def test_n_components(): # Test that after adding n_components, result is different and # n_components = n_clusters by default X, y = make_blobs(n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01) sp = SpectralClustering(n_clusters=2, random_state=0) labels = sp.fit(X).labels_ # set n_components = n_cluster and test if result is the same labels_same_ncomp = SpectralClustering(n_clusters=2, n_components=2, random_state=0).fit(X).labels_ # test that n_components=n_clusters by default assert_array_equal(labels, labels_same_ncomp) # test that n_components affect result # n_clusters=8 by default, and set n_components=2 labels_diff_ncomp = SpectralClustering(n_components=2, random_state=0).fit(X).labels_ assert not np.array_equal(labels, labels_diff_ncomp)
def get_label_res(similar_matrix, n_subs): # cluster = AffinityPropagation(damping = 0.75)# , affinity = 'precomputed') # preference = -1000)# n_clusters = n_subs, affinity = 'precomputed') if True: labels = spectral_clustering(lil_matrix(similar_matrix), n_clusters = n_subs, eigen_solver='arpack') # affinity = 'precomputed', return labels elif False: cluster = SpectralClustering(n_clusters = n_subs, affinity = 'precomputed', eigen_solver='arpack') else: cluster = SpectralClustering(n_clusters = n_subs, affinity = 'nearest_neighbors', eigen_solver='arpack') res = cluster.fit(similar_matrix) size_labels = len(set(res.labels_)) assert size_labels < 10, size_labels assert size_labels > 1, size_labels print res.labels_ return res.labels_
def MultiDimensionalClusteringSPCL(Xmatrix, time, xdata, eigen_solver = 'arpack', n_clusters=2, ax = None, show=False): seed = np.random.seed(0) colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk']) colors = np.hstack([colors] * 20) # normalize dataset for easier parameter selection X = StandardScaler().fit_transform(Xmatrix) # algorithm SpectralClustering SC = SpectralClustering(n_clusters=n_clusters, eigen_solver=eigen_solver, affinity="nearest_neighbors") # Apply algorithm fit = SC.fit(X) y_pred = fit.labels_.astype(np.int) # Representation if np.logical_and(show, ax == None): ax.set_title('Clustering Tech: ' + "SpectralClustering; " + 'Number of Clusters = ' + str(n_clusters), fontsize=15) ax.plot(time, xdata, color='lightgray', alpha=0.4) ax.scatter(time, xdata, color=colors[y_pred].tolist(), s=10) ax.set_xlabel("time (ms)") ax.set_ylabel("Amplitude") return X, y_pred elif np.logical_and(show, ax == None): fig, axis = plt.subplots(1, 1) fig.tight_layout() axis.plot(time, xdata, color='lightgray', alpha=0.4) axis.scatter(time, xdata, color=colors[y_pred].tolist(), s=10) axis.set_xlabel("time (ms)") axis.set_ylabel("Amplitude") return X, y_pred else: return X, y_pred
sys.exit('Usage: python spectral.py dataset k') ## Data preprocessing data = parse_tab(sys.argv[1]) k = int(sys.argv[2]) classes = [example[-1] for example in data] examples = data_to_na(data) distances = euclidean_distances(examples, examples) # Apply gaussian kernel as suggested in the documentation: gamma = 0.5 # == 1 / num_features (heuristic) similarity_matrix = numpy.exp(-distances * gamma) ## Clustering sc = SpectralClustering(k=k, random_state=0) sc.fit(similarity_matrix) labels = sc.labels_ ## Performance evaluation ari = adjusted_rand_score(labels, classes) homogeneity, completeness, v_measure = homogeneity_completeness_v_measure(labels, classes) print('ARI: {0}'.format(ari)) print('Homogeneity: {0}'.format(homogeneity)) print('Completeness: {0}'.format(completeness)) print('V-measure: {0}'.format(v_measure)) addToResult('Spectral', ari, homogeneity, completeness, v_measure) draw.scatter(examples, labels) print(os.path.splitext(os.path.basename(sys.argv[1]))[0]) draw.setImgTitle('spectral_' + os.path.splitext(os.path.basename(sys.argv[1]))[0]) draw.showImage()
def cluster_reproducibility(self, repeats=None, clusters=50): """ Given the tag co-occurence arrays generated by the train method, use the spectral clustering method in sklearn and the known (or desired) number of clusters to assign tags to specific clusters. Required input: None Optional input: repeats - a set of co-occurence arrays to cluster using spectral methods. If not supplied, this method defaults to self.repeats which is the data generated by the train() method. labels - the tags corresponding to the feature vectors. Labels must be correctly ordered, obviously. Returns: None ----BUT---- generates the following analysis in the self namespace.' 1. self.reproduction_matrices: a reorganization of the repeats data into block diagonal form. 2. self.reproduction_analysis: a list of dictionaries. Each dictionary has two keys: 'members' and 'sizes'. 'members' lists the tag membership of each cluster in terms of the indices of the feature vectors represented by samples in train(),arranged by size. 'sizes' gives the size of each cluster. The index of the self.reproduction_analysis list gives the number of clusters remainging from the agglomeration. For example, self.reproduction_analysis[10][4]['members'] lists the tag indices of the 5th largest cluster when there are 11 clusters remaining from the agglomeration. """ def _find(where, what): """ Helper """ return np.where(where == what[0])[0].tolist() from sklearn.cluster import SpectralClustering from collections import Counter if repeats == None: repeats = self.repeats spectral = SpectralClustering(n_clusters=1, affinity="precomputed") cluster = 0 shape = (clusters,)+repeats.shape[1:] self.reproduction_matrices = np.zeros(shape, np.uint8) self.reproduction_analysis = [] for idx, repeat in enumerate(repeats[:clusters]): # run the spectral clustering on the current repeat array. # this is the rate limiting step, and already uses all # available cpu cores. spectral.set_params(n_clusters=idx+1) spectral.fit(repeat) labels = spectral.labels_ # order the clusters by size. keys in members are strings # as required for json dumps count = Counter(spectral.labels_) by_size = [(k, v) for k, v in count.items()] by_size.sort(key=lambda x: -x[1]) members = {str(t[0]+cluster):_find(labels, t) for t in by_size} order = np.hstack([members[str(t[0]+cluster)] for t in by_size]) #rearrange rearr = repeat[order].transpose()[order] sizes = [[str(k), len(v)] for k, v in members.items()] sizes.sort(key=lambda x: -x[1]) # m gives the counts for each pair of tags. 3d array. # shape: [nclusters-1,ntags,ntags]. members are the tag # indices; self.graph.graph.nodes()[members] gives members as words. # sizes are the number of tags in each cluster, sorted by size tmp = {'members':members, 'sizes':sizes} rescale = (rearr*255./rearr.max()).astype(np.uint8) self.reproduction_matrices[idx] = rescale self.reproduction_analysis.append(tmp) cluster += idx+1
textcoords = 'offset points', ha = 'right', va = 'bottom', bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5), arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0')) plt.title('The Workhorse Bus Stops of Pasadena ARTS \n - Spectral Clustering in Two Dimensions -' ) plt.xlabel("Average Delay in minutes") plt.ylabel("Logarithm of total passenger count") plt.savefig('station.png') print("Valuable Bus Stops: \n") print(stationFrame[stationFrame['predictedClass'] == 1]) # pca visualization, not as sexy as one above pcaDecomp = PCA(n_components=2) reduced_data = pcaDecomp.fit_transform(stationFrame) spectral.fit(reduced_data) print(reduced_data) h = 0.3 x_min, x_max = reduced_data[:,0].min() - 1, reduced_data[:,0].max() + 1 y_min, y_max = reduced_data[:,1].min() - 1, reduced_data[:,1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = spectral.fit_predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) fig1 = plt.figure() plt.imshow(Z, interpolation='nearest',extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Paired, aspect='auto', origin='lower') plt.plot(reduced_data[:,0], reduced_data[:,1], 'k.', markersize=8) plt.title('cluster') plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(())
def main(): parser = argparse.ArgumentParser(prog='clusteringTime8.py', description='Performs clustering, Gaussian Mixture, KMeans or Spectral', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--sourceFolder', help='Source folder', type=str, required=True) parser.add_argument('--outputFolder', help='Output folder', type=str, required=True) parser.add_argument('--clustersNumber', help='Number of clusters', type=int, default='3', choices=[2,3,4,5,6,7,8,9,10,11,12,13,14,15], required=False) parser.add_argument('--framesNumber', help='Number of frames used in STA analysis', type=int, default='20', required=False) parser.add_argument('--blockSize', help='Size of each block in micrometres', type=int, default='50', required=False) parser.add_argument('--clusteringAlgorithm', help='Clustering algorithm to use: K-Means, Spectral Clustering, GMM', type=str, default='kmeans', choices=['kmeans','spectral','gmm','densityPeaks'], required=False) parser.add_argument('--percentageDensityDistance', help='Percentage used to calculate the distance', type=float, default='2', required=False) args = parser.parse_args() #Source folder of the files with the timestamps sourceFolder = rfe.fixPath(args.sourceFolder) if not os.path.exists(sourceFolder): print '' print 'Source folder does not exists ' + sourceFolder print '' sys.exit() #Output folder for the graphics outputFolder = rfe.fixPath(args.outputFolder) if not os.path.exists(outputFolder): try: os.makedirs(outputFolder) except: print '' print 'Unable to create folder ' + outputFolder print '' sys.exit() #Clusters number for the kmeans algorithm clustersNumber = args.clustersNumber #Frames used in STA analysis framesNumber = args.framesNumber #Size of each block in micrometres blockSize = args.blockSize #Clustering Algorithm clusteringAlgorithm = args.clusteringAlgorithm #dataCluster stores the data to be used for the clustering process #the size is equal to the number of frames, aka, the time component #plus 5 as we are incorporating the 2 dimensions of the ellipse, #x position, y position and angle dataCluster = zeros((1,framesNumber+7)) units = [] dato = empty((1,1)) for unitFile in os.listdir(sourceFolder): if os.path.isdir(sourceFolder+unitFile): dato = empty((1,1)) unitName = unitFile.rsplit('_', 1)[0] #print unitName dataUnit, coordinates = rfe.loadSTACurve(sourceFolder,unitFile,unitName) xSize = dataUnit.shape[0] ySize = dataUnit.shape[1] fitResult = rfe.loadFitMatrix(sourceFolder,unitFile) dataUnitTemporal = dataUnit[coordinates[0][0],[coordinates[1][0]],:] #Time data from FITResult #dataUnitTemporal = rfe.loadVectorAmp(sourceFolder,unitFile).T #A radius of the RF ellipse aRadius = fitResult[0][2] dato[0] = aRadius dataUnitCompleta = concatenate((dataUnitTemporal,dato),1) #B radius of the RF ellipse bRadius = fitResult[0][3] dato[0] = bRadius dataUnitCompleta = concatenate((dataUnitCompleta,dato),1) #angle of the RF ellipse angle = fitResult[0][1] dato[0] = angle dataUnitCompleta = concatenate((dataUnitCompleta,dato),1) #X coordinate of the RF ellipse xCoordinate = fitResult[0][4] #print 'xCoordinate',xCoordinate dato[0] = xCoordinate dataUnitCompleta = concatenate((dataUnitCompleta,dato),1) #Y coordinate of the RF ellipse yCoordinate = fitResult[0][5] #print 'yCoordinate',yCoordinate dato[0] = yCoordinate dataUnitCompleta = concatenate((dataUnitCompleta,dato),1) #Area of the RF ellipse area = aRadius*bRadius*pi dato[0] = area dataUnitCompleta = concatenate((dataUnitCompleta,dato),1) #UnitName dato=empty(1, dtype='|S16') dato[0]=unitName dataUnitCompleta = concatenate((dataUnitCompleta,dato.reshape(1, 1)),1) dataCluster = append(dataCluster,dataUnitCompleta, axis=0) units.append(unitName) # remove the first row of zeroes dataCluster = dataCluster[1:,:] #Solo temporal dataCluster[:,0:framesNumber] # framesNumber data = dataCluster[:,framesNumber*.45:framesNumber*.9] data = data.astype(float64, copy=False) # Calculates the next 5-step for the y-coordinate maxData = ceil(amax(data)/5)*5 minData = floor(amin(data)/5)*5 if clusteringAlgorithm == 'spectral': from sklearn.cluster import SpectralClustering sc = SpectralClustering(n_clusters=clustersNumber, eigen_solver=None, \ random_state=None, n_init=10, gamma=1.0, affinity='nearest_neighbors', \ n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans', degree=3, \ coef0=1, kernel_params=None) sc.fit(data) labels = sc.labels_ elif clusteringAlgorithm == 'gmm': from sklearn import mixture gmix = mixture.GMM(n_components=clustersNumber, covariance_type='spherical') gmix.fit(data) labels = gmix.predict(data) elif clusteringAlgorithm == 'densityPeaks': import densityPeaks as dp percentageDensityDistance = args.percentageDensityDistance clustersNumber, labels = dp.predict(data, percentageDensityDistance) else: from sklearn.cluster import KMeans km = KMeans(init='k-means++', n_clusters=clustersNumber, n_init=10,n_jobs=-1) km.fit(data) labels = km.labels_ dataFile = empty((1,framesNumber+9),dtype='|S16') datos = empty((1,framesNumber+7),dtype='|S16') dato = empty((1,1),dtype='|S16') for clusterId in range(clustersNumber): for unitId in range(dataCluster.shape[0]): if labels[unitId] == clusterId: dato[0] = clusterId dataFileTmp = concatenate(([dataCluster[unitId,:]],dato),1) x = linspace(1, framesNumber, framesNumber) s = UnivariateSpline(x, dataCluster[unitId,0:framesNumber], s=0) xs = linspace(1, framesNumber, framesNumber*1000) ys = s(xs) media = mean(ys) maximo = amax(ys) minimo = amin(ys) maximaDistancia = absolute(maximo-media) minimaDistancia = absolute(minimo-media) peakTempCurve = minimo if maximaDistancia > minimaDistancia: peakTempCurve = maximo dato[0] = unique(where(peakTempCurve==ys)[0])[0] dataFileTmp = concatenate((dataFileTmp,dato),1) dataFile = append(dataFile, dataFileTmp, axis=0) # remove the first row of zeroes dataFile = dataFile[1:,:] savetxt(outputFolder+'outputFile.csv',dataFile, fmt='%s', delimiter=',', newline='\n') return 0
#line = aline[0][0] for line, _, _ in aline: print 'LINE=', line print datapath + '/WHOLE/trazos.' + line + '.mat' matpeaks = scipy.io.loadmat(datapath + '/WHOLE/trazos.' + line + '.mat') print matpeaks['Trazos'].shape data = matpeaks['Trazos'] normalize(data) if alg == 'spectral': spectral = SpectralClustering(n_clusters=nc, assign_labels='discretize', affinity='nearest_neighbors', n_neighbors=30) elif alg == 'kmeans': spectral = KMeans(n_clusters=nc, n_jobs=-1) spectral.fit(data) lab = spectral.labels_ centers = np.zeros((nc, data.shape[1])) for i in range(data.shape[0]): centers[lab[i]] += data[i] print len(lab) l = [lab[i] for i in range(len(lab))] c = Counter(l) print c
#kmeans km = KMeans(n_clusters = CLNO) km_fit = km.fit(dfun) km_clusters = km.labels_.tolist() print no_unique(km_clusters) #affinity propagation ap = AffinityPropagation() ap_fit = ap.fit(dfun) ap_clusters = ap.labels_.tolist() print no_unique(ap_clusters) #spectral clustering sc = SpectralClustering(CLNO) sc_fit = sc.fit(dfun) sc_clusters = sc.labels_.tolist() print "spectral", no_unique(sc_clusters) #ward ac = AgglomerativeClustering(CLNO, connectivity = conn, linkage = 'ward') ac_fit = ac.fit(dfun) ac_clusters = ac.labels_.tolist() print no_unique(ac_clusters) #output pd data = {"km":km_clusters, "ap":ap_clusters, "sc":sc_clusters, "ac":ac_clusters} df_cl = pd.DataFrame(data = data) df_cl.to_csv("clusters.csv")
""" Preprocessing """ import mypreprocessing as prp data = prp.RowWiseNorm(data) silh = [] comp = [] h**o = [] vmea = [] from sklearn.cluster import SpectralClustering from sklearn import metrics for k in range(2, 11): print k km = SpectralClustering(n_clusters=k) km.fit(data) silh.append(metrics.silhouette_score(data, km.labels_)) comp.append(metrics.completeness_score(projects_true, km.labels_)) h**o.append(metrics.homogeneity_score(projects_true, km.labels_)) vmea.append(metrics.v_measure_score(projects_true, km.labels_)) plt.style.use('fivethirtyeight') plt.plot(range(2,11), silh) plt.plot(range(2,11), comp) plt.plot(range(2,11), h**o) plt.plot(range(2,11), vmea) plt.title('Spectral clustering, Row-wise Normalization') plt.xlabel('k clusters') plt.ylabel('Silhouette Coefficient') plt.legend(['silhouette', 'completeness', 'homogeneity', 'v-measure'], loc='upper right') plt.show()
def main(): parser = argparse.ArgumentParser(prog='kmeans_scikit.py', description='Performs K-means using scikit-learn', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--sourceFolder', help='Source folder', type=str, required=True) parser.add_argument('--outputFolder', help='Output folder', type=str, required=True) parser.add_argument('--clustersNumber', help='Number of clusters', type=int, default='5', choices=[3,4,5,6,7,8,9,10,11,12,13,14,15], required=False) parser.add_argument('--framesNumber', help='Number of frames used in STA analysis', type=int, default='20', required=False) parser.add_argument('--pcaComponents', help='Number of components for PCA', type=int, default='4', required=False) parser.add_argument('--doPCA', help='Performs clusterings with PCA or not', type=bool, default=False, required=False) args = parser.parse_args() #Source folder of the files with the timestamps sourceFolder = rfe.fixPath(args.sourceFolder) if not os.path.exists(sourceFolder): print '' print 'Source folder does not exists ' + sourceFolder sys.exit() #Output folder for the graphics outputFolder = rfe.fixPath(args.outputFolder) if not os.path.exists(outputFolder): try: os.makedirs(outputFolder) except: print '' print 'Unable to create folder ' + outputFolder sys.exit() #Clusters number for the kmeans algorithm clustersNumber = args.clustersNumber #Frames used in STA analysis framesNumber = args.framesNumber #dataCluster stores the data to be used for the clustering process #the size is equal to the number of frames, aka, the time component #plus 5 as we are incorporating the 2 dimensions of the ellipse, #x position, y position and angle dataCluster = np.zeros((1,framesNumber+5)) units=[] dato=np.zeros((1,1)) for unitFile in os.listdir(sourceFolder): if os.path.isdir(sourceFolder+unitFile): unitName = unitFile.rsplit('_', 1)[0] dataUnit, coordinates = rfe.loadSTACurve(sourceFolder,unitFile,unitName) xSize = dataUnit.shape[0] ySize = dataUnit.shape[1] fitResult = rfe.loadFitMatrix(sourceFolder,unitFile) #should we use the not-gaussian-fitted data for clustering? dataUnitGauss = scipy.ndimage.gaussian_filter(dataUnit[coordinates[0][0],[coordinates[1][0]],:],2) #A radius of the RF ellipse dato[0]=fitResult[0][2] dataUnitCompleta = np.concatenate((dataUnitGauss,dato),1) #B radius of the RF ellipse dato[0]=fitResult[0][3] dataUnitCompleta = np.concatenate((dataUnitCompleta,dato),1) #angle of the RF ellipse dato[0]=fitResult[0][1] dataUnitCompleta = np.concatenate((dataUnitCompleta,dato),1) #X coordinate of the RF ellipse dato[0]=fitResult[0][4] dataUnitCompleta = np.concatenate((dataUnitCompleta,dato),1) #Y coordinate of the RF ellipse dato[0]=fitResult[0][5] dataUnitCompleta = np.concatenate((dataUnitCompleta,dato),1) dataCluster = np.append(dataCluster,dataUnitCompleta, axis=0) units.append(unitName) # remove the first row of zeroes dataCluster = dataCluster[1:,:] data = dataCluster[:,0:framesNumber+2] sc = SpectralClustering(n_clusters=clustersNumber, eigen_solver=None, random_state=None, n_init=10, gamma=1.0, affinity='nearest_neighbors', n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans', degree=3, coef0=1, kernel_params=None) sc.fit(data) labels = sc.labels_ fit = metrics.silhouette_score(data, labels, metric='euclidean') rfe.graficaCluster(labels, dataCluster[:,0:framesNumber-1], outputFolder+'no_pca.png',clustersColours, fit) # generate graphics of all ellipses for clusterId in range(clustersNumber): dataGrilla = np.zeros((1,framesNumber+5)) for unitId in range(dataCluster.shape[0]): if labels[unitId] == clusterId: datos=np.zeros((1,framesNumber+5)) datos[0]=dataCluster[unitId,:] dataGrilla = np.append(dataGrilla,datos, axis=0) # remove the first row of zeroes dataGrilla = dataGrilla[1:,:] rfe.graficaGrilla(dataGrilla,outputFolder+'Grilla_'+str(clusterId)+'.png',clustersColours[clusterId],framesNumber,xSize,ySize) rfe.graficaCluster(labels, dataGrilla[:,0:framesNumber-1], outputFolder+'cluster_'+str(clusterId)+'.png',clustersColours[clusterId]) rfe.guardaClustersIDs(outputFolder,units,labels,outputFolder+'clustering_no_pca.csv') if args.doPCA: pca = PCA(n_components=args.pcaComponents) newData = pca.fit_transform(data) sc.fit(newData) fit = metrics.silhouette_score(newData, labels, metric='euclidean') rfe.graficaCluster(labels, dataCluster[:,0:framesNumber-1], outputFolder+'pca.png',clustersColours,fit) rfe.guardaClustersIDs(outputFolder,units,labels,outputFolder+'clustering_pca.csv') return 0
if __name__ == "__main__": players = {} data = [] names = [] data_file = open("kda_200.txt", "r") # Build data from file for line in data_file: fields = line.split(",") data.append([float(fields[1]), float(fields[3]), float(fields[4]), float(fields[4])]) names.append(fields[0]) # Create and fit model clus = SpectralClustering(n_clusters=5,eigen_solver='arpack',affinity= "nearest_neighbors") clus.fit(data) labels = clus.fit_predict(data) # Sort the fitted data into 5 boxes, one for each role boxes = [[],[],[],[],[]] for x in range(len(data)): pred = labels[x] name = names[x] # names like "Amazing (Maurice Stuckenschneider)" are too long, cut at first space if " " in name: name = name[0:name.find(" ")+1] boxes[pred].append(name.ljust(10)) # Get size of largest cluster so you can pad the others sizes = [len(boxes[0]), len(boxes[1]), len(boxes[2]), len(boxes[3]), len(boxes[4])]