def scikit_pca(model, rel_wds, plot_lims, title, cluster="kmeans"): """ Given a word2vec model and a cluster (choice of "kmeans" or "spectral") Make a plot of all word-vectors in the model. """ X, keys = make_data_matrix(model) for i, key in enumerate(keys): X[i,] = model[key] if cluster == "kmeans": k_means = KMeans(n_clusters=8) labels = k_means.fit_predict(X) elif cluster == "spectral": sp_clust = SpectralClustering() labels = sp_clust.fit_predict(X) # PCA X_std = StandardScaler().fit_transform(X) sklearn_pca = PCA(n_components=2) X_transf = sklearn_pca.fit_transform(X_std) scatter_plot(X_transf[:,0], X_transf[:,1], rel_wds, labels, title, keys, plot_lims) return sklearn_pca.explained_variance_ratio_
def spectral_clustering(matrix, N): spectral = SpectralClustering(n_clusters=N) clusters = spectral.fit_predict(matrix) res = [[] for _ in range(N)] for i, c in enumerate(clusters): res[c].append(i) return res
def create_word2vec_cluster(word2vec_model): word_vectors = word2vec_model.syn0 num_clusters = word_vectors.shape[0] / 1000 spectral_cluster_model = SpectralClustering(n_clusters=num_clusters) idx = spectral_cluster_model.fit_predict(word_vectors) pickle.dump(spectral_cluster_model, open(r"C:\Ofir\Tau\Machine Learning\Project\project\k_means_model.pkl", "wb")) return spectral_cluster_model
def call_spectral(num_cluster ,mode_, data, update_flag): X = StandardScaler().fit_transform(data) spectral = SpectralClustering(n_clusters=num_cluster, eigen_solver='arpack', affinity='precomputed') connectivity = kneighbors_graph(X, n_neighbors=10) connectivity = 0.5 * (connectivity + connectivity.T) spectral.fit(connectivity) labels = spectral.labels_ if update_flag: return labels label_dict = {} label_dict_count = 0 for label in labels: label_dict[str(label_dict_count)] = float(label) label_dict_count = label_dict_count + 1 print label_dict unique_dict = {} unique_dict_count = 0 for uniq in np.unique(labels): print uniq unique_dict[str(unique_dict_count)] = float(uniq) unique_dict_count = unique_dict_count + 1 print unique_dict return label_dict, unique_dict
def main(cm_file, perm_file, steps, labels_file, limit_classes=None): """Run optimization and generate output.""" # Load confusion matrix with open(cm_file) as f: cm = json.load(f) cm = np.array(cm) # Load labels if os.path.isfile(labels_file): with open(labels_file, "r") as f: labels = json.load(f) else: labels = list(range(len(cm))) n_clusters = 14 # hyperparameter spectral = SpectralClustering(n_clusters=n_clusters, eigen_solver='arpack', affinity="nearest_neighbors") spectral.fit(cm) if hasattr(spectral, 'labels_'): y_pred = spectral.labels_.astype(np.int) else: y_pred = spectral.predict(cm) sscore = silhouette_score(cm, y_pred) print("silhouette_score={} with {} clusters" .format(sscore, n_clusters)) grouping = [[] for _ in range(n_clusters)] for label, y in zip(labels, y_pred): grouping[y].append(label) for group in grouping: print(" {}: {}".format(len(group), group))
def spectral_clustering(G, graph_name, num_clusters): #Find a way to figure out clusters number automatically subgraphs = [] write_directory = os.path.join(Constants.SPECTRAL_PATH,graph_name) if not os.path.exists(write_directory): os.makedirs(write_directory) nodeList = G.nodes() matrix_data = nx.to_numpy_matrix(G, nodelist = nodeList) spectral = SpectralClustering(n_clusters=2, eigen_solver='arpack', affinity="rbf") spectral.fit(matrix_data) label = spectral.labels_ clusters = {} for nodeIndex, nodeLabel in enumerate(label): if nodeLabel not in clusters: clusters[nodeLabel] = [] clusters[nodeLabel].append(nodeList[nodeIndex]) #countNodes is used to test whether we have all the nodes in the clusters for clusterIndex, subGraphNodes in enumerate(clusters.keys()): subgraph = G.subgraph(clusters[subGraphNodes]) subgraphs.append(subgraph) nx.write_gexf(subgraph, os.path.join(write_directory,graph_name+str(clusterIndex)+"_I"+Constants.GEXF_FORMAT)) #countNodes = countNodes + len(clusters[subGraphNodes]) return subgraphs
def fast_app_spe_cluster(data, label, k, n_cluster): #k-means get the representative points(centers points) start_time = time.clock() k_means = KMeans(n_clusters=k) k_means.fit(data) y_centers = k_means.cluster_centers_ # get the correspondence table x_to_centers_table = list() m = len(data) for i in range(m): min_distance = np.inf min_index = None for j in range(k): i_j_dis = np.sum((data[i, :] - y_centers[j, :]) ** 2) if min_distance > i_j_dis: min_index = j min_distance = i_j_dis x_to_centers_table.append(min_index) # spectral cluster spe_cluster = SpectralClustering(n_clusters=n_cluster) spe_cluster.fit(y_centers) spe_label = spe_cluster.labels_ # get m-way cluster membership x_label = list() for i in range(m): x_label.append(spe_label[x_to_centers_table[i]]) spend_time = time.clock() - start_time print("spend time is %f seconds" % spend_time) return x_label
def compute_centroid_set(self, **kwargs): INPUT_ITR = subset_iterator(X=self.docv, m=self.subcluster_m, repeats=self.subcluster_repeats) kn = self.subcluster_kn clf = SpectralClustering(n_clusters=kn, affinity="precomputed") C = [] for X in INPUT_ITR: # Remove any rows that have zero vectors bad_row_idx = (X ** 2).sum(axis=1) == 0 X = X[~bad_row_idx] A = cosine_affinity(X) labels = clf.fit_predict(A) # Compute the centroids (N, dim) = X.shape centroids = np.zeros((kn, dim)) for i in range(kn): idx = labels == i mu = X[idx].mean(axis=0) mu /= np.linalg.norm(mu) centroids[i] = mu C.append(centroids) return np.vstack(C)
def spectral_clustering(k, X, G, W=None, run_times=5): if type(W) == type(None): W = np.eye(len(X)) W2 = np.sqrt(W) Gtilde = W2.dot(G.dot(W2)) sc = SpectralClustering(k, affinity='precomputed', n_init=run_times) zh = sc.fit_predict(Gtilde) return zh
def run(self, features, number_of_clusters=2, restarts=10, delta=3.0): if number_of_clusters == 1: result = numpy.zeros(len(features), dtype=numpy.int32) return [result] classifier = SpectralClustering(k=number_of_clusters, n_init=restarts) similarity = get_similarity(features, delta) classifier.fit(similarity) return [classifier.labels_]
def spectral_clustering2(similarity, concepts=2, euclid=False): if euclid: model = SpectralClustering(n_clusters=concepts, affinity='nearest_neighbors') return model.fit_predict(similarity) else: model = SpectralClustering(n_clusters=concepts, affinity='precomputed') similarity[similarity < 0] = 0 return model.fit_predict(similarity)
def get_coregulatory_states(corr_matrices, similarity_matrix, n_clusters): spectral = SpectralClustering(n_clusters=n_clusters, affinity='precomputed') labels = spectral.fit_predict(similarity_matrix) coreg_states = {} for ci in np.unique(labels): coreg_states[ci] = corr_matrices[labels == ci, :, :].mean(axis=0) return coreg_states, labels
def spectral(k, X, G, run_times=10): """Spectral clustering from sklearn library. run_times is the number of times the algorithm is gonna run with different initializations. """ sc = SpectralClustering(k, affinity='precomputed', n_init=run_times) zh = sc.fit_predict(G) return zh
def dist_spectral(x, y): plot = [] for s in range(dataset.shape[0]): plot.append(np.array([x[s], y[s]])) plot = np.array(plot) spectral = SpectralClustering(n_clusters=3, eigen_solver='arpack', affinity="nearest_neighbors") clusters = spectral.fit_predict(plot) return clusters
def spectral_clustering(S,X,config): ''' Computes spectral clustering from an input similarity matrix. Returns the labels associated with the clustering. ''' from sklearn.cluster import SpectralClustering nk = int(config["n_clusters"]) clf = SpectralClustering(affinity='cosine',n_clusters=nk) return clf.fit_predict(X)
def test_affinities(): X, y = make_blobs(n_samples=40, random_state=1, centers=[[1, 1], [-1, -1]], cluster_std=0.4) # nearest neighbors affinity sp = SpectralClustering(n_clusters=2, affinity="nearest_neighbors", random_state=0) labels = sp.fit(X).labels_ assert_equal(adjusted_rand_score(y, labels), 1) sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0) labels = sp.fit(X).labels_ assert_equal(adjusted_rand_score(y, labels), 1)
def cluster_faces_CNN(name = '9_8913259@N03', img_list = 'faces_list.txt'): root = '/Users/wangyufei/Documents/Study/intern_adobe/face_recognition_CNN/'+name + '/' f = open(root + model_name + 'similarity_matrix.cPickle','r') affinity_matrix = cPickle.load(f) f.close() f = SpectralClustering(affinity='precomputed', n_clusters=min(8, affinity_matrix.shape[0] - 1), eigen_solver = 'arpack', n_neighbors=min(5, affinity_matrix.shape[0])) a = f.fit_predict(affinity_matrix) groups = {} temp = zip(a, xrange(len(a))) for i in temp: if i[0] not in groups: groups[i[0]] = [i[1]] else: groups[i[0]].append(i[1]) unique_person_id = [] for kk in groups: min_similarity = np.Inf max_similarity = -np.Inf mean_similarity = 0 this_group_ids = groups[kk] for j in xrange(len(this_group_ids)): for i in xrange(j+1, len(this_group_ids)): temp = affinity_matrix[this_group_ids[i],this_group_ids[j]] if temp < min_similarity: min_similarity = temp if temp > max_similarity: max_similarity = temp mean_similarity += temp mean_similarity /= max(1, len(this_group_ids)*(len(this_group_ids) - 1) / 2) print len(this_group_ids), mean_similarity, max_similarity, min_similarity if mean_similarity > 0.5: unique_person_id.append(kk) important_person = [] for i in unique_person_id: important_person.append([i, len(groups[i])]) important_person.sort(key = lambda x:x[1], reverse=True) in_path = root + img_list imgs_list = [] with open(in_path, 'r') as data: for line in data: line = line[:-1] imgs_list.append(line.split('/')[-1]) temp = zip(a, imgs_list) face_groups = {} for i in temp: if i[0] not in face_groups: face_groups[i[0]] = [i[1]] else: face_groups[i[0]].append(i[1]) create_face_group_html_CNN(name, face_groups, important_person)
def spectral(k, X, G, z, run_times=10): """Spectral clustering from sklearn library. run_times is the number of times the algorithm is gonna run with different initializations. """ sc = SpectralClustering(k, affinity='precomputed', n_init=run_times) zh = sc.fit_predict(G) a = metric.accuracy(z, zh) v = metric.variation_information(z, zh) return a, v
def run(self, k): if self.data_is_kernel: clf = SpectralClustering(n_clusters=k, gamma=self.gammav, affinity='precomputed') self.allocation = clf.fit_predict(self.X) self.kernel = self.X else: clf = SpectralClustering(n_clusters=k, gamma=self.gammav) #, affinity='precomputed' self.allocation = clf.fit_predict(self.X) self.kernel = clf.affinity_matrix_ return self.allocation
def spectral_clustering(crime_rows, column_names, num_clusters, affinity='rbf', n_neighbors=0, assign_labels='kmeans'): """ n_clusters : integer, optional The dimension of the projection subspace. affinity : string, array-like or callable, default ‘rbf’ If a string, this may be one of ‘nearest_neighbors’, ‘precomputed’, ‘rbf’ or one of the kernels supported by sklearn.metrics.pairwise_kernels. Only kernels that produce similarity scores (non-negative values that increase with similarity) should be used. This property is not checked by the clustering algorithm. gamma : float Scaling factor of RBF, polynomial, exponential chi^2 and sigmoid affinity kernel. Ignored for affinity='nearest_neighbors'. degree : float, default=3 Degree of the polynomial kernel. Ignored by other kernels. coef0 : float, default=1 Zero coefficient for polynomial and sigmoid kernels. Ignored by other kernels. n_neighbors : integer Number of neighbors to use when constructing the affinity matrix using the nearest neighbors method. Ignored for affinity='rbf'. n_init : int, optional, default: 10 Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia. assign_labels : {‘kmeans’, ‘discretize’}, default: ‘kmeans’ The strategy to use to assign labels in the embedding space. There are two ways to assign labels after the laplacian embedding. k-means can be applied and is a popular choice. But it can also be sensitive to initialization. Discretization is another approach which is less sensitive to random initialization. kernel_params : dictionary of string to any, optional Parameters (keyword arguments) and values for kernel passed as callable object. Ignored by other kernels. """ crime_xy = [crime[0:2] for crime in crime_rows] crime_info = [crime[2:] for crime in crime_rows] #crime_xy = [crime[1:] for crime in crime_rows] spectral_clustering = SpectralClustering( n_clusters=num_clusters, affinity=affinity, n_neighbors=n_neighbors, assign_labels=assign_labels) print("Running spectral clustering....") print("length crimexy") print(len(crime_xy)) spectral_clustering_labels = spectral_clustering.fit_predict( random_sampling(crime_xy, num_samples=3000)) print("Formatting......") return _format_clustering(spectral_clustering_labels, crime_xy, crime_info, column_names, num_clusters=num_clusters)
def predictSpectralClustering(X, y, n=2, val='rbf'): ranX, ranY = shuffle(X, y, random_state=0) X = X[:600,] y = y[:600,] sc = SpectralClustering(n_clusters=n) results = sc.fit_predict(X) gini = compute_gini(results) if n == 2: same = calculate_score(results, y) opp = calculate_score(results, y, True) return (results, max(same, opp), gini) else: return (results, 0, gini)
def run_clustering(methods, cases): true_method_groups = [m[1] for m in methods] edge_model = GraphLassoCV(alphas=4, n_refinements=5, n_jobs=3, max_iter=100) edge_model.fit(cases) CV = edge_model.covariance_ num_clusters=3 spectral = SpectralClustering(n_clusters=num_clusters,affinity='precomputed') spectral.fit(np.asarray(CV)) spec_sort=np.argsort(spectral.labels_) for i,m in enumerate(methods): print "%s:%d\t%s"%(m[1],spectral.labels_[i],m[0]) print "Adj. Rand Score: %f"%adjusted_rand_score(spectral.labels_,true_method_groups)
def spectral_clustering(vectors: list, num_rows, k): matrix = [] ## num_rows X len(vectors) for s in range(num_rows): row = [] for v in vectors: row.append(v[s]) matrix.append(np.array(row)) matrix = np.array(matrix) spectral = SpectralClustering(n_clusters=k, eigen_solver='arpack', affinity="nearest_neighbors") clusters = spectral.fit_predict(matrix) return clusters
def eval_k(max_k): a_score, idx = [], [] for k in xrange(2, max_k + 1): print 'k={}'.format(k) est = SpectralClustering(n_clusters=k, affinity='nearest_neighbors') # est = SpectralClustering(n_clusters=k, affinity='rbf', gamma=0.00001) est.fit(x) ari = metrics.adjusted_rand_score(y, est.labels_) print ari a_score.append(ari) idx.append(k) pl.plot(idx, a_score) pl.xlabel('# of clusters') pl.ylabel('ARI') pl.show()
def _small_partition(self, data): _logger.debug("Running _small_partition on %s observations", len(data)) similarity = self._get_similarity(data, sparse = self.sparse_similarity) _logger.debug("Spectral clustering") spc_obj = SpectralClustering(n_clusters = 2, affinity = 'precomputed', assign_labels = 'discretize') partition = spc_obj.fit_predict(similarity) _logger.debug("Done spectral clustering") sizes = [len(partition[partition == x]) for x in [0, 1]] _logger.debug("Result of _small_partition: #0: {}, #1: {}" \ .format(*sizes)) return partition
def spectral(X, num_clusters): """ Spectral Clustering on X for response y Returns array of cluster groups """ model = SpectralClustering( n_clusters=num_clusters, eigen_solver="arpack", affinity="nearest_neighbors", n_neighbors=4, assign_labels="discretize", ) cleanX = preprocessing.scale(X.as_matrix()) model.fit(cleanX) return model.labels_
def spectral(x, num_clusters): spec = SpectralClustering( affinity='rbf', # 'rbf' n_clusters=num_clusters, n_init=10, assign_labels='kmeans', gamma=1.0, degree=3, coef0=1 ) spec.fit(x) c = spec.labels_ k = len(np.unique(c)) return spec, (None, c, k)
def test_affinities(): X, y = make_blobs(n_samples=40, random_state=1, centers=[[1, 1], [-1, -1]], cluster_std=0.4) # nearest neighbors affinity sp = SpectralClustering(n_clusters=2, affinity='nearest_neighbors', random_state=0) labels = sp.fit(X).labels_ assert_equal(adjusted_rand_score(y, labels), 1) sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0) labels = sp.fit(X).labels_ assert_equal(adjusted_rand_score(y, labels), 1) # raise error on unknown affinity sp = SpectralClustering(n_clusters=2, affinity='<unknown>') assert_raises(ValueError, sp.fit, X)
def test_affinities(): # Note: in the following, random_state has been selected to have # a dataset that yields a stable eigen decomposition both when built # on OSX and Linux X, y = make_blobs(n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01 ) # nearest neighbors affinity sp = SpectralClustering(n_clusters=2, affinity='nearest_neighbors', random_state=0) assert_warns_message(UserWarning, 'not fully connected', sp.fit, X) assert_equal(adjusted_rand_score(y, sp.labels_), 1) sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0) labels = sp.fit(X).labels_ assert_equal(adjusted_rand_score(y, labels), 1) X = check_random_state(10).rand(10, 5) * 10 kernels_available = kernel_metrics() for kern in kernels_available: # Additive chi^2 gives a negative similarity matrix which # doesn't make sense for spectral clustering if kern != 'additive_chi2': sp = SpectralClustering(n_clusters=2, affinity=kern, random_state=0) labels = sp.fit(X).labels_ assert_equal((X.shape[0],), labels.shape) sp = SpectralClustering(n_clusters=2, affinity=lambda x, y: 1, random_state=0) labels = sp.fit(X).labels_ assert_equal((X.shape[0],), labels.shape) def histogram(x, y, **kwargs): """Histogram kernel implemented as a callable.""" assert_equal(kwargs, {}) # no kernel_params that we didn't ask for return np.minimum(x, y).sum() sp = SpectralClustering(n_clusters=2, affinity=histogram, random_state=0) labels = sp.fit(X).labels_ assert_equal((X.shape[0],), labels.shape) # raise error on unknown affinity sp = SpectralClustering(n_clusters=2, affinity='<unknown>') assert_raises(ValueError, sp.fit, X)
def compute_spectral_clustering(n_vertex, edge_list, n_clusters): from sklearn.cluster import SpectralClustering clst = SpectralClustering(n_clusters, affinity="precomputed") adjacency_matrix = tf.compute_adjacency_matrix(n_vertex, edge_list) t = time.time() labels = clst.fit_predict(adjacency_matrix, n_clusters) exectime = time.time() - t labels = tf.compute_normal_labels(labels) clusters = tf.compute_clusters_from_labels(labels) return labels, clusters, exectime
def dbscan(self): # DBSCAN Clustering self.Clusters = DBSCAN(eps=0.3) self.clusterIndex = self.Clusters.fit_predict(self.values) #self.centers = self.DBSCANClusters.core_sample_indices_ self.output()
clustering = KMeans(n_clusters=k, random_state=0).fit(zOut) listResult = clustering.predict(zOut) elif args.clustering_method == 'LouvainB': listResult, size = generateLouvainCluster(edgeList) k = len(np.unique(listResult)) print('Louvain cluster: ' + str(k)) k = int(k * resolution) if k > 3 else 2 clustering = Birch(n_clusters=k).fit(zOut) listResult = clustering.predict(zOut) elif args.clustering_method == 'KMeans': clustering = KMeans(n_clusters=args.n_clusters, random_state=0).fit(zOut) listResult = clustering.predict(zOut) elif args.clustering_method == 'SpectralClustering': clustering = SpectralClustering(n_clusters=args.n_clusters, assign_labels="discretize", random_state=0).fit(zOut) listResult = clustering.labels_.tolist() elif args.clustering_method == 'AffinityPropagation': clustering = AffinityPropagation().fit(zOut) listResult = clustering.predict(zOut) elif args.clustering_method == 'AgglomerativeClustering': clustering = AgglomerativeClustering().fit(zOut) listResult = clustering.labels_.tolist() elif args.clustering_method == 'AgglomerativeClusteringK': clustering = AgglomerativeClustering( n_clusters=args.n_clusters).fit(zOut) listResult = clustering.labels_.tolist() elif args.clustering_method == 'Birch': clustering = Birch(n_clusters=args.n_clusters).fit(zOut) listResult = clustering.predict(zOut)
'SimplePP': SimplePPEncoder(), 'CESAMOEncoder': CESAMOEncoder(), 'CENG': CENGEncoder(verbose=0) } """END: Import encoders""" import random """START: Import models""" try: from sklearn.cluster import KMeans, SpectralClustering, AgglomerativeClustering # Birch DBSCAN except: raise Exception('Scikit-Learn 0.22.2+ not available') Models = { 'K-Means': KMeans(n_clusters=n_clusters), 'Spectral': SpectralClustering(n_clusters=n_clusters, eigen_solver='lobpcg'), 'Agglomerative': AgglomerativeClustering(n_clusters=n_clusters) } #'DBSCAN': DBSCAN(eps=0.3, min_samples=15)} """END: Import models""" # Performance evaluation function import numpy as np from sklearn.model_selection import train_test_split from sklearn.metrics import adjusted_mutual_info_score as ami from sklearn.metrics import calinski_harabasz_score as chs from sklearn.metrics import silhouette_score as sil import time def performance(encoder, models, K):
ax.matshow(a) locs, labels = plt.xticks(range(size), empty) plt.setp(labels, rotation=90) plt.yticks(range(size), empty) plt.show() # In[24]: from sklearn.cluster import SpectralClustering from sklearn.metrics import silhouette_score x_score = [] y_score = [] for i in range(2, 10): #Get scores for n_clusters from 2 to 10 tmp_clf = SpectralClustering(n_clusters=i, affinity='precomputed') tmp_clf.fit(a) score = silhouette_score(a, tmp_clf.labels_, metric='precomputed') x_score.append(i) y_score.append(score) plt.subplots(figsize=(10, 10)) plt.plot(x_score, y_score) plt.grid() plt.show() # In[25]: clusters_count = 3 clusters = [[] for i in range(clusters_count)] clf = SpectralClustering(n_clusters=clusters_count,
def spectral_cluster(k, X): from sklearn.cluster import SpectralClustering y_pred = SpectralClustering(n_clusters=k, gamma=0.1).fit_predict(X) return y_pred
# Show the dataset sns.set() fig, ax = plt.subplots(figsize=(12, 8)) ax.scatter(data[:, 0], data[:, 1]) ax.set_xlabel(r'$x_0$', fontsize=14) ax.set_ylabel(r'$x_1$', fontsize=14) plt.show() # Perform the clustering km = KMeans(n_clusters=2, random_state=1000) sc = SpectralClustering(n_clusters=2, affinity='rbf', gamma=2.0, random_state=1000) Y_pred_km = km.fit_predict(data) Y_pred_sc = sc.fit_predict(data) # Show the results fig, ax = plt.subplots(1, 3, figsize=(20, 6), sharey=True) ax[0].scatter(data[:, 0], data[:, 1], c='b', s=5) ax[1].scatter(data[Y_pred_sc == 0, 0], data[Y_pred_sc == 0, 1], marker='o', s=5, c='b',
def test_clustering(df, gmms, title="", save_to_file=False, highlight_point=None): # preprocessing df_train = copy.deepcopy(df) df_train.drop('attack', 1, inplace=True) df_train.drop('difficulty', 1, inplace=True) # from about 30 dimension to 2 dimension proj = reduction.gmm_reduction(df_train, headers, gmms) cproj = copy.deepcopy(proj) # data_per_true_labels : try to make sort of dictionary per each label data_per_true_labels = [] for i in range(len(attacks)): data_per_true_labels.append([]) true_attack_types = df["attack"].values.tolist() for i, d in enumerate(cproj): data_per_true_labels[true_attack_types[i]].append(d) A = affinity.get_affinity_matrix(cproj, metric_method=distance.cosdist, knn=8) k = predict_k(A) print "supposed k : " + str(k) lim = int(len(df) * 0.01) lim = 12 # if lim < 3 or lim > 10 : # lim = 10 k = lim print "Total number of clusters : " + str(k) sc = SpectralClustering(n_clusters=k, affinity="precomputed", assign_labels="kmeans").fit(A) res = sc.labels_ # cluster data set clusters = [0] * k clusters_data = [] clusters_xmean = [-1] * k clusters_ymean = [-1] * k clusters_xstd = [-1] * k clusters_ystd = [-1] * k for i in range(k): clusters_data.append([]) for i, p in enumerate(cproj): true_label = true_attack_types[i] if true_label == model.attack_normal: clusters[res[i]] = clusters[res[i]] + 1 else: clusters[res[i]] = clusters[res[i]] - 1 clusters_data[res[i]].append(p) # cluster recheck with density for i, cluster in enumerate(clusters): p = clusters_data[i] x = np.array([t[0] for t in p]) y = np.array([t[1] for t in p]) clusters_xmean[i] = np.mean(x) clusters_ymean[i] = np.mean(y) clusters_xstd[i] = np.std(x) clusters_ystd[i] = np.std(y) ds = [] for i, cluster in enumerate(clusters): if cluster > 0: d = check_abnormal_with_density(clusters_xmean[i], clusters_ymean[i], clusters_xstd[i], clusters_ystd[i], len(clusters_data[i])) ds.append(d) if 0 > d: clusters[i] = -99999 else: ds.append(None) print("ds") print ds
# (array([21, 23, 25, 33, 37, 41], dtype=int64),)0 # (array([ 7, 9, 11, 13, 15, 17, 27, 29, 31], dtype=int64),)1 # (array([42, 44], dtype=int64),)2 # (array([18, 38, 46], dtype=int64),)3 # (array([ 5, 19, 43, 45, 47], dtype=int64),)4 # (array([ 6, 8, 10, 12, 14, 16], dtype=int64),)5 # (array([20, 22, 24, 32, 36, 40], dtype=int64),)6 # (array([28, 30, 34], dtype=int64),)7 # (array([ 0, 2, 4, 26], dtype=int64),)8 # (array([ 1, 3, 35, 39], dtype=int64),)9 x1,y1,z1=[],[],[] for index, n_neighbors in enumerate((11,12,13,14,15,16,17,18,19,20)): # 4,5,6,7,8,9,10,11,12,13,14,15,16 for index, k in enumerate((10,11,12,13,14,15,16,17,18,19,20)): y_pred = SpectralClustering(affinity='nearest_neighbors',n_clusters=k, n_neighbors=n_neighbors).fit_predict(X) print ("Calinski-Harabasz Score with n_neighbors=", n_neighbors, "n_clusters=", k,"score:", metrics.calinski_harabaz_score(X, y_pred) ) x1.append(n_neighbors) y1.append(k) z1.append(metrics.calinski_harabaz_score(X, y_pred)) print(x1,y1,z1) from mpl_toolkits.mplot3d import Axes3D import matplotlib.pyplot as plt # 生成画布、3D图形对象、三维散点图 fig = plt.figure() ax = Axes3D(fig) ax.scatter(x1,y1,z1)
class SpectralClusteringPrimitive(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): ''' Primitive that applies sklearn spectral clustering algorithm to unsupervised, supervised or semi-supervised datasets. Training inputs: D3M dataframe with features and labels, and D3M indices Outputs:D3M dataframe with cluster predictions and D3M indices. Clusterlabels are of "suggestTarget" semantic type if the task_type hyperparameter is clustering, and "Attribute" if the task_type is classification. ''' metadata = metadata_base.PrimitiveMetadata({ # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". 'id': "d13a4529-f0ba-44ee-a867-e0fdbb71d6e2", 'version': __version__, 'name': "tsne", # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. 'keywords': ['Clustering', 'Graph Clustering'], 'source': { 'name': __author__, 'contact': __contact__, "uris": [ # Unstructured URIs. "https://github.com/kungfuai/d3m-primitives", ], }, # A list of dependencies in order. These can be Python packages, system packages, or Docker images. # Of course Python packages can also have their own dependencies, but sometimes it is necessary to # install a Python package first to be even able to run setup.py of another package. Or you have # a dependency which is not on PyPi. "installation": [ { "type": "PIP", "package": "cython", "version": "0.29.16" }, { "type": metadata_base.PrimitiveInstallationType.PIP, "package_uri": "git+https://github.com/kungfuai/d3m-primitives.git@{git_commit}#egg=kf-d3m-primitives" .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }, ], # The same path the primitive is registered with entry points in setup.py. 'python_path': 'd3m.primitives.clustering.spectral_graph.SpectralClustering', # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. 'algorithm_types': [ metadata_base.PrimitiveAlgorithmType.SPECTRAL_CLUSTERING, ], 'primitive_family': metadata_base.PrimitiveFamily.CLUSTERING, }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) self.sc = SC(n_clusters=self.hyperparams['n_clusters'], n_init=self.hyperparams['n_init'], n_neighbors=self.hyperparams['n_neighbors'], affinity=self.hyperparams['affinity'], random_state=self.random_seed) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Parameters ---------- inputs : dataframe Returns ---------- Outputs The output is a transformed dataframe of X fit into an embedded space, n feature columns will equal n_components hyperparameter For timeseries datasets the output is the dimensions concatenated to the timeseries filename dataframe """ targets = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/Target') if not len(targets): targets = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' ) target_names = [list(inputs)[t] for t in targets] index = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') index_names = [list(inputs)[i] for i in index] X_test = inputs.drop(columns=list(inputs)[index[0]]) X_test = X_test.drop(columns=target_names).values # special semi-supervised case - during training, only produce rows with labels series = inputs[target_names] != '' if series.any().any(): inputs = select_rows(inputs, np.flatnonzero(series)) X_test = X_test[np.flatnonzero(series)] sc_df = d3m_DataFrame( pandas.DataFrame(self.sc.fit_predict(X_test), columns=['cluster_labels'])) # just add last column of last column ('clusters') col_dict = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type(1) if self.hyperparams['task_type'] == 'classification': col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute') col_dict['name'] = 'cluster_labels' else: col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget' ) col_dict['name'] = target_names[0] sc_df.metadata = sc_df.metadata.update((metadata_base.ALL_ELEMENTS, 0), col_dict) df_dict = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict_1 = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict['dimension'] = df_dict_1 df_dict_1['name'] = 'columns' df_dict_1['semantic_types'] = ( 'https://metadata.datadrivendiscovery.org/types/TabularColumn', ) df_dict_1['length'] = 1 sc_df.metadata = sc_df.metadata.update((metadata_base.ALL_ELEMENTS, ), df_dict) return CallResult(inputs.append_columns(sc_df))
import numpy as np import scipy import random from sklearn.cluster import SpectralClustering import matplotlib.pyplot as plt from matplotlib.patches import Ellipse eight = np.array(([-3, -2, -2, -2, 1, 1, 2, 4], [0, 4, -1, -2, 4, 2, -4, -3])).T eight = eight[[7, 6, 2, 0, 3, 1, 5, 4], :] random.seed(11) sc = SpectralClustering(n_clusters=2, eigen_solver="arpack", affinity="rbf", random_state=11).fit(eight) scipy.linalg.eigh(sc.affinity_matrix_) covm = np.cov(eight[np.where(sc.labels_ == 0)][:, 0], eight[np.where(sc.labels_ == 0)][:, 1]) eigva = np.sqrt(np.linalg.eig(covm)[0]) eigve = np.linalg.eig(covm)[1] covm1 = np.cov(eight[np.where(sc.labels_ == 1)][:, 0], eight[np.where(sc.labels_ == 1)][:, 1]) eigva1 = np.sqrt(np.linalg.eig(covm1)[0]) eigve1 = np.linalg.eig(covm1)[1] fig, ax = plt.subplots(figsize=(10, 10))
if (algo == 'MiniBatch1000'): kmeans = MiniBatchKMeans( n_clusters=n_cluster, batch_size=1000, ).fit(word_embeddings) kmeans.fit(word_embeddings), y_kmeans = kmeans.predict(word_embeddings) print(y_kmeans) pca(y_kmeans) if (algo == 'Spectral'): clustering = SpectralClustering(n_clusters=n_cluster, assign_labels="discretize", random_state=0).fit(word_embeddings) labels = clustering.labels_ print(labels) pca(labels) if (algo == 'Agglomerative'): cluster = AgglomerativeClustering(n_clusters=n_cluster, affinity='euclidean', linkage='ward') cluster.fit_predict(word_embeddings) labels = cluster.labels_ pca(labels) if (algo == 'BIRCH'): brc = Birch(n_clusters=n_cluster)
class PCA_and_Spectral(): # Create adjacency matrix with open('/Users/kat/Desktop/Kaggle/Graph.csv', 'rb') as csvfile1: graphreader = csv.reader(csvfile1, delimiter=' ', quotechar='|') adjgraph = np.empty((6000, 6000)) adjgraph.fill(0) for row in graphreader: arr = row[0].split(",") adjgraph[int(arr[0]) - 1][int(arr[1]) - 1] = 1 adjgraph[int(arr[1]) - 1][int(arr[0]) - 1] = 1 # Get features data into newEF matrix with open('/Users/kat/Desktop/Kaggle/Extracted_features.csv', 'rb') as csvfile3: EF = csv.reader(csvfile3, delimiter=' ', quotechar='|') newEF = [] for row in EF: arr = row[0].split(",") arr2 = np.asarray(arr) arr3 = arr2.astype(np.float) newEF.append(arr3) # PCA reduce features data to 800 dim (instead of 1084) pca = PCA(n_components=800) red_pca = pca.fit_transform(newEF) # spectral clustering on adjacency matrix spectral = SpectralClustering(10, affinity="precomputed") new_plot = spectral.fit_predict( adjgraph) #6000 x 1 Array with cluster labels matching = { 0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [] } # get cluster matchings for first 60 points with open('/Users/kat/Desktop/Kaggle/Seed.csv', 'rb') as csvfile2: seedreader = csv.reader(csvfile2, delimiter=' ', quotechar='|') for row in seedreader: arr = row[0].split(",") findClust = new_plot[int(arr[0]) - 1] matching[int(arr[1])].append( [int(arr[0]), red_pca[int(arr[0]) - 1], findClust]) clusters = { 0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [] } # Get points of each cluster for i in range(1, 6001): findClust = new_plot[i - 1] clusters[findClust].append(red_pca[i - 1]) #for i in range(10): # print "item is " + str(i) # for item in matching[i]: # print item[2] finalmatches = {0: 5, 1: 2, 2: 9, 3: 3, 4: 6, 5: 8, 6: 7, 7: 4, 8: 1, 9: 0} # match clusters to digits adjustedcluster = {} for i in range(10): index = finalmatches[i] adjustedcluster[i] = clusters[index] filtered_features = [] filtered_features_idx = [] # save clusters for digits 1 and 6 cluster_2_digit_1 = adjustedcluster[1] cluster_7_digit_6 = adjustedcluster[6] # filter out clusters for digits 1 and 6 for i in range(len(new_plot)): if not new_plot[i] == 2 and not new_plot[i] == 7: filtered_features.append(red_pca[i]) filtered_features_idx.append(i + 1) centroids_pca_8_clusters = [] # get initial centroids of the 8 digits based on seed for i in range(10): newarray = [] if not i == 1 and not i == 6: for j in range(len(matching[i])): newarray.append(np.asarray(matching[i][j][1])) newa = np.asarray(newarray) centroids_pca_8_clusters.append(newa.mean(axis=0)) centroids_pca_8_clusters = np.asarray(centroids_pca_8_clusters) # do kmeans to clean up 8 clusters kmeans_8 = KMeans( n_clusters=8, init=centroids_pca_8_clusters).fit_predict(filtered_features) kmeans_matching = { 0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [] } with open('/Users/kat/Desktop/Kaggle/Seed.csv', 'rb') as csvfile2: seedreader = csv.reader(csvfile2, delimiter=' ', quotechar='|') for row in seedreader: arr = row[0].split(",") if not int(arr[1]) == 1 and not int(arr[1]) == 6: try: idx = filtered_features_idx.index(int(arr[0])) kmeans_matching[int(arr[1])].append( [int(arr[0]), red_pca[int(arr[0]) - 1], kmeans_8[idx]]) except ValueError: pass #for i in range(10): # print "item is " + str(i) # for j in range(len(kmeans_matching[i])): # print kmeans_matching[i][j][2] clusters_kmeans = { 0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [] } # Get points of each cluster from kmeans for i in range(len(kmeans_8)): findClust = kmeans_8[i] idx = filtered_features_idx[i] clusters_kmeans[findClust].append(red_pca[idx - 1]) finalmatches_kmeans = {0: 0, 2: 1, 3: 2, 4: 3, 5: 4, 7: 5, 8: 6, 9: 7} # match clusters to digits adjustedcluster_kmeans = {} for i in range(10): if not i == 1 and not i == 6: index = finalmatches_kmeans[i] adjustedcluster_kmeans[i] = clusters_kmeans[index] # get features of digit 1 from spectral cluster_2_digit_1 = np.asarray(cluster_2_digit_1) digit_1_centroid = cluster_2_digit_1.mean(axis=0) # get features of digit 6 from spectral cluster_7_digit_6 = np.asarray(cluster_7_digit_6) digit_6_centroid = cluster_7_digit_6.mean(axis=0) cluster_centers = [] # calculate the cluster center for each cluster (digit) for i in range(10): if i == 1: cluster_centers.append(digit_1_centroid) elif i == 6: cluster_centers.append(digit_6_centroid) else: newa = np.asarray(adjustedcluster_kmeans[i]) cluster_centers.append(newa.mean(axis=0)) finalclusters = [[0 for i in range(2)] for j in range(4001)] finalclusters[0][0] = 'Id' finalclusters[0][1] = 'Label' for i in range(1, 4001): finalclusters[i][0] = 6000 + i newdist = [] for j in range(10): newdist.append( dist.euclidean(red_pca[i + 5999], cluster_centers[j])) label = np.argmin(newdist) finalclusters[i][1] = label with open('submission10.csv', "w") as output: writer = csv.writer(output, lineterminator='\n') writer.writerows(finalclusters)
""" Created on Fri Jan 08 17:34:11 2016 @author: wu34 """ from sklearn.cluster import SpectralClustering import visSimilarityMat import utilise Domain = ['DietType', 'ActType'] # dist is to set the similarity measurement method, the default is TFIDFCosin # jaccard,novelJaccard,TFIDFCosin,TFIDFEclud,TFCosin,TFEclud dist = 'TFEclud' for domain in Domain: dietSimilarity_dict = {} if domain == 'DietItem': Similarity_dict = utilise.SimilarityDict(domain, dist) elif domain == 'ActItem': Similarity_dict = utilise.SimilarityDict(domain, dist) elif domain == 'DietType': Similarity_dict = utilise.SimilarityDict(domain, dist) elif domain == 'ActType': Similarity_dict = utilise.SimilarityDict(domain, dist) X = visSimilarityMat.similarityDict2array(Similarity_dict, 0) af = SpectralClustering(affinity="precomputed").fit(X) labels = af.labels_ print labels
y_ID_2 = [] # generate X_train using X_id #ID_list = df.ix[:,0] ID_bipart = df_bipart.ix[:, 0].astype(str) data_bipart = df_bipart.ix[:, 1:] print 'number of nodes' + str(len(ID_bipart)) # need to determin how do you get num_group num_group = num_cluster(data_bipart) #num_group = 5 print 'number of groups' + str(num_group) #kmeans_bipart = KMeans(n_clusters=num_group, random_state=0).fit(data_bipart) #labels_bipart = kmeans_bipart.labels random.seed(17) labels_bipart = SpectralClustering(num_group, gamma=0.7, affinity='rbf').fit(data_bipart).labels_ # get ktruth's group #k_groups = kTruth_groups (ID_bipart, labels_bipart, kTruth) # add new column group to data df_bipart['group'] = labels_bipart ############# #group data by their 'group' #df_bipart = df_bipart.sort_values('group') #divide by group i # @i means i is a variable in group global_y = 0 global_len = 0 global_truth = [] global_fitted = []
kmed = KMedoids(2).fit_predict(data) # In[10]: plt.scatter(data[:, 0], data[:, 1], c=kmed, s=5, cmap="autumn") # # Algoritmo del Clustering Espectral # In[11]: from sklearn.cluster import SpectralClustering # In[12]: clust = SpectralClustering(2).fit_predict(data) # In[13]: plt.scatter(data[:, 0], data[:, 1], c=clust, s=5, cmap="autumn") # * Podemos estimar la k: # * No: Propagación de la afinidad # * Si: Podemos usar la distancia Euclídea: # * Si: K-Means # * No: Buscar valores centrales: # * Si: K-Medoides # * No: Los datos son linealmente separables: # * Si: Clustering aglomerativo # * No: Clustering Espectral
data1 = np.vstack((np.cos(t), np.sin(t))).T data2 = np.vstack((2 * np.cos(t), 2 * np.sin(t))).T data3 = np.vstack((3 * np.cos(t), 3 * np.sin(t))).T data = np.vstack((data1, data2, data3)) n_clusters = 3 m = euclidean_distances(data, squared=True) plt.figure(figsize=(12, 8), facecolor='w') plt.suptitle(u'谱聚类', fontsize=20) clrs = plt.cm.Spectral(np.linspace(0, 0.8, n_clusters)) for i, s in enumerate(np.logspace(-2, 0, 6)): print(s) af = np.exp(-m**2 / (s**2)) + 1e-6 model = SpectralClustering(n_clusters=n_clusters, affinity='precomputed', assign_labels='kmeans', random_state=1) y_hat = model.fit_predict(af) plt.subplot(2, 3, i + 1) for k, clr in enumerate(clrs): cur = (y_hat == k) plt.scatter(data[cur, 0], data[cur, 1], s=40, c=clr, edgecolors='k') x1_min, x2_min = np.min(data, axis=0) x1_max, x2_max = np.max(data, axis=0) x1_min, x1_max = expand(x1_min, x1_max) x2_min, x2_max = expand(x2_min, x2_max) plt.xlim((x1_min, x1_max))
import numpy as np from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import SpectralClustering from sklearn.cluster import KMeans from sklearn import metrics from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer from sklearn import preprocessing newsgroups_train = fetch_20newsgroups(subset='train') labels = newsgroups_train.target vectorizer = TfidfVectorizer(max_df=0.5,min_df=2,stop_words='english') X = vectorizer.fit_transform(newsgroups_train.data) Y = preprocessing.normalize(X, norm='l1', axis=1, copy=True, return_norm=False) #--------------Kernalize K-means------------------------------------------ km=SpectralClustering(n_clusters=20,gamma= 0.01, affinity='rbf') km.fit(Y) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, km.labels_, sample_size=1000)) #Performance print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels, km.labels_)) print("NMI:%0.3f" % metrics.normalized_mutual_info_score(labels,km.labels_)) print("AMI:%0.3f" %metrics.adjusted_mutual_info_score(labels,km.labels_)) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) print("FMI:%0.3f" % metrics.fowlkes_mallows_score(labels, km.labels_))
for i in [0,2,4,5]: le = preprocessing.LabelEncoder() le.fit(traindata3.iloc[:,i]) #print(le.classes_) traindata3.iloc[:,i]=le.transform(traindata3.iloc[:,i]) #print(traindata3.head()) for i in [0,2,4,5]: le = preprocessing.LabelEncoder() le.fit(traindata4.iloc[:,i]) #print(le.classes_) traindata4.iloc[:,i]=le.transform(traindata4.iloc[:,i]) #print(traindata4.head()) data=pd.concat([traindata1,traindata2,traindata3,traindata4]) model=SpectralClustering(n_clusters=2) model.fit(data) labels=model.labels_ data=data.assign(label=labels) X=data.iloc[:,0:6] Y=data.iloc[:,6] #print(X.head()) #print(Y.head()) scaler = Normalizer().fit(X) trainX = scaler.transform(X) traindata = np.array(X) trainlabel = np.array(Y) traindata, testdata, trainlabel, testlabel = model_selection.train_test_split(traindata,trainlabel , test_size=0.5) #print(testdata.shape)
savemat('data_' + str(n) + '.mat', { 'train_x': train_x, 'train_y': train_y }) os.chdir('../') ## Perform KMeans km = KMeans(n_clusters=nClass, init='k-means++', n_init=10) ypred = km.fit_predict(train_x) nmi_km[n] = metrics.adjusted_mutual_info_score(train_y, ypred) ari_km[n] = metrics.adjusted_rand_score(train_y, ypred) ## Perform spectral clustering sc = SpectralClustering(n_clusters=nClass, n_init=10, gamma=0.1, affinity='rbf', assign_labels='kmeans') ypred = sc.fit_predict(train_x) nmi_sc[n] = metrics.adjusted_mutual_info_score(train_y, ypred) ari_sc[n] = metrics.adjusted_rand_score(train_y, ypred) train_set = train_x, train_y dataset = [train_set, train_set, train_set] f = gzip.open('toy.pkl.gz', 'wb') cPickle.dump(dataset, f, protocol=2) f.close() ## Perform non-joint SAE+KM nmi_nj[n], ari_nj[n] = test_SdC_NJ(lbd=0, finetune_lr=.01,
from kernels.dataset_generators import Generator np.random.seed(0) nsamples = 100 X, y = Generator.generate(dataset_name="manual_circles", n_samples=nsamples) reds = y == 0 blues = y == 1 # My KKmeans kkm_model_rbf = KernelKMeans(n_clusters=2, max_iter=200, kernel="rbf") kkm_clusters = kkm_model_rbf.fit(X) # scikit_lear kkmeans spectrual_clusters = SpectralClustering(n_clusters=2, affinity='nearest_neighbors', assign_labels='kmeans').fit_predict(X) print('y: ', y) plt.figure() plt.subplot(3, 1, 1) plt.title("Original Datasset") plt.scatter(X[:, 0], X[:, 1], s=15, linewidth=0, c=y, cmap='flag') plt.xlabel("$x_1$") plt.ylabel("$x_2$") plt.subplot(3, 1, 2) plt.title("Kkmeans(rbf, gamma =0.1, clusters =2)") plt.scatter(X[:, 0], X[:, 1], s=15, linewidth=0, c=kkm_clusters, cmap='flag') plt.xlabel("$x_1$") plt.ylabel("$x_2$")
correction(X_train, y_train, km) # In[63]: from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation, SpectralClustering # In[64]: algorithms = [] algorithms.append(KMeans(n_clusters=2, random_state=1)) algorithms.append(AffinityPropagation()) algorithms.append( SpectralClustering(n_clusters=2, random_state=1, affinity='nearest_neighbors')) algorithms.append(AgglomerativeClustering(n_clusters=2)) # In[67]: data = [] for algo in algorithms: algo.fit(X_train) data.append(({ 'ARI': metrics.adjusted_rand_score(y_train, algo.labels_), 'AMI': metrics.adjusted_mutual_info_score(y_train, algo.labels_), 'Homogenity': metrics.homogeneity_score(y_train, algo.labels_),
t0 = time() X_lle = clf.fit_transform(X_train_normalized) plot_embedding_v2(X_lle, X_train, "LLE (time %.2fs)" % (time() - t0)) ### CLUSTERING ### X_iso = pd.DataFrame(X_iso) print(X_iso) print(X_iso.columns) # Building the clustering model spectral_model = SpectralClustering(n_clusters = 5, affinity ='nearest_neighbors') # Training the model and Storing the predicted cluster labels labels_sp = spectral_model.fit_predict(X_iso) plt.scatter(X_iso.iloc[:,0] , X_iso.iloc[:,1], c=labels_sp, cmap = 'rainbow') plt.show() X_iso = pd.DataFrame(X_iso) df_labels = df_origin.iloc[1:, 0].values print(df_labels) print(df_labels[0])
metrics.silhouette_score(X, labels, metric='euclidean'), )) #**************************error analysis************************************** from sklearn.metrics.cluster import contingency_matrix x = labels #actual labels y = clusters #predicted labels error_analysis = contingency_matrix(x, y) #**************************Plot************************************************ import matplotlib.pyplot as plt import seaborn as sns; sns.set() # for plot styling import numpy as np # #from sklearn.datasets import make_moons #X,y = make_moons(200, noise=.05, random_state=0) # #labels = KMeans(2, random_state=0).fit_predict(X) #plt.scatter(X[:,0], X[:, 1], c=labels, s=50,cmap='viridis'); from sklearn.datasets import make_moons X,Y = make_moons(200, noise=.05, random_state=0) from sklearn.cluster import SpectralClustering model = SpectralClustering(n_clusters=10, affinity='nearest_neighbors', assign_labels='kmeans') plottinglabels = model.fit_predict(X) plt.scatter(X[:, 0], X[:, 1], c=plottinglabels, s=50, cmap='viridis');
adjacency_matrix[2, [0, 1, 5, 4]] = 1 # cluster 2 adjacency_matrix[6, [1, 9]] = 1 # cluster 3 adjacency_matrix[9, [12, 13]] = 1 adjacency_matrix[13, [9, 11, 12]] = 1 transp = np.transpose(adjacency_matrix) print(np.where(adjacency_matrix - transp)) print(adjacency_matrix) nb_datapoints = adjacency_matrix.shape[0] dataset = [x for x in range(nb_datapoints)] # CHANGE HERE # choose a relevant number of clusters nb_clusters = 2 sc = SpectralClustering(nb_clusters, affinity='precomputed') # apply the Spectral Clustering to the adjacency matrix sc.fit_predict(adjacency_matrix) # print the clusters for cluster_index in range(nb_clusters): cluster = np.where(sc.labels_ == cluster_index)[0] print("cluster {}".format(cluster_index)) print(cluster)
import numpy as np from sklearn.cluster import SpectralClustering fragment_names = list(fragments.keys()) nfragments = len(fragment_names) n_clusters = 30 affinity_matrix = np.ones([nfragments, nfragments], np.float32) for i in range(nfragments): shapeFunc = oeshape.OEAnalyticShapeFunc() shapeFunc.SetupRef(fragments[fragment_names[i]]) result = oeshape.OEOverlapResults() for j in range(i+1,nfragments): shapeFunc.Overlap(fragments[fragment_names[j]], result) overlap = result.GetTanimoto() affinity_matrix[i,j] = overlap affinity_matrix[j,i] = overlap clustering = SpectralClustering(n_clusters=n_clusters, affinity='precomputed').fit(affinity_matrix) unique_fragment_names = [ index for index in range(n_clusters) ] for fragment_index, cluster_index in enumerate(clustering.labels_): unique_fragment_names[cluster_index] = fragment_names[fragment_index] fragments = { fragment_name : fragments[fragment_name] for fragment_name in unique_fragment_names } print('Computing overlap scores...') OVERLAP_THRESHOLD = 0.4 molecules = list() directories = glob('Files/x*') max_overlap = 0.0 for directory in tqdm(directories): _, docked_fragment = os.path.split(directory) with oechem.oemolistream(os.path.join(directory, 'poses.mol2')) as ifs: molecule = oechem.OEGraphMol() index = 1
classes = dataset[:, 0] dataset = np.delete(dataset, 0, axis=1) dataset = np.asarray(dataset, dtype=np.float) else: classes = dataset[:, -1] dataset = np.delete(dataset, -1, axis=1) dataset = np.asarray(dataset, dtype=np.float) return dataset, classes dataset, classes = loadData(filepath="./BERT/ATT_DPTC.txt", has_id=None, class_position='last') spectral = SpectralClustering(n_clusters=len(set(classes)), affinity="nearest_neighbors", n_neighbors=10, gamma=2.0) pred_y = spectral.fit_predict(dataset) print(pred_y) classify = defaultdict(list) for k, va in [(v, i) for i, v in enumerate(pred_y)]: classify[k].append(va) classify = dict(classify) print(classify) # accuracy acc = 0 for i in classify.values(): acc += Counter(np.array(classes)[i]).most_common(1)[0][1] print("准确率:%.10f" % (acc / len(pred_y)))
def spectral(self): # Spectral Clustering self.Clusters = SpectralClustering(n_clusters=self.k, affinity='nearest_neighbors') self.clusterIndex = self.Clusters.fit_predict(self.values) self.output()
fig.scatter(X[y_pred == 3, 0], X[y_pred == 3, 1], X[y_pred == 3, 2], s=20, c='black', marker='o', label='Cluster4') # Accracy of 91 # # Using Spectral Clustering on links Dataset # In[37]: clustering = SpectralClustering(n_clusters=4, assign_labels="discretize", random_state=0).fit(Y_link) y_pred = clustering.labels_ # In[40]: np.save('result_94.2.npy', y_pred) # In[41]: X = Y_link f = plt.figure(1, figsize=(14, 14)) fig = f.add_subplot(1, 1, 1, projection='3d') fig.scatter(X[y_pred == 0, 0], X[y_pred == 0, 1], X[y_pred == 0, 2],
float(row[9]), float(row[10]), float(row[11]), float(row[12]), float(row[13]) ]) #X.append([float(row[2]), float(row[3])]) y.append(row[0]) #print(X) #print(y) # In[103]: from sklearn.cluster import SpectralClustering y_pred = SpectralClustering(n_clusters=3, affinity='poly', degree=2, gamma=0.0000955).fit_predict(X) from sklearn import metrics print "Adjusted Rand index", metrics.adjusted_rand_score(y, y_pred) print "Mutual Information based scores", metrics.adjusted_mutual_info_score( y, y_pred) print "V-measure", metrics.v_measure_score(y, y_pred) print "Calinski-Harabasz Score", metrics.calinski_harabaz_score(X, y_pred) result = y_pred print(y_pred) # In[96]: colors = [] colors.append('red')
def cluster(img, grups): normalizedimg = Utils.normalize(img) spectral = SpectralClustering(n_clusters=grups, eigen_solver='amg') grp = spectral.fit_predict(normalizedimg) return grp