def scikit_pca(model, rel_wds, plot_lims, title, cluster="kmeans"): """ Given a word2vec model and a cluster (choice of "kmeans" or "spectral") Make a plot of all word-vectors in the model. """ X, keys = make_data_matrix(model) for i, key in enumerate(keys): X[i,] = model[key] if cluster == "kmeans": k_means = KMeans(n_clusters=8) labels = k_means.fit_predict(X) elif cluster == "spectral": sp_clust = SpectralClustering() labels = sp_clust.fit_predict(X) # PCA X_std = StandardScaler().fit_transform(X) sklearn_pca = PCA(n_components=2) X_transf = sklearn_pca.fit_transform(X_std) scatter_plot(X_transf[:,0], X_transf[:,1], rel_wds, labels, title, keys, plot_lims) return sklearn_pca.explained_variance_ratio_
def spectral_clustering2(similarity, concepts=2, euclid=False): if euclid: model = SpectralClustering(n_clusters=concepts, affinity='nearest_neighbors') return model.fit_predict(similarity) else: model = SpectralClustering(n_clusters=concepts, affinity='precomputed') similarity[similarity < 0] = 0 return model.fit_predict(similarity)
def run(self, k): if self.data_is_kernel: clf = SpectralClustering(n_clusters=k, gamma=self.gammav, affinity='precomputed') self.allocation = clf.fit_predict(self.X) self.kernel = self.X else: clf = SpectralClustering(n_clusters=k, gamma=self.gammav) #, affinity='precomputed' self.allocation = clf.fit_predict(self.X) self.kernel = clf.affinity_matrix_ return self.allocation
def compute_centroid_set(self, **kwargs): INPUT_ITR = subset_iterator(X=self.docv, m=self.subcluster_m, repeats=self.subcluster_repeats) kn = self.subcluster_kn clf = SpectralClustering(n_clusters=kn, affinity="precomputed") C = [] for X in INPUT_ITR: # Remove any rows that have zero vectors bad_row_idx = (X ** 2).sum(axis=1) == 0 X = X[~bad_row_idx] A = cosine_affinity(X) labels = clf.fit_predict(A) # Compute the centroids (N, dim) = X.shape centroids = np.zeros((kn, dim)) for i in range(kn): idx = labels == i mu = X[idx].mean(axis=0) mu /= np.linalg.norm(mu) centroids[i] = mu C.append(centroids) return np.vstack(C)
def spectral_clustering(matrix, N): spectral = SpectralClustering(n_clusters=N) clusters = spectral.fit_predict(matrix) res = [[] for _ in range(N)] for i, c in enumerate(clusters): res[c].append(i) return res
def create_word2vec_cluster(word2vec_model): word_vectors = word2vec_model.syn0 num_clusters = word_vectors.shape[0] / 1000 spectral_cluster_model = SpectralClustering(n_clusters=num_clusters) idx = spectral_cluster_model.fit_predict(word_vectors) pickle.dump(spectral_cluster_model, open(r"C:\Ofir\Tau\Machine Learning\Project\project\k_means_model.pkl", "wb")) return spectral_cluster_model
def spectral_clustering(k, X, G, W=None, run_times=5): if type(W) == type(None): W = np.eye(len(X)) W2 = np.sqrt(W) Gtilde = W2.dot(G.dot(W2)) sc = SpectralClustering(k, affinity='precomputed', n_init=run_times) zh = sc.fit_predict(Gtilde) return zh
def get_coregulatory_states(corr_matrices, similarity_matrix, n_clusters): spectral = SpectralClustering(n_clusters=n_clusters, affinity='precomputed') labels = spectral.fit_predict(similarity_matrix) coreg_states = {} for ci in np.unique(labels): coreg_states[ci] = corr_matrices[labels == ci, :, :].mean(axis=0) return coreg_states, labels
def dist_spectral(x, y): plot = [] for s in range(dataset.shape[0]): plot.append(np.array([x[s], y[s]])) plot = np.array(plot) spectral = SpectralClustering(n_clusters=3, eigen_solver='arpack', affinity="nearest_neighbors") clusters = spectral.fit_predict(plot) return clusters
def spectral(k, X, G, run_times=10): """Spectral clustering from sklearn library. run_times is the number of times the algorithm is gonna run with different initializations. """ sc = SpectralClustering(k, affinity='precomputed', n_init=run_times) zh = sc.fit_predict(G) return zh
def spectral_clustering(S,X,config): ''' Computes spectral clustering from an input similarity matrix. Returns the labels associated with the clustering. ''' from sklearn.cluster import SpectralClustering nk = int(config["n_clusters"]) clf = SpectralClustering(affinity='cosine',n_clusters=nk) return clf.fit_predict(X)
def cluster_faces_CNN(name = '9_8913259@N03', img_list = 'faces_list.txt'): root = '/Users/wangyufei/Documents/Study/intern_adobe/face_recognition_CNN/'+name + '/' f = open(root + model_name + 'similarity_matrix.cPickle','r') affinity_matrix = cPickle.load(f) f.close() f = SpectralClustering(affinity='precomputed', n_clusters=min(8, affinity_matrix.shape[0] - 1), eigen_solver = 'arpack', n_neighbors=min(5, affinity_matrix.shape[0])) a = f.fit_predict(affinity_matrix) groups = {} temp = zip(a, xrange(len(a))) for i in temp: if i[0] not in groups: groups[i[0]] = [i[1]] else: groups[i[0]].append(i[1]) unique_person_id = [] for kk in groups: min_similarity = np.Inf max_similarity = -np.Inf mean_similarity = 0 this_group_ids = groups[kk] for j in xrange(len(this_group_ids)): for i in xrange(j+1, len(this_group_ids)): temp = affinity_matrix[this_group_ids[i],this_group_ids[j]] if temp < min_similarity: min_similarity = temp if temp > max_similarity: max_similarity = temp mean_similarity += temp mean_similarity /= max(1, len(this_group_ids)*(len(this_group_ids) - 1) / 2) print len(this_group_ids), mean_similarity, max_similarity, min_similarity if mean_similarity > 0.5: unique_person_id.append(kk) important_person = [] for i in unique_person_id: important_person.append([i, len(groups[i])]) important_person.sort(key = lambda x:x[1], reverse=True) in_path = root + img_list imgs_list = [] with open(in_path, 'r') as data: for line in data: line = line[:-1] imgs_list.append(line.split('/')[-1]) temp = zip(a, imgs_list) face_groups = {} for i in temp: if i[0] not in face_groups: face_groups[i[0]] = [i[1]] else: face_groups[i[0]].append(i[1]) create_face_group_html_CNN(name, face_groups, important_person)
def spectral(k, X, G, z, run_times=10): """Spectral clustering from sklearn library. run_times is the number of times the algorithm is gonna run with different initializations. """ sc = SpectralClustering(k, affinity='precomputed', n_init=run_times) zh = sc.fit_predict(G) a = metric.accuracy(z, zh) v = metric.variation_information(z, zh) return a, v
def spectral_clustering(crime_rows, column_names, num_clusters, affinity='rbf', n_neighbors=0, assign_labels='kmeans'): """ n_clusters : integer, optional The dimension of the projection subspace. affinity : string, array-like or callable, default ‘rbf’ If a string, this may be one of ‘nearest_neighbors’, ‘precomputed’, ‘rbf’ or one of the kernels supported by sklearn.metrics.pairwise_kernels. Only kernels that produce similarity scores (non-negative values that increase with similarity) should be used. This property is not checked by the clustering algorithm. gamma : float Scaling factor of RBF, polynomial, exponential chi^2 and sigmoid affinity kernel. Ignored for affinity='nearest_neighbors'. degree : float, default=3 Degree of the polynomial kernel. Ignored by other kernels. coef0 : float, default=1 Zero coefficient for polynomial and sigmoid kernels. Ignored by other kernels. n_neighbors : integer Number of neighbors to use when constructing the affinity matrix using the nearest neighbors method. Ignored for affinity='rbf'. n_init : int, optional, default: 10 Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia. assign_labels : {‘kmeans’, ‘discretize’}, default: ‘kmeans’ The strategy to use to assign labels in the embedding space. There are two ways to assign labels after the laplacian embedding. k-means can be applied and is a popular choice. But it can also be sensitive to initialization. Discretization is another approach which is less sensitive to random initialization. kernel_params : dictionary of string to any, optional Parameters (keyword arguments) and values for kernel passed as callable object. Ignored by other kernels. """ crime_xy = [crime[0:2] for crime in crime_rows] crime_info = [crime[2:] for crime in crime_rows] #crime_xy = [crime[1:] for crime in crime_rows] spectral_clustering = SpectralClustering( n_clusters=num_clusters, affinity=affinity, n_neighbors=n_neighbors, assign_labels=assign_labels) print("Running spectral clustering....") print("length crimexy") print(len(crime_xy)) spectral_clustering_labels = spectral_clustering.fit_predict( random_sampling(crime_xy, num_samples=3000)) print("Formatting......") return _format_clustering(spectral_clustering_labels, crime_xy, crime_info, column_names, num_clusters=num_clusters)
def predictSpectralClustering(X, y, n=2, val='rbf'): ranX, ranY = shuffle(X, y, random_state=0) X = X[:600,] y = y[:600,] sc = SpectralClustering(n_clusters=n) results = sc.fit_predict(X) gini = compute_gini(results) if n == 2: same = calculate_score(results, y) opp = calculate_score(results, y, True) return (results, max(same, opp), gini) else: return (results, 0, gini)
def spectral_clustering(vectors: list, num_rows, k): matrix = [] ## num_rows X len(vectors) for s in range(num_rows): row = [] for v in vectors: row.append(v[s]) matrix.append(np.array(row)) matrix = np.array(matrix) spectral = SpectralClustering(n_clusters=k, eigen_solver='arpack', affinity="nearest_neighbors") clusters = spectral.fit_predict(matrix) return clusters
def _small_partition(self, data): _logger.debug("Running _small_partition on %s observations", len(data)) similarity = self._get_similarity(data, sparse = self.sparse_similarity) _logger.debug("Spectral clustering") spc_obj = SpectralClustering(n_clusters = 2, affinity = 'precomputed', assign_labels = 'discretize') partition = spc_obj.fit_predict(similarity) _logger.debug("Done spectral clustering") sizes = [len(partition[partition == x]) for x in [0, 1]] _logger.debug("Result of _small_partition: #0: {}, #1: {}" \ .format(*sizes)) return partition
def compute_spectral_clustering(n_vertex, edge_list, n_clusters): from sklearn.cluster import SpectralClustering clst = SpectralClustering(n_clusters, affinity="precomputed") adjacency_matrix = tf.compute_adjacency_matrix(n_vertex, edge_list) t = time.time() labels = clst.fit_predict(adjacency_matrix, n_clusters) exectime = time.time() - t labels = tf.compute_normal_labels(labels) clusters = tf.compute_clusters_from_labels(labels) return labels, clusters, exectime
def bench_cluster(X, y, pca_n_comp): n = len(np.unique(y)) pca = PCA(pca_n_comp) X_ = pca.fit_transform(X) sc = SpectralClustering(n) km = KMeans(n) sc_pred = sc.fit_predict(X_) km_pred = km.fit_predict(X_) distances = PairwiseDistances(X_.tolist()) distances = ExplicitDistances(distances) singlel_pred = fcluster(linkage(ssd.squareform(distances.distances)), n, criterion='maxclust') print "single-linkage clustering prediction:", singlel_pred print "single-linkage clustering score:", adjusted_rand_score(y, singlel_pred), mutual_info_score(y, singlel_pred) print "spectral clustering prediction:", sc_pred print "spectral clustering score:", adjusted_rand_score(y, sc_pred), mutual_info_score(y, sc_pred) print "kmeans clustering prediction", km_pred print "kmeans clustering score:", adjusted_rand_score(y, km_pred), mutual_info_score(y, km_pred) print "ground truth labels", y
def cluster( self, rows_to_cluster=None, seed_rows=None, cluster_count=None, nearest_neighbors=10): if seed_rows is None: seed_rows = self._query_server.sample( [None for _ in self.feature_names], sample_count=SAMPLE_COUNT) row_limit = len(seed_rows) ** 2 + 1 similar_string = StringIO(self.similar(seed_rows, row_limit=row_limit)) similar = numpy.genfromtxt( similar_string, delimiter=',', skip_header=0) similar = similar.clip(0., 5.) similar = numpy.exp(similar) clustering = SpectralClustering( n_clusters=cluster_count, affinity='precomputed') labels = clustering.fit_predict(similar) if rows_to_cluster is None: return zip(labels, seed_rows) else: row_labels = [] for row in rows_to_cluster: similar_scores = self.similar( [row], seed_rows, row_limit=row_limit) similar_scores = numpy.genfromtxt( StringIO(similar_scores), delimiter=',', skip_header=0) assert len(similar_scores) == len(labels) label_scores = zip(similar_scores, labels) top = sorted(label_scores, reverse=True)[:nearest_neighbors] label_counts = Counter(zip(*top)[1]).items() top_label = sorted(label_counts, key=lambda x: -x[1])[0][0] row_labels.append(top_label) return zip(row_labels, rows_to_cluster)
def clusterSentencesandConsolidate(): ClusterFile = open("../../Temp/SentencesToCluster.txt",'r') documents = ClusterFile.readlines() ClusterFile.close() line_count = len(documents) vectorizer = TfidfVectorizer(stop_words='english') X = vectorizer.fit_transform(documents) noOfClusters = line_count/10 ##### model = SpectralClustering(n_clusters=noOfClusters,eigen_solver='arpack',eigen_tol=0.01,assign_labels = 'discretize') y = model.fit_predict(X) clusterSentenceIndex = [] for i in xrange(len(y)): temp = [] temp.append(y[i]) temp.append(documents[i]) clusterSentenceIndex.append(temp) clusterSentenceIndex.sort() # Writing to the file # outputIndexFile = open('../../Temp/sentence-sluster-sorted-index.txt','w') # for i in xrange(len(clusterSentenceIndex)): # if int(clusterSentenceIndex[i][0]) >= 0: # line = clusterSentenceIndex[i][1] +'$'+clusterSentenceIndex[i][0]+'\n' # outputIndexFile.write(line) # outputIndexFile.close() ## Consolidate into different clusterd cluster_to_sentence_dict = defaultdict(list) for each_line in clusterSentenceIndex: cluster,sentence=each_line if cluster in cluster_to_sentence_dict: cluster_to_sentence_dict[cluster].append(sentence) else: cluster_to_sentence_dict[cluster] = [sentence] return cluster_to_sentence_dict
def Spectral(Aff,k): '''***************Imports****************''' ################################################################## import os, sys, inspect, time # @UnusedImport sys.path.insert(0, 'C:\Users\user\Anaconda\Lib\site-packages') from sklearn.cluster import SpectralClustering # @UnresolvedImport @UnusedImport ################################################################## '''***************Spectral***************''' ################################################################## print "clustering with Spectral clustering, k = " +str(k) end = time.time() estimator = SpectralClustering(n_clusters=k,affinity='precomputed') labels = estimator.fit_predict(Aff) ################################################################## end2 = time.time() print "model time is %s seconds " %str(int(end2-end)) print "%s clusters found" %str(len(set(labels))) return labels
def compute_meta_centroid_set(self, **kwargs): C = self.load_centroid_dataset("subcluster_centroids") print "Intermediate clusters", C.shape # By eye, it looks like the top 60%-80% of the # remaining clusters are stable... nc = int(self.subcluster_pcut * self.subcluster_kn) clf = SpectralClustering(n_clusters=nc, affinity="precomputed") S = cosine_affinity(C) labels = clf.fit_predict(S) meta_clusters = [] meta_cluster_size = [] for i in range(labels.max() + 1): idx = labels == i mu = C[idx].mean(axis=0) mu /= np.linalg.norm(mu) meta_clusters.append(mu) meta_cluster_size.append(idx.sum()) return meta_clusters
#%% df = pd.read_csv('../../data/Stage2DataFiles/RegularSeasonCompactResults.csv') teams = pd.read_csv('../../data/Stage2DataFiles/Teams.csv') df = pd.merge(df, teams[['TeamID', 'TeamName']], left_on='WTeamID', right_on='TeamID') del df['TeamID'] df = df.rename(columns={'TeamName': 'TmName'}) df = pd.merge(df, teams[['TeamID', 'TeamName']], left_on='LTeamID', right_on='TeamID') del df['TeamID'] df = df.rename(columns={'TeamName': 'OppName'}) df = df.loc[df['Season'] == 2018] # %% g = nx.Graph() edges = [tuple(x) for x in df[['TmName', 'OppName']].to_numpy()] g.add_edges_from(edges) A = nx.to_numpy_matrix(g) teamList = g.nodes() # %% clustering = SpectralClustering(affinity='precomputed') labels = clustering.fit_predict(A) results = pd.DataFrame({'TeamName': teamList, 'cluster': labels}) # %%
@author: Jie.Hu """ # spectral clustering import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import make_classification from sklearn.cluster import SpectralClustering # define dataset X, _ = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=4) # define the model model = SpectralClustering(n_clusters=2) # fit model and predict clusters yhat = model.fit_predict(X) # retrieve unique clusters clusters = np.unique(yhat) # create scatter plot for samples from each cluster for cluster in clusters: # get row indexes for samples with this cluster row_ix = np.where(yhat == cluster) # create scatter of these samples plt.scatter(X[row_ix, 0], X[row_ix, 1]) # show the plot plt.show()
def get_clusters(data, k): model = SpectralClustering(n_clusters=k, gamma = 0.3) # model = DPGMM(n_components = k) return model.fit_predict(data)
plt.figure(2) kmeans = KMeans(n_clusters=2) kmeans.fit(points) clusters_kmeans = kmeans.predict(points) plt.scatter(x,y, c=clusters_kmeans, s=50); # narysuj centra klastrow centers=kmeans.cluster_centers_ plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5); plt.title("kmeans") print("Kliknij w obrazek..") plt.waitforbuttonpress() plt.figure(3) model = SpectralClustering(n_clusters=2, affinity='nearest_neighbors', assign_labels='kmeans') labels = model.fit_predict(points) plt.scatter(points[:, 0], points[:, 1], c=labels, s=50, cmap='viridis'); plt.title("Spectral Clustering") plt.show()
## Clustering print() print("Compute clusters") scores_spc_nn_pca = [] labels_spc_nn_pca = [] spc = SpectralClustering(n_clusters=5, affinity="nearest_neighbors", n_neighbors=5, n_jobs=3) for i, da in enumerate(data_pca): labels = spc.fit_predict(da) score = nmi_score(labels_true, labels) scores_spc_nn_pca.append(score) labels_spc_nn_pca.append(labels) print(i, score) if score > 0.99: print("x") max_i = np.argmax(scores_spc_nn_pca) max_labels = labels_spc_nn_pca[max_i] print() print("Best score:", scores_spc_nn_pca[max_i]) print(
def writeSpectralClustering(X, number, objectsNames): clustering = SpectralClustering(n_clusters=number, affinity='nearest_neighbors') results = np.array(clustering.fit_predict(X)) resuldDF = pd.DataFrame({IMAGES:objectsNames, CLUSTERS:results}) resuldDF.to_csv(FOLDER + "/" + SPECTRALCLUSTERING+"_"+str(number)+".csv", index=False)
def main(): x,label = iter(all_loader).next() print('x:',x.shape, 'label:',label.shape) model = Lenet() criteon1 = nn.CrossEntropyLoss() criteon2 = nn.MSELoss() optimizer = optim.Adam(model.parameters(),lr=1e-3) print(model) for epoch in range(500): model.train() for batchidx, (x, label) in enumerate(all_loader): # x: [b,1,100,100] # label: [b] #print(label) if int(label) == -1: print(label) logits = model(x,-1,True) print(logits.shape) print(x.shape) loss = 0.00001*criteon2(logits, x) else: logits = model(x,label, True) loss = criteon1(logits, label) # logits: [b, 10] # label: [b] # loss: tensor scalar # print(logits) # print(label) # backprop optimizer.zero_grad() loss.backward() optimizer.step() print(epoch, loss.item()) model.eval() with torch.no_grad(): # test # 使用conv与fc_unit1部分 encoder = torch.randn(301, 184) label = torch.randn(301) for x,y in encoder_loader: #x,y = iter(all_loader).next() with torch.no_grad(): x_encoder = model(x,y,False) # label.append(y) # encoder.append(x_encoder) label = y encoder = x_encoder encoder = encoder.numpy() label = label.numpy() #print(encoder.shape,label.shape) from sklearn.cluster import SpectralClustering from sklearn.metrics import adjusted_rand_score from sklearn.metrics import normalized_mutual_info_score from sklearn.metrics.pairwise import cosine_similarity #simatrix = np.arange(len(encoder) ** 2, dtype=float).reshape(len(encoder), -1) simatrix = 0.5 * cosine_similarity(encoder) + 0.5 SC = SpectralClustering(affinity='precomputed', assign_labels='discretize')#, random_state=100) label1 = SC.fit_predict(simatrix) print('epoch:',epoch) print('label:',label.shape) ARI = adjusted_rand_score(label, label1) NMI = normalized_mutual_info_score(label, label1) # if ARI > 0.9: # print("谱聚类:ARI", ARI) # # if NMI > 0.9: # print("谱聚类:NMI", NMI) print("谱聚类:ARI", ARI) print("谱聚类:NMI", NMI) # k-means 聚类 from sklearn.metrics import adjusted_rand_score from sklearn.metrics import normalized_mutual_info_score from sklearn.cluster import KMeans from sklearn import metrics label1 = KMeans(n_clusters=11).fit_predict(encoder) ARI = adjusted_rand_score(label, label1) NMI = normalized_mutual_info_score(label, label1) # if ARI > 0.9: # print("k-means:ARI", ARI) # # if NMI > 0.9: # print("k-means:NMI", NMI) print("k-means:ARI", ARI) print("k-means:NMI", NMI)
def main(FILE_ANALISE, N_CLUSTERS=2): # Feature Extraction def extract_features(y, sr, window, hop, n_mfcc): mfcc = librosa.feature.mfcc( y=y, sr=sr, hop_length=int(hop*sr), n_fft=int(window*sr), n_mfcc=n_mfcc, dct_type=2) mfcc_delta = librosa.feature.delta(mfcc) mfcc_delta2 = librosa.feature.delta(mfcc, order=2) stacked = np.vstack((mfcc, mfcc_delta, mfcc_delta2)) return stacked.T # code modified for compactness # orignal code # https://github.com/wiseman/py-webrtcvad/blob/master/example.py def write_wave(path, audio, sample_rate): with contextlib.closing(wave.open(path, 'wb')) as wf: wf.setnchannels(1) wf.setsampwidth(2) wf.setframerate(sample_rate) wf.writeframes(audio) class Frame(object): def __init__(self, bytes, timestamp, duration): self.bytes = bytes self.timestamp = timestamp self.duration = duration def frame_generator(frame_duration_ms, audio, sample_rate): n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) offset = 0 timestamp = 0.0 duration = (float(n) / sample_rate) / 2.0 while offset + n < len(audio): yield Frame(audio[offset:offset + n], timestamp, duration) timestamp += duration offset += n def vad_collector( sample_rate, frame_duration_ms, padding_duration_ms, vad, frames): num_padding_frames = int(padding_duration_ms / frame_duration_ms) ring_buffer = collections.deque(maxlen=num_padding_frames) triggered = False voiced_frames = [] for frame in frames: is_speech = vad.is_speech(frame.bytes, sample_rate) if not triggered: ring_buffer.append((frame, is_speech)) num_voiced = len([f for f, speech in ring_buffer if speech]) if num_voiced > 0.9 * ring_buffer.maxlen: triggered = True for f, s in ring_buffer: voiced_frames.append(f) ring_buffer.clear() else: voiced_frames.append(frame) ring_buffer.append((frame, is_speech)) num_unvoiced = len( [f for f, speech in ring_buffer if not speech]) if num_unvoiced > 0.9 * ring_buffer.maxlen: triggered = False yield b''.join([f.bytes for f in voiced_frames]) ring_buffer.clear() voiced_frames = [] if voiced_frames: yield b''.join([f.bytes for f in voiced_frames]) def map_adaptation( gmm, data, max_iterations=300, likelihood_threshold=1e-20, relevance_factor=16): N = data.shape[0] D = data.shape[1] K = gmm.n_components mu_new = np.zeros((K, D)) n_k = np.zeros((K, 1)) mu_k = gmm.means_ cov_k = gmm.covariances_ pi_k = gmm.weights_ old_likelihood = gmm.score(data) new_likelihood = 0 iterations = 0 while(abs( old_likelihood - new_likelihood) > likelihood_threshold and iterations < max_iterations): iterations += 1 old_likelihood = new_likelihood z_n_k = gmm.predict_proba(data) n_k = np.sum(z_n_k, axis=0) for i in range(K): temp = np.zeros((1, D)) for n in range(N): temp += z_n_k[n][i]*data[n, :] mu_new[i] = (1/n_k[i])*temp adaptation_coefficient = n_k/(n_k + relevance_factor) for k in range(K): mu_k[k] = ( adaptation_coefficient[k] * mu_new[k]) + ( (1 - adaptation_coefficient[k]) * mu_k[k]) gmm.means_ = mu_k log_likelihood = gmm.score(data) new_likelihood = log_likelihood print(log_likelihood) return gmm # Setings SR = 16000 # sample rate N_MFCC = 13 # number of MFCC to extract N_FFT = 0.032 # length of the FFT window in seconds HOP_LENGTH = 0.010 # number of samples between successive frames in sec N_COMPONENTS = 16 # number of gaussians COVARINACE_TYPE = 'full' # cov type for GMM y = [] # LOAD_SIGNAL = False LOAD_SIGNAL = True if LOAD_SIGNAL: y, sr = librosa.load(FILE_ANALISE, sr=SR) pre_emphasis = 0.97 y = np.append(y[0], y[1:] - pre_emphasis * y[:-1]) # MAKE_CHUNKS = False MAKE_CHUNKS = True if MAKE_CHUNKS: vad = webrtcvad.Vad(2) audio = np.int16(y/np.max(np.abs(y)) * 32768) frames = frame_generator(10, audio, sr) frames = list(frames) segments = vad_collector(sr, 50, 200, vad, frames) if not os.path.exists('data/chunks'): os.makedirs('data/chunks') for i, segment in enumerate(segments): chunk_name = 'data/chunks/chunk-%003d.wav' % (i,) write_wave( chunk_name, segment[0: len(segment)-int(100*sr/1000)], sr) # extract MFCC, first and second derivatives FEATURES_FROM_FILE = True # FEATURES_FROM_FILE = False feature_file_name = 'data/param/features_{0}.pkl'.format(N_MFCC) if FEATURES_FROM_FILE: ubm_features = pickle.load(open(feature_file_name, 'rb')) else: ubm_features = extract_features( np.array(y), sr, window=N_FFT, hop=HOP_LENGTH, n_mfcc=N_MFCC) ubm_features = preprocessing.scale(ubm_features) pickle.dump(ubm_features, open(feature_file_name, "wb")) # UBM Train UBM_FROM_FILE = True # UBM_FROM_FILE = False ubm_file_name = 'data/param/ubm_{0}_{1}_{2}MFCC.pkl'.format( N_COMPONENTS, COVARINACE_TYPE, N_MFCC) if UBM_FROM_FILE: ubm = pickle.load(open(ubm_file_name, 'rb')) else: ubm = GaussianMixture( n_components=N_COMPONENTS, covariance_type=COVARINACE_TYPE) ubm.fit(ubm_features) pickle.dump(ubm, open(ubm_file_name, "wb")) # print(ubm.score(ubm_features)) SV = [] num_chunk = len(listdir(os.getcwd()+'\data\chunks')) for i in range(num_chunk): clear_output(wait=True) fname = 'data/chunks/chunk-%003d.wav' % (i,) # print('UBM MAP adaptation for {0}'.format(fname)) y_, sr_ = librosa.load(fname, sr=None) f_ = extract_features( y_, sr_, window=N_FFT, hop=HOP_LENGTH, n_mfcc=N_MFCC) f_ = preprocessing.scale(f_) gmm = copy.deepcopy(ubm) gmm = map_adaptation(gmm, f_, max_iterations=1, relevance_factor=16) sv = gmm.means_.flatten() try: sv = preprocessing.scale(sv) except: pass SV.append(sv) SV = np.array(SV) clear_output() # print(SV.shape) def rearrange(labels, n): seen = set() distinct = [x for x in labels if x not in seen and not seen.add(x)] correct = [i for i in range(n)] dict_ = dict(zip(distinct, correct)) return [x if x not in dict_ else dict_[x] for x in labels] sc = SpectralClustering(n_clusters=N_CLUSTERS, affinity='cosine') labels = sc.fit_predict(SV) labels = rearrange(labels, N_CLUSTERS) print('Обработка завершена.') return labels
n_class = 5 Xs, labels = load_UCImultifeature(select_labeled=list(range(n_class)), views=[0, 1]) ############################################################################### # Singleview spectral clustering # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # # Cluster each view separately and compute nmi s_spectral = SpectralClustering(n_clusters=n_class, random_state=RANDOM_SEED, n_init=100) for i in range(len(Xs)): s_clusters = s_spectral.fit_predict(Xs[i]) s_nmi = nmi_score(labels, s_clusters, average_method='arithmetic') print('Single-view View {0:d} NMI Score: {1:.3f}\n'.format(i + 1, s_nmi)) # Concatenate the multiple views into a single view and produce clusters s_data = np.hstack(Xs) s_clusters = s_spectral.fit_predict(s_data) s_nmi = nmi_score(labels, s_clusters) print('Single-view Concatenated NMI Score: {0:.3f}\n'.format(s_nmi)) ############################################################################### # Co-Regularized multiview spectral clustering # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # # Use the MultiviewSpectralClustering instance to cluster the data
import scipy as sp from centroid import all_separate, initialize, plot_graph, find_location_smallmask import numpy as np from mayavi import mlab from sklearn.cluster import SpectralClustering, AgglomerativeClustering from sklearn.linear_model import LogisticRegression nSubjects = 40 training_data = np.array([]) training_labels = np.array([]) R_all, vertices, faces, mask, rho, rho_1 = initialize() nCluster = 3 SC = SpectralClustering(n_clusters=nCluster, affinity='precomputed') labels = SC.fit_predict(rho) label = np.zeros(vertices.shape[0], dtype=float) label[mask] = labels + 1 temp_d = R_all[mask, :39 * 1200] temp_rho = np.corrcoef(temp_d) temp_rho[~np.isfinite(temp_rho)] = 0 temp_labels = SC.fit_predict(temp_rho) temp_label = np.zeros(vertices.shape[0], dtype=float) temp_label[mask] = temp_labels + 1 mlab.triangular_mesh(vertices[:, 0], vertices[:, 1], vertices[:, 2], faces, representation='surface', opacity=1, scalars=np.float64(temp_label)) mlab.gcf().scene.parallel_projection = True
def intuitive_semi_supervised(userId, file_path, inputlabels, k_min, k_max, num_cluster, assignLabels, seed, method): # labels = pd.read_csv(label_path) # label_list = labels["Labels"].to_list() label_list = inputlabels.to_list() total_len = len(label_list) unknown_label = -1 total_labeled = 0 optimal_accuracy = 0 optimal_k_min = 0 optimal_k_max = 0 kmer_table = pd.DataFrame(data={}) output_df = pd.DataFrame(data={}) for i in label_list: if label_list[i] != unknown_label: total_labeled = total_labeled + 1 res = [0] * total_len if assignLabels == "none": for i in range(k_min, k_max + 1): for j in range(i, k_max + 1): temp_k_min = i temp_k_max = j kmer_table, output_df = get_kmer_table(file_path, temp_k_min, temp_k_max) spectral_clustering = SpectralClustering( n_clusters=num_cluster, assign_labels="kmeans", random_state=seed) labels = spectral_clustering.fit_predict(kmer_table) correct_count = 0 for k in range(len(label_list)): if label_list[k] != unknown_label: if label_list[k] == labels[k]: correct_count += 1 temp_accuracy = correct_count / total_labeled if temp_accuracy > optimal_accuracy: optimal_accuracy = temp_accuracy optimal_k_min = i optimal_k_max = j res = labels for i in range(k_min, k_max + 1): for j in range(i, k_max + 1): temp_k_min = i temp_k_max = j kmer_table, output_df = get_kmer_table(file_path, temp_k_min, temp_k_max) spectral_clustering = SpectralClustering( n_clusters=num_cluster, assign_labels="discretize", random_state=seed) labels = spectral_clustering.fit_predict(kmer_table) correct_count = 0 for k in range(len(label_list)): if label_list[k] != unknown_label: if label_list[k] == labels[k]: correct_count += 1 temp_accuracy = correct_count / total_labeled if temp_accuracy > optimal_accuracy: optimal_accuracy = temp_accuracy optimal_k_min = i optimal_k_max = j res = labels # update parameters for front end new_params = { 'accuracy': optimal_accuracy, 'k_min': optimal_k_min, 'k_max': optimal_k_max } update_parameters(userId, new_params) else: for i in range(k_min, k_max + 1): for j in range(i, k_max + 1): temp_k_min = i temp_k_max = j kmer_table, output_df = get_kmer_table(file_path, temp_k_min, temp_k_max) spectral_clustering = SpectralClustering( n_clusters=num_cluster, assign_labels=assignLabels, random_state=seed) labels = spectral_clustering.fit_predict(kmer_table) correct_count = 0 temp_accuracy = 0 for k in range(len(label_list)): if label_list[k] != unknown_label: if label_list[k] == labels[k]: correct_count += 1 temp_accuracy = correct_count / total_labeled if temp_accuracy > optimal_accuracy: optimal_accuracy = temp_accuracy optimal_k_min = i optimal_k_max = j res = labels # update parameters for front end new_params = { 'accuracy': optimal_accuracy, 'k_min': optimal_k_min, 'k_max': optimal_k_max } update_parameters(userId, new_params) plot_div = plotly_dash_show_plot(userId, kmer_table, res, "Semi-supervised Spectral Clustering", method) output_df.insert(0, "Labels", res) return [[output_df], [plot_div]]
def spectral_init(self): from sklearn.cluster import SpectralClustering spectral_clust = SpectralClustering(n_clusters=self.k, affinity='precomputed') spectral_clust.fit(self.A) self.Z = spectral_clust.fit_predict(self.A)
def supervised_clu(feature, rmMulti, trial): (part1Pos, part1Neg, part2Pos, part2Neg, part3Pos, part3Neg, part4Pos, part4Neg, part5Pos, part5Neg, globalPos, globalNeg) = data_selection(feature, rmMulti) sumpurity = 0 sumfone = 0 for i in range(0, trial): print '#', i + 1, 'trial!!!' pos_dataset = dic2List( globalPos ) # dic2List(part1Pos) + dic2List(part2Pos) + dic2List(part3Pos) + dic2List(part4Pos) + dic2List(part5Pos) # neg_dataset = dic2List( globalNeg ) # dic2List(part1Neg) + dic2List(part2Neg) + dic2List(part3Neg) + dic2List(part4Neg) + dic2List(part5Neg) # # print len(pos_dataset) num_pos_sample = int(0.3 * len(pos_dataset)) num_neg_sample = num_pos_sample (posPicked, posNotPicked) = takingSamples(pos_dataset, num=num_pos_sample) (negPicked, negNotPicked) = takingSamples(neg_dataset, num=num_neg_sample) # print len(posPicked),len(negPicked) # print posPicked, posNotPicked # train_X = pd.DataFrame(mat2arr(list2Dic(posPicked).values() + list2Dic(negPicked).values())) train_X = pd.DataFrame( list2Dic(posPicked).values() + list2Dic(negPicked).values()) train_y = np.array( [1 for i in range(len(list2Dic(posPicked).values()))] + [0 for i in range(len(list2Dic(negPicked).values()))]) print len(train_X), len(train_y) reg = RFC(n_estimators=200, max_features='log2') model = reg.fit(train_X, train_y) # print 'model ready!' # print 'get affinity matrix...' matrixVal = {} for item in posPicked: matrixVal[str(item.keys()[0])] = 1 for item in negPicked: matrixVal[str(item.keys()[0])] = 0 test_X = posNotPicked + negNotPicked modelIn = list2Dic(test_X) test_Y = model.predict_proba(modelIn.values())[:, 1] for i in range(0, len(modelIn)): matrixVal[modelIn.keys()[i]] = test_Y[i] # print matrixVal.keys() # print map(eval,matrixVal.keys()) # print matrixVal.values() # print size row = [] col = [] docMap = {} mapDoc = {} size = 0 for pair in map(eval, matrixVal.keys()): for doc in pair: if not docMap.has_key(doc): docMap[doc] = size mapDoc[size] = doc size += 1 # print mapDoc # print docMap for pair in map(eval, matrixVal.keys()): row.append(docMap[pair[0]]) col.append(docMap[pair[1]]) for pair in map(eval, matrixVal.keys()): row.append(docMap[pair[1]]) col.append(docMap[pair[0]]) data = matrixVal.values() + matrixVal.values() # print size affinity = csc_matrix((data, (row, col)), shape=(size, size)).toarray() # print 'affinity matrix get!' # print 'run clustering...' # groundTruth = json.loads(open('groundTruth.json').read()) # groundTruth = json.loads(open('rmMultiGroundTruth.json').read()) # some documents appears in one part only once, but multiple time in global groundTruth = json.loads(open('rmMultiGroundTruthNew.json').read( )) # rmMultiGroundTruthNew.json is for simply combining all parts only # groundTruth = json.loads(open('part1CluInd.json').read()) # groundTruth = json.loads(open('rmMultiPart5CluInd.json').read()) num_clu = len(groundTruth) # print num_clu model = SC(n_clusters=num_clu, affinity='precomputed') res = model.fit_predict(affinity) # print res # print len(res), len(set(res)) resDic = {} for i in range(len(res)): if not resDic.has_key(res[i]): resDic[res[i]] = [] resDic[res[i]].append(mapDoc[i]) else: resDic[res[i]].append(mapDoc[i]) result = resDic.values() purVal = purity(result, groundTruth) (pre, rec, fone) = fmeasure(result, groundTruth) sumpurity += purVal sumfone += fone print 'purity %.4f' % purVal, 'precision: %.4f' % pre, 'recall: %.4f' % rec, 'f1: %.4f' % fone return (sumpurity, sumfone)
def train(args): parameters = vars(args) train_loader1, test_loader1 = args.loaders1 train_loader2, test_loader2 = args.loaders2 models = define_models(**parameters) initialize(models, args.reload, args.save_path, args.model_path) ssx = args.ssx.to(args.device) ssx.eval() zxs, labelsx = get_initial_zx(train_loader1, ssx, args.device) zys, labelsy = get_initial_zx(train_loader2, ssx, args.device) sc = SpectralClustering(args.nc, affinity='sigmoid', gamma=1.7) clusters = sc.fit_predict(zxs.cpu().numpy()) clusters = torch.from_numpy(clusters).to(args.device) classifier = models['classifier'].to(args.device) discriminator = models['discriminator'].to(args.device) classifier.apply(he_init) discriminator.apply(he_init) print(classifier) print(discriminator) optim_discriminator = optim.Adam(discriminator.parameters(), lr=args.lr, betas=(args.beta1, args.beta2)) optim_classifier = optim.Adam(classifier.parameters(), lr=args.lr, betas=(args.beta1, args.beta2)) optims = { 'optim_discriminator': optim_discriminator, 'optim_classifier': optim_classifier } iteration = infer_iteration( list(models.keys())[0], args.reload, args.model_path, args.save_path) t0 = time.time() for i in range(iteration, args.iterations): classifier.train() discriminator.train() perm = torch.randperm(len(zxs)) ix = perm[:args.train_batch_size] zx = zxs[ix] perm = torch.randperm(len(zys)) iy = perm[:args.train_batch_size] zy = zys[iy] optim_discriminator.zero_grad() d_loss = disc_loss(zx, zy, discriminator, classifier.x, classifier.mlp, args.device) d_loss.backward() optim_discriminator.step() perm = torch.randperm(len(zxs)) ix = perm[:args.train_batch_size] zx = zxs[ix] label = clusters[ix].long() perm = torch.randperm(len(zys)) iy = perm[:args.train_batch_size] zy = zys[iy] optim_classifier.zero_grad() c_loss = classification_loss(zx, label, classifier) tcw_loss = classification_target_loss(zy, classifier) dw_loss = embed_div_loss(zx, zy, discriminator, classifier.x, classifier.mlp, args.device) m_loss1 = mixup_loss(zx, classifier, args.device) m_loss2 = mixup_loss(zy, classifier, args.device) (args.cw * c_loss).backward() (args.tcw * tcw_loss).backward() (args.dw * dw_loss).backward() (args.smw * m_loss1).backward() (args.tmw * m_loss2).backward() optim_classifier.step() if i % args.evaluate == 0: print('Iter: %s' % i, time.time() - t0) classifier.eval() class_map = evaluate_cluster(args.visualiser, i, args.nc, zxs, labelsx, classifier, f'x', args.device) evaluate_cluster_accuracy(args.visualiser, i, zxs, labelsx, class_map, classifier, f'x', args.device) evaluate_cluster_accuracy(args.visualiser, i, zys, labelsy, class_map, classifier, f'y', args.device) save_path = args.save_path with open(os.path.join(save_path, 'c_loss'), 'a') as f: f.write(f'{i},{c_loss.cpu().item()}\n') with open(os.path.join(save_path, 'tcw_loss'), 'a') as f: f.write(f'{i},{tcw_loss.cpu().item()}\n') with open(os.path.join(save_path, 'dw_loss'), 'a') as f: f.write(f'{i},{dw_loss.cpu().item()}\n') with open(os.path.join(save_path, 'm_loss1'), 'a') as f: f.write(f'{i},{m_loss1.cpu().item()}\n') with open(os.path.join(save_path, 'm_loss2'), 'a') as f: f.write(f'{i},{m_loss2.cpu().item()}\n') with open(os.path.join(save_path, 'd_loss2'), 'a') as f: f.write(f'{i},{d_loss.cpu().item()}\n') args.visualiser.plot(c_loss.cpu().detach().numpy(), title='Source classifier loss', step=i) args.visualiser.plot(tcw_loss.cpu().detach().numpy(), title='Target classifier cross entropy', step=i) args.visualiser.plot(dw_loss.cpu().detach().numpy(), title='Classifier marginal divergence', step=i) args.visualiser.plot(m_loss1.cpu().detach().numpy(), title='Source mix up loss', step=i) args.visualiser.plot(m_loss2.cpu().detach().numpy(), title='Target mix up loss', step=i) args.visualiser.plot(d_loss.cpu().detach().numpy(), title='Discriminator loss', step=i) t0 = time.time() save_models(models, i, args.model_path, args.evaluate) save_models(optims, i, args.model_path, args.evaluate)
# divide dataset into train and test sets in 7 : 3 ratio X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # print(X_train[:, 2:]) # # print(len(y)) # print(X_train) # derive the culsters n_clusters = 3 # len(np.unique(y_train)) clu = SpectralClustering(n_clusters=n_clusters, n_jobs=-1) clu.fit(X_train[:, 2:]) y_labels_train = clu.labels_ y_labels_test = clu.fit_predict(X_test[:, 2:]) predict = [] predict.append(new) predict = np.array(predict) # train the dataset using a classification algorithm by using the clusters derived above as the traget class/ output column clf = XGBClassifier(n_jobs=-1) clf.fit(X_train[:, 2:], y_labels_train) print(clf) # predict the target class / output of the new user's datapoint prediction = clf.predict(predict[:, 2:]) predict_class = prediction[0] s = new.size prediction_np = np.array(prediction)
class CIF_Dataset(Dataset): def __init__(self, part_data=None, norm_obj=None, normalization=None, max_num_nbr=12, radius=8, dmin=0, step=0.2, cls_num=3, root_dir='DATA/CIF-DATA/'): self.root_dir = root_dir self.max_num_nbr, self.radius = max_num_nbr, radius self.normalizer = norm_obj self.normalization = normalization self.full_data = part_data self.ari = AtomCustomJSONInitializer(self.root_dir + 'atom_init.json') self.gdf = GaussianDistance(dmin=dmin, dmax=self.radius, step=step) self.clusterizer = SPCL(n_clusters=cls_num, random_state=None, assign_labels='discretize') self.clusterizer2 = KMeans(n_clusters=cls_num, random_state=None) self.encoder_elem = ELEM_Encoder() self.update_root = None def __len__(self): return len(self.partial_data) @functools.lru_cache(maxsize=None) # Cache loaded structures def __getitem__(self, idx): cif_id, target = self.full_data.iloc[idx] crystal = Structure.from_file( os.path.join(self.root_dir, cif_id + '.cif')) atom_fea = np.vstack([ self.ari.get_atom_fea(crystal[i].specie.number) for i in range(len(crystal)) ]) atom_fea = torch.Tensor(atom_fea) all_nbrs = crystal.get_all_neighbors(self.radius, include_index=True) all_nbrs = [sorted(nbrs, key=lambda x: x[1]) for nbrs in all_nbrs] nbr_fea_idx, nbr_fea = [], [] for nbr in all_nbrs: if len(nbr) < self.max_num_nbr: nbr_fea_idx.append( list(map(lambda x: x[2], nbr)) + [0] * (self.max_num_nbr - len(nbr))) nbr_fea.append( list(map(lambda x: x[1], nbr)) + [self.radius + 1.] * (self.max_num_nbr - len(nbr))) else: nbr_fea_idx.append( list(map(lambda x: x[2], nbr[:self.max_num_nbr]))) nbr_fea.append( list(map(lambda x: x[1], nbr[:self.max_num_nbr]))) nbr_fea_idx, nbr_fea = np.array(nbr_fea_idx), np.array(nbr_fea) nbr_fea = self.gdf.expand(nbr_fea) g_coords = crystal.cart_coords groups = [0] * len(g_coords) if len(g_coords) > 2: try: groups = self.clusterizer.fit_predict(g_coords) except: groups = self.clusterizer2.fit_predict(g_coords) groups = torch.tensor(groups).long() atom_fea = torch.Tensor(atom_fea) nbr_fea = torch.Tensor(nbr_fea) nbr_fea_idx = self.format_adj_matrix(torch.LongTensor(nbr_fea_idx)) target = torch.Tensor([float(target)]) coordinates = torch.tensor(g_coords) enc_compo = self.encoder_elem.encode(crystal.composition) return (atom_fea, nbr_fea, nbr_fea_idx), groups, enc_compo, coordinates, target, cif_id, [ crystal[i].specie for i in range(len(crystal)) ] def format_adj_matrix(self, adj_matrix): size = len(adj_matrix) src_list = list(range(size)) all_src_nodes = torch.tensor([[x] * adj_matrix.shape[1] for x in src_list ]).view(-1).long().unsqueeze(0) all_dst_nodes = adj_matrix.view(-1).unsqueeze(0) return torch.cat((all_src_nodes, all_dst_nodes), dim=0)
"""Prepare an ML model using KMeans algorithm to cluster some sample input generated using make_moon function. Plot the clusters. Also plot the same points by clustering it with Spectral Clustering Model. """ from sklearn.datasets.samples_generator import make_moons from sklearn.cluster import KMeans from sklearn.cluster import SpectralClustering import sklearn.metrics import matplotlib.pyplot as plt import pandas as pd import numpy as np X,y_true = make_moons(n_samples = 300,noise = 0.05) kmeans = KMeans(n_clusters = 4) kmeans.fit(X) y_means = kmeans.predict(X) plt.scatter(X[ :,0], X[ :,1], s=50,c = y_means, cmap = 'viridis' ) #plt.show() model = SpectralClustering(2,affinity = 'nearest_neighbors') labels = model.fit_predict(X) plt.scatter(X[ :,0], X[ :,1], s=50,c = labels, cmap = 'viridis' ) plt.show()
def mgm_floyd(X, K, num_graph, num_node): """ :param K: affinity matrix, (num_graph, num_graph, num_node^2, num_node^2) :param num_graph: number of graph, int :param num_node: number of node, int :return: matching results, (num_graph, num_graph, num_node, num_node) """ Lambda = 0 affinity_matrix = cal_affinity_matrix(X, K, num_graph) max = np.max(affinity_matrix) min = np.min(affinity_matrix) affinity_matrix = (affinity_matrix - min) / (max - min) clu_number = 2 cluster = SpectralClustering(n_clusters=clu_number, affinity='precomputed') labels_ = cluster.fit_predict(affinity_matrix) # print(labels_) clusters = [[] for i in range(clu_number)] for i in range(num_graph - 1): clusters[labels_[i]].append(i) index = [0] tmp = 0 for i in range(len(clusters)): tmp += len(clusters[i]) index.append(tmp) graph_rearrange = [] for item in clusters: graph_rearrange.extend(item) # raise Exception('STOP!') for i in range(len(clusters)): for v in clusters[i]: begin = index[i] end = index[i + 1] rearrange = graph_rearrange[ begin:end] + graph_rearrange[:begin] + graph_rearrange[end:] for x in rearrange: for y in rearrange: # calculate S_org J_xy_ori = single_affinity(X[x][y], K[x][y]) J_xy = (J_xy_ori - min) / (max - min) S_org = J_xy # calculate S_opt X_xv_vy = np.matmul(X[x][v], X[v][y]) J_xv_vy = (single_affinity(X_xv_vy, K[x][y]) - min) / (max - min) S_opt = J_xv_vy # compare and update if S_org < S_opt: X[x][y] = np.matmul(X[x][v], X[v][y]) X[y][x] = np.matmul(X[y][v], X[v][x]) if J_xy_ori < min: min = J_xy_ori elif J_xy_ori > max: max = J_xy_ori # set lambda and repeat above process Lambda = 0.45 affinity_matrix = cal_affinity_matrix(X, K, num_graph) max = np.max(affinity_matrix) min = np.min(affinity_matrix) # consistency_matrix = cal_pairwise_consistency_matrix(X, num_graph, num_node) # use unary consistency to speed up consistency_matrix = cal_unary_consistency_matrix(X, num_graph, num_node) flag = False # use flag to check whether X is updated for i in range(len(clusters)): for v in clusters[i]: begin = index[i] end = index[i + 1] rearrange = graph_rearrange[ begin:end] + graph_rearrange[:begin] + graph_rearrange[end:] if flag: # consistency_matrix = cal_pairwise_consistency_matrix(X, num_graph, num_node) # use unary consistency to speed up consistency_matrix = cal_unary_consistency_matrix( X, num_graph, num_node) flag = False for x in rearrange: for y in rearrange: J_xy_ori = single_affinity(X[x][y], K[x][y]) J_xy = (J_xy_ori - min) / (max - min) Cp_xy = consistency_matrix[y] S_org = (1 - Lambda) * J_xy + Lambda * Cp_xy X_xv_vy = np.matmul(X[x][v], X[v][y]) J_xv_vy = (single_affinity(X_xv_vy, K[x][y]) - min) / (max - min) # if use unary consistency C_xv_vy = consistency_matrix[v] # if use pairwise consistency # C_xv_vy = math.sqrt(consistency_matrix[x][v] * consistency_matrix[v][y]) S_opt = (1 - Lambda) * J_xv_vy + Lambda * C_xv_vy if S_org < S_opt: X[x][y] = np.matmul(X[x][v], X[v][y]) X[y][x] = np.matmul(X[y][v], X[v][x]) flag = True if J_xy_ori < min: min = J_xy_ori elif J_xy_ori > max: max = J_xy_ori return X
def TestCaltech(): similarityMatrix, labels = DataLoad.LoadCaltech() spectralSimMatrix = sklearn.preprocessing.normalize(similarityMatrix) n = similarityMatrix.shape[0] nmiVals = np.zeros((21, 2)) numConstraints = np.zeros((21)) classRanges = DataGen.GenClassRanges(labels) normAssocAverages = [] spectralAverages = [] # The value inside range indicates the number of iterations to be averaged for j in range(1): for i in range(21): constraintMatrix = np.zeros((n, n)) spectralConstraintMatrix = np.zeros((n, n)) constraintMatrix = DataGen.GenerateConstraints( constraintMatrix, classRanges, i * 50, n, 4, False, False) spectralConstraintMatrix = DataGen.GenerateConstraints( spectralSimMatrix, classRanges, i * 50, n, 4, False, True) ssKernelKMeansAgent = SSKernelKMeans() spectralClusteringAgent = SpectralClustering( n_clusters=4, affinity='precomputed') spectralAffMatrix = spectralConstraintMatrix - csgraph.laplacian( spectralSimMatrix) spectralAffMatrix = ( ssKernelKMeansAgent.findSigma(spectralAffMatrix) * np.identity(n)) + spectralAffMatrix ssClusterAssignments = ssKernelKMeansAgent.Cluster( similarityMatrix, constraintMatrix, 4) spectralClusteringAssignments = spectralClusteringAgent.fit_predict( spectralAffMatrix) nmiVals[i, 0] = max( 0, sklearn.metrics.normalized_mutual_info_score( DataGen.GetTestLabels(classRanges, n, labels.tolist()), DataGen.GetTestLabels(classRanges, n, ssClusterAssignments))) nmiVals[i, 1] = sklearn.metrics.normalized_mutual_info_score( DataGen.GetTestLabels(classRanges, n, labels.tolist()), DataGen.GetTestLabels(classRanges, n, spectralClusteringAssignments.tolist())) numConstraints[i] = i * 50 print('SS Kernel K Means NMI with ' + str(numConstraints[i]) + ' constraints = ' + str(nmiVals[i, 0])) print('Spectral Clustering NMI with ' + str(numConstraints[i]) + ' constraints = ' + str(nmiVals[i, 1])) normAssocAverages.append(nmiVals[:, 0]) spectralAverages.append(nmiVals[:, 1]) plt.plot(numConstraints, np.mean(normAssocAverages, axis=0), '--x') plt.plot(numConstraints, np.mean(spectralAverages, axis=0), ':s') plt.legend(['SS Kernel KMeans - Ratio Association', 'Spectral Clustering'], loc='upper left') plt.xlabel('Number of Constraints') plt.ylabel('NMI Value') plt.title('Caltech Data Set') plt.show()
# consider only 10000 data (spectralclustering memory complexity): ind = np.array(10000 * [1] + (X.shape[0] - 10000) * [0]).astype(bool) ind = shuffle(ind) data_thr10 = pd.DataFrame(X[ind]) data_thr10.columns = data.columns scaler = StandardScaler() X = scaler.fit_transform(X) X = X[ind] for n_clusters in range(2, 10): km = SpectralClustering(n_clusters=n_clusters) preds = km.fit_predict(X) print "components:", set(preds) print np.bincount(preds) data_thr10['preds'] = pd.Series(preds).astype("category") color_key = ["red", "blue", "yellow", "grey", "black", "purple", "pink", "brown", "green", "orange"] title = str(np.bincount(preds)) TOOLS = "wheel_zoom,box_zoom,reset,box_select,pan" plot_width = 900 plot_height = 300 x_name = 'rateCA' y_name = 'rate' xmin_p = np.percentile(data_thr10[x_name], 0.1)
def spectral(X, n): instance = SpectralClustering(n_clusters=n, affinity='linear') return instance.fit_predict(X)
def learn_mix_model_beta(categroy, K=4, kappa=5): with open(Dict['Dictionary'], 'rb') as fh: _, centers, _ = pickle.load(fh) sim_fname = os.path.join(Feat['cache_dir'], 'simmat', 'simmat_mthrh045_{}.pickle'.format(category)) feat_fname = os.path.join(Feat['cache_dir'], 'feat_{}_train.pickle'.format(category)) savename = os.path.join( root_dir, 'mix_model', 'mmodel_{}_K{}_notrain_beta.pickle'.format(category, K)) # Spectral clustering based on the similarity matrix with open(sim_fname, 'rb') as fh: mat_dis1, _ = pickle.load(fh) mat_dis = mat_dis1 N = mat_dis.shape[0] print('total number of instances for obj {}: {}'.format(categroy, N)) mat_full = mat_dis + mat_dis.T - np.ones((N, N)) np.fill_diagonal(mat_full, 0) W_mat = 1. - mat_full print('W_mat stats: {}, {}'.format(np.mean(W_mat), np.std(W_mat))) K1 = 2 cls_solver = SpectralClustering(n_clusters=K1, affinity='precomputed', random_state=666) lb = cls_solver.fit_predict(W_mat) K2 = 2 idx2 = [] W_mat2 = [] lb2 = [] for k in range(K1): idx2.append(np.where(lb == k)[0]) W_mat2.append(W_mat[np.ix_(idx2[k], idx2[k])]) print('W_mat_i stats: {}, {}'.format(np.mean(W_mat2[k]), np.std(W_mat2[k]))) cls_solver = SpectralClustering(n_clusters=K2, affinity='precomputed', random_state=666) lb2.append(cls_solver.fit_predict(W_mat2[k])) rst_lbs1 = np.ones(len(idx2[0])) * -1 rst_lbs1[np.where(lb2[0] == 0)[0]] = 0 rst_lbs1[np.where(lb2[0] == 1)[0]] = 1 rst_lbs2 = np.ones(len(idx2[1])) * -1 rst_lbs2[np.where(lb2[1] == 0)[0]] = 2 rst_lbs2[np.where(lb2[1] == 1)[0]] = 3 rst_lbs = np.ones(N) * -1 rst_lbs[idx2[0]] = rst_lbs1 rst_lbs[idx2[1]] = rst_lbs2 rst_lbs = rst_lbs.astype('int') del (mat_dis) for kk in range(K): print('cluster {} has {} samples'.format(kk, np.sum(rst_lbs == kk))) # Load the feature vector and compute VC encoding with open(feat_fname, 'rb') as fh: layer_feature = pickle.load(fh) assert (N == len(layer_feature)) r_set = [None for nn in range(N)] for nn in range(N): iheight, iwidth = layer_feature[nn].shape[0:2] lff = layer_feature[nn].reshape(-1, featDim) lff_norm = lff / np.sqrt(np.sum(lff**2, 1)).reshape(-1, 1) r_set[nn] = cdist(lff_norm, centers, 'cosine').reshape(iheight, iwidth, -1) # transfer from distance space to firing rate space, center crop layer_feature_fr = [None for nn in range(N)] for nn in range(N): hnn, wnn = r_set[nn].shape[0:2] if hnn > 14: marg = (hnn - 14) // 2 r_set[nn] = r_set[nn][marg:marg + 14, :, :] elif wnn > 14: marg = (wnn - 14) // 2 r_set[nn] = r_set[nn][:, marg:marg + 14, :] layer_feature_fr[nn] = np.exp(-kappa * r_set[nn]) del (layer_feature) del (r_set) all_train = [[] for kk in range(K)] for nn in range(N): if nn % 100 == 0: print(nn, end=' ', flush=True) all_train[rst_lbs[nn]].append(layer_feature_fr[nn].ravel()) print('') all_alphas = [None for kk in range(K)] all_betas = [None for kk in range(K)] all_N = [0 for kk in range(K)] for kk in range(K): data_kk = np.array(all_train[kk]) all_alphas[kk] = np.zeros(data_kk.shape[1]) all_betas[kk] = np.zeros(data_kk.shape[1]) for dd in range(data_kk.shape[1]): all_alphas[kk][dd], all_betas[kk][dd], _, _ = beta.fit(data_kk[:, dd]) all_N[kk] = data_kk.shape[0] assert (N == np.sum(all_N)) all_priors = np.array(all_N) / N with open(savename, 'wb') as fh: pickle.dump([all_alphas, all_betas, all_priors], fh)
def do_work(cluster_test, cluster_cnt): # Pass a list of tuples and a counter that increments each time we go # through the loop. The tuples are the data to be used by k-means, # and the PCA-derived features for graphing. We use k-means to fit a # model to the data, then store the predicted values and the two-feature # PCA solution in the data frame. for counter, data in enumerate([ (X1, X_pca1), (X2, X_pca2), (X3, X_pca3), (X4, X_pca4)]): # Put the features into ypred. ypred['pca_f1' + '_sample' + str(counter)] = data[1][:, 0] ypred['pca_f2' + '_sample' + str(counter)] = data[1][:, 1] # Generate cluster predictions and store them for clusters 2 to 4. for nclust in range(2, 5): pred = KMeans(n_clusters=cluster_cnt, random_state=42).fit_predict(data[0]) ypred['clust' + str(nclust) + '_sample' + str(counter)] = pred # Get predicted clusters. if (cluster_test == KMEANS): print( "In test - >",cluster_test) output[LABEL] = type_lbls[cluster_test] output[CLUSTERS] = cluster_cnt full_pred = KMeans(n_clusters=cluster_cnt, random_state=42).fit_predict(X_norm) # Create a list of pairs, where each pair is the ground truth group # and the assigned cluster. y = df.iloc[:, 13] y = np.where(y > 0, 0, 1) c = list(itertools.product(y, full_pred)) # Count how often each type of pair (a, b, c, or d) appears. #c = np.array(c, dtype=np.float16) RIcounts = [[x, c.count(x)] for x in set(c)] #print(clusters, " clusters, RIcounts - >",RIcounts) # Create the same counts but without the label, for easier math below. RIcounts_nolabel = [c.count(x) for x in set(c)] # Calculate the Rand Index. RIscore = (RIcounts_nolabel[3] + RIcounts_nolabel[2]) / np.sum(RIcounts_nolabel) output[RISCORE] = RIscore output[ARS] = (metrics.adjusted_rand_score(y, full_pred)) # print(clusters, " clusters, adjustd Rand Score->", metrics.adjusted_rand_score(y, full_pred)) output[METS1] = metrics.silhouette_score(X_norm, type_lbls, metric='sqeuclidean') if (cluster_test == MEANSHFT): #output.clear() # output=[0]*8 print( "In test - >",cluster_test) output[LABEL] = type_lbls[cluster_test] output[CLUSTERS] = cluster_cnt output[RISCORE] = 0#RIscore output[ARS] = 0#(metrics.adjusted_rand_score(y, full_pred)) bandwidth = estimate_bandwidth(X_norm, quantile=0.2, n_samples=500) # Declare and fit the model. ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) full_pred = ms.fit(X_norm) # Extract cluster assignments for each data point. labels = ms.labels_ y = df.iloc[:, 13] y = np.where(y > 0, 0, 1) y = print(pd.crosstab(y,labels)) # Coordinates of the cluster centers. cluster_centers = ms.cluster_centers_ output[RISCORE] = 999#RIscore #### output[ARS] = (metrics.adjusted_rand_score(y, full_pred)) output[METS1] = metrics.silhouette_score(X_norm, labels, metric='sqeuclidean') # Extract cluster assignments for each data point. labels = ms.labels_ if (cluster_test == SPECTRAL): print( "In test - >",cluster_test) output[LABEL] = type_lbls[cluster_test] output[CLUSTERS] = cluster_cnt # Declare and fit the model. sc = SpectralClustering(n_clusters=cluster_cnt) sc.fit(X_norm) #Predicted clusters. y = df.iloc[:, 13] y = np.where(y > 0, 0, 1) full_pred =sc.fit_predict(X_norm) print("spectral->",pd.crosstab(y,full_pred)) # Create a list of pairs, where each pair is the ground truth group # and the assigned cluster. c = list(itertools.product(y, full_pred)) # Count how often each type of pair (a, b, c, or d) appears. RIcounts = [[x, c.count(x)] for x in set(c)] print("Kssspectral RI Count ->",RIcounts) output[ARS] = (metrics.adjusted_rand_score(y, full_pred)) # Create the same counts but without the label, for easier math below. RIcounts_nolabel = [c.count(x) for x in set(c)] # Calculate the Rand Index. RIscore = (RIcounts_nolabel[3] + RIcounts_nolabel[2]) / np.sum(RIcounts_nolabel) output[RISCORE] = RIscore # print("spectral Menas RIscore ->", RIscore) output[METS1] = metrics.silhouette_score(X_norm, labels, metric='sqeuclidean') if (cluster_test == AFFINITY): print( "In test - >",cluster_test) output[LABEL] = type_lbls[cluster_test] output[CLUSTERS] = cluster_cnt # Compute Affinity Propagation input_x = np.array(X_norm)# X_norm.values af = AffinityPropagation().fit(input_x) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ print("labels = ",labels) n_clusters_ = len(cluster_centers_indices) y = df.iloc[:, 13] y = np.where(y > 0, 0, 1) labels_true = y print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" \ % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" \ % metrics.adjusted_mutual_info_score(labels_true, labels)) # print("Silhouette Coefficient: %0.3f" \ # % metrics.silhouette_score(X_norm, labels, metric='sqeuclidean')) output[RISCORE] = 0#RIscore # output[ARS] = (metrics.adjusted_rand_score(y, full_pred)) output[ARS] = metrics.adjusted_rand_score(labels_true, labels)#0#(metrics.adjusted_rand_score(y, full_pred)) output[METS1] = metrics.silhouette_score(X_norm, labels, metric='sqeuclidean')
visualizer_inter.show() st.pyplot() except: st.write("Fill all parameters.") ######################################## # Spectral Clustering ######################################## if ML_option == "Spectral Clustering": try: # Spectral parameters Nk = st.number_input("Number of clusters: ", min_value=1, step=1) SpecClus = SpectralClustering(n_clusters=Nk, affinity='nearest_neighbors', assign_labels='kmeans') pred = SpecClus.fit_predict(data_feature) st.subheader("Classification Report") st.text(classification_report(data_target, pred)) #Confusion matrix plot_confusion_matrix(data_target, pred, figsize=(7, 5), cmap="PuBuGn") bottom, top = plt.ylim() plt.ylim(bottom + 0.5, top - 0.5) st.pyplot() # Elbow Method visualizer = KElbowVisualizer(SpecClus, k=(1, 10)) visualizer.fit(data_feature) visualizer.show() st.pyplot()
data1 = np.vstack((np.cos(t), np.sin(t))).T data2 = np.vstack((2*np.cos(t), 2*np.sin(t))).T data3 = np.vstack((3*np.cos(t), 3*np.sin(t))).T data = np.vstack((data1, data2, data3)) n_clusters = 3 m = euclidean_distances(data, squared=True) plt.figure(figsize=(12, 8), facecolor='w') plt.suptitle('谱聚类', fontsize=16) clrs = plt.cm.Spectral(np.linspace(0, 0.8, n_clusters)) for i, s in enumerate(np.logspace(-2, 0, 6)): print(s) af = np.exp(-m ** 2 / (s ** 2)) + 1e-6 model = SpectralClustering(n_clusters=n_clusters, affinity='precomputed', assign_labels='kmeans', random_state=1) y_hat = model.fit_predict(af) plt.subplot(2, 3, i+1) for k, clr in enumerate(clrs): cur = (y_hat == k) plt.scatter(data[cur, 0], data[cur, 1], s=40, c=clr, edgecolors='k') x1_min, x2_min = np.min(data, axis=0) x1_max, x2_max = np.max(data, axis=0) x1_min, x1_max = expand(x1_min, x1_max) x2_min, x2_max = expand(x2_min, x2_max) plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.grid(b=True, ls=':', color='#808080') plt.title(r'$\sigma$ = %.2f' % s, fontsize=13) plt.tight_layout() plt.subplots_adjust(top=0.9) plt.show()
players = {} data = [] names = [] data_file = open("kda_200.txt", "r") # Build data from file for line in data_file: fields = line.split(",") data.append([float(fields[1]), float(fields[3]), float(fields[4]), float(fields[4])]) names.append(fields[0]) # Create and fit model clus = SpectralClustering(n_clusters=5,eigen_solver='arpack',affinity= "nearest_neighbors") clus.fit(data) labels = clus.fit_predict(data) # Sort the fitted data into 5 boxes, one for each role boxes = [[],[],[],[],[]] for x in range(len(data)): pred = labels[x] name = names[x] # names like "Amazing (Maurice Stuckenschneider)" are too long, cut at first space if " " in name: name = name[0:name.find(" ")+1] boxes[pred].append(name.ljust(10)) # Get size of largest cluster so you can pad the others sizes = [len(boxes[0]), len(boxes[1]), len(boxes[2]), len(boxes[3]), len(boxes[4])] biggest = max(sizes)
from sklearn import datasets import pandas as pd import matplotlib.pyplot as plt from sklearn.cluster import SpectralClustering from sklearn import cluster import numpy as np iris = datasets.load_iris() X = iris.data y = iris.target target_names = iris.target_names print(target_names) A = np.array([[0, 1, 1, 0, 0, 0, 0, 0, 1, 1], [1, 0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1, 0, 0, 0, 0], [0, 0, 0, 1, 0, 1, 0, 0, 0, 0], [0, 0, 0, 1, 1, 0, 1, 1, 0, 0], [0, 0, 0, 0, 0, 1, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 1, 0]]) sc = SpectralClustering(3, affinity='precomputed', n_init=100, assign_labels='discretize') sc.fit_predict(A)
marker=marker[i], label='%s' % i) plt.legend() plt.show() ## Perform spectral clustering sc = SpectralClustering(n_clusters=nClass, n_init=10, gamma=0.1, affinity='rbf', n_neighbors=3, assign_labels='kmeans', degree=3, coef0=1, kernel_params=None) ypred = sc.fit_predict(train_x) nmi_sc = metrics.adjusted_mutual_info_score(train_y, ypred) ari_sc = metrics.adjusted_rand_score(train_y, ypred) print >> sys.stderr, ('NMI for spectral clustering: %.2f' % (nmi_sc)) print >> sys.stderr, ('ARI for spectral clustering: %.2f' % (ari_sc)) ## Perform KMeans km = KMeans(n_clusters=nClass, init='k-means++', n_init=10) ypred = km.fit_predict(train_x) nmi_km = metrics.adjusted_mutual_info_score(train_y, ypred) ari_km = metrics.adjusted_rand_score(train_y, ypred) print >> sys.stderr, ('NMI for Kmeans: %.2f' % (nmi_km)) print >> sys.stderr, ('ARI for Kmeans: %.2f' % (ari_km)) train_set = train_x, train_y dataset = [train_set, train_set, train_set]
def show_clustered_dataset(X, Y): fig, ax = plt.subplots(1, 1, figsize=(30, 25)) ax.grid() ax.set_xlabel('X') ax.set_ylabel('Y') for i in range(nb_samples): if Y[i] == 0: ax.scatter(X[i, 0], X[i, 1], marker='o', color='r') else: ax.scatter(X[i, 0], X[i, 1], marker='^', color='b') plt.show() if __name__ == '__main__': warnings.simplefilter("ignore") # Create dataset X, Y = make_moons(n_samples=nb_samples, noise=0.05) # Show dataset show_dataset(X, Y) # Create and train Spectral Clustering sc = SpectralClustering(n_clusters=2, affinity='nearest_neighbors') Y = sc.fit_predict(X) # Show clustered dataset show_clustered_dataset(X, Y)
def cluster_faces(name, img_list = 'all-scores-faces-list-new'): root = root_all + 'face_recognition/'+ '@'.join(name.split('-')) cnn_root = root_all + 'face_recognition_CNN/'+name + '/' f = open(cnn_root + 'waldo_normalized_combined.cPickle','r') combined_matrix = cPickle.load(f) f.close() diag = np.diag(combined_matrix) diag = diag[:, np.newaxis] normalize_matrix = np.dot(diag, np.transpose(diag)) normalize_matrix = np.sqrt(normalize_matrix) affinity_matrix = np.divide(combined_matrix, normalize_matrix) min_ = np.min(affinity_matrix); max_ = np.max(affinity_matrix) affinity_matrix = (affinity_matrix - min_) / (max_ - min_) f = SpectralClustering(affinity='precomputed', n_clusters=min(8, affinity_matrix.shape[0] - 1), eigen_solver = 'arpack', n_neighbors=min(5, affinity_matrix.shape[0])) a = f.fit_predict(affinity_matrix) groups = {} temp = zip(a, xrange(len(a))) for i in temp: if i[0] not in groups: groups[i[0]] = [i[1]] else: groups[i[0]].append(i[1]) unique_person_id = [] for kk in groups: min_similarity = np.Inf max_similarity = -np.Inf mean_similarity = 0 this_group_ids = groups[kk] for j in xrange(len(this_group_ids)): for i in xrange(j+1, len(this_group_ids)): temp = combined_matrix[this_group_ids[i],this_group_ids[j]] if temp < min_similarity: min_similarity = temp if temp > max_similarity: max_similarity = temp mean_similarity += temp mean_similarity /= max(1, len(this_group_ids)*(len(this_group_ids) - 1) / 2) print len(this_group_ids), mean_similarity, max_similarity, min_similarity print mean_similarity if mean_similarity > 0.4 and len(this_group_ids) > 1: unique_person_id.append(kk) important_person = [] for i in unique_person_id: important_person.append([i, len(groups[i])]) important_person.sort(key = lambda x:x[1], reverse=True) in_path = root + '-dir/' + img_list imgs_list = [] with open(in_path, 'r') as data: for line in data: line = line[:-1] imgs_list.append(line.split('/')[-1]) temp = zip(a, imgs_list) face_groups = {} for i in temp: if i[0] not in face_groups: face_groups[i[0]] = [i[1]] else: face_groups[i[0]].append(i[1]) create_face_group_html(name, face_groups, important_person) f = open(cnn_root + 'waldo_group_combined.cPickle','w') cPickle.dump([face_groups, important_person], f) f.close()
def silhouette(X, n_clusters, algorithm, monthNo): fig, (ax1, ax2) = plt.subplots(1, 2) X = np.array(X) if algorithm == KMeans: clusterer = algorithm(n_clusters=n_clusters, random_state=10) elif algorithm == AgglomerativeClustering: clusterer = algorithm(n_clusters=n_clusters, linkage='ward') elif algorithm == SpectralClustering: clusterer = SpectralClustering(n_clusters=n_clusters) elif algorithm == AffinityPropagation: clusterer = AffinityPropagation(preference=-5.0, damping=0.95) elif algorithm == MeanShift: clusterer = MeanShift(0.175, cluster_all=False) cluster_labels = clusterer.fit_predict(X) fig.set_size_inches(18, 7) # The 1st subplot is the silhouette plot # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1] ax1.set_xlim([-0.1, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10]) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters silhouette_avg = silhouette_score(X, cluster_labels) print("For algorithm =", algorithm, " n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(X, cluster_labels) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhouette score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) # 2nd Plot showing the actual clusters formed colors = cm.spectral(cluster_labels.astype(float) / n_clusters) X = np.array(X) clusterer = KMeans(n_clusters=n_clusters, random_state=10) cluster_labels = clusterer.fit_predict(X) ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=colors, edgecolor='k') # Labeling the clusters centers = clusterer.cluster_centers_ # Draw white circles at cluster centers ax2.scatter(centers[:, 0], centers[:, 1], marker='o', c="white", alpha=1, s=200, edgecolor='k') for i, c in enumerate(centers): ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50, edgecolor='k') ax2.set_title("The visualization of the clustered data.") ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 2nd feature") alg = '' if algorithm == KMeans: alg = "Silhouette analysis for Preprocessing = real , algorith = KMeans " + str( n_clusters) + " , month = " + str( monthNo) + ",average silhouette_score = " + str(silhouette_avg) plt.suptitle((alg), fontsize=14, fontweight='bold') plt.savefig('result/Kmean/real' + '_Kmeans' + str(n_clusters) + '_m' + str(monthNo)) elif algorithm == AgglomerativeClustering: alg = "Silhouette analysis for Preprocessing = real , algorith = AgglomerativeClustering " + str( n_clusters) + " , month = " + str( monthNo) + ",average silhouette_score = " + str(silhouette_avg) plt.suptitle((alg), fontsize=14, fontweight='bold') plt.savefig('result/AgglomerativeClustering/real' + '_Agg' + str(n_clusters) + '_m' + str(monthNo)) elif algorithm == AffinityPropagation: alg = "Silhouette analysis for Preprocessing = real , algorith = AffinityPropagation , month = " + str( monthNo) + ",average silhouette_score = " + str(silhouette_avg) plt.suptitle((alg), fontsize=14, fontweight='bold') plt.savefig('result/AffinityPropagation/real' + '_Aff_m' + str(monthNo)) elif algorithm == MeanShift: alg = "Silhouette analysis for Preprocessing = real , algorith = AffinityPropagation , month = " + str( monthNo) + ",average silhouette_score = " + str(silhouette_avg) plt.suptitle((alg), fontsize=14, fontweight='bold') plt.savefig('result/MeanShift/real' + '_MeanShift_m' + str(monthNo))
writer.writerow(rows[i] + [labels[i]]) # <p style="font-family:courier;">5. We apply spectral clustering with 66 clusters</p> # In[5]: """ spectral = SpectralClustering(n_clusters = 66, eigen_solver = 'arpack', affinity='nearest_neighbors', n_neighbors = 10, kernel_params = {'radius':0.095, 'metric':'euclidean','mode':'distance'}, n_init = 20) """ spectral = SpectralClustering(n_clusters=66, eigen_solver='arpack', affinity='nearest_neighbors', gamma=0.095) labels = spectral.fit_predict(X) unique_labels = set(labels) # <p style="font-family:courier;">6. We plot the results of spectral clustering</p> # In[6]: #Plot colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels))) for k, col in zip(unique_labels, colors): if k != -1: class_member_mask = (labels == k) xy = X[labels == k] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markersize=6) plt.title('Estimated number of clusters: %d' % k) plt.show()
onehotencoder = OneHotEncoder(categorical_features=[17]) x = onehotencoder.fit_transform(x).toarray() from sklearn import preprocessing min_max_scaler = preprocessing.MinMaxScaler() x = min_max_scaler.fit_transform(x) print(x) from sklearn.cluster import SpectralClustering spec = SpectralClustering(n_clusters=9, assign_labels="discretize", random_state=0) spec_predict = spec.fit_predict(x) print(spec_predict) from sklearn import metrics from sklearn.metrics import pairwise_distances print("Silhouette Score: %0.3f" % metrics.silhouette_score(x, spec_predict, metric='euclidean')) print("Calinski-Harabaz Index: %0.3f" % metrics.calinski_harabaz_score(x, spec_predict)) """ # Using the elbow method to find the optimal number of clusters from sklearn.cluster import KMeans
def spectral_plot(x, y, ncl=3, alabels="kmeans", title="Spectral Clustering"): scluster = SpectralClustering(n_clusters=ncl, affinity='rbf', assign_labels=alabels, random_state=0) pred_y = scluster.fit_predict(y) ''' indx_low = 3 indx_high = ul_-1 tru_list = [] colbox = scluster.labels_ while indx_high < cluster_data_: for indx in range(indx_low, indx_high): if colbox[indx] != colbox[indx+1]: if colbox[indx] == colbox[indx-1] and colbox[indx]==colbox[indx-2]: tru_list.append(indx+1) indx_low += ul_ indx_high += ul_ fig = plt.figure(figsize=(20,8)) plt.subplot(121) plt.scatter(y[:,0], y[:,1], c=scluster.labels_, cmap='coolwarm', label='Spectral, '+ str(alabels)+" "+str(ncl)+' clusters') plt.legend(bbox_to_anchor=(0, 1.06), loc='upper left', ncol=1) plt.xlabel("VAE1", fontsize=10) plt.ylabel("VAE2", fontsize=10) plt.rc('xtick', labelsize=10) plt.rc('ytick', labelsize=10) plt.subplot(122) plt.scatter(x[:,0], x[:,1], c=scluster.labels_, cmap='coolwarm', label='Spectral, '+ str(alabels)+" "+str(ncl)+' clusters') plt.legend(bbox_to_anchor=(0, 1.06), loc='upper left', ncol=1) plt.ylabel('Temperature('+u"\u2103"+")", fontsize = 10) plt.xlabel('Composition(%)', fontsize=10) plt.rc('xtick', labelsize=10) plt.rc('ytick', labelsize=10) for i, val in enumerate(tru_list): plt.annotate(str(int(x[:,0][val]))+'%', xy=(x[:,0][val], x[:,1][val]), xytext=(x[:,0][val], 1+x[:,1][val]), ha='center', arrowprops=dict(arrowstyle="->"),) fig.suptitle(title, fontsize = 16) plt.savefig(os.getcwd() + '/plots/' + str(n_latent)+"d_"+title[:8]+"_"+str(ncl)+"_"+str(alabels) +'.png') plt.close() ''' label_data = [float(run_i)] relabel_data = [float(run_i)] relabelled_list = [] uni_vals = [] for k in scluster.labels_: if k not in uni_vals: uni_vals.append(k) for k in scluster.labels_: relabelled_list.append(np.where(uni_vals == k)[0][0]) label_data.extend(scluster.labels_) relabel_data.extend(relabelled_list) if ncl == 2: if alabels == "kmeans": two_spectral.append(label_data) two_spectral.append(relabel_data) if alabels == "discretize": twod_spectral.append(label_data) twod_spectral.append(relabel_data) if ncl == 3: if alabels == "kmeans": three_spectral.append(label_data) three_spectral.append(relabel_data) if alabels == "discretize": threed_spectral.append(label_data) threed_spectral.append(relabel_data) if ncl == 4: if alabels == "kmeans": four_spectral.append(label_data) four_spectral.append(relabel_data) if alabels == "discretize": fourd_spectral.append(label_data) fourd_spectral.append(relabel_data)
data2 = np.vstack((2 * np.cos(t), 2 * np.sin(t))).T data3 = np.vstack((3 * np.cos(t), 3 * np.sin(t))).T data = np.vstack((data1, data2, data3)) n_clusters = 3 m = euclidean_distances(data,data, squared=True) # 一组数据两点之间的相似度 plt.figure(figsize=(12, 8), facecolor='w') plt.suptitle('谱聚类', fontsize=16) clrs = plt.cm.Spectral(np.linspace(0, 0.8, n_clusters)) for i, s in enumerate(np.logspace(-2, 0, 6)): print(s) af = np.exp(-m ** 2 / (s ** 2)) + 1e-6 model = SpectralClustering(n_clusters=n_clusters, affinity='precomputed', assign_labels='kmeans', random_state=1) # 簇为3 y_hat = model.fit_predict(af) plt.subplot(2, 3, i + 1) for k, clr in enumerate(clrs): cur = (y_hat == k) plt.scatter(data[cur, 0], data[cur, 1], s=40, c=clr, edgecolors='k') x1_min, x2_min = np.min(data, axis=0) x1_max, x2_max = np.max(data, axis=0) x1_min, x1_max = expand(x1_min, x1_max) x2_min, x2_max = expand(x2_min, x2_max) plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.grid(b=True, ls=':', color='#808080') plt.title(r'$\sigma$ = %.2f' % s, fontsize=13) plt.tight_layout() plt.subplots_adjust(top=0.9) plt.show()
for mc in range(MC1): print("\n\n\033[91mIteration",mc+1,"\033[0m") trainX,testX,trainy,testy = train_test_split(X,y,train_size=0.8,random_state=np.random.randint(1000),stratify=y) NtrainX = normalize(trainX) NtestX = normalize(testX) train = pd.concat([trainX,trainy],axis=1) test = pd.concat([testX,testy],axis=1) Ntrain = pd.concat([pd.DataFrame(NtrainX,index=trainX.index),trainy],axis=1) Ntest = pd.concat([pd.DataFrame(NtestX,index=testX.index),testy],axis=1) sc = SpectralClustering(n_clusters=2, gamma=1.0, affinity="rbf",random_state=np.random.randint(1000)).fit(NtrainX) cls = sc.labels_ clste = sc.fit_predict(NtestX) temp = trainy.copy() temp['cluster'] = cls # B c = [] for cluster in range(2): classlist = temp.loc[temp['cluster']==cluster].tolist() c.append(max(classlist,key=classlist.count)) print("Clusters:",c) bivpredtr = [] bivdftr = [] tempdf = decfunc(NtrainX,cls) j = 0 for i in cls:
gt_label = [] for tv in video_index: gt_label.append(video_2_action[tv]) # non_over_label = [] # for i in range(label_pred.shape[0]): # non_over_label.append(int(label_pred[i])) gt_label = np.array(gt_label) gt_label = np.squeeze(gt_label) # print(gt_label.shape) from sklearn import metrics print("Adjusted rand score %.4f" % metrics.adjusted_rand_score(gt_label, label_pred)) print("NMI %.4f" % metrics.normalized_mutual_info_score(gt_label, label_pred)) return cluser_2_action, soft_cluster_2_action num_subset_class = 100 subset_class, subset_index, subset_atten_fea = get_subset(num_subset_class, action_2_video, training_index, att_fea_v1) affinity_matrix, sorted_video_index, sorted_video_fea = get_affinity(subset_index, subset_atten_fea, action_2_video) subset_atten_fea = sorted_video_fea subset_index = sorted_video_index # cluster num_of_cluster = num_subset_class estimator = SpectralClustering(n_clusters=num_of_cluster, random_state=0, affinity='precomputed') estimator.fit_predict(affinity_matrix) label_pred = estimator.labels_ cluser_2_action, soft_cluster_2_action = get_cluster_performance(num_of_cluster, label_pred, subset_index, action_2_video, video_2_action)