def fit(self, draw=False, p=5): self.p = p self.draw = draw hie = None if draw: # for drawing hie = AC(n_clusters=None, compute_full_tree=True, distance_threshold=0) else: hie = AC(n_clusters=self.nc) t0 = time.time() hie.fit(self.vectors) print("Training cost %0.3fs" % (time.time() - t0)) self.model = hie
def process_and_plot(df, shrink, ix = None): df = z_scale(df.T).T if ix == None: ix = AC(4).fit(df).labels_.argsort() # a trick to make better heatmaps cap = np.min([np.max(df.values), np.abs(np.min(df.values))]) df = np.clip(df, -1*cap, cap) custom_heatmap(df, shrink, ix = ix)
def __init__(self, n_clusters=2, affinity='euclidean', memory=None, connectivity=None, compute_full_tree='auto', linkage='ward', distance_threshold=None, compute_distances=False): self.n_clusters = n_clusters self.memory = memory self.compute_distances = compute_distances self.affinity = affinity self.linkage = linkage self.distance_threshold = distance_threshold self.connectivity = connectivity self.compute_full_tree = compute_full_tree self.model = AC(compute_distances=self.compute_distances, distance_threshold=self.distance_threshold, affinity=self.affinity, connectivity=self.connectivity, linkage=self.linkage, n_clusters=self.n_clusters, memory=self.memory, compute_full_tree=self.compute_full_tree)
def hc_cluster_score(X, g): """Calculates the silhouette score for all the possible k clusters defined in Hierarchical Clustering. Input: Graph object g Distance matrix X Output: Dict with labels of the cluster with the best score List with the silhouette scores""" scores = [] labels = [] for i in range(2, len(X)): hierarchical_model = AC(n_clusters=i, affinity='precomputed', linkage='average').fit(X) l = hierarchical_model.labels_ s = silhouette_score(X, l, metric="precomputed") scores.append(s) labels.append(l) idx = np.argmax(scores) clust_lab = { list(g.nodes())[i]: list(labels[idx])[i] for i in range(len(labels[idx])) } return clust_lab, scores
def final_plot(dataset, num = None): scaled_dataset = scale_dataset(dataset.T).T if num is None: num = AC(4).fit(scaled_dataset).labels_.argsort() range = np.min([np.max(scaled_dataset.as_matrnum()), np.abs(np.min(scaled_dataset.as_matrnum()))]) scaled_dataset = np.clip(scaled_dataset, -1*range, range) plotting_heatmap(scaled_dataset, num=num)
def HAC(): hac = AC(n_clusters=32).fit(feature) pred_label = hac.labels_ print('silhouette_score = ', metrics.silhouette_score(feature, pred_label, metric='euclidean')) print('homogeneity_score = ', metrics.homogeneity_score(label, pred_label.tolist())) #print(hac.labels_) folder_result('HAC', pred_label)
def clusterize(self): print("Starting clustering...") t1 = time.monotonic() agg_clus = AC(n_clusters=self.num_clus, affinity=self.affinity, linkage=self.linkage) self.predictions = agg_clus.fit_predict(self.feature_matrix) print("Done training in {}s".format( timedelta(seconds=time.monotonic() - t1)))
def hierarchical(n, img): Z_2 = img.reshape((-1, len(img[0][0]))) # print(Z_2) # ac_model=AC(n_clusters=n,linkage='average',compute_full_tree='false',affinity='cosine') ac_model = AC(n_clusters=n) ac_labels = ac_model.fit_predict(Z_2) img_labels_3 = ac_labels.reshape((img.shape[0], img.shape[1])) return img_labels_3
def scale_and_plot(df, ix = None): ''' A wrapper function to calculate the scaled values within each row of df and plot_hmap ''' df_marginal_scaled = scale_df(df.T).T if ix is None: ix = AC(4).fit(df_marginal_scaled).labels_.argsort() cap = np.min([np.max(df_marginal_scaled.as_matrix()), np.abs(np.min(df_marginal_scaled.as_matrix()))]) df_marginal_scaled = np.clip(df_marginal_scaled, -1*cap, cap) plot_hmap(df_marginal_scaled, ix=ix) return df_marginal_scaled
def clustering(k, x, series_avg, met): if met == "KM": res = KMeans(k).fit(series_avg) return res.cluster_centers_, res.labels_ elif met == "AC": res = AC(n_clusters=k, linkage="complete").fit(series_avg) cent = np.array( [np.mean(series_avg[res.labels_ == i], axis=0) for i in range(k)]) return cent, res.labels_ elif met == "KS": label = DTW.KShape(k, series_avg) cent = np.array( [np.mean(series_avg[label == i], axis=0) for i in range(k)]) return cent, label
def clustering(filtered, th_clust): """ Clusters data using Agglomerative Clustering. Distance Threshold: 30 default """ from sklearn.cluster import AgglomerativeClustering as AC agC = AC(n_clusters=None, distance_threshold=th_clust, memory=None) X = np.array(filtered.iloc[:, :3]) agC.fit(X) labels = np.array([agC.labels_]).T #res = np.concatenate((X, np.array([agC.labels_]).T), axis = 1) amountclusters = len(set(agC.labels_)) print('Amount of clusters: ' + str(amountclusters)) return labels, amountclusters
def log_anomalyPRF_AC(cp, ground_truth, dataset, log_flag, SEED=1234): # Init clustering hyperparameters n_clusters = cp.getint('Hyperparameters', 'ClusterNum') cluster_init = cp.getint('Hyperparameters', 'ClusterInit') # KMeans model # km = KMeans(n_clusters=n_clusters, n_init=cluster_init, n_jobs=-1, # random_state=SEED) km = AC(n_clusters=n_clusters) if isinstance(dataset, basestring): pred = km.fit_predict(np.load(dataset)) else: pred = km.fit_predict(dataset) pred = assign_labels(pred, ground_truth) print CR(ground_truth, pred)
def log_NMI_AC(cp, ground_truth, dataset, log_flag, SEED=1234): # Init clustering hyperparameters n_clusters = cp.getint('Hyperparameters', 'ClusterNum') cluster_init = cp.getint('Hyperparameters', 'ClusterInit') # KMeans model # km = KMeans(n_clusters=n_clusters, n_init=cluster_init, n_jobs=-1, # random_state=SEED) km = AC(n_clusters=n_clusters) if isinstance(dataset, basestring): pred = km.fit_predict(np.load(dataset)) else: pred = km.fit_predict(dataset) log('--------------- {} {} ------------------------'.format( log_flag, NMI(ground_truth, pred)))
def perform_clustering(npop: int, coordinates, simulation ) -> Tuple['np.ndarray[int]', 'np.ndarray[float]', 'np.ndarray[float]']: """Perform agglomerative clustering for simulation Return found labels, distances from large eigenvalues, eigenvalues read from file, and labels read from file. """ if simulation.output_level >= 1: print('clustering will be performed on a ' + str(coordinates.shape) + ' matrix') clusterer = AC(n_clusters=npop, compute_full_tree=True,linkage="ward") lab_infered = clusterer.fit_predict(coordinates) return lab_infered
def agglomerative_propagation(matrix, n_cluster, words): start = t.time() affinity = AC(affinity="precomputed", n_clusters=n_cluster, linkage="complete", compute_full_tree=True) affinity.fit(matrix) clusters = [] for index in range(0, n_cluster): lista = [] clusters.append(lista) for index in range(0, len(words)): clusters[affinity.labels_[index]].append(words[index]) end = t.time() return affinity, clusters, end - start
def twoDimension(data, nclusters, linkage_type): arrs = [] for line in data: arrs.append(xy(line)) nparr = np.asarray(arrs) #getDendogram(nparr,linkage_type) #Agglomerative Clusters hc = AC(n_clusters=nclusters, affinity='euclidean', linkage=linkage_type) y_hc = hc.fit_predict(nparr) print("CLUSTER ZERO:", nparr[y_hc == 0]) print("CLUSTER ONE:", nparr[y_hc == 1]) print("CLUSTER TWO:", nparr[y_hc == 2]) print("CLUSTER THREE:", nparr[y_hc == 3]) plt.scatter(nparr[y_hc == 0, 0], nparr[y_hc == 0, 1], s=100, c='red') plt.scatter(nparr[y_hc == 1, 0], nparr[y_hc == 1, 1], s=100, c='black') plt.scatter(nparr[y_hc == 2, 0], nparr[y_hc == 2, 1], s=100, c='blue') plt.scatter(nparr[y_hc == 3, 0], nparr[y_hc == 3, 1], s=100, c='cyan') plt.show()
def clustering(idTfidf, num_clu, term_num): docFeature = idTfidf vecTfidf = {} for file in idTfidf: row = np.zeros(len(idTfidf[file])) col = idTfidf[file].keys() val = idTfidf[file].values() vec = csc_matrix((np.array(val), (np.array(row), np.array(col))), shape=(1, term_num)) vecTfidf[file] = vec.todense().tolist()[0] # print vecTfidf features = vecTfidf.values() # print features selection = 'GM' # selecting model here!!! Options: AgglomerativeClustering as AC, SpectralClustering as SC, GMM if selection == 'AC': model = AC(n_clusters=num_clu, affinity='cosine', linkage='average') if selection == 'SC': model = SC(n_clusters=num_clu, affinity='cosine') if selection == 'GMM': model = GMM(n_components=num_clu, covariance_type='full') if selection == 'GM': model = GM(n_components=num_clu) model.fit(features) res = model.predict(features) else: res = model.fit_predict(features) resDic = {} for i in range(len(res)): if not resDic.has_key(res[i]): resDic[res[i]] = [] resDic[res[i]].append(int(docFeature.keys()[i])) else: resDic[res[i]].append(int(docFeature.keys()[i])) result = resDic.values() # print result with open('gt_GMRes.json', 'w') as f: f.write(json.dumps(result)) return result
def choose_k(X,k_range): X = X.T X = X[:32] print(X.shape) X_mean = sum(X)/len(X) chs = [] n = len(X) for k in range(2, k_range): clf = AC(n_clusters = k, linkage = 'average') clf.fit(X) labels = clf.labels_ centroids = np.zeros((k, len(X[0]))) counts = np.zeros((k, 1)) for i in range(n): for l in range(k): if l == labels[i]: centroids[l] += X[i] counts[l][0] += 1 centroids /= counts W = 0 B = 0 for label in range(k): for i in range(len(X)): if labels[i] == label: W += np.linalg.norm((X[i] - centroids[label]) , 2) ** 2 B += counts[label][0] * (np.linalg.norm((centroids[label] - X_mean) ,2)** 2) up = B/(k - 1) down = W/(n - k) chs.append(up/down) plt.figure() plt.plot([i + 2 for i in range(len(chs))], chs) plt.xlabel('k') plt.ylabel('ch value') plt.title('Choose best k') plt.show()
def getClustersSK(self, X, method="single"): """ Get the model and labels for all possible clusters from the built-in sklearn function for agglomerative clustering. No connectivity matrix is used, since data are (apparently) unstructured. k=[1,N-1], where N the number of observations :type X: array-like :param X: 2D array containing the x,y coordinates of the points to be clustered. :type method: str, optional :param method: method for linking clusters. Defaults to 'single'. """ Mmax = len(X) - 1 M = np.arange(1, Mmax + 1) L = {} for k in M: model = AC(n_clusters=k, linkage=method, affinity="euclidean").fit(X) L.update({k: model.labels_}) return L
def fiveDimension(data, nclusters, linkage_type): arrs = [] for line in data: coor = line.split() d1 = float(coor[0]) d2 = float(coor[1]) d3 = float(coor[2]) d4 = float(coor[3]) d5 = float(coor[4]) arrs.append([d1, d2, d3, d4, d5]) nparr = np.asarray(arrs) #getDendogram(nparr, linkage_type) hc = AC(n_clusters=nclusters, affinity='euclidean', linkage=linkage_type) y_hc = hc.fit_predict(nparr) print(nparr[y_hc == 0][0, 0]) plt.scatter(nparr[y_hc == 0, 0], nparr[y_hc == 0, 1], s=100, c='red') plt.scatter(nparr[y_hc == 1, 0], nparr[y_hc == 1, 1], s=100, c='black') plt.scatter(nparr[y_hc == 2, 0], nparr[y_hc == 2, 1], s=100, c='blue') plt.scatter(nparr[y_hc == 3, 0], nparr[y_hc == 3, 1], s=100, c='cyan') plt.show()
def unsupervised_clu(feature, part, model_selection): if part: if feature == 'graph': docFeature = json.loads( open('rmMultiPart1WOZeroGraph.json').read()) if feature == 'doc2vec': docFeature = json.loads(open('rmMultiPart1Doc2vec.json').read()) if feature == 'comb': walk = json.loads(open('rmMultiPart1WOZeroGraph.json').read()) dv = json.loads(open('rmMultiPart1Doc2vec.json').read()) docFeature = {} for doc in walk: val = walk[doc] + dv[doc] docFeature[doc] = val groundTruth = json.loads(open('rmMultiPart1CluInd.json').read()) num_clu = len(groundTruth) # number of clusters in each part else: rmMulti = True # False # if rmMulti: if feature == 'graph': docFeature = json.loads( open('rmMultiCluDatabaseWOZeroGraph.json').read()) if feature == 'doc2vec': docFeature = json.loads( open('rmMultiCluDatabaseDoc2vec.json').read()) if feature == 'comb': walk = json.loads( open('rmMultiCluDatabaseWOZeroGraph.json').read()) dv = json.loads(open('rmMultiCluDatabaseDoc2vec.json').read()) docFeature = {} for doc in walk: val = walk[doc] + dv[doc] docFeature[doc] = val groundTruth = json.loads(open('rmMultiGroundTruth.json').read()) num_clu = len( groundTruth ) # number of clusters after removing documents appearing multi-cluster, #doc = 1274 (3 all 0s for walk) else: if feature == 'graph': docFeature = json.loads( open('cluDatabaseWOZeroGraph.json').read()) if feature == 'doc2vec': docFeature = json.loads(open('cluDatabaseDoc2vec.json').read()) if feature == 'comb': walk = json.loads(open('cluDatabaseWOZeroGraph.json').read()) dv = json.loads(open('cluDatabaseDoc2vec.json').read()) docFeature = {} for doc in walk: val = walk[doc] + dv[doc] docFeature[doc] = val groundTruth = json.loads(open('groundTruth.json').read()) num_clu = len( groundTruth ) # number of clusters before removing documents appearing multi-cluster, #doc = 1393 (3 all 0s for walk) features = docFeature.values() if model_selection == 'AC': model = AC(n_clusters=num_clu, affinity='cosine', linkage='average') if model_selection == 'SC': model = SC(n_clusters=num_clu, affinity='cosine') if model_selection == 'GMM': model = GMM(n_components=num_clu, covariance_type='full') if model_selection == 'KMeans': model = KMeans(n_clusters=num_clu) if model_selection == 'GM': model = GM(n_components=num_clu) model.fit(features) res = model.predict(features) else: res = model.fit_predict(features) resDic = {} for i in range(len(res)): if not resDic.has_key(res[i]): resDic[res[i]] = [] resDic[res[i]].append(int(docFeature.keys()[i])) else: resDic[res[i]].append(int(docFeature.keys()[i])) result = resDic.values() return (result, groundTruth)
def unsupervised(numClu, graphEmb): print 'Buidling unsupervised model...' model = AC(n_clusters=numClu, affinity='cosine', linkage='complete') res = model.fit_predict(graphEmb.values()) return res
import time import pandas as pd import matplotlib.pyplot as plt from sklearn.cluster import AgglomerativeClustering as AC from sklearn.decomposition import PCA tfidf = pd.read_csv('tfidf.csv') data = tfidf.values[:, 1:] numClass = 4 AC_model = AC(n_clusters=numClass, affinity="euclidean", linkage='ward') # 层次聚类 pca = PCA(n_components=10) TnewData = pca.fit_transform(data) # 先降维 t0 = time.time() AC_model.fit(TnewData) # 再聚类 elapsed_time = time.time() - t0 pca = PCA(n_components=2) # 为了画二维图,这里设置输出两维 newData = pca.fit_transform(data) # 载入N维 result = AC_model.labels_ # labels_为聚类结果 plt.scatter(newData[:, 0], newData[:, 1], c=result, cmap=plt.cm.nipy_spectral) plt.show() print("time(s):", elapsed_time)
for x in l: x1 = float(x.split(" ")[0]) x2 = float(x.split(" ")[1]) points.append([x1, x2]) points = np.array(points) plt.scatter(points[:, 0], points[:, 1]) plt.show() kmean = KMeans(n_clusters=2, random_state=42).fit(points) plt.scatter(points[:, 0], points[:, 1], c=kmean.labels_) plt.show() aclust = AC(n_clusters=2) aclust.fit(points) plt.scatter(points[:, 0], points[:, 1], c=aclust.labels_) plt.show() dbc = DBSCAN() dbc.fit(points) plt.scatter(points[:, 0], points[:, 1], c=dbc.labels_) plt.show() file.close() file = open('./data/Ring/2D_points.txt') l = file.readlines()
ListVal.append(ListF[a:b]) a = b b = b + 7 # Del NaN and 1 ListKey = [y for x,y in zip(ListVal, ListKey) if not (math.isnan(x[0]) or (x[0] == 1 and x[1] == 1))] ListVal = [x for x in ListVal if not (math.isnan(x[0]) or (x[0] == 1 and x[1] == 1))] DictF = {x: y for x, y in zip(ListKey, ListVal)} os.chdir('/media/roman/10A2FE37A2FE20C0/Clustering/') # path to image print('Processed {} descriptors'.format(len(DictF))) for i in range(2,101): agg = AC(n_clusters=i,linkage='average') assignment = agg.fit_predict(ListVal) result = Counter(assignment) clustElem = {} for ind, val in enumerate(assignment): if val+1 not in clustElem.keys(): clustElem[val+1] = [ListKey[ind]] else: clustElem[val+1].append(ListKey[ind]) clustMedian = {i[0]:i[1][len(i[1])//2] for i in clustElem.items()} print('========== {} lavel =========='.format(i-1)) print('{} clusters'.format(i)) cE = list(clustElem.items()) cE.sort() for j in cE:
'xticks': (), 'yticks': () }) for i, (component, ax) in enumerate(zip(y_people[ind], axes.ravel())): ax.imshow(component.reshape(image_shape), cmap='grey') ax.set_title("{}. component".format(i + 1)) dbs = DBS() dbs.fit(X_people) dbs_assignments = dbs.labels_ dbs_means = dbs.core_sample_indices_ print(len(dbs_means)) a = 100 ac = AC(n_clusters=a) ac.fit(X_people) ac_assignments = ac.labels_ ac_means = ac.n_clusters print(ac_means) for i in range(a): ind = ac_assignments == i ent = entropy(y_people[ind]) print("Cluster {:d}, size = {:d}, entropy = {:.3f}".format( i, np.sum(ind), ent)) if ent > 4 and np.sum(ind) > 10: fig, axes = plt.subplots(2, 5, figsize=(15, 8), subplot_kw={
linkage_matrix = np.column_stack( [model.children_, model.distances_, counts]).astype(float) # Plot the corresponding dendrogram dendrogram(linkage_matrix, **kwargs) # Add axis labels plt.xlabel('Data Point') plt.ylabel('Distance') """# Agglomerative Clustering with TSNE""" import numpy as np import matplotlib.pyplot as plt from sklearn.cluster import AgglomerativeClustering as AC ac = AC(n_clusters=None, distance_threshold=0) ac.fit(X) plt.figure(figsize=(12, 6)) plot_dendrogram(ac) plt.xticks([]) plt.show() from sklearn.manifold import TSNE tsne = TSNE(random_state=146) Xtsne = tsne.fit_transform(X) n = 2 ac = AC(n_clusters=n) clusters = ac.fit_predict(X) colors = GetColors(n)
h5_path = '/media/bigdata/Abuzar_Data/AM23/AM23_4Tastes_200316_134649/AM23_4Tastes_200316_134649_repacked.bk' h5_file = tables.open_file(h5_path, 'r') unit_descriptors = h5_file.root.unit_descriptor[:] sorted_units_path = '/sorted_units' unit_num = 3 this_unit_waves = h5_file.get_node( os.path.join(sorted_units_path, 'unit{0:03d}'.format(unit_num), 'waveforms'))[:] this_unit_pca = pca(n_components=3).fit_transform(this_unit_waves) ac_cluster = AC().fit(this_unit_pca) kmeans_cluster = kmeans(n_clusters=3).fit(this_unit_pca) clust_method = ac_cluster mean_wavs = [(np.mean(this_unit_waves[clust_method.labels_ == clust], axis=0), np.std(this_unit_waves[clust_method.labels_ == clust],axis=0)) \ for clust in np.sort(np.unique(clust_method.labels_))] img_plot(this_unit_waves[np.argsort(kmeans_cluster.labels_)]) plt.show() for wav in mean_wavs: plt.fill_between(range(len(wav[0])), wav[0] + 2 * wav[1], wav[0] - 2 * wav[1],
#importing the dataset dataset = pd.read_csv('Mall_customers.csv') X = dataset.iloc[:, [3, 4]].values #using the dendogram to find the optimal number of cluster import scipy.cluster.hierarchy as sch dendogram = sch.dendrogram(sch.linkage(X, method='ward')) plt.title('Dendogram') plt.xlabel('Custeomers') plt.ylabel('Euclidian Distance') plt.show() #Fitting the HC to dataset from sklearn.cluster import AgglomerativeClustering as AC hc = AC(n_clusters=5, affinity='euclidean', linkage='ward') y_hc = hc.fit_predict(X) #visualizing the cluster plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s=100, c='red', label='Careful') plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s=100, c='blue', label='Standard') plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s=100, c='green', label='Target') plt.scatter(X[y_hc == 3, 0], X[y_hc == 3, 1], s=100, c='cyan', label='Careless')
# plt.imshow(k_means(10,new), cmap=plt.get_cmap('hot')) # plt.colorbar() # plt.show() # Z_2=b.reshape((-1,len(b[0][0]))) # print(Z_2) # gmm_model=GMM(n_components=4,covariance_type='tied').fit(Z_2) # gmm_labels=gmm_model.predict(Z_2) # img_labels_2=label.reshape((b.shape[0],b.shape[1])) scale_percent = 40 # percent of original size width = int(rgb.shape[1] * scale_percent / 100) height = int(rgb.shape[0] * scale_percent / 100) dim = (width, height) resized = cv2.resize(rgb, dim, interpolation=cv2.INTER_AREA) Z_2 = resized.reshape((-1, len(resized[0][0]))) print(Z_2) ac_model = AC(n_clusters=14, linkage='average', compute_full_tree='false', affinity='cosine') ac_labels = ac_model.fit_predict(Z_2) img_labels_3 = ac_labels.reshape((resized.shape[0], resized.shape[1])) # plt.imshow(img_labels_3, cmap=plt.get_cmap('hot')) plt.colorbar() plt.show() # cv2.imshow('res2',res2) # cv2.waitKey(0) # cv2.destroyAllWindows()