def calculateNumberOfIdealClusters(maxAmount, corpus): print "Initializing silhouette analysis" range_n_clusters = range(2, maxAmount) # max amount of clusters equal to amount of jobs silhouette_high = 0; silhouette_high_n_clusters = 2; for n_clusters in range_n_clusters: # Initialize the clusterer with n_clusters value cluster = AgglomerativeClustering(n_clusters=n_clusters, linkage="ward", affinity="euclidean") cluster_labels = cluster.fit_predict(corpus) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed clusters silhouette_avg = silhouette_score(corpus, cluster_labels) print "For n_clusters = %d, the average silhouette_score is: %.5f" % (n_clusters, silhouette_avg) if (silhouette_avg > silhouette_high): silhouette_high = silhouette_avg silhouette_high_n_clusters = n_clusters # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(corpus, cluster_labels) print ("Highest score = %f for n_clusters = %d" % (silhouette_high, silhouette_high_n_clusters)) return silhouette_high_n_clusters
def programmer_3(): standardizedfile = "data/standardized.xls" k = 3 data = pd.read_excel(standardizedfile, index_col=u"基站编号") # 层次聚类 model = AgglomerativeClustering(n_clusters=k, linkage="ward") model.fit(data) # 详细输入原始数据及对应类别 r = pd.concat([data, pd.Series(model.labels_, index=data.index)], axis=1) r.columns = list(data.columns) + [u"聚类类别"] # 绘制聚类图,并且用不同样式进行画图 style = ["ro-", "go-", "bo-"] xlabels = [u"工作日人均停留时间", u"凌晨人均停留时间", u"周末人均停留时间", u"日均人流量"] pic_output = "tmp/type_" for i in range(k): plt.figure() tmp = r[r[u"聚类类别"] == i].iloc[:, :4] for j in range(len(tmp)): plt.plot(range(1, 5), tmp.iloc[j], style[i]) plt.xticks(range(1, 5), xlabels, rotation=20) plt.title(u"商圈类别%s" % (i + 1)) # 调整底部 plt.subplots_adjust(bottom=0.15) plt.savefig(u"%s%s.png" % (pic_output, i + 1))
def clustering_tweets_hc(labeled_tweets, num_cluster): vectorizer = cst_vectorizer.StemmedTfidfVectorizer(**param) tweet_vec = vectorizer.fit_transform(labeled_tweets).toarray() # print(tweet_vec) n_clusters = num_cluster from sklearn.neighbors import kneighbors_graph knn_graph = kneighbors_graph(tweet_vec, 1, include_self=False) # print(knn_graph) connectivity = knn_graph from sklearn.cluster import AgglomerativeClustering model = AgglomerativeClustering(linkage='ward', connectivity=connectivity, n_clusters=n_clusters) model.fit(tweet_vec) c = model.labels_ # print(c,len(c)) clustered_tweets = [] for i in range(0, num_cluster): similar_indices = (c == i).nonzero()[0] sent = '' for sid in similar_indices: sent = labeled_tweets[sid] + ' ' + sent clustered_tweets.append(sent) return clustered_tweets
def cluster_agg(cluster_data): clstr = AgglomerativeClustering(n_clusters=11, linkage='ward') clstr.fit(cluster_data) df['tier'] = clstr.labels_ results = df[['Player', 'tier']] return results
def test_agglomerative_clustering_with_distance_threshold(linkage): # Check that we obtain the correct number of clusters with # agglomerative clustering with distance_threshold. rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) n_samples = 100 X = rng.randn(n_samples, 50) connectivity = grid_to_graph(*mask.shape) # test when distance threshold is set to 10 distance_threshold = 10 for conn in [None, connectivity]: clustering = AgglomerativeClustering( n_clusters=None, distance_threshold=distance_threshold, connectivity=conn, linkage=linkage) clustering.fit(X) clusters_produced = clustering.labels_ num_clusters_produced = len(np.unique(clustering.labels_)) # test if the clusters produced match the point in the linkage tree # where the distance exceeds the threshold tree_builder = _TREE_BUILDERS[linkage] children, n_components, n_leaves, parent, distances = \ tree_builder(X, connectivity=conn, n_clusters=None, return_distance=True) num_clusters_at_threshold = np.count_nonzero( distances >= distance_threshold) + 1 # test number of clusters produced assert num_clusters_at_threshold == num_clusters_produced # test clusters produced clusters_at_threshold = _hc_cut(n_clusters=num_clusters_produced, children=children, n_leaves=n_leaves) assert np.array_equiv(clusters_produced, clusters_at_threshold)
def buckshot(k, mat): size = int((k*mat.shape[0])**.5) print size samp = np.zeros((size, mat.shape[1])) inds = np.random.randint(0, mat.shape[0], size) print inds for i in xrange(size): samp[i] = mat[inds[i]] #agglomerative clusting on sample hier = AgglomerativeClustering(n_clusters=k, linkage='average', affinity='euclidean', compute_full_tree=True) flat = hier.fit_predict(samp) centroids = [] #find centroids for j in xrange(k): i_s = [i for i, l in enumerate(flat) if l == j] print len(i_s) points = [samp[m] for m in i_s] points = np.array(points) cent = np.mean(points, axis=0) centroids.append(cent) return centroids
def classify_core(self, N_CLUSTERS, clusterType, data_for_trial_type, begin_time, end_time): BEGIN_TIME_FRAME = begin_time*self.griddy.TIME_GRID_SPACING END_TIME_FRAME = end_time*self.griddy.TIME_GRID_SPACING data = data_for_trial_type[:,BEGIN_TIME_FRAME:END_TIME_FRAME,self.griddy.VEL_X] labels = None if clusterType == 'kmeans': kmeans = KMeans(n_clusters=N_CLUSTERS) kmeans.fit(data) labels = kmeans.labels_ elif clusterType == 'affinity_propagation': ap = AffinityPropagation(damping=0.75) ap.fit(data) labels = ap.labels_ N_CLUSTERS = np.max(self.labels)+1 elif clusterType == 'DBSCAN': dbscan = DBSCAN() dbscan.fit(data) labels = dbscan.labels_ N_CLUSTERS = np.max(labels)+1 print 'N_CLUSTERS=' + str(N_CLUSTERS) elif clusterType == 'AgglomerativeClustering': ac = AgglomerativeClustering(n_clusters=N_CLUSTERS) ac.fit(data) labels = ac.labels_ else: print 'ERROR: clusterType: ' + clusterType + ' is not recognized' return (labels, N_CLUSTERS)
def __generate_dummy_data(): from sklearn.cluster import AgglomerativeClustering import itertools X = np.array([[ -5.27453240e-01, -6.14130238e-01, -1.63611427e+00, -9.26556498e-01, 7.82296885e-01, -1.06286220e+00, -1.24368729e+00, -1.16151964e+00, -2.25816923e-01, -3.32354552e-02], [ -2.01273137e-01, 5.25758359e-01, 1.37940072e+00, -7.63256657e-01, -1.27275323e+00, -1.31618084e+00, -7.00167331e-01, 2.21410669e+00, 9.15456567e-01, 7.93076923e-01], [ 1.53249104e-01, -5.48642411e-01, -1.06559060e+00, -3.05253203e-01, -1.93393495e+00, 1.39827978e-01, 1.73359830e-01, 2.85576854e-02, -1.19427027e+00, 1.04395610e+00], [ 1.00595172e+02, 1.01661346e+02, 1.00115635e+02, 9.86884249e+01, 9.86506406e+01, 1.02214982e+02, 1.01144087e+02, 1.00642778e+02, 1.01635339e+02, 9.88981171e+01], [ 1.01506262e+02, 1.00525318e+02, 9.93021764e+01, 9.92514163e+01, 1.01199015e+02, 1.01771241e+02, 1.00464097e+02, 9.97482396e+01, 9.96888274e+01, 9.88297336e+01]]) model = AgglomerativeClustering(linkage="average", affinity="cosine") model.fit(X) ii = itertools.count(X.shape[0]) DEBUG(str([{'node_id': next(ii), 'left': x[0], 'right':x[1]} for x in model.children_])) return model, model.labels_
def sp_connectivity(self,X,connectivity, n_clusters, n): # plt.figure(figsize=(10, 4)) # plt.subplot(1, 3, index + 1) model = AgglomerativeClustering(linkage="ward", connectivity=connectivity, n_clusters=n_clusters) #t0 = time.time() y = np.zeros(shape=(n)) y = model.fit_predict(X, None) #elapsed_time = time.time() - t0 return y #plt.scatter(X[:, 0], X[:, 1], c=model.labels_, # cmap=plt.cm.spectral) #plt.title('linkage=%s (time %.2fs)' % (linkage, elapsed_time), # fontdict=dict(verticalalignment='top')) #plt.axis('equal') #plt.axis('off') #plt.subplots_adjust(bottom=0, top=.89, wspace=0, # left=0, right=1) # plt.suptitle('n_cluster=%i, connectivity=%r' % # (n_clusters, connectivity is not None), size=17) #plt.show()
def knn_connectivity(self, X): knn_graph = kneighbors_graph(X, 30, include_self=False) for connectivity in (None, knn_graph): n_clusters = 4 plt.figure(figsize=(10, 4)) for index, linkage in enumerate(('average', 'complete', 'ward')): plt.subplot(1, 3, index + 1) model = AgglomerativeClustering(linkage=linkage, connectivity=connectivity, n_clusters=n_clusters) t0 = time.time() model.fit(X) elapsed_time = time.time() - t0 plt.scatter(X[:, 0], X[:, 1], c=model.labels_, cmap=plt.cm.spectral) plt.title('linkage=%s (time %.2fs)' % (linkage, elapsed_time), fontdict=dict(verticalalignment='top')) plt.axis('equal') plt.axis('off') plt.subplots_adjust(bottom=0, top=.89, wspace=0, left=0, right=1) plt.suptitle('n_cluster=%i, connectivity=%r' % (n_clusters, connectivity is not None), size=17) plt.show()
def plot_mfi(self, outputfile='embeddings.pdf', nb_clusters=8, weights='NA'): # collect embeddings for mfi: X = np.asarray([self.w2v_model[w] for w in self.mfi \ if w in self.w2v_model], dtype='float32') # dimension reduction: tsne = TSNE(n_components=2) coor = tsne.fit_transform(X) # unsparsify plt.clf() sns.set_style('dark') sns.plt.rcParams['axes.linewidth'] = 0.4 fig, ax1 = sns.plt.subplots() labels = self.mfi # first plot slices: x1, x2 = coor[:,0], coor[:,1] ax1.scatter(x1, x2, 100, edgecolors='none', facecolors='none') # clustering on top (add some colouring): clustering = AgglomerativeClustering(linkage='ward', affinity='euclidean', n_clusters=nb_clusters) clustering.fit(coor) # add names: for x, y, name, cluster_label in zip(x1, x2, labels, clustering.labels_): ax1.text(x, y, name, ha='center', va="center", color=plt.cm.spectral(cluster_label / 10.), fontdict={'family': 'Arial', 'size': 8}) # control aesthetics: ax1.set_xlabel('') ax1.set_ylabel('') ax1.set_xticklabels([]) ax1.set_xticks([]) ax1.set_yticklabels([]) ax1.set_yticks([]) sns.plt.savefig(outputfile, bbox_inches=0)
def clustering_approach(self): ''' Cluster user data using various clustering algos IN: self.df_full and self.labels OUT: results to stdout ''' print 'Fitting clustering model' X = self.df_full.values y = self.labels # scale data scaler = StandardScaler() X = scaler.fit_transform(X) # KMeans km_clf = KMeans(n_clusters=2, n_jobs=6) km_clf.fit(X) # swap labels as super-users are in cluster 0 (messy!!) temp = y.apply(lambda x: 0 if x == 1 else 1) print '\nKMeans clustering: ' self.analyse_preds(temp, km_clf.labels_) # Agglomerative clustering print '\nAgglomerative clustering approach: ' ac_clf = AgglomerativeClustering() ac_labels = ac_clf.fit_predict(X) self.analyse_preds(y, ac_labels) return None
def Create_Ext_Agg_cluster(self,stem,stop,processing,remS): Allrow_dicts=data_pkg.FileHandling.read_csv(self.ExtStringCSv) Allstrings=list() #Allstrings=[rowdict_str["Text_original"] for rowdict_str in Allrow_dicts] for row_dict in Allrow_dicts: if self.POS =="ALL_EXT": Stringrow=row_dict["Text_original"]+row_dict["Adj_Extended"]+row_dict["Noun_Extended"] +row_dict["Verb_Extended"] Allstrings.append(Stringrow) else: Stringrow=row_dict["Adj"]+row_dict["Adj_Extended"]+row_dict["Noun"]+row_dict["Noun_Extended"]#+row_dict["Verb"]#+row_dict["Verb_Extended"] Allstrings.append(Stringrow) Allstrings_process=[preprocess_text.preprocess(string_text, stem,stop) for string_text in Allstrings] if remS: Allstrings_process=[preprocess_text.removeS(text) for text in Allstrings_process] vectorizer = CountVectorizer() term_doc=vectorizer.fit_transform(Allstrings_process) #-------------------------- feature_names=vectorizer.get_feature_names() #--z---------------------------------------------- Array=term_doc.toarray if self.affinity=='euclidean': Agg_cluster=AgglomerativeClustering(n_clusters=self.num_cluster,affinity='euclidean') if self.affinity=='cosine': Agg_cluster=AgglomerativeClustering(n_clusters=self.num_cluster,linkage='average',affinity='cosine') Res_Labels=Agg_cluster.fit_predict(term_doc.toarray()) self.cluster_tup_list=self.tuple_Ext_cluster_doc(Res_Labels,Allstrings,Allrow_dicts) #term_doc_lsa = lsa.fit_transform(term_doc) print type (term_doc) self.metric=metrics.silhouette_score(term_doc.toarray(), Res_Labels, metric=self.affinity) print Res_Labels print("n_samples: %d, n_features: %d" % term_doc.shape)
def clustering(data, params): # parse parameters for item in params: if isinstance(params[item], str): exec(item+'='+'"'+params[item]+'"') else: exec(item+'='+str(params[item])) # apply Agglomerative Clustering to reduced data clusters = AgglomerativeClustering(n_clusters=n_clusters, affinity=affinity, linkage=linkage) clusters.fit(data) # Agglomerative Clustering does not give centers of clusters # so lets try the mean of each cluster cluster_centers = [] for i in range(n_clusters): mask = (clusters.labels_ == i) cluster_centers.append(mean(data[mask], axis=0)) cluster_centers = array(cluster_centers) return [cluster_centers, clusters.labels_]
def agglom(data, n_clusters): knn_graph = kneighbors_graph(data, 30, include_self=False) cluster = AgglomerativeClustering(n_clusters=n_clusters, connectivity=knn_graph, linkage='ward') # use ward / average / complete for different results model = cluster.fit(data) return cluster.fit_predict(data)
def train_agglomerative(): print "starting agglomerative clustering..." model = AgglomerativeClustering(n_clusters=num_clusters, affinity=aggl_affinity, linkage=aggl_linkage) model.fit(X) labels = model.labels_ print labels
def eval_dist(linkage='ward'): a_score = [] idx = [] d = [[] for i in xrange(3)] for k in xrange(2, 50 + 1): print 'k={}'.format(k) est = AgglomerativeClustering(n_clusters=k, linkage=linkage) est.fit(x) ari_v = metrics.adjusted_rand_score(y, est.labels_) ds = calc_distance(k, est.labels_) for i in xrange(3): d[i].append(ds[i]) print ari_v a_score.append(ari_v) idx.append(k) fig, axes = plt.subplots(nrows=1, ncols=2) axes[0].plot(idx, a_score) # plt.xlim(0, 220) axes[0].set_ylim(ymin=0) axes[0].set_ylabel('ARI') axes[0].set_xlabel('# of clusters') # plt.savefig('figs/hc_ari.png') # plt.show() # plt.close() labels = ['Minimum', 'Maximum', 'Average'] # for i in xrange(3): # axes[1].plot(idx, d[i], label=labels[i]) axes[1].plot(idx, d[1]) axes[1].legend() axes[1].set_ylabel('distance') axes[1].set_xlabel('# of clusters') # plt.savefig('figs/hc_distance.png') plt.show()
def hierarchical(similarity, concepts=2, euclid=False): if euclid: model = AgglomerativeClustering(n_clusters=concepts) return model.fit_predict(similarity) else: model = AgglomerativeClustering(n_clusters=concepts, affinity='precomputed', linkage='complete') return model.fit_predict(1 - similarity)
def Word2VecReduction(senlist, w2vec, ratio): slen = len(senlist) word_matrix = [] word2label = {} idx2word = {} useword = set([]) cnt = 0 for i in range(0, slen): for word in senlist[i].word_used: if word not in useword: #and word in w2vec: idx2word[cnt] = word cnt += 1 useword.add(word) word_matrix.append(w2vec[word]) wlen = len(useword) print "use words:", wlen nclusters = max(int(0.9*wlen), 100) print nclusters AgloCluster = AgglomerativeClustering(n_clusters=nclusters,linkage="average", affinity='cosine') AgloCluster.fit(word_matrix) AgloCluster_labels = AgloCluster.labels_ for i in range(0, wlen): word2label[idx2word[i]] = AgloCluster_labels[i] for i in range(0, slen): senlist[i].sen_words = [ str(word2label[w]) for w in senlist[i].word_used] senlist[i].word_dict = {} #print senlist[i].sen_words return
def test_connectivity_propagation(): # Check that connectivity in the ward tree is propagated correctly during # merging. X = np.array( [ (0.014, 0.120), (0.014, 0.099), (0.014, 0.097), (0.017, 0.153), (0.017, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.152), (0.018, 0.149), (0.018, 0.144), ] ) connectivity = kneighbors_graph(X, 10, include_self=False) ward = AgglomerativeClustering(n_clusters=4, connectivity=connectivity, linkage="ward") # If changes are not propagated correctly, fit crashes with an # IndexError ward.fit(X)
def wardHierarchical(img): connectivity = grid_to_graph(*img.shape) print("Compute structured hierarchical clustering...") st = time.time() n_clusters = 15 # number of regions ward = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward', connectivity=connectivity) face = sp.misc.imresize(img, 0.10) / 255. X = np.reshape(img, (-1, 1)) ward.fit(X) label = np.reshape(ward.labels_, face.shape) print("Elapsed time: ", time.time() - st) print("Number of pixels: ", label.size) print("Number of clusters: ", np.unique(label).size) plt.figure(figsize=(5, 5)) plt.imshow(face, cmap=plt.cm.gray) for l in range(n_clusters): plt.contour(label == l, contours=1, colors=[plt.cm.spectral(l / float(n_clusters)), ]) plt.xticks(()) plt.yticks(()) plt.show()
def agglomerative_clusters(self, word_vectors): #Pre-calculate BallTree object starting = time.time() Ball_Tree = BallTree(word_vectors, leaf_size = 200, metric = "minkowski") print("BallTree object in " + str(time.time() - starting)) #Pre-calculate k_neighbors graph starting = time.time() connectivity_graph = kneighbors_graph(Ball_Tree, n_neighbors = 1, mode = "connectivity", metric = "minkowski", p = 2, include_self = False, n_jobs = workers ) print("Pre-compute connectivity graph in " + str(time.time() - starting)) #Agglomerative clustering starting = time.time() Agl = AgglomerativeClustering(n_clusters = 100, affinity = "minkowski", connectivity = connectivity_graph, compute_full_tree = True, linkage = "average" ) Agl.fit(word_vectors) print("Agglomerative clustering in " + str(time.time() - starting)) clusters = Agl.labels_ return clusters
def agglomClus(distmat,k,sendData=False): ''' For all the TPD matrices captured by pairwise distmat, uses sklearn to hierarchically cluster if meth=agglomerative, bottom up k number of clusters ''' from scipy.cluster.hierarchy import dendrogram from sklearn.datasets import load_iris from sklearn.cluster import AgglomerativeClustering from sklearn.metrics import pairwise_distances from matplotlib import pyplot as plt import itertools #put the calculated (generalized Manhattan) inter-matrix distances into array of floats diMat = [] dists = csv.reader(open(distmat, 'r',newline='\n')) for row in dists: diMat.append(row) disArr = np.array(diMat)#pairwise dist mat as strings diArr = disArr.astype(float)#now as floats #distMat_cond = squareform(diArr)#turns redundant, square into condensed, triangular #set and fit the agglomerative clustering model mclus = AgglomerativeClustering(n_clusters = k, affinity='precomputed',linkage='complete') clusfit = mclus.fit(diArr) labels = clusfit.labels_ #print(labels) #From PCA-based data, pull in the string names of chords in order chdnames = csv.reader(open('n10_PCA/562TPDmatrixSim kmed 200_n10PCA.csv', 'r',newline='\n')) #these for some other topN #chdnames = csv.reader(open('7470TPDmatrixSim kmed 50_n10PCA.csv', 'r',newline='\n')) #chdnames = csv.reader(open('2510TPDmatrixSim kmed 500_n10PCA.csv', 'r',newline='\n')) chdnamesit = [] for row in chdnames: chdnamesit.append(row) chdnameslst = [] for i,chd in enumerate(chdnamesit): if i<2: continue chdnameslst.append(chd[0]) #print(chdnameslst) #output agglomerative mergings as csv if sendData==True if sendData: ii = itertools.count(diArr.shape[0]) nodelst = [{'node_id': next(ii), 'left': x[0], 'right':x[1]} for x in clusfit.children_] csvName = 'agglom_testing.csv' file = open(csvName, 'w',newline='\n') lw = csv.writer(file) for row in nodelst: vals = [] for key,value in row.items(): vals.append(value) lw.writerow(vals) #plot a dendrogram of the agglomerative hierarchical clustering plt.title('Hierarchical Clustering Dendrogram') plot_dendrogram(clusfit,labels=chdnameslst,show_leaf_counts=True,leaf_font_size=8,leaf_rotation=45)#labels=clusfit.labels_ plt.show()
def clusterWithSimMatrix(simMatrix, num): clustering = AgglomerativeClustering(n_clusters=num, affinity='precomputed', linkage='complete') #clustering = MiniBatchKMeans(n_clusters=num, init='k-means++', n_init=1, # init_size=1000, batch_size=1000, verbose=opts.verbose) clustering.fit(simMatrix) return clustering
def agglomerative_clustering(self, samples): affinityArg = self.metric if self.metric == "gaussian": affinityArg = similairty_metrics.gaussianSimGraph ac = AgglomerativeClustering(linkage = self.linkage, n_clusters=self.num_clusters, affinity = affinityArg) ac.fit(samples) return ac.labels_
def pca_ward_tree(self): if not self.pca_reduced: self.pc_analysis() reduced_red = manifold.SpectralEmbedding(n_components=2).fit_transform(self.pca_reduced) clustering = AgglomerativeClustering(linkage='ward', n_clusters=3) clustering.fit(self.pca_reduced) self._plot_ward_tree(reduced_red, self.pca_reduced, self.player_value, clustering.labels_) return plt
def agglomClusCat(distmat,k,crit): ''' For all the TPD matrices captured by pairwise distmat, uses sklearn to hierarchically cluster k is number of clusters crit is criterion for fcluster ('distance' best option) ''' import sklearn from scipy.cluster.hierarchy import dendrogram from scipy.cluster.hierarchy import fcluster from sklearn.datasets import load_iris from sklearn.cluster import AgglomerativeClustering from sklearn.metrics import pairwise_distances from matplotlib import pyplot as plt import itertools #put the calculated (generalized Manhattan) inter-matrix distances into array of floats diMat = [] dists = csv.reader(open(distmat, 'r',newline='\n')) for row in dists: diMat.append(row) disArr = np.array(diMat)#pairwise dist mat as strings diArr = disArr.astype(float)#now as floats #distMat_cond = squareform(diArr)#turns redundant, square into condensed, triangular #set and fit the agglomerative clustering model mclus = AgglomerativeClustering(n_clusters = k, affinity='precomputed',linkage='complete') clusfit = mclus.fit(diArr) labels = clusfit.labels_ #print(labels) #From PCA-based data, pull in the string names of chords in order chdnames = csv.reader(open('n10_PCA/562TPDmatrixSim kmed 200_n10PCA.csv', 'r',newline='\n')) #these for some other topN #chdnames = csv.reader(open('7470TPDmatrixSim kmed 50_n10PCA.csv', 'r',newline='\n')) #chdnames = csv.reader(open('2510TPDmatrixSim kmed 500_n10PCA.csv', 'r',newline='\n')) chdnamesit = [] for row in chdnames: chdnamesit.append(row) chdnameslst = [] for i,chd in enumerate(chdnamesit): if i<2: continue chdnameslst.append(chd[0]) #print(chdnameslst) #now make a dendrogram and/or flat clustering assignments #plot_dendrogram(clusfit,labels=chdnameslst,show_leaf_counts=True,leaf_font_size=8,leaf_rotation=45)#labels=clusfit.labels_ clusters = fcluster(plot_dendrogram(clusfit,labels=chdnameslst,show_leaf_counts=True,leaf_font_size=8,leaf_rotation=45),k,criterion=crit) assigns = [] for i in range(200): assigns.append([clusters[i],chdnameslst[i]]) sassigns = sorted(assigns,key=operator.itemgetter(0)) #send out the leaf cluster membership data csvName = 'truncDend_memb_test.csv' file = open(csvName, 'w',newline='\n') lw = csv.writer(file) for row in sassigns: lw.writerow(row)
def get_topics(X_lsi, text_names, nk=1): ag = AgglomerativeClustering(n_clusters=nk, affinity='cosine', linkage='average') topics = ag.fit_predict(X_lsi) paper_to_topic = defaultdict(int) topic_to_papers = defaultdict(list) for paper,topic in zip(text_names,topics): paper_to_topic[paper] = topic topic_to_papers[topic].append(paper) return (paper_to_topic, topic_to_papers)
def CreateCluster(self): Fileobj=file(self.DistanceFile,"rb") SimArray=np.load(self.DistanceFile) Fileobj.close() print SimArray AggClusterDistObj=AgglomerativeClustering(n_clusters=self.num_cluster,linkage='average',affinity=self.affinity) Res_Labels=AggClusterDistObj.fit_predict(SimArray) print Res_Labels
def hierarchical(X, num_clusters): """ Hierarchical Clustering on X for response y Returns array of cluster groups """ model = AgglomerativeClustering(n_clusters=num_clusters) cleanX = preprocessing.scale(X.as_matrix()) model.fit(cleanX) return model.labels_
def cluster(X,n_clusters): from sklearn.cluster import AgglomerativeClustering clustering = AgglomerativeClustering(n_clusters=n_clusters,linkage="complete") pred = clustering.fit_predict(X) return pred
def run_algorithm_for_k(k, linkage): cluster = AgglomerativeClustering(n_clusters=k, affinity='euclidean', linkage=linkage) cluster.fit(x) score = (metrics.silhouette_score(x, cluster.labels_, metric='euclidean')) silhouette_scores.append(score)
("The festival was generally well received by locals, and businesses in the area would typically put up signs welcoming festival-goers to their town.", "Music"), ("As a result of the location of the music festival, numerous live albums and videos have been recorded or filmed in Bushnell, including the annual Cornerstone Festival DVD. ", "Music"), ("Cornerstone held its final festival in 2012 and no longer operates.", "Music"), ("Beginning in 1908, the Truman Pioneer Stud Farm in Bushnell was home to one of the largest horse shows in the Midwest.", "Horse show"), ("The show was well known for imported European horses.", "Horse show"), ("The Bushnell Horse Show features some of the best Belgian and Percheron hitches in the country. Teams have come from many different states and Canada to compete.", "Horse show"), ] sentences = [row[0] for row in corpus] corpus_embeddings = embedder.encode(sentences) num_clusters = len(set([row[1] for row in corpus])) #Sklearn clustering km = AgglomerativeClustering(n_clusters=num_clusters) km.fit(corpus_embeddings) cluster_assignment = km.labels_ clustered_sentences = [[] for i in range(num_clusters)] for sentence_id, cluster_id in enumerate(cluster_assignment): clustered_sentences[cluster_id].append(corpus[sentence_id]) for i, cluster in enumerate(clustered_sentences): print("Cluster ", i+1) for row in cluster: print("(Gold label: {}) - {}".format(row[1], row[0])) print("")
plt.figure(figsize=(10, 7)) plt.title("Average Text Dendograms") dend = shc.dendrogram(shc.linkage( upper_dists , method='average')) # plt.show() method= 'average' from sklearn import metrics points_average = [] labels_average= [] values_in_range = [] n_clusters_average = [] the_range = np.arange( 0.05,1,0.05 ) for x in the_range: values_in_range.append( x ) cluster_topics = AgglomerativeClustering(n_clusters=None, affinity='precomputed', linkage=method, distance_threshold=x) model_topics = cluster_topics.fit(sim_matrix) labels_average.append( model_topics.labels_ ) n_clusters_average.append( model_topics.n_clusters_ ) score = metrics.silhouette_score(sim_matrix, model_topics.labels_ , metric='precomputed') print(x,score) points_average.append( [ x, score ] ) df_notifications['cluster_'+str(x)]=model_topics.labels_ df_notifications.to_csv('data_silhoutte_scores.csv') plt.plot( [x[0] for x in points_average],[x[1] for x in points_average] ) plt.title( 'average method/ title+body hirarchical model silhoutte score' ) plt.xlabel( 'cut off threshold' ) plt.ylabel('Silhoutte Score') plt.show()
type(df_norm) z = linkage(df_norm, method="complete",metric="euclidean") plt.figure(figsize=(15, 5));plt.title('Hierarchical Clustering Dendrogram');plt.xlabel('Index');plt.ylabel('Distance') sch.dendrogram( z, leaf_rotation=0., # rotates the x axis labels leaf_font_size=8., # font size for the x axis labels ) plt.show() # Now applying AgglomerativeClustering choosing 3 as clusters from the dendrogram from sklearn.cluster import AgglomerativeClustering h_complete = AgglomerativeClustering(n_clusters=3, linkage='complete',affinity = "euclidean").fit(df_norm) cluster_labels=pd.Series(h_complete.labels_) frames['clust']=cluster_labels # creating a new column and assigning it to new column frames frames = frames.iloc[:,[17,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]] frames # getting aggregate mean of each cluster frames.iloc[:,2:].groupby(frames.clust).median() ## Cluster 2 is more suitable # creating a csv file frames.to_csv("PCA.csv",encoding="utf-8")
import numpy as np import pandas as pd from scipy import ndimage from scipy.cluster import hierarchy from scipy.spatial import distance_matrix from matplotlib import pyplot as plt from sklearn import manifold, datasets from sklearn.cluster import AgglomerativeClustering from sklearn.datasets.samples_generator import make_blobs X1, y1 = make_blobs(n_samples=50, centers=[[4, 4], [-2, -1], [1, 1], [10, 4]], cluster_std=0.9) plt.scatter(X1[:, 0], X1[:, 1], marker='o') agglom = AgglomerativeClustering(n_clusters=4, linkage='average') agglom.fit(X1, y1) plt.figure(figsize=(6, 4)) x_min, x_max = np.min(X1, axis=0), np.max(X1, axis=0) X1 = (X1 - x_min) / (x_max - x_min) for i in range(X1.shape[0]): plt.text(X1[i, 0], X1[i, 1], str(y1[i]), color=plt.cm.nipy_spectral(agglom.labels_[i] / 10.), fontdict={
import unittest import numpy as np from reval.best_nclust_cv import FindBestClustCV, _confint from sklearn.neighbors import KNeighborsClassifier from sklearn.cluster import AgglomerativeClustering import math # Modify to test other functions and parameters RNDLABELS_ITER = 10 CLASSIFIER = KNeighborsClassifier(n_neighbors=5) CLUSTERING = AgglomerativeClustering() NCLUST_RANGE = [2, 4] NFOLD = 2 class TestBestNclusterCV(unittest.TestCase): @classmethod def setUp(cls): cls.s = CLASSIFIER cls.c = CLUSTERING cls.nrand = RNDLABELS_ITER cls.nfold = NFOLD cls.nclust_range = NCLUST_RANGE cls.findbest = FindBestClustCV(cls.nfold, cls.nclust_range, cls.s, cls.c, cls.nrand) def test_best_nclust(self): data = np.array([[0] * 20, [1] * 20] * 20) strat_vect = np.array([0, 1] * 20) metrics, best_nclust, _ = self.findbest.best_nclust(data,
text_file = open("data/0208_3_" + mode + "Linkage_score.txt", "a+") text_file.write("\n************" + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + "************\n") print("\n************" + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + "************\n") text_file.close() ### Number of clusters for n_clusters in range(2, 31): ## linkage{"ward","complete","average","single"}, optional (default="ward") model = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage=mode) predict = pd.DataFrame(model.fit_predict(df)) predict.columns = ['predict'] # concatenate labels to df as a new column r = pd.concat([df, predict], axis=1) #print(r.sample(10)) # clusters silhouette_avg = silhouette_score(df.values, predict.values.ravel()) DBI_avg = davies_bouldin_score(df.values, predict.values.ravel()) text_file = open("data/0208_3_" + mode + "Linkage_score.txt", "a+") text_file.write("\n\nn_clusters =" + str(n_clusters) + "The average silhouette_score is :" + str(silhouette_avg))
def decision_function(self, X): return self.classifier_.decision_function(X) def plot_scatter(X, color, alpha=0.5): return plt.scatter(X[:, 0], X[:, 1], c=color, alpha=alpha, edgecolor='k') # Generate some training data from clustering X, y = make_blobs(n_samples=N_SAMPLES, cluster_std=[1.0, 1.0, 0.5], centers=[(-5, -5), (0, 0), (5, 5)], random_state=RANDOM_STATE) # Train a clustering algorithm on the training data and get the cluster labels clusterer = AgglomerativeClustering(n_clusters=3) cluster_labels = clusterer.fit_predict(X) plt.figure(figsize=(12, 4)) plt.subplot(131) plot_scatter(X, cluster_labels) plt.title("Ward Linkage") # Generate new samples and plot them along with the original dataset X_new, y_new = make_blobs(n_samples=10, centers=[(-7, -1), (-2, 4), (3, 6)], random_state=RANDOM_STATE) plt.subplot(132) plot_scatter(X, cluster_labels)
b = b + 7 # Del NaN and 1 ListKey = [ y for x, y in zip(ListVal, ListKey) if not (math.isnan(x[0]) or (x[0] == 1 and x[1] == 1)) ] ListVal = [ x for x in ListVal if not (math.isnan(x[0]) or (x[0] == 1 and x[1] == 1)) ] DictF = {x: y for x, y in zip(ListKey, ListVal)} print('Processed {} descriptors'.format(len(DictF))) for i in range(2, 7): agg = AC(n_clusters=i, linkage='ward') assignment = agg.fit_predict(ListVal) result = Counter(assignment) clustElem = {} for ind, val in enumerate(assignment): if val + 1 not in clustElem.keys(): clustElem[val + 1] = [ListKey[ind]] else: clustElem[val + 1].append(ListKey[ind]) clustMedian = {i[0]: i[1][len(i[1]) // 2] for i in clustElem.items()} print('========== {} lavel =========='.format(i - 1)) print('{} clusters'.format(i)) cE = list(clustElem.items()) cE.sort() for j in cE: print('Number of elements in {0} cluster: {1}'.format(j[0], len(j[1])))
y_true= data[:, 11] y_pred= classifier.predict(data[:, 0:11]) print("4.k-NN classifier: \n", recall_score(y_true, y_pred, average=None)) elif ans==4: sel=input("Select the algorithm ((h)ierarchicalor (k)-means): ") if sel=='h': from sklearn.cluster import AgglomerativeClustering data = np.genfromtxt("./winequality-red.csv", dtype= np.float32, delimiter = ";", skip_header= 1) X=data[:, 0:11] cluster=int(input("Input the number of clusters: ")) model = AgglomerativeClustering(n_clusters= cluster) model.fit(X) first=int(input("Input the number of the first wine: ")) second=int(input("Input the number of the second wine: ")) if model.labels_[first]== model.labels_[second]: print("Result : %d and %d are in the same cluster"%(first,second)) else: print("Result : %d and %d are in the different cluster"%(first,second)) if sel=='k': from sklearn.cluster import KMeans data = np.genfromtxt("./winequality-red.csv", dtype= np.float32, delimiter = ";", skip_header= 1) X=data[:, 0:11]
#Import Libraries from sklearn.cluster import AgglomerativeClustering import scipy.cluster.hierarchy as sch import matplotlib.pyplot as plt #---------------------------------------------------- #Applying AggClusteringModel Model ''' sklearn.cluster.AgglomerativeClustering(n_clusters=2, affinity='euclidean’, memory=None, connectivity=None, compute_full_tree='auto’, linkage=’ward’,pooling_func=’deprecated’) ''' AggClusteringModel = AgglomerativeClustering(n_clusters=5,affinity='euclidean',# it can be l1,l2,manhattan,cosine,precomputed linkage='ward')# it can be complete,average,single y_pred_train = AggClusteringModel.fit_predict(X_train) y_pred_test = AggClusteringModel.fit_predict(X_test) #draw the Hierarchical graph for Training set dendrogram = sch.dendrogram(sch.linkage(X_train[: 30,:], method = 'ward'))# it can be complete,average,single plt.title('Training Set') plt.xlabel('X Values') plt.ylabel('Distances') plt.show() #draw the Hierarchical graph for Test set dendrogram = sch.dendrogram(sch.linkage(X_test[: 30,:], method = 'ward'))# it can be complete,average,single plt.title('Test Set') plt.xlabel('X Value') plt.ylabel('Distances')
cluster_y = [i[1] for i in kmeans.cluster_centers_] #plt.plot(x,y,'.',alpha=0.15) #plt.plot(cluster_x,cluster_y,'o') cluster_plot(15,y_km) sns.kdeplot(x,y,cmap='Blues',shade=True,shade_lowest=False,bw=2,alpha=0.6) plt.show() plt.close() #%% # import hierarchical clustering libraries import scipy.cluster.hierarchy as sch from sklearn.cluster import AgglomerativeClustering # create dendrogram dendrogram = sch.dendrogram(sch.linkage(points, method='ward')) # create clusters hc = AgglomerativeClustering(n_clusters=15, affinity = 'euclidean', linkage = 'ward') # save clusters for chart y_hc = hc.fit_predict(points) #%% cluster_plot(6,y_hc) sns.kdeplot(x,y,cmap='Blues',shade=False,bw=2,alpha=0.5) plt.show() plt.close()
#veri kümesi büyüdükçe hiyerarşik bölütleme k-means den daha iyi çalışmaz. #büyük veri kümesi için uygun dağildir. #kütüphaneler import pandas as pd import numpy as np import matplotlib.pyplot as plt #2.1 veri yükleme data = pd.read_csv('../data/customer.csv') X = data.iloc[:, 3:].values #hierarchical clustering from sklearn.cluster import AgglomerativeClustering agc = AgglomerativeClustering(n_clusters=4, affinity='euclidean', linkage='ward') y_prediction = agc.fit_predict(X) print(y_prediction) #n_clusters kaç küme olacak #affinity mesafe ne ölçüsü ile alınacak #linkage clusterlar arası mesafe nasıl ölçülecek #ward kullanacaksak sadece euclidean ölçü birimi kullanılmak zorunda #fit inşa ediyor fit_predict hem inşa et hemde tahmin et #yapılan kümeleme işleminin grafiğini çizmek plt.scatter(X[y_prediction == 0, 0], X[y_prediction == 0, 1], s=100, c='red') plt.scatter(X[y_prediction == 1, 0], X[y_prediction == 1, 1], s=100, c='blue') plt.scatter(X[y_prediction == 2, 0], X[y_prediction == 2, 1], s=100, c='green') plt.scatter(X[y_prediction == 3, 0], X[y_prediction == 3, 1],
def Cluster(A): if (len(A) > 1): hc = AgglomerativeClustering(n_clusters=cluster_number, affinity='euclidean', linkage='ward') return hc.fit_predict(A)
location_of_images="../../../images/" sys.path.append(os.path.join(os.path.dirname(__file__), "../functions/")) data = np.load('cluster_mask.npy') data_new = data[..., 10:15] X = np.reshape(data_new, (-1, 1)) connectivity = grid_to_graph(n_x= data_new.shape[0], n_y = data_new.shape[1], n_z = data_new.shape[2]) st = time.time() n_clusters = 7 # number of regions ward = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward', connectivity=connectivity).fit(X) label = np.reshape(ward.labels_, data_new.shape) label_mean = np.zeros(n_clusters) center = list() #FIND THE AVERAGE T-VALUE PER CLUSTER for j in range(n_clusters): mask = label==j index = np.where(mask) center.append((np.mean(index[0]),np.mean(index[1]),np.mean(index[2]))) label_mean[j] =np.mean(data_new[mask]) #PRINT THE PLOTS for i in range(data_new.shape[-1]): plt.figure()
def main(): if len(sys.argv) == 2: superclass = sys.argv[1] else: print('Param error') exit() cluster = 'KMeans' cluster = 'AC' classNum = {'Animals': 10, 'Fruits': 10} features_path = 'features_%s.pickle' % (superclass) features_path_cluster = 'features_%s_cluster.pickle' % (superclass) fread = open(features_path, 'rb') fsave = open(features_path_cluster, 'wb') data_all = pickle.load(fread) features_all = data_all['features_all'] labels_all = data_all['labels_all'] images_all = data_all['images_all'] test_labels_idxes = np.where(np.array(labels_all) == 'test')[0] test_features = list(np.array(features_all)[test_labels_idxes]) test_images = list(np.array(images_all)[test_labels_idxes]) if cluster == 'KMeans': clf = KMeans(n_clusters=classNum[superclass], max_iter=300) s = clf.fit(test_features) else: clf = AgglomerativeClustering(n_clusters=classNum[superclass], linkage='complete') cluster_labels = clf.fit_predict(test_features) ac_cluster_centers = {} for i in range(classNum[superclass]): cluster_labels_idxes = np.where(np.array(cluster_labels) == i)[0] sub_test_features = np.array(test_features)[cluster_labels_idxes] ac_cluster_centers[i] = list(np.mean(sub_test_features, axis=0)) idx = 0 images_cluster = {} for image in test_images: if cluster == 'KMeans': cluster_idx = clf.predict([test_features[idx]]) cluster_idx = cluster_idx[0] feature_cluster = clf.cluster_centers_[cluster_idx] else: cluster_idx = cluster_labels[idx] feature_cluster = ac_cluster_centers[cluster_idx] features_all[test_labels_idxes[idx]] = feature_cluster if str(cluster_idx) not in images_cluster.keys(): images_cluster[str(cluster_idx)] = [] images_cluster[str(cluster_idx)].append(image) idx += 1 fimages = open('images_cluster.json', 'w') fimages.write(json.dumps(images_cluster)) fimages.close() feval = open('images_cluster_eval.json', 'w') feval.write(json.dumps(eval_cluster_result(superclass, images_cluster))) feval.close() data_all = { 'features_all': features_all, 'labels_all': labels_all, 'images_all': images_all } pickle.dump(data_all, fsave) fread.close() fsave.close()
data=data) bench_AffinityPropagation(AffinityPropagation(convergence_iter=20), name="AP", data=data) bench_MeanShift(MeanShift(), name="MeanShift", data=data) # bench_SpectralClustering(SpectralClustering(),name="MeanShift", data=data) bench_SpectralClustering(SpectralClustering(n_clusters=n_digits), name="Spectral", data=data) bench_AgglomerativeClustering(AgglomerativeClustering(n_clusters=n_digits, linkage='ward', connectivity=None), name="Ward-hier", data=data) bench_AgglomerativeClustering(AgglomerativeClustering(n_clusters=n_digits, linkage='complete', connectivity=None), name="Agglomerative", data=data) bench_DBSCAN(DBSCAN(eps=5, min_samples=3), name="DBSCAN", data=data) bench_GaussianMixture(mixture.GaussianMixture(n_components=n_digits, covariance_type='full'), name="GaussMix",
import pandas as pd import matplotlib.cm as cm import numpy as np import os from sklearn.cluster import AgglomerativeClustering path = os.getcwd() + '/shopping_data.csv' customer_data = pd.read_csv(path) data = customer_data.iloc[:, 3:5].values n_clusters = 5 linkage_list = ['single', 'average', 'complete', 'ward'] for l in linkage_list: clusterer = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage=l) cluster_labels = clusterer.fit(data) plt.figure() colors = cm.nipy_spectral( cluster_labels.labels_.astype(float) / n_clusters) plt.scatter(data[:, 0], data[:, 1], marker='.', s=70, lw=0, alpha=0.7, c=colors, edgecolors='k') plt.title(l)
""" import numpy as np import pandas as pd import matplotlib.pyplot as plt %matplotlib qt #importing data sets ds=pd.read_csv('Mall_Customers.csv') X=ds.iloc[:,[3,4]].values #plotting dendogram import scipy.cluster.hierarchy as sch dendogram=sch.dendrogram(sch.linkage(X, method='ward')) plt.title('Dendogram') plt.xlabel('salary') plt.ylabel('customerscore') #fittin model heirachical clusturing from sklearn.cluster import AgglomerativeClustering hc=AgglomerativeClustering(affinity='euclidean',linkage='ward',n_clusters=5) y_hc=hc.fit_predict(X) #plotting graph scatter(X[y_hc==0,0],X[y_hc==0,1],color='red',label='Cautious',s=100) plt.scatter(X[y_hc==1,0],X[y_hc==1,1],color='green',label='Standerd',s=100) plt.scatter(X[y_hc==2,0],X[y_hc==2,1],color='blue',label='target',s=100) plt.scatter(X[y_hc==3,0],X[y_hc==3,1],color='black',label='Careless',s=100) plt.scatter(X[y_hc==4,0],X[y_hc==4,1],color='magenta',label='Sensible',s=100) plt.legend() plt.show()
def clustering(self, affinity): return AgglomerativeClustering(distance_threshold=self.threshold, n_clusters=None, affinity=affinity, linkage=self.linkage)
self.direction_count) / self.instance_count * 100 def to_string(self): print('feature: ', self.feature, '\ninstance_count: ', self.instance_count, '\ndirection_count: ', self.direction_count, '\nreliability: ', self.reliability, ' %') datafile = 'test.csv' m = Model(datafile) model = m.model direction = m.direction print(model.shape) n = 10 print('just before clustering') clusters = AgglomerativeClustering(linkage='average').fit(model[n:, :]) patterns = [] for x in range(model.shape[0]): if direction[x] == 1: direction_count = np.array([1, 0, 0]) elif direction[x] == -1: direction_count = np.array([0, 1, 0]) else: direction_count = np.array([0, 0, 1]) patterns.append(Pattern(model[x], 1, direction_count)) for x in clusters.children_: feature = np.average((patterns[x[0]].feature, patterns[x[1]].feature), axis=0) instance_count = patterns[x[0]].instance_count + patterns[ x[1]].instance_count
import datetime time_stamps = [datetime.datetime(2020,7,18,5,1,3,23), datetime.datetime(2020,7,19,5,1,3,222), datetime.datetime(2020,7,21,0,0,0,0), datetime.datetime(2020,7,21,0,0,0,0), datetime.datetime(2020,7,21,3,2,1,110), datetime.datetime(2020,7,23,0,0,0,0), datetime.datetime(2020,7,24,0,0,0,0), datetime.datetime(2020,7,25,0,0,0,0)] # y = incremental_average(coords) # print(y) from sklearn.cluster import AgglomerativeClustering # clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed', linkage='average', distance_threshold=1.0) clustering = AgglomerativeClustering(n_clusters=None, linkage='average', distance_threshold=1.0) # print(sim_map) # print(dist_mat) y_pred = clustering.fit(coords[:4]).labels_ print('orig batch:', y_pred) clustering = AgglomerativeClustering(n_clusters=None, linkage='average', distance_threshold=1.0) # print(sim_map) # print(dist_mat) y_pred = clustering.fit(coords[4:]).labels_ print('orig batch:', y_pred) clustering = AgglomerativeClustering(n_clusters=None, linkage='average', distance_threshold=1.0) # print(sim_map) # print(dist_mat) y_pred = clustering.fit(coords).labels_
# Plotting the "Raw Data" plt.subplot(1, 2, 1) plt.title("Raw Data") plot() # Plotting the Assigned Points using the Hierarchical Clustering (with Euclidean distance) plt.subplot(1, 2, 2) plt.grid() plt.xlim(0, max(coordinates[i][0] for i in coordinates.keys()) + 0.05) plt.xticks(np.arange(0, max(coordinates[i][0] for i in coordinates.keys()) + 0.20, 0.10)) plt.xlabel("X-Axis") plt.ylim(0, max(coordinates[i][0] for i in coordinates.keys()) + 0.05) plt.yticks(np.arange(0, max(coordinates[i][0] for i in coordinates.keys()) + 0.20, 0.10)) plt.ylabel("Y-Axis") plt.title("Assigned Points") clusters = AgglomerativeClustering(n_clusters=n_cluster, affinity='euclidean', linkage='ward') clusters.fit_predict(coo_array) print(clusters.labels_) plt.scatter(coo_array[:, 0], coo_array[:, 1], c=clusters.labels_, cmap='rainbow', edgecolors="black") # Plotting the Dendrogram for showing the order and distances of merges during the hierarchical clustering. plt.figure() linked = linkage(coo_array, 'ward') dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=True) plt.title("Dendrogram: Ward Method") plt.xlabel("Points") plt.ylabel("Euclidean Distance") plt.show() # I was not sure which method to use between "ward" and "complete".
def clusterer(clusterer_id): clusterers = Clustering.query.all() algo = AlgoTypes.query.filter_by(algotype_id=3).first() ## Describe dataset dataset = pd.read_csv( '/home/ubuntu/workspace/static/datasets/Mall_Customers.csv') X = dataset.iloc[:, [3, 4]].values dataset_head = dataset.head(10) stats_data = dataset.iloc[:, 2:5] describe = stats_data.describe() rows = len(dataset.index) columns = len(dataset.columns) pred = 'Choose Algorithm' choice = clusterer_id if choice == '2': ## Using the dendogram to find the optimal number of clusters plt.gcf().clear() dendrogram = sch.dendrogram(sch.linkage(X, method='ward')) img_dendrogram = BytesIO() sns.set_style("darkgrid", {"axes.facecolor": ".9"}) plt.title("Dendrogram") plt.xlabel('Customers') plt.ylabel('Euclidean distances') plt.savefig(img_dendrogram, format='png') img_dendrogram.seek(0) plot_determine = base64.b64encode(img_dendrogram.getvalue()) ## Result = 5 ## Fitting Hierarchical Clustering to the dataset (optimal clusters = 5) hc = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward') y_hc = hc.fit_predict(X) ## Visualising the clusters img = BytesIO() plt.gcf().clear() plt.scatter( X[y_hc == 0, 0], X[y_hc == 0, 1], ## specify that we want first cluster + first column vs second column for 'y' s=100, c='red', label='Savers') ## size for datapoints/color plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s=100, c='blue', label='Average') plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s=100, c='green', label='Target Group') plt.scatter(X[y_hc == 3, 0], X[y_hc == 3, 1], s=100, c='orange', label='Overspenders') plt.scatter(X[y_hc == 4, 0], X[y_hc == 4, 1], s=100, c='magenta', label='Careful') sns.set_style("darkgrid", {"axes.facecolor": ".9"}) plt.title('Suggested Clusters') plt.xlabel('Annual income (k$)') plt.ylabel('Spending Score (1-100)', fontsize=12) plt.ylim(ymin=0) plt.legend(fontsize=9) plt.savefig(img, format='png') img.seek(0) plot_url = base64.b64encode(img.getvalue()) if choice == '1': ## Using the elbow method to find the optimal number of clusters wcss = [] ## initialize the list for i in range(1, 11): kmeans = KMeans( n_clusters=i, ## from 1 to 10 init= 'k-means++', ## k-means++ to avoid random initialziation trap max_iter=300, ## 300 is deafault n_init= 10, ## algorithm runs with different initial centroids random_state=0) kmeans.fit(X) wcss.append(kmeans.inertia_) ## to compute wcss ## Result = 5 ## Visualising Elbow Method plt.gcf().clear() img_elbow = BytesIO() plt.plot(range(1, 11), wcss) plt.title('The Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('WCSS') sns.set_style("darkgrid", {"axes.facecolor": ".9"}) plt.savefig(img_elbow, format='png') img_elbow.seek(0) plot_determine = base64.b64encode(img_elbow.getvalue()) ## Applying k-means to the mall dataset - from the plot we can see that optimum is 5 clusters. kmeans = KMeans( n_clusters=5, init='k-means++', ## k-means++ to avoid random initialziation trap max_iter=300, ## 300 is deafault n_init=10, ## algorithm runs with different initial centroids random_state=0) y_kmeans = kmeans.fit_predict( X) ## fit_predict returns a cluster for each observation ## Visualising the clusters img = BytesIO() plt.gcf().clear() sns.set_style("darkgrid", {"axes.facecolor": ".9"}) plt.scatter( X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], ## specify that we want first cluster + first column vs second column for 'y' s=100, c='red', label='Savers') ## size for datapoints/color plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s=100, c='blue', label='Average') plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s=100, c='green', label='Target Group') plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s=100, c='orange', label='Overspenders') plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s=100, c='magenta', label='Careful') plt.scatter( kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], ## cluster centers coordinates s=200, c='black', label='Centroids') plt.title('Suggested Clusters') plt.xlabel('Annual income (k$)') plt.ylabel('Spending Score (1-100)', fontsize=12) plt.ylim(ymin=0) plt.legend(fontsize=9) plt.savefig(img, format='png') img.seek(0) plot_url = base64.b64encode(img.getvalue()) return render_template('clustering.html', data=dataset_head.to_html(), describe=describe.to_html(), plot_determine=plot_determine, plot_url=plot_url, rows=rows, columns=columns, clusterers=clusterers, algo=algo, user=current_user.username)
'optics_xi', 'optics_dbscan', 'dbscan', 'agglomerative_clustering', 'affinity_propagation', 'spectral_clustering' ] clusterID_xi = OPTICS(metric='precomputed', max_eps=0.16, xi=0.05, algorithm='brute', min_samples=3).fit_predict(distanceMatrix) clusterID_op = OPTICS(metric='precomputed', max_eps=0.16, cluster_method='dbscan', min_samples=7).fit_predict(distanceMatrix) clusterID_db = DBSCAN(metric='precomputed', eps=0.1).fit_predict(distanceMatrix) clusterID_ag = AgglomerativeClustering( affinity='precomputed', linkage='average', n_clusters=2).fit_predict(distanceMatrix) clusterID_af = AffinityPropagation(affinity='precomputed', damping=0.7).fit_predict(1 - distanceMatrix) clusterID_sp = SpectralClustering(affinity='precomputed', n_clusters=2).fit_predict(1 - distanceMatrix) clusterIDs = [ clusterID_xi, clusterID_op, clusterID_db, clusterID_ag, clusterID_af, clusterID_sp ] # Evaluation for clusterID in clusterIDs: try: print( metrics.silhouette_score(distanceMatrix,
#Importing Dataset data = pd.read_csv('Mall_Customers.csv') x = data.iloc[:, [3, 4]].values #Dendrogram Graph (To find the optimal number of clusters) import scipy.cluster.hierarchy as sch dendrograms = sch.dendrogram(sch.linkage(x, method='ward')) plt.title('Dendrogram Model') plt.xlabel('Pts') plt.ylabel('Euclidean Distance') plt.show() #Applying hierarchical clustering from sklearn.cluster import AgglomerativeClustering algo = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward') y_hc = algo.fit_predict(x) #Visualising the HC plt.scatter(x[y_hc == 0, 0], x[y_hc == 0, 1], s=30, c='red', label='Cluster 1') plt.scatter(x[y_hc == 1, 0], x[y_hc == 1, 1], s=30, c='blue', label='Cluster 2') plt.scatter(x[y_hc == 2, 0], x[y_hc == 2, 1], s=30, c='green', label='Cluster 3')
kmeans = KMeans(n_clusters=10, random_state=123) # Fit fit = kmeans.fit(X_scaled) # Print inertia: sum of squared distances to closest cluster center print("Sum of squared distances for 10 clusters is", kmeans.inertia_) ############## Hierarchical agglomerative clustering #Make dendrograms -- tree diagrams that connect each datapoint by distance. #Draw perpendicular lines through the dendrogram to select out the groups # Create dendrogram dendrogram = sch.dendrogram(sch.linkage(X_scaled, method='ward')) #What is y axis? Measure of closeness of either individual data points or clusters plt.show() # Create clusters and fit hc = AgglomerativeClustering(affinity='euclidean', linkage='ward') hc.fit(X_scaled) # Print number of clusters print(hc.n_clusters_) ############## Determining K #In general, two methods for determining K: silhouette method and elbow method #Silhouette method uses silhouette coefficient, composed of mean distance between observation and all others in #same cluster, and mean distance between each observation and all others in next nearest cluster. #1 is good, means observation is close to others in same cluster. -1 is bad. #Elbow method - plot the sum of the square distance from each observation to the centroid against the number #of clusters. The "elbow point" on the plot will be the optimal k point. # Silhouette method
def segmentByClustering (rgbImage, colorSpace, clusteringMethod, numberOfClusters ): import numpy as np #determine if xy is required space=colorSpace.split("+") leng=len(space) w ,h = rgbImage.shape[:2] #generate XY matrix if leng == 2: import numpy as np x=range(w) xmat= np.repeat(x,h) xmat=xmat.reshape(w,h) xmat=np.uint8(xmat) y=range(h) ymat= np.repeat(y,w) ymat=ymat.reshape(w,h) ymat=np.uint8(ymat) colorSpace=space[0] #change image to the specified color space def RGB(rgbImage): newImage = rgbImage return newImage def HSV (rgbImage): import cv2 newImage = cv2.cvtColor(rgbImage, cv2.COLOR_BGR2HSV) return newImage def LAB (rgbImage): import skimage newImage = cv2.cvtColor(rgbImage, cv2.COLOR_BGR2LAB) return newImage #Switch for color space S_color = { "rgb" : RGB, "lab" : LAB, "hsv" : HSV } func = S_color.get(colorSpace) newImage=func(rgbImage) #aply XY matrix if needed if leng == 2: temp=np.ndarray(shape=(w,h,5)) temp[:,:,0]=newImage[:,:,0] temp[:,:,1]=newImage[:,:,1] temp[:,:,2]=newImage[:,:,2] temp[:,:,3]=xmat temp[:,:,4]=ymat newImage=temp indx= 0 size=newImage.shape if leng ==2: repMat=np.zeros((size[0]*size[1],5)) else: repMat=np.zeros((size[0]*size[1],3)) for i in range(size[0]): for j in range(size[1]): if leng ==2: i1= (newImage[i,j,3]/(255-0)) j1= (newImage[i,j,4]/(255-0)) repMat[indx]= [newImage[i,j,0],newImage[i,j,1],newImage[i,j,2], i1,j1] indx=indx+1 else: repMat[indx]= [newImage[i,j,0],newImage[i,j,1],newImage[i,j,2]] k= numberOfClusters if clusteringMethod == 'kmeans': kmeans = KMeans(n_clusters=k).fit(repMat) labels=kmeans.labels_ labels= np.reshape(labels,(size[0],size[1])) seg= labels elif clusteringMethod == 'gmm': gmm = mixture.GaussianMixture(n_components=k).fit(repMat,y='None') labels = gmm.predict(repMat) labels= np.reshape(labels,(size[0],size[1])) seg= labels elif clusteringMethod == 'hierarchical': import sklearn.cluster from sklearn.cluster import AgglomerativeClustering cluster = AgglomerativeClustering(n_clusters=k, affinity='euclidean', linkage='ward').fit(repMat) labels=cluster.labels_ labels= np.reshape(labels,(size[0],size[1])) seg=labels elif clusteringMethod == 'watershed': a=2 return seg
view2D.show(sm, col_sz=4, what = 'codebook',)#which_dim="all", denormalize=True) plt.show() view2D = View2D(20,20,"", text_size=9) view2D.show(sm, col_sz=2, what = 'codebook',)#which_dim="all", denormalize=True) plt.show() vhts = BmuHitsView(12,12,"Hits Map",text_size=7) vhts.show(sm, anotate=True, onlyzeros=False, labelsize=10, cmap="autumn", logaritmic=False) ## Hierarchical Clustering ## som_cluster = final_clusters.groupby("Labels").mean() dend = shc.dendrogram(shc.linkage(som_cluster, method='ward')) plt.title("Dendogram with SOM nodes", size=12) som_cluster["h_cluster"] = AgglomerativeClustering(n_clusters=3).fit_predict(som_cluster) # Calculate centroids of clusters and inverse scaling for interpretation h_cluster = som_cluster.groupby("h_cluster").mean() h_cluster = pd.DataFrame(scaler.inverse_transform(X=h_cluster), columns = customer_related_num) # Assign customer to cluster generated by hierarchical clustering final_clusters["h_cluster"] = [som_cluster.loc[label,"h_cluster"] for label in final_clusters["Labels"].values] # Silhoutte graph create_silgraph(df_cust_norm, final_clusters["h_cluster"]) plt.title("Silhouette graph customer clusters", size=12) silhouette_avg = silhouette_score(df_cust_norm, final_clusters["h_cluster"]) print("the average silhouette_score is :", silhouette_avg) df["c_cluster"] = final_clusters["h_cluster"] ################################################################# ################## Decision Tree classifier ##################### # Find most important features