def robust_single_linkage(metric, *, k_neighbors, alpha=None, force_ultrametric=False): """Robust single linkage algorithm of (Chaudhuri and Dasgupta 2010). Args ---- metric : ndarray The distance array to transform, in condensed or square form. k_neighbors : int, keyword-only The number of neighbors to use in the transformation. Must be a non-negative integer. alpha : float, keyword-only, optional The connection parameter used in the transformation. At time :math:`r`, all edges within :math:`\\alpha r` are connected. Default: :math:`\sqrt{2}`. force_ultrametric : bool, keyword-only, optional If `True`, the resulting clustering is forced to be ultrametric (each point is admitted at radius zero). Returns ------- clustering : Clustering The robust single linkage clustering. """ kwargs = dict(metric=metric, k_neighbors=k_neighbors, alpha=alpha) robust_metric, levels = robust_single_linkage_metric(**kwargs) if force_ultrametric: levels = None # note: _hier.single requires condensed metric! linkage = _hier.single(robust_metric) return Clustering.from_linkage(linkage, levels=levels)
def find_closest_two_points(points): # use single link clustering to find closest two points p = pdist(points) slc = single(p) return np.array([points[slc[0][0]], points[slc[0][1]]])
def make_tree(X, C, method='single'): if method == 'single': tree = to_tree(single(C)) elif method == 'ward': tree = to_tree(ward(X)) elif method == 'average': tree = to_tree(average(C)) return Tree(root=construct_node(tree))
def group_tuples(items=None, val_ind=None, dist_thresh = 0.1, distance_matrix=None, metric='jaccard', linkage='complete', sp_areas=None): ''' items: a dict or list of tuples val_ind: the index of the item of interest within each tuple ''' if distance_matrix is not None: if items is not None: if isinstance(items, dict): keys = items.keys() values = items.values() elif isinstance(items, list): keys = range(len(items)) if isinstance(items[0], tuple): values = map(itemgetter(val_ind), items) else: values = items else: if isinstance(items, dict): keys = items.keys() values = items.values() elif isinstance(items, list): keys = range(len(items)) if isinstance(items[0], tuple): values = map(itemgetter(val_ind), items) else: values = items else: raise Exception('clusters is not the right type') assert items is not None, 'items must be provided' distance_matrix = compute_pairwise_distances(values, metric, sp_areas=sp_areas) if items is None: assert distance_matrix is not None, 'distance_matrix must be provided.' if linkage=='complete': lk = complete(squareform(distance_matrix)) elif linkage=='average': lk = average(squareform(distance_matrix)) elif linkage=='single': lk = single(squareform(distance_matrix)) # T = fcluster(lk, 1.15, criterion='inconsistent') T = fcluster(lk, dist_thresh, criterion='distance') n_groups = len(set(T)) groups = [None] * n_groups for group_id in range(n_groups): groups[group_id] = np.where(T == group_id+1)[0] index_groups = [[keys[i] for i in g] for g in groups if len(g) > 0] item_groups = [[items[i] for i in g] for g in groups if len(g) > 0] return index_groups, item_groups, distance_matrix
def cluster_agglomerative(self, n, with_topic): #self.n_clusters = n if with_topic is True: X = self.X_with_topics else: X = self.X_without_topics z = hierarchy.single(pdist(X.toarray(), 'cityblock')) self.clusterizer_labels = hierarchy.fcluster(z,n, criterion = 'maxclust') cluster_set = set(self.clusterizer_labels) self.n_clusters = len(cluster_set) #- (1 if -1 in self.clusterizer_labels else 0) if with_topic is True: for i in cluster_set: #initialize a dict with cluster no as key to a list self.clusters[i] = [] #adding the actual topics of the cluster to this dict. Mapping of label -> [list of actual topics for the documents clustered in this label] for i in range(0,len(self.topic_per_doc)): self.clusters[self.clusterizer_labels[i]].append(self.topic_per_doc[i]) self.generate_piechart_pdf("hierarchical") else: self.calculate_and_print_silhouette(self.X_without_topics, self.clusterizer_labels, "DBSCAN")
def CalculateClusterTree(self): fullMatrix = self.GenerateFullMatrix(self.results) dissMatrix = [] labels = fullMatrix.keys() for i in xrange(0, len(labels)): sampleNameI = labels[i] for j in xrange(i+1, len(labels)): sampleNameJ = labels[j] dissMatrix.append(fullMatrix[sampleNameI][sampleNameJ]) # calculate hierarchical cluster tree if self.radioSingleLinkage.GetValue(): linkageMatrix = single(dissMatrix) elif self.radioUPGMA.GetValue(): linkageMatrix = average(dissMatrix) elif self.radioCompleteLinkage.GetValue(): linkageMatrix = complete(dissMatrix) elif self.radioWeighted.GetValue(): linkageMatrix = weighted(dissMatrix) root = to_tree(linkageMatrix) # create Newick string return self.CreateNewickString(root, labels) + ';'
def check_fcluster_monocrit(self, t): expectedT = hierarchy_test_data.fcluster_distance[t] Z = single(hierarchy_test_data.Q_X) T = fcluster(Z, t, criterion='monocrit', monocrit=maxdists(Z)) assert_(is_isomorphic(T, expectedT))
def check_fcluster(self, t, criterion): # Tests fcluster(Z, criterion=criterion, t=t) on a random 3-cluster data set. expectedT = getattr(hierarchy_test_data, 'fcluster_' + criterion)[t] Z = single(hierarchy_test_data.Q_X) T = fcluster(Z, criterion=criterion, t=t) assert_(is_isomorphic(T, expectedT))
def filter_corr_hierarchical(cor_matrix, method='complete', draw_plot=False): """ Creates a filtered correlation matrix using hierarchical clustering methods from an empirical correlation matrix, given that all values are non-negative [0 ~ 1] This function allows for three types of hierarchical clustering - complete, single, and average linkage clusters. Link to hierarchical clustering methods documentation: `<https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html>`_ It works as follows: First, the method creates a hierarchical clustering tree using scipy's hierarchical clustering methods from the empirical 2-D correlation matrix. Second, it extracts and stores each cluster's filtered value (alpha) and assigns it to it's corresponding leaf. Finally, we create a new filtered matrix by assigning each of the correlations to their corresponding parent node's alpha value. :param cor_matrix: (np.array) Numpy array of an empirical correlation matrix. :param method: (str) Hierarchical clustering method to use. (``complete`` by default, ``single``, ``average``) :param draw_plot: (bool) Plots the hierarchical cluster tree. (False by default) :return: (np.array) The filtered correlation matrix. """ # Check if all matrix elements are positive if np.any(cor_matrix < 0): warnings.warn( 'Not all elements in matrix are positive... Returning unfiltered matrix.', UserWarning) return cor_matrix # Check if matrix is 2-D if len(cor_matrix.shape) == 2: cor_x, cor_y = cor_matrix.shape else: warnings.warn( 'Invalid matrix dimensions, input must be 2-D array... Returning unfiltered matrix.', UserWarning) return cor_matrix # Check if matrix dimensions and diagonal values are valid. if cor_x == cor_y and np.allclose( np.diag(cor_matrix), 1): # using np.allclose as diag values might be 0.99999 # Creating new coorelation condensed matrix for the upper triangle and dismissing the diagnol. new_cor = cor_matrix[np.triu_indices(cor_matrix.shape[0], k=1)] else: warnings.warn( 'Invalid matrix, input must be a correlation matrix of size (m x m)... Returning unfiltered matrix.', UserWarning) return cor_matrix # Compute the hierarchical clustering tree if method == 'complete': z_cluster = complete(new_cor) elif method == 'single': z_cluster = single(new_cor) elif method == 'average': z_cluster = average(new_cor) else: warnings.warn( 'Invalid method selected, please check docstring... Returning unfiltered matrix.', UserWarning) return cor_matrix # Plot the hierarchical cluster tree if draw_plot: fig = plt.figure(figsize=(10, 6)) axis = fig.add_subplot(111) dendrogram(z_cluster, ax=axis) plt.show() # Creates a pd.DataFrame that will act as a dictionary where the index is the leaf node id, and the values are # thier corresponding cluster's alpha value alpha_values = z_cluster[:, 2] alphas = z_cluster[:, 0] df_alphas = pd.DataFrame(alpha_values, index=alphas) df_alphas.loc[z_cluster[0][1]] = alpha_values[0] # Creates the filtered correlation matrix alphas_sorterd = df_alphas.sort_index() alphas_x = np.tile(alphas_sorterd.values, (1, len(alphas_sorterd.values))) filt_corr = np.maximum(alphas_x, alphas_x.T) np.fill_diagonal(filt_corr, 1) return filt_corr
plt.show() # 也可以查看交叉表频率计数 print(pd.crosstab(kmeans_3['cluster'], kmeans_3['Cultivar'], margins=True)) from scipy.cluster import hierarchy # 再次加载葡萄酒数据集,并删除Cultivar列 wine = pd.read_csv('wine.csv') wine = wine.drop('Cultivar', axis=1) # 层次聚类有许多算法。可以使用matplotlib绘制结果 import matplotlib.pyplot as plt wine_complete = hierarchy.complete(wine) fig = plt.figure() dn = hierarchy.dendrogram(wine_complete) plt.show() wine_single = hierarchy.single(wine) fig = plt.figure() dn = hierarchy.dendrogram(wine_single) plt.show() wine_averge = hierarchy.average(wine) fig = plt.figure() dn = hierarchy.dendrogram(wine_averge) plt.show() wine_centroid = hierarchy.centroid(wine) fig = plt.figure() dn = hierarchy.dendrogram(wine_centroid) plt.show() wine_complete = hierarchy.complete(wine)
len(similarity_constructor) import scipy.spatial.distance as ssd dist=np.zeros((13,13)) dist[:,0]=similarity_constructor.values dist1=ssd.cdist(dist,dist,'euclidean') dist1 import scipy.cluster.hierarchy as sch from sklearn.cluster import AgglomerativeClustering clusters_study=sch.fcluster(sch.single(dist1),0.2,criterion='distance') # Here I create 4 groups # Just left to create the 4 regressions clusters_study # Buildings with similar characteristics similarity_constructor[clusters_study==4] similarity_constructor[clusters_study==3] similarity_constructor[clusters_study==2] similarity_constructor[clusters_study==1] df.columns
def group_tuples(items=None, val_ind=None, dist_thresh=0.1, distance_matrix=None, metric='jaccard', linkage='complete', sp_areas=None): ''' items: a dict or list of tuples val_ind: the index of the item of interest within each tuple ''' if distance_matrix is not None: if items is not None: if isinstance(items, dict): keys = items.keys() values = items.values() elif isinstance(items, list): keys = range(len(items)) if isinstance(items[0], tuple): values = map(itemgetter(val_ind), items) else: values = items else: if isinstance(items, dict): keys = items.keys() values = items.values() elif isinstance(items, list): keys = range(len(items)) if isinstance(items[0], tuple): values = map(itemgetter(val_ind), items) else: values = items else: raise Exception('clusters is not the right type') assert items is not None, 'items must be provided' distance_matrix = compute_pairwise_distances(values, metric, sp_areas=sp_areas) if items is None: assert distance_matrix is not None, 'distance_matrix must be provided.' if linkage == 'complete': lk = complete(squareform(distance_matrix)) elif linkage == 'average': lk = average(squareform(distance_matrix)) elif linkage == 'single': lk = single(squareform(distance_matrix)) # T = fcluster(lk, 1.15, criterion='inconsistent') T = fcluster(lk, dist_thresh, criterion='distance') n_groups = len(set(T)) groups = [None] * n_groups for group_id in range(n_groups): groups[group_id] = np.where(T == group_id + 1)[0] index_groups = [[keys[i] for i in g] for g in groups if len(g) > 0] item_groups = [[items[i] for i in g] for g in groups if len(g) > 0] return index_groups, item_groups, distance_matrix
def linkage(dmat): square = squareform(dmat) #needed for linkage methdos linkmat = sch.single(square) return sch.fcluster(linkmat,0.0001)
def scipyLinkage(dmat): return sch.fcluster(sch.single(dmat),0.01)
from sklearn.decomposition import PCA from sklearn.cluster import KMeans from sklearn.preprocessing import scale # scipy from scipy.cluster import hierarchy # Visulization from IPython.display import display import matplotlib as mpl import matplotlib.pyplot as plt mpl.style.use('ggplot') PATH = '/home/ec2-user/environment/islr/10_unsupervised_learning/figs/' X = np.random.standard_normal((50, 2)) X[:25, 0] = X[:25, 0] + 3 X[:25, 1] = X[:25, 1] - 4 fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(15, 18)) for linkage, cluster, ax in zip( [hierarchy.complete(X), hierarchy.average(X), hierarchy.single(X)], ['c1', 'c2', 'c3'], [ax1, ax2, ax3]): cluster = hierarchy.dendrogram(linkage, ax=ax, color_threshold=0) ax1.set_title('Complete Linkage') ax2.set_title('Average Linkage') ax3.set_title('Single Linkage') plt.savefig(PATH + 'hierarchy.png', dpi=300) plt.close()
tree = Phylo.read(StringIO(newick), 'newick') Phylo.draw_graphviz(tree, prog='neato') plt.savefig("%s.png" % name, dpi=200, bbox_inches='tight') X += np.random.normal(scale=0.01, size=X.shape) pca = PCA(2) pca.fit(X) # X = pca.transform(X) N, D = X.shape C = pdist(X) tree = to_tree(single(C)) def construct_node(snode): if snode.left is None and snode.right is None: return TreeLeaf(snode.get_id()) node = TreeNode() node.add_child(construct_node(snode.left)) node.add_child(construct_node(snode.right)) return node root = construct_node(tree) linkage_tree = Tree(root=root) plot_tree(linkage_tree, 'linkage_induced')
hclust_model = cluster.AgglomerativeClustering(n_clusters = 2, linkage = 'average') hclust_model.fit(X) print('Cluster labels: {}\n'.format(hclust_model.labels_)) hclust_model = cluster.AgglomerativeClustering(n_clusters = 2, linkage = 'complete') hclust_model.fit(X) print('Cluster labels: {}\n'.format(hclust_model.labels_)) print ''' ********************************************************************************************************************* scipy: dendrogram ********************************************************************************************************************* ''' # from: https://github.com/JWarmenhoven/ISLR-python/blob/master/Notebooks/Chapter%2010.ipynb fig, (ax1,ax2,ax3) = plt.subplots(3,1, figsize=(15,18)) for linkage, cluster, ax in zip([hierarchy.complete(X), hierarchy.average(X), hierarchy.single(X)], ['c1','c2','c3'], [ax1,ax2,ax3]): cluster = hierarchy.dendrogram(linkage, ax=ax, color_threshold=0) ax1.set_title('Complete Linkage') ax2.set_title('Average Linkage') ax3.set_title('Single Linkage') plt.show()
plt.figure() newick = final_tree.to_newick() tree = Phylo.read(StringIO(newick), 'newick') Phylo.draw_graphviz(tree, prog='neato') plt.savefig("%s.png" % name, dpi=200, bbox_inches='tight') X += np.random.normal(scale=0.01, size=X.shape) pca = PCA(2) pca.fit(X) # X = pca.transform(X) N, D = X.shape C = pdist(X) tree = to_tree(single(C)) def construct_node(snode): if snode.left is None and snode.right is None: return TreeLeaf(snode.get_id()) node = TreeNode() node.add_child(construct_node(snode.left)) node.add_child(construct_node(snode.right)) return node root = construct_node(tree) linkage_tree = Tree(root=root) plot_tree(linkage_tree, 'linkage_induced') if args.tree:
def condense_clones_file_and_barcode_mapping_file_by_tcrdist( old_clones_file, new_clones_file, tcrdist_threshold, organism, output_distfile=None ): tcrdist_calculator = TcrDistCalculator(organism) df = pd.read_csv(old_clones_file, sep='\t') # in conga we usually also have cdr3_nucseq but we don't need it for tcrdist; we also don't need the jgene but hey tcrs = [ ( ( l.va_gene, l.ja_gene, l.cdr3a ), ( l.vb_gene, l.jb_gene, l.cdr3b ) ) for l in df.itertuples() ] ids = [ l.clone_id for l in df.itertuples() ] print(f'compute tcrdist distance matrix for {len(tcrs)} clonotypes') D = np.array( [ tcrdist_calculator(x,y) for x in tcrs for y in tcrs ] ).reshape( (len(tcrs), len(tcrs)) ) all_barcodes = pd.read_csv(old_clones_file+'.barcode_mapping.tsv', sep='\t') all_barcodes.set_index('clone_id', inplace=True) all_barcodes = all_barcodes['barcodes'] assert type(all_barcodes) is pd.Series N = D.shape[0] assert df.shape[0] == N DT = squareform(D, force='tovector') # single linkage clustering of the distance matrix: any clonotypes with dist<tcrdist_threshold # should end up in the same cluster Z = hierarchy.single(DT) clusters = hierarchy.fcluster(Z, t=tcrdist_threshold, criterion='distance') clusters_set = sorted(set(clusters)) new_clones_dfl = [] new_bcmap_dfl = [] cluster_centers = [] for c in clusters_set: # choose a representative clone based on distance cmask = clusters==c members = np.nonzero(cmask)[0] assert len(members) == np.sum(cmask) if len(members) == 1: center = members[0] else: cdist = D[cmask,:][:,cmask] dists = np.sum(cdist,axis=1)/(len(members)-1) icenter = np.argmin(dists) center = members[icenter] print('center_avgdist: {:3d} {:7.2f} avg {:7.2f}'\ .format(len(members), dists[icenter], np.mean(dists))) cluster_centers.append(center) cdf = df[cmask] center_df = pd.Series( df.iloc[center] ) clone_size = sum(x.clone_size for _,x in cdf.iterrows()) center_df.clone_size = clone_size new_clones_dfl.append( center_df) cbarcodes = [] for _,row in cdf.iterrows(): cbarcodes.extend( all_barcodes[row.clone_id].split(',') ) assert len(cbarcodes) == clone_size new_bcmap_dfl.append( dict(clone_id=center_df.clone_id, barcodes=','.join(cbarcodes))) new_clones_df = pd.DataFrame(new_clones_dfl) new_bcmap_df = pd.DataFrame(new_bcmap_dfl)['clone_id barcodes'.split()] # ensure order? new_clones_df.to_csv(new_clones_file, sep='\t', index=False) new_bcmap_df.to_csv(new_clones_file+'.barcode_mapping.tsv', sep='\t', index=False) if output_distfile is not None: new_D = D[cluster_centers,:][:,cluster_centers] np.savetxt( output_distfile, new_D.astype(float), fmt='%.1f')
for cluster in clusters: print(cluster) if cluster == current_cluster: s2 = time_series[serie_number] series_DTW = dtw.distance(s1, s2) calculated_sse = calculated_sse + np.math.pow(series_DTW, 2) serie_number += 1 print('mpike re malaka') print(current_cluster) print("mexri edw") current_cluster += 1 sse.append(calculated_sse) pl.plot(range(3, 15), sse) pl.title("Elbow Curve") pl.xlabel("Number of clusters") pl.ylabel("SSE") pl.show() print(calculated_sse) print(cluster) #cophenetic - features from scipy.cluster.hierarchy import single, cophenet from scipy.spatial.distance import pdist, squareform Z = single(pdist(ret_var)) cophenet(Z) print(squareform(cophenet(Z)))
def check_fcluster_maxclust_monocrit(self, t): expectedT = hierarchy_test_data.fcluster_maxclust[t] Z = single(hierarchy_test_data.Q_X) T = fcluster(Z, t, criterion='maxclust_monocrit', monocrit=maxdists(Z)) assert_(is_isomorphic(T, expectedT))
ax1.scatter(X[:,0], X[:,1], s=20, c=km1.labels_, cmap=plt.cm.prism) ax1.set_title('K-Means Clustering Results with K=2') ax1.scatter(km1.cluster_centers_[:,0], km1.cluster_centers_[:,1], marker='+', s=100, c='k', linewidth=2) ax2.scatter(X[:,0], X[:,1], s=40, c=km2.labels_, cmap=plt.cm.prism) ax2.set_title('K-Means Clustering Results with K=3') ax2.scatter(km2.cluster_centers_[:,0], km2.cluster_centers_[:,1], marker='+', s=100, c='k', linewidth=2); ''' K means clustering ''' from scipy.cluster import hierarchy plt.figure() fig, (ax1,ax2,ax3) = plt.subplots(3,1, figsize=(15,18)) for linkage, cluster, ax in zip([hierarchy.complete(X), hierarchy.average(X), hierarchy.single(X)], ['c1','c2','c3'],[ax1,ax2,ax3]): cluster = hierarchy.dendrogram(linkage, ax=ax, color_threshold=0) ax1.set_title('Complete Linkage') ax2.set_title('Average Linkage') ax3.set_title('Single Linkage'); ''' Lab Genes NCI60 ''' ''' Each cell line is labeled with a cancer type. We do not make use of the cancer types in performing PCA and clustering, as these are unsupervised techniques. But after performing PCA and clustering, we will check to see the extent to which these cancer types agree with the results of these unsupervised techniques.
# <codecell> np.fill_diagonal(tag_dist_matrix, 0) # <codecell> from scipy.spatial.distance import squareform tag_dist = squareform(tag_dist_matrix) # <codecell> from scipy.cluster.hierarchy import single # <codecell> tag_linkage = single(tag_dist) # <codecell> np.save('tag_linkage',tag_linkage) # <codecell> from scipy.cluster.hierarchy import leaves_list, to_tree, dendrogram # <codecell> tag_closeness_id = leaves_list(tag_linkage) # <codecell>
cS = u.select_atoms('name C312') # select the atoms r1 = [] # get positions for ts in u.trajectory: r1.append((cS.positions)) #print('loading positions frame %i of %i'%(u.trajectory.frame+1, len(u.trajectory))) r1 = np.array(r1) dist = [] # get all pair distances formatted as flat upper triangles for i in range(len(r1)): dist.append( MDAnalysis.analysis.distances.self_distance_array(r1[i], box=box)) #print('loading distances frame %i of %i'%(i+1, len(u.trajectory))) dist = np.array(dist) #print('clustering.') z = [] # perform hierarchical single-linkage clustering for i in range(len(dist)): z.append(single(dist[i])) z = np.array(z) #print('clustering..') hierarchy = [] # get clusters using cutoff (in angstroms) for i in range(len(z)): hierarchy.append(fcluster(z[i], cutoff, criterion='distance')) hierarchy = np.array(hierarchy) # # select the indices of the atoms in each cluster # DPC = u.select_atoms('resname FOS12') # clusters = [] for j in range(-frame, -1): clusters1 = [] for i in range(1, np.amax(hierarchy[j]) + 1):