def k_medoids_clustering(dist_array, k_clusters): clusterid, error, nfound = kmedoids(dist_array, nclusters=k_clusters, npass=100) return clusterid
def pipe2(): p = init() p.sampling(1000, random_sampling=True) p.calc_distance_matrix() p.seq_span = 80 p.calc_distance_matrix() nclusters = 5 clusterid, error, _ = kmedoids(p.dist_matrix, nclusters=nclusters, npass=20)
def cluster(self, nclusters, noise=False, npass=100, nreps=1): if Biopython_Unavailable: print('kmedoids not available without Biopython') return matrix = self.get_dm(noise) p = [kmedoids(matrix, nclusters=nclusters, npass=npass) for _ in range(nreps)] p.sort(key=lambda x: x[1]) return Partition(p[0][0])
def run_kmedoids(self, dm, nclusters): if dm.metric == 'rf': matrix = dm.add_noise(dm.matrix) else: matrix = dm.matrix p = [kmedoids(matrix, nclusters=nclusters, npass=100) for _ in range(100)] p.sort(key=lambda x: x[1]) T = self.order(p[0][0]) return T
def run_kmedoids(self, dm, nclusters): if dm.metric == 'rf': matrix = dm.add_noise(dm.matrix) else: matrix = dm.matrix p = [ kmedoids(matrix, nclusters=nclusters, npass=100) for _ in range(100) ] p.sort(key=lambda x: x[1]) T = self.order(p[0][0]) return T
def kmedoids(self, nclusters, noise=False, npass=100, nreps=1): if Biopython_Unavailable: print('kmedoids not available without Biopython') return matrix = self.get_dm(noise) p = [ kmedoids(matrix, nclusters=nclusters, npass=npass) for _ in range(nreps) ] p.sort(key=lambda x: x[1]) return Partition(p[0][0])
def kmedoids(self, nclusters, noise=False): if Biopython_Unavailable: print 'kmedoids not available without Biopython' return if noise: matrix = self.distance_matrix.add_noise() else: matrix = self.distance_matrix p = [kmedoids(matrix, nclusters=nclusters, npass=100) for _ in range(100)] p.sort(key=lambda x: x[1]) return Partition(p[0][0])
def kmedoids(self, nclusters, noise=False): if Biopython_Unavailable: print 'kmedoids not available without Biopython' return if noise: matrix = self.distance_matrix.add_noise() else: matrix = self.distance_matrix p = [ kmedoids(matrix, nclusters=nclusters, npass=100) for _ in range(100) ] p.sort(key=lambda x: x[1]) return Partition(p[0][0])
def pipe1(): p = HLAProcessor( 'data/child10_S4_L001_R1_001.fastq', 'data/child10_S4_L001_R2_001.fastq', ) nsamples = 1000 p.sampling(nsamples, random_sampling=True) p.seq_span = 80 p.calc_distance_matrix() errors = [] for nclusters in range(2, 10): _, error, nfound = kmedoids(p.dist_matrix, nclusters=nclusters, npass=10) print(nfound) errors.append(error) nclusters = list(range(2, 10)) plt.plot(nclusters, errors) plt.show()
def elbow_clustering(distanceMatrix, elbowErr): maxCluster = N - 2 clusters = [[] for x in range(maxCluster)] ks = np.zeros(maxCluster) err = np.zeros(maxCluster) for k in range(2, maxCluster + 2): clusters[k - 2], ks[k - 2], err[k - 2] = kmedoids( distance=distanceMatrix, nclusters=int(k), npass=100) window_size_smooth = 5 errorSmooth = [] for i in range(len(ks)): frag = ks[np.max([0, i - 1]):np.min([i + window_size_smooth, len(ks)])] errorSmooth.append(sum(frag) / len(frag)) plt.plot(errorSmooth) plt.xlabel("Number of clusters") plt.ylabel("Smoothed error") plt.title("Elbow plot") plt.savefig("output/{}_elbow_plot.png".format(pdb_id)) inertia = [] inertia_smooth = [] for i in range(maxCluster - 1): inertia.append(np.abs(errorSmooth[i] - errorSmooth[i + 1])) window_size_smooth = 10 for i in range(len(inertia)): frag = inertia[np.max([0, i - 1]):np.min( [i + window_size_smooth, len(inertia)])] inertia_smooth.append(sum(frag) / len(frag)) for i in range(maxCluster - 1): if inertia_smooth[i] <= np.mean(inertia_smooth) * elbowErr: k = i + 2 break cluster = clusters[k - 2] representatives = [] for rep in cluster: if rep not in representatives: representatives.append(rep) return cluster, k, representatives
return filter_array def plots_outlier(samples, outlier_x): outlier_y = [samples[x-1] for x in outlier_x] ax.plot(outlier_x, outlier_y, 'or') Y = samples samples = [(x, sample) for x, sample in enumerate(samples, start=1)] distance = distancematrix(samples) clusterid, error, nfound = kmedoids(distance, nclusters=groups, npass=10) clusterid = majority_filter(clusterid) """ #對資料做kmeans input = numpy.array(samples) whitened = whiten(input) ___, labels, ___ = k_means(X = whitened, n_clusters = groups) """ segments = [list() for i in range(len(clusterid)+1)] #產生五個新的獨立list #將不同群的資料放到不同列 for clusteri, sample in zip(clusterid, samples):
filter_array = [find_majority(array, index) for index in range(len(array))] return filter_array def plots_outlier(samples, outlier_x): outlier_y = [samples[x - 1] for x in outlier_x] ax.plot(outlier_x, outlier_y, 'or') Y = samples samples = [(x, sample) for x, sample in enumerate(samples, start=1)] distance = distancematrix(samples) clusterid, error, nfound = kmedoids(distance, nclusters=groups, npass=10) clusterid = majority_filter(clusterid) """ #對資料做kmeans input = numpy.array(samples) whitened = whiten(input) ___, labels, ___ = k_means(X = whitened, n_clusters = groups) """ segments = [list() for i in range(len(clusterid) + 1)] #產生五個新的獨立list #將不同群的資料放到不同列 for clusteri, sample in zip(clusterid, samples): segments[clusteri].append(sample)
def put_partition( self, metric, linkage_method, nclasses, names, criterion='distance', prune=True, ): """ Returns list of cluster assignments from linkage Criteria: 'maxclust' - set threshold as (maximum) number of groups to cluster into 'distance' - set threshold as cutpoint at which to separate dendrogram into clusters (threshold in range float(0,1)) """ dm = self.distance_matrices[metric] compound_key = (metric, linkage_method, nclasses) if nclasses == 1: T = [1] * len(names) self.partitions[compound_key] = T return if linkage_method == 'kmedoids': p = [] for i in range(100): p.append(kmedoids(dm, nclusters=nclasses, npass=100)) T = sorted(p, key=lambda x: x[1])[0][0] elif linkage_method == 'MDS': dbc = self.get_double_centre(dm) (eigvals, eigvecs, cve) = self.get_eigen(dbc, standardize=True) coords = self.get_coords_by_cutoff(eigvals, eigvecs, cve, 95, normalise=False) est = KMeans(n_clusters=nclasses) est.fit(coords) T = est.labels_ elif linkage_method == 'spectral': laplacian = self.spectral(dm, prune=prune) (eigvals, eigvecs, cve) = self.get_eigen(laplacian, standardize=False) coords = self.get_coords_by_dimension(eigvals, eigvecs, cve, nclasses, normalise=True)[0] est = KMeans(n_clusters=nclasses) est.fit(coords) T = est.labels_ elif linkage_method == 'affinity': T = self.affinity_propagation(dm, metric, nclasses) else: if metric == 'sym': size = len(dm) new = np.zeros((size, size)) for i in range(size): for j in range(i + 1, size): eps = np.random.normal(0, 0.001) if dm[i, j] + eps > 0: new[i, j] = new[j, i] = dm[i, j] + eps else: new[i, j] = new[j, i] = dm[i, j] - eps dm = new linkmat = linkage(dm, linkage_method) if criterion == 'distance': linkmat_size = len(linkmat) if nclasses <= 1: br_top = linkmat[linkmat_size - nclasses][2] else: br_top = linkmat[linkmat_size - nclasses + 1][2] if nclasses >= len(linkmat): br_bottom = 0 else: br_bottom = linkmat[linkmat_size - nclasses][2] threshold = 0.5 * (br_top + br_bottom) T = fcluster(linkmat, threshold, criterion=criterion) self.plotting_info[compound_key] = (linkmat, names, threshold) T = self.order(T) # puts cluster labels in ascending order self.partitions[compound_key] = T
def clustering(self, nclusters=20, min_elements=None): self.ids, _, _ = kmedoids(self.dist_matrix, nclusters=nclusters, npass=20)
from Bio.Cluster import kmedoids from Bio.Cluster import distancematrix import numpy as np data = np.array([[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [1, 2, 3, 4]]) matrix = distancematrix(data) # 뭐야 이거 왜 한영키 안먹어요 distances = distancematrix(data, dist='e') clusterid, error, nfound = kmedoids(distances) print("clusterid:", clusterid, "error:", error, "nfound:", nfound)
def main(): import argparse, textwrap parser = argparse.ArgumentParser( usage='Use "python %(prog)s --help" for more information', formatter_class=argparse.RawTextHelpFormatter, description=textwrap.dedent('''\ This program employs Bio.Cluster to run clustering analysis on given PDB files containing a trajectory of models.''')) parser.add_argument('pdb', help='[required] input PDB file.') parser.add_argument('--measure-type', default='rmsd', type=str, help=textwrap.dedent('''\ Now, the program only supports RMSD. The program will calculate a matrix of scores of this measure type, in which each score stands for the structural difference between two models in the PDB structure.''' )) parser.add_argument('--rmsd-selection', default='CA', type=str, help=textwrap.dedent('''\ This option specifies the selection used to align the models and calculate the RMSD values. Available selections are: 'CA' (only select CA atoms from the structures), 'backbone' (only select backbone atoms: N CA C), 'backbone_O' (only select backbone atoms: N CA C O), 'all' (select all protein atoms). The default value is CA.''')) parser.add_argument( '--hierarchical-clustering-analysis', default=False, action='store_true', help='If turned on, perform Hierarchical Clustering analysis.') parser.add_argument('--hierarchical-clustering-analysis-linkage-type', default='average', type=str, help=textwrap.dedent('''\ The following are methods for calculating the distance between the newly formed cluster u and each v. - method='single' assigns d(u,v)=min(dist(u[i],v[j])) for all points i in cluster u and j in cluster v. This is also known as the Nearest Point Algorithm. - method='complete' assigns d(u,v)=max(dist(u[i],v[j])) for all points i in cluster u and j in cluster v. This is also known by the Farthest Point Algorithm or Voor Hees Algorithm. - method='average' assigns d(u,v)=Sum_{ij}d(u[i],v[j])(|u|*|v|) for all points i and j where |u| and |v| are the cardinalities of clusters u and v, respectively. This is also called the UPGMA algorithm. ''')) parser.add_argument( '--hierarchical-clustering-analysis-fcluster-threshold', default=0.5, type=float, help=textwrap.dedent('''\ The threshold to apply when forming flat clusters. Here, the actual threshold used will be dist_mat.max() * fcluster_threshold. The criterion to use in forming flat clusters is set 'distance' so that the original observations in each flat cluster have no greater a cophenetic distance than the threshold. ''' )) parser.add_argument('--k-medoids-analysis', default=False, action='store_true', help='If turned on, perform K-Medoids analysis.') parser.add_argument( '--k-medoids-analysis-nclusters', default=2, type=int, help= 'The number of clusters in K-Medoids analysis. The default value is 2.' ) #parser.add_argument('--pca-analysis', default=False, action='store_true', # help = 'If turned on, perform Principal Component Analysis (PCA) analysis.') args = parser.parse_args() structure = Bio.PDB.PDBParser(QUIET=True).get_structure( 'structure', args.pdb) nmodel = len(structure) nres = len( [res for res in structure[0].get_residues() if res.get_id()[0] == ' ']) print '=' * 60 print '%i models detected in the pdb file.' % nmodel print '%i residues in the model' % nres print '=' * 60 if args.measure_type == 'rmsd': selection = args.rmsd_selection distance_matrix = calculate_RMSD_matrix(structure=structure, selection=selection) if args.hierarchical_clustering_analysis: distance_matrix_ = distance_matrix fc = hierarchical_clustering_analysis( distance_matrix_, linkage_type=args.hierarchical_clustering_analysis_linkage_type, fcluster_threshold=args. hierarchical_clustering_analysis_fcluster_threshold) print 'hierarchical_clustering_analysis' print fc with open(args.pdb + '.hierarchical_clustering_analysis.dat', 'w') as f: for tp in fc: print >> f, '%i %i' % (tp[0], tp[1]) if args.k_medoids_analysis: from Bio.Cluster import kmedoids ''' This function implements k-medoids clustering. kmedoids(distance, nclusters=2, npass=1, initialid=None) Arguments: - distance: The distance matrix between the elements. There are three ways in which you can pass a distance matrix: 1. a 2D Numerical Python array (in which only the left-lower part of the array will be accessed); 2. a 1D Numerical Python array containing the distances consecutively; 3. a list of rows containing the lower-triangular part of the distance matrix. Examples are: >>> distance = array([[0.0, 1.1, 2.3], ... [1.1, 0.0, 4.5], ... [2.3, 4.5, 0.0]]) (option #1) >>> distance = array([1.1, 2.3, 4.5]) (option #2) >>> distance = [array([]), ... array([1.1]), ... array([2.3, 4.5])] (option #3) These three correspond to the same distance matrix. - nclusters: number of clusters (the 'k' in k-medoids) - npass: the number of times the k-medoids clustering algorithm is performed, each time with a different (random) initial condition. - initialid: the initial clustering from which the algorithm should start. If initialid is not given, the routine carries out npass repetitions of the EM algorithm, each time starting from a different random initial clustering. If initialid is given, the routine carries out the EM algorithm only once, starting from the initial clustering specified by initialid and without randomizing the order in which items are assigned to clusters (i.e., using the same order as in the data matrix). In that case, the k-means algorithm is fully deterministic. Return values: - clusterid: array containing the number of the cluster to which each gene/microarray was assigned in the best k-means clustering solution that was found in the npass runs; - error: the within-cluster sum of distances for the returned k-means clustering solution; - nfound: the number of times this solution was found. Returns: clusterid, error, nfound ''' distance_matrix_ = distance_matrix clusterid, error, nfound = kmedoids( distance=distance_matrix_, nclusters=args.k_medoids_analysis_nclusters) print 'k_medoids_analysis' print clusterid print cluster_centroid_size(clusterid) with open(args.pdb + '.k_medoids_analysis.dat', 'w') as f: for tp in cluster_centroid_size(clusterid): print >> f, '%i %i' % (tp[0], tp[1])
def test_distancematrix_kmedoids(self): if TestCluster.module == 'Bio.Cluster': from Bio.Cluster import distancematrix, kmedoids elif TestCluster.module == 'Pycluster': from Pycluster import distancematrix, kmedoids data = numpy.array([[2.2, 3.3, 4.4], [2.1, 1.4, 5.6], [7.8, 9.0, 1.2], [4.5, 2.3, 1.5], [4.2, 2.4, 1.9], [3.6, 3.1, 9.3], [2.3, 1.2, 3.9], [4.2, 9.6, 9.3], [1.7, 8.9, 1.1]]) mask = numpy.array([[1, 1, 1], [1, 1, 1], [0, 1, 1], [1, 1, 1], [1, 1, 1], [0, 1, 0], [1, 1, 1], [1, 0, 1], [1, 1, 1]], int) weight = numpy.array([2.0, 1.0, 0.5]) matrix = distancematrix(data, mask=mask, weight=weight) self.assertAlmostEqual(matrix[1][0], 1.243, places=3) self.assertAlmostEqual(matrix[2][0], 25.073, places=3) self.assertAlmostEqual(matrix[2][1], 44.960, places=3) self.assertAlmostEqual(matrix[3][0], 4.510, places=3) self.assertAlmostEqual(matrix[3][1], 5.924, places=3) self.assertAlmostEqual(matrix[3][2], 29.957, places=3) self.assertAlmostEqual(matrix[4][0], 3.410, places=3) self.assertAlmostEqual(matrix[4][1], 4.761, places=3) self.assertAlmostEqual(matrix[4][2], 29.203, places=3) self.assertAlmostEqual(matrix[4][3], 0.077, places=3) self.assertAlmostEqual(matrix[5][0], 0.040, places=3) self.assertAlmostEqual(matrix[5][1], 2.890, places=3) self.assertAlmostEqual(matrix[5][2], 34.810, places=3) self.assertAlmostEqual(matrix[5][3], 0.640, places=3) self.assertAlmostEqual(matrix[5][4], 0.490, places=3) self.assertAlmostEqual(matrix[6][0], 1.301, places=3) self.assertAlmostEqual(matrix[6][1], 0.447, places=3) self.assertAlmostEqual(matrix[6][2], 42.990, places=3) self.assertAlmostEqual(matrix[6][3], 3.934, places=3) self.assertAlmostEqual(matrix[6][4], 3.046, places=3) self.assertAlmostEqual(matrix[6][5], 3.610, places=3) self.assertAlmostEqual(matrix[7][0], 8.002, places=3) self.assertAlmostEqual(matrix[7][1], 6.266, places=3) self.assertAlmostEqual(matrix[7][2], 65.610, places=3) self.assertAlmostEqual(matrix[7][3], 12.240, places=3) self.assertAlmostEqual(matrix[7][4], 10.952, places=3) self.assertAlmostEqual(matrix[7][5], 0.000, places=3) self.assertAlmostEqual(matrix[7][6], 8.720, places=3) self.assertAlmostEqual(matrix[8][0], 10.659, places=3) self.assertAlmostEqual(matrix[8][1], 19.056, places=3) self.assertAlmostEqual(matrix[8][2], 0.010, places=3) self.assertAlmostEqual(matrix[8][3], 16.949, places=3) self.assertAlmostEqual(matrix[8][4], 15.734, places=3) self.assertAlmostEqual(matrix[8][5], 33.640, places=3) self.assertAlmostEqual(matrix[8][6], 18.266, places=3) self.assertAlmostEqual(matrix[8][7], 18.448, places=3) clusterid, error, nfound = kmedoids(matrix, npass=1000) self.assertEqual(clusterid[0], 5) self.assertEqual(clusterid[1], 5) self.assertEqual(clusterid[2], 2) self.assertEqual(clusterid[3], 5) self.assertEqual(clusterid[4], 5) self.assertEqual(clusterid[5], 5) self.assertEqual(clusterid[6], 5) self.assertEqual(clusterid[7], 5) self.assertEqual(clusterid[8], 2) self.assertAlmostEqual(error, 7.680, places=3)