def k_medoids_clustering(dist_array, k_clusters):

    clusterid, error, nfound = kmedoids(dist_array,
                                        nclusters=k_clusters,
                                        npass=100)

    return clusterid
예제 #2
0
def pipe2():
	p = init()
	p.sampling(1000, random_sampling=True)
	p.calc_distance_matrix()
	p.seq_span = 80
	p.calc_distance_matrix()

	nclusters = 5
	clusterid, error, _ = kmedoids(p.dist_matrix, nclusters=nclusters, npass=20)
예제 #3
0
파일: clustering.py 프로젝트: kgori/treeCl
    def cluster(self, nclusters, noise=False, npass=100, nreps=1):

        if Biopython_Unavailable:
            print('kmedoids not available without Biopython')
            return

        matrix = self.get_dm(noise)

        p = [kmedoids(matrix, nclusters=nclusters, npass=npass) for _ in
             range(nreps)]
        p.sort(key=lambda x: x[1])
        return Partition(p[0][0])
예제 #4
0
    def run_kmedoids(self, dm, nclusters):

        if dm.metric == 'rf':
            matrix = dm.add_noise(dm.matrix)
        else:
            matrix = dm.matrix

        p = [kmedoids(matrix, nclusters=nclusters, npass=100) for _ in
             range(100)]
        p.sort(key=lambda x: x[1])
        T = self.order(p[0][0])
        return T
예제 #5
0
    def run_kmedoids(self, dm, nclusters):

        if dm.metric == 'rf':
            matrix = dm.add_noise(dm.matrix)
        else:
            matrix = dm.matrix

        p = [
            kmedoids(matrix, nclusters=nclusters, npass=100)
            for _ in range(100)
        ]
        p.sort(key=lambda x: x[1])
        T = self.order(p[0][0])
        return T
예제 #6
0
    def kmedoids(self, nclusters, noise=False, npass=100, nreps=1):

        if Biopython_Unavailable:
            print('kmedoids not available without Biopython')
            return

        matrix = self.get_dm(noise)

        p = [
            kmedoids(matrix, nclusters=nclusters, npass=npass)
            for _ in range(nreps)
        ]
        p.sort(key=lambda x: x[1])
        return Partition(p[0][0])
예제 #7
0
    def kmedoids(self, nclusters, noise=False):

        if Biopython_Unavailable:
            print 'kmedoids not available without Biopython'
            return

        if noise:
            matrix = self.distance_matrix.add_noise()
        else:
            matrix = self.distance_matrix

        p = [kmedoids(matrix, nclusters=nclusters, npass=100) for _ in
             range(100)]
        p.sort(key=lambda x: x[1])
        return Partition(p[0][0])
예제 #8
0
    def kmedoids(self, nclusters, noise=False):

        if Biopython_Unavailable:
            print 'kmedoids not available without Biopython'
            return

        if noise:
            matrix = self.distance_matrix.add_noise()
        else:
            matrix = self.distance_matrix

        p = [
            kmedoids(matrix, nclusters=nclusters, npass=100)
            for _ in range(100)
        ]
        p.sort(key=lambda x: x[1])
        return Partition(p[0][0])
예제 #9
0
def pipe1():
	p = HLAProcessor(
		'data/child10_S4_L001_R1_001.fastq',
		'data/child10_S4_L001_R2_001.fastq',
	)

	nsamples = 1000
	p.sampling(nsamples, random_sampling=True)
	p.seq_span = 80
	p.calc_distance_matrix()

	errors = []
	for nclusters in range(2, 10):
		_, error, nfound = kmedoids(p.dist_matrix, nclusters=nclusters, npass=10)
		print(nfound)

		errors.append(error)

	nclusters = list(range(2, 10))
	plt.plot(nclusters, errors)
	plt.show()
def elbow_clustering(distanceMatrix, elbowErr):
    maxCluster = N - 2
    clusters = [[] for x in range(maxCluster)]
    ks = np.zeros(maxCluster)
    err = np.zeros(maxCluster)
    for k in range(2, maxCluster + 2):
        clusters[k - 2], ks[k - 2], err[k - 2] = kmedoids(
            distance=distanceMatrix, nclusters=int(k), npass=100)
    window_size_smooth = 5
    errorSmooth = []
    for i in range(len(ks)):
        frag = ks[np.max([0, i - 1]):np.min([i + window_size_smooth, len(ks)])]
        errorSmooth.append(sum(frag) / len(frag))
    plt.plot(errorSmooth)
    plt.xlabel("Number of clusters")
    plt.ylabel("Smoothed error")
    plt.title("Elbow plot")
    plt.savefig("output/{}_elbow_plot.png".format(pdb_id))
    inertia = []
    inertia_smooth = []
    for i in range(maxCluster - 1):
        inertia.append(np.abs(errorSmooth[i] - errorSmooth[i + 1]))

    window_size_smooth = 10
    for i in range(len(inertia)):
        frag = inertia[np.max([0, i - 1]):np.min(
            [i + window_size_smooth, len(inertia)])]
        inertia_smooth.append(sum(frag) / len(frag))

    for i in range(maxCluster - 1):
        if inertia_smooth[i] <= np.mean(inertia_smooth) * elbowErr:
            k = i + 2
            break
    cluster = clusters[k - 2]
    representatives = []
    for rep in cluster:
        if rep not in representatives:
            representatives.append(rep)
    return cluster, k, representatives
예제 #11
0
    return filter_array
    
def plots_outlier(samples, outlier_x):
    outlier_y = [samples[x-1] for x in outlier_x]    
    
    ax.plot(outlier_x, outlier_y, 'or')


Y = samples
samples = [(x, sample) for x, sample in enumerate(samples, start=1)]

distance =  distancematrix(samples)



clusterid, error, nfound = kmedoids(distance, nclusters=groups, npass=10)

clusterid = majority_filter(clusterid)

"""
#對資料做kmeans    
input = numpy.array(samples)
whitened = whiten(input)    
___, labels, ___ = k_means(X = whitened, n_clusters = groups)
"""

segments = [list() for i in range(len(clusterid)+1)] #產生五個新的獨立list


#將不同群的資料放到不同列
for clusteri, sample in zip(clusterid, samples):
예제 #12
0
    filter_array = [find_majority(array, index) for index in range(len(array))]
    return filter_array


def plots_outlier(samples, outlier_x):
    outlier_y = [samples[x - 1] for x in outlier_x]

    ax.plot(outlier_x, outlier_y, 'or')


Y = samples
samples = [(x, sample) for x, sample in enumerate(samples, start=1)]

distance = distancematrix(samples)

clusterid, error, nfound = kmedoids(distance, nclusters=groups, npass=10)

clusterid = majority_filter(clusterid)
"""
#對資料做kmeans    
input = numpy.array(samples)
whitened = whiten(input)    
___, labels, ___ = k_means(X = whitened, n_clusters = groups)
"""

segments = [list() for i in range(len(clusterid) + 1)]  #產生五個新的獨立list

#將不同群的資料放到不同列
for clusteri, sample in zip(clusterid, samples):
    segments[clusteri].append(sample)
    def put_partition(
        self,
        metric,
        linkage_method,
        nclasses,
        names,
        criterion='distance',
        prune=True,
    ):
        """ Returns list of cluster assignments from linkage
            Criteria: 'maxclust' -  set threshold as (maximum) number 
                                    of groups to cluster into 
                      'distance' -  set threshold as cutpoint at which 
                                    to separate dendrogram into clusters 
                                    (threshold in range float(0,1)) """

        dm = self.distance_matrices[metric]
        compound_key = (metric, linkage_method, nclasses)
        if nclasses == 1:
            T = [1] * len(names)
            self.partitions[compound_key] = T
            return

        if linkage_method == 'kmedoids':
            p = []
            for i in range(100):
                p.append(kmedoids(dm, nclusters=nclasses, npass=100))
            T = sorted(p, key=lambda x: x[1])[0][0]
        elif linkage_method == 'MDS':

            dbc = self.get_double_centre(dm)
            (eigvals, eigvecs, cve) = self.get_eigen(dbc, standardize=True)
            coords = self.get_coords_by_cutoff(eigvals,
                                               eigvecs,
                                               cve,
                                               95,
                                               normalise=False)
            est = KMeans(n_clusters=nclasses)
            est.fit(coords)
            T = est.labels_
        elif linkage_method == 'spectral':

            laplacian = self.spectral(dm, prune=prune)
            (eigvals, eigvecs, cve) = self.get_eigen(laplacian,
                                                     standardize=False)
            coords = self.get_coords_by_dimension(eigvals,
                                                  eigvecs,
                                                  cve,
                                                  nclasses,
                                                  normalise=True)[0]
            est = KMeans(n_clusters=nclasses)
            est.fit(coords)
            T = est.labels_
        elif linkage_method == 'affinity':

            T = self.affinity_propagation(dm, metric, nclasses)
        else:
            if metric == 'sym':
                size = len(dm)
                new = np.zeros((size, size))
                for i in range(size):
                    for j in range(i + 1, size):
                        eps = np.random.normal(0, 0.001)
                        if dm[i, j] + eps > 0:
                            new[i, j] = new[j, i] = dm[i, j] + eps
                        else:
                            new[i, j] = new[j, i] = dm[i, j] - eps
                dm = new
            linkmat = linkage(dm, linkage_method)

            if criterion == 'distance':
                linkmat_size = len(linkmat)
                if nclasses <= 1:
                    br_top = linkmat[linkmat_size - nclasses][2]
                else:
                    br_top = linkmat[linkmat_size - nclasses + 1][2]
                if nclasses >= len(linkmat):
                    br_bottom = 0
                else:
                    br_bottom = linkmat[linkmat_size - nclasses][2]
                threshold = 0.5 * (br_top + br_bottom)
            T = fcluster(linkmat, threshold, criterion=criterion)
            self.plotting_info[compound_key] = (linkmat, names, threshold)

        T = self.order(T)  # puts cluster labels in ascending order
        self.partitions[compound_key] = T
예제 #14
0
 def clustering(self, nclusters=20, min_elements=None):
     self.ids, _, _ = kmedoids(self.dist_matrix,
                               nclusters=nclusters,
                               npass=20)
예제 #15
0
from Bio.Cluster import kmedoids
from Bio.Cluster import distancematrix
import numpy as np
data = np.array([[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [1, 2, 3, 4]])
matrix = distancematrix(data)
# 뭐야 이거 왜 한영키 안먹어요
distances = distancematrix(data, dist='e')
clusterid, error, nfound = kmedoids(distances)
print("clusterid:", clusterid, "error:", error, "nfound:", nfound)
def main():
    import argparse, textwrap
    parser = argparse.ArgumentParser(
        usage='Use "python %(prog)s --help" for more information',
        formatter_class=argparse.RawTextHelpFormatter,
        description=textwrap.dedent('''\
            This program employs Bio.Cluster to run clustering analysis 
            on given PDB files containing a trajectory of models.'''))
    parser.add_argument('pdb', help='[required] input PDB file.')
    parser.add_argument('--measure-type',
                        default='rmsd',
                        type=str,
                        help=textwrap.dedent('''\
            Now, the program only supports RMSD.
            The program will calculate a matrix of scores of this measure type, in which each 
            score stands for the structural difference between two models in the PDB structure.'''
                                             ))
    parser.add_argument('--rmsd-selection',
                        default='CA',
                        type=str,
                        help=textwrap.dedent('''\
            This option specifies the selection used to align the models and calculate the RMSD values.
            Available selections are: 
            'CA' (only select CA atoms from the structures), 
            'backbone' (only select backbone atoms: N CA C), 
            'backbone_O' (only select backbone atoms: N CA C O),
            'all' (select all protein atoms). 
            The default value is CA.'''))

    parser.add_argument(
        '--hierarchical-clustering-analysis',
        default=False,
        action='store_true',
        help='If turned on, perform Hierarchical Clustering analysis.')
    parser.add_argument('--hierarchical-clustering-analysis-linkage-type',
                        default='average',
                        type=str,
                        help=textwrap.dedent('''\
            The following are methods for calculating the distance between the newly formed cluster u and each v.
            - method='single' assigns 
                    d(u,v)=min(dist(u[i],v[j]))
            for all points i in cluster u and j in cluster v. 
            This is also known as the Nearest Point Algorithm.
            - method='complete' assigns 
                    d(u,v)=max(dist(u[i],v[j]))
            for all points i in cluster u and j in cluster v. 
            This is also known by the Farthest Point Algorithm or Voor Hees Algorithm.
            - method='average' assigns
                    d(u,v)=Sum_{ij}d(u[i],v[j])(|u|*|v|)
            for all points i and j where |u| and |v| are the cardinalities of clusters u and v, respectively. 
            This is also called the UPGMA algorithm. '''))
    parser.add_argument(
        '--hierarchical-clustering-analysis-fcluster-threshold',
        default=0.5,
        type=float,
        help=textwrap.dedent('''\
            The threshold to apply when forming flat clusters. 
            Here, the actual threshold used will be dist_mat.max() * fcluster_threshold. 
            The criterion to use in forming flat clusters is set 'distance' so that the original
            observations in each flat cluster have no greater a cophenetic distance than the threshold. '''
                             ))

    parser.add_argument('--k-medoids-analysis',
                        default=False,
                        action='store_true',
                        help='If turned on, perform K-Medoids analysis.')
    parser.add_argument(
        '--k-medoids-analysis-nclusters',
        default=2,
        type=int,
        help=
        'The number of clusters in K-Medoids analysis. The default value is 2.'
    )

    #parser.add_argument('--pca-analysis', default=False, action='store_true',
    #    help = 'If turned on, perform Principal Component Analysis (PCA) analysis.')

    args = parser.parse_args()

    structure = Bio.PDB.PDBParser(QUIET=True).get_structure(
        'structure', args.pdb)
    nmodel = len(structure)
    nres = len(
        [res for res in structure[0].get_residues() if res.get_id()[0] == ' '])
    print '=' * 60
    print '%i models detected in the pdb file.' % nmodel
    print '%i residues in the model' % nres
    print '=' * 60

    if args.measure_type == 'rmsd':
        selection = args.rmsd_selection
        distance_matrix = calculate_RMSD_matrix(structure=structure,
                                                selection=selection)

    if args.hierarchical_clustering_analysis:
        distance_matrix_ = distance_matrix
        fc = hierarchical_clustering_analysis(
            distance_matrix_,
            linkage_type=args.hierarchical_clustering_analysis_linkage_type,
            fcluster_threshold=args.
            hierarchical_clustering_analysis_fcluster_threshold)
        print 'hierarchical_clustering_analysis'
        print fc
        with open(args.pdb + '.hierarchical_clustering_analysis.dat',
                  'w') as f:
            for tp in fc:
                print >> f, '%i %i' % (tp[0], tp[1])

    if args.k_medoids_analysis:
        from Bio.Cluster import kmedoids
        '''
        This function implements k-medoids clustering.

        kmedoids(distance, nclusters=2, npass=1, initialid=None)
        
        Arguments:
        - distance: The distance matrix between the elements. There are three ways in which you can pass a distance matrix:
                    1. a 2D Numerical Python array (in which only the left-lower part of the array will be accessed);
                    2. a 1D Numerical Python array containing the distances consecutively;
                    3. a list of rows containing the lower-triangular part of the distance matrix.
                    Examples are:
                    >>> distance = array([[0.0, 1.1, 2.3],
                    ...                   [1.1, 0.0, 4.5],
                    ...                   [2.3, 4.5, 0.0]])
                    (option #1)
                    >>> distance = array([1.1, 2.3, 4.5])
                    (option #2)
                    >>> distance = [array([]),
                    ...             array([1.1]),
                    ...             array([2.3, 4.5])]
                    (option #3)
                    These three correspond to the same distance matrix.
        - nclusters: number of clusters (the 'k' in k-medoids)
        - npass: the number of times the k-medoids clustering algorithm is performed, each time with a different (random) initial condition.
        - initialid: the initial clustering from which the algorithm should start.
                     If initialid is not given, the routine carries out npass repetitions of the EM algorithm, each time starting from a
                     different random initial clustering. If initialid is given, the routine carries out the EM algorithm only once, starting
                     from the initial clustering specified by initialid and without randomizing the order in which items are assigned to
                     clusters (i.e., using the same order as in the data matrix).
                     In that case, the k-means algorithm is fully deterministic.
        Return values:
        - clusterid: array containing the number of the cluster to which each gene/microarray was assigned in the best k-means clustering
                     solution that was found in the npass runs;
        - error: the within-cluster sum of distances for the returned k-means clustering solution;
        - nfound: the number of times this solution was found.

        Returns: clusterid, error, nfound
        '''
        distance_matrix_ = distance_matrix
        clusterid, error, nfound = kmedoids(
            distance=distance_matrix_,
            nclusters=args.k_medoids_analysis_nclusters)
        print 'k_medoids_analysis'
        print clusterid
        print cluster_centroid_size(clusterid)
        with open(args.pdb + '.k_medoids_analysis.dat', 'w') as f:
            for tp in cluster_centroid_size(clusterid):
                print >> f, '%i %i' % (tp[0], tp[1])
예제 #17
0
    def test_distancematrix_kmedoids(self):
        if TestCluster.module == 'Bio.Cluster':
            from Bio.Cluster import distancematrix, kmedoids
        elif TestCluster.module == 'Pycluster':
            from Pycluster import distancematrix, kmedoids

        data = numpy.array([[2.2, 3.3, 4.4],
                            [2.1, 1.4, 5.6],
                            [7.8, 9.0, 1.2],
                            [4.5, 2.3, 1.5],
                            [4.2, 2.4, 1.9],
                            [3.6, 3.1, 9.3],
                            [2.3, 1.2, 3.9],
                            [4.2, 9.6, 9.3],
                            [1.7, 8.9, 1.1]])
        mask = numpy.array([[1, 1, 1],
                            [1, 1, 1],
                            [0, 1, 1],
                            [1, 1, 1],
                            [1, 1, 1],
                            [0, 1, 0],
                            [1, 1, 1],
                            [1, 0, 1],
                            [1, 1, 1]], int)
        weight = numpy.array([2.0, 1.0, 0.5])
        matrix = distancematrix(data, mask=mask, weight=weight)

        self.assertAlmostEqual(matrix[1][0], 1.243, places=3)

        self.assertAlmostEqual(matrix[2][0], 25.073, places=3)
        self.assertAlmostEqual(matrix[2][1], 44.960, places=3)

        self.assertAlmostEqual(matrix[3][0], 4.510, places=3)
        self.assertAlmostEqual(matrix[3][1], 5.924, places=3)
        self.assertAlmostEqual(matrix[3][2], 29.957, places=3)

        self.assertAlmostEqual(matrix[4][0], 3.410, places=3)
        self.assertAlmostEqual(matrix[4][1], 4.761, places=3)
        self.assertAlmostEqual(matrix[4][2], 29.203, places=3)
        self.assertAlmostEqual(matrix[4][3], 0.077, places=3)

        self.assertAlmostEqual(matrix[5][0], 0.040, places=3)
        self.assertAlmostEqual(matrix[5][1], 2.890, places=3)
        self.assertAlmostEqual(matrix[5][2], 34.810, places=3)
        self.assertAlmostEqual(matrix[5][3], 0.640, places=3)
        self.assertAlmostEqual(matrix[5][4], 0.490, places=3)

        self.assertAlmostEqual(matrix[6][0], 1.301, places=3)
        self.assertAlmostEqual(matrix[6][1], 0.447, places=3)
        self.assertAlmostEqual(matrix[6][2], 42.990, places=3)
        self.assertAlmostEqual(matrix[6][3], 3.934, places=3)
        self.assertAlmostEqual(matrix[6][4], 3.046, places=3)
        self.assertAlmostEqual(matrix[6][5], 3.610, places=3)

        self.assertAlmostEqual(matrix[7][0], 8.002, places=3)
        self.assertAlmostEqual(matrix[7][1], 6.266, places=3)
        self.assertAlmostEqual(matrix[7][2], 65.610, places=3)
        self.assertAlmostEqual(matrix[7][3], 12.240, places=3)
        self.assertAlmostEqual(matrix[7][4], 10.952, places=3)
        self.assertAlmostEqual(matrix[7][5], 0.000, places=3)
        self.assertAlmostEqual(matrix[7][6], 8.720, places=3)

        self.assertAlmostEqual(matrix[8][0], 10.659, places=3)
        self.assertAlmostEqual(matrix[8][1], 19.056, places=3)
        self.assertAlmostEqual(matrix[8][2], 0.010, places=3)
        self.assertAlmostEqual(matrix[8][3], 16.949, places=3)
        self.assertAlmostEqual(matrix[8][4], 15.734, places=3)
        self.assertAlmostEqual(matrix[8][5], 33.640, places=3)
        self.assertAlmostEqual(matrix[8][6], 18.266, places=3)
        self.assertAlmostEqual(matrix[8][7], 18.448, places=3)
        clusterid, error, nfound = kmedoids(matrix, npass=1000)
        self.assertEqual(clusterid[0], 5)
        self.assertEqual(clusterid[1], 5)
        self.assertEqual(clusterid[2], 2)
        self.assertEqual(clusterid[3], 5)
        self.assertEqual(clusterid[4], 5)
        self.assertEqual(clusterid[5], 5)
        self.assertEqual(clusterid[6], 5)
        self.assertEqual(clusterid[7], 5)
        self.assertEqual(clusterid[8], 2)
        self.assertAlmostEqual(error, 7.680, places=3)
    def put_partition(
        self,
        metric,
        linkage_method,
        nclasses,
        names,
        criterion='distance',
        prune=True,
        ):
        """ Returns list of cluster assignments from linkage
            Criteria: 'maxclust' -  set threshold as (maximum) number 
                                    of groups to cluster into 
                      'distance' -  set threshold as cutpoint at which 
                                    to separate dendrogram into clusters 
                                    (threshold in range float(0,1)) """

        dm = self.distance_matrices[metric]
        compound_key = (metric, linkage_method, nclasses)
        if nclasses == 1:
            T = [1] * len(names)
            self.partitions[compound_key] = T
            return

        if linkage_method == 'kmedoids':
            p = []
            for i in range(100):
                p.append(kmedoids(dm, nclusters=nclasses, npass=100))
            T = sorted(p, key=lambda x: x[1])[0][0]
        elif linkage_method == 'MDS':

            dbc = self.get_double_centre(dm)
            (eigvals, eigvecs, cve) = self.get_eigen(dbc,
                    standardize=True)
            coords = self.get_coords_by_cutoff(eigvals, eigvecs, cve,
                    95, normalise=False)
            est = KMeans(n_clusters=nclasses)
            est.fit(coords)
            T = est.labels_
        elif linkage_method == 'spectral':

            laplacian = self.spectral(dm, prune=prune)
            (eigvals, eigvecs, cve) = self.get_eigen(laplacian,
                    standardize=False)
            coords = self.get_coords_by_dimension(eigvals, eigvecs,
                    cve, nclasses, normalise=True)[0]
            est = KMeans(n_clusters=nclasses)
            est.fit(coords)
            T = est.labels_
        elif linkage_method == 'affinity':

            T = self.affinity_propagation(dm, metric, nclasses)
        else:
            if metric == 'sym':
                size = len(dm)
                new = np.zeros((size, size))
                for i in range(size):
                    for j in range(i + 1, size):
                        eps = np.random.normal(0, 0.001)
                        if dm[i, j] + eps > 0:
                            new[i, j] = new[j, i] = dm[i, j] + eps
                        else:
                            new[i, j] = new[j, i] = dm[i, j] - eps
                dm = new
            linkmat = linkage(dm, linkage_method)

            if criterion == 'distance':
                linkmat_size = len(linkmat)
                if nclasses <= 1:
                    br_top = linkmat[linkmat_size - nclasses][2]
                else:
                    br_top = linkmat[linkmat_size - nclasses + 1][2]
                if nclasses >= len(linkmat):
                    br_bottom = 0
                else:
                    br_bottom = linkmat[linkmat_size - nclasses][2]
                threshold = 0.5 * (br_top + br_bottom)
            T = fcluster(linkmat, threshold, criterion=criterion)
            self.plotting_info[compound_key] = (linkmat, names,
                    threshold)

        T = self.order(T)  # puts cluster labels in ascending order
        self.partitions[compound_key] = T