Esempio n. 1
0
def k_centers(X, n_clusters=8, metric='rmsd', random_state=None):
    """K-Centers clustering
    Cluster a vector or Trajectory dataset using a simple heuristic to minimize
    the maximum distance from any data point to its assigned cluster center.
    The runtime of this algorithm is O(kN), where k is the number of
    clusters and N is the size of the dataset, making it one of the least
    expensive clustering algorithms available.
    Parameters
    ----------
    n_clusters : int, optional, default: 8
        The number of clusters to form as well as the number of
        centroids to generate.
    metric : {"euclidean", "sqeuclidean", "cityblock", "chebyshev", "canberra",
              "braycurtis", "hamming", "jaccard", "cityblock", "rmsd"}
        The distance metric to use. metric = "rmsd" requires that sequences
        passed to ``fit()`` be ```md.Trajectory```; other distance metrics
        require ``np.ndarray``s.
    random_state : integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.
    References
    ----------
    .. [1] Gonzalez, Teofilo F. "Clustering to minimize the maximum
       intercluster distance." Theor. Comput. Sci. 38 (1985): 293-306.
    .. [2] Beauchamp, Kyle A., et al. "MSMBuilder2: modeling conformational
       dynamics on the picosecond to millisecond scale." J. Chem. Theory.
       Comput. 7.10 (2011): 3412-3419.
    Attributes
    ----------
    cluster_centers_ : array, [n_clusters, n_features] or md.Trajectory
        Coordinates of cluster centers
    labels_ : array, [n_samples,]
        The label of each point is an integer in [0, n_clusters).
    """
    n_samples = len(X)
    if random_state is -1:
        seed = check_random_state(None).randint(0, n_samples)
    else:
        seed = random_state
    print "seed=", seed
    cluster_centers_ = []
    cluster_centers_.append(seed)  #seed = random
    distances_ = pairwise_distances(X, index=seed, metric=metric)
    labels_ = np.zeros(len(X), dtype=np.int32)

    for i in xrange(1, n_clusters):
        MaxIndex = np.argmax(distances_)
        cluster_centers_.append(MaxIndex)
        #set the furthest point from existing center as a new center

        if distances_[ MaxIndex ] < 0:
            break

        new_distance_list = pairwise_distances(X, index=MaxIndex, metric=metric)
        updated_indices = np.where(new_distance_list < distances_)[0]
        distances_[ updated_indices ] = new_distance_list[ updated_indices ]
        labels_[ updated_indices ] = i

    return cluster_centers_, labels_
Esempio n. 2
0
def k_centers_assign(X,
                     centers=None,
                     n_clusters=8,
                     metric='rmsd',
                     random_state=None):
    """K-Centers clustering
    Cluster a vector or Trajectory dataset using a simple heuristic to minimize
    the maximum distance from any data point to its assigned cluster center.
    The runtime of this algorithm is O(kN), where k is the number of
    clusters and N is the size of the dataset, making it one of the least
    expensive clustering algorithms available.
    Parameters
    ----------
    n_clusters : int, optional, default: 8
        The number of clusters to form as well as the number of
        centroids to generate.
    metric : {"euclidean", "sqeuclidean", "cityblock", "chebyshev", "canberra",
              "braycurtis", "hamming", "jaccard", "cityblock", "rmsd"}
        The distance metric to use. metric = "rmsd" requires that sequences
        passed to ``fit()`` be ```md.Trajectory```; other distance metrics
        require ``np.ndarray``s.
    random_state : integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.
    References
    ----------
    .. [1] Gonzalez, Teofilo F. "Clustering to minimize the maximum
       intercluster distance." Theor. Comput. Sci. 38 (1985): 293-306.
    .. [2] Beauchamp, Kyle A., et al. "MSMBuilder2: modeling conformational
       dynamics on the picosecond to millisecond scale." J. Chem. Theory.
       Comput. 7.10 (2011): 3412-3419.
    Attributes
    ----------
    cluster_centers_ : array, [n_clusters, n_features] or md.Trajectory
        Coordinates of cluster centers
    labels_ : array, [n_samples,]
        The label of each point is an integer in [0, n_clusters).
    """
    n_samples = len(X)
    if centers is None:
        print("No Cluster Centers found!")

    n_centers = len(centers)
    print("N_Centers:", n_centers)
    print("N_samples:", n_samples)
    labels_ = np.zeros(n_samples, dtype=np.int32)
    #distances_ = np.zeros(n_centers, dtype=np.float32)
    for i in range(0, n_samples):
        distances_ = pairwise_distances(X=centers, Y=X, index=i, metric=metric)
        #distances_ = md.rmsd(centers, X, i, parallel=True, precentered=True)
        cluster_num = np.argmin(distances_)
        labels_[i] = cluster_num
    return labels_
Esempio n. 3
0
def k_centers_assign(X, centers=None, n_clusters=8, metric='rmsd', random_state=None):
    """K-Centers clustering
    Cluster a vector or Trajectory dataset using a simple heuristic to minimize
    the maximum distance from any data point to its assigned cluster center.
    The runtime of this algorithm is O(kN), where k is the number of
    clusters and N is the size of the dataset, making it one of the least
    expensive clustering algorithms available.
    Parameters
    ----------
    n_clusters : int, optional, default: 8
        The number of clusters to form as well as the number of
        centroids to generate.
    metric : {"euclidean", "sqeuclidean", "cityblock", "chebyshev", "canberra",
              "braycurtis", "hamming", "jaccard", "cityblock", "rmsd"}
        The distance metric to use. metric = "rmsd" requires that sequences
        passed to ``fit()`` be ```md.Trajectory```; other distance metrics
        require ``np.ndarray``s.
    random_state : integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.
    References
    ----------
    .. [1] Gonzalez, Teofilo F. "Clustering to minimize the maximum
       intercluster distance." Theor. Comput. Sci. 38 (1985): 293-306.
    .. [2] Beauchamp, Kyle A., et al. "MSMBuilder2: modeling conformational
       dynamics on the picosecond to millisecond scale." J. Chem. Theory.
       Comput. 7.10 (2011): 3412-3419.
    Attributes
    ----------
    cluster_centers_ : array, [n_clusters, n_features] or md.Trajectory
        Coordinates of cluster centers
    labels_ : array, [n_samples,]
        The label of each point is an integer in [0, n_clusters).
    """
    n_samples = len(X)
    if centers is None:
        print "No Cluster Centers found!"

    n_centers = len(centers)
    print "N_Centers:", n_centers
    print "N_samples:", n_samples
    labels_ = np.zeros(n_samples, dtype=np.int32)
    #distances_ = np.zeros(n_centers, dtype=np.float32)
    for i in xrange(0, n_samples):
        distances_ = pairwise_distances(X=centers, Y=X, index=i, metric=metric)
        #distances_ = md.rmsd(centers, X, i, parallel=True, precentered=True)
        cluster_num = np.argmin(distances_)
        labels_[ i ] = cluster_num
    return labels_
Esempio n. 4
0
def run_knn(X, n_neighbors=100, n_samples=1000, metric='rmsd', algorithm='vp_tree'):
    #    X = check_array(X, accept_sparse='csr')
    #print "Calculating pairwise ", metric, " distances of ", n_samples, " samples..."
    t0 = time.time()
    if metric is "rmsd":
        samples = random.sample(X, n_samples)
        whole_samples= reduce(operator.add, (samples[i] for i in xrange(len(samples))))
    else:
        whole_samples = random.sample(X, n_samples)
    sample_dist_metric = pairwise_distances( whole_samples, whole_samples, metric=metric )
    t1 = time.time()
    #print "time:", t1-t0,
    #print "Done."

    # Calculate neighborhood for all samples. This leaves the original point
    # in, which needs to be considered later

    #print "Calculating knn..."
    t0 = time.time()
    if metric is 'rmsd':
        shape_x = np.shape(X.xyz)
        knn = knnn.vp_tree_parallel( np.reshape(X.xyz, (shape_x[0] * shape_x[1] * shape_x[2])), shape_x[1] * 3, "rmsd_serial" )
        distances_, indices = knn.query( np.linspace(0, len(X.xyz)-1, len(X.xyz), dtype='int'), n_neighbors )
    else:
        if algorithm is 'vp_tree':
            shape_x = np.shape(X)
            #print "shape_x:", shape_x
            knn = knnn.vp_tree_parallel( np.reshape(X, (shape_x[0] * shape_x[1])), shape_x[1], "euclidean_serial" )
            distances_, indices = knn.query( np.linspace(0, len(X)-1, len(X), dtype='int'), n_neighbors )
        else:
            neighbors_model = NearestNeighbors(n_neighbors=n_neighbors, algorithm=algorithm, metric=metric)
            neighbors_model.fit(X)
            distances_, indices = neighbors_model.kneighbors(X, n_neighbors=n_neighbors, return_distance=True)


    t1 = time.time()
    #print "time:", t1-t0,
    #print "Done."
    # Calculate distance between sample, and find dc
    # np.savetxt("./sample_dist_metric.txt", sample_dist_metric, fmt="%f")
    #np.savetxt("./distances_.txt", distances_, fmt="%f")
    #np.savetxt("./indices.txt", indices, fmt="%d")
    return sample_dist_metric, distances_, indices
Esempio n. 5
0
def main():
    cli = argparse.ArgumentParser()
    cli.add_argument(
        '-t',
        '--trajListFns',
        default='trajlist',
        help='List of trajectory files to read in, separated by spaces.')
    cli.add_argument(
        '-a',
        '--atomListFns',
        default='atom_indices',
        help='List of atom index files to read in, separated by spaces.')
    cli.add_argument('-g',
                     '--topology',
                     default='native.pdb',
                     help='topology file.')
    cli.add_argument('-o',
                     '--homedir',
                     help='Home dir.',
                     default=".",
                     type=str)
    cli.add_argument('-e',
                     '--iext',
                     help='''The file extension of input trajectory
                     files.  Must be a filetype that mdtraj.load() can recognize.''',
                     default="xtc",
                     type=str)
    cli.add_argument('-n',
                     '--n_clusters',
                     help='''n_clusters.''',
                     default=100,
                     type=int)
    cli.add_argument('-m',
                     '--n_macro_states',
                     help='''n_macro_states.''',
                     default=6,
                     type=int)
    cli.add_argument('-s', '--stride', help='stride.', default=None, type=int)

    args = cli.parse_args()
    trajlistname = args.trajListFns
    atom_indicesname = args.atomListFns
    trajext = args.iext
    File_TOP = args.topology
    homedir = args.homedir
    n_clusters = args.n_clusters
    n_macro_states = args.n_macro_states
    stride = args.stride
    # ===========================================================================
    # Reading Trajs from XTC files
    #print "stride:", stride
    #trajreader = XTCReader(trajlistname, atom_indicesname, homedir, trajext, File_TOP, nSubSample=stride)
    #trajs = trajreader.trajs
    #print(trajs)
    #traj_len = trajreader.traj_len
    #np.savetxt("./traj_len.txt", traj_len, fmt="%d")

    if os.path.isfile(
            "./phi_angles.txt") and os.path.isfile("./psi_angles.txt") is True:
        phi_angles = np.loadtxt("./phi_angles.txt", dtype=np.float32)
        psi_angles = np.loadtxt("./psi_angles.txt", dtype=np.float32)
    else:
        #phi_angles, psi_angles = trajreader.get_phipsi(trajs, psi=[6, 8, 14, 16], phi=[4, 6, 8, 14])
        phi_angles, psi_angles = trajreader.get_phipsi(trajs,
                                                       psi=[5, 7, 13, 15],
                                                       phi=[3, 5, 7, 13])
        np.savetxt("./phi_angles.txt", phi_angles, fmt="%f")
        np.savetxt("./psi_angles.txt", psi_angles, fmt="%f")

    phi_psi = np.column_stack((phi_angles, psi_angles))

    n_samples = 1000
    percent = 0.9
    import random
    whole_samples = random.sample(list(phi_psi), n_samples)
    #print whole_samples
    from metrics.pairwise import pairwise_distances
    sample_dist_metric = pairwise_distances(whole_samples,
                                            whole_samples,
                                            metric='euclidean')
    print(sample_dist_metric.shape)
    sample_dist = []
    for i in range(0, n_samples):
        for j in range(i + 1, n_samples):
            sample_dist.append(sample_dist_metric[i, j])
    sorted_sample_dist = np.sort(sample_dist)
    print("Len of samples:", len(sorted_sample_dist),
          np.max(sorted_sample_dist), np.min(sorted_sample_dist))

    eps_list = []
    len_samples = len(sorted_sample_dist)
    for percent in [0.05, 0.025, 0.008]:  #,0.005, 0.003,
        #                   0.002, 0.001, 0.0008, 0.0005, 0.0003, 0.0002, 0.0001, 0.00005]:
        percent /= 10.0
        index = int(round(len_samples * percent))
        if index == len_samples:
            index -= 1
        dc = sorted_sample_dist[index]
        #print index, sorted_sample_dist[index]
        eps_list.append(dc)
    print(eps_list)

    # from sklearn.neighbors import NearestNeighbors
    # print len(phi_psi)
    # neighborhoods_model = NearestNeighbors(n_neighbors=len(phi_psi), algorithm='kd_tree')
    # neighborhoods_model.fit(phi_psi)
    # #distances, indices = neighborhoods_model.kneighbors(phi_psi)
    # distances, indices = neighborhoods_model.kneighbors(phi_psi, 5)
    # print distances

    #print phi_psi
    # ===========================================================================
    # do Clustering using MR -DBSCAN method
    clustering_name = "mr-dbscan_iter_"
    potential = True
    # potential = False
    #eps = eps_list[0]
    eps = 9.376904
    min_samples = 1
    len_frames = len(phi_psi)
    print("Total frames:", len_frames)
    print("Running first calculation")
    db = DBSCAN(eps=eps, min_samples=min_samples,
                algorithm='kd_tree').fit(phi_psi)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    old_assignments = db.labels_
    n_microstates = len(
        set(old_assignments)) - (1 if -1 in old_assignments else 0)
    print('Estimated number of clusters: %d' % n_microstates)

    # Calculating percentage of each states
    frame_bincount = np.bincount(
        old_assignments[old_assignments >= 0])  #remove outliers
    frame_freq_index_sorted = np.argsort(
        frame_bincount)[::-1]  # descending arg sort
    frame_freq_percent_sorted = frame_bincount[
        frame_freq_index_sorted] / np.float32(len_frames)
    print(frame_freq_percent_sorted[0:10])
    print(frame_freq_index_sorted[0:10])
    old_frame_freq_percent_sorted = frame_freq_percent_sorted
    old_frame_freq_index_sorted = frame_freq_index_sorted

    iter_name = clustering_name + '0' + '_eps_' + str(
        eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str(
            n_microstates)
    plot_cluster(labels=old_assignments,
                 phi_angles=phi_angles,
                 psi_angles=psi_angles,
                 name=iter_name,
                 potential=potential)

    n_iterations = len(eps_list)
    print("n_iterations:", n_iterations)
    eps_list = [9.376904, 3.3741567, 0.87675905]
    min_samples_list = [1, 20, 20]
    #min_samples_list = [50, 30, 20, 15, 10, 8, 5, 2]
    n_min_samples = len(min_samples_list)
    #eps_list = [3.0, 2.0, 1.0, 0.8, 0.5]
    #min_samples_list = [3, 3, 3, 3, 3, 2, 2]

    r  #esults = np.zeros((n_min_samples,n_iterations,len_frames), dtype=np.int32)
    results = np.zeros((n_iterations, len_frames), dtype=np.int32)
    for i in range(0, n_iterations):
        #for j in range(0, n_min_samples):
        eps = eps_list[i]
        min_samples = min_samples_list[i]
        db = DBSCAN(eps=eps, min_samples=min_samples).fit(phi_psi)
        '''
            core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
            core_samples_mask[db.core_sample_indices_] = True
            new_assignments = db.labels_
            if i < 7:
                remove_outliers = True
            else:
                remove_outliers = False
            assignments = merge_assignments(new_assignments, old_assignments, remove_outliers=remove_outliers)
            n_microstates = len(set(assignments)) - (1 if -1 in assignments else 0)
    
            # Calculating percentage of each states
            frame_bincount = np.bincount(assignments[assignments >= 0])  # remove outliers
            frame_freq_index_sorted = np.argsort(frame_bincount)[::-1]  # descending arg sort
            frame_freq_percent_sorted = frame_bincount[frame_freq_index_sorted] / np.float32(len_frames)
            frame_freq_percent_sorted = frame_freq_percent_sorted[0:10]
            frame_freq_index_sorted = frame_freq_index_sorted[0:10]
            print frame_freq_percent_sorted
            print frame_freq_index_sorted
            old_frame_freq_index_sorted = []
            for j in xrange(0, 10):
                index = np.argwhere(assignments==frame_freq_index_sorted[j])[0]
                old_frame_freq_index_sorted.append(old_assignments[index][0])
            print old_frame_freq_index_sorted
            '''
        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        assignments = db.labels_
        n_microstates = len(set(assignments)) - (1 if -1 in assignments else 0)
        #results[j,i, :]= np.array(assignments)
        results[i, :] = np.array(assignments)
        print("Iter:", i, "Running MR-DBSCAN at eps:", eps, 'min_sampes:',
              min_samples, 'Estimated number of clusters:', n_microstates)
        #print('Estimated number of clusters: %d' % n_microstates)
        iter_name = clustering_name + str(i) + '_eps_' + str(
            eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str(
                n_microstates)
        plot_cluster(labels=assignments,
                     phi_angles=phi_angles,
                     psi_angles=psi_angles,
                     name=iter_name,
                     potential=potential)
        #old_assignments = assignments
    print(results)
    np.save("results.npy", results)
    #np.savetxt("results.csv", results, fmt="%d", delimiter=",")
    np.savetxt("eps_list.txt", eps_list, fmt="%f", delimiter=",")
    np.savetxt("min_samples_list.txt",
               min_samples_list,
               fmt="%d",
               delimiter=",")
Esempio n. 6
0
def k_centers(X, n_clusters=8, metric='rmsd', random_state=None):
    """K-Centers clustering
    Cluster a vector or Trajectory dataset using a simple heuristic to minimize
    the maximum distance from any data point to its assigned cluster center.
    The runtime of this algorithm is O(kN), where k is the number of
    clusters and N is the size of the dataset, making it one of the least
    expensive clustering algorithms available.
    Parameters
    ----------
    n_clusters : int, optional, default: 8
        The number of clusters to form as well as the number of
        centroids to generate.
    metric : {"euclidean", "sqeuclidean", "cityblock", "chebyshev", "canberra",
              "braycurtis", "hamming", "jaccard", "cityblock", "rmsd"}
        The distance metric to use. metric = "rmsd" requires that sequences
        passed to ``fit()`` be ```md.Trajectory```; other distance metrics
        require ``np.ndarray``s.
    random_state : integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.
    References
    ----------
    .. [1] Gonzalez, Teofilo F. "Clustering to minimize the maximum
       intercluster distance." Theor. Comput. Sci. 38 (1985): 293-306.
    .. [2] Beauchamp, Kyle A., et al. "MSMBuilder2: modeling conformational
       dynamics on the picosecond to millisecond scale." J. Chem. Theory.
       Comput. 7.10 (2011): 3412-3419.
    Attributes
    ----------
    cluster_centers_ : array, [n_clusters, n_features] or md.Trajectory
        Coordinates of cluster centers
    labels_ : array, [n_samples,]
        The label of each point is an integer in [0, n_clusters).
    """
    n_samples = len(X)
    if random_state is -1:
        seed = check_random_state(None).randint(0, n_samples)
    else:
        seed = random_state
    print("seed=", seed)
    cluster_centers_ = []
    cluster_centers_.append(seed)  #seed = random
    distances_ = pairwise_distances(X, index=seed, metric=metric)
    labels_ = np.zeros(len(X), dtype=np.int32)

    for i in range(1, n_clusters):
        MaxIndex = np.argmax(distances_)
        cluster_centers_.append(MaxIndex)
        #set the furthest point from existing center as a new center

        if distances_[MaxIndex] < 0:
            break

        new_distance_list = pairwise_distances(X,
                                               index=MaxIndex,
                                               metric=metric)
        updated_indices = np.where(new_distance_list < distances_)[0]
        distances_[updated_indices] = new_distance_list[updated_indices]
        labels_[updated_indices] = i

    return cluster_centers_, labels_
def main():
    cli = argparse.ArgumentParser()
    cli.add_argument('-e', '--eps', help='eps', default=1, type=float)
    cli.add_argument('-m',
                     '--min_samples',
                     help='min_samples',
                     default=5,
                     type=int)
    cli.add_argument('-l', '--nlist', help='nlist', default=1000, type=int)
    cli.add_argument('-p', '--nprobe', help='nprob', default=10, type=int)

    # Download example dataset
    from msmbuilder.example_datasets import AlanineDipeptide
    ala2 = AlanineDipeptide(verbose=False)
    xyz = ala2.get().trajectories
    print(ala2.description())

    #xyz = [t[::10] for t in xyz]
    print("{} trajectories".format(len(xyz)))
    # msmbuilder does not keep track of units! You must keep track of your
    # data's timestep
    to_ns = 0.5
    print("with length {} ns".format(set(len(x) * to_ns for x in xyz)))

    from msmbuilder.featurizer import DihedralFeaturizer
    featurizer = DihedralFeaturizer(types=['phi', 'psi'])
    diheds = featurizer.fit_transform(xyz)

    print(xyz[0].xyz.shape)
    print(diheds[0].shape)

    from msmbuilder.preprocessing import RobustScaler
    scaler = RobustScaler()
    scaled_diheds = scaler.fit_transform(diheds)

    print(diheds[0].shape)
    print(scaled_diheds[0].shape)

    from msmbuilder.decomposition import tICA
    tica_model = tICA(lag_time=2, n_components=2)
    # fit and transform can be done in seperate steps:
    tica_model.fit(diheds)

    tica_trajs = tica_model.transform(diheds)
    featurizer = DihedralFeaturizer(types=['phi', 'psi'], sincos=False)
    diheds = featurizer.fit_transform(xyz)
    print(diheds[0].shape)
    print(tica_trajs[0].shape)

    # ===========================================================================
    #if os.path.isfile("./phi_angles.txt") and os.path.isfile("./psi_angles.txt") is True:
    #    phi_angles = np.loadtxt("./phi_angles.txt", dtype=np.float32)
    #    psi_angles = np.loadtxt("./psi_angles.txt", dtype=np.float32)
    #X = np.column_stack((phi_angles, psi_angles))
    #print(X.shape)
    phi_angles = np.degrees(diheds[0][:, 0])
    psi_angles = np.degrees(diheds[0][:, 1])
    print(phi_angles)
    X = tica_trajs[0].astype(np.float32)
    #rint(X)
    n_size = X.shape[0]
    dimension = X.shape[1]

    #print(X.shape)

    # ===========================================================================
    args = cli.parse_args()
    eps = args.eps  # eps
    min_samples = args.min_samples  # min_samples
    nlist = args.nlist
    nprobe = args.nprobe
    IVFFlat = True
    print('n_size = %d,\t dimension = %d,\t eps = %f, min_samples = %d' %
          (n_size, dimension, eps, min_samples))

    n_samples = 1000
    percent = 0.9
    import random
    whole_samples = random.sample(list(X), n_samples)
    #print whole_samples
    from metrics.pairwise import pairwise_distances
    sample_dist_metric = pairwise_distances(whole_samples,
                                            whole_samples,
                                            metric='l2')
    print(sample_dist_metric.shape)
    sample_dist = []
    for i in range(0, n_samples):
        for j in range(i + 1, n_samples):
            sample_dist.append(sample_dist_metric[i, j])
    sorted_sample_dist = np.sort(sample_dist)
    print("Len of samples:", len(sorted_sample_dist),
          np.max(sorted_sample_dist), np.min(sorted_sample_dist))

    eps_list = []
    len_samples = len(sorted_sample_dist)
    for percent in [0.30, 0.20, 0.10]:  #,0.005, 0.003,
        #                   0.002, 0.001, 0.0008, 0.0005, 0.0003, 0.0002, 0.0001, 0.00005]:
        #percent /= 10.0
        index = int(round(len_samples * percent))
        if index == len_samples:
            index -= 1
        dc = sorted_sample_dist[index]
        #print index, sorted_sample_dist[index]
        eps_list.append(dc)
    print(eps_list)

    #print X
    # ===========================================================================
    # do Clustering using MR -DBSCAN method
    clustering_name = "mr-dbscan_iter_"
    #potential = True
    remove_outliers = False
    potential = False
    eps = eps_list[0]
    min_samples = 1
    len_frames = len(X)
    print("Total frames:", len_frames)
    print("Running first calculation")
    db = Faiss_DBSCAN(eps=eps,
                      min_samples=min_samples,
                      nlist=nlist,
                      nprobe=nprobe,
                      metric="l2",
                      GPU=False,
                      IVFFlat=IVFFlat)
    db.fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    old_assignments = db.labels_
    n_microstates = len(
        set(old_assignments)) - (1 if -1 in old_assignments else 0)
    print('Estimated number of clusters: %d' % n_microstates)

    # Calculating percentage of each states
    frame_bincount = np.bincount(
        old_assignments[old_assignments >= 0])  #remove outliers
    frame_freq_index_sorted = np.argsort(
        frame_bincount)[::-1]  # descending arg sort
    frame_freq_percent_sorted = frame_bincount[
        frame_freq_index_sorted] / np.float32(len_frames)
    print(frame_freq_percent_sorted[0:10])
    print(frame_freq_index_sorted[0:10])
    old_frame_freq_percent_sorted = frame_freq_percent_sorted
    old_frame_freq_index_sorted = frame_freq_index_sorted
    n_microstates = len(
        set(old_assignments)) - (1 if -1 in old_assignments else 0)
    print('Estimated number of clusters: %d' % n_microstates)
    iter_name = clustering_name + '0' + '_eps_' + str(
        eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str(
            n_microstates)
    plot_cluster(labels=old_assignments,
                 phi_angles=phi_angles,
                 psi_angles=psi_angles,
                 name=iter_name,
                 potential=potential)

    n_iterations = len(eps_list)
    print("n_iterations:", n_iterations)
    min_samples_list = [50, 30, 10]
    #min_samples_list = [50, 30, 20, 15, 10, 8, 5, 2]
    n_min_samples = len(min_samples_list)
    #eps_list = [3.0, 2.0, 1.0, 0.8, 0.5]
    #min_samples_list = [3, 3, 3, 3, 3, 2, 2]

    results = np.zeros((n_min_samples, n_iterations, len_frames),
                       dtype=np.int32)
    for i in range(1, n_iterations):
        eps = eps_list[i]
        min_samples = min_samples_list[i]
        db = Faiss_DBSCAN(eps=eps,
                          min_samples=min_samples,
                          nlist=nlist,
                          nprobe=nprobe,
                          metric="l2",
                          GPU=False,
                          IVFFlat=IVFFlat).fit(X)

        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        new_assignments = db.labels_
        if i is n_iterations - 1:
            remove_outliers = True
        #else:
        #    remove_outliers = False
        assignments = merge_assignments(new_assignments,
                                        old_assignments,
                                        remove_outliers=remove_outliers)
        n_microstates = len(set(assignments)) - (1 if -1 in assignments else 0)

        #results[j,i, :]= np.array(assignments)
        print("Iter:", i, "Running MR-DBSCAN at eps:", eps, 'min_sampes:',
              min_samples, 'Estimated number of clusters:', n_microstates)
        #print('Estimated number of clusters: %d' % n_microstates)
        iter_name = clustering_name + str(i) + '_eps_' + str(
            eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str(
                n_microstates)
        plot_cluster(labels=assignments,
                     phi_angles=phi_angles,
                     psi_angles=psi_angles,
                     name=iter_name,
                     potential=potential)
        #old_assignments = assignments
    #print(results)
    #np.save("results.npy", results)
    #np.savetxt("results.csv", results, fmt="%d", delimiter=",")
    np.savetxt("eps_list.txt", eps_list, fmt="%f", delimiter=",")
    np.savetxt("min_samples_list.txt",
               min_samples_list,
               fmt="%d",
               delimiter=",")
def main():
    cli = argparse.ArgumentParser()
    cli.add_argument('-e', '--eps', help='eps', default=1, type=float)
    cli.add_argument('-m', '--min_samples', help='min_samples', default=5, type=int)
    cli.add_argument('-l', '--nlist', help='nlist', default=1000, type=int)
    cli.add_argument('-p', '--nprobe', help='nprob', default=10, type=int)
    # ===========================================================================
    if os.path.isfile("./phi_angles.txt") and os.path.isfile("./psi_angles.txt") is True:
        phi_angles = np.loadtxt("./phi_angles.txt", dtype=np.float32)
        psi_angles = np.loadtxt("./psi_angles.txt", dtype=np.float32)
    X = np.column_stack((phi_angles, psi_angles))
    print(X.shape)
    n_size = X.shape[0]
    dimension = X.shape[1]
    # ===========================================================================
    args = cli.parse_args()
    eps = args.eps  # eps
    min_samples = args.min_samples  # min_samples
    nlist = args.nlist
    nprobe = args.nprobe
    IVFFlat = True
    print('n_size = %d,\t dimension = %d,\t eps = %f, min_samples = %d' % (n_size, dimension, eps, min_samples))

    n_samples = 1000
    percent = 0.9
    import random
    whole_samples = random.sample(list(X), n_samples)
    #print whole_samples
    from metrics.pairwise import pairwise_distances
    sample_dist_metric = pairwise_distances(whole_samples, whole_samples, metric='l2')
    print(sample_dist_metric.shape)
    sample_dist = []
    for i in range(0, n_samples):
        for j in range(i+1, n_samples):
            sample_dist.append(sample_dist_metric[i, j])
    sorted_sample_dist = np.sort(sample_dist)
    print("Len of samples:", len(sorted_sample_dist), np.max(sorted_sample_dist), np.min(sorted_sample_dist))


    eps_list = []
    len_samples = len(sorted_sample_dist)
    for percent in [0.20, 0.05, 0.020 ]: #,0.005, 0.003,
 #                   0.002, 0.001, 0.0008, 0.0005, 0.0003, 0.0002, 0.0001, 0.00005]:
        #percent /= 10.0
        index = int(round(len_samples*percent))
        if index == len_samples:
            index -= 1
        dc = sorted_sample_dist[index]
        #print index, sorted_sample_dist[index]
        eps_list.append(dc)
    print(eps_list)

    #print X
    # ===========================================================================
    # do Clustering using MR -DBSCAN method
    clustering_name = "mr-dbscan_iter_"
    potential = True
    remove_outliers = False
    # potential = False
    eps = eps_list[0]
    min_samples = 1
    len_frames = len(X)
    print("Total frames:", len_frames)
    print("Running first calculation")
    db = Faiss_DBSCAN(eps=eps, min_samples=min_samples, nlist=nlist, nprobe=nprobe, metric="l2", GPU=False, IVFFlat=IVFFlat)
    db.fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    old_assignments = db.labels_
    n_microstates = len(set(old_assignments)) - (1 if -1 in old_assignments else 0)
    print('Estimated number of clusters: %d' % n_microstates)

    # Calculating percentage of each states
    frame_bincount = np.bincount(old_assignments[old_assignments>=0]) #remove outliers
    frame_freq_index_sorted = np.argsort(frame_bincount)[::-1]  # descending arg sort
    frame_freq_percent_sorted = frame_bincount[frame_freq_index_sorted]/np.float32(len_frames)
    print(frame_freq_percent_sorted[0:10])
    print(frame_freq_index_sorted[0:10])
    old_frame_freq_percent_sorted = frame_freq_percent_sorted
    old_frame_freq_index_sorted = frame_freq_index_sorted

    iter_name = clustering_name + '0' + '_eps_' + str(eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str(n_microstates)
    plot_cluster(labels=old_assignments, phi_angles=phi_angles, psi_angles=psi_angles, name=iter_name, potential=potential)

    n_iterations = len(eps_list)
    print("n_iterations:", n_iterations)
    min_samples_list = [10, 20, 20]
    #min_samples_list = [50, 30, 20, 15, 10, 8, 5, 2]
    n_min_samples = len(min_samples_list)
    #eps_list = [3.0, 2.0, 1.0, 0.8, 0.5]
    #min_samples_list = [3, 3, 3, 3, 3, 2, 2]

    results = np.zeros((n_min_samples,n_iterations,len_frames), dtype=np.int32)
    for i in range(1, n_iterations):
            eps = eps_list[i]
            min_samples = min_samples_list[i]
            db = Faiss_DBSCAN(eps=eps, min_samples=min_samples, nlist=nlist, nprobe=nprobe, metric="l2", GPU=False, IVFFlat=IVFFlat).fit(X)
        

            core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
            core_samples_mask[db.core_sample_indices_] = True
            new_assignments = db.labels_
            if i is n_iterations - 1:
                remove_outliers = True
            #else:
            #    remove_outliers = False
            assignments = merge_assignments(new_assignments, old_assignments, remove_outliers=remove_outliers)
            n_microstates = len(set(assignments)) - (1 if -1 in assignments else 0)

            #results[j,i, :]= np.array(assignments)
            print("Iter:", i, "Running MR-DBSCAN at eps:", eps, 'min_sampes:', min_samples, 'Estimated number of clusters:', n_microstates) 
            #print('Estimated number of clusters: %d' % n_microstates)
            iter_name = clustering_name + str(i) + '_eps_' + str(eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str(n_microstates)
            plot_cluster(labels=assignments, phi_angles=phi_angles, psi_angles=psi_angles, name=iter_name, potential=potential)
            #old_assignments = assignments
    #print(results)
    #np.save("results.npy", results)
    #np.savetxt("results.csv", results, fmt="%d", delimiter=",")
    np.savetxt("eps_list.txt", eps_list, fmt="%f", delimiter=",")
    np.savetxt("min_samples_list.txt", min_samples_list, fmt="%d", delimiter=",")
Esempio n. 9
0
def main():
    cli = argparse.ArgumentParser()
    cli.add_argument(
        '-t',
        '--trajListFns',
        default='trajlist',
        help='List of trajectory files to read in, separated by spaces.')
    cli.add_argument(
        '-a',
        '--atomListFns',
        default='atom_indices',
        help='List of atom index files to read in, separated by spaces.')
    cli.add_argument('-g',
                     '--topology',
                     default='native.pdb',
                     help='topology file.')
    cli.add_argument('-o',
                     '--homedir',
                     help='Home dir.',
                     default=".",
                     type=str)
    cli.add_argument('-e',
                     '--iext',
                     help='''The file extension of input trajectory
                     files.  Must be a filetype that mdtraj.load() can recognize.''',
                     default="xtc",
                     type=str)
    cli.add_argument('-n',
                     '--n_clusters',
                     help='''n_clusters.''',
                     default=100,
                     type=int)
    cli.add_argument('-m',
                     '--n_macro_states',
                     help='''n_macro_states.''',
                     default=6,
                     type=int)
    cli.add_argument('-s', '--stride', help='stride.', default=None, type=int)

    args = cli.parse_args()
    trajlistname = args.trajListFns
    atom_indicesname = args.atomListFns
    trajext = args.iext
    File_TOP = args.topology
    homedir = args.homedir
    n_clusters = args.n_clusters
    n_macro_states = args.n_macro_states
    stride = args.stride
    # ===========================================================================
    # Reading Trajs from XTC files
    #print "stride:", stride
    #trajreader = XTCReader(trajlistname, atom_indicesname, homedir, trajext, File_TOP, nSubSample=stride)
    #trajs = trajreader.trajs
    #print trajs
    #traj_len = trajreader.traj_len
    #np.savetxt("./traj_len.txt", traj_len, fmt="%d")

    if os.path.isfile(
            "./phi_angles.txt") and os.path.isfile("./psi_angles.txt") is True:
        phi_angles = np.loadtxt("./phi_angles.txt", dtype=np.float32)
        psi_angles = np.loadtxt("./psi_angles.txt", dtype=np.float32)
    else:
        phi_angles, psi_angles = trajreader.get_phipsi(trajs,
                                                       psi=[6, 8, 14, 16],
                                                       phi=[4, 6, 8, 14])
        #phi_angles, psi_angles = trajreader.get_phipsi(trajs, psi=[5, 7, 13, 15], phi=[3, 5, 7, 13])
        np.savetxt("./phi_angles.txt", phi_angles, fmt="%f")
        np.savetxt("./psi_angles.txt", psi_angles, fmt="%f")

    phi_psi = np.column_stack((phi_angles, psi_angles))

    n_samples = 1000
    percent = 0.9
    import random
    whole_samples = random.sample(phi_psi, n_samples)
    #print whole_samples
    from metrics.pairwise import pairwise_distances
    sample_dist_metric = pairwise_distances(whole_samples,
                                            whole_samples,
                                            metric='rmsd')
    print sample_dist_metric.shape
    sample_dist = []
    for i in xrange(0, n_samples):
        for j in xrange(i + 1, n_samples):
            sample_dist.append(sample_dist_metric[i, j])
    sorted_sample_dist = np.sort(sample_dist)
    print "Len of samples:", len(sorted_sample_dist), np.max(
        sorted_sample_dist), np.min(sorted_sample_dist)

    eps_list = []
    len_samples = len(sorted_sample_dist)
    for percent in [
            0.40, 0.35, 0.3, 0.25, 0.2, 0.15, 0.1, 0.05, 0.025, 0.010, 0.008,
            0.005, 0.003, 0.001, 0.0005, 0.0003, 0.0001, 0.00005, 0.00001
    ]:
        percent /= 10.0
        index = int(round(len_samples * percent))
        if index == len_samples:
            index -= 1
        dc = sorted_sample_dist[index]
        #print index, sorted_sample_dist[index]
        eps_list.append(dc)
    print eps_list

    # from sklearn.neighbors import NearestNeighbors
    # print len(phi_psi)
    # neighborhoods_model = NearestNeighbors(n_neighbors=len(phi_psi), algorithm='kd_tree')
    # neighborhoods_model.fit(phi_psi)
    # #distances, indices = neighborhoods_model.kneighbors(phi_psi)
    # distances, indices = neighborhoods_model.kneighbors(phi_psi, 5)
    # print distances

    #print phi_psi
    # ===========================================================================
    # do Clustering using MR -DBSCAN method
    clustering_name = "mr-dbscan_iter_"
    potential = True
    # potential = False
    eps = eps_list[0]
    min_samples = 5
    print "Running first calculation"
    db = DBSCAN(eps=eps, min_samples=min_samples, metric='rmsd').fit(phi_psi)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    old_assignments = db.labels_
    n_microstates = len(
        set(old_assignments)) - (1 if -1 in old_assignments else 0)
    print('Estimated number of clusters: %d' % n_microstates)
    iter_name = clustering_name + '0' + '_eps_' + str(
        eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str(
            n_microstates)
    plot_cluster(labels=old_assignments,
                 phi_angles=phi_angles,
                 psi_angles=psi_angles,
                 name=iter_name,
                 potential=potential)

    n_iterations = len(eps_list)
    print "n_iterations:", n_iterations
    #eps_list = [3.0, 2.0, 1.0, 0.8, 0.5]
    min_samples_list = [3, 3, 3, 3, 3, 2, 2]

    for i in xrange(1, n_iterations):
        eps = eps_list[i]
        #min_samples = min_samples_list[i]
        db = DBSCAN(eps=eps, min_samples=min_samples,
                    metric='rmsd').fit(phi_psi)
        print "Iter:", i, "Running MR-DBSCAN at eps:", eps, 'min_sampes:', min_samples
        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        new_assignments = db.labels_
        old_assignments = merge_assignments(new_assignments, old_assignments)
        n_microstates = len(
            set(old_assignments)) - (1 if -1 in old_assignments else 0)
        print('Estimated number of clusters: %d' % n_microstates)
        iter_name = clustering_name + str(i) + '_eps_' + str(
            eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str(
                n_microstates)
        plot_cluster(labels=old_assignments,
                     phi_angles=phi_angles,
                     psi_angles=psi_angles,
                     name=iter_name,
                     potential=potential)
    labels = old_assignments
    print labels
    n_microstates = len(set(labels)) - (1 if -1 in labels else 0)
    print('Estimated number of clusters: %d' % n_microstates)

    #cluster_centers_ = cluster.cluster_centers_
    # plot micro states
    clustering_name = "mr-dbscan_n_" + str(n_microstates)
    np.savetxt("assignments_" + clustering_name + ".txt", labels, fmt="%d")
    #np.savetxt("cluster_centers_"+clustering_name+".txt", cluster_centers_, fmt="%d")

    plot_cluster(labels=labels,
                 phi_angles=phi_angles,
                 psi_angles=psi_angles,
                 name=clustering_name,
                 potential=potential)