Esempio n. 1
0
diheds_array = np.loadtxt('diheds.txt', dtype=np.float32)
phi_angles = diheds_array[:, 0]
psi_angles = diheds_array[:, 1]
print(phi_angles.shape)

n_size = X.shape[0]
dimension = X.shape[1]

labels = np.loadtxt(args.assignments, dtype=np.int32)

#print(sk_labels)
n_microstates = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_microstates)

# plot micro states
name = args.assignments.split("/")[-1][:-4]
clustering_name = "ML-DBSCAN_n_" + name + "_" + str(n_microstates)
#np.savetxt("assignments.txt", labels, fmt="%d")
##np.savetxt("cluster_centers_"+clustering_name+".txt", cluster_centers_, fmt="%d")
plot_cluster(labels=labels,
             phi_angles=phi_angles,
             psi_angles=psi_angles,
             name=clustering_name + '_rama',
             potential=False)

plot_cluster(labels=labels,
             phi_angles=X[:, 0],
             psi_angles=X[:, 1],
             name=clustering_name + '_tica',
             potential=False)
t1 = time.time()
Sklearn_time = t1 - t0
print("Clustering using Scikit-Learn DBSCAN Time Cost:", t1 - t0)

sk_labels = sk_cluster.labels_
#print(sk_labels)
n_microstates = len(set(sk_labels)) - (1 if -1 in sk_labels else 0)
print('Estimated number of clusters: %d' % n_microstates)

# plot micro states
clustering_name = "Sklearn_dbscan_n_" + str(n_microstates)
np.savetxt("assignments.txt", sk_labels, fmt="%d")
#np.savetxt("cluster_centers_"+clustering_name+".txt", cluster_centers_, fmt="%d")
plot_cluster(labels=sk_labels,
             phi_angles=phi_angles,
             psi_angles=psi_angles,
             name='rama',
             potential=False)

plot_cluster(labels=sk_labels,
             phi_angles=X[:, 0],
             psi_angles=X[:, 1],
             name='tica',
             potential=False)

print(
    '---------------------------------------------------------------------------------'
)
##print('%f\t%f' % (CPU_time, Sklearn_time))
#print('%f\t%f\t%f' % (GPU_time, CPU_time, Sklearn_time))
print(
Esempio n. 3
0
CPU_IVFFlat_cluster = Faiss_DBSCAN(eps=eps, min_samples=min_samples, nlist=nlist, nprobe=nprobe, metric="l2", GPU=False, IVFFlat=IVFFlat)
print(CPU_IVFFlat_cluster)
t0 = time.time()
CPU_IVFFlat_cluster.fit(X)
t1 = time.time()
CPU_time = t1 - t0
print("Clustering using Faiss CPU DBSCAN based on IVFFlat Time Cost:", t1 - t0)

CPU_IVFFlat_labels = CPU_IVFFlat_cluster.labels_
#print(CPU_IVFFlat_labels)
n_microstates = len(set(CPU_IVFFlat_labels)) - (1 if -1 in CPU_IVFFlat_labels else 0)
print('Estimated number of clusters: %d' % n_microstates)

# plot micro states
clustering_name = "CPU_Faiss_IVFFlat_dbscan_n_" + str(n_microstates)
plot_cluster(labels=CPU_IVFFlat_labels, phi_angles=phi_angles, psi_angles=psi_angles, name=clustering_name, potential=True)
'''
# ===========================================================================
# do Clustering using Scikit-Learn DBSCAN method
from sklearn.cluster import DBSCAN
sk_cluster = DBSCAN(eps=eps, min_samples=min_samples, metric="l2")
t0 = time.time()
sk_cluster.fit(X)
t1 = time.time()
Sklearn_time = t1 - t0
print("Clustering using Scikit-Learn DBSCAN Time Cost:", t1 - t0)

sk_labels = sk_cluster.labels_
#print(sk_labels)
n_microstates = len(set(sk_labels)) - (1 if -1 in sk_labels else 0)
print('Estimated number of clusters: %d' % n_microstates)
Esempio n. 4
0
def main():
    cli = argparse.ArgumentParser()
    cli.add_argument(
        '-t',
        '--trajListFns',
        default='trajlist',
        help='List of trajectory files to read in, separated by spaces.')
    cli.add_argument(
        '-a',
        '--atomListFns',
        default='atom_indices',
        help='List of atom index files to read in, separated by spaces.')
    cli.add_argument('-g',
                     '--topology',
                     default='native.pdb',
                     help='topology file.')
    cli.add_argument('-o',
                     '--homedir',
                     help='Home dir.',
                     default=".",
                     type=str)
    cli.add_argument('-e',
                     '--iext',
                     help='''The file extension of input trajectory
                     files.  Must be a filetype that mdtraj.load() can recognize.''',
                     default="xtc",
                     type=str)
    cli.add_argument('-n',
                     '--n_clusters',
                     help='''n_clusters.''',
                     default=100,
                     type=int)
    cli.add_argument('-m',
                     '--n_macro_states',
                     help='''n_macro_states.''',
                     default=6,
                     type=int)
    cli.add_argument('-s', '--stride', help='stride.', default=None, type=int)

    args = cli.parse_args()
    trajlistname = args.trajListFns
    atom_indicesname = args.atomListFns
    trajext = args.iext
    File_TOP = args.topology
    homedir = args.homedir
    n_clusters = args.n_clusters
    n_macro_states = args.n_macro_states
    stride = args.stride
    # ===========================================================================
    # Reading Trajs from XTC files
    #print "stride:", stride
    #trajreader = XTCReader(trajlistname, atom_indicesname, homedir, trajext, File_TOP, nSubSample=stride)
    #trajs = trajreader.trajs
    #print(trajs)
    #traj_len = trajreader.traj_len
    #np.savetxt("./traj_len.txt", traj_len, fmt="%d")

    if os.path.isfile(
            "./phi_angles.txt") and os.path.isfile("./psi_angles.txt") is True:
        phi_angles = np.loadtxt("./phi_angles.txt", dtype=np.float32)
        psi_angles = np.loadtxt("./psi_angles.txt", dtype=np.float32)
    else:
        #phi_angles, psi_angles = trajreader.get_phipsi(trajs, psi=[6, 8, 14, 16], phi=[4, 6, 8, 14])
        phi_angles, psi_angles = trajreader.get_phipsi(trajs,
                                                       psi=[5, 7, 13, 15],
                                                       phi=[3, 5, 7, 13])
        np.savetxt("./phi_angles.txt", phi_angles, fmt="%f")
        np.savetxt("./psi_angles.txt", psi_angles, fmt="%f")

    phi_psi = np.column_stack((phi_angles, psi_angles))

    n_samples = 1000
    percent = 0.9
    import random
    whole_samples = random.sample(list(phi_psi), n_samples)
    #print whole_samples
    from metrics.pairwise import pairwise_distances
    sample_dist_metric = pairwise_distances(whole_samples,
                                            whole_samples,
                                            metric='euclidean')
    print(sample_dist_metric.shape)
    sample_dist = []
    for i in range(0, n_samples):
        for j in range(i + 1, n_samples):
            sample_dist.append(sample_dist_metric[i, j])
    sorted_sample_dist = np.sort(sample_dist)
    print("Len of samples:", len(sorted_sample_dist),
          np.max(sorted_sample_dist), np.min(sorted_sample_dist))

    eps_list = []
    len_samples = len(sorted_sample_dist)
    for percent in [0.05, 0.025, 0.008]:  #,0.005, 0.003,
        #                   0.002, 0.001, 0.0008, 0.0005, 0.0003, 0.0002, 0.0001, 0.00005]:
        percent /= 10.0
        index = int(round(len_samples * percent))
        if index == len_samples:
            index -= 1
        dc = sorted_sample_dist[index]
        #print index, sorted_sample_dist[index]
        eps_list.append(dc)
    print(eps_list)

    # from sklearn.neighbors import NearestNeighbors
    # print len(phi_psi)
    # neighborhoods_model = NearestNeighbors(n_neighbors=len(phi_psi), algorithm='kd_tree')
    # neighborhoods_model.fit(phi_psi)
    # #distances, indices = neighborhoods_model.kneighbors(phi_psi)
    # distances, indices = neighborhoods_model.kneighbors(phi_psi, 5)
    # print distances

    #print phi_psi
    # ===========================================================================
    # do Clustering using MR -DBSCAN method
    clustering_name = "mr-dbscan_iter_"
    potential = True
    # potential = False
    #eps = eps_list[0]
    eps = 9.376904
    min_samples = 1
    len_frames = len(phi_psi)
    print("Total frames:", len_frames)
    print("Running first calculation")
    db = DBSCAN(eps=eps, min_samples=min_samples,
                algorithm='kd_tree').fit(phi_psi)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    old_assignments = db.labels_
    n_microstates = len(
        set(old_assignments)) - (1 if -1 in old_assignments else 0)
    print('Estimated number of clusters: %d' % n_microstates)

    # Calculating percentage of each states
    frame_bincount = np.bincount(
        old_assignments[old_assignments >= 0])  #remove outliers
    frame_freq_index_sorted = np.argsort(
        frame_bincount)[::-1]  # descending arg sort
    frame_freq_percent_sorted = frame_bincount[
        frame_freq_index_sorted] / np.float32(len_frames)
    print(frame_freq_percent_sorted[0:10])
    print(frame_freq_index_sorted[0:10])
    old_frame_freq_percent_sorted = frame_freq_percent_sorted
    old_frame_freq_index_sorted = frame_freq_index_sorted

    iter_name = clustering_name + '0' + '_eps_' + str(
        eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str(
            n_microstates)
    plot_cluster(labels=old_assignments,
                 phi_angles=phi_angles,
                 psi_angles=psi_angles,
                 name=iter_name,
                 potential=potential)

    n_iterations = len(eps_list)
    print("n_iterations:", n_iterations)
    eps_list = [9.376904, 3.3741567, 0.87675905]
    min_samples_list = [1, 20, 20]
    #min_samples_list = [50, 30, 20, 15, 10, 8, 5, 2]
    n_min_samples = len(min_samples_list)
    #eps_list = [3.0, 2.0, 1.0, 0.8, 0.5]
    #min_samples_list = [3, 3, 3, 3, 3, 2, 2]

    r  #esults = np.zeros((n_min_samples,n_iterations,len_frames), dtype=np.int32)
    results = np.zeros((n_iterations, len_frames), dtype=np.int32)
    for i in range(0, n_iterations):
        #for j in range(0, n_min_samples):
        eps = eps_list[i]
        min_samples = min_samples_list[i]
        db = DBSCAN(eps=eps, min_samples=min_samples).fit(phi_psi)
        '''
            core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
            core_samples_mask[db.core_sample_indices_] = True
            new_assignments = db.labels_
            if i < 7:
                remove_outliers = True
            else:
                remove_outliers = False
            assignments = merge_assignments(new_assignments, old_assignments, remove_outliers=remove_outliers)
            n_microstates = len(set(assignments)) - (1 if -1 in assignments else 0)
    
            # Calculating percentage of each states
            frame_bincount = np.bincount(assignments[assignments >= 0])  # remove outliers
            frame_freq_index_sorted = np.argsort(frame_bincount)[::-1]  # descending arg sort
            frame_freq_percent_sorted = frame_bincount[frame_freq_index_sorted] / np.float32(len_frames)
            frame_freq_percent_sorted = frame_freq_percent_sorted[0:10]
            frame_freq_index_sorted = frame_freq_index_sorted[0:10]
            print frame_freq_percent_sorted
            print frame_freq_index_sorted
            old_frame_freq_index_sorted = []
            for j in xrange(0, 10):
                index = np.argwhere(assignments==frame_freq_index_sorted[j])[0]
                old_frame_freq_index_sorted.append(old_assignments[index][0])
            print old_frame_freq_index_sorted
            '''
        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        assignments = db.labels_
        n_microstates = len(set(assignments)) - (1 if -1 in assignments else 0)
        #results[j,i, :]= np.array(assignments)
        results[i, :] = np.array(assignments)
        print("Iter:", i, "Running MR-DBSCAN at eps:", eps, 'min_sampes:',
              min_samples, 'Estimated number of clusters:', n_microstates)
        #print('Estimated number of clusters: %d' % n_microstates)
        iter_name = clustering_name + str(i) + '_eps_' + str(
            eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str(
                n_microstates)
        plot_cluster(labels=assignments,
                     phi_angles=phi_angles,
                     psi_angles=psi_angles,
                     name=iter_name,
                     potential=potential)
        #old_assignments = assignments
    print(results)
    np.save("results.npy", results)
    #np.savetxt("results.csv", results, fmt="%d", delimiter=",")
    np.savetxt("eps_list.txt", eps_list, fmt="%f", delimiter=",")
    np.savetxt("min_samples_list.txt",
               min_samples_list,
               fmt="%d",
               delimiter=",")
Esempio n. 5
0


# ===========================================================================
# Reading phi angles and psi angles data from XTC files
if os.path.isfile("./phi_angles.txt") and os.path.isfile("./psi_angles.txt") is True:
    phi_angles = np.loadtxt("./phi_angles.txt", dtype=np.float32)
    psi_angles = np.loadtxt("./psi_angles.txt", dtype=np.float32)
else:
    trajreader = XTCReader(trajlistname, atom_indicesname, homedir, trajext, File_TOP)
    trajs = trajreader.trajs
    traj_len = trajreader.traj_len
    np.savetxt("./traj_len.txt", traj_len, fmt="%d")
    phi_angles, psi_angles = trajreader.get_phipsi(trajs, psi=[6, 8, 14, 16], phi=[4, 6, 8, 14])
    np.savetxt("./phi_angles.txt", phi_angles, fmt="%f")
    np.savetxt("./psi_angles.txt", psi_angles, fmt="%f")
# ===========================================================================
# Reading split assignments and the length of each traj
assignments_dir = args.assignments
labels = np.loadtxt(assignments_dir, dtype=np.int32)
traj_len = np.loadtxt(args.traj_len, dtype=np.int32)

#step=20
#dir = "DensityPeak_Dihedrals/"
name = assignments_dir[:-4] + 'Dihereals'

plot_cluster(labels=labels, phi_angles=phi_angles, psi_angles=psi_angles, name=name)
#plot_each_cluster(labels=labels, phi_angles=phi_angles, psi_angles=psi_angles, name=dir+'Dihedrals', step=step)
#contour_cluster(labels=algorithm.MacroAssignments_, phi_angles=phi_angles, psi_angles=psi_angles, name=lumping_name)

Esempio n. 6
0
#cluster.fit(phi_psi)

cluster.fit(X)
labels = cluster.labels_
print(labels)

labels = np.concatenate(labels)
n_microstates = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_microstates)

#cluster_centers_ = cluster.cluster_centers_
# plot micro states
clustering_name = "kcenters_n_" + str(n_microstates)
#splited_assignments =split_assignments(labels, traj_len)
#np.savetxt("assignments_"+clustering_name+".txt", labels, fmt="%d")
np.savetxt("assignments_" + clustering_name + ".txt", labels, fmt="%d")
#np.savetxt("cluster_centers_"+clustering_name+".txt", cluster_centers_, fmt="%d")
plot_cluster(labels=labels,
             phi_angles=phi_angles,
             psi_angles=psi_angles,
             name=clustering_name)

X = np.concatenate(X)
plot_cluster(labels=labels,
             phi_angles=X[:, 0],
             psi_angles=X[:, 1],
             name='tica_clustering.png')

#trajs[cluster_centers_].save("cluster_centers.pdb")
#trajs_sub_atoms[cluster_centers_].save("cluster_centers_sub_atoms.pdb")
Esempio n. 7
0
                                                   psi=[6, 8, 14, 16],
                                                   phi=[4, 6, 8, 14])
    #phi_angles, psi_angles = trajreader.get_phipsi(trajs, psi=[5, 7, 13, 15], phi=[3, 5, 7, 13])
    np.savetxt("./phi_angles.txt", phi_angles, fmt="%f")
    np.savetxt("./psi_angles.txt", psi_angles, fmt="%f")

phi_psi = np.column_stack((phi_angles, psi_angles))
print phi_psi
# ===========================================================================
# do Clustering using DBSCAN method
cluster = DBSCAN(eps=10.0, min_samples=10, metric="euclidean")
print cluster
cluster.fit(phi_psi)
#cluster.fit(trajs)

labels = cluster.labels_
print labels
n_microstates = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_microstates)

#cluster_centers_ = cluster.cluster_centers_
# plot micro states
clustering_name = "dbscan_n_" + str(n_microstates)
np.savetxt("assignments_" + clustering_name + ".txt", labels, fmt="%d")
#np.savetxt("cluster_centers_"+clustering_name+".txt", cluster_centers_, fmt="%d")

plot_cluster(labels=labels,
             phi_angles=phi_angles,
             psi_angles=psi_angles,
             name=clustering_name)
Esempio n. 8
0
sk_cluster.fit(X)
t1 = time.time()
Sklearn_time = t1 - t0
print("Clustering using Scikit-Learn DBSCAN Time Cost:", t1 - t0)

sk_labels = sk_cluster.labels_
#print(sk_labels)
n_microstates = len(set(sk_labels)) - (1 if -1 in sk_labels else 0)
print('Estimated number of clusters: %d' % n_microstates)

# plot micro states
clustering_name = "Sklearn_dbscan_n_" + str(n_microstates)
np.savetxt("assignments.txt", sk_labels, fmt="%d")
plot_cluster(labels=sk_labels,
             phi_angles=phi_angles,
             psi_angles=psi_angles,
             name='rama',
             potential=True)

#plot_cluster(labels=sk_labels, phi_angles=X[:, 0], psi_angles=X[:, 1], name='tica', potential=False)
#np.savetxt("assignments_"+clustering_name+".txt", sk_labels, fmt="%d")
#np.savetxt("cluster_centers_"+clustering_name+".txt", cluster_centers_, fmt="%d")
print(
    '---------------------------------------------------------------------------------'
)
print('%f\t%f' % (CPU_time, Sklearn_time))
#print('%f\t%f\t%f' % (GPU_time, CPU_time, Sklearn_time))
print(
    '---------------------------------------------------------------------------------'
)
def main():
    cli = argparse.ArgumentParser()
    cli.add_argument('-e', '--eps', help='eps', default=1, type=float)
    cli.add_argument('-m',
                     '--min_samples',
                     help='min_samples',
                     default=5,
                     type=int)
    cli.add_argument('-l', '--nlist', help='nlist', default=1000, type=int)
    cli.add_argument('-p', '--nprobe', help='nprob', default=10, type=int)

    # Download example dataset
    from msmbuilder.example_datasets import AlanineDipeptide
    ala2 = AlanineDipeptide(verbose=False)
    xyz = ala2.get().trajectories
    print(ala2.description())

    #xyz = [t[::10] for t in xyz]
    print("{} trajectories".format(len(xyz)))
    # msmbuilder does not keep track of units! You must keep track of your
    # data's timestep
    to_ns = 0.5
    print("with length {} ns".format(set(len(x) * to_ns for x in xyz)))

    from msmbuilder.featurizer import DihedralFeaturizer
    featurizer = DihedralFeaturizer(types=['phi', 'psi'])
    diheds = featurizer.fit_transform(xyz)

    print(xyz[0].xyz.shape)
    print(diheds[0].shape)

    from msmbuilder.preprocessing import RobustScaler
    scaler = RobustScaler()
    scaled_diheds = scaler.fit_transform(diheds)

    print(diheds[0].shape)
    print(scaled_diheds[0].shape)

    from msmbuilder.decomposition import tICA
    tica_model = tICA(lag_time=2, n_components=2)
    # fit and transform can be done in seperate steps:
    tica_model.fit(diheds)

    tica_trajs = tica_model.transform(diheds)
    featurizer = DihedralFeaturizer(types=['phi', 'psi'], sincos=False)
    diheds = featurizer.fit_transform(xyz)
    print(diheds[0].shape)
    print(tica_trajs[0].shape)

    # ===========================================================================
    #if os.path.isfile("./phi_angles.txt") and os.path.isfile("./psi_angles.txt") is True:
    #    phi_angles = np.loadtxt("./phi_angles.txt", dtype=np.float32)
    #    psi_angles = np.loadtxt("./psi_angles.txt", dtype=np.float32)
    #X = np.column_stack((phi_angles, psi_angles))
    #print(X.shape)
    phi_angles = np.degrees(diheds[0][:, 0])
    psi_angles = np.degrees(diheds[0][:, 1])
    print(phi_angles)
    X = tica_trajs[0].astype(np.float32)
    #rint(X)
    n_size = X.shape[0]
    dimension = X.shape[1]

    #print(X.shape)

    # ===========================================================================
    args = cli.parse_args()
    eps = args.eps  # eps
    min_samples = args.min_samples  # min_samples
    nlist = args.nlist
    nprobe = args.nprobe
    IVFFlat = True
    print('n_size = %d,\t dimension = %d,\t eps = %f, min_samples = %d' %
          (n_size, dimension, eps, min_samples))

    n_samples = 1000
    percent = 0.9
    import random
    whole_samples = random.sample(list(X), n_samples)
    #print whole_samples
    from metrics.pairwise import pairwise_distances
    sample_dist_metric = pairwise_distances(whole_samples,
                                            whole_samples,
                                            metric='l2')
    print(sample_dist_metric.shape)
    sample_dist = []
    for i in range(0, n_samples):
        for j in range(i + 1, n_samples):
            sample_dist.append(sample_dist_metric[i, j])
    sorted_sample_dist = np.sort(sample_dist)
    print("Len of samples:", len(sorted_sample_dist),
          np.max(sorted_sample_dist), np.min(sorted_sample_dist))

    eps_list = []
    len_samples = len(sorted_sample_dist)
    for percent in [0.30, 0.20, 0.10]:  #,0.005, 0.003,
        #                   0.002, 0.001, 0.0008, 0.0005, 0.0003, 0.0002, 0.0001, 0.00005]:
        #percent /= 10.0
        index = int(round(len_samples * percent))
        if index == len_samples:
            index -= 1
        dc = sorted_sample_dist[index]
        #print index, sorted_sample_dist[index]
        eps_list.append(dc)
    print(eps_list)

    #print X
    # ===========================================================================
    # do Clustering using MR -DBSCAN method
    clustering_name = "mr-dbscan_iter_"
    #potential = True
    remove_outliers = False
    potential = False
    eps = eps_list[0]
    min_samples = 1
    len_frames = len(X)
    print("Total frames:", len_frames)
    print("Running first calculation")
    db = Faiss_DBSCAN(eps=eps,
                      min_samples=min_samples,
                      nlist=nlist,
                      nprobe=nprobe,
                      metric="l2",
                      GPU=False,
                      IVFFlat=IVFFlat)
    db.fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    old_assignments = db.labels_
    n_microstates = len(
        set(old_assignments)) - (1 if -1 in old_assignments else 0)
    print('Estimated number of clusters: %d' % n_microstates)

    # Calculating percentage of each states
    frame_bincount = np.bincount(
        old_assignments[old_assignments >= 0])  #remove outliers
    frame_freq_index_sorted = np.argsort(
        frame_bincount)[::-1]  # descending arg sort
    frame_freq_percent_sorted = frame_bincount[
        frame_freq_index_sorted] / np.float32(len_frames)
    print(frame_freq_percent_sorted[0:10])
    print(frame_freq_index_sorted[0:10])
    old_frame_freq_percent_sorted = frame_freq_percent_sorted
    old_frame_freq_index_sorted = frame_freq_index_sorted
    n_microstates = len(
        set(old_assignments)) - (1 if -1 in old_assignments else 0)
    print('Estimated number of clusters: %d' % n_microstates)
    iter_name = clustering_name + '0' + '_eps_' + str(
        eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str(
            n_microstates)
    plot_cluster(labels=old_assignments,
                 phi_angles=phi_angles,
                 psi_angles=psi_angles,
                 name=iter_name,
                 potential=potential)

    n_iterations = len(eps_list)
    print("n_iterations:", n_iterations)
    min_samples_list = [50, 30, 10]
    #min_samples_list = [50, 30, 20, 15, 10, 8, 5, 2]
    n_min_samples = len(min_samples_list)
    #eps_list = [3.0, 2.0, 1.0, 0.8, 0.5]
    #min_samples_list = [3, 3, 3, 3, 3, 2, 2]

    results = np.zeros((n_min_samples, n_iterations, len_frames),
                       dtype=np.int32)
    for i in range(1, n_iterations):
        eps = eps_list[i]
        min_samples = min_samples_list[i]
        db = Faiss_DBSCAN(eps=eps,
                          min_samples=min_samples,
                          nlist=nlist,
                          nprobe=nprobe,
                          metric="l2",
                          GPU=False,
                          IVFFlat=IVFFlat).fit(X)

        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        new_assignments = db.labels_
        if i is n_iterations - 1:
            remove_outliers = True
        #else:
        #    remove_outliers = False
        assignments = merge_assignments(new_assignments,
                                        old_assignments,
                                        remove_outliers=remove_outliers)
        n_microstates = len(set(assignments)) - (1 if -1 in assignments else 0)

        #results[j,i, :]= np.array(assignments)
        print("Iter:", i, "Running MR-DBSCAN at eps:", eps, 'min_sampes:',
              min_samples, 'Estimated number of clusters:', n_microstates)
        #print('Estimated number of clusters: %d' % n_microstates)
        iter_name = clustering_name + str(i) + '_eps_' + str(
            eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str(
                n_microstates)
        plot_cluster(labels=assignments,
                     phi_angles=phi_angles,
                     psi_angles=psi_angles,
                     name=iter_name,
                     potential=potential)
        #old_assignments = assignments
    #print(results)
    #np.save("results.npy", results)
    #np.savetxt("results.csv", results, fmt="%d", delimiter=",")
    np.savetxt("eps_list.txt", eps_list, fmt="%f", delimiter=",")
    np.savetxt("min_samples_list.txt",
               min_samples_list,
               fmt="%d",
               delimiter=",")
Esempio n. 10
0
def main():
    cli = argparse.ArgumentParser()
    cli.add_argument('-e', '--eps', help='eps', default=1, type=float)
    cli.add_argument('-m', '--min_samples', help='min_samples', default=5, type=int)
    cli.add_argument('-l', '--nlist', help='nlist', default=1000, type=int)
    cli.add_argument('-p', '--nprobe', help='nprob', default=10, type=int)
    # ===========================================================================
    if os.path.isfile("./phi_angles.txt") and os.path.isfile("./psi_angles.txt") is True:
        phi_angles = np.loadtxt("./phi_angles.txt", dtype=np.float32)
        psi_angles = np.loadtxt("./psi_angles.txt", dtype=np.float32)
    X = np.column_stack((phi_angles, psi_angles))
    print(X.shape)
    n_size = X.shape[0]
    dimension = X.shape[1]
    # ===========================================================================
    args = cli.parse_args()
    eps = args.eps  # eps
    min_samples = args.min_samples  # min_samples
    nlist = args.nlist
    nprobe = args.nprobe
    IVFFlat = True
    print('n_size = %d,\t dimension = %d,\t eps = %f, min_samples = %d' % (n_size, dimension, eps, min_samples))

    n_samples = 1000
    percent = 0.9
    import random
    whole_samples = random.sample(list(X), n_samples)
    #print whole_samples
    from metrics.pairwise import pairwise_distances
    sample_dist_metric = pairwise_distances(whole_samples, whole_samples, metric='l2')
    print(sample_dist_metric.shape)
    sample_dist = []
    for i in range(0, n_samples):
        for j in range(i+1, n_samples):
            sample_dist.append(sample_dist_metric[i, j])
    sorted_sample_dist = np.sort(sample_dist)
    print("Len of samples:", len(sorted_sample_dist), np.max(sorted_sample_dist), np.min(sorted_sample_dist))


    eps_list = []
    len_samples = len(sorted_sample_dist)
    for percent in [0.20, 0.05, 0.020 ]: #,0.005, 0.003,
 #                   0.002, 0.001, 0.0008, 0.0005, 0.0003, 0.0002, 0.0001, 0.00005]:
        #percent /= 10.0
        index = int(round(len_samples*percent))
        if index == len_samples:
            index -= 1
        dc = sorted_sample_dist[index]
        #print index, sorted_sample_dist[index]
        eps_list.append(dc)
    print(eps_list)

    #print X
    # ===========================================================================
    # do Clustering using MR -DBSCAN method
    clustering_name = "mr-dbscan_iter_"
    potential = True
    remove_outliers = False
    # potential = False
    eps = eps_list[0]
    min_samples = 1
    len_frames = len(X)
    print("Total frames:", len_frames)
    print("Running first calculation")
    db = Faiss_DBSCAN(eps=eps, min_samples=min_samples, nlist=nlist, nprobe=nprobe, metric="l2", GPU=False, IVFFlat=IVFFlat)
    db.fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    old_assignments = db.labels_
    n_microstates = len(set(old_assignments)) - (1 if -1 in old_assignments else 0)
    print('Estimated number of clusters: %d' % n_microstates)

    # Calculating percentage of each states
    frame_bincount = np.bincount(old_assignments[old_assignments>=0]) #remove outliers
    frame_freq_index_sorted = np.argsort(frame_bincount)[::-1]  # descending arg sort
    frame_freq_percent_sorted = frame_bincount[frame_freq_index_sorted]/np.float32(len_frames)
    print(frame_freq_percent_sorted[0:10])
    print(frame_freq_index_sorted[0:10])
    old_frame_freq_percent_sorted = frame_freq_percent_sorted
    old_frame_freq_index_sorted = frame_freq_index_sorted

    iter_name = clustering_name + '0' + '_eps_' + str(eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str(n_microstates)
    plot_cluster(labels=old_assignments, phi_angles=phi_angles, psi_angles=psi_angles, name=iter_name, potential=potential)

    n_iterations = len(eps_list)
    print("n_iterations:", n_iterations)
    min_samples_list = [10, 20, 20]
    #min_samples_list = [50, 30, 20, 15, 10, 8, 5, 2]
    n_min_samples = len(min_samples_list)
    #eps_list = [3.0, 2.0, 1.0, 0.8, 0.5]
    #min_samples_list = [3, 3, 3, 3, 3, 2, 2]

    results = np.zeros((n_min_samples,n_iterations,len_frames), dtype=np.int32)
    for i in range(1, n_iterations):
            eps = eps_list[i]
            min_samples = min_samples_list[i]
            db = Faiss_DBSCAN(eps=eps, min_samples=min_samples, nlist=nlist, nprobe=nprobe, metric="l2", GPU=False, IVFFlat=IVFFlat).fit(X)
        

            core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
            core_samples_mask[db.core_sample_indices_] = True
            new_assignments = db.labels_
            if i is n_iterations - 1:
                remove_outliers = True
            #else:
            #    remove_outliers = False
            assignments = merge_assignments(new_assignments, old_assignments, remove_outliers=remove_outliers)
            n_microstates = len(set(assignments)) - (1 if -1 in assignments else 0)

            #results[j,i, :]= np.array(assignments)
            print("Iter:", i, "Running MR-DBSCAN at eps:", eps, 'min_sampes:', min_samples, 'Estimated number of clusters:', n_microstates) 
            #print('Estimated number of clusters: %d' % n_microstates)
            iter_name = clustering_name + str(i) + '_eps_' + str(eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str(n_microstates)
            plot_cluster(labels=assignments, phi_angles=phi_angles, psi_angles=psi_angles, name=iter_name, potential=potential)
            #old_assignments = assignments
    #print(results)
    #np.save("results.npy", results)
    #np.savetxt("results.csv", results, fmt="%d", delimiter=",")
    np.savetxt("eps_list.txt", eps_list, fmt="%f", delimiter=",")
    np.savetxt("min_samples_list.txt", min_samples_list, fmt="%d", delimiter=",")
Esempio n. 11
0
CPU_IVFFlat_labels = CPU_IVFFlat_cluster.labels_
print(CPU_IVFFlat_labels)
n_microstates = len(
    set(CPU_IVFFlat_labels)) - (1 if -1 in CPU_IVFFlat_labels else 0)
print('Estimated number of clusters: %d' % n_microstates)

#cluster_centers_ = cluster.cluster_centers_
# plot micro states
clustering_name = "CPU_Faiss_IVFFlat_dbscan_n_" + str(n_microstates)
np.savetxt("assignments_" + clustering_name + ".txt",
           CPU_IVFFlat_labels,
           fmt="%d")
#np.savetxt("cluster_centers_"+clustering_name+".txt", cluster_centers_, fmt="%d")

plot_cluster(labels=CPU_IVFFlat_labels,
             phi_angles=phi_angles,
             psi_angles=psi_angles,
             name=clustering_name)
# ===========================================================================
# do Clustering using Faiss CPU DBSCAN method
CPU_FlatL2_cluster = Faiss_DBSCAN(eps=eps,
                                  min_samples=min_samples,
                                  nlist=100,
                                  nprobe=5,
                                  metric="l2",
                                  GPU=False,
                                  IVFFlat=False)
print(CPU_FlatL2_cluster)
CPU_FlatL2_cluster.fit(phi_psi)
#cluster.fit(trajs)

CPU_FlatL2_labels = CPU_FlatL2_cluster.labels_
Esempio n. 12
0
#n_size = args.n_size     # database size
#np.random.seed(1234)             # make reproducible
#X = np.random.random((n_size, dimension)).astype('float32') * 10.0
#X[:, 0] += np.arange(n_size) / 100.0
#print(X)

#if args.stride is not None:
#    trajreader = VectorReader(trajlistName=trajlistName, homedir=homedir, trajExt='txt', stride=args.stride)
#else:
#    trajreader = VectorReader(trajlistName=trajlistName, homedir=homedir, trajExt='txt')
#X = trajreader.trajs

print(phi_angles.shape)

labels = np.loadtxt(args.assignments, dtype=np.int32)

#print(sk_labels)
n_microstates = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_microstates)

# plot micro states
name = args.assignments.split("/")[-1][:-4]
clustering_name = "ML-DBSCAN_n_" + name + "_" + str(n_microstates)
#np.savetxt("assignments.txt", labels, fmt="%d")
##np.savetxt("cluster_centers_"+clustering_name+".txt", cluster_centers_, fmt="%d")
plot_cluster(labels=labels,
             phi_angles=phi_angles,
             psi_angles=psi_angles,
             name=clustering_name + '_rama',
             potential=True)
Esempio n. 13
0
def main():
    cli = argparse.ArgumentParser()
    cli.add_argument(
        '-t',
        '--trajListFns',
        default='trajlist',
        help='List of trajectory files to read in, separated by spaces.')
    cli.add_argument(
        '-a',
        '--atomListFns',
        default='atom_indices',
        help='List of atom index files to read in, separated by spaces.')
    cli.add_argument('-g',
                     '--topology',
                     default='native.pdb',
                     help='topology file.')
    cli.add_argument('-o',
                     '--homedir',
                     help='Home dir.',
                     default=".",
                     type=str)
    cli.add_argument('-e',
                     '--iext',
                     help='''The file extension of input trajectory
                     files.  Must be a filetype that mdtraj.load() can recognize.''',
                     default="xtc",
                     type=str)
    cli.add_argument('-n',
                     '--n_clusters',
                     help='''n_clusters.''',
                     default=100,
                     type=int)
    cli.add_argument('-m',
                     '--n_macro_states',
                     help='''n_macro_states.''',
                     default=6,
                     type=int)
    cli.add_argument('-s', '--stride', help='stride.', default=None, type=int)

    args = cli.parse_args()
    trajlistname = args.trajListFns
    atom_indicesname = args.atomListFns
    trajext = args.iext
    File_TOP = args.topology
    homedir = args.homedir
    n_clusters = args.n_clusters
    n_macro_states = args.n_macro_states
    stride = args.stride
    # ===========================================================================
    # Reading Trajs from XTC files
    #print "stride:", stride
    #trajreader = XTCReader(trajlistname, atom_indicesname, homedir, trajext, File_TOP, nSubSample=stride)
    #trajs = trajreader.trajs
    #print trajs
    #traj_len = trajreader.traj_len
    #np.savetxt("./traj_len.txt", traj_len, fmt="%d")

    if os.path.isfile(
            "./phi_angles.txt") and os.path.isfile("./psi_angles.txt") is True:
        phi_angles = np.loadtxt("./phi_angles.txt", dtype=np.float32)
        psi_angles = np.loadtxt("./psi_angles.txt", dtype=np.float32)
    else:
        phi_angles, psi_angles = trajreader.get_phipsi(trajs,
                                                       psi=[6, 8, 14, 16],
                                                       phi=[4, 6, 8, 14])
        #phi_angles, psi_angles = trajreader.get_phipsi(trajs, psi=[5, 7, 13, 15], phi=[3, 5, 7, 13])
        np.savetxt("./phi_angles.txt", phi_angles, fmt="%f")
        np.savetxt("./psi_angles.txt", psi_angles, fmt="%f")

    phi_psi = np.column_stack((phi_angles, psi_angles))

    n_samples = 1000
    percent = 0.9
    import random
    whole_samples = random.sample(phi_psi, n_samples)
    #print whole_samples
    from metrics.pairwise import pairwise_distances
    sample_dist_metric = pairwise_distances(whole_samples,
                                            whole_samples,
                                            metric='rmsd')
    print sample_dist_metric.shape
    sample_dist = []
    for i in xrange(0, n_samples):
        for j in xrange(i + 1, n_samples):
            sample_dist.append(sample_dist_metric[i, j])
    sorted_sample_dist = np.sort(sample_dist)
    print "Len of samples:", len(sorted_sample_dist), np.max(
        sorted_sample_dist), np.min(sorted_sample_dist)

    eps_list = []
    len_samples = len(sorted_sample_dist)
    for percent in [
            0.40, 0.35, 0.3, 0.25, 0.2, 0.15, 0.1, 0.05, 0.025, 0.010, 0.008,
            0.005, 0.003, 0.001, 0.0005, 0.0003, 0.0001, 0.00005, 0.00001
    ]:
        percent /= 10.0
        index = int(round(len_samples * percent))
        if index == len_samples:
            index -= 1
        dc = sorted_sample_dist[index]
        #print index, sorted_sample_dist[index]
        eps_list.append(dc)
    print eps_list

    # from sklearn.neighbors import NearestNeighbors
    # print len(phi_psi)
    # neighborhoods_model = NearestNeighbors(n_neighbors=len(phi_psi), algorithm='kd_tree')
    # neighborhoods_model.fit(phi_psi)
    # #distances, indices = neighborhoods_model.kneighbors(phi_psi)
    # distances, indices = neighborhoods_model.kneighbors(phi_psi, 5)
    # print distances

    #print phi_psi
    # ===========================================================================
    # do Clustering using MR -DBSCAN method
    clustering_name = "mr-dbscan_iter_"
    potential = True
    # potential = False
    eps = eps_list[0]
    min_samples = 5
    print "Running first calculation"
    db = DBSCAN(eps=eps, min_samples=min_samples, metric='rmsd').fit(phi_psi)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    old_assignments = db.labels_
    n_microstates = len(
        set(old_assignments)) - (1 if -1 in old_assignments else 0)
    print('Estimated number of clusters: %d' % n_microstates)
    iter_name = clustering_name + '0' + '_eps_' + str(
        eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str(
            n_microstates)
    plot_cluster(labels=old_assignments,
                 phi_angles=phi_angles,
                 psi_angles=psi_angles,
                 name=iter_name,
                 potential=potential)

    n_iterations = len(eps_list)
    print "n_iterations:", n_iterations
    #eps_list = [3.0, 2.0, 1.0, 0.8, 0.5]
    min_samples_list = [3, 3, 3, 3, 3, 2, 2]

    for i in xrange(1, n_iterations):
        eps = eps_list[i]
        #min_samples = min_samples_list[i]
        db = DBSCAN(eps=eps, min_samples=min_samples,
                    metric='rmsd').fit(phi_psi)
        print "Iter:", i, "Running MR-DBSCAN at eps:", eps, 'min_sampes:', min_samples
        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        new_assignments = db.labels_
        old_assignments = merge_assignments(new_assignments, old_assignments)
        n_microstates = len(
            set(old_assignments)) - (1 if -1 in old_assignments else 0)
        print('Estimated number of clusters: %d' % n_microstates)
        iter_name = clustering_name + str(i) + '_eps_' + str(
            eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str(
                n_microstates)
        plot_cluster(labels=old_assignments,
                     phi_angles=phi_angles,
                     psi_angles=psi_angles,
                     name=iter_name,
                     potential=potential)
    labels = old_assignments
    print labels
    n_microstates = len(set(labels)) - (1 if -1 in labels else 0)
    print('Estimated number of clusters: %d' % n_microstates)

    #cluster_centers_ = cluster.cluster_centers_
    # plot micro states
    clustering_name = "mr-dbscan_n_" + str(n_microstates)
    np.savetxt("assignments_" + clustering_name + ".txt", labels, fmt="%d")
    #np.savetxt("cluster_centers_"+clustering_name+".txt", cluster_centers_, fmt="%d")

    plot_cluster(labels=labels,
                 phi_angles=phi_angles,
                 psi_angles=psi_angles,
                 name=clustering_name,
                 potential=potential)