Example #1
0
    def scale_data(self, scaler='Robust'):

        print('Scale featurized data been called\n')
        print('-------------------------------\n')
        from msmbuilder.preprocessing import RobustScaler

        if scaler == 'Robust':
            scaler = RobustScaler()

        self.scaled_data = scaler.fit_transform(self.sim_seqs)

        print('scaled ', self.scaled_data[0].shape)
        # #
        print("Scaling feautirized data successfully")
        print('-----------------------------------\n')
Example #2
0
def calculate_fitness(population_dihedral, diheds, score_global, i, lock):
    import pandas as pd
    import numpy as np
    pop_index = i
    new_diheds = []

    for i in range(0, len(diheds)):
        X = diheds[i]
        selected_features = X[:, population_dihedral]
        new_diheds.append(selected_features)
    from msmbuilder.preprocessing import RobustScaler
    scaler = RobustScaler()
    scaled_diheds = scaler.fit_transform(new_diheds)
    scaled_diheds = new_diheds
    from msmbuilder.decomposition import tICA
    tica_model = tICA(lag_time=2, n_components=5)
    tica_model.fit(scaled_diheds)
    tica_trajs = tica_model.transform(scaled_diheds)
    from msmbuilder.cluster import MiniBatchKMeans
    clusterer = MiniBatchKMeans(n_clusters=200, random_state=42)

    clustered_trajs = clusterer.fit_transform(tica_trajs)
    from msmbuilder.msm import MarkovStateModel
    msm = MarkovStateModel(lag_time=50, n_timescales=5)
    #msm.fit_transform(clustered_trajs)
    from sklearn.cross_validation import KFold
    n_states = [4]
    cv = KFold(len(clustered_trajs), n_folds=5)
    results = []
    for n in n_states:
        msm.n_states_ = n
        for fold, (train_index, test_index) in enumerate(cv):
            train_data = [clustered_trajs[i] for i in train_index]
            test_data = [clustered_trajs[i] for i in test_index]
            msm.fit(train_data)
            train_score = msm.score(train_data)
            test_score = msm.score(test_data)
            time_score = msm.timescales_[0]
            time_test_score = time_score + test_score
            print(time_score)
            print(test_score)
            av_score = time_test_score / 2
            results.append({
                'train_score': train_score,
                'test_score': test_score,
                'time_score': time_score,
                'av_score': av_score,
                'n_states': n,
                'fold': fold
            })
            print(msm.timescales_)
    results = pd.DataFrame(results)
    avgs = (results.groupby('n_states').aggregate(np.median).drop('fold',
                                                                  axis=1))
    best_nt = avgs['test_score'].idxmax()
    best_n = avgs['av_score'].idxmax()
    best_score = avgs.loc[best_n, 'av_score']
    best_scorent = avgs.loc[best_nt, 'test_score']
    print(best_scorent)
    lock.acquire()
    score_global.update({pop_index: best_scorent})
    lock.release()
def main():
    cli = argparse.ArgumentParser()
    cli.add_argument('-e', '--eps', help='eps', default=1, type=float)
    cli.add_argument('-m',
                     '--min_samples',
                     help='min_samples',
                     default=5,
                     type=int)
    cli.add_argument('-l', '--nlist', help='nlist', default=1000, type=int)
    cli.add_argument('-p', '--nprobe', help='nprob', default=10, type=int)

    # Download example dataset
    from msmbuilder.example_datasets import AlanineDipeptide
    ala2 = AlanineDipeptide(verbose=False)
    xyz = ala2.get().trajectories
    print(ala2.description())

    #xyz = [t[::10] for t in xyz]
    print("{} trajectories".format(len(xyz)))
    # msmbuilder does not keep track of units! You must keep track of your
    # data's timestep
    to_ns = 0.5
    print("with length {} ns".format(set(len(x) * to_ns for x in xyz)))

    from msmbuilder.featurizer import DihedralFeaturizer
    featurizer = DihedralFeaturizer(types=['phi', 'psi'])
    diheds = featurizer.fit_transform(xyz)

    print(xyz[0].xyz.shape)
    print(diheds[0].shape)

    from msmbuilder.preprocessing import RobustScaler
    scaler = RobustScaler()
    scaled_diheds = scaler.fit_transform(diheds)

    print(diheds[0].shape)
    print(scaled_diheds[0].shape)

    from msmbuilder.decomposition import tICA
    tica_model = tICA(lag_time=2, n_components=2)
    # fit and transform can be done in seperate steps:
    tica_model.fit(diheds)

    tica_trajs = tica_model.transform(diheds)
    featurizer = DihedralFeaturizer(types=['phi', 'psi'], sincos=False)
    diheds = featurizer.fit_transform(xyz)
    print(diheds[0].shape)
    print(tica_trajs[0].shape)

    # ===========================================================================
    #if os.path.isfile("./phi_angles.txt") and os.path.isfile("./psi_angles.txt") is True:
    #    phi_angles = np.loadtxt("./phi_angles.txt", dtype=np.float32)
    #    psi_angles = np.loadtxt("./psi_angles.txt", dtype=np.float32)
    #X = np.column_stack((phi_angles, psi_angles))
    #print(X.shape)
    phi_angles = np.degrees(diheds[0][:, 0])
    psi_angles = np.degrees(diheds[0][:, 1])
    print(phi_angles)
    X = tica_trajs[0].astype(np.float32)
    #rint(X)
    n_size = X.shape[0]
    dimension = X.shape[1]

    #print(X.shape)

    # ===========================================================================
    args = cli.parse_args()
    eps = args.eps  # eps
    min_samples = args.min_samples  # min_samples
    nlist = args.nlist
    nprobe = args.nprobe
    IVFFlat = True
    print('n_size = %d,\t dimension = %d,\t eps = %f, min_samples = %d' %
          (n_size, dimension, eps, min_samples))

    n_samples = 1000
    percent = 0.9
    import random
    whole_samples = random.sample(list(X), n_samples)
    #print whole_samples
    from metrics.pairwise import pairwise_distances
    sample_dist_metric = pairwise_distances(whole_samples,
                                            whole_samples,
                                            metric='l2')
    print(sample_dist_metric.shape)
    sample_dist = []
    for i in range(0, n_samples):
        for j in range(i + 1, n_samples):
            sample_dist.append(sample_dist_metric[i, j])
    sorted_sample_dist = np.sort(sample_dist)
    print("Len of samples:", len(sorted_sample_dist),
          np.max(sorted_sample_dist), np.min(sorted_sample_dist))

    eps_list = []
    len_samples = len(sorted_sample_dist)
    for percent in [0.30, 0.20, 0.10]:  #,0.005, 0.003,
        #                   0.002, 0.001, 0.0008, 0.0005, 0.0003, 0.0002, 0.0001, 0.00005]:
        #percent /= 10.0
        index = int(round(len_samples * percent))
        if index == len_samples:
            index -= 1
        dc = sorted_sample_dist[index]
        #print index, sorted_sample_dist[index]
        eps_list.append(dc)
    print(eps_list)

    #print X
    # ===========================================================================
    # do Clustering using MR -DBSCAN method
    clustering_name = "mr-dbscan_iter_"
    #potential = True
    remove_outliers = False
    potential = False
    eps = eps_list[0]
    min_samples = 1
    len_frames = len(X)
    print("Total frames:", len_frames)
    print("Running first calculation")
    db = Faiss_DBSCAN(eps=eps,
                      min_samples=min_samples,
                      nlist=nlist,
                      nprobe=nprobe,
                      metric="l2",
                      GPU=False,
                      IVFFlat=IVFFlat)
    db.fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    old_assignments = db.labels_
    n_microstates = len(
        set(old_assignments)) - (1 if -1 in old_assignments else 0)
    print('Estimated number of clusters: %d' % n_microstates)

    # Calculating percentage of each states
    frame_bincount = np.bincount(
        old_assignments[old_assignments >= 0])  #remove outliers
    frame_freq_index_sorted = np.argsort(
        frame_bincount)[::-1]  # descending arg sort
    frame_freq_percent_sorted = frame_bincount[
        frame_freq_index_sorted] / np.float32(len_frames)
    print(frame_freq_percent_sorted[0:10])
    print(frame_freq_index_sorted[0:10])
    old_frame_freq_percent_sorted = frame_freq_percent_sorted
    old_frame_freq_index_sorted = frame_freq_index_sorted
    n_microstates = len(
        set(old_assignments)) - (1 if -1 in old_assignments else 0)
    print('Estimated number of clusters: %d' % n_microstates)
    iter_name = clustering_name + '0' + '_eps_' + str(
        eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str(
            n_microstates)
    plot_cluster(labels=old_assignments,
                 phi_angles=phi_angles,
                 psi_angles=psi_angles,
                 name=iter_name,
                 potential=potential)

    n_iterations = len(eps_list)
    print("n_iterations:", n_iterations)
    min_samples_list = [50, 30, 10]
    #min_samples_list = [50, 30, 20, 15, 10, 8, 5, 2]
    n_min_samples = len(min_samples_list)
    #eps_list = [3.0, 2.0, 1.0, 0.8, 0.5]
    #min_samples_list = [3, 3, 3, 3, 3, 2, 2]

    results = np.zeros((n_min_samples, n_iterations, len_frames),
                       dtype=np.int32)
    for i in range(1, n_iterations):
        eps = eps_list[i]
        min_samples = min_samples_list[i]
        db = Faiss_DBSCAN(eps=eps,
                          min_samples=min_samples,
                          nlist=nlist,
                          nprobe=nprobe,
                          metric="l2",
                          GPU=False,
                          IVFFlat=IVFFlat).fit(X)

        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        new_assignments = db.labels_
        if i is n_iterations - 1:
            remove_outliers = True
        #else:
        #    remove_outliers = False
        assignments = merge_assignments(new_assignments,
                                        old_assignments,
                                        remove_outliers=remove_outliers)
        n_microstates = len(set(assignments)) - (1 if -1 in assignments else 0)

        #results[j,i, :]= np.array(assignments)
        print("Iter:", i, "Running MR-DBSCAN at eps:", eps, 'min_sampes:',
              min_samples, 'Estimated number of clusters:', n_microstates)
        #print('Estimated number of clusters: %d' % n_microstates)
        iter_name = clustering_name + str(i) + '_eps_' + str(
            eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str(
                n_microstates)
        plot_cluster(labels=assignments,
                     phi_angles=phi_angles,
                     psi_angles=psi_angles,
                     name=iter_name,
                     potential=potential)
        #old_assignments = assignments
    #print(results)
    #np.save("results.npy", results)
    #np.savetxt("results.csv", results, fmt="%d", delimiter=",")
    np.savetxt("eps_list.txt", eps_list, fmt="%f", delimiter=",")
    np.savetxt("min_samples_list.txt",
               min_samples_list,
               fmt="%d",
               delimiter=",")
Example #4
0

if __name__ == "__main__":

    trajectory_dir = '/Volumes/REA_Data/AADH/traj_5_rxts'
    topology_file = '/Users/robert_arbon/Code/AADH/Analysis/MSM_Reactants_Only/2agy_rxt.psf'
    reference_file = '/Users/robert_arbon/Code/AADH/Analysis/MSM_Reactants_Only/2agy_rxt.pdb'
    reference_traj = md.load(reference_file)

    # Load the meta data
    meta = load_metadata(traj_dir=trajectory_dir, top=topology_file)

    # Featurize
    feature = RawPositionsFeaturizer(ref_traj=reference_traj)
    ftrajs = featurize(featurizer=feature, meta_data=meta)

    # Summarize
    variance = np.var(combine(ftrajs), axis=0)
    plot_features(variance,
                  name='Variance.png',
                  feature_name='Variance',
                  ordered=False)

    # Normalize
    scaler = RobustScaler()
    strajs = scaler.fit_transform(ftrajs)

    # perform tICA
    tica_obj = tICA(n_components=10, lag_time=10, kinetic_mapping=True)
    tica_traj = tica_obj.fit_transform(strajs)
Example #5
0
#featurizer = DihedralFeaturizer(types=['chi1', 'chi2'], resids= 73,74,75,76,77,78,79,80,81,82,83)
diheds = featurizer.fit_transform(ds)
dump(diheds, "features.pkl")

#print(ds[0].shape)
print(diheds[0].shape)

# this basically maps every feature to atom indices.
df1 = pd.DataFrame(featurizer.describe_features(ds))
dump(df1, "feature_descriptor.pkl")

#Robust scaling
from msmbuilder.preprocessing import RobustScaler
scaler = RobustScaler()
scaled_diheds = scaler.fit_transform(diheds)

print(diheds[0].shape)
print(scaled_diheds[0].shape)

#Reducing dimension
tica_model = tICA(lag_time=1, n_components=10)
# fit and transform can be done in seperate steps:
tica_model.fit(diheds)
tica_trajs = tica_model.transform(diheds)

print(diheds[0].shape)
print(tica_trajs[0].shape)

#lets dump the tica mdl for future use
verbosedump(tica_model, "tica_mdl_flapchi1angle.pkl")
def main_modified(generations):
 import numpy as np
 from msmbuilder.preprocessing import RobustScaler
 import time
 import pickle
 import os
 import multiprocessing
 os.environ["OMP_NUM_THREADS"] = "1"
 import operator
 from multiprocessing import Pool
 from operator import itemgetter
 diheds=Get_dihedral_features_villin()
 scaler = RobustScaler()                                                           
 scaled_feature = scaler.fit_transform(diheds) 
 Val=Laplacian_score(scaled_feature) # output of imp_features and col_mean of the laplacian score of each  dihedral
 col_mean=Val[0]
 imp_features=Val[1]
 current_gen = 0
 for_each_gen_score =[]
 population_each_gen=[]
 population_dihedral=[]
 population_dihedral=initial_population(imp_features)
 cross_probability=0.8
 num_parents=(int)(cross_probability*len(population_dihedral))
 population_dihedral_duplicate=[]
 numberOfThreads = multiprocessing.cpu_count()
 f = open("benzamidine_diheds_ga_score"+str(generations)+".txt", "a")
 while current_gen < generations:
   manager = multiprocessing.Manager()
   score = manager.dict()
   processes = []
   lock = multiprocessing.Lock()
   for i in range(len(population_dihedral)):
         p = multiprocessing.Process(target=calculate_fitness, args=(population_dihedral[i],scaled_feature,score,i,lock))
         processes.append(p)
 #starttime = time.time()
   for i in chunks(processes,numberOfThreads): #chunks is a function : has to be defined
      p_count=0   
      for process in i:
         process.start() 
         p_count=p_count+1
      print("the started process are"+str(p_count))
      for process in i:

         process.join()
         p_count=p_count-1
      print("the joined process are"+str(p_count))
      for process in i:
         process.terminate()
         p_count=p_count+1
      print("the terminated process are"+str(p_count)) 
   scored_population={}
   scored_population=dict(sorted(score.items(), key=operator.itemgetter(1)))
   for_each_gen_score.append(scored_population)
   population_each_gen.append(population_dihedral)
   scored_population_list=list(scored_population.keys())
   parents=[]
   parents = select_parents_rank_based(scored_population,population_dihedral,cross_probability)
   offsprings_1=[]
   offsprings_1=crossover(parents,population_dihedral)
   
   parents_binary=[]
   parents_binary=parents_binarize(parents,imp_features)
   offsprings_2_binary=[]
   count_mutation=len(population_dihedral)-len(offsprings_1)
   offsprings_2_binary=mutation_binary_offspring(parents_binary,4,count_mutation)
#,col_mean,imp_features)
   offsprings_2=[]
   offsprings_2=binary_to_pop_dih(offsprings_2_binary)
   for i in range(len(offsprings_2)):
     offsprings_2[i]=np.asarray(offsprings_2[i])
   for i in range(len(offsprings_1)):
     offsprings_1[i]=np.asarray(offsprings_1[i])
   offsprings=[]
   offsprings=offsprings_1+offsprings_2
  # offsprings.append(population_dihedral[scored_population_list[len(scored_population_list)-1]])
   #offsprings.append(population_dihedral[scored_population_list[len(scored_population_list)-2]])
   population_dihedral=[]
   population_dihedral=offsprings
   current_gen = current_gen+1
 print(for_each_gen_score,file=f)
 f.close()
 return for_each_gen_score,population_each_gen,scaled_feature,imp_features  
import matplotlib
matplotlib.use('Agg')
from matplotlib.pylab import plt
from utilities import plot_box

if __name__ == '__main__':

    # Load
    feature_name = 'Positions'
    meta, feature_trajs = load_trajs('Unscaled-{}-ftraj'.format(feature_name))

    # Select scaler
    featurizer = RobustScaler()

    # Transform values
    featurizer.fit_transform(feature_trajs.values())
    scaled_trajs = {}
    for k, v in feature_trajs.items():
        scaled_trajs[k] = featurizer.partial_transform(v)

    # Plot unscaled features
    ftrajs = np.concatenate([fx[::100] for fx in scaled_trajs.values()])
    fig, ax = plt.subplots(figsize=(15, 5))
    plot_box(ax, fxx=ftrajs, feature_name='Scaled {}'.format(feature_name))
    fig.tight_layout()
    fig.savefig("Scaled-{}-box.pdf".format(feature_name))

    # Save
    save_trajs(scaled_trajs, 'Scaled-{}-ftraj'.format(feature_name), meta)
    save_generic(featurizer, 'Scaled-{}-featurizer.pickl'.format(feature_name))