def test_function_featurizer(): trajectories = AlanineDipeptide().get_cached().trajectories trj0 = trajectories[0] # use the dihedral to compute phi for ala atom_ind = [[4, 6, 8, 14]] func = compute_dihedrals # test with args f = FunctionFeaturizer(func, func_args={"indices": atom_ind}) res1 = f.transform([trj0]) # test with function in a function without any args def funcception(trj): return compute_phi(trj)[1] f = FunctionFeaturizer(funcception) res2 = f.transform([trj0]) # know results f3 = DihedralFeaturizer(['phi'], sincos=False) res3 = f3.transform([trj0]) # compare all for r in [res2, res3]: np.testing.assert_array_almost_equal(res1, r)
def test_pipeline(): trajs = AlanineDipeptide().get_cached().trajectories p = Pipeline([('diheds', DihedralFeaturizer(['phi', 'psi'], sincos=False)), ('hmm', VonMisesHMM(n_states=4))]) predict = p.fit_predict(trajs) p.named_steps['hmm'].summarize()
def test_that_all_featurizers_run(): # TODO: include all featurizers, perhaps with generator tests trajectories = AlanineDipeptide().get_cached().trajectories trj0 = trajectories[0][0] atom_indices, pair_indices = get_atompair_indices(trj0) featurizer = AtomPairsFeaturizer(pair_indices) X_all = featurizer.transform(trajectories) featurizer = SuperposeFeaturizer(np.arange(15), trj0) X_all = featurizer.transform(trajectories) featurizer = DihedralFeaturizer(["phi", "psi"]) X_all = featurizer.transform(trajectories) featurizer = VonMisesFeaturizer(["phi", "psi"]) X_all = featurizer.transform(trajectories) # Below doesn't work on ALA dipeptide # featurizer = msmbuilder.featurizer.ContactFeaturizer() # X_all = featurizer.transform(trajectories) featurizer = RMSDFeaturizer(trj0) X_all = featurizer.transform(trajectories)
def test_alanine_dipeptide(): # will produce 0 features because not enough peptides trajectories = AlanineDipeptide().get_cached().trajectories featurizer = msmbuilder.featurizer.AlphaAngleFeaturizer() nothing = featurizer.transform(trajectories) assert (nothing[0].shape[1] == 0)
def setup(): global X_double, Y_double, X_float, Y_float, X_rmsd, Y_rmsd, X_indices X_double = random.randn(10, 2) Y_double = random.randn(3, 2) X_float = random.randn(10, 2).astype(np.float32) Y_float = random.randn(3, 2).astype(np.float32) X_rmsd = AlanineDipeptide().get().trajectories[0][0:10] Y_rmsd = AlanineDipeptide().get().trajectories[0][30:33] X_rmsd.center_coordinates() Y_rmsd.center_coordinates() X_indices = random.random_integers(low=0, high=9, size=5).astype(np.intp)
def test_alanine_dipeptide(): # test for rmsd metric compatibility with ward clustering # keep n_landmarks small or this will get really slow trajectories = AlanineDipeptide().get_cached().trajectories n_clusters = 4 model = LandmarkAgglomerative(n_clusters=n_clusters, n_landmarks=20, linkage='ward', metric='rmsd') labels = model.fit_predict(trajectories[0][0:100]) assert len(np.unique(np.concatenate(labels))) <= n_clusters
def test_featureselector(): trajectories = AlanineDipeptide().get_cached().trajectories fs = FeatureSelector(FEATS, which_feat='phi') assert fs.which_feat == ['phi'] y1 = fs.partial_transform(trajectories[0]) y_ref1 = FEATS[0][1].partial_transform(trajectories[0]) np.testing.assert_array_almost_equal(y_ref1, y1)
def test_pipeline(): trajs = AlanineDipeptide().get_cached().trajectories topology = trajs[0].topology indices = topology.select('backbone') p = Pipeline([('diheds', SuperposeFeaturizer(indices, trajs[0][0])), ('hmm', GaussianHMM(n_states=4))]) predict = p.fit_predict(trajs) p.named_steps['hmm'].summarize()
def test_SubsetAtomPairs_1(): trajectories = AlanineDipeptide().get_cached().trajectories trj0 = trajectories[0][0] atom_indices, pair_indices = get_atompair_indices(trj0) featurizer = AtomPairsFeaturizer(pair_indices) X_all0 = featurizer.transform(trajectories) featurizer = SubsetAtomPairs(pair_indices, trj0) featurizer.subset = np.arange(len(pair_indices)) X_all = featurizer.transform(trajectories) any([eq(x, x0) for (x, x0) in zip(X_all, X_all0)])
def test_alanine_dipeptide_basic(): # This test takes the rmsd of the 0th set of alanine dipeptide # trajectories relative to the 0th frame of the dataset. # The test asserts that all rmsd's calculated will be equal # to the ones that would be calculated straight from mdtraj. trajectories = AlanineDipeptide().get_cached().trajectories featurizer = RMSDFeaturizer(trajectories[0][0]) data = featurizer.transform(trajectories[0:1]) true_rmsd = md.rmsd(trajectories[0], trajectories[0][0]) np.testing.assert_array_almost_equal(data[0][:, 0], true_rmsd, decimal=4)
def test_variancethreshold_vs_sklearn(): trajectories = AlanineDipeptide().get_cached().trajectories fs = FeatureSelector(FEATS) vt = VarianceThreshold(0.1) vtr = VarianceThresholdR(0.1) y = fs.partial_transform(trajectories[0]) z1 = vt.fit_transform([y])[0] z_ref1 = vtr.fit_transform(y) np.testing.assert_array_almost_equal(z_ref1, z1)
def test_two_refs_omitting_indices(): # This test verifies that the result produced when # atom_indices are omitted is the same as the result # produced when atom_indices is all atom indices. trajectories = AlanineDipeptide().get_cached().trajectories featurizer_indices = RMSDFeaturizer(trajectories[0][0:2], np.arange(trajectories[0].n_atoms)) data_indices = featurizer_indices.transform(trajectories[0:1]) featurizer = RMSDFeaturizer(trajectories[0][0:2]) data = featurizer.transform(trajectories[0:1]) np.testing.assert_array_almost_equal(data[0], data_indices[0], decimal=4)
def load_aladip(): from msmbuilder.example_datasets import AlanineDipeptide trajs = AlanineDipeptide().get().trajectories from msmbuilder.featurizer import AtomPairsFeaturizer pairs = [] for i in range(22): for j in range(i): pairs.append((j,i)) X = AtomPairsFeaturizer(pairs).fit_transform(trajs) from msmbuilder.featurizer import DihedralFeaturizer Y = DihedralFeaturizer().fit_transform(trajs) return X, Y
def test_ala2(): # creates a 4-state HMM on the ALA2 data. Nothing fancy, just makes # sure the code runs without erroring out trajectories = AlanineDipeptide().get_cached().trajectories topology = trajectories[0].topology indices = topology.select('symbol C or symbol O or symbol N') featurizer = SuperposeFeaturizer(indices, trajectories[0][0]) sequences = featurizer.transform(trajectories) hmm = GaussianHMM(n_states=4, n_init=3, random_state=rs) hmm.fit(sequences) assert len(hmm.timescales_ == 3) assert np.any(hmm.timescales_ > 50)
def test_code_works(): # creates a 4-state HMM on the ALA2 data. Nothing fancy, just makes # sure the code runs without erroring out trajectories = AlanineDipeptide().get_cached().trajectories topology = trajectories[0].topology indices = topology.select('symbol C or symbol O or symbol N') featurizer = DihedralFeaturizer(['phi', 'psi'], trajectories[0][0]) sequences = featurizer.transform(trajectories) hmm = VonMisesHMM(n_states=4, n_init=1) hmm.fit(sequences) assert len(hmm.timescales_ == 3) assert np.any(hmm.timescales_ > 50)
def test_SubsetAtomPairs_3(): trajectories = AlanineDipeptide().get_cached().trajectories trj0 = trajectories[0][0] atom_indices, pair_indices = get_atompair_indices(trj0) featurizer = AtomPairsFeaturizer(pair_indices) X_all0 = featurizer.transform(trajectories) featurizer = SubsetAtomPairs(pair_indices, trj0, subset=np.array([0, 1])) X_all = featurizer.transform(trajectories) try: any([eq(x, x0) for (x, x0) in zip(X_all, X_all0)]) except AssertionError: pass else: raise AssertionError("Did not raise an assertion!")
def setup(): global X_double, Y_double, X_float, Y_float, X_rmsd, Y_rmsd, X_indices X_double = random.randn(10, 2) Y_double = random.randn(3, 2) X_float = random.randn(10, 2).astype(np.float32) Y_float = np.random.randn(3, 2).astype(np.float32) X_rmsd = AlanineDipeptide().get().trajectories[0][0:10] Y_rmsd = AlanineDipeptide().get().trajectories[0][30:33] X_rmsd.center_coordinates() Y_rmsd.center_coordinates() X_indices = random.random_integers(low=0, high=9, size=5)
def setup(): global X_double, Y_double, X_float, Y_float, X_rmsd, Y_rmsd, X_indices X_double = random.randn(10, 2) Y_double = random.randn(3, 2) X_float = random.randn(10, 2).astype(np.float32) Y_float = random.randn(3, 2).astype(np.float32) X_rmsd = AlanineDipeptide().get_cached().trajectories[0][0:10] Y_rmsd = AlanineDipeptide().get_cached().trajectories[0][30:33] X_rmsd.center_coordinates() Y_rmsd.center_coordinates() X_indices = random.randint(0, 10, size=5).astype(np.intp)
def test_ktica_compare_to_tica(): trajectories = AlanineDipeptide().get_cached().trajectories featurizer = DihedralFeaturizer(sincos=True) features = featurizer.transform(trajectories[0:1]) features = [features[0][::10]] tica = tICA(lag_time=1, n_components=2) ktica = KernelTICA(lag_time=1, kernel='linear', n_components=2, random_state=42) tica_out = tica.fit_transform(features)[0] ktica_out = ktica.fit_transform(features)[0] assert_array_almost_equal(ktica_out, tica_out, decimal=1)
def test_pickle(): """Test pickling an HMM""" trajectories = AlanineDipeptide().get_cached().trajectories topology = trajectories[0].topology indices = topology.select('symbol C or symbol O or symbol N') featurizer = DihedralFeaturizer(['phi', 'psi'], trajectories[0][0]) sequences = featurizer.transform(trajectories) hmm = VonMisesHMM(n_states=4, n_init=1) hmm.fit(sequences) logprob, hidden = hmm.predict(sequences) with tempfile.TemporaryFile() as savefile: pickle.dump(hmm, savefile) savefile.seek(0, 0) hmm2 = pickle.load(savefile) logprob2, hidden2 = hmm2.predict(sequences) assert (logprob == logprob2)
def test_1(): # creates a 4-state HMM on the ALA2 data. Nothing fancy, just makes # sure the code runs without erroring out dataset = AlanineDipeptide().get() trajectories = dataset.trajectories topology = trajectories[0].topology indices = topology.select('symbol C or symbol O or symbol N') featurizer = SuperposeFeaturizer(indices, trajectories[0][0]) sequences = featurizer.transform(trajectories) hmm = GaussianFusionHMM(n_states=4, n_features=sequences[0].shape[1], n_init=1) hmm.fit(sequences) assert len(hmm.timescales_ == 3) assert np.any(hmm.timescales_ > 50)
def test_different_indices(): # This test verifies that the rmsd's calculated from # different sets of atom indices are not the same, # but that the arrays are still the same shape. trajectories = AlanineDipeptide().get_cached().trajectories n_atoms = trajectories[0].n_atoms halfway_point = n_atoms // 2 featurizer_first_half = RMSDFeaturizer(trajectories[0][0], np.arange(halfway_point)) data_first_half = featurizer_first_half.transform(trajectories[0:1]) featurizer_second_half = RMSDFeaturizer(trajectories[0][0], np.arange(halfway_point, n_atoms)) data_second_half = featurizer_second_half.transform(trajectories[0:1]) assert data_first_half[0].shape == data_second_half[0].shape # janky way to show that the arrays shouldn't be equal here assert sum(data_first_half[0][:, 0]) != sum(data_second_half[0][:, 0])
def test_two_refs_basic(): # This test uses the 0th and 1st frames of the 0th set of # adp trajectories as the two reference trajectories and # ensures that the rmsd of the 0th frame of the dataset with # the 0th reference are identical and the 1st frame of the # dataset with the 1st reference are identical. trajectories = AlanineDipeptide().get_cached().trajectories featurizer = RMSDFeaturizer(trajectories[0][0:2]) data = featurizer.transform(trajectories[0:1]) true_rmsd = np.zeros((trajectories[0].n_frames, 2)) for frame in range(2): true_rmsd[:, frame] = md.rmsd(trajectories[0], trajectories[0][frame]) np.testing.assert_almost_equal(data[0][0, 0], data[0][1, 1], decimal=3) np.testing.assert_almost_equal(data[0][1, 0], data[0][0, 1], decimal=3) np.testing.assert_array_almost_equal(data[0], true_rmsd, decimal=4)
def test_feature_slicer(): trajectories = AlanineDipeptide().get_cached().trajectories f = DihedralFeaturizer() fs = FeatureSlicer(f, indices=[0, 1]) y1 = fs.transform(trajectories) assert y1[0].shape[1] == 2 df = pd.DataFrame(fs.describe_features(trajectories[0])) assert len(df) == 2 assert 'psi' not in df.featuregroup[0] assert 'psi' not in df.featuregroup[1] fs = FeatureSlicer(f, indices=[2, 3]) y1 = fs.transform(trajectories) assert y1[0].shape[1] == 2 df = pd.DataFrame(fs.describe_features(trajectories[0])) assert len(df) == 2 assert 'phi' not in df.featuregroup[0] assert 'phi' not in df.featuregroup[1]
def test_that_all_featurizers_run(): trajectories = AlanineDipeptide().get_cached().trajectories trj0 = trajectories[0][0] atom_indices, pair_indices = get_atompair_indices(trj0) atom_featurizer0 = SubsetAtomPairs(pair_indices, trj0, exponent=-1.0) cosphi = SubsetCosPhiFeaturizer(trj0) sinphi = SubsetSinPhiFeaturizer(trj0) cospsi = SubsetCosPsiFeaturizer(trj0) sinpsi = SubsetSinPsiFeaturizer(trj0) featurizer = SubsetFeatureUnion([("pairs", atom_featurizer0), ("cosphi", cosphi), ("sinphi", sinphi), ("cospsi", cospsi), ("sinpsi", sinpsi)]) featurizer.subsets = [ np.arange(1) for i in range(featurizer.n_featurizers) ] X_all = featurizer.transform(trajectories) eq(X_all[0].shape[1], 1 * featurizer.n_featurizers)
def test_von_mises_featurizer(): trajectories = AlanineDipeptide().get_cached().trajectories featurizer = VonMisesFeaturizer(["phi"], n_bins=18) X_all = featurizer.transform(trajectories) n_frames = trajectories[0].n_frames assert X_all[0].shape == (n_frames, 18), ("unexpected shape returned: (%s, %s)" % X_all[0].shape) featurizer = VonMisesFeaturizer(["phi", "psi"], n_bins=18) X_all = featurizer.transform(trajectories) n_frames = trajectories[0].n_frames assert X_all[0].shape == (n_frames, 36), ("unexpected shape returned: (%s, %s)" % X_all[0].shape) featurizer = VonMisesFeaturizer(["phi", "psi"], n_bins=10) X_all = featurizer.transform(trajectories) assert X_all[0].shape == (n_frames, 20), ("unexpected shape returned: (%s, %s)" % X_all[0].shape)
def load(self): from msmbuilder.example_datasets import AlanineDipeptide trajectories = AlanineDipeptide(verbose=False).get().trajectories return trajectories, None
Copied from <http://msmbuilder.org/latest/examples/hmm-and-msm.html> """ import os from matplotlib.pyplot import * import matplotlib.pyplot as plt plt.style.use("ggplot") from msmbuilder.featurizer import SuperposeFeaturizer from msmbuilder.example_datasets import AlanineDipeptide from msmbuilder.hmm import GaussianHMM from msmbuilder.cluster import KCenters from msmbuilder.msm import MarkovStateModel dataset = AlanineDipeptide().get() trajectories = dataset.trajectories topology = trajectories[0].topology indices = [ atom.index for atom in topology.atoms if atom.element.symbol in ['C', 'O', 'N'] ] featurizer = SuperposeFeaturizer(indices, trajectories[0][0]) sequences = featurizer.transform(trajectories) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~ HIDDEN MARKOV MODEL ~~~~~~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ lag_times = [1, 10, 20, 30, 40] hmm_ts0 = {}
def test_featureselector_transform(): trajectories = AlanineDipeptide().get_cached().trajectories fs = FeatureSelector(FEATS, which_feat='psi') y1 = fs.transform(trajectories) assert len(y1) == len(trajectories)
import mdtraj as md import numpy as np traj = md.load("samples_from_model.h5") print(np.max(md.rmsd(traj, traj))) from msmbuilder.example_datasets import AlanineDipeptide traj = AlanineDipeptide().get().trajectories[0] print(md.rmsd(traj, traj[:10])) print(np.max(md.rmsd(traj, traj)))
def main(): cli = argparse.ArgumentParser() cli.add_argument('-e', '--eps', help='eps', default=1, type=float) cli.add_argument('-m', '--min_samples', help='min_samples', default=5, type=int) cli.add_argument('-l', '--nlist', help='nlist', default=1000, type=int) cli.add_argument('-p', '--nprobe', help='nprob', default=10, type=int) # Download example dataset from msmbuilder.example_datasets import AlanineDipeptide ala2 = AlanineDipeptide(verbose=False) xyz = ala2.get().trajectories print(ala2.description()) #xyz = [t[::10] for t in xyz] print("{} trajectories".format(len(xyz))) # msmbuilder does not keep track of units! You must keep track of your # data's timestep to_ns = 0.5 print("with length {} ns".format(set(len(x) * to_ns for x in xyz))) from msmbuilder.featurizer import DihedralFeaturizer featurizer = DihedralFeaturizer(types=['phi', 'psi']) diheds = featurizer.fit_transform(xyz) print(xyz[0].xyz.shape) print(diheds[0].shape) from msmbuilder.preprocessing import RobustScaler scaler = RobustScaler() scaled_diheds = scaler.fit_transform(diheds) print(diheds[0].shape) print(scaled_diheds[0].shape) from msmbuilder.decomposition import tICA tica_model = tICA(lag_time=2, n_components=2) # fit and transform can be done in seperate steps: tica_model.fit(diheds) tica_trajs = tica_model.transform(diheds) featurizer = DihedralFeaturizer(types=['phi', 'psi'], sincos=False) diheds = featurizer.fit_transform(xyz) print(diheds[0].shape) print(tica_trajs[0].shape) # =========================================================================== #if os.path.isfile("./phi_angles.txt") and os.path.isfile("./psi_angles.txt") is True: # phi_angles = np.loadtxt("./phi_angles.txt", dtype=np.float32) # psi_angles = np.loadtxt("./psi_angles.txt", dtype=np.float32) #X = np.column_stack((phi_angles, psi_angles)) #print(X.shape) phi_angles = np.degrees(diheds[0][:, 0]) psi_angles = np.degrees(diheds[0][:, 1]) print(phi_angles) X = tica_trajs[0].astype(np.float32) #rint(X) n_size = X.shape[0] dimension = X.shape[1] #print(X.shape) # =========================================================================== args = cli.parse_args() eps = args.eps # eps min_samples = args.min_samples # min_samples nlist = args.nlist nprobe = args.nprobe IVFFlat = True print('n_size = %d,\t dimension = %d,\t eps = %f, min_samples = %d' % (n_size, dimension, eps, min_samples)) n_samples = 1000 percent = 0.9 import random whole_samples = random.sample(list(X), n_samples) #print whole_samples from metrics.pairwise import pairwise_distances sample_dist_metric = pairwise_distances(whole_samples, whole_samples, metric='l2') print(sample_dist_metric.shape) sample_dist = [] for i in range(0, n_samples): for j in range(i + 1, n_samples): sample_dist.append(sample_dist_metric[i, j]) sorted_sample_dist = np.sort(sample_dist) print("Len of samples:", len(sorted_sample_dist), np.max(sorted_sample_dist), np.min(sorted_sample_dist)) eps_list = [] len_samples = len(sorted_sample_dist) for percent in [0.30, 0.20, 0.10]: #,0.005, 0.003, # 0.002, 0.001, 0.0008, 0.0005, 0.0003, 0.0002, 0.0001, 0.00005]: #percent /= 10.0 index = int(round(len_samples * percent)) if index == len_samples: index -= 1 dc = sorted_sample_dist[index] #print index, sorted_sample_dist[index] eps_list.append(dc) print(eps_list) #print X # =========================================================================== # do Clustering using MR -DBSCAN method clustering_name = "mr-dbscan_iter_" #potential = True remove_outliers = False potential = False eps = eps_list[0] min_samples = 1 len_frames = len(X) print("Total frames:", len_frames) print("Running first calculation") db = Faiss_DBSCAN(eps=eps, min_samples=min_samples, nlist=nlist, nprobe=nprobe, metric="l2", GPU=False, IVFFlat=IVFFlat) db.fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True old_assignments = db.labels_ n_microstates = len( set(old_assignments)) - (1 if -1 in old_assignments else 0) print('Estimated number of clusters: %d' % n_microstates) # Calculating percentage of each states frame_bincount = np.bincount( old_assignments[old_assignments >= 0]) #remove outliers frame_freq_index_sorted = np.argsort( frame_bincount)[::-1] # descending arg sort frame_freq_percent_sorted = frame_bincount[ frame_freq_index_sorted] / np.float32(len_frames) print(frame_freq_percent_sorted[0:10]) print(frame_freq_index_sorted[0:10]) old_frame_freq_percent_sorted = frame_freq_percent_sorted old_frame_freq_index_sorted = frame_freq_index_sorted n_microstates = len( set(old_assignments)) - (1 if -1 in old_assignments else 0) print('Estimated number of clusters: %d' % n_microstates) iter_name = clustering_name + '0' + '_eps_' + str( eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str( n_microstates) plot_cluster(labels=old_assignments, phi_angles=phi_angles, psi_angles=psi_angles, name=iter_name, potential=potential) n_iterations = len(eps_list) print("n_iterations:", n_iterations) min_samples_list = [50, 30, 10] #min_samples_list = [50, 30, 20, 15, 10, 8, 5, 2] n_min_samples = len(min_samples_list) #eps_list = [3.0, 2.0, 1.0, 0.8, 0.5] #min_samples_list = [3, 3, 3, 3, 3, 2, 2] results = np.zeros((n_min_samples, n_iterations, len_frames), dtype=np.int32) for i in range(1, n_iterations): eps = eps_list[i] min_samples = min_samples_list[i] db = Faiss_DBSCAN(eps=eps, min_samples=min_samples, nlist=nlist, nprobe=nprobe, metric="l2", GPU=False, IVFFlat=IVFFlat).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True new_assignments = db.labels_ if i is n_iterations - 1: remove_outliers = True #else: # remove_outliers = False assignments = merge_assignments(new_assignments, old_assignments, remove_outliers=remove_outliers) n_microstates = len(set(assignments)) - (1 if -1 in assignments else 0) #results[j,i, :]= np.array(assignments) print("Iter:", i, "Running MR-DBSCAN at eps:", eps, 'min_sampes:', min_samples, 'Estimated number of clusters:', n_microstates) #print('Estimated number of clusters: %d' % n_microstates) iter_name = clustering_name + str(i) + '_eps_' + str( eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str( n_microstates) plot_cluster(labels=assignments, phi_angles=phi_angles, psi_angles=psi_angles, name=iter_name, potential=potential) #old_assignments = assignments #print(results) #np.save("results.npy", results) #np.savetxt("results.csv", results, fmt="%d", delimiter=",") np.savetxt("eps_list.txt", eps_list, fmt="%f", delimiter=",") np.savetxt("min_samples_list.txt", min_samples_list, fmt="%d", delimiter=",")