def test_featureselector(): trajectories = AlanineDipeptide().get_cached().trajectories fs = FeatureSelector(FEATS, which_feat="phi") assert fs.which_feat == ["phi"] y1 = fs.partial_transform(trajectories[0]) y_ref1 = FEATS[0][1].partial_transform(trajectories[0]) np.testing.assert_array_almost_equal(y_ref1, y1)
def test_featureselector(): trajectories = AlanineDipeptide().get_cached().trajectories fs = FeatureSelector(FEATS, which_feat='phi') assert fs.which_feat == ['phi'] y1 = fs.partial_transform(trajectories[0]) y_ref1 = FEATS[0][1].partial_transform(trajectories[0]) np.testing.assert_array_almost_equal(y_ref1, y1)
def test_variancethreshold_vs_sklearn(): trajectories = AlanineDipeptide().get_cached().trajectories fs = FeatureSelector(FEATS) vt = VarianceThreshold(0.1) vtr = VarianceThresholdR(0.1) y = fs.partial_transform(trajectories[0]) z1 = vt.fit_transform([y])[0] z_ref1 = vtr.fit_transform(y) np.testing.assert_array_almost_equal(z_ref1, z1)
def test_which_feat_types(): # trajectories = AlanineDipeptide().get_cached().trajectories fs = FeatureSelector(FEATS, which_feat=('phi', 'psi')) assert fs.which_feat == ['phi', 'psi'] fs = FeatureSelector(FEATS, which_feat=set(('phi', 'psi'))) assert fs.which_feat == ['phi', 'psi'] or fs.which_feat == ['psi', 'phi'] try: fs = FeatureSelector(FEATS, which_feat={'phi': 'psi'}) assert False except TypeError: pass try: fs = FeatureSelector(FEATS, which_feat=['phiii']) assert False except ValueError: pass
def test_FeatureSelector_describe_features(): rnd_traj = np.random.randint(len(trajectories)) f_ca = ContactFeaturizer(scheme='CA', ignore_nonprotein=True) f1 = f_ca.transform([trajectories[rnd_traj]]) df1 = pd.DataFrame(f_ca.describe_features(trajectories[rnd_traj])) f_dih = DihedralFeaturizer() f2 = f_dih.transform([trajectories[rnd_traj]]) df2 = pd.DataFrame(f_dih.describe_features(trajectories[rnd_traj])) df_dict = {} df_dict["ca"] = df1 df_dict["dih"] = df2 f_comb = FeatureSelector([('ca', f_ca), ('dih', f_dih)]) f3 = f_comb.transform([trajectories[rnd_traj]]) df3 = pd.DataFrame(f_comb.describe_features(trajectories[rnd_traj])) assert len(df3) == len(df1) + len(df2) df4 = pd.concat([df_dict[i] for i in f_comb.feat_list]) # lets randomly compare 40 features for i in np.random.choice(range(len(df3)), 40): for j in df3.columns: assert eq(df3.iloc[i][j], df4.iloc[i][j])
def Get_combined_features_villin(): from msmbuilder.featurizer import DihedralFeaturizer from msmbuilder.featurizer import ContactFeaturizer diheds= DihedralFeaturizer() contacts=ContactFeaturizer() features=[("di_villin",diheds),("con_villin",contacts)] import os import shutil os.chdir('/homes/anuginueni/traj_villin') if(os.path.isdir('/homes/anuginueni/traj_villin/combined')): shutil.rmtree('/homes/anuginueni/traj_villin/combined') from msmbuilder.dataset import dataset xyz = dataset( "/homes/anuginueni/traj_villin/*.xtc",topology='/homes/anuginueni/traj_villin/filtered.pdb',stride=5) from msmbuilder.feature_selection import FeatureSelector comb_features=FeatureSelector(features) co=xyz.fit_transform_with(comb_features, '/homes/anuginueni/traj_villin/combined/', fmt='dir-npy') return co
# # TIMESCALES # # The data will be loaded with a stride of 10 frames. Each fame is 50ps, so the time per frame will be # 500ps/frame or 0.5ns/frame. # Each trajectory is 1000 frames long # Lag time will be 40 frames (20 ns) based on a visual inspection of /Misc/MSM_lag_time.ipynb to_ns = 0.5 msm_lag = int(40 / to_ns) # # FEATURE INDICES # all_idx = np.load('indices_all.npy') # # OTHER PARAMETERS # ref_traj = md.load('../Data/data/trajectory-1.xtc', top='../Data/data/fs-peptide.pdb') featurizer = FeatureSelector(features=feats) pipe = Pipeline([('features', featurizer), ('variance_cut', VarianceThreshold()), ('scaling', RobustScaler()), ('cluster', MiniBatchKMeans()), ('msm', MarkovStateModel(lag_time=msm_lag, verbose=False))]) save_generic(pipe, 'model.pickl')
# # TIMESCALES # # The data will be loaded with a stride of 10 frames. Each fame is 50ps, so the time per frame will be # 500ps/frame or 0.5ns/frame. # Each trajectory is 1000 frames long # Lag time will be 40 frames (20 ns) based on a visual inspection of /Misc/MSM_lag_time.ipynb features = tica_unstructured_features to_ns = 0.5 msm_lag = int(40/to_ns) # # MODEL # pipe = Pipeline([('features', FeatureSelector(features=tica_unstructured_features)), ('variance_cut', VarianceThreshold()), ('scaling', RobustScaler()), ('tica', tICA(kinetic_mapping=True)), ('cluster', MiniBatchKMeans()), ('msm', MarkovStateModel(lag_time=msm_lag, verbose=False, n_timescales=2))]) # # SAVE MODEL # savedir = 'rand-tica-all' save_generic(pipe, '{}/model.pickl'.format(savedir)) print_feature_names(features, join(savedir, 'feature_list.txt'))
def test_featureselector_transform(): trajectories = AlanineDipeptide().get_cached().trajectories fs = FeatureSelector(FEATS, which_feat='psi') y1 = fs.transform(trajectories) assert len(y1) == len(trajectories)
def test_featureselector_transform(): trajectories = AlanineDipeptide().get_cached().trajectories fs = FeatureSelector(FEATS, which_feat="psi") y1 = fs.transform(trajectories) assert len(y1) == len(trajectories)
def test_featureselector_order(): fs1 = FeatureSelector(FEATS) fs2 = FeatureSelector(FEATS[::-1]) assert fs1.which_feat == ['phi', 'psi'] assert fs2.which_feat == ['psi', 'phi']