def fit_protein_kmeans(yaml_file,mini=True,pca=False):
    mdl_dir = yaml_file["mdl_dir"]
    mdl_params = yaml_file["mdl_params"]

    current_mdl_params={}
    for i in mdl_params.keys():
        if i.startswith("cluster__"):
            current_mdl_params[i.split("cluster__")[1]] = mdl_params[i]

    if mini:
        current_mdl_params["batch_size"] = 100*current_mdl_params["n_clusters"]
        kmeans_mdl = MiniBatchKMeans(**current_mdl_params)
    else:
        kmeans_mdl = KMeans(**current_mdl_params)
    data = []

    for protein in yaml_file["protein_list"]:
        with enter_protein_mdl_dir(yaml_file, protein):
            if pca:
                tica_data = verboseload("pca_data.pkl")
            else:
                tica_data = verboseload("tica_data.pkl")
            # get all traj
            sorted_list = sorted(tica_data.keys(), key=keynat)
            data.extend([tica_data[i] for i in sorted_list])

    kmeans_mdl.fit(data)
    kmeans_mdl_path = os.path.join(mdl_dir, "kmeans_mdl.pkl")
    verbosedump(kmeans_mdl, kmeans_mdl_path)
    return
Example #2
0
class TestPlotUtils:

    def setUp(self):
        numpy.random.seed(12)
        self.ttrajs = {
            0 : numpy.random.rand(20, 3),
            1 : numpy.random.rand(20, 3),
        }
        self.clusterer = MiniBatchKMeans(n_clusters=2)
        self.clusterer.fit(list(self.ttrajs.values()))

    def test_plot_spawns(self):
        ax = plot_spawns(
            inds=spawns,
            tica_trajs=self.ttrajs,
            ax=None
        )
        assert isinstance(ax, Axes)

    def test_plot_tica_landscape(self):
        f, ax = plot_tica_landscape(self.ttrajs)
        assert isinstance(ax, Axes)

    def test_plot_clusters(self):
        ax = plot_clusters(self.clusterer)
        assert isinstance(ax, Axes)
def cluster_minikmeans(tica_dir,
                       data_dir,
                       traj_dir,
                       n_clusters,
                       clusterer_dir=None,
                       tICs=None):
    if (os.path.exists(clusterer_dir)):
        reduced_data = load_file(data_dir)
        clusterer = verboseload(clusterer_dir)
        clusterer.labels_ = clusterer.transform(reduced_data)
        verbosedump(clusterer, clusterer_dir)
    else:
        print("Clustering by KMeans")
        try:
            reduced_data = verboseload(data_dir)
        except:
            reduced_data = load_dataset(data_dir)
        if tICs is not None:
            X = []
            for traj in reduced_data:
                X.append(traj[:, tICs])
        else:
            X = reduced_data

        clusterer = MiniBatchKMeans(n_clusters=n_clusters, n_init=10)
        clusterer.fit_transform(X)
        verbosedump(clusterer, clusterer_dir)
def fit_protein_kmeans(yaml_file,mini=True):
    mdl_dir = yaml_file["mdl_dir"]
    mdl_params = yaml_file["mdl_params"]

    current_mdl_params={}
    for i in mdl_params.keys():
        if i.startswith("cluster__"):
            current_mdl_params[i.split("cluster__")[1]] = mdl_params[i]

    if mini:
        current_mdl_params["batch_size"] = 100*current_mdl_params["n_clusters"]
        kmeans_mdl = MiniBatchKMeans(**current_mdl_params)
    else:
        kmeans_mdl = KMeans(**current_mdl_params)
    data = []

    for protein in yaml_file["protein_list"]:
        with enter_protein_mdl_dir(yaml_file, protein):
            tica_data = verboseload("tica_data.pkl")
            # get all traj
            sorted_list = sorted(tica_data.keys(), key=keynat)
            data.extend([tica_data[i] for i in sorted_list])

    kmeans_mdl.fit(data)
    kmeans_mdl_path = os.path.join(mdl_dir, "kmeans_mdl.pkl")
    verbosedump(kmeans_mdl, kmeans_mdl_path)
    return
Example #5
0
 def setUp(self):
     numpy.random.seed(12)
     self.ttrajs = {
         0 : numpy.random.rand(20, 3),
         1 : numpy.random.rand(20, 3),
     }
     self.clusterer = MiniBatchKMeans(n_clusters=2)
     self.clusterer.fit(list(self.ttrajs.values()))
Example #6
0
    def generate_clusters(self, ticad):
        """
        Updates the cluster data. Needs to be re-done each iteration as
        cluster from previous trajectories may change as we get more data.

        Returns: clustered dataset
        """
        clustr = MiniBatchKMeans(
            n_clusters=self.config.getint("model", "num_clusters"))
        clustered = clustr.fit_transform(ticad)
        if self.save_extras:
            utils.dump(clustr, "microstater.pkl")
        return clustered
Example #7
0
def cluster_minikmeans(tica_dir, data_dir, traj_dir, n_clusters, lag_time):
	clusterer_dir = "%s/clusterer_%dclusters.h5" %(tica_dir, n_clusters)
	if (os.path.exists(clusterer_dir)):
		print "Already clustered"
	else:
		print "Clustering by KMeans"
		try:
			reduced_data = verboseload(data_dir)
		except:
			reduced_data = load_dataset(data_dir)
		trajs = np.concatenate(reduced_data)
		clusterer = MiniBatchKMeans(n_clusters = n_clusters)
		clusterer.fit_transform(reduced_data)
		verbosedump(clusterer, clusterer_dir)
def cluster_features(features, clusterer, n_clusters=8):
    '''
    Input
    features : list of arrays, length n_trajs, each of shape (n_samples, n_features)
	
    Output
    clst : msmbuilder.cluster object, with attributes
        cluster_centers_ : (n_clusters, n_features)
	labels_	         : list of arrays, each of shape (n_samples, )
    '''
    if clusterer == 'KMeans':
        from msmbuilder.cluster import KMeans
        clst = KMeans(n_clusters=n_clusters)
    elif clusterer == 'KCenters':
        from msmbuilder.cluster import KCenters
        clst = KCenters(n_clusters=n_clusters)
    elif clusterer == 'KMedoids':
        from msmbuilder.cluster import KMedoids
        clst = KMedoids(n_clusters=n_clusters)
    elif clusterer == 'MiniBatchKMeans':
        from msmbuilder.cluster import MiniBatchKMeans
        clst = MiniBatchKMeans(n_clusters=n_clusters)
    elif clusterer == 'MiniBatchKMedoids':
        from msmbuilder.cluster import MiniBatchKMedoids
        clst = MiniBatchKMedoids(n_clusters=n_clusters)
    clusters = clst.fit_transform(features)
    return clst
Example #9
0
def get_pipeline(parameters):
    """
    Wrapper so that new instance of a pipeline can be instantiated for every fold. 
    :return: sklean.pipeline.Pipeline object
    """
    pipe = Pipeline([('variance_cut', VarianceThreshold()),
                     ('tica', tICA(kinetic_mapping=True)),
                     ('cluster', MiniBatchKMeans()),
                     ('msm', MarkovStateModel(use_gap='timescales', lag_time=50, verbose=True))])
    pipe.set_params(**parameters)

    return pipe
Example #10
0
    def build_model(self, user_defined_model):
        """
        Load or build a model (Pipeline from scikit-learn) to do all the transforming and fitting
        :param user_defined_model: Either a string (to load from disk) or a Pipeline object to use as model
        :return model: Return the model back
        """
        if user_defined_model is None:
            if os.path.exists(self.model_pkl_fname):
                logger.info('Loading model pkl file {}'.format(
                    self.model_pkl_fname))
                model = load_generic(self.model_pkl_fname)
            else:
                logger.info('Building default model based on dihedrals')

                # build a lag time of 1 ns for tICA and msm
                # if the stride is too big and we can't do that
                # use 1 frame and report how much that is in ns
                if self.app.meta is not None:
                    lag_time = max(1, int(1 / self.timestep))
                    logger.info(
                        'Using a lag time of {} ns for the tICA and MSM'.
                        format(lag_time * self.timestep))
                else:
                    self.timestep = None
                    lag_time = 1
                    logger.warning(
                        'Cannot determine timestep. Defaulting to 1 frame.'.
                        format(lag_time))
                model = Pipeline([('feat', DihedralFeaturizer()),
                                  ('scaler', RobustScaler()),
                                  ('tICA',
                                   tICA(lag_time=lag_time,
                                        commute_mapping=True,
                                        n_components=10)),
                                  ('clusterer',
                                   MiniBatchKMeans(n_clusters=200)),
                                  ('msm',
                                   MarkovStateModel(lag_time=lag_time,
                                                    ergodic_cutoff='off',
                                                    reversible_type=None))])
        else:
            if not isinstance(user_defined_model, Pipeline):
                raise ValueError(
                    'model is not an sklearn.pipeline.Pipeline object')
            else:
                logger.info('Using user defined model')
                model = user_defined_model
        return model
Example #11
0
        # tICA is similar to principal component analysis
        tica_model = tICA(lag_time=int(args.lag),
                          n_components=int(args.components))
        # fit and transform can be done in seperate steps:
        tica_model = scaled_diheds.fit_with(tica_model)
        tica_trajs = scaled_diheds.transform_with(tica_model,
                                                  'ticas/',
                                                  fmt='dir-npy')

        # Conformations need to be clustered into states (sometimes written as microstates).
        # We cluster based on the tICA projections to group conformations that interconvert
        # rapidly. Note that we transform our trajectories from the n_components-dimensional
        # tICA space into a 1-dimensional cluster index
        txx = np.concatenate(tica_trajs)
        #_ = msme.plot_histogram(txx)
        clusterer = MiniBatchKMeans(n_clusters=int(args.clusters),
                                    random_state=42)
        clustered_trajs = tica_trajs.fit_transform_with(clusterer,
                                                        'kmeans/',
                                                        fmt='dir-npy')
        #plt.figure()
        #plt.hexbin(txx[:,0], txx[:,1], bins='log', mincnt=1, cmap='viridis')
        #plt.scatter(clusterer.cluster_centers_[:,0], clusterer.cluster_centers_[:,1], s=100, c='w')
        #plt.savefig('microstate_clusters.png')

        # We can construct an MSM from the labeled trajectories
        msm = MarkovStateModel(lag_time=int(args.lag), n_timescales=20)
        msm.fit(clustered_trajs)
        assignments = clusterer.partial_transform(txx)
        assignments = msm.partial_transform(assignments)
        #msme.plot_free_energy(txx, obs=(0, 1), n_samples=10000,
        #                  pi=msm.populations_[assignments],
Example #12
0
def calculate_fitness(population_dihedral, diheds, score_global, i, lock):
    import pandas as pd
    import numpy as np
    pop_index = i
    new_diheds = []

    for i in range(0, len(diheds)):
        X = diheds[i]
        selected_features = X[:, population_dihedral]
        new_diheds.append(selected_features)
    from msmbuilder.preprocessing import RobustScaler
    scaler = RobustScaler()
    scaled_diheds = scaler.fit_transform(new_diheds)
    scaled_diheds = new_diheds
    from msmbuilder.decomposition import tICA
    tica_model = tICA(lag_time=2, n_components=5)
    tica_model.fit(scaled_diheds)
    tica_trajs = tica_model.transform(scaled_diheds)
    from msmbuilder.cluster import MiniBatchKMeans
    clusterer = MiniBatchKMeans(n_clusters=200, random_state=42)

    clustered_trajs = clusterer.fit_transform(tica_trajs)
    from msmbuilder.msm import MarkovStateModel
    msm = MarkovStateModel(lag_time=50, n_timescales=5)
    #msm.fit_transform(clustered_trajs)
    from sklearn.cross_validation import KFold
    n_states = [4]
    cv = KFold(len(clustered_trajs), n_folds=5)
    results = []
    for n in n_states:
        msm.n_states_ = n
        for fold, (train_index, test_index) in enumerate(cv):
            train_data = [clustered_trajs[i] for i in train_index]
            test_data = [clustered_trajs[i] for i in test_index]
            msm.fit(train_data)
            train_score = msm.score(train_data)
            test_score = msm.score(test_data)
            time_score = msm.timescales_[0]
            time_test_score = time_score + test_score
            print(time_score)
            print(test_score)
            av_score = time_test_score / 2
            results.append({
                'train_score': train_score,
                'test_score': test_score,
                'time_score': time_score,
                'av_score': av_score,
                'n_states': n,
                'fold': fold
            })
            print(msm.timescales_)
    results = pd.DataFrame(results)
    avgs = (results.groupby('n_states').aggregate(np.median).drop('fold',
                                                                  axis=1))
    best_nt = avgs['test_score'].idxmax()
    best_n = avgs['av_score'].idxmax()
    best_score = avgs.loc[best_n, 'av_score']
    best_scorent = avgs.loc[best_nt, 'test_score']
    print(best_scorent)
    lock.acquire()
    score_global.update({pop_index: best_scorent})
    lock.release()
Example #13
0
from msmbuilder.cluster import MiniBatchKMeans
from msmbuilder.msm import MarkovStateModel
from sklearn.pipeline import Pipeline
import os
from ..adaptive import create_folder

logging.disable(logging.CRITICAL)

parser = NumberedRunsParser(traj_fmt='run-{run}.nc',
                            top_fn='data_app/runs/structure.prmtop',
                            step_ps=200)
meta = gather_metadata('/'.join(['data_app/runs/', '*nc']), parser)

model = Pipeline([('feat', DihedralFeaturizer()), ('scaler', MinMaxScaler()),
                  ('tICA', tICA(lag_time=1, n_components=4)),
                  ('clusterer', MiniBatchKMeans(n_clusters=5)),
                  ('msm', MarkovStateModel(lag_time=1, n_timescales=4))])

spawns = [
    (0, 1),
]
epoch = 1


class TestAppBase:
    def __init__(self):
        self.app = App(generator_folder='data_app/generators',
                       data_folder='data_app/runs',
                       input_folder='data_app/inputs',
                       filtered_folder='data_app/filtered_trajs',
                       model_folder='data_app/model',
#
# TIMESCALES
#
# The data will be loaded with a stride of 10 frames.  Each fame is 50ps, so the time per frame will be
# 500ps/frame or 0.5ns/frame.
# Each trajectory is 1000 frames long
# Lag time will be 40 frames (20 ns)  based on a visual inspection of /Misc/MSM_lag_time.ipynb
to_ns = 0.5
msm_lag = int(40 / to_ns)

#
# FEATURE INDICES
#
all_idx = np.load('indices_all.npy')

#
# OTHER PARAMETERS
#
ref_traj = md.load('../Data/data/trajectory-1.xtc',
                   top='../Data/data/fs-peptide.pdb')

featurizer = FeatureSelector(features=feats)

pipe = Pipeline([('features', featurizer),
                 ('variance_cut', VarianceThreshold()),
                 ('scaling', RobustScaler()), ('cluster', MiniBatchKMeans()),
                 ('msm', MarkovStateModel(lag_time=msm_lag, verbose=False))])

save_generic(pipe, 'model.pickl')
Example #15
0
import pandas as pd
import pickle
import glob
from msmbuilder.featurizer import ContactFeaturizer
from msmbuilder.dataset import dataset
from msmbuilder.decomposition import tICA
from msmbuilder.cluster import MiniBatchKMeans
from msmbuilder.msm import MarkovStateModel
from msmbuilder.utils import verbosedump,verboseload

file='dataset_nark.best_nonredu.pkl'
alpha=pickle.load(open(file))
print ('#_trajs:'+ str(np.shape(alpha)[0])+'\n' '#_CA_contacts:'+str(np.shape(alpha[0])[1]))
tica_model=tICA(n_components=10,lag_time=1)
tica_trajs=tica_model.fit_transform(alpha)
clusterer =MiniBatchKMeans(n_clusters=400)
clustered_trajs = clusterer.fit_transform(tica_trajs)
msm =MarkovStateModel(lag_time=150, n_timescales=5)
assignments = msm.fit_transform(clustered_trajs)
data = np.concatenate(tica_trajs, axis=0)
pi_0 = msm.populations_[np.concatenate(assignments, axis=0)]


name=file[:-4]
verbosedump(tica_model, name+"-GA-tica_model.pkl")
verbosedump(tica_trajs, name+"-GA-tica_trajs.pkl")
verbosedump(clusterer, name+"-GA-mbkm_mdl.pkl")
verbosedump(clustered_trajs, name+"-GA-clustered_trajs.pkl")
verbosedump(msm,name+"-GA-msm.pkl")
verbosedump(assignments,name+"-GA-assignments.pkl")
verbosedump(data,name+"-GA-weighted-msme-tica-data.pkl")
Example #16
0
# Concatenate features
# ftraj = num trajectories x np.array(n_frames, n_features)
ftraj = []
for traj in all_trajs:
    tmp = []
    for feat in traj:
        if feat.ndim == 1:
            feat = feat.reshape(-1, 1)
        tmp.append(feat)
    ftraj.append(np.concatenate(tmp, axis=1))

# Make Pipeline
cv_iter = ShuffleSplit(n_splits=5, test_size=0.5)

estimators = [('scale', StandardScaler()), ('tica', tICA()),
              ('cluster', MiniBatchKMeans(random_state=0)),
              ('msm', MarkovStateModel())]

param_grid = {
    'cluster__n_clusters': list(np.linspace(200, 500, num=2).astype(int)),
    'tica__n_components': list(np.linspace(10, 30, num=2).astype(int)),
    'tica__lag_time': list(np.linspace(200, 500, num=2).astype(int))
}

params = {
    'cluster__n_clusters': scipy.stats.randint(low=200, high=200),
    'tica__n_components': scipy.stats.randint(low=2, high=40),
    'tica__lag_time': scipy.stats.randint(low=100, high=999)
}

pipe = Pipeline(estimators)
Example #17
0
rs = np.random.RandomState(42)

# Load Fs Peptide Data
trajs = FsPeptide().get().trajectories

# Extract Backbone Dihedrals
featurizer = DihedralFeaturizer(types=['chi1'])
diheds = featurizer.fit_transform(trajs)

# Perform Dimensionality Reduction
tica_model = tICA(lag_time=2, n_components=2)
tica_trajs = tica_model.fit_transform(diheds)

# Perform Clustering
clusterer = MiniBatchKMeans(n_clusters=12, random_state=rs)
clustered_trajs = clusterer.fit_transform(tica_trajs)

# Construct MSM
msm = MarkovStateModel(lag_time=2)
assignments = msm.fit_transform(clustered_trajs)

# Plot Stacked Distributions
a = np.concatenate(assignments, axis=0)
d = np.concatenate(diheds, axis=0)

# Plot Stacked Distributions of the sine of each Chi1 angle
# within an arbitrary set of states {2, 5, 0}
path_data = [d[a == i][:, ::2] for i in [2, 5, 0]]
msme.plot_stackdist(path_data)
Example #18
0
import matplotlib
import matplotlib.patches as mpatches
from seaborn.distributions import (_scipy_univariate_kde, _scipy_bivariate_kde)

###READ TRAJECTORY FILES AND PREPROCESSING the data
ds=dataset("*.nc", topology="s.pdb")                                                                                     
feat = DihedralFeaturizer(types=['phi', 'psi'])                                                                                                                       
ds_alpha=ds.fit_transform_with(feat, "dihed/",fmt='dir-npy')                                                                                                           
ds_alpha = dataset("./dihed/")                                                                                         
print(len(ds_alpha),len(ds))                                                                                           
print(ds[0].xyz.shape)                                                              
ds_alpha = dataset("./dihed/")                                                                                          
tica_mdl = tICA(lag_time=10,n_components=2)                                                                                                                          
tica_features = ds_alpha.fit_transform_with(tica_mdl, out_ds = 'tica')                                                                                                 
tica_features = dataset("./tica/")                                                                                     
kmeans_mdl = MiniBatchKMeans(10)                                                                                       
assignments = tica_features.fit_transform_with(kmeans_mdl, out_ds='assignments/')                                       
assignments = dataset("assignments/")  
import msmexplorer as msme                                                                                                                                                                          
from matplotlib import pyplot as plt        
tica_trajs=dataset("./tica/")

###PREPROCESSING THE TICA DATA
j=0
tica=[]
for k in range(6):
    f1=[]
    for i in range(len(tica_trajs)):
        f=list(tica_trajs[i][j:j+800])
        f1.append(f)
    f1=np.array(f1)
featurizer = DihedralFeaturizer(types=['phi', 'psi'])
diheds = xyz.fit_transform_with(featurizer, 'diheds/', fmt='dir-npy')

#tICA
from msmbuilder.decomposition import tICA
tica_model = tICA(lag_time=2, n_components=4)
# fit and transform can be done in seperate steps:
tica_model = diheds.fit_with(tica_model)
tica_trajs = diheds.transform_with(tica_model, 'ticas/', fmt='dir-npy')

txx = np.concatenate(tica_trajs)

# clustering: can change hyperparameters
from msmbuilder.cluster import MiniBatchKMeans
#clusterer = MiniBatchKMeans(n_clusters=num_clusters)
clusterer = MiniBatchKMeans(n_clusters=num_clusters, max_no_improvement=1000, batch_size=num_clusters*10)
clustered_trajs = tica_trajs.fit_transform_with(
    clusterer, 'kmeans/', fmt='dir-npy'
)

# msm builder
from msmbuilder.msm import MarkovStateModel
from msmbuilder.utils import dump
msm = MarkovStateModel(lag_time=2, n_timescales=20, ergodic_cutoff='off')
msm.fit(clustered_trajs)

# save tIC plot
import matplotlib
matplotlib.use('Agg') # Must be placed before matplotlib.pyplot
import matplotlib.pyplot as plt
plt.hexbin(txx[:, 0], txx[:, 1], bins='log', mincnt=1, cmap="bone_r")
import os
from glob import glob
import numpy as np
from multiprocessing import Pool
import pandas as pd
from msmbuilder.featurizer import DihedralFeaturizer, KappaAngleFeaturizer
from sklearn.model_selection import cross_val_score, cross_val_predict

# Globals
num_procs = 5
traj_dir = '/mnt/storage/home/ra15808/scratch/train'
# traj_dir = '/Users/robert_arbon/Datasets/DHFR/train'

pipe_fixed = Pipeline([('variance_cut', VarianceThreshold()),
                       ('tica', tICA(kinetic_mapping=True)),
                       ('cluster', MiniBatchKMeans()),
                       ('msm',
                        MarkovStateModel(n_timescales=2,
                                         lag_time=50,
                                         verbose=True))])

pipe_csp = Pipeline([('variance_cut', VarianceThreshold()),
                     ('tica', tICA(kinetic_mapping=True)),
                     ('cluster', MiniBatchKMeans()),
                     ('msm',
                      MarkovStateModel(use_gap='timescales',
                                       lag_time=50,
                                       verbose=True))])

best = pd.read_pickle('best_trials.pickl')
Example #21
0
"""Cluster tICA results

{{header}}

Meta
----
depends:
 - ttrajs
 - meta.pandas.pickl
"""
from msmbuilder.io import load_trajs, save_trajs, save_generic
from msmbuilder.cluster import MiniBatchKMeans

## Load
meta, ttrajs = load_trajs('ttrajs')

## Fit
dim = 5
kmeans = MiniBatchKMeans(n_clusters=500)
kmeans.fit([traj[:, :dim] for traj in ttrajs.values()])

## Transform
ktrajs = {}
for k, v in ttrajs.items():
    ktrajs[k] = kmeans.partial_transform(v[:, :dim])

## Save
print(kmeans.summarize())
save_trajs(ktrajs, 'ktrajs', meta)
save_generic(kmeans, 'kmeans.pickl')
Example #22
0
    tica_model = tICA(lag_time=40, n_components=20)

# fit and transform can be done in seperate steps:
tica_model = diheds.fit_with(tica_model)
tica_trajs = diheds.transform_with(tica_model, 'ticas/', fmt='dir-npy')

txx = np.concatenate(tica_trajs)

# save tICA
np.savetxt(folder + 'tICA_coord_+' + which_dataset + '.csv',
           txx,
           delimiter=',')

# clustering
from msmbuilder.cluster import MiniBatchKMeans
clusterer = MiniBatchKMeans(n_clusters=num_clusters)  #100 for camodulin
clustered_trajs = tica_trajs.fit_transform_with(clusterer,
                                                'kmeans/',
                                                fmt='dir-npy')

# msm builder
from msmbuilder.msm import MarkovStateModel
from msmbuilder.utils import dump

if which_dataset == 'fspeptide':
    msm = MarkovStateModel(lag_time=2, n_timescales=20, ergodic_cutoff='on')
if which_dataset == 'apo_calmodulin':
    msm = MarkovStateModel(lag_time=20, n_timescales=20, ergodic_cutoff='on')

msm.fit(clustered_trajs)
Example #23
0
from msmbuilder.featurizer import DihedralFeaturizer
featurizer = DihedralFeaturizer(types=['phi', 'psi'])
diheds = xyz.fit_transform_with(featurizer, 'diheds/', fmt='dir-npy')

#tICA
from msmbuilder.decomposition import tICA
tica_model = tICA(lag_time=2, n_components=4)
# fit and transform can be done in seperate steps:
tica_model = diheds.fit_with(tica_model)
tica_trajs = diheds.transform_with(tica_model, 'ticas/', fmt='dir-npy')

txx = np.concatenate(tica_trajs)

# clustering
from msmbuilder.cluster import MiniBatchKMeans
clusterer = MiniBatchKMeans(n_clusters=num_clusters)
clustered_trajs = tica_trajs.fit_transform_with(
    clusterer, 'kmeans/', fmt='dir-npy'
)

# msm builder
from msmbuilder.msm import MarkovStateModel
from msmbuilder.utils import dump
msm = MarkovStateModel(lag_time=20, n_timescales=20, ergodic_cutoff='on')
msm.fit(clustered_trajs)

# Get MFPT
from msmbuilder.tpt import mfpts 
mfpt_matrix = mfpts(msm)

# Get flux matrix