Beispiel #1
0
    def test_provided_means(self):
        data = np.random.random((300, 3))
        mean = data.mean(axis=0)
        tica_obj = tica(data, mean=mean)
        tica_calc_mean = tica(data)

        np.testing.assert_allclose(tica_obj.mean, tica_calc_mean.mean)
        np.testing.assert_allclose(tica_obj.cov, tica_calc_mean.cov)
        np.testing.assert_allclose(tica_obj.cov_tau, tica_calc_mean.cov_tau)
Beispiel #2
0
def project_and_cluster(trajfiles,
                        featurizer,
                        sparsify=False,
                        tica=True,
                        lag=100000,
                        scale=True,
                        var_cutoff=1.0,
                        ncluster=100):
    """
    Returns
    -------
    trans_obj, Y, clustering

    """
    X = coor.load(trajfiles, featurizer)
    if sparsify:
        X = remove_constant(X)
    if tica:
        trans_obj = coor.tica(X, lag=lag, var_cutoff=var_cutoff)
        Y = trans_obj.get_output()
    else:
        trans_obj = coor.pca(X, dim=-1, var_cutoff=var_cutoff)
        Y = trans_obj.get_output()
    if scale:
        for y in Y:
            y *= trans_obj.eigenvalues[:trans_obj.dimension()]
    if cluster:
        cl_obj = coor.cluster_kmeans(Y,
                                     k=ncluster,
                                     max_iter=3,
                                     fixed_seed=True)
        return trans_obj, Y, cl_obj
    return trans_obj, Y
    def test_write_to_csv_propagate_filenames(self):
        from pyemma.coordinates import source, tica
        with TemporaryDirectory() as td:
            data = [np.random.random((20, 3))] * 3
            fns = [
                os.path.join(td, f)
                for f in ('blah.npy', 'blub.npy', 'foo.npy')
            ]
            for x, fn in zip(data, fns):
                np.save(fn, x)
            reader = source(fns)
            assert reader.filenames == fns
            tica_obj = tica(reader, lag=1, dim=2)
            tica_obj.write_to_csv(extension=".exotic", chunksize=3)
            res = sorted([
                os.path.abspath(x) for x in glob(td + os.path.sep + '*.exotic')
            ])
            self.assertEqual(len(res), len(fns))
            desired_fns = sorted([s.replace('.npy', '.exotic') for s in fns])
            self.assertEqual(res, desired_fns)

            # compare written results
            expected = tica_obj.get_output()
            actual = source(list(s.replace('.npy', '.exotic')
                                 for s in fns)).get_output()
            assert len(actual) == len(fns)
            for a, e in zip(actual, expected):
                np.testing.assert_allclose(a, e)
Beispiel #4
0
    def __init__(self, data, lag, units='frames'):
        from pyemma.coordinates import tica
        # data.dat.tolist() might be better?
        self.data = data
        if isinstance(data, Metric):
            if units != 'frames':
                raise RuntimeError(
                    'Cannot use delayed projection TICA with units other than frames for now. Report this to HTMD issues.'
                )
            metr = data
            from pyemma.coordinates.transform.tica import TICA
            self.tic = TICA(lag)

            p = ProgressBar(len(metr.simulations))
            for proj in _projectionGenerator(metr, _getNcpus()):
                for pro in proj:
                    self.tic.partial_fit(pro[0])
                p.progress(len(proj))
            p.stop()
        else:
            lag = unitconvert(units, 'frames', lag, data.fstep)
            if lag == 0:
                raise RuntimeError(
                    'Lag time conversion resulted in 0 frames. Please use a larger lag-time for TICA.'
                )
            self.tic = tica(data.dat.tolist(), lag=lag)
Beispiel #5
0
    def test_length_and_content_feature_reader_and_TICA(self):
        for stride in range(1, 100, 23):
            r = coor.source(self.trajnames, top=self.temppdb)
            t = coor.tica(data=r, lag=2, dim=2)
            # t.data_producer = r
            t.parametrize()

            # subsample data
            out_tica = t.get_output(stride=stride)
            out_reader = r.get_output(stride=stride)

            # get length in different ways
            len_tica = [x.shape[0] for x in out_tica]
            len_reader = [x.shape[0] for x in out_reader]
            len_trajs = t.trajectory_lengths(stride=stride)
            len_ref = [(x.shape[0]-1)//stride+1 for x in self.data]
            # print 'len_ref', len_ref

            # compare length
            np.testing.assert_equal(len_trajs, len_ref)
            self.assertTrue(len_ref == len_tica)
            self.assertTrue(len_ref == len_reader)

            # compare content (reader)
            for ref_data, test_data in zip(self.data, out_reader):
                ref_data_reshaped = ref_data.reshape((ref_data.shape[0], ref_data.shape[1]*3))
                self.assertTrue(np.allclose(ref_data_reshaped[::stride, :], test_data, atol=1E-3))
Beispiel #6
0
def make_TICA_decomposition(ticaObject,
                            folders,
                            folderPath,
                            lag,
                            overWriteObject=False,
                            kinetic_map=True,
                            commute_map=False):
    if overWriteObject or not os.path.exists(ticaObject):
        trajs = []
        for epoch in folders:
            trajFiles = glob.glob(
                os.path.join(folderPath,
                             "%s/repeatedExtractedCoordinates/coord*" % epoch))
            trajFiles.sort(key=lambda x: int(x[x.rfind("_") + 1:-4]))
            for traj in trajFiles:
                trajs.append(np.loadtxt(traj))
        tica = coor.tica(data=trajs,
                         lag=lag,
                         kinetic_map=kinetic_map,
                         commute_map=commute_map)
        with open(ticaObject, "wb") as f:
            pickle.dump(tica, f)
    else:
        with open(ticaObject, "rb") as f:
            tica = pickle.load(f)
    return tica
Beispiel #7
0
 def test_MD_data(self):
     # this is too little data to get reasonable results. We just test to avoid exceptions
     path = pkg_resources.resource_filename(__name__, 'data') + os.path.sep
     self.pdb_file = os.path.join(path, 'bpti_ca.pdb')
     self.xtc_file = os.path.join(path, 'bpti_mini.xtc')
     inp = source(self.xtc_file, top=self.pdb_file)
     # see if this doesn't raise
     ticamini = tica(inp, lag=1)
Beispiel #8
0
 def setUpClass(cls):
     cls.data = np.ones((10000, 100))
     cls.variable_columns = np.random.choice(100, 10, replace=False)
     cls.data[:, cls.variable_columns] = np.random.rand(10000, 10)
     # Start with one of the constant columns:
     cls.initial_columns = np.setdiff1d(np.arange(cls.data.shape[1]),
                                        cls.variable_columns)[0:1]
     cls.tica_obj = tica(data=cls.data, lag=1)
Beispiel #9
0
 def test_MD_data(self):
     # this is too little data to get reasonable results. We just test to avoid exceptions
     path = os.path.join(os.path.split(__file__)[0], 'data')
     self.pdb_file = os.path.join(path, 'bpti_ca.pdb')
     self.xtc_file = os.path.join(path, 'bpti_mini.xtc')
     inp = source(self.xtc_file, top=self.pdb_file)
     # see if this doesn't raise
     ticamini = tica(inp, lag=1)
Beispiel #10
0
 def test_parametrize_with_stride(self):
     for stride in range(1, 100, 23):
         r = coor.source(self.trajnames, top=self.temppdb)
         tau = 5
         try:
             t = coor.tica(r, lag=tau, stride=stride, dim=2)
             # force_eigenvalues_le_one=True enables an internal consistency check in TICA
             self.assertTrue(np.all(t.eigenvalues <= 1.0 + 1.E-12))
         except RuntimeError:
             assert tau % stride != 0
Beispiel #11
0
    def test_notify_changes_mixin(self):
        X_t = np.random.random((30, 30))
        source = coor.source(np.array(X_t))

        t1 = coor.tica(source)
        from pyemma.coordinates.transform import TICA
        t2 = TICA(lag=10)
        assert len(t1._stream_children) == 0
        t2.data_producer = t1
        assert t1._stream_children[0] == t2
def run_sampling(args):
    topology = "Native.pdb"
    ticadim = 10
    num_sample_frames = 10000

    fn = args.file  # file name
    wn = args.weights  # weights name

    weights = np.loadtxt(wn)
    weights = weights / np.sum(weights)
    # first time
    time1 = time.clock()
    feat = coor.featurizer(topology)
    feat.add_distances_ca()
    X1 = coor.load(fn, feat, stride=1)
    # time for loading
    time2 = time.clock()
    print "Took %f minutes to load a file" % ((time2 - time1) / 60.0)
    sampled_frames = np.zeros((num_sample_frames, np.shape(X1)[1]))

    selected_frames = np.random.choice(np.shape(X1)[0], size=num_sample_frames, replace=True, p=weights)
    time3 = time.clock()
    print "Took %f minutes to select new frames" % ((time3 - time2) / 60.0)
    for i in range(num_sample_frames):
        ##debug
        # print np.shape(sampled_frames)
        # print np.shape(X1)
        ##debugg
        sampled_frames[i, :] = X1[selected_frames[i], :]
    time4 = time.clock()
    print "Took %f minutes to load the new frames" % ((time4 - time3) / 60.0)
    ##debug
    for j in sampled_frames:
        for i in j:
            if i == 0:
                print "ERROR, distance too short, something not written"
                f = open("log.txt", "w")
                f.write("ERROR, distance too short, something not written")
                f.close()
    ##debugg
    time5 = time.clock()
    print "Took %f minutes to go through the debug check" % ((time5 - time4) / 60.0)
    tica_obj = coor.tica(sampled_frames, stride=1, lag=1, dim=ticadim)
    time6 = time.clock()
    print "Took %f minutes to calculate the tica_object" % ((time6 - time5) / 60.0)
    outputs = tica_obj.get_output()[0]
    eigen = tica_obj.eigenvalues
    time7 = time.clock()
    print "Took %f minutes to get the output of the tica_object" % ((time7 - time6) / 60.0)
    print "saving files"
    np.savetxt("output.dat", outputs)
    np.savetxt("eigenvalues.dat", eigen)
    print "files saved"
    time8 = time.clock()
    print "Took %f minutes to write the output files" % ((time8 - time7) / 60.0)
Beispiel #13
0
def run_analysis(args):
    feat = coor.featurizer(args.topfile)
    feat.add_distances(tmeth.generate_pairs(args.range[0],args.range[1], args.step_size, args.cut_value))
    traj = coor.load(args.traj_file, feat, stride=args.stride)
    tica_obj = coor.tica(traj, stride=1, lag=args.lag, dim=args.ticadim)
    outputs = tica_obj.get_output()[0]
    eigen = tica_obj.eigenvalues
    np.savetxt("%s_output_raw.dat"%args.title, outputs)
    np.savetxt("%s_eigenvalues_raw.dat"%args.title, eigen)
    tmeth.plot_eigen_series(eigen, args.title, time_scale=args.time_step*args.stride)
    tmeth.plot_output(outputs, args.title, time_scale=args.time_step*args.stride)
Beispiel #14
0
    def test_skipped_trajs(self):

        feature_trajs = [
            np.arange(10),
            np.arange(11),
            np.arange(12),
            np.arange(13)
        ]

        tica_obj = tica(data=feature_trajs, lag=11)
        assert (len(tica_obj._skipped_trajs) == 2)
        assert np.allclose(tica_obj._skipped_trajs, [0, 1])
Beispiel #15
0
    def test_skipped_trajs(self):

        feature_trajs = [
            np.arange(10),
            np.arange(11),
            np.arange(12),
            np.arange(13)
        ]

        tica_obj = tica(data=feature_trajs, lag=11)
        # we skip the trajs right away in the iterator
        assert (len(tica_obj._skipped_trajs) == 0)
Beispiel #16
0
    def setUpClass(cls):
        from pyemma.datasets import get_bpti_test_data

        d = get_bpti_test_data()
        trajs, top = d['trajs'], d['top']
        s = source(trajs, top=top)

        t = tica(s, lag=1)

        c = cluster_kmeans(t)
        cls.model_file = tempfile.mktemp()
        c.save(cls.model_file, save_streaming_chain=True)
Beispiel #17
0
    def test_too_short_traj_partial_fit(self):
        data = [np.empty((20, 3)), np.empty((10, 3))]
        lag = 11
        tica_obj = tica(lag=lag)
        from pyemma.util.testing_tools import MockLoggingHandler
        log_handler = MockLoggingHandler()
        tica_obj._covar.logger.addHandler(log_handler)
        for x in data:
            tica_obj.partial_fit(x)

        self.assertEqual(tica_obj._used_data, 20 - lag)
        self.assertEqual(len(log_handler.messages['warning']), 1)
        self.assertIn("longer than lag", log_handler.messages['warning'][0])
    def test_write_h5(self):
        from pyemma.coordinates import tica
        dim = 10
        data = [np.random.random((np.random.randint(50, 150), dim)) for _ in range(4)]
        tica = tica(data, lag=1)
        import tempfile
        out = tempfile.mktemp()
        group = '/test'
        def perform(chunksize, stride):
            try:
                transformed_output = tica.get_output(chunk=chunksize, stride=stride)
                tica.write_to_hdf5(out, group=group, chunksize=chunksize, stride=stride)

                import h5py
                with h5py.File(out) as f:
                    assert len(f[group]) == len(data)
                    for (itraj, actual), desired in zip(f[group].items(), transformed_output):
                        np.testing.assert_equal(actual, desired, err_msg='failed for cs=%s, stride=%s'
                                                                         %(chunksize, stride))
            finally:
                os.remove(out)

        for cs in [0, 1, 3, 10, 42, 50]:
            for s in [1, 2, 3, 10]:
                perform(cs, s)

        # test overwrite
        try:
            tica.write_to_hdf5(out, group=group)
            with self.assertRaises(ValueError):
                tica.write_to_hdf5(out, group=group)

            os.remove(out)
            tica.write_to_hdf5(out)
            with self.assertRaises(ValueError) as ctx:
                tica.write_to_hdf5(out)
            assert 'Refusing to overwrite data' in ctx.exception.args[0]

            os.remove(out)
            tica.write_to_hdf5(out, group=group)
            tica.write_to_hdf5(out, group=group, overwrite=True)

            os.remove(out)
            import h5py
            with h5py.File(out) as f:
                f.create_group('empty').create_dataset('0000', shape=(1,1))
            with self.assertRaises(ValueError):
                tica.write_to_hdf5(out, group='empty')
            tica.write_to_hdf5(out, group='empty', overwrite=True)
        finally:
            os.remove(out)
Beispiel #19
0
 def setUpClass(cls):
     from pyemma.datasets import load_2well_discrete
     dw = load_2well_discrete()
     v = dw.dtraj_T100K_dt10[:10000]
     cls.T = v.size
     nstates = 100
     b = np.linspace(-1, 1, nstates)
     sigma = 0.15
     cls.Z = np.zeros((cls.T, nstates))
     for t in range(cls.T):
         for j in range(nstates):
             cls.Z[t, j] = np.exp(-(b[v[t]] - b[j])**2 / (2 * sigma**2))
     cls.lag = 10
     cls.tica_obj = tica(data=cls.Z, lag=cls.lag)
Beispiel #20
0
    def test_feature_correlation_MD(self):
        # Copying from the test_MD_data
        path = pkg_resources.resource_filename(__name__, 'data') + os.path.sep
        self.pdb_file = os.path.join(path, 'bpti_ca.pdb')
        self.xtc_file = os.path.join(path, 'bpti_mini.xtc')
        inp = source(self.xtc_file, top=self.pdb_file)
        ticamini = tica(inp, lag=1, kinetic_map=False)

        feature_traj = ticamini.data_producer.get_output()[0]
        tica_traj = ticamini.get_output()[0]
        test_corr = ticamini.feature_TIC_correlation
        true_corr = mycorrcoef(feature_traj, tica_traj, ticamini.lag)
        #assert np.isclose(test_corr, true_corr).all()
        np.testing.assert_allclose(test_corr, true_corr, atol=1.E-8)
    def setUp(self):
        self.readers = []
        data_dir = pkg_resources.resource_filename('pyemma.coordinates.tests', 'data')
        # three md trajs
        trajs = glob(data_dir + "/bpti_0*.xtc")
        top = os.path.join(data_dir, 'bpti_ca.pdb')
        self.readers.append(source(trajs, top=top))
        self.readers[0].featurizer.add_all()
        ndim = self.readers[0].ndim
        # three random arrays
        lengths = self.readers[0].trajectory_lengths()
        arrays = [np.random.random( (length, ndim) ) for length in lengths]
        self.readers.append(source(arrays))

        self.readers.append(tica(self.readers[-1], dim=20))
Beispiel #22
0
    def test_feature_correlation_data(self):
        # Create features with some correlation
        feature_traj = np.zeros((100, 3))
        feature_traj[:,0] = np.linspace(-.5,.5,len(feature_traj))
        feature_traj[:,1] = (feature_traj[:,0]+np.random.randn(len(feature_traj))*.5)**1
        feature_traj[:,2] = np.random.randn(len(feature_traj))

        # Tica
        tica_obj = tica(data = feature_traj, dim = 3, kinetic_map=False)
        tica_traj = tica_obj.get_output()[0]

        # Create correlations
        test_corr = tica_obj.feature_TIC_correlation
        true_corr = mycorrcoef(feature_traj, tica_traj, tica_obj.lag)
        np.testing.assert_allclose(test_corr, true_corr, atol=1.E-8)
Beispiel #23
0
 def test_parametrize_with_stride(self):
     # for stride in xrange(1,100,20):
     for stride in xrange(1, 100, 5):
         r = coor.feature_reader(self.trajnames, self.temppdb)
         # print 'expected total length of trajectories:', r.trajectory_lengths(stride=stride)
         tau = 5
         # print 'expected inner frames', [max(l-2*tau,0) for l in r.trajectory_lengths(stride=stride)]
         t = coor.tica(r, lag=tau, dim=2, force_eigenvalues_le_one=True)
         # force_eigenvalues_le_one=True enables an internal consitency check in TICA
         # t.data_producer = r
         # print 'STRIDE:', stride
         # print 'theoretical result 2*(N-tau):', sum([2*(x-5) for x in r.trajectory_lengths(stride=stride) if x > 5])
         # print 'theoretical result N:', sum(r.trajectory_lengths(stride=stride))
         t.parametrize(stride=stride)
         # print 'TICA', t.N_cov, 2*t.N_cov_tau
         # print 'eigenvalues', sorted(t.eigenvalues)[::-1][0:5]
         self.assertTrue(np.all(t.eigenvalues <= 1.0 + 1.E-12))
Beispiel #24
0
def multi_temperature_tram(feat, trajfiles, temperatures, dtrajs=None, stride=1, tica_lag=100,
        keep_tica_dims=20, n_clusters=100, tram_lag=100, engfile="Etot.dat", usecols=(1,), kb=0.0083145):
    """
    Parameters
    ----------
    feat : obj, pyemma.coor.featurizer
        Featurizer object that already has the appropriate features added.
    trajfiles : list
        Names of trajectories to include in estimation.
    temperatures : list
        Temperatures of corresponding trajectories.
    stride : int
        Number of frames to skip in tica and clustering.
    tica_lag : int
        Lagtime to use for constructing tica.
    keep_tica_dims : int
        Number of dimensions to keep from tica. Somewhat ambiguous.
    n_clusters : int
        Number of clusters for kmeans. Somewhat ambiguous. 
    """

    dirs = [ os.path.dirname(x) for x in trajfiles ]
    beta = [ 1./(kb*x) for x in temperatures ]

    if dtrajs is None:
        inp = coor.source(trajfiles, feat)

        tica_obj = coor.tica(inp, lag=tica_lag, dim=keep_tica_dims, stride=stride)
        Y = tica_obj.get_output()

        cl = coor.cluster_kmeans(data=Y, k=n_clusters, stride=stride)
        dtrajs = cl.dtrajs

    # dimensionless energy
    if engfile.endswith("npy"):
        energy_trajs = [ beta[i]*np.load("{}/{}".format(dirs[i], engfile)) for i in range(len(dirs)) ]
    else:
        energy_trajs = [ beta[i]*np.loadtxt("{}/{}".format(dirs[i], engfile), usecols=usecols) for i in range(len(dirs)) ]
    temp_trajs = [ kb*temperatures[i]*np.ones(energy_trajs[i].shape[0], float) for i in range(len(dirs)) ]

    # dTRAM approach
    tram = thermo.estimate_multi_temperature(energy_trajs, temp_trajs,
            dtrajs, energy_unit='kT', temp_unit='kT', estimator='tram',
            lag=tram_lag, maxiter=2000000, maxerr=1e-10)

    return dirs, dtrajs, tram
Beispiel #25
0
def model_file():
    file = None
    try:
        from pyemma.datasets import get_bpti_test_data
        d = get_bpti_test_data()
        trajs, top = d['trajs'], d['top']
        s = source(trajs, top=top)

        t = tica(s, lag=1)

        c = cluster_kmeans(t)
        file = tempfile.mktemp()
        c.save(file, save_streaming_chain=True)

        yield file
    finally:
        if file is not None:
            shutil.rmtree(file, ignore_errors=True)
Beispiel #26
0
    def test_feature_correlation_data(self):
        # Create features with some correlation
        feature_traj = np.zeros((100, 3))
        feature_traj[:, 0] = np.linspace(-.5, .5, len(feature_traj))
        feature_traj[:, 1] = (feature_traj[:, 0] +
                              np.random.randn(len(feature_traj)) * .5)**1
        feature_traj[:, 2] = np.random.randn(len(feature_traj))

        # Tica
        tica_obj = tica(data=feature_traj, dim=3, kinetic_map=False)
        tica_traj = tica_obj.get_output()[0]

        # Create correlations
        test_corr = tica_obj.feature_TIC_correlation
        true_corr = np.corrcoef(
            feature_traj.T, y=tica_traj.T)[:tica_obj.data_producer.dimension(),
                                           tica_obj.data_producer.dimension():]

        assert np.isclose(test_corr, true_corr).all()
Beispiel #27
0
    def test_too_short_traj_partial_fit(self):
        data = [np.empty((20, 3)), np.empty((10, 3))]
        lag = 11
        tica_obj = tica(lag=lag)
        from pyemma.util.testing_tools import MockLoggingHandler
        log_handler = MockLoggingHandler()
        import logging
        L = logging.getLogger('pyemma.coordinates.estimation.covariance')
        L.addHandler(log_handler)
        try:
            for x in data:
                tica_obj.partial_fit(x)

            #self.assertEqual(tica_obj._used_data, 20 - lag)
            self.assertEqual(len(log_handler.messages['warning']), 1)
            self.assertIn("longer than lag",
                          log_handler.messages['warning'][0])
        finally:
            L.removeHandler(log_handler)
Beispiel #28
0
    def __init__(self, data, lag):
        from pyemma.coordinates import tica
        # data.dat.tolist() might be better?
        self.data = data
        if isinstance(data, Metric):
            from pyemma.coordinates.transform.tica import TICA
            self.tic = TICA(lag)

            p = ProgressBar(len(data.simulations))
            for i in range(len(data.simulations)):
                # Fix for pyemma bug. Remove eventually:
                d, _, _ = data._projectSingle(i)
                if d is None or d.shape[0] < lag:
                    continue
                self.tic.partial_fit(d)
                p.progress()
            p.stop()
        else:
            self.tic = tica(data.dat.tolist(), lag=lag)
def run_sampling(args):
    topology = args.topfile
    ticadim = 10
    num_sample_frames = 10000
    tica_lag_time = 5
    fn = args.filedir  # file name
    wn = args.weights  # weights name

    weights = np.loadtxt(wn)
    weights = weights / np.sum(weights)
    # first time
    time1 = time.clock()
    feat = coor.featurizer(topology)
    feat.add_distances(tmeth.generate_pairs(5, 288, 4, 4))
    selected_frames = np.random.choice(args.number_traj, size=num_sample_frames, replace=True, p=weights)

    selected_files = []
    selected_frames.sort()
    for i in selected_frames:
        selected_files.append("%s/traj%d.xtc" % (fn, i))
    time2 = time.clock()
    print "Took %f minutes to select new frames" % ((time2 - time1) / 60.0)
    sampled_frames = coor.load(selected_files, feat, stride=10)

    time3 = time.clock()
    print "Took %f minutes to load the new frames" % ((time3 - time2) / 60.0)

    tica_obj = coor.tica(sampled_frames, stride=1, lag=tica_lag_time, dim=ticadim)
    time4 = time.clock()
    print "Took %f minutes to calculate the tica_object" % ((time4 - time3) / 60.0)
    all_outputs = tica_obj.get_output()[0]
    for i in xrange(num_sample_frames - 1):
        outputs = tica_obj.get_output()[i + 1]
        all_outputs = np.append(all_outputs, outputs, axis=0)
    eigen = tica_obj.eigenvalues
    print "saving files"
    np.savetxt("output.dat", all_outputs)
    np.savetxt("eigenvalues.dat", eigen)
    np.savetxt("selected_frames.dat", selected_frames)
    print "files saved"
    time5 = time.clock()
    print "Took %f minutes to write the output files" % ((time5 - time4) / 60.0)
 def test_write_to_csv_propagate_filenames(self):
     from pyemma.coordinates import source, tica
     with TemporaryDirectory() as td:
         data = [np.random.random((20, 3))] * 3
         fns = [
             os.path.join(td, f)
             for f in ('blah.npy', 'blub.npy', 'foo.npy')
         ]
         for x, fn in zip(data, fns):
             np.save(fn, x)
         reader = source(fns)
         assert reader.filenames == fns
         tica_obj = tica(reader, lag=1)
         tica_obj.write_to_csv(extension=".exotic")
         res = sorted([
             os.path.abspath(x) for x in glob(td + os.path.sep + '*.exotic')
         ])
         self.assertEqual(len(res), len(fns))
         desired_fns = sorted([s.replace('.npy', '.exotic') for s in fns])
         self.assertEqual(res, desired_fns)
Beispiel #31
0
    def __init__(self, data, lag, units='frames'):
        from pyemma.coordinates import tica
        # data.dat.tolist() might be better?
        self.data = data
        if isinstance(data, Metric):
            from pyemma.coordinates.transform.tica import TICA
            lag = unitconvert(units, 'frames', lag, data.fstep)
            self.tic = TICA(lag)

            p = ProgressBar(len(data.simulations))
            for i in range(len(data.simulations)):
                # Fix for pyemma bug. Remove eventually:
                d, _, _ = data._projectSingle(i)
                if d is None or d.shape[0] < lag:
                    continue
                self.tic.partial_fit(d)
                p.progress()
            p.stop()
        else:
            self.tic = tica(data.dat.tolist(), lag=lag)
Beispiel #32
0
    def testChunksizeResultsTica(self):
        chunk = 40
        lag = 100
        np.random.seed(0)
        X = np.random.randn(23000, 3)

        # un-chunked
        d = DataInMemory(X)

        tica_obj = api.tica(data=d, lag=lag, dim=1)

        cov = tica_obj.cov.copy()
        mean = tica_obj.mean.copy()

        # ------- run again with new chunksize -------
        d = DataInMemory(X)
        d.chunksize = chunk
        tica_obj = tica(data=d, lag=lag, dim=1)

        np.testing.assert_allclose(tica_obj.mean, mean)
        np.testing.assert_allclose(tica_obj.cov, cov)
Beispiel #33
0
    def testChunksizeResultsTica(self):
        chunk = 40
        lag = 100
        np.random.seed(0)
        X = np.random.randn(23000, 3)

        # un-chunked
        d = DataInMemory(X)

        tica_obj = api.tica(data=d, lag=lag, dim=1)

        cov = tica_obj.cov.copy()
        mean = tica_obj.mu.copy()

        # ------- run again with new chunksize -------
        d = DataInMemory(X)
        d.chunksize = chunk
        tica_obj = tica(data=d, lag=lag, dim=1)

        np.testing.assert_allclose(tica_obj.mu, mean)
        np.testing.assert_allclose(tica_obj.cov, cov)
featurizer = msmbuilder_to_pyemma(msmb_featurizer,traj)

################################################################################
# Define coordinates source
################################################################################

trajectory_files = glob(os.path.join(source_directory, '*0.h5'))
coordinates_source = coor.source(trajectory_files,featurizer)
print("There are %d frames total in %d trajectories." % (coordinates_source.n_frames_total(), coordinates_source.number_of_trajectories()))

################################################################################
# Do tICA
################################################################################

print('tICA...')
running_tica = coor.tica(lag=1600, dim=4)
coor.pipeline([coordinates_source,running_tica])

################################################################################
# Make eigenvalues plot
################################################################################

plt.clf()
eigenvalues = (running_tica.eigenvalues)**2

sum_eigenvalues = np.sum(eigenvalues[0:2])

print "This is the sum of the first two eigenvalues: %s." % sum_eigenvalues

plt.plot(eigenvalues)
plt.xlim(0,4)
Beispiel #35
0
 def test_describe(self):
     desc = self.tica_obj.describe()
     assert types.is_string(desc) or types.is_list_of_string(desc)
     # describe on empty estimator
     tica(lag=1).describe()
Beispiel #36
0
featurizer.add_backbone_torsions()

################################################################################
# Define coordinates source
################################################################################

trajectory_files = glob(os.path.join(source_directory, '*0.h5'))
coordinates_source = coor.source(trajectory_files,featurizer)
print("There are %d frames total in %d trajectories." % (coordinates_source.n_frames_total(), coordinates_source.number_of_trajectories()))

################################################################################
# Do tICA
################################################################################

print('tICA...')
running_tica = coor.tica(lag=100, dim=100)

################################################################################
# Cluster
################################################################################

print('Clustering...')
clustering = coor.cluster_kmeans(k=100, stride=50)
coor.pipeline([coordinates_source,running_tica,clustering])

dtrajs = clustering.dtrajs

# Save discrete trajectories.
clustering.save_dtrajs(output_format='npy', extension='.npy')

################################################################################
Beispiel #37
0
    topfile = tempdirs[0] + "/" + topname

    trajfiles = [ x + "/" + trajname for x in tempdirs ]

    # add features
    feat = coor.featurizer(topfile)
    feat, feature_info = util.sbm_contact_features(feat, pairwise_file, n_native_pairs)

    if not os.path.exists("msm"):
        os.mkdir("msm")

    if (not os.path.exists("msm/dtrajs.pkl")) or recluster:
        # cluster if necessary
        inp = coor.source(trajfiles, feat)
        tica_obj = coor.tica(inp, dim=tica_dims, lag=tica_lag, stride=stride)
        Y = tica_obj.get_output()
        cl = coor.cluster_kmeans(data=Y, k=n_clusters)
        dtrajs = cl.dtrajs

        os.chdir("msm")
        dirs = [ os.path.basename(os.path.dirname(x)) for x in trajfiles ]

        if not dontsavemsm:
            dtraj_info = { dirs[x]:dtrajs[x] for x in range(len(dirs)) }
            dtraj_info["dirs"] = dirs
            with open("dtrajs.pkl", 'wb') as fhandle:
                pickle.dump(dtraj_info, fhandle)
    else:
        os.chdir("msm")
        with open("dtrajs.pkl", 'rb') as fhandle:
Beispiel #38
0
    reader = coor.source(trajnames, features=feat)

    # Estimate Markov state model
    #tica_lag = 20
    #keep_dims = 23
    #keep_dims = 23

    if not noplots:
        print("Plotting tica timescales vs lagtime...")
        plot_tica_stuff()

    #tica_lag = 50 # lagtime where TICA timescales are converged
    #keep_dims = 5 # num dims where cumulative variance reaches ~0.8

    tica = coor.tica(lag=lagtime, stride=1)
    coor.pipeline([reader, tica])
    Y = tica.get_output(dimensions=range(keep_dims))

    #np.save(msm_savedir + "/tica_ti.npy", tica.timescales)

    #print("Saving tica coordinates...")
    ##if not os.path.exists(msm_savedir + "/run_1_TIC_1.npy"):
    #for i in range(keep_dims):
    #    for n in range(len(Y)):
    #        # save TIC with indices of corresponding traj
    #        idx1, idx2 = traj_idxs[n]
    #        tic_saveas = msm_savedir + "/run_{}_{}_TIC_{}.npy".format(idx1, idx2, i+1)
    #        if not os.path.exists(tic_saveas) or resave_tic:
    #            np.save(tic_saveas, Y[n][:,i])
Beispiel #39
0
    def setUpClass(cls):
        # Basis set definition:
        cls.nf = 10
        cls.chi = np.zeros((20, cls.nf), dtype=float)
        for n in range(cls.nf):
            cls.chi[2 * n:2 * (n + 1), n] = 1.0

        # Load simulations:
        f = np.load(
            pkg_resources.resource_filename(__name__,
                                            "data/test_data_koopman.npz"))
        trajs = [f[key] for key in f.keys()]
        cls.data = [cls.chi[traj, :] for traj in trajs]

        # Lag time:
        cls.tau = 10
        # Truncation for small eigenvalues:
        cls.epsilon = 1e-6

        # Compute the means:
        cls.mean_x = np.zeros(cls.nf)
        cls.mean_y = np.zeros(cls.nf)
        cls.frames = 0
        for traj in cls.data:
            cls.mean_x += np.sum(traj[:-cls.tau, :], axis=0)
            cls.mean_y += np.sum(traj[cls.tau:, :], axis=0)
            cls.frames += traj[:-cls.tau, :].shape[0]
        cls.mean_x *= (1.0 / cls.frames)
        cls.mean_y *= (1.0 / cls.frames)
        cls.mean_rev = 0.5 * (cls.mean_x + cls.mean_y)

        # Compute correlations:
        cls.C0 = np.zeros((cls.nf, cls.nf))
        cls.Ct = np.zeros((cls.nf, cls.nf))
        cls.C0_rev = np.zeros((cls.nf, cls.nf))
        cls.Ct_rev = np.zeros((cls.nf, cls.nf))
        for traj in cls.data:
            itraj = (traj - cls.mean_x[None, :]).copy()
            cls.C0 += np.dot(itraj[:-cls.tau, :].T, itraj[:-cls.tau, :])
            cls.Ct += np.dot(itraj[:-cls.tau, :].T, itraj[cls.tau:, :])
            itraj = (traj - cls.mean_rev[None, :]).copy()
            cls.C0_rev += np.dot(itraj[:-cls.tau, :].T, itraj[:-cls.tau, :])\
                          + np.dot(itraj[cls.tau:, :].T, itraj[cls.tau:, :])
            cls.Ct_rev += np.dot(itraj[:-cls.tau, :].T, itraj[cls.tau:, :])\
                          + np.dot(itraj[cls.tau:, :].T, itraj[:-cls.tau, :])
        cls.C0 *= (1.0 / cls.frames)
        cls.Ct *= (1.0 / cls.frames)
        cls.C0_rev *= (1.0 / (2 * cls.frames))
        cls.Ct_rev *= (1.0 / (2 * cls.frames))

        # Compute whitening transformation:
        cls.R = transform_C0(cls.C0, cls.epsilon)
        cls.Rrev = transform_C0(cls.C0_rev, cls.epsilon)

        # Perform non-reversible diagonalization
        cls.ln, cls.Rn = scl.eig(np.dot(cls.R.T, np.dot(cls.Ct, cls.R)))
        cls.ln, cls.Rn = sort_by_norm(cls.ln, cls.Rn)
        cls.Rn = np.dot(cls.R, cls.Rn)
        cls.Rn = scale_eigenvectors(cls.Rn)
        cls.tsn = -cls.tau / np.log(np.abs(cls.ln))

        cls.ls, cls.Rs = scl.eig(
            np.dot(cls.Rrev.T, np.dot(cls.Ct_rev, cls.Rrev)))
        cls.ls, cls.Rs = sort_by_norm(cls.ls, cls.Rs)
        cls.Rs = np.dot(cls.Rrev, cls.Rs)
        cls.Rs = scale_eigenvectors(cls.Rs)
        cls.tss = -cls.tau / np.log(np.abs(cls.ls))

        # Compute non-reversible Koopman matrix:
        cls.K = np.dot(cls.R.T, np.dot(cls.Ct, cls.R))
        cls.K = np.vstack((cls.K, np.dot((cls.mean_y - cls.mean_x), cls.R)))
        cls.K = np.hstack(
            (cls.K, np.eye(cls.K.shape[0], 1, k=-cls.K.shape[0] + 1)))
        cls.N1 = cls.K.shape[0]

        # Compute u-vector:
        ln, Un = scl.eig(cls.K.T)
        ln, Un = sort_by_norm(ln, Un)
        cls.u = np.real(Un[:, 0])
        v = np.eye(cls.N1, 1, k=-cls.N1 + 1)[:, 0]
        cls.u *= (1.0 / np.dot(cls.u, v))

        # Prepare weight object:
        u_mod = cls.u.copy()
        N = cls.R.shape[0]
        u_input = np.zeros(N + 1)
        u_input[0:N] = cls.R.dot(u_mod[0:-1])  # in input basis
        u_input[N] = u_mod[-1] - cls.mean_x.dot(cls.R.dot(u_mod[0:-1]))
        weight_obj = _KoopmanWeights(u_input[:-1], u_input[-1])

        # Compute weights over all data points:
        cls.wtraj = []
        for traj in cls.data:
            traj = np.dot((traj - cls.mean_x[None, :]), cls.R).copy()
            traj = np.hstack((traj, np.ones((traj.shape[0], 1))))
            cls.wtraj.append(np.dot(traj, cls.u))

        # Compute equilibrium mean:
        cls.mean_eq = np.zeros(cls.nf)
        q = 0
        for traj in cls.data:
            qwtraj = cls.wtraj[q]
            cls.mean_eq += np.sum((qwtraj[:-cls.tau, None] * traj[:-cls.tau, :]), axis=0)\
                           + np.sum((qwtraj[:-cls.tau, None] * traj[cls.tau:, :]), axis=0)
            q += 1
        cls.mean_eq *= (1.0 / (2 * cls.frames))

        # Compute reversible C0, Ct:
        cls.C0_eq = np.zeros((cls.N1, cls.N1))
        cls.Ct_eq = np.zeros((cls.N1, cls.N1))
        q = 0
        for traj in cls.data:
            qwtraj = cls.wtraj[q]
            traj = (traj - cls.mean_eq[None, :]).copy()
            cls.C0_eq += np.dot((qwtraj[:-cls.tau, None] * traj[:-cls.tau, :]).T, traj[:-cls.tau, :])\
                         + np.dot((qwtraj[:-cls.tau, None] * traj[cls.tau:, :]).T, traj[cls.tau:, :])
            cls.Ct_eq += np.dot((qwtraj[:-cls.tau, None] * traj[:-cls.tau, :]).T, traj[cls.tau:, :])\
                         + np.dot((qwtraj[:-cls.tau, None] * traj[cls.tau:, :]).T, traj[:-cls.tau, :])
            q += 1
        cls.C0_eq *= (1.0 / (2 * cls.frames))
        cls.Ct_eq *= (1.0 / (2 * cls.frames))

        # Solve re-weighted eigenvalue problem:
        S = transform_C0(cls.C0_eq, cls.epsilon)
        Ct_S = np.dot(S.T, np.dot(cls.Ct_eq, S))

        # Compute its eigenvalues:
        cls.lr, cls.Rr = scl.eigh(Ct_S)
        cls.lr, cls.Rr = sort_by_norm(cls.lr, cls.Rr)
        cls.Rr = np.dot(S, cls.Rr)
        cls.Rr = scale_eigenvectors(cls.Rr)
        cls.tsr = -cls.tau / np.log(np.abs(cls.lr))

        # Set up the model:
        cls.koop_rev = tica(cls.data, lag=cls.tau, kinetic_map=False)
        cls.koop_eq = tica(cls.data,
                           lag=cls.tau,
                           kinetic_map=False,
                           weights='koopman')
        # Test the model by supplying weights directly:
        cls.koop_eq_direct = tica(cls.data,
                                  lag=cls.tau,
                                  weights=weight_obj,
                                  kinetic_map=False)
    print np.shape(X1)
    possible_times = np.logspace(1,100,5)
    possible_times = possible_times.astype(int)
    lag_times = []
    for i in possible_times:
        if i not in lag_times:
            lag_times.append(i)

    print lag_times

    collected_eigenvalues=[]
    for i in range(ticadim):
        collected_eigenvalues.append([])
    #debug
    #lag_times = [10000] 
    #debugg
    for i in lag_times:
        tica_obj = coor.tica(X1, stride=1, lag=i, dim=ticadim)
        outputs = tica_obj.get_output()[0]
        eigen = tica_obj.eigenvalues
        np.savetxt("output_L%d.dat"%i, outputs)
        np.savetxt("eigenvalues_L%d.dat"%i, eigen)
        for j in range(ticadim):
            collected_eigenvalues[j].append(eigen[j])
            run_plot(lag_times[:len(collected_eigenvalues[j])], collected_eigenvalues[j], j)
            
            
        
    

Beispiel #41
0
            pairs = np.loadtxt("%s/native_contacts.ndx" % dirs[0],dtype=int,skiprows=1) - 1
            threshold = np.loadtxt("%s/pairwise_params" % dirs[0],usecols=(4,))[1:2*pairs.shape[0]:2] + 0.1
            scale = 0.3


        # Featurizer parameterizes a pipeline to read in trajectory in chunks.
        feat = coor.featurizer(topfile)
        feat.add_tanh_contacts(pairs,threshold=threshold,scale=scale,periodic=False)

        # Source trajectories
        logger.info("  sourcing trajectories: %s" % traj_list.__str__())
        inp = coor.source(traj_list, feat)

        # Stride has a drastic influence on the number of acceptable eigenvalues.
        logger.info("  computing TICA")
        tica_obj = coor.tica(inp, lag=lag, stride=stride, var_cutoff=0.9, kinetic_map=True)

        # Check if eigenvalues go negative at some point. Truncate before that if necessary.
        logger.info("  TICA done")
        logger.info("    number of dimensions: %d" % tica_obj.dimension())
        if tica_obj.dimension() == 1:
            keep_dims = 1
        else:
            if sum(tica_obj.eigenvalues < 0) > 0:
                first_neg_eigval = np.where(tica_obj.eigenvalues < 0)[0][0]
                keep_dims = min([tica_obj.dimension(),first_neg_eigval])
                logger.info("    first negative eigenvalue: %d" % first_neg_eigval)
            else:
                logger.info("    no negative eigenvalues")
                keep_dims = tica_obj.dimension()
Beispiel #42
0
def plot_tica_stuff():
    # calculate TICA at different lagtimes
    #tica_lags = np.array(range(1, 11) + [12, 15, 20, 25, 50, 75, 100, 150, 200])
    tica_lags = np.array([1, 5, 10, 25, 50, 100, 200, 500, 1000])
    all_cumvar = []
    all_tica_ti = []
    for i in range(len(tica_lags)):
        tica = coor.tica(lag=tica_lags[i], stride=1)
        coor.pipeline([reader, tica])

        all_cumvar.append(tica.cumvar)
        all_tica_ti.append(tica.timescales)

    all_cumvar = np.array(all_cumvar)
    all_tica_ti = np.array(all_tica_ti)

    # times vs lag
    plt.figure()
    for i in range(20):
        plt.plot(tica_lags, all_tica_ti[:, i])
    plt.fill_between(tica_lags, tica_lags, color='gray', lw=2)
    #ymin, ymax = plt.ylim()
    #plt.ylim(ymin, ymax)
    plt.grid(True, alpha=1, color='k', ls='--')
    plt.xlabel(r"Lag time $\tau$")
    plt.ylabel(r"TICA $t_i(\tau)$")
    plt.title(f_str)
    plt.savefig(msm_savedir + "/tica_its_vs_lag.pdf")
    plt.savefig(msm_savedir + "/tica_its_vs_lag.png")

    # cumulative variance
    plt.figure()
    for i in range(len(tica_lags)):
        plt.plot(np.arange(1,
                           len(all_cumvar[i]) + 1),
                 all_cumvar[i],
                 label=str(tica_lags[i]))

    plt.legend(loc=4)
    plt.grid(True, alpha=1, color='k', ls='--')
    #ymin, ymax = plt.ylim()
    plt.ylim(0, 1)
    plt.xlabel("Index")
    plt.ylabel("Kinetic Variance")
    plt.title(f_str)
    plt.savefig(msm_savedir + "/tica_cumvar.pdf")
    plt.savefig(msm_savedir + "/tica_cumvar.png")

    # times vs index
    plt.figure()
    for i in range(len(tica_lags)):
        plt.plot(all_tica_ti[i, :20], 'o', label=str(tica_lags[i]))

    plt.legend()
    plt.grid(True, alpha=1, color='k', ls='--')
    #ymin, ymax = plt.ylim()
    #plt.ylim(ymin, ymax)
    plt.xlabel("Index")
    plt.ylabel(r"TICA $t_i$")
    plt.title(f_str)
    plt.savefig(msm_savedir + "/tica_its.pdf")
    plt.savefig(msm_savedir + "/tica_its.png")
Beispiel #43
0
#number of PCCA clusters
n_sets = 3

print 'feat dimension'
print feat.dimension()




dataset = []
nlist = []

if 1:
    n_clusters = 200
    tica_obj = coor.tica( dim=2, lag=tica_lagtime, kinetic_map=True)

    input_data = coor.cluster_kmeans( k=n_clusters, max_iter=50)

    disc = coor.discretizer(inp, tica_obj, input_data, stride=1, chunksize=10)
    disc.parametrize()
print tica_obj.cumvar
#TICA output is Y
Y = tica_obj.get_output()
print np.shape(Y)
#print 'Y[0]'
#print Y[0]
print 'number of trajetories = ', np.shape(Y)[0]
#

#mapped_data is the TICA clustered data mapped to the microstates (so integer valued)