def test_provided_means(self): data = np.random.random((300, 3)) mean = data.mean(axis=0) tica_obj = tica(data, mean=mean) tica_calc_mean = tica(data) np.testing.assert_allclose(tica_obj.mean, tica_calc_mean.mean) np.testing.assert_allclose(tica_obj.cov, tica_calc_mean.cov) np.testing.assert_allclose(tica_obj.cov_tau, tica_calc_mean.cov_tau)
def project_and_cluster(trajfiles, featurizer, sparsify=False, tica=True, lag=100000, scale=True, var_cutoff=1.0, ncluster=100): """ Returns ------- trans_obj, Y, clustering """ X = coor.load(trajfiles, featurizer) if sparsify: X = remove_constant(X) if tica: trans_obj = coor.tica(X, lag=lag, var_cutoff=var_cutoff) Y = trans_obj.get_output() else: trans_obj = coor.pca(X, dim=-1, var_cutoff=var_cutoff) Y = trans_obj.get_output() if scale: for y in Y: y *= trans_obj.eigenvalues[:trans_obj.dimension()] if cluster: cl_obj = coor.cluster_kmeans(Y, k=ncluster, max_iter=3, fixed_seed=True) return trans_obj, Y, cl_obj return trans_obj, Y
def test_write_to_csv_propagate_filenames(self): from pyemma.coordinates import source, tica with TemporaryDirectory() as td: data = [np.random.random((20, 3))] * 3 fns = [ os.path.join(td, f) for f in ('blah.npy', 'blub.npy', 'foo.npy') ] for x, fn in zip(data, fns): np.save(fn, x) reader = source(fns) assert reader.filenames == fns tica_obj = tica(reader, lag=1, dim=2) tica_obj.write_to_csv(extension=".exotic", chunksize=3) res = sorted([ os.path.abspath(x) for x in glob(td + os.path.sep + '*.exotic') ]) self.assertEqual(len(res), len(fns)) desired_fns = sorted([s.replace('.npy', '.exotic') for s in fns]) self.assertEqual(res, desired_fns) # compare written results expected = tica_obj.get_output() actual = source(list(s.replace('.npy', '.exotic') for s in fns)).get_output() assert len(actual) == len(fns) for a, e in zip(actual, expected): np.testing.assert_allclose(a, e)
def __init__(self, data, lag, units='frames'): from pyemma.coordinates import tica # data.dat.tolist() might be better? self.data = data if isinstance(data, Metric): if units != 'frames': raise RuntimeError( 'Cannot use delayed projection TICA with units other than frames for now. Report this to HTMD issues.' ) metr = data from pyemma.coordinates.transform.tica import TICA self.tic = TICA(lag) p = ProgressBar(len(metr.simulations)) for proj in _projectionGenerator(metr, _getNcpus()): for pro in proj: self.tic.partial_fit(pro[0]) p.progress(len(proj)) p.stop() else: lag = unitconvert(units, 'frames', lag, data.fstep) if lag == 0: raise RuntimeError( 'Lag time conversion resulted in 0 frames. Please use a larger lag-time for TICA.' ) self.tic = tica(data.dat.tolist(), lag=lag)
def test_length_and_content_feature_reader_and_TICA(self): for stride in range(1, 100, 23): r = coor.source(self.trajnames, top=self.temppdb) t = coor.tica(data=r, lag=2, dim=2) # t.data_producer = r t.parametrize() # subsample data out_tica = t.get_output(stride=stride) out_reader = r.get_output(stride=stride) # get length in different ways len_tica = [x.shape[0] for x in out_tica] len_reader = [x.shape[0] for x in out_reader] len_trajs = t.trajectory_lengths(stride=stride) len_ref = [(x.shape[0]-1)//stride+1 for x in self.data] # print 'len_ref', len_ref # compare length np.testing.assert_equal(len_trajs, len_ref) self.assertTrue(len_ref == len_tica) self.assertTrue(len_ref == len_reader) # compare content (reader) for ref_data, test_data in zip(self.data, out_reader): ref_data_reshaped = ref_data.reshape((ref_data.shape[0], ref_data.shape[1]*3)) self.assertTrue(np.allclose(ref_data_reshaped[::stride, :], test_data, atol=1E-3))
def make_TICA_decomposition(ticaObject, folders, folderPath, lag, overWriteObject=False, kinetic_map=True, commute_map=False): if overWriteObject or not os.path.exists(ticaObject): trajs = [] for epoch in folders: trajFiles = glob.glob( os.path.join(folderPath, "%s/repeatedExtractedCoordinates/coord*" % epoch)) trajFiles.sort(key=lambda x: int(x[x.rfind("_") + 1:-4])) for traj in trajFiles: trajs.append(np.loadtxt(traj)) tica = coor.tica(data=trajs, lag=lag, kinetic_map=kinetic_map, commute_map=commute_map) with open(ticaObject, "wb") as f: pickle.dump(tica, f) else: with open(ticaObject, "rb") as f: tica = pickle.load(f) return tica
def test_MD_data(self): # this is too little data to get reasonable results. We just test to avoid exceptions path = pkg_resources.resource_filename(__name__, 'data') + os.path.sep self.pdb_file = os.path.join(path, 'bpti_ca.pdb') self.xtc_file = os.path.join(path, 'bpti_mini.xtc') inp = source(self.xtc_file, top=self.pdb_file) # see if this doesn't raise ticamini = tica(inp, lag=1)
def setUpClass(cls): cls.data = np.ones((10000, 100)) cls.variable_columns = np.random.choice(100, 10, replace=False) cls.data[:, cls.variable_columns] = np.random.rand(10000, 10) # Start with one of the constant columns: cls.initial_columns = np.setdiff1d(np.arange(cls.data.shape[1]), cls.variable_columns)[0:1] cls.tica_obj = tica(data=cls.data, lag=1)
def test_MD_data(self): # this is too little data to get reasonable results. We just test to avoid exceptions path = os.path.join(os.path.split(__file__)[0], 'data') self.pdb_file = os.path.join(path, 'bpti_ca.pdb') self.xtc_file = os.path.join(path, 'bpti_mini.xtc') inp = source(self.xtc_file, top=self.pdb_file) # see if this doesn't raise ticamini = tica(inp, lag=1)
def test_parametrize_with_stride(self): for stride in range(1, 100, 23): r = coor.source(self.trajnames, top=self.temppdb) tau = 5 try: t = coor.tica(r, lag=tau, stride=stride, dim=2) # force_eigenvalues_le_one=True enables an internal consistency check in TICA self.assertTrue(np.all(t.eigenvalues <= 1.0 + 1.E-12)) except RuntimeError: assert tau % stride != 0
def test_notify_changes_mixin(self): X_t = np.random.random((30, 30)) source = coor.source(np.array(X_t)) t1 = coor.tica(source) from pyemma.coordinates.transform import TICA t2 = TICA(lag=10) assert len(t1._stream_children) == 0 t2.data_producer = t1 assert t1._stream_children[0] == t2
def run_sampling(args): topology = "Native.pdb" ticadim = 10 num_sample_frames = 10000 fn = args.file # file name wn = args.weights # weights name weights = np.loadtxt(wn) weights = weights / np.sum(weights) # first time time1 = time.clock() feat = coor.featurizer(topology) feat.add_distances_ca() X1 = coor.load(fn, feat, stride=1) # time for loading time2 = time.clock() print "Took %f minutes to load a file" % ((time2 - time1) / 60.0) sampled_frames = np.zeros((num_sample_frames, np.shape(X1)[1])) selected_frames = np.random.choice(np.shape(X1)[0], size=num_sample_frames, replace=True, p=weights) time3 = time.clock() print "Took %f minutes to select new frames" % ((time3 - time2) / 60.0) for i in range(num_sample_frames): ##debug # print np.shape(sampled_frames) # print np.shape(X1) ##debugg sampled_frames[i, :] = X1[selected_frames[i], :] time4 = time.clock() print "Took %f minutes to load the new frames" % ((time4 - time3) / 60.0) ##debug for j in sampled_frames: for i in j: if i == 0: print "ERROR, distance too short, something not written" f = open("log.txt", "w") f.write("ERROR, distance too short, something not written") f.close() ##debugg time5 = time.clock() print "Took %f minutes to go through the debug check" % ((time5 - time4) / 60.0) tica_obj = coor.tica(sampled_frames, stride=1, lag=1, dim=ticadim) time6 = time.clock() print "Took %f minutes to calculate the tica_object" % ((time6 - time5) / 60.0) outputs = tica_obj.get_output()[0] eigen = tica_obj.eigenvalues time7 = time.clock() print "Took %f minutes to get the output of the tica_object" % ((time7 - time6) / 60.0) print "saving files" np.savetxt("output.dat", outputs) np.savetxt("eigenvalues.dat", eigen) print "files saved" time8 = time.clock() print "Took %f minutes to write the output files" % ((time8 - time7) / 60.0)
def run_analysis(args): feat = coor.featurizer(args.topfile) feat.add_distances(tmeth.generate_pairs(args.range[0],args.range[1], args.step_size, args.cut_value)) traj = coor.load(args.traj_file, feat, stride=args.stride) tica_obj = coor.tica(traj, stride=1, lag=args.lag, dim=args.ticadim) outputs = tica_obj.get_output()[0] eigen = tica_obj.eigenvalues np.savetxt("%s_output_raw.dat"%args.title, outputs) np.savetxt("%s_eigenvalues_raw.dat"%args.title, eigen) tmeth.plot_eigen_series(eigen, args.title, time_scale=args.time_step*args.stride) tmeth.plot_output(outputs, args.title, time_scale=args.time_step*args.stride)
def test_skipped_trajs(self): feature_trajs = [ np.arange(10), np.arange(11), np.arange(12), np.arange(13) ] tica_obj = tica(data=feature_trajs, lag=11) assert (len(tica_obj._skipped_trajs) == 2) assert np.allclose(tica_obj._skipped_trajs, [0, 1])
def test_skipped_trajs(self): feature_trajs = [ np.arange(10), np.arange(11), np.arange(12), np.arange(13) ] tica_obj = tica(data=feature_trajs, lag=11) # we skip the trajs right away in the iterator assert (len(tica_obj._skipped_trajs) == 0)
def setUpClass(cls): from pyemma.datasets import get_bpti_test_data d = get_bpti_test_data() trajs, top = d['trajs'], d['top'] s = source(trajs, top=top) t = tica(s, lag=1) c = cluster_kmeans(t) cls.model_file = tempfile.mktemp() c.save(cls.model_file, save_streaming_chain=True)
def test_too_short_traj_partial_fit(self): data = [np.empty((20, 3)), np.empty((10, 3))] lag = 11 tica_obj = tica(lag=lag) from pyemma.util.testing_tools import MockLoggingHandler log_handler = MockLoggingHandler() tica_obj._covar.logger.addHandler(log_handler) for x in data: tica_obj.partial_fit(x) self.assertEqual(tica_obj._used_data, 20 - lag) self.assertEqual(len(log_handler.messages['warning']), 1) self.assertIn("longer than lag", log_handler.messages['warning'][0])
def test_write_h5(self): from pyemma.coordinates import tica dim = 10 data = [np.random.random((np.random.randint(50, 150), dim)) for _ in range(4)] tica = tica(data, lag=1) import tempfile out = tempfile.mktemp() group = '/test' def perform(chunksize, stride): try: transformed_output = tica.get_output(chunk=chunksize, stride=stride) tica.write_to_hdf5(out, group=group, chunksize=chunksize, stride=stride) import h5py with h5py.File(out) as f: assert len(f[group]) == len(data) for (itraj, actual), desired in zip(f[group].items(), transformed_output): np.testing.assert_equal(actual, desired, err_msg='failed for cs=%s, stride=%s' %(chunksize, stride)) finally: os.remove(out) for cs in [0, 1, 3, 10, 42, 50]: for s in [1, 2, 3, 10]: perform(cs, s) # test overwrite try: tica.write_to_hdf5(out, group=group) with self.assertRaises(ValueError): tica.write_to_hdf5(out, group=group) os.remove(out) tica.write_to_hdf5(out) with self.assertRaises(ValueError) as ctx: tica.write_to_hdf5(out) assert 'Refusing to overwrite data' in ctx.exception.args[0] os.remove(out) tica.write_to_hdf5(out, group=group) tica.write_to_hdf5(out, group=group, overwrite=True) os.remove(out) import h5py with h5py.File(out) as f: f.create_group('empty').create_dataset('0000', shape=(1,1)) with self.assertRaises(ValueError): tica.write_to_hdf5(out, group='empty') tica.write_to_hdf5(out, group='empty', overwrite=True) finally: os.remove(out)
def setUpClass(cls): from pyemma.datasets import load_2well_discrete dw = load_2well_discrete() v = dw.dtraj_T100K_dt10[:10000] cls.T = v.size nstates = 100 b = np.linspace(-1, 1, nstates) sigma = 0.15 cls.Z = np.zeros((cls.T, nstates)) for t in range(cls.T): for j in range(nstates): cls.Z[t, j] = np.exp(-(b[v[t]] - b[j])**2 / (2 * sigma**2)) cls.lag = 10 cls.tica_obj = tica(data=cls.Z, lag=cls.lag)
def test_feature_correlation_MD(self): # Copying from the test_MD_data path = pkg_resources.resource_filename(__name__, 'data') + os.path.sep self.pdb_file = os.path.join(path, 'bpti_ca.pdb') self.xtc_file = os.path.join(path, 'bpti_mini.xtc') inp = source(self.xtc_file, top=self.pdb_file) ticamini = tica(inp, lag=1, kinetic_map=False) feature_traj = ticamini.data_producer.get_output()[0] tica_traj = ticamini.get_output()[0] test_corr = ticamini.feature_TIC_correlation true_corr = mycorrcoef(feature_traj, tica_traj, ticamini.lag) #assert np.isclose(test_corr, true_corr).all() np.testing.assert_allclose(test_corr, true_corr, atol=1.E-8)
def setUp(self): self.readers = [] data_dir = pkg_resources.resource_filename('pyemma.coordinates.tests', 'data') # three md trajs trajs = glob(data_dir + "/bpti_0*.xtc") top = os.path.join(data_dir, 'bpti_ca.pdb') self.readers.append(source(trajs, top=top)) self.readers[0].featurizer.add_all() ndim = self.readers[0].ndim # three random arrays lengths = self.readers[0].trajectory_lengths() arrays = [np.random.random( (length, ndim) ) for length in lengths] self.readers.append(source(arrays)) self.readers.append(tica(self.readers[-1], dim=20))
def test_feature_correlation_data(self): # Create features with some correlation feature_traj = np.zeros((100, 3)) feature_traj[:,0] = np.linspace(-.5,.5,len(feature_traj)) feature_traj[:,1] = (feature_traj[:,0]+np.random.randn(len(feature_traj))*.5)**1 feature_traj[:,2] = np.random.randn(len(feature_traj)) # Tica tica_obj = tica(data = feature_traj, dim = 3, kinetic_map=False) tica_traj = tica_obj.get_output()[0] # Create correlations test_corr = tica_obj.feature_TIC_correlation true_corr = mycorrcoef(feature_traj, tica_traj, tica_obj.lag) np.testing.assert_allclose(test_corr, true_corr, atol=1.E-8)
def test_parametrize_with_stride(self): # for stride in xrange(1,100,20): for stride in xrange(1, 100, 5): r = coor.feature_reader(self.trajnames, self.temppdb) # print 'expected total length of trajectories:', r.trajectory_lengths(stride=stride) tau = 5 # print 'expected inner frames', [max(l-2*tau,0) for l in r.trajectory_lengths(stride=stride)] t = coor.tica(r, lag=tau, dim=2, force_eigenvalues_le_one=True) # force_eigenvalues_le_one=True enables an internal consitency check in TICA # t.data_producer = r # print 'STRIDE:', stride # print 'theoretical result 2*(N-tau):', sum([2*(x-5) for x in r.trajectory_lengths(stride=stride) if x > 5]) # print 'theoretical result N:', sum(r.trajectory_lengths(stride=stride)) t.parametrize(stride=stride) # print 'TICA', t.N_cov, 2*t.N_cov_tau # print 'eigenvalues', sorted(t.eigenvalues)[::-1][0:5] self.assertTrue(np.all(t.eigenvalues <= 1.0 + 1.E-12))
def multi_temperature_tram(feat, trajfiles, temperatures, dtrajs=None, stride=1, tica_lag=100, keep_tica_dims=20, n_clusters=100, tram_lag=100, engfile="Etot.dat", usecols=(1,), kb=0.0083145): """ Parameters ---------- feat : obj, pyemma.coor.featurizer Featurizer object that already has the appropriate features added. trajfiles : list Names of trajectories to include in estimation. temperatures : list Temperatures of corresponding trajectories. stride : int Number of frames to skip in tica and clustering. tica_lag : int Lagtime to use for constructing tica. keep_tica_dims : int Number of dimensions to keep from tica. Somewhat ambiguous. n_clusters : int Number of clusters for kmeans. Somewhat ambiguous. """ dirs = [ os.path.dirname(x) for x in trajfiles ] beta = [ 1./(kb*x) for x in temperatures ] if dtrajs is None: inp = coor.source(trajfiles, feat) tica_obj = coor.tica(inp, lag=tica_lag, dim=keep_tica_dims, stride=stride) Y = tica_obj.get_output() cl = coor.cluster_kmeans(data=Y, k=n_clusters, stride=stride) dtrajs = cl.dtrajs # dimensionless energy if engfile.endswith("npy"): energy_trajs = [ beta[i]*np.load("{}/{}".format(dirs[i], engfile)) for i in range(len(dirs)) ] else: energy_trajs = [ beta[i]*np.loadtxt("{}/{}".format(dirs[i], engfile), usecols=usecols) for i in range(len(dirs)) ] temp_trajs = [ kb*temperatures[i]*np.ones(energy_trajs[i].shape[0], float) for i in range(len(dirs)) ] # dTRAM approach tram = thermo.estimate_multi_temperature(energy_trajs, temp_trajs, dtrajs, energy_unit='kT', temp_unit='kT', estimator='tram', lag=tram_lag, maxiter=2000000, maxerr=1e-10) return dirs, dtrajs, tram
def model_file(): file = None try: from pyemma.datasets import get_bpti_test_data d = get_bpti_test_data() trajs, top = d['trajs'], d['top'] s = source(trajs, top=top) t = tica(s, lag=1) c = cluster_kmeans(t) file = tempfile.mktemp() c.save(file, save_streaming_chain=True) yield file finally: if file is not None: shutil.rmtree(file, ignore_errors=True)
def test_feature_correlation_data(self): # Create features with some correlation feature_traj = np.zeros((100, 3)) feature_traj[:, 0] = np.linspace(-.5, .5, len(feature_traj)) feature_traj[:, 1] = (feature_traj[:, 0] + np.random.randn(len(feature_traj)) * .5)**1 feature_traj[:, 2] = np.random.randn(len(feature_traj)) # Tica tica_obj = tica(data=feature_traj, dim=3, kinetic_map=False) tica_traj = tica_obj.get_output()[0] # Create correlations test_corr = tica_obj.feature_TIC_correlation true_corr = np.corrcoef( feature_traj.T, y=tica_traj.T)[:tica_obj.data_producer.dimension(), tica_obj.data_producer.dimension():] assert np.isclose(test_corr, true_corr).all()
def test_too_short_traj_partial_fit(self): data = [np.empty((20, 3)), np.empty((10, 3))] lag = 11 tica_obj = tica(lag=lag) from pyemma.util.testing_tools import MockLoggingHandler log_handler = MockLoggingHandler() import logging L = logging.getLogger('pyemma.coordinates.estimation.covariance') L.addHandler(log_handler) try: for x in data: tica_obj.partial_fit(x) #self.assertEqual(tica_obj._used_data, 20 - lag) self.assertEqual(len(log_handler.messages['warning']), 1) self.assertIn("longer than lag", log_handler.messages['warning'][0]) finally: L.removeHandler(log_handler)
def __init__(self, data, lag): from pyemma.coordinates import tica # data.dat.tolist() might be better? self.data = data if isinstance(data, Metric): from pyemma.coordinates.transform.tica import TICA self.tic = TICA(lag) p = ProgressBar(len(data.simulations)) for i in range(len(data.simulations)): # Fix for pyemma bug. Remove eventually: d, _, _ = data._projectSingle(i) if d is None or d.shape[0] < lag: continue self.tic.partial_fit(d) p.progress() p.stop() else: self.tic = tica(data.dat.tolist(), lag=lag)
def run_sampling(args): topology = args.topfile ticadim = 10 num_sample_frames = 10000 tica_lag_time = 5 fn = args.filedir # file name wn = args.weights # weights name weights = np.loadtxt(wn) weights = weights / np.sum(weights) # first time time1 = time.clock() feat = coor.featurizer(topology) feat.add_distances(tmeth.generate_pairs(5, 288, 4, 4)) selected_frames = np.random.choice(args.number_traj, size=num_sample_frames, replace=True, p=weights) selected_files = [] selected_frames.sort() for i in selected_frames: selected_files.append("%s/traj%d.xtc" % (fn, i)) time2 = time.clock() print "Took %f minutes to select new frames" % ((time2 - time1) / 60.0) sampled_frames = coor.load(selected_files, feat, stride=10) time3 = time.clock() print "Took %f minutes to load the new frames" % ((time3 - time2) / 60.0) tica_obj = coor.tica(sampled_frames, stride=1, lag=tica_lag_time, dim=ticadim) time4 = time.clock() print "Took %f minutes to calculate the tica_object" % ((time4 - time3) / 60.0) all_outputs = tica_obj.get_output()[0] for i in xrange(num_sample_frames - 1): outputs = tica_obj.get_output()[i + 1] all_outputs = np.append(all_outputs, outputs, axis=0) eigen = tica_obj.eigenvalues print "saving files" np.savetxt("output.dat", all_outputs) np.savetxt("eigenvalues.dat", eigen) np.savetxt("selected_frames.dat", selected_frames) print "files saved" time5 = time.clock() print "Took %f minutes to write the output files" % ((time5 - time4) / 60.0)
def test_write_to_csv_propagate_filenames(self): from pyemma.coordinates import source, tica with TemporaryDirectory() as td: data = [np.random.random((20, 3))] * 3 fns = [ os.path.join(td, f) for f in ('blah.npy', 'blub.npy', 'foo.npy') ] for x, fn in zip(data, fns): np.save(fn, x) reader = source(fns) assert reader.filenames == fns tica_obj = tica(reader, lag=1) tica_obj.write_to_csv(extension=".exotic") res = sorted([ os.path.abspath(x) for x in glob(td + os.path.sep + '*.exotic') ]) self.assertEqual(len(res), len(fns)) desired_fns = sorted([s.replace('.npy', '.exotic') for s in fns]) self.assertEqual(res, desired_fns)
def __init__(self, data, lag, units='frames'): from pyemma.coordinates import tica # data.dat.tolist() might be better? self.data = data if isinstance(data, Metric): from pyemma.coordinates.transform.tica import TICA lag = unitconvert(units, 'frames', lag, data.fstep) self.tic = TICA(lag) p = ProgressBar(len(data.simulations)) for i in range(len(data.simulations)): # Fix for pyemma bug. Remove eventually: d, _, _ = data._projectSingle(i) if d is None or d.shape[0] < lag: continue self.tic.partial_fit(d) p.progress() p.stop() else: self.tic = tica(data.dat.tolist(), lag=lag)
def testChunksizeResultsTica(self): chunk = 40 lag = 100 np.random.seed(0) X = np.random.randn(23000, 3) # un-chunked d = DataInMemory(X) tica_obj = api.tica(data=d, lag=lag, dim=1) cov = tica_obj.cov.copy() mean = tica_obj.mean.copy() # ------- run again with new chunksize ------- d = DataInMemory(X) d.chunksize = chunk tica_obj = tica(data=d, lag=lag, dim=1) np.testing.assert_allclose(tica_obj.mean, mean) np.testing.assert_allclose(tica_obj.cov, cov)
def testChunksizeResultsTica(self): chunk = 40 lag = 100 np.random.seed(0) X = np.random.randn(23000, 3) # un-chunked d = DataInMemory(X) tica_obj = api.tica(data=d, lag=lag, dim=1) cov = tica_obj.cov.copy() mean = tica_obj.mu.copy() # ------- run again with new chunksize ------- d = DataInMemory(X) d.chunksize = chunk tica_obj = tica(data=d, lag=lag, dim=1) np.testing.assert_allclose(tica_obj.mu, mean) np.testing.assert_allclose(tica_obj.cov, cov)
featurizer = msmbuilder_to_pyemma(msmb_featurizer,traj) ################################################################################ # Define coordinates source ################################################################################ trajectory_files = glob(os.path.join(source_directory, '*0.h5')) coordinates_source = coor.source(trajectory_files,featurizer) print("There are %d frames total in %d trajectories." % (coordinates_source.n_frames_total(), coordinates_source.number_of_trajectories())) ################################################################################ # Do tICA ################################################################################ print('tICA...') running_tica = coor.tica(lag=1600, dim=4) coor.pipeline([coordinates_source,running_tica]) ################################################################################ # Make eigenvalues plot ################################################################################ plt.clf() eigenvalues = (running_tica.eigenvalues)**2 sum_eigenvalues = np.sum(eigenvalues[0:2]) print "This is the sum of the first two eigenvalues: %s." % sum_eigenvalues plt.plot(eigenvalues) plt.xlim(0,4)
def test_describe(self): desc = self.tica_obj.describe() assert types.is_string(desc) or types.is_list_of_string(desc) # describe on empty estimator tica(lag=1).describe()
featurizer.add_backbone_torsions() ################################################################################ # Define coordinates source ################################################################################ trajectory_files = glob(os.path.join(source_directory, '*0.h5')) coordinates_source = coor.source(trajectory_files,featurizer) print("There are %d frames total in %d trajectories." % (coordinates_source.n_frames_total(), coordinates_source.number_of_trajectories())) ################################################################################ # Do tICA ################################################################################ print('tICA...') running_tica = coor.tica(lag=100, dim=100) ################################################################################ # Cluster ################################################################################ print('Clustering...') clustering = coor.cluster_kmeans(k=100, stride=50) coor.pipeline([coordinates_source,running_tica,clustering]) dtrajs = clustering.dtrajs # Save discrete trajectories. clustering.save_dtrajs(output_format='npy', extension='.npy') ################################################################################
topfile = tempdirs[0] + "/" + topname trajfiles = [ x + "/" + trajname for x in tempdirs ] # add features feat = coor.featurizer(topfile) feat, feature_info = util.sbm_contact_features(feat, pairwise_file, n_native_pairs) if not os.path.exists("msm"): os.mkdir("msm") if (not os.path.exists("msm/dtrajs.pkl")) or recluster: # cluster if necessary inp = coor.source(trajfiles, feat) tica_obj = coor.tica(inp, dim=tica_dims, lag=tica_lag, stride=stride) Y = tica_obj.get_output() cl = coor.cluster_kmeans(data=Y, k=n_clusters) dtrajs = cl.dtrajs os.chdir("msm") dirs = [ os.path.basename(os.path.dirname(x)) for x in trajfiles ] if not dontsavemsm: dtraj_info = { dirs[x]:dtrajs[x] for x in range(len(dirs)) } dtraj_info["dirs"] = dirs with open("dtrajs.pkl", 'wb') as fhandle: pickle.dump(dtraj_info, fhandle) else: os.chdir("msm") with open("dtrajs.pkl", 'rb') as fhandle:
reader = coor.source(trajnames, features=feat) # Estimate Markov state model #tica_lag = 20 #keep_dims = 23 #keep_dims = 23 if not noplots: print("Plotting tica timescales vs lagtime...") plot_tica_stuff() #tica_lag = 50 # lagtime where TICA timescales are converged #keep_dims = 5 # num dims where cumulative variance reaches ~0.8 tica = coor.tica(lag=lagtime, stride=1) coor.pipeline([reader, tica]) Y = tica.get_output(dimensions=range(keep_dims)) #np.save(msm_savedir + "/tica_ti.npy", tica.timescales) #print("Saving tica coordinates...") ##if not os.path.exists(msm_savedir + "/run_1_TIC_1.npy"): #for i in range(keep_dims): # for n in range(len(Y)): # # save TIC with indices of corresponding traj # idx1, idx2 = traj_idxs[n] # tic_saveas = msm_savedir + "/run_{}_{}_TIC_{}.npy".format(idx1, idx2, i+1) # if not os.path.exists(tic_saveas) or resave_tic: # np.save(tic_saveas, Y[n][:,i])
def setUpClass(cls): # Basis set definition: cls.nf = 10 cls.chi = np.zeros((20, cls.nf), dtype=float) for n in range(cls.nf): cls.chi[2 * n:2 * (n + 1), n] = 1.0 # Load simulations: f = np.load( pkg_resources.resource_filename(__name__, "data/test_data_koopman.npz")) trajs = [f[key] for key in f.keys()] cls.data = [cls.chi[traj, :] for traj in trajs] # Lag time: cls.tau = 10 # Truncation for small eigenvalues: cls.epsilon = 1e-6 # Compute the means: cls.mean_x = np.zeros(cls.nf) cls.mean_y = np.zeros(cls.nf) cls.frames = 0 for traj in cls.data: cls.mean_x += np.sum(traj[:-cls.tau, :], axis=0) cls.mean_y += np.sum(traj[cls.tau:, :], axis=0) cls.frames += traj[:-cls.tau, :].shape[0] cls.mean_x *= (1.0 / cls.frames) cls.mean_y *= (1.0 / cls.frames) cls.mean_rev = 0.5 * (cls.mean_x + cls.mean_y) # Compute correlations: cls.C0 = np.zeros((cls.nf, cls.nf)) cls.Ct = np.zeros((cls.nf, cls.nf)) cls.C0_rev = np.zeros((cls.nf, cls.nf)) cls.Ct_rev = np.zeros((cls.nf, cls.nf)) for traj in cls.data: itraj = (traj - cls.mean_x[None, :]).copy() cls.C0 += np.dot(itraj[:-cls.tau, :].T, itraj[:-cls.tau, :]) cls.Ct += np.dot(itraj[:-cls.tau, :].T, itraj[cls.tau:, :]) itraj = (traj - cls.mean_rev[None, :]).copy() cls.C0_rev += np.dot(itraj[:-cls.tau, :].T, itraj[:-cls.tau, :])\ + np.dot(itraj[cls.tau:, :].T, itraj[cls.tau:, :]) cls.Ct_rev += np.dot(itraj[:-cls.tau, :].T, itraj[cls.tau:, :])\ + np.dot(itraj[cls.tau:, :].T, itraj[:-cls.tau, :]) cls.C0 *= (1.0 / cls.frames) cls.Ct *= (1.0 / cls.frames) cls.C0_rev *= (1.0 / (2 * cls.frames)) cls.Ct_rev *= (1.0 / (2 * cls.frames)) # Compute whitening transformation: cls.R = transform_C0(cls.C0, cls.epsilon) cls.Rrev = transform_C0(cls.C0_rev, cls.epsilon) # Perform non-reversible diagonalization cls.ln, cls.Rn = scl.eig(np.dot(cls.R.T, np.dot(cls.Ct, cls.R))) cls.ln, cls.Rn = sort_by_norm(cls.ln, cls.Rn) cls.Rn = np.dot(cls.R, cls.Rn) cls.Rn = scale_eigenvectors(cls.Rn) cls.tsn = -cls.tau / np.log(np.abs(cls.ln)) cls.ls, cls.Rs = scl.eig( np.dot(cls.Rrev.T, np.dot(cls.Ct_rev, cls.Rrev))) cls.ls, cls.Rs = sort_by_norm(cls.ls, cls.Rs) cls.Rs = np.dot(cls.Rrev, cls.Rs) cls.Rs = scale_eigenvectors(cls.Rs) cls.tss = -cls.tau / np.log(np.abs(cls.ls)) # Compute non-reversible Koopman matrix: cls.K = np.dot(cls.R.T, np.dot(cls.Ct, cls.R)) cls.K = np.vstack((cls.K, np.dot((cls.mean_y - cls.mean_x), cls.R))) cls.K = np.hstack( (cls.K, np.eye(cls.K.shape[0], 1, k=-cls.K.shape[0] + 1))) cls.N1 = cls.K.shape[0] # Compute u-vector: ln, Un = scl.eig(cls.K.T) ln, Un = sort_by_norm(ln, Un) cls.u = np.real(Un[:, 0]) v = np.eye(cls.N1, 1, k=-cls.N1 + 1)[:, 0] cls.u *= (1.0 / np.dot(cls.u, v)) # Prepare weight object: u_mod = cls.u.copy() N = cls.R.shape[0] u_input = np.zeros(N + 1) u_input[0:N] = cls.R.dot(u_mod[0:-1]) # in input basis u_input[N] = u_mod[-1] - cls.mean_x.dot(cls.R.dot(u_mod[0:-1])) weight_obj = _KoopmanWeights(u_input[:-1], u_input[-1]) # Compute weights over all data points: cls.wtraj = [] for traj in cls.data: traj = np.dot((traj - cls.mean_x[None, :]), cls.R).copy() traj = np.hstack((traj, np.ones((traj.shape[0], 1)))) cls.wtraj.append(np.dot(traj, cls.u)) # Compute equilibrium mean: cls.mean_eq = np.zeros(cls.nf) q = 0 for traj in cls.data: qwtraj = cls.wtraj[q] cls.mean_eq += np.sum((qwtraj[:-cls.tau, None] * traj[:-cls.tau, :]), axis=0)\ + np.sum((qwtraj[:-cls.tau, None] * traj[cls.tau:, :]), axis=0) q += 1 cls.mean_eq *= (1.0 / (2 * cls.frames)) # Compute reversible C0, Ct: cls.C0_eq = np.zeros((cls.N1, cls.N1)) cls.Ct_eq = np.zeros((cls.N1, cls.N1)) q = 0 for traj in cls.data: qwtraj = cls.wtraj[q] traj = (traj - cls.mean_eq[None, :]).copy() cls.C0_eq += np.dot((qwtraj[:-cls.tau, None] * traj[:-cls.tau, :]).T, traj[:-cls.tau, :])\ + np.dot((qwtraj[:-cls.tau, None] * traj[cls.tau:, :]).T, traj[cls.tau:, :]) cls.Ct_eq += np.dot((qwtraj[:-cls.tau, None] * traj[:-cls.tau, :]).T, traj[cls.tau:, :])\ + np.dot((qwtraj[:-cls.tau, None] * traj[cls.tau:, :]).T, traj[:-cls.tau, :]) q += 1 cls.C0_eq *= (1.0 / (2 * cls.frames)) cls.Ct_eq *= (1.0 / (2 * cls.frames)) # Solve re-weighted eigenvalue problem: S = transform_C0(cls.C0_eq, cls.epsilon) Ct_S = np.dot(S.T, np.dot(cls.Ct_eq, S)) # Compute its eigenvalues: cls.lr, cls.Rr = scl.eigh(Ct_S) cls.lr, cls.Rr = sort_by_norm(cls.lr, cls.Rr) cls.Rr = np.dot(S, cls.Rr) cls.Rr = scale_eigenvectors(cls.Rr) cls.tsr = -cls.tau / np.log(np.abs(cls.lr)) # Set up the model: cls.koop_rev = tica(cls.data, lag=cls.tau, kinetic_map=False) cls.koop_eq = tica(cls.data, lag=cls.tau, kinetic_map=False, weights='koopman') # Test the model by supplying weights directly: cls.koop_eq_direct = tica(cls.data, lag=cls.tau, weights=weight_obj, kinetic_map=False)
print np.shape(X1) possible_times = np.logspace(1,100,5) possible_times = possible_times.astype(int) lag_times = [] for i in possible_times: if i not in lag_times: lag_times.append(i) print lag_times collected_eigenvalues=[] for i in range(ticadim): collected_eigenvalues.append([]) #debug #lag_times = [10000] #debugg for i in lag_times: tica_obj = coor.tica(X1, stride=1, lag=i, dim=ticadim) outputs = tica_obj.get_output()[0] eigen = tica_obj.eigenvalues np.savetxt("output_L%d.dat"%i, outputs) np.savetxt("eigenvalues_L%d.dat"%i, eigen) for j in range(ticadim): collected_eigenvalues[j].append(eigen[j]) run_plot(lag_times[:len(collected_eigenvalues[j])], collected_eigenvalues[j], j)
pairs = np.loadtxt("%s/native_contacts.ndx" % dirs[0],dtype=int,skiprows=1) - 1 threshold = np.loadtxt("%s/pairwise_params" % dirs[0],usecols=(4,))[1:2*pairs.shape[0]:2] + 0.1 scale = 0.3 # Featurizer parameterizes a pipeline to read in trajectory in chunks. feat = coor.featurizer(topfile) feat.add_tanh_contacts(pairs,threshold=threshold,scale=scale,periodic=False) # Source trajectories logger.info(" sourcing trajectories: %s" % traj_list.__str__()) inp = coor.source(traj_list, feat) # Stride has a drastic influence on the number of acceptable eigenvalues. logger.info(" computing TICA") tica_obj = coor.tica(inp, lag=lag, stride=stride, var_cutoff=0.9, kinetic_map=True) # Check if eigenvalues go negative at some point. Truncate before that if necessary. logger.info(" TICA done") logger.info(" number of dimensions: %d" % tica_obj.dimension()) if tica_obj.dimension() == 1: keep_dims = 1 else: if sum(tica_obj.eigenvalues < 0) > 0: first_neg_eigval = np.where(tica_obj.eigenvalues < 0)[0][0] keep_dims = min([tica_obj.dimension(),first_neg_eigval]) logger.info(" first negative eigenvalue: %d" % first_neg_eigval) else: logger.info(" no negative eigenvalues") keep_dims = tica_obj.dimension()
def plot_tica_stuff(): # calculate TICA at different lagtimes #tica_lags = np.array(range(1, 11) + [12, 15, 20, 25, 50, 75, 100, 150, 200]) tica_lags = np.array([1, 5, 10, 25, 50, 100, 200, 500, 1000]) all_cumvar = [] all_tica_ti = [] for i in range(len(tica_lags)): tica = coor.tica(lag=tica_lags[i], stride=1) coor.pipeline([reader, tica]) all_cumvar.append(tica.cumvar) all_tica_ti.append(tica.timescales) all_cumvar = np.array(all_cumvar) all_tica_ti = np.array(all_tica_ti) # times vs lag plt.figure() for i in range(20): plt.plot(tica_lags, all_tica_ti[:, i]) plt.fill_between(tica_lags, tica_lags, color='gray', lw=2) #ymin, ymax = plt.ylim() #plt.ylim(ymin, ymax) plt.grid(True, alpha=1, color='k', ls='--') plt.xlabel(r"Lag time $\tau$") plt.ylabel(r"TICA $t_i(\tau)$") plt.title(f_str) plt.savefig(msm_savedir + "/tica_its_vs_lag.pdf") plt.savefig(msm_savedir + "/tica_its_vs_lag.png") # cumulative variance plt.figure() for i in range(len(tica_lags)): plt.plot(np.arange(1, len(all_cumvar[i]) + 1), all_cumvar[i], label=str(tica_lags[i])) plt.legend(loc=4) plt.grid(True, alpha=1, color='k', ls='--') #ymin, ymax = plt.ylim() plt.ylim(0, 1) plt.xlabel("Index") plt.ylabel("Kinetic Variance") plt.title(f_str) plt.savefig(msm_savedir + "/tica_cumvar.pdf") plt.savefig(msm_savedir + "/tica_cumvar.png") # times vs index plt.figure() for i in range(len(tica_lags)): plt.plot(all_tica_ti[i, :20], 'o', label=str(tica_lags[i])) plt.legend() plt.grid(True, alpha=1, color='k', ls='--') #ymin, ymax = plt.ylim() #plt.ylim(ymin, ymax) plt.xlabel("Index") plt.ylabel(r"TICA $t_i$") plt.title(f_str) plt.savefig(msm_savedir + "/tica_its.pdf") plt.savefig(msm_savedir + "/tica_its.png")
#number of PCCA clusters n_sets = 3 print 'feat dimension' print feat.dimension() dataset = [] nlist = [] if 1: n_clusters = 200 tica_obj = coor.tica( dim=2, lag=tica_lagtime, kinetic_map=True) input_data = coor.cluster_kmeans( k=n_clusters, max_iter=50) disc = coor.discretizer(inp, tica_obj, input_data, stride=1, chunksize=10) disc.parametrize() print tica_obj.cumvar #TICA output is Y Y = tica_obj.get_output() print np.shape(Y) #print 'Y[0]' #print Y[0] print 'number of trajetories = ', np.shape(Y)[0] # #mapped_data is the TICA clustered data mapped to the microstates (so integer valued)