def test_write_to_csv_propagate_filenames(self): from pyemma.coordinates import source, tica with TemporaryDirectory() as td: data = [np.random.random((20, 3))] * 3 fns = [ os.path.join(td, f) for f in ('blah.npy', 'blub.npy', 'foo.npy') ] for x, fn in zip(data, fns): np.save(fn, x) reader = source(fns) assert reader.filenames == fns tica_obj = tica(reader, lag=1, dim=2) tica_obj.write_to_csv(extension=".exotic", chunksize=3) res = sorted([ os.path.abspath(x) for x in glob(td + os.path.sep + '*.exotic') ]) self.assertEqual(len(res), len(fns)) desired_fns = sorted([s.replace('.npy', '.exotic') for s in fns]) self.assertEqual(res, desired_fns) # compare written results expected = tica_obj.get_output() actual = source(list(s.replace('.npy', '.exotic') for s in fns)).get_output() assert len(actual) == len(fns) for a, e in zip(actual, expected): np.testing.assert_allclose(a, e)
def setUp(self): self.readers = [] data_dir = pkg_resources.resource_filename('pyemma.coordinates.tests', 'data') # three md trajs trajs = glob(data_dir + "/bpti_0*.xtc") top = os.path.join(data_dir, 'bpti_ca.pdb') self.readers.append(source(trajs, top=top)) self.readers[0].featurizer.add_all() ndim = self.readers[0].ndim # three random arrays lengths = self.readers[0].trajectory_lengths() arrays = [np.random.random( (length, ndim) ) for length in lengths] self.readers.append(source(arrays))
def test_fragmented_trajs(self): """ build two fragmented readers consisting out of two fragments each and check if they are merged properly.""" segment_0 = np.arange(20) segment_1 = np.arange(20, 40) s1 = source([(segment_0, segment_1)]) s2 = source([(segment_0, segment_1)]) sm = SourcesMerger((s1, s2)) out = sm.get_output() x = np.atleast_2d(np.arange(40)) expected = [np.concatenate((x, x), axis=0).T] np.testing.assert_equal(out, expected)
def test_lagged_iterator(self): import pyemma.coordinates as coor from pyemma.coordinates.tests.util import create_traj, get_top trajectory_length = 4720 lagtime = 1000 n_trajs = 15 top = get_top() trajs_data = [ create_traj(top=top, length=trajectory_length) for _ in range(n_trajs) ] trajs = [t[0] for t in trajs_data] xyzs = [t[1].reshape(-1, 9) for t in trajs_data] reader = coor.source(trajs, top=top, chunksize=5000) for chunk in [ None, 0, trajectory_length, trajectory_length + 1, trajectory_length + 1000 ]: it = reader.iterator(lag=lagtime, chunk=chunk, return_trajindex=True) with it: for itraj, X, Y in it: np.testing.assert_equal(X.shape, Y.shape) np.testing.assert_equal(X.shape[0], trajectory_length - lagtime) np.testing.assert_array_almost_equal( X, xyzs[itraj][:trajectory_length - lagtime]) np.testing.assert_array_almost_equal( Y, xyzs[itraj][lagtime:])
def prepare_tica_inputs(datasets, topfile, features=None, selection=None, chunksize=10000, singletraj=False): #print("topfile: ", topfile) #print("selection", selection) if isinstance(topfile, mdtraj.Topology): topology = topfile elif os.path.exists(topfile): topology = mdtraj.load(topfile).topology else: print("Cannot find topology file: %s"%topfile) assert isinstance(topology, mdtraj.Topology) if selection: topology = topology.subset(topology.select(selection_string=selection)) #if isinstance(features, featurizer): # feat = features feat = coor.featurizer(topology) if not features: # then use inverse Ca distances # PyEMMA equivalent: `feat.add_inverse_distances(feat.select_backbone())` features = {'add_inverse_distances': { 'select_Ca': None }} apply_feat_part(feat, features) ticainputs, input_order = squish_tica_inputfiles(datasets, feat) if singletraj: ticainputs = [ticainputs] #print("remove this & below!") #print(ticainputs) #datasets['analysis']['dim_reduction']['input_order'].append(nm) tica_inp = coor.source(ticainputs, feat, chunksize=chunksize) return tica_inp, input_order
def test_fragmented_reader(self): from pyemma.coordinates.tests.util import create_traj from pyemma.util.files import TemporaryDirectory top_file = pkg_resources.resource_filename(__name__, 'data/test.pdb') trajfiles = [] with TemporaryDirectory() as d: for _ in range(3): f, _, _ = create_traj(top_file, dir=d) trajfiles.append(f) # three trajectories: one consisting of all three, one consisting of the first, # one consisting of the first and the last frag_trajs = [ trajfiles, [trajfiles[0]], [trajfiles[0], trajfiles[2]] ] chunksize = 232 source = coor.source(frag_trajs, top=top_file, chunksize=chunksize) params = { 'chunksize': chunksize, 'ndim': source.ndim, '_trajectories': trajfiles } restored = self.compare(source, params) np.testing.assert_equal(source.get_output(), restored.get_output())
def test_h5_reader(self): h5_file = pkg_resources.resource_filename(__name__, 'data/bpti_mini.h5') params = dict(selection='/coordinates') source = coor.source(h5_file, **params) restored = self.compare(source, params) np.testing.assert_equal(source.get_output(), restored.get_output())
def setUp(self): self.eps = 1e-10 path = pkg_resources.resource_filename(__name__, 'data') + os.path.sep self.pdbfile = os.path.join(path, 'bpti_ca.pdb') self.trajfiles = [os.path.join(path, 'bpti_001-033.xtc'), os.path.join(path, 'bpti_034-066.xtc'), os.path.join(path, 'bpti_067-100.xtc') ] # Create random sets of files and frames to be retrieved from trajfiles n_members_set1 = 10 n_members_set2 = 20 set_1 = np.vstack((np.random.permutation([0, 2] * n_members_set1)[:n_members_set1], np.random.randint(32, size=n_members_set1))).T set_2 = np.vstack((np.random.permutation([0, 2] * n_members_set2)[:n_members_set2], np.random.randint(32, size=n_members_set2))).T self.sets = [set_1, set_2] self.subdir = tempfile.mkdtemp(suffix='save_trajs_test/') # Instantiate the reader self.reader = coor.source(self.trajfiles, top=self.pdbfile) self.reader.chunksize = 30 self.n_pass_files = [self.subdir + 'n_pass.set_%06u.xtc' % ii for ii in range(len(self.sets))] self.one_pass_files = [self.subdir + '1_pass.set_%06u.xtc' % ii for ii in range(len(self.sets))] self.traj_ref = save_traj_w_md_load_frame(self.reader, self.sets) self.strides = [2, 3, 5]
def test_fragmented_xtc(self): from pyemma.coordinates.tests.util import create_traj top_file = pkg_resources.resource_filename(__name__, 'data/test.pdb') trajfiles = [] for _ in range(3): f, _, _ = create_traj(top_file) trajfiles.append(f) try: # three trajectories: one consisting of all three, one consisting of the first, # one consisting of the first and the last source = coor.source( [trajfiles, [trajfiles[0]], [trajfiles[0], trajfiles[2]]], top=top_file) source.chunksize = 1000 out = source.get_output(stride=1) trajs = [ mdtraj.load(trajfiles[i], top=top_file).xyz.reshape(-1, 9) for i in range(0, 3) ] np.testing.assert_equal(out[0], np.vstack(trajs)) np.testing.assert_equal(out[1], trajs[0]) np.testing.assert_equal(out[2], np.vstack((trajs[0], trajs[2]))) finally: for t in trajfiles: try: os.unlink(t) except EnvironmentError: pass
def partial_fit(self, X): """ incrementally update the covariances and mean. Parameters ---------- X: array, list of arrays, PyEMMA reader input data. Notes ----- The projection matrix is first being calculated upon its first access. """ from pyemma.coordinates import source iterable = source(X) if isinstance(self.dim, int): indim = iterable.dimension() if not self.dim <= indim: raise RuntimeError( "requested more output dimensions (%i) than dimension" " of input data (%i)" % (self.dim, indim)) self._covar = self._init_covar(partial=True) self._covar.partial_fit(iterable) self.model.update_model_params( mean_0=self._covar.mean, # TODO: inefficient, fixme mean_t=self._covar.mean_tau, C00=self._covar.C00_, C0t=self._covar.C0t_, Ctt=self._covar.Ctt_) self._estimated = False return self.model
def _test_fragment_reader(self, file_format, stride, lag, chunksize): trajs = self.test_trajs[file_format] reader = coor.source([trajs], top=self.pdb_file, chunksize=chunksize) assert isinstance(reader, FragmentedTrajectoryReader) data = np.vstack(self.traj_data) itraj = None if lag > 0: collected = [] collected_lagged = [] for itraj, X, Y in reader.iterator(stride=stride, lag=lag): collected.append(X) collected_lagged.append(Y) assert collected assert collected_lagged assert len(collected) == len(collected_lagged) collected = np.vstack(collected) collected_lagged = np.vstack(collected_lagged) np.testing.assert_allclose(data[::stride][0:len(collected_lagged)], collected, atol=self.eps, err_msg="lag={}, stride={}, cs={}".format(lag, stride, chunksize )) np.testing.assert_allclose(data[lag::stride], collected_lagged, atol=self.eps) else: collected = [] for itraj, X in reader.iterator(stride=stride): collected.append(X) assert collected collected = np.vstack(collected) np.testing.assert_allclose(data[::stride], collected, atol=self.eps) assert itraj == 0 # only one trajectory
def test_with_save_traj(self): path = pkg_resources.resource_filename(__name__, 'data') + os.path.sep pdb_file = os.path.join(path, 'bpti_ca.pdb') traj_files = [ os.path.join(path, 'bpti_001-033.xtc'), os.path.join(path, 'bpti_034-066.xtc'), os.path.join(path, 'bpti_067-100.xtc') ] source_frag = coor.source([traj_files], top=pdb_file) full_data = source_frag.get_output()[0] last_frame_fragment_0 = [0,32] first_frame_fragment_1 = [0,33] first_frame_fragment_2 = [0,66] reshape = lambda f: f.xyz.reshape((f.xyz.shape[0],f.xyz.shape[1] * f.xyz.shape[2])).squeeze() # Frames in the first fragment: frames = coor.save_traj(source_frag, [last_frame_fragment_0], None) np.testing.assert_equal(reshape(frames), full_data[32]) # Frames the first and second fragments frames = coor.save_traj(source_frag, [last_frame_fragment_0, first_frame_fragment_1], None) np.testing.assert_equal(reshape(frames), full_data[np.array([32, 33])]) # Frames only in the second fragment frames = coor.save_traj(source_frag, [first_frame_fragment_1], None) np.testing.assert_equal(reshape(frames), full_data[33]) # Frames only in the second and third fragment frames = coor.save_traj(source_frag, [first_frame_fragment_1, first_frame_fragment_2], None) np.testing.assert_equal(reshape(frames), full_data[np.array([33, 66])])
def test_non_matching_lengths(self): data = self.readers[1].data data = [data[0], data[1], data[2][:20]] self.readers.append(source(data)) with self.assertRaises(ValueError) as ctx: SourcesMerger(self.readers) self.assertIn('matching', ctx.exception.args[0])
def test_assignment_multithread_minrsmd(self): # re-do assignment with multiple threads and compare results import pyemma.datasets as data d = data.get_bpti_test_data() reader = coor.source(d['trajs'], top=d['top']) N_centers = 9 centers = np.asarray((reader.ra_itraj_jagged[0, [0, 1, 7]], reader.ra_itraj_jagged[1, [32, 1, 23]], reader.ra_itraj_jagged[2, [17, 8, 15]])).reshape( (N_centers, -1)) chunksize = 1000 assignment_mp = coor.assign_to_centers(reader, centers, n_jobs=2, chunksize=chunksize, metric='minRMSD') assignment_sp = coor.assign_to_centers(reader, centers, n_jobs=1, chunksize=chunksize, metric='minRMSD') np.testing.assert_equal(assignment_mp, assignment_sp)
def test_length_and_content_feature_reader_and_TICA(self): for stride in range(1, 100, 23): r = coor.source(self.trajnames, top=self.temppdb) t = coor.tica(data=r, lag=2, dim=2) # t.data_producer = r t.parametrize() # subsample data out_tica = t.get_output(stride=stride) out_reader = r.get_output(stride=stride) # get length in different ways len_tica = [x.shape[0] for x in out_tica] len_reader = [x.shape[0] for x in out_reader] len_trajs = t.trajectory_lengths(stride=stride) len_ref = [(x.shape[0]-1)//stride+1 for x in self.data] # print 'len_ref', len_ref # compare length np.testing.assert_equal(len_trajs, len_ref) self.assertTrue(len_ref == len_tica) self.assertTrue(len_ref == len_reader) # compare content (reader) for ref_data, test_data in zip(self.data, out_reader): ref_data_reshaped = ref_data.reshape((ref_data.shape[0], ref_data.shape[1]*3)) self.assertTrue(np.allclose(ref_data_reshaped[::stride, :], test_data, atol=1E-3))
def test_partial_fit(self): from pyemma.coordinates import source reader = source(self.trajnames, top=self.temppdb) reader_output = reader.get_output() for output_params in [{'kinetic_map': False}, {'kinetic_map': True}, {'kinetic_map': False, 'commute_map': True}]: params = {'lag': 10, 'dim': self.dim} params.update(output_params) tica_obj = tica(**params) tica_obj.partial_fit(reader_output[0]) assert not tica_obj._estimated # acccess eigenvectors to force diagonalization tica_obj.eigenvectors assert tica_obj._estimated tica_obj.partial_fit(reader_output[1]) assert not tica_obj._estimated tica_obj.eigenvalues assert tica_obj._estimated for traj in reader_output[2:]: tica_obj.partial_fit(traj) # reference ref = tica(reader, **params) np.testing.assert_allclose(tica_obj.cov, ref.cov, atol=1e-15) np.testing.assert_allclose(tica_obj.cov_tau, ref.cov_tau, atol=1e-15) np.testing.assert_allclose(tica_obj.eigenvalues, ref.eigenvalues, atol=1e-15)
def setUp(self): self.eps = 1e-6 path = os.path.join(os.path.split(__file__)[0], 'data') self.pdbfile = os.path.join(path, 'bpti_ca.pdb') self.trajfiles = [os.path.join(path, 'bpti_001-033.xtc'), os.path.join(path, 'bpti_034-066.xtc'), os.path.join(path, 'bpti_067-100.xtc') ] # Create random sets of files and frames to be retrieved from trajfiles n_members_set1 = 10 n_members_set2 = 20 set_1 = np.vstack((np.random.permutation([0, 2] * n_members_set1)[:n_members_set1], np.random.randint(32, size=n_members_set1))).T set_2 = np.vstack((np.random.permutation([0, 2] * n_members_set2)[:n_members_set2], np.random.randint(32, size=n_members_set2))).T self.sets = [set_1, set_2] self.subdir = tempfile.mkdtemp(suffix='save_trajs_test') # Instantiate the reader self.reader = coor.source(self.trajfiles, top=self.pdbfile) self.reader.chunksize = 10 self.n_pass_files = [self.subdir + 'n_pass.set_%06u.xtc' % ii for ii in xrange(len(self.sets))] self.one_pass_files = [self.subdir + '1_pass.set_%06u.xtc' % ii for ii in xrange(len(self.sets))]
def DoubleProducts(Y1, Y2, filename, U=None): ''' Evaluate all products between two given time-series. Optionally,a linear transformation of the product basis can be computed instead. Parameters: ------------- Y1, Y2: pyemma-reader, containing time series of basis functions. filename: str, name to be used to save the data for the product time series. U, ndarray, shape (r,s), where r must be identical to the product dimension of Y1 and Y2 and s is the number of linear combinations to be extracted. Returns: ------------- pyemma-reader, containing the time-series of all possible products between the basis functions in Y1 and Y2.''' # Get the dimensions of both time-series: r1 = Y1.dimension() r2 = Y2.dimension() # Compute the product dimension: r = r1 * r2 # Get the output dimension: if not (U is None): ro = U.shape[1] else: ro = r # Get the iterators for both time-series: I1 = Y1.iterator() I2 = Y2.iterator() # Prepare an empty array for the trajectory pieces: file_names = [] q = 0 ieval = np.zeros((0, ro)) # Compute the products chunk by chunk: for piece in zip(I1, I2): # Get the trajectory number and the data: traj_id = piece[0][0] piece0 = piece[0][1] piece1 = piece[1][1] # Check if the last trajectory is finished: if traj_id > q: np.save(filename + "_%d.npy" % q, ieval) file_names.append(filename + "_%d.npy" % q) ieval = np.zeros((0, ro)) q += 1 # Compute all the products: chunkeval = np.einsum('ijk,imk->ijm', piece0[:, :, np.newaxis], piece1[:, :, np.newaxis]) chunkeval = np.reshape(chunkeval, (chunkeval.shape[0], r)) # Apply linear transform if necessary: if not (U is None): chunkeval = np.dot(chunkeval, U) # Stack the result underneath the previous results: ieval = np.vstack((ieval, chunkeval)) # Save the last trajectory: np.save(filename + "_%d.npy" % q, ieval) file_names.append(filename + "_%d.npy" % q) # Build a new reader and return it: reader = pco.source(file_names) reader.chunksize = Y1.chunksize return reader
def test_MD_data(self): # this is too little data to get reasonable results. We just test to avoid exceptions path = os.path.join(os.path.split(__file__)[0], 'data') self.pdb_file = os.path.join(path, 'bpti_ca.pdb') self.xtc_file = os.path.join(path, 'bpti_mini.xtc') inp = source(self.xtc_file, top=self.pdb_file) # see if this doesn't raise ticamini = tica(inp, lag=1)
def partial_fit(self, X): from pyemma.coordinates import source iterable = source(X) self._estimate(iterable, partial=True) self._estimated = False return self
def __init__(self, topologyfile:str, trajfiles:list, workdir='automsm') -> None: super().__init__() self.topologyfile = os.path.abspath(topologyfile) self.trajfiles = [ os.path.abspath(trajfile) for trajfile in trajfiles ] self.workdir = workdir if not os.path.exists(workdir): os.mkdir(workdir) self.src = source(self.trajfiles, top=self.topologyfile)
def test_MD_data(self): # this is too little data to get reasonable results. We just test to avoid exceptions path = pkg_resources.resource_filename(__name__, 'data') + os.path.sep self.pdb_file = os.path.join(path, 'bpti_ca.pdb') self.xtc_file = os.path.join(path, 'bpti_mini.xtc') inp = source(self.xtc_file, top=self.pdb_file) # see if this doesn't raise ticamini = tica(inp, lag=1)
def _test_base_reader(self, file_format, stride, skip, chunksize, transform): # TODO: remove this, when mdtraj-2.0 is released. if file_format == 'dcd' and stride > 1 and skip_stride_handling_old_mdtraj: raise unittest.SkipTest('wait for mdtraj 2.0') trajs = self.test_trajs[file_format] reader = coor.source(trajs, top=self.pdb_file, chunksize=chunksize) if transform == 'identity': reader = util.create_transform(reader) if chunksize is not None: np.testing.assert_equal(reader.chunksize, chunksize) it = reader.iterator(stride=stride, skip=skip, lag=0, chunk=chunksize) assert it.chunksize is not None if chunksize is None: max_frames = max_chunksize_from_config(reader.output_type().itemsize) assert it.chunksize <= max_frames # now we set the chunksize to max_frames, to be able to compare the actual shapes of iterator output. chunksize = max_frames traj_data = [data[skip::stride] for data in self.traj_data] valid_itraj = [i for i, x in enumerate(traj_data) if len(x) > 0] output = defaultdict(list) with it: current_itraj = None t = t_total = 0 for itraj, chunk in it: # reset t upon next trajectory if itraj != current_itraj: current_itraj = itraj t = 0 assert len(chunk) <= chunksize or chunksize == 0, '%s' % it if chunksize != 0 and len(traj_data[itraj]) - t >= chunksize: assert len(chunk) == chunksize elif chunksize == 0: assert len(chunk) == len(traj_data[itraj]) output[itraj].append(chunk) t += len(chunk) t_total += len(chunk) for itraj in valid_itraj: assert itraj in output.keys() for itraj in output.keys(): assert itraj in valid_itraj output[itraj] = np.vstack(output[itraj]) np.testing.assert_allclose(output[itraj], traj_data[itraj], atol=self.eps) assert t_total == sum(len(x) for x in output.values()) assert t_total == reader.n_frames_total(stride=stride, skip=skip)
def test_in_memory(self): data = np.random.random((100, 10)) tica_obj = api.tica(lag=10, dim=1) reader = source(data) tica_obj.data_producer = reader tica_obj.in_memory = True tica_obj.parametrize() tica_obj.get_output()
def test_parametrize_with_stride(self): for stride in range(1, 100, 23): r = coor.source(self.trajnames, top=self.temppdb) tau = 5 try: t = coor.tica(r, lag=tau, stride=stride, dim=2) # force_eigenvalues_le_one=True enables an internal consistency check in TICA self.assertTrue(np.all(t.eigenvalues <= 1.0 + 1.E-12)) except RuntimeError: assert tau % stride != 0
def test_pass_reader(self): from pyemma.coordinates import source reader = source(self.trajfiles, top=self.pdbfile) reader.in_memory = True inds = np.vstack((np.random.randint(0, 1), np.random.randint(0, 100))).T traj_test = _frames_from_file(reader.filenames, self.pdbfile, inds, reader=reader)
def test_notify_changes_mixin(self): X_t = np.random.random((30, 30)) source = coor.source(np.array(X_t)) t1 = coor.tica(source) from pyemma.coordinates.transform import TICA t2 = TICA(lag=10) assert len(t1._stream_children) == 0 t2.data_producer = t1 assert t1._stream_children[0] == t2
def _test_lagged_reader(self, file_format, stride, skip, chunksize, lag): # TODO: remove this, when mdtraj-2.0 is released. if file_format == 'dcd' and stride > 1 and skip_stride_handling_old_mdtraj: raise unittest.SkipTest('wait for mdtraj 2.0') trajs = self.test_trajs[file_format] reader = coor.source(trajs, top=self.pdb_file, chunksize=chunksize) it = reader.iterator(stride=stride, skip=skip, lag=lag, chunk=chunksize) traj_data = [data[skip::stride] for data in self.traj_data] traj_data_lagged = [data[skip + lag::stride] for data in self.traj_data] valid_itrajs = [i for i, x in enumerate(traj_data_lagged) if len(x) > 0] assert it.chunksize is not None if chunksize is None: chunksize = max_chunksize_from_config(reader.output_type().itemsize) with it: current_itraj = None t = t_total = 0 collected = defaultdict(list) collected_lag = defaultdict(list) for itraj, chunk, chunk_lagged in it: # reset t upon next trajectory if itraj != current_itraj: current_itraj = itraj t = 0 assert len(chunk) <= chunksize or chunksize == 0 if chunksize != 0 and len(traj_data[itraj]) - t >= chunksize: assert len(chunk) <= chunksize elif chunksize == 0: assert len(chunk) == len(chunk_lagged) == len(traj_data_lagged[itraj]) collected[itraj].append(chunk) collected_lag[itraj].append(chunk_lagged) t += len(chunk) t_total += len(chunk) for itraj in valid_itrajs: assert itraj in collected.keys() assert itraj in collected_lag.keys() assert set(collected.keys()) == set(collected_lag.keys()) for itraj in collected.keys(): assert itraj in valid_itrajs collected[itraj] = np.vstack(collected[itraj]) collected_lag[itraj] = np.vstack(collected_lag[itraj]) # unlagged data is truncated to the length of the lagged data. max_len = len(traj_data_lagged[itraj]) np.testing.assert_allclose(collected[itraj], traj_data[itraj][:max_len], atol=self.eps) np.testing.assert_allclose(collected_lag[itraj], traj_data_lagged[itraj], atol=self.eps) assert t_total == sum(len(x) for x in collected.values()) assert t_total == reader.n_frames_total(stride=stride, skip=skip+lag)
def setUpClass(cls): from pyemma.datasets import get_bpti_test_data d = get_bpti_test_data() trajs, top = d['trajs'], d['top'] s = source(trajs, top=top) t = tica(s, lag=1) c = cluster_kmeans(t) cls.model_file = tempfile.mktemp() c.save(cls.model_file, save_streaming_chain=True)
def Reweight(f,pi,filename,minval): ''' This functions reweights a given basis trajectory by the inverse square root of the stationary distribution pi. Parameters: ------------ f: pyemma-reader, contains the evaluation of the basis functions. pi: pyemma-reader, contains the evaluation of the stationary distribution. filename: str, filename for the evaluation files to be produced. minval: float, minimal value (greater than zero) allowed for the stationary distribution. All values smaller than minval are replaced by minval. Returns: ----------- pyemma-reader, the reweighted basis. ''' # Get the basis set size: r0 = f.dimension() # Get the iterators for both time-series: I1 = f.iterator() I2 = pi.iterator() # Prepare an empty array for the trajectory pieces: file_names = [] q = 0 ieval = np.zeros((0,r0)) # Compute the products chunk by chunk: for piece in zip(I1,I2): # Get the trajectory number and the data: traj_id = piece[0][0] piece0 = np.copy(piece[0][1]) piece1 = np.copy(piece[1][1]) # Check if the last trajectory is finished: if traj_id > q: np.save(filename + "_%d.npy"%q,ieval) file_names.append(filename + "_%d.npy"%q) ieval = np.zeros((0,r0)) q += 1 # Reweight: # Replace too small and negative values: minind = piece1[:,0] < minval piece1[minind,:] = minval # Re-weight the basis functions: piece0 = piece0/np.sqrt(piece1) # Stack the result underneath the previous results: ieval = np.vstack((ieval,piece0)) # Save the last trajectory: np.save(filename + "_%d.npy"%q,ieval) file_names.append(filename + "_%d.npy"%q) # Build a new reader and return it: reader = pco.source(file_names) reader.chunksize = f.chunksize return reader
def test_with_fragmented_reader(self): # intenionally group bpti dataset to a fake fragmented traj frag_traj = [[self.trajfiles[0], self.trajfiles[1]], self.trajfiles[2]] reader = coor.source(frag_traj, top=self.pdbfile) traj = save_traj(reader, self.sets, None) traj_ref = save_traj_w_md_load_frame(self.reader, self.sets) # Check for diffs (found_diff, errmsg) = compare_coords_md_trajectory_objects(traj, traj_ref, atom=0) self.assertFalse(found_diff, errmsg)
def pyemma_feat(args): irow, featurizer_name, tops, indices = args i, row = irow traj, top = row['traj_fn'], tops[row['top_fn']] feat = featurizer(top) try: adder = getattr(feat, featurizer_name) adder(indexes=indices, cossin=True) feat_traj = np.squeeze(source(traj, features=feat).get_output(), axis=0) return i, feat_traj except AttributeError: print("pyEMMA doesn't have {} as a featurizer".format(featurizer_name))
def SaveEVFrames(dt, ev_traj, c, d, traj_inp=None, filename=None, topfile=None, nframes=None): ''' Save frames that correspond to eigenvector centers from md-trajectories to separate trajectory. Parameters: -------------- traj_inp: List of underlying md-trajectories. ev_traj: List of eigenfunction trajectories. dt: Physical time step. c: ndarray, shape(nc,M), centers. d: ndarray, shape(nc,). admissible distances to the centers. filename: str, name of the center-trajectories topfile:str, topology-file nframes: int, number of frames per center and per trajectory. ''' # Get the number of trajectories: ntraj = len(ev_traj) # Get the number of centers and eigenfunctions: nc, M = c.shape # Create a reader of eigenfunction data: ef = pco.source(ev_traj) ef.chunksize = np.min(ef.trajectory_lengths()) # Get the output into memory, leaving out the first ef: psidata = ef.get_output(dimensions=np.arange(1, M + 1, dtype=int)) cindices = [] # Write out frames to a trajectory file: # Loop over the centers: for i in range(nc): # Create a list of possible frames: indices = [] # Loop over the trajectory files: for m in range(ntraj): # Get the data for this traj: mdata = psidata[m] # Get the admissible frames for this trajectory: mind = np.where(np.any(np.abs(mdata - c[i, :]) <= d[i], axis=1))[0] # Make a random selection: if not (nframes is None): mind = dt * np.random.choice(mind, (nframes,)) else: mind = dt * mind # Put the information together: mindices = np.zeros((mind.shape[0], 2), dtype=int) mindices[:, 0] = m mindices[:, 1] = mind indices.append(mindices) # Save to traj: if not (traj_inp is None) and not (filename is None) and not (topfile is None): pco.save_traj(traj_inp, indices, outfile=filename + "Center%d.xtc" % i, topfile=topfile) cindices.append(indices) return cindices
def CreateEVHistogram(ev_traj, bins, filename, m=np.array([1]), rg=None, kb=8.314e-3, T=300): ''' Create a histogram of the eigenfunction. Parameters: ------------ ev_traj: List of eigenfunction trajectories. nbins: int, number of bins. m: Indices of eigenfunctions to be histogrammed: By default, the second ei- genfunction is shown. If m contains another integer, this function is shown. If m is a two-element array, a 2d-histogram of the two functions is shown. ''' # Get the number of trajectories: ntraj = len(ev_traj) # Create a reader of eigenfunction data: ef = pco.source(ev_traj) ef.chunksize = np.min(ef.trajectory_lengths()) # Create the histogram depending on m: if m.shape[0] == 1: psidata = ef.get_output(dimensions=m) psi = np.zeros((0, 1)) # Stack all data on top of each other: for m in range(ntraj): psi = np.vstack((psi, psidata[m])) # Show the histogram: plt.figure() plt.hist(psi, bins=bins, range=rg) elif m.shape[0] == 2: psidata = ef.get_output(dimensions=m) psi = np.zeros((0, 2)) # Stack all data on top of each other: for m in range(ntraj): psi = np.vstack((psi, psidata[m])) # Show the histogram: plt.figure() H, xe, ye = np.histogram2d(psi[:, 0], psi[:, 1], bins=bins, range=rg, normed=True) # Make it a free energy plot: binwx = xe[1] - xe[0] binwy = ye[1] - ye[0] H = H * binwx * binwy ind = np.nonzero(H) thres = np.min(H[ind[0], ind[1]]) H2 = thres * np.ones(H.shape) H2[ind[0], ind[1]] = H[ind[0], ind[1]] H2 = -kb * T * np.log(H2) X, Y = np.meshgrid(0.5 * (xe[1:] + xe[:-1]), 0.5 * (ye[1:] + ye[:-1])) plt.contourf(X, Y, H2.transpose()) plt.colorbar() else: print "Selection in m could not be used." plt.savefig(filename) plt.show()
def test_feature_correlation_MD(self): # Copying from the test_MD_data path = pkg_resources.resource_filename(__name__, 'data') + os.path.sep self.pdb_file = os.path.join(path, 'bpti_ca.pdb') self.xtc_file = os.path.join(path, 'bpti_mini.xtc') inp = source(self.xtc_file, top=self.pdb_file) ticamini = tica(inp, lag=1, kinetic_map=False) feature_traj = ticamini.data_producer.get_output()[0] tica_traj = ticamini.get_output()[0] test_corr = ticamini.feature_TIC_correlation true_corr = mycorrcoef(feature_traj, tica_traj, ticamini.lag) #assert np.isclose(test_corr, true_corr).all() np.testing.assert_allclose(test_corr, true_corr, atol=1.E-8)
def partial_fit(self, X): """ incrementally update the estimates Parameters ---------- X: array, list of arrays, PyEMMA reader input data. """ from pyemma.coordinates import source self._estimate(source(X), partial_fit=True) self._estimated = True return self
def multi_temperature_tram(feat, trajfiles, temperatures, dtrajs=None, stride=1, tica_lag=100, keep_tica_dims=20, n_clusters=100, tram_lag=100, engfile="Etot.dat", usecols=(1,), kb=0.0083145): """ Parameters ---------- feat : obj, pyemma.coor.featurizer Featurizer object that already has the appropriate features added. trajfiles : list Names of trajectories to include in estimation. temperatures : list Temperatures of corresponding trajectories. stride : int Number of frames to skip in tica and clustering. tica_lag : int Lagtime to use for constructing tica. keep_tica_dims : int Number of dimensions to keep from tica. Somewhat ambiguous. n_clusters : int Number of clusters for kmeans. Somewhat ambiguous. """ dirs = [ os.path.dirname(x) for x in trajfiles ] beta = [ 1./(kb*x) for x in temperatures ] if dtrajs is None: inp = coor.source(trajfiles, feat) tica_obj = coor.tica(inp, lag=tica_lag, dim=keep_tica_dims, stride=stride) Y = tica_obj.get_output() cl = coor.cluster_kmeans(data=Y, k=n_clusters, stride=stride) dtrajs = cl.dtrajs # dimensionless energy if engfile.endswith("npy"): energy_trajs = [ beta[i]*np.load("{}/{}".format(dirs[i], engfile)) for i in range(len(dirs)) ] else: energy_trajs = [ beta[i]*np.loadtxt("{}/{}".format(dirs[i], engfile), usecols=usecols) for i in range(len(dirs)) ] temp_trajs = [ kb*temperatures[i]*np.ones(energy_trajs[i].shape[0], float) for i in range(len(dirs)) ] # dTRAM approach tram = thermo.estimate_multi_temperature(energy_trajs, temp_trajs, dtrajs, energy_unit='kT', temp_unit='kT', estimator='tram', lag=tram_lag, maxiter=2000000, maxerr=1e-10) return dirs, dtrajs, tram
def ApplyLinearTransform(Y, U, filename): """ Apply linear transformation U to time-series given by Y. Parameters: ------------- Y, pyemma-reader, containing time series of basis functions. U, ndarray, shape (r,s), where r must be identical to dimension of Y and s is the number of linear combinations to be extracted. filename: str, name to be used to save the data for the new time series. Returns: ------------- pyemma-reader, containing the time-series of all the linear transform applied to Y.""" # Get the dimension of the new time-series: r = U.shape[1] # Get the iterator for the time-series: I = Y.iterator() # Prepare an empty array for the trajectory pieces: file_names = [] q = 0 ieval = np.zeros((0, r)) # Compute the products chunk by chunk: for piece in I: # Get the trajectory number and the data: traj_id = piece[0] piece = piece[1] # Check if the last trajectory is finished: if traj_id > q: np.save(filename + "_%d.npy" % q, ieval) file_names.append(filename + "_%d.npy" % q) ieval = np.zeros((0, r)) q += 1 # Apply linear transform: piece = np.dot(piece, U) # Stack the result underneath the previous results: ieval = np.vstack((ieval, piece)) # Save the last trajectory: np.save(filename + "_%d.npy" % q, ieval) file_names.append(filename + "_%d.npy" % q) # Build a new reader and return it: reader = pco.source(file_names) reader.chunksize = Y.chunksize return reader
# Dimension: d = 16 ''' 2. Basis functions and directories:''' print "Preparing data:" # Path of basis evaluations: basispath = fundamental_path + "TTApplications/ALA10TT2/Evaluations/" # Number of trajectories: ntraj = 6 # List for basis readers: basis = [] for i in range(d): # Create list of evaluation files for this coordinate: file_list = [basispath+"Traj%d/Basis%d.npy"%(j,i) for j in range(ntraj)] # Create a reader for this basis: ireader = pco.source(file_list,chunk_size=50000) # Append it: basis.append(ireader) # Define a directory for intermediate files, interfaces, and results: ifacedir = fundamental_path + "TTApplications/ALA10TT2/Interfaces/" ifilename = fundamental_path + "TTApplications/ALA10TT2/Intermediate/Intermediate" resdir = fundamental_path + "TTApplications/ALA10TT2/ResultsCG/" ''' 3. Computational Settings:''' # Lag time: tau = 40 # Physical time step: dt = 0.05 # Number of eigenfunctions: M = 2
scale = 0.3 else: # Use native contact distance as threshold for native pairs. logger.info(" contacts between native pairs") pairs = np.loadtxt("%s/native_contacts.ndx" % dirs[0],dtype=int,skiprows=1) - 1 threshold = np.loadtxt("%s/pairwise_params" % dirs[0],usecols=(4,))[1:2*pairs.shape[0]:2] + 0.1 scale = 0.3 # Featurizer parameterizes a pipeline to read in trajectory in chunks. feat = coor.featurizer(topfile) feat.add_tanh_contacts(pairs,threshold=threshold,scale=scale,periodic=False) # Source trajectories logger.info(" sourcing trajectories: %s" % traj_list.__str__()) inp = coor.source(traj_list, feat) # Stride has a drastic influence on the number of acceptable eigenvalues. logger.info(" computing TICA") tica_obj = coor.tica(inp, lag=lag, stride=stride, var_cutoff=0.9, kinetic_map=True) # Check if eigenvalues go negative at some point. Truncate before that if necessary. logger.info(" TICA done") logger.info(" number of dimensions: %d" % tica_obj.dimension()) if tica_obj.dimension() == 1: keep_dims = 1 else: if sum(tica_obj.eigenvalues < 0) > 0: first_neg_eigval = np.where(tica_obj.eigenvalues < 0)[0][0] keep_dims = min([tica_obj.dimension(),first_neg_eigval]) logger.info(" first negative eigenvalue: %d" % first_neg_eigval)
tempdirs = [ "T_{:.2f}_{}".format(T, x) for x in [1,2,3] ] topfile = tempdirs[0] + "/" + topname trajfiles = [ x + "/" + trajname for x in tempdirs ] # add features feat = coor.featurizer(topfile) feat, feature_info = util.sbm_contact_features(feat, pairwise_file, n_native_pairs) if not os.path.exists("msm"): os.mkdir("msm") if (not os.path.exists("msm/dtrajs.pkl")) or recluster: # cluster if necessary inp = coor.source(trajfiles, feat) tica_obj = coor.tica(inp, dim=tica_dims, lag=tica_lag, stride=stride) Y = tica_obj.get_output() cl = coor.cluster_kmeans(data=Y, k=n_clusters) dtrajs = cl.dtrajs os.chdir("msm") dirs = [ os.path.basename(os.path.dirname(x)) for x in trajfiles ] if not dontsavemsm: dtraj_info = { dirs[x]:dtrajs[x] for x in range(len(dirs)) } dtraj_info["dirs"] = dirs with open("dtrajs.pkl", 'wb') as fhandle: pickle.dump(dtraj_info, fhandle) else: os.chdir("msm")
traj[0].save_pdb(reference_pdb_filename) ################################################################################ # Initialize featurizer ################################################################################ print('Initializing backbone torsions featurizer...') featurizer = coor.featurizer(reference_pdb_filename) featurizer.add_backbone_torsions() ################################################################################ # Define coordinates source ################################################################################ trajectory_files = glob(os.path.join(source_directory, '*0.h5')) coordinates_source = coor.source(trajectory_files,featurizer) print("There are %d frames total in %d trajectories." % (coordinates_source.n_frames_total(), coordinates_source.number_of_trajectories())) ################################################################################ # Do tICA ################################################################################ print('tICA...') running_tica = coor.tica(lag=100, dim=100) ################################################################################ # Cluster ################################################################################ print('Clustering...') clustering = coor.cluster_kmeans(k=100, stride=50)