def test_length_and_content_feature_reader_and_TICA(self): for stride in range(1, 100, 23): r = coor.source(self.trajnames, top=self.temppdb) t = coor.tica(data=r, lag=2, dim=2) # subsample data out_tica = t.get_output(stride=stride) out_reader = r.get_output(stride=stride) # get length in different ways len_tica = [x.shape[0] for x in out_tica] len_reader = [x.shape[0] for x in out_reader] len_trajs = t.trajectory_lengths(stride=stride) len_ref = [(x.shape[0]-1)//stride+1 for x in self.data] # print 'len_ref', len_ref # compare length np.testing.assert_equal(len_trajs, len_ref) self.assertTrue(len_ref == len_tica) self.assertTrue(len_ref == len_reader) # compare content (reader) for ref_data, test_data in zip(self.data, out_reader): ref_data_reshaped = ref_data.reshape((ref_data.shape[0], ref_data.shape[1]*3)) self.assertTrue(np.allclose(ref_data_reshaped[::stride, :], test_data, atol=1E-3))
def test_write_to_csv_propagate_filenames(self): from pyerna.coordinates import source, tica with TemporaryDirectory() as td: data = [np.random.random((20, 3))] * 3 fns = [ os.path.join(td, f) for f in ('blah.npy', 'blub.npy', 'foo.npy') ] for x, fn in zip(data, fns): np.save(fn, x) reader = source(fns) assert reader.filenames == fns tica_obj = tica(reader, lag=1, dim=2) tica_obj.write_to_csv(extension=".exotic", chunksize=3) res = sorted([ os.path.abspath(x) for x in glob(td + os.path.sep + '*.exotic') ]) self.assertEqual(len(res), len(fns)) desired_fns = sorted([s.replace('.npy', '.exotic') for s in fns]) self.assertEqual(res, desired_fns) # compare written results expected = tica_obj.get_output() actual = source(list(s.replace('.npy', '.exotic') for s in fns)).get_output() assert len(actual) == len(fns) for a, e in zip(actual, expected): np.testing.assert_allclose(a, e)
def test_MD_data(self): # this is too little data to get reasonable results. We just test to avoid exceptions path = pkg_resources.resource_filename(__name__, 'data') + os.path.sep self.pdb_file = os.path.join(path, 'bpti_ca.pdb') self.xtc_file = os.path.join(path, 'bpti_mini.xtc') inp = source(self.xtc_file, top=self.pdb_file) # see if this doesn't raise ticamini = tica(inp, lag=1)
def setUpClass(cls): cls.data = np.ones((10000, 100)) cls.variable_columns = np.random.choice(100, 10, replace=False) cls.data[:, cls.variable_columns] = np.random.rand(10000, 10) # Start with one of the constant columns: cls.initial_columns = np.setdiff1d(np.arange(cls.data.shape[1]), cls.variable_columns)[0:1] cls.tica_obj = tica(data=cls.data, lag=1)
def test_parametrize_with_stride(self): for stride in range(1, 100, 23): r = coor.source(self.trajnames, top=self.temppdb) tau = 5 try: t = coor.tica(r, lag=tau, stride=stride, dim=2) # force_eigenvalues_le_one=True enables an internal consistency check in TICA self.assertTrue(np.all(t.eigenvalues <= 1.0+1.E-12)) except RuntimeError: assert tau % stride != 0
def setUpClass(cls): from pyerna.datasets import get_bpti_test_data d = get_bpti_test_data() trajs, top = d['trajs'], d['top'] s = source(trajs, top=top) t = tica(s, lag=1) c = cluster_kmeans(t) cls.model_file = tempfile.mktemp() c.save(cls.model_file, save_streaming_chain=True)
def test_feature_correlation_MD(self): # Copying from the test_MD_data path = pkg_resources.resource_filename(__name__, 'data') + os.path.sep self.pdb_file = os.path.join(path, 'bpti_ca.pdb') self.xtc_file = os.path.join(path, 'bpti_mini.xtc') inp = source(self.xtc_file, top=self.pdb_file) ticamini = tica(inp, lag=1, kinetic_map=False) feature_traj = ticamini.data_producer.get_output()[0] tica_traj = ticamini.get_output()[0] test_corr = ticamini.feature_TIC_correlation true_corr = mycorrcoef(feature_traj, tica_traj, ticamini.lag) #assert np.isclose(test_corr, true_corr).all() np.testing.assert_allclose(test_corr, true_corr, atol=1.E-8)
def setUpClass(cls): from pyerna.datasets import load_2well_discrete dw = load_2well_discrete() v = dw.dtraj_T100K_dt10[:10000] cls.T = v.size nstates = 100 b = np.linspace(-1, 1, nstates) sigma = 0.15 cls.Z = np.zeros((cls.T, nstates)) for t in range(cls.T): for j in range(nstates): cls.Z[t, j] = np.exp(-(b[v[t]] - b[j])**2 / (2 * sigma**2)) cls.lag = 10 cls.tica_obj = tica(data=cls.Z, lag=cls.lag)
def test_feature_correlation_data(self): # Create features with some correlation feature_traj = np.zeros((100, 3)) feature_traj[:, 0] = np.linspace(-.5, .5, len(feature_traj)) feature_traj[:, 1] = (feature_traj[:, 0] + np.random.randn(len(feature_traj)) * .5)**1 feature_traj[:, 2] = np.random.randn(len(feature_traj)) # Tica tica_obj = tica(data=feature_traj, dim=3, kinetic_map=False) tica_traj = tica_obj.get_output()[0] # Create correlations test_corr = tica_obj.feature_TIC_correlation true_corr = mycorrcoef(feature_traj, tica_traj, tica_obj.lag) np.testing.assert_allclose(test_corr, true_corr, atol=1.E-8)
def setUp(self): self.readers = [] data_dir = pkg_resources.resource_filename('pyerna.coordinates.tests', 'data') # three md trajs trajs = glob(data_dir + "/bpti_0*.xtc") top = os.path.join(data_dir, 'bpti_ca.pdb') self.readers.append(source(trajs, top=top)) self.readers[0].featurizer.add_all() ndim = self.readers[0].ndim # three random arrays lengths = self.readers[0].trajectory_lengths() arrays = [np.random.random((length, ndim)) for length in lengths] self.readers.append(source(arrays)) self.readers.append(tica(self.readers[-1], dim=20))
def test_too_short_traj_partial_fit(self): data = [np.empty((20, 3)), np.empty((10, 3))] lag = 11 tica_obj = tica(lag=lag) from pyerna.util.testing_tools import MockLoggingHandler log_handler = MockLoggingHandler() import logging L = logging.getLogger('pyerna.coordinates.estimation.covariance') L.addHandler(log_handler) try: for x in data: tica_obj.partial_fit(x) #self.assertEqual(tica_obj._used_data, 20 - lag) self.assertEqual(len(log_handler.messages['warning']), 1) self.assertIn("longer than lag", log_handler.messages['warning'][0]) finally: L.removeHandler(log_handler)
def testChunksizeResultsTica(self): chunk = 40 lag = 100 np.random.seed(0) X = np.random.randn(23000, 3) # un-chunked d = DataInMemory(X) tica_obj = api.tica(data=d, lag=lag, dim=1) cov = tica_obj.cov.copy() mean = tica_obj.mean.copy() # ------- run again with new chunksize ------- d = DataInMemory(X) d.chunksize = chunk tica_obj = tica(data=d, lag=lag, dim=1) np.testing.assert_allclose(tica_obj.mean, mean) np.testing.assert_allclose(tica_obj.cov, cov)
def test_too_short_trajs(self): trajs = [np.empty((100, 1))] with self.assertRaises(ValueError): tica(trajs, lag=100)
def setUpClass(cls): # Basis set definition: cls.nf = 10 cls.chi = np.zeros((20, cls.nf), dtype=float) for n in range(cls.nf): cls.chi[2 * n:2 * (n + 1), n] = 1.0 # Load simulations: f = np.load( pkg_resources.resource_filename(__name__, "data/test_data_koopman.npz")) trajs = [f[key] for key in f.keys()] cls.data = [cls.chi[traj, :] for traj in trajs] # Lag time: cls.tau = 10 # Truncation for small eigenvalues: cls.epsilon = 1e-6 # Compute the means: cls.mean_x = np.zeros(cls.nf) cls.mean_y = np.zeros(cls.nf) cls.frames = 0 for traj in cls.data: cls.mean_x += np.sum(traj[:-cls.tau, :], axis=0) cls.mean_y += np.sum(traj[cls.tau:, :], axis=0) cls.frames += traj[:-cls.tau, :].shape[0] cls.mean_x *= (1.0 / cls.frames) cls.mean_y *= (1.0 / cls.frames) cls.mean_rev = 0.5 * (cls.mean_x + cls.mean_y) # Compute correlations: cls.C0 = np.zeros((cls.nf, cls.nf)) cls.Ct = np.zeros((cls.nf, cls.nf)) cls.C0_rev = np.zeros((cls.nf, cls.nf)) cls.Ct_rev = np.zeros((cls.nf, cls.nf)) for traj in cls.data: itraj = (traj - cls.mean_x[None, :]).copy() cls.C0 += np.dot(itraj[:-cls.tau, :].T, itraj[:-cls.tau, :]) cls.Ct += np.dot(itraj[:-cls.tau, :].T, itraj[cls.tau:, :]) itraj = (traj - cls.mean_rev[None, :]).copy() cls.C0_rev += np.dot(itraj[:-cls.tau, :].T, itraj[:-cls.tau, :])\ + np.dot(itraj[cls.tau:, :].T, itraj[cls.tau:, :]) cls.Ct_rev += np.dot(itraj[:-cls.tau, :].T, itraj[cls.tau:, :])\ + np.dot(itraj[cls.tau:, :].T, itraj[:-cls.tau, :]) cls.C0 *= (1.0 / cls.frames) cls.Ct *= (1.0 / cls.frames) cls.C0_rev *= (1.0 / (2 * cls.frames)) cls.Ct_rev *= (1.0 / (2 * cls.frames)) # Compute whitening transformation: cls.R = transform_C0(cls.C0, cls.epsilon) cls.Rrev = transform_C0(cls.C0_rev, cls.epsilon) # Perform non-reversible diagonalization cls.ln, cls.Rn = scl.eig(np.dot(cls.R.T, np.dot(cls.Ct, cls.R))) cls.ln, cls.Rn = sort_by_norm(cls.ln, cls.Rn) cls.Rn = np.dot(cls.R, cls.Rn) cls.Rn = scale_eigenvectors(cls.Rn) cls.tsn = -cls.tau / np.log(np.abs(cls.ln)) cls.ls, cls.Rs = scl.eig( np.dot(cls.Rrev.T, np.dot(cls.Ct_rev, cls.Rrev))) cls.ls, cls.Rs = sort_by_norm(cls.ls, cls.Rs) cls.Rs = np.dot(cls.Rrev, cls.Rs) cls.Rs = scale_eigenvectors(cls.Rs) cls.tss = -cls.tau / np.log(np.abs(cls.ls)) # Compute non-reversible Koopman matrix: cls.K = np.dot(cls.R.T, np.dot(cls.Ct, cls.R)) cls.K = np.vstack((cls.K, np.dot((cls.mean_y - cls.mean_x), cls.R))) cls.K = np.hstack( (cls.K, np.eye(cls.K.shape[0], 1, k=-cls.K.shape[0] + 1))) cls.N1 = cls.K.shape[0] # Compute u-vector: ln, Un = scl.eig(cls.K.T) ln, Un = sort_by_norm(ln, Un) cls.u = np.real(Un[:, 0]) v = np.eye(cls.N1, 1, k=-cls.N1 + 1)[:, 0] cls.u *= (1.0 / np.dot(cls.u, v)) # Prepare weight object: u_mod = cls.u.copy() N = cls.R.shape[0] u_input = np.zeros(N + 1) u_input[0:N] = cls.R.dot(u_mod[0:-1]) # in input basis u_input[N] = u_mod[-1] - cls.mean_x.dot(cls.R.dot(u_mod[0:-1])) weight_obj = _KoopmanWeights(u_input[:-1], u_input[-1]) # Compute weights over all data points: cls.wtraj = [] for traj in cls.data: traj = np.dot((traj - cls.mean_x[None, :]), cls.R).copy() traj = np.hstack((traj, np.ones((traj.shape[0], 1)))) cls.wtraj.append(np.dot(traj, cls.u)) # Compute equilibrium mean: cls.mean_eq = np.zeros(cls.nf) q = 0 for traj in cls.data: qwtraj = cls.wtraj[q] cls.mean_eq += np.sum((qwtraj[:-cls.tau, None] * traj[:-cls.tau, :]), axis=0)\ + np.sum((qwtraj[:-cls.tau, None] * traj[cls.tau:, :]), axis=0) q += 1 cls.mean_eq *= (1.0 / (2 * cls.frames)) # Compute reversible C0, Ct: cls.C0_eq = np.zeros((cls.N1, cls.N1)) cls.Ct_eq = np.zeros((cls.N1, cls.N1)) q = 0 for traj in cls.data: qwtraj = cls.wtraj[q] traj = (traj - cls.mean_eq[None, :]).copy() cls.C0_eq += np.dot((qwtraj[:-cls.tau, None] * traj[:-cls.tau, :]).T, traj[:-cls.tau, :])\ + np.dot((qwtraj[:-cls.tau, None] * traj[cls.tau:, :]).T, traj[cls.tau:, :]) cls.Ct_eq += np.dot((qwtraj[:-cls.tau, None] * traj[:-cls.tau, :]).T, traj[cls.tau:, :])\ + np.dot((qwtraj[:-cls.tau, None] * traj[cls.tau:, :]).T, traj[:-cls.tau, :]) q += 1 cls.C0_eq *= (1.0 / (2 * cls.frames)) cls.Ct_eq *= (1.0 / (2 * cls.frames)) # Solve re-weighted eigenvalue problem: S = transform_C0(cls.C0_eq, cls.epsilon) Ct_S = np.dot(S.T, np.dot(cls.Ct_eq, S)) # Compute its eigenvalues: cls.lr, cls.Rr = scl.eigh(Ct_S) cls.lr, cls.Rr = sort_by_norm(cls.lr, cls.Rr) cls.Rr = np.dot(S, cls.Rr) cls.Rr = scale_eigenvectors(cls.Rr) cls.tsr = -cls.tau / np.log(np.abs(cls.lr)) # Set up the model: cls.koop_rev = tica(cls.data, lag=cls.tau, kinetic_map=False) cls.koop_eq = tica(cls.data, lag=cls.tau, kinetic_map=False, weights='koopman') # Test the model by supplying weights directly: cls.koop_eq_direct = tica(cls.data, lag=cls.tau, weights=weight_obj, kinetic_map=False)
def test_describe(self): desc = self.tica_obj.describe() assert types.is_string(desc) or types.is_list_of_string(desc) # describe on empty estimator tica(lag=1).describe()
def test_default_cs(self): t = tica(chunksize=None) assert t.default_chunksize == t.chunksize == t._FALLBACK_CHUNKSIZE
def test_commute_map(self): tica(np.arange(100), commute_map=True, kinetic_map=False)
def test_write_h5(self): from pyerna.coordinates import tica dim = 10 data = [ np.random.random((np.random.randint(50, 150), dim)) for _ in range(4) ] tica = tica(data, lag=1) import tempfile out = tempfile.mktemp() group = '/test' def perform(chunksize, stride): try: transformed_output = tica.get_output(chunk=chunksize, stride=stride) tica.write_to_hdf5(out, group=group, chunksize=chunksize, stride=stride) import h5py with h5py.File(out) as f: assert len(f[group]) == len(data) for (itraj, actual), desired in zip(f[group].items(), transformed_output): np.testing.assert_equal( actual, desired, err_msg='failed for cs=%s, stride=%s' % (chunksize, stride)) finally: os.remove(out) for cs in [0, 1, 3, 10, 42, 50]: for s in [1, 2, 3, 10]: perform(cs, s) # test overwrite try: tica.write_to_hdf5(out, group=group) with self.assertRaises(ValueError): tica.write_to_hdf5(out, group=group) os.remove(out) tica.write_to_hdf5(out) with self.assertRaises(ValueError) as ctx: tica.write_to_hdf5(out) assert 'Refusing to overwrite data' in ctx.exception.args[0] os.remove(out) tica.write_to_hdf5(out, group=group) tica.write_to_hdf5(out, group=group, overwrite=True) os.remove(out) import h5py with h5py.File(out) as f: f.create_group('empty').create_dataset('0000', shape=(1, 1)) with self.assertRaises(ValueError): tica.write_to_hdf5(out, group='empty') tica.write_to_hdf5(out, group='empty', overwrite=True) finally: os.remove(out)