def test_stride(self): reader = DataInMemory(self.d) stride = [1, 2, 3, 4, 5, 6, 7, 10, 11, 21, 23] for s in stride: output = reader.get_output(stride=s)[0] expected = self.d[::s] np.testing.assert_allclose(output, expected, err_msg="not equal for stride=%i" % s)
def testDataArray(self): frames_per_traj = 100 dim = 3 data = np.random.random((frames_per_traj, dim)) d = DataInMemory(data) np.testing.assert_equal( d.trajectory_lengths(), np.array([frames_per_traj for _ in range(1)]))
def test1dData(self): n = 3 data = np.arange(n) reader = DataInMemory(data) self.assertEqual(reader.trajectory_lengths(), np.array([n])) self.assertEqual(reader.ndim, 1) self.assertEqual(reader.number_of_trajectories(), 1) self.assertEqual(reader.n_frames_total(), n)
def test1dDataList(self): n = 10 data = [np.arange(n), np.arange(n)] reader = DataInMemory(data) np.testing.assert_equal(reader.trajectory_lengths(), np.array([n, n])) self.assertEqual(reader.ndim, 1) self.assertEqual(reader.number_of_trajectories(), 2) self.assertEqual(reader.n_frames_total(), 2 * n)
def testListOfArrays(self): frames_per_traj = 100 dim = 3 data = [np.random.random((frames_per_traj, dim)) for _ in range(3)] d = DataInMemory(data) self.assertEqual(d.dimension(), dim) np.testing.assert_equal( d.trajectory_lengths(), np.array([frames_per_traj for _ in range(3)]))
def test_lagged_stridden_access(self): data = np.random.random((1000, 2)).astype(np.float32) reader = DataInMemory(data) strides = [2, 3, 5, 7, 15] lags = [1, 3, 7, 10, 30] for stride in strides: for lag in lags: chunks = [] for _, _, Y in reader.iterator(stride=stride, lag=lag): chunks.append(Y) chunks = np.vstack(chunks) np.testing.assert_equal(chunks, data[lag::stride], "failed for stride=%s, lag=%s" % (stride, lag))
def test_duplicated_data_in_fit_transform(self): X = np.random.randn(100, 2) d = DataInMemory([X, X]) tica = api.tica(data=d, lag=1, dim=1) out1 = tica.get_output() out2 = tica.fit_transform([X, X]) np.testing.assert_array_almost_equal(out1, out2)
def test_big_k(self): x = np.random.random((300, 3)) reader = DataInMemory(x) k = 151 c = api.cluster_uniform_time(k=k) c.estimate(reader)
def test_2d_skip(self): x = np.random.random((300, 3)) reader = DataInMemory(x) k = 2 c = api.cluster_uniform_time(k=k, skip=100) c.estimate(reader)
def test_duplicated_data(self): # make some data that has one column repeated twice X = np.random.randn(100, 2) X = np.hstack((X, X[:, 0, np.newaxis])) d = DataInMemory(X) tica_obj = api.tica(data=d, lag=1, dim=1) assert tica_obj.eigenvectors.dtype == np.float64 assert tica_obj.eigenvalues.dtype == np.float64
def test_time_lagged_chunked_access(self): n = 100 data = [np.random.random((n, 3)), np.zeros((29, 3)), np.random.random((n - 50, 3))] reader = DataInMemory(data) self.assertEqual(reader.n_frames_total(), n + n - 50 + 29) # iterate over data it = reader.iterator(lag=30, return_trajindex=True) for itraj, X, Y in it: if itraj == 0: # self.assertEqual(X.shape, (100, 3)) <-- changed behavior: return only chunks of same size self.assertEqual(X.shape, (70, 3)) self.assertEqual(Y.shape, (70, 3)) elif itraj == 1: # the time lagged chunk can not be built due to lag time self.assertEqual(X.shape, (0, 3)) self.assertEqual(Y.shape, (0, 3)) elif itraj == 2: self.assertEqual(X.shape, (20, 3)) self.assertEqual(Y.shape, (20, 3))
def testChunksizeResultsTica(self): chunk = 40 lag = 100 np.random.seed(0) X = np.random.randn(23000, 3) # un-chunked d = DataInMemory(X) tica_obj = api.tica(data=d, lag=lag, dim=1) cov = tica_obj.cov.copy() mean = tica_obj.mean.copy() # ------- run again with new chunksize ------- d = DataInMemory(X) d.chunksize = chunk tica_obj = tica(data=d, lag=lag, dim=1) np.testing.assert_allclose(tica_obj.mean, mean) np.testing.assert_allclose(tica_obj.cov, cov)
def test_lagged_iterator_1d_legacy(self): n = 30 chunksize = 5 lag = 9 stride = 2 data = [np.arange(n), np.arange(50), np.arange(33)] input_lens = [x.shape[0] for x in data] reader = DataInMemory(data, chunksize=chunksize) it = reader.iterator(chunk=chunksize, stride=stride, lag=lag) # lag > chunksize, so we expect a LegacyLaggedIter from pyerna.coordinates.data._base.iterable import _LegacyLaggedIterator self.assertIsInstance(it, _LegacyLaggedIterator) assert reader.chunksize == chunksize self.assertEqual(reader.n_frames_total(), sum(input_lens)) # store results by traj chunked_trajs = [[] for _ in range(len(data))] chunked_lagged_trajs = [[] for _ in range(len(data))] # iterate over data for itraj, X, Y in reader.iterator(lag=lag, stride=stride): chunked_trajs[itraj].append(X) chunked_lagged_trajs[itraj].append(Y) trajs = [np.vstack(ichunks) for ichunks in chunked_trajs] lagged_trajs = [np.vstack(ichunks) for ichunks in chunked_lagged_trajs] # unlagged data for idx, (traj, input_traj) in enumerate(zip(trajs, data)): # do not consider chunks that have no lagged counterpart input_shape = input_traj.shape np.testing.assert_equal(traj.T.squeeze(), input_traj[::stride][:len(lagged_trajs[idx])].squeeze(), err_msg="failed for traj=%s"%idx) # lagged data for idx, (traj, input_traj) in enumerate(zip(lagged_trajs, data)): np.testing.assert_equal(traj.T.squeeze(), input_traj[lag::stride].squeeze(), err_msg="failed for traj=%s" % idx)
def test_skip(self): for skip in [0, 3, 13]: r1 = DataInMemory(self.d) out_with_skip = r1.get_output(skip=skip)[0] r2 = DataInMemory(self.d) out = r2.get_output()[0] np.testing.assert_almost_equal(out_with_skip, out[skip::], err_msg="The first %s rows were skipped, but that did not " "match the rows with skip=0 and sliced by [%s::]" % (skip, skip))
def test_ndim_input(self): data = np.empty((4, 2, 2, 2)) reader = DataInMemory(data) self.assertEqual(reader.ndim, 2 * 2 * 2) self.assertEqual(reader.number_of_trajectories(), 1) self.assertEqual(reader.n_frames_total(), 4) np.testing.assert_equal( reader.trajectory_lengths(), np.array([reader.n_frames_total()]))
def test_skip_input_list(self): for skip in [0, 3, 13]: r1 = DataInMemory([self.d, self.d]) out_with_skip = r1.get_output(skip=skip) r2 = DataInMemory([self.d, self.d]) out = r2.get_output() np.testing.assert_almost_equal(out_with_skip[0], out[0][skip::], err_msg="The first %s rows of the first file were skipped, but that did not " "match the rows with skip=0 and sliced by [%s::]" % (skip, skip)) np.testing.assert_almost_equal(out_with_skip[1], out[1][skip::], err_msg="The first %s rows of the second file were skipped, but that did not" " match the rows with skip=0 and sliced by [%s::]" % (skip, skip))
def get_output(self, dimensions=slice(0, None), stride=1, skip=0, chunk=None): """Maps all input data of this transformer and returns it as an array or list of arrays Parameters ---------- dimensions : list-like of indexes or slice, default=all indices of dimensions you like to keep. stride : int, default=1 only take every n'th frame. skip : int, default=0 initially skip n frames of each file. chunk: int, default=None How many frames to process at once. If not given obtain the chunk size from the source. Returns ------- output : list of ndarray(T_i, d) the mapped data, where T is the number of time steps of the input data, or if stride > 1, floor(T_in / stride). d is the output dimension of this transformer. If the input consists of a list of trajectories, Y will also be a corresponding list of trajectories """ if isinstance(dimensions, int): ndim = 1 dimensions = slice(dimensions, dimensions + 1) elif isinstance(dimensions, (list, np.ndarray, tuple, slice)): if hasattr(dimensions, 'ndim') and dimensions.ndim > 1: raise ValueError( 'dimension indices can\'t have more than one dimension') ndim = len(np.zeros(self.ndim)[dimensions]) else: raise ValueError('unsupported type (%s) of "dimensions"' % type(dimensions)) assert ndim > 0, "ndim was zero in %s" % self.__class__.__name__ if chunk is None: chunk = self.chunksize # create iterator if self.in_memory and not self._mapping_to_mem_active: from pyerna.coordinates.data.data_in_memory import DataInMemory assert self._Y is not None it = DataInMemory(self._Y)._create_iterator(skip=skip, chunk=chunk, stride=stride, return_trajindex=True) else: it = self._create_iterator(skip=skip, chunk=chunk, stride=stride, return_trajindex=True) with it: # allocate memory try: from pyerna import config if config.coordinates_check_output: trajs = [ np.full((l, ndim), np.nan, dtype=self.output_type()) for l in it.trajectory_lengths() ] else: # TODO: avoid having a copy here, if Y is already filled trajs = [ np.empty((l, ndim), dtype=self.output_type()) for l in it.trajectory_lengths() ] except MemoryError: self.logger.exception( "Could not allocate enough memory to map all data." " Consider using a larger stride.") return if self._logger_is_active(self._loglevel_DEBUG): self.logger.debug("get_output(): dimensions=%s" % str(dimensions)) self.logger.debug( "get_output(): created output trajs with shapes: %s" % [x.shape for x in trajs]) self.logger.debug("nchunks :%s, chunksize=%s" % (it.n_chunks, it.chunksize)) # fetch data from pyerna._base.progress import ProgressReporter pg = ProgressReporter() pg.register(it.n_chunks, description='getting output of %s' % self.__class__.__name__) with pg.context(), it: for itraj, chunk in it: i = slice(it.pos, it.pos + len(chunk)) assert i.stop - i.start > 0 trajs[itraj][i, :] = chunk[:, dimensions] pg.update(1) if config.coordinates_check_output: for i, t in enumerate(trajs): finite = self._chunk_finite(t) if not np.all(finite): # determine position frames = np.where(np.logical_not(finite)) if not len(frames): raise RuntimeError( 'nothing got assigned for traj {}'.format(i)) raise RuntimeError( 'unassigned sections in traj {i} in range [{frames}]'. format(frames=frames, i=i)) return trajs
def test_cols(self): reader = DataInMemory(self.d) cols=(2, 0) for x in reader.iterator(chunk=0, return_trajindex=False, cols=cols): np.testing.assert_equal(x, self.d[:, cols])
def testWrongArguments(self): with self.assertRaises(ValueError): reader = DataInMemory("foo")
def iterator(self, stride=1, lag=0, chunk=None, return_trajindex=True, cols=None, skip=0): """ creates an iterator to stream over the (transformed) data. If your data is too large to fit into memory and you want to incrementally compute some quantities on it, you can create an iterator on a reader or transformer (eg. TICA) to avoid memory overflows. Parameters ---------- stride : int, default=1 Take only every stride'th frame. lag: int, default=0 how many frame to omit for each file. chunk: int, default=None How many frames to process at once. If not given obtain the chunk size from the source. return_trajindex: boolean, default=True a chunk of data if return_trajindex is False, otherwise a tuple of (trajindex, data). cols: array like, default=None return only the given columns. skip: int, default=0 skip 'n' first frames of each trajectory. Returns ------- iter : instance of DataSourceIterator a implementation of a DataSourceIterator to stream over the data Examples -------- >>> from pyerna.coordinates import source; import numpy as np >>> data = [np.arange(3), np.arange(4, 7)] >>> reader = source(data) >>> iterator = reader.iterator(chunk=1) >>> for array_index, chunk in iterator: ... print(array_index, chunk) 0 [[0]] 0 [[1]] 0 [[2]] 1 [[4]] 1 [[5]] 1 [[6]] """ if self.in_memory: from pyerna.coordinates.data.data_in_memory import DataInMemory return DataInMemory(self._Y).iterator( lag=lag, chunk=chunk, stride=stride, return_trajindex=return_trajindex, skip=skip) chunk = chunk if chunk is not None else self.chunksize if lag > 0: if chunk == 0 or lag <= chunk: it = self._create_iterator(skip=skip, chunk=chunk, stride=1, return_trajindex=return_trajindex, cols=cols) it.return_traj_index = True return _LaggedIterator(it, lag, return_trajindex, stride) else: it = self._create_iterator(skip=skip, chunk=chunk, stride=stride, return_trajindex=return_trajindex, cols=cols) it.return_traj_index = True it_lagged = self._create_iterator(skip=skip + lag, chunk=chunk, stride=stride, return_trajindex=True, cols=cols) return _LegacyLaggedIterator(it, it_lagged, return_trajindex) return self._create_iterator(skip=skip, chunk=chunk, stride=stride, return_trajindex=return_trajindex, cols=cols)
def testNotEqualDims(self): """ should raise, since different dims can not be processed""" data = [np.zeros((10, 3)), np.zeros((10, 5))] with self.assertRaises(ValueError): DataInMemory(data)
def test_1d(self): x = np.random.random(1000) reader = DataInMemory(x) k = 2 c = api.cluster_uniform_time(reader, k=k)