def test_last_chunk(self): r = DataInMemory(self.d) it = r.iterator(chunk=0) for itraj, X in it: assert it.last_chunk_in_traj if itraj == 2: assert it.last_chunk
def weights(self, value): from pyemma.coordinates.data import DataInMemory import types if is_float_vector(value): value = DataInMemory(value) elif isinstance(value, (list, tuple)): value = DataInMemory(value) elif isinstance(value, numbers.Integral): value = float(value) if value is not None else 1.0 elif hasattr(value, 'weights') and type(getattr( value, 'weights')) == types.MethodType: from pyemma.coordinates.data._base.transformer import StreamingTransformer class compute_weights_streamer(StreamingTransformer): def __init__(self, func): super(compute_weights_streamer, self).__init__() self.func = func def dimension(self): return 1 def _transform_array(self, X): return self.func.weights(X) def describe(self): pass value = compute_weights_streamer(value) self._weights = value
def test_chunksize_max_memory(self): from pyemma.util.contexts import settings data = np.random.random((10000, 10)) max_size = 1024 with settings(default_chunksize=str(max_size)): r = DataInMemory(data) for itraj, x in r.iterator(): self.assertLessEqual(x.nbytes, max_size)
def test_invalid_data_in_input_inf(self): self.d[1][-1] = np.inf r = DataInMemory(self.d, chunksize=5) it = r.iterator() from pyemma.coordinates.data._base.datasource import InvalidDataInStreamException with settings(coordinates_check_output=True): with self.assertRaises(InvalidDataInStreamException) as cm: for itraj, X in it: pass
def test_stride(self): r = DataInMemory(self.d) stride = np.arange(1, 17) i = 0 it = r.iterator(stride=stride[i], chunk=1) for _ in it: i += 1 i %= len(stride) it.stride = stride[i] assert it.stride == stride[i]
def test_pos(self): r = DataInMemory(self.d) r.chunksize = 17 it = r.iterator() t = 0 for itraj, X in it: assert t == it.pos t += len(X) if it.last_chunk_in_traj: t = 0
def test_iterator_context(self): dim = DataInMemory(np.array([1])) ctx = dim.iterator(stride=1).state assert ctx.stride == 1 assert ctx.uniform_stride assert ctx.is_stride_sorted() assert ctx.traj_keys is None ctx = dim.iterator(stride=np.asarray([[0, 0], [0, 1], [0, 2]])).state assert not ctx.uniform_stride assert ctx.is_stride_sorted() np.testing.assert_array_equal(ctx.traj_keys, np.array([0])) # require sorted random access dim._needs_sorted_random_access_stride = True # sorted within trajectory, not sorted by trajectory key with self.assertRaises(ValueError): dim.iterator(stride=np.asarray([[1, 1], [1, 2], [1, 3], [0, 0], [0, 1], [0, 2]])) # sorted by trajectory key, not within trajectory with self.assertRaises(ValueError): dim.iterator(stride=np.asarray([[0, 0], [0, 1], [0, 2], [1, 1], [1, 5], [1, 3]])) np.testing.assert_array_equal(ctx.ra_indices_for_traj(0), np.array([0, 1, 2]))
def test_skip(self): r = DataInMemory(self.d) lagged_it = r.iterator(lag=5) assert lagged_it._it.skip == 0 assert lagged_it._it_lagged.skip == 5 it = r.iterator() for itraj, X in it: if itraj == 0: it.skip = 5 if itraj == 1: assert it.skip == 5
def test_current_trajindex(self): r = DataInMemory(self.d) expected_itraj = 0 for itraj, X in r.iterator(chunk=0): assert itraj == expected_itraj expected_itraj += 1 expected_itraj = -1 it = r.iterator(chunk=16) for itraj, X in it: if it.pos == 0: expected_itraj += 1 assert itraj == expected_itraj == it.current_trajindex
def get_output(self, dimensions=slice(0, None), stride=1, skip=0, chunk=0): if isinstance(dimensions, int): ndim = 1 dimensions = slice(dimensions, dimensions + 1) elif isinstance(dimensions, (list, np.ndarray, tuple, slice)): if hasattr(dimensions, 'ndim') and dimensions.ndim > 1: raise ValueError('dimension indices can\'t have more than one dimension') ndim = len(np.zeros(self.ndim)[dimensions]) else: raise ValueError('unsupported type (%s) of "dimensions"' % type(dimensions)) assert ndim > 0, "ndim was zero in %s" % self.__class__.__name__ # create iterator if self.in_memory and not self._mapping_to_mem_active: from pyemma.coordinates.data.data_in_memory import DataInMemory assert self._Y is not None it = DataInMemory(self._Y)._create_iterator(skip=skip, chunk=chunk, stride=stride, return_trajindex=True) else: it = self._create_iterator(skip=skip, chunk=chunk, stride=stride, return_trajindex=True) with it: # allocate memory try: # TODO: avoid having a copy here, if Y is already filled trajs = [np.empty((l, ndim), dtype=self.output_type()) for l in it.trajectory_lengths()] except MemoryError: self._logger.exception("Could not allocate enough memory to map all data." " Consider using a larger stride.") return if self._logger_is_active(self._loglevel_DEBUG): self._logger.debug("get_output(): dimensions=%s" % str(dimensions)) self._logger.debug("get_output(): created output trajs with shapes: %s" % [x.shape for x in trajs]) # fetch data self.logger.debug("nchunks :%s, chunksize=%s" % (it._n_chunks, it.chunksize)) self._progress_register(it._n_chunks, description='getting output of %s' % self.__class__.__name__, stage=1) for itraj, chunk in it: L = len(chunk) if L > 0: trajs[itraj][it.pos:it.pos + L, :] = chunk[:, dimensions] # update progress self._progress_update(1, stage=1) return trajs
def test_chunksize(self): r = DataInMemory(self.d) cs = np.arange(1, 17) i = 0 it = r.iterator(chunk=cs[i]) for itraj, X in it: if not it.last_chunk_in_traj: assert len(X) == it.chunksize else: assert len(X) <= it.chunksize i += 1 i %= len(cs) it.chunksize = cs[i] assert it.chunksize == cs[i]
def test_n_chunks(self): r = DataInMemory(self.d) it0 = r.iterator(chunk=0) assert it0._n_chunks == 3 # 3 trajs it1 = r.iterator(chunk=50) assert it1._n_chunks == 3 * 2 # 2 chunks per trajectory it2 = r.iterator(chunk=30) # 3 full chunks and 1 small chunk per trajectory assert it2._n_chunks == 3 * 4 it3 = r.iterator(chunk=30) it3.skip = 10 assert it3._n_chunks == 3 * 3 # 3 full chunks per traj it4 = r.iterator(chunk=30) it4.skip = 5 # 3 full chunks and 1 chunk of 5 frames per trajectory assert it4._n_chunks == 3 * 4 # test for lagged iterator for stride in range(1, 5): for lag in range(0, 18): it = r.iterator(lag=lag, chunk=30, stride=stride, return_trajindex=False) chunks = 0 for _ in it: chunks += 1 assert chunks == it._n_chunks
def iterator(self, stride=1, lag=0, chunk=None, return_trajindex=True, cols=None): if self.in_memory: from pyemma.coordinates.data.data_in_memory import DataInMemory return DataInMemory(self._Y).iterator( lag=lag, chunk=chunk, stride=stride, return_trajindex=return_trajindex) chunk = chunk if chunk is not None else self.default_chunksize it = self._create_iterator(skip=0, chunk=chunk, stride=stride, return_trajindex=return_trajindex, cols=cols) if lag > 0: it.return_traj_index = True it_lagged = self._create_iterator(skip=lag, chunk=chunk, stride=stride, return_trajindex=True, cols=cols) return LaggedIterator(it, it_lagged, return_trajindex) return it
def test_discretizer(self): reader_gen = DataInMemory(data=self.generated_data) # check if exception safe api.discretizer(reader_gen)._chain[-1].get_output() api.discretizer(reader_gen, transform=api.tica())._chain[-1].get_output() api.discretizer(reader_gen, cluster=api.cluster_uniform_time())._chain[-1].get_output() api.discretizer(reader_gen, transform=api.pca(), cluster=api.cluster_regspace(dmin=10))._chain[-1].get_output()
def estimate(self, X, chunksize=None, **kwargs): # ensure the input is able to provide a stream if not isinstance(X, Iterable): from pyemma.util import types array_list = types.ensure_traj_list(X) X = DataInMemory(array_list, chunksize=chunksize) # Because we want to use pipelining methods like get_output, we have to set a data producer. self.data_producer = X X.chunksize = chunksize # run estimation try: super(StreamingEstimator, self).estimate(X, **kwargs) except NotConvergedWarning as ncw: self.logger.info("Presumably finished estimation. Message: %s" % ncw) return self
def test_n_chunks_ra(self): """ """ r = DataInMemory(self.d) def gen_sorted_stride(n): frames = np.random.randint(0, 99, size=n) trajs = np.random.randint(0, 3, size=n) stride = np.sort(np.stack((trajs, frames)).T, axis=1) # sort by file and frame index sort_inds = np.lexsort((stride[:, 1], stride[:, 0])) return stride[sort_inds] strides = [gen_sorted_stride(np.random.randint(1, 99)) for _ in range(10)] lengths = [len(x) for x in strides] for chunk in range(0, 100):#max(lengths)): for stride in strides: it = r.iterator(chunk=chunk, stride=stride) self._count_chunks(it)
def _map_to_memory(self, stride=1): r"""Maps results to memory. Will be stored in attribute :attr:`_Y`.""" if self._logger_is_active(self._loglevel_DEBUG): self._logger.debug("mapping to mem") assert self._in_memory self._mapping_to_mem_active = True self._Y = self.get_output(stride=stride) from pyemma.coordinates.data import DataInMemory self._Y_source = DataInMemory(self._Y) self._mapping_to_mem_active = False
def _map_to_memory(self, stride=1): r"""Maps results to memory. Will be stored in attribute :attr:`_Y`.""" self._mapping_to_mem_active = True try: self._Y = self.get_output(stride=stride) from pyemma.coordinates.data import DataInMemory self._Y_source = DataInMemory(self._Y) finally: self._mapping_to_mem_active = False self._in_memory = True
def test_slice_random_access_linear(self): dim = DataInMemory(self.data) all_data = dim.ra_linear[:, :] # all data should be all data concatenated np.testing.assert_equal(all_data, np.concatenate(self.data)) # select first 5 frames np.testing.assert_equal(dim.ra_linear[:5], self.data[0][:5]) # select only dimensions 1:3 of every 2nd frame np.testing.assert_equal(dim.ra_linear[::2, 1:3], np.concatenate(self.data)[::2, 1:3])
def test_return_trajindex(self): r = DataInMemory(self.d) it = r.iterator(chunk=0) it.return_traj_index = True assert it.return_traj_index is True for tup in it: assert len(tup) == 2 it.reset() it.return_traj_index = False assert it.return_traj_index is False itraj = 0 for tup in it: np.testing.assert_equal(tup, self.d[itraj]) itraj += 1 for tup in r.iterator(return_trajindex=True): assert len(tup) == 2 itraj = 0 for tup in r.iterator(return_trajindex=False): np.testing.assert_equal(tup, self.d[itraj]) itraj += 1
def test_slice_random_access_linear_itraj(self): dim = DataInMemory(self.data) all_data = dim.ra_itraj_linear[:, :, :] # all data should be all data concatenated np.testing.assert_equal(all_data, np.concatenate(self.data)) # if requested 130 frames, this should yield the first two trajectories and half of the third np.testing.assert_equal(dim.ra_itraj_linear[:, :130], np.concatenate(self.data)[:130]) # now request first 30 frames of the last two trajectories np.testing.assert_equal( dim.ra_itraj_linear[[1, 2], :30], np.concatenate((self.data[1], self.data[2]))[:30])
def test_replace_data_source(self): reader_xtc = api.source(self.traj_files, top=self.pdb_file) reader_gen = DataInMemory(data=self.generated_data) kmeans = api.kmeans(k=10) assert hasattr(kmeans, '_chunks') p = api.pipeline([reader_xtc, kmeans]) out1 = kmeans.get_output() # replace source print reader_gen p.set_element(0, reader_gen) assert hasattr(kmeans, '_chunks') p.parametrize() out2 = kmeans.get_output() self.assertFalse(np.array_equal(out1, out2), "Data source changed, so should the resulting clusters.")
def test_slice_random_access_jagged(self): dim = DataInMemory(self.data) all_data = dim.ra_itraj_jagged[:, :, :] for idx in range(3): np.testing.assert_equal(all_data[idx], self.data[idx]) jagged = dim.ra_itraj_jagged[:, :30] for idx in range(3): np.testing.assert_equal(jagged[idx], self.data[idx][:30]) jagged_last_dim = dim.ra_itraj_jagged[:, :, -1] for idx in range(3): np.testing.assert_equal(jagged_last_dim[idx], self.data[idx][:, -1])
def estimate(self, X, **kwargs): # ensure the input is able to provide a stream if not isinstance(X, Iterable): if isinstance(X, np.ndarray) or \ (isinstance(X, (list, tuple)) and len(X) > 0 and all((isinstance(x, np.ndarray) for x in X))): X = DataInMemory(X, self.chunksize) else: raise ValueError( "no np.ndarray or non-empty list of np.ndarrays given") # Because we want to use pipelining methods like get_output, we have to set a data producer. self.data_producer = X # run estimation try: super(StreamingEstimator, self).estimate(X, **kwargs) except NotConvergedWarning as ncw: self._logger.info("Presumably finished estimation. Message: %s" % ncw) return self
def test_slice_random_access(self): dim = DataInMemory(self.data) all_data = dim.ra_itraj_cuboid[:, :, :] # the remaining 80 frames of the first trajectory should be truncated np.testing.assert_equal(all_data.shape, (3, 20, self.dim)) # should coincide with original data np.testing.assert_equal( all_data, np.array( (self.data[0][:20], self.data[1], self.data[2]))) # we should be able to select the 1st trajectory np.testing.assert_equal(dim.ra_itraj_cuboid[0], np.array([self.data[0]])) # select only dimensions 1:3 of 2nd trajectory with every 2nd frame np.testing.assert_equal(dim.ra_itraj_cuboid[1, ::2, 1:3], np.array([self.data[1][::2, 1:3]])) # select only last dimension of 1st trajectory every 17th frame np.testing.assert_equal( dim.ra_itraj_cuboid[0, ::17, -1], np.array([np.array([self.data[0][::17, -1]]).T]))
def test_n_chunks(self): r = DataInMemory(self.d) it0 = r.iterator(chunk=0) assert it0.n_chunks == 3 # 3 trajs it1 = r.iterator(chunk=50) assert it1.n_chunks == 3 * 2 # 2 chunks per trajectory it2 = r.iterator(chunk=30) # 3 full chunks and 1 small chunk per trajectory assert it2.n_chunks == 3 * 4 it3 = r.iterator(chunk=30) it3.skip = 10 assert it3.n_chunks == 3 * 3 # 3 full chunks per traj it4 = r.iterator(chunk=30) it4.skip = 5 # 3 full chunks and 1 chunk of 5 frames per trajectory assert it4.n_chunks == 3 * 4 # test for lagged iterator for stride in range(1, 5): for lag in range(0, 18): it = r.iterator(lag=lag, chunk=30, stride=stride, return_trajindex=False) chunks = sum(1 for _ in it) np.testing.assert_equal( it.n_chunks, chunks, err_msg= "Expected number of chunks did not agree with what the iterator " "returned for stride=%s, lag=%s" % (stride, lag)) assert chunks == it.n_chunks
def estimate(self, X, **kwargs): # TODO: X is either Iterable of an array if not isinstance(X, Iterable): if isinstance(X, np.ndarray): X = DataInMemory(X, self.chunksize) self.data_producer = X else: raise ValueError("no array given") model = None # run estimation try: model = super(StreamingTransformer, self).estimate(X, **kwargs) except NotConvergedWarning as ncw: self._logger.info("Presumely finished estimation. Message: %s" % ncw) # memory mode? Then map all results. Avoid recursion here, if parametrization # is triggered from get_output if self.in_memory and not self._mapping_to_mem_active: self._map_to_memory() self._estimated = True return model
def estimate(self, X, **kwargs): if not isinstance(X, Iterable): if isinstance(X, np.ndarray) or \ (isinstance(X, (list, tuple)) and len(X) > 0 and all([isinstance(x, np.ndarray) for x in X])): X = DataInMemory(X, self.chunksize) self.data_producer = X else: raise ValueError( "no np.ndarray or non-empty list of np.ndarrays given") # run estimation try: super(StreamingTransformer, self).estimate(X, **kwargs) except NotConvergedWarning as ncw: self._logger.info("Presumely finished estimation. Message: %s" % ncw) # memory mode? Then map all results. Avoid recursion here, if parametrization # is triggered from get_output if self.in_memory and not self._mapping_to_mem_active: self._map_to_memory() self._estimated = True return self
def _get_reader_instance(self, instance_number): if instance_number == 0: return DataInMemory(self.data) elif instance_number == 1: return FeatureReader(self.data_feature_reader, topologyfile=self.topfile)
def test_is_random_accessible(self): dim = DataInMemory(self.data) frag = FragmentedTrajectoryReader([[self.data]]) assert dim.is_random_accessible is True assert frag.is_random_accessible is False