def test_last_chunk(self):
     r = DataInMemory(self.d)
     it = r.iterator(chunk=0)
     for itraj, X in it:
         assert it.last_chunk_in_traj
         if itraj == 2:
             assert it.last_chunk
Example #2
0
    def weights(self, value):
        from pyemma.coordinates.data import DataInMemory
        import types

        if is_float_vector(value):
            value = DataInMemory(value)
        elif isinstance(value, (list, tuple)):
            value = DataInMemory(value)
        elif isinstance(value, numbers.Integral):
            value = float(value) if value is not None else 1.0
        elif hasattr(value, 'weights') and type(getattr(
                value, 'weights')) == types.MethodType:
            from pyemma.coordinates.data._base.transformer import StreamingTransformer

            class compute_weights_streamer(StreamingTransformer):
                def __init__(self, func):
                    super(compute_weights_streamer, self).__init__()
                    self.func = func

                def dimension(self):
                    return 1

                def _transform_array(self, X):
                    return self.func.weights(X)

                def describe(self):
                    pass

            value = compute_weights_streamer(value)

        self._weights = value
 def test_chunksize_max_memory(self):
     from pyemma.util.contexts import settings
     data = np.random.random((10000, 10))
     max_size = 1024
     with settings(default_chunksize=str(max_size)):
         r = DataInMemory(data)
         for itraj, x in r.iterator():
             self.assertLessEqual(x.nbytes, max_size)
 def test_invalid_data_in_input_inf(self):
     self.d[1][-1] = np.inf
     r = DataInMemory(self.d, chunksize=5)
     it = r.iterator()
     from pyemma.coordinates.data._base.datasource import InvalidDataInStreamException
     with settings(coordinates_check_output=True):
         with self.assertRaises(InvalidDataInStreamException) as cm:
             for itraj, X in it:
                 pass
 def test_stride(self):
     r = DataInMemory(self.d)
     stride = np.arange(1, 17)
     i = 0
     it = r.iterator(stride=stride[i], chunk=1)
     for _ in it:
         i += 1
         i %= len(stride)
         it.stride = stride[i]
         assert it.stride == stride[i]
 def test_pos(self):
     r = DataInMemory(self.d)
     r.chunksize = 17
     it = r.iterator()
     t = 0
     for itraj, X in it:
         assert t == it.pos
         t += len(X)
         if it.last_chunk_in_traj:
             t = 0
    def test_iterator_context(self):
        dim = DataInMemory(np.array([1]))

        ctx = dim.iterator(stride=1).state
        assert ctx.stride == 1
        assert ctx.uniform_stride
        assert ctx.is_stride_sorted()
        assert ctx.traj_keys is None

        ctx = dim.iterator(stride=np.asarray([[0, 0], [0, 1], [0, 2]])).state
        assert not ctx.uniform_stride
        assert ctx.is_stride_sorted()
        np.testing.assert_array_equal(ctx.traj_keys, np.array([0]))

        # require sorted random access
        dim._needs_sorted_random_access_stride = True

        # sorted within trajectory, not sorted by trajectory key
        with self.assertRaises(ValueError):
            dim.iterator(stride=np.asarray([[1, 1], [1, 2], [1, 3], [0, 0],
                                            [0, 1], [0, 2]]))

        # sorted by trajectory key, not within trajectory
        with self.assertRaises(ValueError):
            dim.iterator(stride=np.asarray([[0, 0], [0, 1], [0, 2], [1, 1],
                                            [1, 5], [1, 3]]))

        np.testing.assert_array_equal(ctx.ra_indices_for_traj(0),
                                      np.array([0, 1, 2]))
    def test_skip(self):
        r = DataInMemory(self.d)
        lagged_it = r.iterator(lag=5)
        assert lagged_it._it.skip == 0
        assert lagged_it._it_lagged.skip == 5

        it = r.iterator()
        for itraj, X in it:
            if itraj == 0:
                it.skip = 5
            if itraj == 1:
                assert it.skip == 5
    def test_current_trajindex(self):
        r = DataInMemory(self.d)
        expected_itraj = 0
        for itraj, X in r.iterator(chunk=0):
            assert itraj == expected_itraj
            expected_itraj += 1

        expected_itraj = -1
        it = r.iterator(chunk=16)
        for itraj, X in it:
            if it.pos == 0:
                expected_itraj += 1
            assert itraj == expected_itraj == it.current_trajindex
Example #10
0
    def get_output(self, dimensions=slice(0, None), stride=1, skip=0, chunk=0):
        if isinstance(dimensions, int):
            ndim = 1
            dimensions = slice(dimensions, dimensions + 1)
        elif isinstance(dimensions, (list, np.ndarray, tuple, slice)):
            if hasattr(dimensions, 'ndim') and dimensions.ndim > 1:
                raise ValueError('dimension indices can\'t have more than one dimension')
            ndim = len(np.zeros(self.ndim)[dimensions])
        else:
            raise ValueError('unsupported type (%s) of "dimensions"' % type(dimensions))

        assert ndim > 0, "ndim was zero in %s" % self.__class__.__name__

        # create iterator
        if self.in_memory and not self._mapping_to_mem_active:
            from pyemma.coordinates.data.data_in_memory import DataInMemory
            assert self._Y is not None
            it = DataInMemory(self._Y)._create_iterator(skip=skip, chunk=chunk,
                                                        stride=stride, return_trajindex=True)
        else:
            it = self._create_iterator(skip=skip, chunk=chunk, stride=stride, return_trajindex=True)

        with it:
            # allocate memory
            try:
                # TODO: avoid having a copy here, if Y is already filled
                trajs = [np.empty((l, ndim), dtype=self.output_type())
                         for l in it.trajectory_lengths()]
            except MemoryError:
                self._logger.exception("Could not allocate enough memory to map all data."
                                       " Consider using a larger stride.")
                return

            if self._logger_is_active(self._loglevel_DEBUG):
                self._logger.debug("get_output(): dimensions=%s" % str(dimensions))
                self._logger.debug("get_output(): created output trajs with shapes: %s"
                                   % [x.shape for x in trajs])
            # fetch data
            self.logger.debug("nchunks :%s, chunksize=%s" % (it._n_chunks, it.chunksize))
            self._progress_register(it._n_chunks,
                                    description='getting output of %s' % self.__class__.__name__,
                                    stage=1)
            for itraj, chunk in it:
                L = len(chunk)
                if L > 0:
                    trajs[itraj][it.pos:it.pos + L, :] = chunk[:, dimensions]

                # update progress
                self._progress_update(1, stage=1)

        return trajs
 def test_chunksize(self):
     r = DataInMemory(self.d)
     cs = np.arange(1, 17)
     i = 0
     it = r.iterator(chunk=cs[i])
     for itraj, X in it:
         if not it.last_chunk_in_traj:
             assert len(X) == it.chunksize
         else:
             assert len(X) <= it.chunksize
         i += 1
         i %= len(cs)
         it.chunksize = cs[i]
         assert it.chunksize == cs[i]
    def test_n_chunks(self):
        r = DataInMemory(self.d)

        it0 = r.iterator(chunk=0)
        assert it0._n_chunks == 3  # 3 trajs

        it1 = r.iterator(chunk=50)
        assert it1._n_chunks == 3 * 2  # 2 chunks per trajectory

        it2 = r.iterator(chunk=30)
        # 3 full chunks and 1 small chunk per trajectory
        assert it2._n_chunks == 3 * 4

        it3 = r.iterator(chunk=30)
        it3.skip = 10
        assert it3._n_chunks == 3 * 3  # 3 full chunks per traj

        it4 = r.iterator(chunk=30)
        it4.skip = 5
        # 3 full chunks and 1 chunk of 5 frames per trajectory
        assert it4._n_chunks == 3 * 4

        # test for lagged iterator
        for stride in range(1, 5):
            for lag in range(0, 18):
                it = r.iterator(lag=lag,
                                chunk=30,
                                stride=stride,
                                return_trajindex=False)
                chunks = 0
                for _ in it:
                    chunks += 1
                assert chunks == it._n_chunks
Example #13
0
 def iterator(self,
              stride=1,
              lag=0,
              chunk=None,
              return_trajindex=True,
              cols=None):
     if self.in_memory:
         from pyemma.coordinates.data.data_in_memory import DataInMemory
         return DataInMemory(self._Y).iterator(
             lag=lag,
             chunk=chunk,
             stride=stride,
             return_trajindex=return_trajindex)
     chunk = chunk if chunk is not None else self.default_chunksize
     it = self._create_iterator(skip=0,
                                chunk=chunk,
                                stride=stride,
                                return_trajindex=return_trajindex,
                                cols=cols)
     if lag > 0:
         it.return_traj_index = True
         it_lagged = self._create_iterator(skip=lag,
                                           chunk=chunk,
                                           stride=stride,
                                           return_trajindex=True,
                                           cols=cols)
         return LaggedIterator(it, it_lagged, return_trajindex)
     return it
Example #14
0
 def test_discretizer(self):
     reader_gen = DataInMemory(data=self.generated_data)
     # check if exception safe
     api.discretizer(reader_gen)._chain[-1].get_output()
     api.discretizer(reader_gen, transform=api.tica())._chain[-1].get_output()
     api.discretizer(reader_gen, cluster=api.cluster_uniform_time())._chain[-1].get_output()
     api.discretizer(reader_gen, transform=api.pca(), cluster=api.cluster_regspace(dmin=10))._chain[-1].get_output()
Example #15
0
 def estimate(self, X, chunksize=None, **kwargs):
     # ensure the input is able to provide a stream
     if not isinstance(X, Iterable):
         from pyemma.util import types
         array_list = types.ensure_traj_list(X)
         X = DataInMemory(array_list, chunksize=chunksize)
     # Because we want to use pipelining methods like get_output, we have to set a data producer.
     self.data_producer = X
     X.chunksize = chunksize
     # run estimation
     try:
         super(StreamingEstimator, self).estimate(X, **kwargs)
     except NotConvergedWarning as ncw:
         self.logger.info("Presumably finished estimation. Message: %s" %
                          ncw)
     return self
    def test_n_chunks_ra(self):
        """ """
        r = DataInMemory(self.d)

        def gen_sorted_stride(n):
            frames = np.random.randint(0, 99, size=n)
            trajs = np.random.randint(0, 3, size=n)

            stride = np.sort(np.stack((trajs, frames)).T, axis=1)
            # sort by file and frame index
            sort_inds = np.lexsort((stride[:, 1], stride[:, 0]))
            return stride[sort_inds]

        strides = [gen_sorted_stride(np.random.randint(1, 99)) for _ in range(10)]
        lengths = [len(x) for x in strides]
        for chunk in range(0, 100):#max(lengths)):
            for stride in strides:
                it = r.iterator(chunk=chunk, stride=stride)
                self._count_chunks(it)
Example #17
0
 def _map_to_memory(self, stride=1):
     r"""Maps results to memory. Will be stored in attribute :attr:`_Y`."""
     if self._logger_is_active(self._loglevel_DEBUG):
         self._logger.debug("mapping to mem")
     assert self._in_memory
     self._mapping_to_mem_active = True
     self._Y = self.get_output(stride=stride)
     from pyemma.coordinates.data import DataInMemory
     self._Y_source = DataInMemory(self._Y)
     self._mapping_to_mem_active = False
Example #18
0
    def _map_to_memory(self, stride=1):
        r"""Maps results to memory. Will be stored in attribute :attr:`_Y`."""
        self._mapping_to_mem_active = True
        try:
            self._Y = self.get_output(stride=stride)
            from pyemma.coordinates.data import DataInMemory
            self._Y_source = DataInMemory(self._Y)
        finally:
            self._mapping_to_mem_active = False

        self._in_memory = True
    def test_slice_random_access_linear(self):
        dim = DataInMemory(self.data)

        all_data = dim.ra_linear[:, :]
        # all data should be all data concatenated
        np.testing.assert_equal(all_data, np.concatenate(self.data))
        # select first 5 frames
        np.testing.assert_equal(dim.ra_linear[:5], self.data[0][:5])
        # select only dimensions 1:3 of every 2nd frame
        np.testing.assert_equal(dim.ra_linear[::2, 1:3],
                                np.concatenate(self.data)[::2, 1:3])
    def test_return_trajindex(self):
        r = DataInMemory(self.d)
        it = r.iterator(chunk=0)
        it.return_traj_index = True
        assert it.return_traj_index is True
        for tup in it:
            assert len(tup) == 2
        it.reset()
        it.return_traj_index = False
        assert it.return_traj_index is False
        itraj = 0
        for tup in it:
            np.testing.assert_equal(tup, self.d[itraj])
            itraj += 1

        for tup in r.iterator(return_trajindex=True):
            assert len(tup) == 2
        itraj = 0
        for tup in r.iterator(return_trajindex=False):
            np.testing.assert_equal(tup, self.d[itraj])
            itraj += 1
    def test_slice_random_access_linear_itraj(self):
        dim = DataInMemory(self.data)

        all_data = dim.ra_itraj_linear[:, :, :]
        # all data should be all data concatenated
        np.testing.assert_equal(all_data, np.concatenate(self.data))

        # if requested 130 frames, this should yield the first two trajectories and half of the third
        np.testing.assert_equal(dim.ra_itraj_linear[:, :130],
                                np.concatenate(self.data)[:130])
        # now request first 30 frames of the last two trajectories
        np.testing.assert_equal(
            dim.ra_itraj_linear[[1, 2], :30],
            np.concatenate((self.data[1], self.data[2]))[:30])
Example #22
0
    def test_replace_data_source(self):
        reader_xtc = api.source(self.traj_files, top=self.pdb_file)
        reader_gen = DataInMemory(data=self.generated_data)

        kmeans = api.kmeans(k=10)
        assert hasattr(kmeans, '_chunks')
        p = api.pipeline([reader_xtc, kmeans])
        out1 = kmeans.get_output()
        # replace source
        print reader_gen
        p.set_element(0, reader_gen)
        assert hasattr(kmeans, '_chunks')
        p.parametrize()
        out2 = kmeans.get_output()
        self.assertFalse(np.array_equal(out1, out2), "Data source changed, so should the resulting clusters.")
    def test_slice_random_access_jagged(self):
        dim = DataInMemory(self.data)

        all_data = dim.ra_itraj_jagged[:, :, :]
        for idx in range(3):
            np.testing.assert_equal(all_data[idx], self.data[idx])

        jagged = dim.ra_itraj_jagged[:, :30]
        for idx in range(3):
            np.testing.assert_equal(jagged[idx], self.data[idx][:30])

        jagged_last_dim = dim.ra_itraj_jagged[:, :, -1]
        for idx in range(3):
            np.testing.assert_equal(jagged_last_dim[idx], self.data[idx][:,
                                                                         -1])
Example #24
0
 def estimate(self, X, **kwargs):
     # ensure the input is able to provide a stream
     if not isinstance(X, Iterable):
         if isinstance(X, np.ndarray) or \
                 (isinstance(X, (list, tuple)) and len(X) > 0 and all((isinstance(x, np.ndarray) for x in X))):
             X = DataInMemory(X, self.chunksize)
         else:
             raise ValueError(
                 "no np.ndarray or non-empty list of np.ndarrays given")
     # Because we want to use pipelining methods like get_output, we have to set a data producer.
     self.data_producer = X
     # run estimation
     try:
         super(StreamingEstimator, self).estimate(X, **kwargs)
     except NotConvergedWarning as ncw:
         self._logger.info("Presumably finished estimation. Message: %s" %
                           ncw)
     return self
    def test_slice_random_access(self):
        dim = DataInMemory(self.data)

        all_data = dim.ra_itraj_cuboid[:, :, :]
        # the remaining 80 frames of the first trajectory should be truncated
        np.testing.assert_equal(all_data.shape, (3, 20, self.dim))
        # should coincide with original data
        np.testing.assert_equal(
            all_data, np.array(
                (self.data[0][:20], self.data[1], self.data[2])))
        # we should be able to select the 1st trajectory
        np.testing.assert_equal(dim.ra_itraj_cuboid[0],
                                np.array([self.data[0]]))
        # select only dimensions 1:3 of 2nd trajectory with every 2nd frame
        np.testing.assert_equal(dim.ra_itraj_cuboid[1, ::2, 1:3],
                                np.array([self.data[1][::2, 1:3]]))
        # select only last dimension of 1st trajectory every 17th frame
        np.testing.assert_equal(
            dim.ra_itraj_cuboid[0, ::17, -1],
            np.array([np.array([self.data[0][::17, -1]]).T]))
Example #26
0
    def test_n_chunks(self):
        r = DataInMemory(self.d)

        it0 = r.iterator(chunk=0)
        assert it0.n_chunks == 3  # 3 trajs

        it1 = r.iterator(chunk=50)
        assert it1.n_chunks == 3 * 2  # 2 chunks per trajectory

        it2 = r.iterator(chunk=30)
        # 3 full chunks and 1 small chunk per trajectory
        assert it2.n_chunks == 3 * 4

        it3 = r.iterator(chunk=30)
        it3.skip = 10
        assert it3.n_chunks == 3 * 3  # 3 full chunks per traj

        it4 = r.iterator(chunk=30)
        it4.skip = 5
        # 3 full chunks and 1 chunk of 5 frames per trajectory
        assert it4.n_chunks == 3 * 4

        # test for lagged iterator
        for stride in range(1, 5):
            for lag in range(0, 18):
                it = r.iterator(lag=lag,
                                chunk=30,
                                stride=stride,
                                return_trajindex=False)
                chunks = sum(1 for _ in it)
                np.testing.assert_equal(
                    it.n_chunks,
                    chunks,
                    err_msg=
                    "Expected number of chunks did not agree with what the iterator "
                    "returned for stride=%s, lag=%s" % (stride, lag))
                assert chunks == it.n_chunks
Example #27
0
    def estimate(self, X, **kwargs):
        # TODO: X is either Iterable of an array
        if not isinstance(X, Iterable):
            if isinstance(X, np.ndarray):
                X = DataInMemory(X, self.chunksize)
                self.data_producer = X
            else:
                raise ValueError("no array given")

        model = None
        # run estimation
        try:
            model = super(StreamingTransformer, self).estimate(X, **kwargs)
        except NotConvergedWarning as ncw:
            self._logger.info("Presumely finished estimation. Message: %s" %
                              ncw)
        # memory mode? Then map all results. Avoid recursion here, if parametrization
        # is triggered from get_output
        if self.in_memory and not self._mapping_to_mem_active:
            self._map_to_memory()

        self._estimated = True

        return model
Example #28
0
    def estimate(self, X, **kwargs):
        if not isinstance(X, Iterable):
            if isinstance(X, np.ndarray) or \
                    (isinstance(X, (list, tuple)) and len(X) > 0 and all([isinstance(x, np.ndarray) for x in X])):
                X = DataInMemory(X, self.chunksize)
                self.data_producer = X
            else:
                raise ValueError(
                    "no np.ndarray or non-empty list of np.ndarrays given")

        # run estimation
        try:
            super(StreamingTransformer, self).estimate(X, **kwargs)
        except NotConvergedWarning as ncw:
            self._logger.info("Presumely finished estimation. Message: %s" %
                              ncw)
        # memory mode? Then map all results. Avoid recursion here, if parametrization
        # is triggered from get_output
        if self.in_memory and not self._mapping_to_mem_active:
            self._map_to_memory()

        self._estimated = True

        return self
 def _get_reader_instance(self, instance_number):
     if instance_number == 0:
         return DataInMemory(self.data)
     elif instance_number == 1:
         return FeatureReader(self.data_feature_reader,
                              topologyfile=self.topfile)
 def test_is_random_accessible(self):
     dim = DataInMemory(self.data)
     frag = FragmentedTrajectoryReader([[self.data]])
     assert dim.is_random_accessible is True
     assert frag.is_random_accessible is False