Ejemplo n.º 1
0
    def test1dDataList(self):
        n = 10
        data = [np.arange(n), np.arange(n)]
        reader = DataInMemory(data)

        self.assertEqual(reader.trajectory_lengths(), [n, n])
        self.assertEqual(reader.dimension(), 1)
        self.assertEqual(reader.number_of_trajectories(), 2)
        self.assertEqual(reader.n_frames_total(), 2 * n)
Ejemplo n.º 2
0
    def testDataArray(self):
        frames_per_traj = 100
        dim = 3

        data = np.random.random((frames_per_traj, dim))
        d = DataInMemory(data)

        self.assertEqual(
            d.trajectory_lengths(), [frames_per_traj for _ in xrange(1)])
Ejemplo n.º 3
0
    def test1dData(self):
        n = 3
        data = np.arange(n)
        reader = DataInMemory(data)

        self.assertEqual(reader.trajectory_lengths(), [n])
        self.assertEqual(reader.dimension(), 1)
        self.assertEqual(reader.number_of_trajectories(), 1)
        self.assertEqual(reader.n_frames_total(), n)
Ejemplo n.º 4
0
    def testListOfArrays(self):

        frames_per_traj = 100
        dim = 3
        data = [np.random.random((frames_per_traj, dim)) for _ in xrange(3)]

        d = DataInMemory(data)

        self.assertEqual(d.dimension(), dim)

        self.assertEqual(
            d.trajectory_lengths(), [frames_per_traj for _ in xrange(3)])
Ejemplo n.º 5
0
    def testChunksizeResultsTica(self):
        chunk = 40
        lag = 100
        np.random.seed(0)
        X = np.random.randn(23000, 3)

        # un-chunked
        d = DataInMemory(X)

        tica_obj = api.tica(data=d, lag=lag, dim=1)

        cov = tica_obj.cov.copy()
        mean = tica_obj.mu.copy()

        # ------- run again with new chunksize -------
        d = DataInMemory(X)
        d.chunksize = chunk
        tica_obj = tica(data=d, lag=lag, dim=1)

        np.testing.assert_allclose(tica_obj.mu, mean)
        np.testing.assert_allclose(tica_obj.cov, cov)
Ejemplo n.º 6
0
    def test_lagged_iterator_2d(self):
        n = 57
        chunksize = 10
        lag = 1

#         data = [np.random.random((n, 3)),
#                 np.zeros((29, 3)),
#                 np.random.random((n - 50, 3))]
        data = [np.arange(300).reshape((100, 3)),
                np.arange(29 * 3).reshape((29, 3)),
                np.arange(150).reshape(50, 3)]
        input_lens = [x.shape[0] for x in data]
        # print data[0].shape
        reader = DataInMemory(data)
        reader.chunksize = chunksize

        self.assertEqual(reader.n_frames_total(), sum(input_lens))

        # store results by traj
        chunks = [[] for _ in xrange(len(data))]
        lagged_chunks = [[] for _ in xrange(len(data))]

        # iterate over data
        for itraj, X, Y in reader.iterator(lag=lag):
            chunks[itraj].append(X)
            lagged_chunks[itraj].append(Y)

        trajs = [np.vstack(ichunks) for ichunks in chunks]

        lagged_trajs = [np.vstack(ichunks) for ichunks in lagged_chunks]

        # unlagged data
        for traj, input_traj in zip(trajs, data):
            np.testing.assert_equal(traj.reshape(input_traj.shape), input_traj)

        # lagged data
        lagged_0 = [d[lag:] for d in data]

        for traj, input_traj in zip(lagged_trajs, lagged_0):
            np.testing.assert_equal(traj.reshape(input_traj.shape), input_traj)
Ejemplo n.º 7
0
    def test_time_lagged_chunked_access(self):
        n = 100
        data = [np.random.random((n, 3)), np.zeros((29, 3)),
                np.random.random((n - 50, 3))]
        reader = DataInMemory(data)
        self.assertEqual(reader.n_frames_total(), n + n - 50 + 29)

        # iterate over data
        lag = 30
        t = 0
        itraj = 0
        last_chunk = False
        while not last_chunk:
            last_chunk_in_traj = False
            t = 0
            while not last_chunk_in_traj:
                X, Y = reader._next_chunk(lag=lag)
                if itraj == 0:
                    self.assertEqual(X.shape, (100, 3))
                    self.assertEqual(Y.shape, (70, 3))
                elif itraj == 1:
                    # the time lagged chunk can not be built due to lag time
                    self.assertEqual(X.shape, (29, 3))
                    self.assertEqual(Y.shape, (0, 3))
                elif itraj == 2:
                    self.assertEqual(X.shape, (50, 3))
                    self.assertEqual(Y.shape, (20, 3))
                L = np.shape(X)[0]
                # last chunk in traj?
                last_chunk_in_traj = (
                    t + L >= reader.trajectory_length(itraj))
                # last chunk?
                last_chunk = (
                    last_chunk_in_traj and itraj >= reader.number_of_trajectories() - 1)
                t += L
            # increment trajectory
            itraj += 1
Ejemplo n.º 8
0
    def test_ndim_input(self):
        data = np.empty((4, 2, 2, 2))

        reader = DataInMemory(data)

        self.assertEqual(reader.dimension(), 2 * 2 * 2)
        self.assertEqual(reader.number_of_trajectories(), 1)
        self.assertEqual(reader.n_frames_total(), 4)
        self.assertEqual(
            reader.trajectory_lengths(), [reader.n_frames_total()])
Ejemplo n.º 9
0
    def iterator(self,
                 stride=1,
                 lag=0,
                 chunk=None,
                 return_trajindex=True,
                 cols=None,
                 skip=0):
        """ creates an iterator to stream over the (transformed) data.

        If your data is too large to fit into memory and you want to incrementally compute
        some quantities on it, you can create an iterator on a reader or transformer (eg. TICA)
        to avoid memory overflows.

        Parameters
        ----------

        stride : int, default=1
            Take only every stride'th frame.
        lag: int, default=0
            how many frame to omit for each file.
        chunk: int, default=None
            How many frames to process at once. If not given obtain the chunk size
            from the source.
        return_trajindex: boolean, default=True
            a chunk of data if return_trajindex is False, otherwise a tuple of (trajindex, data).
        cols: array like, default=None
            return only the given columns.
        skip: int, default=0
            skip 'n' first frames of each trajectory.

        Returns
        -------
        iter : instance of DataSourceIterator
            a implementation of a DataSourceIterator to stream over the data

        Examples
        --------

        >>> from pyemma.coordinates import source; import numpy as np
        >>> data = [np.arange(3), np.arange(4, 7)]
        >>> reader = source(data)
        >>> iterator = reader.iterator(chunk=1)
        >>> for array_index, chunk in iterator:
        ...     print(array_index, chunk)
        0 [[0]]
        0 [[1]]
        0 [[2]]
        1 [[4]]
        1 [[5]]
        1 [[6]]
        """
        if self.in_memory:
            from pyemma.coordinates.data.data_in_memory import DataInMemory
            return DataInMemory(self._Y).iterator(
                lag=lag,
                chunk=chunk,
                stride=stride,
                return_trajindex=return_trajindex,
                skip=skip)
        chunk = chunk if chunk is not None else self.chunksize
        if 0 < lag <= chunk:
            it = self._create_iterator(skip=skip,
                                       chunk=chunk,
                                       stride=1,
                                       return_trajindex=return_trajindex,
                                       cols=cols)
            it.return_traj_index = True
            return _LaggedIterator(it, lag, return_trajindex, stride)
        elif lag > 0:
            it = self._create_iterator(skip=skip,
                                       chunk=chunk,
                                       stride=stride,
                                       return_trajindex=return_trajindex,
                                       cols=cols)
            it.return_traj_index = True
            it_lagged = self._create_iterator(skip=skip + lag,
                                              chunk=chunk,
                                              stride=stride,
                                              return_trajindex=True,
                                              cols=cols)
            return _LegacyLaggedIterator(it, it_lagged, return_trajindex)
        return self._create_iterator(skip=skip,
                                     chunk=chunk,
                                     stride=stride,
                                     return_trajindex=return_trajindex,
                                     cols=cols)
Ejemplo n.º 10
0
    def get_output(self,
                   dimensions=slice(0, None),
                   stride=1,
                   skip=0,
                   chunk=None):
        """Maps all input data of this transformer and returns it as an array or list of arrays

        Parameters
        ----------
        dimensions : list-like of indexes or slice, default=all
           indices of dimensions you like to keep.
        stride : int, default=1
           only take every n'th frame.
        skip : int, default=0
            initially skip n frames of each file.
        chunk: int, default=None
            How many frames to process at once. If not given obtain the chunk size
            from the source.

        Returns
        -------
        output : list of ndarray(T_i, d)
           the mapped data, where T is the number of time steps of the input data, or if stride > 1,
           floor(T_in / stride). d is the output dimension of this transformer.
           If the input consists of a list of trajectories, Y will also be a corresponding list of trajectories

        """
        if isinstance(dimensions, int):
            ndim = 1
            dimensions = slice(dimensions, dimensions + 1)
        elif isinstance(dimensions, (list, np.ndarray, tuple, slice)):
            if hasattr(dimensions, 'ndim') and dimensions.ndim > 1:
                raise ValueError(
                    'dimension indices can\'t have more than one dimension')
            ndim = len(np.zeros(self.ndim)[dimensions])
        else:
            raise ValueError('unsupported type (%s) of "dimensions"' %
                             type(dimensions))

        assert ndim > 0, "ndim was zero in %s" % self.__class__.__name__

        if chunk is None:
            chunk = self.chunksize

        # create iterator
        if self.in_memory and not self._mapping_to_mem_active:
            from pyemma.coordinates.data.data_in_memory import DataInMemory
            assert self._Y is not None
            it = DataInMemory(self._Y)._create_iterator(skip=skip,
                                                        chunk=chunk,
                                                        stride=stride,
                                                        return_trajindex=True)
        else:
            it = self._create_iterator(skip=skip,
                                       chunk=chunk,
                                       stride=stride,
                                       return_trajindex=True)

        with it:
            # allocate memory
            try:
                from pyemma import config
                if config.coordinates_check_output:
                    trajs = [
                        np.full((l, ndim), np.nan, dtype=self.output_type())
                        for l in it.trajectory_lengths()
                    ]
                else:
                    # TODO: avoid having a copy here, if Y is already filled
                    trajs = [
                        np.empty((l, ndim), dtype=self.output_type())
                        for l in it.trajectory_lengths()
                    ]
            except MemoryError:
                self.logger.exception(
                    "Could not allocate enough memory to map all data."
                    " Consider using a larger stride.")
                return

            if self._logger_is_active(self._loglevel_DEBUG):
                self.logger.debug("get_output(): dimensions=%s" %
                                  str(dimensions))
                self.logger.debug(
                    "get_output(): created output trajs with shapes: %s" %
                    [x.shape for x in trajs])
                self.logger.debug("nchunks :%s, chunksize=%s" %
                                  (it.n_chunks, it.chunksize))
            # fetch data
            from pyemma._base.progress import ProgressReporter
            pg = ProgressReporter()
            pg.register(it.n_chunks,
                        description='getting output of %s' %
                        self.__class__.__name__)
            with pg.context(), it:
                for itraj, chunk in it:
                    i = slice(it.pos, it.pos + len(chunk))
                    assert i.stop - i.start > 0
                    trajs[itraj][i, :] = chunk[:, dimensions]
                    pg.update(1)

        if config.coordinates_check_output:
            for i, t in enumerate(trajs):
                finite = self._chunk_finite(t)
                if not np.all(finite):
                    # determine position
                    frames = np.where(np.logical_not(finite))
                    if not len(frames):
                        raise RuntimeError(
                            'nothing got assigned for traj {}'.format(i))
                    raise RuntimeError(
                        'unassigned sections in traj {i} in range [{frames}]'.
                        format(frames=frames, i=i))

        return trajs
Ejemplo n.º 11
0
    def test_1d(self):
        x = np.random.random(1000)
        reader = DataInMemory(x)

        k = 2
        c = api.cluster_uniform_time(reader, k=k)
Ejemplo n.º 12
0
 def testWrongArguments(self):
     with self.assertRaises(ValueError):
         reader = DataInMemory("foo")
Ejemplo n.º 13
0
 def test_cols(self):
     reader = DataInMemory(self.d)
     cols=(2, 0)
     for x in reader.iterator(chunk=0, return_trajindex=False, cols=cols):
         np.testing.assert_equal(x, self.d[:, cols])
Ejemplo n.º 14
0
    def testNotEqualDims(self):
        """ should raise, since different dims can not be processed"""
        data = [np.zeros((10, 3)), np.zeros((10, 5))]

        with self.assertRaises(ValueError):
            DataInMemory(data)