Esempio n. 1
0
    def test_add_element(self):
        # start with empty pipeline without auto-parametrization
        p = api.pipeline([], run=False)
        # add some reader
        reader = api.source(self.traj_files, top=self.pdb_file)
        p.add_element(reader)
        p.parametrize()
        # get the result immediately
        out1 = reader.get_output()

        # add some kmeans
        kmeans = api.cluster_kmeans(k=15)
        p.add_element(kmeans)
        p.parametrize()
        # get the result immediately
        kmeans1 = kmeans.get_output()

        # get reader output again
        out2 = reader.get_output()
        p.add_element(api.kmeans(k=2))
        p.parametrize()

        # get kmeans output again
        kmeans2 = kmeans.get_output()
        # check if add_element changes the intermediate results
        np.testing.assert_array_equal(out1[0], out2[0])
        np.testing.assert_array_equal(out1[1], out2[1])
        np.testing.assert_array_equal(kmeans1[0], kmeans2[0])
        np.testing.assert_array_equal(kmeans1[1], kmeans2[1])
Esempio n. 2
0
 def test_read_multiple_files_topology_file(self):
     reader = api.source(self.traj_files, top=self.pdb_file)
     self.assertIsNotNone(reader, "The reader should not be none.")
     self.assertEqual(reader.topfile, self.pdb_file,
                      "Reader topology file and input topology file should coincide.")
     self.assertListEqual(reader.trajfiles, self.traj_files, "Reader trajectories and input"
                                                             " trajectories should coincide.")
     self.assertEqual(reader.featurizer.topologyfile, self.pdb_file, "Featurizers topology file and input "
                                                                     "topology file should coincide.")
Esempio n. 3
0
 def test_no_cluster(self):
     reader_xtc = api.source(self.traj_files, top=self.pdb_file)
     # only reader
     api.pipeline(reader_xtc)
     reader_xtc.get_output()
     # reader + pca / tica
     tica = api.tica()
     pca = api.pca()
     api.pipeline([reader_xtc, tica])._chain[-1].get_output()
     api.pipeline([reader_xtc, pca])._chain[-1].get_output()
Esempio n. 4
0
 def test_read_single_file_featurizer(self):
     featurizer = MDFeaturizer(self.pdb_file)
     reader = api.source(self.traj_files[0], features=featurizer)
     self.assertIsNotNone(reader, "The reader should not be none.")
     self.assertEqual(reader.topfile, self.pdb_file,
                      "Reader topology file and input topology file should coincide.")
     self.assertListEqual(reader.trajfiles, [self.traj_files[0]], "Reader trajectories and input"
                                                                  " trajectories should coincide.")
     self.assertEqual(reader.featurizer.topologyfile, self.pdb_file, "Featurizers topology file and input "
                                                                     "topology file should coincide.")
Esempio n. 5
0
    def testIteratorAccess(self):
        reader = api.source(self.trajfile, top=self.topfile)

        frames = 0
        data = []
        for i, X in reader:
            frames += X.shape[0]
            data.append(X)

        # restore shape of input
        data = np.array(data).reshape(self.xyz.shape)

        self.assertEqual(frames, reader.trajectory_lengths()[0])
        self.assertTrue(np.allclose(data, self.xyz))
Esempio n. 6
0
 def test_set_element(self):
     reader = api.source(self.traj_files, top=self.pdb_file)
     pca = api.pca()
     p = api.pipeline([reader, pca])
     self.assertTrue(p._is_parametrized())
     pca_out = pca.get_output()
     tica = api.tica(lag=self.generated_lag)
     # replace pca with tica
     p.set_element(1, tica)
     self.assertFalse(p._is_parametrized(), "After replacing an element, the pipeline should not be parametrized.")
     p.parametrize()
     tica_out = tica.get_output()
     # check if replacement actually happened
     self.assertFalse(np.array_equal(pca_out[0], tica_out[0]),
                      "The output should not be the same when the method got replaced.")
Esempio n. 7
0
    def test_replace_data_source(self):
        reader_xtc = api.source(self.traj_files, top=self.pdb_file)
        reader_gen = DataInMemory(data=self.generated_data)

        kmeans = api.kmeans(k=10)
        assert hasattr(kmeans, '_chunks')
        p = api.pipeline([reader_xtc, kmeans])
        out1 = kmeans.get_output()
        # replace source
        print reader_gen
        p.set_element(0, reader_gen)
        assert hasattr(kmeans, '_chunks')
        p.parametrize()
        out2 = kmeans.get_output()
        self.assertFalse(np.array_equal(out1, out2), "Data source changed, so should the resulting clusters.")
Esempio n. 8
0
 def test_is_parametrized(self):
     # construct pipeline with all possible transformers
     p = api.pipeline(
         [
             api.source(self.traj_files, top=self.pdb_file),
             api.tica(),
             api.pca(),
             api.cluster_kmeans(k=50),
             api.cluster_regspace(dmin=50),
             api.cluster_uniform_time(k=20)
         ], run=False
     )
     self.assertFalse(p._is_parametrized(), "If run=false, the pipeline should not be parametrized.")
     p.parametrize()
     self.assertTrue(p._is_parametrized(), "If parametrized was called, the pipeline should be parametrized.")
Esempio n. 9
0
    def testIteratorAccess(self):
        reader = api.source(self.trajfile, top=self.topfile)
        assert isinstance(reader, FeatureReader)

        frames = 0
        data = []
        for i, X in reader:
            assert isinstance(X, np.ndarray)
            frames += X.shape[0]
            data.append(X)

        self.assertEqual(frames, reader.trajectory_lengths()[0])
        data = np.vstack(data)
        # restore shape of input
        data.reshape(self.xyz.shape)

        self.assertTrue(np.allclose(data, self.xyz.reshape(-1, 9)))
Esempio n. 10
0
    def test_lagged_access_small_files(self):
        """ itraj 0 should be skipped, since it is too short."""
        top = self.topfile
        trajs = [
            create_traj(top=top, length=10, format='.xtc', dir=self.tmpdir)[0],
            create_traj(top=top, length=20, format='.xtc', dir=self.tmpdir)[0]
        ]

        reader = source(trajs, top=top)
        it = reader.iterator(lag=11, chunk=0)
        res = {}
        with it:
            for itraj, x, y in it:
                res[itraj] = (x.shape, y.shape)

        self.assertNotIn(0, res)
        self.assertIn(1, res)
Esempio n. 11
0
 def test_is_parametrized(self):
     # construct pipeline with all possible transformers
     p = api.pipeline([
         api.source(self.traj_files, top=self.pdb_file),
         api.tica(),
         api.pca(),
         api.cluster_kmeans(k=50),
         api.cluster_regspace(dmin=50),
         api.cluster_uniform_time(k=20)
     ],
                      run=False)
     self.assertFalse(
         p._is_estimated(),
         "If run=false, the pipeline should not be parametrized.")
     p.parametrize()
     self.assertTrue(
         p._is_estimated(),
         "If parametrized was called, the pipeline should be parametrized.")
Esempio n. 12
0
    def test_old_db_conversion(self):
        # prior 2.1, database only contained lengths (int as string) entries
        # check conversion is happening
        with NamedTemporaryFile(suffix='.npy', delete=False) as f:
            db = TrajectoryInfoCache(None)
            fn = f.name
            np.save(fn, [1, 2, 3])
            f.close()  # windows sucks
            reader = api.source(fn)
            hash = db._get_file_hash(fn)
            from pyemma.coordinates.data.util.traj_info_backends import DictDB
            db._database = DictDB()
            db._database.db_version = 0

            info = db[fn, reader]
            assert info.length == 3
            assert info.ndim == 1
            assert info.offsets == []
Esempio n. 13
0
    def test_lagged_stridden_access(self):
        reader = api.source([self.trajfile, self.trajfile2], top=self.topfile)
        reader.chunksize = 210
        strides = [2, 3, 5, 7, 15]
        lags = [1, 3, 7, 10, 30]
        err_msg = "not equal for stride=%i, lag=%i"
        for stride in strides:
            for lag in lags:
                chunks = {itraj: [] for itraj in range(reader.number_of_trajectories())}
                for itraj, _, Y in reader.iterator(stride=stride, lag=lag):
                    chunks[itraj].append(Y)
                chunks[0] = np.vstack(chunks[0])
                np.testing.assert_almost_equal(
                        chunks[0], self.xyz.reshape(-1, 9)[lag::stride], err_msg=err_msg % (stride, lag))

                chunks[1] = np.vstack(chunks[1])
                np.testing.assert_almost_equal(
                        chunks[1], self.xyz2.reshape(-1, 9)[lag::stride], err_msg=err_msg % (stride, lag))
    def test_fragmented_reader_random_access1(self):
        with TemporaryDirectory() as td:
            trajfiles = []
            for i in range(3):
                trajfiles.append(
                    create_traj(start=i * 10, dir=td, length=20)[0])
            topfile = get_top()
            trajfiles = [(trajfiles[0], trajfiles[1]), trajfiles[0],
                         trajfiles[2]]

            source = coor.source(trajfiles, top=topfile)
            assert isinstance(source, FragmentedTrajectoryReader)

            for r in source._readers:
                if not isinstance(r, (list, tuple)):
                    r = r[0]
                for _r in r:
                    _r._return_traj_obj = True

            from collections import defaultdict
            for chunksize in [0, 2, 3, 100000]:
                frames = defaultdict(list)
                with source.iterator(chunk=chunksize,
                                     return_trajindex=True,
                                     stride=self.stride) as it:
                    for itraj, t in it:
                        frames[itraj].append(t)

                dest = []
                for itraj in frames.keys():
                    dest.append(frames[itraj][0])

                    for t in frames[itraj][1:]:
                        dest[-1] = dest[-1].join(t)

                keys = np.unique(self.stride[:, 0])
                for i, coords in enumerate(dest):
                    if i in keys:
                        traj = mdtraj.load(trajfiles[i], top=topfile)
                        np.testing.assert_equal(
                            coords.xyz,
                            traj.xyz[np.array(
                                self.stride[self.stride[:, 0] == i][:, 1])],
                            err_msg="not equal for chunksize=%s" % chunksize)
Esempio n. 15
0
 def test_set_element(self):
     reader = api.source(self.traj_files, top=self.pdb_file)
     pca = api.pca()
     p = api.pipeline([reader, pca])
     self.assertTrue(p._is_estimated())
     pca_out = pca.get_output()
     tica = api.tica(lag=self.generated_lag)
     # replace pca with tica
     p.set_element(1, tica)
     self.assertFalse(
         p._is_estimated(),
         "After replacing an element, the pipeline should not be parametrized."
     )
     p.parametrize()
     tica_out = tica.get_output()
     # check if replacement actually happened
     self.assertFalse(
         np.array_equal(pca_out[0], tica_out[0]),
         "The output should not be the same when the method got replaced.")
Esempio n. 16
0
    def test_cols_with_features(self):
        trajs = glob.glob(
            pkg_resources.resource_filename('pyemma.coordinates.tests',
                                            'data/bpti_mini.xtc'))
        top = pkg_resources.resource_filename('pyemma.coordinates.tests',
                                              'data/bpti_ca.pdb')
        reader = api.source(trajs, top=top)
        feat = reader.featurizer
        inds = feat.pairs(feat.select('name CA'))
        reader.featurizer.add_distances(inds)
        # select first and second atom?
        cols = np.array((0, 2))
        ref = mdtraj.load(trajs, top=top)
        ref = mdtraj.compute_distances(ref, inds)
        ref = ref[:, cols]

        it = reader.iterator(chunk=0, return_trajindex=False, cols=cols)
        with it:
            for x in it:
                np.testing.assert_equal(x, ref)
Esempio n. 17
0
    def setUpClass(cls):
        with numpy_random_seed(123):
            import msmtools.generation as msmgen

            # generate HMM with two Gaussians
            cls.P = np.array([[0.99, 0.01],
                              [0.01, 0.99]])
            cls.T = 40000
            means = [np.array([-1, 1]), np.array([1, -1])]
            widths = [np.array([0.3, 2]), np.array([0.3, 2])]
            # continuous trajectory
            cls.X = np.zeros((cls.T, 2))
            # hidden trajectory
            dtraj = msmgen.generate_traj(cls.P, cls.T)
            for t in range(cls.T):
                s = dtraj[t]
                cls.X[t, 0] = widths[s][0] * np.random.randn() + means[s][0]
                cls.X[t, 1] = widths[s][1] * np.random.randn() + means[s][1]
            # Set the lag time:
            cls.lag = 10
            # Compute mean free data:
            mref = (np.sum(cls.X[:-cls.lag, :], axis=0) +
                    np.sum(cls.X[cls.lag:, :], axis=0)) / float(2*(cls.T-cls.lag))
            mref_nr = np.sum(cls.X[:-cls.lag, :], axis=0) / float(cls.T-cls.lag)
            cls.X_mf = cls.X - mref[None, :]
            cls.X_mf_nr = cls.X - mref_nr[None, :]
            # Compute correlation matrices:
            cls.cov_ref = (np.dot(cls.X_mf[:-cls.lag, :].T, cls.X_mf[:-cls.lag, :]) +\
                  np.dot(cls.X_mf[cls.lag:, :].T, cls.X_mf[cls.lag:, :])) / float(2*(cls.T-cls.lag))
            cls.cov_ref_nr = np.dot(cls.X_mf_nr[:-cls.lag, :].T, cls.X_mf_nr[:-cls.lag, :]) / float(cls.T - cls.lag)
            cls.cov_tau_ref = (np.dot(cls.X_mf[:-cls.lag, :].T, cls.X_mf[cls.lag:, :]) +\
                  np.dot(cls.X_mf[cls.lag:, :].T, cls.X_mf[:-cls.lag, :])) / float(2*(cls.T-cls.lag))
            cls.cov_tau_ref_nr = np.dot(cls.X_mf_nr[:-cls.lag, :].T, cls.X_mf_nr[cls.lag:, :]) / float(cls.T - cls.lag)

            # do unscaled TICA
            reader=api.source(cls.X, chunk_size=0)
            cls.tica_obj = api.tica(data=reader, lag=cls.lag, dim=1, kinetic_map=False)
            # non-reversible TICA
            cls.tica_obj_nr = api.tica(data=reader, lag=cls.lag, dim=1, kinetic_map=False, reversible=False)
Esempio n. 18
0
    def setUpClass(cls):
        with numpy_random_seed(123):
            import msmtools.generation as msmgen

            # generate HMM with two Gaussians
            cls.P = np.array([[0.99, 0.01],
                              [0.01, 0.99]])
            cls.T = 40000
            means = [np.array([-1, 1]), np.array([1, -1])]
            widths = [np.array([0.3, 2]), np.array([0.3, 2])]
            # continuous trajectory
            cls.X = np.zeros((cls.T, 2))
            # hidden trajectory
            dtraj = msmgen.generate_traj(cls.P, cls.T)
            for t in range(cls.T):
                s = dtraj[t]
                cls.X[t, 0] = widths[s][0] * np.random.randn() + means[s][0]
                cls.X[t, 1] = widths[s][1] * np.random.randn() + means[s][1]
            cls.lag = 10
            # do unscaled TICA
            reader=api.source(cls.X, chunk_size=0)
            cls.tica_obj = api.tica(data=reader, lag=cls.lag, dim=1, kinetic_map=False)
    def test_RA_high_stride(self):
        """ ensure we use a random access pattern for high strides chunksize combinations to avoid memory issues."""
        from pyemma.coordinates.util.patches import iterload

        n = int(1e5)
        n_bytes = 3 * 3 * 8 * n  # ~8Mb
        savable_formats_mdtra_18 = ('.xtc', '.trr', '.dcd', '.h5', '.binpos',
                                    '.nc', '.netcdf', '.ncdf', '.tng')
        for ext in savable_formats_mdtra_18:
            traj = create_traj(length=n, dir=self.tmpdir, format=ext)[0]

            from unittest.mock import patch
            # temporarily overwrite the memory cutoff with a smaller value, to trigger the switch to RA stride.
            with patch(
                    'pyemma.coordinates.util.patches.iterload.MEMORY_CUTOFF',
                    n_bytes - 1):
                r = coor.source(traj, top=get_top())
                it = r.iterator(stride=1000, chunk=100000)
                next(it)
                assert iterload._DEACTIVATE_RANDOM_ACCESS_OPTIMIZATION or it._mditer.is_ra_iter

                out_ra = r.get_output(stride=1000, chunk=10000)
            it = r.iterator(stride=1)
            next(it)
            assert iterload._DEACTIVATE_RANDOM_ACCESS_OPTIMIZATION or not it._mditer.is_ra_iter
            out = r.get_output(stride=1000)
            np.testing.assert_equal(out_ra, out)

            # check max stride exceeding
            it = r.iterator(stride=iterload.MAX_STRIDE_SWITCH_TO_RA + 1)
            next(it)
            assert iterload._DEACTIVATE_RANDOM_ACCESS_OPTIMIZATION or it._mditer.is_ra_iter

            it = r.iterator(stride=iterload.MAX_STRIDE_SWITCH_TO_RA)
            next(it)
            assert iterload._DEACTIVATE_RANDOM_ACCESS_OPTIMIZATION or not it._mditer.is_ra_iter
Esempio n. 20
0
    def testTimeLaggedAccess(self):
        # each frame has 2 atoms with 3 coords = 6 coords per frame.
        # coords are sequential through all frames and start with 0.

        lags = [2, 200]

        chunksizes = [1, 100]

        for lag in lags:
            for chunksize in chunksizes:
                log.info("chunksize=%i\tlag=%i" % (chunksize, lag))

                lagged_chunks = []
                reader = api.source(self.trajfile, top=self.topfile)
                reader.chunksize = chunksize
                for _, _, y in reader.iterator(lag=lag):
                    lagged_chunks.append(y)

                coords = self.xyz.reshape((self.xyz.shape[0], -1))

                for ii, c in enumerate(lagged_chunks[:-1]):
                    # all despite last chunk shall have chunksize
                    self.assertTrue(c.shape[0] <= chunksize)
                    # first lagged chunk should start at lag and stop at chunksize +
                    # lag
                    ind1 = ii * chunksize + lag
                    ind2 = ind1 + chunksize
                    #log.debug("coor slice[%i: %i]" % (ind1, ind2))
                    np.testing.assert_allclose(c, coords[ind1:ind2])

                # TODO: check last lagged frame

                # last lagged chunk should miss "lag" frames of input! e.g
                # padded to maintain chunksize

                last_chunk = lagged_chunks[-1]
Esempio n. 21
0
    def testTimeLaggedAccess(self):
        # each frame has 2 atoms with 3 coords = 6 coords per frame.
        # coords are sequential through all frames and start with 0.

        lags = [2, 200]

        chunksizes = [1, 100]

        for lag in lags:
            for chunksize in chunksizes:
                log.info("chunksize=%i\tlag=%i" % (chunksize, lag))

                lagged_chunks = []
                reader = api.source(self.trajfile, top=self.topfile)
                reader.chunksize = chunksize
                for _, _, y in reader.iterator(lag=lag):
                    lagged_chunks.append(y)

                coords = self.xyz.reshape((self.xyz.shape[0], -1))

                for ii, c in enumerate(lagged_chunks[:-1]):
                    # all despite last chunk shall have chunksize
                    self.assertTrue(c.shape[0] <= chunksize)
                    # first lagged chunk should start at lag and stop at chunksize +
                    # lag
                    ind1 = ii * chunksize + lag
                    ind2 = ind1 + chunksize
                    #log.debug("coor slice[%i: %i]" % (ind1, ind2))
                    np.testing.assert_allclose(c, coords[ind1:ind2])

                # TODO: check last lagged frame

                # last lagged chunk should miss "lag" frames of input! e.g
                # padded to maintain chunksize

                last_chunk = lagged_chunks[-1]
Esempio n. 22
0
    def test_exceptions(self):
        # in accessible files
        not_existant = ''.join(
            chr(i) for i in np.random.randint(65, 90, size=10)) + '.npy'
        bad = [not_existant]  # should be unaccessible or non existent
        with self.assertRaises(ValueError) as cm:
            api.source(bad)
            assert bad[0] in cm.exception.message

        # empty files
        with NamedTemporaryFile(delete=False) as f:
            f.close()
            with self.assertRaises(ValueError) as cm:
                api.source(f.name)
                assert f.name in cm.exception.message

        # bogus files
        with NamedTemporaryFile(suffix='.npy', delete=False) as f:
            x = np.array([1, 2, 3])
            np.save(f, x)
            with open(f.name, 'wb') as f2:
                f2.write(b'asdf')
            with self.assertRaises(IOError) as cm:
                api.source(f.name)
Esempio n. 23
0
 def test_obtain_csv_file_reader_csv(self):
     reader = api.source(self.csv)
     self.assertIsNotNone(reader, "Reader object should not be none.")
     self.assertTrue(isinstance(reader, CSVReader), "Should be a CSVReader.")
Esempio n. 24
0
 def setUpClass(cls):
     path = pkg_resources.resource_filename(__name__, 'data') + os.path.sep
     cls.pdb_file = os.path.join(path, 'bpti_ca.pdb')
     cls.xtc_file = os.path.join(path, 'bpti_mini.xtc')
     cls.inp = api.source(cls.xtc_file, top=cls.pdb_file)
Esempio n. 25
0
 def test_format_loading_via_feature_reader(self):
     reader = source(traj_file, top=top, dir=self.tmpdir)
     reader.get_output()
Esempio n. 26
0
 def setUpClass(cls):
     path = os.path.join(os.path.split(__file__)[0], 'data')
     cls.pdb_file = os.path.join(path, 'bpti_ca.pdb')
     cls.xtc_file = os.path.join(path, 'bpti_mini.xtc')
     cls.inp = api.source(cls.xtc_file, top=cls.pdb_file)
Esempio n. 27
0
 def test_pdb_traj_unsupported(self):
     with self.assertRaises(ValueError) as c, tempfile.NamedTemporaryFile(
             suffix='.pdb') as ntf:
         api.source([ntf.name], top=self.bpti_pdbfile)
         assert 'PDB' in c.exception.args[0]
Esempio n. 28
0
 def test_no_transform(self):
     reader_xtc = api.source(self.traj_files, top=self.pdb_file)
     api.pipeline([reader_xtc, api.cluster_kmeans(k=10)])._chain[-1].get_output()
     api.pipeline([reader_xtc, api.cluster_regspace(dmin=10)])._chain[-1].get_output()
     api.pipeline([reader_xtc, api.cluster_uniform_time()])._chain[-1].get_output()
Esempio n. 29
0
 def test_obtain_csv_file_reader_csv(self):
     reader = api.source(self.csv)
     self.assertIsNotNone(reader, "Reader object should not be none.")
     self.assertTrue(isinstance(reader, CSVReader), "Should be a CSVReader.")
Esempio n. 30
0
 def test_obtain_numpy_file_reader_npz(self):
     reader = api.source(self.npz)
     self.assertIsNotNone(reader, "Reader object should not be none.")
     self.assertTrue(
         isinstance(reader, NumPyFileReader), "Should be a NumPyFileReader.")
Esempio n. 31
0
 def test_obtain_numpy_file_reader_npz(self):
     reader = api.source(self.npz)
     self.assertIsNotNone(reader, "Reader object should not be none.")
     self.assertTrue(
         isinstance(reader, NumPyFileReader), "Should be a NumPyFileReader.")
Esempio n. 32
0
 def test_data_in_mem(self):
     # make sure cache is not used for data in memory!
     data = [np.empty((3, 3))] * 3
     api.source(data)
     self.assertEqual(self.db.num_entries, 0)
 def test_data_in_memory_without_first_two_trajs(self):
     data_in_memory = coor.source(self.data, chunksize=10)
     out = data_in_memory.get_output(stride=self.stride2)
     np.testing.assert_array_almost_equal(out[2], [self.data[2][0]])
Esempio n. 34
0
 def test_data_in_mem(self):
     # make sure cache is not used for data in memory!
     data = [np.empty((3, 3))] * 3
     api.source(data)
     assert len(self.db._database) == 1
Esempio n. 35
0
    def __init__(self,
                 trajectories,
                 topologyfile=None,
                 chunksize=1000,
                 featurizer=None):
        self._args = (trajectories, topologyfile, chunksize, featurizer)
        # sanity checks
        assert isinstance(
            trajectories,
            (list,
             tuple)), "input trajectories should be of list or tuple type"
        # if it contains no further list: treat as single trajectory
        if not any([isinstance(traj, (list, tuple)) for traj in trajectories]):
            trajectories = [trajectories]
        # if not list of lists, treat as single-element-fragment-trajectory
        trajectories = [
            traj if isinstance(traj, (list, tuple)) else [traj]
            for traj in trajectories
        ]
        # some trajectory should be provided
        assert len(trajectories) > 0, "no input trajectories provided"
        # call super
        super(FragmentedTrajectoryReader, self).__init__(chunksize=chunksize)
        self._is_reader = True
        # number of trajectories
        self._ntraj = len(trajectories)
        # store readers
        from pyemma.coordinates.api import source

        self._readers = [[
            source(input_item,
                   features=featurizer,
                   top=topologyfile,
                   chunksize=chunksize) for input_item in trajectories[itraj]
        ] for itraj in range(0, self._ntraj)]

        # check all readers have same dimension
        if not len(set(itraj_r.ndim for r in self._readers
                       for itraj_r in r)) == 1:
            # lookup the evil reader:
            last_dim = -1
            for r in self._readers:
                for itraj_r in r:
                    if last_dim == -1:
                        last_dim = itraj_r.ndim
                    if itraj_r.ndim != last_dim:
                        raise ValueError(
                            "%s has different dimension (%i) than expected (%i)"
                            % (itraj_r.describe(), itraj_r.ndim, last_dim))

        from collections import defaultdict
        self._reader_by_filename = defaultdict(list)
        for r in self._readers:
            for itraj_r in r:
                for filename in itraj_r.filenames:
                    self._reader_by_filename[filename].append(itraj_r)

        # lengths array per reader
        self._reader_lengths = [[
            reader.trajectory_length(0, 1) for reader in self._readers[itraj]
        ] for itraj in range(0, self._ntraj)]
        # composite trajectory length
        self._lengths = [
            sum(self._reader_lengths[itraj])
            for itraj in range(0, self._ntraj)
        ]
        # mapping reader_index -> cumulative length
        self._cumulative_lengths = [
            np.cumsum(self._reader_lengths[itraj])
            for itraj in range(0, self._ntraj)
        ]
        # store trajectory files
        self._trajectories = trajectories
        self._filenames = trajectories