Esempio n. 1
0
    def setUp(self):
        self.work_dir = tempfile.mkdtemp("traj_cache_test")
        self.tmpfile = tempfile.mktemp(dir=self.work_dir)
        self.db = TrajectoryInfoCache(self.tmpfile)

        # overwrite TrajectoryInfoCache._instance with self.db...
        TrajectoryInfoCache._instance = self.db
Esempio n. 2
0
    def setUp(self):
        self.work_dir = tempfile.mkdtemp(prefix="traj_cache_test")
        self.tmpfile = tempfile.mktemp(dir=self.work_dir)
        self.db = TrajectoryInfoCache(self.tmpfile)

        # overwrite TrajectoryInfoCache._instance with self.db...
        TrajectoryInfoCache._instance = self.db
        config.use_trajectory_lengths_cache = True
Esempio n. 3
0
    def test_old_db_conversion(self):
        # prior 2.1, database only contained lengths (int as string) entries
        # check conversion is happening
        with NamedTemporaryFile(suffix='.npy', delete=False) as f:
            db = TrajectoryInfoCache(None)
            fn = f.name
            np.save(fn, [1, 2, 3])
            f.close() # windows sucks
            reader = api.source(fn)
            hash = db._get_file_hash(fn)
            db._database = {hash: str(3)}

            info = db[fn, reader]
            assert info.length == 3
            assert info.ndim == 1
            assert info.offsets == []
Esempio n. 4
0
    def test_no_sqlite(self):
        # create new instance (init has to be called, install temporary import hook to raise importerror for sqlite3
        import sys
        del sys.modules['sqlite3']

        class meta_ldr(object):
            def find_module(self, fullname, path):
                if fullname.startswith('sqlite3'):
                    return self

            def load_module(self, fullname, path=None):
                raise ImportError()

        import warnings
        try:
            sys.meta_path.insert(0, meta_ldr())
            # import sqlite3
            with warnings.catch_warnings(record=True) as cw:
                db = TrajectoryInfoCache()
                self.assertNotIsInstance(db._database, SqliteDB)
            self.assertEqual(len(cw), 1)
            self.assertIn("sqlite3 package not available",
                          cw[0].message.args[0])
        finally:
            del sys.meta_path[0]
Esempio n. 5
0
    def setUp(self):
        self.tmpfile = tempfile.mktemp(dir=self.work_dir)
        self.db = TrajectoryInfoCache(self.tmpfile)

        assert len(self.db._database) == 1, len(self.db._database)
        assert 'db_version' in self.db._database
        assert int(self.db._database['db_version']) >= 1
Esempio n. 6
0
def load_topology_cached(top_file):
    if isinstance(top_file, str):
        return _load(top_file, TrajectoryInfoCache.compute_file_hash(top_file))
    if isinstance(top_file, Topology):
        return top_file
    if isinstance(top_file, Trajectory):
        return top_file.topology
    raise NotImplementedError()
Esempio n. 7
0
    def test_old_db_conversion(self):
        # prior 2.1, database only contained lengths (int as string) entries
        # check conversion is happening
        with NamedTemporaryFile(suffix='.npy', delete=False) as f:
            db = TrajectoryInfoCache(None)
            fn = f.name
            np.save(fn, [1, 2, 3])
            f.close()  # windows sucks
            reader = api.source(fn)
            hash = db._get_file_hash(fn)
            from pyemma.coordinates.data.util.traj_info_backends import DictDB
            db._database = DictDB()
            db._database.db_version = 0

            info = db[fn, reader]
            assert info.length == 3
            assert info.ndim == 1
            assert info.offsets == []
Esempio n. 8
0
    def test_corrupted_db(self):
        with NamedTemporaryFile(mode='w', suffix='.dat', delete=False) as f:
            f.write("makes no sense!!!!")
            f.close()
        name = f.name
        db = TrajectoryInfoCache(name)

        # ensure we can perform lookups on the broken db without exception.
        r = api.source(xtcfiles[0], top=pdbfile)
        db[xtcfiles[0], r]
Esempio n. 9
0
    def test_no_sqlite(self):
        def import_mock(name, *args):
            if name == 'sqlite3':
                raise ImportError("we pretend not to have this")
            return __import__(name, *args)

        from pyemma.coordinates.data.util import traj_info_cache
        with mock.patch('pyemma.coordinates.data.util.traj_info_cache',
                        '__import__',
                        side_effect=import_mock,
                        create=True):
            TrajectoryInfoCache._instance = None
            TrajectoryInfoCache(self.tempfile)
Esempio n. 10
0
    def test_corrupted_db(self):
        with NamedTemporaryFile(mode='w', suffix='.dat', delete=False) as f:
            f.write("makes no sense!!!!")
            f.close()
        name = f.name
        import warnings
        with warnings.catch_warnings(record=True) as cm:
            warnings.simplefilter('always')
            db = TrajectoryInfoCache(name)
            assert len(cm) == 1
            assert "corrupted" in str(cm[-1].message)

        # ensure we can perform lookups on the broken db without exception.
        r = api.source(xtcfiles[0], top=pdbfile)
        db[xtcfiles[0], r]
Esempio n. 11
0
    def test_in_memory_db(self):
        """ new instance, not yet saved to disk, no lru cache avail """
        old_cfg_dir = config.cfg_dir
        try:
            config._cfg_dir = ''
            db = TrajectoryInfoCache()
            reader = pyemma.coordinates.source(xtcfiles, top=pdbfile)

            info = db[xtcfiles[0], reader]
            self.assertIsInstance(db._database, SqliteDB)

            directory = db._database._database_from_key(info.hash_value)
            assert directory is None
        finally:
            from pyemma.util.exceptions import ConfigDirectoryException
            try:
                config.cfg_dir = old_cfg_dir
            except ConfigDirectoryException:
                pass
Esempio n. 12
0
    def filenames(self, filename_list):

        if isinstance(filename_list, str):
            filename_list = [filename_list]

        uniq = set(filename_list)
        if len(uniq) != len(filename_list):
            self.logger.warning("duplicate files/arrays detected")
            filename_list = list(uniq)

        from pyemma.coordinates.data.data_in_memory import DataInMemory

        if self._is_reader:
            if isinstance(self, DataInMemory):
                import warnings
                warnings.warn('filenames are not being used for DataInMemory')
                return

            self._ntraj = len(filename_list)
            if self._ntraj == 0:
                raise ValueError("empty file list")

            # validate files
            for f in filename_list:
                try:
                    stat = os.stat(f)
                except EnvironmentError:
                    self.logger.exception('Error during access of file "%s"' %
                                          f)
                    raise ValueError('could not read file "%s"' % f)

                if not os.path.isfile(
                        f):  # can be true for symlinks to directories
                    raise ValueError('"%s" is not a valid file')

                if stat.st_size == 0:
                    raise ValueError('file "%s" is empty' % f)

            # number of trajectories/data sets
            self._filenames = filename_list
            # determine len and dim via cache lookup,
            lengths = []
            offsets = []
            ndims = []
            # avoid cyclic imports
            from pyemma.coordinates.data.util.traj_info_cache import TrajectoryInfoCache
            from pyemma._base.progress import ProgressReporter
            pg = ProgressReporter()
            pg.register(len(filename_list), 'Obtaining file info')
            with pg.context():
                for filename in filename_list:
                    if config.use_trajectory_lengths_cache:
                        info = TrajectoryInfoCache.instance()[filename, self]
                    else:
                        info = self._get_traj_info(filename)
                    # nested data set support.
                    if hasattr(info, 'children'):
                        lengths.append(info.length)
                        offsets.append(info.offsets)
                        ndims.append(info.ndim)
                        for c in info.children:
                            lengths.append(c.length)
                            offsets.append(c.offsets)
                            ndims.append(c.ndim)
                    else:
                        lengths.append(info.length)
                        offsets.append(info.offsets)
                        ndims.append(info.ndim)
                    if len(filename_list) > 3:
                        pg.update(1)

            # ensure all trajs have same dim
            if not np.unique(ndims).size == 1:
                # group files by their dimensions to give user indicator
                ndims = np.array(ndims)
                filename_list = np.asarray(filename_list)
                sort_inds = np.argsort(ndims)
                import itertools, operator
                res = {}
                for dim, files in itertools.groupby(
                        zip(ndims[sort_inds], filename_list[sort_inds]),
                        operator.itemgetter(0)):
                    res[dim] = list(f[1] for f in files)

                raise ValueError(
                    "Input data has different dimensions ({dims})!"
                    " Files grouped by dimensions: {groups}".format(
                        dims=res.keys(), groups=res))

            self._ndim = ndims[0]
            self._lengths = lengths
            self._offsets = offsets

        else:
            # propagate this until we finally have a a reader
            self.data_producer.filenames = filename_list
Esempio n. 13
0
 def test_get_instance(self):
     # test for exceptions in singleton creation
     inst = TrajectoryInfoCache.instance()
     inst.current_db_version
Esempio n. 14
0
class TestTrajectoryInfoCache(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.old_instance = TrajectoryInfoCache.instance()
        config.use_trajectory_lengths_cache = True

    def setUp(self):
        self.work_dir = tempfile.mkdtemp("traj_cache_test")
        self.tmpfile = tempfile.mktemp(dir=self.work_dir)
        self.db = TrajectoryInfoCache(self.tmpfile)

        # overwrite TrajectoryInfoCache._instance with self.db...
        TrajectoryInfoCache._instance = self.db

    def tearDown(self):
        self.db.close()
        os.unlink(self.tmpfile)

        import shutil
        shutil.rmtree(self.work_dir, ignore_errors=True)

    @classmethod
    def tearDownClass(cls):
        TrajectoryInfoCache._instance = cls.old_instance
        config.use_trajectory_lengths_cache = False

    def test_get_instance(self):
        # test for exceptions in singleton creation
        inst = TrajectoryInfoCache.instance()
        inst.current_db_version
        self.assertIs(inst, self.db)

    def test_store_load_traj_info(self):
        x = np.random.random((10, 3))
        my_conf = config()
        my_conf.cfg_dir = self.work_dir
        with mock.patch('pyemma.coordinates.data.util.traj_info_cache.config',
                        my_conf):
            with NamedTemporaryFile(delete=False) as fh:
                np.savetxt(fh.name, x)
                reader = api.source(fh.name)
                info = self.db[fh.name, reader]
                self.db.close()
                self.db.__init__(self.db._database.filename)
                info2 = self.db[fh.name, reader]
                self.assertEqual(info2, info)

    def test_exceptions(self):
        # in accessible files
        not_existant = ''.join(
            chr(i)
            for i in np.random.random_integers(65, 90, size=10)) + '.npy'
        bad = [not_existant]  # should be unaccessible or non existent
        with self.assertRaises(ValueError) as cm:
            api.source(bad)
            assert bad[0] in cm.exception.message

        # empty files
        with NamedTemporaryFile(delete=False) as f:
            f.close()
            with self.assertRaises(ValueError) as cm:
                api.source(f.name)
                assert f.name in cm.exception.message

    def test_featurereader_xtc(self):
        # cause cache failures
        with settings(use_trajectory_lengths_cache=False):
            reader = FeatureReader(xtcfiles, pdbfile)

        results = {}
        for f in xtcfiles:
            traj_info = self.db[f, reader]
            results[f] = traj_info.ndim, traj_info.length, traj_info.offsets

        expected = {}
        for f in xtcfiles:
            with mdtraj.open(f) as fh:
                length = len(fh)
                ndim = fh.read(1)[0].shape[1]
                offsets = fh.offsets if hasattr(fh, 'offsets') else []
                expected[f] = ndim, length, offsets

        np.testing.assert_equal(results, expected)

    def test_npy_reader(self):
        lengths_and_dims = [(7, 3), (23, 3), (27, 3)]
        data = [np.empty((n, dim)) for n, dim in lengths_and_dims]
        files = []
        with TemporaryDirectory() as td:
            for i, x in enumerate(data):
                fn = os.path.join(td, "%i.npy" % i)
                np.save(fn, x)
                files.append(fn)

            reader = NumPyFileReader(files)

            # cache it and compare
            results = {
                f: (self.db[f, reader].length, self.db[f, reader].ndim,
                    self.db[f, reader].offsets)
                for f in files
            }
            expected = {
                f: (len(data[i]), data[i].shape[1], [])
                for i, f in enumerate(files)
            }
            np.testing.assert_equal(results, expected)

    def test_csvreader(self):
        data = np.random.random((101, 3))
        fn = tempfile.mktemp()
        try:
            np.savetxt(fn, data)
            # calc offsets
            offsets = [0]
            with open(fn, PyCSVReader.DEFAULT_OPEN_MODE) as new_fh:
                while new_fh.readline():
                    offsets.append(new_fh.tell())
            reader = PyCSVReader(fn)
            assert reader.dimension() == 3
            trajinfo = reader._get_traj_info(fn)
            np.testing.assert_equal(offsets, trajinfo.offsets)
        finally:
            os.unlink(fn)

    def test_fragmented_reader(self):
        top_file = pkg_resources.resource_filename(__name__, 'data/test.pdb')
        trajfiles = []
        nframes = []
        with TemporaryDirectory() as wd:
            for _ in range(3):
                f, _, l = create_traj(top_file, dir=wd)
                trajfiles.append(f)
                nframes.append(l)
            # three trajectories: one consisting of all three, one consisting of the first,
            # one consisting of the first and the last
            reader = api.source(
                [trajfiles, [trajfiles[0]], [trajfiles[0], trajfiles[2]]],
                top=top_file)
            np.testing.assert_equal(
                reader.trajectory_lengths(),
                [sum(nframes), nframes[0], nframes[0] + nframes[2]])

    def test_feature_reader_xyz(self):
        traj = mdtraj.load(xtcfiles, top=pdbfile)
        length = len(traj)

        with NamedTemporaryFile(mode='wb', suffix='.xyz', delete=False) as f:
            fn = f.name
            traj.save_xyz(fn)
            f.close()
            reader = pyemma.coordinates.source(fn, top=pdbfile)
            self.assertEqual(reader.trajectory_length(0), length)

    def test_data_in_mem(self):
        # make sure cache is not used for data in memory!
        data = [np.empty((3, 3))] * 3
        api.source(data)
        self.assertEqual(self.db.num_entries, 0)

    def test_old_db_conversion(self):
        # prior 2.1, database only contained lengths (int as string) entries
        # check conversion is happening
        with NamedTemporaryFile(suffix='.npy', delete=False) as f:
            db = TrajectoryInfoCache(None)
            fn = f.name
            np.save(fn, [1, 2, 3])
            f.close()  # windows sucks
            reader = api.source(fn)
            hash = db._get_file_hash(fn)
            from pyemma.coordinates.data.util.traj_info_backends import DictDB
            db._database = DictDB()
            db._database.db_version = 0

            info = db[fn, reader]
            assert info.length == 3
            assert info.ndim == 1
            assert info.offsets == []

    def test_corrupted_db(self):
        with NamedTemporaryFile(mode='w', suffix='.dat', delete=False) as f:
            f.write("makes no sense!!!!")
            f.close()
        name = f.name
        import warnings
        with warnings.catch_warnings(record=True) as cm:
            warnings.simplefilter('always')
            db = TrajectoryInfoCache(name)
            assert len(cm) == 1
            assert "corrupted" in str(cm[-1].message)

        # ensure we can perform lookups on the broken db without exception.
        r = api.source(xtcfiles[0], top=pdbfile)
        db[xtcfiles[0], r]

    def test_n_entries(self):
        self.assertEqual(self.db.num_entries, 0)
        assert TrajectoryInfoCache._instance is self.db
        pyemma.coordinates.source(xtcfiles, top=pdbfile)
        self.assertEqual(self.db.num_entries, len(xtcfiles))

    def test_max_n_entries(self):
        data = [np.random.random((10, 3)) for _ in range(20)]
        max_entries = 10
        config.traj_info_max_entries = max_entries
        files = []
        with TemporaryDirectory() as td:
            for i, arr in enumerate(data):
                f = os.path.join(td, "%s.npy" % i)
                np.save(f, arr)
                files.append(f)
            pyemma.coordinates.source(files)
        self.assertLessEqual(self.db.num_entries, max_entries)
        self.assertGreater(self.db.num_entries, 0)

    def test_max_size(self):
        data = [np.random.random((150, 10)) for _ in range(150)]
        max_size = 1

        files = []
        config.show_progress_bars = False
        with TemporaryDirectory() as td, settings(traj_info_max_size=max_size):
            for i, arr in enumerate(data):
                f = os.path.join(td, "%s.txt" % i)
                # save as txt to enforce creation of offsets
                np.savetxt(f, arr)
                files.append(f)
            pyemma.coordinates.source(files)

        self.assertLessEqual(
            os.stat(self.db.database_filename).st_size / 1024,
            config.traj_info_max_size)
        self.assertGreater(self.db.num_entries, 0)

    def test_no_working_directory(self):
        import sqlite3
        # this is the case as long as the user has not yet created a config directory via config.save()
        self.db._database = SqliteDB(filename=None)

        # trigger caching
        pyemma.coordinates.source(xtcfiles, top=pdbfile)

    @unittest.skip("not yet functional")
    def test_no_sqlite(self):
        def import_mock(name, *args):
            if name == 'sqlite3':
                raise ImportError("we pretend not to have this")
            return __import__(name, *args)

        from pyemma.coordinates.data.util import traj_info_cache
        with mock.patch('pyemma.coordinates.data.util.traj_info_cache',
                        '__import__',
                        side_effect=import_mock,
                        create=True):
            TrajectoryInfoCache._instance = None
            TrajectoryInfoCache(self.tempfile)
Esempio n. 15
0
    def filenames(self, filename_list):

        if isinstance(filename_list, string_types):
            filename_list = [filename_list]

        uniq = set(filename_list)
        if len(uniq) != len(filename_list):
            self.logger.warning("duplicate files/arrays detected")
            filename_list = list(uniq)

        from pyemma.coordinates.data.data_in_memory import DataInMemory

        if self._is_reader:
            if isinstance(self, DataInMemory):
                import warnings
                warnings.warn('filenames are not being used for DataInMemory')
                return

            self._ntraj = len(filename_list)
            if self._ntraj == 0:
                raise ValueError("empty file list")

            # validate files
            for f in filename_list:
                try:
                    stat = os.stat(f)
                except EnvironmentError:
                    self.logger.exception('Error during access of file "%s"' % f)
                    raise ValueError('could not read file "%s"' % f)

                if not os.path.isfile(f): # can be true for symlinks to directories
                    raise ValueError('"%s" is not a valid file')

                if stat.st_size == 0:
                    raise ValueError('file "%s" is empty' % f)

            # number of trajectories/data sets
            self._filenames = filename_list
            # determine len and dim via cache lookup,
            lengths = []
            offsets = []
            ndims = []
            # avoid cyclic imports
            from pyemma.coordinates.data.util.traj_info_cache import TrajectoryInfoCache
            if len(filename_list) > 3:
                self._progress_register(len(filename_list), 'Obtaining file info')
            for filename in filename_list:
                if config['use_trajectory_lengths_cache'] == 'True':
                    info = TrajectoryInfoCache.instance()[filename, self]
                else:
                    info = self._get_traj_info(filename)
                lengths.append(info.length)
                offsets.append(info.offsets)
                ndims.append(info.ndim)
                if len(filename_list) > 3:
                    self._progress_update(1)

            # ensure all trajs have same dim
            if not np.unique(ndims).size == 1:
                raise ValueError("input data has different dimensions!"
                                 " Dimensions are = %s" % zip(filename_list, ndims))

            self._ndim = ndims[0]
            self._lengths = lengths
            self._offsets = offsets

        else:
            # propate this until we finally have a a reader?
            self.data_producer.filenames = filename_list
Esempio n. 16
0
class TestTrajectoryInfoCache(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.old_instance = TrajectoryInfoCache.instance()
        config.use_trajectory_lengths_cache = True

    def setUp(self):
        self.work_dir = tempfile.mkdtemp(prefix="traj_cache_test")
        self.tmpfile = tempfile.mktemp(dir=self.work_dir)
        self.db = TrajectoryInfoCache(self.tmpfile)

        # overwrite TrajectoryInfoCache._instance with self.db...
        TrajectoryInfoCache._instance = self.db
        config.use_trajectory_lengths_cache = True

    def tearDown(self):
        self.db.close()
        os.unlink(self.tmpfile)

        import shutil
        shutil.rmtree(self.work_dir, ignore_errors=True)

    @classmethod
    def tearDownClass(cls):
        TrajectoryInfoCache._instance = cls.old_instance
        config.use_trajectory_lengths_cache = False

    def test_get_instance(self):
        # test for exceptions in singleton creation
        inst = TrajectoryInfoCache.instance()
        inst.current_db_version
        self.assertIs(inst, self.db)

    def test_store_load_traj_info(self):
        x = np.random.random((10, 3))
        from pyemma.util._config import Config
        my_conf = Config()
        my_conf.cfg_dir = self.work_dir
        with mock.patch('pyemma.coordinates.data.util.traj_info_cache.config',
                        my_conf):
            with NamedTemporaryFile(delete=False) as fh:
                np.savetxt(fh.name, x)
                reader = api.source(fh.name)
                info = self.db[fh.name, reader]
                self.db.close()
                self.db.__init__(self.db._database.filename)
                info2 = self.db[fh.name, reader]
                self.assertEqual(info2, info)

    def test_exceptions(self):
        # in accessible files
        not_existant = ''.join(
            chr(i) for i in np.random.randint(65, 90, size=10)) + '.npy'
        bad = [not_existant]  # should be unaccessible or non existent
        with self.assertRaises(ValueError) as cm:
            api.source(bad)
            assert bad[0] in cm.exception.message

        # empty files
        with NamedTemporaryFile(delete=False) as f:
            f.close()
            with self.assertRaises(ValueError) as cm:
                api.source(f.name)
                assert f.name in cm.exception.message

        # bogus files
        with NamedTemporaryFile(suffix='.npy', delete=False) as f:
            x = np.array([1, 2, 3])
            np.save(f, x)
            with open(f.name, 'wb') as f2:
                f2.write(b'asdf')
            with self.assertRaises(IOError) as cm:
                api.source(f.name)

    def test_featurereader_xtc(self):
        # cause cache failures
        with settings(use_trajectory_lengths_cache=False):
            reader = FeatureReader(xtcfiles, pdbfile)

        results = {}
        for f in xtcfiles:
            traj_info = self.db[f, reader]
            results[f] = traj_info.ndim, traj_info.length, traj_info.offsets

        expected = {}
        for f in xtcfiles:
            with mdtraj.open(f) as fh:
                length = len(fh)
                ndim = fh.read(1)[0].shape[1]
                offsets = fh.offsets if hasattr(fh, 'offsets') else []
                expected[f] = ndim, length, offsets

        np.testing.assert_equal(results, expected)

    def test_npy_reader(self):
        lengths_and_dims = [(7, 3), (23, 3), (27, 3)]
        data = [np.empty((n, dim)) for n, dim in lengths_and_dims]
        files = []
        with TemporaryDirectory() as td:
            for i, x in enumerate(data):
                fn = os.path.join(td, "%i.npy" % i)
                np.save(fn, x)
                files.append(fn)

            reader = NumPyFileReader(files)

            # cache it and compare
            results = {
                f: (self.db[f, reader].length, self.db[f, reader].ndim,
                    self.db[f, reader].offsets)
                for f in files
            }
            expected = {
                f: (len(data[i]), data[i].shape[1], [])
                for i, f in enumerate(files)
            }
            np.testing.assert_equal(results, expected)

    def test_csvreader(self):
        data = np.random.random((101, 3))
        fn = tempfile.mktemp()
        try:
            np.savetxt(fn, data)
            # calc offsets
            offsets = [0]
            with open(fn, PyCSVReader.DEFAULT_OPEN_MODE) as new_fh:
                while new_fh.readline():
                    offsets.append(new_fh.tell())
            reader = PyCSVReader(fn)
            assert reader.dimension() == 3
            trajinfo = reader._get_traj_info(fn)
            np.testing.assert_equal(offsets, trajinfo.offsets)
        finally:
            os.unlink(fn)

    def test_fragmented_reader(self):
        top_file = pkg_resources.resource_filename(__name__, 'data/test.pdb')
        trajfiles = []
        nframes = []
        with TemporaryDirectory() as wd:
            for _ in range(3):
                f, _, l = create_traj(top_file, dir=wd)
                trajfiles.append(f)
                nframes.append(l)
            # three trajectories: one consisting of all three, one consisting of the first,
            # one consisting of the first and the last
            reader = api.source(
                [trajfiles, [trajfiles[0]], [trajfiles[0], trajfiles[2]]],
                top=top_file)
            np.testing.assert_equal(
                reader.trajectory_lengths(),
                [sum(nframes), nframes[0], nframes[0] + nframes[2]])

    def test_feature_reader_xyz(self):
        traj = mdtraj.load(xtcfiles, top=pdbfile)
        length = len(traj)

        with NamedTemporaryFile(mode='wb', suffix='.xyz', delete=False) as f:
            fn = f.name
            traj.save_xyz(fn)
            f.close()
            reader = pyemma.coordinates.source(fn, top=pdbfile)
            self.assertEqual(reader.trajectory_length(0), length)

    def test_data_in_mem(self):
        # make sure cache is not used for data in memory!
        data = [np.empty((3, 3))] * 3
        api.source(data)
        self.assertEqual(self.db.num_entries, 0)

    def test_old_db_conversion(self):
        # prior 2.1, database only contained lengths (int as string) entries
        # check conversion is happening
        with NamedTemporaryFile(suffix='.npy', delete=False) as f:
            db = TrajectoryInfoCache(None)
            fn = f.name
            np.save(fn, [1, 2, 3])
            f.close()  # windows sucks
            reader = api.source(fn)
            hash = db._get_file_hash(fn)
            from pyemma.coordinates.data.util.traj_info_backends import DictDB
            db._database = DictDB()
            db._database.db_version = 0

            info = db[fn, reader]
            assert info.length == 3
            assert info.ndim == 1
            assert info.offsets == []

    def test_corrupted_db(self):
        with NamedTemporaryFile(mode='w', suffix='.dat', delete=False) as f:
            f.write("makes no sense!!!!")
            f.close()
        name = f.name
        import warnings
        with warnings.catch_warnings(record=True) as cm:
            warnings.simplefilter('always')
            db = TrajectoryInfoCache(name)
            assert len(cm) == 1
            assert "corrupted" in str(cm[-1].message)

        # ensure we can perform lookups on the broken db without exception.
        r = api.source(xtcfiles[0], top=pdbfile)
        db[xtcfiles[0], r]

    def test_n_entries(self):
        assert config.use_trajectory_lengths_cache
        self.assertEqual(self.db.num_entries, 0)
        assert TrajectoryInfoCache._instance is self.db
        pyemma.coordinates.source(xtcfiles, top=pdbfile)
        self.assertEqual(self.db.num_entries, len(xtcfiles))

    def test_max_n_entries(self):
        data = [np.random.random((10, 3)) for _ in range(20)]
        max_entries = 10
        config.traj_info_max_entries = max_entries
        files = []
        with TemporaryDirectory() as td:
            for i, arr in enumerate(data):
                f = os.path.join(td, "%s.npy" % i)
                np.save(f, arr)
                files.append(f)
            pyemma.coordinates.source(files)
        self.assertLessEqual(self.db.num_entries, max_entries)
        self.assertGreater(self.db.num_entries, 0)

    def test_max_size(self):
        data = [np.random.random((150, 10)) for _ in range(150)]
        max_size = 1

        files = []
        with TemporaryDirectory() as td, settings(traj_info_max_size=max_size,
                                                  show_progress_bars=False):
            for i, arr in enumerate(data):
                f = os.path.join(td, "%s.txt" % i)
                # save as txt to enforce creation of offsets
                np.savetxt(f, arr)
                files.append(f)
            pyemma.coordinates.source(files)

        self.assertLessEqual(
            os.stat(self.db.database_filename).st_size / 1024,
            config.traj_info_max_size)
        self.assertGreater(self.db.num_entries, 0)

    def test_no_working_directory(self):
        # this is the case as long as the user has not yet created a config directory via config.save()
        self.db._database = SqliteDB(filename=None)

        # trigger caching
        pyemma.coordinates.source(xtcfiles, top=pdbfile)

    def test_no_sqlite(self):
        # create new instance (init has to be called, install temporary import hook to raise importerror for sqlite3
        import sys
        del sys.modules['sqlite3']

        class meta_ldr(object):
            def find_module(self, fullname, path):
                if fullname.startswith('sqlite3'):
                    return self

            def load_module(self, fullname, path=None):
                raise ImportError()

        import warnings
        try:
            sys.meta_path.insert(0, meta_ldr())
            # import sqlite3
            with warnings.catch_warnings(record=True) as cw:
                db = TrajectoryInfoCache()
                self.assertNotIsInstance(db._database, SqliteDB)
            self.assertEqual(len(cw), 1)
            self.assertIn("sqlite3 package not available",
                          cw[0].message.args[0])
        finally:
            del sys.meta_path[0]

    def test_in_memory_db(self):
        """ new instance, not yet saved to disk, no lru cache avail """
        old_cfg_dir = config.cfg_dir
        try:
            config._cfg_dir = ''
            db = TrajectoryInfoCache()
            reader = pyemma.coordinates.source(xtcfiles, top=pdbfile)

            info = db[xtcfiles[0], reader]
            self.assertIsInstance(db._database, SqliteDB)

            directory = db._database._database_from_key(info.hash_value)
            assert directory is None
        finally:
            from pyemma.util.exceptions import ConfigDirectoryException
            try:
                config.cfg_dir = old_cfg_dir
            except ConfigDirectoryException:
                pass

    def test_stress(self):
        arrays = [np.empty((5, 2))] * 100
        npy_files = [
            os.path.join(self.work_dir, '{}.npy'.format(i))
            for i in range(len(arrays))
        ]
        [np.save(f, x) for f, x in zip(npy_files, arrays)]
        env = os.environ.copy()
        env['PYEMMA_CFG_DIR'] = self.work_dir
        import subprocess
        import sys
        import time
        script = 'import pyemma; pyemma.coordinates.source({files})' \
            .format(cfg_dir=self.work_dir, files=npy_files)
        failed = False
        procs = [
            subprocess.Popen([sys.executable, '-c', script], env=env)
            for _ in range(10)
        ]
        error = None
        while procs:
            for proc in procs:
                retcode = proc.poll()
                if retcode is not None:
                    if retcode != 0:
                        pass
                        #stdout = proc.stdout.read()
                        #stderr = proc.stderr.read()
                        #error = '{};;{}'.format(stdout, stderr)
                    procs.remove(proc)
                    #break
                else:  # No process is done, wait a bit and check again.
                    time.sleep(.1)
                    continue

            # Here, `proc` has finished with return code `retcode`
            if retcode is not None and retcode != 0:
                print('process failed with {}'.format(retcode))
                failed = True
                break

        self.assertTrue(not failed, msg=error)
Esempio n. 17
0
 def setUpClass(cls):
     cls.old_instance = TrajectoryInfoCache.instance()
     config.use_trajectory_lengths_cache = True