Esempio n. 1
0
    def __init__(self, database_filename=None):
        self.database_filename = database_filename

        # have no filename, use in memory sqlite db
        # have no sqlite module, use dict
        # have sqlite and file, create db with given filename

        try:
            import sqlite3
            from pyemma.coordinates.data.util.traj_info_backends import SqliteDB
            self._database = SqliteDB(self.database_filename)
        except ImportError:
            warnings.warn("sqlite3 package not available, persistant storage of trajectory info not possible!")
            from pyemma.coordinates.data.util.traj_info_backends import DictDB
            self._database = DictDB()
Esempio n. 2
0
    def test_old_db_conversion(self):
        # prior 2.1, database only contained lengths (int as string) entries
        # check conversion is happening
        with NamedTemporaryFile(suffix='.npy', delete=False) as f:
            db = TrajectoryInfoCache(None)
            fn = f.name
            np.save(fn, [1, 2, 3])
            f.close()  # windows sucks
            reader = api.source(fn)
            hash = db._get_file_hash(fn)
            from pyemma.coordinates.data.util.traj_info_backends import DictDB
            db._database = DictDB()
            db._database.db_version = 0

            info = db[fn, reader]
            assert info.length == 3
            assert info.ndim == 1
            assert info.offsets == []
Esempio n. 3
0
class TrajectoryInfoCache(object):
    """ stores trajectory lengths associated to a file based hash (mtime, name, 1mb of data)

    Parameters
    ----------
    database_filename : str (optional)
        if given the cache is being made persistent to this file. Otherwise the
        cache is lost after the process has finished.

    Notes
    -----
    Do not instantiate this yourself, but use the instance provided by this
    module.

    """
    _instance = None
    DB_VERSION = 2

    @staticmethod
    def instance():
        """ :returns the TrajectoryInfoCache singleton instance"""
        if TrajectoryInfoCache._instance is None:
            # if we do not have a configuration director yet, we do not want to store
            if not config.cfg_dir:
                filename = None
            else:
                filename = os.path.join(config.cfg_dir, "traj_info.sqlite3")
            TrajectoryInfoCache._instance = TrajectoryInfoCache(filename)

        return TrajectoryInfoCache._instance

    def __init__(self, database_filename=None):
        self.database_filename = database_filename

        # have no filename, use in memory sqlite db
        # have no sqlite module, use dict
        # have sqlite and file, create db with given filename

        try:
            import sqlite3
            from pyemma.coordinates.data.util.traj_info_backends import SqliteDB
            self._database = SqliteDB(self.database_filename)
        except ImportError:
            warnings.warn(
                "sqlite3 package not available, persistant storage of trajectory info not possible!"
            )
            from pyemma.coordinates.data.util.traj_info_backends import DictDB
            self._database = DictDB()

    @property
    def current_db_version(self):
        return self._database.db_version

    @property
    def num_entries(self):
        return self._database.num_entries

    def _handle_csv(self, reader, filename, length):
        # this is maybe a bit ugly, but so far we do not store the dialect of csv files in
        # the database, so we need to re-do this step in case of a cache hit.
        from pyemma.coordinates.data import PyCSVReader
        if not isinstance(reader, PyCSVReader):
            return
        with open(filename, PyCSVReader.DEFAULT_OPEN_MODE) as fh:
            reader._determine_dialect(fh, length)

    def __getitem__(self, filename_reader_tuple):
        filename, reader = filename_reader_tuple
        if isinstance(filename, Path):
            filename = str(filename)
        abs_path = os.path.abspath(filename)
        key = self.compute_file_hash(abs_path)
        try:
            info = self._database.get(key)
            if not isinstance(info, TrajInfo):
                raise KeyError()
            self._handle_csv(reader, filename, info.length)
            # if path has changed, update it
            if not info.abs_path == abs_path:
                info.abs_path = abs_path
                self._database.update(info)
        # handle cache misses and not interpretable results by re-computation.
        # Note: this also handles UnknownDBFormatExceptions!
        except KeyError:
            try:
                info = reader._get_traj_info(filename)
            except BaseException as e:
                raise IOError('Could not obtain info for file {f}. '
                              'Original error was {e}'.format(f=filename, e=e))
            info.hash_value = key
            info.abs_path = abs_path
            # store info in db
            self.__setitem__(info)

            # save forcefully now
            if hasattr(self._database, 'sync'):
                self._database.sync()

        return info

    def _get_file_hash(self, filename):
        statinfo = os.stat(filename)

        # only remember file name without path, to re-identify it when its
        # moved
        hash_value = hash(os.path.basename(filename))
        hash_value ^= hash(statinfo.st_mtime)
        hash_value ^= hash(statinfo.st_size)

        # now read the first megabyte and hash it
        with open(filename, mode='rb') as fh:
            data = fh.read(1024)

        hash_value ^= hash(data)
        return str(hash_value)

    @staticmethod
    def compute_file_hash(filename):
        statinfo = os.stat(filename)
        # now read the first megabyte and hash it
        with open(filename, mode='rb') as fh:
            data = fh.read(1024)

        hasher = hashlib.md5()
        hasher.update(os.path.basename(filename).encode('utf-8'))
        hasher.update(str(statinfo.st_mtime).encode('ascii'))
        hasher.update(str(statinfo.st_size).encode('ascii'))
        hasher.update(data)
        return hasher.hexdigest()

    def __setitem__(self, traj_info):
        self._database.set(traj_info)

    def clear(self):
        self._database.clear()

    def close(self):
        """ you most likely never want to call this! """
        self._database.close()