def __init__(self, database_filename=None): self.database_filename = database_filename # have no filename, use in memory sqlite db # have no sqlite module, use dict # have sqlite and file, create db with given filename try: import sqlite3 from pyemma.coordinates.data.util.traj_info_backends import SqliteDB self._database = SqliteDB(self.database_filename) except ImportError: warnings.warn("sqlite3 package not available, persistant storage of trajectory info not possible!") from pyemma.coordinates.data.util.traj_info_backends import DictDB self._database = DictDB()
def test_old_db_conversion(self): # prior 2.1, database only contained lengths (int as string) entries # check conversion is happening with NamedTemporaryFile(suffix='.npy', delete=False) as f: db = TrajectoryInfoCache(None) fn = f.name np.save(fn, [1, 2, 3]) f.close() # windows sucks reader = api.source(fn) hash = db._get_file_hash(fn) from pyemma.coordinates.data.util.traj_info_backends import DictDB db._database = DictDB() db._database.db_version = 0 info = db[fn, reader] assert info.length == 3 assert info.ndim == 1 assert info.offsets == []
class TrajectoryInfoCache(object): """ stores trajectory lengths associated to a file based hash (mtime, name, 1mb of data) Parameters ---------- database_filename : str (optional) if given the cache is being made persistent to this file. Otherwise the cache is lost after the process has finished. Notes ----- Do not instantiate this yourself, but use the instance provided by this module. """ _instance = None DB_VERSION = 2 @staticmethod def instance(): """ :returns the TrajectoryInfoCache singleton instance""" if TrajectoryInfoCache._instance is None: # if we do not have a configuration director yet, we do not want to store if not config.cfg_dir: filename = None else: filename = os.path.join(config.cfg_dir, "traj_info.sqlite3") TrajectoryInfoCache._instance = TrajectoryInfoCache(filename) return TrajectoryInfoCache._instance def __init__(self, database_filename=None): self.database_filename = database_filename # have no filename, use in memory sqlite db # have no sqlite module, use dict # have sqlite and file, create db with given filename try: import sqlite3 from pyemma.coordinates.data.util.traj_info_backends import SqliteDB self._database = SqliteDB(self.database_filename) except ImportError: warnings.warn( "sqlite3 package not available, persistant storage of trajectory info not possible!" ) from pyemma.coordinates.data.util.traj_info_backends import DictDB self._database = DictDB() @property def current_db_version(self): return self._database.db_version @property def num_entries(self): return self._database.num_entries def _handle_csv(self, reader, filename, length): # this is maybe a bit ugly, but so far we do not store the dialect of csv files in # the database, so we need to re-do this step in case of a cache hit. from pyemma.coordinates.data import PyCSVReader if not isinstance(reader, PyCSVReader): return with open(filename, PyCSVReader.DEFAULT_OPEN_MODE) as fh: reader._determine_dialect(fh, length) def __getitem__(self, filename_reader_tuple): filename, reader = filename_reader_tuple if isinstance(filename, Path): filename = str(filename) abs_path = os.path.abspath(filename) key = self.compute_file_hash(abs_path) try: info = self._database.get(key) if not isinstance(info, TrajInfo): raise KeyError() self._handle_csv(reader, filename, info.length) # if path has changed, update it if not info.abs_path == abs_path: info.abs_path = abs_path self._database.update(info) # handle cache misses and not interpretable results by re-computation. # Note: this also handles UnknownDBFormatExceptions! except KeyError: try: info = reader._get_traj_info(filename) except BaseException as e: raise IOError('Could not obtain info for file {f}. ' 'Original error was {e}'.format(f=filename, e=e)) info.hash_value = key info.abs_path = abs_path # store info in db self.__setitem__(info) # save forcefully now if hasattr(self._database, 'sync'): self._database.sync() return info def _get_file_hash(self, filename): statinfo = os.stat(filename) # only remember file name without path, to re-identify it when its # moved hash_value = hash(os.path.basename(filename)) hash_value ^= hash(statinfo.st_mtime) hash_value ^= hash(statinfo.st_size) # now read the first megabyte and hash it with open(filename, mode='rb') as fh: data = fh.read(1024) hash_value ^= hash(data) return str(hash_value) @staticmethod def compute_file_hash(filename): statinfo = os.stat(filename) # now read the first megabyte and hash it with open(filename, mode='rb') as fh: data = fh.read(1024) hasher = hashlib.md5() hasher.update(os.path.basename(filename).encode('utf-8')) hasher.update(str(statinfo.st_mtime).encode('ascii')) hasher.update(str(statinfo.st_size).encode('ascii')) hasher.update(data) return hasher.hexdigest() def __setitem__(self, traj_info): self._database.set(traj_info) def clear(self): self._database.clear() def close(self): """ you most likely never want to call this! """ self._database.close()