def __init__(self, *, sqlite_db, source_select, targets, query_filter, na_rep=None): """Infer index names and columns, retain connection and query information""" from genefab3.db.sql.streamed_tables import SQLiteIndexName _split3 = lambda c: (c[0].split("/", 2) + ["*", "*"])[:3] self.sqlite_db = sqlite_db self.source_select = source_select self.sqltransactions = SQLTransactions(sqlite_db, source_select.name) self.targets = targets self.query_filter = query_filter self.na_rep = na_rep self.query = f""" SELECT {targets} FROM `{source_select.name}` {query_filter} """ desc = "tables/StreamedDataTable" with self.sqltransactions.concurrent(desc) as (connection, execute): try: cursor = connection.cursor() cursor.execute(self.query) self._index_name = SQLiteIndexName(cursor.description[0][0]) self._columns = [_split3(c) for c in cursor.description[1:]] _count_query = f"SELECT count(*) FROM ({self.query})" _nrows = (execute(_count_query).fetchone() or [0])[0] self.shape = (_nrows, len(self._columns)) except OperationalError as e: reraise_operational_error(self, e) self.accessions = {c[0] for c in self._columns} self.n_index_levels = 1 self.datatypes, self.gct_validity_set = set(), set()
def __init__(self, sqlite_dbs): self.sqlite_db = sqlite_dbs.response_cache["db"] self.maxdbsize = sqlite_dbs.response_cache["maxsize"] or float("inf") if self.sqlite_db is None: msg = "LRU SQL cache DISABLED by client parameter" _logw(f"ResponseCache():\n {msg}") else: self.sqltransactions = SQLTransactions(self.sqlite_db) desc = "response_cache/ensure_schema" with self.sqltransactions.concurrent(desc) as (_, execute): for table, schema in RESPONSE_CACHE_SCHEMAS: execute(f"CREATE TABLE IF NOT EXISTS `{table}` {schema}")
def __init__(self, *, sqlite_db, identifier=None, table_schemas=None): """Initialize SQLiteObject, ensure tables in `sqlite_db`""" self.sqlite_db, self.table_schemas = sqlite_db, table_schemas self.identifier = identifier self.changed = None self.sqltransactions = SQLTransactions(sqlite_db, identifier) desc = "tables/ensure_schema" with self.sqltransactions.concurrent(desc) as (_, execute): for table, schema in (table_schemas or {}).items(): execute( "CREATE TABLE IF NOT EXISTS `{}` ({})".format( validate_no_backtick(table, "table"), ", ".join("`" + validate_no_backtick(f, "field") + "` " + k for f, k in schema.items()), ), )
def __init__(self, sqlite_db, column_dispatcher, identifier=None): """Interpret `column_dispatcher`""" self.sqlite_db = sqlite_db self.identifier = identifier self.sqltransactions = SQLTransactions(sqlite_db, identifier) self._column_dispatcher = column_dispatcher self.name = None self._columns, _index_names = [], set() for n, p in column_dispatcher.items(): validate_no_backtick(n, "column") validate_no_backtick(p, "table_part") if isinstance(n, SQLiteIndexName): _index_names.add(n) else: self._columns.append([n]) if self.name is None: self.name = p if len(_index_names) == 0: msg = "StreamedDataTableWizard(): no index" raise GeneFabDatabaseException(msg, table=self.name) elif len(_index_names) > 1: msg = "StreamedDataTableWizard(): indexes of parts do not match" _kw = dict(table=self.name, index_names=_index_names) raise GeneFabDatabaseException(msg, **_kw) self._index_name = _index_names.pop()
class TempSelect(): """Temporary table or view generated from `query`""" def __init__(self, *, sqlite_db, query, targets, kind="TABLE", _depends_on=None, msg=None): self.sqlite_db = sqlite_db self._depends_on = _depends_on # keeps sources from being deleted early self.query, self.targets, self.kind = query, targets, kind self.name = "TEMP:" + random_unique_string(seed=query) self.sqltransactions = SQLTransactions(self.sqlite_db, self.name) with self.sqltransactions.exclusive("TempSelect") as (_, execute): if msg: GeneFabLogger.info(msg) try: execute(f"CREATE {self.kind} `{self.name}` as {query}") except OperationalError as e: reraise_operational_error(self, e) else: query_repr = repr(query.lstrip()[:200] + "...") msg = f"Created temporary SQLite {self.kind}" GeneFabLogger.info(f"{msg} {self.name} from\n {query_repr}") def __del__(self, desc="TempSelect/__del__"): with self.sqltransactions.exclusive(desc) as (_, execute): try: execute(f"DROP {self.kind} `{self.name}`") except OperationalError as e: msg = f"Failed to drop temporary {self.kind} {self.name}" GeneFabLogger.error(msg, exc_info=e) else: msg = f"Dropped temporary SQLite {self.kind} {self.name}" GeneFabLogger.info(msg)
class StreamedDataTable(StreamedTable): """Table streamed from SQLite query""" def __init__(self, *, sqlite_db, source_select, targets, query_filter, na_rep=None): """Infer index names and columns, retain connection and query information""" from genefab3.db.sql.streamed_tables import SQLiteIndexName _split3 = lambda c: (c[0].split("/", 2) + ["*", "*"])[:3] self.sqlite_db = sqlite_db self.source_select = source_select self.sqltransactions = SQLTransactions(sqlite_db, source_select.name) self.targets = targets self.query_filter = query_filter self.na_rep = na_rep self.query = f""" SELECT {targets} FROM `{source_select.name}` {query_filter} """ desc = "tables/StreamedDataTable" with self.sqltransactions.concurrent(desc) as (connection, execute): try: cursor = connection.cursor() cursor.execute(self.query) self._index_name = SQLiteIndexName(cursor.description[0][0]) self._columns = [_split3(c) for c in cursor.description[1:]] _count_query = f"SELECT count(*) FROM ({self.query})" _nrows = (execute(_count_query).fetchone() or [0])[0] self.shape = (_nrows, len(self._columns)) except OperationalError as e: reraise_operational_error(self, e) self.accessions = {c[0] for c in self._columns} self.n_index_levels = 1 self.datatypes, self.gct_validity_set = set(), set() @property def gct_valid(self): """Test if valid for GCT, i.e. has exactly one datatype, and the datatype is supported""" return ((len(self.datatypes) == 1) and self.gct_validity_set and all(self.gct_validity_set)) def move_index_boundary(self, *, to): """Like pandas methods reset_index() and set_index(), but by numeric position""" if to == 0: self.n_index_levels = 0 self.shape = (self.shape[0], len(self._columns) + 1) elif to == 1: self.n_index_levels = 1 self.shape = (self.shape[0], len(self._columns)) else: msg = "StreamedDataTable.move_index_boundary() only moves to 0 or 1" raise GeneFabConfigurationException(msg, to=to) @property def index_levels(self): """Iterate index level line by line""" if self.n_index_levels: yield ["*", "*", self._index_name] @property def column_levels(self): """Iterate column levels line by line""" if self.n_index_levels: yield from zip(*self._columns) else: yield from zip(["*", "*", self._index_name], *self._columns) @property def index(self): """Iterate index line by line, like in pandas""" if self.n_index_levels: index_query = f"SELECT `{self._index_name}` FROM ({self.query})" desc = "tables/StreamedDataTable/index" with self.sqltransactions.concurrent(desc) as (_, execute): try: if self.na_rep is None: yield from execute(index_query) else: _na_tup = (self.na_rep, ) for value, *_ in execute(index_query): yield _na_tup if value is None else (value, ) except OperationalError as e: reraise_operational_error(self, e) else: yield from ([] for _ in range(self.shape[0])) @property def values(self): """Iterate values line by line, like in pandas""" desc = "tables/StreamedDataTable/values" try: if self.na_rep is None: if self.n_index_levels: with self.sqltransactions.concurrent(desc) as (_, execute): for _, *vv in execute(self.query): yield vv else: with self.sqltransactions.concurrent(desc) as (_, execute): yield from execute(self.query) else: if self.shape[0] > 50: msg = "StreamedDataTable with custom na_rep may be slow" GeneFabLogger.warning(msg) if self.n_index_levels: with self.sqltransactions.concurrent(desc) as (_, execute): for _, *vv in execute(self.query): yield [self.na_rep if v is None else v for v in vv] else: with self.sqltransactions.concurrent(desc) as (_, execute): for vv in execute(self.query): yield [self.na_rep if v is None else v for v in vv] except OperationalError as e: reraise_operational_error(self, e)
class SQLiteObject(): """Universal wrapper for cached objects""" def __init__(self, *, sqlite_db, identifier=None, table_schemas=None): """Initialize SQLiteObject, ensure tables in `sqlite_db`""" self.sqlite_db, self.table_schemas = sqlite_db, table_schemas self.identifier = identifier self.changed = None self.sqltransactions = SQLTransactions(sqlite_db, identifier) desc = "tables/ensure_schema" with self.sqltransactions.concurrent(desc) as (_, execute): for table, schema in (table_schemas or {}).items(): execute( "CREATE TABLE IF NOT EXISTS `{}` ({})".format( validate_no_backtick(table, "table"), ", ".join("`" + validate_no_backtick(f, "field") + "` " + k for f, k in schema.items()), ), ) @classmethod def iterparts(cls, table, connection, *, must_exist=True, partname_mask="{table}://{i}"): """During an open connection, iterate all parts of `table` and their index and column names""" for i in count(): partname = table if i == 0 else partname_mask.format(table=table, i=i) if must_exist: query = f"SELECT * FROM `{partname}` LIMIT 0" try: cursor = connection.cursor() cursor.execute(query).fetchall() except OperationalError: break else: index_name = SQLiteIndexName(cursor.description[0][0]) columns = [c[0] for c in cursor.description[1:]] yield partname, index_name, columns else: yield partname, None, None @classmethod def drop_all_parts(cls, table, connection): """During an open connection, drop all parts of `table`""" _iterparts = cls.iterparts(table, connection, must_exist=True) for partname, *_ in list(_iterparts): try: connection.execute(f"DROP TABLE IF EXISTS `{partname}`") except Exception as e: GeneFabLogger.error(f"Could not drop {partname}", exc_info=e) raise else: GeneFabLogger.info(f"Dropped {partname} (if it existed)") def is_stale(self, *, timestamp_table=None, id_field=None, db_type=None, ignore_conflicts=False): """Evaluates to True if underlying data in need of update, otherwise False""" if (timestamp_table is None) or (id_field is None): msg = "did not pass arguments to self.is_stale(), will never update" GeneFabLogger.warning(f"{type(self).__name__} {msg}") else: db_type = db_type or f"{type(self).__name__}" desc = f"{db_type}/is_stale" self_id_value = getattr(self, id_field) query = f"""SELECT `timestamp` FROM `{timestamp_table}` WHERE `{id_field}` == "{self_id_value}" """ if ignore_conflicts: read_transaction = self.sqltransactions.unconditional else: read_transaction = self.sqltransactions.concurrent with read_transaction(desc) as (_, execute): ret = execute(query).fetchall() if len(ret) == 0: _staleness = True elif (len(ret) == 1) and (len(ret[0]) == 1): _staleness = (ret[0][0] < self.timestamp) else: _staleness = None if (_staleness is None) and (not ignore_conflicts): with self.sqltransactions.exclusive(desc) as (connection, _): msg = "Conflicting timestamp values for SQLiteObject" GeneFabLogger.warning(f"{msg}\n ({self_id_value})") self.drop(connection=connection) _staleness = True if _staleness is True: GeneFabLogger.info(f"{self_id_value} is stale, staging update") return _staleness def update(self): """Update underlying data in SQLite""" msg = "did not define self.update(), will never update" GeneFabLogger.warning(f"{type(self).__name__} {msg}") def retrieve(self): """Retrieve underlying data from SQLite""" msg = "did not define self.retrieve(), will always retrieve `None`" GeneFabLogger.warning(f"{type(self).__name__} {msg}") return None def cleanup(self): """Actions to be performed after request completion""" pass @property def data(self): """Main interface: returns data associated with this SQLiteObject; will have auto-updated itself in the process if necessary""" if self.is_stale(): self.update() self.changed = True else: self.changed = False Thread(target=self.cleanup).start() return self.retrieve()
class ResponseCache(): """LRU response cache; responses are identified by context.identity, dropped if underlying (meta)data changed""" def __init__(self, sqlite_dbs): self.sqlite_db = sqlite_dbs.response_cache["db"] self.maxdbsize = sqlite_dbs.response_cache["maxsize"] or float("inf") if self.sqlite_db is None: msg = "LRU SQL cache DISABLED by client parameter" _logw(f"ResponseCache():\n {msg}") else: self.sqltransactions = SQLTransactions(self.sqlite_db) desc = "response_cache/ensure_schema" with self.sqltransactions.concurrent(desc) as (_, execute): for table, schema in RESPONSE_CACHE_SCHEMAS: execute(f"CREATE TABLE IF NOT EXISTS `{table}` {schema}") bypass_if_disabled = lambda f: wraps(f)( lambda self, *args, **kwargs: ResponseContainer(content=None) if self.sqlite_db is None else f(self, *args, **kwargs)) def _validate_content_type(self, response_container): """Check if type of passed content is supported by ResponseCache""" _is = lambda _type: isinstance(response_container.content, _type) if not response_container.obj.accessions: return "none or unrecognized accession names in object" elif _is(Response) or (not _is(Callable)): return "only data iterators are supported" else: return None def _itercompress(self, content, isinstance=isinstance, str=str, bytes=bytes, Binary=Binary): """Iteratively compress chunks generated by callable `content`""" compressor = compressobj() for uncompressed_chunk in content(): if isinstance(uncompressed_chunk, str): chunk = compressor.compress(uncompressed_chunk.encode()) elif isinstance(uncompressed_chunk, bytes): chunk = compressor.compress(uncompressed_chunk) else: _type = type(uncompressed_chunk).__name__ raise TypeError("Content chunk is not str or bytes", _type) if chunk: yield Binary(chunk) chunk = compressor.flush(Z_FINISH) if chunk: yield Binary(chunk) @bypass_if_disabled def put(self, response_container, context): """Store response object blob in response_cache table, if possible; this will happen in a parallel thread""" problem = self._validate_content_type(response_container) if problem: msg = f"{context.identity}\n {problem}" _logw(f"ResponseCache(), did not store:\n {msg}") return def _put(desc="response_cache/put"): retrieved_at = int(datetime.now().timestamp()) with self.sqltransactions.exclusive(desc) as (_, execute): try: execute( """DELETE FROM `response_cache` WHERE `context_identity` == ?""", [context.identity]) _it = self._itercompress for i, chunk in enumerate(_it(response_container.content)): execute( """INSERT INTO `response_cache` (context_identity, i, chunk, mimetype, retrieved_at) VALUES (?,?,?,?,?)""", [ context.identity, i, chunk, response_container.mimetype, retrieved_at ]) for accession in response_container.obj.accessions: execute( """DELETE FROM `accessions_used` WHERE `accession` == ? AND `context_identity` == ?""", [accession, context.identity]) execute( """INSERT INTO `accessions_used` (accession, context_identity) VALUES (?, ?)""", [accession, context.identity]) except (OperationalError, ZlibError, TypeError) as e: msg = f"{context.identity}, {e!r}" _loge(f"ResponseCache(), could not store:\n {msg}") raise else: _logi(f"ResponseCache(), stored:\n {context.identity}") Thread(target=_put).start() def _drop_by_context_identity(self, execute, context_identity): """Drop responses with given context.identity""" execute( """DELETE FROM `accessions_used` WHERE `context_identity` == ?""", [context_identity]) execute( """DELETE FROM `response_cache` WHERE `context_identity` == ?""", [context_identity]) @bypass_if_disabled def drop(self, accession, desc="response_cache/drop"): """Drop responses for given accession""" with self.sqltransactions.exclusive(desc) as (_, execute): try: query = """SELECT `context_identity` FROM `accessions_used` WHERE `accession` == ?""" identity_entries = execute(query, [accession]).fetchall() for context_identity, *_ in identity_entries: self._drop_by_context_identity(execute, context_identity) except OperationalError as e: msg = "ResponseCache():\n could not drop responses for %s: %s" _loge(msg, accession, repr(e)) raise else: msg = "ResponseCache():\n dropped %s cached response(s) for %s" _logi(msg, len(identity_entries), accession) @bypass_if_disabled def drop_all(self, desc="response_cache/drop_all"): """Drop all cached responses""" with self.sqltransactions.exclusive(desc) as (_, execute): try: execute("DELETE FROM `accessions_used`") execute("DELETE FROM `response_cache`") except OperationalError as e: _loge(f"ResponseCache().drop_all():\n failed with {e!r}") raise else: _logi("ResponseCache():\n dropped all cached Flask responses") def _iterdecompress(self, cid, desc="response_cache/_iterdecompress"): """Iteratively decompress chunks retrieved from database by `context_identity`""" decompressor = decompressobj() with self.sqltransactions.concurrent(desc) as (_, execute): query = """SELECT `mimetype` FROM `response_cache` WHERE `context_identity` == ? LIMIT 1""" mimetype, = execute(query, [cid]).fetchone() or [None] if mimetype is None: raise EOFError("No chunks found in response_cache") else: yield mimetype query = """SELECT `chunk`,`mimetype` FROM `response_cache` WHERE `context_identity` == ? ORDER BY `i` ASC""" for chunk, _mimetype in execute(query, [cid]): if mimetype != _mimetype: raise OperationalError("Stored mimetypes of chunks differ") decompressed_chunk = decompressor.decompress(chunk) if decompressed_chunk: yield decompressed_chunk.decode() decompressed_chunk = decompressor.flush() if decompressed_chunk: yield decompressed_chunk.decode() @apply_hack(bypass_uncached_views) @bypass_if_disabled def get(self, context): """Retrieve cached response object blob from response_cache table if possible; otherwise return empty ResponseContainer()""" try: for value in self._iterdecompress(context.identity): pass # test retrieval and decompression before returning except EOFError: _logi(f"ResponseCache(), nothing yet for:\n {context.identity}") return ResponseContainer(content=None) except OperationalError as e: msg = "could not retrieve, staging replacement" _logw(f"ResponseCache() {msg}:\n {context.identity}, {e!r}") return ResponseContainer(content=None) except ZlibError as e: msg = "could not decompress, staging replacement" _logw(f"ResponseCache() {msg}:\n {context.identity}, {e!r}") return ResponseContainer(content=None) else: _logi(f"ResponseCache(), retrieving:\n {context.identity}") iterator = self._iterdecompress( context.identity) # second pass # TODO PhoenixIterator? mimetype = next(iterator) return ResponseContainer(lambda: iterator, mimetype) @bypass_if_disabled def shrink(self, max_iter=100, max_skids=20, desc="response_cache/shrink"): """Drop oldest cached responses to keep file size on disk under `self.maxdbsize`""" # TODO: DRY: very similar to genefab3.db.sql.core SQLiteTable.cleanup() n_dropped, n_skids = 0, 0 for _ in range(max_iter): current_size = path.getsize(self.sqlite_db) if (n_skids < max_skids) and (current_size > self.maxdbsize): with self.sqltransactions.concurrent(desc) as (_, execute): query_oldest = """SELECT `context_identity` FROM `response_cache` ORDER BY `retrieved_at` ASC""" cid = (execute(query_oldest).fetchone() or [None])[0] if cid is None: break with self.sqltransactions.exclusive(desc) as (connection, execute): try: msg = f"ResponseCache.shrink():\n dropping {cid}" GeneFabLogger.info(msg) self._drop_by_context_identity(execute, cid) except OperationalError as e: msg = f"Rolling back shrinkage due to {e!r}" GeneFabLogger.error(msg, exc_info=e) connection.rollback( ) # explicit, to be able to continue break else: connection.commit() n_dropped += 1 n_skids += (path.getsize(self.sqlite_db) >= current_size) else: break if n_dropped: _logi(f"ResponseCache():\n shrunk by {n_dropped} entries") elif path.getsize(self.sqlite_db) > self.maxdbsize: _logw("ResponseCache():\n could not drop entries to shrink") if n_skids: _logw(f"ResponseCache():\n file did not shrink {n_skids} times")