Exemple #1
0
 def values(self):
     """Iterate values line by line, like in pandas"""
     desc = "tables/StreamedDataTable/values"
     try:
         if self.na_rep is None:
             if self.n_index_levels:
                 with self.sqltransactions.concurrent(desc) as (_, execute):
                     for _, *vv in execute(self.query):
                         yield vv
             else:
                 with self.sqltransactions.concurrent(desc) as (_, execute):
                     yield from execute(self.query)
         else:
             if self.shape[0] > 50:
                 msg = "StreamedDataTable with custom na_rep may be slow"
                 GeneFabLogger.warning(msg)
             if self.n_index_levels:
                 with self.sqltransactions.concurrent(desc) as (_, execute):
                     for _, *vv in execute(self.query):
                         yield [self.na_rep if v is None else v for v in vv]
             else:
                 with self.sqltransactions.concurrent(desc) as (_, execute):
                     for vv in execute(self.query):
                         yield [self.na_rep if v is None else v for v in vv]
     except OperationalError as e:
         reraise_operational_error(self, e)
Exemple #2
0
 def __download_as_pandas(self, chunksize, sniff_ahead=2**20):
     """Download and parse data from URL as a table"""
     with self.__tempfile() as tempfile:
         self.url = self.__copyfileobj(tempfile)
         with open(tempfile, mode="rb") as handle:
             magic = handle.read(3)
         if magic == b"\x1f\x8b\x08":
             compression = "gzip"
             from gzip import open as _open
         elif magic == b"\x42\x5a\x68":
             compression = "bz2"
             from bz2 import open as _open
         else:
             compression, _open = "infer", open
         try:
             with _open(tempfile, mode="rt", newline="") as handle:
                 sep = Sniffer().sniff(handle.read(sniff_ahead)).delimiter
             _reader_kw = dict(
                 sep=sep, compression=compression,
                 chunksize=chunksize, **self.pandas_kws,
             )
             for i, csv_chunk in enumerate(read_csv(tempfile, **_reader_kw)):
                 self.INPLACE_process(csv_chunk)
                 msg = f"interpreted table chunk {i}:\n  {tempfile}"
                 GeneFabLogger.info(f"{self.name}; {msg}")
                 yield csv_chunk
         except (IOError, UnicodeDecodeError, CSVError, PandasParserError):
             msg = "Not recognized as a table file"
             raise GeneFabFileException(msg, name=self.name, url=self.url)
Exemple #3
0
 def cleanup(self, max_iter=100, max_skids=20, desc="tables/cleanup"):
     """Check size of underlying database file, drop oldest tables to keep file size under `self.maxdbsize`"""
     n_dropped, n_skids = 0, 0
     for _ in range(max_iter):
         current_size = path.getsize(self.sqlite_db)
         if (n_skids < max_skids) and (current_size > self.maxdbsize):
             with self.sqltransactions.concurrent(desc) as (_, execute):
                 query_oldest = f"""SELECT `table`
                     FROM `{self.aux_table}` ORDER BY `retrieved_at` ASC"""
                 table = (execute(query_oldest).fetchone() or [None])[0]
                 if table is None:
                     break
             with self.sqltransactions.exclusive(desc) as (connection, _):
                 try:
                     GeneFabLogger.info(f"{desc} purging: {table}")
                     self.drop(connection=connection, other=table)
                 except OperationalError as e:
                     msg = f"Rolling back shrinkage due to {e!r}"
                     GeneFabLogger.error(msg, exc_info=e)
                     connection.rollback(
                     )  # explicit, to be able to continue
                     break
                 else:
                     connection.commit()
                     n_dropped += 1
             n_skids += (path.getsize(self.sqlite_db) >= current_size)
         else:
             break
     desc = f"SQLiteTable():\n  {self.sqlite_db}"
     if n_dropped:
         GeneFabLogger.info(f"{desc} shrunk by {n_dropped} entries")
     elif path.getsize(self.sqlite_db) > self.maxdbsize:
         GeneFabLogger.warning(f"{desc} could not be shrunk")
     if n_skids:
         GeneFabLogger.warning(f"{desc} did not shrink {n_skids} times")
 def __del__(self, desc="TempSelect/__del__"):
     with self.sqltransactions.exclusive(desc) as (_, execute):
         try:
             execute(f"DROP {self.kind} `{self.name}`")
         except OperationalError as e:
             msg = f"Failed to drop temporary {self.kind} {self.name}"
             GeneFabLogger.error(msg, exc_info=e)
         else:
             msg = f"Dropped temporary SQLite {self.kind} {self.name}"
             GeneFabLogger.info(msg)
Exemple #5
0
 def drop_all_parts(cls, table, connection):
     """During an open connection, drop all parts of `table`"""
     _iterparts = cls.iterparts(table, connection, must_exist=True)
     for partname, *_ in list(_iterparts):
         try:
             connection.execute(f"DROP TABLE IF EXISTS `{partname}`")
         except Exception as e:
             GeneFabLogger.error(f"Could not drop {partname}", exc_info=e)
             raise
         else:
             GeneFabLogger.info(f"Dropped {partname} (if it existed)")
Exemple #6
0
 def drop(self, *, connection, other=None):
     identifier = other or self.identifier
     try:
         connection.execute(f"""DELETE FROM `{self.table}`
             WHERE `identifier` == "{identifier}" """)
     except Exception as e:
         msg = f"Could not delete from {self.table}: {identifier}"
         GeneFabLogger.error(msg, exc_info=e)
         raise
     else:
         GeneFabLogger.info(f"Deleted from {self.table}: {identifier}")
Exemple #7
0
def ensure_info_index(mongo_collections, locale):
    """Index `id.*` for sorting"""
    if "id" not in mongo_collections.metadata.index_information():
        msgmask = "Generating index for metadata collection ('{}'), key 'id'"
        id_fields = METADATA_AUX_TEMPLATE["id"].keys()
        GeneFabLogger.info(msgmask.format(mongo_collections.metadata.name))
        mongo_collections.metadata.create_index(
            name="id", keys=[(f"id.{f}", ASCENDING) for f in id_fields],
            collation={"locale": locale, "numericOrdering": True},
        )
        msgmask = "Index generated for metadata collection ('{}'), key 'id'"
        GeneFabLogger.info(msgmask.format(mongo_collections.metadata.name))
Exemple #8
0
 def drop(self, *, connection, other=None):
     table = other or self.table
     try:
         connection.execute(f"""DELETE FROM `{self.aux_table}`
             WHERE `table` == "{table}" """)
     except Exception as e:
         msg = f"Could not delete from {self.aux_table}: {table}"
         GeneFabLogger.error(msg, exc_info=e)
         raise
     else:
         GeneFabLogger.info(f"Deleted from {self.aux_table}: {table}")
     SQLiteObject.drop_all_parts(table, connection)
Exemple #9
0
 def update(self, desc="blobs/update"):
     """Run `self.__download_as_blob()` and insert result (optionally compressed) into `self.table` as BLOB"""
     blob = Binary(bytes(self.compressor(self.__download_as_blob())))
     retrieved_at = int(datetime.now().timestamp())
     with self.sqltransactions.exclusive(desc) as (connection, execute):
         if self.is_stale(ignore_conflicts=True) is False:
             return # data was updated while waiting to acquire lock
         self.drop(connection=connection)
         execute(f"""INSERT INTO `{self.table}`
             (`identifier`,`blob`,`timestamp`,`retrieved_at`)
             VALUES(?,?,?,?)""", [
             self.identifier, blob, self.timestamp, retrieved_at])
         msg = f"Inserted new blob into {self.table}"
         GeneFabLogger.info(f"{msg}:\n  {self.identifier}")
 def get(self, *, context, limit=None, offset=0):
     """Interpret arguments and retrieve data as StreamedDataTable by running SQL queries"""
     data = StreamedDataTable(
         sqlite_db=self.sqlite_db,
         source_select=self.make_select(kind="VIEW"),
         targets=",".join((
             f"`{self._index_name}`",
             *(f"`{'/'.join(c)}`" for c in self.columns),
         )),
         query_filter=self._make_query_filter(context, limit, offset),
         na_rep=NaN,
     )
     msg = "staged to retrieve from SQLite as StreamedDataTable"
     GeneFabLogger.info(f"{self.name};\n  {msg}")
     return data
Exemple #11
0
 def run(self):
     """Continuously run MongoDB and SQLite3 cachers"""
     while True:
         ensure_info_index(self.mongo_collections, self.locale)
         accessions, success = self.recache_metadata()
         if success:
             update_metadata_value_lookup(self.mongo_collections, self._id)
             if accessions["updated"]:
                 self.response_cache.drop_all()
             else:
                 for acc in accessions["failed"] | accessions["dropped"]:
                     self.response_cache.drop(acc)
             self.response_cache.shrink()
             delay = self.full_update_interval
         else:
             delay = self.full_update_retry_delay
         GeneFabLogger.info(f"{self._id}:\n  Sleeping for {delay} seconds")
         sleep(delay)
Exemple #12
0
def run_mongo_action(action,
                     collection,
                     *,
                     query=None,
                     data=None,
                     documents=None):
    """Shortcut to replace/delete/insert all matching instances"""
    error_message, unused_arguments = None, None
    if action == "replace":
        if (query is not None) and (data is not None):
            collection.delete_many(query)
            collection.insert_one({**query, **data})
            if documents is not None:
                unused_arguments = "`documents`"
        else:
            error_message = "no `query` and/or `data` specified"
    elif action == "delete_many":
        if query is not None:
            collection.delete_many(query)
            if (data is not None) or (documents is not None):
                unused_arguments = "`data`, `documents`"
        else:
            error_message = "no `query` specified"
    elif action == "insert_many":
        if documents is not None:
            collection.insert_many(documents)
            if (query is not None) or (data is not None):
                unused_arguments = "`query`, `data`"
        else:
            error_message = "no `documents` specified"
    else:
        error_message = "unsupported action"
    if unused_arguments:
        message = "run_mongo_transaction('%s'): %s unused in this action"
        GeneFabLogger.warning(message, action, unused_arguments)
    if error_message:
        raise GeneFabDatabaseException(
            error_message,
            action=action,
            collection=collection,
            query=query,
            data=data,
            documents=documents,
        )
Exemple #13
0
def update_metadata_value_lookup(mongo_collections, cacher_id, keys=("investigation", "study", "assay")):
    """Collect existing keys and values for lookups"""
    m = "{}:\n  reindexing metadata lookup records ('{}')"
    GeneFabLogger.info(m.format(cacher_id, mongo_collections.metadata_aux.name))
    index = deepcopy_keys(METADATA_AUX_TEMPLATE, *keys)
    INPLACE_update_metadata_value_lookup_keys(index, mongo_collections)
    INPLACE_update_metadata_value_lookup_values(index, mongo_collections)
    collection = mongo_collections.metadata_aux
    with collection.database.client.start_session() as session:
        with session.start_transaction():
            for isa_category in index:
                for subkey in index[isa_category]:
                    run_mongo_action(
                        action="replace", collection=collection,
                        query={"isa_category": isa_category, "subkey": subkey},
                        data={"content": index[isa_category][subkey]},
                    )
    m = "{}:\n  finished reindexing metadata lookup records ('{}')"
    GeneFabLogger.info(m.format(cacher_id, mongo_collections.metadata_aux.name))
Exemple #14
0
def speed_up_data_schema(get, self, *, context, limit=None, offset=0):
    """If context.schema == '1', replaces underlying query with quick retrieval of just values informative for schema"""
    if context.schema != "1":
        return get(self, context=context, limit=limit, offset=offset)
    elif context.data_columns or context.data_comparisons:
        msg = "Data schema does not support column subsetting / comparisons"
        sug = "Remove comparisons and/or column, row slicing from query"
        raise GeneFabFormatException(msg, suggestion=sug)
    else:
        from genefab3.db.sql.streamed_tables import (
            SQLiteIndexName,
            StreamedDataTableWizard_Single,
            StreamedDataTableWizard_OuterJoined,
        )
        GeneFabLogger.info(f"apply_hack(speed_up_data_schema) for {self.name}")
        sub_dfs, sub_indices = OrderedDict(), {}
        sub_columns, index_name = [], []

        def _extend_parts(obj):
            for partname, partcols in obj._inverse_column_dispatcher.items():
                if isinstance(partcols[0], SQLiteIndexName):
                    index_name.clear()
                    index_name.append(partcols[0])
                    sub_df = get_sub_df(obj, partname, partcols)
                else:
                    sub_df = get_sub_df(obj, partname,
                                        [*index_name, *partcols])
                sub_indices[partname] = get_part_index(obj, partname)
                sub_dfs[partname] = sub_df
                _ocr2f = obj._columns_raw2full
                sub_columns.extend(_ocr2f[c] for c in sub_df.columns)

        if isinstance(self, StreamedDataTableWizard_Single):
            _extend_parts(self)
        elif isinstance(self, StreamedDataTableWizard_OuterJoined):
            for obj in self.objs:
                _extend_parts(obj)
        else:
            msg = "Schema speedup applied to unsupported object type"
            raise GeneFabConfigurationException(msg, type=type(self))
        sub_merged = merge_subs(self, sub_dfs, sub_indices)
        return StreamedDataTableSub(sub_merged, sub_columns)
Exemple #15
0
def twolevel(obj, context, squash_preheader=False, frozen=0, indent=None):
    """Display StreamedTable with two-level columns using SlickGrid"""
    GeneFabLogger.info("HTML: converting StreamedTable into interactive table")
    obj.move_index_boundary(to=0)
    title_postfix = repr_quote(f"{context.view} {context.complete_kwargs}")

    def content():
        is_annotation_table = isinstance(obj, StreamedAnnotationTable)
        if is_annotation_table and (context.view != "status"):
            formatters = iterate_formatters(obj.columns, context)
        else:
            formatters = []
        if squash_preheader:
            columns = ((f"{c[0]}<br>{c[1]}", c[2]) for c in obj.columns)
            preheader_css = SQUASHED_PREHEADER_CSS
        else:
            columns, preheader_css = obj.columns, ""
        replacements = {
            "$APPNAME": f"{context.app_name}: {title_postfix}",
            "$URL_ROOT": context.url_root,
            "$SQUASH_PREHEADER": preheader_css,
            "$CSVLINK": build_url(context, drop={"format"}) + "format=csv",
            "$TSVLINK": build_url(context, drop={"format"}) + "format=tsv",
            "$JSONLINK": build_url(context, drop={"format"}) + "format=json",
            "$VIEWDEPENDENTLINKS": get_view_dependent_links(obj, context),
            "$ASSAYSVIEW": build_url(context, "assays"),
            "$SAMPLESVIEW": build_url(context, "samples"),
            "$DATAVIEW": build_url(context, "data"),
            "$COLUMNDATA": _iter_json_chunks(data=columns,
                                             length=obj.shape[1]),
            "$ROWDATA": _iter_json_chunks(data=obj.values,
                                          length=obj.shape[0]),
            "$CONTEXTURL": build_url(context),
            "$FORMATTERS": "\n".join(formatters),
            "$FROZENCOLUMN": "undefined" if frozen is None else str(frozen),
        }
        template_file = Path(__file__).parent / "dataframe.html"
        yield from _iter_html_chunks(template_file, replacements)

    return content, "text/html"
Exemple #16
0
 def update(self, to_sql_kws=dict(index=True, if_exists="append"), chunksize=256, desc="tables/update"):
     """Update `self.table` with result of `self.__download_as_pandas()`, update `self.aux_table` with timestamps"""
     columns, width, bounds = None, None, None
     with self.sqltransactions.exclusive(desc) as (connection, execute):
         if self.is_stale(ignore_conflicts=True) is False:
             return # data was updated while waiting to acquire lock
         self.drop(connection=connection)
         for csv_chunk in self.__download_as_pandas(chunksize=chunksize):
             try:
                 columns = csv_chunk.columns if columns is None else columns
                 if width is None:
                     width = csv_chunk.shape[1]
                     bounds = bounds or range(0, width, self.maxpartcols)
                 if (csv_chunk.shape[1] != width):
                     raise ValueError("Inconsistent chunk width")
                 if (csv_chunk.columns != columns).any():
                     raise ValueError("Inconsistent chunk column names")
                 parts = SQLiteObject.iterparts(
                     self.table, connection, must_exist=0,
                 )
                 for bound, (partname, *_) in zip(bounds, parts):
                     bounded = csv_chunk.iloc[:,bound:bound+self.maxpartcols]
                     bounded.to_sql(
                         partname, NoCommitConnection(connection),
                         **to_sql_kws, chunksize=chunksize,
                         method=ExecuteMany(partname, bounded.shape[1]),
                     )
                     msg = "Extended table for CachedTableFile"
                     GeneFabLogger.info(f"{msg}:\n  {self.name}, {partname}")
             except (OperationalError, PandasDatabaseError, ValueError) as e:
                 msg = "Failed to insert SQL chunk or chunk part"
                 _kw = dict(name=self.name, debug_info=repr(e))
                 raise GeneFabDatabaseException(msg, name=self.name)
         execute(f"""INSERT INTO `{self.aux_table}`
             (`table`,`timestamp`,`retrieved_at`) VALUES(?,?,?)""", [
             self.table, self.timestamp, int(datetime.now().timestamp()),
         ])
         msg = "Finished extending; all parts inserted for CachedTableFile"
         GeneFabLogger.info(f"{msg}:\n  {self.name}\n  {self.table}")
Exemple #17
0
 def recache_single_dataset_metadata(self, accession, has_cache):
     """Check if dataset changed, update metadata cached in `self.mongo_collections.metadata`, report with result/errors"""
     try:
         files = ValueCheckedRecord(
             identifier=dict(kind="dataset files", accession=accession),
             collection=self.mongo_collections.records,
             value=self.adapter.get_files_by_accession(accession),
         )
         if files.changed or (not has_cache):
             best_sample_name_matches = self.adapter.best_sample_name_matches
             dataset = Dataset(
                 accession,
                 files.value,
                 self.sqlite_dbs,
                 best_sample_name_matches=best_sample_name_matches,
                 status_kwargs=self.status_kwargs,
             )
         else:
             dataset = None
     except Exception as e:
         msg = f"{self._id} @ {accession}:\n  {e!r}"
         if has_cache:
             status = "stale"
             report = f"failed to retrieve ({repr(e)}), kept stale"
             GeneFabLogger.warning(msg, exc_info=e)
         else:
             status, report = "failed", f"failed to retrieve ({repr(e)})"
             GeneFabLogger.error(msg, exc_info=e)
         return status, report, e
     if dataset is not None:  # files have changed OR needs to be re-inserted
         self.drop_single_dataset_metadata(accession)
         e = self.recache_single_dataset_samples(dataset)
         if e is not None:
             self.drop_single_dataset_metadata(accession)
             return "failed", f"failed to parse ({repr(e)})", e
         else:
             return "updated", "updated", None
     else:  # files have not changed
         return "fresh", "no action (fresh)", None
Exemple #18
0
 def __init__(self, identifier, collection, value):
     """Match existing documents by base64-encoded `value`, update if changed, report state in self.changed"""
     if not isinstance(identifier, dict):
         msg = "ValueCheckedRecord(): `identifier` is not a dictionary"
         raise GeneFabConfigurationException(msg, identifier=identifier)
     elif "base64value" in identifier:
         msg = "ValueCheckedRecord(): `identifier` uses a reserved key"
         _kw = dict(identifier=identifier, key="base64value")
         raise GeneFabConfigurationException(msg, **_kw)
     else:
         self.identifier, self.value = identifier, value
         try:
             dumped = dumps(value, sort_keys=True, default=funcdump)
             self.base64value = compress(encodebytes(dumped.encode()))
         except TypeError as e:
             msg, _erep = "ValueCheckedRecord(): TypeError", repr(e)
             _kw = dict(identifier=identifier,
                        value=value,
                        debug_info=_erep)
             raise GeneFabConfigurationException(msg, **_kw)
         else:
             self.changed, n_stale_entries = True, 0
             for entry in collection.find(identifier):
                 if entry["base64value"] == self.base64value:
                     self.changed = False
                 else:
                     n_stale_entries += 1
             if (n_stale_entries != 0) or self.changed:
                 msg = f"ValueCheckedRecord updated:\n  {identifier}"
                 GeneFabLogger.info(msg)
                 with collection.database.client.start_session() as session:
                     with session.start_transaction():
                         run_mongo_action(
                             "replace",
                             collection,
                             query=identifier,
                             data={"base64value": self.base64value},
                         )
Exemple #19
0
    def recache_metadata(self):
        """Instantiate each available dataset; if contents changed, dataset automatically updates db.metadata"""
        GeneFabLogger.info(f"{self._id}:\n  Checking metadata cache")
        try:
            collection = self.mongo_collections.metadata
            accessions = OrderedDict(
                cached=set(collection.distinct("id.accession")),
                live=set(self.adapter.get_accessions()),
                fresh=set(),
                updated=set(),
                stale=set(),
                dropped=set(),
                failed=set(),
            )
        except Exception as e:
            GeneFabLogger.error(f"{self._id}:\n  {e!r}", exc_info=e)
            return None, False

        def _iterate():
            for a in accessions["cached"] - accessions["live"]:
                yield (a, *self.drop_single_dataset_metadata(a))
            for a in accessions["live"]:
                has_cache = a in accessions["cached"]
                yield (a, *self.recache_single_dataset_metadata(a, has_cache))

        for accession, key, report, error in _iterate():
            accessions[key].add(accession)
            _kws = dict(
                **self.status_kwargs,
                status=key,
                accession=accession,
                prefix=self._id,
                info=f"{accession} {report}",
                error=error,
            )
            if key in {"dropped", "failed"}:
                drop_status(**_kws)
            update_status(**_kws)
            mongo_client = self.genefab3_client.mongo_client
            n_apps = sum(1 for _ in iterate_mongo_connections(mongo_client))
            msg = f"Total number of active MongoDB connections: {n_apps}"
            GeneFabLogger.info(msg)
            sleep(self.dataset_update_interval)
        GeneFabLogger.info(f"{self._id}, datasets:\n  " +
                           ", ".join(f"{k}={len(v)}"
                                     for k, v in accessions.items()))
        return accessions, True
Exemple #20
0
 def shrink(self, max_iter=100, max_skids=20, desc="response_cache/shrink"):
     """Drop oldest cached responses to keep file size on disk under `self.maxdbsize`"""
     # TODO: DRY: very similar to genefab3.db.sql.core SQLiteTable.cleanup()
     n_dropped, n_skids = 0, 0
     for _ in range(max_iter):
         current_size = path.getsize(self.sqlite_db)
         if (n_skids < max_skids) and (current_size > self.maxdbsize):
             with self.sqltransactions.concurrent(desc) as (_, execute):
                 query_oldest = """SELECT `context_identity`
                     FROM `response_cache` ORDER BY `retrieved_at` ASC"""
                 cid = (execute(query_oldest).fetchone() or [None])[0]
                 if cid is None:
                     break
             with self.sqltransactions.exclusive(desc) as (connection,
                                                           execute):
                 try:
                     msg = f"ResponseCache.shrink():\n  dropping {cid}"
                     GeneFabLogger.info(msg)
                     self._drop_by_context_identity(execute, cid)
                 except OperationalError as e:
                     msg = f"Rolling back shrinkage due to {e!r}"
                     GeneFabLogger.error(msg, exc_info=e)
                     connection.rollback(
                     )  # explicit, to be able to continue
                     break
                 else:
                     connection.commit()
                     n_dropped += 1
             n_skids += (path.getsize(self.sqlite_db) >= current_size)
         else:
             break
     if n_dropped:
         _logi(f"ResponseCache():\n  shrunk by {n_dropped} entries")
     elif path.getsize(self.sqlite_db) > self.maxdbsize:
         _logw("ResponseCache():\n  could not drop entries to shrink")
     if n_skids:
         _logw(f"ResponseCache():\n  file did not shrink {n_skids} times")
 def __init__(self,
              *,
              sqlite_db,
              query,
              targets,
              kind="TABLE",
              _depends_on=None,
              msg=None):
     self.sqlite_db = sqlite_db
     self._depends_on = _depends_on  # keeps sources from being deleted early
     self.query, self.targets, self.kind = query, targets, kind
     self.name = "TEMP:" + random_unique_string(seed=query)
     self.sqltransactions = SQLTransactions(self.sqlite_db, self.name)
     with self.sqltransactions.exclusive("TempSelect") as (_, execute):
         if msg:
             GeneFabLogger.info(msg)
         try:
             execute(f"CREATE {self.kind} `{self.name}` as {query}")
         except OperationalError as e:
             reraise_operational_error(self, e)
         else:
             query_repr = repr(query.lstrip()[:200] + "...")
             msg = f"Created temporary SQLite {self.kind}"
             GeneFabLogger.info(f"{msg} {self.name} from\n  {query_repr}")
Exemple #22
0
 def recache_single_dataset_samples(self, dataset):
     """Insert per-sample documents into MongoDB, return exception on error"""
     collection = self.mongo_collections.metadata
     with collection.database.client.start_session() as session:
         with session.start_transaction():
             try:
                 has_samples = False
                 for sample in dataset.samples:
                     collection.insert_one(
                         harmonize_document(
                             sample,
                             self.units_formatter,
                         ))
                     has_samples = True
                     if "Study" not in sample:
                         update_status(
                             **self.status_kwargs,
                             status="warning",
                             warning="Study entry missing",
                             accession=dataset.accession,
                             assay_name=sample.assay_name,
                             sample_name=sample.name,
                         )
                 if not has_samples:
                     update_status(
                         **self.status_kwargs,
                         status="warning",
                         warning="No samples",
                         accession=dataset.accession,
                     )
             except Exception as e:
                 msg = f"{self._id} @ {dataset.accession} samples:\n  {e!r}"
                 GeneFabLogger.error(msg, exc_info=e)
                 return e
             else:
                 return None
Exemple #23
0
 def __copyfileobj(self, tempfile):
     """Try all URLs and push data into temporary file"""
     for url in self.urls:
         with open(tempfile, mode="wb") as handle:
             GeneFabLogger.info(f"{self.name}; trying URL:\n  {url}")
             try:
                 with request_get(url, stream=True) as response:
                     response.raw.decode_content = True
                     msg = f"{self.name}:\n  streaming to {tempfile}"
                     GeneFabLogger.debug(msg)
                     copyfileobj(response.raw, handle)
             except (URLError, OSError) as e:
                 msg = f"{self.name}; tried URL and failed:\n  {url}"
                 GeneFabLogger.warning(msg, exc_info=e)
             else:
                 msg = f"{self.name}; successfully fetched data:\n  {url}"
                 GeneFabLogger.info(msg)
                 return url
     else:
         msg = "None of the URLs are reachable for file"
         _kw = dict(name=self.name, urls=self.urls)
         raise GeneFabDataManagerException(msg, **_kw)
Exemple #24
0
 def _ok_to_loop_metadata_cacher_thread(self, enabled):
     """Check if no other instances of genefab3 are talking to MongoDB database"""
     if not enabled:
         m = "MetadataCacherThread disabled by client parameter, NOT LOOPING"
         GeneFabLogger.info(f"{self.mongo_appname}:\n  {m}")
         return False
     else:
         for other in iterate_mongo_connections(self.mongo_client):
             if other < self.mongo_appname:
                 m = (f"Found other instance {other}, " +
                     "NOT LOOPING current instance")
                 GeneFabLogger.info(f"{self.mongo_appname}:\n  {m}")
                 return False
         else:
             m = "No other instances found, STARTING LOOP"
             GeneFabLogger.info(f"{self.mongo_appname}:\n  {m}")
             return True
Exemple #25
0
 def __download_as_blob(self):
     """Download data from URL as-is"""
     for url in self.urls:
         GeneFabLogger.info(f"{self.name}; trying URL:\n  {url}")
         try:
             with request_get(url) as response:
                 data = response.content
         except (URLError, OSError) as e:
             msg = f"{self.name}; tried URL and failed:\n  {url}"
             GeneFabLogger.warning(msg, exc_info=e)
         else:
             msg = f"{self.name}; successfully fetched blob:\n  {url}"
             GeneFabLogger.info(msg)
             self.url = url
             return data
     else:
         msg = "None of the URLs are reachable for file"
         _kw = dict(name=self.name, urls=self.urls)
         raise GeneFabDataManagerException(msg, **_kw)
Exemple #26
0
 def is_stale(self,
              *,
              timestamp_table=None,
              id_field=None,
              db_type=None,
              ignore_conflicts=False):
     """Evaluates to True if underlying data in need of update, otherwise False"""
     if (timestamp_table is None) or (id_field is None):
         msg = "did not pass arguments to self.is_stale(), will never update"
         GeneFabLogger.warning(f"{type(self).__name__} {msg}")
     else:
         db_type = db_type or f"{type(self).__name__}"
         desc = f"{db_type}/is_stale"
         self_id_value = getattr(self, id_field)
         query = f"""SELECT `timestamp` FROM `{timestamp_table}`
             WHERE `{id_field}` == "{self_id_value}" """
     if ignore_conflicts:
         read_transaction = self.sqltransactions.unconditional
     else:
         read_transaction = self.sqltransactions.concurrent
     with read_transaction(desc) as (_, execute):
         ret = execute(query).fetchall()
         if len(ret) == 0:
             _staleness = True
         elif (len(ret) == 1) and (len(ret[0]) == 1):
             _staleness = (ret[0][0] < self.timestamp)
         else:
             _staleness = None
     if (_staleness is None) and (not ignore_conflicts):
         with self.sqltransactions.exclusive(desc) as (connection, _):
             msg = "Conflicting timestamp values for SQLiteObject"
             GeneFabLogger.warning(f"{msg}\n  ({self_id_value})")
             self.drop(connection=connection)
         _staleness = True
     if _staleness is True:
         GeneFabLogger.info(f"{self_id_value} is stale, staging update")
     return _staleness
Exemple #27
0
 def update(self):
     """Update underlying data in SQLite"""
     msg = "did not define self.update(), will never update"
     GeneFabLogger.warning(f"{type(self).__name__} {msg}")
Exemple #28
0
 def retrieve(self):
     """Retrieve underlying data from SQLite"""
     msg = "did not define self.retrieve(), will always retrieve `None`"
     GeneFabLogger.warning(f"{type(self).__name__} {msg}")
     return None