Exemple #1
0
def combine_objects(objects, context, limit=None):
    """Combine objects and post-process"""
    if len(objects) == 0:
        return None
    elif len(objects) == 1:
        combined = objects[0]
    elif all(isinstance(obj, StreamedDataTableWizard) for obj in objects):
        combined = StreamedDataTableWizard.concat(objects, axis=1)
    else:
        raise NotImplementedError("Merging non-table data objects")
    if isinstance(combined, StreamedDataTableWizard):
        if context.data_columns and (context.format == "gct"):
            msg = "GCT format is disabled for arbitrarily subset tables"
            _kw = dict(columns="|".join(context.data_columns))
            raise GeneFabFormatException(msg, **_kw)
        else:
            combined.constrain_columns(context=context)
            return combined.get(context=context)
    elif context.data_columns or context.data_comparisons:
        raise GeneFabFileException(
            "Column operations on non-table data objects are not supported",
            columns=context.data_columns,
            comparisons=context.data_comparisons,
        )
    else:
        return combined
Exemple #2
0
 def __download_as_pandas(self, chunksize, sniff_ahead=2**20):
     """Download and parse data from URL as a table"""
     with self.__tempfile() as tempfile:
         self.url = self.__copyfileobj(tempfile)
         with open(tempfile, mode="rb") as handle:
             magic = handle.read(3)
         if magic == b"\x1f\x8b\x08":
             compression = "gzip"
             from gzip import open as _open
         elif magic == b"\x42\x5a\x68":
             compression = "bz2"
             from bz2 import open as _open
         else:
             compression, _open = "infer", open
         try:
             with _open(tempfile, mode="rt", newline="") as handle:
                 sep = Sniffer().sniff(handle.read(sniff_ahead)).delimiter
             _reader_kw = dict(
                 sep=sep, compression=compression,
                 chunksize=chunksize, **self.pandas_kws,
             )
             for i, csv_chunk in enumerate(read_csv(tempfile, **_reader_kw)):
                 self.INPLACE_process(csv_chunk)
                 msg = f"interpreted table chunk {i}:\n  {tempfile}"
                 GeneFabLogger.info(f"{self.name}; {msg}")
                 yield csv_chunk
         except (IOError, UnicodeDecodeError, CSVError, PandasParserError):
             msg = "Not recognized as a table file"
             raise GeneFabFileException(msg, name=self.name, url=self.url)
Exemple #3
0
def fail_if_files_not_joinable(getset):
    """Check for ability to join data from requested files"""
    if len(getset("file", "datatype")) > 1:
        msg = "Cannot combine data of multiple datatypes"
        _kw = dict(datatypes=getset("file", "datatype"))
    elif len(getset("technology type")) > 1:
        msg = "Cannot combine data for multiple technology types"
        _kw = dict(technology_types=getset("technology type"))
    elif getset("file", "joinable") != {True}:
        msg = "Cannot combine files of this datatype from one or more assays"
        _kw = dict(
            datatype=getset("file", "datatype").pop(),
            accessions=getset("accession"),
            assay_names=getset("assay name"),
            filenames=getset("file", "filename"),
        )
    elif getset("file", "type") != {"table"}:
        msg = "Cannot combine non-table files"
        _kw = dict(types=getset("file", "type"))
    elif len(getset("file", "index_name")) > 1:
        msg = "Cannot combine tables with conflicting index names"
        _kw = dict(index_names=getset("file", "index_name"))
    else:
        msg, _kw = None, {}
    if msg:
        raise GeneFabFileException(msg, **_kw)
 def _unique_column_passed2full(self, passed_name):
     """Match passed column name to unique full column name found in self.objs[*].columns"""
     matches_and_misses = {
         o._column_passed2full(passed_name, ignore_missing=True)
         for o in self.objs
     }
     matches = matches_and_misses - {None}
     if not matches:
         msg = "Requested column not in table"
         raise GeneFabFileException(msg, column=passed_name)
     elif len(matches) > 1:
         msg = "Ambiguous column requested"
         sug = "Use full syntax (columns.ACCESSION/ASSAY/COLUMN)"
         _kw = dict(column=passed_name, suggestion=sug)
         raise GeneFabFileException(msg, **_kw)
     else:
         return matches.pop()
 def columns(self, passed_columns):
     passed_last_level = [c[-1] for c in passed_columns]
     own_last_level = [c[-1] for c in self._columns]
     if set(passed_last_level) <= set(own_last_level):
         self._columns = passed_columns
     else:
         msg = "Setting foreign column(s) to StreamedDataTableWizard"
         foreign = sorted(set(passed_last_level) - set(own_last_level))
         raise GeneFabFileException(msg, columns=foreign)
 def _column_passed2full(self, passed_name, ignore_missing=False):
     """Match passed column name to full column name found in self.columns"""
     _raw_name_counts = Counter(c[-1] for c in self.columns)
     full_name = self._columns_slashed2full.get(
         passed_name,
         (self._columns_raw2full.get(passed_name) if
          _raw_name_counts.get(passed_name) == 1 else _raw_name_counts.get(
              passed_name, 0)),
     )
     if isinstance(full_name, Iterable):  # string or tuple or list
         return full_name
     else:  # number of occurrences in _raw_name_counts
         if full_name == 0:
             if ignore_missing:
                 return None
             else:
                 msg = "Requested column not in table"
                 raise GeneFabFileException(msg, column=passed_name)
         else:
             msg = "Ambiguous column requested"
             sug = "Use full syntax (columns.ACCESSION/ASSAY/COLUMN)"
             _kw = dict(column=passed_name, suggestion=sug)
             raise GeneFabFileException(msg, **_kw)
 def _sanitize_where(self, context):
     """Infer column names for SQLite WHERE as columns are presented in table or view"""
     passed2full = getattr(
         self,
         # defined in StreamedDataTableWizard_OuterJoined:
         "_unique_column_passed2full",
         # defined in StreamedDataTableWizard/StreamedDataTableWizard_Single:
         self._column_passed2full,
     )
     for dc in getattr(context, "data_comparisons", []):
         match = search(r'(`)([^`]*)(`)', dc)
         if not match:
             msg = "Not a valid column in data comparison"
             raise GeneFabFileException(msg, comparison=dc)
         else:
             sanitized_name = "/".join(passed2full(match.group(2)))
             yield sub(r'(`)([^`]*)(`)', f"`{sanitized_name}`", dc, count=1)
Exemple #8
0
def combined_data(descriptors, n_descriptors, context, mongo_collections,
                  sqlite_dbs, adapter):
    """Patch through to cached data for each file and combine them"""
    getset = lru_cache(maxsize=None)(lambda *keys: set(
        reduce(lambda d, k: d.get(k, {}), keys, d) or None
        for d in descriptors))
    if n_descriptors > 1:
        fail_if_files_not_joinable(getset)
    if getset("file", "cacheable") != {True}:
        msg = "Data marked as non-cacheable, cannot be returned in this format"
        sug = "Use 'format=raw'"
        raise GeneFabFileException(msg, suggestion=sug, format=context.format)
    _types = getset("file", "type")
    if _types == {"table"}:
        sqlite_db, CachedFile = sqlite_dbs.tables["db"], CachedTableFile
        identifier_prefix = "TABLE"
        maxdbsize = sqlite_dbs.tables["maxsize"]
        _kws = dict(maxdbsize=maxdbsize, index_col=0, INPLACE_process=True)
    elif len(_types) == 1:
        sqlite_db, CachedFile = sqlite_dbs.blobs["db"], CachedBinaryFile
        identifier_prefix, _kws = "BLOB", {}
    else:
        raise NotImplementedError(f"Joining data of types {_types}")
    _sort_key = lambda d: (d.get("accession"), d.get("assay name"))
    data = combine_objects(
        context=context,
        objects=[
            get_formatted_data(
                descriptor,
                mongo_collections,
                sqlite_db,
                CachedFile,
                adapter,
                identifier_prefix,
                _kws,
            ) for descriptor in natsorted(descriptors, key=_sort_key)
        ])
    if data is None:
        raise GeneFabDatabaseException("No data found in database")
    else:
        data.datatypes = getset("file", "datatype")
        data.gct_validity_set = getset("file", "gct_valid")
        return data
Exemple #9
0
def get(*, mongo_collections, locale, context, sqlite_dbs, adapter):
    """Return data corresponding to search parameters; merged if multiple underlying files are same type and joinable"""
    descriptors = PhoenixIterator(
        aggregate_file_descriptors_by_context(
            mongo_collections.metadata,
            locale=locale,
            context=context,
        ))
    n_descriptors = 0
    for n_descriptors, d in enumerate(descriptors, 1):
        if ("file" in d) and (not isinstance(d["file"], dict)):
            msg = "Query did not result in an unambiguous target file"
            raise GeneFabDatabaseException(msg, debug_info=d)
        elif ("file" not in d) or ("filename" not in d["file"]):
            msg = "File information missing for entry"
            raise GeneFabDatabaseException(msg, entry=d)
    if n_descriptors == 0:
        cfp = quote(context.full_path)
        msg = "No file found matching specified constraints"
        raise HTTPError(cfp, 404, msg, hdrs=None, fp=None)
    elif context.format == "raw":
        if n_descriptors == 1:
            return file_redirect(next(descriptors))
        else:
            raise GeneFabFileException(
                ("Multiple files match query; " +
                 "with format 'raw', only one file can be requested"),
                format="raw",
                files={d["file"]["filename"]
                       for d in descriptors},
            )
    else:
        return combined_data(
            descriptors,
            n_descriptors,
            context,
            mongo_collections,
            sqlite_dbs,
            adapter,
        )