def combine_objects(objects, context, limit=None): """Combine objects and post-process""" if len(objects) == 0: return None elif len(objects) == 1: combined = objects[0] elif all(isinstance(obj, StreamedDataTableWizard) for obj in objects): combined = StreamedDataTableWizard.concat(objects, axis=1) else: raise NotImplementedError("Merging non-table data objects") if isinstance(combined, StreamedDataTableWizard): if context.data_columns and (context.format == "gct"): msg = "GCT format is disabled for arbitrarily subset tables" _kw = dict(columns="|".join(context.data_columns)) raise GeneFabFormatException(msg, **_kw) else: combined.constrain_columns(context=context) return combined.get(context=context) elif context.data_columns or context.data_comparisons: raise GeneFabFileException( "Column operations on non-table data objects are not supported", columns=context.data_columns, comparisons=context.data_comparisons, ) else: return combined
def __download_as_pandas(self, chunksize, sniff_ahead=2**20): """Download and parse data from URL as a table""" with self.__tempfile() as tempfile: self.url = self.__copyfileobj(tempfile) with open(tempfile, mode="rb") as handle: magic = handle.read(3) if magic == b"\x1f\x8b\x08": compression = "gzip" from gzip import open as _open elif magic == b"\x42\x5a\x68": compression = "bz2" from bz2 import open as _open else: compression, _open = "infer", open try: with _open(tempfile, mode="rt", newline="") as handle: sep = Sniffer().sniff(handle.read(sniff_ahead)).delimiter _reader_kw = dict( sep=sep, compression=compression, chunksize=chunksize, **self.pandas_kws, ) for i, csv_chunk in enumerate(read_csv(tempfile, **_reader_kw)): self.INPLACE_process(csv_chunk) msg = f"interpreted table chunk {i}:\n {tempfile}" GeneFabLogger.info(f"{self.name}; {msg}") yield csv_chunk except (IOError, UnicodeDecodeError, CSVError, PandasParserError): msg = "Not recognized as a table file" raise GeneFabFileException(msg, name=self.name, url=self.url)
def fail_if_files_not_joinable(getset): """Check for ability to join data from requested files""" if len(getset("file", "datatype")) > 1: msg = "Cannot combine data of multiple datatypes" _kw = dict(datatypes=getset("file", "datatype")) elif len(getset("technology type")) > 1: msg = "Cannot combine data for multiple technology types" _kw = dict(technology_types=getset("technology type")) elif getset("file", "joinable") != {True}: msg = "Cannot combine files of this datatype from one or more assays" _kw = dict( datatype=getset("file", "datatype").pop(), accessions=getset("accession"), assay_names=getset("assay name"), filenames=getset("file", "filename"), ) elif getset("file", "type") != {"table"}: msg = "Cannot combine non-table files" _kw = dict(types=getset("file", "type")) elif len(getset("file", "index_name")) > 1: msg = "Cannot combine tables with conflicting index names" _kw = dict(index_names=getset("file", "index_name")) else: msg, _kw = None, {} if msg: raise GeneFabFileException(msg, **_kw)
def _unique_column_passed2full(self, passed_name): """Match passed column name to unique full column name found in self.objs[*].columns""" matches_and_misses = { o._column_passed2full(passed_name, ignore_missing=True) for o in self.objs } matches = matches_and_misses - {None} if not matches: msg = "Requested column not in table" raise GeneFabFileException(msg, column=passed_name) elif len(matches) > 1: msg = "Ambiguous column requested" sug = "Use full syntax (columns.ACCESSION/ASSAY/COLUMN)" _kw = dict(column=passed_name, suggestion=sug) raise GeneFabFileException(msg, **_kw) else: return matches.pop()
def columns(self, passed_columns): passed_last_level = [c[-1] for c in passed_columns] own_last_level = [c[-1] for c in self._columns] if set(passed_last_level) <= set(own_last_level): self._columns = passed_columns else: msg = "Setting foreign column(s) to StreamedDataTableWizard" foreign = sorted(set(passed_last_level) - set(own_last_level)) raise GeneFabFileException(msg, columns=foreign)
def _column_passed2full(self, passed_name, ignore_missing=False): """Match passed column name to full column name found in self.columns""" _raw_name_counts = Counter(c[-1] for c in self.columns) full_name = self._columns_slashed2full.get( passed_name, (self._columns_raw2full.get(passed_name) if _raw_name_counts.get(passed_name) == 1 else _raw_name_counts.get( passed_name, 0)), ) if isinstance(full_name, Iterable): # string or tuple or list return full_name else: # number of occurrences in _raw_name_counts if full_name == 0: if ignore_missing: return None else: msg = "Requested column not in table" raise GeneFabFileException(msg, column=passed_name) else: msg = "Ambiguous column requested" sug = "Use full syntax (columns.ACCESSION/ASSAY/COLUMN)" _kw = dict(column=passed_name, suggestion=sug) raise GeneFabFileException(msg, **_kw)
def _sanitize_where(self, context): """Infer column names for SQLite WHERE as columns are presented in table or view""" passed2full = getattr( self, # defined in StreamedDataTableWizard_OuterJoined: "_unique_column_passed2full", # defined in StreamedDataTableWizard/StreamedDataTableWizard_Single: self._column_passed2full, ) for dc in getattr(context, "data_comparisons", []): match = search(r'(`)([^`]*)(`)', dc) if not match: msg = "Not a valid column in data comparison" raise GeneFabFileException(msg, comparison=dc) else: sanitized_name = "/".join(passed2full(match.group(2))) yield sub(r'(`)([^`]*)(`)', f"`{sanitized_name}`", dc, count=1)
def combined_data(descriptors, n_descriptors, context, mongo_collections, sqlite_dbs, adapter): """Patch through to cached data for each file and combine them""" getset = lru_cache(maxsize=None)(lambda *keys: set( reduce(lambda d, k: d.get(k, {}), keys, d) or None for d in descriptors)) if n_descriptors > 1: fail_if_files_not_joinable(getset) if getset("file", "cacheable") != {True}: msg = "Data marked as non-cacheable, cannot be returned in this format" sug = "Use 'format=raw'" raise GeneFabFileException(msg, suggestion=sug, format=context.format) _types = getset("file", "type") if _types == {"table"}: sqlite_db, CachedFile = sqlite_dbs.tables["db"], CachedTableFile identifier_prefix = "TABLE" maxdbsize = sqlite_dbs.tables["maxsize"] _kws = dict(maxdbsize=maxdbsize, index_col=0, INPLACE_process=True) elif len(_types) == 1: sqlite_db, CachedFile = sqlite_dbs.blobs["db"], CachedBinaryFile identifier_prefix, _kws = "BLOB", {} else: raise NotImplementedError(f"Joining data of types {_types}") _sort_key = lambda d: (d.get("accession"), d.get("assay name")) data = combine_objects( context=context, objects=[ get_formatted_data( descriptor, mongo_collections, sqlite_db, CachedFile, adapter, identifier_prefix, _kws, ) for descriptor in natsorted(descriptors, key=_sort_key) ]) if data is None: raise GeneFabDatabaseException("No data found in database") else: data.datatypes = getset("file", "datatype") data.gct_validity_set = getset("file", "gct_valid") return data
def get(*, mongo_collections, locale, context, sqlite_dbs, adapter): """Return data corresponding to search parameters; merged if multiple underlying files are same type and joinable""" descriptors = PhoenixIterator( aggregate_file_descriptors_by_context( mongo_collections.metadata, locale=locale, context=context, )) n_descriptors = 0 for n_descriptors, d in enumerate(descriptors, 1): if ("file" in d) and (not isinstance(d["file"], dict)): msg = "Query did not result in an unambiguous target file" raise GeneFabDatabaseException(msg, debug_info=d) elif ("file" not in d) or ("filename" not in d["file"]): msg = "File information missing for entry" raise GeneFabDatabaseException(msg, entry=d) if n_descriptors == 0: cfp = quote(context.full_path) msg = "No file found matching specified constraints" raise HTTPError(cfp, 404, msg, hdrs=None, fp=None) elif context.format == "raw": if n_descriptors == 1: return file_redirect(next(descriptors)) else: raise GeneFabFileException( ("Multiple files match query; " + "with format 'raw', only one file can be requested"), format="raw", files={d["file"]["filename"] for d in descriptors}, ) else: return combined_data( descriptors, n_descriptors, context, mongo_collections, sqlite_dbs, adapter, )