def get_files_by_accession(self, accession): """Get dictionary of files for dataset available through genelab.nasa.gov/genelabAPIs""" try: url = self.constants.COLD_GLDS_MASK.format(accession) glds_json = read_json(url) assert len(glds_json) == 1 _id = glds_json[0]["_id"] except (AssertionError, IndexError, KeyError, TypeError): raise GeneFabDataManagerException( "Malformed GLDS JSON", accession=accession, url=url, object_type=type(glds_json).__name__, length=getattr(glds_json, "__len__", lambda: None)(), target="[0]['_id']", ) try: url = self.constants.COLD_FILELISTINGS_MASK.format(_id) filelisting_json = read_json(url) assert isinstance(filelisting_json, list) except AssertionError: raise GeneFabDataManagerException( "Malformed 'filelistings' JSON", accession=accession, _id=_id, url=url, object_type=type(filelisting_json).__name__, expected_type="list", ) else: files = json_normalize(filelisting_json) with catch_warnings(): filterwarnings("ignore", category=UnknownTimezoneWarning) files["date_created"] = as_timestamp(files, "date_created") files["date_modified"] = as_timestamp(files, "date_modified") files["timestamp"] = files[["date_created", "date_modified"]].max(axis=1) return { row["file_name"]: self._format_file_entry(row) for _, row in files.sort_values(by="timestamp").iterrows() }
def read_json(url): """Get parsed JSON from URL""" try: with request_get(url) as response: return response.json() except (URLError, OSError): raise GeneFabDataManagerException("Not found", url=url)
def get_accessions(self): """Return list of dataset accessions available through genelab.nasa.gov/genelabAPIs""" try: n_datasets_url = self.constants.COLD_SEARCH_MASK.format(0) n_datasets = read_json(n_datasets_url)["hits"]["total"] datasets_url = self.constants.COLD_SEARCH_MASK.format(n_datasets) return {e["_id"] for e in read_json(datasets_url)["hits"]["hits"]} except (KeyError, TypeError): raise GeneFabDataManagerException("Malformed GeneLab search JSON")
def __download_as_blob(self): """Download data from URL as-is""" for url in self.urls: GeneFabLogger.info(f"{self.name}; trying URL:\n {url}") try: with request_get(url) as response: data = response.content except (URLError, OSError) as e: msg = f"{self.name}; tried URL and failed:\n {url}" GeneFabLogger.warning(msg, exc_info=e) else: msg = f"{self.name}; successfully fetched blob:\n {url}" GeneFabLogger.info(msg) self.url = url return data else: msg = "None of the URLs are reachable for file" _kw = dict(name=self.name, urls=self.urls) raise GeneFabDataManagerException(msg, **_kw)
def __init__(self, accession, files, sqlite_dbs, best_sample_name_matches=None, status_kwargs=None): self.accession, self.files = accession, files self.sqlite_db = sqlite_dbs.blobs["db"] self.maxdbsize = sqlite_dbs.blobs["maxsize"] self.best_sample_name_matches = ( best_sample_name_matches or (lambda n, N: Adapter.best_sample_name_matches(None, n, N))) isa_files = { filename: descriptor for filename, descriptor in files.items() if descriptor.get("datatype") == "isa" } if len(isa_files) != 1: msg = "File entries for Dataset must contain exactly one ISA file" _kw = dict(accession=accession, filenames=set(isa_files)) raise GeneFabDataManagerException(msg, **_kw) else: isa_name, isa_desc = next(iter(isa_files.items())) urls = isa_desc.get("urls", ()) isa_file = CachedBinaryFile( name=isa_name, identifier=f"BLOB:{accession}/ISA/{isa_name}", sqlite_db=self.sqlite_db, maxdbsize=self.maxdbsize, urls=urls, timestamp=isa_desc.get("timestamp", -1), ) self.isa = IsaFromZip( data=isa_file.data, status_kwargs={ **(status_kwargs or {}), "accession": accession, "filename": isa_file.name, "url": isa_file.url, }, ) self.isa.changed = isa_file.changed
def __copyfileobj(self, tempfile): """Try all URLs and push data into temporary file""" for url in self.urls: with open(tempfile, mode="wb") as handle: GeneFabLogger.info(f"{self.name}; trying URL:\n {url}") try: with request_get(url, stream=True) as response: response.raw.decode_content = True msg = f"{self.name}:\n streaming to {tempfile}" GeneFabLogger.debug(msg) copyfileobj(response.raw, handle) except (URLError, OSError) as e: msg = f"{self.name}; tried URL and failed:\n {url}" GeneFabLogger.warning(msg, exc_info=e) else: msg = f"{self.name}; successfully fetched data:\n {url}" GeneFabLogger.info(msg) return url else: msg = "None of the URLs are reachable for file" _kw = dict(name=self.name, urls=self.urls) raise GeneFabDataManagerException(msg, **_kw)
def harmonize_columns(columns, descriptor, sample_names, best_sample_name_matches): """Match sample names to columns, infer correct order of original columns based on order of sample_names""" harmonized_column_order, harmonized_positions = [], [] include_nomatch = (descriptor["file"].get("column_subset") != "sample name") for i, c in enumerate(columns): hcs, ps = best_sample_name_matches( c, sample_names, return_positions=True, ) if len(hcs) == 0: if include_nomatch: harmonized_column_order.append(c) else: harmonized_column_order.append(None) elif len(hcs) == 1: harmonized_column_order.append(hcs[0]) harmonized_positions.append((ps[0], i)) else: msg = "Column name matches multiple sample names" filename = descriptor["file"].get("filename") _kws = dict(filename=filename, column=c, sample_names=hcs) raise GeneFabDataManagerException(msg, **_kws) harmonized_unordered = harmonized_column_order[:] original_unordered = list(columns) column_order = original_unordered[:] current_positions = [i for p, i in harmonized_positions] target_positions = [i for p, i in sorted(harmonized_positions)] for cp, tp in zip(current_positions, target_positions): harmonized_column_order[tp] = harmonized_unordered[cp] column_order[tp] = original_unordered[cp] return ( [c for c in column_order if c is not None], [c for c in harmonized_column_order if c is not None], )