コード例 #1
0
 def get_files_by_accession(self, accession):
     """Get dictionary of files for dataset available through genelab.nasa.gov/genelabAPIs"""
     try:
         url = self.constants.COLD_GLDS_MASK.format(accession)
         glds_json = read_json(url)
         assert len(glds_json) == 1
         _id = glds_json[0]["_id"]
     except (AssertionError, IndexError, KeyError, TypeError):
         raise GeneFabDataManagerException(
             "Malformed GLDS JSON", accession=accession,
             url=url, object_type=type(glds_json).__name__,
             length=getattr(glds_json, "__len__", lambda: None)(),
             target="[0]['_id']",
         )
     try:
         url = self.constants.COLD_FILELISTINGS_MASK.format(_id)
         filelisting_json = read_json(url)
         assert isinstance(filelisting_json, list)
     except AssertionError:
         raise GeneFabDataManagerException(
             "Malformed 'filelistings' JSON", accession=accession, _id=_id,
             url=url, object_type=type(filelisting_json).__name__,
             expected_type="list",
         )
     else:
         files = json_normalize(filelisting_json)
     with catch_warnings():
         filterwarnings("ignore", category=UnknownTimezoneWarning)
         files["date_created"] = as_timestamp(files, "date_created")
         files["date_modified"] = as_timestamp(files, "date_modified")
     files["timestamp"] = files[["date_created", "date_modified"]].max(axis=1)
     return {
         row["file_name"]: self._format_file_entry(row)
         for _, row in files.sort_values(by="timestamp").iterrows()
     }
コード例 #2
0
def read_json(url):
    """Get parsed JSON from URL"""
    try:
        with request_get(url) as response:
            return response.json()
    except (URLError, OSError):
        raise GeneFabDataManagerException("Not found", url=url)
コード例 #3
0
 def get_accessions(self):
     """Return list of dataset accessions available through genelab.nasa.gov/genelabAPIs"""
     try:
         n_datasets_url = self.constants.COLD_SEARCH_MASK.format(0)
         n_datasets = read_json(n_datasets_url)["hits"]["total"]
         datasets_url = self.constants.COLD_SEARCH_MASK.format(n_datasets)
         return {e["_id"] for e in read_json(datasets_url)["hits"]["hits"]}
     except (KeyError, TypeError):
         raise GeneFabDataManagerException("Malformed GeneLab search JSON")
コード例 #4
0
 def __download_as_blob(self):
     """Download data from URL as-is"""
     for url in self.urls:
         GeneFabLogger.info(f"{self.name}; trying URL:\n  {url}")
         try:
             with request_get(url) as response:
                 data = response.content
         except (URLError, OSError) as e:
             msg = f"{self.name}; tried URL and failed:\n  {url}"
             GeneFabLogger.warning(msg, exc_info=e)
         else:
             msg = f"{self.name}; successfully fetched blob:\n  {url}"
             GeneFabLogger.info(msg)
             self.url = url
             return data
     else:
         msg = "None of the URLs are reachable for file"
         _kw = dict(name=self.name, urls=self.urls)
         raise GeneFabDataManagerException(msg, **_kw)
コード例 #5
0
ファイル: types.py プロジェクト: LankyCyril/genefab3
 def __init__(self,
              accession,
              files,
              sqlite_dbs,
              best_sample_name_matches=None,
              status_kwargs=None):
     self.accession, self.files = accession, files
     self.sqlite_db = sqlite_dbs.blobs["db"]
     self.maxdbsize = sqlite_dbs.blobs["maxsize"]
     self.best_sample_name_matches = (
         best_sample_name_matches
         or (lambda n, N: Adapter.best_sample_name_matches(None, n, N)))
     isa_files = {
         filename: descriptor
         for filename, descriptor in files.items()
         if descriptor.get("datatype") == "isa"
     }
     if len(isa_files) != 1:
         msg = "File entries for Dataset must contain exactly one ISA file"
         _kw = dict(accession=accession, filenames=set(isa_files))
         raise GeneFabDataManagerException(msg, **_kw)
     else:
         isa_name, isa_desc = next(iter(isa_files.items()))
         urls = isa_desc.get("urls", ())
         isa_file = CachedBinaryFile(
             name=isa_name,
             identifier=f"BLOB:{accession}/ISA/{isa_name}",
             sqlite_db=self.sqlite_db,
             maxdbsize=self.maxdbsize,
             urls=urls,
             timestamp=isa_desc.get("timestamp", -1),
         )
         self.isa = IsaFromZip(
             data=isa_file.data,
             status_kwargs={
                 **(status_kwargs or {}),
                 "accession": accession,
                 "filename": isa_file.name,
                 "url": isa_file.url,
             },
         )
         self.isa.changed = isa_file.changed
コード例 #6
0
 def __copyfileobj(self, tempfile):
     """Try all URLs and push data into temporary file"""
     for url in self.urls:
         with open(tempfile, mode="wb") as handle:
             GeneFabLogger.info(f"{self.name}; trying URL:\n  {url}")
             try:
                 with request_get(url, stream=True) as response:
                     response.raw.decode_content = True
                     msg = f"{self.name}:\n  streaming to {tempfile}"
                     GeneFabLogger.debug(msg)
                     copyfileobj(response.raw, handle)
             except (URLError, OSError) as e:
                 msg = f"{self.name}; tried URL and failed:\n  {url}"
                 GeneFabLogger.warning(msg, exc_info=e)
             else:
                 msg = f"{self.name}; successfully fetched data:\n  {url}"
                 GeneFabLogger.info(msg)
                 return url
     else:
         msg = "None of the URLs are reachable for file"
         _kw = dict(name=self.name, urls=self.urls)
         raise GeneFabDataManagerException(msg, **_kw)
コード例 #7
0
ファイル: data.py プロジェクト: LankyCyril/genefab3
def harmonize_columns(columns, descriptor, sample_names,
                      best_sample_name_matches):
    """Match sample names to columns, infer correct order of original columns based on order of sample_names"""
    harmonized_column_order, harmonized_positions = [], []
    include_nomatch = (descriptor["file"].get("column_subset") !=
                       "sample name")
    for i, c in enumerate(columns):
        hcs, ps = best_sample_name_matches(
            c,
            sample_names,
            return_positions=True,
        )
        if len(hcs) == 0:
            if include_nomatch:
                harmonized_column_order.append(c)
            else:
                harmonized_column_order.append(None)
        elif len(hcs) == 1:
            harmonized_column_order.append(hcs[0])
            harmonized_positions.append((ps[0], i))
        else:
            msg = "Column name matches multiple sample names"
            filename = descriptor["file"].get("filename")
            _kws = dict(filename=filename, column=c, sample_names=hcs)
            raise GeneFabDataManagerException(msg, **_kws)
    harmonized_unordered = harmonized_column_order[:]
    original_unordered = list(columns)
    column_order = original_unordered[:]
    current_positions = [i for p, i in harmonized_positions]
    target_positions = [i for p, i in sorted(harmonized_positions)]
    for cp, tp in zip(current_positions, target_positions):
        harmonized_column_order[tp] = harmonized_unordered[cp]
        column_order[tp] = original_unordered[cp]
    return (
        [c for c in column_order if c is not None],
        [c for c in harmonized_column_order if c is not None],
    )