コード例 #1
0
ファイル: _dataset.py プロジェクト: Tubbz-alt/genefab
 def __init__(self, accession, verbose=False, storage_prefix=STORAGE_PREFIX, index_by="Sample Name", name_delim=DELIM_DEFAULT):
     """Request JSON representation of ISA metadata and store fields"""
     self.accession = accession
     self.verbose = verbose
     self.storage = join(storage_prefix, accession)
     data_json = get_json(
         "{}/data/study/data/{}/".format(API_ROOT, accession), self.verbose
     )
     if len(data_json) == 0:
         raise GeneLabJSONException("Invalid JSON (GLDS does not exist?)")
     if len(data_json) > 1:
         raise GeneLabJSONException("Invalid JSON, too many sections")
     self._json = data_json[0]
     try:
         self.internal_id = self._json["_id"]
         self.metadata_id = self._json["metadata_id"]
         if len(self._json["foreignFields"]) != 1:
             raise NotImplementedError("Multiple foreignFields")
         self._isa2json = self._json["foreignFields"][0]["isa2json"]
         self._info = self._isa2json["additionalInformation"]
         for field in "description", "samples", "ontologies", "organisms":
             setattr(self, field, self._info[field])
     except KeyError:
         error_message = "Malformed JSON ({})".format(self.accession)
         raise GeneLabJSONException(error_message)
     self.assays = AssayDispatcher(
         parent=self, json=self._info["assays"], storage_prefix=self.storage,
         name_delim=name_delim, glds_file_urls=self.get_files_info("urls"),
         index_by=index_by, glds_file_dates=self.get_files_info("dates")
     )
コード例 #2
0
 def __init__(self, parent, name, json, glds_file_urls, glds_file_dates, storage_prefix, index_by, name_delim):
     """Parse JSON into assay metadata"""
     self.parent, self.name, self._json = parent, name, json
     self.glds_file_urls = glds_file_urls
     self.glds_file_dates = glds_file_dates
     self.storage = join(storage_prefix, name)
     self._raw, self._header = self._json["raw"], self._json["header"]
     # populate and freeze self._fields (this can be refactored...):
     self._field2title = {e["field"]: e["title"] for e in self._header}
     if len(self._field2title) != len(self._header):
         raise GeneLabJSONException("Conflicting IDs of data fields")
     self._fields = defaultdict(set)
     for field, title in self._field2title.items():
         self._fields[title].add(field)
     self._fields = dict(self._fields)
     # populate metadata and index with `index_by`:
     self.raw_metadata = concat(map(Series, self._raw), axis=1).T
     self._field_indexed_by = self._get_unique_field_from_title(index_by)
     maybe_indexed_by = self._match_field_titles(index_by, method=fullmatch)
     if len(maybe_indexed_by) != 1:
         raise IndexError(
             "Nonexistent or ambiguous index_by value: '{}'".format(index_by)
         )
     self._indexed_by = maybe_indexed_by.pop()
     self.raw_metadata = self.raw_metadata.set_index(self._field_indexed_by)
     self._name_delim = name_delim
     if name_delim != DELIM_AS_IS:
         self.raw_metadata.index = self.raw_metadata.index.map(
             lambda f: sub(r'[._-]', name_delim, f)
         )
     del self._fields[self._indexed_by]
     # initialize indexing functions:
     self.metadata = AssayMetadata(self)
コード例 #3
0
ファイル: _dataset.py プロジェクト: Tubbz-alt/genefab
 def get_files_info(self, kind="urls"):
     """Get filenames and associated URLs"""
     if self.accession is None:
         raise ValueError("Uninitialized GLDS instance")
     elif kind == "urls":
         getter_url = "{}/data/glds/files/{}"
         acc_nr = search(r'\d+$', self.accession).group()
         files_json = get_json(
             getter_url.format(API_ROOT, acc_nr), self.verbose
         )
         try:
             filedata = files_json["studies"][self.accession]["study_files"]
         except KeyError:
             raise GeneLabJSONException("Malformed JSON")
         return {
             fd["file_name"]: GENELAB_ROOT+fd["remote_url"]
             for fd in filedata
         }
     elif kind == "dates":
         getter_url = "{}/data/study/filelistings/{}"
         filedata = get_json(
             getter_url.format(API_ROOT, self.internal_id), self.verbose
         )
         return {fd["file_name"]: date2stamp(fd) for fd in filedata}
     else:
         raise ValueError("Unrecognized parameter: '{}'".format(kind))
コード例 #4
0
ファイル: _dataset.py プロジェクト: Tubbz-alt/genefab
def get_datasets(maxcount="25", storage=STORAGE_PREFIX, verbose=False, onerror="warn", **ffield_kwargs):
    """Match passed regexes and combine into search URL, get JSON and parse for accessions"""
    url_lead_components = [
        API_ROOT+"/data/search/?term=GLDS", "type=cgene", "size="+str(maxcount)
    ]
    url_ffield_components = [
        "ffield={}&fvalue={}".format(ffield, quote_plus(ffvalue))
        for ffield, ffvalue
        in get_ffield_matches(verbose=verbose, **ffield_kwargs)
    ]
    url = "&".join(url_lead_components + url_ffield_components)
    try:
        json = get_json(url, verbose=verbose)["hits"]["hits"]
    except:
        raise GeneLabJSONException("Unrecognized JSON structure")
    datasets = []
    for hit in json:
        try:
            datasets.append(
                GeneLabDataSet(
                    hit["_id"], storage_prefix=storage, verbose=verbose
                )
            )
        except Exception as e:
            if onerror == "ignore":
                pass
            elif onerror == "warn":
                msgmask = "Warning: Could not process {} due to error:"
                print(msgmask.format(hit["_id"]), e, file=stderr)
            else:
                raise
    return datasets
コード例 #5
0
 def _get_file_url(self, filemask):
     """Get URL of file defined by file mask (such as *SRR1781971_*)"""
     regex_filemask = filemask.split("/")[0].replace("*", ".*")
     matching_names = {
         filename for filename in self.glds_file_urls.keys()
         if search(regex_filemask, filename)
     }
     if len(matching_names) == 0:
         return None
     elif len(matching_names) > 1:
         raise GeneLabJSONException("Multiple file URLs match name")
     else:
         return self.glds_file_urls[matching_names.pop()]
コード例 #6
0
 def __init__(self, parent, json, glds_file_urls, glds_file_dates, storage_prefix, index_by, name_delim):
     """Populate dictionary of assay_name -> Assay()"""
     try:
         for assay_name, assay_json in json.items():
             super().__setitem__(
                 assay_name,
                 Assay(
                     parent, assay_name, assay_json, index_by=index_by,
                     name_delim=name_delim, storage_prefix=storage_prefix,
                     glds_file_urls=glds_file_urls,
                     glds_file_dates=glds_file_dates
                 )
             )
     except KeyError:
         raise GeneLabJSONException(
             "Malformed assay JSON ({})".format(self.accession)
         )
コード例 #7
0
 def annotation(self, differential_annotation=True, named_only=True, index_by="Sample Name", cls=None, continuous="infer"):
     """Get annotation of samples: entries that differ (default) or all entries"""
     samples_keys = set(self.parent.samples.keys())
     if len(samples_keys) == 1:
         samples_key = samples_keys.pop()
     else:
         samples_key = sub(r'^a', "s", self.name)
     if samples_key not in self.parent.samples:
         error_message = "Could not find an unambiguous samples key"
         raise GeneLabJSONException(error_message)
     annotation_dataframe = concat([
         Series(raw_sample_annotation)
         for raw_sample_annotation in self.parent.samples[samples_key]["raw"]
     ], axis=1)
     samples_field2title = {
         entry["field"]: entry["title"]
         for entry in self.parent.samples[samples_key]["header"]
     }
     if named_only:
         index_subset = [
             field for field in annotation_dataframe.index
             if field in samples_field2title
         ]
         annotation_dataframe = annotation_dataframe.loc[index_subset]
     annotation_dataframe.index = annotation_dataframe.index.map(
         lambda field: samples_field2title.get(field, field)
     )
     if differential_annotation:
         differential_rows = annotation_dataframe.apply(
             lambda r: len(set(r.values))>1, axis=1
         )
         annotation_dataframe = annotation_dataframe[differential_rows]
     annotation_dataframe = annotation_dataframe.T.set_index(index_by).T
     if self._name_delim != DELIM_AS_IS:
         annotation_dataframe.columns = annotation_dataframe.columns.map(
             lambda f: sub(r'[._-]', self._name_delim, f)
         )
     annotation_dataframe.columns.name = index_by
     if cls:
         return to_cls(
             annotation_dataframe.T, target=cls, continuous=continuous
         )
     else:
         return annotation_dataframe.T