def __init__(self, accession, verbose=False, storage_prefix=STORAGE_PREFIX, index_by="Sample Name", name_delim=DELIM_DEFAULT): """Request JSON representation of ISA metadata and store fields""" self.accession = accession self.verbose = verbose self.storage = join(storage_prefix, accession) data_json = get_json( "{}/data/study/data/{}/".format(API_ROOT, accession), self.verbose ) if len(data_json) == 0: raise GeneLabJSONException("Invalid JSON (GLDS does not exist?)") if len(data_json) > 1: raise GeneLabJSONException("Invalid JSON, too many sections") self._json = data_json[0] try: self.internal_id = self._json["_id"] self.metadata_id = self._json["metadata_id"] if len(self._json["foreignFields"]) != 1: raise NotImplementedError("Multiple foreignFields") self._isa2json = self._json["foreignFields"][0]["isa2json"] self._info = self._isa2json["additionalInformation"] for field in "description", "samples", "ontologies", "organisms": setattr(self, field, self._info[field]) except KeyError: error_message = "Malformed JSON ({})".format(self.accession) raise GeneLabJSONException(error_message) self.assays = AssayDispatcher( parent=self, json=self._info["assays"], storage_prefix=self.storage, name_delim=name_delim, glds_file_urls=self.get_files_info("urls"), index_by=index_by, glds_file_dates=self.get_files_info("dates") )
def __init__(self, parent, name, json, glds_file_urls, glds_file_dates, storage_prefix, index_by, name_delim): """Parse JSON into assay metadata""" self.parent, self.name, self._json = parent, name, json self.glds_file_urls = glds_file_urls self.glds_file_dates = glds_file_dates self.storage = join(storage_prefix, name) self._raw, self._header = self._json["raw"], self._json["header"] # populate and freeze self._fields (this can be refactored...): self._field2title = {e["field"]: e["title"] for e in self._header} if len(self._field2title) != len(self._header): raise GeneLabJSONException("Conflicting IDs of data fields") self._fields = defaultdict(set) for field, title in self._field2title.items(): self._fields[title].add(field) self._fields = dict(self._fields) # populate metadata and index with `index_by`: self.raw_metadata = concat(map(Series, self._raw), axis=1).T self._field_indexed_by = self._get_unique_field_from_title(index_by) maybe_indexed_by = self._match_field_titles(index_by, method=fullmatch) if len(maybe_indexed_by) != 1: raise IndexError( "Nonexistent or ambiguous index_by value: '{}'".format(index_by) ) self._indexed_by = maybe_indexed_by.pop() self.raw_metadata = self.raw_metadata.set_index(self._field_indexed_by) self._name_delim = name_delim if name_delim != DELIM_AS_IS: self.raw_metadata.index = self.raw_metadata.index.map( lambda f: sub(r'[._-]', name_delim, f) ) del self._fields[self._indexed_by] # initialize indexing functions: self.metadata = AssayMetadata(self)
def get_files_info(self, kind="urls"): """Get filenames and associated URLs""" if self.accession is None: raise ValueError("Uninitialized GLDS instance") elif kind == "urls": getter_url = "{}/data/glds/files/{}" acc_nr = search(r'\d+$', self.accession).group() files_json = get_json( getter_url.format(API_ROOT, acc_nr), self.verbose ) try: filedata = files_json["studies"][self.accession]["study_files"] except KeyError: raise GeneLabJSONException("Malformed JSON") return { fd["file_name"]: GENELAB_ROOT+fd["remote_url"] for fd in filedata } elif kind == "dates": getter_url = "{}/data/study/filelistings/{}" filedata = get_json( getter_url.format(API_ROOT, self.internal_id), self.verbose ) return {fd["file_name"]: date2stamp(fd) for fd in filedata} else: raise ValueError("Unrecognized parameter: '{}'".format(kind))
def get_datasets(maxcount="25", storage=STORAGE_PREFIX, verbose=False, onerror="warn", **ffield_kwargs): """Match passed regexes and combine into search URL, get JSON and parse for accessions""" url_lead_components = [ API_ROOT+"/data/search/?term=GLDS", "type=cgene", "size="+str(maxcount) ] url_ffield_components = [ "ffield={}&fvalue={}".format(ffield, quote_plus(ffvalue)) for ffield, ffvalue in get_ffield_matches(verbose=verbose, **ffield_kwargs) ] url = "&".join(url_lead_components + url_ffield_components) try: json = get_json(url, verbose=verbose)["hits"]["hits"] except: raise GeneLabJSONException("Unrecognized JSON structure") datasets = [] for hit in json: try: datasets.append( GeneLabDataSet( hit["_id"], storage_prefix=storage, verbose=verbose ) ) except Exception as e: if onerror == "ignore": pass elif onerror == "warn": msgmask = "Warning: Could not process {} due to error:" print(msgmask.format(hit["_id"]), e, file=stderr) else: raise return datasets
def _get_file_url(self, filemask): """Get URL of file defined by file mask (such as *SRR1781971_*)""" regex_filemask = filemask.split("/")[0].replace("*", ".*") matching_names = { filename for filename in self.glds_file_urls.keys() if search(regex_filemask, filename) } if len(matching_names) == 0: return None elif len(matching_names) > 1: raise GeneLabJSONException("Multiple file URLs match name") else: return self.glds_file_urls[matching_names.pop()]
def __init__(self, parent, json, glds_file_urls, glds_file_dates, storage_prefix, index_by, name_delim): """Populate dictionary of assay_name -> Assay()""" try: for assay_name, assay_json in json.items(): super().__setitem__( assay_name, Assay( parent, assay_name, assay_json, index_by=index_by, name_delim=name_delim, storage_prefix=storage_prefix, glds_file_urls=glds_file_urls, glds_file_dates=glds_file_dates ) ) except KeyError: raise GeneLabJSONException( "Malformed assay JSON ({})".format(self.accession) )
def annotation(self, differential_annotation=True, named_only=True, index_by="Sample Name", cls=None, continuous="infer"): """Get annotation of samples: entries that differ (default) or all entries""" samples_keys = set(self.parent.samples.keys()) if len(samples_keys) == 1: samples_key = samples_keys.pop() else: samples_key = sub(r'^a', "s", self.name) if samples_key not in self.parent.samples: error_message = "Could not find an unambiguous samples key" raise GeneLabJSONException(error_message) annotation_dataframe = concat([ Series(raw_sample_annotation) for raw_sample_annotation in self.parent.samples[samples_key]["raw"] ], axis=1) samples_field2title = { entry["field"]: entry["title"] for entry in self.parent.samples[samples_key]["header"] } if named_only: index_subset = [ field for field in annotation_dataframe.index if field in samples_field2title ] annotation_dataframe = annotation_dataframe.loc[index_subset] annotation_dataframe.index = annotation_dataframe.index.map( lambda field: samples_field2title.get(field, field) ) if differential_annotation: differential_rows = annotation_dataframe.apply( lambda r: len(set(r.values))>1, axis=1 ) annotation_dataframe = annotation_dataframe[differential_rows] annotation_dataframe = annotation_dataframe.T.set_index(index_by).T if self._name_delim != DELIM_AS_IS: annotation_dataframe.columns = annotation_dataframe.columns.map( lambda f: sub(r'[._-]', self._name_delim, f) ) annotation_dataframe.columns.name = index_by if cls: return to_cls( annotation_dataframe.T, target=cls, continuous=continuous ) else: return annotation_dataframe.T