def _ingest_raw_isa(self, data, status_kwargs): """Unpack ZIP from URL and delegate to top-level parsers""" raw = SimpleNamespace(investigation=None, studies={}, assays={}) with ZipFile(BytesIO(data)) as archive: for filepath in archive.namelist(): _, filename = path.split(filepath) matcher = search(r'^([isa])_(.+)\.txt$', filename) if matcher: kind, name = matcher.groups() with archive.open(filepath) as handle: if kind == "i": reader = self._read_investigation raw.investigation = reader(handle) elif kind == "s": reader = self._read_tab raw.studies[name] = reader(handle, status_kwargs) elif kind == "a": reader = self._read_tab raw.assays[name] = reader(handle, status_kwargs) for tab, value in raw.__dict__.items(): if not value: msg = "Missing ISA tab" _kw = copy_except(status_kwargs, "collection") raise GeneFabISAException(msg, tab=tab, **_kw) return raw
def __init__(self, raw_tabs, status_kwargs): """Convert tables to nested JSONs""" if self._self_identifier == "Study": self._by_sample_name = {} else: # lookup in classes like AssayEntries would be ambiguous self._by_sample_name = defaultdict(self._abort_lookup) for name, raw_tab in raw_tabs.items(): for _, row in raw_tab.iterrows(): if "Sample Name" not in row: msg = f"{self._self_identifier} entry missing 'Sample Name'" _kw = copy_except(status_kwargs, "collection") raise GeneFabISAException(msg, **_kw) else: sample_name = row["Sample Name"] if isinstance(sample_name, Series): if len(set(sample_name)) > 1: _m = "entry has multiple 'Sample Name' values" msg = f"{self._self_identifier} {_m}" _kw = copy_except(status_kwargs, "collection") raise GeneFabISAException(msg, **_kw) else: sample_name = sample_name.iloc[0] if not isnull(sample_name): _kw = {**status_kwargs, "sample_name": sample_name} json = self._row_to_json(row, name, _kw) super().append(json) if self._self_identifier == "Study": if sample_name in self._by_sample_name: msg = "Duplicate 'Sample Name' in Study tab" _kw = copy_except(status_kwargs, "collection") _kkw = dict(sample_name=sample_name, **_kw) raise GeneFabISAException(msg, **_kkw) else: self._by_sample_name[sample_name] = json else: update_status( **status_kwargs, status="warning", warning="Null 'Sample Name'", tab=self._self_identifier, )
def _INPLACE_extend_with_dataset_files(self): """Populate with File annotation for files that match records for the sample""" isa_elements = set(iterate_terminal_leaf_elements(self)) _sdf = self.dataset.files _no_condition = lambda *_: True self["File"] = [{ **copy_except(_sdf[f], "condition"), "filename": f } for f in { filename for filename, filedata in _sdf.items() if ((filedata.get("internal") or (filename in isa_elements)) and filedata.get("condition", _no_condition)(self, filename)) }]
def _row_to_json(self, row, name, status_kwargs): """Convert single row of table to nested JSON""" json = {"Id": {f"{self._self_identifier} Name": name}} protocol_ref, qualifiable = nan, None for column, value in row.items(): field, subfield, extra = self._parse_field(column) if field is None: update_status( **status_kwargs, status="warning", tab=self._self_identifier, field=repr(column), warning="ISA field is not a string", ) else: if field == "Protocol REF": protocol_ref = value elif self._is_not_qualifier(field): # top-level field if not subfield: # e.g. "Source Name" qualifiable = self._INPLACE_add_toplevel_field( json, field, value, protocol_ref, ) else: # e.g. "Characteristics[Age]" qualifiable = self._INPLACE_add_metadatalike( json, field, subfield, value, protocol_ref, status_kwargs, ) else: # qualify entry at pointer with second-level field if qualifiable is None: msg = "Qualifier before main field" _kw = copy_except(status_kwargs, "collection") raise GeneFabISAException(msg, field=value, **_kw) else: self._INPLACE_qualify( qualifiable, field, subfield, value, status_kwargs={ **status_kwargs, "name": name }, ) return json
def __init__(self, raw_investigation, status_kwargs): """Convert dataframes to JSONs""" for real_name, isatools_name, target, pattern in self._key_dispatcher: if isatools_name in raw_investigation: content = raw_investigation[isatools_name] _kw = dict(coerce_comments=True, status_kwargs=status_kwargs) if isinstance(content, list): json = [self._jsonify(df, **_kw) for df in content] else: json = self._jsonify(content, **_kw) if isinstance(json, list): if (len(json) == 1) and isinstance(json[0], list): json = json[0] if isinstance(target, int) and isinstance(pattern, int): try: if len(json) != pattern: raise IndexError else: super().__setitem__(real_name, json[target]) except (TypeError, IndexError, KeyError): msg = "Unexpected structure of field" _kw = copy_except(status_kwargs, "collection") raise GeneFabISAException(msg, field=real_name, **_kw) elif target and pattern: try: super().__setitem__( real_name, { search(pattern, entry[target]).group(1): entry for entry in json }) except (TypeError, AttributeError, IndexError, KeyError): msg = "Could not break up field by name" _kw = copy_except(status_kwargs, "collection") raise GeneFabISAException(msg, field=real_name, **_kw) else: super().__setitem__(real_name, json)
def _INPLACE_add_metadatalike(self, json, field, subfield, value, protocol_ref, status_kwargs): """Add metadatalike to json (e.g. 'Characteristics' -> 'Age'), qualify with 'Protocol REF', point to resulting field""" if field not in json: json[field] = {} if subfield in json[field]: msg = "Duplicate field[subfield]" _k = copy_except(status_kwargs, "collection") raise GeneFabISAException(msg, field=field, subfield=subfield, **_k) else: # make {"Characteristics": {"Age": {"": "36"}}} json[field][subfield] = {"": value} qualifiable = json[field][subfield] if field == "Parameter Value": qualifiable["Protocol REF"] = protocol_ref return qualifiable