def export_identifier_metadata(self, dump_path, dataset_blobs): if (self.latest and self.latest_id_met_path.exists()): blob_id_met = self.latest_id_met else: def fetch(id): # FIXME error proof version ... try: metadata = id.metadata() metadata['id'] = id.identifier # FIXME normalization ... return metadata except requests.exceptions.HTTPError as e: logd.error(e) except (requests.exceptions.ConnectionError, requests.exceptions.SSLError) as e: log.error(e) # retrieve doi metadata and materialize it in the dataset _dois = set([ idlib.Auto(id) if not isinstance(id, idlib.Stream) else id for blob in dataset_blobs for id in chain( adops.get(blob, ['meta', 'protocol_url_or_doi'], on_failure=[]), adops.get(blob, ['meta', 'originating_article_doi'], on_failure=[]), # TODO data["links"]? [blob['meta']['doi']]) if id is not None ]) dois = [d for d in _dois if isinstance(d, idlib.Doi)] metadatas = Async(rate=10)(deferred(fetch)(d) for d in dois) bads = [ { 'id': d, 'reason': 'no metadata' } # TODO more granular reporting e.g. 404 for d, m in zip(dois, metadatas) if m is None ] metadatas = [m for m in metadatas if m is not None] blob_id_met = { 'id': 'identifier-metadata', # TODO is this ok ? 'identifier_metadata': metadatas, 'errors': bads, 'meta': { 'count': len(metadatas) }, 'prov': { 'timestamp_export_start': self.timestamp, 'export_system_identifier': Path.sysid, 'export_hostname': gethostname(), 'export_project_path': self.export_source_path.cache.anchor, }, } with open(dump_path / 'identifier-metadata.json', 'wt') as f: json.dump(blob_id_met, f, sort_keys=True, indent=2, cls=JEncode) return blob_id_met
def _completeness(self, data): accessor = JT(data) # can go direct if elements are always present #organ = accessor.query('meta', 'organ') try: organ = adops.get(data, ['meta', 'organ']) except: organ = None if isinstance(organ, list) or isinstance(organ, tuple): if len(organ) == 1: organ, = organ organ = OntTerm(organ) else: organ = [OntTerm(o) for o in organ] elif organ == 'othertargets': pass elif organ: organ = OntTerm(organ) return ( accessor.status.submission_index, accessor.status.curation_index, accessor.status.error_index, #accessor.submission_completeness_index, #dataset.name, # from filename (do we not have that in meta!?) accessor.query('meta', 'folder_name'), accessor.id, #if 'id' in dowe else None, accessor.query('meta', 'award_number'), organ, )
def protocol_uris(self, outer_self=self ): # FIXME this needs to be pipelined try: yield from adops.get(outer_self.data(), ['meta', 'protocol_url_or_doi']) except exc.NoSourcePathError: pass
def update_from_ir(self, ir): oi = OntTerm.query._instrumented if oi is not OntTerm: OntTerm.query._instrumented = OntTerm try: dataset_blobs = ir['datasets'] self._wat = self.values[8] for blob in dataset_blobs: meta = blob['meta'] self._update_dataset_metadata( id=blob['id'], name=adops.get(blob, ['meta', 'folder_name'], on_failure=''), award=adops.get(blob, ['meta', 'award_number'], on_failure=''), ) finally: # FIXME this is so dumb :/ OntTerm.query._instrumented = oi
def allOf(obj): for o in obj['allOf']: if '$ref' in o: ref = o['$ref'] if ref in types: yield types[ref] else: jpath = ref_to_list(ref) no = adops.get(schema, jpath) yield top(jpath[-1], no) else: log.debug(f'{obj}')
def update_from_ir(self, ir): oi = OntTerm.query._instrumented if oi is not OntTerm: OntTerm.query._instrumented = OntTerm def cformat(cell): if isinstance(cell, OntTerm): cell = cell.asCell() return cell try: dataset_blobs = ir['datasets'] self._wat = self.values[8] for blob in dataset_blobs: meta = blob['meta'] #species = adops.get(blob, ['subjects', int, 'species'], on_failure='') # TODO not implemented if 'subjects' in blob: species = '\n'.join( sorted( set([ cformat(s['species']) for s in blob['subjects'] if 'species' in s ]))) else: species = '' self._update_dataset_metadata( id=blob['id'], name=adops.get(blob, ['meta', 'folder_name'], on_failure=''), award=adops.get(blob, ['meta', 'award_number'], on_failure=''), species=species, ) finally: # FIXME this is so dumb :/ OntTerm.query._instrumented = oi log.debug(self.uncommitted()) self.commit()
def f(self, path=path, terminal=term, term_list=term_list): try: end = adops.get( self._blob, path ) # FIXME / and * would need to be implemented on top of this except exc.NoSourcePathError: if term_list: return [] elif terminal is not None: return terminal({}) else: return [ ] # FIXME was return None but that breaks missing keys that should contain lists # FIXME type check if terminal and term_list: return [terminal(o) for o in end] elif terminal: return terminal(end) elif term_list: raise NotImplementedError('hrm') else: return end
def keywords(self): try: yield from adops.get(self.data(), ['meta', 'keywords']) except exc.NoSourcePathError: pass
def timestamp_export_start(self): return adops.get(self.data, ['prov', 'timestamp_export_start'])
def deref(ref): return adops.get(schema, ref_to_list(ref))
def export_identifier_metadata(self, dump_path, latest_path, dataset_blobs): latest_id_met_path = latest_path / self.id_metadata if (self.latest and latest_id_met_path.exists()): with open(latest_id_met_path, 'rt') as f: blob_id_met = json.load(f) else: import requests def fetch(id): # FIXME error proof version ... try: metadata = id.metadata() metadata['id'] = id return metadata except (requests.exceptions.HTTPError, idlib.exc.RemoteError) as e: logd.error(e) except (requests.exceptions.ConnectionError, requests.exceptions.SSLError, idlib.exc.ResolutionError) as e: log.error(e) def autoid_report_error(id, blob): try: return idlib.Auto(id) except idlib.exc.MalformedIdentifierError as e: msg = f'{blob["id"]} bad id: {id}' logd.error(msg) return None # retrieve doi metadata and materialize it in the dataset _dois = set([ id if isinstance(id, idlib.Stream) else (fromJson(id) if isinstance(id, dict) else autoid_report_error( id, blob)) for blob in dataset_blobs for id in chain( adops.get(blob, ['meta', 'protocol_url_or_doi'], on_failure=[]), adops.get(blob, ['meta', 'originating_article_doi'], on_failure=[]), # TODO data["links"]? [blob['meta']['doi']] if 'doi' in blob['meta'] else []) if id is not None ]) dois = [d for d in _dois if isinstance(d, idlib.Doi)] metadatas = Async(rate=10)(deferred(fetch)(d) for d in dois) bads = [ { 'id': d, 'reason': 'no metadata' } # TODO more granular reporting e.g. 404 for d, m in zip(dois, metadatas) if m is None ] metadatas = [m for m in metadatas if m is not None] blob_id_met = { 'id': 'identifier-metadata', # TODO is this ok ? 'identifier_metadata': metadatas, 'errors': bads, 'meta': { 'count': len(metadatas) }, 'prov': { 'timestamp_export_start': self.timestamp, 'export_system_identifier': Path.sysid, 'export_hostname': gethostname(), 'export_project_path': self.export_source_path.cache.anchor, }, } with open(dump_path / self.id_metadata, 'wt') as f: json.dump(blob_id_met, f, sort_keys=True, indent=2, cls=JEncode) return blob_id_met