def export_single_dataset(self): intr = cur.Integrator( self.export_source_path) # FIXME implicit state set by cli dump_path = self.export_base / 'datasets' / intr.id / self.folder_timestamp latest_path = self.export_base / 'datasets' / intr.id / 'LATEST' latest_partial_path = self.export_base / 'datasets' / intr.id / 'LATEST_PARTIAL' if not dump_path.exists(): dump_path.mkdir(parents=True) functions = [] suffixes = [] modes = [] blob_data = intr.data_for_export( self.timestamp) # build and cache the data epipe = pipes.IrToExportJsonPipeline(blob_data) blob_export = epipe.data blob_jsonld = self._dataset_export_jsonld(blob_export) # always dump the json j = lambda f: json.dump( blob_export, f, sort_keys=True, indent=2, cls=JEncode) functions.append(j) suffixes.append('.json') modes.append('wt') # always dump the jsonld j = lambda f: json.dump( blob_jsonld, f, sort_keys=True, indent=2, cls=JEncode) functions.append(j) suffixes.append('.jsonld') modes.append('wt') # always dump the ttl (for single datasets this is probably ok) t = lambda f: f.write(ex.TriplesExportDataset(blob_data).ttl) functions.append(t) suffixes.append('.ttl') modes.append('wb') filename = 'curation-export' filepath = dump_path / filename for function, suffix, mode in zip(functions, suffixes, modes): out = filepath.with_suffix(suffix) with open(out, mode) as f: function(f) if suffix == '.json': symlink_latest(dump_path, latest_partial_path) elif suffix == '.jsonld': loge.info(f'dataset graph exported to {out}') elif suffix == '.ttl': loge.info(f'dataset graph exported to {out}') if self.open_when_done: out.xopen(self.open_when_done) symlink_latest(dump_path, latest_path) return blob_data, intr
def export_protcur(self, dump_path, *hypothesis_groups, no_network=False): # FIXME no_network passed in here is dumb #if (self.latest and # FIXME NOTE this only points to the latest integrated release #self.latest_protcur_path.exists()): #blob_protcur = self.latest_protocols #else: pipeline = pipes.ProtcurPipeline(*hypothesis_groups, no_network=no_network) # FIXME NOTE this does not do the identifier expansion pass protcur = pipeline.data context = { **sc.base_context, **sc.protcur_context, } for f in ('meta', 'subjects', 'samples', 'contributors'): context.pop(f) # FIXME HACK meta @graph for datasets ontology_header = { # FIXME should probably not be added here since it is obscure ... '@id': 'https://cassava.ucsd.edu/sparc/ontologies/protcur.ttl', '@type': 'owl:Ontology', } protcur.append(ontology_header) blob_protcur = { # FIXME this should not be defined here so confusing that it is not with the pipeline ... '@context': context, 'meta': { 'count': len(protcur) }, # FIXME adjust to structure 'prov': { 'timestamp_export_start': self.timestamp, 'export_system_identifier': Path.sysid, 'export_hostname': gethostname(), }, '@graph': protcur, # FIXME regularize elements ? } dump_path.mkdir(parents=True, exist_ok=True) # FIXME TODO make these latest paths accessible # probably by splitting protcur export out into # its own class latest_path = dump_path.parent / 'LATEST' latest_partial_path = dump_path.parent / 'LATEST_PARTIAL' fn = dump_path / 'protcur.json' with open(fn, 'wt') as f: json.dump(blob_protcur, f, sort_keys=True, indent=2, cls=JEncode) symlink_latest(dump_path, latest_partial_path) g = populateFromJsonLd(OntGraph(), fn).write(fn.with_suffix('.ttl')) symlink_latest(dump_path, latest_path) return blob_protcur
def export_datasets(self): # start time not end time ... # obviously not transactional ... filename = 'curation-export' dump_path = self.export_base / self.folder_timestamp if not dump_path.exists(): dump_path.mkdir(parents=True) symlink_latest(dump_path, self.LATEST_RUN) filepath = dump_path / filename # data summary = cur.Summary( self.export_source_path) # FIXME implicit state set by cli blob_data = (self.latest_export if self.latest else summary.data_for_export(self.timestamp)) # FIXME we still create a new export folder every time even if the json didn't change ... with open(filepath.with_suffix('.json'), 'wt') as f: json.dump(blob_data, f, sort_keys=True, indent=2, cls=JEncode) previous_latest = self.latest_datasets_path.resolve() symlink_latest(dump_path, self.LATEST_PARTIAL) dataset_blobs = blob_data['datasets'] # identifier metadata blob_id_met = self.export_identifier_metadata(dump_path, dataset_blobs) teim = self.export_identifier_rdf(dump_path, blob_id_met) # protocol blob_protocol = self.export_protocols(dump_path, dataset_blobs, summary) # rdf teds = self.export_rdf(dump_path, previous_latest, dataset_blobs) tes = ex.TriplesExportSummary(blob_data, teds=teds + [teim]) with open(filepath.with_suffix('.ttl'), 'wb') as f: f.write(tes.ttl) # xml self.export_xml(filepath, dataset_blobs) # disco self.export_disco(filepath, dataset_blobs, teds) symlink_latest(dump_path, self.LATEST) return summary
def export(self, *args, **kwargs): dump_path = self.dump_path # make the dump directory self.make_dump_path(dump_path) symlink_latest(dump_path, self.LATEST_RUN) filepath_json = self.filepath_json # build or load the export of the internal representation blob_ir, *rest_ir = self.make_ir(**kwargs) blob_export_json = self.make_export_json(blob_ir) self.write_json(filepath_json, blob_export_json) symlink_latest(dump_path, self.LATEST_PARTIAL) # build or load derived exports self.export_other_formats(dump_path, filepath_json, blob_ir, blob_export_json, *rest_ir) symlink_latest(dump_path, self.LATEST) return (blob_ir, *rest_ir) # FIXME :/
def export_protcur( self, dump_path, *hypothesis_groups, rerun_protcur_export=False, # FIXME no_network passed in here is dumb no_network=False, # FIXME direct= is a hack direct=False): if not direct and self.export_base != self.export_protcur_base: # workaround to set the correct export base path nargs = {**self._args} nargs['export_base'] = self.export_protcur_base export = ExportProtcur(**nargs) return export.export_protcur(export.dump_path, *hypothesis_groups, no_network=no_network), export pipeline = pipes.ProtcurPipeline(*hypothesis_groups, no_network=no_network) annos = pipeline.load() if not annos: msg = ('No annos. Did you remember to run\n' 'python -m sparcur.simple.fetch_annotations') raise ValueError(msg) if self.latest_export_path.exists(): # FIXME this only points to the latest integrated release # which is not what we want, we need the latest protcur to be independent #self.latest and blob_protcur = self.latest_export t_lex = blob_protcur['prov']['timestamp_export_start'] t_lup = max(a.updated for a in annos).replace('+00:00', 'Z') new_annos_here = t_lex < t_lup # <= is pretty much impossible if not (new_annos_here or rerun_protcur_export): return blob_protcur # FIXME NOTE this does not do the identifier expansion pass protcur = pipeline._make_blob(annos=annos) context = { **sc.base_context, **sc.protcur_context, } for f in ('meta', 'subjects', 'samples', 'contributors'): # subjects samples and contributors no longer included in context directly if f in context: context.pop(f) # FIXME HACK meta @graph for datasets ontology_header = { # FIXME should probably not be added here since it is obscure ... '@id': 'https://cassava.ucsd.edu/sparc/ontologies/protcur.ttl', '@type': 'owl:Ontology', } protcur.append(ontology_header) blob_protcur = { # FIXME this should not be defined here so confusing that it is not with the pipeline ... '@context': context, 'meta': { 'count': len(protcur) }, # FIXME adjust to structure 'prov': { 'timestamp_export_start': self.timestamp, 'export_system_identifier': Path.sysid, 'export_hostname': gethostname(), }, '@graph': protcur, # FIXME regularize elements ? } dump_path.mkdir(parents=True, exist_ok=True) # FIXME TODO make these latest paths accessible # probably by splitting protcur export out into # its own class latest_path = dump_path.parent / 'LATEST' latest_partial_path = dump_path.parent / 'LATEST_PARTIAL' fn = dump_path / 'protcur.json' with open(fn, 'wt') as f: json.dump(blob_protcur, f, sort_keys=True, indent=2, cls=JEncode) symlink_latest(dump_path, latest_partial_path) g = populateFromJsonLd(OntGraph(), fn).write(fn.with_suffix('.ttl')) symlink_latest(dump_path, latest_path) return blob_protcur
def export_single_dataset(self): intr = cur.Integrator( self.export_source_path) # FIXME implicit state set by cli id = intr.path.cache.identifier.uuid dump_path = self.export_path / 'datasets' / id / self.folder_timestamp latest_path = self.export_path / 'datasets' / id / 'LATEST' latest_partial_path = self.export_path / 'datasets' / id / 'LATEST_PARTIAL' if not dump_path.exists(): dump_path.mkdir(parents=True) def jdump(blob, f): json.dump(blob, f, sort_keys=True, indent=2, cls=JEncode) # path metadata blob_path_transitive_metadata = pipes.PathTransitiveMetadataPipeline( self.export_source_path, None, None).data # FIXME timestamp etc. # FIXME need top level object not just an array with open(dump_path / 'path-metadata.json', 'wt') as f: # TODO mongo jdump(blob_path_transitive_metadata, f) # TODO a converter that doesn't care about higher level structure #blob_ptm_jsonld = pipes.IrToExportJsonPipeline(blob_path_transitive_metadata).data #breakpoint() # TODO ttl conversion blob_data = intr.data_for_export( self.timestamp) # build and cache the data epipe = pipes.IrToExportJsonPipeline(blob_data) blob_export = epipe.data blob_jsonld = self._dataset_export_jsonld(blob_export) functions = [] suffixes = [] modes = [] # always dump the json j = lambda f: jdump( blob_export, f ) #json.dump(blob_export, f, sort_keys=True, indent=2, cls=JEncode) functions.append(j) suffixes.append('.json') modes.append('wt') # always dump the jsonld j = lambda f: jdump( blob_jsonld, f ) #json.dump(blob_jsonld, f, sort_keys=True, indent=2, cls=JEncode) functions.append(j) suffixes.append('.jsonld') modes.append('wt') # always dump the ttl (for single datasets this is probably ok) t = lambda f: f.write(ex.TriplesExportDataset(blob_data).ttl) functions.append(t) suffixes.append('.ttl') modes.append('wb') filename = 'curation-export' filepath = dump_path / filename for function, suffix, mode in zip(functions, suffixes, modes): out = filepath.with_suffix(suffix) with open(out, mode) as f: function(f) if suffix == '.json': symlink_latest(dump_path, latest_partial_path) elif suffix == '.jsonld': loge.info(f'dataset graph exported to {out}') elif suffix == '.ttl': loge.info(f'dataset graph exported to {out}') if self.open_when_done: out.xopen(self.open_when_done) symlink_latest(dump_path, latest_path) return blob_data, intr, dump_path, latest_path