def export_single_dataset(self): intr = cur.Integrator( self.export_source_path) # FIXME implicit state set by cli dump_path = self.export_base / 'datasets' / intr.id / self.folder_timestamp latest_path = self.export_base / 'datasets' / intr.id / 'LATEST' latest_partial_path = self.export_base / 'datasets' / intr.id / 'LATEST_PARTIAL' if not dump_path.exists(): dump_path.mkdir(parents=True) functions = [] suffixes = [] modes = [] blob_data = intr.data_for_export( self.timestamp) # build and cache the data epipe = pipes.IrToExportJsonPipeline(blob_data) blob_export = epipe.data blob_jsonld = self._dataset_export_jsonld(blob_export) # always dump the json j = lambda f: json.dump( blob_export, f, sort_keys=True, indent=2, cls=JEncode) functions.append(j) suffixes.append('.json') modes.append('wt') # always dump the jsonld j = lambda f: json.dump( blob_jsonld, f, sort_keys=True, indent=2, cls=JEncode) functions.append(j) suffixes.append('.jsonld') modes.append('wt') # always dump the ttl (for single datasets this is probably ok) t = lambda f: f.write(ex.TriplesExportDataset(blob_data).ttl) functions.append(t) suffixes.append('.ttl') modes.append('wb') filename = 'curation-export' filepath = dump_path / filename for function, suffix, mode in zip(functions, suffixes, modes): out = filepath.with_suffix(suffix) with open(out, mode) as f: function(f) if suffix == '.json': symlink_latest(dump_path, latest_partial_path) elif suffix == '.jsonld': loge.info(f'dataset graph exported to {out}') elif suffix == '.ttl': loge.info(f'dataset graph exported to {out}') if self.open_when_done: out.xopen(self.open_when_done) symlink_latest(dump_path, latest_path) return blob_data, intr
def export_rdf(self, dump_path, latest_path, dataset_blobs): dataset_dump_path = dump_path / 'datasets' dataset_dump_path.mkdir() suffix = '.ttl' mode = 'wb' teds = [] for dataset_blob in dataset_blobs: filename = dataset_blob['id'] filepath = dataset_dump_path / filename filepsuf = filepath.with_suffix(suffix) lfilepath = self.latest_datasets_path / filename lfilepath = latest_path / filename lfilepsuf = lfilepath.with_suffix(suffix) ted = ex.TriplesExportDataset(dataset_blob) teds.append(ted) if self.latest and lfilepsuf.exists(): filepsuf.copy_from(lfilepsuf) graph = OntGraph(path=lfilepsuf).parse() ted._graph = graph else: ted.graph.write(filepsuf) # yay OntGraph defaults loge.info(f'dataset graph exported to {filepsuf}') return teds
def export_rdf(self, dump_path, latest_path, dataset_blobs): dataset_dump_path = dump_path / 'datasets' dataset_dump_path.mkdir() suffix = '.ttl' mode = 'wb' wat = [b['id'] for b in dataset_blobs] counts = Counter([d for d in wat]) bads = set(id for id, c in counts.most_common() if c > 1) key = lambda d: d['id'] dupes = sorted([b for b in dataset_blobs if b['id'] in bads], key=key) if bads: loge.critical(bads) # TODO #breakpoint() #raise BaseException('NOPE') teds = [] for dataset_blob in dataset_blobs: filename = dataset_blob['id'] if filename in bads: loge.critical(filename) continue filepath = dataset_dump_path / filename filepsuf = filepath.with_suffix(suffix) lfilepath = self.latest_datasets_path / filename lfilepath = latest_path / filename lfilepsuf = lfilepath.with_suffix(suffix) ted = ex.TriplesExportDataset(dataset_blob) teds.append(ted) if self.latest and lfilepsuf.exists(): filepsuf.copy_from(lfilepsuf) graph = OntGraph(path=lfilepsuf).parse() ted._graph = graph else: ted.graph.write(filepsuf) # yay OntGraph defaults loge.info(f'dataset graph exported to {filepsuf}') return teds
def export_single_dataset(self): intr = cur.Integrator( self.export_source_path) # FIXME implicit state set by cli id = intr.path.cache.identifier.uuid dump_path = self.export_path / 'datasets' / id / self.folder_timestamp latest_path = self.export_path / 'datasets' / id / 'LATEST' latest_partial_path = self.export_path / 'datasets' / id / 'LATEST_PARTIAL' if not dump_path.exists(): dump_path.mkdir(parents=True) def jdump(blob, f): json.dump(blob, f, sort_keys=True, indent=2, cls=JEncode) # path metadata blob_path_transitive_metadata = pipes.PathTransitiveMetadataPipeline( self.export_source_path, None, None).data # FIXME timestamp etc. # FIXME need top level object not just an array with open(dump_path / 'path-metadata.json', 'wt') as f: # TODO mongo jdump(blob_path_transitive_metadata, f) # TODO a converter that doesn't care about higher level structure #blob_ptm_jsonld = pipes.IrToExportJsonPipeline(blob_path_transitive_metadata).data #breakpoint() # TODO ttl conversion blob_data = intr.data_for_export( self.timestamp) # build and cache the data epipe = pipes.IrToExportJsonPipeline(blob_data) blob_export = epipe.data blob_jsonld = self._dataset_export_jsonld(blob_export) functions = [] suffixes = [] modes = [] # always dump the json j = lambda f: jdump( blob_export, f ) #json.dump(blob_export, f, sort_keys=True, indent=2, cls=JEncode) functions.append(j) suffixes.append('.json') modes.append('wt') # always dump the jsonld j = lambda f: jdump( blob_jsonld, f ) #json.dump(blob_jsonld, f, sort_keys=True, indent=2, cls=JEncode) functions.append(j) suffixes.append('.jsonld') modes.append('wt') # always dump the ttl (for single datasets this is probably ok) t = lambda f: f.write(ex.TriplesExportDataset(blob_data).ttl) functions.append(t) suffixes.append('.ttl') modes.append('wb') filename = 'curation-export' filepath = dump_path / filename for function, suffix, mode in zip(functions, suffixes, modes): out = filepath.with_suffix(suffix) with open(out, mode) as f: function(f) if suffix == '.json': symlink_latest(dump_path, latest_partial_path) elif suffix == '.jsonld': loge.info(f'dataset graph exported to {out}') elif suffix == '.ttl': loge.info(f'dataset graph exported to {out}') if self.open_when_done: out.xopen(self.open_when_done) symlink_latest(dump_path, latest_path) return blob_data, intr, dump_path, latest_path