Example #1
0
    def export_single_dataset(self):
        intr = cur.Integrator(
            self.export_source_path)  # FIXME implicit state set by cli
        dump_path = self.export_base / 'datasets' / intr.id / self.folder_timestamp
        latest_path = self.export_base / 'datasets' / intr.id / 'LATEST'
        latest_partial_path = self.export_base / 'datasets' / intr.id / 'LATEST_PARTIAL'
        if not dump_path.exists():
            dump_path.mkdir(parents=True)

        functions = []
        suffixes = []
        modes = []
        blob_data = intr.data_for_export(
            self.timestamp)  # build and cache the data
        epipe = pipes.IrToExportJsonPipeline(blob_data)
        blob_export = epipe.data
        blob_jsonld = self._dataset_export_jsonld(blob_export)

        # always dump the json
        j = lambda f: json.dump(
            blob_export, f, sort_keys=True, indent=2, cls=JEncode)
        functions.append(j)
        suffixes.append('.json')
        modes.append('wt')

        # always dump the jsonld
        j = lambda f: json.dump(
            blob_jsonld, f, sort_keys=True, indent=2, cls=JEncode)
        functions.append(j)
        suffixes.append('.jsonld')
        modes.append('wt')

        # always dump the ttl (for single datasets this is probably ok)
        t = lambda f: f.write(ex.TriplesExportDataset(blob_data).ttl)
        functions.append(t)
        suffixes.append('.ttl')
        modes.append('wb')

        filename = 'curation-export'
        filepath = dump_path / filename

        for function, suffix, mode in zip(functions, suffixes, modes):
            out = filepath.with_suffix(suffix)
            with open(out, mode) as f:
                function(f)

            if suffix == '.json':
                symlink_latest(dump_path, latest_partial_path)

            elif suffix == '.jsonld':
                loge.info(f'dataset graph exported to {out}')

            elif suffix == '.ttl':
                loge.info(f'dataset graph exported to {out}')

            if self.open_when_done:
                out.xopen(self.open_when_done)

        symlink_latest(dump_path, latest_path)
        return blob_data, intr
Example #2
0
    def export_rdf(self, dump_path, latest_path, dataset_blobs):
        dataset_dump_path = dump_path / 'datasets'
        dataset_dump_path.mkdir()
        suffix = '.ttl'
        mode = 'wb'

        teds = []
        for dataset_blob in dataset_blobs:
            filename = dataset_blob['id']
            filepath = dataset_dump_path / filename
            filepsuf = filepath.with_suffix(suffix)
            lfilepath = self.latest_datasets_path / filename
            lfilepath = latest_path / filename
            lfilepsuf = lfilepath.with_suffix(suffix)

            ted = ex.TriplesExportDataset(dataset_blob)
            teds.append(ted)

            if self.latest and lfilepsuf.exists():
                filepsuf.copy_from(lfilepsuf)
                graph = OntGraph(path=lfilepsuf).parse()
                ted._graph = graph
            else:
                ted.graph.write(filepsuf)  # yay OntGraph defaults

            loge.info(f'dataset graph exported to {filepsuf}')

        return teds
Example #3
0
    def export_rdf(self, dump_path, latest_path, dataset_blobs):
        dataset_dump_path = dump_path / 'datasets'
        dataset_dump_path.mkdir()
        suffix = '.ttl'
        mode = 'wb'

        wat = [b['id'] for b in dataset_blobs]
        counts = Counter([d for d in wat])
        bads = set(id for id, c in counts.most_common() if c > 1)
        key = lambda d: d['id']
        dupes = sorted([b for b in dataset_blobs if b['id'] in bads], key=key)
        if bads:
            loge.critical(bads)
            # TODO
            #breakpoint()
            #raise BaseException('NOPE')

        teds = []
        for dataset_blob in dataset_blobs:
            filename = dataset_blob['id']
            if filename in bads:
                loge.critical(filename)
                continue
            filepath = dataset_dump_path / filename
            filepsuf = filepath.with_suffix(suffix)
            lfilepath = self.latest_datasets_path / filename
            lfilepath = latest_path / filename
            lfilepsuf = lfilepath.with_suffix(suffix)

            ted = ex.TriplesExportDataset(dataset_blob)
            teds.append(ted)

            if self.latest and lfilepsuf.exists():
                filepsuf.copy_from(lfilepsuf)
                graph = OntGraph(path=lfilepsuf).parse()
                ted._graph = graph
            else:
                ted.graph.write(filepsuf)  # yay OntGraph defaults

            loge.info(f'dataset graph exported to {filepsuf}')

        return teds
Example #4
0
    def export_single_dataset(self):
        intr = cur.Integrator(
            self.export_source_path)  # FIXME implicit state set by cli
        id = intr.path.cache.identifier.uuid
        dump_path = self.export_path / 'datasets' / id / self.folder_timestamp
        latest_path = self.export_path / 'datasets' / id / 'LATEST'
        latest_partial_path = self.export_path / 'datasets' / id / 'LATEST_PARTIAL'
        if not dump_path.exists():
            dump_path.mkdir(parents=True)

        def jdump(blob, f):
            json.dump(blob, f, sort_keys=True, indent=2, cls=JEncode)

        # path metadata
        blob_path_transitive_metadata = pipes.PathTransitiveMetadataPipeline(
            self.export_source_path, None, None).data  # FIXME timestamp etc.
        # FIXME need top level object not just an array
        with open(dump_path / 'path-metadata.json', 'wt') as f:
            # TODO mongo
            jdump(blob_path_transitive_metadata, f)

        # TODO a converter that doesn't care about higher level structure
        #blob_ptm_jsonld = pipes.IrToExportJsonPipeline(blob_path_transitive_metadata).data
        #breakpoint()

        # TODO ttl conversion

        blob_data = intr.data_for_export(
            self.timestamp)  # build and cache the data
        epipe = pipes.IrToExportJsonPipeline(blob_data)
        blob_export = epipe.data
        blob_jsonld = self._dataset_export_jsonld(blob_export)

        functions = []
        suffixes = []
        modes = []

        # always dump the json
        j = lambda f: jdump(
            blob_export, f
        )  #json.dump(blob_export, f, sort_keys=True, indent=2, cls=JEncode)
        functions.append(j)
        suffixes.append('.json')
        modes.append('wt')

        # always dump the jsonld
        j = lambda f: jdump(
            blob_jsonld, f
        )  #json.dump(blob_jsonld, f, sort_keys=True, indent=2, cls=JEncode)
        functions.append(j)
        suffixes.append('.jsonld')
        modes.append('wt')

        # always dump the ttl (for single datasets this is probably ok)
        t = lambda f: f.write(ex.TriplesExportDataset(blob_data).ttl)
        functions.append(t)
        suffixes.append('.ttl')
        modes.append('wb')

        filename = 'curation-export'
        filepath = dump_path / filename

        for function, suffix, mode in zip(functions, suffixes, modes):
            out = filepath.with_suffix(suffix)
            with open(out, mode) as f:
                function(f)

            if suffix == '.json':
                symlink_latest(dump_path, latest_partial_path)

            elif suffix == '.jsonld':
                loge.info(f'dataset graph exported to {out}')

            elif suffix == '.ttl':
                loge.info(f'dataset graph exported to {out}')

            if self.open_when_done:
                out.xopen(self.open_when_done)

        symlink_latest(dump_path, latest_path)
        return blob_data, intr, dump_path, latest_path