Example #1
0
    def export_single_dataset(self):
        intr = cur.Integrator(
            self.export_source_path)  # FIXME implicit state set by cli
        dump_path = self.export_base / 'datasets' / intr.id / self.folder_timestamp
        latest_path = self.export_base / 'datasets' / intr.id / 'LATEST'
        latest_partial_path = self.export_base / 'datasets' / intr.id / 'LATEST_PARTIAL'
        if not dump_path.exists():
            dump_path.mkdir(parents=True)

        functions = []
        suffixes = []
        modes = []
        blob_data = intr.data_for_export(
            self.timestamp)  # build and cache the data
        epipe = pipes.IrToExportJsonPipeline(blob_data)
        blob_export = epipe.data
        blob_jsonld = self._dataset_export_jsonld(blob_export)

        # always dump the json
        j = lambda f: json.dump(
            blob_export, f, sort_keys=True, indent=2, cls=JEncode)
        functions.append(j)
        suffixes.append('.json')
        modes.append('wt')

        # always dump the jsonld
        j = lambda f: json.dump(
            blob_jsonld, f, sort_keys=True, indent=2, cls=JEncode)
        functions.append(j)
        suffixes.append('.jsonld')
        modes.append('wt')

        # always dump the ttl (for single datasets this is probably ok)
        t = lambda f: f.write(ex.TriplesExportDataset(blob_data).ttl)
        functions.append(t)
        suffixes.append('.ttl')
        modes.append('wb')

        filename = 'curation-export'
        filepath = dump_path / filename

        for function, suffix, mode in zip(functions, suffixes, modes):
            out = filepath.with_suffix(suffix)
            with open(out, mode) as f:
                function(f)

            if suffix == '.json':
                symlink_latest(dump_path, latest_partial_path)

            elif suffix == '.jsonld':
                loge.info(f'dataset graph exported to {out}')

            elif suffix == '.ttl':
                loge.info(f'dataset graph exported to {out}')

            if self.open_when_done:
                out.xopen(self.open_when_done)

        symlink_latest(dump_path, latest_path)
        return blob_data, intr
Example #2
0
    def export_protcur(self, dump_path, *hypothesis_groups, no_network=False):
        # FIXME no_network passed in here is dumb
        #if (self.latest and  # FIXME NOTE this only points to the latest integrated release
        #self.latest_protcur_path.exists()):
        #blob_protcur = self.latest_protocols
        #else:

        pipeline = pipes.ProtcurPipeline(*hypothesis_groups,
                                         no_network=no_network)
        # FIXME NOTE this does not do the identifier expansion pass
        protcur = pipeline.data
        context = {
            **sc.base_context,
            **sc.protcur_context,
        }
        for f in ('meta', 'subjects', 'samples', 'contributors'):
            context.pop(f)  # FIXME HACK meta @graph for datasets

        ontology_header = {  # FIXME should probably not be added here since it is obscure ...
            '@id': 'https://cassava.ucsd.edu/sparc/ontologies/protcur.ttl',
            '@type': 'owl:Ontology',
        }

        protcur.append(ontology_header)

        blob_protcur = {  # FIXME this should not be defined here so confusing that it is not with the pipeline ...
            '@context': context,
            'meta': {
                'count': len(protcur)
            },  # FIXME adjust to structure
            'prov': {
                'timestamp_export_start': self.timestamp,
                'export_system_identifier': Path.sysid,
                'export_hostname': gethostname(),
            },
            '@graph': protcur,  # FIXME regularize elements ?
        }

        dump_path.mkdir(parents=True, exist_ok=True)
        # FIXME TODO make these latest paths accessible
        # probably by splitting protcur export out into
        # its own class
        latest_path = dump_path.parent / 'LATEST'
        latest_partial_path = dump_path.parent / 'LATEST_PARTIAL'
        fn = dump_path / 'protcur.json'
        with open(fn, 'wt') as f:
            json.dump(blob_protcur, f, sort_keys=True, indent=2, cls=JEncode)

        symlink_latest(dump_path, latest_partial_path)

        g = populateFromJsonLd(OntGraph(), fn).write(fn.with_suffix('.ttl'))

        symlink_latest(dump_path, latest_path)

        return blob_protcur
Example #3
0
    def export_datasets(self):
        # start time not end time ...
        # obviously not transactional ...
        filename = 'curation-export'
        dump_path = self.export_base / self.folder_timestamp
        if not dump_path.exists():
            dump_path.mkdir(parents=True)

        symlink_latest(dump_path, self.LATEST_RUN)

        filepath = dump_path / filename

        # data
        summary = cur.Summary(
            self.export_source_path)  # FIXME implicit state set by cli
        blob_data = (self.latest_export if self.latest else
                     summary.data_for_export(self.timestamp))

        # FIXME we still create a new export folder every time even if the json didn't change ...
        with open(filepath.with_suffix('.json'), 'wt') as f:
            json.dump(blob_data, f, sort_keys=True, indent=2, cls=JEncode)

        previous_latest = self.latest_datasets_path.resolve()
        symlink_latest(dump_path, self.LATEST_PARTIAL)

        dataset_blobs = blob_data['datasets']

        # identifier metadata
        blob_id_met = self.export_identifier_metadata(dump_path, dataset_blobs)
        teim = self.export_identifier_rdf(dump_path, blob_id_met)

        # protocol
        blob_protocol = self.export_protocols(dump_path, dataset_blobs,
                                              summary)

        # rdf
        teds = self.export_rdf(dump_path, previous_latest, dataset_blobs)
        tes = ex.TriplesExportSummary(blob_data, teds=teds + [teim])

        with open(filepath.with_suffix('.ttl'), 'wb') as f:
            f.write(tes.ttl)

        # xml
        self.export_xml(filepath, dataset_blobs)

        # disco
        self.export_disco(filepath, dataset_blobs, teds)

        symlink_latest(dump_path, self.LATEST)
        return summary
Example #4
0
    def export(self, *args, **kwargs):
        dump_path = self.dump_path
        # make the dump directory
        self.make_dump_path(dump_path)
        symlink_latest(dump_path, self.LATEST_RUN)

        filepath_json = self.filepath_json
        # build or load the export of the internal representation
        blob_ir, *rest_ir = self.make_ir(**kwargs)
        blob_export_json = self.make_export_json(blob_ir)
        self.write_json(filepath_json, blob_export_json)
        symlink_latest(dump_path, self.LATEST_PARTIAL)

        # build or load derived exports
        self.export_other_formats(dump_path, filepath_json, blob_ir,
                                  blob_export_json, *rest_ir)
        symlink_latest(dump_path, self.LATEST)

        return (blob_ir, *rest_ir)  # FIXME :/
Example #5
0
    def export_protcur(
            self,
            dump_path,
            *hypothesis_groups,
            rerun_protcur_export=False,
            # FIXME no_network passed in here is dumb
            no_network=False,
            # FIXME direct= is a hack
            direct=False):
        if not direct and self.export_base != self.export_protcur_base:
            # workaround to set the correct export base path
            nargs = {**self._args}
            nargs['export_base'] = self.export_protcur_base
            export = ExportProtcur(**nargs)
            return export.export_protcur(export.dump_path,
                                         *hypothesis_groups,
                                         no_network=no_network), export

        pipeline = pipes.ProtcurPipeline(*hypothesis_groups,
                                         no_network=no_network)
        annos = pipeline.load()
        if not annos:
            msg = ('No annos. Did you remember to run\n'
                   'python -m sparcur.simple.fetch_annotations')
            raise ValueError(msg)

        if self.latest_export_path.exists():
            # FIXME this only points to the latest integrated release
            # which is not what we want, we need the latest protcur to be independent
            #self.latest and
            blob_protcur = self.latest_export
            t_lex = blob_protcur['prov']['timestamp_export_start']
            t_lup = max(a.updated for a in annos).replace('+00:00', 'Z')
            new_annos_here = t_lex < t_lup  # <= is pretty much impossible
            if not (new_annos_here or rerun_protcur_export):
                return blob_protcur

        # FIXME NOTE this does not do the identifier expansion pass
        protcur = pipeline._make_blob(annos=annos)
        context = {
            **sc.base_context,
            **sc.protcur_context,
        }
        for f in ('meta', 'subjects', 'samples', 'contributors'):
            # subjects samples and contributors no longer included in context directly
            if f in context:
                context.pop(f)  # FIXME HACK meta @graph for datasets

        ontology_header = {  # FIXME should probably not be added here since it is obscure ...
            '@id': 'https://cassava.ucsd.edu/sparc/ontologies/protcur.ttl',
            '@type': 'owl:Ontology',
        }

        protcur.append(ontology_header)

        blob_protcur = {  # FIXME this should not be defined here so confusing that it is not with the pipeline ...
            '@context': context,
            'meta': {
                'count': len(protcur)
            },  # FIXME adjust to structure
            'prov': {
                'timestamp_export_start': self.timestamp,
                'export_system_identifier': Path.sysid,
                'export_hostname': gethostname(),
            },
            '@graph': protcur,  # FIXME regularize elements ?
        }

        dump_path.mkdir(parents=True, exist_ok=True)
        # FIXME TODO make these latest paths accessible
        # probably by splitting protcur export out into
        # its own class
        latest_path = dump_path.parent / 'LATEST'
        latest_partial_path = dump_path.parent / 'LATEST_PARTIAL'
        fn = dump_path / 'protcur.json'
        with open(fn, 'wt') as f:
            json.dump(blob_protcur, f, sort_keys=True, indent=2, cls=JEncode)

        symlink_latest(dump_path, latest_partial_path)

        g = populateFromJsonLd(OntGraph(), fn).write(fn.with_suffix('.ttl'))

        symlink_latest(dump_path, latest_path)

        return blob_protcur
Example #6
0
    def export_single_dataset(self):
        intr = cur.Integrator(
            self.export_source_path)  # FIXME implicit state set by cli
        id = intr.path.cache.identifier.uuid
        dump_path = self.export_path / 'datasets' / id / self.folder_timestamp
        latest_path = self.export_path / 'datasets' / id / 'LATEST'
        latest_partial_path = self.export_path / 'datasets' / id / 'LATEST_PARTIAL'
        if not dump_path.exists():
            dump_path.mkdir(parents=True)

        def jdump(blob, f):
            json.dump(blob, f, sort_keys=True, indent=2, cls=JEncode)

        # path metadata
        blob_path_transitive_metadata = pipes.PathTransitiveMetadataPipeline(
            self.export_source_path, None, None).data  # FIXME timestamp etc.
        # FIXME need top level object not just an array
        with open(dump_path / 'path-metadata.json', 'wt') as f:
            # TODO mongo
            jdump(blob_path_transitive_metadata, f)

        # TODO a converter that doesn't care about higher level structure
        #blob_ptm_jsonld = pipes.IrToExportJsonPipeline(blob_path_transitive_metadata).data
        #breakpoint()

        # TODO ttl conversion

        blob_data = intr.data_for_export(
            self.timestamp)  # build and cache the data
        epipe = pipes.IrToExportJsonPipeline(blob_data)
        blob_export = epipe.data
        blob_jsonld = self._dataset_export_jsonld(blob_export)

        functions = []
        suffixes = []
        modes = []

        # always dump the json
        j = lambda f: jdump(
            blob_export, f
        )  #json.dump(blob_export, f, sort_keys=True, indent=2, cls=JEncode)
        functions.append(j)
        suffixes.append('.json')
        modes.append('wt')

        # always dump the jsonld
        j = lambda f: jdump(
            blob_jsonld, f
        )  #json.dump(blob_jsonld, f, sort_keys=True, indent=2, cls=JEncode)
        functions.append(j)
        suffixes.append('.jsonld')
        modes.append('wt')

        # always dump the ttl (for single datasets this is probably ok)
        t = lambda f: f.write(ex.TriplesExportDataset(blob_data).ttl)
        functions.append(t)
        suffixes.append('.ttl')
        modes.append('wb')

        filename = 'curation-export'
        filepath = dump_path / filename

        for function, suffix, mode in zip(functions, suffixes, modes):
            out = filepath.with_suffix(suffix)
            with open(out, mode) as f:
                function(f)

            if suffix == '.json':
                symlink_latest(dump_path, latest_partial_path)

            elif suffix == '.jsonld':
                loge.info(f'dataset graph exported to {out}')

            elif suffix == '.ttl':
                loge.info(f'dataset graph exported to {out}')

            if self.open_when_done:
                out.xopen(self.open_when_done)

        symlink_latest(dump_path, latest_path)
        return blob_data, intr, dump_path, latest_path