Beispiel #1
0
    def _createProvenance(self, result):
        provdata = IProvenanceData(result)
        from rdflib import URIRef, Literal, Namespace, Graph
        from rdflib.namespace import RDF, RDFS, FOAF, DCTERMS, XSD
        from rdflib.resource import Resource
        PROV = Namespace(u"http://www.w3.org/ns/prov#")
        BCCVL = Namespace(u"http://ns.bccvl.org.au/")
        LOCAL = Namespace(u"urn:bccvl:")
        graph = Graph()
        # the user is our agent

        member = api.user.get_current()
        username = member.getProperty('fullname') or member.getId()
        user = Resource(graph, LOCAL['user'])
        user.add(RDF['type'], PROV['Agent'])
        user.add(RDF['type'], FOAF['Person'])
        user.add(FOAF['name'], Literal(username))
        user.add(FOAF['mbox'],
                 URIRef('mailto:{}'.format(member.getProperty('email'))))
        # add software as agent
        software = Resource(graph, LOCAL['software'])
        software.add(RDF['type'], PROV['Agent'])
        software.add(RDF['type'], PROV['SoftwareAgent'])
        software.add(FOAF['name'], Literal('BCCVL ALA Importer'))
        # script content is stored somewhere on result and will be exported with zip?
        #   ... or store along with pstats.json ? hidden from user

        # -> execenvironment after import -> log output?
        # -> source code ... maybe some link expression? stored on result ? separate entity?
        activity = Resource(graph, LOCAL['activity'])
        activity.add(RDF['type'], PROV['Activity'])
        # TODO: this is rather queued or created time for this activity ... could capture real start time on running status update (or start transfer)
        now = datetime.now().replace(microsecond=0)
        activity.add(PROV['startedAtTime'],
                     Literal(now.isoformat(), datatype=XSD['dateTime']))
        activity.add(PROV['hasAssociationWith'], user)
        activity.add(PROV['hasAssociationWith'], software)
        # add job parameters to activity

        provdata.data = graph.serialize(format="turtle")
    def _createProvenance(self, result):
        provdata = IProvenanceData(result)
        from rdflib import URIRef, Literal, Namespace, Graph
        from rdflib.namespace import RDF, RDFS, FOAF, DCTERMS, XSD
        from rdflib.resource import Resource
        PROV = Namespace(u"http://www.w3.org/ns/prov#")
        BCCVL = Namespace(u"http://ns.bccvl.org.au/")
        LOCAL = Namespace(u"urn:bccvl:")
        graph = Graph()
        # the user is our agent

        member = api.user.get_current()
        username = member.getProperty('fullname') or member.getId()
        user = Resource(graph, LOCAL['user'])
        user.add(RDF['type'], PROV['Agent'])
        user.add(RDF['type'], FOAF['Person'])
        user.add(FOAF['name'], Literal(username))
        user.add(FOAF['mbox'],
                 URIRef('mailto:{}'.format(member.getProperty('email'))))
        # add software as agent
        software = Resource(graph, LOCAL['software'])
        software.add(RDF['type'], PROV['Agent'])
        software.add(RDF['type'], PROV['SoftwareAgent'])
        software.add(FOAF['name'], Literal('BCCVL ALA Importer'))
        # script content is stored somewhere on result and will be exported with zip?
        #   ... or store along with pstats.json ? hidden from user

        # -> execenvironment after import -> log output?
        # -> source code ... maybe some link expression? stored on result ? separate entity?
        activity = Resource(graph, LOCAL['activity'])
        activity.add(RDF['type'], PROV['Activity'])
        # TODO: this is rather queued or created time for this activity ... could capture real start time on running status update (or start transfer)
        now = datetime.now().replace(microsecond=0)
        activity.add(PROV['startedAtTime'],
                     Literal(now.isoformat(), datatype=XSD['dateTime']))
        activity.add(PROV['hasAssociationWith'], user)
        activity.add(PROV['hasAssociationWith'], software)
        # add job parameters to activity

        provdata.data = graph.serialize(format="turtle")
    def __iter__(self):
        """missing docstring."""
        for item in self.previous:
            # check if we have a dataset

            if item['_type'] not in ('org.bccvl.content.dataset',
                                     'org.bccvl.content.remotedataset'):
                # not a dataset
                yield item
                continue

            pathkey = self.pathkey(*item.keys())[0]
            # no path .. can't do anything
            if not pathkey:
                yield item
                continue

            path = item[pathkey]
            # Skip the Plone site object itself
            if not path:
                yield item
                continue

            obj = self.context.unrestrictedTraverse(path.encode().lstrip('/'),
                                                    None)

            # FIXME: this is really not a great way to check where to find provenenace data
            # check if we are inside an experiment (means we import result)
            if IExperiment.providedBy(self.context.__parent__):
                # result import
                context = self.context
            else:
                # dataset import?
                context = obj

            # TODO: do some sanity checks
            provdata = IProvenanceData(context)
            PROV = Namespace(u"http://www.w3.org/ns/prov#")
            BCCVL = Namespace(u"http://ns.bccvl.org.au/")
            LOCAL = Namespace(u"urn:bccvl:")
            graph = Graph()
            graph.parse(data=provdata.data or '', format='turtle')
            activity = Resource(graph, LOCAL['activity'])
            # FIXME: shouldn't I use uuid instead of id?
            entity = Resource(graph, LOCAL[obj.id])
            # create this dataset as new entity -> output of activity
            entity.add(RDF['type'], PROV['Entity'])
            # generated by
            entity.add(PROV['wasGeneratedBy'], activity)
            # PROV['prov:wasAttributedTo'] to user and software?
            # File metadata
            entity.add(DCTERMS['creator'], Literal(obj.Creator()))
            entity.add(DCTERMS['title'], Literal(obj.title))
            entity.add(DCTERMS['description'], Literal(obj.description))
            entity.add(DCTERMS['rights'], Literal(obj.rights))
            if obj.portal_type == 'org.bccvl.content.dataset':
                entity.add(DCTERMS['format'], Literal(obj.file.contentType))
            else:
                # FIXME: this doesn't seem to do the right thing
                entity.add(DCTERMS['format'], Literal(obj.format))
            # TODO: add metadata about file?
            #    genre, layers, emsc, gcm, year

            # set activities end time
            #   first one wins
            if activity.value(PROV['endedAtTime']) is None:
                activity.add(
                    PROV['endedAtTime'],
                    Literal(datetime.now().replace(microsecond=0).isoformat(),
                            datatype=XSD['dateTime']))

            # TODO: extend activity metadata with execution environment data
            #       (logfile import?, pstats import) .. and script + params.json file
            # ALA import url
            pd = item.get('_ala_provenance', {})
            if pd:
                entity.add(BCCVL['download_url'], Literal(pd['url']))

            # store prov data
            provdata.data = graph.serialize(format="turtle")

            yield item
Beispiel #4
0
    def __iter__(self):
        """missing docstring."""
        for item in self.previous:
            # check if we have a dataset

            if item['_type'] not in ('org.bccvl.content.dataset',
                                     'org.bccvl.content.remotedataset'):
                # not a dataset
                yield item
                continue

            pathkey = self.pathkey(*item.keys())[0]
            # no path .. can't do anything
            if not pathkey:
                yield item
                continue

            path = item[pathkey]
            # Skip the Plone site object itself
            if not path:
                yield item
                continue

            obj = self.context.unrestrictedTraverse(
                path.encode().lstrip('/'), None)

            # FIXME: this is really not a great way to check where to find provenenace data
            # check if we are inside an experiment (means we import result)
            if IExperiment.providedBy(self.context.__parent__):
                # result import
                context = self.context
            else:
                # dataset import?
                context = obj

            # TODO: do some sanity checks
            provdata = IProvenanceData(context)
            PROV = Namespace(u"http://www.w3.org/ns/prov#")
            BCCVL = Namespace(u"http://ns.bccvl.org.au/")
            LOCAL = Namespace(u"urn:bccvl:")
            graph = Graph()
            graph.parse(data=provdata.data or '', format='turtle')
            activity = Resource(graph, LOCAL['activity'])
            # FIXME: shouldn't I use uuid instead of id?
            entity = Resource(graph, LOCAL[obj.id])
            # create this dataset as new entity -> output of activity
            entity.add(RDF['type'], PROV['Entity'])
            # generated by
            entity.add(PROV['wasGeneratedBy'], activity)
            # PROV['prov:wasAttributedTo'] to user and software?
            # File metadata
            entity.add(DCTERMS['creator'], Literal(obj.Creator()))
            entity.add(DCTERMS['title'], Literal(obj.title))
            entity.add(DCTERMS['description'], Literal(obj.description))
            entity.add(DCTERMS['rights'], Literal(obj.rights))
            if obj.portal_type == 'org.bccvl.content.dataset':
                entity.add(DCTERMS['format'], Literal(obj.file.contentType))
            else:
                # FIXME: this doesn't seem to do the right thing
                entity.add(DCTERMS['format'], Literal(obj.format))
            # TODO: add metadata about file?
            #    genre, layers, emsc, gcm, year

            # set activities end time
            #   first one wins
            if activity.value(PROV['endedAtTime']) is None:
                activity.add(PROV['endedAtTime'],
                             Literal(datetime.now().replace(microsecond=0).isoformat(), datatype=XSD['dateTime']))

            # TODO: extend activity metadata with execution environment data
            #       (logfile import?, pstats import) .. and script + params.json file
            # ALA import url
            pd = item.get('_ala_provenance', {})
            if pd:
                entity.add(BCCVL['download_url'], Literal(pd['url']))

            # store prov data
            provdata.data = graph.serialize(format="turtle")

            yield item
Beispiel #5
0
    def _createProvenance(self, result):
        provdata = IProvenanceData(result)
        from rdflib import URIRef, Literal, Namespace, Graph
        from rdflib.namespace import RDF, RDFS, FOAF, DCTERMS, XSD
        from rdflib.resource import Resource
        PROV = Namespace(u"http://www.w3.org/ns/prov#")
        BCCVL = Namespace(u"http://ns.bccvl.org.au/")
        LOCAL = Namespace(u"urn:bccvl:")
        graph = Graph()
        # the user is our agent

        member = api.user.get_current()
        username = member.getProperty('fullname') or member.getId()
        user = Resource(graph, LOCAL['user'])
        user.add(RDF['type'], PROV['Agent'])
        user.add(RDF['type'], FOAF['Person'])
        user.add(FOAF['name'], Literal(username))
        user.add(FOAF['mbox'],
                 URIRef('mailto:{}'.format(member.getProperty('email'))))
        # add software as agent
        software = Resource(graph, LOCAL['software'])
        software.add(RDF['type'], PROV['Agent'])
        software.add(RDF['type'], PROV['SoftwareAgent'])
        software.add(FOAF['name'], Literal('BCCVL Job Script'))
        # script content is stored somewhere on result and will be exported with zip?
        #   ... or store along with pstats.json ? hidden from user

        # -> execenvironment after import -> log output?
        # -> source code ... maybe some link expression? stored on result ? separate entity?
        activity = Resource(graph, LOCAL['activity'])
        activity.add(RDF['type'], PROV['Activity'])
        # TODO: this is rather queued or created time for this activity ... could capture real start time on running status update (or start transfer)
        now = datetime.now().replace(microsecond=0)
        activity.add(PROV['startedAtTime'],
                     Literal(now.isoformat(), datatype=XSD['dateTime']))
        activity.add(PROV['hasAssociationWith'], user)
        activity.add(PROV['hasAssociationWith'], software)
        # add job parameters to activity
        for idx, (key, value) in enumerate(result.job_params.items()):
            param = Resource(graph, LOCAL[u'param_{}'.format(idx)])
            activity.add(BCCVL['algoparam'], param)
            param.add(BCCVL['name'], Literal(key))
            # We have only dataset references as parameters
            if key in ('data_table', ):
                param.add(BCCVL['value'], LOCAL[dsuuid])
            else:
                param.add(BCCVL['value'], Literal(value))
        # iterate over all input datasets and add them as entities
        for key in ('data_table', ):
            dsbrain = uuidToCatalogBrain(result.job_params[key])
            if not dsbrain:
                continue
            ds = dsbrain.getObject()
            dsprov = Resource(graph, LOCAL[result.job_params[key]])
            dsprov.add(RDF['type'], PROV['Entity'])
            #dsprov.add(PROV['..'], Literal(''))
            dsprov.add(DCTERMS['creator'], Literal(ds.Creator()))
            dsprov.add(DCTERMS['title'], Literal(ds.title))
            dsprov.add(DCTERMS['description'], Literal(ds.description))
            dsprov.add(DCTERMS['rights'],
                       Literal(ds.rights))  # ds.rightsstatement
            dsprov.add(DCTERMS['format'], Literal(ds.file.contentType))
            # location / source
            # graph.add(uri, DCTERMS['source'], Literal(''))
            # TODO: genre ...
            # TODO: resolution
            # species metadata
            md = IBCCVLMetadata(ds)
            # dsprov.add(BCCVL['scientificName'], Literal(md['species']['scientificName']))
            # dsprov.add(BCCVL['taxonID'], URIRef(md['species']['taxonID']))

            # ... species data, ... species id
            for layer in md.get('layers_used', ()):
                dsprov.add(BCCVL['layer'], LOCAL[layer])

            # link with activity
            activity.add(PROV['used'], dsprov)

        provdata.data = graph.serialize(format="turtle")
    def _createProvenance(self, result):
        provdata = IProvenanceData(result)
        from rdflib import URIRef, Literal, Namespace, Graph
        from rdflib.namespace import RDF, RDFS, FOAF, DCTERMS, XSD
        from rdflib.resource import Resource
        PROV = Namespace(u"http://www.w3.org/ns/prov#")
        BCCVL = Namespace(u"http://ns.bccvl.org.au/")
        LOCAL = Namespace(u"urn:bccvl:")
        graph = Graph()
        # the user is our agent

        member = api.user.get_current()
        username = member.getProperty('fullname') or member.getId()
        user = Resource(graph, LOCAL['user'])
        user.add(RDF['type'], PROV['Agent'])
        user.add(RDF['type'], FOAF['Person'])
        user.add(FOAF['name'], Literal(username))
        user.add(FOAF['mbox'],
                 URIRef('mailto:{}'.format(member.getProperty('email'))))
        # add software as agent
        software = Resource(graph, LOCAL['software'])
        software.add(RDF['type'], PROV['Agent'])
        software.add(RDF['type'], PROV['SoftwareAgent'])
        software.add(FOAF['name'], Literal('BCCVL Job Script'))
        # script content is stored somewhere on result and will be exported with zip?
        #   ... or store along with pstats.json ? hidden from user

        # -> execenvironment after import -> log output?
        # -> source code ... maybe some link expression? stored on result ? separate entity?
        activity = Resource(graph, LOCAL['activity'])
        activity.add(RDF['type'], PROV['Activity'])
        # TODO: this is rather queued or created time for this activity ... could capture real start time on running status update (or start transfer)
        now = datetime.now().replace(microsecond=0)
        activity.add(PROV['startedAtTime'],
                     Literal(now.isoformat(), datatype=XSD['dateTime']))
        activity.add(PROV['hasAssociationWith'], user)
        activity.add(PROV['hasAssociationWith'], software)
        # add job parameters to activity
        for idx, (key, value) in enumerate(result.job_params.items()):
            param = Resource(graph, LOCAL[u'param_{}'.format(idx)])
            activity.add(BCCVL['algoparam'], param)
            param.add(BCCVL['name'], Literal(key))
            # We have only dataset references as parameters
            if key in ('data_table',):
                param.add(BCCVL['value'], LOCAL[dsuuid])
            else:
                param.add(BCCVL['value'], Literal(value))
        # iterate over all input datasets and add them as entities
        for key in ('data_table',):
            dsbrain = uuidToCatalogBrain(result.job_params[key])
            if not dsbrain:
                continue
            ds = dsbrain.getObject()
            dsprov = Resource(graph, LOCAL[result.job_params[key]])
            dsprov.add(RDF['type'], PROV['Entity'])
            #dsprov.add(PROV['..'], Literal(''))
            dsprov.add(DCTERMS['creator'], Literal(ds.Creator()))
            dsprov.add(DCTERMS['title'], Literal(ds.title))
            dsprov.add(DCTERMS['description'], Literal(ds.description))
            dsprov.add(DCTERMS['rights'], Literal(ds.rights))  # ds.rightsstatement
            dsprov.add(DCTERMS['format'], Literal(ds.file.contentType))
            # location / source
            # graph.add(uri, DCTERMS['source'], Literal(''))
            # TODO: genre ...
            # TODO: resolution
            # species metadata
            md = IBCCVLMetadata(ds)
            # dsprov.add(BCCVL['scientificName'], Literal(md['species']['scientificName']))
            # dsprov.add(BCCVL['taxonID'], URIRef(md['species']['taxonID']))

            # ... species data, ... species id
            for layer in md.get('layers_used',()):
                dsprov.add(BCCVL['layer'], LOCAL[layer])

            # link with activity
            activity.add(PROV['used'], dsprov)

        provdata.data = graph.serialize(format="turtle")