Python IProvenanceData Examples, org.bccvl.site.interfaces.IProvenanceData Python Examples

Example #1

0

Show file

    def __call__(self):


        # 1. find all IBlobDataset/ IRemotedDataset/ IDataset objects within context
        pc = getToolByName(self.context, 'portal_catalog')
        brains = pc.searchResults(path='/'.join(self.context.getPhysicalPath()),
                                  object_provides=IBlobDataset.__identifier__)
        fname = None
        try:
            # create tmp file
            fd, fname = tempfile.mkstemp()
            fo = os.fdopen(fd, 'w')
            zfile = zipfile.ZipFile(fo, 'w')

            metadata = {}

            # the file/folder name for the zip
            zfilename = self.context.title
            # iterate over files and add to zip
            for brain in brains:
                content = brain.getObject()
                # ob.file should be a NamedFile ... need to get fs name for that
                blobfile = content.file.openDetached()
                arcname = '/'.join((zfilename, 'data', content.file.filename))
                zfile.write(blobfile.name, arcname)
                blobfile.close()

                metadata[arcname] = getdsmetadata(content)
            # all files are in ....
            # TODO: add experiment result metadata

            # put metadata into zip
            # provenance data stored on result container
            provdata = IProvenanceData(self.context)
            if not provdata.data is None:
                zfile.writestr('/'.join((zfilename, 'prov.ttl')),
                               provdata.data)
            # add mets.xml
            metsview = getMultiAdapter((self.context, self.request), name="mets.xml")
            zfile.writestr('/'.join((zfilename, 'mets.xml')),
                           metsview.render())
            # finish zip file
            zfile.close()

            fo.close()

            # create response
            self.request.response.setHeader('Content-Type', 'application/zip')
            self.request.response.setHeader('Content-Disposition', 'attachment; filename="{}.zip"'.format(zfilename))
            self.request.response.setHeader('Content-Length', '{}'.format(os.path.getsize(fname)))
            return tmpfile_stream_iterator(fname)
        except Exception as e:
            # something went wrong ...
            # clean up and re-raise
            if os.path.exists(fname):
                os.remove(fname)
            raise e

Example #2

0

Show file

    def _createProvenance(self, result):
        provdata = IProvenanceData(result)
        from rdflib import URIRef, Literal, Namespace, Graph
        from rdflib.namespace import RDF, RDFS, FOAF, DCTERMS, XSD
        from rdflib.resource import Resource
        PROV = Namespace(u"http://www.w3.org/ns/prov#")
        BCCVL = Namespace(u"http://ns.bccvl.org.au/")
        LOCAL = Namespace(u"urn:bccvl:")
        graph = Graph()
        # the user is our agent

        member = api.user.get_current()
        username = member.getProperty('fullname') or member.getId()
        user = Resource(graph, LOCAL['user'])
        user.add(RDF['type'], PROV['Agent'])
        user.add(RDF['type'], FOAF['Person'])
        user.add(FOAF['name'], Literal(username))
        user.add(FOAF['mbox'],
                 URIRef('mailto:{}'.format(member.getProperty('email'))))
        # add software as agent
        software = Resource(graph, LOCAL['software'])
        software.add(RDF['type'], PROV['Agent'])
        software.add(RDF['type'], PROV['SoftwareAgent'])
        software.add(FOAF['name'], Literal('BCCVL ALA Importer'))
        # script content is stored somewhere on result and will be exported with zip?
        #   ... or store along with pstats.json ? hidden from user

        # -> execenvironment after import -> log output?
        # -> source code ... maybe some link expression? stored on result ? separate entity?
        activity = Resource(graph, LOCAL['activity'])
        activity.add(RDF['type'], PROV['Activity'])
        # TODO: this is rather queued or created time for this activity ... could capture real start time on running status update (or start transfer)
        now = datetime.now().replace(microsecond=0)
        activity.add(PROV['startedAtTime'],
                     Literal(now.isoformat(), datatype=XSD['dateTime']))
        activity.add(PROV['hasAssociationWith'], user)
        activity.add(PROV['hasAssociationWith'], software)
        # add job parameters to activity

        provdata.data = graph.serialize(format="turtle")

Example #3

0

Show file

File: utilities.py Project: sarahrichmond/org.bccvl.site

    def _createProvenance(self, result):
        provdata = IProvenanceData(result)
        from rdflib import URIRef, Literal, Namespace, Graph
        from rdflib.namespace import RDF, RDFS, FOAF, DCTERMS, XSD
        from rdflib.resource import Resource
        PROV = Namespace(u"http://www.w3.org/ns/prov#")
        BCCVL = Namespace(u"http://ns.bccvl.org.au/")
        LOCAL = Namespace(u"urn:bccvl:")
        graph = Graph()
        # the user is our agent

        member = api.user.get_current()
        username = member.getProperty('fullname') or member.getId()
        user = Resource(graph, LOCAL['user'])
        user.add(RDF['type'], PROV['Agent'])
        user.add(RDF['type'], FOAF['Person'])
        user.add(FOAF['name'], Literal(username))
        user.add(FOAF['mbox'],
                 URIRef('mailto:{}'.format(member.getProperty('email'))))
        # add software as agent
        software = Resource(graph, LOCAL['software'])
        software.add(RDF['type'], PROV['Agent'])
        software.add(RDF['type'], PROV['SoftwareAgent'])
        software.add(FOAF['name'], Literal('BCCVL ALA Importer'))
        # script content is stored somewhere on result and will be exported with zip?
        #   ... or store along with pstats.json ? hidden from user

        # -> execenvironment after import -> log output?
        # -> source code ... maybe some link expression? stored on result ? separate entity?
        activity = Resource(graph, LOCAL['activity'])
        activity.add(RDF['type'], PROV['Activity'])
        # TODO: this is rather queued or created time for this activity ... could capture real start time on running status update (or start transfer)
        now = datetime.now().replace(microsecond=0)
        activity.add(PROV['startedAtTime'],
                     Literal(now.isoformat(), datatype=XSD['dateTime']))
        activity.add(PROV['hasAssociationWith'], user)
        activity.add(PROV['hasAssociationWith'], software)
        # add job parameters to activity

        provdata.data = graph.serialize(format="turtle")

Example #4

0

Show file

    def _download_results(self, context, zfile):
        # FIXME: This is a rather lengthy process, and should probably be turned into a background task... (maybe as part of a datamanager service?)

        # 1. find all IBlobDataset/ IRemotedDataset/ IDataset objects within context
        pc = getToolByName(context, 'portal_catalog')
        brains = pc.searchResults(path='/'.join(context.getPhysicalPath()),
                                  object_provides=[
                                      IBlobDataset.__identifier__,
                                      IRemoteDataset.__identifier__
                                  ])
        metadata = {}

        # the file/folder name for the zip
        zfilename = context.title
        # iterate over files and add to zip
        for brain in brains:
            content = brain.getObject()
            if IBlobDataset.providedBy(content):
                # If data is stored locally:
                arcname = '/'.join((zfilename, 'data', content.file.filename))
                # ob.file should be a NamedFile ... need to get fs name for that
                blobfile = content.file.openDetached()

                zfile.write(blobfile.name, arcname)
                blobfile.close()

            elif IRemoteDataset.providedBy(content):
                # TODO: duplicate code from
                remoteUrl = getattr(content, 'remoteUrl', None)
                if remoteUrl is None:
                    raise NotFound(self, 'remoteUrl', self.request)
                # get arcname from remoteUrl
                arcname = '/'.join(
                    (zfilename, 'data', os.path.basename(remoteUrl)))
                # FIXME: should check dataset downloaiable flag here,
                #       but assumption is, that this function can only be called on an experiment result folder....
                # TODO: duplicate code in browser/dataset.py:RemoteDatasetDownload.__call__
                # TODO: may not work in general... it always uses swift as remote url
                tool = getUtility(ISwiftUtility)
                try:
                    url = tool.generate_temp_url(url=remoteUrl)
                except:
                    url = remoteUrl
                # url is now the location from which we can fetch the file
                temp_file, _ = urlretrieve(url)
                zfile.write(temp_file, arcname)
                os.remove(temp_file)
            else:
                # unknown type of Dataset
                # just skip it
                # TODO: Log warning or debug?
                continue
            metadata[arcname] = getdsmetadata(content)
        # all files are in ....
        # TODO: add experiment result metadata

        # put metadata into zip
        # provenance data stored on result container
        provdata = IProvenanceData(context)
        if not provdata.data is None:
            zfile.writestr('/'.join((zfilename, 'prov.ttl')),
                           provdata.data.encode('utf-8'))

        # add experiment metadata
        expmetadata = IExperimentMetadata(context)
        if not expmetadata.data is None:
            zfile.writestr('/'.join((zfilename, 'expmetadata.txt')),
                           expmetadata.data.encode('utf-8'))

        # add mets.xml
        metsview = getMultiAdapter((context, self.request), name="mets.xml")
        zfile.writestr('/'.join((zfilename, 'mets.xml')),
                       metsview.render().encode('utf-8'))

        # add experiment parameters
        params = IExperimentParameter(context)
        if not params.data is None:
            zfile.writestr('/'.join((zfilename, 'params.json')),
                           params.data.encode('utf-8'))

Example #5

0

Show file

File: metsview.py Project: manhinli/org.bccvl.site

 def __call__(self):
     self.request.response.setHeader("Content-type",
                                     "text/turtle; charset=utf-8")
     provdata = IProvenanceData(self.context)
     return provdata.data

Example #6

0

Show file

File: transmogrify.py Project: manhinli/org.bccvl.site

    def __iter__(self):
        """missing docstring."""
        for item in self.previous:
            # check if we have a dataset

            if item['_type'] not in ('org.bccvl.content.dataset',
                                     'org.bccvl.content.remotedataset'):
                # not a dataset
                yield item
                continue

            pathkey = self.pathkey(*item.keys())[0]
            # no path .. can't do anything
            if not pathkey:
                yield item
                continue

            path = item[pathkey]
            # Skip the Plone site object itself
            if not path:
                yield item
                continue

            obj = self.context.unrestrictedTraverse(path.encode().lstrip('/'),
                                                    None)

            # FIXME: this is really not a great way to check where to find provenenace data
            # check if we are inside an experiment (means we import result)
            if IExperiment.providedBy(self.context.__parent__):
                # result import
                context = self.context
            else:
                # dataset import?
                context = obj

            # TODO: do some sanity checks
            provdata = IProvenanceData(context)
            PROV = Namespace(u"http://www.w3.org/ns/prov#")
            BCCVL = Namespace(u"http://ns.bccvl.org.au/")
            LOCAL = Namespace(u"urn:bccvl:")
            graph = Graph()
            graph.parse(data=provdata.data or '', format='turtle')
            activity = Resource(graph, LOCAL['activity'])
            # FIXME: shouldn't I use uuid instead of id?
            entity = Resource(graph, LOCAL[obj.id])
            # create this dataset as new entity -> output of activity
            entity.add(RDF['type'], PROV['Entity'])
            # generated by
            entity.add(PROV['wasGeneratedBy'], activity)
            # PROV['prov:wasAttributedTo'] to user and software?
            # File metadata
            entity.add(DCTERMS['creator'], Literal(obj.Creator()))
            entity.add(DCTERMS['title'], Literal(obj.title))
            entity.add(DCTERMS['description'], Literal(obj.description))
            entity.add(DCTERMS['rights'], Literal(obj.rights))
            if obj.portal_type == 'org.bccvl.content.dataset':
                entity.add(DCTERMS['format'], Literal(obj.file.contentType))
            else:
                # FIXME: this doesn't seem to do the right thing
                entity.add(DCTERMS['format'], Literal(obj.format))
            # TODO: add metadata about file?
            #    genre, layers, emsc, gcm, year

            # set activities end time
            #   first one wins
            if activity.value(PROV['endedAtTime']) is None:
                activity.add(
                    PROV['endedAtTime'],
                    Literal(datetime.now().replace(microsecond=0).isoformat(),
                            datatype=XSD['dateTime']))

            # TODO: extend activity metadata with execution environment data
            #       (logfile import?, pstats import) .. and script + params.json file
            # ALA import url
            pd = item.get('_ala_provenance', {})
            if pd:
                entity.add(BCCVL['download_url'], Literal(pd['url']))

            # store prov data
            provdata.data = graph.serialize(format="turtle")

            yield item

Example #7

0

Show file

    def __iter__(self):
        """missing docstring."""
        for item in self.previous:
            # check if we have a dataset

            if item['_type'] not in ('org.bccvl.content.dataset',
                                     'org.bccvl.content.remotedataset'):
                # not a dataset
                yield item
                continue

            pathkey = self.pathkey(*item.keys())[0]
            # no path .. can't do anything
            if not pathkey:
                yield item
                continue

            path = item[pathkey]
            # Skip the Plone site object itself
            if not path:
                yield item
                continue

            obj = self.context.unrestrictedTraverse(
                path.encode().lstrip('/'), None)

            # FIXME: this is really not a great way to check where to find provenenace data
            # check if we are inside an experiment (means we import result)
            if IExperiment.providedBy(self.context.__parent__):
                # result import
                context = self.context
            else:
                # dataset import?
                context = obj

            # TODO: do some sanity checks
            provdata = IProvenanceData(context)
            PROV = Namespace(u"http://www.w3.org/ns/prov#")
            BCCVL = Namespace(u"http://ns.bccvl.org.au/")
            LOCAL = Namespace(u"urn:bccvl:")
            graph = Graph()
            graph.parse(data=provdata.data or '', format='turtle')
            activity = Resource(graph, LOCAL['activity'])
            # FIXME: shouldn't I use uuid instead of id?
            entity = Resource(graph, LOCAL[obj.id])
            # create this dataset as new entity -> output of activity
            entity.add(RDF['type'], PROV['Entity'])
            # generated by
            entity.add(PROV['wasGeneratedBy'], activity)
            # PROV['prov:wasAttributedTo'] to user and software?
            # File metadata
            entity.add(DCTERMS['creator'], Literal(obj.Creator()))
            entity.add(DCTERMS['title'], Literal(obj.title))
            entity.add(DCTERMS['description'], Literal(obj.description))
            entity.add(DCTERMS['rights'], Literal(obj.rights))
            if obj.portal_type == 'org.bccvl.content.dataset':
                entity.add(DCTERMS['format'], Literal(obj.file.contentType))
            else:
                # FIXME: this doesn't seem to do the right thing
                entity.add(DCTERMS['format'], Literal(obj.format))
            # TODO: add metadata about file?
            #    genre, layers, emsc, gcm, year

            # set activities end time
            #   first one wins
            if activity.value(PROV['endedAtTime']) is None:
                activity.add(PROV['endedAtTime'],
                             Literal(datetime.now().replace(microsecond=0).isoformat(), datatype=XSD['dateTime']))

            # TODO: extend activity metadata with execution environment data
            #       (logfile import?, pstats import) .. and script + params.json file
            # ALA import url
            pd = item.get('_ala_provenance', {})
            if pd:
                entity.add(BCCVL['download_url'], Literal(pd['url']))

            # store prov data
            provdata.data = graph.serialize(format="turtle")

            yield item

Example #8

0

Show file

    def _createProvenance(self, result):
        provdata = IProvenanceData(result)
        from rdflib import URIRef, Literal, Namespace, Graph
        from rdflib.namespace import RDF, RDFS, FOAF, DCTERMS, XSD
        from rdflib.resource import Resource
        PROV = Namespace(u"http://www.w3.org/ns/prov#")
        BCCVL = Namespace(u"http://ns.bccvl.org.au/")
        LOCAL = Namespace(u"urn:bccvl:")
        graph = Graph()
        # the user is our agent

        member = api.user.get_current()
        username = member.getProperty('fullname') or member.getId()
        user = Resource(graph, LOCAL['user'])
        user.add(RDF['type'], PROV['Agent'])
        user.add(RDF['type'], FOAF['Person'])
        user.add(FOAF['name'], Literal(username))
        user.add(FOAF['mbox'],
                 URIRef('mailto:{}'.format(member.getProperty('email'))))
        # add software as agent
        software = Resource(graph, LOCAL['software'])
        software.add(RDF['type'], PROV['Agent'])
        software.add(RDF['type'], PROV['SoftwareAgent'])
        software.add(FOAF['name'], Literal('BCCVL Job Script'))
        # script content is stored somewhere on result and will be exported with zip?
        #   ... or store along with pstats.json ? hidden from user

        # -> execenvironment after import -> log output?
        # -> source code ... maybe some link expression? stored on result ? separate entity?
        activity = Resource(graph, LOCAL['activity'])
        activity.add(RDF['type'], PROV['Activity'])
        # TODO: this is rather queued or created time for this activity ... could capture real start time on running status update (or start transfer)
        now = datetime.now().replace(microsecond=0)
        activity.add(PROV['startedAtTime'],
                     Literal(now.isoformat(), datatype=XSD['dateTime']))
        activity.add(PROV['hasAssociationWith'], user)
        activity.add(PROV['hasAssociationWith'], software)
        # add job parameters to activity
        for idx, (key, value) in enumerate(result.job_params.items()):
            param = Resource(graph, LOCAL[u'param_{}'.format(idx)])
            activity.add(BCCVL['algoparam'], param)
            param.add(BCCVL['name'], Literal(key))
            # We have only dataset references as parameters
            if key in ('data_table', ):
                param.add(BCCVL['value'], LOCAL[dsuuid])
            else:
                param.add(BCCVL['value'], Literal(value))
        # iterate over all input datasets and add them as entities
        for key in ('data_table', ):
            dsbrain = uuidToCatalogBrain(result.job_params[key])
            if not dsbrain:
                continue
            ds = dsbrain.getObject()
            dsprov = Resource(graph, LOCAL[result.job_params[key]])
            dsprov.add(RDF['type'], PROV['Entity'])
            #dsprov.add(PROV['..'], Literal(''))
            dsprov.add(DCTERMS['creator'], Literal(ds.Creator()))
            dsprov.add(DCTERMS['title'], Literal(ds.title))
            dsprov.add(DCTERMS['description'], Literal(ds.description))
            dsprov.add(DCTERMS['rights'],
                       Literal(ds.rights))  # ds.rightsstatement
            dsprov.add(DCTERMS['format'], Literal(ds.file.contentType))
            # location / source
            # graph.add(uri, DCTERMS['source'], Literal(''))
            # TODO: genre ...
            # TODO: resolution
            # species metadata
            md = IBCCVLMetadata(ds)
            # dsprov.add(BCCVL['scientificName'], Literal(md['species']['scientificName']))
            # dsprov.add(BCCVL['taxonID'], URIRef(md['species']['taxonID']))

            # ... species data, ... species id
            for layer in md.get('layers_used', ()):
                dsprov.add(BCCVL['layer'], LOCAL[layer])

            # link with activity
            activity.add(PROV['used'], dsprov)

        provdata.data = graph.serialize(format="turtle")

Example #9

0

Show file

File: utilities.py Project: sarahrichmond/org.bccvl.site

    def _createProvenance(self, result):
        provdata = IProvenanceData(result)
        from rdflib import URIRef, Literal, Namespace, Graph
        from rdflib.namespace import RDF, RDFS, FOAF, DCTERMS, XSD
        from rdflib.resource import Resource
        PROV = Namespace(u"http://www.w3.org/ns/prov#")
        BCCVL = Namespace(u"http://ns.bccvl.org.au/")
        LOCAL = Namespace(u"urn:bccvl:")
        graph = Graph()
        # the user is our agent

        member = api.user.get_current()
        username = member.getProperty('fullname') or member.getId()
        user = Resource(graph, LOCAL['user'])
        user.add(RDF['type'], PROV['Agent'])
        user.add(RDF['type'], FOAF['Person'])
        user.add(FOAF['name'], Literal(username))
        user.add(FOAF['mbox'],
                 URIRef('mailto:{}'.format(member.getProperty('email'))))
        # add software as agent
        software = Resource(graph, LOCAL['software'])
        software.add(RDF['type'], PROV['Agent'])
        software.add(RDF['type'], PROV['SoftwareAgent'])
        software.add(FOAF['name'], Literal('BCCVL Job Script'))
        # script content is stored somewhere on result and will be exported with zip?
        #   ... or store along with pstats.json ? hidden from user

        # -> execenvironment after import -> log output?
        # -> source code ... maybe some link expression? stored on result ? separate entity?
        activity = Resource(graph, LOCAL['activity'])
        activity.add(RDF['type'], PROV['Activity'])
        # TODO: this is rather queued or created time for this activity ... could capture real start time on running status update (or start transfer)
        now = datetime.now().replace(microsecond=0)
        activity.add(PROV['startedAtTime'],
                     Literal(now.isoformat(), datatype=XSD['dateTime']))
        activity.add(PROV['hasAssociationWith'], user)
        activity.add(PROV['hasAssociationWith'], software)
        # add job parameters to activity
        for idx, (key, value) in enumerate(result.job_params.items()):
            param = Resource(graph, LOCAL[u'param_{}'.format(idx)])
            activity.add(BCCVL['algoparam'], param)
            param.add(BCCVL['name'], Literal(key))
            # We have only dataset references as parameters
            if key in ('data_table',):
                param.add(BCCVL['value'], LOCAL[dsuuid])
            else:
                param.add(BCCVL['value'], Literal(value))
        # iterate over all input datasets and add them as entities
        for key in ('data_table',):
            dsbrain = uuidToCatalogBrain(result.job_params[key])
            if not dsbrain:
                continue
            ds = dsbrain.getObject()
            dsprov = Resource(graph, LOCAL[result.job_params[key]])
            dsprov.add(RDF['type'], PROV['Entity'])
            #dsprov.add(PROV['..'], Literal(''))
            dsprov.add(DCTERMS['creator'], Literal(ds.Creator()))
            dsprov.add(DCTERMS['title'], Literal(ds.title))
            dsprov.add(DCTERMS['description'], Literal(ds.description))
            dsprov.add(DCTERMS['rights'], Literal(ds.rights))  # ds.rightsstatement
            dsprov.add(DCTERMS['format'], Literal(ds.file.contentType))
            # location / source
            # graph.add(uri, DCTERMS['source'], Literal(''))
            # TODO: genre ...
            # TODO: resolution
            # species metadata
            md = IBCCVLMetadata(ds)
            # dsprov.add(BCCVL['scientificName'], Literal(md['species']['scientificName']))
            # dsprov.add(BCCVL['taxonID'], URIRef(md['species']['taxonID']))

            # ... species data, ... species id
            for layer in md.get('layers_used',()):
                dsprov.add(BCCVL['layer'], LOCAL[layer])

            # link with activity
            activity.add(PROV['used'], dsprov)

        provdata.data = graph.serialize(format="turtle")