def __call__(self): # 1. find all IBlobDataset/ IRemotedDataset/ IDataset objects within context pc = getToolByName(self.context, 'portal_catalog') brains = pc.searchResults(path='/'.join(self.context.getPhysicalPath()), object_provides=IBlobDataset.__identifier__) fname = None try: # create tmp file fd, fname = tempfile.mkstemp() fo = os.fdopen(fd, 'w') zfile = zipfile.ZipFile(fo, 'w') metadata = {} # the file/folder name for the zip zfilename = self.context.title # iterate over files and add to zip for brain in brains: content = brain.getObject() # ob.file should be a NamedFile ... need to get fs name for that blobfile = content.file.openDetached() arcname = '/'.join((zfilename, 'data', content.file.filename)) zfile.write(blobfile.name, arcname) blobfile.close() metadata[arcname] = getdsmetadata(content) # all files are in .... # TODO: add experiment result metadata # put metadata into zip # provenance data stored on result container provdata = IProvenanceData(self.context) if not provdata.data is None: zfile.writestr('/'.join((zfilename, 'prov.ttl')), provdata.data) # add mets.xml metsview = getMultiAdapter((self.context, self.request), name="mets.xml") zfile.writestr('/'.join((zfilename, 'mets.xml')), metsview.render()) # finish zip file zfile.close() fo.close() # create response self.request.response.setHeader('Content-Type', 'application/zip') self.request.response.setHeader('Content-Disposition', 'attachment; filename="{}.zip"'.format(zfilename)) self.request.response.setHeader('Content-Length', '{}'.format(os.path.getsize(fname))) return tmpfile_stream_iterator(fname) except Exception as e: # something went wrong ... # clean up and re-raise if os.path.exists(fname): os.remove(fname) raise e
def _createProvenance(self, result): provdata = IProvenanceData(result) from rdflib import URIRef, Literal, Namespace, Graph from rdflib.namespace import RDF, RDFS, FOAF, DCTERMS, XSD from rdflib.resource import Resource PROV = Namespace(u"http://www.w3.org/ns/prov#") BCCVL = Namespace(u"http://ns.bccvl.org.au/") LOCAL = Namespace(u"urn:bccvl:") graph = Graph() # the user is our agent member = api.user.get_current() username = member.getProperty('fullname') or member.getId() user = Resource(graph, LOCAL['user']) user.add(RDF['type'], PROV['Agent']) user.add(RDF['type'], FOAF['Person']) user.add(FOAF['name'], Literal(username)) user.add(FOAF['mbox'], URIRef('mailto:{}'.format(member.getProperty('email')))) # add software as agent software = Resource(graph, LOCAL['software']) software.add(RDF['type'], PROV['Agent']) software.add(RDF['type'], PROV['SoftwareAgent']) software.add(FOAF['name'], Literal('BCCVL ALA Importer')) # script content is stored somewhere on result and will be exported with zip? # ... or store along with pstats.json ? hidden from user # -> execenvironment after import -> log output? # -> source code ... maybe some link expression? stored on result ? separate entity? activity = Resource(graph, LOCAL['activity']) activity.add(RDF['type'], PROV['Activity']) # TODO: this is rather queued or created time for this activity ... could capture real start time on running status update (or start transfer) now = datetime.now().replace(microsecond=0) activity.add(PROV['startedAtTime'], Literal(now.isoformat(), datatype=XSD['dateTime'])) activity.add(PROV['hasAssociationWith'], user) activity.add(PROV['hasAssociationWith'], software) # add job parameters to activity provdata.data = graph.serialize(format="turtle")
def _download_results(self, context, zfile): # FIXME: This is a rather lengthy process, and should probably be turned into a background task... (maybe as part of a datamanager service?) # 1. find all IBlobDataset/ IRemotedDataset/ IDataset objects within context pc = getToolByName(context, 'portal_catalog') brains = pc.searchResults(path='/'.join(context.getPhysicalPath()), object_provides=[ IBlobDataset.__identifier__, IRemoteDataset.__identifier__ ]) metadata = {} # the file/folder name for the zip zfilename = context.title # iterate over files and add to zip for brain in brains: content = brain.getObject() if IBlobDataset.providedBy(content): # If data is stored locally: arcname = '/'.join((zfilename, 'data', content.file.filename)) # ob.file should be a NamedFile ... need to get fs name for that blobfile = content.file.openDetached() zfile.write(blobfile.name, arcname) blobfile.close() elif IRemoteDataset.providedBy(content): # TODO: duplicate code from remoteUrl = getattr(content, 'remoteUrl', None) if remoteUrl is None: raise NotFound(self, 'remoteUrl', self.request) # get arcname from remoteUrl arcname = '/'.join( (zfilename, 'data', os.path.basename(remoteUrl))) # FIXME: should check dataset downloaiable flag here, # but assumption is, that this function can only be called on an experiment result folder.... # TODO: duplicate code in browser/dataset.py:RemoteDatasetDownload.__call__ # TODO: may not work in general... it always uses swift as remote url tool = getUtility(ISwiftUtility) try: url = tool.generate_temp_url(url=remoteUrl) except: url = remoteUrl # url is now the location from which we can fetch the file temp_file, _ = urlretrieve(url) zfile.write(temp_file, arcname) os.remove(temp_file) else: # unknown type of Dataset # just skip it # TODO: Log warning or debug? continue metadata[arcname] = getdsmetadata(content) # all files are in .... # TODO: add experiment result metadata # put metadata into zip # provenance data stored on result container provdata = IProvenanceData(context) if not provdata.data is None: zfile.writestr('/'.join((zfilename, 'prov.ttl')), provdata.data.encode('utf-8')) # add experiment metadata expmetadata = IExperimentMetadata(context) if not expmetadata.data is None: zfile.writestr('/'.join((zfilename, 'expmetadata.txt')), expmetadata.data.encode('utf-8')) # add mets.xml metsview = getMultiAdapter((context, self.request), name="mets.xml") zfile.writestr('/'.join((zfilename, 'mets.xml')), metsview.render().encode('utf-8')) # add experiment parameters params = IExperimentParameter(context) if not params.data is None: zfile.writestr('/'.join((zfilename, 'params.json')), params.data.encode('utf-8'))
def __call__(self): self.request.response.setHeader("Content-type", "text/turtle; charset=utf-8") provdata = IProvenanceData(self.context) return provdata.data
def __iter__(self): """missing docstring.""" for item in self.previous: # check if we have a dataset if item['_type'] not in ('org.bccvl.content.dataset', 'org.bccvl.content.remotedataset'): # not a dataset yield item continue pathkey = self.pathkey(*item.keys())[0] # no path .. can't do anything if not pathkey: yield item continue path = item[pathkey] # Skip the Plone site object itself if not path: yield item continue obj = self.context.unrestrictedTraverse(path.encode().lstrip('/'), None) # FIXME: this is really not a great way to check where to find provenenace data # check if we are inside an experiment (means we import result) if IExperiment.providedBy(self.context.__parent__): # result import context = self.context else: # dataset import? context = obj # TODO: do some sanity checks provdata = IProvenanceData(context) PROV = Namespace(u"http://www.w3.org/ns/prov#") BCCVL = Namespace(u"http://ns.bccvl.org.au/") LOCAL = Namespace(u"urn:bccvl:") graph = Graph() graph.parse(data=provdata.data or '', format='turtle') activity = Resource(graph, LOCAL['activity']) # FIXME: shouldn't I use uuid instead of id? entity = Resource(graph, LOCAL[obj.id]) # create this dataset as new entity -> output of activity entity.add(RDF['type'], PROV['Entity']) # generated by entity.add(PROV['wasGeneratedBy'], activity) # PROV['prov:wasAttributedTo'] to user and software? # File metadata entity.add(DCTERMS['creator'], Literal(obj.Creator())) entity.add(DCTERMS['title'], Literal(obj.title)) entity.add(DCTERMS['description'], Literal(obj.description)) entity.add(DCTERMS['rights'], Literal(obj.rights)) if obj.portal_type == 'org.bccvl.content.dataset': entity.add(DCTERMS['format'], Literal(obj.file.contentType)) else: # FIXME: this doesn't seem to do the right thing entity.add(DCTERMS['format'], Literal(obj.format)) # TODO: add metadata about file? # genre, layers, emsc, gcm, year # set activities end time # first one wins if activity.value(PROV['endedAtTime']) is None: activity.add( PROV['endedAtTime'], Literal(datetime.now().replace(microsecond=0).isoformat(), datatype=XSD['dateTime'])) # TODO: extend activity metadata with execution environment data # (logfile import?, pstats import) .. and script + params.json file # ALA import url pd = item.get('_ala_provenance', {}) if pd: entity.add(BCCVL['download_url'], Literal(pd['url'])) # store prov data provdata.data = graph.serialize(format="turtle") yield item
def __iter__(self): """missing docstring.""" for item in self.previous: # check if we have a dataset if item['_type'] not in ('org.bccvl.content.dataset', 'org.bccvl.content.remotedataset'): # not a dataset yield item continue pathkey = self.pathkey(*item.keys())[0] # no path .. can't do anything if not pathkey: yield item continue path = item[pathkey] # Skip the Plone site object itself if not path: yield item continue obj = self.context.unrestrictedTraverse( path.encode().lstrip('/'), None) # FIXME: this is really not a great way to check where to find provenenace data # check if we are inside an experiment (means we import result) if IExperiment.providedBy(self.context.__parent__): # result import context = self.context else: # dataset import? context = obj # TODO: do some sanity checks provdata = IProvenanceData(context) PROV = Namespace(u"http://www.w3.org/ns/prov#") BCCVL = Namespace(u"http://ns.bccvl.org.au/") LOCAL = Namespace(u"urn:bccvl:") graph = Graph() graph.parse(data=provdata.data or '', format='turtle') activity = Resource(graph, LOCAL['activity']) # FIXME: shouldn't I use uuid instead of id? entity = Resource(graph, LOCAL[obj.id]) # create this dataset as new entity -> output of activity entity.add(RDF['type'], PROV['Entity']) # generated by entity.add(PROV['wasGeneratedBy'], activity) # PROV['prov:wasAttributedTo'] to user and software? # File metadata entity.add(DCTERMS['creator'], Literal(obj.Creator())) entity.add(DCTERMS['title'], Literal(obj.title)) entity.add(DCTERMS['description'], Literal(obj.description)) entity.add(DCTERMS['rights'], Literal(obj.rights)) if obj.portal_type == 'org.bccvl.content.dataset': entity.add(DCTERMS['format'], Literal(obj.file.contentType)) else: # FIXME: this doesn't seem to do the right thing entity.add(DCTERMS['format'], Literal(obj.format)) # TODO: add metadata about file? # genre, layers, emsc, gcm, year # set activities end time # first one wins if activity.value(PROV['endedAtTime']) is None: activity.add(PROV['endedAtTime'], Literal(datetime.now().replace(microsecond=0).isoformat(), datatype=XSD['dateTime'])) # TODO: extend activity metadata with execution environment data # (logfile import?, pstats import) .. and script + params.json file # ALA import url pd = item.get('_ala_provenance', {}) if pd: entity.add(BCCVL['download_url'], Literal(pd['url'])) # store prov data provdata.data = graph.serialize(format="turtle") yield item
def _createProvenance(self, result): provdata = IProvenanceData(result) from rdflib import URIRef, Literal, Namespace, Graph from rdflib.namespace import RDF, RDFS, FOAF, DCTERMS, XSD from rdflib.resource import Resource PROV = Namespace(u"http://www.w3.org/ns/prov#") BCCVL = Namespace(u"http://ns.bccvl.org.au/") LOCAL = Namespace(u"urn:bccvl:") graph = Graph() # the user is our agent member = api.user.get_current() username = member.getProperty('fullname') or member.getId() user = Resource(graph, LOCAL['user']) user.add(RDF['type'], PROV['Agent']) user.add(RDF['type'], FOAF['Person']) user.add(FOAF['name'], Literal(username)) user.add(FOAF['mbox'], URIRef('mailto:{}'.format(member.getProperty('email')))) # add software as agent software = Resource(graph, LOCAL['software']) software.add(RDF['type'], PROV['Agent']) software.add(RDF['type'], PROV['SoftwareAgent']) software.add(FOAF['name'], Literal('BCCVL Job Script')) # script content is stored somewhere on result and will be exported with zip? # ... or store along with pstats.json ? hidden from user # -> execenvironment after import -> log output? # -> source code ... maybe some link expression? stored on result ? separate entity? activity = Resource(graph, LOCAL['activity']) activity.add(RDF['type'], PROV['Activity']) # TODO: this is rather queued or created time for this activity ... could capture real start time on running status update (or start transfer) now = datetime.now().replace(microsecond=0) activity.add(PROV['startedAtTime'], Literal(now.isoformat(), datatype=XSD['dateTime'])) activity.add(PROV['hasAssociationWith'], user) activity.add(PROV['hasAssociationWith'], software) # add job parameters to activity for idx, (key, value) in enumerate(result.job_params.items()): param = Resource(graph, LOCAL[u'param_{}'.format(idx)]) activity.add(BCCVL['algoparam'], param) param.add(BCCVL['name'], Literal(key)) # We have only dataset references as parameters if key in ('data_table', ): param.add(BCCVL['value'], LOCAL[dsuuid]) else: param.add(BCCVL['value'], Literal(value)) # iterate over all input datasets and add them as entities for key in ('data_table', ): dsbrain = uuidToCatalogBrain(result.job_params[key]) if not dsbrain: continue ds = dsbrain.getObject() dsprov = Resource(graph, LOCAL[result.job_params[key]]) dsprov.add(RDF['type'], PROV['Entity']) #dsprov.add(PROV['..'], Literal('')) dsprov.add(DCTERMS['creator'], Literal(ds.Creator())) dsprov.add(DCTERMS['title'], Literal(ds.title)) dsprov.add(DCTERMS['description'], Literal(ds.description)) dsprov.add(DCTERMS['rights'], Literal(ds.rights)) # ds.rightsstatement dsprov.add(DCTERMS['format'], Literal(ds.file.contentType)) # location / source # graph.add(uri, DCTERMS['source'], Literal('')) # TODO: genre ... # TODO: resolution # species metadata md = IBCCVLMetadata(ds) # dsprov.add(BCCVL['scientificName'], Literal(md['species']['scientificName'])) # dsprov.add(BCCVL['taxonID'], URIRef(md['species']['taxonID'])) # ... species data, ... species id for layer in md.get('layers_used', ()): dsprov.add(BCCVL['layer'], LOCAL[layer]) # link with activity activity.add(PROV['used'], dsprov) provdata.data = graph.serialize(format="turtle")
def _createProvenance(self, result): provdata = IProvenanceData(result) from rdflib import URIRef, Literal, Namespace, Graph from rdflib.namespace import RDF, RDFS, FOAF, DCTERMS, XSD from rdflib.resource import Resource PROV = Namespace(u"http://www.w3.org/ns/prov#") BCCVL = Namespace(u"http://ns.bccvl.org.au/") LOCAL = Namespace(u"urn:bccvl:") graph = Graph() # the user is our agent member = api.user.get_current() username = member.getProperty('fullname') or member.getId() user = Resource(graph, LOCAL['user']) user.add(RDF['type'], PROV['Agent']) user.add(RDF['type'], FOAF['Person']) user.add(FOAF['name'], Literal(username)) user.add(FOAF['mbox'], URIRef('mailto:{}'.format(member.getProperty('email')))) # add software as agent software = Resource(graph, LOCAL['software']) software.add(RDF['type'], PROV['Agent']) software.add(RDF['type'], PROV['SoftwareAgent']) software.add(FOAF['name'], Literal('BCCVL Job Script')) # script content is stored somewhere on result and will be exported with zip? # ... or store along with pstats.json ? hidden from user # -> execenvironment after import -> log output? # -> source code ... maybe some link expression? stored on result ? separate entity? activity = Resource(graph, LOCAL['activity']) activity.add(RDF['type'], PROV['Activity']) # TODO: this is rather queued or created time for this activity ... could capture real start time on running status update (or start transfer) now = datetime.now().replace(microsecond=0) activity.add(PROV['startedAtTime'], Literal(now.isoformat(), datatype=XSD['dateTime'])) activity.add(PROV['hasAssociationWith'], user) activity.add(PROV['hasAssociationWith'], software) # add job parameters to activity for idx, (key, value) in enumerate(result.job_params.items()): param = Resource(graph, LOCAL[u'param_{}'.format(idx)]) activity.add(BCCVL['algoparam'], param) param.add(BCCVL['name'], Literal(key)) # We have only dataset references as parameters if key in ('data_table',): param.add(BCCVL['value'], LOCAL[dsuuid]) else: param.add(BCCVL['value'], Literal(value)) # iterate over all input datasets and add them as entities for key in ('data_table',): dsbrain = uuidToCatalogBrain(result.job_params[key]) if not dsbrain: continue ds = dsbrain.getObject() dsprov = Resource(graph, LOCAL[result.job_params[key]]) dsprov.add(RDF['type'], PROV['Entity']) #dsprov.add(PROV['..'], Literal('')) dsprov.add(DCTERMS['creator'], Literal(ds.Creator())) dsprov.add(DCTERMS['title'], Literal(ds.title)) dsprov.add(DCTERMS['description'], Literal(ds.description)) dsprov.add(DCTERMS['rights'], Literal(ds.rights)) # ds.rightsstatement dsprov.add(DCTERMS['format'], Literal(ds.file.contentType)) # location / source # graph.add(uri, DCTERMS['source'], Literal('')) # TODO: genre ... # TODO: resolution # species metadata md = IBCCVLMetadata(ds) # dsprov.add(BCCVL['scientificName'], Literal(md['species']['scientificName'])) # dsprov.add(BCCVL['taxonID'], URIRef(md['species']['taxonID'])) # ... species data, ... species id for layer in md.get('layers_used',()): dsprov.add(BCCVL['layer'], LOCAL[layer]) # link with activity activity.add(PROV['used'], dsprov) provdata.data = graph.serialize(format="turtle")