def parseMets(filename, createdBy, expId=None, sync_root=None): '''Parse the METS document using the SAX Parser classes provided in the metsparser module. Arguments: filename -- path of the document to parse (METS or notMETS) created_by -- a User instance expid -- the experiment ID to use Returns: The experiment ID ''' import time startParseTime = time.time() logger.debug('parse experiment id: ' + str(expId)) parser = make_parser(["drv_libxml2"]) parser.setFeature(feature_namespaces, 1) dataHolder = MetsDataHolder() # on the first pass, we'll parse the document just so we can # create the experiment's structure parser.setContentHandler(MetsExperimentStructCreator(dataHolder)) parser.parse(filename) # Get the destination directory if it isn't given if sync_root is None: if expId: sync_root = get_sync_root(prefix="%d-" % expId) else: sync_root = get_sync_root() # on the second pass, we'll parse the document so that we can tie # the metadata info with the experiment/dataset/datafile objects parser.setContentHandler( MetsMetadataInfoHandler(dataHolder, expId, createdBy, sync_root, get_sync_location())) parser.parse(filename) endParseTime = time.time() # time difference in seconds timeDiff = endParseTime - startParseTime logger.debug('time difference in seconds: %s' % (timeDiff)) return (dataHolder.experimentDatabaseId, sync_root)
def hydrate(self, bundle): dataset = DatasetResource.get_via_uri(DatasetResource(), bundle.data['dataset'], bundle.request) if 'attached_file' in bundle.data: # have POSTed file newfile = bundle.data['attached_file'][0] file_path = write_uploaded_file_to_dataset(dataset, newfile) location_name = 'local' del(bundle.data['attached_file']) elif 'replicas' not in bundle.data: # no replica specified: return upload path and create replica for # new path location_name = 'staging' stage_url = get_sync_root(prefix="%d-" % dataset.get_first_experiment().id) # TODO make sure filename isn't duplicate file_path = os.path.join(stage_url, bundle.data['filename']) self.temp_url = file_path file_path = "file://" + file_path else: return bundle newreplica = { 'url': file_path, 'protocol': 'file', 'location': location_name, } bundle.data['replicas'] = [newreplica] return bundle
def endElementNS(self, name, qname): # just get the element name without the namespace elName = name[1] if elName == 'dmdSec': self.inDmdSec = False # if we currently processing an experiment structure, let's # save the institution value before we finalise the experiment if self.processExperimentStruct: self.metsObject.institution = self.institution # let's save the experiment in the DB if self.tardisExpId: self.modelExperiment = models.Experiment.objects.get( pk=self.tardisExpId) else: self.modelExperiment = models.Experiment() self.modelExperiment.id = self.tardisExpId self.modelExperiment.url = self.metsObject.url self.modelExperiment.approved = True self.modelExperiment.title = self.metsObject.title self.modelExperiment.institution_name = \ self.metsObject.institution self.modelExperiment.description = self.metsObject.description self.modelExperiment.start_time = self.metsObject.start_time self.modelExperiment.end_time = self.metsObject.end_time self.modelExperiment.created_by = self.createdBy self.modelExperiment.save() self.holder.experimentDatabaseId = self.modelExperiment.id x = 0 for author in self.metsObject.authors: author_experiment = models.Author_Experiment( experiment=self.modelExperiment, author=author, order=x) author_experiment.save() x = x + 1 elif self.processDatasetStruct: # let's save the dataset in the DB self.modelDataset = models.Dataset( description=self.metsObject.title, immutable=settings.IMMUTABLE_METS_DATASETS) self.modelDataset.save() self.modelDataset.experiments.add(self.modelExperiment) self.modelDataset.save() # let's also save the modelDataset in a dictionary so that we # can look it up easily later on when we start processing # the datafiles. self.datasetLookupDict[self.metsObject.id] = self.modelDataset self.metsObject = None self.processExperimentStruct = False self.processDatasetStruct = False elif elName == 'title' and self.inDmdSec: self.grabTitle = False elif elName == 'startTime' and self.processExperimentStruct: self.grabStartTime = False elif elName == 'endTime' and self.processExperimentStruct: self.grabEndTime = False elif elName == 'url' and self.processExperimentStruct: self.grabExperimentUrl = False elif elName == 'abstract' and self.processExperimentStruct: self.grabAbstract = False elif elName == 'name' and self.processExperimentStruct: self.inName = False elif elName == 'namePart' and self.inName: self.grabMightBeAuthor = False elif elName == 'roleTerm' and self.inName: self.grabRoleTerm = False self.mightBeAuthor = None elif elName == 'name' and self.inInstitution: self.grabInstitution = False elif elName == 'agent': self.inInstitution = False elif elName == 'amdSec': # we're done processing the metadata entries self.inAmdSec = False # let's reset the cached experiment model object self.modelExperiment = None logger.info(self.holder.metadataMap) for mdId in self.holder.metadataMap: if mdId.startswith('NOMD-'): df = self.holder.metadataMap[mdId][0] if df.dataset.id in self.datasetLookupDict: # look up the dataset this file belongs to thisFilesDataset = self.datasetLookupDict[ df.dataset.id] size = df.size if not df.size: size = 0 def checksum(obj, type_): # Check if the checksum is of type if obj.checksumType != type_: return '' checksum = obj.checksum.lower() # Ensure the checksum is hexdecimal if not re.match('[0-9a-f]+$', checksum): return '' # Get algorithm try: name = type_.replace('-','').lower() alg = getattr(hashlib, name) except: return '' # Check checksum is the correct length hex_length = alg('').digest_size * 2 if hex_length != len(checksum): return '' # Should be valid checksum of given type return checksum sync_url, proto = get_sync_url_and_protocol( get_sync_root(), df.url) self.modelDatafile = models.Dataset_File( dataset=thisFilesDataset, filename=df.name, size=size, md5sum=checksum(df, 'MD5'), sha512sum=checksum(df, 'SHA-512')) logger.info('=== saving datafile: %s' % df.name) self.modelDatafile.save() replica = models.Replica( datafile=self.modelDatafile, url=sync_url, protocol=proto, location=self.syncLocation) replica.save() elif elName == 'techMD' and self.inAmdSec: self.inTechMd = False self.metadataId = None self.metsObject = None self.processMetadata = False elif elName == 'xmlData' and self.inTechMd: self.inXmlData = False elif elName != self.xmlDataChildElement and \ self.customHandler is not None: self.customHandler.endElement(elName) elif elName == self.xmlDataChildElement and self.inXmlData: if self.customHandler is not None: self.tempMetadataHolder = self.customHandler.metadataDict try: schema = models.Schema.objects.get( namespace__exact=self.elementNamespace) # get the associated parameter names for the given schema parameterNames = \ models.ParameterName.objects.filter( schema__namespace__exact=schema.namespace).order_by('id') # let's create a trigger holder which we can use to check # if we still need to create another parameterset entry in the # DB createParamSetFlag = {'experiment': True, 'dataset': True, 'datafile': True} datasetParameterSet = None datafileParameterSet = None if self.metadataId in self.holder.metadataMap: for metsObject in self.holder.metadataMap[self.metadataId]: self.metsObject = metsObject metsObjectClassName = self.metsObject.__class__.__name__ if metsObjectClassName == 'Experiment': if createParamSetFlag['experiment']: # create a new parameter set for the metadata parameterSet = \ models.ExperimentParameterSet( schema=schema, experiment=self.modelExperiment) parameterSet.save() # now let's process the experiment parameters for parameterName in parameterNames: if parameterName.name in \ self.tempMetadataHolder: parameterValues = self.tempMetadataHolder[ parameterName.name] self._saveParameters('ExperimentParameter', parameterName, parameterValues, parameterSet) createParamSetFlag['experiment'] = False else: # this is not even allowed as there's only going # to be one experiment per METS file raise Exception('forbidden state!') elif metsObjectClassName == 'Dataset': if createParamSetFlag['dataset']: dataset = self.datasetLookupDict[ self.metsObject.id] # create a new parameter set for the # dataset metadata datasetParameterSet = \ models.DatasetParameterSet(schema=schema, dataset=dataset) datasetParameterSet.save() # now let's process the dataset parameters for parameterName in parameterNames: if parameterName.name in \ self.tempMetadataHolder: parameterValues = self.tempMetadataHolder[ parameterName.name] self._saveParameters('DatasetParameter', parameterName, parameterValues, datasetParameterSet) # disable creation for the next visit createParamSetFlag['dataset'] = False elif metsObjectClassName == 'Datafile': # this will be a good time to save the # "hard" metadata of this datafile so that # when we start adding "soft" metadata # parameters to it, we already have an # entry for it in the DB logger.info('=== found datafile: %s' % self.metsObject.name) # look up the dataset this file belongs to thisFilesDataset = self.datasetLookupDict[ self.metsObject.dataset.id] # also check if the file already exists datafile = thisFilesDataset.dataset_file_set.filter( filename=self.metsObject.name, size=self.metsObject.size) if datafile.count() == 0: size = self.metsObject.size if not self.metsObject.size: size = 0 def checksum(obj, type_): # Check if the checksum is of type if obj.checksumType != type_: return '' checksum = obj.checksum.lower() # Ensure the checksum is hexdecimal if not re.match('[0-9a-f]+$', checksum): return '' # Get algorithm try: name = type_.replace('-','').lower() alg = getattr(hashlib, name) except: return '' # Check checksum is the correct length hex_length = alg('').digest_size * 2 if hex_length != len(checksum): return '' # Should be valid checksum of given type return checksum sync_url, proto = get_sync_url_and_protocol( self.syncRootDir, self.metsObject.url) self.modelDatafile = models.Dataset_File( dataset=thisFilesDataset, filename=self.metsObject.name, size=size, md5sum=checksum(self.metsObject, 'MD5'), sha512sum=checksum(self.metsObject, 'SHA-512')) logger.info('=== saving datafile: %s' % self.metsObject.name) self.modelDatafile.save() replica = models.Replica( datafile=self.modelDatafile, url=sync_url, protocol=proto, location=self.syncLocation) replica.save() else: self.modelDatafile = thisFilesDataset.dataset_file_set.get( filename=self.metsObject.name, size=self.metsObject.size) # TODO: we need to note here that we are # only creating a datafile entry in the DB # for files that have corresponding # metadata. if we are to create a file # entry for files with no metadata, we'll # need to get the unaccessed datafiles # from datasetLookupDict. if createParamSetFlag['datafile']: # create a new parameter set for the metadata datafileParameterSet = \ models.DatafileParameterSet(schema=schema, dataset_file=self.modelDatafile) datafileParameterSet.save() # now let's process the datafile parameters for parameterName in parameterNames: if parameterName.name in \ self.tempMetadataHolder: parameterValues = self.tempMetadataHolder[ parameterName.name] self._saveParameters('DatafileParameter', parameterName, parameterValues, datafileParameterSet) createParamSetFlag['datafile'] = False except models.Schema.DoesNotExist: logger.warning('unsupported schema being ingested ' + self.elementNamespace) # reset the current xmlData child element so that if a new # parameter set is read, we can process it again self.xmlDataChildElement = None self.customHandler = None elif elName == self.parameterName and \ self.xmlDataChildElement is not None: # reset self.parameterName to None so the next parameter can be # processed self.parameterName = None
def process_simple(self, filename, created_by, eid): sync_root = get_sync_root() f = open(filename) e = 0 ds = 0 df = 0 current = None current_df_id = 0 mdelist = [] for line in f: line = line.strip() # logger.debug("LINE: %s, CURRENT: %s" % (line, current)) if line.startswith('<experiment>'): current = 'experiment' e += 1 ds = 0 df = 0 # initialize with empty strings to avoid key errors exp = {} exp['abstract'] = '' exp['organization'] = '' exp['title'] = '' exp['url'] = '' exp['starttime'] = None exp['endtime'] = None authors = list() elif line.startswith('<dataset>'): # commit any experiment if current = experiment if current == 'experiment': if eid: experiment = Experiment.objects.get(pk=eid) else: experiment = Experiment() experiment.url = exp['url'] experiment.title = exp['title'] experiment.institution_name = exp['organization'] experiment.description = exp['abstract'] experiment.created_by = created_by experiment.start_time = exp['starttime'] experiment.end_time = exp['endtime'] experiment.save() author_experiments = \ Author_Experiment.objects.all() author_experiments = \ author_experiments.filter( experiment=experiment).delete() x = 0 for authorName in authors: author_experiment = \ Author_Experiment(experiment=experiment, author=authorName, order=x) author_experiment.save() x = x + 1 experiment.datasets.all().delete() if 'metadata' in exp: for md in exp['metadata']: xmlns = getXmlnsFromTechXMLRaw(md) logger.debug('schema %s' % xmlns) schema = None try: schema = Schema.objects.get( namespace__exact=xmlns) except Schema.DoesNotExist, e: logger.debug('schema not found: ' + e) if schema: parameternames = \ ParameterName.objects.filter( schema__namespace__exact=schema.namespace) parameternames = \ parameternames.order_by('id') tech_xml = getTechXMLFromRaw(md) parameterset = \ ExperimentParameterSet( schema=schema, experiment=experiment) parameterset.save() for pn in parameternames: # logger.debug("finding parameter %s in metadata" % pn.name) try: if pn.data_type == ParameterName.NUMERIC: value = \ getParameterFromTechXML( tech_xml, pn.name) if value != None: ep = \ ExperimentParameter( parameterset=parameterset, name=pn, string_value=None, numerical_value=float(value)) ep.save() else: ep = \ ExperimentParameter( parameterset=parameterset, name=pn, string_value=getParameterFromTechXML( tech_xml, pn.name), numerical_value=None) ep.save() except e: logger.debug( 'error saving experiment ' + 'parameter: ' + e) current = 'dataset' ds = ds + 1 mdflist = [] mdslist = [] df = 0 dataset = dict() elif line.startswith('<file>'): if current == 'dataset': d = Dataset(description=dataset['description']) d.save() d.experiments.add(experiment) d.save() if 'metadata' in dataset: for md in dataset['metadata']: if 'metadata' in dataset: xmlns = getXmlnsFromTechXMLRaw(md) logger.debug( 'trying to find parameters with ' + 'an xmlns of ' + xmlns) schema = None try: schema = \ Schema.objects.get( namespace__exact=xmlns) except Schema.DoesNotExist, e: logger.debug('schema not found: ' + e) if schema: parameternames = \ ParameterName.objects.filter( schema__namespace__exact=schema.namespace) parameternames = \ parameternames.order_by('id') tech_xml = \ getTechXMLFromRaw(md) parameterset = \ DatasetParameterSet( schema=schema, dataset=d) parameterset.save() for pn in parameternames: logger.debug( "finding parameter " + pn.name + " in metadata") try: if pn.data_type == ParameterName.NUMERIC: value = \ getParameterFromTechXML( tech_xml, pn.name) if value != None: dp = \ DatasetParameter( parameterset=parameterset, name=pn, string_value=None, numerical_value=float(value)) dp.save() else: dp = \ DatasetParameter( parameterset=parameterset, name=pn, string_value=getParameterFromTechXML( tech_xml, pn.name), numerical_value=None) dp.save() except e: logger.debug( 'error saving ' + 'experiment parameter: ' + e)
def process_simple(self, filename, created_by, eid): sync_root = get_sync_root() f = open(filename) e = 0 ds = 0 df = 0 current = None current_df_id = 0 mdelist = [] for line in f: line = line.strip() # logger.debug("LINE: %s, CURRENT: %s" % (line, current)) if line.startswith('<experiment>'): current = 'experiment' e += 1 ds = 0 df = 0 # initialize with empty strings to avoid key errors exp = {} exp['abstract'] = '' exp['organization'] = '' exp['title'] = '' exp['url'] = '' exp['starttime'] = None exp['endtime'] = None authors = list() elif line.startswith('<dataset>'): # commit any experiment if current = experiment if current == 'experiment': if eid: experiment = Experiment.objects.get(pk=eid) else: experiment = Experiment() experiment.url = exp['url'] experiment.title = exp['title'] experiment.institution_name = exp['organization'] experiment.description = exp['abstract'] experiment.created_by = created_by experiment.start_time = exp['starttime'] experiment.end_time = exp['endtime'] experiment.save() author_experiments = \ Author_Experiment.objects.all() author_experiments = \ author_experiments.filter( experiment=experiment).delete() x = 0 for authorName in authors: author_experiment = \ Author_Experiment(experiment=experiment, author=authorName, order=x) author_experiment.save() x = x + 1 experiment.datasets.all().delete() if 'metadata' in exp: for md in exp['metadata']: xmlns = getXmlnsFromTechXMLRaw(md) logger.debug('schema %s' % xmlns) schema = None try: schema = Schema.objects.get( namespace__exact=xmlns) except Schema.DoesNotExist, e: logger.debug('schema not found: ' + e) if schema: parameternames = \ ParameterName.objects.filter( schema__namespace__exact=schema.namespace) parameternames = \ parameternames.order_by('id') tech_xml = getTechXMLFromRaw(md) parameterset = \ ExperimentParameterSet( schema=schema, experiment=experiment) parameterset.save() for pn in parameternames: # logger.debug("finding parameter %s in metadata" % pn.name) try: if pn.data_type == ParameterName.NUMERIC: value = \ getParameterFromTechXML( tech_xml, pn.name) if value != None: ep = \ ExperimentParameter( parameterset=parameterset, name=pn, string_value=None, numerical_value=float(value)) ep.save() else: ep = \ ExperimentParameter( parameterset=parameterset, name=pn, string_value=getParameterFromTechXML( tech_xml, pn.name), numerical_value=None) ep.save() except e: logger.debug( 'error saving experiment ' + 'parameter: ' + e) current = 'dataset' ds = ds + 1 mdflist = [] mdslist = [] df = 0 dataset = dict() elif line.startswith('<file>'): if current == 'dataset': d = Dataset(description=dataset['description']) d.save() d.experiments.add(experiment) d.save() if 'metadata' in dataset: for md in dataset['metadata']: if 'metadata' in dataset: xmlns = getXmlnsFromTechXMLRaw(md) logger.debug( 'trying to find parameters with ' + 'an xmlns of ' + xmlns) schema = None try: schema = \ Schema.objects.get( namespace__exact=xmlns) except Schema.DoesNotExist, e: logger.debug('schema not found: ' + e) if schema: parameternames = \ ParameterName.objects.filter( schema__namespace__exact=schema.namespace) parameternames = \ parameternames.order_by('id') tech_xml = \ getTechXMLFromRaw(md) parameterset = \ DatasetParameterSet( schema=schema, dataset=d) parameterset.save() for pn in parameternames: logger.debug("finding parameter " + pn.name + " in metadata") try: if pn.data_type == ParameterName.NUMERIC: value = \ getParameterFromTechXML( tech_xml, pn.name) if value != None: dp = \ DatasetParameter( parameterset=parameterset, name=pn, string_value=None, numerical_value=float(value)) dp.save() else: dp = \ DatasetParameter( parameterset=parameterset, name=pn, string_value=getParameterFromTechXML( tech_xml, pn.name), numerical_value=None) dp.save() except e: logger.debug( 'error saving ' + 'experiment parameter: ' + e)