def __init__(self, args, subset=None): # set subset self.subset = subset # read args, route verb to verb handler self.verb_routes = { 'GetRecord': self._GetRecord, 'Identify': self._Identify, 'ListIdentifiers': self._ListIdentifiers, 'ListMetadataFormats': self._ListMetadataFormats, 'ListRecords': self._ListRecords, 'ListSets': self._ListSets } self.args = args.copy() self.request_timestamp = datetime.datetime.now() self.request_timestamp_string = self.request_timestamp.strftime( '%Y-%m-%dT%H:%M:%SZ') self.record_nodes = [] # published dataframe slice parameters self.start = 0 self.chunk_size = settings.OAI_RESPONSE_SIZE if 'set' in self.args.keys() and self.args['set'] != '': self.publish_set_id = self.args['set'] else: self.publish_set_id = None # get instance of Published model self.published = PublishedRecords(subset=self.subset) # begin scaffolding self.scaffold()
def test_get_published(self): self.config.job.publish(publish_set_id='test publish id') publish_records = PublishedRecords().records # For some reason this accumulates records every time I run it # TODO: what the heck? print(publish_records.count()) self.assertGreater(publish_records.count(), 0) published_page = self.client.get('/combine/published') self.assertIn('test publish id', str(published_page.content, 'utf-8')) self.assertIn(b'Published Records', published_page.content)
def published(request, subset=None): """ Published records """ # get instance of Published model pub_records = PublishedRecords(subset=subset) # get field counts if pub_records.records.count() > 0: # get count of fields for all published job indices field_counts = pub_records.count_indexed_fields() else: field_counts = {} # get field mappers field_mappers = FieldMapper.objects.all() # get published subsets with PublishedRecords static method subsets = PublishedRecords.get_subsets() # loop through subsets and enrich for _ in subsets: # add counts counts = mc_handle.combine.misc.find_one( {'_id': 'published_field_counts_%s' % _['name']}) # if counts not yet calculated, do now if counts is None: counts = PublishedRecords( subset=_['name']).count_indexed_fields() _['counts'] = counts # generate hierarchy_dict job_hierarchy = _stateio_prepare_job_hierarchy() return render(request, 'core/published.html', { 'published': pub_records, 'field_mappers': field_mappers, 'xml2kvp_handle': xml2kvp.XML2kvp(), 'field_counts': field_counts, 'es_index_str': pub_records.esi.es_index_str, 'subsets': subsets, 'job_hierarchy_json': json.dumps(job_hierarchy), 'job_hierarchy_json_subset': json.dumps( getattr(pub_records, 'ps_doc', {}).get('hierarchy', []) ), 'breadcrumbs': breadcrumb_parser(request) })
def published_subset_create(request): """ Create subset of published records - output should be a Mongo document in combine.misc called "published_subset_[SUBSET]" Subset Form/Doc - slug/id for subset: lowercase, no spaces, sanitize - human name - description - publish sets to include - also include "loose" records? """ if request.method == 'GET': # get all published sets pub_records = PublishedRecords() # generate hierarchy_dict job_hierarchy = _stateio_prepare_job_hierarchy() return render(request, 'core/published_subset_create.html', { 'published': pub_records, 'job_hierarchy_json': json.dumps(job_hierarchy), 'breadcrumbs': breadcrumb_parser(request) }) if request.method == 'POST': LOGGER.debug('creating new published subset') # sanitize name name = request.POST.get('name') name = ''.join(c for c in name if c.isalnum()) name = name.lower() # confirm sets are present sets = request.POST.getlist('sets') # handle non set records include_non_set_records = request.POST.get('include_non_set_records', False) # handle org / rg hierarchy hierarchy = json.loads(request.POST.get('hierarchy', [])) # create new published subset mc_handle.combine.misc.insert_one( { 'name': name, 'description': request.POST.get('description', None), 'type': 'published_subset', 'publish_set_ids': sets, 'hierarchy': hierarchy, 'include_non_set_records': include_non_set_records }) return redirect('published_subset', subset=name)
def get_initial_queryset(self): # return queryset used as base for further sorting/filtering # get PublishedRecords instance pub_records = PublishedRecords(subset=self.kwargs.get('subset', None)) # return queryset return pub_records.records
def record_group(request, org_id, record_group_id): """ View information about a single record group, including any and all jobs run Args: record_group_id (str/int): PK for RecordGroup table """ LOGGER.debug('retrieving record group ID: %s', record_group_id) # retrieve record group rec_group = RecordGroup.objects.get(pk=int(record_group_id)) # get all jobs associated with record group jobs = Job.objects.filter(record_group=record_group_id) # get all currently applied publish set ids publish_set_ids = PublishedRecords.get_publish_set_ids() # loop through jobs for job in jobs: # update status job.update_status() # get record group job lineage job_lineage = rec_group.get_jobs_lineage() # get all record groups for this organization record_groups = RecordGroup.objects.filter(organization=org_id).exclude(id=record_group_id).exclude( for_analysis=True) # render page return render(request, 'core/record_group.html', { 'record_group': rec_group, 'jobs': jobs, 'job_lineage_json': json.dumps(job_lineage), 'publish_set_ids': publish_set_ids, 'record_groups': record_groups, 'breadcrumbs': breadcrumb_parser(request) })
def job_analysis(request): """ Run new analysis job """ # if GET, prepare form if request.method == 'GET': # retrieve jobs (limiting if needed) input_jobs = Job.objects.all() # limit if analysis_type set analysis_type = request.GET.get('type', None) subset = request.GET.get('subset', None) if analysis_type == 'published': # load PublishedRecords published = PublishedRecords(subset=subset) # define input_jobs input_jobs = published.published_jobs else: published = None # get validation scenarios validation_scenarios = ValidationScenario.objects.all() # get field mappers field_mappers = FieldMapper.objects.all() # get record identifier transformation scenarios rits = RecordIdentifierTransformation.objects.all() # get job lineage for all jobs (filtered to input jobs scope) job_lineage = Job.get_all_jobs_lineage(jobs_query_set=input_jobs) # get all bulk downloads bulk_downloads = DPLABulkDataDownload.objects.all() # render page return render( request, 'core/job_analysis.html', { 'job_select_type': 'multiple', 'input_jobs': input_jobs, 'published': published, 'validation_scenarios': validation_scenarios, 'rits': rits, 'field_mappers': field_mappers, 'xml2kvp_handle': xml2kvp.XML2kvp(), 'analysis_type': analysis_type, 'bulk_downloads': bulk_downloads, 'job_lineage_json': json.dumps(job_lineage) }) # if POST, submit job if request.method == 'POST': cjob = CombineJob.init_combine_job( user=request.user, # TODO: record_group=record_group, job_type_class=AnalysisJob, job_params=request.POST) # start job and update status job_status = cjob.start_job() # if job_status is absent, report job status as failed if job_status is False: cjob.job.status = 'failed' cjob.job.save() return redirect('analysis')
def export_tabular_data(request, export_source=None, job_id=None, subset=None): # get records per file records_per_file = request.POST.get('records_per_file', False) if records_per_file in ['', False]: records_per_file = 500 # get mapped fields export type tabular_data_export_type = request.POST.get('tabular_data_export_type') # get archive type archive_type = request.POST.get('archive_type') # get fm config json fm_export_config_json = request.POST.get('fm_export_config_json') # export for single job if export_source == 'job': LOGGER.debug('exporting tabular data from Job') # retrieve job cjob = CombineJob.get_combine_job(int(job_id)) # initiate Combine BG Task combine_task = CombineBackgroundTask( name='Export Tabular Data for Job: %s' % cjob.job.name, task_type='export_tabular_data', task_params_json=json.dumps({ 'job_id': cjob.job.id, 'records_per_file': int(records_per_file), 'tabular_data_export_type': tabular_data_export_type, 'archive_type': archive_type, 'fm_export_config_json': fm_export_config_json })) combine_task.save() # handle export output configurations combine_task = _handle_export_output(request, export_source, combine_task) # run celery task background_task = tasks.export_tabular_data.delay(combine_task.id) LOGGER.debug('firing bg task: %s', background_task) combine_task.celery_task_id = background_task.task_id combine_task.save() # set gm gmc = GlobalMessageClient(request.session) target = "Job:</strong><br>%s" % cjob.job.name gmc.add_gm({ 'html': '<p><strong>Exporting Tabular Data for %s</p><p><a href="%s"><button type="button" ' 'class="btn btn-outline-primary btn-sm">View Background Tasks</button></a></p>' % (target, reverse('bg_tasks')), 'class': 'success' }) return redirect('job_details', org_id=cjob.job.record_group.organization.id, record_group_id=cjob.job.record_group.id, job_id=cjob.job.id) # export for published if export_source == 'published': LOGGER.debug('exporting tabular data from published records') # get instance of Published model # TODO: not used PublishedRecords() # initiate Combine BG Task combine_task = CombineBackgroundTask( name='Export Tabular Data for Published Records', task_type='export_tabular_data', task_params_json=json.dumps({ 'published': True, 'subset': subset, 'records_per_file': int(records_per_file), 'tabular_data_export_type': tabular_data_export_type, 'archive_type': archive_type, 'fm_export_config_json': fm_export_config_json })) combine_task.save() # handle export output configurations combine_task = _handle_export_output(request, export_source, combine_task) # run celery task background_task = tasks.export_tabular_data.delay(combine_task.id) LOGGER.debug('firing bg task: %s', background_task) combine_task.celery_task_id = background_task.task_id combine_task.save() # set gm gmc = GlobalMessageClient(request.session) target = ":</strong><br>Published Records" gmc.add_gm({ 'html': '<p><strong>Exporting Tabular Data for %s</p><p><a href="%s"><button type="button" ' 'class="btn btn-outline-primary btn-sm">View Background Tasks</button></a></p>' % (target, reverse('bg_tasks')), 'class': 'success' }) return redirect('published')
def job_details(request, org_id, record_group_id, job_id): LOGGER.debug('details for job id: %s', job_id) # get CombineJob cjob = CombineJob.get_combine_job(job_id) # update status cjob.job.update_status() # detailed record count record_count_details = cjob.job.get_detailed_job_record_count() # get job lineage job_lineage = cjob.job.get_lineage() # get dpla_bulk_data_match dpla_bulk_data_matches = cjob.job.get_dpla_bulk_data_matches() # check if limiting to one, pre-existing record get_q = request.GET.get('q', None) # job details and job type specific augment job_detail = cjob.job.job_details_dict # mapped field analysis, generate if not part of job_details if 'mapped_field_analysis' in job_detail.keys(): field_counts = job_detail['mapped_field_analysis'] else: if cjob.job.finished: field_counts = cjob.count_indexed_fields() cjob.job.update_job_details( {'mapped_field_analysis': field_counts}, save=True) else: LOGGER.debug('job not finished, not setting') field_counts = {} # TODO: What is this accomplishing? # OAI Harvest if isinstance(cjob, HarvestOAIJob): pass # Static Harvest elif isinstance(cjob, HarvestStaticXMLJob): pass # Transform elif isinstance(cjob, TransformJob): pass # Merge/Duplicate elif isinstance(cjob, MergeJob): pass # Analysis elif isinstance(cjob, AnalysisJob): pass # get published records, primarily for published sets pub_records = PublishedRecords() oai_sets = Record.objects(job_id=cjob.job.id).item_frequencies(field='oai_set') # get published subsets with PublishedRecords static method published_subsets = PublishedRecords.get_subsets() # loop through subsets and enrich for _ in published_subsets: # add counts counts = mc_handle.combine.misc.find_one( {'_id': 'published_field_counts_%s' % _['name']}) # if counts not yet calculated, do now if counts is None: counts = PublishedRecords( subset=_['name']).count_indexed_fields() _['counts'] = counts # get field mappers field_mappers = FieldMapper.objects.all() # return return render(request, 'core/job_details.html', { 'cjob': cjob, 'record_group': cjob.job.record_group, 'record_count_details': record_count_details, 'field_counts': field_counts, 'field_mappers': field_mappers, 'xml2kvp_handle': xml2kvp.XML2kvp(), 'job_lineage_json': json.dumps(job_lineage), 'dpla_bulk_data_matches': dpla_bulk_data_matches, 'q': get_q, 'job_details': job_detail, 'pr': pub_records, 'published_subsets': published_subsets, 'es_index_str': cjob.esi.es_index_str, 'breadcrumbs': breadcrumb_parser(request), 'oai_sets': dict(oai_sets) })
def spark_function(spark, **kwargs): ''' Publish records in Combine, prepares for OAI server output Args: spark (pyspark.sql.session.SparkSession): provided by pyspark context kwargs: job_id (int): Job ID job_input (str): location of avro files on disk Returns: None - creates symlinks from input job to new avro file symlinks on disk - copies records in DB from input job to new published job - copies documents in ES from input to new published job index ''' # refresh Django DB Connection refresh_django_db_connection() # get job job = Job.objects.get(pk=int(kwargs['job_id'])) # start job_track instance, marking job start job_track = JobTrack(job_id=job.id) job_track.save() # read output from input job, filtering by job_id, grabbing Combine Record schema fields input_job = Job.objects.get(pk=int(kwargs['input_job_id'])) bounds = get_job_db_bounds(input_job) sqldf = spark.read.jdbc(settings.COMBINE_DATABASE['jdbc_url'], 'core_record', properties=settings.COMBINE_DATABASE, column='id', lowerBound=bounds['lowerBound'], upperBound=bounds['upperBound'], numPartitions=settings.JDBC_NUMPARTITIONS) records = sqldf.filter(sqldf.job_id == int(kwargs['input_job_id'])) # repartition records = records.repartition(settings.SPARK_REPARTITION) # get rows with document content records = records[records['document'] != ''] # update job column, overwriting job_id from input jobs in merge job_id = job.id job_id_udf = udf(lambda record_id: job_id, IntegerType()) records = records.withColumn('job_id', job_id_udf(records.record_id)) # write job output to avro records.select(CombineRecordSchema().field_names).write.format( "com.databricks.spark.avro").save(job.job_output) # confirm directory exists published_dir = '%s/published' % ( settings.BINARY_STORAGE.split('file://')[-1].rstrip('/')) if not os.path.exists(published_dir): os.mkdir(published_dir) # get avro files job_output_dir = job.job_output.split('file://')[-1] avros = [f for f in os.listdir(job_output_dir) if f.endswith('.avro')] for avro in avros: os.symlink(os.path.join(job_output_dir, avro), os.path.join(published_dir, avro)) # index records to DB and index to ElasticSearch db_records = save_records(spark=spark, kwargs=kwargs, job=job, records_df=records, write_avro=False, index_records=False) # copy index from input job to new Publish job index_to_job_index = ESIndex.copy_es_index(source_index='j%s' % input_job.id, target_index='j%s' % job.id, wait_for_completion=False) # copy index from new Publish Job to /published index # NOTE: because back to back reindexes, and problems with timeouts on requests, # wait on task from previous reindex es_handle_temp = Elasticsearch(hosts=[settings.ES_HOST]) retry = 1 while retry <= 100: # get task task = es_handle_temp.tasks.get(index_to_job_index['task']) # if task complete, index job index to published index if task['completed']: index_to_published_index = ESIndex.copy_es_index( source_index='j%s' % job.id, target_index='published', wait_for_completion=False, add_copied_from= job_id # do not use Job instance here, only pass string ) break # break from retry loop else: print( "indexing to /published, waiting on task node %s, retry: %s/10" % (task['task']['node'], retry)) # bump retries, sleep, and continue retry += 1 time.sleep(3) continue # get PublishedRecords handle pr = PublishedRecords() # set records from job as published pr.set_published_field(job_id) # update uniqueness of all published records pr.update_published_uniqueness() # finally, update finish_timestamp of job_track instance job_track.finish_timestamp = datetime.datetime.now() job_track.save()
def published_subset_edit(request, subset): """ Edit Published Subset """ if request.method == 'GET': # get subset published records pub_records = PublishedRecords() published_subset = PublishedRecords(subset=subset) published_subset.ps_doc['id'] = str(published_subset.ps_doc['_id']) # generate hierarchy_dict job_hierarchy = _stateio_prepare_job_hierarchy() return render(request, 'core/published_subset_edit.html', { 'published': pub_records, 'published_subset': published_subset, 'job_hierarchy_json': json.dumps(job_hierarchy), 'job_hierarchy_json_subset': json.dumps(published_subset.ps_doc.get('hierarchy', [])), 'breadcrumbs': breadcrumb_parser(request) }) if request.method == 'POST': LOGGER.debug('updating published subset') # confirm sets are present sets = request.POST.getlist('sets') # handle non set records include_non_set_records = request.POST.get('include_non_set_records', False) # handle org / rg hierarchy hierarchy = json.loads(request.POST.get('hierarchy', [])) # update published subset pub_records = PublishedRecords(subset=subset) pub_records.update_subset({ 'description': request.POST.get('description', None), 'type': 'published_subset', 'publish_set_ids': sets, 'hierarchy': hierarchy, 'include_non_set_records': include_non_set_records }) pub_records.remove_subset_precounts() return redirect('published_subset', subset=subset)
class OAIProvider(): """ Class for scaffolding and building responses to OAI queries NOTE: Because the OAI-PMH protocol shares verbs with reserved words in Python (e.g. "set", or "from"), easier to keep the HTTP request args to work with as a dictionary, and maintain the original OAI-PMH vocab. """ def __init__(self, args, subset=None): # set subset self.subset = subset # read args, route verb to verb handler self.verb_routes = { 'GetRecord': self._GetRecord, 'Identify': self._Identify, 'ListIdentifiers': self._ListIdentifiers, 'ListMetadataFormats': self._ListMetadataFormats, 'ListRecords': self._ListRecords, 'ListSets': self._ListSets } self.args = args.copy() self.request_timestamp = datetime.datetime.now() self.request_timestamp_string = self.request_timestamp.strftime( '%Y-%m-%dT%H:%M:%SZ') self.record_nodes = [] # published dataframe slice parameters self.start = 0 self.chunk_size = settings.OAI_RESPONSE_SIZE if 'set' in self.args.keys() and self.args['set'] != '': self.publish_set_id = self.args['set'] else: self.publish_set_id = None # get instance of Published model self.published = PublishedRecords(subset=self.subset) # begin scaffolding self.scaffold() # generate XML root node with OAI-PMH scaffolding def scaffold(self): """ Scaffold XML, OAI response Args: None Returns: None - sets multiple attributes for response building """ # build root node, nsmap, and attributes NSMAP = {None: 'http://www.openarchives.org/OAI/2.0/'} self.root_node = etree.Element('OAI-PMH', nsmap=NSMAP) self.root_node.set( '{http://www.w3.org/2001/XMLSchema-instance}schemaLocation', 'http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd' ) # set responseDate node self.responseDate_node = etree.Element('responseDate') self.responseDate_node.text = self.request_timestamp.strftime( '%Y-%m-%dT%H:%M:%SZ') self.root_node.append(self.responseDate_node) # set request node self.request_node = etree.Element('request') # set verb try: self.request_node.attrib['verb'] = self.args['verb'] except: self.args['verb'] = 'NULL' self.request_node.attrib['verb'] = 'NULL' # capture set if present if 'set' in self.args.keys(): self.request_node.attrib['set'] = self.args['set'] # metadataPrefix if 'metadataPrefix' in self.args.keys(): self.request_node.attrib['metadataPrefix'] = self.args[ 'metadataPrefix'] self.request_node.text = 'http://%s%s' % (settings.APP_HOST, reverse('oai')) self.root_node.append(self.request_node) # set verb node self.verb_node = etree.Element(self.args['verb']) self.root_node.append(self.verb_node) def retrieve_records(self, include_metadata=False): """ Retrieve record(s) from DB for response Args: include_metadata (bool): If False, return only identifiers, if True, include record document as well Returns: None - adds record(s) to self.record_nodes """ stime = time.time() logger.debug("retrieving records for verb %s", self.args['verb']) # get records records = self.published.records # if set present, filter by this set if self.publish_set_id: logger.debug('applying publish_set_id filter: %s', self.publish_set_id) records = records.filter(publish_set_id=self.publish_set_id) # loop through rows, limited by current OAI transaction start / chunk # count records before slice records_count = records.count() # get slice for iteration records = records[self.start:(self.start + self.chunk_size)] for record in records: record = OAIRecord(args=self.args, record_id=record.record_id, publish_set_id=record.publish_set_id, document=record.document, timestamp=self.request_timestamp_string) # include full metadata in record if include_metadata: record.include_metadata() # append to record_nodes self.record_nodes.append(record.oai_record_node) # add to verb node for oai_record_node in self.record_nodes: self.verb_node.append(oai_record_node) # finally, set resumption token self.set_resumption_token(records, completeListSize=records_count) # report record_nodes_num = len(self.record_nodes) logger.debug("%s record(s) returned in %s", record_nodes_num, (float(time.time()) - float(stime))) def set_resumption_token(self, records, completeListSize=None): """ Set resumption tokens in DB under OAITransaction model Args: completeListSize (int): total number of records based on passed parameters Returns: None - sets attributes related to resumption tokens """ # set resumption token if self.start + self.chunk_size < completeListSize: # set token and slice parameters to DB token = str(uuid.uuid4()) logger.debug('setting resumption token: %s', token) oai_trans = OAITransaction(verb=self.args['verb'], start=self.start + self.chunk_size, chunk_size=self.chunk_size, publish_set_id=self.publish_set_id, token=token, args=json.dumps(self.args)) oai_trans.save() # set resumption token node and attributes self.resumptionToken_node = etree.Element('resumptionToken') self.resumptionToken_node.attrib['expirationDate'] = (self.request_timestamp + datetime.timedelta(0, 3600))\ .strftime('%Y-%m-%dT%H:%M:%SZ') self.resumptionToken_node.attrib['completeListSize'] = str( completeListSize) self.resumptionToken_node.attrib['cursor'] = str(self.start) self.resumptionToken_node.text = token self.verb_node.append(self.resumptionToken_node) # convenience function to run all internal methods def generate_response(self): """ Returns OAI response as XML Args: None Returns: (str): XML response """ # check verb if self.args['verb'] not in self.verb_routes.keys(): return self.raise_error( 'badVerb', 'The verb %s is not allowed, must be from: %s' % (self.args['verb'], str(self.verb_routes.keys()))) # check for resumption token if 'resumptionToken' in self.args.keys(): # retrieve token params and alter args and search_params ot_query = OAITransaction.objects.filter( token=self.args['resumptionToken']) if ot_query.count() == 1: ot = ot_query.first() # set args and start and chunk_size self.start = ot.start self.chunk_size = ot.chunk_size self.publish_set_id = ot.publish_set_id logger.debug( 'following resumption token, altering dataframe slice params:' ) logger.debug( [self.start, self.chunk_size, self.publish_set_id]) # raise error else: return self.raise_error( 'badResumptionToken', 'The resumptionToken %s is not found' % self.args['resumptionToken']) # fire verb reponse building self.verb_routes[self.args['verb']]() return self.serialize() def raise_error(self, error_code, error_msg): """ Returns error as XML, OAI response Args: error_code (str): OAI-PMH error codes (e.g. badVerb, generic, etc.) error_msg (str): details about error Returns: (str): XML response """ # remove verb node try: self.root_node.remove(self.verb_node) except: logger.debug('verb_node not found') # create error node and append error_node = etree.SubElement(self.root_node, 'error') error_node.attrib['code'] = error_code error_node.text = error_msg # serialize and return return self.serialize() # serialize record nodes as XML response def serialize(self): """ Serialize all nodes as XML for returning Args: None Returns: (str): XML response """ return etree.tostring(self.root_node) # GetRecord def _GetRecord(self): """ OAI-PMH verb: GetRecord Retrieve a single record based on record id, return Args: None Returns: None sets single record node to self.record_nodes """ stime = time.time() logger.debug("retrieving record: %s", self.args['identifier']) # get single row single_record = self.published.get_record(self.args['identifier']) # if single record found if single_record: # open as OAIRecord record = OAIRecord(args=self.args, record_id=single_record.record_id, document=single_record.document, timestamp=self.request_timestamp_string) # include metadata record.include_metadata() # append to record_nodes self.record_nodes.append(record.oai_record_node) # add to verb node for oai_record_node in self.record_nodes: self.verb_node.append(oai_record_node) else: logger.debug('record not found for id: %s, not appending node', self.args['identifier']) # report etime = time.time() logger.debug("%s record(s) returned in %sms", len(self.record_nodes), (float(etime) - float(stime)) * 1000) # Identify def _Identify(self): """ OAI-PMH verb: Identify Provide information about Repository / OAI Server Args: None Returns: None sets description node text """ # init OAIRecord logger.debug('generating identify node') # write Identify node description_node = etree.Element('description') desc_text = 'Combine, integrated OAI-PMH.' if self.subset is not None: desc_text += ' Note: You are receiving a published subset of this Combine instance named: %s.' % self.subset description_node.text = desc_text self.verb_node.append(description_node) # ListIdentifiers def _ListIdentifiers(self): """ OAI-PMH verb: ListIdentifiers Lists identifiers Args: None Returns: None sets multiple record nodes to self.record.nodes """ self.retrieve_records() # ListMetadataFormats def _ListMetadataFormats(self): """ # OAI-PMH verb: ListMetadataFormats # List all metadataformats, or optionally, available metadataformats for # one item based on published metadata formats NOTE: Currently, Combine does not support Metadata Formats for the outgoing OAI-PMH server. All published Records are undoubtedly of a metadata format, but this is opaque to Combine. This may change in the future, but for now, a shim is in place to return valid OAI-PMH responses for the verb ListMetadataForamts """ # generic metadata prefix shim generic_metadata_hash = { 'prefix': 'generic', 'schema': 'http://generic.org/schema', 'namespace': 'gnc' } # identifier provided if 'identifier' in self.args.keys(): try: logging.debug( "identifier provided for ListMetadataFormats, confirming that identifier exists..." ) single_record = self.published.get_record( self.args['identifier']) if single_record: mf_node = etree.Element('metadataFormat') # write metadataPrefix node prefix = etree.SubElement(mf_node, 'metadataPrefix') prefix.text = generic_metadata_hash['prefix'] # write schema node schema = etree.SubElement(mf_node, 'schema') schema.text = generic_metadata_hash['schema'] # write schema node namespace = etree.SubElement(mf_node, 'metadataNamespace') namespace.text = generic_metadata_hash['namespace'] # append to verb_node and return self.verb_node.append(mf_node) else: raise Exception('record could not be located') except: return self.raise_error( 'idDoesNotExist', 'The identifier %s is not found.' % self.args['identifier']) # no identifier, return all available metadataPrefixes else: mf_node = etree.Element('metadataFormat') # write metadataPrefix node prefix = etree.SubElement(mf_node, 'metadataPrefix') prefix.text = generic_metadata_hash['prefix'] # write schema node schema = etree.SubElement(mf_node, 'schema') schema.text = generic_metadata_hash['schema'] # write schema node namespace = etree.SubElement(mf_node, 'metadataNamespace') namespace.text = generic_metadata_hash['namespace'] # append to verb_node and return self.verb_node.append(mf_node) # ListRecords def _ListRecords(self): """ OAI-PMH verb: ListRecords Lists records; similar to ListIdentifiers, but includes metadata from record.document Args: None Returns: None sets multiple record nodes to self.record.nodes """ self.retrieve_records(include_metadata=True) # ListSets def _ListSets(self): """ OAI-PMH verb: ListSets Lists available sets. Sets are derived from the publish_set_id from a published Job Args: None Returns: None sets multiple set nodes """ # generate response for publish_set_id in self.published.sets: set_node = etree.Element('set') setSpec = etree.SubElement(set_node, 'setSpec') setSpec.text = publish_set_id setName = etree.SubElement(set_node, 'setName') setName.text = publish_set_id self.verb_node.append(set_node)