Esempio n. 1
0
    def __init__(self, args, subset=None):

        # set subset
        self.subset = subset

        # read args, route verb to verb handler
        self.verb_routes = {
            'GetRecord': self._GetRecord,
            'Identify': self._Identify,
            'ListIdentifiers': self._ListIdentifiers,
            'ListMetadataFormats': self._ListMetadataFormats,
            'ListRecords': self._ListRecords,
            'ListSets': self._ListSets
        }

        self.args = args.copy()
        self.request_timestamp = datetime.datetime.now()
        self.request_timestamp_string = self.request_timestamp.strftime(
            '%Y-%m-%dT%H:%M:%SZ')
        self.record_nodes = []

        # published dataframe slice parameters
        self.start = 0
        self.chunk_size = settings.OAI_RESPONSE_SIZE
        if 'set' in self.args.keys() and self.args['set'] != '':
            self.publish_set_id = self.args['set']
        else:
            self.publish_set_id = None

        # get instance of Published model
        self.published = PublishedRecords(subset=self.subset)

        # begin scaffolding
        self.scaffold()
Esempio n. 2
0
 def test_get_published(self):
     self.config.job.publish(publish_set_id='test publish id')
     publish_records = PublishedRecords().records
     # For some reason this accumulates records every time I run it
     # TODO: what the heck?
     print(publish_records.count())
     self.assertGreater(publish_records.count(), 0)
     published_page = self.client.get('/combine/published')
     self.assertIn('test publish id', str(published_page.content, 'utf-8'))
     self.assertIn(b'Published Records', published_page.content)
Esempio n. 3
0
def published(request, subset=None):
    """
        Published records
        """

    # get instance of Published model
    pub_records = PublishedRecords(subset=subset)

    # get field counts
    if pub_records.records.count() > 0:
        # get count of fields for all published job indices
        field_counts = pub_records.count_indexed_fields()
    else:
        field_counts = {}

    # get field mappers
    field_mappers = FieldMapper.objects.all()

    # get published subsets with PublishedRecords static method
    subsets = PublishedRecords.get_subsets()

    # loop through subsets and enrich
    for _ in subsets:

        # add counts
        counts = mc_handle.combine.misc.find_one(
            {'_id': 'published_field_counts_%s' % _['name']})

        # if counts not yet calculated, do now
        if counts is None:
            counts = PublishedRecords(
                subset=_['name']).count_indexed_fields()
        _['counts'] = counts

    # generate hierarchy_dict
    job_hierarchy = _stateio_prepare_job_hierarchy()

    return render(request, 'core/published.html', {
        'published': pub_records,
        'field_mappers': field_mappers,
        'xml2kvp_handle': xml2kvp.XML2kvp(),
        'field_counts': field_counts,
        'es_index_str': pub_records.esi.es_index_str,
        'subsets': subsets,
        'job_hierarchy_json': json.dumps(job_hierarchy),
        'job_hierarchy_json_subset': json.dumps(
            getattr(pub_records, 'ps_doc', {}).get('hierarchy', [])
        ),
        'breadcrumbs': breadcrumb_parser(request)
    })
Esempio n. 4
0
def published_subset_create(request):
    """
        Create subset of published records
                - output should be a Mongo document in combine.misc
                called "published_subset_[SUBSET]"

        Subset Form/Doc
                - slug/id for subset: lowercase, no spaces, sanitize
                - human name
                - description
                - publish sets to include
                        - also include "loose" records?
        """

    if request.method == 'GET':

        # get all published sets
        pub_records = PublishedRecords()

        # generate hierarchy_dict
        job_hierarchy = _stateio_prepare_job_hierarchy()

        return render(request, 'core/published_subset_create.html', {
            'published': pub_records,
            'job_hierarchy_json': json.dumps(job_hierarchy),
            'breadcrumbs': breadcrumb_parser(request)
        })

    if request.method == 'POST':

        LOGGER.debug('creating new published subset')

        # sanitize name
        name = request.POST.get('name')
        name = ''.join(c for c in name if c.isalnum())
        name = name.lower()

        # confirm sets are present
        sets = request.POST.getlist('sets')

        # handle non set records
        include_non_set_records = request.POST.get('include_non_set_records', False)

        # handle org / rg hierarchy
        hierarchy = json.loads(request.POST.get('hierarchy', []))

        # create new published subset
        mc_handle.combine.misc.insert_one(
            {
                'name': name,
                'description': request.POST.get('description', None),
                'type': 'published_subset',
                'publish_set_ids': sets,
                'hierarchy': hierarchy,
                'include_non_set_records': include_non_set_records
            })

        return redirect('published_subset',
                        subset=name)
Esempio n. 5
0
    def get_initial_queryset(self):

        # return queryset used as base for further sorting/filtering

        # get PublishedRecords instance
        pub_records = PublishedRecords(subset=self.kwargs.get('subset', None))

        # return queryset
        return pub_records.records
Esempio n. 6
0
def record_group(request, org_id, record_group_id):
    """
        View information about a single record group, including any and all jobs run

        Args:
                record_group_id (str/int): PK for RecordGroup table
        """

    LOGGER.debug('retrieving record group ID: %s', record_group_id)

    # retrieve record group
    rec_group = RecordGroup.objects.get(pk=int(record_group_id))

    # get all jobs associated with record group
    jobs = Job.objects.filter(record_group=record_group_id)

    # get all currently applied publish set ids
    publish_set_ids = PublishedRecords.get_publish_set_ids()

    # loop through jobs
    for job in jobs:
        # update status
        job.update_status()

    # get record group job lineage
    job_lineage = rec_group.get_jobs_lineage()

    # get all record groups for this organization
    record_groups = RecordGroup.objects.filter(organization=org_id).exclude(id=record_group_id).exclude(
        for_analysis=True)

    # render page
    return render(request, 'core/record_group.html', {
        'record_group': rec_group,
        'jobs': jobs,
        'job_lineage_json': json.dumps(job_lineage),
        'publish_set_ids': publish_set_ids,
        'record_groups': record_groups,
        'breadcrumbs': breadcrumb_parser(request)
    })
Esempio n. 7
0
def job_analysis(request):
    """
    Run new analysis job
    """

    # if GET, prepare form
    if request.method == 'GET':

        # retrieve jobs (limiting if needed)
        input_jobs = Job.objects.all()

        # limit if analysis_type set
        analysis_type = request.GET.get('type', None)
        subset = request.GET.get('subset', None)
        if analysis_type == 'published':

            # load PublishedRecords
            published = PublishedRecords(subset=subset)

            # define input_jobs
            input_jobs = published.published_jobs

        else:
            published = None

        # get validation scenarios
        validation_scenarios = ValidationScenario.objects.all()

        # get field mappers
        field_mappers = FieldMapper.objects.all()

        # get record identifier transformation scenarios
        rits = RecordIdentifierTransformation.objects.all()

        # get job lineage for all jobs (filtered to input jobs scope)
        job_lineage = Job.get_all_jobs_lineage(jobs_query_set=input_jobs)

        # get all bulk downloads
        bulk_downloads = DPLABulkDataDownload.objects.all()

        # render page
        return render(
            request, 'core/job_analysis.html', {
                'job_select_type': 'multiple',
                'input_jobs': input_jobs,
                'published': published,
                'validation_scenarios': validation_scenarios,
                'rits': rits,
                'field_mappers': field_mappers,
                'xml2kvp_handle': xml2kvp.XML2kvp(),
                'analysis_type': analysis_type,
                'bulk_downloads': bulk_downloads,
                'job_lineage_json': json.dumps(job_lineage)
            })

    # if POST, submit job
    if request.method == 'POST':

        cjob = CombineJob.init_combine_job(
            user=request.user,
            # TODO: record_group=record_group,
            job_type_class=AnalysisJob,
            job_params=request.POST)

        # start job and update status
        job_status = cjob.start_job()

        # if job_status is absent, report job status as failed
        if job_status is False:
            cjob.job.status = 'failed'
            cjob.job.save()

        return redirect('analysis')
Esempio n. 8
0
def export_tabular_data(request, export_source=None, job_id=None, subset=None):
    # get records per file
    records_per_file = request.POST.get('records_per_file', False)
    if records_per_file in ['', False]:
        records_per_file = 500

    # get mapped fields export type
    tabular_data_export_type = request.POST.get('tabular_data_export_type')

    # get archive type
    archive_type = request.POST.get('archive_type')

    # get fm config json
    fm_export_config_json = request.POST.get('fm_export_config_json')

    # export for single job
    if export_source == 'job':
        LOGGER.debug('exporting tabular data from Job')

        # retrieve job
        cjob = CombineJob.get_combine_job(int(job_id))

        # initiate Combine BG Task
        combine_task = CombineBackgroundTask(
            name='Export Tabular Data for Job: %s' % cjob.job.name,
            task_type='export_tabular_data',
            task_params_json=json.dumps({
                'job_id':
                cjob.job.id,
                'records_per_file':
                int(records_per_file),
                'tabular_data_export_type':
                tabular_data_export_type,
                'archive_type':
                archive_type,
                'fm_export_config_json':
                fm_export_config_json
            }))
        combine_task.save()

        # handle export output configurations
        combine_task = _handle_export_output(request, export_source,
                                             combine_task)

        # run celery task
        background_task = tasks.export_tabular_data.delay(combine_task.id)
        LOGGER.debug('firing bg task: %s', background_task)
        combine_task.celery_task_id = background_task.task_id
        combine_task.save()

        # set gm
        gmc = GlobalMessageClient(request.session)
        target = "Job:</strong><br>%s" % cjob.job.name
        gmc.add_gm({
            'html':
            '<p><strong>Exporting Tabular Data for %s</p><p><a href="%s"><button type="button" '
            'class="btn btn-outline-primary btn-sm">View Background Tasks</button></a></p>'
            % (target, reverse('bg_tasks')),
            'class':
            'success'
        })

        return redirect('job_details',
                        org_id=cjob.job.record_group.organization.id,
                        record_group_id=cjob.job.record_group.id,
                        job_id=cjob.job.id)

    # export for published
    if export_source == 'published':
        LOGGER.debug('exporting tabular data from published records')

        # get instance of Published model
        # TODO: not used
        PublishedRecords()

        # initiate Combine BG Task
        combine_task = CombineBackgroundTask(
            name='Export Tabular Data for Published Records',
            task_type='export_tabular_data',
            task_params_json=json.dumps({
                'published':
                True,
                'subset':
                subset,
                'records_per_file':
                int(records_per_file),
                'tabular_data_export_type':
                tabular_data_export_type,
                'archive_type':
                archive_type,
                'fm_export_config_json':
                fm_export_config_json
            }))
        combine_task.save()

        # handle export output configurations
        combine_task = _handle_export_output(request, export_source,
                                             combine_task)

        # run celery task
        background_task = tasks.export_tabular_data.delay(combine_task.id)
        LOGGER.debug('firing bg task: %s', background_task)
        combine_task.celery_task_id = background_task.task_id
        combine_task.save()

        # set gm
        gmc = GlobalMessageClient(request.session)
        target = ":</strong><br>Published Records"
        gmc.add_gm({
            'html':
            '<p><strong>Exporting Tabular Data for %s</p><p><a href="%s"><button type="button" '
            'class="btn btn-outline-primary btn-sm">View Background Tasks</button></a></p>'
            % (target, reverse('bg_tasks')),
            'class':
            'success'
        })

        return redirect('published')
Esempio n. 9
0
def job_details(request, org_id, record_group_id, job_id):
    LOGGER.debug('details for job id: %s', job_id)

    # get CombineJob
    cjob = CombineJob.get_combine_job(job_id)

    # update status
    cjob.job.update_status()

    # detailed record count
    record_count_details = cjob.job.get_detailed_job_record_count()

    # get job lineage
    job_lineage = cjob.job.get_lineage()

    # get dpla_bulk_data_match
    dpla_bulk_data_matches = cjob.job.get_dpla_bulk_data_matches()

    # check if limiting to one, pre-existing record
    get_q = request.GET.get('q', None)

    # job details and job type specific augment
    job_detail = cjob.job.job_details_dict

    # mapped field analysis, generate if not part of job_details
    if 'mapped_field_analysis' in job_detail.keys():
        field_counts = job_detail['mapped_field_analysis']
    else:
        if cjob.job.finished:
            field_counts = cjob.count_indexed_fields()
            cjob.job.update_job_details(
                {'mapped_field_analysis': field_counts}, save=True)
        else:
            LOGGER.debug('job not finished, not setting')
            field_counts = {}

    # TODO: What is this accomplishing?
    # OAI Harvest
    if isinstance(cjob, HarvestOAIJob):
        pass

    # Static Harvest
    elif isinstance(cjob, HarvestStaticXMLJob):
        pass

    # Transform
    elif isinstance(cjob, TransformJob):
        pass

    # Merge/Duplicate
    elif isinstance(cjob, MergeJob):
        pass

    # Analysis
    elif isinstance(cjob, AnalysisJob):
        pass

    # get published records, primarily for published sets
    pub_records = PublishedRecords()

    oai_sets = Record.objects(job_id=cjob.job.id).item_frequencies(field='oai_set')

    # get published subsets with PublishedRecords static method
    published_subsets = PublishedRecords.get_subsets()

    # loop through subsets and enrich
    for _ in published_subsets:

        # add counts
        counts = mc_handle.combine.misc.find_one(
            {'_id': 'published_field_counts_%s' % _['name']})

        # if counts not yet calculated, do now
        if counts is None:
            counts = PublishedRecords(
                subset=_['name']).count_indexed_fields()
        _['counts'] = counts

    # get field mappers
    field_mappers = FieldMapper.objects.all()

    # return
    return render(request, 'core/job_details.html', {
        'cjob': cjob,
        'record_group': cjob.job.record_group,
        'record_count_details': record_count_details,
        'field_counts': field_counts,
        'field_mappers': field_mappers,
        'xml2kvp_handle': xml2kvp.XML2kvp(),
        'job_lineage_json': json.dumps(job_lineage),
        'dpla_bulk_data_matches': dpla_bulk_data_matches,
        'q': get_q,
        'job_details': job_detail,
        'pr': pub_records,
        'published_subsets': published_subsets,
        'es_index_str': cjob.esi.es_index_str,
        'breadcrumbs': breadcrumb_parser(request),
        'oai_sets': dict(oai_sets)
    })
Esempio n. 10
0
    def spark_function(spark, **kwargs):
        '''
		Publish records in Combine, prepares for OAI server output

		Args:
			spark (pyspark.sql.session.SparkSession): provided by pyspark context
			kwargs:
				job_id (int): Job ID
				job_input (str): location of avro files on disk				

		Returns:
			None
			- creates symlinks from input job to new avro file symlinks on disk
			- copies records in DB from input job to new published job
			- copies documents in ES from input to new published job index
		'''

        # refresh Django DB Connection
        refresh_django_db_connection()

        # get job
        job = Job.objects.get(pk=int(kwargs['job_id']))

        # start job_track instance, marking job start
        job_track = JobTrack(job_id=job.id)
        job_track.save()

        # read output from input job, filtering by job_id, grabbing Combine Record schema fields
        input_job = Job.objects.get(pk=int(kwargs['input_job_id']))
        bounds = get_job_db_bounds(input_job)
        sqldf = spark.read.jdbc(settings.COMBINE_DATABASE['jdbc_url'],
                                'core_record',
                                properties=settings.COMBINE_DATABASE,
                                column='id',
                                lowerBound=bounds['lowerBound'],
                                upperBound=bounds['upperBound'],
                                numPartitions=settings.JDBC_NUMPARTITIONS)
        records = sqldf.filter(sqldf.job_id == int(kwargs['input_job_id']))

        # repartition
        records = records.repartition(settings.SPARK_REPARTITION)

        # get rows with document content
        records = records[records['document'] != '']

        # update job column, overwriting job_id from input jobs in merge
        job_id = job.id
        job_id_udf = udf(lambda record_id: job_id, IntegerType())
        records = records.withColumn('job_id', job_id_udf(records.record_id))

        # write job output to avro
        records.select(CombineRecordSchema().field_names).write.format(
            "com.databricks.spark.avro").save(job.job_output)

        # confirm directory exists
        published_dir = '%s/published' % (
            settings.BINARY_STORAGE.split('file://')[-1].rstrip('/'))
        if not os.path.exists(published_dir):
            os.mkdir(published_dir)

        # get avro files
        job_output_dir = job.job_output.split('file://')[-1]
        avros = [f for f in os.listdir(job_output_dir) if f.endswith('.avro')]
        for avro in avros:
            os.symlink(os.path.join(job_output_dir, avro),
                       os.path.join(published_dir, avro))

        # index records to DB and index to ElasticSearch
        db_records = save_records(spark=spark,
                                  kwargs=kwargs,
                                  job=job,
                                  records_df=records,
                                  write_avro=False,
                                  index_records=False)

        # copy index from input job to new Publish job
        index_to_job_index = ESIndex.copy_es_index(source_index='j%s' %
                                                   input_job.id,
                                                   target_index='j%s' % job.id,
                                                   wait_for_completion=False)

        # copy index from new Publish Job to /published index
        # NOTE: because back to back reindexes, and problems with timeouts on requests,
        # wait on task from previous reindex
        es_handle_temp = Elasticsearch(hosts=[settings.ES_HOST])
        retry = 1
        while retry <= 100:

            # get task
            task = es_handle_temp.tasks.get(index_to_job_index['task'])

            # if task complete, index job index to published index
            if task['completed']:
                index_to_published_index = ESIndex.copy_es_index(
                    source_index='j%s' % job.id,
                    target_index='published',
                    wait_for_completion=False,
                    add_copied_from=
                    job_id  # do not use Job instance here, only pass string
                )
                break  # break from retry loop

            else:
                print(
                    "indexing to /published, waiting on task node %s, retry: %s/10"
                    % (task['task']['node'], retry))

                # bump retries, sleep, and continue
                retry += 1
                time.sleep(3)
                continue

        # get PublishedRecords handle
        pr = PublishedRecords()

        # set records from job as published
        pr.set_published_field(job_id)

        # update uniqueness of all published records
        pr.update_published_uniqueness()

        # finally, update finish_timestamp of job_track instance
        job_track.finish_timestamp = datetime.datetime.now()
        job_track.save()
Esempio n. 11
0
def published_subset_edit(request, subset):
    """
        Edit Published Subset
        """

    if request.method == 'GET':

        # get subset published records
        pub_records = PublishedRecords()
        published_subset = PublishedRecords(subset=subset)
        published_subset.ps_doc['id'] = str(published_subset.ps_doc['_id'])

        # generate hierarchy_dict
        job_hierarchy = _stateio_prepare_job_hierarchy()

        return render(request, 'core/published_subset_edit.html', {
            'published': pub_records,
            'published_subset': published_subset,
            'job_hierarchy_json': json.dumps(job_hierarchy),
            'job_hierarchy_json_subset': json.dumps(published_subset.ps_doc.get('hierarchy', [])),
            'breadcrumbs': breadcrumb_parser(request)
        })

    if request.method == 'POST':

        LOGGER.debug('updating published subset')

        # confirm sets are present
        sets = request.POST.getlist('sets')

        # handle non set records
        include_non_set_records = request.POST.get('include_non_set_records', False)

        # handle org / rg hierarchy
        hierarchy = json.loads(request.POST.get('hierarchy', []))

        # update published subset
        pub_records = PublishedRecords(subset=subset)
        pub_records.update_subset({
            'description': request.POST.get('description', None),
            'type': 'published_subset',
            'publish_set_ids': sets,
            'hierarchy': hierarchy,
            'include_non_set_records': include_non_set_records
        })
        pub_records.remove_subset_precounts()

        return redirect('published_subset',
                        subset=subset)
Esempio n. 12
0
class OAIProvider():
    """
    Class for scaffolding and building responses to OAI queries

    NOTE: Because the OAI-PMH protocol shares verbs with reserved words in Python (e.g. "set", or "from"),
    easier to keep the HTTP request args to work with as a dictionary, and maintain the original OAI-PMH vocab.
    """
    def __init__(self, args, subset=None):

        # set subset
        self.subset = subset

        # read args, route verb to verb handler
        self.verb_routes = {
            'GetRecord': self._GetRecord,
            'Identify': self._Identify,
            'ListIdentifiers': self._ListIdentifiers,
            'ListMetadataFormats': self._ListMetadataFormats,
            'ListRecords': self._ListRecords,
            'ListSets': self._ListSets
        }

        self.args = args.copy()
        self.request_timestamp = datetime.datetime.now()
        self.request_timestamp_string = self.request_timestamp.strftime(
            '%Y-%m-%dT%H:%M:%SZ')
        self.record_nodes = []

        # published dataframe slice parameters
        self.start = 0
        self.chunk_size = settings.OAI_RESPONSE_SIZE
        if 'set' in self.args.keys() and self.args['set'] != '':
            self.publish_set_id = self.args['set']
        else:
            self.publish_set_id = None

        # get instance of Published model
        self.published = PublishedRecords(subset=self.subset)

        # begin scaffolding
        self.scaffold()

    # generate XML root node with OAI-PMH scaffolding
    def scaffold(self):
        """
        Scaffold XML, OAI response

        Args:
                None

        Returns:
                None
                        - sets multiple attributes for response building
        """

        # build root node, nsmap, and attributes
        NSMAP = {None: 'http://www.openarchives.org/OAI/2.0/'}
        self.root_node = etree.Element('OAI-PMH', nsmap=NSMAP)
        self.root_node.set(
            '{http://www.w3.org/2001/XMLSchema-instance}schemaLocation',
            'http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd'
        )

        # set responseDate node
        self.responseDate_node = etree.Element('responseDate')
        self.responseDate_node.text = self.request_timestamp.strftime(
            '%Y-%m-%dT%H:%M:%SZ')
        self.root_node.append(self.responseDate_node)

        # set request node
        self.request_node = etree.Element('request')

        # set verb
        try:
            self.request_node.attrib['verb'] = self.args['verb']
        except:
            self.args['verb'] = 'NULL'
            self.request_node.attrib['verb'] = 'NULL'

        # capture set if present
        if 'set' in self.args.keys():
            self.request_node.attrib['set'] = self.args['set']

        # metadataPrefix
        if 'metadataPrefix' in self.args.keys():
            self.request_node.attrib['metadataPrefix'] = self.args[
                'metadataPrefix']

        self.request_node.text = 'http://%s%s' % (settings.APP_HOST,
                                                  reverse('oai'))
        self.root_node.append(self.request_node)

        # set verb node
        self.verb_node = etree.Element(self.args['verb'])
        self.root_node.append(self.verb_node)

    def retrieve_records(self, include_metadata=False):
        """
        Retrieve record(s) from DB for response

        Args:
                include_metadata (bool): If False, return only identifiers, if True, include record document as well

        Returns:
                None
                        - adds record(s) to self.record_nodes
        """

        stime = time.time()
        logger.debug("retrieving records for verb %s", self.args['verb'])

        # get records
        records = self.published.records

        # if set present, filter by this set
        if self.publish_set_id:
            logger.debug('applying publish_set_id filter: %s',
                         self.publish_set_id)
            records = records.filter(publish_set_id=self.publish_set_id)

        # loop through rows, limited by current OAI transaction start / chunk

        # count records before slice
        records_count = records.count()

        # get slice for iteration
        records = records[self.start:(self.start + self.chunk_size)]
        for record in records:

            record = OAIRecord(args=self.args,
                               record_id=record.record_id,
                               publish_set_id=record.publish_set_id,
                               document=record.document,
                               timestamp=self.request_timestamp_string)

            # include full metadata in record
            if include_metadata:
                record.include_metadata()

            # append to record_nodes
            self.record_nodes.append(record.oai_record_node)

        # add to verb node
        for oai_record_node in self.record_nodes:
            self.verb_node.append(oai_record_node)

        # finally, set resumption token
        self.set_resumption_token(records, completeListSize=records_count)

        # report
        record_nodes_num = len(self.record_nodes)
        logger.debug("%s record(s) returned in %s", record_nodes_num,
                     (float(time.time()) - float(stime)))

    def set_resumption_token(self, records, completeListSize=None):
        """
        Set resumption tokens in DB under OAITransaction model

        Args:
                completeListSize (int): total number of records based on passed parameters

        Returns:
                None
                        - sets attributes related to resumption tokens
        """

        # set resumption token
        if self.start + self.chunk_size < completeListSize:
            # set token and slice parameters to DB
            token = str(uuid.uuid4())
            logger.debug('setting resumption token: %s', token)
            oai_trans = OAITransaction(verb=self.args['verb'],
                                       start=self.start + self.chunk_size,
                                       chunk_size=self.chunk_size,
                                       publish_set_id=self.publish_set_id,
                                       token=token,
                                       args=json.dumps(self.args))
            oai_trans.save()

            # set resumption token node and attributes
            self.resumptionToken_node = etree.Element('resumptionToken')
            self.resumptionToken_node.attrib['expirationDate'] = (self.request_timestamp + datetime.timedelta(0, 3600))\
                .strftime('%Y-%m-%dT%H:%M:%SZ')
            self.resumptionToken_node.attrib['completeListSize'] = str(
                completeListSize)
            self.resumptionToken_node.attrib['cursor'] = str(self.start)
            self.resumptionToken_node.text = token
            self.verb_node.append(self.resumptionToken_node)

    # convenience function to run all internal methods
    def generate_response(self):
        """
        Returns OAI response as XML

        Args:
                None

        Returns:
                (str): XML response
        """

        # check verb
        if self.args['verb'] not in self.verb_routes.keys():
            return self.raise_error(
                'badVerb', 'The verb %s is not allowed, must be from: %s' %
                (self.args['verb'], str(self.verb_routes.keys())))

        # check for resumption token
        if 'resumptionToken' in self.args.keys():

            # retrieve token params and alter args and search_params
            ot_query = OAITransaction.objects.filter(
                token=self.args['resumptionToken'])
            if ot_query.count() == 1:
                ot = ot_query.first()

                # set args and start and chunk_size
                self.start = ot.start
                self.chunk_size = ot.chunk_size
                self.publish_set_id = ot.publish_set_id

                logger.debug(
                    'following resumption token, altering dataframe slice params:'
                )
                logger.debug(
                    [self.start, self.chunk_size, self.publish_set_id])

            # raise error
            else:
                return self.raise_error(
                    'badResumptionToken',
                    'The resumptionToken %s is not found' %
                    self.args['resumptionToken'])

        # fire verb reponse building
        self.verb_routes[self.args['verb']]()
        return self.serialize()

    def raise_error(self, error_code, error_msg):
        """
        Returns error as XML, OAI response

        Args:
                error_code (str): OAI-PMH error codes (e.g. badVerb, generic, etc.)
                error_msg (str): details about error

        Returns:
                (str): XML response
        """

        # remove verb node
        try:
            self.root_node.remove(self.verb_node)
        except:
            logger.debug('verb_node not found')

        # create error node and append
        error_node = etree.SubElement(self.root_node, 'error')
        error_node.attrib['code'] = error_code
        error_node.text = error_msg

        # serialize and return
        return self.serialize()

    # serialize record nodes as XML response
    def serialize(self):
        """
        Serialize all nodes as XML for returning

        Args:
                None

        Returns:
                (str): XML response
        """

        return etree.tostring(self.root_node)

    # GetRecord
    def _GetRecord(self):
        """
        OAI-PMH verb: GetRecord
        Retrieve a single record based on record id, return

        Args:
                None

        Returns:
                None
                        sets single record node to self.record_nodes
        """

        stime = time.time()
        logger.debug("retrieving record: %s", self.args['identifier'])

        # get single row
        single_record = self.published.get_record(self.args['identifier'])

        # if single record found
        if single_record:

            # open as OAIRecord
            record = OAIRecord(args=self.args,
                               record_id=single_record.record_id,
                               document=single_record.document,
                               timestamp=self.request_timestamp_string)

            # include metadata
            record.include_metadata()

            # append to record_nodes
            self.record_nodes.append(record.oai_record_node)

            # add to verb node
            for oai_record_node in self.record_nodes:
                self.verb_node.append(oai_record_node)

        else:
            logger.debug('record not found for id: %s, not appending node',
                         self.args['identifier'])

        # report
        etime = time.time()
        logger.debug("%s record(s) returned in %sms", len(self.record_nodes),
                     (float(etime) - float(stime)) * 1000)

    # Identify
    def _Identify(self):
        """
        OAI-PMH verb: Identify
        Provide information about Repository / OAI Server

        Args:
                None

        Returns:
                None
                        sets description node text
        """

        # init OAIRecord
        logger.debug('generating identify node')

        # write Identify node
        description_node = etree.Element('description')
        desc_text = 'Combine, integrated OAI-PMH.'
        if self.subset is not None:
            desc_text += ' Note: You are receiving a published subset of this Combine instance named: %s.' % self.subset
        description_node.text = desc_text
        self.verb_node.append(description_node)

    # ListIdentifiers
    def _ListIdentifiers(self):
        """
        OAI-PMH verb: ListIdentifiers
        Lists identifiers

        Args:
                None

        Returns:
                None
                        sets multiple record nodes to self.record.nodes
        """

        self.retrieve_records()

    # ListMetadataFormats
    def _ListMetadataFormats(self):
        """
        # OAI-PMH verb: ListMetadataFormats
        # List all metadataformats, or optionally, available metadataformats for
        # one item based on published metadata formats

                NOTE: Currently, Combine does not support Metadata Formats for the outgoing OAI-PMH server.
                All published Records are undoubtedly of a metadata format, but this is opaque to Combine.  This
                may change in the future, but for now, a shim is in place to return valid OAI-PMH responses for
                the verb ListMetadataForamts
        """

        # generic metadata prefix shim
        generic_metadata_hash = {
            'prefix': 'generic',
            'schema': 'http://generic.org/schema',
            'namespace': 'gnc'
        }

        # identifier provided
        if 'identifier' in self.args.keys():

            try:
                logging.debug(
                    "identifier provided for ListMetadataFormats, confirming that identifier exists..."
                )
                single_record = self.published.get_record(
                    self.args['identifier'])

                if single_record:

                    mf_node = etree.Element('metadataFormat')

                    # write metadataPrefix node
                    prefix = etree.SubElement(mf_node, 'metadataPrefix')
                    prefix.text = generic_metadata_hash['prefix']

                    # write schema node
                    schema = etree.SubElement(mf_node, 'schema')
                    schema.text = generic_metadata_hash['schema']

                    # write schema node
                    namespace = etree.SubElement(mf_node, 'metadataNamespace')
                    namespace.text = generic_metadata_hash['namespace']

                    # append to verb_node and return
                    self.verb_node.append(mf_node)

                else:
                    raise Exception('record could not be located')
            except:
                return self.raise_error(
                    'idDoesNotExist', 'The identifier %s is not found.' %
                    self.args['identifier'])

        # no identifier, return all available metadataPrefixes
        else:

            mf_node = etree.Element('metadataFormat')

            # write metadataPrefix node
            prefix = etree.SubElement(mf_node, 'metadataPrefix')
            prefix.text = generic_metadata_hash['prefix']

            # write schema node
            schema = etree.SubElement(mf_node, 'schema')
            schema.text = generic_metadata_hash['schema']

            # write schema node
            namespace = etree.SubElement(mf_node, 'metadataNamespace')
            namespace.text = generic_metadata_hash['namespace']

            # append to verb_node and return
            self.verb_node.append(mf_node)

    # ListRecords
    def _ListRecords(self):
        """
        OAI-PMH verb: ListRecords
        Lists records; similar to ListIdentifiers, but includes metadata from record.document

        Args:
                None

        Returns:
                None
                        sets multiple record nodes to self.record.nodes
        """

        self.retrieve_records(include_metadata=True)

    # ListSets
    def _ListSets(self):
        """
        OAI-PMH verb: ListSets
        Lists available sets.  Sets are derived from the publish_set_id from a published Job

        Args:
                None

        Returns:
                None
                        sets multiple set nodes
        """

        # generate response
        for publish_set_id in self.published.sets:
            set_node = etree.Element('set')
            setSpec = etree.SubElement(set_node, 'setSpec')
            setSpec.text = publish_set_id
            setName = etree.SubElement(set_node, 'setName')
            setName.text = publish_set_id
            self.verb_node.append(set_node)