Beispiel #1
0
def job_harvest_tabular_data(request,
                             org_id,
                             record_group_id,
                             hash_payload_filename=False):
    """
        Create a new static XML Harvest Job
        """

    # retrieve record group
    record_group = RecordGroup.objects.filter(id=record_group_id).first()

    # get validation scenarios
    validation_scenarios = ValidationScenario.objects.all()

    # get field mappers
    field_mappers = FieldMapper.objects.all()

    # get record identifier transformation scenarios
    rits = RecordIdentifierTransformation.objects.all()

    # get all bulk downloads
    bulk_downloads = DPLABulkDataDownload.objects.all()

    # if GET, prepare form
    if request.method == 'GET':
        # render page
        return render(
            request, 'core/job_harvest_tabular_data.html', {
                'record_group': record_group,
                'validation_scenarios': validation_scenarios,
                'rits': rits,
                'field_mappers': field_mappers,
                'xml2kvp_handle': xml2kvp.XML2kvp(),
                'bulk_downloads': bulk_downloads,
                'breadcrumbs': breadcrumb_parser(request)
            })

    # if POST, submit job
    if request.method == 'POST':

        cjob = CombineJob.init_combine_job(
            user=request.user,
            record_group=record_group,
            job_type_class=HarvestTabularDataJob,
            job_params=request.POST,
            files=request.FILES,
            hash_payload_filename=hash_payload_filename)

        # start job and update status
        job_status = cjob.start_job()

        # if job_status is absent, report job status as failed
        if job_status == False:
            cjob.job.status = 'failed'
            cjob.job.save()

        return redirect('record_group',
                        org_id=org_id,
                        record_group_id=record_group.id)
Beispiel #2
0
def published(request, subset=None):
    """
        Published records
        """

    # get instance of Published model
    pub_records = PublishedRecords(subset=subset)

    # get field counts
    if pub_records.records.count() > 0:
        # get count of fields for all published job indices
        field_counts = pub_records.count_indexed_fields()
    else:
        field_counts = {}

    # get field mappers
    field_mappers = FieldMapper.objects.all()

    # get published subsets with PublishedRecords static method
    subsets = PublishedRecords.get_subsets()

    # loop through subsets and enrich
    for _ in subsets:

        # add counts
        counts = mc_handle.combine.misc.find_one(
            {'_id': 'published_field_counts_%s' % _['name']})

        # if counts not yet calculated, do now
        if counts is None:
            counts = PublishedRecords(
                subset=_['name']).count_indexed_fields()
        _['counts'] = counts

    # generate hierarchy_dict
    job_hierarchy = _stateio_prepare_job_hierarchy()

    return render(request, 'core/published.html', {
        'published': pub_records,
        'field_mappers': field_mappers,
        'xml2kvp_handle': xml2kvp.XML2kvp(),
        'field_counts': field_counts,
        'es_index_str': pub_records.esi.es_index_str,
        'subsets': subsets,
        'job_hierarchy_json': json.dumps(job_hierarchy),
        'job_hierarchy_json_subset': json.dumps(
            getattr(pub_records, 'ps_doc', {}).get('hierarchy', [])
        ),
        'breadcrumbs': breadcrumb_parser(request)
    })
Beispiel #3
0
def job_analysis(request):
    """
    Run new analysis job
    """

    # if GET, prepare form
    if request.method == 'GET':

        # retrieve jobs (limiting if needed)
        input_jobs = Job.objects.all()

        # limit if analysis_type set
        analysis_type = request.GET.get('type', None)
        subset = request.GET.get('subset', None)
        if analysis_type == 'published':

            # load PublishedRecords
            published = PublishedRecords(subset=subset)

            # define input_jobs
            input_jobs = published.published_jobs

        else:
            published = None

        # get validation scenarios
        validation_scenarios = ValidationScenario.objects.all()

        # get field mappers
        field_mappers = FieldMapper.objects.all()

        # get record identifier transformation scenarios
        rits = RecordIdentifierTransformation.objects.all()

        # get job lineage for all jobs (filtered to input jobs scope)
        job_lineage = Job.get_all_jobs_lineage(jobs_query_set=input_jobs)

        # get all bulk downloads
        bulk_downloads = DPLABulkDataDownload.objects.all()

        # render page
        return render(
            request, 'core/job_analysis.html', {
                'job_select_type': 'multiple',
                'input_jobs': input_jobs,
                'published': published,
                'validation_scenarios': validation_scenarios,
                'rits': rits,
                'field_mappers': field_mappers,
                'xml2kvp_handle': xml2kvp.XML2kvp(),
                'analysis_type': analysis_type,
                'bulk_downloads': bulk_downloads,
                'job_lineage_json': json.dumps(job_lineage)
            })

    # if POST, submit job
    if request.method == 'POST':

        cjob = CombineJob.init_combine_job(
            user=request.user,
            # TODO: record_group=record_group,
            job_type_class=AnalysisJob,
            job_params=request.POST)

        # start job and update status
        job_status = cjob.start_job()

        # if job_status is absent, report job status as failed
        if job_status is False:
            cjob.job.status = 'failed'
            cjob.job.save()

        return redirect('analysis')
Beispiel #4
0
def job_merge(request, org_id, record_group_id):
    """
        Merge multiple jobs into a single job
        """

    # retrieve record group
    record_group = RecordGroup.objects.get(pk=record_group_id)

    # if GET, prepare form
    if request.method == 'GET':

        # get scope of input jobs and retrieve
        input_job_scope = request.GET.get('scope', None)

        # if all jobs, retrieve all jobs
        if input_job_scope == 'all_jobs':
            input_jobs = Job.objects.exclude(
                job_type='AnalysisJob').all()

        # else, limit to RecordGroup
        else:
            input_jobs = record_group.job_set.all()

        # get validation scenarios
        validation_scenarios = ValidationScenario.objects.all()

        # get record identifier transformation scenarios
        rits = RecordIdentifierTransformation.objects.all()

        # get field mappers
        field_mappers = FieldMapper.objects.all()

        # get job lineage for all jobs (filtered to input jobs scope)
        job_lineage = Job.get_all_jobs_lineage(jobs_query_set=input_jobs)

        # get all bulk downloads
        bulk_downloads = DPLABulkDataDownload.objects.all()

        # render page
        return render(request, 'core/job_merge.html', {
            'job_select_type': 'multiple',
            'record_group': record_group,
            'input_jobs': input_jobs,
            'input_job_scope': input_job_scope,
            'validation_scenarios': validation_scenarios,
            'rits': rits,
            'field_mappers': field_mappers,
            'xml2kvp_handle': xml2kvp.XML2kvp(),
            'job_lineage_json': json.dumps(job_lineage),
            'bulk_downloads': bulk_downloads,
            'breadcrumbs': breadcrumb_parser(request)
        })

    # if POST, submit job
    if request.method == 'POST':

        cjob = CombineJob.init_combine_job(
            user=request.user,
            record_group=record_group,
            job_type_class=MergeJob,
            job_params=request.POST)

        # start job and update status
        job_status = cjob.start_job()

        # if job_status is absent, report job status as failed
        if job_status == False:
            cjob.job.status = 'failed'
            cjob.job.save()

        return redirect('record_group', org_id=org_id, record_group_id=record_group.id)
Beispiel #5
0
def job_details(request, org_id, record_group_id, job_id):
    LOGGER.debug('details for job id: %s', job_id)

    # get CombineJob
    cjob = CombineJob.get_combine_job(job_id)

    # update status
    cjob.job.update_status()

    # detailed record count
    record_count_details = cjob.job.get_detailed_job_record_count()

    # get job lineage
    job_lineage = cjob.job.get_lineage()

    # get dpla_bulk_data_match
    dpla_bulk_data_matches = cjob.job.get_dpla_bulk_data_matches()

    # check if limiting to one, pre-existing record
    get_q = request.GET.get('q', None)

    # job details and job type specific augment
    job_detail = cjob.job.job_details_dict

    # mapped field analysis, generate if not part of job_details
    if 'mapped_field_analysis' in job_detail.keys():
        field_counts = job_detail['mapped_field_analysis']
    else:
        if cjob.job.finished:
            field_counts = cjob.count_indexed_fields()
            cjob.job.update_job_details(
                {'mapped_field_analysis': field_counts}, save=True)
        else:
            LOGGER.debug('job not finished, not setting')
            field_counts = {}

    # TODO: What is this accomplishing?
    # OAI Harvest
    if isinstance(cjob, HarvestOAIJob):
        pass

    # Static Harvest
    elif isinstance(cjob, HarvestStaticXMLJob):
        pass

    # Transform
    elif isinstance(cjob, TransformJob):
        pass

    # Merge/Duplicate
    elif isinstance(cjob, MergeJob):
        pass

    # Analysis
    elif isinstance(cjob, AnalysisJob):
        pass

    # get published records, primarily for published sets
    pub_records = PublishedRecords()

    oai_sets = Record.objects(job_id=cjob.job.id).item_frequencies(field='oai_set')

    # get published subsets with PublishedRecords static method
    published_subsets = PublishedRecords.get_subsets()

    # loop through subsets and enrich
    for _ in published_subsets:

        # add counts
        counts = mc_handle.combine.misc.find_one(
            {'_id': 'published_field_counts_%s' % _['name']})

        # if counts not yet calculated, do now
        if counts is None:
            counts = PublishedRecords(
                subset=_['name']).count_indexed_fields()
        _['counts'] = counts

    # get field mappers
    field_mappers = FieldMapper.objects.all()

    # return
    return render(request, 'core/job_details.html', {
        'cjob': cjob,
        'record_group': cjob.job.record_group,
        'record_count_details': record_count_details,
        'field_counts': field_counts,
        'field_mappers': field_mappers,
        'xml2kvp_handle': xml2kvp.XML2kvp(),
        'job_lineage_json': json.dumps(job_lineage),
        'dpla_bulk_data_matches': dpla_bulk_data_matches,
        'q': get_q,
        'job_details': job_detail,
        'pr': pub_records,
        'published_subsets': published_subsets,
        'es_index_str': cjob.esi.es_index_str,
        'breadcrumbs': breadcrumb_parser(request),
        'oai_sets': dict(oai_sets)
    })
Beispiel #6
0
def job_update(request, org_id, record_group_id, job_id):
    """
    Update Job in one of several ways:
        - re-map and index
        - run new / different validations
    """

    # retrieve job
    cjob = CombineJob.get_combine_job(int(job_id))

    # if GET, prepare form
    if request.method == 'GET':
        # get validation scenarios
        validation_scenarios = ValidationScenario.objects.all()

        # get field mappers
        field_mappers = FieldMapper.objects.all()
        orig_fm_config_json = cjob.job.get_fm_config_json()

        # get all bulk downloads
        bulk_downloads = DPLABulkDataDownload.objects.all()

        # get update type from GET params
        update_type = request.GET.get('update_type', None)

        # render page
        return render(request, 'core/job_update.html', {
            'cjob': cjob,
            'update_type': update_type,
            'validation_scenarios': validation_scenarios,
            'field_mappers': field_mappers,
            'bulk_downloads': bulk_downloads,
            'xml2kvp_handle': xml2kvp.XML2kvp(),
            'orig_fm_config_json': orig_fm_config_json,
            'breadcrumbs': breadcrumb_parser(request)
        })

    # if POST, submit job
    if request.method == 'POST':

        LOGGER.debug('updating job')
        LOGGER.debug(request.POST)

        # retrieve job
        cjob = CombineJob.get_combine_job(int(job_id))

        # get update type
        update_type = request.POST.get('update_type', None)
        LOGGER.debug('running job update: %s', update_type)

        # handle re-index
        if update_type == 'reindex':
            # get preferred metadata index mapper
            fm_config_json = request.POST.get('fm_config_json')

            # init re-index
            cjob.reindex_bg_task(fm_config_json=fm_config_json)

            # set gms
            gmc = GlobalMessageClient(request.session)
            gmc.add_gm({
                'html': '<p><strong>Re-Indexing Job:</strong><br>%s</p>'
                        '<p><a href="%s"><button type="button" '
                        'class="btn btn-outline-primary btn-sm">View Background Tasks</button></a></p>' % (
                            cjob.job.name, reverse('bg_tasks')),
                'class': 'success'
            })

            return redirect('job_details',
                            org_id=cjob.job.record_group.organization.id,
                            record_group_id=cjob.job.record_group.id,
                            job_id=cjob.job.id)

        # handle new validations
        if update_type == 'validations':
            # get requested validation scenarios
            validation_scenarios = request.POST.getlist(
                'validation_scenario', [])

            # get validations
            validations = ValidationScenario.objects.filter(
                id__in=[int(vs_id) for vs_id in validation_scenarios])

            # init bg task
            cjob.new_validations_bg_task([vs.id for vs in validations])

            # set gms
            gmc = GlobalMessageClient(request.session)
            gmc.add_gm({
                'html': '<p><strong>Running New Validations for Job:</strong><br>%s<br>'
                        '<br><strong>Validation Scenarios:</strong><br>%s</p>'
                        '<p><a href="%s"><button type="button" '
                        'class="btn btn-outline-primary btn-sm">View Background Tasks</button></a></p>' % (
                            cjob.job.name, '<br>'.join([vs.name for vs in validations]), reverse('bg_tasks')),
                'class': 'success'
            })

            return redirect('job_details',
                            org_id=cjob.job.record_group.organization.id,
                            record_group_id=cjob.job.record_group.id,
                            job_id=cjob.job.id)

        # handle validation removal
        if update_type == 'remove_validation':
            # get validation scenario to remove
            jv_id = request.POST.get('jv_id', False)

            # initiate Combine BG Task
            cjob.remove_validation_bg_task(jv_id)

            # set gms
            validation_scenario = JobValidation.objects.get(
                pk=int(jv_id)).validation_scenario
            gmc = GlobalMessageClient(request.session)
            gmc.add_gm({
                'html': '<p><strong>Removing Validation for Job:</strong><br>%s<br><br>'
                        '<strong>Validation Scenario:</strong><br>%s</p><p><a href="%s"><button type="button" '
                        'class="btn btn-outline-primary btn-sm">View Background Tasks</button></a></p>' % (
                            cjob.job.name, validation_scenario.name, reverse('bg_tasks')),
                'class': 'success'
            })

            return redirect('job_details',
                            org_id=cjob.job.record_group.organization.id,
                            record_group_id=cjob.job.record_group.id,
                            job_id=cjob.job.id)

        # handle validation removal
        if update_type == 'dbdm':
            # get validation scenario to remove
            dbdd_id = request.POST.get('dbdd', False)

            # initiate Combine BG Task
            cjob.dbdm_bg_task(dbdd_id)

            # set gms
            dbdd = DPLABulkDataDownload.objects.get(pk=int(dbdd_id))
            gmc = GlobalMessageClient(request.session)
            gmc.add_gm({
                'html': '<p><strong>Running DPLA Bulk Data comparison for Job:</strong><br>%s<br><br>'
                        '<strong>Bulk Data S3 key:</strong><br>%s</p><p><a href="%s"><button type="button" '
                        'class="btn btn-outline-primary btn-sm">View Background Tasks</button></a></p>' % (
                            cjob.job.name, dbdd.s3_key, reverse('bg_tasks')),
                'class': 'success'
            })

            return redirect('job_details',
                            org_id=cjob.job.record_group.organization.id,
                            record_group_id=cjob.job.record_group.id,
                            job_id=cjob.job.id)

        if update_type == 'publish_set':
            update_body = request.POST
            if update_body.get('publish_set_id', None):
                cjob.job.publish_set_id = update_body['publish_set_id']
            if update_body.get('existing_publish_set_id', None):
                cjob.job.publish_set_id = update_body['existing_publish_set_id']
            redirect_anchor = update_body.get('redirect_anchor', '')
            cjob.job.save()
            return redirect(reverse('job_details', args=[org_id, record_group_id, job_id]) + redirect_anchor)