Example #1
0
def test_static_harvest_reindex(VO):
    # refresh job
    VO.static_harvest_cjob = CombineJob.get_combine_job(
        VO.static_harvest_cjob.job.id)

    # fm config json, adding literal foo:bar
    fm_config_json = '{"concat_values_on_all_fields": false, "capture_attribute_values": [], "remove_ns_prefix": true, "skip_attribute_ns_declarations": true, "remove_copied_key": true, "node_delim": "_", "copy_to": {}, "copy_value_to_regex": {}, "copy_to_regex": {}, "split_values_on_all_fields": false, "add_literals": {"foo":"bar"}, "exclude_attributes": [], "ns_prefix_delim": "|", "self_describing": false, "split_values_on_fields": {}, "include_attributes": [], "include_sibling_id": false, "multivalue_delim": "|", "skip_repeating_values": true, "repeating_element_suffix_count": false, "exclude_elements": [], "concat_values_on_fields": {}, "remove_copied_value": false, "error_on_delims_collision": false, "include_all_attributes": false, "skip_root": false}'

    # reindex static harvest
    bg_task = VO.static_harvest_cjob.reindex_bg_task(
        fm_config_json=fm_config_json)

    # poll until complete
    for x in range(0, 480):

        # pause
        time.sleep(1)
        LOGGER.debug('polling for reindexing %s seconds...' % (x))

        # refresh session
        bg_task.update()

        # check status
        if bg_task.celery_status not in ['SUCCESS', 'FAILURE']:
            continue
        else:
            break

    # assert 250 records have foo:bar, indicating successful reindexing
    results = VO.static_harvest_cjob.field_analysis('foo')
    assert results['metrics']['doc_instances'] == 250
Example #2
0
def job_publish(request, org_id, record_group_id, job_id):
    LOGGER.debug(request.POST)

    # capture entered publish set id
    publish_set_id = request.POST.get('publish_set_id', None)

    # override with pre-existing publish set id is selected
    if request.POST.get('existing_publish_set_id', None) is not None:
        publish_set_id = request.POST.get('existing_publish_set_id')

    # get published subsets to include in
    published_subsets = request.POST.getlist('published_subsets', [])

    # get CombineJob
    cjob = CombineJob.get_combine_job(job_id)

    # init publish
    cjob.publish_bg_task(
        publish_set_id=publish_set_id,
        in_published_subsets=published_subsets)

    # set gms
    gmc = GlobalMessageClient(request.session)
    gmc.add_gm({
        'html': '<p><strong>Publishing Job:</strong><br>%s<br><br><strong>Publish Set ID:</strong><br>%s</p><p><a href="%s"><button type="button" class="btn btn-outline-primary btn-sm">View Published Records</button></a></p>' % (
            cjob.job.name, publish_set_id, reverse('published')),
        'class': 'success'
    })

    return redirect('record_group',
                    org_id=cjob.job.record_group.organization.id,
                    record_group_id=cjob.job.record_group.id)
Example #3
0
def job_harvest_tabular_data(request,
                             org_id,
                             record_group_id,
                             hash_payload_filename=False):
    """
        Create a new static XML Harvest Job
        """

    # retrieve record group
    record_group = RecordGroup.objects.filter(id=record_group_id).first()

    # get validation scenarios
    validation_scenarios = ValidationScenario.objects.all()

    # get field mappers
    field_mappers = FieldMapper.objects.all()

    # get record identifier transformation scenarios
    rits = RecordIdentifierTransformation.objects.all()

    # get all bulk downloads
    bulk_downloads = DPLABulkDataDownload.objects.all()

    # if GET, prepare form
    if request.method == 'GET':
        # render page
        return render(
            request, 'core/job_harvest_tabular_data.html', {
                'record_group': record_group,
                'validation_scenarios': validation_scenarios,
                'rits': rits,
                'field_mappers': field_mappers,
                'xml2kvp_handle': xml2kvp.XML2kvp(),
                'bulk_downloads': bulk_downloads,
                'breadcrumbs': breadcrumb_parser(request)
            })

    # if POST, submit job
    if request.method == 'POST':

        cjob = CombineJob.init_combine_job(
            user=request.user,
            record_group=record_group,
            job_type_class=HarvestTabularDataJob,
            job_params=request.POST,
            files=request.FILES,
            hash_payload_filename=hash_payload_filename)

        # start job and update status
        job_status = cjob.start_job()

        # if job_status is absent, report job status as failed
        if job_status == False:
            cjob.job.status = 'failed'
            cjob.job.save()

        return redirect('record_group',
                        org_id=org_id,
                        record_group_id=record_group.id)
Example #4
0
def clone_jobs(request):
    LOGGER.debug('cloning jobs')

    job_ids = request.POST.getlist('job_ids[]')

    # get downstream toggle
    downstream_toggle = request.POST.get('downstream_clone_toggle', False)
    if downstream_toggle == 'true':
        downstream_toggle = True
    elif downstream_toggle == 'false':
        downstream_toggle = False

    # get rerun toggle
    rerun_on_clone = request.POST.get('rerun_on_clone', False)
    if rerun_on_clone == 'true':
        rerun_on_clone = True
    elif rerun_on_clone == 'false':
        rerun_on_clone = False

    # set of jobs to rerun
    job_clone_set = set()

    # loop through job_ids and add
    for job_id in job_ids:
        cjob = CombineJob.get_combine_job(job_id)
        job_clone_set.add(cjob.job)

    # sort and run
    ordered_job_clone_set = sorted(list(job_clone_set), key=lambda j: j.id)

    # initiate Combine BG Task
    combine_task = CombineBackgroundTask(
        name="Clone Jobs",
        task_type='clone_jobs',
        task_params_json=json.dumps({
            'ordered_job_clone_set': [j.id for j in ordered_job_clone_set],
            'downstream_toggle': downstream_toggle,
            'rerun_on_clone': rerun_on_clone
        })
    )
    combine_task.save()

    # run celery task
    bg_task = tasks.clone_jobs.delay(combine_task.id)
    LOGGER.debug('firing bg task: %s', bg_task)
    combine_task.celery_task_id = bg_task.task_id
    combine_task.save()

    # set gms
    gmc = GlobalMessageClient(request.session)
    gmc.add_gm({
        'html': '<strong>Cloning Job(s):</strong><br>%s<br><br>Including downstream? <strong>%s</strong><br><br>Refresh this page to update status of Jobs cloning. <button class="btn-sm btn-outline-primary" onclick="location.reload();">Refresh</button>' % (
            '<br>'.join([str(j.name) for j in ordered_job_clone_set]), downstream_toggle),
        'class': 'success'
    })

    # return, as requested via Ajax which will reload page
    return JsonResponse({'results': True})
Example #5
0
def job_indexing_failures(request, org_id, record_group_id, job_id):
    # get CombineJob
    cjob = CombineJob.get_combine_job(job_id)

    # return
    return render(request, 'core/job_indexing_failures.html', {
        'cjob': cjob,
        'breadcrumbs': breadcrumb_parser(request)
    })
Example #6
0
def stop_jobs(request):
    LOGGER.debug('stopping jobs')

    job_ids = request.POST.getlist('job_ids[]')
    LOGGER.debug(job_ids)

    # get downstream toggle
    downstream_toggle = request.POST.get('downstream_stop_toggle', False)
    if downstream_toggle == 'true':
        downstream_toggle = True
    elif downstream_toggle == 'false':
        downstream_toggle = False

    # set of jobs to rerun
    job_stop_set = set()

    # loop through job_ids
    for job_id in job_ids:

        # get CombineJob
        cjob = CombineJob.get_combine_job(job_id)

        # if including downstream
        if downstream_toggle:

            # add rerun lineage for this job to set
            job_stop_set.update(cjob.job.get_downstream_jobs())

        # else, just job
        else:

            job_stop_set.add(cjob.job)

    # sort and run
    ordered_job_delete_set = sorted(list(job_stop_set), key=lambda j: j.id)

    # # loop through and update visible elements of Job for front-end
    for job in ordered_job_delete_set:
        LOGGER.debug('stopping Job: %s', job)

        # stop job
        job.stop_job()

    # set gms
    gmc = GlobalMessageClient(request.session)
    gmc.add_gm({
        'html':
        '<p><strong>Stopped Job(s):</strong><br>%s</p>' %
        ('<br>'.join([j.name for j in ordered_job_delete_set])),
        'class':
        'danger'
    })

    # return
    return JsonResponse({'results': True})
Example #7
0
def job_validation_scenario_failures(request, org_id, record_group_id, job_id, job_validation_id):
    # get CombineJob
    cjob = CombineJob.get_combine_job(job_id)

    # get job validation instance
    job_validation = JobValidation.objects.get(pk=int(job_validation_id))

    # return
    return render(request, 'core/job_validation_scenario_failures.html', {
        'cjob': cjob,
        'jv': job_validation,
        'breadcrumbs': breadcrumb_parser(request)
    })
Example #8
0
def rerun_jobs(request):
    LOGGER.debug('re-running jobs')

    # get job ids
    job_ids = request.POST.getlist('job_ids[]')

    # get downstream toggle
    downstream_toggle = bool_for_string(
        request.POST.get('downstream_rerun_toggle', False))
    upstream_toggle = bool_for_string(
        request.POST.get('upstream_rerun_toggle', False))

    # set of jobs to rerun
    job_rerun_set = set()

    # loop through job_ids
    for job_id in job_ids:

        # get CombineJob
        cjob = CombineJob.get_combine_job(job_id)

        # if including downstream
        if downstream_toggle:
            # add rerun lineage for this job to set
            job_rerun_set.update(
                cjob.job.get_downstream_jobs(include_self=False))

        if upstream_toggle:
            job_rerun_set.update(
                cjob.job.get_upstream_jobs(include_self=False))

        # else, just job
        job_rerun_set.add(cjob.job)

    # sort and run
    ordered_job_rerun_set = sorted(list(job_rerun_set), key=lambda j: j.id)

    tasks.rerun_jobs(ordered_job_rerun_set)

    # set gms
    gmc = GlobalMessageClient(request.session)
    gmc.add_gm({
        'html':
        '<strong>Preparing to Rerun Job(s):</strong><br>%s<br><br>Refresh this page to update status of Jobs rerunning. <button class="btn-sm btn-outline-primary" onclick="location.reload();">Refresh</button>'
        % '<br>'.join([str(j.name) for j in ordered_job_rerun_set]),
        'class':
        'success'
    })

    # return, as requested via Ajax which will reload page
    return JsonResponse({'results': True})
Example #9
0
def job_errors(request, org_id, record_group_id, job_id):
    LOGGER.debug('retrieving errors for job id: %s', job_id)

    # get CombineJob
    cjob = CombineJob.get_combine_job(job_id)

    job_error_list = cjob.get_job_errors()

    # return
    return render(request, 'core/job_errors.html', {
        'cjob': cjob,
        'job_errors': job_error_list,
        'breadcrumbs': breadcrumb_parser(request)
    })
Example #10
0
def move_jobs(request):
    LOGGER.debug('moving jobs')

    job_ids = request.POST.getlist('job_ids[]')
    record_group_id = request.POST.getlist('record_group_id')[0]

    # get downstream toggle
    downstream_toggle = request.POST.get('downstream_move_toggle', False)
    if downstream_toggle == 'true':
        downstream_toggle = True
    elif downstream_toggle == 'false':
        downstream_toggle = False

    # set of jobs to move
    job_move_set = set()

    # loop through job_ids
    for job_id in job_ids:

        # get CombineJob
        cjob = CombineJob.get_combine_job(job_id)

        # if including downstream
        if downstream_toggle:

            # add move lineage for this job to set
            job_move_set.update(cjob.job.get_downstream_jobs())

        # else, just job
        else:

            job_move_set.add(cjob.job)

    # sort and run
    ordered_job_move_set = sorted(list(job_move_set), key=lambda j: j.id)

    # loop through jobs
    for job in ordered_job_move_set:
        LOGGER.debug('moving Job: %s', job)

        new_record_group = RecordGroup.objects.get(pk=record_group_id)
        job.record_group = new_record_group
        job.save()

        LOGGER.debug('Job %s has been moved', job)

    # redirect
    return JsonResponse({'results': True})
def bg_task(request, task_id):
    # get task
    combine_task = CombineBackgroundTask.objects.get(pk=int(task_id))
    LOGGER.debug('retrieving task: %s', combine_task)

    # include job if mentioned in task params
    if 'job_id' in combine_task.task_params:
        cjob = CombineJob.get_combine_job(combine_task.task_params['job_id'])
    else:
        cjob = None

    return render(
        request, 'core/bg_task.html', {
            'ct': combine_task,
            'cjob': cjob,
            'breadcrumbs': breadcrumb_parser(request)
        })
Example #12
0
def job_update_name(request, org_id, record_group_id, job_id):
    if request.method == 'POST':

        # get CombineJob
        cjob = CombineJob.get_combine_job(job_id)

        # get job note
        job_name = request.POST.get('job_name')
        if job_name == '':
            job_name = None

        # update job note
        cjob.job.name = job_name
        cjob.job.save()

        # redirect
        return redirect(request.META.get('HTTP_REFERER'))
Example #13
0
def job_unpublish(request, org_id, record_group_id, job_id):
    # get CombineJob
    cjob = CombineJob.get_combine_job(job_id)

    # init unpublish
    cjob.unpublish_bg_task()

    # set gms
    gmc = GlobalMessageClient(request.session)
    gmc.add_gm({
        'html': '<p><strong>Unpublishing Job:</strong><br>%s</p><p><a href="%s"><button type="button" class="btn btn-outline-primary btn-sm">View Published Records</button></a></p>' % (
            cjob.job.name, reverse('published')),
        'class': 'success'
    })

    return redirect('record_group',
                    org_id=cjob.job.record_group.organization.id,
                    record_group_id=cjob.job.record_group.id)
Example #14
0
def job_parameters(request, org_id, record_group_id, job_id):
    # get CombineJob
    cjob = CombineJob.get_combine_job(job_id)

    # if GET, return JSON
    if request.method == 'GET':
        # return
        return JsonResponse(cjob.job.job_details_dict)

    # if POST, update
    if request.method == 'POST':

        # get job_details as JSON
        job_details_json = request.POST.get('job_details_json', None)

        if job_details_json is not None:
            cjob.job.job_details = job_details_json
            cjob.job.save()

        return JsonResponse({"msg": "Job Parameters updated!"})
Example #15
0
def job_analysis(request):
    """
    Run new analysis job
    """

    # if GET, prepare form
    if request.method == 'GET':

        # retrieve jobs (limiting if needed)
        input_jobs = Job.objects.all()

        # limit if analysis_type set
        analysis_type = request.GET.get('type', None)
        subset = request.GET.get('subset', None)
        if analysis_type == 'published':

            # load PublishedRecords
            published = PublishedRecords(subset=subset)

            # define input_jobs
            input_jobs = published.published_jobs

        else:
            published = None

        # get validation scenarios
        validation_scenarios = ValidationScenario.objects.all()

        # get field mappers
        field_mappers = FieldMapper.objects.all()

        # get record identifier transformation scenarios
        rits = RecordIdentifierTransformation.objects.all()

        # get job lineage for all jobs (filtered to input jobs scope)
        job_lineage = Job.get_all_jobs_lineage(jobs_query_set=input_jobs)

        # get all bulk downloads
        bulk_downloads = DPLABulkDataDownload.objects.all()

        # render page
        return render(
            request, 'core/job_analysis.html', {
                'job_select_type': 'multiple',
                'input_jobs': input_jobs,
                'published': published,
                'validation_scenarios': validation_scenarios,
                'rits': rits,
                'field_mappers': field_mappers,
                'xml2kvp_handle': xml2kvp.XML2kvp(),
                'analysis_type': analysis_type,
                'bulk_downloads': bulk_downloads,
                'job_lineage_json': json.dumps(job_lineage)
            })

    # if POST, submit job
    if request.method == 'POST':

        cjob = CombineJob.init_combine_job(
            user=request.user,
            # TODO: record_group=record_group,
            job_type_class=AnalysisJob,
            job_params=request.POST)

        # start job and update status
        job_status = cjob.start_job()

        # if job_status is absent, report job status as failed
        if job_status is False:
            cjob.job.status = 'failed'
            cjob.job.save()

        return redirect('analysis')
Example #16
0
def export_tabular_data(request, export_source=None, job_id=None, subset=None):
    # get records per file
    records_per_file = request.POST.get('records_per_file', False)
    if records_per_file in ['', False]:
        records_per_file = 500

    # get mapped fields export type
    tabular_data_export_type = request.POST.get('tabular_data_export_type')

    # get archive type
    archive_type = request.POST.get('archive_type')

    # get fm config json
    fm_export_config_json = request.POST.get('fm_export_config_json')

    # export for single job
    if export_source == 'job':
        LOGGER.debug('exporting tabular data from Job')

        # retrieve job
        cjob = CombineJob.get_combine_job(int(job_id))

        # initiate Combine BG Task
        combine_task = CombineBackgroundTask(
            name='Export Tabular Data for Job: %s' % cjob.job.name,
            task_type='export_tabular_data',
            task_params_json=json.dumps({
                'job_id':
                cjob.job.id,
                'records_per_file':
                int(records_per_file),
                'tabular_data_export_type':
                tabular_data_export_type,
                'archive_type':
                archive_type,
                'fm_export_config_json':
                fm_export_config_json
            }))
        combine_task.save()

        # handle export output configurations
        combine_task = _handle_export_output(request, export_source,
                                             combine_task)

        # run celery task
        background_task = tasks.export_tabular_data.delay(combine_task.id)
        LOGGER.debug('firing bg task: %s', background_task)
        combine_task.celery_task_id = background_task.task_id
        combine_task.save()

        # set gm
        gmc = GlobalMessageClient(request.session)
        target = "Job:</strong><br>%s" % cjob.job.name
        gmc.add_gm({
            'html':
            '<p><strong>Exporting Tabular Data for %s</p><p><a href="%s"><button type="button" '
            'class="btn btn-outline-primary btn-sm">View Background Tasks</button></a></p>'
            % (target, reverse('bg_tasks')),
            'class':
            'success'
        })

        return redirect('job_details',
                        org_id=cjob.job.record_group.organization.id,
                        record_group_id=cjob.job.record_group.id,
                        job_id=cjob.job.id)

    # export for published
    if export_source == 'published':
        LOGGER.debug('exporting tabular data from published records')

        # get instance of Published model
        # TODO: not used
        PublishedRecords()

        # initiate Combine BG Task
        combine_task = CombineBackgroundTask(
            name='Export Tabular Data for Published Records',
            task_type='export_tabular_data',
            task_params_json=json.dumps({
                'published':
                True,
                'subset':
                subset,
                'records_per_file':
                int(records_per_file),
                'tabular_data_export_type':
                tabular_data_export_type,
                'archive_type':
                archive_type,
                'fm_export_config_json':
                fm_export_config_json
            }))
        combine_task.save()

        # handle export output configurations
        combine_task = _handle_export_output(request, export_source,
                                             combine_task)

        # run celery task
        background_task = tasks.export_tabular_data.delay(combine_task.id)
        LOGGER.debug('firing bg task: %s', background_task)
        combine_task.celery_task_id = background_task.task_id
        combine_task.save()

        # set gm
        gmc = GlobalMessageClient(request.session)
        target = ":</strong><br>Published Records"
        gmc.add_gm({
            'html':
            '<p><strong>Exporting Tabular Data for %s</p><p><a href="%s"><button type="button" '
            'class="btn btn-outline-primary btn-sm">View Background Tasks</button></a></p>'
            % (target, reverse('bg_tasks')),
            'class':
            'success'
        })

        return redirect('published')
Example #17
0
def export_mapped_fields(request,
                         export_source=None,
                         job_id=None,
                         subset=None):
    # get mapped fields export type
    mapped_fields_export_type = request.POST.get('mapped_fields_export_type')

    # check for Kibana check
    kibana_style = request.POST.get('kibana_style', False)
    if kibana_style:
        kibana_style = True

    # get archive type
    archive_type = request.POST.get('archive_type')

    # get selected fields if present
    mapped_field_include = request.POST.getlist('mapped_field_include', False)

    # export for single job
    if export_source == 'job':
        LOGGER.debug('exporting mapped fields from Job')

        # retrieve job
        cjob = CombineJob.get_combine_job(int(job_id))

        # initiate Combine BG Task
        combine_task = CombineBackgroundTask(
            name='Export Mapped Fields for Job: %s' % cjob.job.name,
            task_type='export_mapped_fields',
            task_params_json=json.dumps({
                'job_id':
                cjob.job.id,
                'mapped_fields_export_type':
                mapped_fields_export_type,
                'kibana_style':
                kibana_style,
                'archive_type':
                archive_type,
                'mapped_field_include':
                mapped_field_include
            }))
        combine_task.save()

        # handle export output configurations
        combine_task = _handle_export_output(request, export_source,
                                             combine_task)

        # run celery task
        background_task = tasks.export_mapped_fields.delay(combine_task.id)
        LOGGER.debug('firing bg task: %s', background_task)
        combine_task.celery_task_id = background_task.task_id
        combine_task.save()

        # set gm
        gmc = GlobalMessageClient(request.session)
        target = "Job:</strong><br>%s" % cjob.job.name
        gmc.add_gm({
            'html':
            '<p><strong>Exporting Mapped Fields for %s</p><p><a href="%s"><button type="button" '
            'class="btn btn-outline-primary btn-sm">View Background Tasks</button></a></p>'
            % (target, reverse('bg_tasks')),
            'class':
            'success'
        })

        return redirect('job_details',
                        org_id=cjob.job.record_group.organization.id,
                        record_group_id=cjob.job.record_group.id,
                        job_id=cjob.job.id)

    # export for published
    if export_source == 'published':
        LOGGER.debug('exporting mapped fields from published records')

        # initiate Combine BG Task
        combine_task = CombineBackgroundTask(
            name='Export Mapped Fields for Published Records',
            task_type='export_mapped_fields',
            task_params_json=json.dumps({
                'published':
                True,
                'subset':
                subset,
                'mapped_fields_export_type':
                mapped_fields_export_type,
                'kibana_style':
                kibana_style,
                'archive_type':
                archive_type,
                'mapped_field_include':
                mapped_field_include
            }))
        combine_task.save()

        # handle export output configurations
        combine_task = _handle_export_output(request, export_source,
                                             combine_task)

        # run celery task
        background_task = tasks.export_mapped_fields.delay(combine_task.id)
        LOGGER.debug('firing bg task: %s', background_task)
        combine_task.celery_task_id = background_task.task_id
        combine_task.save()

        # set gm
        gmc = GlobalMessageClient(request.session)
        target = ":</strong><br>Published Records"
        gmc.add_gm({
            'html':
            '<p><strong>Exporting Mapped Fields for %s</p><p><a href="%s"><button type="button" '
            'class="btn btn-outline-primary btn-sm">View Background Tasks</button></a></p>'
            % (target, reverse('bg_tasks')),
            'class':
            'success'
        })

        return redirect('published')
Example #18
0
def test_static_harvest(VO):
    '''
    Test static harvest of XML records from disk
    '''

    # copy test data to /tmp
    payload_dir = '/tmp/%s' % uuid.uuid4().hex
    shutil.copytree('/opt/combine/tests/data/static_harvest_data', payload_dir)

    # emulate request.POST
    request_dict = {
        'dbdd': '',
        'job_note': '',
        'xpath_record_id': '',
        'static_filepath': payload_dir,
        'fm_config_json':
        '{"add_literals":{},"capture_attribute_values":[],"concat_values_on_all_fields":false,"concat_values_on_fields":{},"copy_to":{},"copy_to_regex":{},"copy_value_to_regex":{},"error_on_delims_collision":false,"exclude_attributes":[],"exclude_elements":[],"include_all_attributes":false,"include_attributes":[],"include_sibling_id":false,"multivalue_delim":"|","node_delim":"_","ns_prefix_delim":"|","remove_copied_key":true,"remove_copied_value":false,"remove_ns_prefix":true,"repeating_element_suffix_count":false,"self_describing":false,"skip_attribute_ns_declarations":true,"skip_repeating_values":true,"skip_root":false,"split_values_on_all_fields":false,"split_values_on_fields":{}}',
        'static_payload': '',
        'job_name': '',
        'field_mapper': 'default',
        'rits': '',
        'additional_namespace_decs': 'xmlns:mods="http://www.loc.gov/mods/v3"',
        'document_element_root': 'mods:mods'
    }
    query_dict = QueryDict('', mutable=True)
    query_dict.update(request_dict)

    # init job, using Variable Object (VO)
    cjob = CombineJob.init_combine_job(user=VO.user,
                                       record_group=VO.rg,
                                       job_type_class=HarvestStaticXMLJob,
                                       job_params=query_dict,
                                       files={},
                                       hash_payload_filename=False)

    # start job and update status
    job_status = cjob.start_job()

    # if job_status is absent, report job status as failed
    if job_status == False:
        cjob.job.status = 'failed'
        cjob.job.save()

    # poll until complete
    for x in range(0, 480):

        # pause
        time.sleep(1)

        # refresh session
        cjob.job.update_status()

        # check status
        if cjob.job.status != 'available':
            continue
        else:
            break

    # save static harvest job to VO
    VO.static_harvest_cjob = cjob

    # remove payload_dir
    shutil.rmtree(payload_dir)

    # assert job is done and available via livy
    assert VO.static_harvest_cjob.job.status == 'available'

    # assert record count is 250
    assert VO.static_harvest_cjob.job.record_count == 250

    # assert no indexing failures
    assert len(VO.static_harvest_cjob.get_indexing_failures()) == 0
Example #19
0
def test_merge_duplicate(VO):
    '''
    Duplicate Transform job, applying newly created validation scenarios
    '''

    # emulate request.POST
    request_dict = {
        'dbdd': '',
        'field_mapper': 'default',
        'filter_dupe_record_ids': 'true',
        'fm_config_json':
        '{"add_literals":{},"capture_attribute_values":[],"concat_values_on_all_fields":false,"concat_values_on_fields":{},"copy_to":{},"copy_to_regex":{},"copy_value_to_regex":{},"error_on_delims_collision":false,"exclude_attributes":[],"exclude_elements":[],"include_all_attributes":false,"include_attributes":[],"include_sibling_id":false,"multivalue_delim":"|","node_delim":"_","ns_prefix_delim":"|","remove_copied_key":true,"remove_copied_value":false,"remove_ns_prefix":true,"repeating_element_suffix_count":false,"self_describing":false,"skip_attribute_ns_declarations":true,"skip_repeating_values":true,"skip_root":false,"split_values_on_all_fields":false,"split_values_on_fields":{}}',
        'input_es_query_valve': '',
        'input_numerical_valve': '',
        'input_validity_valve': 'all',
        'job_name': '',
        'job_note': '',
        'rits': ''
    }
    query_dict = QueryDict('', mutable=True)
    query_dict.update(request_dict)

    # set input jobs with QueryDict.setlist
    query_dict.setlist(
        'input_job_id',
        [VO.static_harvest_cjob.job.id, VO.static_transform_cjob.job.id])
    # set validation scenarios with QueryDict.setlist
    query_dict.setlist('validation_scenario', [
        VO.schematron_validation_scenario.id, VO.python_validation_scenario.id
    ])

    # init job
    cjob = CombineJob.init_combine_job(user=VO.user,
                                       record_group=VO.rg,
                                       job_type_class=MergeJob,
                                       job_params=query_dict)

    # start job and update status
    job_status = cjob.start_job()

    # if job_status is absent, report job status as failed
    if job_status == False:
        cjob.job.status = 'failed'
        cjob.job.save()

    # poll until complete
    for x in range(0, 480):

        # pause
        time.sleep(1)

        # refresh session
        cjob.job.update_status()

        # check status
        if cjob.job.status != 'available':
            continue
        else:
            break

    # save static harvest job to VO
    VO.merge_cjob = cjob

    # assert job is done and available via livy
    assert VO.merge_cjob.job.status == 'available'

    # assert record count is 250
    assert VO.merge_cjob.job.record_count == 250

    # assert validation scenarios applied
    job_validation_scenarios = VO.merge_cjob.job.jobvalidation_set.all()
    assert job_validation_scenarios.count() == 2

    # loop through validation scenarios and confirm that both show 250 failures
    for jv in job_validation_scenarios:
        assert jv.get_record_validation_failures().count() == 232

    # assert no indexing failures
    assert len(VO.merge_cjob.get_indexing_failures()) == 0
Example #20
0
def test_static_transform(VO):
    '''
    Test static harvest of XML records from disk
    '''

    # prepare and capture temporary transformation scenario
    VO.transformation_scenario = prepare_transform()

    # emulate request.POST
    request_dict = {
        'dbdd':
        '',
        'field_mapper':
        'default',
        'filter_dupe_record_ids':
        'true',
        'fm_config_json':
        '{"add_literals":{},"capture_attribute_values":[],"concat_values_on_all_fields":false,"concat_values_on_fields":{},"copy_to":{},"copy_to_regex":{},"copy_value_to_regex":{},"error_on_delims_collision":false,"exclude_attributes":[],"exclude_elements":[],"include_all_attributes":false,"include_attributes":[],"include_sibling_id":false,"multivalue_delim":"|","node_delim":"_","ns_prefix_delim":"|","remove_copied_key":true,"remove_copied_value":false,"remove_ns_prefix":true,"repeating_element_suffix_count":false,"self_describing":false,"skip_attribute_ns_declarations":true,"skip_repeating_values":true,"skip_root":false,"split_values_on_all_fields":false,"split_values_on_fields":{}}',
        'input_es_query_valve':
        '',
        'input_job_id':
        VO.static_harvest_cjob.job.id,
        'input_numerical_valve':
        '',
        'input_validity_valve':
        'all',
        'job_name':
        '',
        'job_note':
        '',
        'rits':
        '',
        'sel_trans_json':
        '[{"index":0,"trans_id":%s}]' % VO.transformation_scenario.id
    }
    query_dict = QueryDict('', mutable=True)
    query_dict.update(request_dict)

    # init job
    cjob = CombineJob.init_combine_job(user=VO.user,
                                       record_group=VO.rg,
                                       job_type_class=TransformJob,
                                       job_params=query_dict)

    # start job and update status
    job_status = cjob.start_job()

    # if job_status is absent, report job status as failed
    if job_status == False:
        cjob.job.status = 'failed'
        cjob.job.save()

    # poll until complete
    for x in range(0, 480):

        # pause
        time.sleep(1)

        # refresh session
        cjob.job.update_status()

        # check status
        if cjob.job.status != 'available':
            continue
        else:
            break

    # save static harvest job to VO
    VO.static_transform_cjob = cjob

    # assert job is done and available via livy
    assert VO.static_transform_cjob.job.status == 'available'

    # assert record count is 250
    assert VO.static_transform_cjob.job.record_count == 250

    # assert no indexing failures
    assert len(VO.static_transform_cjob.get_indexing_failures()) == 0

    # remove transformation
    assert VO.transformation_scenario.delete()[0] > 0
Example #21
0
def job_reports_create_validation(request, org_id, record_group_id, job_id):
    """
    Generate job report based on validation results
    """

    # retrieve job
    cjob = CombineJob.get_combine_job(int(job_id))

    # if GET, prepare form
    if request.method == 'GET':

        # mapped field analysis, generate if not part of job_details
        if 'mapped_field_analysis' in cjob.job.job_details_dict.keys():
            field_counts = cjob.job.job_details_dict['mapped_field_analysis']
        else:
            if cjob.job.finished:
                field_counts = cjob.count_indexed_fields()
                cjob.job.update_job_details(
                    {'mapped_field_analysis': field_counts}, save=True)
            else:
                LOGGER.debug('job not finished, not setting')
                field_counts = {}

        # render page
        return render(request, 'core/job_reports_create_validation.html', {
            'cjob': cjob,
            'field_counts': field_counts,
            'breadcrumbs': breadcrumb_parser(request)
        })

    # if POST, generate report
    if request.method == 'POST':

        # get job name for Combine Task
        report_name = request.POST.get('report_name')
        if report_name == '':
            report_name = 'j_%s_validation_report' % cjob.job.id
            combine_task_name = "Validation Report: %s" % cjob.job.name
        else:
            combine_task_name = "Validation Report: %s" % report_name

        # handle POST params and save as Combine task params
        task_params = {
            'job_id': cjob.job.id,
            'report_name': report_name,
            'report_format': request.POST.get('report_format'),
            'compression_type': request.POST.get('compression_type'),
            'validation_scenarios': request.POST.getlist('validation_scenario', []),
            'mapped_field_include': request.POST.getlist('mapped_field_include', [])
        }

        # cast to int
        task_params['validation_scenarios'] = [
            int(vs_id) for vs_id in task_params['validation_scenarios']]

        # remove select, reserved fields if in mapped field request
        task_params['mapped_field_include'] = [f for f in task_params['mapped_field_include'] if
                                               f not in ['record_id', 'db_id', 'oid', '_id']]

        # initiate Combine BG Task
        combine_task = CombineBackgroundTask(
            name=combine_task_name,
            task_type='validation_report',
            task_params_json=json.dumps(task_params)
        )
        combine_task.save()

        # run celery task
        background_task = tasks.create_validation_report.delay(combine_task.id)
        LOGGER.debug('firing bg task: %s', background_task)
        combine_task.celery_task_id = background_task.task_id
        combine_task.save()

        # redirect to Background Tasks
        return redirect('bg_tasks')
Example #22
0
def job_merge(request, org_id, record_group_id):
    """
        Merge multiple jobs into a single job
        """

    # retrieve record group
    record_group = RecordGroup.objects.get(pk=record_group_id)

    # if GET, prepare form
    if request.method == 'GET':

        # get scope of input jobs and retrieve
        input_job_scope = request.GET.get('scope', None)

        # if all jobs, retrieve all jobs
        if input_job_scope == 'all_jobs':
            input_jobs = Job.objects.exclude(
                job_type='AnalysisJob').all()

        # else, limit to RecordGroup
        else:
            input_jobs = record_group.job_set.all()

        # get validation scenarios
        validation_scenarios = ValidationScenario.objects.all()

        # get record identifier transformation scenarios
        rits = RecordIdentifierTransformation.objects.all()

        # get field mappers
        field_mappers = FieldMapper.objects.all()

        # get job lineage for all jobs (filtered to input jobs scope)
        job_lineage = Job.get_all_jobs_lineage(jobs_query_set=input_jobs)

        # get all bulk downloads
        bulk_downloads = DPLABulkDataDownload.objects.all()

        # render page
        return render(request, 'core/job_merge.html', {
            'job_select_type': 'multiple',
            'record_group': record_group,
            'input_jobs': input_jobs,
            'input_job_scope': input_job_scope,
            'validation_scenarios': validation_scenarios,
            'rits': rits,
            'field_mappers': field_mappers,
            'xml2kvp_handle': xml2kvp.XML2kvp(),
            'job_lineage_json': json.dumps(job_lineage),
            'bulk_downloads': bulk_downloads,
            'breadcrumbs': breadcrumb_parser(request)
        })

    # if POST, submit job
    if request.method == 'POST':

        cjob = CombineJob.init_combine_job(
            user=request.user,
            record_group=record_group,
            job_type_class=MergeJob,
            job_params=request.POST)

        # start job and update status
        job_status = cjob.start_job()

        # if job_status is absent, report job status as failed
        if job_status == False:
            cjob.job.status = 'failed'
            cjob.job.save()

        return redirect('record_group', org_id=org_id, record_group_id=record_group.id)
Example #23
0
def job_update(request, org_id, record_group_id, job_id):
    """
    Update Job in one of several ways:
        - re-map and index
        - run new / different validations
    """

    # retrieve job
    cjob = CombineJob.get_combine_job(int(job_id))

    # if GET, prepare form
    if request.method == 'GET':
        # get validation scenarios
        validation_scenarios = ValidationScenario.objects.all()

        # get field mappers
        field_mappers = FieldMapper.objects.all()
        orig_fm_config_json = cjob.job.get_fm_config_json()

        # get all bulk downloads
        bulk_downloads = DPLABulkDataDownload.objects.all()

        # get update type from GET params
        update_type = request.GET.get('update_type', None)

        # render page
        return render(request, 'core/job_update.html', {
            'cjob': cjob,
            'update_type': update_type,
            'validation_scenarios': validation_scenarios,
            'field_mappers': field_mappers,
            'bulk_downloads': bulk_downloads,
            'xml2kvp_handle': xml2kvp.XML2kvp(),
            'orig_fm_config_json': orig_fm_config_json,
            'breadcrumbs': breadcrumb_parser(request)
        })

    # if POST, submit job
    if request.method == 'POST':

        LOGGER.debug('updating job')
        LOGGER.debug(request.POST)

        # retrieve job
        cjob = CombineJob.get_combine_job(int(job_id))

        # get update type
        update_type = request.POST.get('update_type', None)
        LOGGER.debug('running job update: %s', update_type)

        # handle re-index
        if update_type == 'reindex':
            # get preferred metadata index mapper
            fm_config_json = request.POST.get('fm_config_json')

            # init re-index
            cjob.reindex_bg_task(fm_config_json=fm_config_json)

            # set gms
            gmc = GlobalMessageClient(request.session)
            gmc.add_gm({
                'html': '<p><strong>Re-Indexing Job:</strong><br>%s</p>'
                        '<p><a href="%s"><button type="button" '
                        'class="btn btn-outline-primary btn-sm">View Background Tasks</button></a></p>' % (
                            cjob.job.name, reverse('bg_tasks')),
                'class': 'success'
            })

            return redirect('job_details',
                            org_id=cjob.job.record_group.organization.id,
                            record_group_id=cjob.job.record_group.id,
                            job_id=cjob.job.id)

        # handle new validations
        if update_type == 'validations':
            # get requested validation scenarios
            validation_scenarios = request.POST.getlist(
                'validation_scenario', [])

            # get validations
            validations = ValidationScenario.objects.filter(
                id__in=[int(vs_id) for vs_id in validation_scenarios])

            # init bg task
            cjob.new_validations_bg_task([vs.id for vs in validations])

            # set gms
            gmc = GlobalMessageClient(request.session)
            gmc.add_gm({
                'html': '<p><strong>Running New Validations for Job:</strong><br>%s<br>'
                        '<br><strong>Validation Scenarios:</strong><br>%s</p>'
                        '<p><a href="%s"><button type="button" '
                        'class="btn btn-outline-primary btn-sm">View Background Tasks</button></a></p>' % (
                            cjob.job.name, '<br>'.join([vs.name for vs in validations]), reverse('bg_tasks')),
                'class': 'success'
            })

            return redirect('job_details',
                            org_id=cjob.job.record_group.organization.id,
                            record_group_id=cjob.job.record_group.id,
                            job_id=cjob.job.id)

        # handle validation removal
        if update_type == 'remove_validation':
            # get validation scenario to remove
            jv_id = request.POST.get('jv_id', False)

            # initiate Combine BG Task
            cjob.remove_validation_bg_task(jv_id)

            # set gms
            validation_scenario = JobValidation.objects.get(
                pk=int(jv_id)).validation_scenario
            gmc = GlobalMessageClient(request.session)
            gmc.add_gm({
                'html': '<p><strong>Removing Validation for Job:</strong><br>%s<br><br>'
                        '<strong>Validation Scenario:</strong><br>%s</p><p><a href="%s"><button type="button" '
                        'class="btn btn-outline-primary btn-sm">View Background Tasks</button></a></p>' % (
                            cjob.job.name, validation_scenario.name, reverse('bg_tasks')),
                'class': 'success'
            })

            return redirect('job_details',
                            org_id=cjob.job.record_group.organization.id,
                            record_group_id=cjob.job.record_group.id,
                            job_id=cjob.job.id)

        # handle validation removal
        if update_type == 'dbdm':
            # get validation scenario to remove
            dbdd_id = request.POST.get('dbdd', False)

            # initiate Combine BG Task
            cjob.dbdm_bg_task(dbdd_id)

            # set gms
            dbdd = DPLABulkDataDownload.objects.get(pk=int(dbdd_id))
            gmc = GlobalMessageClient(request.session)
            gmc.add_gm({
                'html': '<p><strong>Running DPLA Bulk Data comparison for Job:</strong><br>%s<br><br>'
                        '<strong>Bulk Data S3 key:</strong><br>%s</p><p><a href="%s"><button type="button" '
                        'class="btn btn-outline-primary btn-sm">View Background Tasks</button></a></p>' % (
                            cjob.job.name, dbdd.s3_key, reverse('bg_tasks')),
                'class': 'success'
            })

            return redirect('job_details',
                            org_id=cjob.job.record_group.organization.id,
                            record_group_id=cjob.job.record_group.id,
                            job_id=cjob.job.id)

        if update_type == 'publish_set':
            update_body = request.POST
            if update_body.get('publish_set_id', None):
                cjob.job.publish_set_id = update_body['publish_set_id']
            if update_body.get('existing_publish_set_id', None):
                cjob.job.publish_set_id = update_body['existing_publish_set_id']
            redirect_anchor = update_body.get('redirect_anchor', '')
            cjob.job.save()
            return redirect(reverse('job_details', args=[org_id, record_group_id, job_id]) + redirect_anchor)
Example #24
0
def delete_jobs(request):
    LOGGER.debug('deleting jobs')

    job_ids = request.POST.getlist('job_ids[]')
    LOGGER.debug(job_ids)

    # get downstream toggle
    downstream_toggle = request.POST.get('downstream_delete_toggle', False)
    if downstream_toggle == 'true':
        downstream_toggle = True
    elif downstream_toggle == 'false':
        downstream_toggle = False

    # set of jobs to delete
    job_delete_set = set()

    # loop through job_ids
    for job_id in job_ids:

        # get CombineJob
        cjob = CombineJob.get_combine_job(job_id)

        # if including downstream
        if downstream_toggle:

            # add delete lineage for this job to set
            job_delete_set.update(cjob.job.get_downstream_jobs())

        # else, just job
        else:

            job_delete_set.add(cjob.job)

    # sort and run
    ordered_job_delete_set = sorted(list(job_delete_set), key=lambda j: j.id)

    # # loop through and update visible elements of Job for front-end
    for job in ordered_job_delete_set:
        LOGGER.debug('deleting Job: %s', job)

        # set job status to deleting
        job.name = "%s (DELETING)" % job.name
        job.deleted = True
        job.status = 'deleting'
        job.save()

        # initiate Combine BG Task
        combine_task = CombineBackgroundTask(
            name='Delete Job: #%s' % job.name,
            task_type='delete_model_instance',
            task_params_json=json.dumps({
                'model': 'Job',
                'job_id': job.id
            })
        )
        combine_task.save()

        # run celery task
        bg_task = tasks.delete_model_instance.delay('Job', job.id, )
        LOGGER.debug('firing bg task: %s', bg_task)
        combine_task.celery_task_id = bg_task.task_id
        combine_task.save()

    # set gms
    gmc = GlobalMessageClient(request.session)
    gmc.add_gm({
        'html': '<p><strong>Deleting Job(s):</strong><br>%s</p><p>Refresh this page to update status of removing Jobs. <button class="btn-sm btn-outline-primary" onclick="location.reload();">Refresh</button></p>' % (
            '<br>'.join([j.name for j in ordered_job_delete_set])),
        'class': 'danger'
    })

    # return
    return JsonResponse({'results': True})
Example #25
0
def job_details(request, org_id, record_group_id, job_id):
    LOGGER.debug('details for job id: %s', job_id)

    # get CombineJob
    cjob = CombineJob.get_combine_job(job_id)

    # update status
    cjob.job.update_status()

    # detailed record count
    record_count_details = cjob.job.get_detailed_job_record_count()

    # get job lineage
    job_lineage = cjob.job.get_lineage()

    # get dpla_bulk_data_match
    dpla_bulk_data_matches = cjob.job.get_dpla_bulk_data_matches()

    # check if limiting to one, pre-existing record
    get_q = request.GET.get('q', None)

    # job details and job type specific augment
    job_detail = cjob.job.job_details_dict

    # mapped field analysis, generate if not part of job_details
    if 'mapped_field_analysis' in job_detail.keys():
        field_counts = job_detail['mapped_field_analysis']
    else:
        if cjob.job.finished:
            field_counts = cjob.count_indexed_fields()
            cjob.job.update_job_details(
                {'mapped_field_analysis': field_counts}, save=True)
        else:
            LOGGER.debug('job not finished, not setting')
            field_counts = {}

    # TODO: What is this accomplishing?
    # OAI Harvest
    if isinstance(cjob, HarvestOAIJob):
        pass

    # Static Harvest
    elif isinstance(cjob, HarvestStaticXMLJob):
        pass

    # Transform
    elif isinstance(cjob, TransformJob):
        pass

    # Merge/Duplicate
    elif isinstance(cjob, MergeJob):
        pass

    # Analysis
    elif isinstance(cjob, AnalysisJob):
        pass

    # get published records, primarily for published sets
    pub_records = PublishedRecords()

    oai_sets = Record.objects(job_id=cjob.job.id).item_frequencies(field='oai_set')

    # get published subsets with PublishedRecords static method
    published_subsets = PublishedRecords.get_subsets()

    # loop through subsets and enrich
    for _ in published_subsets:

        # add counts
        counts = mc_handle.combine.misc.find_one(
            {'_id': 'published_field_counts_%s' % _['name']})

        # if counts not yet calculated, do now
        if counts is None:
            counts = PublishedRecords(
                subset=_['name']).count_indexed_fields()
        _['counts'] = counts

    # get field mappers
    field_mappers = FieldMapper.objects.all()

    # return
    return render(request, 'core/job_details.html', {
        'cjob': cjob,
        'record_group': cjob.job.record_group,
        'record_count_details': record_count_details,
        'field_counts': field_counts,
        'field_mappers': field_mappers,
        'xml2kvp_handle': xml2kvp.XML2kvp(),
        'job_lineage_json': json.dumps(job_lineage),
        'dpla_bulk_data_matches': dpla_bulk_data_matches,
        'q': get_q,
        'job_details': job_detail,
        'pr': pub_records,
        'published_subsets': published_subsets,
        'es_index_str': cjob.esi.es_index_str,
        'breadcrumbs': breadcrumb_parser(request),
        'oai_sets': dict(oai_sets)
    })