def test_static_harvest_reindex(VO): # refresh job VO.static_harvest_cjob = CombineJob.get_combine_job( VO.static_harvest_cjob.job.id) # fm config json, adding literal foo:bar fm_config_json = '{"concat_values_on_all_fields": false, "capture_attribute_values": [], "remove_ns_prefix": true, "skip_attribute_ns_declarations": true, "remove_copied_key": true, "node_delim": "_", "copy_to": {}, "copy_value_to_regex": {}, "copy_to_regex": {}, "split_values_on_all_fields": false, "add_literals": {"foo":"bar"}, "exclude_attributes": [], "ns_prefix_delim": "|", "self_describing": false, "split_values_on_fields": {}, "include_attributes": [], "include_sibling_id": false, "multivalue_delim": "|", "skip_repeating_values": true, "repeating_element_suffix_count": false, "exclude_elements": [], "concat_values_on_fields": {}, "remove_copied_value": false, "error_on_delims_collision": false, "include_all_attributes": false, "skip_root": false}' # reindex static harvest bg_task = VO.static_harvest_cjob.reindex_bg_task( fm_config_json=fm_config_json) # poll until complete for x in range(0, 480): # pause time.sleep(1) LOGGER.debug('polling for reindexing %s seconds...' % (x)) # refresh session bg_task.update() # check status if bg_task.celery_status not in ['SUCCESS', 'FAILURE']: continue else: break # assert 250 records have foo:bar, indicating successful reindexing results = VO.static_harvest_cjob.field_analysis('foo') assert results['metrics']['doc_instances'] == 250
def job_publish(request, org_id, record_group_id, job_id): LOGGER.debug(request.POST) # capture entered publish set id publish_set_id = request.POST.get('publish_set_id', None) # override with pre-existing publish set id is selected if request.POST.get('existing_publish_set_id', None) is not None: publish_set_id = request.POST.get('existing_publish_set_id') # get published subsets to include in published_subsets = request.POST.getlist('published_subsets', []) # get CombineJob cjob = CombineJob.get_combine_job(job_id) # init publish cjob.publish_bg_task( publish_set_id=publish_set_id, in_published_subsets=published_subsets) # set gms gmc = GlobalMessageClient(request.session) gmc.add_gm({ 'html': '<p><strong>Publishing Job:</strong><br>%s<br><br><strong>Publish Set ID:</strong><br>%s</p><p><a href="%s"><button type="button" class="btn btn-outline-primary btn-sm">View Published Records</button></a></p>' % ( cjob.job.name, publish_set_id, reverse('published')), 'class': 'success' }) return redirect('record_group', org_id=cjob.job.record_group.organization.id, record_group_id=cjob.job.record_group.id)
def job_harvest_tabular_data(request, org_id, record_group_id, hash_payload_filename=False): """ Create a new static XML Harvest Job """ # retrieve record group record_group = RecordGroup.objects.filter(id=record_group_id).first() # get validation scenarios validation_scenarios = ValidationScenario.objects.all() # get field mappers field_mappers = FieldMapper.objects.all() # get record identifier transformation scenarios rits = RecordIdentifierTransformation.objects.all() # get all bulk downloads bulk_downloads = DPLABulkDataDownload.objects.all() # if GET, prepare form if request.method == 'GET': # render page return render( request, 'core/job_harvest_tabular_data.html', { 'record_group': record_group, 'validation_scenarios': validation_scenarios, 'rits': rits, 'field_mappers': field_mappers, 'xml2kvp_handle': xml2kvp.XML2kvp(), 'bulk_downloads': bulk_downloads, 'breadcrumbs': breadcrumb_parser(request) }) # if POST, submit job if request.method == 'POST': cjob = CombineJob.init_combine_job( user=request.user, record_group=record_group, job_type_class=HarvestTabularDataJob, job_params=request.POST, files=request.FILES, hash_payload_filename=hash_payload_filename) # start job and update status job_status = cjob.start_job() # if job_status is absent, report job status as failed if job_status == False: cjob.job.status = 'failed' cjob.job.save() return redirect('record_group', org_id=org_id, record_group_id=record_group.id)
def clone_jobs(request): LOGGER.debug('cloning jobs') job_ids = request.POST.getlist('job_ids[]') # get downstream toggle downstream_toggle = request.POST.get('downstream_clone_toggle', False) if downstream_toggle == 'true': downstream_toggle = True elif downstream_toggle == 'false': downstream_toggle = False # get rerun toggle rerun_on_clone = request.POST.get('rerun_on_clone', False) if rerun_on_clone == 'true': rerun_on_clone = True elif rerun_on_clone == 'false': rerun_on_clone = False # set of jobs to rerun job_clone_set = set() # loop through job_ids and add for job_id in job_ids: cjob = CombineJob.get_combine_job(job_id) job_clone_set.add(cjob.job) # sort and run ordered_job_clone_set = sorted(list(job_clone_set), key=lambda j: j.id) # initiate Combine BG Task combine_task = CombineBackgroundTask( name="Clone Jobs", task_type='clone_jobs', task_params_json=json.dumps({ 'ordered_job_clone_set': [j.id for j in ordered_job_clone_set], 'downstream_toggle': downstream_toggle, 'rerun_on_clone': rerun_on_clone }) ) combine_task.save() # run celery task bg_task = tasks.clone_jobs.delay(combine_task.id) LOGGER.debug('firing bg task: %s', bg_task) combine_task.celery_task_id = bg_task.task_id combine_task.save() # set gms gmc = GlobalMessageClient(request.session) gmc.add_gm({ 'html': '<strong>Cloning Job(s):</strong><br>%s<br><br>Including downstream? <strong>%s</strong><br><br>Refresh this page to update status of Jobs cloning. <button class="btn-sm btn-outline-primary" onclick="location.reload();">Refresh</button>' % ( '<br>'.join([str(j.name) for j in ordered_job_clone_set]), downstream_toggle), 'class': 'success' }) # return, as requested via Ajax which will reload page return JsonResponse({'results': True})
def job_indexing_failures(request, org_id, record_group_id, job_id): # get CombineJob cjob = CombineJob.get_combine_job(job_id) # return return render(request, 'core/job_indexing_failures.html', { 'cjob': cjob, 'breadcrumbs': breadcrumb_parser(request) })
def stop_jobs(request): LOGGER.debug('stopping jobs') job_ids = request.POST.getlist('job_ids[]') LOGGER.debug(job_ids) # get downstream toggle downstream_toggle = request.POST.get('downstream_stop_toggle', False) if downstream_toggle == 'true': downstream_toggle = True elif downstream_toggle == 'false': downstream_toggle = False # set of jobs to rerun job_stop_set = set() # loop through job_ids for job_id in job_ids: # get CombineJob cjob = CombineJob.get_combine_job(job_id) # if including downstream if downstream_toggle: # add rerun lineage for this job to set job_stop_set.update(cjob.job.get_downstream_jobs()) # else, just job else: job_stop_set.add(cjob.job) # sort and run ordered_job_delete_set = sorted(list(job_stop_set), key=lambda j: j.id) # # loop through and update visible elements of Job for front-end for job in ordered_job_delete_set: LOGGER.debug('stopping Job: %s', job) # stop job job.stop_job() # set gms gmc = GlobalMessageClient(request.session) gmc.add_gm({ 'html': '<p><strong>Stopped Job(s):</strong><br>%s</p>' % ('<br>'.join([j.name for j in ordered_job_delete_set])), 'class': 'danger' }) # return return JsonResponse({'results': True})
def job_validation_scenario_failures(request, org_id, record_group_id, job_id, job_validation_id): # get CombineJob cjob = CombineJob.get_combine_job(job_id) # get job validation instance job_validation = JobValidation.objects.get(pk=int(job_validation_id)) # return return render(request, 'core/job_validation_scenario_failures.html', { 'cjob': cjob, 'jv': job_validation, 'breadcrumbs': breadcrumb_parser(request) })
def rerun_jobs(request): LOGGER.debug('re-running jobs') # get job ids job_ids = request.POST.getlist('job_ids[]') # get downstream toggle downstream_toggle = bool_for_string( request.POST.get('downstream_rerun_toggle', False)) upstream_toggle = bool_for_string( request.POST.get('upstream_rerun_toggle', False)) # set of jobs to rerun job_rerun_set = set() # loop through job_ids for job_id in job_ids: # get CombineJob cjob = CombineJob.get_combine_job(job_id) # if including downstream if downstream_toggle: # add rerun lineage for this job to set job_rerun_set.update( cjob.job.get_downstream_jobs(include_self=False)) if upstream_toggle: job_rerun_set.update( cjob.job.get_upstream_jobs(include_self=False)) # else, just job job_rerun_set.add(cjob.job) # sort and run ordered_job_rerun_set = sorted(list(job_rerun_set), key=lambda j: j.id) tasks.rerun_jobs(ordered_job_rerun_set) # set gms gmc = GlobalMessageClient(request.session) gmc.add_gm({ 'html': '<strong>Preparing to Rerun Job(s):</strong><br>%s<br><br>Refresh this page to update status of Jobs rerunning. <button class="btn-sm btn-outline-primary" onclick="location.reload();">Refresh</button>' % '<br>'.join([str(j.name) for j in ordered_job_rerun_set]), 'class': 'success' }) # return, as requested via Ajax which will reload page return JsonResponse({'results': True})
def job_errors(request, org_id, record_group_id, job_id): LOGGER.debug('retrieving errors for job id: %s', job_id) # get CombineJob cjob = CombineJob.get_combine_job(job_id) job_error_list = cjob.get_job_errors() # return return render(request, 'core/job_errors.html', { 'cjob': cjob, 'job_errors': job_error_list, 'breadcrumbs': breadcrumb_parser(request) })
def move_jobs(request): LOGGER.debug('moving jobs') job_ids = request.POST.getlist('job_ids[]') record_group_id = request.POST.getlist('record_group_id')[0] # get downstream toggle downstream_toggle = request.POST.get('downstream_move_toggle', False) if downstream_toggle == 'true': downstream_toggle = True elif downstream_toggle == 'false': downstream_toggle = False # set of jobs to move job_move_set = set() # loop through job_ids for job_id in job_ids: # get CombineJob cjob = CombineJob.get_combine_job(job_id) # if including downstream if downstream_toggle: # add move lineage for this job to set job_move_set.update(cjob.job.get_downstream_jobs()) # else, just job else: job_move_set.add(cjob.job) # sort and run ordered_job_move_set = sorted(list(job_move_set), key=lambda j: j.id) # loop through jobs for job in ordered_job_move_set: LOGGER.debug('moving Job: %s', job) new_record_group = RecordGroup.objects.get(pk=record_group_id) job.record_group = new_record_group job.save() LOGGER.debug('Job %s has been moved', job) # redirect return JsonResponse({'results': True})
def bg_task(request, task_id): # get task combine_task = CombineBackgroundTask.objects.get(pk=int(task_id)) LOGGER.debug('retrieving task: %s', combine_task) # include job if mentioned in task params if 'job_id' in combine_task.task_params: cjob = CombineJob.get_combine_job(combine_task.task_params['job_id']) else: cjob = None return render( request, 'core/bg_task.html', { 'ct': combine_task, 'cjob': cjob, 'breadcrumbs': breadcrumb_parser(request) })
def job_update_name(request, org_id, record_group_id, job_id): if request.method == 'POST': # get CombineJob cjob = CombineJob.get_combine_job(job_id) # get job note job_name = request.POST.get('job_name') if job_name == '': job_name = None # update job note cjob.job.name = job_name cjob.job.save() # redirect return redirect(request.META.get('HTTP_REFERER'))
def job_unpublish(request, org_id, record_group_id, job_id): # get CombineJob cjob = CombineJob.get_combine_job(job_id) # init unpublish cjob.unpublish_bg_task() # set gms gmc = GlobalMessageClient(request.session) gmc.add_gm({ 'html': '<p><strong>Unpublishing Job:</strong><br>%s</p><p><a href="%s"><button type="button" class="btn btn-outline-primary btn-sm">View Published Records</button></a></p>' % ( cjob.job.name, reverse('published')), 'class': 'success' }) return redirect('record_group', org_id=cjob.job.record_group.organization.id, record_group_id=cjob.job.record_group.id)
def job_parameters(request, org_id, record_group_id, job_id): # get CombineJob cjob = CombineJob.get_combine_job(job_id) # if GET, return JSON if request.method == 'GET': # return return JsonResponse(cjob.job.job_details_dict) # if POST, update if request.method == 'POST': # get job_details as JSON job_details_json = request.POST.get('job_details_json', None) if job_details_json is not None: cjob.job.job_details = job_details_json cjob.job.save() return JsonResponse({"msg": "Job Parameters updated!"})
def job_analysis(request): """ Run new analysis job """ # if GET, prepare form if request.method == 'GET': # retrieve jobs (limiting if needed) input_jobs = Job.objects.all() # limit if analysis_type set analysis_type = request.GET.get('type', None) subset = request.GET.get('subset', None) if analysis_type == 'published': # load PublishedRecords published = PublishedRecords(subset=subset) # define input_jobs input_jobs = published.published_jobs else: published = None # get validation scenarios validation_scenarios = ValidationScenario.objects.all() # get field mappers field_mappers = FieldMapper.objects.all() # get record identifier transformation scenarios rits = RecordIdentifierTransformation.objects.all() # get job lineage for all jobs (filtered to input jobs scope) job_lineage = Job.get_all_jobs_lineage(jobs_query_set=input_jobs) # get all bulk downloads bulk_downloads = DPLABulkDataDownload.objects.all() # render page return render( request, 'core/job_analysis.html', { 'job_select_type': 'multiple', 'input_jobs': input_jobs, 'published': published, 'validation_scenarios': validation_scenarios, 'rits': rits, 'field_mappers': field_mappers, 'xml2kvp_handle': xml2kvp.XML2kvp(), 'analysis_type': analysis_type, 'bulk_downloads': bulk_downloads, 'job_lineage_json': json.dumps(job_lineage) }) # if POST, submit job if request.method == 'POST': cjob = CombineJob.init_combine_job( user=request.user, # TODO: record_group=record_group, job_type_class=AnalysisJob, job_params=request.POST) # start job and update status job_status = cjob.start_job() # if job_status is absent, report job status as failed if job_status is False: cjob.job.status = 'failed' cjob.job.save() return redirect('analysis')
def export_tabular_data(request, export_source=None, job_id=None, subset=None): # get records per file records_per_file = request.POST.get('records_per_file', False) if records_per_file in ['', False]: records_per_file = 500 # get mapped fields export type tabular_data_export_type = request.POST.get('tabular_data_export_type') # get archive type archive_type = request.POST.get('archive_type') # get fm config json fm_export_config_json = request.POST.get('fm_export_config_json') # export for single job if export_source == 'job': LOGGER.debug('exporting tabular data from Job') # retrieve job cjob = CombineJob.get_combine_job(int(job_id)) # initiate Combine BG Task combine_task = CombineBackgroundTask( name='Export Tabular Data for Job: %s' % cjob.job.name, task_type='export_tabular_data', task_params_json=json.dumps({ 'job_id': cjob.job.id, 'records_per_file': int(records_per_file), 'tabular_data_export_type': tabular_data_export_type, 'archive_type': archive_type, 'fm_export_config_json': fm_export_config_json })) combine_task.save() # handle export output configurations combine_task = _handle_export_output(request, export_source, combine_task) # run celery task background_task = tasks.export_tabular_data.delay(combine_task.id) LOGGER.debug('firing bg task: %s', background_task) combine_task.celery_task_id = background_task.task_id combine_task.save() # set gm gmc = GlobalMessageClient(request.session) target = "Job:</strong><br>%s" % cjob.job.name gmc.add_gm({ 'html': '<p><strong>Exporting Tabular Data for %s</p><p><a href="%s"><button type="button" ' 'class="btn btn-outline-primary btn-sm">View Background Tasks</button></a></p>' % (target, reverse('bg_tasks')), 'class': 'success' }) return redirect('job_details', org_id=cjob.job.record_group.organization.id, record_group_id=cjob.job.record_group.id, job_id=cjob.job.id) # export for published if export_source == 'published': LOGGER.debug('exporting tabular data from published records') # get instance of Published model # TODO: not used PublishedRecords() # initiate Combine BG Task combine_task = CombineBackgroundTask( name='Export Tabular Data for Published Records', task_type='export_tabular_data', task_params_json=json.dumps({ 'published': True, 'subset': subset, 'records_per_file': int(records_per_file), 'tabular_data_export_type': tabular_data_export_type, 'archive_type': archive_type, 'fm_export_config_json': fm_export_config_json })) combine_task.save() # handle export output configurations combine_task = _handle_export_output(request, export_source, combine_task) # run celery task background_task = tasks.export_tabular_data.delay(combine_task.id) LOGGER.debug('firing bg task: %s', background_task) combine_task.celery_task_id = background_task.task_id combine_task.save() # set gm gmc = GlobalMessageClient(request.session) target = ":</strong><br>Published Records" gmc.add_gm({ 'html': '<p><strong>Exporting Tabular Data for %s</p><p><a href="%s"><button type="button" ' 'class="btn btn-outline-primary btn-sm">View Background Tasks</button></a></p>' % (target, reverse('bg_tasks')), 'class': 'success' }) return redirect('published')
def export_mapped_fields(request, export_source=None, job_id=None, subset=None): # get mapped fields export type mapped_fields_export_type = request.POST.get('mapped_fields_export_type') # check for Kibana check kibana_style = request.POST.get('kibana_style', False) if kibana_style: kibana_style = True # get archive type archive_type = request.POST.get('archive_type') # get selected fields if present mapped_field_include = request.POST.getlist('mapped_field_include', False) # export for single job if export_source == 'job': LOGGER.debug('exporting mapped fields from Job') # retrieve job cjob = CombineJob.get_combine_job(int(job_id)) # initiate Combine BG Task combine_task = CombineBackgroundTask( name='Export Mapped Fields for Job: %s' % cjob.job.name, task_type='export_mapped_fields', task_params_json=json.dumps({ 'job_id': cjob.job.id, 'mapped_fields_export_type': mapped_fields_export_type, 'kibana_style': kibana_style, 'archive_type': archive_type, 'mapped_field_include': mapped_field_include })) combine_task.save() # handle export output configurations combine_task = _handle_export_output(request, export_source, combine_task) # run celery task background_task = tasks.export_mapped_fields.delay(combine_task.id) LOGGER.debug('firing bg task: %s', background_task) combine_task.celery_task_id = background_task.task_id combine_task.save() # set gm gmc = GlobalMessageClient(request.session) target = "Job:</strong><br>%s" % cjob.job.name gmc.add_gm({ 'html': '<p><strong>Exporting Mapped Fields for %s</p><p><a href="%s"><button type="button" ' 'class="btn btn-outline-primary btn-sm">View Background Tasks</button></a></p>' % (target, reverse('bg_tasks')), 'class': 'success' }) return redirect('job_details', org_id=cjob.job.record_group.organization.id, record_group_id=cjob.job.record_group.id, job_id=cjob.job.id) # export for published if export_source == 'published': LOGGER.debug('exporting mapped fields from published records') # initiate Combine BG Task combine_task = CombineBackgroundTask( name='Export Mapped Fields for Published Records', task_type='export_mapped_fields', task_params_json=json.dumps({ 'published': True, 'subset': subset, 'mapped_fields_export_type': mapped_fields_export_type, 'kibana_style': kibana_style, 'archive_type': archive_type, 'mapped_field_include': mapped_field_include })) combine_task.save() # handle export output configurations combine_task = _handle_export_output(request, export_source, combine_task) # run celery task background_task = tasks.export_mapped_fields.delay(combine_task.id) LOGGER.debug('firing bg task: %s', background_task) combine_task.celery_task_id = background_task.task_id combine_task.save() # set gm gmc = GlobalMessageClient(request.session) target = ":</strong><br>Published Records" gmc.add_gm({ 'html': '<p><strong>Exporting Mapped Fields for %s</p><p><a href="%s"><button type="button" ' 'class="btn btn-outline-primary btn-sm">View Background Tasks</button></a></p>' % (target, reverse('bg_tasks')), 'class': 'success' }) return redirect('published')
def test_static_harvest(VO): ''' Test static harvest of XML records from disk ''' # copy test data to /tmp payload_dir = '/tmp/%s' % uuid.uuid4().hex shutil.copytree('/opt/combine/tests/data/static_harvest_data', payload_dir) # emulate request.POST request_dict = { 'dbdd': '', 'job_note': '', 'xpath_record_id': '', 'static_filepath': payload_dir, 'fm_config_json': '{"add_literals":{},"capture_attribute_values":[],"concat_values_on_all_fields":false,"concat_values_on_fields":{},"copy_to":{},"copy_to_regex":{},"copy_value_to_regex":{},"error_on_delims_collision":false,"exclude_attributes":[],"exclude_elements":[],"include_all_attributes":false,"include_attributes":[],"include_sibling_id":false,"multivalue_delim":"|","node_delim":"_","ns_prefix_delim":"|","remove_copied_key":true,"remove_copied_value":false,"remove_ns_prefix":true,"repeating_element_suffix_count":false,"self_describing":false,"skip_attribute_ns_declarations":true,"skip_repeating_values":true,"skip_root":false,"split_values_on_all_fields":false,"split_values_on_fields":{}}', 'static_payload': '', 'job_name': '', 'field_mapper': 'default', 'rits': '', 'additional_namespace_decs': 'xmlns:mods="http://www.loc.gov/mods/v3"', 'document_element_root': 'mods:mods' } query_dict = QueryDict('', mutable=True) query_dict.update(request_dict) # init job, using Variable Object (VO) cjob = CombineJob.init_combine_job(user=VO.user, record_group=VO.rg, job_type_class=HarvestStaticXMLJob, job_params=query_dict, files={}, hash_payload_filename=False) # start job and update status job_status = cjob.start_job() # if job_status is absent, report job status as failed if job_status == False: cjob.job.status = 'failed' cjob.job.save() # poll until complete for x in range(0, 480): # pause time.sleep(1) # refresh session cjob.job.update_status() # check status if cjob.job.status != 'available': continue else: break # save static harvest job to VO VO.static_harvest_cjob = cjob # remove payload_dir shutil.rmtree(payload_dir) # assert job is done and available via livy assert VO.static_harvest_cjob.job.status == 'available' # assert record count is 250 assert VO.static_harvest_cjob.job.record_count == 250 # assert no indexing failures assert len(VO.static_harvest_cjob.get_indexing_failures()) == 0
def test_merge_duplicate(VO): ''' Duplicate Transform job, applying newly created validation scenarios ''' # emulate request.POST request_dict = { 'dbdd': '', 'field_mapper': 'default', 'filter_dupe_record_ids': 'true', 'fm_config_json': '{"add_literals":{},"capture_attribute_values":[],"concat_values_on_all_fields":false,"concat_values_on_fields":{},"copy_to":{},"copy_to_regex":{},"copy_value_to_regex":{},"error_on_delims_collision":false,"exclude_attributes":[],"exclude_elements":[],"include_all_attributes":false,"include_attributes":[],"include_sibling_id":false,"multivalue_delim":"|","node_delim":"_","ns_prefix_delim":"|","remove_copied_key":true,"remove_copied_value":false,"remove_ns_prefix":true,"repeating_element_suffix_count":false,"self_describing":false,"skip_attribute_ns_declarations":true,"skip_repeating_values":true,"skip_root":false,"split_values_on_all_fields":false,"split_values_on_fields":{}}', 'input_es_query_valve': '', 'input_numerical_valve': '', 'input_validity_valve': 'all', 'job_name': '', 'job_note': '', 'rits': '' } query_dict = QueryDict('', mutable=True) query_dict.update(request_dict) # set input jobs with QueryDict.setlist query_dict.setlist( 'input_job_id', [VO.static_harvest_cjob.job.id, VO.static_transform_cjob.job.id]) # set validation scenarios with QueryDict.setlist query_dict.setlist('validation_scenario', [ VO.schematron_validation_scenario.id, VO.python_validation_scenario.id ]) # init job cjob = CombineJob.init_combine_job(user=VO.user, record_group=VO.rg, job_type_class=MergeJob, job_params=query_dict) # start job and update status job_status = cjob.start_job() # if job_status is absent, report job status as failed if job_status == False: cjob.job.status = 'failed' cjob.job.save() # poll until complete for x in range(0, 480): # pause time.sleep(1) # refresh session cjob.job.update_status() # check status if cjob.job.status != 'available': continue else: break # save static harvest job to VO VO.merge_cjob = cjob # assert job is done and available via livy assert VO.merge_cjob.job.status == 'available' # assert record count is 250 assert VO.merge_cjob.job.record_count == 250 # assert validation scenarios applied job_validation_scenarios = VO.merge_cjob.job.jobvalidation_set.all() assert job_validation_scenarios.count() == 2 # loop through validation scenarios and confirm that both show 250 failures for jv in job_validation_scenarios: assert jv.get_record_validation_failures().count() == 232 # assert no indexing failures assert len(VO.merge_cjob.get_indexing_failures()) == 0
def test_static_transform(VO): ''' Test static harvest of XML records from disk ''' # prepare and capture temporary transformation scenario VO.transformation_scenario = prepare_transform() # emulate request.POST request_dict = { 'dbdd': '', 'field_mapper': 'default', 'filter_dupe_record_ids': 'true', 'fm_config_json': '{"add_literals":{},"capture_attribute_values":[],"concat_values_on_all_fields":false,"concat_values_on_fields":{},"copy_to":{},"copy_to_regex":{},"copy_value_to_regex":{},"error_on_delims_collision":false,"exclude_attributes":[],"exclude_elements":[],"include_all_attributes":false,"include_attributes":[],"include_sibling_id":false,"multivalue_delim":"|","node_delim":"_","ns_prefix_delim":"|","remove_copied_key":true,"remove_copied_value":false,"remove_ns_prefix":true,"repeating_element_suffix_count":false,"self_describing":false,"skip_attribute_ns_declarations":true,"skip_repeating_values":true,"skip_root":false,"split_values_on_all_fields":false,"split_values_on_fields":{}}', 'input_es_query_valve': '', 'input_job_id': VO.static_harvest_cjob.job.id, 'input_numerical_valve': '', 'input_validity_valve': 'all', 'job_name': '', 'job_note': '', 'rits': '', 'sel_trans_json': '[{"index":0,"trans_id":%s}]' % VO.transformation_scenario.id } query_dict = QueryDict('', mutable=True) query_dict.update(request_dict) # init job cjob = CombineJob.init_combine_job(user=VO.user, record_group=VO.rg, job_type_class=TransformJob, job_params=query_dict) # start job and update status job_status = cjob.start_job() # if job_status is absent, report job status as failed if job_status == False: cjob.job.status = 'failed' cjob.job.save() # poll until complete for x in range(0, 480): # pause time.sleep(1) # refresh session cjob.job.update_status() # check status if cjob.job.status != 'available': continue else: break # save static harvest job to VO VO.static_transform_cjob = cjob # assert job is done and available via livy assert VO.static_transform_cjob.job.status == 'available' # assert record count is 250 assert VO.static_transform_cjob.job.record_count == 250 # assert no indexing failures assert len(VO.static_transform_cjob.get_indexing_failures()) == 0 # remove transformation assert VO.transformation_scenario.delete()[0] > 0
def job_reports_create_validation(request, org_id, record_group_id, job_id): """ Generate job report based on validation results """ # retrieve job cjob = CombineJob.get_combine_job(int(job_id)) # if GET, prepare form if request.method == 'GET': # mapped field analysis, generate if not part of job_details if 'mapped_field_analysis' in cjob.job.job_details_dict.keys(): field_counts = cjob.job.job_details_dict['mapped_field_analysis'] else: if cjob.job.finished: field_counts = cjob.count_indexed_fields() cjob.job.update_job_details( {'mapped_field_analysis': field_counts}, save=True) else: LOGGER.debug('job not finished, not setting') field_counts = {} # render page return render(request, 'core/job_reports_create_validation.html', { 'cjob': cjob, 'field_counts': field_counts, 'breadcrumbs': breadcrumb_parser(request) }) # if POST, generate report if request.method == 'POST': # get job name for Combine Task report_name = request.POST.get('report_name') if report_name == '': report_name = 'j_%s_validation_report' % cjob.job.id combine_task_name = "Validation Report: %s" % cjob.job.name else: combine_task_name = "Validation Report: %s" % report_name # handle POST params and save as Combine task params task_params = { 'job_id': cjob.job.id, 'report_name': report_name, 'report_format': request.POST.get('report_format'), 'compression_type': request.POST.get('compression_type'), 'validation_scenarios': request.POST.getlist('validation_scenario', []), 'mapped_field_include': request.POST.getlist('mapped_field_include', []) } # cast to int task_params['validation_scenarios'] = [ int(vs_id) for vs_id in task_params['validation_scenarios']] # remove select, reserved fields if in mapped field request task_params['mapped_field_include'] = [f for f in task_params['mapped_field_include'] if f not in ['record_id', 'db_id', 'oid', '_id']] # initiate Combine BG Task combine_task = CombineBackgroundTask( name=combine_task_name, task_type='validation_report', task_params_json=json.dumps(task_params) ) combine_task.save() # run celery task background_task = tasks.create_validation_report.delay(combine_task.id) LOGGER.debug('firing bg task: %s', background_task) combine_task.celery_task_id = background_task.task_id combine_task.save() # redirect to Background Tasks return redirect('bg_tasks')
def job_merge(request, org_id, record_group_id): """ Merge multiple jobs into a single job """ # retrieve record group record_group = RecordGroup.objects.get(pk=record_group_id) # if GET, prepare form if request.method == 'GET': # get scope of input jobs and retrieve input_job_scope = request.GET.get('scope', None) # if all jobs, retrieve all jobs if input_job_scope == 'all_jobs': input_jobs = Job.objects.exclude( job_type='AnalysisJob').all() # else, limit to RecordGroup else: input_jobs = record_group.job_set.all() # get validation scenarios validation_scenarios = ValidationScenario.objects.all() # get record identifier transformation scenarios rits = RecordIdentifierTransformation.objects.all() # get field mappers field_mappers = FieldMapper.objects.all() # get job lineage for all jobs (filtered to input jobs scope) job_lineage = Job.get_all_jobs_lineage(jobs_query_set=input_jobs) # get all bulk downloads bulk_downloads = DPLABulkDataDownload.objects.all() # render page return render(request, 'core/job_merge.html', { 'job_select_type': 'multiple', 'record_group': record_group, 'input_jobs': input_jobs, 'input_job_scope': input_job_scope, 'validation_scenarios': validation_scenarios, 'rits': rits, 'field_mappers': field_mappers, 'xml2kvp_handle': xml2kvp.XML2kvp(), 'job_lineage_json': json.dumps(job_lineage), 'bulk_downloads': bulk_downloads, 'breadcrumbs': breadcrumb_parser(request) }) # if POST, submit job if request.method == 'POST': cjob = CombineJob.init_combine_job( user=request.user, record_group=record_group, job_type_class=MergeJob, job_params=request.POST) # start job and update status job_status = cjob.start_job() # if job_status is absent, report job status as failed if job_status == False: cjob.job.status = 'failed' cjob.job.save() return redirect('record_group', org_id=org_id, record_group_id=record_group.id)
def job_update(request, org_id, record_group_id, job_id): """ Update Job in one of several ways: - re-map and index - run new / different validations """ # retrieve job cjob = CombineJob.get_combine_job(int(job_id)) # if GET, prepare form if request.method == 'GET': # get validation scenarios validation_scenarios = ValidationScenario.objects.all() # get field mappers field_mappers = FieldMapper.objects.all() orig_fm_config_json = cjob.job.get_fm_config_json() # get all bulk downloads bulk_downloads = DPLABulkDataDownload.objects.all() # get update type from GET params update_type = request.GET.get('update_type', None) # render page return render(request, 'core/job_update.html', { 'cjob': cjob, 'update_type': update_type, 'validation_scenarios': validation_scenarios, 'field_mappers': field_mappers, 'bulk_downloads': bulk_downloads, 'xml2kvp_handle': xml2kvp.XML2kvp(), 'orig_fm_config_json': orig_fm_config_json, 'breadcrumbs': breadcrumb_parser(request) }) # if POST, submit job if request.method == 'POST': LOGGER.debug('updating job') LOGGER.debug(request.POST) # retrieve job cjob = CombineJob.get_combine_job(int(job_id)) # get update type update_type = request.POST.get('update_type', None) LOGGER.debug('running job update: %s', update_type) # handle re-index if update_type == 'reindex': # get preferred metadata index mapper fm_config_json = request.POST.get('fm_config_json') # init re-index cjob.reindex_bg_task(fm_config_json=fm_config_json) # set gms gmc = GlobalMessageClient(request.session) gmc.add_gm({ 'html': '<p><strong>Re-Indexing Job:</strong><br>%s</p>' '<p><a href="%s"><button type="button" ' 'class="btn btn-outline-primary btn-sm">View Background Tasks</button></a></p>' % ( cjob.job.name, reverse('bg_tasks')), 'class': 'success' }) return redirect('job_details', org_id=cjob.job.record_group.organization.id, record_group_id=cjob.job.record_group.id, job_id=cjob.job.id) # handle new validations if update_type == 'validations': # get requested validation scenarios validation_scenarios = request.POST.getlist( 'validation_scenario', []) # get validations validations = ValidationScenario.objects.filter( id__in=[int(vs_id) for vs_id in validation_scenarios]) # init bg task cjob.new_validations_bg_task([vs.id for vs in validations]) # set gms gmc = GlobalMessageClient(request.session) gmc.add_gm({ 'html': '<p><strong>Running New Validations for Job:</strong><br>%s<br>' '<br><strong>Validation Scenarios:</strong><br>%s</p>' '<p><a href="%s"><button type="button" ' 'class="btn btn-outline-primary btn-sm">View Background Tasks</button></a></p>' % ( cjob.job.name, '<br>'.join([vs.name for vs in validations]), reverse('bg_tasks')), 'class': 'success' }) return redirect('job_details', org_id=cjob.job.record_group.organization.id, record_group_id=cjob.job.record_group.id, job_id=cjob.job.id) # handle validation removal if update_type == 'remove_validation': # get validation scenario to remove jv_id = request.POST.get('jv_id', False) # initiate Combine BG Task cjob.remove_validation_bg_task(jv_id) # set gms validation_scenario = JobValidation.objects.get( pk=int(jv_id)).validation_scenario gmc = GlobalMessageClient(request.session) gmc.add_gm({ 'html': '<p><strong>Removing Validation for Job:</strong><br>%s<br><br>' '<strong>Validation Scenario:</strong><br>%s</p><p><a href="%s"><button type="button" ' 'class="btn btn-outline-primary btn-sm">View Background Tasks</button></a></p>' % ( cjob.job.name, validation_scenario.name, reverse('bg_tasks')), 'class': 'success' }) return redirect('job_details', org_id=cjob.job.record_group.organization.id, record_group_id=cjob.job.record_group.id, job_id=cjob.job.id) # handle validation removal if update_type == 'dbdm': # get validation scenario to remove dbdd_id = request.POST.get('dbdd', False) # initiate Combine BG Task cjob.dbdm_bg_task(dbdd_id) # set gms dbdd = DPLABulkDataDownload.objects.get(pk=int(dbdd_id)) gmc = GlobalMessageClient(request.session) gmc.add_gm({ 'html': '<p><strong>Running DPLA Bulk Data comparison for Job:</strong><br>%s<br><br>' '<strong>Bulk Data S3 key:</strong><br>%s</p><p><a href="%s"><button type="button" ' 'class="btn btn-outline-primary btn-sm">View Background Tasks</button></a></p>' % ( cjob.job.name, dbdd.s3_key, reverse('bg_tasks')), 'class': 'success' }) return redirect('job_details', org_id=cjob.job.record_group.organization.id, record_group_id=cjob.job.record_group.id, job_id=cjob.job.id) if update_type == 'publish_set': update_body = request.POST if update_body.get('publish_set_id', None): cjob.job.publish_set_id = update_body['publish_set_id'] if update_body.get('existing_publish_set_id', None): cjob.job.publish_set_id = update_body['existing_publish_set_id'] redirect_anchor = update_body.get('redirect_anchor', '') cjob.job.save() return redirect(reverse('job_details', args=[org_id, record_group_id, job_id]) + redirect_anchor)
def delete_jobs(request): LOGGER.debug('deleting jobs') job_ids = request.POST.getlist('job_ids[]') LOGGER.debug(job_ids) # get downstream toggle downstream_toggle = request.POST.get('downstream_delete_toggle', False) if downstream_toggle == 'true': downstream_toggle = True elif downstream_toggle == 'false': downstream_toggle = False # set of jobs to delete job_delete_set = set() # loop through job_ids for job_id in job_ids: # get CombineJob cjob = CombineJob.get_combine_job(job_id) # if including downstream if downstream_toggle: # add delete lineage for this job to set job_delete_set.update(cjob.job.get_downstream_jobs()) # else, just job else: job_delete_set.add(cjob.job) # sort and run ordered_job_delete_set = sorted(list(job_delete_set), key=lambda j: j.id) # # loop through and update visible elements of Job for front-end for job in ordered_job_delete_set: LOGGER.debug('deleting Job: %s', job) # set job status to deleting job.name = "%s (DELETING)" % job.name job.deleted = True job.status = 'deleting' job.save() # initiate Combine BG Task combine_task = CombineBackgroundTask( name='Delete Job: #%s' % job.name, task_type='delete_model_instance', task_params_json=json.dumps({ 'model': 'Job', 'job_id': job.id }) ) combine_task.save() # run celery task bg_task = tasks.delete_model_instance.delay('Job', job.id, ) LOGGER.debug('firing bg task: %s', bg_task) combine_task.celery_task_id = bg_task.task_id combine_task.save() # set gms gmc = GlobalMessageClient(request.session) gmc.add_gm({ 'html': '<p><strong>Deleting Job(s):</strong><br>%s</p><p>Refresh this page to update status of removing Jobs. <button class="btn-sm btn-outline-primary" onclick="location.reload();">Refresh</button></p>' % ( '<br>'.join([j.name for j in ordered_job_delete_set])), 'class': 'danger' }) # return return JsonResponse({'results': True})
def job_details(request, org_id, record_group_id, job_id): LOGGER.debug('details for job id: %s', job_id) # get CombineJob cjob = CombineJob.get_combine_job(job_id) # update status cjob.job.update_status() # detailed record count record_count_details = cjob.job.get_detailed_job_record_count() # get job lineage job_lineage = cjob.job.get_lineage() # get dpla_bulk_data_match dpla_bulk_data_matches = cjob.job.get_dpla_bulk_data_matches() # check if limiting to one, pre-existing record get_q = request.GET.get('q', None) # job details and job type specific augment job_detail = cjob.job.job_details_dict # mapped field analysis, generate if not part of job_details if 'mapped_field_analysis' in job_detail.keys(): field_counts = job_detail['mapped_field_analysis'] else: if cjob.job.finished: field_counts = cjob.count_indexed_fields() cjob.job.update_job_details( {'mapped_field_analysis': field_counts}, save=True) else: LOGGER.debug('job not finished, not setting') field_counts = {} # TODO: What is this accomplishing? # OAI Harvest if isinstance(cjob, HarvestOAIJob): pass # Static Harvest elif isinstance(cjob, HarvestStaticXMLJob): pass # Transform elif isinstance(cjob, TransformJob): pass # Merge/Duplicate elif isinstance(cjob, MergeJob): pass # Analysis elif isinstance(cjob, AnalysisJob): pass # get published records, primarily for published sets pub_records = PublishedRecords() oai_sets = Record.objects(job_id=cjob.job.id).item_frequencies(field='oai_set') # get published subsets with PublishedRecords static method published_subsets = PublishedRecords.get_subsets() # loop through subsets and enrich for _ in published_subsets: # add counts counts = mc_handle.combine.misc.find_one( {'_id': 'published_field_counts_%s' % _['name']}) # if counts not yet calculated, do now if counts is None: counts = PublishedRecords( subset=_['name']).count_indexed_fields() _['counts'] = counts # get field mappers field_mappers = FieldMapper.objects.all() # return return render(request, 'core/job_details.html', { 'cjob': cjob, 'record_group': cjob.job.record_group, 'record_count_details': record_count_details, 'field_counts': field_counts, 'field_mappers': field_mappers, 'xml2kvp_handle': xml2kvp.XML2kvp(), 'job_lineage_json': json.dumps(job_lineage), 'dpla_bulk_data_matches': dpla_bulk_data_matches, 'q': get_q, 'job_details': job_detail, 'pr': pub_records, 'published_subsets': published_subsets, 'es_index_str': cjob.esi.es_index_str, 'breadcrumbs': breadcrumb_parser(request), 'oai_sets': dict(oai_sets) })