def job_harvest_tabular_data(request, org_id, record_group_id, hash_payload_filename=False): """ Create a new static XML Harvest Job """ # retrieve record group record_group = RecordGroup.objects.filter(id=record_group_id).first() # get validation scenarios validation_scenarios = ValidationScenario.objects.all() # get field mappers field_mappers = FieldMapper.objects.all() # get record identifier transformation scenarios rits = RecordIdentifierTransformation.objects.all() # get all bulk downloads bulk_downloads = DPLABulkDataDownload.objects.all() # if GET, prepare form if request.method == 'GET': # render page return render( request, 'core/job_harvest_tabular_data.html', { 'record_group': record_group, 'validation_scenarios': validation_scenarios, 'rits': rits, 'field_mappers': field_mappers, 'xml2kvp_handle': xml2kvp.XML2kvp(), 'bulk_downloads': bulk_downloads, 'breadcrumbs': breadcrumb_parser(request) }) # if POST, submit job if request.method == 'POST': cjob = CombineJob.init_combine_job( user=request.user, record_group=record_group, job_type_class=HarvestTabularDataJob, job_params=request.POST, files=request.FILES, hash_payload_filename=hash_payload_filename) # start job and update status job_status = cjob.start_job() # if job_status is absent, report job status as failed if job_status == False: cjob.job.status = 'failed' cjob.job.save() return redirect('record_group', org_id=org_id, record_group_id=record_group.id)
def job_analysis(request): """ Run new analysis job """ # if GET, prepare form if request.method == 'GET': # retrieve jobs (limiting if needed) input_jobs = Job.objects.all() # limit if analysis_type set analysis_type = request.GET.get('type', None) subset = request.GET.get('subset', None) if analysis_type == 'published': # load PublishedRecords published = PublishedRecords(subset=subset) # define input_jobs input_jobs = published.published_jobs else: published = None # get validation scenarios validation_scenarios = ValidationScenario.objects.all() # get field mappers field_mappers = FieldMapper.objects.all() # get record identifier transformation scenarios rits = RecordIdentifierTransformation.objects.all() # get job lineage for all jobs (filtered to input jobs scope) job_lineage = Job.get_all_jobs_lineage(jobs_query_set=input_jobs) # get all bulk downloads bulk_downloads = DPLABulkDataDownload.objects.all() # render page return render( request, 'core/job_analysis.html', { 'job_select_type': 'multiple', 'input_jobs': input_jobs, 'published': published, 'validation_scenarios': validation_scenarios, 'rits': rits, 'field_mappers': field_mappers, 'xml2kvp_handle': xml2kvp.XML2kvp(), 'analysis_type': analysis_type, 'bulk_downloads': bulk_downloads, 'job_lineage_json': json.dumps(job_lineage) }) # if POST, submit job if request.method == 'POST': cjob = CombineJob.init_combine_job( user=request.user, # TODO: record_group=record_group, job_type_class=AnalysisJob, job_params=request.POST) # start job and update status job_status = cjob.start_job() # if job_status is absent, report job status as failed if job_status is False: cjob.job.status = 'failed' cjob.job.save() return redirect('analysis')
def test_static_harvest(VO): ''' Test static harvest of XML records from disk ''' # copy test data to /tmp payload_dir = '/tmp/%s' % uuid.uuid4().hex shutil.copytree('/opt/combine/tests/data/static_harvest_data', payload_dir) # emulate request.POST request_dict = { 'dbdd': '', 'job_note': '', 'xpath_record_id': '', 'static_filepath': payload_dir, 'fm_config_json': '{"add_literals":{},"capture_attribute_values":[],"concat_values_on_all_fields":false,"concat_values_on_fields":{},"copy_to":{},"copy_to_regex":{},"copy_value_to_regex":{},"error_on_delims_collision":false,"exclude_attributes":[],"exclude_elements":[],"include_all_attributes":false,"include_attributes":[],"include_sibling_id":false,"multivalue_delim":"|","node_delim":"_","ns_prefix_delim":"|","remove_copied_key":true,"remove_copied_value":false,"remove_ns_prefix":true,"repeating_element_suffix_count":false,"self_describing":false,"skip_attribute_ns_declarations":true,"skip_repeating_values":true,"skip_root":false,"split_values_on_all_fields":false,"split_values_on_fields":{}}', 'static_payload': '', 'job_name': '', 'field_mapper': 'default', 'rits': '', 'additional_namespace_decs': 'xmlns:mods="http://www.loc.gov/mods/v3"', 'document_element_root': 'mods:mods' } query_dict = QueryDict('', mutable=True) query_dict.update(request_dict) # init job, using Variable Object (VO) cjob = CombineJob.init_combine_job(user=VO.user, record_group=VO.rg, job_type_class=HarvestStaticXMLJob, job_params=query_dict, files={}, hash_payload_filename=False) # start job and update status job_status = cjob.start_job() # if job_status is absent, report job status as failed if job_status == False: cjob.job.status = 'failed' cjob.job.save() # poll until complete for x in range(0, 480): # pause time.sleep(1) # refresh session cjob.job.update_status() # check status if cjob.job.status != 'available': continue else: break # save static harvest job to VO VO.static_harvest_cjob = cjob # remove payload_dir shutil.rmtree(payload_dir) # assert job is done and available via livy assert VO.static_harvest_cjob.job.status == 'available' # assert record count is 250 assert VO.static_harvest_cjob.job.record_count == 250 # assert no indexing failures assert len(VO.static_harvest_cjob.get_indexing_failures()) == 0
def test_merge_duplicate(VO): ''' Duplicate Transform job, applying newly created validation scenarios ''' # emulate request.POST request_dict = { 'dbdd': '', 'field_mapper': 'default', 'filter_dupe_record_ids': 'true', 'fm_config_json': '{"add_literals":{},"capture_attribute_values":[],"concat_values_on_all_fields":false,"concat_values_on_fields":{},"copy_to":{},"copy_to_regex":{},"copy_value_to_regex":{},"error_on_delims_collision":false,"exclude_attributes":[],"exclude_elements":[],"include_all_attributes":false,"include_attributes":[],"include_sibling_id":false,"multivalue_delim":"|","node_delim":"_","ns_prefix_delim":"|","remove_copied_key":true,"remove_copied_value":false,"remove_ns_prefix":true,"repeating_element_suffix_count":false,"self_describing":false,"skip_attribute_ns_declarations":true,"skip_repeating_values":true,"skip_root":false,"split_values_on_all_fields":false,"split_values_on_fields":{}}', 'input_es_query_valve': '', 'input_numerical_valve': '', 'input_validity_valve': 'all', 'job_name': '', 'job_note': '', 'rits': '' } query_dict = QueryDict('', mutable=True) query_dict.update(request_dict) # set input jobs with QueryDict.setlist query_dict.setlist( 'input_job_id', [VO.static_harvest_cjob.job.id, VO.static_transform_cjob.job.id]) # set validation scenarios with QueryDict.setlist query_dict.setlist('validation_scenario', [ VO.schematron_validation_scenario.id, VO.python_validation_scenario.id ]) # init job cjob = CombineJob.init_combine_job(user=VO.user, record_group=VO.rg, job_type_class=MergeJob, job_params=query_dict) # start job and update status job_status = cjob.start_job() # if job_status is absent, report job status as failed if job_status == False: cjob.job.status = 'failed' cjob.job.save() # poll until complete for x in range(0, 480): # pause time.sleep(1) # refresh session cjob.job.update_status() # check status if cjob.job.status != 'available': continue else: break # save static harvest job to VO VO.merge_cjob = cjob # assert job is done and available via livy assert VO.merge_cjob.job.status == 'available' # assert record count is 250 assert VO.merge_cjob.job.record_count == 250 # assert validation scenarios applied job_validation_scenarios = VO.merge_cjob.job.jobvalidation_set.all() assert job_validation_scenarios.count() == 2 # loop through validation scenarios and confirm that both show 250 failures for jv in job_validation_scenarios: assert jv.get_record_validation_failures().count() == 232 # assert no indexing failures assert len(VO.merge_cjob.get_indexing_failures()) == 0
def test_static_transform(VO): ''' Test static harvest of XML records from disk ''' # prepare and capture temporary transformation scenario VO.transformation_scenario = prepare_transform() # emulate request.POST request_dict = { 'dbdd': '', 'field_mapper': 'default', 'filter_dupe_record_ids': 'true', 'fm_config_json': '{"add_literals":{},"capture_attribute_values":[],"concat_values_on_all_fields":false,"concat_values_on_fields":{},"copy_to":{},"copy_to_regex":{},"copy_value_to_regex":{},"error_on_delims_collision":false,"exclude_attributes":[],"exclude_elements":[],"include_all_attributes":false,"include_attributes":[],"include_sibling_id":false,"multivalue_delim":"|","node_delim":"_","ns_prefix_delim":"|","remove_copied_key":true,"remove_copied_value":false,"remove_ns_prefix":true,"repeating_element_suffix_count":false,"self_describing":false,"skip_attribute_ns_declarations":true,"skip_repeating_values":true,"skip_root":false,"split_values_on_all_fields":false,"split_values_on_fields":{}}', 'input_es_query_valve': '', 'input_job_id': VO.static_harvest_cjob.job.id, 'input_numerical_valve': '', 'input_validity_valve': 'all', 'job_name': '', 'job_note': '', 'rits': '', 'sel_trans_json': '[{"index":0,"trans_id":%s}]' % VO.transformation_scenario.id } query_dict = QueryDict('', mutable=True) query_dict.update(request_dict) # init job cjob = CombineJob.init_combine_job(user=VO.user, record_group=VO.rg, job_type_class=TransformJob, job_params=query_dict) # start job and update status job_status = cjob.start_job() # if job_status is absent, report job status as failed if job_status == False: cjob.job.status = 'failed' cjob.job.save() # poll until complete for x in range(0, 480): # pause time.sleep(1) # refresh session cjob.job.update_status() # check status if cjob.job.status != 'available': continue else: break # save static harvest job to VO VO.static_transform_cjob = cjob # assert job is done and available via livy assert VO.static_transform_cjob.job.status == 'available' # assert record count is 250 assert VO.static_transform_cjob.job.record_count == 250 # assert no indexing failures assert len(VO.static_transform_cjob.get_indexing_failures()) == 0 # remove transformation assert VO.transformation_scenario.delete()[0] > 0
def job_merge(request, org_id, record_group_id): """ Merge multiple jobs into a single job """ # retrieve record group record_group = RecordGroup.objects.get(pk=record_group_id) # if GET, prepare form if request.method == 'GET': # get scope of input jobs and retrieve input_job_scope = request.GET.get('scope', None) # if all jobs, retrieve all jobs if input_job_scope == 'all_jobs': input_jobs = Job.objects.exclude( job_type='AnalysisJob').all() # else, limit to RecordGroup else: input_jobs = record_group.job_set.all() # get validation scenarios validation_scenarios = ValidationScenario.objects.all() # get record identifier transformation scenarios rits = RecordIdentifierTransformation.objects.all() # get field mappers field_mappers = FieldMapper.objects.all() # get job lineage for all jobs (filtered to input jobs scope) job_lineage = Job.get_all_jobs_lineage(jobs_query_set=input_jobs) # get all bulk downloads bulk_downloads = DPLABulkDataDownload.objects.all() # render page return render(request, 'core/job_merge.html', { 'job_select_type': 'multiple', 'record_group': record_group, 'input_jobs': input_jobs, 'input_job_scope': input_job_scope, 'validation_scenarios': validation_scenarios, 'rits': rits, 'field_mappers': field_mappers, 'xml2kvp_handle': xml2kvp.XML2kvp(), 'job_lineage_json': json.dumps(job_lineage), 'bulk_downloads': bulk_downloads, 'breadcrumbs': breadcrumb_parser(request) }) # if POST, submit job if request.method == 'POST': cjob = CombineJob.init_combine_job( user=request.user, record_group=record_group, job_type_class=MergeJob, job_params=request.POST) # start job and update status job_status = cjob.start_job() # if job_status is absent, report job status as failed if job_status == False: cjob.job.status = 'failed' cjob.job.save() return redirect('record_group', org_id=org_id, record_group_id=record_group.id)