コード例 #1
0
ファイル: tasks.py プロジェクト: mlibrary/combine
def create_validation_report(ct_id):

	'''
	Function to generate a Validation Report for a Job as a bg task

	Args:
		request (django.request): request object with parameters needed for report generation

	Returns:
		location on disk
	'''

	# get CombineTask (ct)
	ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id))

	# get CombineJob
	cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id']))

	logger.info(ct.task_params)

	try:

		# check for livy session
		_check_livy_session()

		# set output path
		output_path = '/tmp/%s' % uuid.uuid4().hex

		# generate spark code
		spark_code = "from console import *\ngenerate_validation_report(spark, '%(output_path)s', %(task_params)s)" % {
			'output_path':output_path,
			'task_params':ct.task_params
		}
		logger.info(spark_code)

		# submit to livy
		logger.info('submitting code to Spark')
		submit = models.LivyClient().submit_job(cjob.livy_session.session_id, {'code':spark_code})

		# poll until complete
		logger.info('polling for Spark job to complete...')
		results = polling.poll(lambda: models.LivyClient().job_status(submit.headers['Location']).json(), check_success=spark_job_done, step=5, poll_forever=True)
		logger.info(results)

		# set archive filename of loose XML files
		archive_filename_root = '/tmp/%s.%s' % (ct.task_params['report_name'],ct.task_params['report_format'])

		# loop through partitioned parts, coalesce and write to single file
		logger.info('coalescing output parts')

		# glob parts
		export_parts = glob.glob('%s/part*' % output_path)
		logger.info('found %s documents to group' % len(export_parts))

		# if output not found, exit
		if len(export_parts) == 0:
			ct.task_output_json = json.dumps({
				'error':'no output found',
				'spark_output':results
			})
			ct.save()

		# else, continue
		else:

			# set report_format
			report_format = ct.task_params['report_format']

			# open new file for writing and loop through files
			with open(archive_filename_root, 'w') as fout, fileinput.input(export_parts) as fin:

				# if CSV or TSV, write first line of headers
				if report_format == 'csv':
					header_string = 'db_id,record_id,validation_scenario_id,validation_scenario_name,results_payload,fail_count'
					if len(ct.task_params['mapped_field_include']) > 0:
						header_string += ',' + ','.join(ct.task_params['mapped_field_include'])
					fout.write('%s\n' % header_string)

				if report_format == 'tsv':
					header_string = 'db_id\trecord_id\tvalidation_scenario_id\tvalidation_scenario_name\tresults_payload\tfail_count'
					if len(ct.task_params['mapped_field_include']) > 0:
						header_string += '\t' + '\t'.join(ct.task_params['mapped_field_include'])
					fout.write('%s\n' % header_string)

				# loop through output and write
				for line in fin:
					fout.write(line)

			# removing partitioned output
			logger.info('removing dir: %s' % output_path)
			shutil.rmtree(output_path)

			# optionally, compress file
			if ct.task_params['compression_type'] == 'none':
				logger.info('no compression requested, continuing')
				output_filename = archive_filename_root

			elif ct.task_params['compression_type'] == 'zip':

				logger.info('creating compressed zip archive')
				report_format = 'zip'

				# establish output archive file
				output_filename = '%s.zip' % (archive_filename_root)

				with zipfile.ZipFile(output_filename,'w', zipfile.ZIP_DEFLATED) as zip:
					zip.write(archive_filename_root, archive_filename_root.split('/')[-1])

			# tar.gz
			elif ct.task_params['compression_type'] == 'targz':

				logger.info('creating compressed tar archive')
				report_format = 'targz'

				# establish output archive file
				output_filename = '%s.tar.gz' % (archive_filename_root)

				with tarfile.open(output_filename, 'w:gz') as tar:
					tar.add(archive_filename_root, arcname=archive_filename_root.split('/')[-1])

			# save validation report output to Combine Task output
			ct.task_output_json = json.dumps({
				'report_format':report_format,
				'mapped_field_include':ct.task_params['mapped_field_include'],
				'output_dir':output_path,
				'output_filename':output_filename,
				'results':results
			})
			ct.save()

	except Exception as e:

		logger.info(str(e))

		# attempt to capture error and return for task
		ct.task_output_json = json.dumps({
			'error':str(e)
		})
		ct.save()
コード例 #2
0
ファイル: tasks.py プロジェクト: mlibrary/combine
def job_new_validations(ct_id):

	'''
	- submit livy job and poll until complete
		- use livy session from cjob (works, but awkward way to get this)
	'''

	# get CombineTask (ct)
	try:

		# check for livy session
		_check_livy_session()

		ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id))
		logger.info('using %s' % ct)

		# get CombineJob
		cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id']))

		# generate spark code
		spark_code = 'from jobs import RunNewValidationsSpark\nRunNewValidationsSpark(spark, job_id="%(job_id)s", validation_scenarios="%(validation_scenarios)s").spark_function()' % {
			'job_id':cjob.job.id,
			'validation_scenarios':str([ int(vs_id) for vs_id in ct.task_params['validation_scenarios'] ]),
		}
		logger.info(spark_code)

		# submit to livy
		logger.info('submitting code to Spark')
		submit = models.LivyClient().submit_job(cjob.livy_session.session_id, {'code':spark_code})

		# poll until complete
		logger.info('polling for Spark job to complete...')
		results = polling.poll(lambda: models.LivyClient().job_status(submit.headers['Location']).json(), check_success=spark_job_done, step=5, poll_forever=True)
		logger.info(results)

		# loop through validation jobs, and remove from DB if share validation scenario
		cjob.job.remove_validation_jobs(validation_scenarios=[ int(vs_id) for vs_id in ct.task_params['validation_scenarios'] ])

		# update job_details
		cjob.job.refresh_from_db()
		# remove validation results
		cjob.job.job_details = json.dumps({ k:v for k,v in cjob.job.job_details_dict.items() if k != 'validation_results' })
		cjob.job.save()
		# update scenarios
		validation_scenarios = cjob.job.job_details_dict['validation_scenarios']
		validation_scenarios.extend(ct.task_params['validation_scenarios'])
		cjob.job.update_job_details({
			'validation_scenarios':validation_scenarios
			}, save=True)

		# write validation links
		logger.info('writing validations job links')
		for vs_id in ct.task_params['validation_scenarios']:
			val_job = models.JobValidation(
				job=cjob.job,
				validation_scenario=models.ValidationScenario.objects.get(pk=vs_id)
			)
			val_job.save()

		# update failure counts
		logger.info('updating failure counts for new validation jobs')
		for jv in cjob.job.jobvalidation_set.filter(failure_count=None):
			jv.validation_failure_count(force_recount=True)

		# save export output to Combine Task output
		ct.refresh_from_db()
		ct.task_output_json = json.dumps({
			'run_new_validations':results
		})
		ct.save()

	except Exception as e:

		logger.info(str(e))

		# attempt to capture error and return for task
		ct.task_output_json = json.dumps({
			'error':str(e)
		})
		ct.save()
コード例 #3
0
ファイル: tasks.py プロジェクト: mlibrary/combine
def job_reindex(ct_id):

	'''

	Background tasks to re-index Job

	- submit livy job and poll until complete
		- use livy session from cjob (works, but awkward way to get this)
	'''

	# get CombineTask (ct)
	try:

		# check for livy session
		_check_livy_session()

		ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id))
		logger.info('using %s' % ct)

		# get CombineJob
		cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id']))

		# drop Job's ES index
		cjob.job.drop_es_index(clear_mapped_field_analysis=False)

		# drop previous index mapping failures
		cjob.job.remove_mapping_failures_from_db()

		# generate spark code
		spark_code = 'from jobs import ReindexSparkPatch\nReindexSparkPatch(spark, job_id="%(job_id)s", fm_config_json=\'\'\'%(fm_config_json)s\'\'\').spark_function()' % {
			'job_id':cjob.job.id,
			'fm_config_json':ct.task_params['fm_config_json']
		}

		# submit to livy
		logger.info('submitting code to Spark')
		submit = models.LivyClient().submit_job(cjob.livy_session.session_id, {'code':spark_code})

		# poll until complete
		logger.info('polling for Spark job to complete...')
		results = polling.poll(lambda: models.LivyClient().job_status(submit.headers['Location']).json(), check_success=spark_job_done, step=5, poll_forever=True)
		logger.info(results)

		# get new mapping
		mapped_field_analysis = cjob.count_indexed_fields()
		cjob.job.update_job_details({
			'field_mapper_config':json.loads(ct.task_params['fm_config_json']),
			'mapped_field_analysis':mapped_field_analysis
			}, save=True)

		# save export output to Combine Task output
		ct.refresh_from_db()
		ct.task_output_json = json.dumps({
			'reindex_results':results
		})
		ct.save()

	except Exception as e:

		logger.info(str(e))

		# attempt to capture error and return for task
		ct.task_output_json = json.dumps({
			'error':str(e)
		})
		ct.save()
コード例 #4
0
ファイル: tasks.py プロジェクト: mlibrary/combine
def export_documents(ct_id):

	'''
	- submit livy job and poll until complete
		- use livy session from cjob (works, but awkward way to get this)
	- add wrapper element to file parts
	- rename file parts
	- tar/zip together
	'''

	# get CombineBackgroundTask
	ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id))
	logger.info('using %s' % ct)

	# generate spark code
	output_path = '/tmp/%s' % str(uuid.uuid4())

	# handle single Job
	if 'job_id' in ct.task_params.keys():

		# get CombineJob
		cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id']))

		# set archive filename of loose XML files
		archive_filename_root = 'j_%s_documents' % cjob.job.id

		# build job_dictionary
		job_dict = {'j%s' % cjob.job.id: [cjob.job.id]}
		logger.info(job_dict)

	# handle published records
	if 'published' in ct.task_params.keys():

		# set archive filename of loose XML files
		archive_filename_root = 'published_documents'

		# get anonymous CombineJob
		cjob = models.CombineJob()

		# get published records to determine sets
		pr = models.PublishedRecords(subset=ct.task_params['subset'])

		# init job dictionary
		job_dict = {}

		# handle published jobs with publish set ids
		for publish_id, jobs in pr.sets.items():
			job_dict[publish_id] = [ job.id for job in jobs ]

		# handle "loose" Jobs
		job_dict['no_publish_set_id'] = [job.id for job in pr.published_jobs.filter(publish_set_id='')]

		# debug
		logger.info(job_dict)

	# update task params
	ct.refresh_from_db()
	ct.update_task_params({
		'output_path':output_path,
		'archive_filename_root':archive_filename_root,
		'job_dict':job_dict
	})

	# prepare spark code
	spark_code = "import math,uuid\nfrom console import *\nexport_records_as_xml(spark, %d)" % (int(ct_id))
	logger.info(spark_code)

	try:

		# check for livy session
		_check_livy_session()

		# submit to livy
		logger.info('submitting code to Spark')
		submit = models.LivyClient().submit_job(cjob.livy_session.session_id, {'code':spark_code})

		# poll until complete
		logger.info('polling for Spark job to complete...')
		results = polling.poll(lambda: models.LivyClient().job_status(submit.headers['Location']).json(), check_success=spark_job_done, step=5, poll_forever=True)
		logger.info(results)

		# handle s3 bucket
		if ct.task_params.get('s3_export', False):

			if ct.task_params.get('s3_export_type') == 'archive':

				logger.debug('writing archive file to S3')

				# create single archive file
				ct = _create_export_documents_archive(ct)

				# upload to s3
				s3 = boto3.resource('s3',
					aws_access_key_id=settings.AWS_ACCESS_KEY_ID,
					aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY)
				s3.Object(ct.task_params['s3_bucket'], ct.task_params['s3_key'])\
				.put(Body=open(ct.task_params['export_output_archive'],'rb'))

				# delete all traces from local output
				shutil.rmtree(ct.task_params['output_path'])

			elif ct.task_params.get('s3_export_type') == 'spark_df':
				logger.debug('s3 export type was spark_df, nothing to cleanup or do')

			# save export output to Combine Task output
			ct.refresh_from_db()
			ct.task_output_json = json.dumps({
				's3_export_type':ct.task_params['s3_export_type'],
				'export_output':'s3://%s/%s' % (ct.task_params['s3_bucket'], ct.task_params['s3_key'].lstrip('/')),
			})
			ct.save()
			logger.info(ct.task_output_json)

		# handle local filesystem
		else:

			# create single archive file
			ct = _create_export_documents_archive(ct)

			# save export output to Combine Task output
			ct.refresh_from_db()
			ct.task_output_json = json.dumps({
				'export_output':ct.task_params['export_output_archive'],
				'name':ct.task_params['export_output_archive'].split('/')[-1],
				'content_type':ct.task_params['content_type'],
				'export_dir':"/".join(ct.task_params['export_output_archive'].split('/')[:-1])
			})
			ct.save()
			logger.info(ct.task_output_json)

	except Exception as e:

		logger.info(str(e))

		# attempt to capture error and return for task
		ct.task_output_json = json.dumps({
			'error':str(e)
		})
		ct.save()
コード例 #5
0
ファイル: tasks.py プロジェクト: mlibrary/combine
def job_dbdm(ct_id):

	# get CombineTask (ct)
	try:

		# check for livy session
		_check_livy_session()

		ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id))
		logger.info('using %s' % ct)

		# get CombineJob
		cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id']))

		# set dbdm as False for all Records in Job
		clear_result = models.mc_handle.combine.record.update_many({'job_id':cjob.job.id},{'$set':{'dbdm':False}}, upsert=False)

		# generate spark code
		spark_code = 'from jobs import RunDBDM\nRunDBDM(spark, job_id="%(job_id)s", dbdd_id=%(dbdd_id)s).spark_function()' % {
			'job_id':cjob.job.id,
			'dbdd_id':int(ct.task_params['dbdd_id'])
		}
		logger.info(spark_code)

		# submit to livy
		logger.info('submitting code to Spark')
		submit = models.LivyClient().submit_job(cjob.livy_session.session_id, {'code':spark_code})

		# poll until complete
		logger.info('polling for Spark job to complete...')
		results = polling.poll(lambda: models.LivyClient().job_status(submit.headers['Location']).json(), check_success=spark_job_done, step=5, poll_forever=True)
		logger.info(results)

		# update job_details
		cjob.job.refresh_from_db()

		# get dbdd
		dbdd = models.DPLABulkDataDownload.objects.get(pk=int(ct.task_params['dbdd_id']))
		cjob.job.update_job_details({
			'dbdm':{
				'dbdd':int(ct.task_params['dbdd_id']),
				'dbdd_s3_key':dbdd.s3_key,
				'matches':None,
				'misses':None
			}
		})

		# save export output to Combine Task output
		ct.refresh_from_db()
		ct.task_output_json = json.dumps({
			'job_id':ct.task_params['job_id'],
			'dbdd_id':ct.task_params['dbdd_id'],
			'dbdd_results':results
		})
		ct.save()
		logger.info(ct.task_output_json)

	except Exception as e:

		logger.info(str(e))

		# attempt to capture error and return for task
		ct.task_output_json = json.dumps({
			'error':str(e)
		})
		ct.save()
コード例 #6
0
ファイル: tasks.py プロジェクト: mlibrary/combine
def job_remove_validation(ct_id):

	'''
	Task to remove a validation, and all failures, from a Job
	'''

	# get CombineTask (ct)
	try:

		# check for livy session
		_check_livy_session()

		ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id))
		logger.info('using %s' % ct)

		# get CombineJob
		cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id']))

		# get Job Validation and delete
		jv = models.JobValidation.objects.get(pk=int(ct.task_params['jv_id']))

		# delete validation failures associated with Validation Scenario and Job
		delete_results = jv.delete_record_validation_failures()

		# update valid field in Records via Spark
		# generate spark code
		spark_code = 'from jobs import RemoveValidationsSpark\nRemoveValidationsSpark(spark, job_id="%(job_id)s", validation_scenarios="%(validation_scenarios)s").spark_function()' % {
			'job_id':cjob.job.id,
			'validation_scenarios':str([ jv.validation_scenario.id ]),
		}
		logger.info(spark_code)

		# submit to livy
		logger.info('submitting code to Spark')
		submit = models.LivyClient().submit_job(cjob.livy_session.session_id, {'code':spark_code})

		# poll until complete
		logger.info('polling for Spark job to complete...')
		results = polling.poll(lambda: models.LivyClient().job_status(submit.headers['Location']).json(), check_success=spark_job_done, step=5, poll_forever=True)
		logger.info(results)

		# remove Job Validation from job_details
		cjob.job.refresh_from_db()
		# remove validation results
		cjob.job.job_details = json.dumps({ k:v for k,v in cjob.job.job_details_dict.items() if k != 'validation_results' })
		cjob.job.save()
		validation_scenarios = cjob.job.job_details_dict['validation_scenarios']
		if jv.validation_scenario.id in validation_scenarios:
			validation_scenarios.remove(jv.validation_scenario.id)
		cjob.job.update_job_details({
			'validation_scenarios':validation_scenarios
			}, save=True)

		# save export output to Combine Task output
		ct.refresh_from_db()
		ct.task_output_json = json.dumps({
			'delete_job_validation':str(jv),
			'validation_failures_removed_':delete_results
		})
		ct.save()

		# remove job validation link
		jv.delete()

	except Exception as e:

		logger.info(str(e))

		# attempt to capture error and return for task
		ct.task_output_json = json.dumps({
			'error':str(e)
		})
		ct.save()