def run_manual_code(study_id): """ Create an AWS Batch job for the Study specified :param study_id: Primary key of a Study """ # we assume that the cluster is configured only in one region. pipeline_region = get_current_region() # Get the object ID of the study, used in the pipeline query = Study.objects.filter(pk=study_id) if not query.exists(): return abort(404) object_id = query.get().object_id error_sentry = make_error_sentry("data", tags={"pipeline_frequency": "manually"}) # Get new data access credentials for the manual user, submit a manual job, display message # Report all errors to sentry including DataPipelineNotConfigured errors. with error_sentry: ssm_client = get_boto_client('ssm', pipeline_region) refresh_data_access_credentials('manually', ssm_client=ssm_client, webserver=True) batch_client = get_boto_client('batch', pipeline_region) create_one_job('manually', object_id, batch_client, webserver=True) flash('Data pipeline code successfully initiated!', 'success') if error_sentry.errors: flash('An unknown error occurred when trying to run this task.', category='danger') print error_sentry return redirect('/data-pipeline/{:s}'.format(study_id))
def terminate_pipeline(study_id): """ Terminate an AWS Batch job for the Study specified :param study_id: Primary key of a Study """ username = session["admin_username"] pipeline_id = request.values['pipeline_id'] flash('terminating pipeline {0}'.format(pipeline_id)) error_sentry = make_error_sentry( "data", tags={"pipeline_frequency": "terminate_job manually"}) # Get new data access credentials for the manual user, submit a manual job, display message # Report all errors to sentry including DataPipelineNotConfigured errors. with error_sentry: batch_client = get_boto_client('batch', pipeline_region) terminate_job(pipeline_id, username, batch_client) if error_sentry.errors: flash( 'An error occurred when trying to terminate the pipeline {0}: {1}'. format(pipeline_id, error_sentry), category='danger') print(error_sentry) else: flash('Pipeline {0} terminated.'.format(pipeline_id), 'success') return redirect('/data-pipeline/{:s}'.format(study_id))
def refresh_data_access_credentials(freq, ssm_client=None, webserver=False): """ Refresh the data access credentials for a particular BATCH USER user and upload them (encrypted) to the AWS Parameter Store. This enables AWS batch jobs to get the credentials and thereby access the data access API (DAA). :param freq: string, one of 'hourly' | 'daily' | 'weekly' | 'monthly' | 'manually' This is used to know what call the data access credentials on AWS. """ # Get or create Researcher with no password. This means that nobody can log in as this # Researcher in the web interface. researcher_name = 'BATCH USER {}'.format(freq) mock_researchers = Researcher.objects.filter(username=researcher_name) if not mock_researchers.exists(): mock_researcher = Researcher.create_without_password(researcher_name) else: mock_researcher = mock_researchers.get() mock_researcher.save() # Ensure that the Researcher is attached to all Studies. This allows them to access all # data via the DAA. for study in Study.objects.all(): StudyRelation.objects.get_or_create( study=study, researcher=mock_researcher, relationship=ResearcherRole.researcher, is_batch_user=True, ) # Reset the credentials. This ensures that they aren't stale. access_key, secret_key = mock_researcher.reset_access_credentials() if not webserver: generic_config = get_generic_config() else: generic_config = get_eb_config() # Append the frequency to the SSM (AWS Systems Manager) names. This ensures that the # different frequency jobs' keys do not overwrite each other. access_key_ssm_name = '{}-{}'.format(generic_config['access_key_ssm_name'], freq) secret_key_ssm_name = '{}-{}'.format(generic_config['secret_key_ssm_name'], freq) # Put the credentials (encrypted) into AWS Parameter Store if not ssm_client: ssm_client = get_boto_client('ssm') ssm_client.put_parameter( Name=access_key_ssm_name, Value=access_key, Type='SecureString', Overwrite=True, ) ssm_client.put_parameter( Name=secret_key_ssm_name, Value=secret_key, Type='SecureString', Overwrite=True, )
def create_one_job(freq, study, patient_id, client=None, webserver=False): """ Create an AWS batch job The aws_object_names and client parameters are optional. They are provided in case that this function is run as part of a loop, to avoid an unnecessarily large number of file operations or API calls. :param freq: string e.g. 'daily', 'manually' :param a Study database object :param a string of a patient id :param client: a credentialed boto3 client or None config needs are the following: job_name, job_defn_name, queue_name """ # Get the AWS parameters and client if not provided if not webserver: aws_object_names = get_generic_config() else: aws_object_names = get_eb_config() # requires region_name be defined. if client is None: client = get_boto_client('batch') client.submit_job( jobName=aws_object_names['job_name'].format(freq=freq), jobDefinition=aws_object_names['job_defn_name'], jobQueue=aws_object_names['queue_name'], containerOverrides={ 'environment': [ { 'name': 'study_object_id', 'value': str(study.object_id), }, { 'name': 'study_name', 'value': Study.objects.get(object_id=study.object_id).name, }, { 'name': 'FREQ', 'value': freq, }, { 'name': 'patient_id', 'value': patient_id, },{ 'name': 'server_url', 'value': DOMAIN_NAME, } ], }, )
def create_one_job(freq, object_id, aws_object_names=None, client=None): """ Create an AWS batch job The aws_object_names and client parameters are optional. They are provided in case that this function is run as part of a loop, to avoid an unnecessarily large number of file operations or API calls. :param freq: string e.g. 'daily', 'manually' :param object_id: string representing the Study object_id e.g. '56325d8297013e33a2e57736' :param aws_object_names: dict containing various parameters for the batch job or None :param client: a credentialled boto3 client or None """ # Get the AWS parameters and client if not provided if aws_object_names is None: aws_object_names = get_aws_object_names() if client is None: client = get_boto_client('batch') client.submit_job( jobName=aws_object_names['job_name'].format(freq=freq), jobDefinition=aws_object_names['job_defn_name'], jobQueue=aws_object_names['queue_name'], containerOverrides={ 'environment': [ { 'name': 'study_object_id', 'value': str(object_id), }, { 'name': 'study_name', 'value': Study.objects.get(object_id=object_id).name, }, { 'name': 'FREQ', 'value': freq, }, ], }, )
def terminate_job(pipeline_id, user_id, client=None): # Get the AWS parameters and client if not provided aws_object_names = get_generic_config() # requires region_name be defined. if client is None: client = get_boto_client('batch') pipeline = PipelineExecutionTracking.objects.get(id=pipeline_id) if not pipeline.batch_job_id: raise ValueError( 'Error terminating pipeline {0}, batch job id not found'.format( pipeline_id)) client.terminate_job(jobId=pipeline.batch_job_id, reason='Terminated by user {0}'.format(user_id)) pipeline.terminate_job(pipeline_id, datetime.datetime.now(), reason='Terminated by user {0}'.format(user_id)) return
def run_manual_code(study_id): """ Create an AWS Batch job for the Study specified :param study_id: Primary key of a Study """ username = session["admin_username"] destination_email_addresses_string = '' if 'destination_email_addresses' in request.values: destination_email_addresses_string = request.values[ 'destination_email_addresses'] destination_email_addresses = [ d.strip() for d in filter( None, re.split("[, \?:;]+", destination_email_addresses_string)) ] for email_address in destination_email_addresses: if not validate_email(email_address): flash( 'Email address {0} in ({1}) does not appear to be a valid email address.' .format(email_address, destination_email_addresses_string), category='danger') return redirect('/data-pipeline/{:s}'.format(study_id)) destination_email_addresses_string = ','.join( destination_email_addresses) participants_string = '' if 'participants' in request.values: participants_string = request.form.getlist('participants') participants_string = ','.join(participants_string) data_start_time = '' if 'time_start' in request.values: data_start_time = request.values['time_start'] data_end_time = '' if 'time_end' in request.values: data_end_time = request.values['time_end'] # Get the object ID of the study, used in the pipeline query = Study.objects.filter(pk=study_id) if not query.exists(): flash('Could not find study corresponding to study id {0}'.format( study_id), category='danger') return redirect('/data-pipeline/{:s}'.format(study_id)) #return abort(404) object_id = query.get().object_id pipeline_region = os.getenv("pipeline_region", None) if not pipeline_region: pipeline_region = 'us-east-1' flash('Pipeline region not configured, choosing default ({})'.format( pipeline_region), category='info') # return redirect('/data-pipeline/{:s}'.format(study_id)) error_sentry = make_error_sentry("data", tags={"pipeline_frequency": "manually"}) # Get new data access credentials for the manual user, submit a manual job, display message # Report all errors to sentry including DataPipelineNotConfigured errors. with error_sentry: ssm_client = get_boto_client('ssm', pipeline_region) refresh_data_access_credentials('manually', ssm_client=ssm_client) batch_client = get_boto_client('batch', pipeline_region) create_one_job('manually', object_id, username, destination_email_addresses_string, data_start_time, data_end_time, participants_string, batch_client) if data_start_time and data_end_time: flash( 'Data pipeline successfully initiated on data collected between {0} and {1}! Email(s) will be sent to {2} on completion.' .format(data_start_time, data_end_time, destination_email_addresses), 'success') elif data_start_time: flash( 'Data pipeline successfully initiated on data collected after {0}! Email(s) will be sent to {1} on completion.' .format(data_start_time, destination_email_addresses), 'success') elif data_end_time: flash( 'Data pipeline successfully initiated on data collected before {0}! Email(s) will be sent to {1} on completion.' .format(data_start_time, destination_email_addresses), 'success') else: flash( 'Data pipeline successfully initiated! Email(s) will be sent to {0} on completion.' .format(destination_email_addresses), 'success') if error_sentry.errors: flash('An error occurred when trying to execute the pipeline: {0}'. format(error_sentry), category='danger') print(error_sentry) return redirect('/data-pipeline/{:s}'.format(study_id))
_one_folder_up = _abspath(__file__).rsplit('/', 2)[0] _path.insert(1, _one_folder_up) from datetime import timedelta from django.utils import timezone from database.data_access_models import ChunkRegistry from database.study_models import Study from libs.sentry import make_error_sentry from pipeline.boto_helpers import get_boto_client from pipeline.configuration_getters import get_current_region from pipeline.index import create_one_job, refresh_data_access_credentials pipeline_region = get_current_region() ssm_client = get_boto_client('ssm', pipeline_region) error_sentry = make_error_sentry("data", tags={"pipeline_frequency": "manually"}) batch_client = get_boto_client('batch', pipeline_region) yesterday = timezone.now() - timedelta(days=1) refresh_data_access_credentials('manually', ssm_client=ssm_client, webserver=False) ################################################################################################ # if you are running this on an ubuntu machine you have to sudo apt-get -y install cloud-utils # ################################################################################################ for study in Study.objects.all(): with error_sentry:
def create_one_job(freq, object_id, owner_id, destination_email_addresses='', data_start_datetime='', data_end_datetime='', participants='', client=None): """ Create an AWS batch job The aws_object_names and client parameters are optional. They are provided in case that this function is run as part of a loop, to avoid an unnecessarily large number of file operations or API calls. :param freq: string e.g. 'daily', 'manually' :param object_id: string representing the Study object_id e.g. '56325d8297013e33a2e57736' :param client: a credentialled boto3 client or None config needs are the following: job_name, job_defn_name, queue_name """ # Get the AWS parameters and client if not provided aws_object_names = get_generic_config() # requires region_name be defined. if client is None: client = get_boto_client('batch') # clean up list of participants if isinstance(participants, list): participants = " ".join(participants) elif ',' in participants: participants = " ".join(participants.split(',')) print('participants [{0}]'.format(participants)) # clean up list of destination email addresses if isinstance(destination_email_addresses, list): destination_email_addresses = " ".join(destination_email_addresses) elif ',' in destination_email_addresses: destination_email_addresses = " ".join( destination_email_addresses.split(',')) print("scheduling pipeline for study {0}".format( Study.objects.get(object_id=object_id).id)) pipeline_id = PipelineExecutionTracking.pipeline_scheduled( owner_id, Study.objects.get(object_id=object_id).id, datetime.datetime.now(), destination_email_addresses, data_start_datetime, data_end_datetime, participants) response = None try: response = client.submit_job( jobName=aws_object_names['job_name'].format(freq=freq), jobDefinition=aws_object_names['job_defn_name'], jobQueue=aws_object_names['queue_name'], containerOverrides={ 'environment': [{ 'name': 'pipeline_id', 'value': str(pipeline_id), }, { 'name': 'study_object_id', 'value': str(object_id), }, { 'name': 'study_name', 'value': Study.objects.get(object_id=object_id).name, }, { 'name': 'FREQ', 'value': freq, }, { 'name': 'destination_email_address', 'value': destination_email_addresses, }, { 'name': 'data_start_datetime', 'value': data_start_datetime, }, { 'name': 'data_end_datetime', 'value': data_end_datetime, }, { 'name': 'participants', 'value': participants, }], }, ) except Exception as e: PipelineExecutionTracking.pipeline_crashed(pipeline_id, datetime.datetime.now(), str(e)) raise if response and 'jobId' in response: PipelineExecutionTracking.pipeline_set_batch_job_id( pipeline_id, response['jobId']) return