Example #1
0
def run_manual_code(study_id):
    """
    Create an AWS Batch job for the Study specified
    :param study_id: Primary key of a Study
    """
    # we assume that the cluster is configured only in one region.
    pipeline_region = get_current_region()

    # Get the object ID of the study, used in the pipeline
    query = Study.objects.filter(pk=study_id)
    if not query.exists():
        return abort(404)
    object_id = query.get().object_id

    error_sentry = make_error_sentry("data",
                                     tags={"pipeline_frequency": "manually"})
    # Get new data access credentials for the manual user, submit a manual job, display message
    # Report all errors to sentry including DataPipelineNotConfigured errors.
    with error_sentry:
        ssm_client = get_boto_client('ssm', pipeline_region)
        refresh_data_access_credentials('manually',
                                        ssm_client=ssm_client,
                                        webserver=True)
        batch_client = get_boto_client('batch', pipeline_region)
        create_one_job('manually', object_id, batch_client, webserver=True)
        flash('Data pipeline code successfully initiated!', 'success')

    if error_sentry.errors:
        flash('An unknown error occurred when trying to run this task.',
              category='danger')
        print error_sentry

    return redirect('/data-pipeline/{:s}'.format(study_id))
def terminate_pipeline(study_id):
    """
    Terminate an AWS Batch job for the Study specified
    :param study_id: Primary key of a Study
    """

    username = session["admin_username"]

    pipeline_id = request.values['pipeline_id']
    flash('terminating pipeline {0}'.format(pipeline_id))

    error_sentry = make_error_sentry(
        "data", tags={"pipeline_frequency": "terminate_job manually"})
    # Get new data access credentials for the manual user, submit a manual job, display message
    # Report all errors to sentry including DataPipelineNotConfigured errors.
    with error_sentry:
        batch_client = get_boto_client('batch', pipeline_region)
        terminate_job(pipeline_id, username, batch_client)

    if error_sentry.errors:
        flash(
            'An error occurred when trying to terminate the pipeline {0}: {1}'.
            format(pipeline_id, error_sentry),
            category='danger')
        print(error_sentry)
    else:
        flash('Pipeline {0} terminated.'.format(pipeline_id), 'success')

    return redirect('/data-pipeline/{:s}'.format(study_id))
Example #3
0
def refresh_data_access_credentials(freq, ssm_client=None, webserver=False):
    """
    Refresh the data access credentials for a particular BATCH USER user and upload them
    (encrypted) to the AWS Parameter Store. This enables AWS batch jobs to get the
    credentials and thereby access the data access API (DAA).
    :param freq: string, one of 'hourly' | 'daily' | 'weekly' | 'monthly' | 'manually'
    This is used to know what call the data access credentials on AWS.
    """

    # Get or create Researcher with no password. This means that nobody can log in as this
    # Researcher in the web interface.
    researcher_name = 'BATCH USER {}'.format(freq)
    mock_researchers = Researcher.objects.filter(username=researcher_name)
    if not mock_researchers.exists():
        mock_researcher = Researcher.create_without_password(researcher_name)
    else:
        mock_researcher = mock_researchers.get()
        mock_researcher.save()

    # Ensure that the Researcher is attached to all Studies. This allows them to access all
    # data via the DAA.
    for study in Study.objects.all():
        StudyRelation.objects.get_or_create(
            study=study,
            researcher=mock_researcher,
            relationship=ResearcherRole.researcher,
            is_batch_user=True,
        )

    # Reset the credentials. This ensures that they aren't stale.
    access_key, secret_key = mock_researcher.reset_access_credentials()

    if not webserver:
        generic_config = get_generic_config()
    else:
        generic_config = get_eb_config()

    # Append the frequency to the SSM (AWS Systems Manager) names. This ensures that the
    # different frequency jobs' keys do not overwrite each other.
    access_key_ssm_name = '{}-{}'.format(generic_config['access_key_ssm_name'],
                                         freq)
    secret_key_ssm_name = '{}-{}'.format(generic_config['secret_key_ssm_name'],
                                         freq)

    # Put the credentials (encrypted) into AWS Parameter Store
    if not ssm_client:
        ssm_client = get_boto_client('ssm')
    ssm_client.put_parameter(
        Name=access_key_ssm_name,
        Value=access_key,
        Type='SecureString',
        Overwrite=True,
    )
    ssm_client.put_parameter(
        Name=secret_key_ssm_name,
        Value=secret_key,
        Type='SecureString',
        Overwrite=True,
    )
Example #4
0
def create_one_job(freq, study, patient_id, client=None, webserver=False):
    """
    Create an AWS batch job
    The aws_object_names and client parameters are optional. They are provided in case
    that this function is run as part of a loop, to avoid an unnecessarily large number of
    file operations or API calls.
    :param freq: string e.g. 'daily', 'manually'
    :param a Study database object
    :param a string of a patient id
    :param client: a credentialed boto3 client or None
    
    config needs are the following: job_name, job_defn_name, queue_name
    """
    
    # Get the AWS parameters and client if not provided
    if not webserver:
        aws_object_names = get_generic_config()
    else:
        aws_object_names = get_eb_config()

    # requires region_name be defined.
    if client is None:
        client = get_boto_client('batch')

    client.submit_job(
        jobName=aws_object_names['job_name'].format(freq=freq),
        jobDefinition=aws_object_names['job_defn_name'],
        jobQueue=aws_object_names['queue_name'],
        containerOverrides={
            'environment': [
                {
                    'name': 'study_object_id',
                    'value': str(study.object_id),
                }, {
                    'name': 'study_name',
                    'value': Study.objects.get(object_id=study.object_id).name,
                }, {
                    'name': 'FREQ',
                    'value': freq,
                }, {
                    'name': 'patient_id',
                    'value': patient_id,
                },{
                    'name': 'server_url',
                    'value': DOMAIN_NAME,
                }

            ],
        },
    )
Example #5
0
def create_one_job(freq, object_id, aws_object_names=None, client=None):
    """
    Create an AWS batch job
    The aws_object_names and client parameters are optional. They are provided in case
    that this function is run as part of a loop, to avoid an unnecessarily large number of
    file operations or API calls.
    :param freq: string e.g. 'daily', 'manually'
    :param object_id: string representing the Study object_id e.g. '56325d8297013e33a2e57736'
    :param aws_object_names: dict containing various parameters for the batch job or None
    :param client: a credentialled boto3 client or None
    """

    # Get the AWS parameters and client if not provided
    if aws_object_names is None:
        aws_object_names = get_aws_object_names()
    if client is None:
        client = get_boto_client('batch')

    client.submit_job(
        jobName=aws_object_names['job_name'].format(freq=freq),
        jobDefinition=aws_object_names['job_defn_name'],
        jobQueue=aws_object_names['queue_name'],
        containerOverrides={
            'environment': [
                {
                    'name': 'study_object_id',
                    'value': str(object_id),
                },
                {
                    'name': 'study_name',
                    'value': Study.objects.get(object_id=object_id).name,
                },
                {
                    'name': 'FREQ',
                    'value': freq,
                },
            ],
        },
    )
Example #6
0
def terminate_job(pipeline_id, user_id, client=None):

    # Get the AWS parameters and client if not provided
    aws_object_names = get_generic_config()

    # requires region_name be defined.
    if client is None:
        client = get_boto_client('batch')

    pipeline = PipelineExecutionTracking.objects.get(id=pipeline_id)

    if not pipeline.batch_job_id:
        raise ValueError(
            'Error terminating pipeline {0}, batch job id not found'.format(
                pipeline_id))

    client.terminate_job(jobId=pipeline.batch_job_id,
                         reason='Terminated by user {0}'.format(user_id))

    pipeline.terminate_job(pipeline_id,
                           datetime.datetime.now(),
                           reason='Terminated by user {0}'.format(user_id))

    return
def run_manual_code(study_id):
    """
    Create an AWS Batch job for the Study specified
    :param study_id: Primary key of a Study
    """

    username = session["admin_username"]

    destination_email_addresses_string = ''
    if 'destination_email_addresses' in request.values:
        destination_email_addresses_string = request.values[
            'destination_email_addresses']
        destination_email_addresses = [
            d.strip() for d in filter(
                None, re.split("[, \?:;]+",
                               destination_email_addresses_string))
        ]
        for email_address in destination_email_addresses:
            if not validate_email(email_address):
                flash(
                    'Email address {0} in ({1}) does not appear to be a valid email address.'
                    .format(email_address, destination_email_addresses_string),
                    category='danger')
                return redirect('/data-pipeline/{:s}'.format(study_id))
        destination_email_addresses_string = ','.join(
            destination_email_addresses)

    participants_string = ''
    if 'participants' in request.values:
        participants_string = request.form.getlist('participants')
        participants_string = ','.join(participants_string)

    data_start_time = ''
    if 'time_start' in request.values:
        data_start_time = request.values['time_start']

    data_end_time = ''
    if 'time_end' in request.values:
        data_end_time = request.values['time_end']

    # Get the object ID of the study, used in the pipeline
    query = Study.objects.filter(pk=study_id)
    if not query.exists():
        flash('Could not find study corresponding to study id {0}'.format(
            study_id),
              category='danger')
        return redirect('/data-pipeline/{:s}'.format(study_id))
        #return abort(404)
    object_id = query.get().object_id

    pipeline_region = os.getenv("pipeline_region", None)
    if not pipeline_region:
        pipeline_region = 'us-east-1'
        flash('Pipeline region not configured, choosing default ({})'.format(
            pipeline_region),
              category='info')
        # return redirect('/data-pipeline/{:s}'.format(study_id))

    error_sentry = make_error_sentry("data",
                                     tags={"pipeline_frequency": "manually"})
    # Get new data access credentials for the manual user, submit a manual job, display message
    # Report all errors to sentry including DataPipelineNotConfigured errors.
    with error_sentry:
        ssm_client = get_boto_client('ssm', pipeline_region)
        refresh_data_access_credentials('manually', ssm_client=ssm_client)
        batch_client = get_boto_client('batch', pipeline_region)
        create_one_job('manually', object_id, username,
                       destination_email_addresses_string, data_start_time,
                       data_end_time, participants_string, batch_client)

        if data_start_time and data_end_time:
            flash(
                'Data pipeline successfully initiated on data collected between {0} and {1}! Email(s) will be sent to {2} on completion.'
                .format(data_start_time, data_end_time,
                        destination_email_addresses), 'success')
        elif data_start_time:
            flash(
                'Data pipeline successfully initiated on data collected after {0}! Email(s) will be sent to {1} on completion.'
                .format(data_start_time,
                        destination_email_addresses), 'success')
        elif data_end_time:
            flash(
                'Data pipeline successfully initiated on data collected before {0}! Email(s) will be sent to {1} on completion.'
                .format(data_start_time,
                        destination_email_addresses), 'success')
        else:
            flash(
                'Data pipeline successfully initiated! Email(s) will be sent to {0} on completion.'
                .format(destination_email_addresses), 'success')

    if error_sentry.errors:
        flash('An error occurred when trying to execute the pipeline: {0}'.
              format(error_sentry),
              category='danger')
        print(error_sentry)

    return redirect('/data-pipeline/{:s}'.format(study_id))
Example #8
0
_one_folder_up = _abspath(__file__).rsplit('/', 2)[0]
_path.insert(1, _one_folder_up)

from datetime import timedelta

from django.utils import timezone

from database.data_access_models import ChunkRegistry
from database.study_models import Study
from libs.sentry import make_error_sentry
from pipeline.boto_helpers import get_boto_client
from pipeline.configuration_getters import get_current_region
from pipeline.index import create_one_job, refresh_data_access_credentials

pipeline_region = get_current_region()
ssm_client = get_boto_client('ssm', pipeline_region)
error_sentry = make_error_sentry("data",
                                 tags={"pipeline_frequency": "manually"})
batch_client = get_boto_client('batch', pipeline_region)
yesterday = timezone.now() - timedelta(days=1)

refresh_data_access_credentials('manually',
                                ssm_client=ssm_client,
                                webserver=False)

################################################################################################
# if you are running this on an ubuntu machine you have to sudo apt-get -y install cloud-utils #
################################################################################################

for study in Study.objects.all():
    with error_sentry:
Example #9
0
def create_one_job(freq,
                   object_id,
                   owner_id,
                   destination_email_addresses='',
                   data_start_datetime='',
                   data_end_datetime='',
                   participants='',
                   client=None):
    """
    Create an AWS batch job
    The aws_object_names and client parameters are optional. They are provided in case
    that this function is run as part of a loop, to avoid an unnecessarily large number of
    file operations or API calls.
    :param freq: string e.g. 'daily', 'manually'
    :param object_id: string representing the Study object_id e.g. '56325d8297013e33a2e57736'
    :param client: a credentialled boto3 client or None
    
    config needs are the following: job_name, job_defn_name, queue_name
    """
    # Get the AWS parameters and client if not provided
    aws_object_names = get_generic_config()
    # requires region_name be defined.
    if client is None:
        client = get_boto_client('batch')

    # clean up list of participants
    if isinstance(participants, list):
        participants = " ".join(participants)
    elif ',' in participants:
        participants = " ".join(participants.split(','))
    print('participants [{0}]'.format(participants))

    # clean up list of destination email addresses
    if isinstance(destination_email_addresses, list):
        destination_email_addresses = " ".join(destination_email_addresses)
    elif ',' in destination_email_addresses:
        destination_email_addresses = " ".join(
            destination_email_addresses.split(','))

    print("scheduling pipeline for study {0}".format(
        Study.objects.get(object_id=object_id).id))
    pipeline_id = PipelineExecutionTracking.pipeline_scheduled(
        owner_id,
        Study.objects.get(object_id=object_id).id, datetime.datetime.now(),
        destination_email_addresses, data_start_datetime, data_end_datetime,
        participants)

    response = None
    try:
        response = client.submit_job(
            jobName=aws_object_names['job_name'].format(freq=freq),
            jobDefinition=aws_object_names['job_defn_name'],
            jobQueue=aws_object_names['queue_name'],
            containerOverrides={
                'environment': [{
                    'name': 'pipeline_id',
                    'value': str(pipeline_id),
                }, {
                    'name': 'study_object_id',
                    'value': str(object_id),
                }, {
                    'name':
                    'study_name',
                    'value':
                    Study.objects.get(object_id=object_id).name,
                }, {
                    'name': 'FREQ',
                    'value': freq,
                }, {
                    'name': 'destination_email_address',
                    'value': destination_email_addresses,
                }, {
                    'name': 'data_start_datetime',
                    'value': data_start_datetime,
                }, {
                    'name': 'data_end_datetime',
                    'value': data_end_datetime,
                }, {
                    'name': 'participants',
                    'value': participants,
                }],
            },
        )

    except Exception as e:
        PipelineExecutionTracking.pipeline_crashed(pipeline_id,
                                                   datetime.datetime.now(),
                                                   str(e))
        raise

    if response and 'jobId' in response:
        PipelineExecutionTracking.pipeline_set_batch_job_id(
            pipeline_id, response['jobId'])

    return