Example #1
0
def _run_aws_batch(arglist, fq_repository_name, job_name, pipeline_image_name,
                   aws_session_token_duration, vcpus, memory, no_submit,
                   job_role_arn):
    """
    Run job on AWS Batch.   Sends to queue configured in disdat.cfg.
    This assumes that you have already created a cluster that will run the jobs
    that have been assigned to that queue.

    Args:
        arglist:
        fq_repository_name (str): The fully qualified docker repository name
        job_name:
        pipeline_image_name:
        aws_session_token_duration:
        vcpus:
        memory:
        no_submit (bool): default False
        job_role_arn (str): Can be None

    Returns:

    """
    def check_role_arn(job_dict, jra):
        """ Check to see if the job desc dictionary contains the same job_role_arn (jra)
        """

        if jra is None:
            if 'jobRoleArn' not in job_dict['containerProperties']:
                return True
        else:
            if 'jobRoleArn' in job_dict['containerProperties']:
                if job_dict['containerProperties']['jobRoleArn'] == jra:
                    return True
        return False

    disdat_config = DisdatConfig.instance()

    # Get the parameter values required to kick off an AWS Batch job.
    # Every batch job must:
    # 1. Have a name
    # 2. Have a job definition that declares which ECR-hosted Docker
    #    image to use.
    # 3. Have a queue that feeds jobs into a compute cluster.
    # 4. The command to execute inside the Docker image; the command
    #    args are more-or-less the same as the ones used to execute
    #    locally using 'dsdt run'

    # Create a Job Definition and upload it.
    # We create per-user job definitions so multiple users do not clobber each other.
    # In addition, we never re-use a job definition, since the user may update
    # the vcpu or memory requirements and those are stuck in the job definition

    job_definition_name = aws.batch_get_job_definition_name(
        pipeline_image_name)

    if disdat_config.parser.has_option(_MODULE_NAME,
                                       'aws_batch_job_definition'):
        job_definition_name = disdat_config.parser.get(
            _MODULE_NAME, 'aws_batch_job_definition')

    # TODO: Look through all of history to find one that matches?
    # TODO: Delete old jobs here or let user do it?
    job_definition_obj = aws.batch_get_latest_job_definition(
        job_definition_name)

    if (job_definition_obj is not None
            and job_definition_obj['containerProperties']['image']
            == fq_repository_name
            and job_definition_obj['containerProperties']['vcpus'] == vcpus
            and job_definition_obj['containerProperties']['memory'] == memory
            and check_role_arn(job_definition_obj, job_role_arn)):

        job_definition_fqn = aws.batch_extract_job_definition_fqn(
            job_definition_obj)

        _logger.info("Re-using prior AWS Batch run job definition : {}".format(
            job_definition_obj))

    else:
        """ Whether None or doesn't match, make a new one """

        job_definition_obj = aws.batch_register_job_definition(
            job_definition_name,
            fq_repository_name,
            vcpus=vcpus,
            memory=memory,
            job_role_arn=job_role_arn)

        job_definition_fqn = aws.batch_get_job_definition(job_definition_name)

        _logger.info(
            "New AWS Batch run job definition {}".format(job_definition_fqn))

    if no_submit:
        # Return the job description object
        return job_definition_obj

    job_queue = disdat_config.parser.get(_MODULE_NAME, 'aws_batch_queue')

    container_overrides = {'command': arglist}

    # Through the magic of boto3_session_cache, the client in our script
    # here can get at AWS profiles and temporary AWS tokens created in
    # part from MFA tokens generated through the user's shells; we don't
    # have to write special code of our own to deal with authenticating
    # with AWS.
    client = b3.client('batch', region_name=aws.profile_get_region())
    # A bigger problem might be that the IAM role executing the job on
    # a batch EC2 instance might not have access to the S3 remote. To
    # get around this, allow the user to create some temporary AWS
    # credentials.

    if aws_session_token_duration > 0 and job_role_arn is None:
        sts_client = b3.client('sts')
        try:
            token = sts_client.get_session_token(
                DurationSeconds=aws_session_token_duration)
            credentials = token['Credentials']
            container_overrides['environment'] = [{
                'name':
                'AWS_ACCESS_KEY_ID',
                'value':
                credentials['AccessKeyId']
            }, {
                'name':
                'AWS_SECRET_ACCESS_KEY',
                'value':
                credentials['SecretAccessKey']
            }, {
                'name':
                'AWS_SESSION_TOKEN',
                'value':
                credentials['SessionToken']
            }]
        except Exception as e:
            _logger.debug(
                "Unable to generate an STS token, instead trying users default credentials..."
            )
            credentials = b3.session.Session().get_credentials()
            container_overrides['environment'] = [{
                'name':
                'AWS_ACCESS_KEY_ID',
                'value':
                credentials.access_key
            }, {
                'name':
                'AWS_SECRET_ACCESS_KEY',
                'value':
                credentials.secret_key
            }, {
                'name': 'AWS_SESSION_TOKEN',
                'value': credentials.token
            }]

    container_overrides['environment'].append({
        'name': 'DISDAT_CPU_COUNT',
        'value': str(vcpus)
    })

    job = client.submit_job(jobName=job_name,
                            jobDefinition=job_definition_fqn,
                            jobQueue=job_queue,
                            containerOverrides=container_overrides)

    status = job['ResponseMetadata']['HTTPStatusCode']
    if status == 200:
        _logger.info(
            'Job {} (ID {}) with definition {} submitted to AWS Batch queue {}'
            .format(job['jobName'], job['jobId'], job_definition_fqn,
                    job_queue))
        return job
    else:
        _logger.error('Job submission failed: HTTP Status {}'.format())
        return None
Example #2
0
def _run_aws_sagemaker(arglist, fq_repository_name, job_name):
    """
    Runs a training job on AWS SageMaker.  This uses the default machine type
    in the disdat.cfg file.

    Args:
        arglist:
        fq_repository_name (str): fully qualified repository name
        job_name:  instance job name

    Returns:
        TrainingJobArn (str)
    """

    disdat_config = DisdatConfig.instance()

    job_name = job_name.replace(
        '_',
        '-')  # b/c SageMaker complains it must be ^[a-zA-Z0-9](-*[a-zA-Z0-9])*

    hyperparameter_dict = _sagemaker_hyperparameters_from_arglist(arglist)

    algorithm_specification = {
        'TrainingImage': fq_repository_name,
        'TrainingInputMode': 'File'
    }

    role_arn = disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_role_arn')

    input_channel_config = [
        {
            'ChannelName': 'disdat_sagemaker_input_blackhole',
            'DataSource': {
                'S3DataSource': {
                    'S3DataType':
                    'S3Prefix',
                    'S3Uri':
                    disdat_config.parser.get(_MODULE_NAME,
                                             'aws_sagemaker_s3_input_uri'),
                    'S3DataDistributionType':
                    'FullyReplicated'
                }
            },
            'ContentType': 'application/javascript',
            'CompressionType': 'None',  # | 'Gzip',
            'RecordWrapperType': 'None'  # | 'RecordIO'
        },
    ]

    output_data_config = {
        'S3OutputPath':
        os.path.join(
            disdat_config.parser.get(_MODULE_NAME,
                                     'aws_sagemaker_s3_output_uri'), job_name)
    }

    resource_config = {
        'InstanceType':
        disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_instance_type'),
        'InstanceCount':
        int(
            disdat_config.parser.get(_MODULE_NAME,
                                     'aws_sagemaker_instance_count')),
        'VolumeSizeInGB':
        int(
            disdat_config.parser.get(_MODULE_NAME,
                                     'aws_sagemaker_volume_sizeGB'))
        # 'VolumeKmsKeyId': 'string'
    }

    vpc_config = None  #'SecurityGroupIds': [], 'Subnets': []}

    stopping_condition = {
        'MaxRuntimeInSeconds':
        int(
            disdat_config.parser.get(_MODULE_NAME,
                                     'aws_sagemaker_max_runtime_sec'))
    }

    tags = [{
        'Key': 'user',
        'Value': 'disdat'
    }, {
        'Key': 'job',
        'Value': job_name
    }]

    if False:
        print("Disdat SageMaker configs")
        print("job name: {}".format(job_name))
        print("hparams: {}".format(hyperparameter_dict))
        print("algorithm: {}".format(algorithm_specification))
        print("Role ARN: {}".format(role_arn))
        print("Input data conf: {}".format(input_channel_config))
        print("Output data conf: {}".format(output_data_config))
        print("Resource conf: {}".format(resource_config))
        print("VPC conf: {}".format(vpc_config))
        print("Stopping condition seconds: {}".format(stopping_condition))
        print("Tags: {}".format(tags))

    client = b3.client('sagemaker', region_name=aws.profile_get_region())

    response = client.create_training_job(
        TrainingJobName=job_name,
        HyperParameters=hyperparameter_dict,
        AlgorithmSpecification=algorithm_specification,
        RoleArn=role_arn,
        InputDataConfig=input_channel_config,
        OutputDataConfig=output_data_config,
        ResourceConfig=resource_config,
        StoppingCondition=stopping_condition,
        Tags=tags)

    _logger.info(
        "Disdat SageMaker create_training_job response {}".format(response))
    return response['TrainingJobArn']
Example #3
0
def _run(input_bundle,
         output_bundle,
         pipeline_params,
         pipeline_class_name,
         backend=Backend.Local,
         force=False,
         push_input_bundle=True,
         input_tags=None,
         output_tags=None):
    """Run the dockerized version of a pipeline.

    Args:
        input_bundle: The human name of the input bundle
        output_bundle: The human name of the output bundle
        pipeline_class_name: Name of the pipeline class to run
        pipeline_params: Optional arguments to pass to the pipeline class
        backend: The batch execution back-end to use (default
            `Backend.Local`)
        force: If `True` force recomputation of all upstream pipe
            requirements (default `False`)
        input_tags: Find bundle with these tags
        output_tags: Push result bundle with these tags

    Returns:
        `None`
    """

    #print "_run args are {}".format(pipeline_params)

    pfs = fs.DisdatFS()
    disdat_config = common.DisdatConfig.instance()

    pipeline_image_name = common.make_pipeline_image_name(pipeline_class_name)

    try:
        output_bundle_uuid, remote, branch_name = common.get_run_command_parameters(
            pfs)
    except ValueError:
        _logger.error(
            "'run' requires a remote set with `dsdt remote <s3 url>`")
        return

    if backend == Backend.AWSBatch:
        # Get the parameter values required to kick off an AWS Batch job.
        # Every batch job must:
        # 1. Have a name
        # 2. Have a job definition that declares which ECR-hosted Docker
        #    image to use.
        # 3. Have a queue that feeds jobs into a compute cluster.
        # 4. The command to execute inside the Docker image; the command
        #    args are more-or-less the same as the ones used to execute
        #    locally using 'dsdt run'
        job_name = '{}-{}'.format(pipeline_image_name, int(time.time()))
        job_definition_name = aws.batch_get_job_definition_name(
            pipeline_class_name)
        if disdat_config.parser.has_option(_MODULE_NAME,
                                           'aws_batch_job_definition'):
            job_definition_name = disdat_config.parser.get(
                _MODULE_NAME, 'aws_batch_job_definition')

        # If the job definition does not exist, create it.
        job_definition = aws.batch_get_job_definition(job_definition_name)
        if job_definition is None:
            repository_prefix = disdat_config.parser.get(
                'docker', 'repository_prefix')
            repository_name = common.make_pipeline_repository_name(
                repository_prefix, pipeline_class_name)
            # Figure out the fully-qualified repository name, i.e., the name
            # including the registry.
            registry_name = disdat_config.parser.get('docker',
                                                     'registry').strip('/')
            if registry_name == '*ECR*':
                fq_repository_name = aws.ecr_get_fq_respository_name(
                    repository_name)
            else:
                fq_repository_name = '{}/{}'.format(registry_name,
                                                    repository_name)
            aws.batch_register_job_definition(job_definition_name,
                                              fq_repository_name)
            job_definition = aws.batch_get_job_definition(job_definition_name)
        job_queue = disdat_config.parser.get(_MODULE_NAME, 'aws_batch_queue')

        # Assemble the command...
        job_command = common.make_run_command(input_bundle, output_bundle,
                                              output_bundle_uuid, remote,
                                              branch_name, input_tags,
                                              output_tags, pipeline_params)
        container_overrides = {'command': job_command}

        # Through the magic boto3_session_cache, we get clients to interact
        # with AWS services and (if necessary) temporary tokens if using
        # AWS profiles/MFA tokens.
        client = b3.client('batch', region_name=aws.profile_get_region())
        job = client.submit_job(jobName=job_name,
                                jobDefinition=job_definition,
                                jobQueue=job_queue,
                                containerOverrides=container_overrides)
        status = job['ResponseMetadata']['HTTPStatusCode']
        if status == 200:
            print 'Job {} (ID {}) with definition {} submitted to AWS Batch queue {}'.format(
                job['jobName'], job['jobId'], job_definition, job_queue)
        else:
            _logger.error('Job submission failed: HTTP Status {}'.format())
    elif backend == Backend.Local:

        client = docker.from_env()
        # Configure the container environment and mounted file systems.
        environment = {}
        if 'AWS_PROFILE' in os.environ:
            environment['AWS_PROFILE'] = os.environ['AWS_PROFILE']
        volumes = {}
        aws_config_dir = os.getenv('AWS_CONFIG_DIR',
                                   os.path.join(os.environ['HOME'], '.aws'))
        if aws_config_dir is not None and os.path.exists(aws_config_dir):
            volumes[aws_config_dir] = {'bind': '/root/.aws', 'mode': 'rw'}
        # Make sure latest committed is sent to remote
        if push_input_bundle:
            result = pfs.push(human_name=input_bundle)
            if result is None:
                _logger.error(
                    "'run' failed trying to push input bundle {} to remote.".
                    format(input_bundle))
                return
        # Now try to run the container
        try:
            args = ' '.join(
                common.make_run_command(input_bundle, output_bundle,
                                        output_bundle_uuid, remote,
                                        branch_name, input_tags, output_tags,
                                        pipeline_params))

            print "run.py ARGS {}".format(args)

            _logger.debug('Running image {} with arguments {}'.format(
                pipeline_image_name, args))
            stdout = client.containers.run(pipeline_image_name,
                                           args,
                                           detach=False,
                                           environment=environment,
                                           init=True,
                                           stderr=True,
                                           volumes=volumes)
            print stdout
        except docker.errors.ImageNotFound:
            _logger.error("Unable to find the docker image {}".format(
                pipeline_image_name))
            return
        # Now that this is finished, we need to pull this from the remote.
        pfs.pull(output_bundle, output_bundle_uuid)
    else:
        raise ValueError(
            'Got unrecognized job backend \'{}\': Expected {}'.format(
                backend, Backend.options()))