예제 #1
0
def batch_get_latest_job_definition(job_definition_name):
    """Get the most recent active revision number for a AWS Batch job
    definition

    Args:
        job_definition_name: The name of the job definition
        remote_pipeline_image_name:
        vcpus:
        memory:

    Return:
        The latest job definition dictionary or `None` if the job definition does not exist
    """
    region = profile_get_region()
    client = b3.client('batch', region_name=region)
    response = client.describe_job_definitions(
        jobDefinitionName=job_definition_name, status='ACTIVE')
    if response['ResponseMetadata']['HTTPStatusCode'] != 200:
        raise RuntimeError(
            'Failed to get job definition revisions for {}: HTTP Status {}'.
            format(job_definition_name,
                   response['ResponseMetadata']['HTTPStatusCode']))
    job_definitions = response['jobDefinitions']
    revision = 0
    job_def = None
    for j in job_definitions:
        if j['jobDefinitionName'] != job_definition_name:
            continue
        if j['revision'] > revision:
            revision = j['revision']
            job_def = j

    return job_def
예제 #2
0
def ecr_create_fq_respository_name(repository_name,
                                   policy_resource_package=None,
                                   policy_resource_name=None):
    ecr_client = b3.client('ecr', region_name=profile_get_region())
    # Create or fetch the repository in AWS (to store the image)
    try:
        response = ecr_client.create_repository(repositoryName=repository_name)
        repository_metadata = response['repository']
        # Set the policy on the repository
        if policy_resource_package is not None and policy_resource_name is not None:
            policy = pkg_resources.resource_string(
                policy_resource_package.__name__, policy_resource_name)
            _ = ecr_client.set_repository_policy(
                registryId=repository_metadata['registryId'],
                repositoryName=repository_name,
                policyText=policy,
                force=True)
    except ClientError as e:
        if e.response['Error']['Code'] == 'RepositoryAlreadyExistsException':
            response = ecr_client.describe_repositories(
                repositoryNames=[repository_name])
            repository_metadata = response['repositories'][0]
        else:
            raise e
    return repository_metadata['repositoryUri']
예제 #3
0
def batch_register_job_definition(job_definition_name,
                                  remote_pipeline_image_name,
                                  vcpus=1,
                                  memory=2000):
    """Register a new AWS Batch job definition.

    Args:
        job_definition_name: The name of the job definition
        remote_pipeline_image_name: The ECR Docker image to load to run jobs
            using this definition
        vcpus: The number of vCPUs to use to run jobs using this definition
        memory: The amount of memory in MiB to use to run jobs using this
            definition
    """
    region = profile_get_region()
    client = b3.client('batch', region_name=region)
    response = client.register_job_definition(
        jobDefinitionName=job_definition_name,
        type='container',
        containerProperties={
            'image': remote_pipeline_image_name,
            'vcpus': vcpus,
            'memory': memory,
        })
    if response['ResponseMetadata']['HTTPStatusCode'] != 200:
        raise RuntimeError(
            'Failed to create job definition {}: HTTP Status {}'.format(
                job_definition_name,
                response['ResponseMetadata']['HTTPStatusCode']))
예제 #4
0
def batch_get_job_definition(job_definition_name):
    """Get the most recent active revision number for a AWS Batch job
    definition

    Args:
        job_definition_name: The name of the job definition

    Return:
        The fully-qualified job definition name with revision number, or
            `None` if the job definition does not exist
    """
    region = profile_get_region()
    client = b3.client('batch', region_name=region)
    response = client.describe_job_definitions(
        jobDefinitionName=job_definition_name, status='ACTIVE')
    if response['ResponseMetadata']['HTTPStatusCode'] != 200:
        raise RuntimeError(
            'Failed to get job definition revisions for {}: HTTP Status {}'.
            format(job_definition_name,
                   response['ResponseMetadata']['HTTPStatusCode']))
    job_definitions = response['jobDefinitions']
    revision = 0
    for j in job_definitions:
        if j['jobDefinitionName'] != job_definition_name:
            continue
        if j['revision'] > revision:
            revision = j['revision']
    if revision == 0:
        return None
    else:
        return '{}:{}'.format(job_definition_name, revision)
예제 #5
0
def ecr_get_auth_config():
    ecr_client = b3.client('ecr', region_name=profile_get_region())
    # Authorize docker to push to ECR
    response = ecr_client.get_authorization_token()
    if response['ResponseMetadata']['HTTPStatusCode'] != 200:
        raise RuntimeError(
            'Failed to get AWS ECR authorization token: HTTP Status {}'.format(
                response['ResponseMetadata']['HTTPStatusCode']))
    token = response['authorizationData'][0]['authorizationToken']
    username, password = base64.decodestring(token).split(':')
    return {'username': username, 'password': password}
예제 #6
0
def get_aws_instances():
    try:
        return boto3_session_cache.client('ec2').describe_instances()
    except NoRegionError:
        print(NO_REGION_ERROR)
        exit(1)
    except (PartialCredentialsError, NoCredentialsError):
        print(NO_CREDENTIALS_ERROR)
        exit(1)
    except ClientError:
        print(WRONG_CREDENTIALS_ERROR)
        exit(1)
예제 #7
0
def ls_s3_url_objects(s3_url):
    """
    Return aws boto3 ObjectSummary's

    Note: There is no current way in boto3 to do globs -- you filter on the client side.

    Returns:
        list:str: list of ObjectSummary's under this path
    """
    result = []

    if s3_url[-1] is not '/':
        s3_url += '/'

    bucket, s3_path = split_s3_url(s3_url)

    #if not s3_bucket_exists(bucket):
    #    return result

    if False:
        client = b3.client('s3')
        paginator = client.get_paginator('list_objects_v2')
        # use delimiter to groupby, which means, list things only at this level.
        #page_iterator = paginator.paginate(Bucket=bucket, Delimiter='/', Prefix=s3_path)
        page_iterator = paginator.paginate(Bucket=bucket, Prefix=s3_path)
        for page in page_iterator:
            result += [obj['Key'] for obj in page['Contents']]
    else:
        s3 = b3.resource('s3')
        try:
            s3_b = s3.Bucket(bucket)
            for i in s3_b.objects.filter(Prefix=s3_path, MaxKeys=1024):
                result.append(i)
            if len(result) == 1024:
                _logger.warn(
                    "ls_s3_url_objects: hit MaxKeys 1024 limit in result set.")
        except Exception as e:
            _logger.error(
                "ls_s3_url_objects: failed with exception {}".format(e))
            raise

    return result
예제 #8
0
def ls_s3_url(s3_url):
    """

    Args:
        s3_url:

    Returns:
        list(dict)
    """

    bucket, s3_path = split_s3_url(s3_url)
    result = []
    client = b3.client('s3')
    paginator = client.get_paginator('list_objects_v2')
    # use delimiter to groupby, which means, list things only at this level.
    #page_iterator = paginator.paginate(Bucket=bucket, Delimiter='/', Prefix=s3_path)
    page_iterator = paginator.paginate(Bucket=bucket, Prefix=s3_path)
    for page in page_iterator:
        result += page['Contents']

    return result
예제 #9
0
    def __init__(self, **kwargs):
        valid_output_options = ('color_enabled', 'output_stream_enabled',
                                'output_group_enabled',
                                'output_timestamp_enabled',
                                'output_ingestion_time_enabled', 'query')

        self.output_options = {
            k: v
            for k, v in kwargs.iteritems() if k in valid_output_options
        }
        self.aws_region = kwargs.get('aws_region')
        self.aws_access_key_id = kwargs.get('aws_access_key_id')
        self.aws_secret_access_key = kwargs.get('aws_secret_access_key')
        self.aws_session_token = kwargs.get('aws_session_token')
        self.log_group_name = kwargs.get('log_group_name')
        self.log_stream_prefix = kwargs.get('log_stream_prefix')
        self.filter_pattern = kwargs.get('filter_pattern')
        self.watch = kwargs.get('watch')
        if self.watch:
            sys.stderr.write(
                colored(
                    "Watch flag is currently broken! "
                    "You'll see new logs displayed to the console, "
                    "but they will be stale.\n", "yellow"))
        self.start = self.parse_datetime(kwargs.get('start'))
        self.end = self.parse_datetime(kwargs.get('end'))
        self.query = kwargs.get('query')
        self.query_template_file = kwargs.get('query_template_file')
        self.query_template_args = kwargs.get('args')

        self.log_group_prefix = kwargs.get('log_group_prefix')
        self.client = boto3_session_cache.client(
            'logs',
            aws_access_key_id=self.aws_access_key_id,
            aws_secret_access_key=self.aws_secret_access_key,
            aws_session_token=self.aws_session_token,
            region_name=self.aws_region)
예제 #10
0
파일: run.py 프로젝트: wontonswaggie/disdat
def _run(input_bundle,
         output_bundle,
         pipeline_params,
         pipeline_class_name,
         backend=Backend.Local,
         force=False,
         push_input_bundle=True,
         input_tags=None,
         output_tags=None):
    """Run the dockerized version of a pipeline.

    Args:
        input_bundle: The human name of the input bundle
        output_bundle: The human name of the output bundle
        pipeline_class_name: Name of the pipeline class to run
        pipeline_params: Optional arguments to pass to the pipeline class
        backend: The batch execution back-end to use (default
            `Backend.Local`)
        force: If `True` force recomputation of all upstream pipe
            requirements (default `False`)
        input_tags: Find bundle with these tags
        output_tags: Push result bundle with these tags

    Returns:
        `None`
    """

    #print "_run args are {}".format(pipeline_params)

    pfs = fs.DisdatFS()
    disdat_config = common.DisdatConfig.instance()

    pipeline_image_name = common.make_pipeline_image_name(pipeline_class_name)

    try:
        output_bundle_uuid, remote, branch_name = common.get_run_command_parameters(
            pfs)
    except ValueError:
        _logger.error(
            "'run' requires a remote set with `dsdt remote <s3 url>`")
        return

    if backend == Backend.AWSBatch:
        # Get the parameter values required to kick off an AWS Batch job.
        # Every batch job must:
        # 1. Have a name
        # 2. Have a job definition that declares which ECR-hosted Docker
        #    image to use.
        # 3. Have a queue that feeds jobs into a compute cluster.
        # 4. The command to execute inside the Docker image; the command
        #    args are more-or-less the same as the ones used to execute
        #    locally using 'dsdt run'
        job_name = '{}-{}'.format(pipeline_image_name, int(time.time()))
        job_definition_name = aws.batch_get_job_definition_name(
            pipeline_class_name)
        if disdat_config.parser.has_option(_MODULE_NAME,
                                           'aws_batch_job_definition'):
            job_definition_name = disdat_config.parser.get(
                _MODULE_NAME, 'aws_batch_job_definition')

        # If the job definition does not exist, create it.
        job_definition = aws.batch_get_job_definition(job_definition_name)
        if job_definition is None:
            repository_prefix = disdat_config.parser.get(
                'docker', 'repository_prefix')
            repository_name = common.make_pipeline_repository_name(
                repository_prefix, pipeline_class_name)
            # Figure out the fully-qualified repository name, i.e., the name
            # including the registry.
            registry_name = disdat_config.parser.get('docker',
                                                     'registry').strip('/')
            if registry_name == '*ECR*':
                fq_repository_name = aws.ecr_get_fq_respository_name(
                    repository_name)
            else:
                fq_repository_name = '{}/{}'.format(registry_name,
                                                    repository_name)
            aws.batch_register_job_definition(job_definition_name,
                                              fq_repository_name)
            job_definition = aws.batch_get_job_definition(job_definition_name)
        job_queue = disdat_config.parser.get(_MODULE_NAME, 'aws_batch_queue')

        # Assemble the command...
        job_command = common.make_run_command(input_bundle, output_bundle,
                                              output_bundle_uuid, remote,
                                              branch_name, input_tags,
                                              output_tags, pipeline_params)
        container_overrides = {'command': job_command}

        # Through the magic boto3_session_cache, we get clients to interact
        # with AWS services and (if necessary) temporary tokens if using
        # AWS profiles/MFA tokens.
        client = b3.client('batch', region_name=aws.profile_get_region())
        job = client.submit_job(jobName=job_name,
                                jobDefinition=job_definition,
                                jobQueue=job_queue,
                                containerOverrides=container_overrides)
        status = job['ResponseMetadata']['HTTPStatusCode']
        if status == 200:
            print 'Job {} (ID {}) with definition {} submitted to AWS Batch queue {}'.format(
                job['jobName'], job['jobId'], job_definition, job_queue)
        else:
            _logger.error('Job submission failed: HTTP Status {}'.format())
    elif backend == Backend.Local:

        client = docker.from_env()
        # Configure the container environment and mounted file systems.
        environment = {}
        if 'AWS_PROFILE' in os.environ:
            environment['AWS_PROFILE'] = os.environ['AWS_PROFILE']
        volumes = {}
        aws_config_dir = os.getenv('AWS_CONFIG_DIR',
                                   os.path.join(os.environ['HOME'], '.aws'))
        if aws_config_dir is not None and os.path.exists(aws_config_dir):
            volumes[aws_config_dir] = {'bind': '/root/.aws', 'mode': 'rw'}
        # Make sure latest committed is sent to remote
        if push_input_bundle:
            result = pfs.push(human_name=input_bundle)
            if result is None:
                _logger.error(
                    "'run' failed trying to push input bundle {} to remote.".
                    format(input_bundle))
                return
        # Now try to run the container
        try:
            args = ' '.join(
                common.make_run_command(input_bundle, output_bundle,
                                        output_bundle_uuid, remote,
                                        branch_name, input_tags, output_tags,
                                        pipeline_params))

            print "run.py ARGS {}".format(args)

            _logger.debug('Running image {} with arguments {}'.format(
                pipeline_image_name, args))
            stdout = client.containers.run(pipeline_image_name,
                                           args,
                                           detach=False,
                                           environment=environment,
                                           init=True,
                                           stderr=True,
                                           volumes=volumes)
            print stdout
        except docker.errors.ImageNotFound:
            _logger.error("Unable to find the docker image {}".format(
                pipeline_image_name))
            return
        # Now that this is finished, we need to pull this from the remote.
        pfs.pull(output_bundle, output_bundle_uuid)
    else:
        raise ValueError(
            'Got unrecognized job backend \'{}\': Expected {}'.format(
                backend, Backend.options()))
예제 #11
0
def _run_aws_sagemaker(arglist, job_name, pipeline_class_name):
    """
    Runs a training job on AWS SageMaker.  This uses the default machine type
    in the disdat.cfg file.

    Args:
        cli (bool): Whether we were called from the CLI or API

    Returns:
        TrainingJobArn (str)
    """

    disdat_config = DisdatConfig.instance()

    job_name = job_name.replace(
        '_',
        '-')  # b/c SageMaker complains it must be ^[a-zA-Z0-9](-*[a-zA-Z0-9])*

    hyperparameter_dict = _sagemaker_hyperparameters_from_arglist(arglist)

    fq_repository_name = get_fq_docker_repo_name(True, pipeline_class_name)

    algorithm_specification = {
        'TrainingImage': fq_repository_name,
        'TrainingInputMode': 'File'
    }

    role_arn = disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_role_arn')

    input_channel_config = [
        {
            'ChannelName': 'disdat_sagemaker_input_blackhole',
            'DataSource': {
                'S3DataSource': {
                    'S3DataType':
                    'S3Prefix',
                    'S3Uri':
                    disdat_config.parser.get(_MODULE_NAME,
                                             'aws_sagemaker_s3_input_uri'),
                    'S3DataDistributionType':
                    'FullyReplicated'
                }
            },
            'ContentType': 'application/javascript',
            'CompressionType': 'None',  # | 'Gzip',
            'RecordWrapperType': 'None'  # | 'RecordIO'
        },
    ]

    output_data_config = {
        'S3OutputPath':
        os.path.join(
            disdat_config.parser.get(_MODULE_NAME,
                                     'aws_sagemaker_s3_output_uri'), job_name)
    }

    resource_config = {
        'InstanceType':
        disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_instance_type'),
        'InstanceCount':
        int(
            disdat_config.parser.get(_MODULE_NAME,
                                     'aws_sagemaker_instance_count')),
        'VolumeSizeInGB':
        int(
            disdat_config.parser.get(_MODULE_NAME,
                                     'aws_sagemaker_volume_sizeGB'))
        # 'VolumeKmsKeyId': 'string'
    }

    vpc_config = None  #'SecurityGroupIds': [], 'Subnets': []}

    stopping_condition = {
        'MaxRuntimeInSeconds':
        int(
            disdat_config.parser.get(_MODULE_NAME,
                                     'aws_sagemaker_max_runtime_sec'))
    }

    tags = [{
        'Key': 'user',
        'Value': 'disdat'
    }, {
        'Key': 'job',
        'Value': job_name
    }]

    if False:
        print "Disdat SageMaker configs"
        print "job name: {}".format(job_name)
        print "hparams: {}".format(hyperparameter_dict)
        print "algorithm: {}".format(algorithm_specification)
        print "Role ARN: {}".format(role_arn)
        print "Input data conf: {}".format(input_channel_config)
        print "Output data conf: {}".format(output_data_config)
        print "Resource conf: {}".format(resource_config)
        print "VPC conf: {}".format(vpc_config)
        print "Stopping condition seconds: {}".format(stopping_condition)
        print "Tags: {}".format(tags)

    client = b3.client('sagemaker', region_name=aws.profile_get_region())

    response = client.create_training_job(
        TrainingJobName=job_name,
        HyperParameters=hyperparameter_dict,
        AlgorithmSpecification=algorithm_specification,
        RoleArn=role_arn,
        InputDataConfig=input_channel_config,
        OutputDataConfig=output_data_config,
        ResourceConfig=resource_config,
        #VpcConfig=vpc_config,
        StoppingCondition=stopping_condition,
        Tags=tags)

    _logger.info(
        "Disdat SageMaker create_training_job response {}".format(response))
    return response['TrainingJobArn']
예제 #12
0
def _run_aws_batch(arglist, job_name, pipeline_class_name,
                   aws_session_token_duration, vcpus, memory, no_submit,
                   job_role_arn):
    """
    Run job on AWS Batch.   Sends to queue configured in disdat.cfg.
    This assumes that you have already created a cluster that will run the jobs
    that have been assigned to that queue.

    Args:
        arglist:
        pipeline_class_name:
        aws_session_token_duration:
        vcpus:
        memory:
        no_submit (bool): default False
        job_role_arn (str): Can be None

    Returns:

    """
    def check_role_arn(job_dict, jra):
        """ Check to see if the job desc dictionary contains the same job_role_arn (jra)
        """

        if jra is None:
            if 'jobRoleArn' not in job_dict['containerProperties']:
                return True
        else:
            if 'jobRoleArn' in job_dict['containerProperties']:
                if job_dict['containerProperties']['jobRoleArn'] == jra:
                    return True
        return False

    disdat_config = DisdatConfig.instance()

    # Get the parameter values required to kick off an AWS Batch job.
    # Every batch job must:
    # 1. Have a name
    # 2. Have a job definition that declares which ECR-hosted Docker
    #    image to use.
    # 3. Have a queue that feeds jobs into a compute cluster.
    # 4. The command to execute inside the Docker image; the command
    #    args are more-or-less the same as the ones used to execute
    #    locally using 'dsdt run'

    # Create a Job Definition and upload it.
    # We create per-user job definitions so multiple users do not clobber each other.
    # In addition, we never re-use a job definition, since the user may update
    # the vcpu or memory requirements and those are stuck in the job definition

    fq_repository_name = get_fq_docker_repo_name(False, pipeline_class_name)

    job_definition_name = aws.batch_get_job_definition_name(
        pipeline_class_name)

    if disdat_config.parser.has_option(_MODULE_NAME,
                                       'aws_batch_job_definition'):
        job_definition_name = disdat_config.parser.get(
            _MODULE_NAME, 'aws_batch_job_definition')

    # TODO: Look through all of history to find one that matches?
    # TODO: Delete old jobs here or let user do it?
    job_definition_obj = aws.batch_get_latest_job_definition(
        job_definition_name)

    if (job_definition_obj is not None
            and job_definition_obj['containerProperties']['image']
            == fq_repository_name
            and job_definition_obj['containerProperties']['vcpus'] == vcpus
            and job_definition_obj['containerProperties']['memory'] == memory
            and check_role_arn(job_definition_obj, job_role_arn)):

        job_definition_fqn = aws.batch_extract_job_definition_fqn(
            job_definition_obj)

        _logger.info("Re-using prior AWS Batch run job definition : {}".format(
            job_definition_obj))

    else:
        """ Whether None or doesn't match, make a new one """

        job_definition_obj = aws.batch_register_job_definition(
            job_definition_name,
            fq_repository_name,
            vcpus=vcpus,
            memory=memory,
            job_role_arn=job_role_arn)

        job_definition_fqn = aws.batch_get_job_definition(job_definition_name)

        _logger.info(
            "New AWS Batch run job definition {}".format(job_definition_fqn))

    if no_submit:
        # Return the job description object
        return job_definition_obj

    job_queue = disdat_config.parser.get(_MODULE_NAME, 'aws_batch_queue')

    container_overrides = {'command': arglist}

    # Through the magic of boto3_session_cache, the client in our script
    # here can get at AWS profiles and temporary AWS tokens created in
    # part from MFA tokens generated through the user's shells; we don't
    # have to write special code of our own to deal with authenticating
    # with AWS.
    client = b3.client('batch', region_name=aws.profile_get_region())
    # A bigger problem might be that the IAM role executing the job on
    # a batch EC2 instance might not have access to the S3 remote. To
    # get around this, allow the user to create some temporary AWS
    # credentials.
    if aws_session_token_duration > 0 and job_role_arn is None:
        sts_client = b3.client('sts')
        token = sts_client.get_session_token(
            DurationSeconds=aws_session_token_duration)
        credentials = token['Credentials']
        container_overrides['environment'] = [{
            'name':
            'AWS_ACCESS_KEY_ID',
            'value':
            credentials['AccessKeyId']
        }, {
            'name':
            'AWS_SECRET_ACCESS_KEY',
            'value':
            credentials['SecretAccessKey']
        }, {
            'name':
            'AWS_SESSION_TOKEN',
            'value':
            credentials['SessionToken']
        }]
    job = client.submit_job(jobName=job_name,
                            jobDefinition=job_definition_fqn,
                            jobQueue=job_queue,
                            containerOverrides=container_overrides)
    status = job['ResponseMetadata']['HTTPStatusCode']
    if status == 200:
        _logger.info(
            'Job {} (ID {}) with definition {} submitted to AWS Batch queue {}'
            .format(job['jobName'], job['jobId'], job_definition_fqn,
                    job_queue))
        return job
    else:
        _logger.error('Job submission failed: HTTP Status {}'.format())
        return None