def batch_get_latest_job_definition(job_definition_name): """Get the most recent active revision number for a AWS Batch job definition Args: job_definition_name: The name of the job definition remote_pipeline_image_name: vcpus: memory: Return: The latest job definition dictionary or `None` if the job definition does not exist """ region = profile_get_region() client = b3.client('batch', region_name=region) response = client.describe_job_definitions( jobDefinitionName=job_definition_name, status='ACTIVE') if response['ResponseMetadata']['HTTPStatusCode'] != 200: raise RuntimeError( 'Failed to get job definition revisions for {}: HTTP Status {}'. format(job_definition_name, response['ResponseMetadata']['HTTPStatusCode'])) job_definitions = response['jobDefinitions'] revision = 0 job_def = None for j in job_definitions: if j['jobDefinitionName'] != job_definition_name: continue if j['revision'] > revision: revision = j['revision'] job_def = j return job_def
def ecr_create_fq_respository_name(repository_name, policy_resource_package=None, policy_resource_name=None): ecr_client = b3.client('ecr', region_name=profile_get_region()) # Create or fetch the repository in AWS (to store the image) try: response = ecr_client.create_repository(repositoryName=repository_name) repository_metadata = response['repository'] # Set the policy on the repository if policy_resource_package is not None and policy_resource_name is not None: policy = pkg_resources.resource_string( policy_resource_package.__name__, policy_resource_name) _ = ecr_client.set_repository_policy( registryId=repository_metadata['registryId'], repositoryName=repository_name, policyText=policy, force=True) except ClientError as e: if e.response['Error']['Code'] == 'RepositoryAlreadyExistsException': response = ecr_client.describe_repositories( repositoryNames=[repository_name]) repository_metadata = response['repositories'][0] else: raise e return repository_metadata['repositoryUri']
def batch_register_job_definition(job_definition_name, remote_pipeline_image_name, vcpus=1, memory=2000): """Register a new AWS Batch job definition. Args: job_definition_name: The name of the job definition remote_pipeline_image_name: The ECR Docker image to load to run jobs using this definition vcpus: The number of vCPUs to use to run jobs using this definition memory: The amount of memory in MiB to use to run jobs using this definition """ region = profile_get_region() client = b3.client('batch', region_name=region) response = client.register_job_definition( jobDefinitionName=job_definition_name, type='container', containerProperties={ 'image': remote_pipeline_image_name, 'vcpus': vcpus, 'memory': memory, }) if response['ResponseMetadata']['HTTPStatusCode'] != 200: raise RuntimeError( 'Failed to create job definition {}: HTTP Status {}'.format( job_definition_name, response['ResponseMetadata']['HTTPStatusCode']))
def batch_get_job_definition(job_definition_name): """Get the most recent active revision number for a AWS Batch job definition Args: job_definition_name: The name of the job definition Return: The fully-qualified job definition name with revision number, or `None` if the job definition does not exist """ region = profile_get_region() client = b3.client('batch', region_name=region) response = client.describe_job_definitions( jobDefinitionName=job_definition_name, status='ACTIVE') if response['ResponseMetadata']['HTTPStatusCode'] != 200: raise RuntimeError( 'Failed to get job definition revisions for {}: HTTP Status {}'. format(job_definition_name, response['ResponseMetadata']['HTTPStatusCode'])) job_definitions = response['jobDefinitions'] revision = 0 for j in job_definitions: if j['jobDefinitionName'] != job_definition_name: continue if j['revision'] > revision: revision = j['revision'] if revision == 0: return None else: return '{}:{}'.format(job_definition_name, revision)
def ecr_get_auth_config(): ecr_client = b3.client('ecr', region_name=profile_get_region()) # Authorize docker to push to ECR response = ecr_client.get_authorization_token() if response['ResponseMetadata']['HTTPStatusCode'] != 200: raise RuntimeError( 'Failed to get AWS ECR authorization token: HTTP Status {}'.format( response['ResponseMetadata']['HTTPStatusCode'])) token = response['authorizationData'][0]['authorizationToken'] username, password = base64.decodestring(token).split(':') return {'username': username, 'password': password}
def get_aws_instances(): try: return boto3_session_cache.client('ec2').describe_instances() except NoRegionError: print(NO_REGION_ERROR) exit(1) except (PartialCredentialsError, NoCredentialsError): print(NO_CREDENTIALS_ERROR) exit(1) except ClientError: print(WRONG_CREDENTIALS_ERROR) exit(1)
def ls_s3_url_objects(s3_url): """ Return aws boto3 ObjectSummary's Note: There is no current way in boto3 to do globs -- you filter on the client side. Returns: list:str: list of ObjectSummary's under this path """ result = [] if s3_url[-1] is not '/': s3_url += '/' bucket, s3_path = split_s3_url(s3_url) #if not s3_bucket_exists(bucket): # return result if False: client = b3.client('s3') paginator = client.get_paginator('list_objects_v2') # use delimiter to groupby, which means, list things only at this level. #page_iterator = paginator.paginate(Bucket=bucket, Delimiter='/', Prefix=s3_path) page_iterator = paginator.paginate(Bucket=bucket, Prefix=s3_path) for page in page_iterator: result += [obj['Key'] for obj in page['Contents']] else: s3 = b3.resource('s3') try: s3_b = s3.Bucket(bucket) for i in s3_b.objects.filter(Prefix=s3_path, MaxKeys=1024): result.append(i) if len(result) == 1024: _logger.warn( "ls_s3_url_objects: hit MaxKeys 1024 limit in result set.") except Exception as e: _logger.error( "ls_s3_url_objects: failed with exception {}".format(e)) raise return result
def ls_s3_url(s3_url): """ Args: s3_url: Returns: list(dict) """ bucket, s3_path = split_s3_url(s3_url) result = [] client = b3.client('s3') paginator = client.get_paginator('list_objects_v2') # use delimiter to groupby, which means, list things only at this level. #page_iterator = paginator.paginate(Bucket=bucket, Delimiter='/', Prefix=s3_path) page_iterator = paginator.paginate(Bucket=bucket, Prefix=s3_path) for page in page_iterator: result += page['Contents'] return result
def __init__(self, **kwargs): valid_output_options = ('color_enabled', 'output_stream_enabled', 'output_group_enabled', 'output_timestamp_enabled', 'output_ingestion_time_enabled', 'query') self.output_options = { k: v for k, v in kwargs.iteritems() if k in valid_output_options } self.aws_region = kwargs.get('aws_region') self.aws_access_key_id = kwargs.get('aws_access_key_id') self.aws_secret_access_key = kwargs.get('aws_secret_access_key') self.aws_session_token = kwargs.get('aws_session_token') self.log_group_name = kwargs.get('log_group_name') self.log_stream_prefix = kwargs.get('log_stream_prefix') self.filter_pattern = kwargs.get('filter_pattern') self.watch = kwargs.get('watch') if self.watch: sys.stderr.write( colored( "Watch flag is currently broken! " "You'll see new logs displayed to the console, " "but they will be stale.\n", "yellow")) self.start = self.parse_datetime(kwargs.get('start')) self.end = self.parse_datetime(kwargs.get('end')) self.query = kwargs.get('query') self.query_template_file = kwargs.get('query_template_file') self.query_template_args = kwargs.get('args') self.log_group_prefix = kwargs.get('log_group_prefix') self.client = boto3_session_cache.client( 'logs', aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, aws_session_token=self.aws_session_token, region_name=self.aws_region)
def _run(input_bundle, output_bundle, pipeline_params, pipeline_class_name, backend=Backend.Local, force=False, push_input_bundle=True, input_tags=None, output_tags=None): """Run the dockerized version of a pipeline. Args: input_bundle: The human name of the input bundle output_bundle: The human name of the output bundle pipeline_class_name: Name of the pipeline class to run pipeline_params: Optional arguments to pass to the pipeline class backend: The batch execution back-end to use (default `Backend.Local`) force: If `True` force recomputation of all upstream pipe requirements (default `False`) input_tags: Find bundle with these tags output_tags: Push result bundle with these tags Returns: `None` """ #print "_run args are {}".format(pipeline_params) pfs = fs.DisdatFS() disdat_config = common.DisdatConfig.instance() pipeline_image_name = common.make_pipeline_image_name(pipeline_class_name) try: output_bundle_uuid, remote, branch_name = common.get_run_command_parameters( pfs) except ValueError: _logger.error( "'run' requires a remote set with `dsdt remote <s3 url>`") return if backend == Backend.AWSBatch: # Get the parameter values required to kick off an AWS Batch job. # Every batch job must: # 1. Have a name # 2. Have a job definition that declares which ECR-hosted Docker # image to use. # 3. Have a queue that feeds jobs into a compute cluster. # 4. The command to execute inside the Docker image; the command # args are more-or-less the same as the ones used to execute # locally using 'dsdt run' job_name = '{}-{}'.format(pipeline_image_name, int(time.time())) job_definition_name = aws.batch_get_job_definition_name( pipeline_class_name) if disdat_config.parser.has_option(_MODULE_NAME, 'aws_batch_job_definition'): job_definition_name = disdat_config.parser.get( _MODULE_NAME, 'aws_batch_job_definition') # If the job definition does not exist, create it. job_definition = aws.batch_get_job_definition(job_definition_name) if job_definition is None: repository_prefix = disdat_config.parser.get( 'docker', 'repository_prefix') repository_name = common.make_pipeline_repository_name( repository_prefix, pipeline_class_name) # Figure out the fully-qualified repository name, i.e., the name # including the registry. registry_name = disdat_config.parser.get('docker', 'registry').strip('/') if registry_name == '*ECR*': fq_repository_name = aws.ecr_get_fq_respository_name( repository_name) else: fq_repository_name = '{}/{}'.format(registry_name, repository_name) aws.batch_register_job_definition(job_definition_name, fq_repository_name) job_definition = aws.batch_get_job_definition(job_definition_name) job_queue = disdat_config.parser.get(_MODULE_NAME, 'aws_batch_queue') # Assemble the command... job_command = common.make_run_command(input_bundle, output_bundle, output_bundle_uuid, remote, branch_name, input_tags, output_tags, pipeline_params) container_overrides = {'command': job_command} # Through the magic boto3_session_cache, we get clients to interact # with AWS services and (if necessary) temporary tokens if using # AWS profiles/MFA tokens. client = b3.client('batch', region_name=aws.profile_get_region()) job = client.submit_job(jobName=job_name, jobDefinition=job_definition, jobQueue=job_queue, containerOverrides=container_overrides) status = job['ResponseMetadata']['HTTPStatusCode'] if status == 200: print 'Job {} (ID {}) with definition {} submitted to AWS Batch queue {}'.format( job['jobName'], job['jobId'], job_definition, job_queue) else: _logger.error('Job submission failed: HTTP Status {}'.format()) elif backend == Backend.Local: client = docker.from_env() # Configure the container environment and mounted file systems. environment = {} if 'AWS_PROFILE' in os.environ: environment['AWS_PROFILE'] = os.environ['AWS_PROFILE'] volumes = {} aws_config_dir = os.getenv('AWS_CONFIG_DIR', os.path.join(os.environ['HOME'], '.aws')) if aws_config_dir is not None and os.path.exists(aws_config_dir): volumes[aws_config_dir] = {'bind': '/root/.aws', 'mode': 'rw'} # Make sure latest committed is sent to remote if push_input_bundle: result = pfs.push(human_name=input_bundle) if result is None: _logger.error( "'run' failed trying to push input bundle {} to remote.". format(input_bundle)) return # Now try to run the container try: args = ' '.join( common.make_run_command(input_bundle, output_bundle, output_bundle_uuid, remote, branch_name, input_tags, output_tags, pipeline_params)) print "run.py ARGS {}".format(args) _logger.debug('Running image {} with arguments {}'.format( pipeline_image_name, args)) stdout = client.containers.run(pipeline_image_name, args, detach=False, environment=environment, init=True, stderr=True, volumes=volumes) print stdout except docker.errors.ImageNotFound: _logger.error("Unable to find the docker image {}".format( pipeline_image_name)) return # Now that this is finished, we need to pull this from the remote. pfs.pull(output_bundle, output_bundle_uuid) else: raise ValueError( 'Got unrecognized job backend \'{}\': Expected {}'.format( backend, Backend.options()))
def _run_aws_sagemaker(arglist, job_name, pipeline_class_name): """ Runs a training job on AWS SageMaker. This uses the default machine type in the disdat.cfg file. Args: cli (bool): Whether we were called from the CLI or API Returns: TrainingJobArn (str) """ disdat_config = DisdatConfig.instance() job_name = job_name.replace( '_', '-') # b/c SageMaker complains it must be ^[a-zA-Z0-9](-*[a-zA-Z0-9])* hyperparameter_dict = _sagemaker_hyperparameters_from_arglist(arglist) fq_repository_name = get_fq_docker_repo_name(True, pipeline_class_name) algorithm_specification = { 'TrainingImage': fq_repository_name, 'TrainingInputMode': 'File' } role_arn = disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_role_arn') input_channel_config = [ { 'ChannelName': 'disdat_sagemaker_input_blackhole', 'DataSource': { 'S3DataSource': { 'S3DataType': 'S3Prefix', 'S3Uri': disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_s3_input_uri'), 'S3DataDistributionType': 'FullyReplicated' } }, 'ContentType': 'application/javascript', 'CompressionType': 'None', # | 'Gzip', 'RecordWrapperType': 'None' # | 'RecordIO' }, ] output_data_config = { 'S3OutputPath': os.path.join( disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_s3_output_uri'), job_name) } resource_config = { 'InstanceType': disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_instance_type'), 'InstanceCount': int( disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_instance_count')), 'VolumeSizeInGB': int( disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_volume_sizeGB')) # 'VolumeKmsKeyId': 'string' } vpc_config = None #'SecurityGroupIds': [], 'Subnets': []} stopping_condition = { 'MaxRuntimeInSeconds': int( disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_max_runtime_sec')) } tags = [{ 'Key': 'user', 'Value': 'disdat' }, { 'Key': 'job', 'Value': job_name }] if False: print "Disdat SageMaker configs" print "job name: {}".format(job_name) print "hparams: {}".format(hyperparameter_dict) print "algorithm: {}".format(algorithm_specification) print "Role ARN: {}".format(role_arn) print "Input data conf: {}".format(input_channel_config) print "Output data conf: {}".format(output_data_config) print "Resource conf: {}".format(resource_config) print "VPC conf: {}".format(vpc_config) print "Stopping condition seconds: {}".format(stopping_condition) print "Tags: {}".format(tags) client = b3.client('sagemaker', region_name=aws.profile_get_region()) response = client.create_training_job( TrainingJobName=job_name, HyperParameters=hyperparameter_dict, AlgorithmSpecification=algorithm_specification, RoleArn=role_arn, InputDataConfig=input_channel_config, OutputDataConfig=output_data_config, ResourceConfig=resource_config, #VpcConfig=vpc_config, StoppingCondition=stopping_condition, Tags=tags) _logger.info( "Disdat SageMaker create_training_job response {}".format(response)) return response['TrainingJobArn']
def _run_aws_batch(arglist, job_name, pipeline_class_name, aws_session_token_duration, vcpus, memory, no_submit, job_role_arn): """ Run job on AWS Batch. Sends to queue configured in disdat.cfg. This assumes that you have already created a cluster that will run the jobs that have been assigned to that queue. Args: arglist: pipeline_class_name: aws_session_token_duration: vcpus: memory: no_submit (bool): default False job_role_arn (str): Can be None Returns: """ def check_role_arn(job_dict, jra): """ Check to see if the job desc dictionary contains the same job_role_arn (jra) """ if jra is None: if 'jobRoleArn' not in job_dict['containerProperties']: return True else: if 'jobRoleArn' in job_dict['containerProperties']: if job_dict['containerProperties']['jobRoleArn'] == jra: return True return False disdat_config = DisdatConfig.instance() # Get the parameter values required to kick off an AWS Batch job. # Every batch job must: # 1. Have a name # 2. Have a job definition that declares which ECR-hosted Docker # image to use. # 3. Have a queue that feeds jobs into a compute cluster. # 4. The command to execute inside the Docker image; the command # args are more-or-less the same as the ones used to execute # locally using 'dsdt run' # Create a Job Definition and upload it. # We create per-user job definitions so multiple users do not clobber each other. # In addition, we never re-use a job definition, since the user may update # the vcpu or memory requirements and those are stuck in the job definition fq_repository_name = get_fq_docker_repo_name(False, pipeline_class_name) job_definition_name = aws.batch_get_job_definition_name( pipeline_class_name) if disdat_config.parser.has_option(_MODULE_NAME, 'aws_batch_job_definition'): job_definition_name = disdat_config.parser.get( _MODULE_NAME, 'aws_batch_job_definition') # TODO: Look through all of history to find one that matches? # TODO: Delete old jobs here or let user do it? job_definition_obj = aws.batch_get_latest_job_definition( job_definition_name) if (job_definition_obj is not None and job_definition_obj['containerProperties']['image'] == fq_repository_name and job_definition_obj['containerProperties']['vcpus'] == vcpus and job_definition_obj['containerProperties']['memory'] == memory and check_role_arn(job_definition_obj, job_role_arn)): job_definition_fqn = aws.batch_extract_job_definition_fqn( job_definition_obj) _logger.info("Re-using prior AWS Batch run job definition : {}".format( job_definition_obj)) else: """ Whether None or doesn't match, make a new one """ job_definition_obj = aws.batch_register_job_definition( job_definition_name, fq_repository_name, vcpus=vcpus, memory=memory, job_role_arn=job_role_arn) job_definition_fqn = aws.batch_get_job_definition(job_definition_name) _logger.info( "New AWS Batch run job definition {}".format(job_definition_fqn)) if no_submit: # Return the job description object return job_definition_obj job_queue = disdat_config.parser.get(_MODULE_NAME, 'aws_batch_queue') container_overrides = {'command': arglist} # Through the magic of boto3_session_cache, the client in our script # here can get at AWS profiles and temporary AWS tokens created in # part from MFA tokens generated through the user's shells; we don't # have to write special code of our own to deal with authenticating # with AWS. client = b3.client('batch', region_name=aws.profile_get_region()) # A bigger problem might be that the IAM role executing the job on # a batch EC2 instance might not have access to the S3 remote. To # get around this, allow the user to create some temporary AWS # credentials. if aws_session_token_duration > 0 and job_role_arn is None: sts_client = b3.client('sts') token = sts_client.get_session_token( DurationSeconds=aws_session_token_duration) credentials = token['Credentials'] container_overrides['environment'] = [{ 'name': 'AWS_ACCESS_KEY_ID', 'value': credentials['AccessKeyId'] }, { 'name': 'AWS_SECRET_ACCESS_KEY', 'value': credentials['SecretAccessKey'] }, { 'name': 'AWS_SESSION_TOKEN', 'value': credentials['SessionToken'] }] job = client.submit_job(jobName=job_name, jobDefinition=job_definition_fqn, jobQueue=job_queue, containerOverrides=container_overrides) status = job['ResponseMetadata']['HTTPStatusCode'] if status == 200: _logger.info( 'Job {} (ID {}) with definition {} submitted to AWS Batch queue {}' .format(job['jobName'], job['jobId'], job_definition_fqn, job_queue)) return job else: _logger.error('Job submission failed: HTTP Status {}'.format()) return None