def _run_aws_batch(arglist, fq_repository_name, job_name, pipeline_image_name, aws_session_token_duration, vcpus, memory, no_submit, job_role_arn): """ Run job on AWS Batch. Sends to queue configured in disdat.cfg. This assumes that you have already created a cluster that will run the jobs that have been assigned to that queue. Args: arglist: fq_repository_name (str): The fully qualified docker repository name job_name: pipeline_image_name: aws_session_token_duration: vcpus: memory: no_submit (bool): default False job_role_arn (str): Can be None Returns: """ def check_role_arn(job_dict, jra): """ Check to see if the job desc dictionary contains the same job_role_arn (jra) """ if jra is None: if 'jobRoleArn' not in job_dict['containerProperties']: return True else: if 'jobRoleArn' in job_dict['containerProperties']: if job_dict['containerProperties']['jobRoleArn'] == jra: return True return False disdat_config = DisdatConfig.instance() # Get the parameter values required to kick off an AWS Batch job. # Every batch job must: # 1. Have a name # 2. Have a job definition that declares which ECR-hosted Docker # image to use. # 3. Have a queue that feeds jobs into a compute cluster. # 4. The command to execute inside the Docker image; the command # args are more-or-less the same as the ones used to execute # locally using 'dsdt run' # Create a Job Definition and upload it. # We create per-user job definitions so multiple users do not clobber each other. # In addition, we never re-use a job definition, since the user may update # the vcpu or memory requirements and those are stuck in the job definition job_definition_name = aws.batch_get_job_definition_name( pipeline_image_name) if disdat_config.parser.has_option(_MODULE_NAME, 'aws_batch_job_definition'): job_definition_name = disdat_config.parser.get( _MODULE_NAME, 'aws_batch_job_definition') # TODO: Look through all of history to find one that matches? # TODO: Delete old jobs here or let user do it? job_definition_obj = aws.batch_get_latest_job_definition( job_definition_name) if (job_definition_obj is not None and job_definition_obj['containerProperties']['image'] == fq_repository_name and job_definition_obj['containerProperties']['vcpus'] == vcpus and job_definition_obj['containerProperties']['memory'] == memory and check_role_arn(job_definition_obj, job_role_arn)): job_definition_fqn = aws.batch_extract_job_definition_fqn( job_definition_obj) _logger.info("Re-using prior AWS Batch run job definition : {}".format( job_definition_obj)) else: """ Whether None or doesn't match, make a new one """ job_definition_obj = aws.batch_register_job_definition( job_definition_name, fq_repository_name, vcpus=vcpus, memory=memory, job_role_arn=job_role_arn) job_definition_fqn = aws.batch_get_job_definition(job_definition_name) _logger.info( "New AWS Batch run job definition {}".format(job_definition_fqn)) if no_submit: # Return the job description object return job_definition_obj job_queue = disdat_config.parser.get(_MODULE_NAME, 'aws_batch_queue') container_overrides = {'command': arglist} # Through the magic of boto3_session_cache, the client in our script # here can get at AWS profiles and temporary AWS tokens created in # part from MFA tokens generated through the user's shells; we don't # have to write special code of our own to deal with authenticating # with AWS. client = b3.client('batch', region_name=aws.profile_get_region()) # A bigger problem might be that the IAM role executing the job on # a batch EC2 instance might not have access to the S3 remote. To # get around this, allow the user to create some temporary AWS # credentials. if aws_session_token_duration > 0 and job_role_arn is None: sts_client = b3.client('sts') try: token = sts_client.get_session_token( DurationSeconds=aws_session_token_duration) credentials = token['Credentials'] container_overrides['environment'] = [{ 'name': 'AWS_ACCESS_KEY_ID', 'value': credentials['AccessKeyId'] }, { 'name': 'AWS_SECRET_ACCESS_KEY', 'value': credentials['SecretAccessKey'] }, { 'name': 'AWS_SESSION_TOKEN', 'value': credentials['SessionToken'] }] except Exception as e: _logger.debug( "Unable to generate an STS token, instead trying users default credentials..." ) credentials = b3.session.Session().get_credentials() container_overrides['environment'] = [{ 'name': 'AWS_ACCESS_KEY_ID', 'value': credentials.access_key }, { 'name': 'AWS_SECRET_ACCESS_KEY', 'value': credentials.secret_key }, { 'name': 'AWS_SESSION_TOKEN', 'value': credentials.token }] container_overrides['environment'].append({ 'name': 'DISDAT_CPU_COUNT', 'value': str(vcpus) }) job = client.submit_job(jobName=job_name, jobDefinition=job_definition_fqn, jobQueue=job_queue, containerOverrides=container_overrides) status = job['ResponseMetadata']['HTTPStatusCode'] if status == 200: _logger.info( 'Job {} (ID {}) with definition {} submitted to AWS Batch queue {}' .format(job['jobName'], job['jobId'], job_definition_fqn, job_queue)) return job else: _logger.error('Job submission failed: HTTP Status {}'.format()) return None
def _run_aws_sagemaker(arglist, fq_repository_name, job_name): """ Runs a training job on AWS SageMaker. This uses the default machine type in the disdat.cfg file. Args: arglist: fq_repository_name (str): fully qualified repository name job_name: instance job name Returns: TrainingJobArn (str) """ disdat_config = DisdatConfig.instance() job_name = job_name.replace( '_', '-') # b/c SageMaker complains it must be ^[a-zA-Z0-9](-*[a-zA-Z0-9])* hyperparameter_dict = _sagemaker_hyperparameters_from_arglist(arglist) algorithm_specification = { 'TrainingImage': fq_repository_name, 'TrainingInputMode': 'File' } role_arn = disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_role_arn') input_channel_config = [ { 'ChannelName': 'disdat_sagemaker_input_blackhole', 'DataSource': { 'S3DataSource': { 'S3DataType': 'S3Prefix', 'S3Uri': disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_s3_input_uri'), 'S3DataDistributionType': 'FullyReplicated' } }, 'ContentType': 'application/javascript', 'CompressionType': 'None', # | 'Gzip', 'RecordWrapperType': 'None' # | 'RecordIO' }, ] output_data_config = { 'S3OutputPath': os.path.join( disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_s3_output_uri'), job_name) } resource_config = { 'InstanceType': disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_instance_type'), 'InstanceCount': int( disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_instance_count')), 'VolumeSizeInGB': int( disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_volume_sizeGB')) # 'VolumeKmsKeyId': 'string' } vpc_config = None #'SecurityGroupIds': [], 'Subnets': []} stopping_condition = { 'MaxRuntimeInSeconds': int( disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_max_runtime_sec')) } tags = [{ 'Key': 'user', 'Value': 'disdat' }, { 'Key': 'job', 'Value': job_name }] if False: print("Disdat SageMaker configs") print("job name: {}".format(job_name)) print("hparams: {}".format(hyperparameter_dict)) print("algorithm: {}".format(algorithm_specification)) print("Role ARN: {}".format(role_arn)) print("Input data conf: {}".format(input_channel_config)) print("Output data conf: {}".format(output_data_config)) print("Resource conf: {}".format(resource_config)) print("VPC conf: {}".format(vpc_config)) print("Stopping condition seconds: {}".format(stopping_condition)) print("Tags: {}".format(tags)) client = b3.client('sagemaker', region_name=aws.profile_get_region()) response = client.create_training_job( TrainingJobName=job_name, HyperParameters=hyperparameter_dict, AlgorithmSpecification=algorithm_specification, RoleArn=role_arn, InputDataConfig=input_channel_config, OutputDataConfig=output_data_config, ResourceConfig=resource_config, StoppingCondition=stopping_condition, Tags=tags) _logger.info( "Disdat SageMaker create_training_job response {}".format(response)) return response['TrainingJobArn']
def _run(input_bundle, output_bundle, pipeline_params, pipeline_class_name, backend=Backend.Local, force=False, push_input_bundle=True, input_tags=None, output_tags=None): """Run the dockerized version of a pipeline. Args: input_bundle: The human name of the input bundle output_bundle: The human name of the output bundle pipeline_class_name: Name of the pipeline class to run pipeline_params: Optional arguments to pass to the pipeline class backend: The batch execution back-end to use (default `Backend.Local`) force: If `True` force recomputation of all upstream pipe requirements (default `False`) input_tags: Find bundle with these tags output_tags: Push result bundle with these tags Returns: `None` """ #print "_run args are {}".format(pipeline_params) pfs = fs.DisdatFS() disdat_config = common.DisdatConfig.instance() pipeline_image_name = common.make_pipeline_image_name(pipeline_class_name) try: output_bundle_uuid, remote, branch_name = common.get_run_command_parameters( pfs) except ValueError: _logger.error( "'run' requires a remote set with `dsdt remote <s3 url>`") return if backend == Backend.AWSBatch: # Get the parameter values required to kick off an AWS Batch job. # Every batch job must: # 1. Have a name # 2. Have a job definition that declares which ECR-hosted Docker # image to use. # 3. Have a queue that feeds jobs into a compute cluster. # 4. The command to execute inside the Docker image; the command # args are more-or-less the same as the ones used to execute # locally using 'dsdt run' job_name = '{}-{}'.format(pipeline_image_name, int(time.time())) job_definition_name = aws.batch_get_job_definition_name( pipeline_class_name) if disdat_config.parser.has_option(_MODULE_NAME, 'aws_batch_job_definition'): job_definition_name = disdat_config.parser.get( _MODULE_NAME, 'aws_batch_job_definition') # If the job definition does not exist, create it. job_definition = aws.batch_get_job_definition(job_definition_name) if job_definition is None: repository_prefix = disdat_config.parser.get( 'docker', 'repository_prefix') repository_name = common.make_pipeline_repository_name( repository_prefix, pipeline_class_name) # Figure out the fully-qualified repository name, i.e., the name # including the registry. registry_name = disdat_config.parser.get('docker', 'registry').strip('/') if registry_name == '*ECR*': fq_repository_name = aws.ecr_get_fq_respository_name( repository_name) else: fq_repository_name = '{}/{}'.format(registry_name, repository_name) aws.batch_register_job_definition(job_definition_name, fq_repository_name) job_definition = aws.batch_get_job_definition(job_definition_name) job_queue = disdat_config.parser.get(_MODULE_NAME, 'aws_batch_queue') # Assemble the command... job_command = common.make_run_command(input_bundle, output_bundle, output_bundle_uuid, remote, branch_name, input_tags, output_tags, pipeline_params) container_overrides = {'command': job_command} # Through the magic boto3_session_cache, we get clients to interact # with AWS services and (if necessary) temporary tokens if using # AWS profiles/MFA tokens. client = b3.client('batch', region_name=aws.profile_get_region()) job = client.submit_job(jobName=job_name, jobDefinition=job_definition, jobQueue=job_queue, containerOverrides=container_overrides) status = job['ResponseMetadata']['HTTPStatusCode'] if status == 200: print 'Job {} (ID {}) with definition {} submitted to AWS Batch queue {}'.format( job['jobName'], job['jobId'], job_definition, job_queue) else: _logger.error('Job submission failed: HTTP Status {}'.format()) elif backend == Backend.Local: client = docker.from_env() # Configure the container environment and mounted file systems. environment = {} if 'AWS_PROFILE' in os.environ: environment['AWS_PROFILE'] = os.environ['AWS_PROFILE'] volumes = {} aws_config_dir = os.getenv('AWS_CONFIG_DIR', os.path.join(os.environ['HOME'], '.aws')) if aws_config_dir is not None and os.path.exists(aws_config_dir): volumes[aws_config_dir] = {'bind': '/root/.aws', 'mode': 'rw'} # Make sure latest committed is sent to remote if push_input_bundle: result = pfs.push(human_name=input_bundle) if result is None: _logger.error( "'run' failed trying to push input bundle {} to remote.". format(input_bundle)) return # Now try to run the container try: args = ' '.join( common.make_run_command(input_bundle, output_bundle, output_bundle_uuid, remote, branch_name, input_tags, output_tags, pipeline_params)) print "run.py ARGS {}".format(args) _logger.debug('Running image {} with arguments {}'.format( pipeline_image_name, args)) stdout = client.containers.run(pipeline_image_name, args, detach=False, environment=environment, init=True, stderr=True, volumes=volumes) print stdout except docker.errors.ImageNotFound: _logger.error("Unable to find the docker image {}".format( pipeline_image_name)) return # Now that this is finished, we need to pull this from the remote. pfs.pull(output_bundle, output_bundle_uuid) else: raise ValueError( 'Got unrecognized job backend \'{}\': Expected {}'.format( backend, Backend.options()))