def s3_bucket_exists(bucket): """ Code from Amazon docs for checking bucket existence. Args: bucket: Returns: booL: whether bucket exists """ import botocore s3 = get_s3_resource() exists = True try: s3.meta.client.head_bucket(Bucket=bucket) except botocore.exceptions.ClientError as e: error_code = int(e.response['Error']['Code']) if error_code == 404: exists = False elif error_code == 403: # for buckets you can get a forbidden instead of resource not found # if you have the s3:ListBucket permission on the bucket, Amazon S3 will return a # HTTP status code 404 ("no such key") error. If you don't have the s3:ListBucket permission, # Amazon S3 will return a HTTP status code 403 ("access denied") error. _logger.info( "aws_s3: bucket {} raised a 403 (access forbidden), do you have ListBucket permission?" .format(bucket)) exists = False else: raise return exists
def s3_path_exists(s3_url): """ Given an entire path, does the key exist? If you're checking for partial key, make sure to end with '/' This is how you make "folders" in s3, you use a key ending with '/' e.g., s3://mybucket/onelevel/anotherdir/ bucket = mybucket key = onelevel/anotherdir/ -- it's a zero size object. If checking for full path, you can end with thing itself. Args: s3_url: Returns: """ import botocore s3 = get_s3_resource() bucket, key = split_s3_url(s3_url) if key is None: return s3_bucket_exists(bucket) try: s3.Object(bucket, key).load() except botocore.exceptions.ClientError as e: error_code = int(e.response['Error']['Code']) _logger.info("Error code {}".format(error_code)) if error_code == 404: return False else: raise return True
def resolve_bundle(pfs, pipe, is_left_edge_task, data_context): """ Args: pfs: pipe fs object pipe: the pipe to investigate is_left_edge_task: True if this task starts the DAG. data_context: the data context object from which we should resolve bundles. Returns: bool: True if bundle found (not re-running). False if bundle not found or being regenerated """ # These are constants verbose = False use_bundle = True regen_bundle = False # 1.) Get output bundle for pipe_id (the specific pipeline/transform/param hash). if verbose: print("resolve_bundle: looking up bundle {}".format(pipe.pipe_id())) if pipe._mark_force and not worker._is_external(pipe): # Forcing recomputation through a manual annotation in the pipe.pipe_requires() itself # If it is external, we don't recompute in any case. _logger.debug("resolve_bundle: pipe.mark_force forcing a new output bundle.") if verbose: print("resolve_bundle: pipe.mark_force forcing a new output bundle.\n") pfs.new_output_hframe(pipe, is_left_edge_task, data_context=data_context) return regen_bundle if pipe.force: # Forcing recomputation through a manual --force directive _logger.debug("resolve_bundle: --force forcing a new output bundle.") if verbose: print("resolve_bundle: --force forcing a new output bundle.\n") pfs.new_output_hframe(pipe, is_left_edge_task, data_context=data_context) return regen_bundle bndls = pfs.get_hframe_by_proc(pipe.pipe_id(), getall=True, data_context=data_context) if bndls is None or len(bndls) <= 0: if verbose: print("resolve_bundle: No bundle with proc_name {}, getting new output bundle.\n".format(pipe.pipe_id())) # no bundle, force recompute pfs.new_output_hframe(pipe, is_left_edge_task, data_context=data_context) return regen_bundle bndl = bndls[0] # our best guess is the most recent bundle with the same pipe_id() # 2.) Bundle exists - lineage object tells us input bundles. lng = bndl.get_lineage() if lng is None: if verbose: print("resolve_bundle: No lineage present, getting new output bundle.\n") pfs.new_output_hframe(pipe, is_left_edge_task, data_context=data_context) return regen_bundle # 3.) Lineage record exists -- if new code, re-run pipeline_path = os.path.dirname(sys.modules[pipe.__module__].__file__) current_version = fs.DisdatFS().get_pipe_version(pipeline_path) if different_code_versions(current_version, lng): if verbose: print("resolve_bundle: New code version, getting new output bundle.\n") pfs.new_output_hframe(pipe, is_left_edge_task, data_context=data_context) return regen_bundle # 3.5.) Have we changed the output human bundle name? If so, re-run task. # Note: we need to go through all the bundle versions with that processing_id. # because, at the moment, we make new bundles when we change name. When in some sense # it's just a tag set that should include other names and the data should be the same. current_human_name = pipe.pipeline_id() found = False for bndl in bndls: if current_human_name == bndl.get_human_name(): found = True break if not found: if verbose: print("resolve_bundle: New human name {} (prior {}), getting new output bundle.\n".format( current_human_name, bndl.get_human_name())) pfs.new_output_hframe(pipe, is_left_edge_task, data_context=data_context) return regen_bundle # 4.) Check the inputs -- assumes we have processed upstream tasks already for task in pipe.requires(): """ Are we re-running an upstream input (look in path cache)? At this time the only bundles a task depends on are the ones created by its upstream tasks. We have to look through its *current* list of possible upstream tasks, not the ones it had on its prior run. If the UUID has changed relative to lineage, then we need to re-run. In general, the only reason we should re-run an upstream is b/c of a code change. And that change did not change the tasks parameters. So it looks the same, but it is actually different. OR someone re-runs a sql query and the table has changed and the output changes those the parameters are the same. Sometimes folks remove an output to force a single stage to re-run, just for that reason. But if an output exists and we want to ignore code version and ignore data changes then while we do this, we should re-use our bundle independent of whether an upstream needs to re-run or whether one of our inputs is out of date. So one option is to ignore upstreams that need to be re-run. Re-use blindly. Like Luigi. Another option is that anytime we don't have an input bundle, we attempt to read it not just locally, but remotely as well. """ pce = pfs.get_path_cache(task) LUIGI_RERUN = False if LUIGI_RERUN: # Ignore whether upstreams had to be re-run b/c they didn't have bundles. # Ignore whether this has to be re-run because existing inputs are newer continue if pce is None: # this can happen with bundles created by other pipelines. # still surface the warning, but no longer raise exception _logger.info( "Resolve bundles: input bundle {} with no path cache entry. Likely produced by other pipesline".format( task.task_id)) else: if pce.rerun: if verbose: print("Resolve_bundle: an upstream task is in the pce and is being re-run, so we need to reun. getting new output bundle.\n") pfs.new_output_hframe(pipe, is_left_edge_task, data_context=data_context) return regen_bundle local_bundle = pfs.get_hframe_by_proc(task.task_id, data_context=data_context) assert(local_bundle is not None) """ Now we need to check if we should re-run this task because an upstream input exists and has been updated Go through each of the inputs used for this current task. POLICY 1.) if the date is more recent, it is "new" data. 2.) if it is older, we should require force (but currently do not and re-run). XXX TODO: Add date to the depends_on pb data structure to enforce 2 XXX """ for tup in lng.pb.depends_on: if tup.hframe_name == local_bundle.pb.processing_name and tup.hframe_uuid != local_bundle.pb.uuid: if verbose: print("Resolve_bundle: prior input bundle {} {} has new uuid {}\n".format( task.task_id, tup.hframe_uuid, local_bundle.pb.uuid)) pfs.new_output_hframe(pipe, is_left_edge_task, data_context=data_context) return regen_bundle # 5.) Woot! Reuse the found bundle. if verbose: print("resolve_bundle: reusing bundle\n") pfs.reuse_hframe(pipe, bndl, is_left_edge_task, data_context=data_context) return use_bundle
def _run_local(cli, pipeline_setup_file, arglist, backend): """ Run container locally or run sagemaker container locally Args: cli (bool): Whether we were called from the CLI or API pipeline_setup_file (str): The FQ path to the setup.py used to dockerize the pipeline. arglist: backend: Returns: output (str): Returns None if there is a failure """ on_macos = False if platform == "darwin": on_macos = True client = docker.from_env() environment = {} if 'AWS_PROFILE' in os.environ: environment['AWS_PROFILE'] = os.environ['AWS_PROFILE'] environment[common.LOCAL_EXECUTION] = 'True' # Todo: Local runs do not yet set resource limits, but when they do, we'll have to set this #environment['DISDAT_CPU_COUNT'] = vcpus volumes = {} aws_config_dir = os.getenv('AWS_CONFIG_DIR', os.path.join(os.environ['HOME'], '.aws')) if aws_config_dir is not None and os.path.exists(aws_config_dir): volumes[aws_config_dir] = {'bind': '/root/.aws', 'mode': 'rw'} local_disdat_meta_dir = DisdatConfig.instance().get_meta_dir() volumes[local_disdat_meta_dir] = {'bind': '/root/.disdat', 'mode': 'rw'} try: if backend == Backend.LocalSageMaker: pipeline_image_name = common.make_sagemaker_project_image_name( pipeline_setup_file) tempdir = tempfile.mkdtemp() with open(os.path.join(tempdir, 'hyperparameters.json'), 'w') as of: json.dump(_sagemaker_hyperparameters_from_arglist(arglist), of) args = ['train'] # rewrite to just 'train' # On mac OS, tempdir returns /var, but is actually /private/var # Add /private since it that dir is shared (and not /var) with Docker. if on_macos: localdir = os.path.join('/private', tempdir[1:]) else: localdir = tempdir volumes[localdir] = { 'bind': '/opt/ml/input/config/', 'mode': 'rw' } _logger.info("VOLUMES: {}".format(volumes)) else: # Add the actual command to the arglist (for non-sagemaker runs) arglist = [ENTRYPOINT_BIN] + arglist pipeline_image_name = common.make_project_image_name( pipeline_setup_file) _logger.debug('Running image {} with arguments {}'.format( pipeline_image_name, arglist)) stdout = client.containers.run(pipeline_image_name, arglist, detach=False, environment=environment, init=True, stderr=True, volumes=volumes) stdout = six.ensure_str(stdout) if cli: print(stdout) return stdout except docker.errors.ContainerError as ce: _logger.error( "Internal error running image {}".format(pipeline_image_name)) _logger.error("Error: {}".format(six.ensure_str(ce.stderr))) return six.ensure_str(ce) except docker.errors.ImageNotFound: _logger.error( "Unable to find the docker image {}".format(pipeline_image_name)) return None
def _run_aws_sagemaker(arglist, fq_repository_name, job_name): """ Runs a training job on AWS SageMaker. This uses the default machine type in the disdat.cfg file. Args: arglist: fq_repository_name (str): fully qualified repository name job_name: instance job name Returns: TrainingJobArn (str) """ disdat_config = DisdatConfig.instance() job_name = job_name.replace( '_', '-') # b/c SageMaker complains it must be ^[a-zA-Z0-9](-*[a-zA-Z0-9])* hyperparameter_dict = _sagemaker_hyperparameters_from_arglist(arglist) algorithm_specification = { 'TrainingImage': fq_repository_name, 'TrainingInputMode': 'File' } role_arn = disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_role_arn') input_channel_config = [ { 'ChannelName': 'disdat_sagemaker_input_blackhole', 'DataSource': { 'S3DataSource': { 'S3DataType': 'S3Prefix', 'S3Uri': disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_s3_input_uri'), 'S3DataDistributionType': 'FullyReplicated' } }, 'ContentType': 'application/javascript', 'CompressionType': 'None', # | 'Gzip', 'RecordWrapperType': 'None' # | 'RecordIO' }, ] output_data_config = { 'S3OutputPath': os.path.join( disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_s3_output_uri'), job_name) } resource_config = { 'InstanceType': disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_instance_type'), 'InstanceCount': int( disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_instance_count')), 'VolumeSizeInGB': int( disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_volume_sizeGB')) # 'VolumeKmsKeyId': 'string' } vpc_config = None #'SecurityGroupIds': [], 'Subnets': []} stopping_condition = { 'MaxRuntimeInSeconds': int( disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_max_runtime_sec')) } tags = [{ 'Key': 'user', 'Value': 'disdat' }, { 'Key': 'job', 'Value': job_name }] if False: print("Disdat SageMaker configs") print("job name: {}".format(job_name)) print("hparams: {}".format(hyperparameter_dict)) print("algorithm: {}".format(algorithm_specification)) print("Role ARN: {}".format(role_arn)) print("Input data conf: {}".format(input_channel_config)) print("Output data conf: {}".format(output_data_config)) print("Resource conf: {}".format(resource_config)) print("VPC conf: {}".format(vpc_config)) print("Stopping condition seconds: {}".format(stopping_condition)) print("Tags: {}".format(tags)) client = b3.client('sagemaker', region_name=aws.profile_get_region()) response = client.create_training_job( TrainingJobName=job_name, HyperParameters=hyperparameter_dict, AlgorithmSpecification=algorithm_specification, RoleArn=role_arn, InputDataConfig=input_channel_config, OutputDataConfig=output_data_config, ResourceConfig=resource_config, StoppingCondition=stopping_condition, Tags=tags) _logger.info( "Disdat SageMaker create_training_job response {}".format(response)) return response['TrainingJobArn']
def _run_aws_batch(arglist, fq_repository_name, job_name, pipeline_image_name, aws_session_token_duration, vcpus, memory, no_submit, job_role_arn): """ Run job on AWS Batch. Sends to queue configured in disdat.cfg. This assumes that you have already created a cluster that will run the jobs that have been assigned to that queue. Args: arglist: fq_repository_name (str): The fully qualified docker repository name job_name: pipeline_image_name: aws_session_token_duration: vcpus: memory: no_submit (bool): default False job_role_arn (str): Can be None Returns: """ def check_role_arn(job_dict, jra): """ Check to see if the job desc dictionary contains the same job_role_arn (jra) """ if jra is None: if 'jobRoleArn' not in job_dict['containerProperties']: return True else: if 'jobRoleArn' in job_dict['containerProperties']: if job_dict['containerProperties']['jobRoleArn'] == jra: return True return False disdat_config = DisdatConfig.instance() # Get the parameter values required to kick off an AWS Batch job. # Every batch job must: # 1. Have a name # 2. Have a job definition that declares which ECR-hosted Docker # image to use. # 3. Have a queue that feeds jobs into a compute cluster. # 4. The command to execute inside the Docker image; the command # args are more-or-less the same as the ones used to execute # locally using 'dsdt run' # Create a Job Definition and upload it. # We create per-user job definitions so multiple users do not clobber each other. # In addition, we never re-use a job definition, since the user may update # the vcpu or memory requirements and those are stuck in the job definition job_definition_name = aws.batch_get_job_definition_name( pipeline_image_name) if disdat_config.parser.has_option(_MODULE_NAME, 'aws_batch_job_definition'): job_definition_name = disdat_config.parser.get( _MODULE_NAME, 'aws_batch_job_definition') # TODO: Look through all of history to find one that matches? # TODO: Delete old jobs here or let user do it? job_definition_obj = aws.batch_get_latest_job_definition( job_definition_name) if (job_definition_obj is not None and job_definition_obj['containerProperties']['image'] == fq_repository_name and job_definition_obj['containerProperties']['vcpus'] == vcpus and job_definition_obj['containerProperties']['memory'] == memory and check_role_arn(job_definition_obj, job_role_arn)): job_definition_fqn = aws.batch_extract_job_definition_fqn( job_definition_obj) _logger.info("Re-using prior AWS Batch run job definition : {}".format( job_definition_obj)) else: """ Whether None or doesn't match, make a new one """ job_definition_obj = aws.batch_register_job_definition( job_definition_name, fq_repository_name, vcpus=vcpus, memory=memory, job_role_arn=job_role_arn) job_definition_fqn = aws.batch_get_job_definition(job_definition_name) _logger.info( "New AWS Batch run job definition {}".format(job_definition_fqn)) if no_submit: # Return the job description object return job_definition_obj job_queue = disdat_config.parser.get(_MODULE_NAME, 'aws_batch_queue') container_overrides = {'command': arglist} # Through the magic of boto3_session_cache, the client in our script # here can get at AWS profiles and temporary AWS tokens created in # part from MFA tokens generated through the user's shells; we don't # have to write special code of our own to deal with authenticating # with AWS. client = b3.client('batch', region_name=aws.profile_get_region()) # A bigger problem might be that the IAM role executing the job on # a batch EC2 instance might not have access to the S3 remote. To # get around this, allow the user to create some temporary AWS # credentials. if aws_session_token_duration > 0 and job_role_arn is None: sts_client = b3.client('sts') try: token = sts_client.get_session_token( DurationSeconds=aws_session_token_duration) credentials = token['Credentials'] container_overrides['environment'] = [{ 'name': 'AWS_ACCESS_KEY_ID', 'value': credentials['AccessKeyId'] }, { 'name': 'AWS_SECRET_ACCESS_KEY', 'value': credentials['SecretAccessKey'] }, { 'name': 'AWS_SESSION_TOKEN', 'value': credentials['SessionToken'] }] except Exception as e: _logger.debug( "Unable to generate an STS token, instead trying users default credentials..." ) credentials = b3.session.Session().get_credentials() container_overrides['environment'] = [{ 'name': 'AWS_ACCESS_KEY_ID', 'value': credentials.access_key }, { 'name': 'AWS_SECRET_ACCESS_KEY', 'value': credentials.secret_key }, { 'name': 'AWS_SESSION_TOKEN', 'value': credentials.token }] container_overrides['environment'].append({ 'name': 'DISDAT_CPU_COUNT', 'value': str(vcpus) }) job = client.submit_job(jobName=job_name, jobDefinition=job_definition_fqn, jobQueue=job_queue, containerOverrides=container_overrides) status = job['ResponseMetadata']['HTTPStatusCode'] if status == 200: _logger.info( 'Job {} (ID {}) with definition {} submitted to AWS Batch queue {}' .format(job['jobName'], job['jobId'], job_definition_fqn, job_queue)) return job else: _logger.error('Job submission failed: HTTP Status {}'.format()) return None