def s3_list_objects_at_prefix_v2(bucket, prefix): """ List out the objects at this prefix Returns a list of the keys found at this bucket. We do so because boto objects aren't serializable under multiprocessing Note: Use v2 for multi-processing, since this filters on the server side! Args: bucket(str): The s3 bucket prefix(str): The s3 key prefix you wish to search Returns: (list): List of item keys """ result = [] client = get_s3_client() #print(f"s3_list_objects_at_prefix_v2 the b3[{b3}] and client[{b3.client} and resource[{b3.resource}]") try: paginator = client.get_paginator('list_objects_v2') page_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix) for page in page_iterator: if 'Contents' not in page: continue result += [obj['Key'] for obj in page['Contents']] except Exception as e: _logger.error( "s3_list_objects_starting_hex: failed with exception {}".format(e)) raise return result
def rm_bundle_dir(output_path, uuid, db_targets): """ We created a directory (managed path) to hold the bundle and any files. The files have been copied in. Removing the directory removes any created files. If the user has told us about any DBTargets, also call rm() on those. TODO: Integrate with data_context bundle remove. That deals with information already stored in the local DB. ASSUMES: That we haven't actually updated the local DB with information on this bundle. Args: output_path (str): uuid (str): db_targets (list(DBTarget)): Returns: None """ try: shutil.rmtree(output_path) # if people create s3 files, s3 file targets, inside of an s3 context, # then we will have to clean those up as well. for t in db_targets: t.rm() except IOError as why: _logger.error( "Removal of hyperframe directory {} failed with error {}. Continuing removal..." .format(uuid, why))
def s3_list_objects_at_prefix(bucket, prefix): """ List out the objects at this prefix. Returns a list of the keys found at this bucket. We do so because boto objects aren't serializable under multiprocessing Note: Do *not* use with multi-processing. This version uses boto Collections. That means all the filtering is done on the client side, which makes this a bad choice for multiprocessing as all the work is done for each call. Args: bucket(str): The s3 bucket prefix(str): The s3 key prefix you wish to search Returns: (list): List of item keys """ s3 = get_s3_resource() result = [] try: s3_b = s3.Bucket(bucket) for i in s3_b.objects.filter(Prefix=prefix, MaxKeys=1024): result.append(i) except Exception as e: _logger.error( "s3_list_objects_starting_hex: failed with exception {}".format(e)) raise return result
def _get_context(context_name): """Retrieve data context given name. Raise exception if not found. Args: context_name(str): <remote context>/<local context> or <local context> Returns: (`disdat.data_context.DataContext`) """ fs = _get_fs() data_context = fs.get_context(context_name) if data_context is None: # Try once to see if needs to be loaded data_context = fs.reload_context(context_name) if data_context is None: error_msg = "Unable to perform operation: could not find context {}".format( context_name) _logger.error(error_msg) raise RuntimeError(error_msg) return data_context
def convert_str_params(cls, params_str): """ This is similar to Luigi.Task.from_str_params(cls, params_str) But we don't create the class here, and we outer loop through our params (not the classes params). We just want to convert each of the params that are in the class and in this dictionary into the deserialized form. NOTE: This is somewhat dangerous and could break if Luigi changes around this code. The alternative is to use Luigi.load_task() but then we have to ensure all the input parameters are "strings" and we have to then put special code inside of apply to know when to create a class normally, or create it from the CLI. Parameters: params_str (dict): dict of str->str. param name -> value . """ kwargs = {} cls_params = {n: p for n, p in cls.get_params() } # get_params() returns [ (name, param), ... ] for param_name, param_str in params_str.items(): if param_name in cls_params: param = cls_params[param_name] if isinstance(param_str, list): kwargs[param_name] = param._parse_list(param_str) else: kwargs[param_name] = param.parse(param_str) else: _logger.error("Parameter {} is not defined in class {}.".format( param_name, cls.__name__)) raise ValueError("Parameter {} is not defined in class {}.".format( param_name, cls.__name__)) return kwargs
def __init__(self, local_context, name, owner=''): """ Given name and a local context, create a handle that users can work with to: a.) Create files / directories / dbtables (i.e., Bundle Links) b.) Add constants and files into bundle Create the bundle ahead of time, and add items to it. Or use a temp dir and copy things into the bundle when you're done. If #2 then, it's easy to use a bundle object and write it to multiple contexts. We should close or destroy a bundle in case 2. Args: local_context (str): Where this bundle will be output or where it was sourced from. name (str): Human name for this bundle """ self.fs = _get_fs() try: self.data_context = self.fs.get_context(local_context) except Exception as e: _logger.error("Unable to allocate bundle in context: {} ".format(local_context, e)) return super(Bundle, self).__init__(human_name=name, owner=owner) self.local_dir = None self.remote_dir = None self.open = False self.closed = False self.depends_on = [] # list of tuples (processing_name, uuid) of bundles on which this bundle depends self.data = None # The df, array, dictionary the user wants to store
def create_output_dir(self, dirname): """ Disdat Pipe API Function Given basename directory name, return a fully qualified path whose prefix is the local output directory for this bundle in the current context. This call creates the output directory as well. Args: dirname (str): The name of the output directory, i.e., "models" Returns: output_dir (str): Fully qualified path of a directory whose prefix is the bundle's local output directory. """ prefix_dir = self.get_output_dir() fqp = os.path.join(prefix_dir, dirname) try: os.makedirs(fqp) except IOError as why: _logger.error( "Creating directory in bundle directory failed:".format(why)) return fqp
def open(self, force_uuid=None): """ Management operations to open bundle for writing. At this time all of the open operations, namely creating the managed path occur in the default constructor or in the class fill_from_hfr constructor. Args: force_uuid (str): DEPRECATING - do not use. Force to open a bundle with a specific bundle. Returns: Bundle """ if self._closed: _logger.error("Bundle is closed -- unable to re-open.") assert False self._local_dir, self.pb.uuid, self._remote_dir = self.data_context.make_managed_path( uuid=force_uuid) return self
def ls_s3_url_objects(s3_url): """ Return aws boto3 ObjectSummary's Note: There is no current way in boto3 to do globs -- you filter on the client side. Returns: list:str: list of ObjectSummary's under this path """ result = [] if s3_url[-1] is not '/': s3_url += '/' bucket, s3_path = split_s3_url(s3_url) #if not s3_bucket_exists(bucket): # return result if False: client = b3.client('s3') paginator = client.get_paginator('list_objects_v2') # use delimiter to groupby, which means, list things only at this level. #page_iterator = paginator.paginate(Bucket=bucket, Delimiter='/', Prefix=s3_path) page_iterator = paginator.paginate(Bucket=bucket, Prefix=s3_path) for page in page_iterator: result += [obj['Key'] for obj in page['Contents']] else: s3 = b3.resource('s3') try: s3_b = s3.Bucket(bucket) for i in s3_b.objects.filter(Prefix=s3_path, MaxKeys=1024): result.append(i) if len(result) == 1024: _logger.warn( "ls_s3_url_objects: hit MaxKeys 1024 limit in result set.") except Exception as e: _logger.error( "ls_s3_url_objects: failed with exception {}".format(e)) raise return result
def make_directory(self, dir_name): """ Returns path `<disdat-managed-directory>/<dir_name>`. This is used if you need to hand a process an output directory and you do not have control of what it writes in that directory. Add this path as you would add file paths to your output bundle. Disdat will incorporate all the data found in this directory into the bundle. See Pipe.create_output_dir() Arguments: dir_name (str): Either a FQP (prefix is the bundle path) or a basedir of a directory to appear in the bundle. Neither should end in / Returns: str: A directory path managed by disdat """ assert (self.open and not self.closed) # remove the prefix iff it exists dst_base_path = dir_name.replace(self.local_dir, '') # if the user erroneously passes in the directory of the bundle, return same if dst_base_path == '': return self.local_dir fqp = os.path.join(self.local_dir, dst_base_path.lstrip('/')) try: os.makedirs(fqp) except OSError as why: if not why.errno == errno.EEXIST: _logger.error( "Creating directory in bundle directory failed errno {}". format(why.strerror)) raise # else directory exists is OK and fall through except IOError as why: _logger.error( "Creating directory in bundle directory failed {}".format(why)) raise return fqp
def get_directory(self, dir_name): """ Returns path `<disdat-managed-directory>/<dir_name>`. This gives the user a local output directory into which to write files. This is useful when a user needs to give an external tool, such as Spark or Tensorflow, a directory to place output files. After this call, the directory will exist in the local context. It is the user's responsibility to add individual file links to the bundle. It is an error to add a directory as a file link. Arguments: dir_name (str): A basedir of a directory to appear in the local bundle. Returns: str: A directory path managed by disdat """ self._check_open() if dir_name[-1] == '/': dir_name = dir_name[:-1] # if the user erroneously passes in the directory of the bundle, return same if dir_name == self._local_dir: return self._local_dir fqp = os.path.join(self._local_dir, dir_name.lstrip('/')) try: os.makedirs(fqp) except OSError as why: if not why.errno == errno.EEXIST: _logger.error( "Creating directory in bundle directory failed errno {}". format(why.strerror)) raise except IOError as why: _logger.error( "Creating directory in bundle directory failed {}".format(why)) raise return fqp
def abandon(self): """ Remove on-disk state of the bundle if it is abandoned before it is closed. that were left !closed have their directories harvested. NOTE: the user has the responsibility to make sure the bundle is not shared across threads or processes and that they don't remove a directory out from under another thread of control. E.g., you cannot place this code in __del__ and then _check_closed() b/c a forked child process might have closed their copy while the parent deletes theirs. """ self._check_open() _logger.debug( f"Disdat api abandon bundle obj [{id(self)}] process[{os.getpid()}] uuid[{self.uuid}]" ) try: shutil.rmtree(self._local_dir, ignore_errors=True) os.rmdir(self._local_dir) # TODO: if people create s3 files, s3 file targets, inside of an s3 context, # TODO: then we will have to clean those up as well. except IOError as why: _logger.error( "Removal of bundle directory {} failed with error {}. Continuing removal..." .format(self._local_dir, why))
def add_bundle_meta_files(pipe_task): """ Given a pipe or driver task, create the bundle metaoutput files and Luigi output targets for them. Use the pipe_task (or driver task) to get the name of the bundle. Use the name of the bundle to look up the output path in the pipe cache in the PipeFS class object. Create an hframe. The individual frame records have to be written out before hand. Args: pipe_task: The pipe task that will use these outputs Returns: [ luigi output for meta file, luigi output for lineage file ] """ pce = DisdatFS.get_path_cache(pipe_task) if pce is None: # This can happen when the pipe has been created with non-deterministic parameters _logger.error( "add_bundle_meta_files: could not find pce for task {}".format( pipe_task.pipe_id())) _logger.error( "It is possible one of your tasks is parameterized in a non-deterministic fashion." ) raise Exception( "add_bundle_meta_files: Unable to find pce for task {}".format( pipe_task.pipe_id())) hframe = { PipeBase.HFRAME: luigi.LocalTarget( os.path.join(pce.path, HyperFrameRecord.make_filename(pce.uuid))) } return hframe
def put_path_cache(pipe_instance, bundle, uuid, path, rerun, overwrite=False): """ The path cache is used to associate a pipe instance with its output path and whether we have decided to re-run this pipe. If rerun is True, then there should be no ouput at this path. AND it should eventually be added as a new version of this bundle. Args: pipe_instance: instance of a pipe bundle (disdat.api.Bundle): The bundle to hold the output data and metadata uuid: specific uuid of the output path path: where to write the bundle rerun: whether or not we are re-running or re-using overwrite: overwrite existing entry (if exists) Returns: pce or raise KeyError """ pipe_name = pipe_instance.processing_id() pce = PathCacheEntry(pipe_instance, bundle, uuid, path, rerun) if pipe_name not in PathCache.task_path_cache: PathCache.task_path_cache[pipe_name] = pce else: if pce == PathCache.task_path_cache[ pipe_name]: # The tuples are identical _logger.error( "path_cache dup key: pipe {} already bound to same PCE {} " .format(pipe_name, pce)) else: if overwrite: PathCache.task_path_cache[pipe_name] = pce else: raise KeyError( "path_cache dup key: pipe {} bound to pce {} but trying to re-assign to {}" .format(pipe_name, PathCache.task_path_cache[pipe_name], pce)) return pce
def _run_aws_batch(arglist, fq_repository_name, job_name, pipeline_image_name, aws_session_token_duration, vcpus, memory, no_submit, job_role_arn): """ Run job on AWS Batch. Sends to queue configured in disdat.cfg. This assumes that you have already created a cluster that will run the jobs that have been assigned to that queue. Args: arglist: fq_repository_name (str): The fully qualified docker repository name job_name: pipeline_image_name: aws_session_token_duration: vcpus: memory: no_submit (bool): default False job_role_arn (str): Can be None Returns: """ def check_role_arn(job_dict, jra): """ Check to see if the job desc dictionary contains the same job_role_arn (jra) """ if jra is None: if 'jobRoleArn' not in job_dict['containerProperties']: return True else: if 'jobRoleArn' in job_dict['containerProperties']: if job_dict['containerProperties']['jobRoleArn'] == jra: return True return False disdat_config = DisdatConfig.instance() # Get the parameter values required to kick off an AWS Batch job. # Every batch job must: # 1. Have a name # 2. Have a job definition that declares which ECR-hosted Docker # image to use. # 3. Have a queue that feeds jobs into a compute cluster. # 4. The command to execute inside the Docker image; the command # args are more-or-less the same as the ones used to execute # locally using 'dsdt run' # Create a Job Definition and upload it. # We create per-user job definitions so multiple users do not clobber each other. # In addition, we never re-use a job definition, since the user may update # the vcpu or memory requirements and those are stuck in the job definition job_definition_name = aws.batch_get_job_definition_name( pipeline_image_name) if disdat_config.parser.has_option(_MODULE_NAME, 'aws_batch_job_definition'): job_definition_name = disdat_config.parser.get( _MODULE_NAME, 'aws_batch_job_definition') # TODO: Look through all of history to find one that matches? # TODO: Delete old jobs here or let user do it? job_definition_obj = aws.batch_get_latest_job_definition( job_definition_name) if (job_definition_obj is not None and job_definition_obj['containerProperties']['image'] == fq_repository_name and job_definition_obj['containerProperties']['vcpus'] == vcpus and job_definition_obj['containerProperties']['memory'] == memory and check_role_arn(job_definition_obj, job_role_arn)): job_definition_fqn = aws.batch_extract_job_definition_fqn( job_definition_obj) _logger.info("Re-using prior AWS Batch run job definition : {}".format( job_definition_obj)) else: """ Whether None or doesn't match, make a new one """ job_definition_obj = aws.batch_register_job_definition( job_definition_name, fq_repository_name, vcpus=vcpus, memory=memory, job_role_arn=job_role_arn) job_definition_fqn = aws.batch_get_job_definition(job_definition_name) _logger.info( "New AWS Batch run job definition {}".format(job_definition_fqn)) if no_submit: # Return the job description object return job_definition_obj job_queue = disdat_config.parser.get(_MODULE_NAME, 'aws_batch_queue') container_overrides = {'command': arglist} # Through the magic of boto3_session_cache, the client in our script # here can get at AWS profiles and temporary AWS tokens created in # part from MFA tokens generated through the user's shells; we don't # have to write special code of our own to deal with authenticating # with AWS. client = b3.client('batch', region_name=aws.profile_get_region()) # A bigger problem might be that the IAM role executing the job on # a batch EC2 instance might not have access to the S3 remote. To # get around this, allow the user to create some temporary AWS # credentials. if aws_session_token_duration > 0 and job_role_arn is None: sts_client = b3.client('sts') try: token = sts_client.get_session_token( DurationSeconds=aws_session_token_duration) credentials = token['Credentials'] container_overrides['environment'] = [{ 'name': 'AWS_ACCESS_KEY_ID', 'value': credentials['AccessKeyId'] }, { 'name': 'AWS_SECRET_ACCESS_KEY', 'value': credentials['SecretAccessKey'] }, { 'name': 'AWS_SESSION_TOKEN', 'value': credentials['SessionToken'] }] except Exception as e: _logger.debug( "Unable to generate an STS token, instead trying users default credentials..." ) credentials = b3.session.Session().get_credentials() container_overrides['environment'] = [{ 'name': 'AWS_ACCESS_KEY_ID', 'value': credentials.access_key }, { 'name': 'AWS_SECRET_ACCESS_KEY', 'value': credentials.secret_key }, { 'name': 'AWS_SESSION_TOKEN', 'value': credentials.token }] container_overrides['environment'].append({ 'name': 'DISDAT_CPU_COUNT', 'value': str(vcpus) }) job = client.submit_job(jobName=job_name, jobDefinition=job_definition_fqn, jobQueue=job_queue, containerOverrides=container_overrides) status = job['ResponseMetadata']['HTTPStatusCode'] if status == 200: _logger.info( 'Job {} (ID {}) with definition {} submitted to AWS Batch queue {}' .format(job['jobName'], job['jobId'], job_definition_fqn, job_queue)) return job else: _logger.error('Job submission failed: HTTP Status {}'.format()) return None
def run(self): """ Call users run function. 1.) prepare the arguments 2.) run and gather user result 3.) interpret and wrap in a HyperFrame Returns: (`hyperframe.HyperFrame`): """ kwargs = self.prepare_pipe_kwargs(for_run=True) pce = self.pfs.get_path_cache(self) assert (pce is not None) try: start = time.time() #P3 datetime.now().timestamp() user_rtn_val = self.pipe_run(**kwargs) stop = time.time() #P3 datetime.now().timestamp() except Exception as error: """ If user's pipe fails for any reason, remove bundle dir and raise """ try: _logger.error( "User pipe_run encountered exception: {}".format(error)) PipeBase.rm_bundle_dir(pce.path, pce.uuid, self.db_targets) except OSError as ose: _logger.error( "User pipe_run encountered error, and error on remove bundle: {}" .format(ose)) raise try: presentation, frames = PipeBase.parse_return_val( pce.uuid, user_rtn_val, self.data_context) hfr = PipeBase.make_hframe(frames, pce.uuid, self.bundle_inputs(), self.pipeline_id(), self.pipe_id(), self, start_ts=start, stop_ts=stop, tags={"presentable": "True"}, presentation=presentation) # Add Luigi Task parameters -- Only add the class parameters. These are Disdat special params. self.user_tags.update(self._get_subcls_params(self)) if self.output_tags: self.user_tags.update(self.output_tags) if isinstance(self.calling_task, DriverTask): self.user_tags.update({'root_task': 'True'}) if self.user_tags: hfr.replace_tags(self.user_tags) self.data_context.write_hframe(hfr) transient = False if hfr.get_tag(BUNDLE_TAG_TRANSIENT) is not None: transient = True if self.incremental_push and not transient: self.pfs.commit(None, None, uuid=pce.uuid, data_context=self.data_context) self.pfs.push(uuid=pce.uuid, data_context=self.data_context) except Exception as error: """ If we fail for any reason, remove bundle dir and raise """ PipeBase.rm_bundle_dir(pce.path, pce.uuid, self.db_targets) raise return hfr
def error(msg, *args, **kwargs): _logger.error(msg, *args, **kwargs) sys.exit(1)
def __init__(self, local_context, name=None, data=None, processing_name=None, owner=None, tags=None, params=None, dependencies=None, code_method=None, vc_info=None, start_time=0, stop_time=0): """ Create a bundle in a local context. There are three ways to create bundles: 1.) Create a bundle with a single call. Must include a data field! b = api.Bundle('examples', name='propensity_model',owner='fred',data='/Users/fred/model.tgz') 2.) Create a bundle using a context manager. The initial call requires only a context. with api.Bundle('examples') as b: b.add_data(file_list) b.add_code_ref('mymodule.mymethod') b.add_params({'path': path}) b.add_tags(tags) Users can query the bundle object to create output files directly in the referred-to context. They may also add tags, parameters, code and git info, and start/stop times. Once the bundles is "closed" via the context manager, it will be written to disk and immutable. Note that one may change anything about an "open" bundle except the context information. 3.) Open and close manually. b = api.Bundle('examples').open() b.add_data(file_list) b.add_code_ref('mymodule.mymethod') b.add_params({'path': path}) b.add_tags(tags) b.close() Default name: If you don't provide a name, Disdat tries to use the basename in `code_ref`. Default processing_name: If you don't provide a processing name, Disdat will use a default that takes into consideration your bundles upstream inputs, parameters, and code reference. Args: local_context (Union[str, `disdat.data_context.DataContext`): The local context name or context object name (str): Human name for this bundle. data (union(pandas.DataFrame, tuple, None, list, dict)): The data this bundle contains. processing_name (str): A name that indicates a bundle was made in an identical fashion. owner (str): The owner of the bundle. Default getpass.getuser() tags (dict): (str,str) dictionary of arbitrary user tags. params (dict(str:str)): Dictionary of parameters that <code_method> used to produce this output. dependencies (dict(str:bundle)): Dictionary of argname: bundle, Bundles used to produce this output. code_method (str): A reference to code that created this bundle. Default None vc_info (tuple): Version control information triple: e.g. tuple(git_repo , git_commit, branch) start_time (float): Start time of the process that produced the bundle. Default time.now() stop_time (float): Stop time of the process that produced the bundle. Default time.now() """ self._fs = _get_fs() try: if isinstance(local_context, DataContext): self.data_context = local_context elif isinstance(local_context, str): self.data_context = self._fs.get_context(local_context) if self.data_context is None: raise Exception( "Unable to create Bundle: no context found with name[{}]" .format(local_context)) else: raise Exception( "Unable to create Bundle: local_context is not str or DataContext" ) except Exception as e: _logger.error("Unable to allocate bundle in context: {} ".format( local_context, e)) return self._local_dir = None self._remote_dir = None self._closed = False # Bundle is closed and immutable self._data = None # The df, array, dictionary the user wants to store super(Bundle, self).__init__( human_name=name, #'' if name is None else name, owner=getpass.getuser() if owner is None else owner, processing_name= processing_name, #'' if processing_name is None else processing_name ) # Add the fields they have passed in. if tags is not None: self.add_tags(tags) if params is not None: self.add_params(params) if dependencies is not None: self.add_dependencies(dependencies.values(), dependencies.keys()) if code_method is not None: self.add_code_ref(code_method) if vc_info is not None: self.add_git_info(vc_info) self.add_timing(start_time, stop_time) # Only close and make immutable if the user also adds the data field if data is not None: self.open() self.add_data(data) self.close()
def _run(output_bundle='-', pipeline_root='', pipeline_args='', pipe_cls=None, backend=None, input_tags={}, output_tags={}, force=False, context=None, remote=None, no_pull=False, no_push=False, no_push_int=False, vcpus=1, memory=2000, workers=1, no_submit=False, job_role_arn=None, aws_session_token_duration=0, cli=False): """Run the dockerized version of a pipeline. Note these are named parameters so we avoid bugs related to argument order. Args: output_bundle (str): The human name of the output bundle pipeline_root (str): The path to the setup.py used to create the container pipeline_args: Optional arguments to pass to the pipeline class pipe_cls: Name of the pipeline class to run backend: The batch execution back-end to use (default `Backend.Local`) input_tags (list(str)): Find bundle with these tags ['key:value',...] output_tags (list(str)): Push result bundle with these tags ['key:value',...] force (bool): If `True` force recomputation of all upstream pipe requirements (default `False`) context (str): <remote context>/<local context> context string remote (str): The remote S3 URL. no_pull (bool): Do not pull before executing (start in empty local context) no_push (bool): Do not push any new bundles to remote (useful for testing locally) no_push_int (bool): Do not push new intermediate bundles to remote vcpus (int): Number of AWS vCPUs the container requests memory (int): Amount of memory container requests in MB workers (int): Number of Luigi workers to run tasks in DAG no_submit (bool): Produce the AWS job config (for AWS Batch), but do not submit the job job_role_arn (str): The AWS role under which the job should execute aws_session_token_duration (int): the number of seconds our temporary credentials should last. cli (bool): Whether we called run from the API (buffer output) or the CLI Returns: job_result (json): A json blob that contains information about the run job. Error with empty dict. If backend is Sagemaker, return TrainingJobArn. If backend is AWSBatch, return Batch Job description. If local, return stdout. """ pfs = fs.DisdatFS() pipeline_setup_file = os.path.join(pipeline_root, 'setup.py') if not common.setup_exists(pipeline_setup_file): return None output_bundle_uuid = pfs.disdat_uuid() if remote is None or context is None: remote, context = common.get_run_command_parameters(pfs) if remote is None and (not no_push or not no_pull ): # if pulling or pushing, need a remote _logger.error( "Pushing or pulling bundles with 'run' requires a remote set with `dsdt remote <s3 url>`" ) return arglist = common.make_run_command(output_bundle, output_bundle_uuid, pipe_cls, remote, context, input_tags, output_tags, force, no_pull, no_push, no_push_int, workers, pipeline_args) if backend == Backend.AWSBatch or backend == Backend.SageMaker: pipeline_image_name = common.make_project_image_name( pipeline_setup_file) job_name = '{}-{}'.format(pipeline_image_name, int(time.time())) fq_repository_name = get_fq_docker_repo_name(False, pipeline_setup_file) if backend == Backend.AWSBatch: retval = _run_aws_batch(arglist, fq_repository_name, job_name, pipeline_image_name, aws_session_token_duration, vcpus, memory, no_submit, job_role_arn) else: fq_repository_name = get_fq_docker_repo_name(True, pipeline_root) retval = _run_aws_sagemaker(arglist, fq_repository_name, job_name) elif backend == Backend.Local or backend == Backend.LocalSageMaker: retval = _run_local(cli, pipeline_setup_file, arglist, backend) else: raise ValueError( 'Got unrecognized job backend \'{}\': Expected {}'.format( backend, Backend.options())) return retval
def run(self): """ Call users run function. 1.) prepare the arguments 2.) run and gather user result 3.) interpret and wrap in a HyperFrame Returns: None """ kwargs = self.prepare_pipe_kwargs(for_run=True) pce = PathCache.get_path_cache(self) assert (pce is not None) """ NOTE: If a user changes a task param in run(), and that param parameterizes a dependency in requires(), then running requires() post run() will give different tasks. To be safe we record the inputs before run() """ cached_bundle_inputs = self.bundle_inputs() try: start = time.time() # P3 datetime.now().timestamp() user_rtn_val = self.pipe_run(**kwargs) stop = time.time() # P3 datetime.now().timestamp() except Exception as error: """ If user's pipe fails for any reason, remove bundle dir and raise """ try: _logger.error( "User pipe_run encountered exception: {}".format(error)) pce.bundle.abandon() except OSError as ose: _logger.error( "User pipe_run encountered error, and error on remove bundle: {}" .format(ose)) raise try: # Add any output tags to the user tag dict if self.output_tags: self.user_tags.update(self.output_tags) # If this is the root_task, identify it as so in the tag dict if isinstance(self.calling_task, DriverTask): self.user_tags.update({'root_task': 'True'}) """ if we have a pce, we have a new bundle that we need to add info to and close """ pce.bundle.add_data(user_rtn_val) pce.bundle.add_timing(start, stop) pce.bundle.add_dependencies(cached_bundle_inputs.values(), cached_bundle_inputs.keys()) pce.bundle.name = self.human_id() pce.bundle.processing_name = self.processing_id() pce.bundle.add_params(self._get_subcls_params()) pce.bundle.add_tags(self.user_tags) pce.bundle.add_code_ref('{}.{}'.format(self.__class__.__module__, self.__class__.__name__)) pipeline_path = os.path.dirname( sys.modules[self.__class__.__module__].__file__) cv = DisdatFS.get_pipe_version(pipeline_path) pce.bundle.add_git_info(cv.url, cv.hash, cv.branch) pce.bundle.close() # Write out the bundle """ Incrementally push the completed bundle """ if self.incremental_push and (BUNDLE_TAG_TRANSIENT not in pce.bundle.tags): self.pfs.commit(None, None, uuid=pce.bundle.uuid, data_context=self.data_context) self.pfs.push(uuid=pce.uuid, data_context=self.data_context) except Exception as error: """ If we fail for any reason, remove bundle dir and raise """ pce.bundle.abandon() raise return None
def add_external_dependency(self, param_name, task_class, params, human_name=None, uuid=None): """ Disdat Pipe API Function Add an external task and its parameters to our requirements. What this means is that there is no run function and, in that case, Luigi will ignore the results of task.deps() (which calls flatten(self.requires())). And what that means is that this requirement can only be satisfied by the bundle actually existing. Create ersatz ExternalDepTask parameterized by uuid and processing_name Note: it is possible to use class/params when searching by class, params, but this makes all external dependencies look the same in the code. Win. NOTE: if you add an external dependency by name, it is possible that someone adds a bundle during execution and that your requires function is no longer deterministic. You must add caching to your requires function to handle this scenario. Example with class variable bundle_uuid: `` if self.bundle_uuid is None: bundle = self.add_external_dependency('_', MyTaskClass, {}, human_name='some_result') self.bundle_uuid = bundle.uuid else: bundle = self.add_external_dependency('_', MyTaskClass, {}, uuid=self.bundle_uuid) `` TODO: Consider pushing caching into this layer. Args: param_name (str): The parameter name this bundle assumes when passed to Pipe.run task_class (object): Class name of upstream task if looking for external bundle by processing_id. params (dict): Dictionary of parameters if looking for external bundle by processing_id. human_name (str): Resolve dependency by human_name, return the latest bundle with that humman_name. Trumps task_class and params. uuid (str): Resolve dependency by explicit UUID, trumps task_class, params and human_name. Returns: `api.Bundle` or None """ import disdat.api as api if task_class is not None and not isinstance(params, dict): error = "add_external_dependency requires parameter dictionary" raise Exception(error) assert (param_name not in self.add_deps) try: if uuid is not None: hfr = self.pfs.get_hframe_by_uuid( uuid, data_context=self.data_context) elif human_name is not None: hfr = self.pfs.get_latest_hframe( human_name, data_context=self.data_context) else: # we propagate the same inputs and the same output dir for every upstream task! params.update({ 'user_arg_name': param_name, 'data_context': self.data_context }) p = task_class(**params) hfr = self.pfs.get_hframe_by_proc( p.processing_id(), data_context=self.data_context) if hfr is None: error_str = "Disdat can't resolve external bundle from class[{}] params[{}] name[{}] uuid[{}]".format( task_class, params, human_name, uuid) raise ExtDepError(error_str) bundle = api.Bundle( self.data_context.get_local_name()).fill_from_hfr(hfr) except ExtDepError as error: # Swallow and allow Luigi to determine task is not available. _logger.error(error_str) bundle = None except Exception as error: _logger.error(error) bundle = None finally: if bundle is None: self.add_deps[param_name] = ( luigi.task.externalize(ExternalDepTask), { 'uuid': 'None', 'processing_name': 'None' }) else: self.add_deps[param_name] = ( luigi.task.externalize(ExternalDepTask), { 'uuid': bundle.uuid, 'processing_name': bundle.processing_name }) return bundle
def _run_local(cli, pipeline_setup_file, arglist, backend): """ Run container locally or run sagemaker container locally Args: cli (bool): Whether we were called from the CLI or API pipeline_setup_file (str): The FQ path to the setup.py used to dockerize the pipeline. arglist: backend: Returns: output (str): Returns None if there is a failure """ on_macos = False if platform == "darwin": on_macos = True client = docker.from_env() environment = {} if 'AWS_PROFILE' in os.environ: environment['AWS_PROFILE'] = os.environ['AWS_PROFILE'] environment[common.LOCAL_EXECUTION] = 'True' # Todo: Local runs do not yet set resource limits, but when they do, we'll have to set this #environment['DISDAT_CPU_COUNT'] = vcpus volumes = {} aws_config_dir = os.getenv('AWS_CONFIG_DIR', os.path.join(os.environ['HOME'], '.aws')) if aws_config_dir is not None and os.path.exists(aws_config_dir): volumes[aws_config_dir] = {'bind': '/root/.aws', 'mode': 'rw'} local_disdat_meta_dir = DisdatConfig.instance().get_meta_dir() volumes[local_disdat_meta_dir] = {'bind': '/root/.disdat', 'mode': 'rw'} try: if backend == Backend.LocalSageMaker: pipeline_image_name = common.make_sagemaker_project_image_name( pipeline_setup_file) tempdir = tempfile.mkdtemp() with open(os.path.join(tempdir, 'hyperparameters.json'), 'w') as of: json.dump(_sagemaker_hyperparameters_from_arglist(arglist), of) args = ['train'] # rewrite to just 'train' # On mac OS, tempdir returns /var, but is actually /private/var # Add /private since it that dir is shared (and not /var) with Docker. if on_macos: localdir = os.path.join('/private', tempdir[1:]) else: localdir = tempdir volumes[localdir] = { 'bind': '/opt/ml/input/config/', 'mode': 'rw' } _logger.info("VOLUMES: {}".format(volumes)) else: # Add the actual command to the arglist (for non-sagemaker runs) arglist = [ENTRYPOINT_BIN] + arglist pipeline_image_name = common.make_project_image_name( pipeline_setup_file) _logger.debug('Running image {} with arguments {}'.format( pipeline_image_name, arglist)) stdout = client.containers.run(pipeline_image_name, arglist, detach=False, environment=environment, init=True, stderr=True, volumes=volumes) stdout = six.ensure_str(stdout) if cli: print(stdout) return stdout except docker.errors.ContainerError as ce: _logger.error( "Internal error running image {}".format(pipeline_image_name)) _logger.error("Error: {}".format(six.ensure_str(ce.stderr))) return six.ensure_str(ce) except docker.errors.ImageNotFound: _logger.error( "Unable to find the docker image {}".format(pipeline_image_name)) return None
def assert_or_log(cli, msg): if cli: _logger.error(msg) else: assert False, msg
def run(self): """ Call users run function. 1.) prepare the arguments 2.) run and gather user result 3.) interpret and wrap in a HyperFrame Returns: (`hyperframe.HyperFrame`): """ kwargs = self.prepare_pipe_kwargs(for_run=True) pce = self.pfs.get_path_cache(self) assert (pce is not None) """ NOTE: If a user changes a task param in run(), and that param parameterizes a dependency in requires(), then running requires() post run() will give different tasks. To be safe we record the inputs before run() """ cached_bundle_inputs = self.bundle_inputs() try: start = time.time() # P3 datetime.now().timestamp() user_rtn_val = self.pipe_run(**kwargs) stop = time.time() # P3 datetime.now().timestamp() except Exception as error: """ If user's pipe fails for any reason, remove bundle dir and raise """ try: _logger.error( "User pipe_run encountered exception: {}".format(error)) PipeBase.rm_bundle_dir(pce.path, pce.uuid, self.db_targets) except OSError as ose: _logger.error( "User pipe_run encountered error, and error on remove bundle: {}" .format(ose)) raise try: presentation, frames = PipeBase.parse_return_val( pce.uuid, user_rtn_val, self.data_context) hfr = PipeBase.make_hframe(frames, pce.uuid, cached_bundle_inputs, self.pipeline_id(), self.pipe_id(), self, start_ts=start, stop_ts=stop, tags={"presentable": "True"}, presentation=presentation) # Add any output tags to the user tag dict if self.output_tags: self.user_tags.update(self.output_tags) # If this is the root_task, identify it as so in the tag dict if isinstance(self.calling_task, DriverTask): self.user_tags.update({'root_task': 'True'}) # Lastly add any parameters associated with this class as tags. # They are differentiated by a special prefix in the key self.user_tags.update(self._get_subcls_params()) # Overwrite the hyperframe tags with the complete set of tags hfr.replace_tags(self.user_tags) self.data_context.write_hframe(hfr) transient = False if hfr.get_tag(BUNDLE_TAG_TRANSIENT) is not None: transient = True if self.incremental_push and not transient: self.pfs.commit(None, None, uuid=pce.uuid, data_context=self.data_context) self.pfs.push(uuid=pce.uuid, data_context=self.data_context) except Exception as error: """ If we fail for any reason, remove bundle dir and raise """ PipeBase.rm_bundle_dir(pce.path, pce.uuid, self.db_targets) raise return hfr