def __init__(self, attributes=None, statically_defined=False): super(BatchDecorator, self).__init__(attributes, statically_defined) # If no docker image is explicitly specified, impute a default image. if not self.attributes["image"]: # If metaflow-config specifies a docker image, just use that. if BATCH_CONTAINER_IMAGE: self.attributes["image"] = BATCH_CONTAINER_IMAGE # If metaflow-config doesn't specify a docker image, assign a # default docker image. else: # Metaflow-R has it's own default docker image (rocker family) if R.use_r(): self.attributes["image"] = R.container_image() # Default to vanilla Python image corresponding to major.minor # version of the Python interpreter launching the flow. else: self.attributes["image"] = "python:%s.%s" % ( platform.python_version_tuple()[0], platform.python_version_tuple()[1], ) # Assign docker registry URL for the image. if not get_docker_registry(self.attributes["image"]): if BATCH_CONTAINER_REGISTRY: self.attributes["image"] = "%s/%s" % ( BATCH_CONTAINER_REGISTRY.rstrip("/"), self.attributes["image"], )
def __init__(self, attributes=None, statically_defined=False): super(BatchDecorator, self).__init__(attributes, statically_defined) if not self.attributes['image']: if BATCH_CONTAINER_IMAGE: self.attributes['image'] = BATCH_CONTAINER_IMAGE else: if R.use_r(): self.attributes['image'] = R.container_image() else: self.attributes['image'] = 'python:%s.%s' % (platform.python_version_tuple()[0], platform.python_version_tuple()[1]) if not BatchDecorator._get_registry(self.attributes['image']): if BATCH_CONTAINER_REGISTRY: self.attributes['image'] = '%s/%s' % (BATCH_CONTAINER_REGISTRY.rstrip('/'), self.attributes['image'])
def runtime_step_cli(self, cli_args, retry_count, max_user_code_retries): if retry_count <= max_user_code_retries: # after all attempts to run the user code have failed, we don't need # Batch anymore. We can execute possible fallback code locally. cli_args.commands = ['batch', 'step'] cli_args.command_args.append(self.package_sha) cli_args.command_args.append(self.package_url) cli_args.command_options.update(self.attributes) cli_args.command_options['run-time-limit'] = self.run_time_limit if not R.use_r(): cli_args.entrypoint[0] = sys.executable
def step(ctx, step_name, code_package_sha, code_package_url, executable=None, image=None, iam_role=None, execution_role=None, cpu=None, gpu=None, memory=None, queue=None, run_time_limit=None, shared_memory=None, max_swap=None, swappiness=None, **kwargs): def echo(batch_id, msg, stream=sys.stdout): ctx.obj.echo_always("[%s] %s" % (batch_id, msg)) if ctx.obj.datastore.datastore_root is None: ctx.obj.datastore.datastore_root = ctx.obj.datastore.get_datastore_root_from_config( echo) if R.use_r(): entrypoint = R.entrypoint() else: if executable is None: executable = ctx.obj.environment.executable(step_name) entrypoint = '%s -u %s' % (executable, os.path.basename(sys.argv[0])) top_args = " ".join(util.dict_to_cli_options(ctx.parent.parent.params)) input_paths = kwargs.get("input_paths") split_vars = None if input_paths: max_size = 30 * 1024 split_vars = { "METAFLOW_INPUT_PATHS_%d" % (i // max_size): input_paths[i:i + max_size] for i in range(0, len(input_paths), max_size) } kwargs["input_paths"] = "".join("${%s}" % s for s in split_vars.keys()) step_args = " ".join(util.dict_to_cli_options(kwargs)) step_cli = u"{entrypoint} {top_args} step {step} {step_args}".format( entrypoint=entrypoint, top_args=top_args, step=step_name, step_args=step_args) node = ctx.obj.graph[step_name] # Get retry information retry_count = kwargs.get("retry_count", 0) retry_deco = [deco for deco in node.decorators if deco.name == "retry"] minutes_between_retries = None if retry_deco: minutes_between_retries = int(retry_deco[0].attributes.get( "minutes_between_retries", 1)) # Set batch attributes attrs = { "metaflow.user": util.get_username(), "metaflow.flow_name": ctx.obj.flow.name, "metaflow.step_name": step_name, "metaflow.run_id": kwargs["run_id"], "metaflow.task_id": kwargs["task_id"], "metaflow.retry_count": str(retry_count), "metaflow.version": ctx.obj.environment.get_environment_info()["metaflow_version"], } env_deco = [deco for deco in node.decorators if deco.name == "environment"] if env_deco: env = env_deco[0].attributes["vars"] else: env = {} datastore_root = os.path.join( ctx.obj.datastore.make_path(ctx.obj.flow.name, kwargs['run_id'], step_name, kwargs['task_id'])) # Add the environment variables related to the input-paths argument if split_vars: env.update(split_vars) if retry_count: ctx.obj.echo_always( "Sleeping %d minutes before the next AWS Batch retry" % minutes_between_retries) time.sleep(minutes_between_retries * 60) batch = Batch(ctx.obj.metadata, ctx.obj.environment) try: with ctx.obj.monitor.measure("metaflow.batch.launch"): batch.launch_job(step_name, step_cli, code_package_sha, code_package_url, ctx.obj.datastore.TYPE, image=image, queue=queue, iam_role=iam_role, execution_role=execution_role, cpu=cpu, gpu=gpu, memory=memory, run_time_limit=run_time_limit, shared_memory=shared_memory, max_swap=max_swap, swappiness=swappiness, env=env, attrs=attrs) except Exception as e: print(e) _sync_metadata(echo, ctx.obj.metadata, datastore_root, retry_count) sys.exit(METAFLOW_EXIT_DISALLOW_RETRY) try: batch.wait(echo=echo) except BatchKilledException: # don't retry killed tasks traceback.print_exc() _sync_metadata(echo, ctx.obj.metadata, datastore_root, retry_count) sys.exit(METAFLOW_EXIT_DISALLOW_RETRY) _sync_metadata(echo, ctx.obj.metadata, datastore_root, retry_count)
def step( ctx, step_name, code_package_sha, code_package_url, executable=None, image=None, iam_role=None, execution_role=None, cpu=None, gpu=None, memory=None, queue=None, run_time_limit=None, shared_memory=None, max_swap=None, swappiness=None, host_volumes=None, **kwargs ): def echo(msg, stream="stderr", batch_id=None): msg = util.to_unicode(msg) if batch_id: msg = "[%s] %s" % (batch_id, msg) ctx.obj.echo_always(msg, err=(stream == sys.stderr)) if R.use_r(): entrypoint = R.entrypoint() else: if executable is None: executable = ctx.obj.environment.executable(step_name) entrypoint = "%s -u %s" % (executable, os.path.basename(sys.argv[0])) top_args = " ".join(util.dict_to_cli_options(ctx.parent.parent.params)) input_paths = kwargs.get("input_paths") split_vars = None if input_paths: max_size = 30 * 1024 split_vars = { "METAFLOW_INPUT_PATHS_%d" % (i // max_size): input_paths[i : i + max_size] for i in range(0, len(input_paths), max_size) } kwargs["input_paths"] = "".join("${%s}" % s for s in split_vars.keys()) step_args = " ".join(util.dict_to_cli_options(kwargs)) step_cli = u"{entrypoint} {top_args} step {step} {step_args}".format( entrypoint=entrypoint, top_args=top_args, step=step_name, step_args=step_args, ) node = ctx.obj.graph[step_name] # Get retry information retry_count = kwargs.get("retry_count", 0) retry_deco = [deco for deco in node.decorators if deco.name == "retry"] minutes_between_retries = None if retry_deco: minutes_between_retries = int( retry_deco[0].attributes.get("minutes_between_retries", 1) ) # Set batch attributes task_spec = { "flow_name": ctx.obj.flow.name, "step_name": step_name, "run_id": kwargs["run_id"], "task_id": kwargs["task_id"], "retry_count": str(retry_count), } attrs = {"metaflow.%s" % k: v for k, v in task_spec.items()} attrs["metaflow.user"] = util.get_username() attrs["metaflow.version"] = ctx.obj.environment.get_environment_info()[ "metaflow_version" ] env_deco = [deco for deco in node.decorators if deco.name == "environment"] if env_deco: env = env_deco[0].attributes["vars"] else: env = {} # Add the environment variables related to the input-paths argument if split_vars: env.update(split_vars) if retry_count: ctx.obj.echo_always( "Sleeping %d minutes before the next AWS Batch retry" % minutes_between_retries ) time.sleep(minutes_between_retries * 60) # this information is needed for log tailing ds = ctx.obj.flow_datastore.get_task_datastore( mode="w", run_id=kwargs["run_id"], step_name=step_name, task_id=kwargs["task_id"], attempt=int(retry_count), ) stdout_location = ds.get_log_location(TASK_LOG_SOURCE, "stdout") stderr_location = ds.get_log_location(TASK_LOG_SOURCE, "stderr") def _sync_metadata(): if ctx.obj.metadata.TYPE == "local": sync_local_metadata_from_datastore( DATASTORE_LOCAL_DIR, ctx.obj.flow_datastore.get_task_datastore( kwargs["run_id"], step_name, kwargs["task_id"] ), ) batch = Batch(ctx.obj.metadata, ctx.obj.environment) try: with ctx.obj.monitor.measure("metaflow.aws.batch.launch_job"): batch.launch_job( step_name, step_cli, task_spec, code_package_sha, code_package_url, ctx.obj.flow_datastore.TYPE, image=image, queue=queue, iam_role=iam_role, execution_role=execution_role, cpu=cpu, gpu=gpu, memory=memory, run_time_limit=run_time_limit, shared_memory=shared_memory, max_swap=max_swap, swappiness=swappiness, env=env, attrs=attrs, host_volumes=host_volumes, ) except Exception as e: traceback.print_exc() _sync_metadata() sys.exit(METAFLOW_EXIT_DISALLOW_RETRY) try: batch.wait(stdout_location, stderr_location, echo=echo) except BatchKilledException: # don't retry killed tasks traceback.print_exc() sys.exit(METAFLOW_EXIT_DISALLOW_RETRY) finally: _sync_metadata()
def _step_cli(self, node, paths, code_package_url, user_code_retries): cmds = [] script_name = os.path.basename(sys.argv[0]) executable = self.environment.executable(node.name) if R.use_r(): entrypoint = [R.entrypoint()] else: entrypoint = [executable, script_name] # Use AWS Batch job identifier as the globally unique task identifier. task_id = '${AWS_BATCH_JOB_ID}' if node.name == 'start': # We need a separate unique ID for the special _parameters task task_id_params = '%s-params' % task_id # Export user-defined parameters into runtime environment param_file = ''.join( random.choice(string.ascii_lowercase) for _ in range(10)) export_params = \ 'python -m ' \ 'metaflow.plugins.aws.step_functions.set_batch_environment ' \ 'parameters %s && . `pwd`/%s' % (param_file, param_file) params = entrypoint +\ ['--quiet', '--metadata=%s' % self.metadata.TYPE, '--environment=%s' % self.environment.TYPE, '--datastore=s3', '--event-logger=%s' % self.event_logger.logger_type, '--monitor=%s' % self.monitor.monitor_type, '--no-pylint', 'init', '--run-id sfn-$METAFLOW_RUN_ID', '--task-id %s' % task_id_params] # If the start step gets retried, we must be careful not to # regenerate multiple parameters tasks. Hence we check first if # _parameters exists already. exists = entrypoint +\ ['dump', '--max-value-size=0', 'sfn-${METAFLOW_RUN_ID}/_parameters/%s' % (task_id_params)] cmd = 'if ! %s >/dev/null 2>/dev/null; then %s && %s; fi'\ % (' '.join(exists), export_params, ' '.join(params)) cmds.append(cmd) paths = 'sfn-${METAFLOW_RUN_ID}/_parameters/%s' % (task_id_params) if node.type == 'join' and\ self.graph[node.split_parents[-1]].type == 'foreach': parent_tasks_file = ''.join( random.choice(string.ascii_lowercase) for _ in range(10)) export_parent_tasks = \ 'python -m ' \ 'metaflow.plugins.aws.step_functions.set_batch_environment ' \ 'parent_tasks %s && . `pwd`/%s' \ % (parent_tasks_file, parent_tasks_file) cmds.append(export_parent_tasks) top_level = [ '--quiet', '--metadata=%s' % self.metadata.TYPE, '--environment=%s' % self.environment.TYPE, '--datastore=%s' % self.datastore.TYPE, '--datastore-root=%s' % self.datastore.datastore_root, '--event-logger=%s' % self.event_logger.logger_type, '--monitor=%s' % self.monitor.monitor_type, '--no-pylint', '--with=step_functions_internal' ] step = [ 'step', node.name, '--run-id sfn-$METAFLOW_RUN_ID', '--task-id %s' % task_id, # Since retries are handled by AWS Batch, we can rely on # AWS_BATCH_JOB_ATTEMPT as the job counter. '--retry-count $((AWS_BATCH_JOB_ATTEMPT-1))', '--max-user-code-retries %d' % user_code_retries, '--input-paths %s' % paths, # Set decorator to batch to execute `task_*` hooks for batch # decorator. '--with=batch' ] if any(self.graph[n].type == 'foreach' for n in node.in_funcs): # We set the `METAFLOW_SPLIT_INDEX` through JSONPath-foo # to pass the state from the parent DynamoDb state for for-each. step.append('--split-index $METAFLOW_SPLIT_INDEX') if self.tags: step.extend('--tag %s' % tag for tag in self.tags) if self.namespace: step.append('--namespace %s' % self.namespace) cmds.append(' '.join(entrypoint + top_level + step)) return ' && '.join(cmds)
def _step_cli(self, node, paths, code_package_url, user_code_retries): cmds = [] script_name = os.path.basename(sys.argv[0]) executable = self.environment.executable(node.name) if R.use_r(): entrypoint = [R.entrypoint()] else: entrypoint = [executable, script_name] # Use AWS Batch job identifier as the globally unique task identifier. task_id = "${AWS_BATCH_JOB_ID}" # FlowDecorators can define their own top-level options. They are # responsible for adding their own top-level options and values through # the get_top_level_options() hook. See similar logic in runtime.py. top_opts_dict = {} for deco in flow_decorators(): top_opts_dict.update(deco.get_top_level_options()) top_opts = list(dict_to_cli_options(top_opts_dict)) if node.name == "start": # We need a separate unique ID for the special _parameters task task_id_params = "%s-params" % task_id # Export user-defined parameters into runtime environment param_file = "".join( random.choice(string.ascii_lowercase) for _ in range(10)) export_params = ( "python -m " "metaflow.plugins.aws.step_functions.set_batch_environment " "parameters %s && . `pwd`/%s" % (param_file, param_file)) params = (entrypoint + top_opts + [ "--quiet", "--metadata=%s" % self.metadata.TYPE, "--environment=%s" % self.environment.TYPE, "--datastore=s3", "--event-logger=%s" % self.event_logger.logger_type, "--monitor=%s" % self.monitor.monitor_type, "--no-pylint", "init", "--run-id sfn-$METAFLOW_RUN_ID", "--task-id %s" % task_id_params, ]) # Assign tags to run objects. if self.tags: params.extend("--tag %s" % tag for tag in self.tags) # If the start step gets retried, we must be careful not to # regenerate multiple parameters tasks. Hence we check first if # _parameters exists already. exists = entrypoint + [ "dump", "--max-value-size=0", "sfn-${METAFLOW_RUN_ID}/_parameters/%s" % (task_id_params), ] cmd = "if ! %s >/dev/null 2>/dev/null; then %s && %s; fi" % ( " ".join(exists), export_params, " ".join(params), ) cmds.append(cmd) paths = "sfn-${METAFLOW_RUN_ID}/_parameters/%s" % (task_id_params) if node.type == "join" and self.graph[ node.split_parents[-1]].type == "foreach": parent_tasks_file = "".join( random.choice(string.ascii_lowercase) for _ in range(10)) export_parent_tasks = ( "python -m " "metaflow.plugins.aws.step_functions.set_batch_environment " "parent_tasks %s && . `pwd`/%s" % (parent_tasks_file, parent_tasks_file)) cmds.append(export_parent_tasks) top_level = top_opts + [ "--quiet", "--metadata=%s" % self.metadata.TYPE, "--environment=%s" % self.environment.TYPE, "--datastore=%s" % self.flow_datastore.TYPE, "--datastore-root=%s" % self.flow_datastore.datastore_root, "--event-logger=%s" % self.event_logger.logger_type, "--monitor=%s" % self.monitor.monitor_type, "--no-pylint", "--with=step_functions_internal", ] step = [ "step", node.name, "--run-id sfn-$METAFLOW_RUN_ID", "--task-id %s" % task_id, # Since retries are handled by AWS Batch, we can rely on # AWS_BATCH_JOB_ATTEMPT as the job counter. "--retry-count $((AWS_BATCH_JOB_ATTEMPT-1))", "--max-user-code-retries %d" % user_code_retries, "--input-paths %s" % paths, # Set decorator to batch to execute `task_*` hooks for batch # decorator. "--with=batch", ] if any(self.graph[n].type == "foreach" for n in node.in_funcs): # We set the `METAFLOW_SPLIT_INDEX` through JSONPath-foo # to pass the state from the parent DynamoDb state for for-each. step.append("--split-index $METAFLOW_SPLIT_INDEX") if self.tags: step.extend("--tag %s" % tag for tag in self.tags) if self.namespace is not None: step.append("--namespace=%s" % self.namespace) cmds.append(" ".join(entrypoint + top_level + step)) return " && ".join(cmds)