def step(ctx, step_name, code_package_sha, code_package_url, executable=None, image=None, iam_role=None, execution_role=None, cpu=None, gpu=None, memory=None, queue=None, run_time_limit=None, shared_memory=None, max_swap=None, swappiness=None, **kwargs): def echo(batch_id, msg, stream=sys.stdout): ctx.obj.echo_always("[%s] %s" % (batch_id, msg)) if ctx.obj.datastore.datastore_root is None: ctx.obj.datastore.datastore_root = ctx.obj.datastore.get_datastore_root_from_config( echo) if R.use_r(): entrypoint = R.entrypoint() else: if executable is None: executable = ctx.obj.environment.executable(step_name) entrypoint = '%s -u %s' % (executable, os.path.basename(sys.argv[0])) top_args = " ".join(util.dict_to_cli_options(ctx.parent.parent.params)) input_paths = kwargs.get("input_paths") split_vars = None if input_paths: max_size = 30 * 1024 split_vars = { "METAFLOW_INPUT_PATHS_%d" % (i // max_size): input_paths[i:i + max_size] for i in range(0, len(input_paths), max_size) } kwargs["input_paths"] = "".join("${%s}" % s for s in split_vars.keys()) step_args = " ".join(util.dict_to_cli_options(kwargs)) step_cli = u"{entrypoint} {top_args} step {step} {step_args}".format( entrypoint=entrypoint, top_args=top_args, step=step_name, step_args=step_args) node = ctx.obj.graph[step_name] # Get retry information retry_count = kwargs.get("retry_count", 0) retry_deco = [deco for deco in node.decorators if deco.name == "retry"] minutes_between_retries = None if retry_deco: minutes_between_retries = int(retry_deco[0].attributes.get( "minutes_between_retries", 1)) # Set batch attributes attrs = { "metaflow.user": util.get_username(), "metaflow.flow_name": ctx.obj.flow.name, "metaflow.step_name": step_name, "metaflow.run_id": kwargs["run_id"], "metaflow.task_id": kwargs["task_id"], "metaflow.retry_count": str(retry_count), "metaflow.version": ctx.obj.environment.get_environment_info()["metaflow_version"], } env_deco = [deco for deco in node.decorators if deco.name == "environment"] if env_deco: env = env_deco[0].attributes["vars"] else: env = {} datastore_root = os.path.join( ctx.obj.datastore.make_path(ctx.obj.flow.name, kwargs['run_id'], step_name, kwargs['task_id'])) # Add the environment variables related to the input-paths argument if split_vars: env.update(split_vars) if retry_count: ctx.obj.echo_always( "Sleeping %d minutes before the next AWS Batch retry" % minutes_between_retries) time.sleep(minutes_between_retries * 60) batch = Batch(ctx.obj.metadata, ctx.obj.environment) try: with ctx.obj.monitor.measure("metaflow.batch.launch"): batch.launch_job(step_name, step_cli, code_package_sha, code_package_url, ctx.obj.datastore.TYPE, image=image, queue=queue, iam_role=iam_role, execution_role=execution_role, cpu=cpu, gpu=gpu, memory=memory, run_time_limit=run_time_limit, shared_memory=shared_memory, max_swap=max_swap, swappiness=swappiness, env=env, attrs=attrs) except Exception as e: print(e) _sync_metadata(echo, ctx.obj.metadata, datastore_root, retry_count) sys.exit(METAFLOW_EXIT_DISALLOW_RETRY) try: batch.wait(echo=echo) except BatchKilledException: # don't retry killed tasks traceback.print_exc() _sync_metadata(echo, ctx.obj.metadata, datastore_root, retry_count) sys.exit(METAFLOW_EXIT_DISALLOW_RETRY) _sync_metadata(echo, ctx.obj.metadata, datastore_root, retry_count)
def step( ctx, step_name, code_package_sha, code_package_url, executable=None, image=None, iam_role=None, execution_role=None, cpu=None, gpu=None, memory=None, queue=None, run_time_limit=None, shared_memory=None, max_swap=None, swappiness=None, host_volumes=None, **kwargs ): def echo(msg, stream="stderr", batch_id=None): msg = util.to_unicode(msg) if batch_id: msg = "[%s] %s" % (batch_id, msg) ctx.obj.echo_always(msg, err=(stream == sys.stderr)) if R.use_r(): entrypoint = R.entrypoint() else: if executable is None: executable = ctx.obj.environment.executable(step_name) entrypoint = "%s -u %s" % (executable, os.path.basename(sys.argv[0])) top_args = " ".join(util.dict_to_cli_options(ctx.parent.parent.params)) input_paths = kwargs.get("input_paths") split_vars = None if input_paths: max_size = 30 * 1024 split_vars = { "METAFLOW_INPUT_PATHS_%d" % (i // max_size): input_paths[i : i + max_size] for i in range(0, len(input_paths), max_size) } kwargs["input_paths"] = "".join("${%s}" % s for s in split_vars.keys()) step_args = " ".join(util.dict_to_cli_options(kwargs)) step_cli = u"{entrypoint} {top_args} step {step} {step_args}".format( entrypoint=entrypoint, top_args=top_args, step=step_name, step_args=step_args, ) node = ctx.obj.graph[step_name] # Get retry information retry_count = kwargs.get("retry_count", 0) retry_deco = [deco for deco in node.decorators if deco.name == "retry"] minutes_between_retries = None if retry_deco: minutes_between_retries = int( retry_deco[0].attributes.get("minutes_between_retries", 1) ) # Set batch attributes task_spec = { "flow_name": ctx.obj.flow.name, "step_name": step_name, "run_id": kwargs["run_id"], "task_id": kwargs["task_id"], "retry_count": str(retry_count), } attrs = {"metaflow.%s" % k: v for k, v in task_spec.items()} attrs["metaflow.user"] = util.get_username() attrs["metaflow.version"] = ctx.obj.environment.get_environment_info()[ "metaflow_version" ] env_deco = [deco for deco in node.decorators if deco.name == "environment"] if env_deco: env = env_deco[0].attributes["vars"] else: env = {} # Add the environment variables related to the input-paths argument if split_vars: env.update(split_vars) if retry_count: ctx.obj.echo_always( "Sleeping %d minutes before the next AWS Batch retry" % minutes_between_retries ) time.sleep(minutes_between_retries * 60) # this information is needed for log tailing ds = ctx.obj.flow_datastore.get_task_datastore( mode="w", run_id=kwargs["run_id"], step_name=step_name, task_id=kwargs["task_id"], attempt=int(retry_count), ) stdout_location = ds.get_log_location(TASK_LOG_SOURCE, "stdout") stderr_location = ds.get_log_location(TASK_LOG_SOURCE, "stderr") def _sync_metadata(): if ctx.obj.metadata.TYPE == "local": sync_local_metadata_from_datastore( DATASTORE_LOCAL_DIR, ctx.obj.flow_datastore.get_task_datastore( kwargs["run_id"], step_name, kwargs["task_id"] ), ) batch = Batch(ctx.obj.metadata, ctx.obj.environment) try: with ctx.obj.monitor.measure("metaflow.aws.batch.launch_job"): batch.launch_job( step_name, step_cli, task_spec, code_package_sha, code_package_url, ctx.obj.flow_datastore.TYPE, image=image, queue=queue, iam_role=iam_role, execution_role=execution_role, cpu=cpu, gpu=gpu, memory=memory, run_time_limit=run_time_limit, shared_memory=shared_memory, max_swap=max_swap, swappiness=swappiness, env=env, attrs=attrs, host_volumes=host_volumes, ) except Exception as e: traceback.print_exc() _sync_metadata() sys.exit(METAFLOW_EXIT_DISALLOW_RETRY) try: batch.wait(stdout_location, stderr_location, echo=echo) except BatchKilledException: # don't retry killed tasks traceback.print_exc() sys.exit(METAFLOW_EXIT_DISALLOW_RETRY) finally: _sync_metadata()
def _step_cli(self, node, paths, code_package_url, user_code_retries): cmds = [] script_name = os.path.basename(sys.argv[0]) executable = self.environment.executable(node.name) if R.use_r(): entrypoint = [R.entrypoint()] else: entrypoint = [executable, script_name] # Use AWS Batch job identifier as the globally unique task identifier. task_id = "${AWS_BATCH_JOB_ID}" top_opts_dict = { "with": [ decorator.make_decorator_spec() for decorator in node.decorators if not decorator.statically_defined ] } # FlowDecorators can define their own top-level options. They are # responsible for adding their own top-level options and values through # the get_top_level_options() hook. See similar logic in runtime.py. for deco in flow_decorators(): top_opts_dict.update(deco.get_top_level_options()) top_opts = list(dict_to_cli_options(top_opts_dict)) top_level = top_opts + [ "--quiet", "--metadata=%s" % self.metadata.TYPE, "--environment=%s" % self.environment.TYPE, "--datastore=%s" % self.flow_datastore.TYPE, "--datastore-root=%s" % self.flow_datastore.datastore_root, "--event-logger=%s" % self.event_logger.TYPE, "--monitor=%s" % self.monitor.TYPE, "--no-pylint", "--with=step_functions_internal", ] if node.name == "start": # We need a separate unique ID for the special _parameters task task_id_params = "%s-params" % task_id # Export user-defined parameters into runtime environment param_file = "".join( random.choice(string.ascii_lowercase) for _ in range(10) ) export_params = ( "python -m " "metaflow.plugins.aws.step_functions.set_batch_environment " "parameters %s && . `pwd`/%s" % (param_file, param_file) ) params = ( entrypoint + top_level + [ "init", "--run-id sfn-$METAFLOW_RUN_ID", "--task-id %s" % task_id_params, ] ) # Assign tags to run objects. if self.tags: params.extend("--tag %s" % tag for tag in self.tags) # If the start step gets retried, we must be careful not to # regenerate multiple parameters tasks. Hence we check first if # _parameters exists already. exists = entrypoint + [ "dump", "--max-value-size=0", "sfn-${METAFLOW_RUN_ID}/_parameters/%s" % (task_id_params), ] cmd = "if ! %s >/dev/null 2>/dev/null; then %s && %s; fi" % ( " ".join(exists), export_params, " ".join(params), ) cmds.append(cmd) paths = "sfn-${METAFLOW_RUN_ID}/_parameters/%s" % (task_id_params) if node.type == "join" and self.graph[node.split_parents[-1]].type == "foreach": parent_tasks_file = "".join( random.choice(string.ascii_lowercase) for _ in range(10) ) export_parent_tasks = ( "python -m " "metaflow.plugins.aws.step_functions.set_batch_environment " "parent_tasks %s && . `pwd`/%s" % (parent_tasks_file, parent_tasks_file) ) cmds.append(export_parent_tasks) step = [ "step", node.name, "--run-id sfn-$METAFLOW_RUN_ID", "--task-id %s" % task_id, # Since retries are handled by AWS Batch, we can rely on # AWS_BATCH_JOB_ATTEMPT as the job counter. "--retry-count $((AWS_BATCH_JOB_ATTEMPT-1))", "--max-user-code-retries %d" % user_code_retries, "--input-paths %s" % paths, ] if any(self.graph[n].type == "foreach" for n in node.in_funcs): # We set the `METAFLOW_SPLIT_INDEX` through JSONPath-foo # to pass the state from the parent DynamoDb state for for-each. step.append("--split-index $METAFLOW_SPLIT_INDEX") if self.tags: step.extend("--tag %s" % tag for tag in self.tags) if self.namespace is not None: step.append("--namespace=%s" % self.namespace) cmds.append(" ".join(entrypoint + top_level + step)) return " && ".join(cmds)
def _step_cli(self, node, paths, code_package_url, user_code_retries): cmds = [] script_name = os.path.basename(sys.argv[0]) executable = self.environment.executable(node.name) if R.use_r(): entrypoint = [R.entrypoint()] else: entrypoint = [executable, script_name] # Use AWS Batch job identifier as the globally unique task identifier. task_id = '${AWS_BATCH_JOB_ID}' # FlowDecorators can define their own top-level options. They are # responsible for adding their own top-level options and values through # the get_top_level_options() hook. See similar logic in runtime.py. top_opts_dict = {} for deco in flow_decorators(): top_opts_dict.update(deco.get_top_level_options()) top_opts = list(dict_to_cli_options(top_opts_dict)) if node.name == 'start': # We need a separate unique ID for the special _parameters task task_id_params = '%s-params' % task_id # Export user-defined parameters into runtime environment param_file = ''.join( random.choice(string.ascii_lowercase) for _ in range(10)) export_params = \ 'python -m ' \ 'metaflow.plugins.aws.step_functions.set_batch_environment ' \ 'parameters %s && . `pwd`/%s' % (param_file, param_file) params = entrypoint + top_opts +\ ['--quiet', '--metadata=%s' % self.metadata.TYPE, '--environment=%s' % self.environment.TYPE, '--datastore=s3', '--event-logger=%s' % self.event_logger.logger_type, '--monitor=%s' % self.monitor.monitor_type, '--no-pylint', 'init', '--run-id sfn-$METAFLOW_RUN_ID', '--task-id %s' % task_id_params] # Assign tags to run objects. if self.tags: params.extend('--tag %s' % tag for tag in self.tags) # If the start step gets retried, we must be careful not to # regenerate multiple parameters tasks. Hence we check first if # _parameters exists already. exists = entrypoint +\ ['dump', '--max-value-size=0', 'sfn-${METAFLOW_RUN_ID}/_parameters/%s' % (task_id_params)] cmd = 'if ! %s >/dev/null 2>/dev/null; then %s && %s; fi'\ % (' '.join(exists), export_params, ' '.join(params)) cmds.append(cmd) paths = 'sfn-${METAFLOW_RUN_ID}/_parameters/%s' % (task_id_params) if node.type == 'join' and\ self.graph[node.split_parents[-1]].type == 'foreach': parent_tasks_file = ''.join( random.choice(string.ascii_lowercase) for _ in range(10)) export_parent_tasks = \ 'python -m ' \ 'metaflow.plugins.aws.step_functions.set_batch_environment ' \ 'parent_tasks %s && . `pwd`/%s' \ % (parent_tasks_file, parent_tasks_file) cmds.append(export_parent_tasks) top_level = top_opts + [ '--quiet', '--metadata=%s' % self.metadata.TYPE, '--environment=%s' % self.environment.TYPE, '--datastore=%s' % self.datastore.TYPE, '--datastore-root=%s' % self.datastore.datastore_root, '--event-logger=%s' % self.event_logger.logger_type, '--monitor=%s' % self.monitor.monitor_type, '--no-pylint', '--with=step_functions_internal' ] step = [ 'step', node.name, '--run-id sfn-$METAFLOW_RUN_ID', '--task-id %s' % task_id, # Since retries are handled by AWS Batch, we can rely on # AWS_BATCH_JOB_ATTEMPT as the job counter. '--retry-count $((AWS_BATCH_JOB_ATTEMPT-1))', '--max-user-code-retries %d' % user_code_retries, '--input-paths %s' % paths, # Set decorator to batch to execute `task_*` hooks for batch # decorator. '--with=batch' ] if any(self.graph[n].type == 'foreach' for n in node.in_funcs): # We set the `METAFLOW_SPLIT_INDEX` through JSONPath-foo # to pass the state from the parent DynamoDb state for for-each. step.append('--split-index $METAFLOW_SPLIT_INDEX') if self.tags: step.extend('--tag %s' % tag for tag in self.tags) if self.namespace is not None: step.append('--namespace=%s' % self.namespace) cmds.append(' '.join(entrypoint + top_level + step)) return ' && '.join(cmds)
def step( ctx, step_name, code_package_sha, code_package_url, executable=None, image=None, service_account=None, secrets=None, node_selector=None, k8s_namespace=None, cpu=None, disk=None, memory=None, gpu=None, gpu_vendor=None, run_time_limit=None, **kwargs ): def echo(msg, stream="stderr", job_id=None): msg = util.to_unicode(msg) if job_id: msg = "[%s] %s" % (job_id, msg) ctx.obj.echo_always(msg, err=(stream == sys.stderr)) node = ctx.obj.graph[step_name] # Construct entrypoint CLI if executable is None: executable = ctx.obj.environment.executable(step_name) # Set environment env = {} env_deco = [deco for deco in node.decorators if deco.name == "environment"] if env_deco: env = env_deco[0].attributes["vars"] # Set input paths. input_paths = kwargs.get("input_paths") split_vars = None if input_paths: max_size = 30 * 1024 split_vars = { "METAFLOW_INPUT_PATHS_%d" % (i // max_size): input_paths[i : i + max_size] for i in range(0, len(input_paths), max_size) } kwargs["input_paths"] = "".join("${%s}" % s for s in split_vars.keys()) env.update(split_vars) # Set retry policy. retry_count = int(kwargs.get("retry_count", 0)) retry_deco = [deco for deco in node.decorators if deco.name == "retry"] minutes_between_retries = None if retry_deco: minutes_between_retries = int( retry_deco[0].attributes.get("minutes_between_retries", 2) ) if retry_count: ctx.obj.echo_always( "Sleeping %d minutes before the next retry" % minutes_between_retries ) time.sleep(minutes_between_retries * 60) step_cli = "{entrypoint} {top_args} step {step} {step_args}".format( entrypoint="%s -u %s" % (executable, os.path.basename(sys.argv[0])), top_args=" ".join(util.dict_to_cli_options(ctx.parent.parent.params)), step=step_name, step_args=" ".join(util.dict_to_cli_options(kwargs)), ) # Set log tailing. ds = ctx.obj.flow_datastore.get_task_datastore( mode="w", run_id=kwargs["run_id"], step_name=step_name, task_id=kwargs["task_id"], attempt=int(retry_count), ) stdout_location = ds.get_log_location(TASK_LOG_SOURCE, "stdout") stderr_location = ds.get_log_location(TASK_LOG_SOURCE, "stderr") def _sync_metadata(): if ctx.obj.metadata.TYPE == "local": sync_local_metadata_from_datastore( DATASTORE_LOCAL_DIR, ctx.obj.flow_datastore.get_task_datastore( kwargs["run_id"], step_name, kwargs["task_id"] ), ) try: kubernetes = Kubernetes( datastore=ctx.obj.flow_datastore, metadata=ctx.obj.metadata, environment=ctx.obj.environment, ) # Configure and launch Kubernetes job. with ctx.obj.monitor.measure("metaflow.kubernetes.launch_job"): kubernetes.launch_job( flow_name=ctx.obj.flow.name, run_id=kwargs["run_id"], step_name=step_name, task_id=kwargs["task_id"], attempt=str(retry_count), user=util.get_username(), code_package_sha=code_package_sha, code_package_url=code_package_url, code_package_ds=ctx.obj.flow_datastore.TYPE, step_cli=step_cli, docker_image=image, service_account=service_account, secrets=secrets, node_selector=node_selector, namespace=k8s_namespace, cpu=cpu, disk=disk, memory=memory, gpu=gpu, gpu_vendor=gpu_vendor, run_time_limit=run_time_limit, env=env, ) except Exception as e: traceback.print_exc(chain=False) _sync_metadata() sys.exit(METAFLOW_EXIT_DISALLOW_RETRY) try: kubernetes.wait(stdout_location, stderr_location, echo=echo) except KubernetesKilledException: # don't retry killed tasks traceback.print_exc() sys.exit(METAFLOW_EXIT_DISALLOW_RETRY) finally: _sync_metadata()