def launch_job( self, step_name, step_cli, task_spec, code_package_sha, code_package_url, code_package_ds, image, queue, iam_role=None, execution_role=None, # for FARGATE compatibility cpu=None, gpu=None, memory=None, run_time_limit=None, shared_memory=None, max_swap=None, swappiness=None, host_volumes=None, num_parallel=1, env={}, attrs={}, ): if queue is None: queue = next(self._client.active_job_queues(), None) if queue is None: raise BatchException( "Unable to launch AWS Batch job. No job queue " " specified and no valid & enabled queue found.") job = self.create_job( step_name, capture_output_to_mflog(step_cli), task_spec, code_package_sha, code_package_url, code_package_ds, image, queue, iam_role, execution_role, cpu, gpu, memory, run_time_limit, shared_memory, max_swap, swappiness, env=env, attrs=attrs, host_volumes=host_volumes, num_parallel=num_parallel, ) self.num_parallel = num_parallel self.job = job.execute()
def _command( self, code_package_url, step_cmds, ): mflog_expr = export_mflog_env_vars( flow_name=self._flow_name, run_id=self._run_id, step_name=self._step_name, task_id=self._task_id, retry_count=self._attempt, datastore_type=self._datastore.TYPE, stdout_path=STDOUT_PATH, stderr_path=STDERR_PATH, ) init_cmds = self._environment.get_package_commands(code_package_url) init_expr = " && ".join(init_cmds) step_expr = " && ".join([ capture_output_to_mflog(a) for a in (self._environment.bootstrap_commands(self._step_name) + step_cmds) ]) # Construct an entry point that # 1) initializes the mflog environment (mflog_expr) # 2) bootstraps a metaflow environment (init_expr) # 3) executes a task (step_expr) # The `true` command is to make sure that the generated command # plays well with docker containers which have entrypoint set as # eval $@ cmd_str = "true && mkdir -p %s && %s && %s && %s; " % ( LOGS_DIR, mflog_expr, init_expr, step_expr, ) # After the task has finished, we save its exit code (fail/success) # and persist the final logs. The whole entrypoint should exit # with the exit code (c) of the task. # # Note that if step_expr OOMs, this tail expression is never executed. # We lose the last logs in this scenario. # # TODO: Find a way to capture hard exit logs in Kubernetes. cmd_str += "c=$?; %s; exit $c" % BASH_SAVE_LOGS return shlex.split('bash -c "%s"' % cmd_str)
def _commands(self, node, retry_count, user_code_retries): mflog_expr = export_mflog_env_vars(datastore_type='s3', stdout_path='/tmp/mflog_stdout', stderr_path='/tmp/mflog_stderr', flow_name=self.flow.name, run_id='{{workflow.name}}', step_name=node.name, task_id='{{pod.name}}', retry_count=retry_count) init_cmds = [] if self.code_package_url: init_cmds.extend( self.environment.get_package_commands(self.code_package_url)) init_cmds.extend(self.environment.bootstrap_commands(node.name)) init_expr = ' && '.join(init_cmds) step_expr = " && ".join([ capture_output_to_mflog(a) for a in ( self._step_commands(node, retry_count, user_code_retries)) ]) cmd = ['true', mflog_expr, init_expr, step_expr] cmd_str = '%s; c=$?; %s; exit $c' % (' && '.join( c for c in cmd if c), BASH_SAVE_LOGS) return shlex.split('bash -c \"%s\"' % cmd_str)
def _command(self, environment, code_package_url, step_name, step_cmds, task_spec): mflog_expr = export_mflog_env_vars(datastore_type="s3", stdout_path=STDOUT_PATH, stderr_path=STDERR_PATH, **task_spec) init_cmds = environment.get_package_commands(code_package_url) init_expr = " && ".join(init_cmds) step_expr = " && ".join([ capture_output_to_mflog(a) for a in (environment.bootstrap_commands(step_name) + step_cmds) ]) # construct an entry point that # 1) initializes the mflog environment (mflog_expr) # 2) bootstraps a metaflow environment (init_expr) # 3) executes a task (step_expr) # the `true` command is to make sure that the generated command # plays well with docker containers which have entrypoint set as # eval $@ cmd_str = "true && mkdir -p %s && %s && %s && %s; " % ( LOGS_DIR, mflog_expr, init_expr, step_expr, ) # after the task has finished, we save its exit code (fail/success) # and persist the final logs. The whole entrypoint should exit # with the exit code (c) of the task. # # Note that if step_expr OOMs, this tail expression is never executed. # We lose the last logs in this scenario (although they are visible # still through AWS CloudWatch console). cmd_str += "c=$?; %s; exit $c" % BASH_SAVE_LOGS return shlex.split('bash -c "%s"' % cmd_str)
def _step_cli(self, node, paths, code_package_url, user_code_retries): cmds = [] script_name = os.path.basename(sys.argv[0]) executable = self.environment.executable(node.name) if R.use_r(): entrypoint = [R.entrypoint()] else: entrypoint = [executable, script_name] # Use AWS Batch job identifier as the globally unique task identifier. task_id = "${AWS_BATCH_JOB_ID}" # FlowDecorators can define their own top-level options. They are # responsible for adding their own top-level options and values through # the get_top_level_options() hook. See similar logic in runtime.py. top_opts_dict = {} for deco in flow_decorators(): top_opts_dict.update(deco.get_top_level_options()) top_opts = list(dict_to_cli_options(top_opts_dict)) if node.name == "start": # We need a separate unique ID for the special _parameters task task_id_params = "%s-params" % task_id # Export user-defined parameters into runtime environment param_file = "".join( random.choice(string.ascii_lowercase) for _ in range(10)) export_params = " && ".join([ capture_output_to_mflog( "python -m metaflow.plugins.aws.step_functions.set_batch_environment parameters %s" % param_file), ". `pwd`/%s" % param_file, ]) params = (entrypoint + top_opts + [ "--quiet", "--metadata=%s" % self.metadata.TYPE, "--environment=%s" % self.environment.TYPE, "--datastore=s3", "--event-logger=%s" % self.event_logger.logger_type, "--monitor=%s" % self.monitor.monitor_type, "--no-pylint", "init", "--run-id sfn-$METAFLOW_RUN_ID", "--task-id %s" % task_id_params, ]) # Assign tags to run objects. if self.tags: params.extend("--tag %s" % tag for tag in self.tags) # If the start step gets retried, we must be careful not to # regenerate multiple parameters tasks. Hence we check first if # _parameters exists already. exists = entrypoint + [ "dump", "--max-value-size=0", "sfn-${METAFLOW_RUN_ID}/_parameters/%s" % (task_id_params), ] cmd = "if ! %s >/dev/null 2>/dev/null; then %s && %s; fi" % ( " ".join(exists), export_params, capture_output_to_mflog(" ".join(params)), ) cmds.append(cmd) paths = "sfn-${METAFLOW_RUN_ID}/_parameters/%s" % (task_id_params) if node.type == "join" and self.graph[ node.split_parents[-1]].type == "foreach": parent_tasks_file = "".join( random.choice(string.ascii_lowercase) for _ in range(10)) export_parent_tasks = capture_output_to_mflog( "python -m " "metaflow.plugins.aws.step_functions.set_batch_environment " "parent_tasks %s && . `pwd`/%s" % (parent_tasks_file, parent_tasks_file)) cmds.append(export_parent_tasks) top_level = top_opts + [ "--quiet", "--metadata=%s" % self.metadata.TYPE, "--environment=%s" % self.environment.TYPE, "--datastore=%s" % self.flow_datastore.TYPE, "--datastore-root=%s" % self.flow_datastore.datastore_root, "--event-logger=%s" % self.event_logger.logger_type, "--monitor=%s" % self.monitor.monitor_type, "--no-pylint", "--with=step_functions_internal", ] step = [ "step", node.name, "--run-id sfn-$METAFLOW_RUN_ID", "--task-id %s" % task_id, # Since retries are handled by AWS Batch, we can rely on # AWS_BATCH_JOB_ATTEMPT as the job counter. "--retry-count $((AWS_BATCH_JOB_ATTEMPT-1))", "--max-user-code-retries %d" % user_code_retries, "--input-paths %s" % paths, # Set decorator to batch to execute `task_*` hooks for batch # decorator. "--with=batch", ] if any(self.graph[n].type == "foreach" for n in node.in_funcs): # We set the `METAFLOW_SPLIT_INDEX` through JSONPath-foo # to pass the state from the parent DynamoDb state for for-each. step.append("--split-index $METAFLOW_SPLIT_INDEX") if self.tags: step.extend("--tag %s" % tag for tag in self.tags) if self.namespace is not None: step.append("--namespace=%s" % self.namespace) cmds.append( capture_output_to_mflog(" ".join(entrypoint + top_level + step))) return " && ".join(cmds)