コード例 #1
0
ファイル: batch.py プロジェクト: savingoyal/metaflow
 def launch_job(
     self,
     step_name,
     step_cli,
     task_spec,
     code_package_sha,
     code_package_url,
     code_package_ds,
     image,
     queue,
     iam_role=None,
     execution_role=None,  # for FARGATE compatibility
     cpu=None,
     gpu=None,
     memory=None,
     run_time_limit=None,
     shared_memory=None,
     max_swap=None,
     swappiness=None,
     host_volumes=None,
     num_parallel=1,
     env={},
     attrs={},
 ):
     if queue is None:
         queue = next(self._client.active_job_queues(), None)
         if queue is None:
             raise BatchException(
                 "Unable to launch AWS Batch job. No job queue "
                 " specified and no valid & enabled queue found.")
     job = self.create_job(
         step_name,
         capture_output_to_mflog(step_cli),
         task_spec,
         code_package_sha,
         code_package_url,
         code_package_ds,
         image,
         queue,
         iam_role,
         execution_role,
         cpu,
         gpu,
         memory,
         run_time_limit,
         shared_memory,
         max_swap,
         swappiness,
         env=env,
         attrs=attrs,
         host_volumes=host_volumes,
         num_parallel=num_parallel,
     )
     self.num_parallel = num_parallel
     self.job = job.execute()
コード例 #2
0
ファイル: kubernetes.py プロジェクト: sappier/metaflow
    def _command(
        self,
        code_package_url,
        step_cmds,
    ):
        mflog_expr = export_mflog_env_vars(
            flow_name=self._flow_name,
            run_id=self._run_id,
            step_name=self._step_name,
            task_id=self._task_id,
            retry_count=self._attempt,
            datastore_type=self._datastore.TYPE,
            stdout_path=STDOUT_PATH,
            stderr_path=STDERR_PATH,
        )
        init_cmds = self._environment.get_package_commands(code_package_url)
        init_expr = " && ".join(init_cmds)
        step_expr = " && ".join([
            capture_output_to_mflog(a)
            for a in (self._environment.bootstrap_commands(self._step_name) +
                      step_cmds)
        ])

        # Construct an entry point that
        # 1) initializes the mflog environment (mflog_expr)
        # 2) bootstraps a metaflow environment (init_expr)
        # 3) executes a task (step_expr)

        # The `true` command is to make sure that the generated command
        # plays well with docker containers which have entrypoint set as
        # eval $@
        cmd_str = "true && mkdir -p %s && %s && %s && %s; " % (
            LOGS_DIR,
            mflog_expr,
            init_expr,
            step_expr,
        )
        # After the task has finished, we save its exit code (fail/success)
        # and persist the final logs. The whole entrypoint should exit
        # with the exit code (c) of the task.
        #
        # Note that if step_expr OOMs, this tail expression is never executed.
        # We lose the last logs in this scenario.
        #
        # TODO: Find a way to capture hard exit logs in Kubernetes.
        cmd_str += "c=$?; %s; exit $c" % BASH_SAVE_LOGS
        return shlex.split('bash -c "%s"' % cmd_str)
コード例 #3
0
ファイル: argo_workflow.py プロジェクト: sappier/metaflow
 def _commands(self, node, retry_count, user_code_retries):
     mflog_expr = export_mflog_env_vars(datastore_type='s3',
                                        stdout_path='/tmp/mflog_stdout',
                                        stderr_path='/tmp/mflog_stderr',
                                        flow_name=self.flow.name,
                                        run_id='{{workflow.name}}',
                                        step_name=node.name,
                                        task_id='{{pod.name}}',
                                        retry_count=retry_count)
     init_cmds = []
     if self.code_package_url:
         init_cmds.extend(
             self.environment.get_package_commands(self.code_package_url))
     init_cmds.extend(self.environment.bootstrap_commands(node.name))
     init_expr = ' && '.join(init_cmds)
     step_expr = " && ".join([
         capture_output_to_mflog(a) for a in (
             self._step_commands(node, retry_count, user_code_retries))
     ])
     cmd = ['true', mflog_expr, init_expr, step_expr]
     cmd_str = '%s; c=$?; %s; exit $c' % (' && '.join(
         c for c in cmd if c), BASH_SAVE_LOGS)
     return shlex.split('bash -c \"%s\"' % cmd_str)
コード例 #4
0
ファイル: batch.py プロジェクト: sappier/metaflow
    def _command(self, environment, code_package_url, step_name, step_cmds,
                 task_spec):
        mflog_expr = export_mflog_env_vars(datastore_type="s3",
                                           stdout_path=STDOUT_PATH,
                                           stderr_path=STDERR_PATH,
                                           **task_spec)
        init_cmds = environment.get_package_commands(code_package_url)
        init_expr = " && ".join(init_cmds)
        step_expr = " && ".join([
            capture_output_to_mflog(a)
            for a in (environment.bootstrap_commands(step_name) + step_cmds)
        ])

        # construct an entry point that
        # 1) initializes the mflog environment (mflog_expr)
        # 2) bootstraps a metaflow environment (init_expr)
        # 3) executes a task (step_expr)

        # the `true` command is to make sure that the generated command
        # plays well with docker containers which have entrypoint set as
        # eval $@
        cmd_str = "true && mkdir -p %s && %s && %s && %s; " % (
            LOGS_DIR,
            mflog_expr,
            init_expr,
            step_expr,
        )
        # after the task has finished, we save its exit code (fail/success)
        # and persist the final logs. The whole entrypoint should exit
        # with the exit code (c) of the task.
        #
        # Note that if step_expr OOMs, this tail expression is never executed.
        # We lose the last logs in this scenario (although they are visible
        # still through AWS CloudWatch console).
        cmd_str += "c=$?; %s; exit $c" % BASH_SAVE_LOGS
        return shlex.split('bash -c "%s"' % cmd_str)
コード例 #5
0
    def _step_cli(self, node, paths, code_package_url, user_code_retries):
        cmds = []

        script_name = os.path.basename(sys.argv[0])
        executable = self.environment.executable(node.name)

        if R.use_r():
            entrypoint = [R.entrypoint()]
        else:
            entrypoint = [executable, script_name]

        # Use AWS Batch job identifier as the globally unique task identifier.
        task_id = "${AWS_BATCH_JOB_ID}"

        # FlowDecorators can define their own top-level options. They are
        # responsible for adding their own top-level options and values through
        # the get_top_level_options() hook. See similar logic in runtime.py.
        top_opts_dict = {}
        for deco in flow_decorators():
            top_opts_dict.update(deco.get_top_level_options())
        top_opts = list(dict_to_cli_options(top_opts_dict))

        if node.name == "start":
            # We need a separate unique ID for the special _parameters task
            task_id_params = "%s-params" % task_id
            # Export user-defined parameters into runtime environment
            param_file = "".join(
                random.choice(string.ascii_lowercase) for _ in range(10))
            export_params = " && ".join([
                capture_output_to_mflog(
                    "python -m metaflow.plugins.aws.step_functions.set_batch_environment parameters %s"
                    % param_file),
                ". `pwd`/%s" % param_file,
            ])

            params = (entrypoint + top_opts + [
                "--quiet",
                "--metadata=%s" % self.metadata.TYPE,
                "--environment=%s" % self.environment.TYPE,
                "--datastore=s3",
                "--event-logger=%s" % self.event_logger.logger_type,
                "--monitor=%s" % self.monitor.monitor_type,
                "--no-pylint",
                "init",
                "--run-id sfn-$METAFLOW_RUN_ID",
                "--task-id %s" % task_id_params,
            ])
            # Assign tags to run objects.
            if self.tags:
                params.extend("--tag %s" % tag for tag in self.tags)

            # If the start step gets retried, we must be careful not to
            # regenerate multiple parameters tasks. Hence we check first if
            # _parameters exists already.
            exists = entrypoint + [
                "dump",
                "--max-value-size=0",
                "sfn-${METAFLOW_RUN_ID}/_parameters/%s" % (task_id_params),
            ]
            cmd = "if ! %s >/dev/null 2>/dev/null; then %s && %s; fi" % (
                " ".join(exists),
                export_params,
                capture_output_to_mflog(" ".join(params)),
            )
            cmds.append(cmd)
            paths = "sfn-${METAFLOW_RUN_ID}/_parameters/%s" % (task_id_params)

        if node.type == "join" and self.graph[
                node.split_parents[-1]].type == "foreach":
            parent_tasks_file = "".join(
                random.choice(string.ascii_lowercase) for _ in range(10))
            export_parent_tasks = capture_output_to_mflog(
                "python -m "
                "metaflow.plugins.aws.step_functions.set_batch_environment "
                "parent_tasks %s && . `pwd`/%s" %
                (parent_tasks_file, parent_tasks_file))
            cmds.append(export_parent_tasks)

        top_level = top_opts + [
            "--quiet",
            "--metadata=%s" % self.metadata.TYPE,
            "--environment=%s" % self.environment.TYPE,
            "--datastore=%s" % self.flow_datastore.TYPE,
            "--datastore-root=%s" % self.flow_datastore.datastore_root,
            "--event-logger=%s" % self.event_logger.logger_type,
            "--monitor=%s" % self.monitor.monitor_type,
            "--no-pylint",
            "--with=step_functions_internal",
        ]

        step = [
            "step",
            node.name,
            "--run-id sfn-$METAFLOW_RUN_ID",
            "--task-id %s" % task_id,
            # Since retries are handled by AWS Batch, we can rely on
            # AWS_BATCH_JOB_ATTEMPT as the job counter.
            "--retry-count $((AWS_BATCH_JOB_ATTEMPT-1))",
            "--max-user-code-retries %d" % user_code_retries,
            "--input-paths %s" % paths,
            # Set decorator to batch to execute `task_*` hooks for batch
            # decorator.
            "--with=batch",
        ]
        if any(self.graph[n].type == "foreach" for n in node.in_funcs):
            # We set the `METAFLOW_SPLIT_INDEX` through JSONPath-foo
            # to pass the state from the parent DynamoDb state for for-each.
            step.append("--split-index $METAFLOW_SPLIT_INDEX")
        if self.tags:
            step.extend("--tag %s" % tag for tag in self.tags)
        if self.namespace is not None:
            step.append("--namespace=%s" % self.namespace)
        cmds.append(
            capture_output_to_mflog(" ".join(entrypoint + top_level + step)))
        return " && ".join(cmds)