コード例 #1
0
ファイル: step_functions.py プロジェクト: sappier/metaflow
    def _step_cli(self, node, paths, code_package_url, user_code_retries):
        cmds = []

        script_name = os.path.basename(sys.argv[0])
        executable = self.environment.executable(node.name)

        if R.use_r():
            entrypoint = [R.entrypoint()]
        else:
            entrypoint = [executable, script_name]

        # Use AWS Batch job identifier as the globally unique task identifier.
        task_id = "${AWS_BATCH_JOB_ID}"

        # FlowDecorators can define their own top-level options. They are
        # responsible for adding their own top-level options and values through
        # the get_top_level_options() hook. See similar logic in runtime.py.
        top_opts_dict = {}
        for deco in flow_decorators():
            top_opts_dict.update(deco.get_top_level_options())
        top_opts = list(dict_to_cli_options(top_opts_dict))

        if node.name == "start":
            # We need a separate unique ID for the special _parameters task
            task_id_params = "%s-params" % task_id
            # Export user-defined parameters into runtime environment
            param_file = "".join(
                random.choice(string.ascii_lowercase) for _ in range(10))
            export_params = (
                "python -m "
                "metaflow.plugins.aws.step_functions.set_batch_environment "
                "parameters %s && . `pwd`/%s" % (param_file, param_file))
            params = (entrypoint + top_opts + [
                "--quiet",
                "--metadata=%s" % self.metadata.TYPE,
                "--environment=%s" % self.environment.TYPE,
                "--datastore=s3",
                "--event-logger=%s" % self.event_logger.logger_type,
                "--monitor=%s" % self.monitor.monitor_type,
                "--no-pylint",
                "init",
                "--run-id sfn-$METAFLOW_RUN_ID",
                "--task-id %s" % task_id_params,
            ])
            # Assign tags to run objects.
            if self.tags:
                params.extend("--tag %s" % tag for tag in self.tags)

            # If the start step gets retried, we must be careful not to
            # regenerate multiple parameters tasks. Hence we check first if
            # _parameters exists already.
            exists = entrypoint + [
                "dump",
                "--max-value-size=0",
                "sfn-${METAFLOW_RUN_ID}/_parameters/%s" % (task_id_params),
            ]
            cmd = "if ! %s >/dev/null 2>/dev/null; then %s && %s; fi" % (
                " ".join(exists),
                export_params,
                " ".join(params),
            )
            cmds.append(cmd)
            paths = "sfn-${METAFLOW_RUN_ID}/_parameters/%s" % (task_id_params)

        if node.type == "join" and self.graph[
                node.split_parents[-1]].type == "foreach":
            parent_tasks_file = "".join(
                random.choice(string.ascii_lowercase) for _ in range(10))
            export_parent_tasks = (
                "python -m "
                "metaflow.plugins.aws.step_functions.set_batch_environment "
                "parent_tasks %s && . `pwd`/%s" %
                (parent_tasks_file, parent_tasks_file))
            cmds.append(export_parent_tasks)

        top_level = top_opts + [
            "--quiet",
            "--metadata=%s" % self.metadata.TYPE,
            "--environment=%s" % self.environment.TYPE,
            "--datastore=%s" % self.flow_datastore.TYPE,
            "--datastore-root=%s" % self.flow_datastore.datastore_root,
            "--event-logger=%s" % self.event_logger.logger_type,
            "--monitor=%s" % self.monitor.monitor_type,
            "--no-pylint",
            "--with=step_functions_internal",
        ]

        step = [
            "step",
            node.name,
            "--run-id sfn-$METAFLOW_RUN_ID",
            "--task-id %s" % task_id,
            # Since retries are handled by AWS Batch, we can rely on
            # AWS_BATCH_JOB_ATTEMPT as the job counter.
            "--retry-count $((AWS_BATCH_JOB_ATTEMPT-1))",
            "--max-user-code-retries %d" % user_code_retries,
            "--input-paths %s" % paths,
            # Set decorator to batch to execute `task_*` hooks for batch
            # decorator.
            "--with=batch",
        ]
        if any(self.graph[n].type == "foreach" for n in node.in_funcs):
            # We set the `METAFLOW_SPLIT_INDEX` through JSONPath-foo
            # to pass the state from the parent DynamoDb state for for-each.
            step.append("--split-index $METAFLOW_SPLIT_INDEX")
        if self.tags:
            step.extend("--tag %s" % tag for tag in self.tags)
        if self.namespace is not None:
            step.append("--namespace=%s" % self.namespace)
        cmds.append(" ".join(entrypoint + top_level + step))
        return " && ".join(cmds)
コード例 #2
0
    def _set_constants(self, graph, kwargs):
        from metaflow.decorators import (
            flow_decorators, )  # To prevent circular dependency

        # Persist values for parameters and other constants (class level variables)
        # only once. This method is called before persist_constants is called to
        # persist all values set using setattr
        seen = set()
        for var, param in self._get_parameters():
            norm = param.name.lower()
            if norm in seen:
                raise MetaflowException("Parameter *%s* is specified twice. "
                                        "Note that parameter names are "
                                        "case-insensitive." % param.name)
            seen.add(norm)
        seen.clear()
        self._success = True

        parameters_info = []
        for var, param in self._get_parameters():
            seen.add(var)
            val = kwargs[param.name.replace("-", "_").lower()]
            # Support for delayed evaluation of parameters. This is used for
            # includefile in particular
            if callable(val):
                val = val()
            val = val.split(
                param.separator) if val and param.separator else val
            setattr(self, var, val)
            parameters_info.append({
                "name": var,
                "type": param.__class__.__name__
            })

        # Do the same for class variables which will be forced constant as modifications
        # to them don't propagate well since we create a new process for each step and
        # re-read the flow file
        constants_info = []
        for var in dir(self.__class__):
            if var[0] == "_" or var in self._NON_PARAMETERS or var in seen:
                continue
            val = getattr(self.__class__, var)
            if isinstance(val, (MethodType, FunctionType, property, type)):
                continue
            constants_info.append({"name": var, "type": type(val).__name__})
            setattr(self, var, val)

        # We store the DAG information as an artifact called _graph_info
        steps_info, graph_structure = graph.output_steps()

        graph_info = {
            "file":
            os.path.basename(os.path.abspath(sys.argv[0])),
            "parameters":
            parameters_info,
            "constants":
            constants_info,
            "steps":
            steps_info,
            "graph_structure":
            graph_structure,
            "doc":
            graph.doc,
            "decorators": [{
                "name": deco.name,
                "attributes": deco.attributes,
                "statically_defined": deco.statically_defined,
            } for deco in flow_decorators() if not deco.name.startswith("_")],
        }
        self._graph_info = graph_info