Esempio n. 1
0
def step(ctx,
         step_name,
         code_package_sha,
         code_package_url,
         executable=None,
         image=None,
         iam_role=None,
         execution_role=None,
         cpu=None,
         gpu=None,
         memory=None,
         queue=None,
         run_time_limit=None,
         shared_memory=None,
         max_swap=None,
         swappiness=None,
         **kwargs):
    def echo(batch_id, msg, stream=sys.stdout):
        ctx.obj.echo_always("[%s] %s" % (batch_id, msg))

    if ctx.obj.datastore.datastore_root is None:
        ctx.obj.datastore.datastore_root = ctx.obj.datastore.get_datastore_root_from_config(
            echo)

    if R.use_r():
        entrypoint = R.entrypoint()
    else:
        if executable is None:
            executable = ctx.obj.environment.executable(step_name)
        entrypoint = '%s -u %s' % (executable, os.path.basename(sys.argv[0]))

    top_args = " ".join(util.dict_to_cli_options(ctx.parent.parent.params))

    input_paths = kwargs.get("input_paths")
    split_vars = None
    if input_paths:
        max_size = 30 * 1024
        split_vars = {
            "METAFLOW_INPUT_PATHS_%d" % (i // max_size):
            input_paths[i:i + max_size]
            for i in range(0, len(input_paths), max_size)
        }
        kwargs["input_paths"] = "".join("${%s}" % s for s in split_vars.keys())

    step_args = " ".join(util.dict_to_cli_options(kwargs))
    step_cli = u"{entrypoint} {top_args} step {step} {step_args}".format(
        entrypoint=entrypoint,
        top_args=top_args,
        step=step_name,
        step_args=step_args)
    node = ctx.obj.graph[step_name]

    # Get retry information
    retry_count = kwargs.get("retry_count", 0)
    retry_deco = [deco for deco in node.decorators if deco.name == "retry"]
    minutes_between_retries = None
    if retry_deco:
        minutes_between_retries = int(retry_deco[0].attributes.get(
            "minutes_between_retries", 1))

    # Set batch attributes
    attrs = {
        "metaflow.user":
        util.get_username(),
        "metaflow.flow_name":
        ctx.obj.flow.name,
        "metaflow.step_name":
        step_name,
        "metaflow.run_id":
        kwargs["run_id"],
        "metaflow.task_id":
        kwargs["task_id"],
        "metaflow.retry_count":
        str(retry_count),
        "metaflow.version":
        ctx.obj.environment.get_environment_info()["metaflow_version"],
    }

    env_deco = [deco for deco in node.decorators if deco.name == "environment"]
    if env_deco:
        env = env_deco[0].attributes["vars"]
    else:
        env = {}

    datastore_root = os.path.join(
        ctx.obj.datastore.make_path(ctx.obj.flow.name, kwargs['run_id'],
                                    step_name, kwargs['task_id']))
    # Add the environment variables related to the input-paths argument
    if split_vars:
        env.update(split_vars)

    if retry_count:
        ctx.obj.echo_always(
            "Sleeping %d minutes before the next AWS Batch retry" %
            minutes_between_retries)
        time.sleep(minutes_between_retries * 60)
    batch = Batch(ctx.obj.metadata, ctx.obj.environment)
    try:
        with ctx.obj.monitor.measure("metaflow.batch.launch"):
            batch.launch_job(step_name,
                             step_cli,
                             code_package_sha,
                             code_package_url,
                             ctx.obj.datastore.TYPE,
                             image=image,
                             queue=queue,
                             iam_role=iam_role,
                             execution_role=execution_role,
                             cpu=cpu,
                             gpu=gpu,
                             memory=memory,
                             run_time_limit=run_time_limit,
                             shared_memory=shared_memory,
                             max_swap=max_swap,
                             swappiness=swappiness,
                             env=env,
                             attrs=attrs)
    except Exception as e:
        print(e)
        _sync_metadata(echo, ctx.obj.metadata, datastore_root, retry_count)
        sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
    try:
        batch.wait(echo=echo)
    except BatchKilledException:
        # don't retry killed tasks
        traceback.print_exc()
        _sync_metadata(echo, ctx.obj.metadata, datastore_root, retry_count)
        sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
    _sync_metadata(echo, ctx.obj.metadata, datastore_root, retry_count)
Esempio n. 2
0
def step(
    ctx,
    step_name,
    code_package_sha,
    code_package_url,
    executable=None,
    image=None,
    iam_role=None,
    execution_role=None,
    cpu=None,
    gpu=None,
    memory=None,
    queue=None,
    run_time_limit=None,
    shared_memory=None,
    max_swap=None,
    swappiness=None,
    host_volumes=None,
    **kwargs
):
    def echo(msg, stream="stderr", batch_id=None):
        msg = util.to_unicode(msg)
        if batch_id:
            msg = "[%s] %s" % (batch_id, msg)
        ctx.obj.echo_always(msg, err=(stream == sys.stderr))

    if R.use_r():
        entrypoint = R.entrypoint()
    else:
        if executable is None:
            executable = ctx.obj.environment.executable(step_name)
        entrypoint = "%s -u %s" % (executable, os.path.basename(sys.argv[0]))

    top_args = " ".join(util.dict_to_cli_options(ctx.parent.parent.params))

    input_paths = kwargs.get("input_paths")
    split_vars = None
    if input_paths:
        max_size = 30 * 1024
        split_vars = {
            "METAFLOW_INPUT_PATHS_%d" % (i // max_size): input_paths[i : i + max_size]
            for i in range(0, len(input_paths), max_size)
        }
        kwargs["input_paths"] = "".join("${%s}" % s for s in split_vars.keys())

    step_args = " ".join(util.dict_to_cli_options(kwargs))
    step_cli = u"{entrypoint} {top_args} step {step} {step_args}".format(
        entrypoint=entrypoint,
        top_args=top_args,
        step=step_name,
        step_args=step_args,
    )
    node = ctx.obj.graph[step_name]

    # Get retry information
    retry_count = kwargs.get("retry_count", 0)
    retry_deco = [deco for deco in node.decorators if deco.name == "retry"]
    minutes_between_retries = None
    if retry_deco:
        minutes_between_retries = int(
            retry_deco[0].attributes.get("minutes_between_retries", 1)
        )

    # Set batch attributes
    task_spec = {
        "flow_name": ctx.obj.flow.name,
        "step_name": step_name,
        "run_id": kwargs["run_id"],
        "task_id": kwargs["task_id"],
        "retry_count": str(retry_count),
    }
    attrs = {"metaflow.%s" % k: v for k, v in task_spec.items()}
    attrs["metaflow.user"] = util.get_username()
    attrs["metaflow.version"] = ctx.obj.environment.get_environment_info()[
        "metaflow_version"
    ]

    env_deco = [deco for deco in node.decorators if deco.name == "environment"]
    if env_deco:
        env = env_deco[0].attributes["vars"]
    else:
        env = {}

    # Add the environment variables related to the input-paths argument
    if split_vars:
        env.update(split_vars)

    if retry_count:
        ctx.obj.echo_always(
            "Sleeping %d minutes before the next AWS Batch retry"
            % minutes_between_retries
        )
        time.sleep(minutes_between_retries * 60)

    # this information is needed for log tailing
    ds = ctx.obj.flow_datastore.get_task_datastore(
        mode="w",
        run_id=kwargs["run_id"],
        step_name=step_name,
        task_id=kwargs["task_id"],
        attempt=int(retry_count),
    )
    stdout_location = ds.get_log_location(TASK_LOG_SOURCE, "stdout")
    stderr_location = ds.get_log_location(TASK_LOG_SOURCE, "stderr")

    def _sync_metadata():
        if ctx.obj.metadata.TYPE == "local":
            sync_local_metadata_from_datastore(
                DATASTORE_LOCAL_DIR,
                ctx.obj.flow_datastore.get_task_datastore(
                    kwargs["run_id"], step_name, kwargs["task_id"]
                ),
            )

    batch = Batch(ctx.obj.metadata, ctx.obj.environment)
    try:
        with ctx.obj.monitor.measure("metaflow.aws.batch.launch_job"):
            batch.launch_job(
                step_name,
                step_cli,
                task_spec,
                code_package_sha,
                code_package_url,
                ctx.obj.flow_datastore.TYPE,
                image=image,
                queue=queue,
                iam_role=iam_role,
                execution_role=execution_role,
                cpu=cpu,
                gpu=gpu,
                memory=memory,
                run_time_limit=run_time_limit,
                shared_memory=shared_memory,
                max_swap=max_swap,
                swappiness=swappiness,
                env=env,
                attrs=attrs,
                host_volumes=host_volumes,
            )
    except Exception as e:
        traceback.print_exc()
        _sync_metadata()
        sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
    try:
        batch.wait(stdout_location, stderr_location, echo=echo)
    except BatchKilledException:
        # don't retry killed tasks
        traceback.print_exc()
        sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
    finally:
        _sync_metadata()
Esempio n. 3
0
    def _step_cli(self, node, paths, code_package_url, user_code_retries):
        cmds = []

        script_name = os.path.basename(sys.argv[0])
        executable = self.environment.executable(node.name)

        if R.use_r():
            entrypoint = [R.entrypoint()]
        else:
            entrypoint = [executable, script_name]

        # Use AWS Batch job identifier as the globally unique task identifier.
        task_id = "${AWS_BATCH_JOB_ID}"
        top_opts_dict = {
            "with": [
                decorator.make_decorator_spec()
                for decorator in node.decorators
                if not decorator.statically_defined
            ]
        }
        # FlowDecorators can define their own top-level options. They are
        # responsible for adding their own top-level options and values through
        # the get_top_level_options() hook. See similar logic in runtime.py.
        for deco in flow_decorators():
            top_opts_dict.update(deco.get_top_level_options())

        top_opts = list(dict_to_cli_options(top_opts_dict))

        top_level = top_opts + [
            "--quiet",
            "--metadata=%s" % self.metadata.TYPE,
            "--environment=%s" % self.environment.TYPE,
            "--datastore=%s" % self.flow_datastore.TYPE,
            "--datastore-root=%s" % self.flow_datastore.datastore_root,
            "--event-logger=%s" % self.event_logger.TYPE,
            "--monitor=%s" % self.monitor.TYPE,
            "--no-pylint",
            "--with=step_functions_internal",
        ]

        if node.name == "start":
            # We need a separate unique ID for the special _parameters task
            task_id_params = "%s-params" % task_id
            # Export user-defined parameters into runtime environment
            param_file = "".join(
                random.choice(string.ascii_lowercase) for _ in range(10)
            )
            export_params = (
                "python -m "
                "metaflow.plugins.aws.step_functions.set_batch_environment "
                "parameters %s && . `pwd`/%s" % (param_file, param_file)
            )
            params = (
                entrypoint
                + top_level
                + [
                    "init",
                    "--run-id sfn-$METAFLOW_RUN_ID",
                    "--task-id %s" % task_id_params,
                ]
            )
            # Assign tags to run objects.
            if self.tags:
                params.extend("--tag %s" % tag for tag in self.tags)

            # If the start step gets retried, we must be careful not to
            # regenerate multiple parameters tasks. Hence we check first if
            # _parameters exists already.
            exists = entrypoint + [
                "dump",
                "--max-value-size=0",
                "sfn-${METAFLOW_RUN_ID}/_parameters/%s" % (task_id_params),
            ]
            cmd = "if ! %s >/dev/null 2>/dev/null; then %s && %s; fi" % (
                " ".join(exists),
                export_params,
                " ".join(params),
            )
            cmds.append(cmd)
            paths = "sfn-${METAFLOW_RUN_ID}/_parameters/%s" % (task_id_params)

        if node.type == "join" and self.graph[node.split_parents[-1]].type == "foreach":
            parent_tasks_file = "".join(
                random.choice(string.ascii_lowercase) for _ in range(10)
            )
            export_parent_tasks = (
                "python -m "
                "metaflow.plugins.aws.step_functions.set_batch_environment "
                "parent_tasks %s && . `pwd`/%s" % (parent_tasks_file, parent_tasks_file)
            )
            cmds.append(export_parent_tasks)

        step = [
            "step",
            node.name,
            "--run-id sfn-$METAFLOW_RUN_ID",
            "--task-id %s" % task_id,
            # Since retries are handled by AWS Batch, we can rely on
            # AWS_BATCH_JOB_ATTEMPT as the job counter.
            "--retry-count $((AWS_BATCH_JOB_ATTEMPT-1))",
            "--max-user-code-retries %d" % user_code_retries,
            "--input-paths %s" % paths,
        ]
        if any(self.graph[n].type == "foreach" for n in node.in_funcs):
            # We set the `METAFLOW_SPLIT_INDEX` through JSONPath-foo
            # to pass the state from the parent DynamoDb state for for-each.
            step.append("--split-index $METAFLOW_SPLIT_INDEX")
        if self.tags:
            step.extend("--tag %s" % tag for tag in self.tags)
        if self.namespace is not None:
            step.append("--namespace=%s" % self.namespace)
        cmds.append(" ".join(entrypoint + top_level + step))
        return " && ".join(cmds)
Esempio n. 4
0
    def _step_cli(self, node, paths, code_package_url, user_code_retries):
        cmds = []

        script_name = os.path.basename(sys.argv[0])
        executable = self.environment.executable(node.name)

        if R.use_r():
            entrypoint = [R.entrypoint()]
        else:
            entrypoint = [executable, script_name]

        # Use AWS Batch job identifier as the globally unique task identifier.
        task_id = '${AWS_BATCH_JOB_ID}'

        # FlowDecorators can define their own top-level options. They are
        # responsible for adding their own top-level options and values through
        # the get_top_level_options() hook. See similar logic in runtime.py.
        top_opts_dict = {}
        for deco in flow_decorators():
            top_opts_dict.update(deco.get_top_level_options())
        top_opts = list(dict_to_cli_options(top_opts_dict))

        if node.name == 'start':
            # We need a separate unique ID for the special _parameters task
            task_id_params = '%s-params' % task_id
            # Export user-defined parameters into runtime environment
            param_file = ''.join(
                random.choice(string.ascii_lowercase) for _ in range(10))
            export_params = \
                'python -m ' \
                'metaflow.plugins.aws.step_functions.set_batch_environment ' \
                'parameters %s && . `pwd`/%s' % (param_file, param_file)
            params = entrypoint + top_opts +\
                ['--quiet',
                 '--metadata=%s' % self.metadata.TYPE,
                 '--environment=%s' % self.environment.TYPE,
                 '--datastore=s3',
                 '--event-logger=%s' % self.event_logger.logger_type,
                 '--monitor=%s' % self.monitor.monitor_type,
                 '--no-pylint',
                 'init',
                 '--run-id sfn-$METAFLOW_RUN_ID',
                 '--task-id %s' % task_id_params]
            # Assign tags to run objects.
            if self.tags:
                params.extend('--tag %s' % tag for tag in self.tags)

            # If the start step gets retried, we must be careful not to
            # regenerate multiple parameters tasks. Hence we check first if
            # _parameters exists already.
            exists = entrypoint +\
                ['dump',
                 '--max-value-size=0',
                 'sfn-${METAFLOW_RUN_ID}/_parameters/%s' % (task_id_params)]
            cmd = 'if ! %s >/dev/null 2>/dev/null; then %s && %s; fi'\
                  % (' '.join(exists), export_params, ' '.join(params))
            cmds.append(cmd)
            paths = 'sfn-${METAFLOW_RUN_ID}/_parameters/%s' % (task_id_params)

        if node.type == 'join' and\
            self.graph[node.split_parents[-1]].type == 'foreach':
            parent_tasks_file = ''.join(
                random.choice(string.ascii_lowercase) for _ in range(10))
            export_parent_tasks = \
                'python -m ' \
                'metaflow.plugins.aws.step_functions.set_batch_environment ' \
                'parent_tasks %s && . `pwd`/%s' \
                    % (parent_tasks_file, parent_tasks_file)
            cmds.append(export_parent_tasks)

        top_level = top_opts + [
            '--quiet',
            '--metadata=%s' % self.metadata.TYPE,
            '--environment=%s' % self.environment.TYPE,
            '--datastore=%s' % self.datastore.TYPE,
            '--datastore-root=%s' % self.datastore.datastore_root,
            '--event-logger=%s' % self.event_logger.logger_type,
            '--monitor=%s' % self.monitor.monitor_type, '--no-pylint',
            '--with=step_functions_internal'
        ]

        step = [
            'step',
            node.name,
            '--run-id sfn-$METAFLOW_RUN_ID',
            '--task-id %s' % task_id,
            # Since retries are handled by AWS Batch, we can rely on
            # AWS_BATCH_JOB_ATTEMPT as the job counter.
            '--retry-count $((AWS_BATCH_JOB_ATTEMPT-1))',
            '--max-user-code-retries %d' % user_code_retries,
            '--input-paths %s' % paths,
            # Set decorator to batch to execute `task_*` hooks for batch
            # decorator.
            '--with=batch'
        ]
        if any(self.graph[n].type == 'foreach' for n in node.in_funcs):
            # We set the `METAFLOW_SPLIT_INDEX` through JSONPath-foo
            # to pass the state from the parent DynamoDb state for for-each.
            step.append('--split-index $METAFLOW_SPLIT_INDEX')
        if self.tags:
            step.extend('--tag %s' % tag for tag in self.tags)
        if self.namespace is not None:
            step.append('--namespace=%s' % self.namespace)
        cmds.append(' '.join(entrypoint + top_level + step))
        return ' && '.join(cmds)
Esempio n. 5
0
def step(
    ctx,
    step_name,
    code_package_sha,
    code_package_url,
    executable=None,
    image=None,
    service_account=None,
    secrets=None,
    node_selector=None,
    k8s_namespace=None,
    cpu=None,
    disk=None,
    memory=None,
    gpu=None,
    gpu_vendor=None,
    run_time_limit=None,
    **kwargs
):
    def echo(msg, stream="stderr", job_id=None):
        msg = util.to_unicode(msg)
        if job_id:
            msg = "[%s] %s" % (job_id, msg)
        ctx.obj.echo_always(msg, err=(stream == sys.stderr))

    node = ctx.obj.graph[step_name]

    # Construct entrypoint CLI
    if executable is None:
        executable = ctx.obj.environment.executable(step_name)

    # Set environment
    env = {}
    env_deco = [deco for deco in node.decorators if deco.name == "environment"]
    if env_deco:
        env = env_deco[0].attributes["vars"]

    # Set input paths.
    input_paths = kwargs.get("input_paths")
    split_vars = None
    if input_paths:
        max_size = 30 * 1024
        split_vars = {
            "METAFLOW_INPUT_PATHS_%d" % (i // max_size): input_paths[i : i + max_size]
            for i in range(0, len(input_paths), max_size)
        }
        kwargs["input_paths"] = "".join("${%s}" % s for s in split_vars.keys())
        env.update(split_vars)

    # Set retry policy.
    retry_count = int(kwargs.get("retry_count", 0))
    retry_deco = [deco for deco in node.decorators if deco.name == "retry"]
    minutes_between_retries = None
    if retry_deco:
        minutes_between_retries = int(
            retry_deco[0].attributes.get("minutes_between_retries", 2)
        )
    if retry_count:
        ctx.obj.echo_always(
            "Sleeping %d minutes before the next retry" % minutes_between_retries
        )
        time.sleep(minutes_between_retries * 60)

    step_cli = "{entrypoint} {top_args} step {step} {step_args}".format(
        entrypoint="%s -u %s" % (executable, os.path.basename(sys.argv[0])),
        top_args=" ".join(util.dict_to_cli_options(ctx.parent.parent.params)),
        step=step_name,
        step_args=" ".join(util.dict_to_cli_options(kwargs)),
    )

    # Set log tailing.
    ds = ctx.obj.flow_datastore.get_task_datastore(
        mode="w",
        run_id=kwargs["run_id"],
        step_name=step_name,
        task_id=kwargs["task_id"],
        attempt=int(retry_count),
    )
    stdout_location = ds.get_log_location(TASK_LOG_SOURCE, "stdout")
    stderr_location = ds.get_log_location(TASK_LOG_SOURCE, "stderr")

    def _sync_metadata():
        if ctx.obj.metadata.TYPE == "local":
            sync_local_metadata_from_datastore(
                DATASTORE_LOCAL_DIR,
                ctx.obj.flow_datastore.get_task_datastore(
                    kwargs["run_id"], step_name, kwargs["task_id"]
                ),
            )

    try:
        kubernetes = Kubernetes(
            datastore=ctx.obj.flow_datastore,
            metadata=ctx.obj.metadata,
            environment=ctx.obj.environment,
        )
        # Configure and launch Kubernetes job.
        with ctx.obj.monitor.measure("metaflow.kubernetes.launch_job"):
            kubernetes.launch_job(
                flow_name=ctx.obj.flow.name,
                run_id=kwargs["run_id"],
                step_name=step_name,
                task_id=kwargs["task_id"],
                attempt=str(retry_count),
                user=util.get_username(),
                code_package_sha=code_package_sha,
                code_package_url=code_package_url,
                code_package_ds=ctx.obj.flow_datastore.TYPE,
                step_cli=step_cli,
                docker_image=image,
                service_account=service_account,
                secrets=secrets,
                node_selector=node_selector,
                namespace=k8s_namespace,
                cpu=cpu,
                disk=disk,
                memory=memory,
                gpu=gpu,
                gpu_vendor=gpu_vendor,
                run_time_limit=run_time_limit,
                env=env,
            )
    except Exception as e:
        traceback.print_exc(chain=False)
        _sync_metadata()
        sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
    try:
        kubernetes.wait(stdout_location, stderr_location, echo=echo)
    except KubernetesKilledException:
        # don't retry killed tasks
        traceback.print_exc()
        sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
    finally:
        _sync_metadata()