Ejemplo n.º 1
0
def run_canned_step(name,
                    args,
                    inputs=None,
                    outputs=None,
                    step_name=None,
                    cache=None):
    func_name, caller_line = utils.invocation_location()
    func_name = (utils.argo_safe_name(step_name)
                 if step_name is not None else func_name)
    step_name = step_update_utils.update_step(func_name, args, step_name,
                                              caller_line)
    tmpl_args = []
    if states._outputs_tmp is not None:
        tmpl_args.extend(states._outputs_tmp)
    pb_step = None
    if proto_repr:
        pb_step = proto_repr.step_repr(  # noqa: F841
            input=inputs,
            output=outputs,
            canned_step_name=name,
            canned_step_args=args,
            step_name=step_name,
            tmpl_name=step_name + "-tmpl",
            args=tmpl_args,
            cache=cache,
        )
        proto_repr.add_deps_to_step(step_name)
    return pb_step
Ejemplo n.º 2
0
def run_script(
    image,
    command=None,
    source=None,
    env=None,
    resources=None,
    secret=None,
    timeout=None,
    retry=None,
    step_name=None,
    image_pull_policy=None,
    pool=None,
    daemon=False,
):
    """Generate an Argo script template.  For example,
    https://github.com/argoproj/argo/tree/master/examples#scripts--results.
    Step_name is only used for annotating step while developing step zoo.
    """
    func_name, caller_line = utils.invocation_location()
    func_name = (utils.argo_safe_name(step_name)
                 if step_name is not None else func_name)

    if states.workflow.get_template(func_name) is None:
        if source is None:
            raise ValueError("Input script can not be null")

        template = Script(
            name=func_name,
            image=image,
            command=command,
            source=source,
            env=env,
            secret=states.get_secret(secret),
            resources=resources,
            timeout=timeout,
            retry=retry,
            image_pull_policy=image_pull_policy,
            pool=pool,
            daemon=daemon,
        )
        states.workflow.add_template(template)

    step_name = step_update_utils.update_step(func_name,
                                              args=None,
                                              step_name=step_name,
                                              caller_line=caller_line)
    rets = _script_output(step_name, func_name)
    states._steps_outputs[step_name] = rets

    # TODO(typhoonzero): return pb_step when using a couler server.
    pb_step = proto_repr.step_repr(  # noqa: F841
        step_name=step_name,
        tmpl_name=func_name,
        image=image,
        command=command,
        source=source,
        script_output=rets,
    )

    return rets
Ejemplo n.º 3
0
 def __init__(self, path, type=None, is_global=False):
     # TODO (terrytangyuan): This seems hacky.
     #   If line number changes, we need to update tests as well.
     _, caller_line = utils.invocation_location()
     self.id = "output-id-%s" % caller_line
     self.path = path
     # TODO (terrytangyuan): this is not used for now and we currently
     #   only support "valueFrom".
     self.type = type
     self.is_global = is_global
Ejemplo n.º 4
0
def concurrent(function_list, subtasks=False):
    """
    Start different jobs at the same time
    subtasks: each function F of function_list contains multiple steps.
    Then, for each F, we create a sub-steps template.
    """
    if not isinstance(function_list, list):
        raise SyntaxError("require input functions as list")

    _, con_caller_line = utils.invocation_location()

    states._concurrent_func_line = con_caller_line
    states._run_concurrent_lock = True

    function_rets = []
    for function in function_list:
        # In case different parallel steps use the same function name
        states._concurrent_func_id = states._concurrent_func_id + 1
        if callable(function):
            if subtasks is True:
                # 1. generate the sub-steps template
                # 2. for each step in F, update the sub_steps template
                # 3. append the steps into the template
                # 4. for F itself, update the main control flow step
                states._sub_steps = OrderedDict()
                tmp_concurrent_func_id = states._concurrent_func_id
                states._run_concurrent_lock = False
                ret = function()
                states._concurrent_func_id = tmp_concurrent_func_id
                func_name = "concurrent-task-%s" % states._concurrent_func_id
                template = Steps(
                    name=func_name, steps=list(states._sub_steps.values())
                )
                states.workflow.add_template(template)
                states._sub_steps = None
                # TODO: add the args for the sub task
                states._run_concurrent_lock = True
                _update_steps(
                    "concurrent_func_name",
                    con_caller_line,
                    args=None,
                    template_name=func_name,
                )
            else:
                ret = function()

            function_rets.append(ret)
        else:
            raise TypeError("require loop over a function to run")

    states._run_concurrent_lock = False
    states._concurrent_func_id = 0

    return function_rets
Ejemplo n.º 5
0
def dag(dependency_graph):
    """
    Generate a DAG of Argo YAML
    Note: couler.set_dependencies() is more preferable.
    https://github.com/argoproj/argo/blob/master/examples/dag-coinflip.yaml
    """
    if not isinstance(dependency_graph, list):
        raise SyntaxError("require input as list")

    states.workflow.enable_dag_mode()

    _, call_line = utils.invocation_location()

    states._dag_caller_line = call_line

    for edges in dependency_graph:
        states._upstream_dag_task = None
        if isinstance(edges, list):
            for node in edges:
                if isinstance(node, types.FunctionType):
                    node()
                else:
                    raise TypeError("require loop over a function to run")
Ejemplo n.º 6
0
def train(
    image=None,
    command="",
    secret=None,
    no_chief=True,
    chief_image=None,
    chief_resources=None,
    chief_restart_policy="Never",
    chief_command=None,
    num_ps=0,
    ps_image=None,
    ps_resources=None,
    ps_restart_policy="Never",
    ps_command=None,
    num_workers=0,
    worker_image=None,
    worker_resources=None,
    worker_restart_policy="Never",
    worker_command=None,
    clean_pod_policy="Running",
    timeout=None,
):
    name = "tf-train-%s" % str(uuid.uuid4())
    success_condition = ("status.replicaStatuses.Worker.succeeded == %s" %
                         num_workers)
    failure_condition = "status.replicaStatuses.Worker.failed > 0"

    manifest = copy.deepcopy(manifest_template)
    manifest["metadata"].update({"name": name})
    manifest["spec"].update({"cleanPodPolicy": clean_pod_policy})

    if not no_chief:
        chief_image = chief_image if chief_image else image
        chief_command = chief_command if chief_command else command

        chief_pod = _generate_pod_spec(
            pod_template,
            container_template,
            allowed_pod_types=pod_types,
            pod_type="Chief",
            image=chief_image,
            replicas=1,
            secret=secret,
            command=chief_command,
            resources=chief_resources,
            restart_policy=chief_restart_policy,
        )

        manifest["spec"]["tfReplicaSpecs"].update({"Chief": chief_pod})

    if num_ps > 0:
        ps_image = ps_image if ps_image else image
        ps_command = ps_command if ps_command else command

        ps_pod = _generate_pod_spec(
            pod_template,
            container_template,
            allowed_pod_types=pod_types,
            pod_type="PS",
            image=ps_image,
            replicas=num_ps,
            secret=secret,
            command=ps_command,
            resources=ps_resources,
            restart_policy=ps_restart_policy,
        )

        manifest["spec"]["tfReplicaSpecs"].update({"PS": ps_pod})

    if num_workers > 0:
        worker_image = worker_image if worker_image else image
        worker_command = worker_command if worker_command else command

        worker_pod = _generate_pod_spec(
            pod_template,
            container_template,
            allowed_pod_types=pod_types,
            pod_type="Worker",
            image=worker_image,
            replicas=num_workers,
            secret=secret,
            command=worker_command,
            resources=worker_resources,
            restart_policy=worker_restart_policy,
        )

        manifest["spec"]["tfReplicaSpecs"].update({"Worker": worker_pod})

    step_name, _ = utils.invocation_location()

    couler.run_job(
        manifest=pyaml.dump(manifest),
        success_condition=success_condition,
        failure_condition=failure_condition,
        step_name=step_name,
        timeout=timeout,
    )
Ejemplo n.º 7
0
 def inner_func():
     func_name, _ = utils.invocation_location()
     self.assertEqual("test-invocation-location", func_name)
Ejemplo n.º 8
0
def run_container(
    image,
    command=None,
    args=None,
    output=None,
    input=None,
    env=None,
    secret=None,
    resources=None,
    timeout=None,
    retry=None,
    step_name=None,
    image_pull_policy=None,
    pool=None,
    enable_ulogfs=True,
    daemon=False,
    volume_mounts=None,
    working_dir=None,
    node_selector=None,
):
    """
    Generate an Argo container template.  For example, the template whalesay
    in https://github.com/argoproj/argo/tree/master/examples#hello-world.
    :param image:
    :param command:
    :param args:
    :param output: output artifact for container output
    :param input: input artifact for container input
    :param env: environmental variable
    :param secret:
    :param resources: CPU or memory resource config dict
    :param timeout: in seconds
    :param retry: retry policy
    :param step_name: used for annotating step .
    :param image_pull_policy:
    :param pool:
    :param enable_ulogfs:
    :param daemon:
    :return:
    """
    func_name, caller_line = utils.invocation_location()
    func_name = (utils.argo_safe_name(step_name)
                 if step_name is not None else func_name)

    if states.workflow.get_template(func_name) is None:
        # Generate the inputs parameter for the template
        if input is None:
            input = []

        if args is None and states._outputs_tmp is not None:
            args = []

        if args is not None:
            if not isinstance(args, list):
                args = [args]

            # Handle case where args is a list of list type
            # For example, [[Output, ]]
            if (isinstance(args, list) and len(args) > 0
                    and isinstance(args[0], list) and len(args[0]) > 0
                    and isinstance(args[0][0], Output)):
                args = args[0]

            if states._outputs_tmp is not None:
                args.extend(states._outputs_tmp)

            # In case, the args include output artifact
            # Place output artifact into the input
            for arg in args:
                if isinstance(arg, (OutputArtifact, OutputJob)):
                    input.append(arg)

        # Automatically append emptyDir volume and volume mount to work with
        # Argo k8sapi executor.
        # More info: https://argoproj.github.io/argo/empty-dir/
        if output is not None:
            if not isinstance(output, list):
                output = [output]
            if volume_mounts is None:
                volume_mounts = []
            mounted_path = []
            for i, out in enumerate(output):
                if "/tmp" in out.path:
                    raise ValueError("Mounting to /tmp is not supported")
                path_to_mount = os.path.dirname(out.path)
                # Avoid duplicate mount paths
                if path_to_mount not in mounted_path:
                    volume_mounts.append(
                        VolumeMount("couler-out-dir-%s" % i, path_to_mount))
                    mounted_path.append(path_to_mount)

        # Generate container and template
        template = Container(
            name=func_name,
            image=image,
            command=command,
            args=args,
            env=env,
            secret=states.get_secret(secret),
            resources=resources,
            image_pull_policy=image_pull_policy,
            retry=retry,
            timeout=timeout,
            output=output,
            input=input,
            pool=pool,
            enable_ulogfs=enable_ulogfs,
            daemon=daemon,
            volume_mounts=volume_mounts,
            working_dir=working_dir,
            node_selector=node_selector,
        )
        states.workflow.add_template(template)

    step_name = step_update_utils.update_step(func_name, args, step_name,
                                              caller_line)

    # TODO: need to switch to use field `output` directly
    step_templ = states.workflow.get_template(func_name)
    _output = step_templ.to_dict().get("outputs", None)
    _input = step_templ.to_dict().get("inputs", None)

    rets = _container_output(step_name, func_name, _output)
    states._steps_outputs[step_name] = rets

    pb_step = proto_repr.step_repr(  # noqa: F841
        step_name=step_name,
        tmpl_name=func_name,
        image=image,
        command=command,
        source=None,
        script_output=None,
        args=args,
        input=_input,
        output=_output,
    )

    return rets
Ejemplo n.º 9
0
def run_job(
    manifest,
    success_condition,
    failure_condition,
    timeout=None,
    retry=None,
    step_name=None,
    pool=None,
    env=None,
    set_owner_reference=True,
):
    """
    Create a k8s job. For example, the pi-tmpl template in
    https://github.com/argoproj/argo/blob/master/examples/k8s-jobs.yaml
    :param manifest: YAML specification of the job to be created.
    :param success_condition: expression for verifying job success.
    :param failure_condition: expression for verifying job failure.
    :param timeout: To limit the elapsed time for a workflow in seconds.
    :param step_name: is only used while developing functions of step zoo.
    :param env: environmental parameter with a dict types, e.g., {"OS_ENV_1": "OS_ENV_value"}  # noqa: E501
    :param set_owner_reference: Whether to set the workflow as the job's owner reference.
        If `True`, the job will be deleted once the workflow is deleted.
    :return: output
    """
    if manifest is None:
        raise ValueError("Input manifest can not be null")

    func_name, caller_line = utils.invocation_location()
    func_name = (utils.argo_safe_name(step_name)
                 if step_name is not None else func_name)

    args = []
    if states.workflow.get_template(func_name) is None:
        if states._outputs_tmp is not None and env is not None:
            env["inferred_outputs"] = states._outputs_tmp

        # Generate the inputs for the manifest template
        envs, parameters, args = utils.generate_parameters_run_job(env)

        # update the env
        if env is not None:
            manifest_dict = yaml.safe_load(manifest)
            manifest_dict["spec"]["env"] = envs

            # TODO this is used to pass the test cases,
            # should be fixed in a better way
            if ("labels" in manifest_dict["metadata"] and "argo.step.owner"
                    in manifest_dict["metadata"]["labels"]):
                manifest_dict["metadata"]["labels"][
                    "argo.step.owner"] = "'{{pod.name}}'"

            manifest = pyaml.dump(manifest_dict)

        template = Job(
            name=func_name,
            args=args,
            action="create",
            manifest=manifest,
            set_owner_reference=set_owner_reference,
            success_condition=success_condition,
            failure_condition=failure_condition,
            timeout=timeout,
            retry=retry,
            pool=pool,
        )
        states.workflow.add_template(template)

    step_name = step_update_utils.update_step(func_name, args, step_name,
                                              caller_line)

    # return job name and job uid for reference
    rets = _job_output(step_name, func_name)
    states._steps_outputs[step_name] = rets

    pb_step = proto_repr.step_repr(  # noqa: F841
        step_name=step_name,
        tmpl_name=func_name,
        image=None,
        source=None,
        script_output=None,
        input=None,
        output=rets,
        manifest=manifest,
        success_cond=success_condition,
        failure_cond=failure_condition,
    )

    return rets
Ejemplo n.º 10
0
def train(
    image=None,
    command="",
    secret=None,
    master_image=None,
    master_resources=None,
    master_restart_policy="Never",
    master_command=None,
    num_workers=0,
    worker_image=None,
    worker_resources=None,
    worker_restart_policy="Never",
    worker_command=None,
    clean_pod_policy="Running",
    timeout=None,
):
    name = "pytorch-train-%s" % str(uuid.uuid4())
    success_condition = "status.pytorchReplicaStatuses.Worker.succeeded > 0"
    failure_condition = "status.pytorchReplicaStatuses.Worker.failed > 0"

    manifest = copy.deepcopy(manifest_template)
    manifest["metadata"].update({"name": name})
    manifest["spec"].update({"cleanPodPolicy": clean_pod_policy})

    master_image = master_image if master_image else image
    master_command = master_command if master_command else command

    chief_pod = _generate_pod_spec(
        pod_template,
        container_template,
        allowed_pod_types=pod_types,
        pod_type="Master",
        image=master_image,
        replicas=1,
        secret=secret,
        command=master_command,
        resources=master_resources,
        restart_policy=master_restart_policy,
    )

    manifest["spec"]["pytorchReplicaSpecs"].update({"Master": chief_pod})

    if num_workers > 0:
        worker_image = worker_image if worker_image else image
        worker_command = worker_command if worker_command else command

        worker_pod = _generate_pod_spec(
            pod_template,
            container_template,
            allowed_pod_types=pod_types,
            pod_type="Worker",
            image=worker_image,
            replicas=num_workers,
            secret=secret,
            command=worker_command,
            resources=worker_resources,
            restart_policy=worker_restart_policy,
        )

        manifest["spec"]["pytorchReplicaSpecs"].update({"Worker": worker_pod})

    step_name, _ = utils.invocation_location()

    couler.run_job(
        manifest=pyaml.dump(manifest),
        success_condition=success_condition,
        failure_condition=failure_condition,
        step_name=step_name,
        timeout=timeout,
    )
Ejemplo n.º 11
0
def run_container(
    image,
    command=None,
    args=None,
    output=None,
    input=None,
    env=None,
    secret=None,
    resources=None,
    timeout=None,
    retry=None,
    step_name=None,
    image_pull_policy=None,
    pool=None,
    enable_ulogfs=True,
    daemon=False,
    volume_mounts=None,
    working_dir=None,
    node_selector=None,
):
    """
    Generate an Argo container template.  For example, the template whalesay
    in https://github.com/argoproj/argo/tree/master/examples#hello-world.
    :param image:
    :param command:
    :param args:
    :param output: output artifact for container output
    :param input: input artifact for container input
    :param env: environmental variable
    :param secret:
    :param resources: CPU or memory resource config dict
    :param timeout: in seconds
    :param retry: retry policy
    :param step_name: used for annotating step .
    :param image_pull_policy:
    :param pool:
    :param enable_ulogfs:
    :param daemon:
    :return:
    """
    func_name, caller_line = utils.invocation_location()
    func_name = (utils.argo_safe_name(step_name)
                 if step_name is not None else func_name)

    if states.workflow.get_template(func_name) is None:
        # Generate the inputs parameter for the template
        if input is None:
            input = []

        if args is None and states._outputs_tmp is not None:
            args = []

        if args is not None:
            if not isinstance(args, list):
                args = [args]

            # Handle case where args is a list of list type
            # For example, [[Output, ]]
            if (isinstance(args, list) and len(args) > 0
                    and isinstance(args[0], list) and len(args[0]) > 0
                    and isinstance(args[0][0], Output)):
                args = args[0]

            if states._outputs_tmp is not None:
                args.extend(states._outputs_tmp)

            # In case, the args include output artifact
            # Place output artifact into the input
            for arg in args:
                if isinstance(arg, (OutputArtifact, OutputJob)):
                    input.append(arg)

        # Generate container and template
        template = Container(
            name=func_name,
            image=image,
            command=command,
            args=args,
            env=env,
            secret=states.get_secret(secret),
            resources=resources,
            image_pull_policy=image_pull_policy,
            retry=retry,
            timeout=timeout,
            output=output,
            input=input,
            pool=pool,
            enable_ulogfs=enable_ulogfs,
            daemon=daemon,
            volume_mounts=volume_mounts,
            working_dir=working_dir,
            node_selector=node_selector,
        )
        states.workflow.add_template(template)

    step_name = step_update_utils.update_step(func_name, args, step_name,
                                              caller_line)

    # TODO: need to switch to use field `output` directly
    _output = (states.workflow.get_template(func_name).to_dict().get(
        "outputs", None))

    rets = _container_output(step_name, func_name, _output)
    states._steps_outputs[step_name] = rets
    return rets