def run_canned_step(name, args, inputs=None, outputs=None, step_name=None, cache=None): func_name, caller_line = utils.invocation_location() func_name = (utils.argo_safe_name(step_name) if step_name is not None else func_name) step_name = step_update_utils.update_step(func_name, args, step_name, caller_line) tmpl_args = [] if states._outputs_tmp is not None: tmpl_args.extend(states._outputs_tmp) pb_step = None if proto_repr: pb_step = proto_repr.step_repr( # noqa: F841 input=inputs, output=outputs, canned_step_name=name, canned_step_args=args, step_name=step_name, tmpl_name=step_name + "-tmpl", args=tmpl_args, cache=cache, ) proto_repr.add_deps_to_step(step_name) return pb_step
def run_script( image, command=None, source=None, env=None, resources=None, secret=None, timeout=None, retry=None, step_name=None, image_pull_policy=None, pool=None, daemon=False, ): """Generate an Argo script template. For example, https://github.com/argoproj/argo/tree/master/examples#scripts--results. Step_name is only used for annotating step while developing step zoo. """ func_name, caller_line = utils.invocation_location() func_name = (utils.argo_safe_name(step_name) if step_name is not None else func_name) if states.workflow.get_template(func_name) is None: if source is None: raise ValueError("Input script can not be null") template = Script( name=func_name, image=image, command=command, source=source, env=env, secret=states.get_secret(secret), resources=resources, timeout=timeout, retry=retry, image_pull_policy=image_pull_policy, pool=pool, daemon=daemon, ) states.workflow.add_template(template) step_name = step_update_utils.update_step(func_name, args=None, step_name=step_name, caller_line=caller_line) rets = _script_output(step_name, func_name) states._steps_outputs[step_name] = rets # TODO(typhoonzero): return pb_step when using a couler server. pb_step = proto_repr.step_repr( # noqa: F841 step_name=step_name, tmpl_name=func_name, image=image, command=command, source=source, script_output=rets, ) return rets
def __init__(self, path, type=None, is_global=False): # TODO (terrytangyuan): This seems hacky. # If line number changes, we need to update tests as well. _, caller_line = utils.invocation_location() self.id = "output-id-%s" % caller_line self.path = path # TODO (terrytangyuan): this is not used for now and we currently # only support "valueFrom". self.type = type self.is_global = is_global
def concurrent(function_list, subtasks=False): """ Start different jobs at the same time subtasks: each function F of function_list contains multiple steps. Then, for each F, we create a sub-steps template. """ if not isinstance(function_list, list): raise SyntaxError("require input functions as list") _, con_caller_line = utils.invocation_location() states._concurrent_func_line = con_caller_line states._run_concurrent_lock = True function_rets = [] for function in function_list: # In case different parallel steps use the same function name states._concurrent_func_id = states._concurrent_func_id + 1 if callable(function): if subtasks is True: # 1. generate the sub-steps template # 2. for each step in F, update the sub_steps template # 3. append the steps into the template # 4. for F itself, update the main control flow step states._sub_steps = OrderedDict() tmp_concurrent_func_id = states._concurrent_func_id states._run_concurrent_lock = False ret = function() states._concurrent_func_id = tmp_concurrent_func_id func_name = "concurrent-task-%s" % states._concurrent_func_id template = Steps( name=func_name, steps=list(states._sub_steps.values()) ) states.workflow.add_template(template) states._sub_steps = None # TODO: add the args for the sub task states._run_concurrent_lock = True _update_steps( "concurrent_func_name", con_caller_line, args=None, template_name=func_name, ) else: ret = function() function_rets.append(ret) else: raise TypeError("require loop over a function to run") states._run_concurrent_lock = False states._concurrent_func_id = 0 return function_rets
def dag(dependency_graph): """ Generate a DAG of Argo YAML Note: couler.set_dependencies() is more preferable. https://github.com/argoproj/argo/blob/master/examples/dag-coinflip.yaml """ if not isinstance(dependency_graph, list): raise SyntaxError("require input as list") states.workflow.enable_dag_mode() _, call_line = utils.invocation_location() states._dag_caller_line = call_line for edges in dependency_graph: states._upstream_dag_task = None if isinstance(edges, list): for node in edges: if isinstance(node, types.FunctionType): node() else: raise TypeError("require loop over a function to run")
def train( image=None, command="", secret=None, no_chief=True, chief_image=None, chief_resources=None, chief_restart_policy="Never", chief_command=None, num_ps=0, ps_image=None, ps_resources=None, ps_restart_policy="Never", ps_command=None, num_workers=0, worker_image=None, worker_resources=None, worker_restart_policy="Never", worker_command=None, clean_pod_policy="Running", timeout=None, ): name = "tf-train-%s" % str(uuid.uuid4()) success_condition = ("status.replicaStatuses.Worker.succeeded == %s" % num_workers) failure_condition = "status.replicaStatuses.Worker.failed > 0" manifest = copy.deepcopy(manifest_template) manifest["metadata"].update({"name": name}) manifest["spec"].update({"cleanPodPolicy": clean_pod_policy}) if not no_chief: chief_image = chief_image if chief_image else image chief_command = chief_command if chief_command else command chief_pod = _generate_pod_spec( pod_template, container_template, allowed_pod_types=pod_types, pod_type="Chief", image=chief_image, replicas=1, secret=secret, command=chief_command, resources=chief_resources, restart_policy=chief_restart_policy, ) manifest["spec"]["tfReplicaSpecs"].update({"Chief": chief_pod}) if num_ps > 0: ps_image = ps_image if ps_image else image ps_command = ps_command if ps_command else command ps_pod = _generate_pod_spec( pod_template, container_template, allowed_pod_types=pod_types, pod_type="PS", image=ps_image, replicas=num_ps, secret=secret, command=ps_command, resources=ps_resources, restart_policy=ps_restart_policy, ) manifest["spec"]["tfReplicaSpecs"].update({"PS": ps_pod}) if num_workers > 0: worker_image = worker_image if worker_image else image worker_command = worker_command if worker_command else command worker_pod = _generate_pod_spec( pod_template, container_template, allowed_pod_types=pod_types, pod_type="Worker", image=worker_image, replicas=num_workers, secret=secret, command=worker_command, resources=worker_resources, restart_policy=worker_restart_policy, ) manifest["spec"]["tfReplicaSpecs"].update({"Worker": worker_pod}) step_name, _ = utils.invocation_location() couler.run_job( manifest=pyaml.dump(manifest), success_condition=success_condition, failure_condition=failure_condition, step_name=step_name, timeout=timeout, )
def inner_func(): func_name, _ = utils.invocation_location() self.assertEqual("test-invocation-location", func_name)
def run_container( image, command=None, args=None, output=None, input=None, env=None, secret=None, resources=None, timeout=None, retry=None, step_name=None, image_pull_policy=None, pool=None, enable_ulogfs=True, daemon=False, volume_mounts=None, working_dir=None, node_selector=None, ): """ Generate an Argo container template. For example, the template whalesay in https://github.com/argoproj/argo/tree/master/examples#hello-world. :param image: :param command: :param args: :param output: output artifact for container output :param input: input artifact for container input :param env: environmental variable :param secret: :param resources: CPU or memory resource config dict :param timeout: in seconds :param retry: retry policy :param step_name: used for annotating step . :param image_pull_policy: :param pool: :param enable_ulogfs: :param daemon: :return: """ func_name, caller_line = utils.invocation_location() func_name = (utils.argo_safe_name(step_name) if step_name is not None else func_name) if states.workflow.get_template(func_name) is None: # Generate the inputs parameter for the template if input is None: input = [] if args is None and states._outputs_tmp is not None: args = [] if args is not None: if not isinstance(args, list): args = [args] # Handle case where args is a list of list type # For example, [[Output, ]] if (isinstance(args, list) and len(args) > 0 and isinstance(args[0], list) and len(args[0]) > 0 and isinstance(args[0][0], Output)): args = args[0] if states._outputs_tmp is not None: args.extend(states._outputs_tmp) # In case, the args include output artifact # Place output artifact into the input for arg in args: if isinstance(arg, (OutputArtifact, OutputJob)): input.append(arg) # Automatically append emptyDir volume and volume mount to work with # Argo k8sapi executor. # More info: https://argoproj.github.io/argo/empty-dir/ if output is not None: if not isinstance(output, list): output = [output] if volume_mounts is None: volume_mounts = [] mounted_path = [] for i, out in enumerate(output): if "/tmp" in out.path: raise ValueError("Mounting to /tmp is not supported") path_to_mount = os.path.dirname(out.path) # Avoid duplicate mount paths if path_to_mount not in mounted_path: volume_mounts.append( VolumeMount("couler-out-dir-%s" % i, path_to_mount)) mounted_path.append(path_to_mount) # Generate container and template template = Container( name=func_name, image=image, command=command, args=args, env=env, secret=states.get_secret(secret), resources=resources, image_pull_policy=image_pull_policy, retry=retry, timeout=timeout, output=output, input=input, pool=pool, enable_ulogfs=enable_ulogfs, daemon=daemon, volume_mounts=volume_mounts, working_dir=working_dir, node_selector=node_selector, ) states.workflow.add_template(template) step_name = step_update_utils.update_step(func_name, args, step_name, caller_line) # TODO: need to switch to use field `output` directly step_templ = states.workflow.get_template(func_name) _output = step_templ.to_dict().get("outputs", None) _input = step_templ.to_dict().get("inputs", None) rets = _container_output(step_name, func_name, _output) states._steps_outputs[step_name] = rets pb_step = proto_repr.step_repr( # noqa: F841 step_name=step_name, tmpl_name=func_name, image=image, command=command, source=None, script_output=None, args=args, input=_input, output=_output, ) return rets
def run_job( manifest, success_condition, failure_condition, timeout=None, retry=None, step_name=None, pool=None, env=None, set_owner_reference=True, ): """ Create a k8s job. For example, the pi-tmpl template in https://github.com/argoproj/argo/blob/master/examples/k8s-jobs.yaml :param manifest: YAML specification of the job to be created. :param success_condition: expression for verifying job success. :param failure_condition: expression for verifying job failure. :param timeout: To limit the elapsed time for a workflow in seconds. :param step_name: is only used while developing functions of step zoo. :param env: environmental parameter with a dict types, e.g., {"OS_ENV_1": "OS_ENV_value"} # noqa: E501 :param set_owner_reference: Whether to set the workflow as the job's owner reference. If `True`, the job will be deleted once the workflow is deleted. :return: output """ if manifest is None: raise ValueError("Input manifest can not be null") func_name, caller_line = utils.invocation_location() func_name = (utils.argo_safe_name(step_name) if step_name is not None else func_name) args = [] if states.workflow.get_template(func_name) is None: if states._outputs_tmp is not None and env is not None: env["inferred_outputs"] = states._outputs_tmp # Generate the inputs for the manifest template envs, parameters, args = utils.generate_parameters_run_job(env) # update the env if env is not None: manifest_dict = yaml.safe_load(manifest) manifest_dict["spec"]["env"] = envs # TODO this is used to pass the test cases, # should be fixed in a better way if ("labels" in manifest_dict["metadata"] and "argo.step.owner" in manifest_dict["metadata"]["labels"]): manifest_dict["metadata"]["labels"][ "argo.step.owner"] = "'{{pod.name}}'" manifest = pyaml.dump(manifest_dict) template = Job( name=func_name, args=args, action="create", manifest=manifest, set_owner_reference=set_owner_reference, success_condition=success_condition, failure_condition=failure_condition, timeout=timeout, retry=retry, pool=pool, ) states.workflow.add_template(template) step_name = step_update_utils.update_step(func_name, args, step_name, caller_line) # return job name and job uid for reference rets = _job_output(step_name, func_name) states._steps_outputs[step_name] = rets pb_step = proto_repr.step_repr( # noqa: F841 step_name=step_name, tmpl_name=func_name, image=None, source=None, script_output=None, input=None, output=rets, manifest=manifest, success_cond=success_condition, failure_cond=failure_condition, ) return rets
def train( image=None, command="", secret=None, master_image=None, master_resources=None, master_restart_policy="Never", master_command=None, num_workers=0, worker_image=None, worker_resources=None, worker_restart_policy="Never", worker_command=None, clean_pod_policy="Running", timeout=None, ): name = "pytorch-train-%s" % str(uuid.uuid4()) success_condition = "status.pytorchReplicaStatuses.Worker.succeeded > 0" failure_condition = "status.pytorchReplicaStatuses.Worker.failed > 0" manifest = copy.deepcopy(manifest_template) manifest["metadata"].update({"name": name}) manifest["spec"].update({"cleanPodPolicy": clean_pod_policy}) master_image = master_image if master_image else image master_command = master_command if master_command else command chief_pod = _generate_pod_spec( pod_template, container_template, allowed_pod_types=pod_types, pod_type="Master", image=master_image, replicas=1, secret=secret, command=master_command, resources=master_resources, restart_policy=master_restart_policy, ) manifest["spec"]["pytorchReplicaSpecs"].update({"Master": chief_pod}) if num_workers > 0: worker_image = worker_image if worker_image else image worker_command = worker_command if worker_command else command worker_pod = _generate_pod_spec( pod_template, container_template, allowed_pod_types=pod_types, pod_type="Worker", image=worker_image, replicas=num_workers, secret=secret, command=worker_command, resources=worker_resources, restart_policy=worker_restart_policy, ) manifest["spec"]["pytorchReplicaSpecs"].update({"Worker": worker_pod}) step_name, _ = utils.invocation_location() couler.run_job( manifest=pyaml.dump(manifest), success_condition=success_condition, failure_condition=failure_condition, step_name=step_name, timeout=timeout, )
def run_container( image, command=None, args=None, output=None, input=None, env=None, secret=None, resources=None, timeout=None, retry=None, step_name=None, image_pull_policy=None, pool=None, enable_ulogfs=True, daemon=False, volume_mounts=None, working_dir=None, node_selector=None, ): """ Generate an Argo container template. For example, the template whalesay in https://github.com/argoproj/argo/tree/master/examples#hello-world. :param image: :param command: :param args: :param output: output artifact for container output :param input: input artifact for container input :param env: environmental variable :param secret: :param resources: CPU or memory resource config dict :param timeout: in seconds :param retry: retry policy :param step_name: used for annotating step . :param image_pull_policy: :param pool: :param enable_ulogfs: :param daemon: :return: """ func_name, caller_line = utils.invocation_location() func_name = (utils.argo_safe_name(step_name) if step_name is not None else func_name) if states.workflow.get_template(func_name) is None: # Generate the inputs parameter for the template if input is None: input = [] if args is None and states._outputs_tmp is not None: args = [] if args is not None: if not isinstance(args, list): args = [args] # Handle case where args is a list of list type # For example, [[Output, ]] if (isinstance(args, list) and len(args) > 0 and isinstance(args[0], list) and len(args[0]) > 0 and isinstance(args[0][0], Output)): args = args[0] if states._outputs_tmp is not None: args.extend(states._outputs_tmp) # In case, the args include output artifact # Place output artifact into the input for arg in args: if isinstance(arg, (OutputArtifact, OutputJob)): input.append(arg) # Generate container and template template = Container( name=func_name, image=image, command=command, args=args, env=env, secret=states.get_secret(secret), resources=resources, image_pull_policy=image_pull_policy, retry=retry, timeout=timeout, output=output, input=input, pool=pool, enable_ulogfs=enable_ulogfs, daemon=daemon, volume_mounts=volume_mounts, working_dir=working_dir, node_selector=node_selector, ) states.workflow.add_template(template) step_name = step_update_utils.update_step(func_name, args, step_name, caller_line) # TODO: need to switch to use field `output` directly _output = (states.workflow.get_template(func_name).to_dict().get( "outputs", None)) rets = _container_output(step_name, func_name, _output) states._steps_outputs[step_name] = rets return rets