def producer(step_name): couler.run_job( manifest=manifest, success_condition=success_condition, failure_condition=failure_condition, step_name=step_name, )
def test_run_job(self): success_condition = "status.succeeded > 0" failure_condition = "status.failed > 3" manifest = """apiVersion: batch/v1 kind: Job metadata: generateName: rand-num- spec: template: spec: containers: - name: rand image: python:3.6 command: ["python random_num.py"] """ couler.run_job( manifest=manifest, success_condition=success_condition, failure_condition=failure_condition, step_name="test_run_job", ) proto_wf = get_default_proto_workflow() s = proto_wf.steps[0].steps[0] t = proto_wf.templates[s.tmpl_name] self.assertEqual(s.resource_spec.manifest, manifest) self.assertEqual(s.resource_spec.success_condition, success_condition) self.assertEqual(s.resource_spec.failure_condition, failure_condition) self.assertEqual(len(t.outputs), 3) self.assertEqual(t.outputs[0].parameter.name, "job-name")
def consumer(step_name): couler.run_job( manifest=manifest, success_condition=success_condition, failure_condition=failure_condition, step_name=step_name, env={"k1": "v1"}, )
def test_create_job(self): success_condition = "status.succeeded > 0" failure_condition = "status.failed > 3" # Null manifest with self.assertRaises(ValueError): couler.run_job( manifest=None, success_condition=success_condition, failure_condition=failure_condition, ) # Have a manifest manifest = """ apiVersion: batch/v1 kind: Job metadata: generateName: rand-num- spec: template: spec: containers: - name: rand image: python:3.6 command: ["python random_num.py"] """ for set_owner in (True, False): couler.run_job( manifest=manifest, success_condition=success_condition, failure_condition=failure_condition, set_owner_reference=set_owner, ) self.assertEqual(len(couler.workflow.templates), 1) template = couler.workflow.get_template( "test-create-job" ).to_dict() resource = template["resource"] self.assertEqual(template["name"], "test-create-job") self.assertEqual(resource["action"], "create") self.assertEqual( resource["setOwnerReference"], "true" if set_owner else "false" ) self.assertEqual(resource["successCondition"], success_condition) self.assertEqual(resource["failureCondition"], failure_condition) self.assertEqual(resource["manifest"], manifest) couler._cleanup()
def train( image=None, command="", secret=None, no_chief=True, chief_image=None, chief_resources=None, chief_restart_policy="Never", chief_command=None, num_ps=0, ps_image=None, ps_resources=None, ps_restart_policy="Never", ps_command=None, num_workers=0, worker_image=None, worker_resources=None, worker_restart_policy="Never", worker_command=None, clean_pod_policy="Running", timeout=None, ): name = "tf-train-%s" % str(uuid.uuid4()) success_condition = ("status.replicaStatuses.Worker.succeeded == %s" % num_workers) failure_condition = "status.replicaStatuses.Worker.failed > 0" manifest = copy.deepcopy(manifest_template) manifest["metadata"].update({"name": name}) manifest["spec"].update({"cleanPodPolicy": clean_pod_policy}) if not no_chief: chief_image = chief_image if chief_image else image chief_command = chief_command if chief_command else command chief_pod = _generate_pod_spec( pod_template, container_template, allowed_pod_types=pod_types, pod_type="Chief", image=chief_image, replicas=1, secret=secret, command=chief_command, resources=chief_resources, restart_policy=chief_restart_policy, ) manifest["spec"]["tfReplicaSpecs"].update({"Chief": chief_pod}) if num_ps > 0: ps_image = ps_image if ps_image else image ps_command = ps_command if ps_command else command ps_pod = _generate_pod_spec( pod_template, container_template, allowed_pod_types=pod_types, pod_type="PS", image=ps_image, replicas=num_ps, secret=secret, command=ps_command, resources=ps_resources, restart_policy=ps_restart_policy, ) manifest["spec"]["tfReplicaSpecs"].update({"PS": ps_pod}) if num_workers > 0: worker_image = worker_image if worker_image else image worker_command = worker_command if worker_command else command worker_pod = _generate_pod_spec( pod_template, container_template, allowed_pod_types=pod_types, pod_type="Worker", image=worker_image, replicas=num_workers, secret=secret, command=worker_command, resources=worker_resources, restart_policy=worker_restart_policy, ) manifest["spec"]["tfReplicaSpecs"].update({"Worker": worker_pod}) step_name, _ = utils.invocation_location() couler.run_job( manifest=pyaml.dump(manifest), success_condition=success_condition, failure_condition=failure_condition, step_name=step_name, timeout=timeout, )
def run( raw_template, tuning_params, objective, success_condition=None, failure_condition=None, algorithm="random", parallel_trial_count=3, max_trial_count=12, max_failed_trial_count=3, ): """ Args: tuning_params: A dictionary of hyperparameters to be tuned. objective: The dictionary of objective for model training. success_condition: The condition to indicate when a Katib experiment succeeds. failure_condition: The condition to indicate when a Katib experiment fails. algorithm: The algorithm used in model training. raw_template: The YAML string for containing Katib trial manifest. parallel_trial_count: The number of parallel Katib trials. max_trial_count: The total number of Katib trials. max_failed_trial_count: The total number of failed Katib trials. """ _validate_objective(objective) _validate_tuning_params(tuning_params) manifest = katib_manifest_template.format( parallel_trial_count=parallel_trial_count, max_trial_count=max_trial_count, max_failed_trial_count=max_failed_trial_count, obj_type=objective["type"], obj_goal=objective["goal"], obj_metric_name=objective["metric_name"], algorithm=algorithm, raw_template=raw_template, ) wf_yaml = yaml.load(StringIO(manifest), Loader=yaml.FullLoader) parameters = [] for i in range(0, len(tuning_params)): param = { "name": "--%s" % tuning_params[i]["name"], "parameterType": tuning_params[i]["type"], "feasibleSpace": { "min": '"%d"' % tuning_params[i]["range"][0], "max": '"%d"' % tuning_params[i]["range"][1], }, } parameters.append(param) wf_yaml["spec"]["parameters"] = parameters manifest = yaml.dump(wf_yaml, default_flow_style=False) couler.run_job( manifest=manifest, success_condition=success_condition, failure_condition=failure_condition, )
def train( image=None, command="", secret=None, master_image=None, master_resources=None, master_restart_policy="Never", master_command=None, num_workers=0, worker_image=None, worker_resources=None, worker_restart_policy="Never", worker_command=None, clean_pod_policy="Running", timeout=None, ): name = "pytorch-train-%s" % str(uuid.uuid4()) success_condition = "status.pytorchReplicaStatuses.Worker.succeeded > 0" failure_condition = "status.pytorchReplicaStatuses.Worker.failed > 0" manifest = copy.deepcopy(manifest_template) manifest["metadata"].update({"name": name}) manifest["spec"].update({"cleanPodPolicy": clean_pod_policy}) master_image = master_image if master_image else image master_command = master_command if master_command else command chief_pod = _generate_pod_spec( pod_template, container_template, allowed_pod_types=pod_types, pod_type="Master", image=master_image, replicas=1, secret=secret, command=master_command, resources=master_resources, restart_policy=master_restart_policy, ) manifest["spec"]["pytorchReplicaSpecs"].update({"Master": chief_pod}) if num_workers > 0: worker_image = worker_image if worker_image else image worker_command = worker_command if worker_command else command worker_pod = _generate_pod_spec( pod_template, container_template, allowed_pod_types=pod_types, pod_type="Worker", image=worker_image, replicas=num_workers, secret=secret, command=worker_command, resources=worker_resources, restart_policy=worker_restart_policy, ) manifest["spec"]["pytorchReplicaSpecs"].update({"Worker": worker_pod}) step_name, _ = utils.invocation_location() couler.run_job( manifest=pyaml.dump(manifest), success_condition=success_condition, failure_condition=failure_condition, step_name=step_name, timeout=timeout, )