コード例 #1
0
ファイル: argo_test.py プロジェクト: peiniliu/couler
 def producer(step_name):
     couler.run_job(
         manifest=manifest,
         success_condition=success_condition,
         failure_condition=failure_condition,
         step_name=step_name,
     )
コード例 #2
0
ファイル: proto_repr_test.py プロジェクト: ywskycn/couler-1
    def test_run_job(self):
        success_condition = "status.succeeded > 0"
        failure_condition = "status.failed > 3"
        manifest = """apiVersion: batch/v1
kind: Job
metadata:
  generateName: rand-num-
spec:
    template:
      spec:
        containers:
        - name: rand
          image: python:3.6
          command: ["python random_num.py"]
"""
        couler.run_job(
            manifest=manifest,
            success_condition=success_condition,
            failure_condition=failure_condition,
            step_name="test_run_job",
        )
        proto_wf = get_default_proto_workflow()
        s = proto_wf.steps[0].steps[0]
        t = proto_wf.templates[s.tmpl_name]
        self.assertEqual(s.resource_spec.manifest, manifest)
        self.assertEqual(s.resource_spec.success_condition, success_condition)
        self.assertEqual(s.resource_spec.failure_condition, failure_condition)
        self.assertEqual(len(t.outputs), 3)
        self.assertEqual(t.outputs[0].parameter.name, "job-name")
コード例 #3
0
ファイル: argo_test.py プロジェクト: peiniliu/couler
 def consumer(step_name):
     couler.run_job(
         manifest=manifest,
         success_condition=success_condition,
         failure_condition=failure_condition,
         step_name=step_name,
         env={"k1": "v1"},
     )
コード例 #4
0
ファイル: argo_test.py プロジェクト: Vafilor/couler
 def test_create_job(self):
     success_condition = "status.succeeded > 0"
     failure_condition = "status.failed > 3"
     # Null manifest
     with self.assertRaises(ValueError):
         couler.run_job(
             manifest=None,
             success_condition=success_condition,
             failure_condition=failure_condition,
         )
     # Have a manifest
     manifest = """
     apiVersion: batch/v1
     kind: Job
     metadata:
       generateName: rand-num-
     spec:
       template:
         spec:
           containers:
           - name: rand
             image: python:3.6
             command: ["python random_num.py"]
     """
     for set_owner in (True, False):
         couler.run_job(
             manifest=manifest,
             success_condition=success_condition,
             failure_condition=failure_condition,
             set_owner_reference=set_owner,
         )
         self.assertEqual(len(couler.workflow.templates), 1)
         template = couler.workflow.get_template(
             "test-create-job"
         ).to_dict()
         resource = template["resource"]
         self.assertEqual(template["name"], "test-create-job")
         self.assertEqual(resource["action"], "create")
         self.assertEqual(
             resource["setOwnerReference"], "true" if set_owner else "false"
         )
         self.assertEqual(resource["successCondition"], success_condition)
         self.assertEqual(resource["failureCondition"], failure_condition)
         self.assertEqual(resource["manifest"], manifest)
         couler._cleanup()
コード例 #5
0
def train(
    image=None,
    command="",
    secret=None,
    no_chief=True,
    chief_image=None,
    chief_resources=None,
    chief_restart_policy="Never",
    chief_command=None,
    num_ps=0,
    ps_image=None,
    ps_resources=None,
    ps_restart_policy="Never",
    ps_command=None,
    num_workers=0,
    worker_image=None,
    worker_resources=None,
    worker_restart_policy="Never",
    worker_command=None,
    clean_pod_policy="Running",
    timeout=None,
):
    name = "tf-train-%s" % str(uuid.uuid4())
    success_condition = ("status.replicaStatuses.Worker.succeeded == %s" %
                         num_workers)
    failure_condition = "status.replicaStatuses.Worker.failed > 0"

    manifest = copy.deepcopy(manifest_template)
    manifest["metadata"].update({"name": name})
    manifest["spec"].update({"cleanPodPolicy": clean_pod_policy})

    if not no_chief:
        chief_image = chief_image if chief_image else image
        chief_command = chief_command if chief_command else command

        chief_pod = _generate_pod_spec(
            pod_template,
            container_template,
            allowed_pod_types=pod_types,
            pod_type="Chief",
            image=chief_image,
            replicas=1,
            secret=secret,
            command=chief_command,
            resources=chief_resources,
            restart_policy=chief_restart_policy,
        )

        manifest["spec"]["tfReplicaSpecs"].update({"Chief": chief_pod})

    if num_ps > 0:
        ps_image = ps_image if ps_image else image
        ps_command = ps_command if ps_command else command

        ps_pod = _generate_pod_spec(
            pod_template,
            container_template,
            allowed_pod_types=pod_types,
            pod_type="PS",
            image=ps_image,
            replicas=num_ps,
            secret=secret,
            command=ps_command,
            resources=ps_resources,
            restart_policy=ps_restart_policy,
        )

        manifest["spec"]["tfReplicaSpecs"].update({"PS": ps_pod})

    if num_workers > 0:
        worker_image = worker_image if worker_image else image
        worker_command = worker_command if worker_command else command

        worker_pod = _generate_pod_spec(
            pod_template,
            container_template,
            allowed_pod_types=pod_types,
            pod_type="Worker",
            image=worker_image,
            replicas=num_workers,
            secret=secret,
            command=worker_command,
            resources=worker_resources,
            restart_policy=worker_restart_policy,
        )

        manifest["spec"]["tfReplicaSpecs"].update({"Worker": worker_pod})

    step_name, _ = utils.invocation_location()

    couler.run_job(
        manifest=pyaml.dump(manifest),
        success_condition=success_condition,
        failure_condition=failure_condition,
        step_name=step_name,
        timeout=timeout,
    )
コード例 #6
0
ファイル: katib.py プロジェクト: ywskycn/couler-1
def run(
    raw_template,
    tuning_params,
    objective,
    success_condition=None,
    failure_condition=None,
    algorithm="random",
    parallel_trial_count=3,
    max_trial_count=12,
    max_failed_trial_count=3,
):
    """
    Args:
        tuning_params: A dictionary of hyperparameters to be tuned.
        objective: The dictionary of objective for model training.
        success_condition: The condition to indicate when a Katib
            experiment succeeds.
        failure_condition: The condition to indicate when a Katib
            experiment fails.
        algorithm: The algorithm used in model training.
        raw_template: The YAML string for containing Katib trial manifest.
        parallel_trial_count: The number of parallel Katib trials.
        max_trial_count: The total number of Katib trials.
        max_failed_trial_count: The total number of failed Katib trials.
    """
    _validate_objective(objective)
    _validate_tuning_params(tuning_params)

    manifest = katib_manifest_template.format(
        parallel_trial_count=parallel_trial_count,
        max_trial_count=max_trial_count,
        max_failed_trial_count=max_failed_trial_count,
        obj_type=objective["type"],
        obj_goal=objective["goal"],
        obj_metric_name=objective["metric_name"],
        algorithm=algorithm,
        raw_template=raw_template,
    )

    wf_yaml = yaml.load(StringIO(manifest), Loader=yaml.FullLoader)

    parameters = []
    for i in range(0, len(tuning_params)):
        param = {
            "name": "--%s" % tuning_params[i]["name"],
            "parameterType": tuning_params[i]["type"],
            "feasibleSpace": {
                "min": '"%d"' % tuning_params[i]["range"][0],
                "max": '"%d"' % tuning_params[i]["range"][1],
            },
        }
        parameters.append(param)

    wf_yaml["spec"]["parameters"] = parameters

    manifest = yaml.dump(wf_yaml, default_flow_style=False)
    couler.run_job(
        manifest=manifest,
        success_condition=success_condition,
        failure_condition=failure_condition,
    )
コード例 #7
0
def train(
    image=None,
    command="",
    secret=None,
    master_image=None,
    master_resources=None,
    master_restart_policy="Never",
    master_command=None,
    num_workers=0,
    worker_image=None,
    worker_resources=None,
    worker_restart_policy="Never",
    worker_command=None,
    clean_pod_policy="Running",
    timeout=None,
):
    name = "pytorch-train-%s" % str(uuid.uuid4())
    success_condition = "status.pytorchReplicaStatuses.Worker.succeeded > 0"
    failure_condition = "status.pytorchReplicaStatuses.Worker.failed > 0"

    manifest = copy.deepcopy(manifest_template)
    manifest["metadata"].update({"name": name})
    manifest["spec"].update({"cleanPodPolicy": clean_pod_policy})

    master_image = master_image if master_image else image
    master_command = master_command if master_command else command

    chief_pod = _generate_pod_spec(
        pod_template,
        container_template,
        allowed_pod_types=pod_types,
        pod_type="Master",
        image=master_image,
        replicas=1,
        secret=secret,
        command=master_command,
        resources=master_resources,
        restart_policy=master_restart_policy,
    )

    manifest["spec"]["pytorchReplicaSpecs"].update({"Master": chief_pod})

    if num_workers > 0:
        worker_image = worker_image if worker_image else image
        worker_command = worker_command if worker_command else command

        worker_pod = _generate_pod_spec(
            pod_template,
            container_template,
            allowed_pod_types=pod_types,
            pod_type="Worker",
            image=worker_image,
            replicas=num_workers,
            secret=secret,
            command=worker_command,
            resources=worker_resources,
            restart_policy=worker_restart_policy,
        )

        manifest["spec"]["pytorchReplicaSpecs"].update({"Worker": worker_pod})

    step_name, _ = utils.invocation_location()

    couler.run_job(
        manifest=pyaml.dump(manifest),
        success_condition=success_condition,
        failure_condition=failure_condition,
        step_name=step_name,
        timeout=timeout,
    )