Python V1beta1TrialTemplate Examples

Programming Language: Python

Namespace/Package Name: kubeflow.katib

Examples at hotexamples.com: 2

Python V1beta1TrialTemplate - 2 examples found. These are the top rated real world Python examples of kubeflow.katib.V1beta1TrialTemplate extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

V1beta1TrialTemplate(2)

Frequently Used Methods

V1beta1TrialTemplate (2)

Example #1

Show file

File: mpi-job-horovod.py Project: ydataai/pipelines

def horovod_mnist_hpo(
    experiment_name: str = "mpi-horovod-mnist",
    experiment_namespace: str = "anonymous",
):

    # Trial count specification.
    max_trial_count = 6
    max_failed_trial_count = 3
    parallel_trial_count = 2

    # Objective specification.
    objective = V1beta1ObjectiveSpec(
        type="minimize",
        goal=0.01,
        objective_metric_name="loss",
    )

    # Algorithm specification.
    algorithm = V1beta1AlgorithmSpec(algorithm_name="bayesianoptimization",
                                     algorithm_settings=[
                                         V1beta1AlgorithmSetting(
                                             name="random_state", value="10")
                                     ])

    # Experiment search space.
    # In this example we tune learning rate and number of training steps.
    parameters = [
        V1beta1ParameterSpec(
            name="lr",
            parameter_type="double",
            feasible_space=V1beta1FeasibleSpace(min="0.001", max="0.003"),
        ),
        V1beta1ParameterSpec(
            name="num-steps",
            parameter_type="int",
            feasible_space=V1beta1FeasibleSpace(min="50", max="150",
                                                step="10"),
        ),
    ]

    # JSON template specification for the Trial's Worker Kubeflow MPIJob.
    trial_spec = {
        "apiVersion": "kubeflow.org/v1",
        "kind": "MPIJob",
        "spec": {
            "slotsPerWorker": 1,
            "cleanPodPolicy": "Running",
            "mpiReplicaSpecs": {
                "Launcher": {
                    "replicas": 1,
                    "template": {
                        "metadata": {
                            "annotations": {
                                "sidecar.istio.io/inject": "false"
                            }
                        },
                        "spec": {
                            "containers": [{
                                "image":
                                "docker.io/kubeflow/mpi-horovod-mnist",
                                "name":
                                "mpi-launcher",
                                "command": ["mpirun"],
                                "args": [
                                    "-np", "2", "--allow-run-as-root",
                                    "-bind-to", "none", "-map-by", "slot",
                                    "-x", "LD_LIBRARY_PATH", "-x", "PATH",
                                    "-mca", "pml", "ob1", "-mca", "btl",
                                    "^openib", "python",
                                    "/examples/tensorflow_mnist.py", "--lr",
                                    "${trialParameters.learningRate}",
                                    "--num-steps",
                                    "${trialParameters.numberSteps}"
                                ],
                                "resources": {
                                    "limits": {
                                        "cpu": "500m",
                                        "memory": "2Gi"
                                    }
                                }
                            }]
                        }
                    }
                },
                "Worker": {
                    "replicas": 2,
                    "template": {
                        "metadata": {
                            "annotations": {
                                "sidecar.istio.io/inject": "false"
                            }
                        },
                        "spec": {
                            "containers": [{
                                "image":
                                "docker.io/kubeflow/mpi-horovod-mnist",
                                "name": "mpi-worker",
                                "resources": {
                                    "limits": {
                                        "cpu": "500m",
                                        "memory": "4Gi"
                                    }
                                }
                            }]
                        }
                    }
                }
            }
        }
    }

    # Configure parameters for the Trial template.
    trial_template = V1beta1TrialTemplate(
        primary_pod_labels={"mpi-job-role": "launcher"},
        primary_container_name="mpi-launcher",
        success_condition=
        'status.conditions.#(type=="Succeeded")#|#(status=="True")#',
        failure_condition=
        'status.conditions.#(type=="Failed")#|#(status=="True")#',
        trial_parameters=[
            V1beta1TrialParameterSpec(
                name="learningRate",
                description="Learning rate for the training model",
                reference="lr"),
            V1beta1TrialParameterSpec(name="numberSteps",
                                      description="Number of training steps",
                                      reference="num-steps"),
        ],
        trial_spec=trial_spec)

    # Create Experiment specification.
    experiment_spec = V1beta1ExperimentSpec(
        max_trial_count=max_trial_count,
        max_failed_trial_count=max_failed_trial_count,
        parallel_trial_count=parallel_trial_count,
        objective=objective,
        algorithm=algorithm,
        parameters=parameters,
        trial_template=trial_template)

    # Get the Katib launcher.
    # Load component from the URL or from the file.
    katib_experiment_launcher_op = components.load_component_from_url(
        "https://raw.githubusercontent.com/kubeflow/pipelines/master/components/kubeflow/katib-launcher/component.yaml"
    )
    # katib_experiment_launcher_op = components.load_component_from_file(
    #     "../../../components/kubeflow/katib-launcher/component.yaml"
    # )

    # Katib launcher component.
    # Experiment Spec should be serialized to a valid Kubernetes object.
    # The Experiment is deleted after the Pipeline is finished.
    op = katib_experiment_launcher_op(
        experiment_name=experiment_name,
        experiment_namespace=experiment_namespace,
        experiment_spec=ApiClient().sanitize_for_serialization(
            experiment_spec),
        experiment_timeout_minutes=60)

    # Output container to print the results.
    dsl.ContainerOp(
        name="best-hp",
        image="library/bash:4.4.23",
        command=["sh", "-c"],
        arguments=["echo Best HyperParameters: %s" % op.output],
    )

Example #2

Show file

File: e2e-mnist.py Project: rpatil524/kfp-tekton

def create_katib_experiment_task(experiment_name, experiment_namespace,
                                 training_steps):
    # Trial count specification.
    max_trial_count = 5
    max_failed_trial_count = 3
    parallel_trial_count = 2

    # Objective specification.
    objective = V1beta1ObjectiveSpec(type="minimize",
                                     goal=0.001,
                                     objective_metric_name="loss")

    # Algorithm specification.
    algorithm = V1beta1AlgorithmSpec(algorithm_name="random", )

    # Experiment search space.
    # In this example we tune learning rate and batch size.
    parameters = [
        V1beta1ParameterSpec(
            name="learning_rate",
            parameter_type="double",
            feasible_space=V1beta1FeasibleSpace(min="0.01", max="0.05"),
        ),
        V1beta1ParameterSpec(
            name="batch_size",
            parameter_type="int",
            feasible_space=V1beta1FeasibleSpace(min="80", max="100"),
        )
    ]

    # Experiment Trial template.
    # TODO (andreyvelich): Use community image for the mnist example.
    trial_spec = {
        "apiVersion": "kubeflow.org/v1",
        "kind": "TFJob",
        "spec": {
            "tfReplicaSpecs": {
                "Chief": {
                    "replicas": 1,
                    "restartPolicy": "OnFailure",
                    "template": {
                        "metadata": {
                            "annotations": {
                                "sidecar.istio.io/inject": "false"
                            }
                        },
                        "spec": {
                            "containers": [{
                                "name":
                                "tensorflow",
                                "image":
                                "docker.io/liuhougangxa/tf-estimator-mnist",
                                "command": [
                                    "python", "/opt/model.py",
                                    "--tf-train-steps=" + str(training_steps),
                                    "--tf-learning-rate=${trialParameters.learningRate}",
                                    "--tf-batch-size=${trialParameters.batchSize}"
                                ]
                            }]
                        }
                    }
                },
                "Worker": {
                    "replicas": 1,
                    "restartPolicy": "OnFailure",
                    "template": {
                        "metadata": {
                            "annotations": {
                                "sidecar.istio.io/inject": "false"
                            }
                        },
                        "spec": {
                            "containers": [{
                                "name":
                                "tensorflow",
                                "image":
                                "docker.io/liuhougangxa/tf-estimator-mnist",
                                "command": [
                                    "python", "/opt/model.py",
                                    "--tf-train-steps=" + str(training_steps),
                                    "--tf-learning-rate=${trialParameters.learningRate}",
                                    "--tf-batch-size=${trialParameters.batchSize}"
                                ]
                            }]
                        }
                    }
                }
            }
        }
    }

    # Configure parameters for the Trial template.
    trial_template = V1beta1TrialTemplate(
        primary_container_name="tensorflow",
        trial_parameters=[
            V1beta1TrialParameterSpec(
                name="learningRate",
                description="Learning rate for the training model",
                reference="learning_rate"),
            V1beta1TrialParameterSpec(name="batchSize",
                                      description="Batch size for the model",
                                      reference="batch_size"),
        ],
        trial_spec=trial_spec)

    # Create an Experiment from the above parameters.
    experiment_spec = V1beta1ExperimentSpec(
        max_trial_count=max_trial_count,
        max_failed_trial_count=max_failed_trial_count,
        parallel_trial_count=parallel_trial_count,
        objective=objective,
        algorithm=algorithm,
        parameters=parameters,
        trial_template=trial_template)

    # Create the KFP task for the Katib Experiment.
    # Experiment Spec should be serialized to a valid Kubernetes object.
    katib_experiment_launcher_op = components.load_component_from_url(
        "https://raw.githubusercontent.com/kubeflow/pipelines/master/components/kubeflow/katib-launcher/component.yaml"
    )
    op = katib_experiment_launcher_op(
        experiment_name=experiment_name,
        experiment_namespace=experiment_namespace,
        experiment_spec=ApiClient().sanitize_for_serialization(
            experiment_spec),
        experiment_timeout_minutes=60,
        delete_finished_experiment=False)

    return op