def test_sdk_e2e(): container = V1Container( name="xgboost", image="docker.io/merlintang/xgboost-dist-iris:1.1", args=[ "--job_type=Train", "--xgboost_parameter=objective:multi:softprob,num_class:3", "--n_estimators=10", "--learning_rate=0.1", "--model_path=/tmp/xgboost-model", "--model_storage_type=local" ], ) master = V1ReplicaSpec( replicas=1, restart_policy="Never", template=V1PodTemplateSpec(spec=V1PodSpec(containers=[container]))) worker = V1ReplicaSpec( replicas=1, restart_policy="Never", template=V1PodTemplateSpec(spec=V1PodSpec(containers=[container]))) xgboostjob = KubeflowOrgV1XGBoostJob( api_version="kubeflow.org/v1", kind="XGBoostJob", metadata=V1ObjectMeta(name="xgboostjob-iris-ci-test", namespace=SDK_TEST_NAMESPACE), spec=KubeflowOrgV1XGBoostJobSpec(run_policy=V1RunPolicy( clean_pod_policy="None", ), xgb_replica_specs={ "Master": master, "Worker": worker })) XGBOOST_CLIENT.create(xgboostjob) XGBOOST_CLIENT.wait_for_job("xgboostjob-iris-ci-test", namespace=SDK_TEST_NAMESPACE) if not XGBOOST_CLIENT.is_job_succeeded("xgboostjob-iris-ci-test", namespace=SDK_TEST_NAMESPACE): raise RuntimeError("The XGBoostJob is not succeeded.") XGBOOST_CLIENT.get_logs("xgboostjob-iris-ci-test", namespace=SDK_TEST_NAMESPACE) XGBOOST_CLIENT.delete("xgboostjob-iris-ci-test", namespace=SDK_TEST_NAMESPACE)
def test_sdk_e2e(): container = V1Container( name="tensorflow", image="gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0", command=[ "python", "/var/tf_mnist/mnist_with_summaries.py", "--log_dir=/train/logs", "--learning_rate=0.01", "--batch_size=150" ] ) worker = V1ReplicaSpec( replicas=1, restart_policy="Never", template=V1PodTemplateSpec( spec=V1PodSpec( containers=[container] ) ) ) tfjob = V1TFJob( api_version="kubeflow.org/v1", kind="TFJob", metadata=V1ObjectMeta(name="mnist-ci-test", namespace=SDK_TEST_NAMESPACE), spec=V1TFJobSpec( run_policy=V1RunPolicy( clean_pod_policy="None", ), tf_replica_specs={"Worker": worker} ) ) TFJOB_CLIENT.create(tfjob, namespace=SDK_TEST_NAMESPACE) TFJOB_CLIENT.wait_for_job("mnist-ci-test", namespace=SDK_TEST_NAMESPACE) if not TFJOB_CLIENT.is_job_succeeded("mnist-ci-test", namespace=SDK_TEST_NAMESPACE): raise RuntimeError("The TFJob is not succeeded.") TFJOB_CLIENT.get_logs("mnist-ci-test", master=False, namespace=SDK_TEST_NAMESPACE) TFJOB_CLIENT.delete("mnist-ci-test", namespace=SDK_TEST_NAMESPACE)
def test_sdk_e2e(): container = V1Container( name="pytorch", image="gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0", args=["--backend", "gloo"], ) master = V1ReplicaSpec( replicas=1, restart_policy="OnFailure", template=V1PodTemplateSpec(spec=V1PodSpec(containers=[container]))) worker = V1ReplicaSpec( replicas=1, restart_policy="OnFailure", template=V1PodTemplateSpec(spec=V1PodSpec(containers=[container]))) pytorchjob = KubeflowOrgV1PyTorchJob( api_version="kubeflow.org/v1", kind="PyTorchJob", metadata=V1ObjectMeta(name="pytorchjob-mnist-ci-test", namespace=SDK_TEST_NAMESPACE), spec=KubeflowOrgV1PyTorchJobSpec(run_policy=V1RunPolicy( clean_pod_policy="None", ), pytorch_replica_specs={ "Master": master, "Worker": worker })) PYTORCH_CLIENT.create(pytorchjob) PYTORCH_CLIENT.wait_for_job("pytorchjob-mnist-ci-test", namespace=SDK_TEST_NAMESPACE) if not PYTORCH_CLIENT.is_job_succeeded("pytorchjob-mnist-ci-test", namespace=SDK_TEST_NAMESPACE): raise RuntimeError("The PyTorchJob is not succeeded.") PYTORCH_CLIENT.get_logs("pytorchjob-mnist-ci-test", namespace=SDK_TEST_NAMESPACE) PYTORCH_CLIENT.delete("pytorchjob-mnist-ci-test", namespace=SDK_TEST_NAMESPACE)
def test_sdk_e2e(): master_container = V1Container( name="mpi", image="horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu", command=["mpirun"], args=[ "-np", "1", "--allow-run-as-root", "-bind-to", "none", "-map-by", "slot", "-x", "LD_LIBRARY_PATH", "-x", "PATH", "-mca", "pml", "ob1", "-mca", "btl", "^openib", #"python", "/examples/tensorflow2_mnist.py"] "python", "/examples/pytorch_mnist.py", "--epochs", "1" ]) worker_container = V1Container( name="mpi", image="horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu", ) master = V1ReplicaSpec(replicas=1, restart_policy="Never", template=V1PodTemplateSpec(spec=V1PodSpec( containers=[master_container]))) worker = V1ReplicaSpec(replicas=1, restart_policy="Never", template=V1PodTemplateSpec(spec=V1PodSpec( containers=[worker_container]))) mpijob = KubeflowOrgV1MPIJob( api_version="kubeflow.org/v1", kind="MPIJob", metadata=V1ObjectMeta(name="mpijob-mxnet-ci-test", namespace=SDK_TEST_NAMESPACE), spec=KubeflowOrgV1MPIJobSpec(slots_per_worker=1, run_policy=V1RunPolicy( clean_pod_policy="None", ), mpi_replica_specs={ "Launcher": master, "Worker": worker })) MPI_CLIENT.create(mpijob) MPI_CLIENT.wait_for_job("mpijob-mxnet-ci-test", namespace=SDK_TEST_NAMESPACE) if not MPI_CLIENT.is_job_succeeded("mpijob-mxnet-ci-test", namespace=SDK_TEST_NAMESPACE): raise RuntimeError("The MPIJob is not succeeded.") MPI_CLIENT.get_logs("mpijob-mxnet-ci-test", namespace=SDK_TEST_NAMESPACE) MPI_CLIENT.delete("mpijob-mxnet-ci-test", namespace=SDK_TEST_NAMESPACE)
def test_sdk_e2e(): worker_container = V1Container( name="mxnet", image="docker.io/johnugeorge/mxnet:1.9.1_cpu_py3", command=["/usr/local/bin/python3"], args=["incubator-mxnet/example/image-classification/train_mnist.py", "--num-epochs", "5", "--num-examples","1000", "--kv-store", "dist_sync"], ports=[V1ContainerPort(container_port=9991, name="mxjob-port")] ) server_container = V1Container( name="mxnet", image="docker.io/johnugeorge/mxnet:1.9.1_cpu_py3", ports=[V1ContainerPort(container_port=9991, name="mxjob-port")] ) scheduler_container = V1Container( name="mxnet", image="docker.io/johnugeorge/mxnet:1.9.1_cpu_py3", ports=[V1ContainerPort(container_port=9991, name="mxjob-port")] ) worker = V1ReplicaSpec( replicas=1, restart_policy="Never", template=V1PodTemplateSpec( spec=V1PodSpec( containers=[worker_container] ) ) ) server = V1ReplicaSpec( replicas=1, restart_policy="Never", template=V1PodTemplateSpec( spec=V1PodSpec( containers=[server_container] ) ) ) scheduler = V1ReplicaSpec( replicas=1, restart_policy="Never", template=V1PodTemplateSpec( spec=V1PodSpec( containers=[scheduler_container] ) ) ) mxjob = KubeflowOrgV1MXJob( api_version="kubeflow.org/v1", kind="MXJob", metadata=V1ObjectMeta(name="mxjob-mnist-ci-test", namespace=SDK_TEST_NAMESPACE), spec=KubeflowOrgV1MXJobSpec( job_mode="MXTrain", run_policy=V1RunPolicy( clean_pod_policy="None", ), mx_replica_specs={"Scheduler": scheduler, "Server": server, "Worker": worker} ) ) MX_CLIENT.create(mxjob) MX_CLIENT.wait_for_job("mxjob-mnist-ci-test", namespace=SDK_TEST_NAMESPACE) if not MX_CLIENT.is_job_succeeded("mxjob-mnist-ci-test", namespace=SDK_TEST_NAMESPACE): raise RuntimeError("The MXJob is not succeeded.") MX_CLIENT.get_logs("mxjob-mnist-ci-test", namespace=SDK_TEST_NAMESPACE, master=False) MX_CLIENT.delete("mxjob-mnist-ci-test", namespace=SDK_TEST_NAMESPACE)