Esempio n. 1
0
def test_inference_job_scale(args):
    if utils.get_launcher(args.config) == "controller":
        return
    job_spec = utils.gen_default_job_description("inference", args.email,
                                                args.uid, args.vc, cmd="sleep 600")

    with utils.run_job(args.rest, job_spec) as job:
        job_id = job.jid
        state = job.block_until_state_not_in(
            {"unapproved", "queued", "scheduling"})
        assert state == "running"

        deployment_name = job_id + "-deployment"
        deployment = utils.kube_get_deployment(args.config, "default", deployment_name)
        assert 1 == deployment.spec.replicas

        desired_replicas = 2
        logger.info("scale up job %s to %d" % (job_id, desired_replicas))
        resp = utils.scale_job(args.rest, args.email, job_id, desired_replicas)
        assert "Success" == resp

        time.sleep(30)
        deployment = utils.kube_get_deployment(args.config, "default", deployment_name)
        assert desired_replicas == deployment.spec.replicas

        desired_replicas = 1
        logger.info("scale down job %s to %d" % (job_id, desired_replicas))
        resp = utils.scale_job(args.rest, args.email, job_id, desired_replicas)
        assert "Success" == resp

        time.sleep(30)
        deployment = utils.kube_get_deployment(args.config, "default", deployment_name)
        assert desired_replicas == deployment.spec.replicas
Esempio n. 2
0
def test_fault_tolerance(args):
    # Job is only retried when launcher is controller.
    if utils.get_launcher(args.config) == "python":
        return

    job_spec = utils.gen_default_job_description("regular", args.email,
                                                 args.uid, args.vc)

    with utils.run_job(args.rest, job_spec) as job:
        endpoints = utils.create_endpoint(args.rest, args.email, job.jid,
                                          ["ssh"])
        endpoints_ids = list(endpoints.keys())
        assert len(endpoints_ids) == 1
        endpoint_id = endpoints_ids[0]

        state = job.block_until_state_not_in(
            {"unapproved", "queued", "scheduling"})
        assert state == "running"

        ssh_endpoint = utils.wait_endpoint_state(args.rest, args.email,
                                                 job.jid, endpoint_id)
        ssh_host = "%s.%s" % (ssh_endpoint["nodeName"], ssh_endpoint["domain"])
        ssh_port = ssh_endpoint["port"]

        logger.info("current ssh endpoint is %s:%s", ssh_host, ssh_port)

        pod = utils.kube_get_pods(args.config, "default",
                                  "jobId=%s" % (job.jid))[0]
        utils.kube_delete_pod(args.config, "default", pod.metadata.name)

        ssh_endpoint = utils.wait_endpoint_state(args.rest,
                                                 args.email,
                                                 job.jid,
                                                 endpoint_id,
                                                 state="pending")

        ssh_endpoint = utils.wait_endpoint_state(args.rest, args.email,
                                                 job.jid, endpoint_id)

        ssh_host = "%s.%s" % (ssh_endpoint["nodeName"], ssh_endpoint["domain"])
        ssh_port = ssh_endpoint["port"]
        logger.info("current ssh endpoint is %s:%s", ssh_host, ssh_port)

        # exec into jobmanager to execute ssh to avoid firewall
        job_manager_pod = utils.kube_get_pods(args.config, "default",
                                              "app=jobmanager")[0]
        job_manager_pod_name = job_manager_pod.metadata.name

        alias = args.email.split("@")[0]

        cmd = [
            "ssh", "-i",
            "/dlwsdata/work/%s/.ssh/id_rsa" % alias, "-p", ssh_port, "-o",
            "StrictHostKeyChecking=no", "-o", "LogLevel=ERROR",
            "%s@%s" % (alias, ssh_host), "--", "echo", "dummy"
        ]
        code, output = utils.kube_pod_exec(args.config, "default",
                                           job_manager_pod_name, "jobmanager",
                                           cmd)
        assert code == 0, "code is %s, output is %s" % (code, output)
        assert output == "dummy\n", "output is %s" % (output)