Example #1
0
def test_batch_api(
    client: cx.Client,
    api: str,
    test_s3_path: str,
    deploy_timeout: int = None,
    job_timeout: int = None,
    retry_attempts: int = 0,
    api_config_name: str = "cortex.yaml",
):
    api_dir = TEST_APIS_DIR / api
    with open(str(api_dir / api_config_name)) as f:
        api_specs = yaml.safe_load(f)

    assert len(api_specs) == 1

    api_name = api_specs[0]["name"]
    client.create_api(api_spec=api_specs[0], project_dir=api_dir)

    try:
        assert endpoint_ready(
            client=client, api_name=api_name, timeout=deploy_timeout
        ), f"api {api_name} not ready"

        with open(str(api_dir / "sample.json")) as f:
            payload = json.load(f)

        response = None
        for i in range(retry_attempts + 1):
            response = request_batch_prediction(
                client,
                api_name,
                item_list=payload,
                batch_size=2,
                config={"dest_s3_dir": test_s3_path},
            )
            if response.status_code == HTTPStatus.OK:
                break

            time.sleep(1)

        assert (
            response.status_code == HTTPStatus.OK
        ), f"status code: got {response.status_code}, expected {HTTPStatus.OK} ({response.text})"

        job_spec = response.json()

        # monitor job progress
        assert job_done(
            client=client,
            api_name=job_spec["api_name"],
            job_id=job_spec["job_id"],
            timeout=job_timeout,
        ), f"job did not succeed (api_name: {api_name}, job_id: {job_spec['job_id']})"

    finally:
        delete_apis(client, [api_name])
Example #2
0
def test_task_api(
    client: cx.Client,
    api: str,
    deploy_timeout: int = None,
    job_timeout: int = None,
    retry_attempts: int = 0,
    api_config_name: str = "cortex.yaml",
):
    api_dir = TEST_APIS_DIR / api
    with open(str(api_dir / api_config_name)) as f:
        api_specs = yaml.safe_load(f)

    assert len(api_specs) == 1

    api_name = api_specs[0]["name"]
    client.create_api(api_spec=api_specs[0], project_dir=api_dir)

    try:
        assert endpoint_ready(
            client=client, api_name=api_name, timeout=deploy_timeout
        ), f"api {api_name} not ready"

        response = None
        for i in range(retry_attempts + 1):
            response = request_task(
                client,
                api_name,
            )
            if response.status_code == HTTPStatus.OK:
                break

            time.sleep(1)

        job_spec = response.json()

        assert job_done(
            client=client,
            api_name=api_name,
            job_id=job_spec["job_id"],
            timeout=job_timeout,
        ), f"task job did not succeed (api_name: {api_name}, job_id: {job_spec['job_id']})"

    finally:
        delete_apis(client, [api_name])
Example #3
0
def test_load_task(
    printer: Callable,
    client: cx.Client,
    api: str,
    load_config: Dict[str, Union[int, float]],
    deploy_timeout: int = None,
    retry_attempts: int = 0,
    poll_sleep_seconds: int = 1,
    api_config_name: str = "cortex.yaml",
):

    jobs = load_config["jobs"]
    concurrency = load_config["concurrency"]
    submit_timeout = load_config["submit_timeout"]
    workload_timeout = load_config["workload_timeout"]

    api_dir = TEST_APIS_DIR / api
    with open(str(api_dir / api_config_name)) as f:
        api_specs = yaml.safe_load(f)
    assert len(api_specs) == 1

    api_name = api_specs[0]["name"]
    client.create_api(api_spec=api_specs[0], project_dir=api_dir)

    request_stopper = td.Event()
    map_stopper = td.Event()
    try:
        assert endpoint_ready(
            client=client, api_name=api_name,
            timeout=deploy_timeout), f"api {api_name} not ready"

        # give the operator time to start
        time.sleep(1 * retry_attempts)

        # submit jobs
        printer(f"submitting {jobs} jobs concurrently")
        job_specs = []
        threads_futures = request_tasks_concurrently(client, api_name,
                                                     request_stopper,
                                                     concurrency, jobs,
                                                     job_specs)

        assert wait_on_event(
            request_stopper, submit_timeout
        ), f"{jobs} jobs couldn't be submitted in {submit_timeout}s"
        check_futures_healthy(threads_futures)
        wait_on_futures(threads_futures)

        printer("waiting on the jobs")
        job_ids = [job_spec.json()["job_id"] for job_spec in job_specs]
        retrieve_results_concurrently(
            client,
            api_name,
            concurrency,
            map_stopper,
            job_ids,
            poll_sleep_seconds=poll_sleep_seconds,
            timeout=workload_timeout,
        )

    except:
        # best effort
        try:
            api_info = client.get_api(api_name)

            # only get the last 10 job statuses
            if "task_job_statuses" in api_info and len(
                    api_info["task_job_statuses"]) > 10:
                api_info["task_job_statuses"] = api_info["task_job_statuses"][
                    -10:]

            printer(json.dumps(api_info, indent=2))
        except:
            pass
        raise

    finally:
        map_stopper.set()
        delete_apis(client, [api_name])
Example #4
0
def test_load_batch(
    printer: Callable,
    client: cx.Client,
    api: str,
    test_s3_path: str,
    load_config: Dict[str, Union[int, float]],
    deploy_timeout: int = None,
    retry_attempts: int = 0,
    api_config_name: str = "cortex.yaml",
):

    jobs = load_config["jobs"]
    workers_per_job = load_config["workers_per_job"]
    items_per_job = load_config["items_per_job"]
    batch_size = load_config["batch_size"]
    workload_timeout = load_config["workload_timeout"]

    bucket, key = re.match("s3://(.+?)/(.+)", test_s3_path).groups()
    s3 = boto3.client("s3")
    paginator = s3.get_paginator("list_objects_v2")

    api_dir = TEST_APIS_DIR / api
    with open(str(api_dir / api_config_name)) as f:
        api_specs = yaml.safe_load(f)
    assert len(api_specs) == 1

    sample_generator_path = api_dir / "sample_generator.py"
    assert (sample_generator_path.exists()
            ), "sample_generator.py must be present for the batch load test"
    sample_generator = load_generator(sample_generator_path)

    api_name = api_specs[0]["name"]
    client.create_api(api_spec=api_specs[0], project_dir=api_dir)
    api_endpoint = client.get_api(api_name)["endpoint"]

    try:
        assert endpoint_ready(
            client=client, api_name=api_name,
            timeout=deploy_timeout), f"api {api_name} not ready"

        # submit jobs
        printer(f"submitting {jobs} jobs")
        job_specs = []
        for _ in range(jobs):
            for _ in range(retry_attempts + 1):
                response = request_batch_prediction(
                    client,
                    api_name,
                    item_list=[
                        sample_generator() for _ in range(items_per_job)
                    ],
                    batch_size=batch_size,
                    workers=workers_per_job,
                    config={"dest_s3_dir": test_s3_path},
                )
                if response.status_code == HTTPStatus.OK:
                    break
                time.sleep(1)
            # retries are only required once
            retry_attempts = 0

            assert (
                response.status_code == HTTPStatus.OK
            ), f"status code: got {response.status_code}, expected {HTTPStatus.OK} ({response.text})"
            job_specs.append(response.json())

        # wait the jobs to finish
        printer("waiting on the jobs")
        assert jobs_done(
            client, api_name, [job_spec["job_id"] for job_spec in job_specs],
            workload_timeout), f"not all jobs succeed in {workload_timeout}s"

        # assert jobs
        printer("checking the jobs' responses")
        for job_spec in job_specs:
            job_id: str = job_spec["job_id"]
            job_status = requests.get(
                f"{api_endpoint}?jobID={job_id}").json()["job_status"]

            assert (
                job_status["batches_in_queue"] == 0
            ), f"there are still batches in queue ({job_status['batches_in_queue']}) for job ID {job_id}"
            assert job_status["batch_metrics"]["succeeded"] == math.ceil(
                items_per_job / batch_size)

            num_objects = 0
            for page in paginator.paginate(Bucket=bucket,
                                           Prefix=os.path.join(key, job_id)):
                num_objects += len(page["Contents"])
            assert num_objects == 1

    except:
        # best effort
        try:
            api_info = client.get_api(api_name)

            # only get the last 10 job statuses
            if "batch_job_statuses" in api_info and len(
                    api_info["batch_job_statuses"]) > 10:
                api_info["batch_job_statuses"] = api_info[
                    "batch_job_statuses"][-10:]

            printer(json.dumps(api_info, indent=2))
        except:
            pass
        raise

    finally:
        delete_apis(client, [api_name])
Example #5
0
def test_task_api(
    printer: Callable,
    client: cx.Client,
    api: str,
    deploy_timeout: int = None,
    job_timeout: int = None,
    retry_attempts: int = 0,
    api_config_name: str = "cortex.yaml",
):
    api_dir = TEST_APIS_DIR / api
    with open(str(api_dir / api_config_name)) as f:
        api_specs = yaml.safe_load(f)

    assert len(api_specs) == 1

    api_name = api_specs[0]["name"]
    client.create_api(api_spec=api_specs[0], project_dir=api_dir)

    try:
        assert endpoint_ready(
            client=client, api_name=api_name,
            timeout=deploy_timeout), f"api {api_name} not ready"

        response = None
        for _ in range(retry_attempts + 1):
            response = request_task(
                client,
                api_name,
            )
            if response.status_code == HTTPStatus.OK:
                break

            time.sleep(1)

        job_spec = response.json()

        assert job_done(
            client=client,
            api_name=api_name,
            job_id=job_spec["job_id"],
            timeout=job_timeout,
        ), f"task job did not succeed (api_name: {api_name}, job_id: {job_spec['job_id']})"

    except:
        # best effort
        try:
            api_info = client.get_api(api_name)
            printer(json.dumps(api_info, indent=2))

            job_status = client.get_job(api_name, job_spec["job_id"])
            printer(json.dumps(job_status, indent=2))
            td.Thread(target=lambda: client.stream_job_logs(
                api_name, job_spec["job_id"]),
                      daemon=True).start()
            time.sleep(5)
        except:
            pass
        raise

    finally:
        delete_apis(client, [api_name])
Example #6
0
def test_batch_api(
    printer: Callable,
    client: cx.Client,
    api: str,
    test_s3_path: str,
    deploy_timeout: int = None,
    job_timeout: int = None,
    retry_attempts: int = 0,
    api_config_name: str = "cortex.yaml",
):
    api_dir = TEST_APIS_DIR / api
    with open(str(api_dir / api_config_name)) as f:
        api_specs = yaml.safe_load(f)

    assert len(api_specs) == 1

    api_name = api_specs[0]["name"]
    client.create_api(api_spec=api_specs[0], project_dir=api_dir)

    try:
        assert endpoint_ready(
            client=client, api_name=api_name,
            timeout=deploy_timeout), f"api {api_name} not ready"

        with open(str(api_dir / "sample.json")) as f:
            payload = json.load(f)

        response = None
        for _ in range(retry_attempts + 1):
            response = request_batch_prediction(
                client,
                api_name,
                item_list=payload,
                batch_size=2,
                config={"dest_s3_dir": test_s3_path},
            )
            if response.status_code == HTTPStatus.OK:
                break

            time.sleep(1)

        assert (
            response.status_code == HTTPStatus.OK
        ), f"status code: got {response.status_code}, expected {HTTPStatus.OK} ({response.text})"

        job_spec = response.json()

        # monitor job progress
        assert job_done(
            client=client,
            api_name=job_spec["api_name"],
            job_id=job_spec["job_id"],
            timeout=job_timeout,
        ), f"job did not succeed (api_name: {api_name}, job_id: {job_spec['job_id']})"

    except:
        # best effort
        try:
            api_info = client.get_api(api_name)
            printer(json.dumps(api_info, indent=2))

            job_status = client.get_job(api_name, job_spec["job_id"])
            printer(json.dumps(job_status, indent=2))
            td.Thread(target=lambda: client.stream_job_logs(
                api_name, job_spec["job_id"]),
                      daemon=True).start()
            time.sleep(5)
        except:
            pass
        raise

    finally:
        delete_apis(client, [api_name])
Example #7
0
def test_task_api(
    printer: Callable,
    client: cx.Client,
    api: str,
    deploy_timeout: int = None,
    job_timeout: int = None,
    retry_attempts: int = 0,
    api_config_name: str = "cortex_cpu.yaml",
    node_groups: List[str] = [],
    local_operator: bool = False,
):
    api_dir = TEST_APIS_DIR / api
    with open(str(api_dir / api_config_name)) as f:
        api_specs = yaml.safe_load(f)

    assert len(api_specs) == 1

    if len(node_groups) > 0:
        api_specs[0]["node_groups"] = node_groups

    api_name = api_specs[0]["name"]
    client.deploy(api_spec=api_specs[0])

    try:
        endpoint_override = f"http://localhost:8888/tasks/{api_name}" if local_operator else None
        assert endpoint_ready(
            client=client,
            api_name=api_name,
            timeout=deploy_timeout,
            endpoint_override=endpoint_override,
        ), f"api {api_name} not ready"

        response = None
        for _ in range(retry_attempts + 1):
            response = request_task(client,
                                    api_name,
                                    local_operator=local_operator)
            if response.status_code == HTTPStatus.OK:
                break

            time.sleep(1)

        job_spec = response.json()

        job_id = job_spec["job_id"]
        endpoint_override = (
            f"http://localhost:8888/tasks/{api_name}?jobID={job_id}"
            if local_operator else None)
        assert job_done(
            client=client,
            api_name=api_name,
            job_id=job_spec["job_id"],
            timeout=job_timeout,
            endpoint_override=endpoint_override,
        ), f"task job did not succeed (api_name: {api_name}, job_id: {job_spec['job_id']})"

    except:
        # best effort
        try:
            api_info = client.get_api(api_name)
            printer(json.dumps(api_info, indent=2))

            job_status = client.get_job(api_name, job_spec["job_id"])
            printer(json.dumps(job_status, indent=2))
            td.Thread(target=lambda: stream_job_logs(client, api_name,
                                                     job_spec["job_id"]),
                      daemon=True).start()
            time.sleep(5)
        except:
            pass
        raise

    finally:
        delete_apis(client, [api_name])
Example #8
0
def test_batch_api(
    printer: Callable,
    client: cx.Client,
    api: str,
    test_s3_path: str,
    deploy_timeout: int = None,
    job_timeout: int = None,
    retry_attempts: int = 0,
    api_config_name: str = "cortex_cpu.yaml",
    node_groups: List[str] = [],
    local_operator: bool = False,
):
    api_dir = TEST_APIS_DIR / api
    with open(str(api_dir / api_config_name)) as f:
        api_specs = yaml.safe_load(f)
    assert len(api_specs) == 1

    if len(node_groups) > 0:
        api_specs[0]["node_groups"] = node_groups

    api_name = api_specs[0]["name"]
    client.deploy(api_spec=api_specs[0])

    try:
        endpoint_override = f"http://localhost:8888/batch/{api_name}" if local_operator else None
        assert endpoint_ready(
            client=client,
            api_name=api_name,
            timeout=deploy_timeout,
            endpoint_override=endpoint_override,
        ), f"api {api_name} not ready"

        with open(str(api_dir / "sample.json")) as f:
            payload = json.load(f)

        response = None
        for _ in range(retry_attempts + 1):
            response = request_batch_prediction(
                client,
                api_name,
                item_list=payload,
                batch_size=2,
                config={"dest_s3_dir": test_s3_path},
                local_operator=local_operator,
            )
            if response.status_code == HTTPStatus.OK:
                break

            time.sleep(1)

        assert (
            response.status_code == HTTPStatus.OK
        ), f"status code: got {response.status_code}, expected {HTTPStatus.OK} ({response.text})"

        job_spec = response.json()

        # monitor job progress
        job_id = job_spec["job_id"]
        endpoint_override = (
            f"http://localhost:8888/batch/{api_name}?jobID={job_id}"
            if local_operator else None)
        assert job_done(
            client=client,
            api_name=api_name,
            job_id=job_id,
            timeout=job_timeout,
            endpoint_override=endpoint_override,
        ), f"job did not succeed (api_name: {api_name}, job_id: {job_spec['job_id']})"

    except:
        # best effort
        try:
            api_info = client.get_api(api_name)
            printer(json.dumps(api_info, indent=2))

            job_status = client.get_job(api_name, job_spec["job_id"])
            printer(json.dumps(job_status, indent=2))

            td.Thread(
                target=lambda: stream_job_logs(client, api_name, job_spec[
                    "job_id"]),
                daemon=True,
            ).start()
            time.sleep(5)
        finally:
            raise
    finally:
        delete_apis(client, [api_name])