Esempio n. 1
0
    def setUp(self):
        self.run = ModelRun(
            name="TestRun",
            num_workers=3,
            cpu_limit="1000m",
            image="Testimage",
            command="Testcommand",
            backend="mpi",
            run_on_all_nodes=False,
            gpu_enabled=False,
            light_target=True,
        )
        self.run.save()

        for i in range(100):
            for j, name in enumerate(self.names):
                metric = KubeMetric(
                    name=name,
                    date=(timezone.now() -
                          dt.timedelta(seconds=(100 - i) * 10 + (10 - j))),
                    value=random.random(),
                    metadata={},
                    cumulative=False,
                    model_run=self.run,
                )
                metric.save()
Esempio n. 2
0
    def create(self, request):
        """Create and start a new Model run

        Arguments:
            request {[Django request]} -- The request object

        Returns:
            Json -- Returns posted values
        """
        d = request.data

        image = d["image_name"]
        backend = d["backend"].lower()
        gpu = False

        if not is_valid_run_name(d["name"]):
            return Response(
                {
                    "status": "ERROR",
                    "message": "Invalid run name {}".format(d["name"])
                },
                status=status.HTTP_304_NOT_MODIFIED,
            )

        if image == "custom_image":
            image = d["custom_image_name"]
            command = d["custom_image_command"]
            gpu = d["gpu_enabled"] == "true"
        else:
            entry = settings.MLBENCH_IMAGES[image]
            command = entry[1]
            if entry[2]:
                gpu = d["gpu_enabled"] == "true"

        if backend == "custom_backend":
            backend = d["custom_backend"]
            run_all = d["run_all_nodes"] == "true"
        else:
            run_all = backend != "mpi"

        cpu = "{}m".format(float(d["num_cpus"]) * 1000)

        run = ModelRun(
            name=d["name"],
            num_workers=d["num_workers"],
            cpu_limit=cpu,
            image=image,
            command=command,
            backend=backend,
            run_on_all_nodes=run_all,
            gpu_enabled=gpu,
            light_target=d["light_target"] == "true",
        )

        run.start(run_model_job=run_model_job)

        serializer = ModelRunSerializer(run, many=False)

        return Response(serializer.data, status=status.HTTP_201_CREATED)
Esempio n. 3
0
    def test_check_available_nodes(self):
        """Tests check available nodes"""
        total_workers = int(os.environ.get("MLBENCH_MAX_WORKERS", "1"))

        run_1 = ModelRun(
            name=RUN_NAME.format(1),
            num_workers=4,
            cpu_limit=0.1,
            image=TEST_IMAGE,
            command="sleep",
            backend="gloo",
            run_on_all_nodes=True,
            gpu_enabled=False,
            light_target=False,
        )

        run_1.state = ModelRun.STARTED
        run_1.save()

        run_2 = ModelRun(
            name=RUN_NAME.format(2),
            num_workers=4,
            cpu_limit=0.1,
            image=TEST_IMAGE,
            command="sleep",
            backend="gloo",
            run_on_all_nodes=True,
            gpu_enabled=False,
            light_target=False,
        )

        run_2.save()
        available = check_nodes_available_for_execution(run_2)
        self.assertEqual(
            available, total_workers - run_1.num_workers >= run_2.num_workers)

        run_3 = ModelRun(
            name=RUN_NAME.format(3),
            num_workers=1,
            cpu_limit=0.1,
            image=TEST_IMAGE,
            command="sleep",
            backend="gloo",
            run_on_all_nodes=True,
            gpu_enabled=False,
            light_target=False,
        )

        available = check_nodes_available_for_execution(run_3)
        self.assertEqual(
            available,
            total_workers - run_1.num_workers - run_2.num_workers >=
            run_3.num_workers,
        )
Esempio n. 4
0
    def test_destroy_modelrun(self):
        with patch("api.views.delete_service", autospec=True), patch(
                "api.views.delete_statefulset",
                autospec=True), patch("api.models.modelrun._remove_run_job",
                                      autospec=True):
            run = ModelRun(name="Run1")
            run.start()

            response = self.client.delete("/api/runs/{}/".format(run.pk))

            assert response.status_code == 204
            # Check if object was deleted in DB
            runs = ModelRun.objects.all()
            self.assertEqual(len(runs), 0)
Esempio n. 5
0
    def create(self, request):
        """ Create and start a new Model run

        Arguments:
            request {[Django request]} -- The request object

        Returns:
            Json -- Returns posted values
        """
        # TODO: lock table, otherwise there might be concurrency conflicts
        d = request.data

        active_runs = ModelRun.objects.filter(state=ModelRun.STARTED)

        if active_runs.count() > 0:
            return Response({
                'status': 'Conflict',
                'message': 'There is already an active run'
            }, status=status.HTTP_409_CONFLICT)

        cpu = "{}m".format(float(d['num_cpus']) * 1000)

        run = ModelRun(
            name=d['name'],
            num_workers=d['num_workers'],
            cpu_limit=cpu,
            network_bandwidth_limit=d['max_bandwidth']
        )

        run.start()

        serializer = ModelRunSerializer(run, many=False)

        return Response(
            serializer.data, status=status.HTTP_201_CREATED
        )
Esempio n. 6
0
    def _test_create_statefulset(self):
        """Tests the creation of a stateful set"""
        run = ModelRun(
            name=RUN_NAME.format(1),
            num_workers=1,
            cpu_limit=0.1,
            image=TEST_IMAGE,
            command="sleep",
            backend="gloo",
            run_on_all_nodes=True,
            gpu_enabled=False,
            light_target=False,
        )

        # Create stateful set
        create_statefulset(
            run,
            os.getenv("MLBENCH_KUBE_RELEASENAME"),
            os.environ.get("MLBENCH_NAMESPACE"),
        )

        # Wait for creation
        sleep(10)
        # Check creation
        stateful_set_name = "{1}-mlbench-worker-{0}".format(
            os.getenv("MLBENCH_KUBE_RELEASENAME"), run.name).lower()
        kube_api = client.AppsV1Api()
        stateful_sets = kube_api.list_namespaced_stateful_set(
            os.environ.get("MLBENCH_NAMESPACE"))

        items = stateful_sets.items
        self.assertEqual(len(items), 1)

        stateful_set = items[0]
        self.assertEqual(stateful_set.metadata.name, stateful_set_name)
        self.assertEqual(stateful_set.status.current_replicas, 1)
        self.assertEqual(stateful_set.status.ready_replicas, 1)
        self.assertEqual(stateful_set.status.replicas, 1)

        containers = stateful_set.spec.template.spec.containers
        self.assertEqual(len(containers), 1)

        container = containers[0]
        self.assertEqual(container.image, TEST_IMAGE)

        core = client.CoreV1Api()
        pods = core.list_namespaced_pod(os.environ.get("MLBENCH_NAMESPACE"))
        self.assertEqual(len(pods.items), 1)
Esempio n. 7
0
class KubeMetricTests(APITestCase):
    names = [
        "start",
        "batch_load",
        "init",
        "fwd_pass",
        "comp_loss",
        "backprop",
        "agg",
        "opt_step",
        "comp_metrics",
        "end",
    ]

    def setUp(self):
        self.run = ModelRun(
            name="TestRun",
            num_workers=3,
            cpu_limit="1000m",
            image="Testimage",
            command="Testcommand",
            backend="mpi",
            run_on_all_nodes=False,
            gpu_enabled=False,
            light_target=True,
        )
        self.run.save()

        for i in range(100):
            for j, name in enumerate(self.names):
                metric = KubeMetric(
                    name=name,
                    date=(timezone.now() -
                          dt.timedelta(seconds=(100 - i) * 10 + (10 - j))),
                    value=random.random(),
                    metadata={},
                    cumulative=False,
                    model_run=self.run,
                )
                metric.save()

    def test_get_metric(self):
        """
        Ensure we can get metrics
        """
        response = self.client.get("/api/metrics/{}/?metric_type=run".format(
            self.run.id),
                                   format="json")
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        res = response.json()

        for name in self.names:
            assert len(res[name]) == 100

        response = self.client.get(
            "/api/metrics/{}/?metric_type=run&summarize=10".format(
                self.run.id),
            format="json",
        )
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        res = response.json()

        for name in self.names:
            assert len(res[name]) == 10

        response = self.client.get(
            "/api/metrics/{}/?metric_type=run&last_n=5".format(self.run.id),
            format="json",
        )
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        res = response.json()

        for name in self.names:
            assert len(res[name]) == 5