def setUp(self): self.run = ModelRun( name="TestRun", num_workers=3, cpu_limit="1000m", image="Testimage", command="Testcommand", backend="mpi", run_on_all_nodes=False, gpu_enabled=False, light_target=True, ) self.run.save() for i in range(100): for j, name in enumerate(self.names): metric = KubeMetric( name=name, date=(timezone.now() - dt.timedelta(seconds=(100 - i) * 10 + (10 - j))), value=random.random(), metadata={}, cumulative=False, model_run=self.run, ) metric.save()
def create(self, request): """Create and start a new Model run Arguments: request {[Django request]} -- The request object Returns: Json -- Returns posted values """ d = request.data image = d["image_name"] backend = d["backend"].lower() gpu = False if not is_valid_run_name(d["name"]): return Response( { "status": "ERROR", "message": "Invalid run name {}".format(d["name"]) }, status=status.HTTP_304_NOT_MODIFIED, ) if image == "custom_image": image = d["custom_image_name"] command = d["custom_image_command"] gpu = d["gpu_enabled"] == "true" else: entry = settings.MLBENCH_IMAGES[image] command = entry[1] if entry[2]: gpu = d["gpu_enabled"] == "true" if backend == "custom_backend": backend = d["custom_backend"] run_all = d["run_all_nodes"] == "true" else: run_all = backend != "mpi" cpu = "{}m".format(float(d["num_cpus"]) * 1000) run = ModelRun( name=d["name"], num_workers=d["num_workers"], cpu_limit=cpu, image=image, command=command, backend=backend, run_on_all_nodes=run_all, gpu_enabled=gpu, light_target=d["light_target"] == "true", ) run.start(run_model_job=run_model_job) serializer = ModelRunSerializer(run, many=False) return Response(serializer.data, status=status.HTTP_201_CREATED)
def test_check_available_nodes(self): """Tests check available nodes""" total_workers = int(os.environ.get("MLBENCH_MAX_WORKERS", "1")) run_1 = ModelRun( name=RUN_NAME.format(1), num_workers=4, cpu_limit=0.1, image=TEST_IMAGE, command="sleep", backend="gloo", run_on_all_nodes=True, gpu_enabled=False, light_target=False, ) run_1.state = ModelRun.STARTED run_1.save() run_2 = ModelRun( name=RUN_NAME.format(2), num_workers=4, cpu_limit=0.1, image=TEST_IMAGE, command="sleep", backend="gloo", run_on_all_nodes=True, gpu_enabled=False, light_target=False, ) run_2.save() available = check_nodes_available_for_execution(run_2) self.assertEqual( available, total_workers - run_1.num_workers >= run_2.num_workers) run_3 = ModelRun( name=RUN_NAME.format(3), num_workers=1, cpu_limit=0.1, image=TEST_IMAGE, command="sleep", backend="gloo", run_on_all_nodes=True, gpu_enabled=False, light_target=False, ) available = check_nodes_available_for_execution(run_3) self.assertEqual( available, total_workers - run_1.num_workers - run_2.num_workers >= run_3.num_workers, )
def test_destroy_modelrun(self): with patch("api.views.delete_service", autospec=True), patch( "api.views.delete_statefulset", autospec=True), patch("api.models.modelrun._remove_run_job", autospec=True): run = ModelRun(name="Run1") run.start() response = self.client.delete("/api/runs/{}/".format(run.pk)) assert response.status_code == 204 # Check if object was deleted in DB runs = ModelRun.objects.all() self.assertEqual(len(runs), 0)
def create(self, request): """ Create and start a new Model run Arguments: request {[Django request]} -- The request object Returns: Json -- Returns posted values """ # TODO: lock table, otherwise there might be concurrency conflicts d = request.data active_runs = ModelRun.objects.filter(state=ModelRun.STARTED) if active_runs.count() > 0: return Response({ 'status': 'Conflict', 'message': 'There is already an active run' }, status=status.HTTP_409_CONFLICT) cpu = "{}m".format(float(d['num_cpus']) * 1000) run = ModelRun( name=d['name'], num_workers=d['num_workers'], cpu_limit=cpu, network_bandwidth_limit=d['max_bandwidth'] ) run.start() serializer = ModelRunSerializer(run, many=False) return Response( serializer.data, status=status.HTTP_201_CREATED )
def _test_create_statefulset(self): """Tests the creation of a stateful set""" run = ModelRun( name=RUN_NAME.format(1), num_workers=1, cpu_limit=0.1, image=TEST_IMAGE, command="sleep", backend="gloo", run_on_all_nodes=True, gpu_enabled=False, light_target=False, ) # Create stateful set create_statefulset( run, os.getenv("MLBENCH_KUBE_RELEASENAME"), os.environ.get("MLBENCH_NAMESPACE"), ) # Wait for creation sleep(10) # Check creation stateful_set_name = "{1}-mlbench-worker-{0}".format( os.getenv("MLBENCH_KUBE_RELEASENAME"), run.name).lower() kube_api = client.AppsV1Api() stateful_sets = kube_api.list_namespaced_stateful_set( os.environ.get("MLBENCH_NAMESPACE")) items = stateful_sets.items self.assertEqual(len(items), 1) stateful_set = items[0] self.assertEqual(stateful_set.metadata.name, stateful_set_name) self.assertEqual(stateful_set.status.current_replicas, 1) self.assertEqual(stateful_set.status.ready_replicas, 1) self.assertEqual(stateful_set.status.replicas, 1) containers = stateful_set.spec.template.spec.containers self.assertEqual(len(containers), 1) container = containers[0] self.assertEqual(container.image, TEST_IMAGE) core = client.CoreV1Api() pods = core.list_namespaced_pod(os.environ.get("MLBENCH_NAMESPACE")) self.assertEqual(len(pods.items), 1)
class KubeMetricTests(APITestCase): names = [ "start", "batch_load", "init", "fwd_pass", "comp_loss", "backprop", "agg", "opt_step", "comp_metrics", "end", ] def setUp(self): self.run = ModelRun( name="TestRun", num_workers=3, cpu_limit="1000m", image="Testimage", command="Testcommand", backend="mpi", run_on_all_nodes=False, gpu_enabled=False, light_target=True, ) self.run.save() for i in range(100): for j, name in enumerate(self.names): metric = KubeMetric( name=name, date=(timezone.now() - dt.timedelta(seconds=(100 - i) * 10 + (10 - j))), value=random.random(), metadata={}, cumulative=False, model_run=self.run, ) metric.save() def test_get_metric(self): """ Ensure we can get metrics """ response = self.client.get("/api/metrics/{}/?metric_type=run".format( self.run.id), format="json") self.assertEqual(response.status_code, status.HTTP_200_OK) res = response.json() for name in self.names: assert len(res[name]) == 100 response = self.client.get( "/api/metrics/{}/?metric_type=run&summarize=10".format( self.run.id), format="json", ) self.assertEqual(response.status_code, status.HTTP_200_OK) res = response.json() for name in self.names: assert len(res[name]) == 10 response = self.client.get( "/api/metrics/{}/?metric_type=run&last_n=5".format(self.run.id), format="json", ) self.assertEqual(response.status_code, status.HTTP_200_OK) res = response.json() for name in self.names: assert len(res[name]) == 5