def test_healthy_vm_list_client_cluster(self): # Arrange list_instances_map = {} instance_resp_map = { 'fake-ig-' + c: gen_fake_instances_get_entry('fake-ig-' + c, 'n1-standard-16', '10.0.0.' + ip, 'RUNNING') for c, ip in zip('abcd', '0123') } compute_service = build_mock_compute_service(instance_resp_map, list_instances_map) noop_tpu_service = build_mock_tpu_service({}) self.mock_discovery.side_effect = build_mock_services_fn( compute_service, noop_tpu_service) # Act vms = ['fake-ig-a', 'fake-ig-b', 'fake-ig-c', 'fake-ig-d'] cr = ClusterResolver(['fake-tpu'], vms=vms) vm_cluster = cr.get_client_workers() # Assert expected = [ ClientWorker( internal_ip='10.0.0.' + ip, machine_type='n1-standard-16', zone='fake-zone', hostname='fake-ig-' + c) for c, ip in zip('abcd', '0123') ] self.assertCountEqual(expected, vm_cluster)
def test_healthy_sea_service_cluster(self): tpu_resp_map = { 'fake-tpu-{}'.format(ip): gen_fake_tpu_entry( 'v3-8', ['10.0.0.{}'.format(ip)], 'fake-tpu-{}'.format(ip), 'READY', 'pytorch-nightly', health='HEALTHY') for ip in range(256) } noop_compute_service = build_mock_compute_service({}, {}) tpu_service = build_mock_tpu_service(tpu_resp_map) self.mock_discovery.side_effect = build_mock_services_fn( noop_compute_service, tpu_service) tpus = list(tpu_resp_map.keys()) cr = ClusterResolver(tpus) service_workers = cr.get_service_workers() expected = [ ServiceWorker( internal_ip='10.0.0.{}'.format(ip), port='8470', machine_type='v3-8', zone='fake-zone', sw_version='pytorch-nightly') for ip in range(256) ] self.assertCountEqual(expected, service_workers)
def test_healthy_cluster(self): list_instances_map = { 'fake-ig': { 'kind': 'compute#instanceGroupsListInstances', 'items': [ gen_fake_ig_list_instances_entry('fake-ig-' + c, 'RUNNING') for c in 'abcd' ], }, } instance_resp_map = { 'fake-ig-' + c: gen_fake_instances_get_entry('fake-ig-' + c, 'n1-standard-16', '10.0.0.' + ip, 'RUNNING') for c, ip in zip('abcd', '0123') } compute_service = build_mock_compute_service(instance_resp_map, list_instances_map) tpu_resp_map = { 'fake-pod': gen_fake_tpu_entry( 'v3-32', ['10.0.0.{}'.format(ip) for ip in range(4)], 'fake-pod', 'READY', 'pytorch-nightly', health='HEALTHY'), } tpu_service = build_mock_tpu_service(tpu_resp_map) self.mock_discovery.side_effect = build_mock_services_fn( compute_service, tpu_service) tpus = list(tpu_resp_map.keys()) cr = ClusterResolver(tpus) cluster = cr.get_cluster() expected_client_workers = [ ClientWorker( internal_ip='10.0.0.' + ip, machine_type='n1-standard-16', zone='fake-zone', hostname='fake-ig-' + c) for c, ip in zip('abcd', '0123') ] expected_service_workers = [ ServiceWorker( internal_ip='10.0.0.{}'.format(ip), port='8470', machine_type='v3-32', zone='fake-zone', sw_version='pytorch-nightly') for ip in range(4) ] expected = Cluster(expected_client_workers, expected_service_workers) self.assertEqual(expected, cluster)
def test_empty_instance_group_client_cluster(self): list_instances_map = { 'fake-ig': { 'kind': 'compute#instanceGroupsListInstances', 'items': [], }, } instance_resp_map = { 'fake-ig-a': gen_fake_instances_get_entry('fake-ig-a', 'n1-standard-16', '10.0.0.0', 'RUNNING'), } compute_service = build_mock_compute_service(instance_resp_map, list_instances_map) noop_tpu_service = build_mock_tpu_service({}) self.mock_discovery.side_effect = build_mock_services_fn( compute_service, noop_tpu_service) # Act cr = ClusterResolver(['fake-tpu']) # Assert self.assertRaisesRegex(RuntimeError, '.*vms is empty in instance group.*', cr._get_client_workers)
def test_non_ready_sea_service_cluster(self): tpu_resp_map = { 'fake-tpu-0': gen_fake_tpu_entry('v3-8', ['10.0.0.0'], 'fake-tpu-0', 'READY', 'pytorch-nightly', health='HEALTHY'), 'fake-tpu-1': gen_fake_tpu_entry('v3-8', ['10.0.0.1'], 'fake-tpu-1', 'READY', 'pytorch-nightly', health='HEALTHY'), 'fake-tpu-2': gen_fake_tpu_entry('v3-8', ['10.0.0.2'], 'fake-tpu-2', 'CREATING', 'pytorch-nightly'), 'fake-tpu-3': gen_fake_tpu_entry('v3-8', ['10.0.0.3'], 'fake-tpu-3', 'READY', 'pytorch-nightly', health='HEALTHY'), } noop_compute_service = build_mock_compute_service({}, {}) tpu_service = build_mock_tpu_service(tpu_resp_map) self.mock_discovery.side_effect = build_mock_services_fn( noop_compute_service, tpu_service) tpus = list(tpu_resp_map.keys()) cr = ClusterResolver(tpus) self.assertRaisesRegex(RuntimeError, 'TPU fake-tpu-2 is not READY yet.*', cr.get_service_workers)
def get_tpu_fn(*args, **kwargs): node_name = ClusterResolver._parse_resource_url( kwargs['name'], 'nodes') resp = get_tpu_resp[node_name] get_node = mock.MagicMock() get_node.execute.return_value = resp return get_node
def test_unknown_health_pod_service_cluster(self): tpu_resp_map = { 'fake-pod': gen_fake_tpu_entry('v3-32', ['10.0.0.{}'.format(ip) for ip in range(4)], 'fake-pod', 'READY', 'pytorch-nightly'), } noop_compute_service = build_mock_compute_service({}, {}) tpu_service = build_mock_tpu_service(tpu_resp_map) self.mock_discovery.side_effect = build_mock_services_fn( noop_compute_service, tpu_service) tpus = list(tpu_resp_map.keys()) cr = ClusterResolver(tpus) self.assertRaisesRegex(RuntimeError, 'TPU fake-pod is not HEALTHY yet.*', cr.get_service_workers)
def test_unhealthy_client_cluster(self): # Arrange list_instances_map = { 'fake-ig': { 'kind': 'compute#instanceGroupsListInstances', 'items': [ gen_fake_ig_list_instances_entry('fake-ig-a', 'RUNNING'), gen_fake_ig_list_instances_entry('fake-ig-b', 'PROVISIONING'), gen_fake_ig_list_instances_entry('fake-ig-c', 'RUNNING'), gen_fake_ig_list_instances_entry('fake-ig-d', 'RUNNING'), ], }, } instance_resp_map = { 'fake-ig-a': gen_fake_instances_get_entry('fake-ig-a', 'n1-standard-16', '10.0.0.0', 'RUNNING'), 'fake-ig-b': gen_fake_instances_get_entry('fake-ig-b', 'n1-standard-16', '10.0.0.1', 'PROVISIONING'), 'fake-ig-c': gen_fake_instances_get_entry('fake-ig-c', 'n1-standard-16', '10.0.0.2', 'RUNNING'), 'fake-ig-d': gen_fake_instances_get_entry('fake-ig-d', 'n1-standard-16', '10.0.0.3', 'RUNNING'), } compute_service = build_mock_compute_service(instance_resp_map, list_instances_map) noop_tpu_service = build_mock_tpu_service({}) self.mock_discovery.side_effect = build_mock_services_fn( compute_service, noop_tpu_service) # Act cr = ClusterResolver(['fake-tpu']) # Assert self.assertRaisesRegex(RuntimeError, 'Instance fake-ig-b is not running yet.*', cr._get_client_workers)
def test_bad_cluster(self): list_instances_map = { 'fake-ig': { 'kind': 'compute#instanceGroupsListInstances', 'items': [ gen_fake_ig_list_instances_entry('fake-ig-' + c, 'RUNNING') for c in 'abc' ], }, } instance_resp_map = { 'fake-ig-' + c: gen_fake_instances_get_entry('fake-ig-' + c, 'n1-standard-16', '10.0.0.' + ip, 'RUNNING') for c, ip in zip('abcd', '0123') } compute_service = build_mock_compute_service(instance_resp_map, list_instances_map) tpu_resp_map = { 'fake-pod': gen_fake_tpu_entry( 'v3-32', ['10.0.0.{}'.format(ip) for ip in range(4)], 'fake-pod', 'READY', 'pytorch-nightly', health='HEALTHY'), } tpu_service = build_mock_tpu_service(tpu_resp_map) self.mock_discovery.side_effect = build_mock_services_fn( compute_service, tpu_service) tpus = list(tpu_resp_map.keys()) cr = ClusterResolver(tpus) self.assertRaisesRegex( RuntimeError, 'The client_workers and service_workers must have a 1:1 mapping', cr.get_cluster)