def test11_image1(self) -> None: """Push image alpine:latest to the cluster. Master should load the image_file of alpine and present it to master_details. Returns: None. """ # Run command. command = f"maro grass image push {self.cluster_name} --debug --image-name alpine:latest" Subprocess.interactive_run(command=command) self._gracefully_wait() # Check validity, failed if does not meet the desired state in 120s. is_valid = False start_time = time.time() while not is_valid and start_time + 120 >= time.time(): try: is_valid = True master_details = self._get_master_details() self.assertIn("alpine_latest", master_details["image_files"]) except AssertionError: is_valid = False time.sleep(10) self.assertTrue(is_valid)
def test20_train_env_provision(self): # Build docker image and load docker image. command = ( f"docker build -f {self.maro_pkg_path}/docker_files/cpu.runtime.source.df -t maro_runtime_cpu:test " f"{self.maro_pkg_path}") Subprocess.run(command=command) # Run command. command = f"maro grass image push {self.cluster_name} --debug --image-name maro_runtime_cpu:test" Subprocess.interactive_run(command=command) # Check image status, failed if does not meet the desired state in 1000s. is_loaded = False start_time = time.time() while not is_loaded and start_time + 1000 >= time.time(): try: is_loaded = True nodes_details = self._list_nodes_details() for node_details in nodes_details: self.assertIn("maro_runtime_cpu_test", node_details["image_files"]) except AssertionError: is_loaded = False time.sleep(10) self.assertTrue(is_loaded)
def test16_start(self) -> None: """Start one Standard_D2s_v3. Two Standard_D2s_v3 should be in running state, and they should have loaded the image alpine:latest and ubuntu:latest. Returns: None. """ # Run command. command = f"maro grass node start {self.cluster_name} --debug Standard_D2s_v3 1" Subprocess.interactive_run(command=command) self._gracefully_wait() # Check validity, failed if does not meet the desired state in 120s. is_valid = False start_time = time.time() while not is_valid and start_time + 120 >= time.time(): try: is_valid = True nodes_details = self._list_nodes_details() self.assertEqual(len(nodes_details), 2) for node_details in nodes_details: self.assertEqual(node_details["state"]["status"], NodeStatus.RUNNING) self.assertIn("alpine_latest", node_details["image_files"]) self.assertIn("ubuntu_latest", node_details["image_files"]) except AssertionError: is_valid = False time.sleep(10) self.assertTrue(is_valid)
def remote_join_cluster(node_username: str, node_hostname: str, node_ssh_port: int, master_private_ip_address: str, master_api_server_port: int, deployment_path: str) -> None: """Remote join cluster. Install required runtime env first, then download the /lib/scripts/node/join_cluster.py from master_api_server, and execute remotely. Args: node_username (str): username of the MARO Node VM. node_hostname (str): hostname of the MARO Node VM. node_ssh_port (str): ssh port of the MARO Node VM. master_private_ip_address (str): private ip address of the MARO Master VM, (master and nodes must in the same virtual network). master_api_server_port (int): port of the master_api_server. deployment_path (str): path of the join_cluster_deployment. Returns: None. """ command = ( f"ssh -o StrictHostKeyChecking=no -p {node_ssh_port} {node_username}@{node_hostname} " "'export DEBIAN_FRONTEND=noninteractive; " "sudo -E apt update; " "sudo -E apt install -y python3-pip; " "pip3 install deepdiff redis pyyaml; " f"curl -s GET http://{master_private_ip_address}:{master_api_server_port}/v1/joinClusterScript | " f"python3 - {deployment_path}'") Subprocess.interactive_run(command=command)
def test14_stop(self) -> None: """Stop one Standard_D2s_v3. One Standard_D2s_v3 should be in running state, and the other should be in Stopped state. Returns: None. """ # Run command. command = f"maro grass node stop {self.cluster_name} --debug Standard_D2s_v3 1" Subprocess.interactive_run(command=command) self._gracefully_wait() # Check validity, failed if does not meet the desired state in 120s. is_valid = False start_time = time.time() while not is_valid and start_time + 120 >= time.time(): try: is_valid = True nodes_details = self._list_nodes_details() self.assertEqual(len(nodes_details), 2) running_count = 0 stopped_count = 0 for node_details in nodes_details: if node_details["state"]["status"] == NodeStatus.RUNNING: running_count += 1 if node_details["state"]["status"] == NodeStatus.STOPPED: stopped_count += 1 self.assertEqual(running_count, 1) self.assertEqual(stopped_count, 1) except AssertionError: is_valid = False time.sleep(10) self.assertTrue(is_valid)
def test11_node1(self) -> None: """Scale node spec Standard_D2s_v3 to 1. A Standard_D2s_v3 should be in running state. Returns: None. """ # Run command. command = f"maro grass node scale {self.cluster_name} --debug Standard_D2s_v3 1" Subprocess.interactive_run(command=command) self._gracefully_wait() # Check validity, failed if does not meet the desired state in 120s. is_valid = False start_time = time.time() while not is_valid and start_time + 120 >= time.time(): try: is_valid = True nodes_details = self._list_nodes_details() self.assertEqual(len(nodes_details), 1) for node_details in nodes_details: self.assertEqual(node_details["state"]["status"], NodeStatus.RUNNING) except AssertionError: is_valid = False time.sleep(10) self.assertTrue(is_valid)
def test12_join_cluster(self) -> None: """Join a node to cluster. Returns: None. """ # Run command. command = f"maro grass node join --debug {self.join_cluster_deployment_path}" Subprocess.interactive_run(command=command) self._gracefully_wait() # Check validity, failed if does not meet the desired state in 120s. is_valid = False start_time = time.time() while not is_valid and start_time + 120 >= time.time(): try: is_valid = True nodes_details = self._list_nodes_details() self.assertEqual(len(nodes_details), 1) for node_details in nodes_details: self.assertEqual(node_details["state"]["status"], NodeStatus.RUNNING) self.assertIn("alpine_latest", node_details["image_files"]) except AssertionError: is_valid = False time.sleep(10) self.assertTrue(is_valid)
def test13_image2(self) -> None: """Push image ubuntu:latest to the cluster. The only Running node should have loaded the image ubuntu:latest. Returns: None. """ # Run command. command = f"maro grass image push {self.cluster_name} --debug --image-name ubuntu:latest" Subprocess.interactive_run(command=command) self._gracefully_wait() # Check validity, failed if does not meet the desired state in 120s. is_valid = False start_time = time.time() while not is_valid and start_time + 120 >= time.time(): try: is_valid = True nodes_details = self._list_nodes_details() self.assertEqual(len(nodes_details), 1) for node_details in nodes_details: if node_details["state"]["status"] == NodeStatus.RUNNING: self.assertIn("alpine_latest", node_details["image_files"]) self.assertIn("ubuntu_latest", node_details["image_files"]) except AssertionError: is_valid = False time.sleep(10) self.assertTrue(is_valid)
def test10_create(self) -> None: # Run command. command = f"maro k8s create --debug {self.create_deployment_path}" Subprocess.interactive_run(command=command) # Check validity. nodes_details = self._get_node_details() self.assertIn("Standard_D2s_v3", nodes_details) self.assertEqual(nodes_details["Standard_D2s_v3"], 1)
def test_1_azcopy_small_files_to_remote(self) -> None: sas = self.executor._check_and_get_account_sas() command = ( f"azcopy copy " f"'{self.local_small_files_path}' " f"'https://{self.cluster_id}st.file.core.windows.net/{self.cluster_id}-fs" f"/{self.test_id}/test_1_azcopy_small_files_to_remote/?{sas}' " f"--recursive=True") Subprocess.interactive_run(command=command)
def test12_image(self) -> None: # Run command. command = f"maro k8s image push {self.cluster_name} --debug --image-name alpine:latest" Subprocess.interactive_run(command=command) # Check validity. command = f"maro k8s image list {self.cluster_name}" return_str = Subprocess.run(command=command) images = ast.literal_eval(return_str) self.assertIn("alpine", images)
def test_1_rsync_small_files_to_remote(self) -> None: command = (f"ssh -o StrictHostKeyChecking=no " f"{self.admin_username}@{self.master_public_ip_address} " f"'mkdir -p ~/test/{self.test_id}/test_1_rsync_small_files_to_remote'") _ = Subprocess.run(command=command) command = (f"rsync -e 'ssh -o StrictHostKeyChecking=no' -az -r " f"{self.local_small_files_path} " f"{self.admin_username}@{self.master_public_ip_address}:" f"~/test/{self.test_id}/test_1_rsync_small_files_to_remote") Subprocess.interactive_run(command=command)
def test20_train_env_provision(self): # Build docker image and load docker image. command = ( f"docker build -f {self.maro_pkg_path}/docker_files/cpu.runtime.source.df -t maro_runtime_cpu " f"{self.maro_pkg_path}") Subprocess.run(command=command) # Run command. command = f"maro k8s image push {self.cluster_name} --debug --image-name maro_runtime_cpu" Subprocess.interactive_run(command=command)
def local_leave_cluster() -> None: """Local leave cluster. Exec /lib/scripts/node/activate_leave_cluster.py Returns: None. """ command = "python3 ~/.maro-local/scripts/activate_leave_cluster.py" Subprocess.interactive_run(command=command)
def tearDownClass(cls) -> None: # Delete cluster command = f"maro k8s delete --debug {cls.cluster_name}" Subprocess.interactive_run(command=command) # Print result print(json.dumps(TEST_TO_TIME, indent=4, sort_keys=True)) # Delete tmp test folder shutil.rmtree( os.path.expanduser(f"{GlobalPaths.MARO_TEST}/{cls.test_id}"))
def test_2_rsync_small_files_to_local(self) -> None: command = f"mkdir -p {GlobalPaths.MARO_TEST}/{self.test_id}/test_2_rsync_small_files_to_local" _ = Subprocess.run(command=command) command = (f"rsync -e 'ssh -o StrictHostKeyChecking=no' -az -r " f"{self.admin_username}@{self.master_public_ip_address}:" f"~/test/{self.test_id}/test_1_rsync_small_files_to_remote " f"{GlobalPaths.MARO_TEST}/{self.test_id}/test_2_rsync_small_files_to_local") Subprocess.interactive_run(command=command) self.assertTrue(os.path.exists(os.path.expanduser( f"{GlobalPaths.MARO_TEST}/{self.test_id}/" f"test_2_rsync_small_files_to_local/test_1_rsync_small_files_to_remote/small_files/README.md")))
def test11_node(self) -> None: # Run command. command = f"maro k8s node scale {self.cluster_name} --debug Standard_D4s_v3 1" Subprocess.interactive_run(command=command) # Check validity. nodes_details = self._get_node_details() self.assertIn("Standard_D2s_v3", nodes_details) self.assertIn("Standard_D4s_v3", nodes_details) self.assertEqual(nodes_details["Standard_D2s_v3"], 1) self.assertEqual(nodes_details["Standard_D4s_v3"], 1)
def test_2_tar_ssh_small_files_to_local(self) -> None: command = f"mkdir -p {GlobalPaths.MARO_TEST}/{self.test_id}/test_2_tar_ssh_small_files_to_local" _ = Subprocess.run(command=command) basename = os.path.basename(f"~/test/{self.test_id}/test_1_tar_ssh_small_files_to_remote") dirname = os.path.dirname(f"~/test/{self.test_id}/test_1_tar_ssh_small_files_to_remote") command = (f"ssh {self.admin_username}@{self.master_public_ip_address} 'tar cf - -C {dirname} {basename}' | " f"tar xf - -C {GlobalPaths.MARO_TEST}/{self.test_id}/test_2_tar_ssh_small_files_to_local") Subprocess.interactive_run(command=command) self.assertTrue(os.path.exists(os.path.expanduser( f"{GlobalPaths.MARO_TEST}/{self.test_id}/" f"test_2_tar_ssh_small_files_to_local/test_1_tar_ssh_small_files_to_remote/small_files/README.md")))
def test_1_tar_ssh_small_files_to_remote(self) -> None: command = (f"ssh -o StrictHostKeyChecking=no " f"{self.admin_username}@{self.master_public_ip_address} " f"'mkdir -p ~/test/{self.test_id}/test_1_tar_ssh_small_files_to_remote'") _ = Subprocess.run(command=command) basename = os.path.basename(self.local_small_files_path) dirname = os.path.dirname(self.local_small_files_path) command = (f"tar cf - -C {dirname} {basename} | " f"ssh {self.admin_username}@{self.master_public_ip_address} " f"'tar xf - -C ~/test/{self.test_id}/test_1_tar_ssh_small_files_to_remote'") Subprocess.interactive_run(command=command)
def test_1_azcopy_tar_small_files_to_remote(self) -> None: # create remote folder command = ( f"kubectl exec -i {self.pod_name} -- " f"mkdir -p /mnt/maro/{self.test_id}/test_1_azcopy_tar_small_files_to_remote" ) Subprocess.interactive_run(command=command) # local tar zip basename = os.path.basename(self.local_small_files_path) dirname = os.path.dirname(self.local_small_files_path) tar_file_name = uuid.uuid4().hex[:8] command = f"tar cf {GlobalPaths.MARO_TEST}/{self.test_id}/tar/{tar_file_name} -C {dirname} {basename}" Subprocess.interactive_run(command=command) # azcopy sas = self.executor._check_and_get_account_sas() local_path = os.path.expanduser( f"{GlobalPaths.MARO_TEST}/{self.test_id}/tar/{tar_file_name}") command = ( f"azcopy copy " f"'{local_path}' " f"'https://{self.cluster_id}st.file.core.windows.net/{self.cluster_id}-fs" f"/tar/{tar_file_name}?{sas}' " f"--recursive=True") Subprocess.interactive_run(command=command) # remote tar unzip command = ( f"kubectl exec -i {self.pod_name} -- " f"tar xf /mnt/maro/tar/{tar_file_name} " f"-C /mnt/maro/{self.test_id}/test_1_azcopy_tar_small_files_to_remote" ) Subprocess.interactive_run(command=command)
def test_1_kubectl_exec_small_files_to_remote(self) -> None: command = ( f"kubectl exec -i {self.pod_name} -- " f"mkdir -p /mnt/maro/{self.test_id}/test_1_kubectl_exec_small_files_to_remote" ) Subprocess.interactive_run(command=command) basename = os.path.basename(self.local_small_files_path) dirname = os.path.dirname(self.local_small_files_path) command = ( f"tar cf - -C {dirname} {basename} | " f"kubectl exec -i {self.pod_name} -- " f"tar xf - -C /mnt/maro/{self.test_id}/test_1_kubectl_exec_small_files_to_remote" ) Subprocess.interactive_run(command=command)
def get_static_info() -> dict: """ Get static resource information about local environment. Returns: Tuple[int, list]: (total cpu number, [cpu usage per core]) """ static_info = {} static_info["cpu"] = psutil.cpu_count() memory = psutil.virtual_memory() static_info["total_memory"] = round(float(memory.total) / (1024**2), 2) static_info["memory"] = round(float(memory.free) / (1024**2), 2) gpu_static_command = "nvidia-smi --query-gpu=name,memory.total --format=csv,noheader,nounits" try: return_str = Subprocess.run(command=gpu_static_command) gpus_info = return_str.split(os.linesep) static_info["gpu"] = len(gpus_info) - 1 # (int) logical number static_info["gpu_name"] = [] static_info["gpu_memory"] = [] for info in gpus_info: name, total_memory = info.split(", ") static_info["gpu_name"].append(name) static_info["gpu_memory"].append(total_memory) except Exception: static_info["gpu"] = 0 return static_info
def pull_data(self, local_dir: str, remote_path: str) -> None: """Pull remote AFS service data to local folder via azcopy. Args: local_dir (str): path of the local folder. remote_path (str): path of the remote data. Returns: None. """ # Get sas sas = self._check_and_get_account_sas() # Push data abs_local_dir = os.path.expanduser(local_dir) source_path = PathConvertor.build_path_without_trailing_slash( remote_path) abs_target_dir = PathConvertor.build_path_with_trailing_slash( abs_local_dir) os.makedirs(abs_target_dir, exist_ok=True) if not source_path.startswith("/"): raise FileOperationError( f"Invalid remote path: {source_path}\nShould be started with '/'" ) copy_command = ( "azcopy copy " f"'https://{self.cluster_id}st.file.core.windows.net/{self.cluster_id}-fs{source_path}?{sas}' " f"'{abs_target_dir}' " "--recursive=True") _ = Subprocess.run(command=copy_command)
def get_resource_group(resource_group: str) -> dict: command = f"az group show --name {resource_group}" try: return_str = Subprocess.run(command=command) return json.loads(return_str) except CommandExecutionError: return {}
def tearDownClass(cls) -> None: # Print result. print(json.dumps(cls.test_func_to_time, indent=4, sort_keys=True)) # Delete resource group. AzureController.delete_resource_group( resource_group=cls.resource_group) # Delete tmp test folder. shutil.rmtree(f"{GlobalPaths.ABS_MARO_TEST}/{cls.test_id}") # Delete docker image. try: command = "docker rmi maro_runtime_cpu:test" Subprocess.run(command=command) except CommandExecutionError: pass
def scale_nodepool(resource_group: str, aks_name: str, nodepool_name: str, node_count: int) -> None: command = (f"az aks nodepool scale " f"-g {resource_group} " f"--cluster-name {aks_name} " f"--name {nodepool_name} " f"--node-count {node_count}") _ = Subprocess.run(command=command)
def add_nodepool(resource_group: str, aks_name: str, nodepool_name: str, node_count: int, node_size: str) -> None: command = (f"az aks nodepool add " f"-g {resource_group} " f"--cluster-name {aks_name} " f"--name {nodepool_name} " f"--node-count {node_count} " f"--node-vm-size {node_size}") _ = Subprocess.run(command=command)
def remote_leave_cluster(node_username: str, node_hostname: str, node_ssh_port: int) -> None: """Remote leave cluster. Exec /lib/scripts/node/activate_leave_cluster.py Args: node_username (str): username of the MARO Node VM. node_hostname (str): hostname of the MARO Node VM. node_ssh_port (str): ssh port of the MARO Node VM. Returns: None. """ command = ( f"ssh -o StrictHostKeyChecking=no -p {node_ssh_port} {node_username}@{node_hostname} " f"'python3 ~/.maro-local/scripts/activate_leave_cluster.py'") Subprocess.interactive_run(command=command)
def remote_delete_master(master_username: str, master_hostname: str, master_ssh_port: int) -> None: """Remote delete MARO Master. Exec /lib/scripts/master/delete_master.py remotely. Args: master_username (str): username of the MARO Master VM. master_hostname (str): hostname of the MARO Master VM. master_ssh_port (int): ssh port of the MARO Master VM. Returns: None. """ command = ( f"ssh -o StrictHostKeyChecking=no -p {master_ssh_port} {master_username}@{master_hostname} " f"'python3 {GlobalPaths.MARO_LOCAL}/scripts/delete_master.py'") Subprocess.interactive_run(command=command)
def remote_init_build_node_image_vm(node_username: str, node_hostname: str, node_ssh_port: int) -> None: """Remote init Build Node Image VM. Exec /lib/scripts/build_node_image_vm/init_build_node_image_vm.py remotely. Args: node_username (str): username of the vm. node_hostname (str): hostname of the vm. node_ssh_port (int): ssh port of the vm. Returns: None. """ command = ( f"ssh -o StrictHostKeyChecking=no -p {node_ssh_port} {node_username}@{node_hostname} " "'python3 ~/init_build_node_image_vm.py'") Subprocess.interactive_run(command=command)