def test20_train_env_provision(self): # Build docker image and load docker image. command = ( f"docker build -f {self.maro_pkg_path}/docker_files/cpu.runtime.source.df -t maro_runtime_cpu:test " f"{self.maro_pkg_path}") Subprocess.run(command=command) # Run command. command = f"maro grass image push {self.cluster_name} --debug --image-name maro_runtime_cpu:test" Subprocess.interactive_run(command=command) # Check image status, failed if does not meet the desired state in 1000s. is_loaded = False start_time = time.time() while not is_loaded and start_time + 1000 >= time.time(): try: is_loaded = True nodes_details = self._list_nodes_details() for node_details in nodes_details: self.assertIn("maro_runtime_cpu_test", node_details["image_files"]) except AssertionError: is_loaded = False time.sleep(10) self.assertTrue(is_loaded)
def setUpClass(cls, file_path: str = os.path.abspath(__file__)) -> None: # Get and set params GlobalParams.LOG_LEVEL = logging.DEBUG cls.test_id = uuid.uuid4().hex[:8] os.makedirs( os.path.expanduser(f"{GlobalPaths.MARO_TEST}/{cls.test_id}"), exist_ok=True) os.makedirs( os.path.expanduser(f"{GlobalPaths.MARO_TEST}/{cls.test_id}/tar"), exist_ok=True) cls.file_path = os.path.abspath(__file__) cls.dir_path = os.path.dirname(cls.file_path) cls.deployment_template_path = os.path.normpath( os.path.join(cls.dir_path, "../templates/test_k8s_azure_create.yml")) cls.deployment_path = os.path.expanduser( f"{GlobalPaths.MARO_TEST}/{cls.test_id}/test_k8s_azure_create.yml") cls.config_path = os.path.normpath( os.path.join(cls.dir_path, "../config.yml")) # Load config and save deployment with open(cls.deployment_template_path) as fr: deployment_details = yaml.safe_load(fr) with open(cls.config_path) as fr: config_details = yaml.safe_load(fr) if config_details["cloud/subscription"] and config_details[ "user/admin_public_key"]: deployment_details["cloud"]["subscription"] = config_details[ "cloud/subscription"] deployment_details["user"][ "admin_public_key"] = config_details[ "user/admin_public_key"] else: raise Exception("Invalid config") with open(cls.deployment_path, "w") as fw: yaml.safe_dump(deployment_details, fw) # Get params from deployments cls.cluster_name = deployment_details["name"] # Init test files cls.local_big_file_path = os.path.expanduser( f"{GlobalPaths.MARO_TEST}/{cls.test_id}/big_file") cls.local_small_files_path = os.path.expanduser( f"{GlobalPaths.MARO_TEST}/{cls.test_id}/small_files") command = f"dd if=/dev/zero of={cls.local_big_file_path} bs=1 count=0 seek=1G" Subprocess.run(command=command) command = f"git clone [email protected]:microsoft/maro.git {cls.local_small_files_path}" Subprocess.run(command=command) # Create cluster command = f"maro k8s create --debug {cls.deployment_path}" Subprocess.interactive_run(command=command) cls.cluster_details = DetailsReader.load_cluster_details( cluster_name=cls.cluster_name) cls.cluster_id = cls.cluster_details["id"] cls.executor = K8sAksExecutor(cluster_name=cls.cluster_name) time.sleep(15) cls.pod_name = cls._get_redis_pod_name()
def test20_train_env_provision(self): # Build docker image and load docker image. command = ( f"docker build -f {self.maro_pkg_path}/docker_files/cpu.runtime.source.df -t maro_runtime_cpu " f"{self.maro_pkg_path}") Subprocess.run(command=command) # Run command. command = f"maro k8s image push {self.cluster_name} --debug --image-name maro_runtime_cpu" Subprocess.interactive_run(command=command)
def get_resource_group(resource_group: str) -> dict: command = f"az group show --name {resource_group}" try: return_str = Subprocess.run(command=command) return json.loads(return_str) except CommandExecutionError: return {}
def pull_data(self, local_dir: str, remote_path: str) -> None: """Pull remote AFS service data to local folder via azcopy. Args: local_dir (str): path of the local folder. remote_path (str): path of the remote data. Returns: None. """ # Get sas sas = self._check_and_get_account_sas() # Push data abs_local_dir = os.path.expanduser(local_dir) source_path = PathConvertor.build_path_without_trailing_slash( remote_path) abs_target_dir = PathConvertor.build_path_with_trailing_slash( abs_local_dir) os.makedirs(abs_target_dir, exist_ok=True) if not source_path.startswith("/"): raise FileOperationError( f"Invalid remote path: {source_path}\nShould be started with '/'" ) copy_command = ( "azcopy copy " f"'https://{self.cluster_id}st.file.core.windows.net/{self.cluster_id}-fs{source_path}?{sas}' " f"'{abs_target_dir}' " "--recursive=True") _ = Subprocess.run(command=copy_command)
def get_static_info() -> dict: """ Get static resource information about local environment. Returns: Tuple[int, list]: (total cpu number, [cpu usage per core]) """ static_info = {} static_info["cpu"] = psutil.cpu_count() memory = psutil.virtual_memory() static_info["total_memory"] = round(float(memory.total) / (1024**2), 2) static_info["memory"] = round(float(memory.free) / (1024**2), 2) gpu_static_command = "nvidia-smi --query-gpu=name,memory.total --format=csv,noheader,nounits" try: return_str = Subprocess.run(command=gpu_static_command) gpus_info = return_str.split(os.linesep) static_info["gpu"] = len(gpus_info) - 1 # (int) logical number static_info["gpu_name"] = [] static_info["gpu_memory"] = [] for info in gpus_info: name, total_memory = info.split(", ") static_info["gpu_name"].append(name) static_info["gpu_memory"].append(total_memory) except Exception: static_info["gpu"] = 0 return static_info
def scale_nodepool(resource_group: str, aks_name: str, nodepool_name: str, node_count: int) -> None: command = (f"az aks nodepool scale " f"-g {resource_group} " f"--cluster-name {aks_name} " f"--name {nodepool_name} " f"--node-count {node_count}") _ = Subprocess.run(command=command)
def tearDownClass(cls) -> None: # Print result. print(json.dumps(cls.test_func_to_time, indent=4, sort_keys=True)) # Delete resource group. AzureController.delete_resource_group( resource_group=cls.resource_group) # Delete tmp test folder. shutil.rmtree(f"{GlobalPaths.ABS_MARO_TEST}/{cls.test_id}") # Delete docker image. try: command = "docker rmi maro_runtime_cpu:test" Subprocess.run(command=command) except CommandExecutionError: pass
def add_nodepool(resource_group: str, aks_name: str, nodepool_name: str, node_count: int, node_size: str) -> None: command = (f"az aks nodepool add " f"-g {resource_group} " f"--cluster-name {aks_name} " f"--name {nodepool_name} " f"--node-count {node_count} " f"--node-vm-size {node_size}") _ = Subprocess.run(command=command)
def test_1_rsync_small_files_to_remote(self) -> None: command = (f"ssh -o StrictHostKeyChecking=no " f"{self.admin_username}@{self.master_public_ip_address} " f"'mkdir -p ~/test/{self.test_id}/test_1_rsync_small_files_to_remote'") _ = Subprocess.run(command=command) command = (f"rsync -e 'ssh -o StrictHostKeyChecking=no' -az -r " f"{self.local_small_files_path} " f"{self.admin_username}@{self.master_public_ip_address}:" f"~/test/{self.test_id}/test_1_rsync_small_files_to_remote") Subprocess.interactive_run(command=command)
def test12_image(self) -> None: # Run command. command = f"maro k8s image push {self.cluster_name} --debug --image-name alpine:latest" Subprocess.interactive_run(command=command) # Check validity. command = f"maro k8s image list {self.cluster_name}" return_str = Subprocess.run(command=command) images = ast.literal_eval(return_str) self.assertIn("alpine", images)
def _get_redis_pod_name(cls) -> str: # Get pods details command = "kubectl get pods -o json" return_str = Subprocess.run(command=command) pods_details = json.loads(return_str)["items"] # Export logs for pod_details in pods_details: if pod_details["metadata"]["labels"]["app"] == "maro-redis": return pod_details["metadata"]["name"]
def test_2_tar_ssh_small_files_to_local(self) -> None: command = f"mkdir -p {GlobalPaths.MARO_TEST}/{self.test_id}/test_2_tar_ssh_small_files_to_local" _ = Subprocess.run(command=command) basename = os.path.basename(f"~/test/{self.test_id}/test_1_tar_ssh_small_files_to_remote") dirname = os.path.dirname(f"~/test/{self.test_id}/test_1_tar_ssh_small_files_to_remote") command = (f"ssh {self.admin_username}@{self.master_public_ip_address} 'tar cf - -C {dirname} {basename}' | " f"tar xf - -C {GlobalPaths.MARO_TEST}/{self.test_id}/test_2_tar_ssh_small_files_to_local") Subprocess.interactive_run(command=command) self.assertTrue(os.path.exists(os.path.expanduser( f"{GlobalPaths.MARO_TEST}/{self.test_id}/" f"test_2_tar_ssh_small_files_to_local/test_1_tar_ssh_small_files_to_remote/small_files/README.md")))
def test_2_rsync_small_files_to_local(self) -> None: command = f"mkdir -p {GlobalPaths.MARO_TEST}/{self.test_id}/test_2_rsync_small_files_to_local" _ = Subprocess.run(command=command) command = (f"rsync -e 'ssh -o StrictHostKeyChecking=no' -az -r " f"{self.admin_username}@{self.master_public_ip_address}:" f"~/test/{self.test_id}/test_1_rsync_small_files_to_remote " f"{GlobalPaths.MARO_TEST}/{self.test_id}/test_2_rsync_small_files_to_local") Subprocess.interactive_run(command=command) self.assertTrue(os.path.exists(os.path.expanduser( f"{GlobalPaths.MARO_TEST}/{self.test_id}/" f"test_2_rsync_small_files_to_local/test_1_rsync_small_files_to_remote/small_files/README.md")))
def push_image(self, image_name: str) -> None: """Push local image to the MARO Cluster. Args: image_name (str): name of the local image that loaded in the docker. Returns: None. """ remote_image_name = f"{self.cluster_id}acr.azurecr.io/{image_name}" # ACR login AzureController.login_acr(acr_name=f"{self.cluster_id}acr") # Tag image command = f"docker tag {image_name} {remote_image_name}" _ = Subprocess.run(command=command) # Push image to ACR command = f"docker push {remote_image_name}" _ = Subprocess.run(command=command)
def test_1_tar_ssh_small_files_to_remote(self) -> None: command = (f"ssh -o StrictHostKeyChecking=no " f"{self.admin_username}@{self.master_public_ip_address} " f"'mkdir -p ~/test/{self.test_id}/test_1_tar_ssh_small_files_to_remote'") _ = Subprocess.run(command=command) basename = os.path.basename(self.local_small_files_path) dirname = os.path.dirname(self.local_small_files_path) command = (f"tar cf - -C {dirname} {basename} | " f"ssh {self.admin_username}@{self.master_public_ip_address} " f"'tar xf - -C ~/test/{self.test_id}/test_1_tar_ssh_small_files_to_remote'") Subprocess.interactive_run(command=command)
def __init__(self): self._redis_connection = redis.Redis( host="localhost", port=LocalParams.RESOURCE_REDIS_PORT) try: self._redis_connection.ping() except Exception: start_redis_command = f"redis-server --port {str(LocalParams.RESOURCE_REDIS_PORT)} --daemonize yes" _ = Subprocess.run(start_redis_command) # Start Resource Agents start_agent_command = f"python {LocalPaths.MARO_RESOURCE_AGENT}" _ = subprocess.Popen(start_agent_command, shell=True)
def template(export_path: str) -> None: """Export deployment template of grass mode. Args: export_path (str): location to export the templates. Returns: None. """ # Get templates command = f"cp {GrassPaths.MARO_GRASS_LIB}/deployments/external/* {export_path}" _ = Subprocess.run(command=command)
def get_connection_string(storage_account_name: str) -> str: """Get the connection string for a storage account. Args: storage_account_name: The storage account name. Returns: str: Connection string. """ command = f"az storage account show-connection-string --name {storage_account_name}" return_str = Subprocess.run(command=command) return json.loads(return_str)["connectionString"]
def start_deployment(resource_group: str, deployment_name: str, template_file_path: str, parameters_file_path: str) -> None: command = ( f"az deployment group create -g {resource_group} --name {deployment_name} " f"--template-file {template_file_path} --parameters {parameters_file_path}" ) try: _ = Subprocess.run(command=command) except CommandExecutionError as e: error = json.loads(AzureController._get_valid_json( e.get_message()))["error"] raise DeploymentError(error["message"])
def copy_files_to_node(local_path: str, remote_dir: str, node_username: str, node_hostname: str, node_ssh_port: int) -> None: """Copy local files to node, automatically create folder if not exist. Args: local_path (str): path of the local file. remote_dir (str): dir for remote files. node_username (str): username of the vm. node_hostname (str): hostname of the vm. node_ssh_port (int): port of the ssh connection. """ source_path = PathConvertor.build_path_without_trailing_slash( local_path) basename = os.path.basename(source_path) folder_name = os.path.expanduser(os.path.dirname(source_path)) target_dir = PathConvertor.build_path_with_trailing_slash(remote_dir) mkdir_script = ( f"ssh -o StrictHostKeyChecking=no -p {node_ssh_port} {node_username}@{node_hostname} " f"'mkdir -p {target_dir}'") _ = Subprocess.run(command=mkdir_script) if platform.system() in ["Linux", "Darwin"]: # Copy with pipe copy_script = ( f"tar czf - -C {folder_name} {basename} | " f"ssh -o StrictHostKeyChecking=no -p {node_ssh_port} {node_username}@{node_hostname} " f"'tar xzf - -C {target_dir}'") _ = Subprocess.run(command=copy_script) else: # Copy with tmp file tmp_file_name = uuid.uuid4() maro_local_tmp_abs_path = os.path.expanduser( GlobalPaths.MARO_LOCAL_TMP) tar_script = f"tar czf {maro_local_tmp_abs_path}/{tmp_file_name} -C {folder_name} {basename}" _ = Subprocess.run(command=tar_script) copy_script = ( f"scp {maro_local_tmp_abs_path}/{tmp_file_name} " f"{node_username}@{node_hostname}:{GlobalPaths.MARO_LOCAL_TMP}" ) _ = Subprocess.run(command=copy_script) untar_script = ( f"ssh -o StrictHostKeyChecking=no -p {node_ssh_port} {node_username}@{node_hostname} " f"'tar xzf {GlobalPaths.MARO_LOCAL_TMP}/{tmp_file_name} -C {target_dir}'" ) _ = Subprocess.run(untar_script) remove_script = f"rm {maro_local_tmp_abs_path}/{tmp_file_name}" _ = Subprocess.run(remove_script) remote_remove_script = ( f"ssh -o StrictHostKeyChecking=no -p {node_ssh_port} {node_username}@{node_hostname} " f"'rm {GlobalPaths.MARO_LOCAL_TMP}/{tmp_file_name}'") _ = Subprocess.run(command=remote_remove_script)
def get_storage_account_sas( account_name: str, services: str = "bqtf", resource_types: str = "sco", permissions: str = "rwdlacup", expiry: str = (datetime.datetime.utcnow() + datetime.timedelta( days=365)).strftime("%Y-%m-%dT%H:%M:%S") + "Z" ) -> str: command = ( f"az storage account generate-sas --account-name {account_name} --services {services} " f"--resource-types {resource_types} --permissions {permissions} --expiry {expiry}" ) sas_str = Subprocess.run(command=command).strip("\n").replace('"', "") logger.debug(sas_str) return sas_str
def setUpClass(cls) -> None: # Set Env. GlobalParams.LOG_LEVEL = logging.DEBUG # Init folders. os.makedirs(f"{GlobalPaths.ABS_MARO_TEST}/{cls.test_id}", exist_ok=False) # Load config and save deployment. with open(file=cls.create_deployment_template_path, mode="r") as fr: create_deployment = yaml.safe_load(fr) with open(file=cls.test_config_path, mode="r") as fr: test_config = yaml.safe_load(fr) if test_config["cloud/subscription"] and test_config[ "cloud/default_public_key"]: create_deployment["name"] = f"test_maro_k8s_{cls.test_id}" create_deployment["cloud"]["subscription"] = test_config[ "cloud/subscription"] create_deployment["cloud"][ "resource_group"] = f"test_maro_k8s_{cls.test_id}" create_deployment["cloud"]["default_public_key"] = test_config[ "cloud/default_public_key"] else: raise Exception("Invalid config") with open(file=cls.create_deployment_path, mode="w") as fw: yaml.safe_dump(create_deployment, fw) # Get params from deployments. cls.resource_group = create_deployment["cloud"]["resource_group"] cls.cluster_name = create_deployment["name"] # Pull testing images. command = "docker pull alpine:latest" Subprocess.run(command=command) command = "docker pull ubuntu:latest" Subprocess.run(command=command)
def test21_train_dqn(self) -> None: # Copy dqn examples to test folder. dqn_source_dir = os.path.normpath( os.path.join(self.maro_pkg_path, "./examples/cim/dqn")) dqn_target_dir = os.path.expanduser( f"{GlobalPaths.MARO_TEST}/{self.test_id}/train/dqn") os.makedirs(os.path.dirname( f"{GlobalPaths.ABS_MARO_TEST}/{self.test_id}/train/dqn"), exist_ok=True) command = f"cp -r {dqn_source_dir} {GlobalPaths.ABS_MARO_TEST}/{self.test_id}/train/dqn" Subprocess.run(command=command) # Get cluster details and rebuild config cluster_details = self._get_cluster_details() with open(f"{dqn_target_dir}/config.yml", 'r') as fr: config = yaml.safe_load(fr) with open(f"{dqn_target_dir}/distributed_config.yml", "r") as fr: distributed_config = yaml.safe_load(fr) with open(f"{dqn_target_dir}/config.yml", "w") as fw: config["main_loop"]["max_episode"] = 25 config["main_loop"]["exploration"]["split_ep"] = 20 yaml.safe_dump(config, fw) with open(f"{dqn_target_dir}/distributed_config.yml", 'w') as fw: distributed_config["redis"]["hostname"] = cluster_details["redis"][ "private_ip_address"] yaml.safe_dump(distributed_config, fw) # Push dqn folder to cluster command = ( f"maro k8s data push {self.cluster_name} --debug " f"'{GlobalPaths.MARO_TEST}/{self.test_id}/train/dqn' '/train'") Subprocess.run(command=command) # Start job. start_job_dqn_template_path = os.path.normpath( os.path.join(self.test_dir_path, "./modes/aks/k8s_aks_start_job_dqn.yml")) command = f"maro k8s job start {self.cluster_name} {start_job_dqn_template_path}" Subprocess.run(command=command) self._gracefully_wait(60) # Check job status. remain_idx = 0 is_finished = False while remain_idx <= 100: name_to_job_details = self._get_name_to_job_details() job_details = name_to_job_details[self.job_name] if "succeeded" in job_details["status"] and job_details["status"][ "succeeded"] == 1: is_finished = True break time.sleep(10) remain_idx += 1 self.assertTrue(is_finished)
def test_2_kubectl_exec_big_file_to_local(self) -> None: command = f"mkdir -p {GlobalPaths.MARO_TEST}/{self.test_id}/test_2_kubectl_exec_big_file_to_local" _ = Subprocess.run(command=command) basename = os.path.basename( f"/mnt/maro/{self.test_id}/test_1_kubectl_exec_big_file_to_remote") dirname = os.path.dirname( f"/mnt/maro/{self.test_id}/test_1_kubectl_exec_big_file_to_remote") command = ( f"kubectl exec -i {self.pod_name} -- tar cf - -C {dirname} {basename} | " f"tar xf - -C {GlobalPaths.MARO_TEST}/{self.test_id}/test_2_kubectl_exec_big_file_to_local" ) Subprocess.interactive_run(command=command) self.assertTrue( os.path.exists( os.path.expanduser( f"{GlobalPaths.MARO_TEST}/{self.test_id}/" f"test_2_kubectl_exec_big_file_to_local/test_1_kubectl_exec_big_file_to_remote/big_file" )))
def remove_data(self, remote_path: str) -> None: """Remote data at the remote AFS service. Args: remote_path (str): path of the remote data. Returns: None. """ # FIXME: Remove failed, The specified resource may be in use by an SMB client # Get sas sas = self._check_and_get_account_sas() # Remove data copy_command = ( "azcopy remove " f"'https://{self.cluster_id}st.file.core.windows.net/{self.cluster_id}-fs{remote_path}?{sas}' " "--recursive=True") _ = Subprocess.run(command=copy_command)
def test_ssh_default_port_connection(node_username: str, node_hostname: str, node_ssh_port: int) -> None: """Test ssh connection. Args: node_username (str): username of the MARO Node VM. node_hostname (str): hostname of the MARO Node VM. node_ssh_port (str): ssh port of the MARO Node VM. Raises: CliError / TimeoutExpired: if the connection is failed. Returns: None. """ command = ( f"ssh -o StrictHostKeyChecking=no -p {node_ssh_port} {node_username}@{node_hostname} " "echo 'Connection established'") _ = Subprocess.run(command=command, timeout=5)
def test_2_azcopy_small_files_to_local(self) -> None: sas = self.executor._check_and_get_account_sas() command = f"mkdir -p {GlobalPaths.MARO_TEST}/{self.test_id}/test_2_azcopy_small_files_to_local" _ = Subprocess.run(command=command) local_path = os.path.expanduser( f"{GlobalPaths.MARO_TEST}/{self.test_id}/test_2_azcopy_small_files_to_local" ) command = ( f"azcopy copy " f"'https://{self.cluster_id}st.file.core.windows.net/{self.cluster_id}-fs" f"/{self.test_id}/test_1_azcopy_small_files_to_remote?{sas}' " f"'{local_path}' " f"--recursive=True") Subprocess.interactive_run(command=command) self.assertTrue( os.path.exists( os.path.expanduser( f"{GlobalPaths.MARO_TEST}/{self.test_id}/" f"test_2_azcopy_small_files_to_local/test_1_azcopy_small_files_to_remote/small_files" )))
def remote_create_user(master_username: str, master_hostname: str, master_ssh_port: int, user_id: str, user_role: str) -> dict: """Remote create MARO User. Exec /lib/scripts/master/create_user.py remotely. Args: master_username (str): username of the MARO Master VM. master_hostname (str): hostname of the MARO Master VM. master_ssh_port (int): ssh port of the MARO Master VM. user_id (str): id of the MARO User. user_role (str): role of the MARO User, currently we only have 'admin' at this time. Returns: dict: details of the created MARO User. """ command = ( f"ssh -o StrictHostKeyChecking=no -p {master_ssh_port} {master_username}@{master_hostname} " f"'cd {GlobalPaths.MARO_SHARED}/lib/grass; python3 -m scripts.master.create_user " f"{user_id} {user_role}'") return_str = Subprocess.run(command=command) return json.loads(return_str)
def get_dynamic_info(interval: int = None) -> dict: """ Get dynamic resource information about local environment. Returns: Tuple[float]: (total memory, free memory, used memory, memory usage) """ dynamic_info = {} dynamic_info["cpu_usage_per_core"] = psutil.cpu_percent( interval=interval, percpu=True) memory = psutil.virtual_memory() dynamic_info["memory_usage"] = memory.percent / 100 gpu_dynamic_command = "nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits" dynamic_info["gpu_memory_usage"] = [] try: return_str = Subprocess.run(command=gpu_dynamic_command) memory_usage_per_gpu = return_str.split("\n") for single_usage in memory_usage_per_gpu: dynamic_info["gpu_memory_usage"].append(float(single_usage)) except Exception: pass return dynamic_info