Esempio n. 1
0
    def test11_image1(self) -> None:
        """Push image alpine:latest to the cluster.

        Master should load the image_file of alpine and present it to master_details.

        Returns:
            None.
        """
        # Run command.
        command = f"maro grass image push {self.cluster_name} --debug --image-name alpine:latest"
        Subprocess.interactive_run(command=command)
        self._gracefully_wait()

        # Check validity, failed if does not meet the desired state in 120s.
        is_valid = False
        start_time = time.time()
        while not is_valid and start_time + 120 >= time.time():
            try:
                is_valid = True
                master_details = self._get_master_details()
                self.assertIn("alpine_latest", master_details["image_files"])
            except AssertionError:
                is_valid = False
                time.sleep(10)
        self.assertTrue(is_valid)
Esempio n. 2
0
    def test20_train_env_provision(self):
        # Build docker image and load docker image.
        command = (
            f"docker build -f {self.maro_pkg_path}/docker_files/cpu.runtime.source.df -t maro_runtime_cpu:test "
            f"{self.maro_pkg_path}")
        Subprocess.run(command=command)

        # Run command.
        command = f"maro grass image push {self.cluster_name} --debug --image-name maro_runtime_cpu:test"
        Subprocess.interactive_run(command=command)

        # Check image status, failed if does not meet the desired state in 1000s.
        is_loaded = False
        start_time = time.time()
        while not is_loaded and start_time + 1000 >= time.time():
            try:
                is_loaded = True
                nodes_details = self._list_nodes_details()
                for node_details in nodes_details:
                    self.assertIn("maro_runtime_cpu_test",
                                  node_details["image_files"])
            except AssertionError:
                is_loaded = False
                time.sleep(10)
        self.assertTrue(is_loaded)
Esempio n. 3
0
    def test16_start(self) -> None:
        """Start one Standard_D2s_v3.

        Two Standard_D2s_v3 should be in running state,
        and they should have loaded the image alpine:latest and ubuntu:latest.

        Returns:
            None.
        """
        # Run command.
        command = f"maro grass node start {self.cluster_name} --debug Standard_D2s_v3 1"
        Subprocess.interactive_run(command=command)
        self._gracefully_wait()

        # Check validity, failed if does not meet the desired state in 120s.
        is_valid = False
        start_time = time.time()
        while not is_valid and start_time + 120 >= time.time():
            try:
                is_valid = True
                nodes_details = self._list_nodes_details()
                self.assertEqual(len(nodes_details), 2)
                for node_details in nodes_details:
                    self.assertEqual(node_details["state"]["status"],
                                     NodeStatus.RUNNING)
                    self.assertIn("alpine_latest", node_details["image_files"])
                    self.assertIn("ubuntu_latest", node_details["image_files"])
            except AssertionError:
                is_valid = False
                time.sleep(10)
        self.assertTrue(is_valid)
Esempio n. 4
0
    def remote_join_cluster(node_username: str, node_hostname: str,
                            node_ssh_port: int, master_private_ip_address: str,
                            master_api_server_port: int,
                            deployment_path: str) -> None:
        """Remote join cluster.

        Install required runtime env first,
        then download the /lib/scripts/node/join_cluster.py from master_api_server, and execute remotely.

        Args:
            node_username (str): username of the MARO Node VM.
            node_hostname (str): hostname of the MARO Node VM.
            node_ssh_port (str): ssh port of the MARO Node VM.
            master_private_ip_address (str): private ip address of the MARO Master VM,
                (master and nodes must in the same virtual network).
            master_api_server_port (int): port of the master_api_server.
            deployment_path (str): path of the join_cluster_deployment.

        Returns:
            None.
        """
        command = (
            f"ssh -o StrictHostKeyChecking=no -p {node_ssh_port} {node_username}@{node_hostname} "
            "'export DEBIAN_FRONTEND=noninteractive; "
            "sudo -E apt update; "
            "sudo -E apt install -y python3-pip; "
            "pip3 install deepdiff redis pyyaml; "
            f"curl -s GET http://{master_private_ip_address}:{master_api_server_port}/v1/joinClusterScript | "
            f"python3 - {deployment_path}'")
        Subprocess.interactive_run(command=command)
Esempio n. 5
0
    def test14_stop(self) -> None:
        """Stop one Standard_D2s_v3.

        One Standard_D2s_v3 should be in running state, and the other should be in Stopped state.

        Returns:
            None.
        """
        # Run command.
        command = f"maro grass node stop {self.cluster_name} --debug Standard_D2s_v3 1"
        Subprocess.interactive_run(command=command)
        self._gracefully_wait()

        # Check validity, failed if does not meet the desired state in 120s.
        is_valid = False
        start_time = time.time()
        while not is_valid and start_time + 120 >= time.time():
            try:
                is_valid = True
                nodes_details = self._list_nodes_details()
                self.assertEqual(len(nodes_details), 2)
                running_count = 0
                stopped_count = 0
                for node_details in nodes_details:
                    if node_details["state"]["status"] == NodeStatus.RUNNING:
                        running_count += 1
                    if node_details["state"]["status"] == NodeStatus.STOPPED:
                        stopped_count += 1
                self.assertEqual(running_count, 1)
                self.assertEqual(stopped_count, 1)
            except AssertionError:

                is_valid = False
                time.sleep(10)
        self.assertTrue(is_valid)
Esempio n. 6
0
    def test11_node1(self) -> None:
        """Scale node spec Standard_D2s_v3 to 1.

        A Standard_D2s_v3 should be in running state.

        Returns:
            None.
        """
        # Run command.
        command = f"maro grass node scale {self.cluster_name} --debug Standard_D2s_v3 1"
        Subprocess.interactive_run(command=command)
        self._gracefully_wait()

        # Check validity, failed if does not meet the desired state in 120s.
        is_valid = False
        start_time = time.time()
        while not is_valid and start_time + 120 >= time.time():
            try:
                is_valid = True
                nodes_details = self._list_nodes_details()
                self.assertEqual(len(nodes_details), 1)
                for node_details in nodes_details:
                    self.assertEqual(node_details["state"]["status"],
                                     NodeStatus.RUNNING)
            except AssertionError:
                is_valid = False
                time.sleep(10)
        self.assertTrue(is_valid)
Esempio n. 7
0
    def test12_join_cluster(self) -> None:
        """Join a node to cluster.

        Returns:
            None.
        """
        # Run command.
        command = f"maro grass node join --debug {self.join_cluster_deployment_path}"
        Subprocess.interactive_run(command=command)
        self._gracefully_wait()

        # Check validity, failed if does not meet the desired state in 120s.
        is_valid = False
        start_time = time.time()
        while not is_valid and start_time + 120 >= time.time():
            try:
                is_valid = True
                nodes_details = self._list_nodes_details()
                self.assertEqual(len(nodes_details), 1)
                for node_details in nodes_details:
                    self.assertEqual(node_details["state"]["status"],
                                     NodeStatus.RUNNING)
                    self.assertIn("alpine_latest", node_details["image_files"])
            except AssertionError:
                is_valid = False
                time.sleep(10)
        self.assertTrue(is_valid)
Esempio n. 8
0
    def test13_image2(self) -> None:
        """Push image ubuntu:latest to the cluster.

        The only Running node should have loaded the image ubuntu:latest.

        Returns:
            None.
        """
        # Run command.
        command = f"maro grass image push {self.cluster_name} --debug --image-name ubuntu:latest"
        Subprocess.interactive_run(command=command)
        self._gracefully_wait()

        # Check validity, failed if does not meet the desired state in 120s.
        is_valid = False
        start_time = time.time()
        while not is_valid and start_time + 120 >= time.time():
            try:
                is_valid = True
                nodes_details = self._list_nodes_details()
                self.assertEqual(len(nodes_details), 1)
                for node_details in nodes_details:
                    if node_details["state"]["status"] == NodeStatus.RUNNING:
                        self.assertIn("alpine_latest",
                                      node_details["image_files"])
                        self.assertIn("ubuntu_latest",
                                      node_details["image_files"])
            except AssertionError:
                is_valid = False
                time.sleep(10)
        self.assertTrue(is_valid)
Esempio n. 9
0
    def test10_create(self) -> None:
        # Run command.
        command = f"maro k8s create --debug {self.create_deployment_path}"
        Subprocess.interactive_run(command=command)

        # Check validity.
        nodes_details = self._get_node_details()
        self.assertIn("Standard_D2s_v3", nodes_details)
        self.assertEqual(nodes_details["Standard_D2s_v3"], 1)
Esempio n. 10
0
 def test_1_azcopy_small_files_to_remote(self) -> None:
     sas = self.executor._check_and_get_account_sas()
     command = (
         f"azcopy copy "
         f"'{self.local_small_files_path}' "
         f"'https://{self.cluster_id}st.file.core.windows.net/{self.cluster_id}-fs"
         f"/{self.test_id}/test_1_azcopy_small_files_to_remote/?{sas}' "
         f"--recursive=True")
     Subprocess.interactive_run(command=command)
Esempio n. 11
0
    def test12_image(self) -> None:
        # Run command.
        command = f"maro k8s image push {self.cluster_name} --debug --image-name alpine:latest"
        Subprocess.interactive_run(command=command)

        # Check validity.
        command = f"maro k8s image list {self.cluster_name}"
        return_str = Subprocess.run(command=command)
        images = ast.literal_eval(return_str)
        self.assertIn("alpine", images)
Esempio n. 12
0
 def test_1_rsync_small_files_to_remote(self) -> None:
     command = (f"ssh -o StrictHostKeyChecking=no "
                f"{self.admin_username}@{self.master_public_ip_address} "
                f"'mkdir -p ~/test/{self.test_id}/test_1_rsync_small_files_to_remote'")
     _ = Subprocess.run(command=command)
     command = (f"rsync -e 'ssh -o StrictHostKeyChecking=no' -az -r "
                f"{self.local_small_files_path} "
                f"{self.admin_username}@{self.master_public_ip_address}:"
                f"~/test/{self.test_id}/test_1_rsync_small_files_to_remote")
     Subprocess.interactive_run(command=command)
Esempio n. 13
0
    def test20_train_env_provision(self):
        # Build docker image and load docker image.
        command = (
            f"docker build -f {self.maro_pkg_path}/docker_files/cpu.runtime.source.df -t maro_runtime_cpu "
            f"{self.maro_pkg_path}")
        Subprocess.run(command=command)

        # Run command.
        command = f"maro k8s image push {self.cluster_name} --debug --image-name maro_runtime_cpu"
        Subprocess.interactive_run(command=command)
Esempio n. 14
0
    def local_leave_cluster() -> None:
        """Local leave cluster.

        Exec /lib/scripts/node/activate_leave_cluster.py

        Returns:
            None.
        """
        command = "python3 ~/.maro-local/scripts/activate_leave_cluster.py"
        Subprocess.interactive_run(command=command)
Esempio n. 15
0
    def tearDownClass(cls) -> None:
        # Delete cluster
        command = f"maro k8s delete --debug {cls.cluster_name}"
        Subprocess.interactive_run(command=command)

        # Print result
        print(json.dumps(TEST_TO_TIME, indent=4, sort_keys=True))

        # Delete tmp test folder
        shutil.rmtree(
            os.path.expanduser(f"{GlobalPaths.MARO_TEST}/{cls.test_id}"))
Esempio n. 16
0
 def test_2_rsync_small_files_to_local(self) -> None:
     command = f"mkdir -p {GlobalPaths.MARO_TEST}/{self.test_id}/test_2_rsync_small_files_to_local"
     _ = Subprocess.run(command=command)
     command = (f"rsync -e 'ssh -o StrictHostKeyChecking=no' -az -r "
                f"{self.admin_username}@{self.master_public_ip_address}:"
                f"~/test/{self.test_id}/test_1_rsync_small_files_to_remote "
                f"{GlobalPaths.MARO_TEST}/{self.test_id}/test_2_rsync_small_files_to_local")
     Subprocess.interactive_run(command=command)
     self.assertTrue(os.path.exists(os.path.expanduser(
         f"{GlobalPaths.MARO_TEST}/{self.test_id}/"
         f"test_2_rsync_small_files_to_local/test_1_rsync_small_files_to_remote/small_files/README.md")))
Esempio n. 17
0
    def test11_node(self) -> None:
        # Run command.
        command = f"maro k8s node scale {self.cluster_name} --debug Standard_D4s_v3 1"
        Subprocess.interactive_run(command=command)

        # Check validity.
        nodes_details = self._get_node_details()
        self.assertIn("Standard_D2s_v3", nodes_details)
        self.assertIn("Standard_D4s_v3", nodes_details)
        self.assertEqual(nodes_details["Standard_D2s_v3"], 1)
        self.assertEqual(nodes_details["Standard_D4s_v3"], 1)
Esempio n. 18
0
 def test_2_tar_ssh_small_files_to_local(self) -> None:
     command = f"mkdir -p {GlobalPaths.MARO_TEST}/{self.test_id}/test_2_tar_ssh_small_files_to_local"
     _ = Subprocess.run(command=command)
     basename = os.path.basename(f"~/test/{self.test_id}/test_1_tar_ssh_small_files_to_remote")
     dirname = os.path.dirname(f"~/test/{self.test_id}/test_1_tar_ssh_small_files_to_remote")
     command = (f"ssh {self.admin_username}@{self.master_public_ip_address} 'tar cf - -C {dirname} {basename}' | "
                f"tar xf - -C {GlobalPaths.MARO_TEST}/{self.test_id}/test_2_tar_ssh_small_files_to_local")
     Subprocess.interactive_run(command=command)
     self.assertTrue(os.path.exists(os.path.expanduser(
         f"{GlobalPaths.MARO_TEST}/{self.test_id}/"
         f"test_2_tar_ssh_small_files_to_local/test_1_tar_ssh_small_files_to_remote/small_files/README.md")))
Esempio n. 19
0
    def test_1_tar_ssh_small_files_to_remote(self) -> None:
        command = (f"ssh -o StrictHostKeyChecking=no "
                   f"{self.admin_username}@{self.master_public_ip_address} "
                   f"'mkdir -p ~/test/{self.test_id}/test_1_tar_ssh_small_files_to_remote'")
        _ = Subprocess.run(command=command)

        basename = os.path.basename(self.local_small_files_path)
        dirname = os.path.dirname(self.local_small_files_path)
        command = (f"tar cf - -C {dirname} {basename} | "
                   f"ssh {self.admin_username}@{self.master_public_ip_address} "
                   f"'tar xf - -C ~/test/{self.test_id}/test_1_tar_ssh_small_files_to_remote'")
        Subprocess.interactive_run(command=command)
Esempio n. 20
0
    def test_1_azcopy_tar_small_files_to_remote(self) -> None:
        # create remote folder
        command = (
            f"kubectl exec -i {self.pod_name} -- "
            f"mkdir -p /mnt/maro/{self.test_id}/test_1_azcopy_tar_small_files_to_remote"
        )
        Subprocess.interactive_run(command=command)

        # local tar zip
        basename = os.path.basename(self.local_small_files_path)
        dirname = os.path.dirname(self.local_small_files_path)
        tar_file_name = uuid.uuid4().hex[:8]
        command = f"tar cf {GlobalPaths.MARO_TEST}/{self.test_id}/tar/{tar_file_name} -C {dirname} {basename}"
        Subprocess.interactive_run(command=command)

        # azcopy
        sas = self.executor._check_and_get_account_sas()
        local_path = os.path.expanduser(
            f"{GlobalPaths.MARO_TEST}/{self.test_id}/tar/{tar_file_name}")
        command = (
            f"azcopy copy "
            f"'{local_path}' "
            f"'https://{self.cluster_id}st.file.core.windows.net/{self.cluster_id}-fs"
            f"/tar/{tar_file_name}?{sas}' "
            f"--recursive=True")
        Subprocess.interactive_run(command=command)

        # remote tar unzip
        command = (
            f"kubectl exec -i {self.pod_name} -- "
            f"tar xf /mnt/maro/tar/{tar_file_name} "
            f"-C /mnt/maro/{self.test_id}/test_1_azcopy_tar_small_files_to_remote"
        )
        Subprocess.interactive_run(command=command)
Esempio n. 21
0
    def test_1_kubectl_exec_small_files_to_remote(self) -> None:
        command = (
            f"kubectl exec -i {self.pod_name} -- "
            f"mkdir -p /mnt/maro/{self.test_id}/test_1_kubectl_exec_small_files_to_remote"
        )
        Subprocess.interactive_run(command=command)

        basename = os.path.basename(self.local_small_files_path)
        dirname = os.path.dirname(self.local_small_files_path)
        command = (
            f"tar cf - -C {dirname} {basename} | "
            f"kubectl exec -i {self.pod_name} -- "
            f"tar xf - -C /mnt/maro/{self.test_id}/test_1_kubectl_exec_small_files_to_remote"
        )
        Subprocess.interactive_run(command=command)
Esempio n. 22
0
    def get_static_info() -> dict:
        """ Get static resource information about local environment.

        Returns:
            Tuple[int, list]: (total cpu number, [cpu usage per core])
        """
        static_info = {}
        static_info["cpu"] = psutil.cpu_count()

        memory = psutil.virtual_memory()
        static_info["total_memory"] = round(float(memory.total) / (1024**2), 2)
        static_info["memory"] = round(float(memory.free) / (1024**2), 2)

        gpu_static_command = "nvidia-smi --query-gpu=name,memory.total --format=csv,noheader,nounits"
        try:
            return_str = Subprocess.run(command=gpu_static_command)
            gpus_info = return_str.split(os.linesep)
            static_info["gpu"] = len(gpus_info) - 1  # (int) logical number
            static_info["gpu_name"] = []
            static_info["gpu_memory"] = []
            for info in gpus_info:
                name, total_memory = info.split(", ")
                static_info["gpu_name"].append(name)
                static_info["gpu_memory"].append(total_memory)
        except Exception:
            static_info["gpu"] = 0

        return static_info
Esempio n. 23
0
    def pull_data(self, local_dir: str, remote_path: str) -> None:
        """Pull remote AFS service data to local folder via azcopy.

        Args:
            local_dir (str): path of the local folder.
            remote_path (str): path of the remote data.

        Returns:
            None.
        """
        # Get sas
        sas = self._check_and_get_account_sas()

        # Push data
        abs_local_dir = os.path.expanduser(local_dir)
        source_path = PathConvertor.build_path_without_trailing_slash(
            remote_path)
        abs_target_dir = PathConvertor.build_path_with_trailing_slash(
            abs_local_dir)
        os.makedirs(abs_target_dir, exist_ok=True)
        if not source_path.startswith("/"):
            raise FileOperationError(
                f"Invalid remote path: {source_path}\nShould be started with '/'"
            )
        copy_command = (
            "azcopy copy "
            f"'https://{self.cluster_id}st.file.core.windows.net/{self.cluster_id}-fs{source_path}?{sas}' "
            f"'{abs_target_dir}' "
            "--recursive=True")
        _ = Subprocess.run(command=copy_command)
Esempio n. 24
0
 def get_resource_group(resource_group: str) -> dict:
     command = f"az group show --name {resource_group}"
     try:
         return_str = Subprocess.run(command=command)
         return json.loads(return_str)
     except CommandExecutionError:
         return {}
Esempio n. 25
0
    def tearDownClass(cls) -> None:
        # Print result.
        print(json.dumps(cls.test_func_to_time, indent=4, sort_keys=True))

        # Delete resource group.
        AzureController.delete_resource_group(
            resource_group=cls.resource_group)

        # Delete tmp test folder.
        shutil.rmtree(f"{GlobalPaths.ABS_MARO_TEST}/{cls.test_id}")

        # Delete docker image.
        try:
            command = "docker rmi maro_runtime_cpu:test"
            Subprocess.run(command=command)
        except CommandExecutionError:
            pass
Esempio n. 26
0
 def scale_nodepool(resource_group: str, aks_name: str, nodepool_name: str,
                    node_count: int) -> None:
     command = (f"az aks nodepool scale "
                f"-g {resource_group} "
                f"--cluster-name {aks_name} "
                f"--name {nodepool_name} "
                f"--node-count {node_count}")
     _ = Subprocess.run(command=command)
Esempio n. 27
0
 def add_nodepool(resource_group: str, aks_name: str, nodepool_name: str,
                  node_count: int, node_size: str) -> None:
     command = (f"az aks nodepool add "
                f"-g {resource_group} "
                f"--cluster-name {aks_name} "
                f"--name {nodepool_name} "
                f"--node-count {node_count} "
                f"--node-vm-size {node_size}")
     _ = Subprocess.run(command=command)
Esempio n. 28
0
    def remote_leave_cluster(node_username: str, node_hostname: str,
                             node_ssh_port: int) -> None:
        """Remote leave cluster.

        Exec /lib/scripts/node/activate_leave_cluster.py

        Args:
            node_username (str): username of the MARO Node VM.
            node_hostname (str): hostname of the MARO Node VM.
            node_ssh_port (str): ssh port of the MARO Node VM.

        Returns:
            None.
        """
        command = (
            f"ssh -o StrictHostKeyChecking=no -p {node_ssh_port} {node_username}@{node_hostname} "
            f"'python3 ~/.maro-local/scripts/activate_leave_cluster.py'")
        Subprocess.interactive_run(command=command)
Esempio n. 29
0
    def remote_delete_master(master_username: str, master_hostname: str,
                             master_ssh_port: int) -> None:
        """Remote delete MARO Master.

        Exec /lib/scripts/master/delete_master.py remotely.

        Args:
            master_username (str): username of the MARO Master VM.
            master_hostname (str): hostname of the MARO Master VM.
            master_ssh_port (int): ssh port of the MARO Master VM.

        Returns:
            None.
        """
        command = (
            f"ssh -o StrictHostKeyChecking=no -p {master_ssh_port} {master_username}@{master_hostname} "
            f"'python3 {GlobalPaths.MARO_LOCAL}/scripts/delete_master.py'")
        Subprocess.interactive_run(command=command)
Esempio n. 30
0
    def remote_init_build_node_image_vm(node_username: str, node_hostname: str,
                                        node_ssh_port: int) -> None:
        """Remote init Build Node Image VM.

        Exec /lib/scripts/build_node_image_vm/init_build_node_image_vm.py remotely.

        Args:
            node_username (str): username of the vm.
            node_hostname (str): hostname of the vm.
            node_ssh_port (int): ssh port of the vm.

        Returns:
            None.
        """
        command = (
            f"ssh -o StrictHostKeyChecking=no -p {node_ssh_port} {node_username}@{node_hostname} "
            "'python3 ~/init_build_node_image_vm.py'")
        Subprocess.interactive_run(command=command)