Exemple #1
0
def create(deployment_path: str, **kwargs):
    # Late import.
    import yaml

    from maro.cli.grass.executors.grass_azure_executor import GrassAzureExecutor
    from maro.cli.grass.executors.grass_local_executor import GrassLocalExecutor
    from maro.cli.grass.executors.grass_on_premises_executor import GrassOnPremisesExecutor
    from maro.utils.exception.cli_exception import BadRequestError, FileOperationError, InvalidDeploymentTemplateError

    try:
        with open(deployment_path, "r") as fr:
            create_deployment = yaml.safe_load(fr)
        if create_deployment["mode"] == "grass/azure":
            GrassAzureExecutor.create(create_deployment=create_deployment)
        elif create_deployment["mode"] == "grass/on-premises":
            GrassOnPremisesExecutor.create(create_deployment=create_deployment)
        elif create_deployment["mode"] == "grass/local":
            executor = GrassLocalExecutor(
                cluster_name=create_deployment["name"],
                cluster_details=create_deployment)
            executor.create()
        else:
            raise BadRequestError(
                f"Unsupported operation in mode '{create_deployment['mode']}'."
            )
    except KeyError as e:
        raise InvalidDeploymentTemplateError(f"Missing key '{e.args[0]}'.")
    except FileNotFoundError:
        raise FileOperationError("Invalid template file path.")
Exemple #2
0
    def stop_node(self, replicas: int, node_size: str):
        """Stop MARO Node VMs in parallel.

        Args:
            replicas (int): number of MARO Node in specific node_size to stop.
            node_size (str): size of the MARO Node VM,
                see https://docs.microsoft.com/en-us/azure/virtual-machines/sizes for reference.

        Returns:
            None.
        """
        # Get nodes details
        nodes_details = self.master_api_client.list_nodes()

        # Get stoppable nodes
        stoppable_nodes_details = []
        for node_details in nodes_details:
            if (node_details["node_size"] == node_size
                    and node_details["state"]["status"] == NodeStatus.RUNNING
                    and self._count_running_containers(node_details) == 0):
                stoppable_nodes_details.append(node_details)

        # Check replicas
        if len(stoppable_nodes_details) < replicas:
            raise BadRequestError(
                f"No more '{node_size}' nodes can be stopped, only {len(stoppable_nodes_details)} are stoppable"
            )

        # Parallel stop
        params = [[node_details]
                  for node_details in stoppable_nodes_details[:replicas]]
        with ThreadPool(GlobalParams.PARALLELS) as pool:
            pool.starmap(self._stop_node, params)
    def start_schedule(self, deployment_path: str):
        # Load start_schedule_deployment
        with open(deployment_path, "r") as fr:
            start_schedule_deployment = yaml.safe_load(fr)

        schedule_name = start_schedule_deployment["name"]
        start_schedule_deployment = self._completed_local_job_deployment(start_schedule_deployment)

        # Check resource
        is_satisfied, _ = resource_op(
            self.cluster_details["master"]["resource"],
            start_schedule_deployment["total_request_resource"],
            ResourceOperation.ALLOCATION
        )
        if not is_satisfied:
            raise BadRequestError(f"No enough resource to start schedule {schedule_name} in {self.cluster_name}.")

        # push schedule details to Redis
        self._redis_connection.hset(
            f"{self.cluster_name}:job_details",
            schedule_name,
            json.dumps(start_schedule_deployment)
        )

        job_list = start_schedule_deployment["job_names"]
        # switch schedule details into job details
        job_detail = copy.deepcopy(start_schedule_deployment)
        del job_detail["job_names"]

        for job_name in job_list:
            job_detail["name"] = job_name

            self._push_pending_job(job_detail)
Exemple #4
0
    def start_node(self, replicas: int, node_size: str):
        """Start MARO Node VMs in parallel.

        Args:
            replicas (int): number of MARO Node in specific node_size to start.
            node_size (str): size of the MARO Node VM, see https://docs.microsoft.com/en-us/azure/virtual-machines/sizes
                for reference.

        Returns:
            None.
        """
        # Get nodes details
        nodes_details = self.master_api_client.list_nodes()

        # Get startable nodes
        startable_nodes = []
        for node_details in nodes_details:
            if node_details["node_size"] == node_size and node_details[
                    "state"]["status"] == NodeStatus.STOPPED:
                startable_nodes.append(node_details["name"])

        # Check replicas
        if len(startable_nodes) < replicas:
            raise BadRequestError(
                f"No enough '{node_size}' nodes can be started, only {len(startable_nodes)} is able to start"
            )

        # Parallel start
        params = [[startable_node]
                  for startable_node in startable_nodes[:replicas]]
        with ThreadPool(GlobalParams.PARALLELS) as pool:
            pool.starmap(self._start_node, params)
Exemple #5
0
    def scale_node(self, replicas: int, node_size: str):
        """Scale up/down MARO Node using predefined Node Image.

        Args:
            replicas (int): desired number of MARO Node in specific node_size.
            node_size (str): size of the MARO Node VM, see https://docs.microsoft.com/en-us/azure/virtual-machines/sizes
                for reference.

        Returns:
            None.
        """
        # Load details
        nodes_details = self.master_api_client.list_nodes()

        # Init node_size_to_count
        node_size_to_count = collections.defaultdict(lambda: 0)
        for node_details in nodes_details:
            node_size_to_count[node_details["node_size"]] += 1

        # Get node_size_to_spec
        node_size_to_spec = self._get_node_size_to_spec()
        if node_size not in node_size_to_spec:
            raise BadRequestError(f"Invalid node_size '{node_size}'")

        # Scale nodes
        if node_size_to_count[node_size] > replicas:
            self._delete_nodes(num=node_size_to_count[node_size] - replicas,
                               node_size=node_size)
        elif node_size_to_count[node_size] < replicas:
            self._create_nodes(num=replicas - node_size_to_count[node_size],
                               node_size=node_size)
        else:
            logger.warning_yellow("Replica is match, no create or delete")
Exemple #6
0
    def scale_node(self, replicas: int, node_size: str) -> None:
        """Scale up/down MARO Node.

        Args:
            replicas (int): desired number of MARO Node in specific node_size.
            node_size (str): size of the MARO Node VM, see https://docs.microsoft.com/en-us/azure/virtual-machines/sizes
                for reference.

        Returns:
            None.
        """
        # Get node_size_to_info
        node_size_to_info = self._get_node_size_to_info()

        # Get node_size_to_spec, and check if node_size is valid
        node_size_to_spec = self._get_node_size_to_spec()
        if node_size not in node_size_to_spec:
            raise BadRequestError(f"Invalid node_size '{node_size}'")

        # Scale node
        if node_size not in node_size_to_info:
            self._build_node_pool(replicas=replicas, node_size=node_size)
        elif node_size_to_info[node_size]["count"] != replicas:
            self._scale_node_pool(replicas=replicas,
                                  node_size=node_size,
                                  node_size_to_info=node_size_to_info)
        else:
            logger.warning_yellow("Replica is match, no create or delete")
Exemple #7
0
def node_leave(cluster_name: str, node_name: str, **kwargs):

    cluster_details = load_cluster_details(cluster_name)
    if cluster_details["mode"] != "grass/on-premises":
        raise BadRequestError("Node join cluster interrupted: Invalid mode.")

    executor = GrassOnPremisesExecutor(cluster_name)
    executor.node_leave_cluster(node_name)
Exemple #8
0
def delete(cluster_name: str, **kwargs):
    cluster_details = load_cluster_details(cluster_name=cluster_name)

    if cluster_details["mode"] == "k8s/aks":
        executor = K8sAksExecutor(cluster_name=cluster_name)
        executor.delete()
    else:
        raise BadRequestError(f"Unsupported command in mode '{cluster_details['mode']}'.")
Exemple #9
0
def stop_node(cluster_name: str, replicas: int, node_size: str, **kwargs):
    cluster_details = load_cluster_details(cluster_name=cluster_name)

    if cluster_details["mode"] == "grass/azure":
        executor = GrassAzureExecutor(cluster_name=cluster_name)
        executor.stop_node(replicas=replicas, node_size=node_size)
    else:
        raise BadRequestError(
            f"Unsupported command in mode '{cluster_details['mode']}'.")
Exemple #10
0
def remove_data(cluster_name: str, remote_path: str, **kwargs):
    cluster_details = load_cluster_details(cluster_name=cluster_name)

    if cluster_details["mode"] == "k8s/aks":
        executor = K8sAksExecutor(cluster_name=cluster_name)
        executor.remove_data(remote_path=remote_path)
    else:
        raise BadRequestError(
            f"Unsupported command in mode '{cluster_details['mode']}'.")
Exemple #11
0
def status(cluster_name: str, resource_name: str, **kwargs):
    cluster_details = load_cluster_details(cluster_name=cluster_name)

    if cluster_details["mode"] in ["grass/azure", "grass/on-premises"]:
        executor = GrassAzureExecutor(cluster_name=cluster_name)
        executor.status(resource_name=resource_name)
    else:
        raise BadRequestError(
            f"Unsupported command in mode '{cluster_details['mode']}'.")
Exemple #12
0
def get_job_logs(cluster_name: str, job_name: str, **kwargs):
    # Load details
    cluster_details = load_cluster_details(cluster_name=cluster_name)

    if cluster_details["mode"] in ["grass/azure", "grass/on-premises"]:
        executor = GrassAzureExecutor(cluster_name=cluster_name)
        executor.get_job_logs(job_name=job_name)
    else:
        raise BadRequestError(f"Unsupported command in mode '{cluster_details['mode']}'.")
Exemple #13
0
def pull_data(cluster_name: str, local_path: str, remote_path: str, **kwargs):
    cluster_details = load_cluster_details(cluster_name=cluster_name)
    if cluster_details["mode"] in ["grass/azure", "grass/on-premises"]:

        executor = GrassAzureExecutor(cluster_name=cluster_name)
        executor.pull_data(local_path=local_path, remote_path=remote_path)
    else:
        raise BadRequestError(
            f"Unsupported command in mode '{cluster_details['mode']}'.")
Exemple #14
0
def start_job(cluster_name: str, deployment_path: str, **kwargs):
    # Load details
    cluster_details = load_cluster_details(cluster_name=cluster_name)

    if cluster_details["mode"] in ["grass/azure", "grass/on-premises"]:
        executor = GrassAzureExecutor(cluster_name=cluster_name)
        executor.start_job(deployment_path=deployment_path)
    else:
        raise BadRequestError(f"Unsupported command in mode '{cluster_details['mode']}'.")
Exemple #15
0
    def push_image(self, image_name: str, image_path: str,
                   remote_context_path: str, remote_image_name: str) -> None:
        """Push docker image from local to the MARO Cluster.

        Args:
            image_name (str): name of the image.
            image_path (str): path of the image file.
            remote_context_path (str): path of the remote context (for remote build).
            remote_image_name (str): name of the image (for remote build).

        Returns:
            None.
        """
        # Push image TODO: design a new paradigm for remote build
        if image_name or image_path:
            if image_name:
                # Push image from local docker client.
                new_file_name = NameCreator.get_valid_file_name(image_name)
                abs_image_path = f"{GlobalPaths.ABS_MARO_CLUSTERS}/{self.cluster_name}/image_files/{new_file_name}"
                DockerController.save_image(image_name=image_name,
                                            abs_export_path=abs_image_path)
            else:
                # Push image from local image file.
                file_name = os.path.basename(image_path)
                new_file_name = NameCreator.get_valid_file_name(file_name)
                abs_image_path = f"{GlobalPaths.ABS_MARO_CLUSTERS}/{self.cluster_name}/image_files/{new_file_name}"
                FileSynchronizer.copy_and_rename(
                    source_path=image_path,
                    target_dir=
                    f"{GlobalPaths.ABS_MARO_CLUSTERS}/{self.cluster_name}/image_files",
                    new_name=new_file_name)
            # Use md5_checksum to skip existed image file.
            remote_image_file_details = self.master_api_client.get_image_file(
                image_file_name=new_file_name)
            local_md5_checksum = self._get_md5_checksum(path=abs_image_path)
            if ("md5_checksum" in remote_image_file_details
                    and remote_image_file_details["md5_checksum"]
                    == local_md5_checksum):
                logger.info_green(
                    f"The image file '{new_file_name}' already exists")
                return
            FileSynchronizer.copy_files_to_node(
                local_path=abs_image_path,
                remote_dir=
                f"{GlobalPaths.MARO_SHARED}/clusters/{self.cluster_name}/image_files",
                node_username=self.master_username,
                node_hostname=self.master_public_ip_address,
                node_ssh_port=self.master_ssh_port)
            self.master_api_client.create_image_file(
                image_file_details={
                    "name": new_file_name,
                    "md5_checksum": local_md5_checksum
                })
            logger.info_green(f"Image {image_name} is loaded")
        else:
            raise BadRequestError("Invalid arguments")
Exemple #16
0
def get_job_logs(cluster_name: str, job_name: str, **kwargs):
    # Load details
    cluster_details = load_cluster_details(cluster_name=cluster_name)

    if cluster_details["mode"] == "k8s/aks":
        executor = K8sAksExecutor(cluster_name=cluster_name)
        executor.get_job_logs(job_name=job_name)
    else:
        raise BadRequestError(
            f"Unsupported command in mode '{cluster_details['mode']}'.")
Exemple #17
0
def start_job(cluster_name: str, deployment_path: str, **kwargs):
    # Load details
    cluster_details = load_cluster_details(cluster_name=cluster_name)

    if cluster_details["mode"] == "k8s/aks":
        executor = K8sAksExecutor(cluster_name=cluster_name)
        executor.start_job(deployment_path=deployment_path)
    else:
        raise BadRequestError(
            f"Unsupported command in mode '{cluster_details['mode']}'.")
Exemple #18
0
def stop_schedule(cluster_name: str, schedule_name: str, **kwargs):
    # Load details
    cluster_details = load_cluster_details(cluster_name=cluster_name)

    if cluster_details["mode"] == "k8s/aks":
        executor = K8sAksExecutor(cluster_name=cluster_name)
        executor.stop_schedule(schedule_name=schedule_name)
    else:
        raise BadRequestError(
            f"Unsupported command in mode '{cluster_details['mode']}'.")
Exemple #19
0
def clean(cluster_name: str, **kwargs):
    # Load details
    cluster_details = load_cluster_details(cluster_name=cluster_name)

    if cluster_details["mode"] == "grass/azure":
        executor = GrassAzureExecutor(cluster_name=cluster_name)
        executor.clean()
    else:
        raise BadRequestError(
            f"Unsupported command in mode '{cluster_details['mode']}'.")
Exemple #20
0
def stop_schedule(cluster_name: str, schedule_name: str, **kwargs):
    # Load details
    cluster_details = load_cluster_details(cluster_name=cluster_name)

    if cluster_details["mode"] in ["grass/azure", "grass/on-premises"]:
        executor = GrassAzureExecutor(cluster_name=cluster_name)
        executor.stop_schedule(schedule_name=schedule_name)
    else:
        raise BadRequestError(
            f"Unsupported command in mode '{cluster_details['mode']}'.")
Exemple #21
0
def scale_node(cluster_name: str, replicas: int, node_size: str, **kwargs):
    cluster_details = load_cluster_details(cluster_name=cluster_name)

    if cluster_details["mode"] == "k8s/aks":
        executor = K8sAksExecutor(cluster_name=cluster_name)
        executor.scale_node(
            replicas=replicas,
            node_size=node_size
        )
    else:
        raise BadRequestError(f"Unsupported command in mode '{cluster_details['mode']}'.")
    def create(self):
        logger.info("Creating cluster")

        # Get cluster name and save cluster details.
        if os.path.isdir(f"{GlobalPaths.ABS_MARO_CLUSTERS}/{self.cluster_name}"):
            raise BadRequestError(f"Cluster '{self.cluster_name}' is exist.")

        # Build connection with Resource Redis
        self._resource_redis.add_cluster()

        # Allocation
        cluster_resource = self.cluster_details["master"]["resource"]
        available_resource = self._resource_redis.get_available_resource()

        # Update resource
        is_satisfied, updated_resource = resource_op(
            available_resource, cluster_resource, ResourceOperation.ALLOCATION
        )
        if not is_satisfied:
            self._resource_redis.sub_cluster()
            raise BadRequestError("No enough resource for this cluster.")

        self._resource_redis.set_available_resource(updated_resource)

        # Start agents.
        self._agents_start()

        # Set available resource for cluster
        self._redis_connection.hset(
            f"{self.cluster_name}:runtime_detail",
            "available_resource",
            json.dumps(cluster_resource)
        )

        # Save cluster config locally.
        DetailsWriter.save_cluster_details(
            cluster_name=self.cluster_name,
            cluster_details=self.cluster_details
        )

        logger.info(f"{self.cluster_name} is created.")
Exemple #23
0
def delete(cluster_name: str, **kwargs):
    cluster_details = load_cluster_details(cluster_name=cluster_name)

    if cluster_details["mode"] == "grass/azure":
        executor = GrassAzureExecutor(cluster_name=cluster_name)
        executor.delete()
    elif cluster_details["mode"] == "grass/on-premises":
        executor = GrassOnPremisesExecutor(cluster_name=cluster_name)
        executor.delete()
    else:
        raise BadRequestError(
            f"Unsupported command in mode '{cluster_details['mode']}'.")
Exemple #24
0
def push_image(cluster_name: str, image_name: str, image_path: str,
               remote_context_path: str, remote_image_name: str, **kwargs):
    cluster_details = load_cluster_details(cluster_name=cluster_name)

    if cluster_details["mode"] in ["grass/azure", "grass/on-premises"]:
        executor = GrassAzureExecutor(cluster_name=cluster_name)
        executor.push_image(image_name=image_name,
                            image_path=image_path,
                            remote_context_path=remote_context_path,
                            remote_image_name=remote_image_name)
    else:
        raise BadRequestError(
            f"Unsupported command in mode '{cluster_details['mode']}'.")
Exemple #25
0
def delete(cluster_name: str, **kwargs):
    # Late import.
    from maro.cli.k8s.executors.k8s_aks_executor import K8sAksExecutor
    from maro.cli.utils.details_reader import DetailsReader
    from maro.utils.exception.cli_exception import BadRequestError

    cluster_details = DetailsReader.load_cluster_details(cluster_name=cluster_name)

    if cluster_details["mode"] == "k8s/aks":
        executor = K8sAksExecutor(cluster_name=cluster_name)
        executor.delete()
    else:
        raise BadRequestError(f"Unsupported operation in mode '{cluster_details['mode']}'.")
    def create(create_deployment: dict):
        """Create MARO Cluster with create_deployment.

        Args:
            create_deployment (dict): create_deployment of grass/on-premises.
                See lib/deployments/internal for reference.

        Returns:
            None.
        """
        logger.info("Creating cluster")

        # Get standardized cluster_details
        cluster_details = GrassOnPremisesExecutor._standardize_cluster_details(
            create_deployment=create_deployment)
        cluster_name = cluster_details["name"]
        if os.path.isdir(f"{GlobalPaths.ABS_MARO_CLUSTERS}/{cluster_name}"):
            raise BadRequestError(f"Cluster '{cluster_name}' is exist")

        # Start creating
        try:
            GrassOnPremisesExecutor._init_master(
                cluster_details=cluster_details)
            GrassOnPremisesExecutor._create_user(
                cluster_details=cluster_details)

            # Remote create master, cluster after initialization
            master_api_client = MasterApiClientV1(
                master_hostname=cluster_details["master"]["public_ip_address"],
                master_api_server_port=cluster_details["master"]["api_server"]
                ["port"],
                user_id=cluster_details["user"]["id"],
                master_to_dev_encryption_private_key=cluster_details["user"]
                ["master_to_dev_encryption_private_key"],
                dev_to_master_encryption_public_key=cluster_details["user"]
                ["dev_to_master_encryption_public_key"],
                dev_to_master_signing_private_key=cluster_details["user"]
                ["dev_to_master_signing_private_key"])
            master_api_client.create_master(
                master_details=cluster_details["master"])
            master_api_client.create_cluster(cluster_details=cluster_details)
        except Exception as e:
            # If failed, remove details folder, then raise
            shutil.rmtree(
                path=f"{GlobalPaths.ABS_MARO_CLUSTERS}/{cluster_name}")
            logger.error_red(f"Failed to create cluster '{cluster_name}'")
            raise e

        logger.info_green(f"Cluster {cluster_name} has been created.")
Exemple #27
0
def clean(cluster_name: str, **kwargs):
    # Late import.
    from maro.cli.grass.executors.grass_azure_executor import GrassAzureExecutor
    from maro.cli.utils.details_reader import DetailsReader
    from maro.utils.exception.cli_exception import BadRequestError

    cluster_details = DetailsReader.load_cluster_details(
        cluster_name=cluster_name)

    if cluster_details["mode"] == "grass/azure":
        executor = GrassAzureExecutor(cluster_name=cluster_name)
        executor.clean()
    else:
        raise BadRequestError(
            f"Unsupported operation in mode '{cluster_details['mode']}'.")
Exemple #28
0
def node_join(node_join_path: str, **kwargs):

    try:
        with open(node_join_path, "r") as fr:
            node_join_info = yaml.safe_load(fr)
            fr.close()

        if node_join_info["mode"] != "grass/on-premises":
            raise BadRequestError(
                f"Node join cluster interrupted: Invalid mode: {node_join_info['mode']}"
            )

        executor = GrassOnPremisesExecutor(node_join_info["cluster"])
        executor.node_join_cluster(node_join_info)
    except FileNotFoundError:
        raise FileOperationError("Invalid template file path.")
Exemple #29
0
def create(deployment_path: str, **kwargs):
    try:
        with open(deployment_path, 'r') as fr:
            create_deployment = yaml.safe_load(fr)
        if create_deployment["mode"] == "k8s/aks":
            K8sAksExecutor.build_cluster_details(
                create_deployment=create_deployment)
            executor = K8sAksExecutor(cluster_name=create_deployment["name"])
            executor.create()
        else:
            raise BadRequestError(
                f"Unsupported command in mode '{create_deployment['mode']}'.")
    except KeyError as e:
        raise InvalidDeploymentTemplateError(f"Missing key '{e.args[0]}'.")
    except FileNotFoundError:
        raise FileOperationError("Invalid template file path.")
Exemple #30
0
def start_schedule(cluster_name: str, deployment_path: str, **kwargs):
    # Late import.
    from maro.cli.k8s.executors.k8s_aks_executor import K8sAksExecutor
    from maro.cli.utils.details_reader import DetailsReader
    from maro.utils.exception.cli_exception import BadRequestError

    # Load details
    cluster_details = DetailsReader.load_cluster_details(
        cluster_name=cluster_name)

    if cluster_details["mode"] == "k8s/aks":
        executor = K8sAksExecutor(cluster_name=cluster_name)
        executor.start_schedule(deployment_path=deployment_path)
    else:
        raise BadRequestError(
            f"Unsupported operation in mode '{cluster_details['mode']}'.")