Ejemplo n.º 1
0
    def _update_node_details(self) -> None:
        # Get node details
        node_details = get_node_details(redis=self._redis,
                                        cluster_name=self._cluster_name,
                                        node_name=self._node_name)

        # Main update
        self._update_container_details(node_details=node_details)
        self._update_system_resources_details(node_details=node_details)

        # Other update
        node_details['state'] = 'Running'
        node_details['check_time'] = self._redis.time()[0]

        # Save node details
        set_node_details(redis=self._redis,
                         cluster_name=self._cluster_name,
                         node_name=self._node_name,
                         node_details=node_details)
Ejemplo n.º 2
0
    parser.add_argument('node_name')
    parser.add_argument('parallels', type=int)
    args = parser.parse_args()

    # Load details
    cluster_details = load_cluster_details(cluster_name=args.cluster_name)
    master_hostname = cluster_details['master']['hostname']
    redis_port = cluster_details['master']['redis']['port']

    # Load node details
    redis = Redis(host=master_hostname,
                  port=redis_port,
                  charset="utf-8",
                  decode_responses=True)
    node_details = get_node_details(redis=redis,
                                    cluster_name=args.cluster_name,
                                    node_name=args.node_name)
    master_details = get_master_details(redis=redis,
                                        cluster_name=args.cluster_name)
    master_image_files_details = master_details['image_files']
    node_image_files_details = node_details['image_files']

    # Get unloaded images
    unloaded_images = []
    for image_file, image_file_details in master_image_files_details.items():
        if image_file not in node_image_files_details:
            unloaded_images.append(image_file)
        elif image_file_details['modify_time'] != node_image_files_details[image_file]['modify_time'] or \
                image_file_details['size'] != node_image_files_details[image_file]['size']:
            unloaded_images.append(image_file)
    sys.stdout.write(f"Unloaded_images: {unloaded_images}\n")
Ejemplo n.º 3
0
    def start_container(self, container_name: str, node_name: str,
                        job_details: dict):
        # Load details and vars
        cluster_name = self._cluster_name
        component_id_to_component_type = self._get_component_id_to_component_type(
            job_details=job_details)
        component_id = container_name.split('-')[-2]
        component_index = container_name.split('-')[-1]
        component_type = component_id_to_component_type[component_id]
        job_name = job_details['name']
        job_id = job_details['id']
        cluster_id = self._cluster_id
        admin_username = self._admin_username
        node_details = get_node_details(redis=self._redis,
                                        cluster_name=self._cluster_name,
                                        node_name=node_name)
        node_hostname = node_details['hostname']
        master_hostname = self._master_hostname

        # Parse environment parameters
        environment_parameters = \
            f"-e COMPONENT_TYPE={component_type} " \
            f"-e COMPONENT_ID={component_id} " \
            f"-e COMPONENT_INDEX={component_index} " \
            f"-e JOB_NAME={job_name} " \
            f"-e JOB_ID={job_id} " \
            f"-e CLUSTER_NAME={cluster_name} " \
            f"-e CLUSTER_ID={cluster_id} " \
            f"-e PYTHONUNBUFFERED=0"

        # Load command
        if job_details['components'][component_type]['resources']['gpu'] != 0:
            command = START_CONTAINER_WITH_GPU_COMMAND
        else:
            command = START_CONTAINER_COMMAND
        command = command.format(
            # cluster related
            admin_username=admin_username,
            master_hostname=master_hostname,
            node_hostname=node_hostname,
            fluentd_port=self._fluentd_port,

            # job related (user)
            cpu=job_details['components'][component_type]['resources']['cpu'],
            memory=job_details['components'][component_type]['resources']
            ['memory'],
            gpu=job_details['components'][component_type]['resources']['gpu'],
            mount_target=job_details['components'][component_type]['mount']
            ['target'],
            command=job_details['components'][component_type]['command'],
            image_name=job_details['components'][component_type]['image'],

            # job related (system)
            container_name=container_name,
            job_id=job_id,
            mount_source=f"~/.maro/clusters/{cluster_name}/data/",
            environment_parameters=environment_parameters)

        # Exec command
        logger.info(command)
        completed_process = subprocess.run(command,
                                           shell=True,
                                           stdout=subprocess.PIPE,
                                           stderr=subprocess.PIPE,
                                           encoding='utf8')
        if completed_process.returncode != 0:
            raise AllocationFailed(completed_process.stderr)