def remove_docker_machine(self, machine_id_on_site):
     print("machine_id_on_site=", machine_id_on_site)
     container_id = self.search_container_by_machine_id_on_site(
         machine_id_on_site)
     print("remove_docker_machine:", container_id)
     if container_id is not None:
         cmd = "nvidia-docker stop %s" % container_id
         exe_cmd_on_local(cmd)
         cmd = "nvidia-docker rm %s" % container_id
         exe_cmd_on_local(cmd)
     self.remove_machine_meta_info(machine_id_on_site)
     self.hdd_disk_manager.remove_vol(machine_id_on_site)
 def get_all_container_id_status(self):
     cmd = "nvidia-docker ps -a"
     ret_code, msg = exe_cmd_on_local(cmd, ret_msg=True)
     if ret_code != 0:
         raise ValueError("cannot get container status")
     headers = [
         "CONTAINER ID", "IMAGE", "COMMAND", "CREATED", "STATUS", "PORTS",
         "NAMES"
     ]
     lines = msg.split("\n")
     header_poses = []
     for header in headers:
         header_poses.append(lines[0].index(header))
     container_id_to_status = {}
     for line in lines[1:]:
         if len(line.strip()) < 5:
             continue
         container_id = line.split()[0]
         item = {}
         for i, pos in enumerate(header_poses):
             col_name = headers[i]
             col_start = pos
             if i == (len(header_poses) - 1):
                 col_value = line[col_start:]
             else:
                 col_end = header_poses[i + 1]
                 col_value = line[col_start:col_end]
             item[col_name] = col_value.strip()
         container_id_to_status[container_id] = item
     return container_id_to_status
    def remove_bridge(self, local_service_port):
        self.load()
        cmd_fmt_stop = "docker stop %(container_id)s"
        cmd_fmt_rm = "docker rm %(container_id)s"
        bridges = self.data.get("bridges", {})
        keys_to_remove = []
        for bridge_key in bridges:
            if bridges[bridge_key]["local_service_port"] == local_service_port:
                local_containder_id = bridges[bridge_key]["local_container_id"]
                remote_container_id = bridges[bridge_key][
                    "remote_container_id"]
                forwarding_port = bridges[bridge_key]["forwarding_port"]
                your_sshd_port = bridges[bridge_key]["your_sshd_port"]

                cmd = cmd_fmt_stop % {"container_id": local_containder_id}
                ret, msg = exe_cmd_on_local(cmd, ret_msg=True)
                cmd = cmd_fmt_rm % {"container_id": local_containder_id}
                ret, msg = exe_cmd_on_local(cmd, ret_msg=True)

                cmd = cmd_fmt_stop % {"container_id": remote_container_id}
                ret, msg = exe_cmd_on_remote(self.remote_login,
                                             self.remote_host,
                                             cmd,
                                             ret_msg=True)
                cmd = cmd_fmt_rm % {"container_id": remote_container_id}
                ret, msg = exe_cmd_on_remote(self.remote_login,
                                             self.remote_host,
                                             cmd,
                                             ret_msg=True)

                self.remote_port_manager.release_port(forwarding_port)
                self.remote_port_manager.release_port(your_sshd_port)

                keys_to_remove.append(bridge_key)
        for key_to_remove in keys_to_remove:
            bridges.pop(key_to_remove)
        self.data["bridges"] = bridges
        self.save()
    def build_bridge(self, local_service_port):
        '''
            see https://github.com/JinpengLI/docker-image-reverse-ssh-tunnel
            return remote port
        '''
        if self.start_bridge_if_exist(local_service_port):
            #print("debug build_bridge already exist")
            return
        self.load()
        forwarding_port = self.remote_port_manager.allocate_port(
        )  ## open service so it is called forwarding port
        your_sshd_port = self.remote_port_manager.allocate_port(
        )  ## ssh server listening
        data = {}
        data["remote_login"] = self.remote_login
        data["remote_host"] = self.remote_host
        data["your_sshd_port"] = your_sshd_port
        data["forwarding_port"] = forwarding_port
        data["bridge_password"] = self.bridge_password
        data["local_service_port"] = local_service_port

        cmd = "docker run -d -e ROOT_PASS=%(bridge_password)s -p %(your_sshd_port)d:22 -p %(forwarding_port)d:1080 jinpengli/docker-image-reverse-ssh-tunnel"
        cmd = cmd % data
        ret, msg = exe_cmd_on_remote(self.remote_login,
                                     self.remote_host,
                                     cmd,
                                     ret_msg=True)
        if ret != 0:
            print("fail cmd:", cmd)
            return None
        remote_container_id = msg.strip()
        cmd = "docker run -d -e PUBLIC_HOST_ADDR=%(remote_host)s -e PUBLIC_HOST_PORT=%(your_sshd_port)d -e ROOT_PASS=%(bridge_password)s -e PROXY_PORT=%(local_service_port)d --net=host jinpengli/docker-image-reverse-ssh-tunnel"
        cmd = cmd % data
        ret, msg = exe_cmd_on_local(cmd, ret_msg=True)
        if ret != 0:
            print("fail cmd:", cmd)
            return None
        local_container_id = msg.strip()
        bridges = self.data.get("bridges", {})
        bridge_key = (str(local_container_id) + '_' + str(remote_container_id))
        bridges[bridge_key] = {}
        bridges[bridge_key]["created_time"] = datetime.now().isoformat()
        bridges[bridge_key]["your_sshd_port"] = your_sshd_port
        bridges[bridge_key]["forwarding_port"] = forwarding_port
        bridges[bridge_key]["local_service_port"] = local_service_port
        bridges[bridge_key]["local_container_id"] = local_container_id
        bridges[bridge_key]["remote_container_id"] = remote_container_id
        self.data["bridges"] = bridges
        self.save()
 def get_all_processes(self, container_id):
     cmd = "nvidia-docker top %(container_id)s" % {
         "container_id": container_id
     }
     ret_code, msg = exe_cmd_on_local(cmd, ret_msg=True)
     lines = msg.split("\n")
     lines = lines[1:]
     ret_pids = []
     for line in lines:
         if len(line.strip()) > 0:
             #print line
             words = line.split()
             pid = words[1]
             ret_pids.append(pid)
     return ret_pids
 def modify_machine_if_modified(self, machine_id_on_site, cpu_cores, memory,
                                gpu_memory, hdd_disk_size):
     container_id = self.search_container_by_machine_id_on_site(
         machine_id_on_site)
     if self.data["machines"][container_id]["gpu_memory"] != gpu_memory:
         self.data["machines"][container_id]["gpu_memory"] = gpu_memory
         self.save()
     if self.data["machines"][container_id]["cpu_cores"] != cpu_cores or \
             self.data["machines"][container_id]["memory"] != memory:
         self.data["machines"][container_id]["cpu_cores"] = cpu_cores
         self.data["machines"][container_id]["memory"] = memory
         cmd_fmt = "nvidia-docker update --cpus %(cpu_cores)d --memory %(memory)dm %(container_id)s"
         cmd_fmt_dict = {}
         cmd_fmt_dict["container_id"] = container_id
         cmd_fmt_dict["cpu_cores"] = cpu_cores
         cmd_fmt_dict["memory"] = memory
         cmd = cmd_fmt % cmd_fmt_dict
         #print("! cmd=", cmd)
         ret_code, msg = exe_cmd_on_local(cmd, ret_msg=True)
         self.client.set_virtual_machine_connection_info(
             machine_id_on_site, "ready")
         self.save()
     if "hdd_disk_size" in self.data["machines"][container_id]:
         if self.data["machines"][container_id][
                 "hdd_disk_size"] < hdd_disk_size:
             self.data["machines"][container_id][
                 "hdd_disk_size"] = hdd_disk_size
             cmd = "nvidia-docker stop %(container_id)s" % {
                 "container_id": container_id
             }
             os.system(cmd)
             self.hdd_disk_manager.increase_vol(machine_id_on_site,
                                                hdd_disk_size)
             cmd = "nvidia-docker start %(container_id)s" % {
                 "container_id": container_id
             }
             os.system(cmd)
             self.client.set_virtual_machine_connection_info(
                 machine_id_on_site, "ready")
             self.save()
     self.client.set_virtual_machine_connection_info(
         machine_id_on_site, "ready")
 def start_bridge_if_exist(self, local_service_port):
     self.load()
     bridges = self.data.get("bridges", {})
     for key in bridges:
         bridge = bridges[key]
         if bridge["local_service_port"] == local_service_port:
             local_container_id = bridge["local_container_id"]
             remote_container_id = bridge["remote_container_id"]
             cmd_remote = "docker start %s" % remote_container_id
             ret, msg = exe_cmd_on_remote(self.remote_login,
                                          self.remote_host,
                                          cmd_remote,
                                          ret_msg=True)
             if ret != 0:
                 print("fail to execute ", cmd_remote)
             cmd_local = "docker start %s" % local_container_id
             ret, msg = exe_cmd_on_local(cmd_local, ret_msg=True)
             if ret != 0:
                 print("fail to execute ", cmd_local)
             return True
     return False
Exemple #8
0
    def create_vol(self, vol_name, size):
        vol_dir_path = os.path.join(self.base_dir, vol_name)
        image_path = os.path.join(self.base_dir, vol_name + ".img")
        data = {
            "image_path": image_path,
            "size": size * 1024,
            "vol_dir_path": vol_dir_path,
        }
        cmd = "sudo dd if=/dev/zero of=%(image_path)s bs=1M count=100"
        cmd = cmd % data
        ret_code, msg = exe_cmd_on_local(cmd, ret_msg=True)
        if ret_code != 0:
            print("fail ", cmd)
            return ret_code, msg

        cmd = "sudo mkfs.ext4 %(image_path)s" % data
        cmd = cmd % data
        ret_code, msg = exe_cmd_on_local(cmd, ret_msg=True)
        if ret_code != 0:
            print("fail ", cmd)
            return ret_code, msg

        cmd = "sudo e2fsck -y -f %(image_path)s"
        cmd = cmd % data
        ret_code, msg = exe_cmd_on_local(cmd, ret_msg=True)
        if ret_code != 0:
            print("fail ", cmd)
            return ret_code, msg

        cmd = "sudo resize2fs %(image_path)s %(size)dM"
        cmd = cmd % data
        ret_code, msg = exe_cmd_on_local(cmd, ret_msg=True)
        if ret_code != 0:
            print("fail ", cmd)
            return ret_code, msg

        cmd = "sudo mkdir %(vol_dir_path)s" % data
        ret_code, msg = exe_cmd_on_local(cmd, ret_msg=True)
        if ret_code != 0:
            print("fail ", cmd)
            return ret_code, msg

        cmd = "sudo mount -o loop %(image_path)s %(vol_dir_path)s" % data
        ret_code, msg = exe_cmd_on_local(cmd, ret_msg=True)
        if ret_code != 0:
            print("fail ", cmd)
            return ret_code, msg

        cmd = "echo '%(image_path)s %(vol_dir_path)s  ext4   loop    0    2' | sudo tee -a /etc/fstab" % data
        os.system(cmd)

        cmd = "sudo chmod -R o-r %(vol_dir_path)s" % data
        ret_code, msg = exe_cmd_on_local(cmd, ret_msg=True)
        if ret_code != 0:
            print("fail ", cmd)
            return ret_code, msg

        cmd = "sudo chmod -R o-r %(image_path)s" % data
        ret_code, msg = exe_cmd_on_local(cmd, ret_msg=True)
        if ret_code != 0:
            print("fail ", cmd)
            return ret_code, msg

        return True
    def generate_docker_machine(self,
                                machine_id_on_site,
                                cpu_cores=1,
                                memory=1000,
                                disk_size=10,
                                hdd_disk_size=10,
                                gpu_memory=1000,
                                container_password="******"):
        memory = int(memory)
        disk_size = int(disk_size)
        cpu_cores = int(cpu_cores)

        container_config = {}
        container_config["memory"] = memory
        container_config["disk_size"] = disk_size
        container_config["cpu_cores"] = cpu_cores
        ## need improve with size and cpu
        ## check if machine_id_on_site if exist
        container_status = self.get_status_by_machine_id_on_site(
            machine_id_on_site)
        if container_status is not None:
            ## check if it is modified
            self.modify_machine_if_modified(machine_id_on_site, cpu_cores,
                                            memory, gpu_memory, hdd_disk_size)
            #print("machine %s already is already created." % machine_id_on_site)
            return self.search_container_by_machine_id_on_site(
                machine_id_on_site)

        ## build hdd disk size
        ret = self.hdd_disk_manager.create_vol(machine_id_on_site,
                                               hdd_disk_size)
        if not ret:
            return None

        ## return public server and public server ssh port
        ## build a machine locally
        local_ssh_port = self.port_manager.allocate_port()
        container_config["local_ssh_port"] = local_ssh_port
        container_config["uuid"] = str(uuid.uuid4())[:8]
        container_config["vol_dir_path"] = self.hdd_disk_manager.get_vol_path(
            machine_id_on_site)
        cmd_fmt = "nvidia-docker run -m %(memory)dm -v %(vol_dir_path)s:/mnt/data --cpus %(cpu_cores)d -d -t -p %(local_ssh_port)d:22 --storage-opt size=%(disk_size)dG --name sshd_cuda_machine_%(uuid)s jinpengli/sshd_cuda"
        cmd = cmd_fmt % container_config
        print("! cmd = ", cmd)

        ## build a port mapping
        ret_code, msg = exe_cmd_on_local(cmd, ret_msg=True)
        if ret_code == 0:
            container_id = msg.strip().split("\n")[-1].strip()
            print("success create machine %s:%d" %
                  (container_id, local_ssh_port))
            self.add_machine_meta_info(container_id, local_ssh_port,
                                       machine_id_on_site, cpu_cores, memory,
                                       disk_size, hdd_disk_size, gpu_memory)
            #cmd_fmt = "nvidia-docker exec -it %(container_id)s echo 'root:%(container_password)s' | sudo chpasswd"
            #cmd = cmd_fmt % {"container_id": container_id, "container_password": container_password}
            cmd = [
                "nvidia-docker", "exec", "-d", container_id, "bash", "-c",
                "echo 'root:%s' | chpasswd" % container_password
            ]
            print("! cmd ", cmd)
            ret_code, msg = exe_cmd_on_local(cmd, ret_msg=True)
            return container_id
        else:
            print("fail to create the machine...")
            print(msg)
        return None