def remove_docker_machine(self, machine_id_on_site): print("machine_id_on_site=", machine_id_on_site) container_id = self.search_container_by_machine_id_on_site( machine_id_on_site) print("remove_docker_machine:", container_id) if container_id is not None: cmd = "nvidia-docker stop %s" % container_id exe_cmd_on_local(cmd) cmd = "nvidia-docker rm %s" % container_id exe_cmd_on_local(cmd) self.remove_machine_meta_info(machine_id_on_site) self.hdd_disk_manager.remove_vol(machine_id_on_site)
def get_all_container_id_status(self): cmd = "nvidia-docker ps -a" ret_code, msg = exe_cmd_on_local(cmd, ret_msg=True) if ret_code != 0: raise ValueError("cannot get container status") headers = [ "CONTAINER ID", "IMAGE", "COMMAND", "CREATED", "STATUS", "PORTS", "NAMES" ] lines = msg.split("\n") header_poses = [] for header in headers: header_poses.append(lines[0].index(header)) container_id_to_status = {} for line in lines[1:]: if len(line.strip()) < 5: continue container_id = line.split()[0] item = {} for i, pos in enumerate(header_poses): col_name = headers[i] col_start = pos if i == (len(header_poses) - 1): col_value = line[col_start:] else: col_end = header_poses[i + 1] col_value = line[col_start:col_end] item[col_name] = col_value.strip() container_id_to_status[container_id] = item return container_id_to_status
def remove_bridge(self, local_service_port): self.load() cmd_fmt_stop = "docker stop %(container_id)s" cmd_fmt_rm = "docker rm %(container_id)s" bridges = self.data.get("bridges", {}) keys_to_remove = [] for bridge_key in bridges: if bridges[bridge_key]["local_service_port"] == local_service_port: local_containder_id = bridges[bridge_key]["local_container_id"] remote_container_id = bridges[bridge_key][ "remote_container_id"] forwarding_port = bridges[bridge_key]["forwarding_port"] your_sshd_port = bridges[bridge_key]["your_sshd_port"] cmd = cmd_fmt_stop % {"container_id": local_containder_id} ret, msg = exe_cmd_on_local(cmd, ret_msg=True) cmd = cmd_fmt_rm % {"container_id": local_containder_id} ret, msg = exe_cmd_on_local(cmd, ret_msg=True) cmd = cmd_fmt_stop % {"container_id": remote_container_id} ret, msg = exe_cmd_on_remote(self.remote_login, self.remote_host, cmd, ret_msg=True) cmd = cmd_fmt_rm % {"container_id": remote_container_id} ret, msg = exe_cmd_on_remote(self.remote_login, self.remote_host, cmd, ret_msg=True) self.remote_port_manager.release_port(forwarding_port) self.remote_port_manager.release_port(your_sshd_port) keys_to_remove.append(bridge_key) for key_to_remove in keys_to_remove: bridges.pop(key_to_remove) self.data["bridges"] = bridges self.save()
def build_bridge(self, local_service_port): ''' see https://github.com/JinpengLI/docker-image-reverse-ssh-tunnel return remote port ''' if self.start_bridge_if_exist(local_service_port): #print("debug build_bridge already exist") return self.load() forwarding_port = self.remote_port_manager.allocate_port( ) ## open service so it is called forwarding port your_sshd_port = self.remote_port_manager.allocate_port( ) ## ssh server listening data = {} data["remote_login"] = self.remote_login data["remote_host"] = self.remote_host data["your_sshd_port"] = your_sshd_port data["forwarding_port"] = forwarding_port data["bridge_password"] = self.bridge_password data["local_service_port"] = local_service_port cmd = "docker run -d -e ROOT_PASS=%(bridge_password)s -p %(your_sshd_port)d:22 -p %(forwarding_port)d:1080 jinpengli/docker-image-reverse-ssh-tunnel" cmd = cmd % data ret, msg = exe_cmd_on_remote(self.remote_login, self.remote_host, cmd, ret_msg=True) if ret != 0: print("fail cmd:", cmd) return None remote_container_id = msg.strip() cmd = "docker run -d -e PUBLIC_HOST_ADDR=%(remote_host)s -e PUBLIC_HOST_PORT=%(your_sshd_port)d -e ROOT_PASS=%(bridge_password)s -e PROXY_PORT=%(local_service_port)d --net=host jinpengli/docker-image-reverse-ssh-tunnel" cmd = cmd % data ret, msg = exe_cmd_on_local(cmd, ret_msg=True) if ret != 0: print("fail cmd:", cmd) return None local_container_id = msg.strip() bridges = self.data.get("bridges", {}) bridge_key = (str(local_container_id) + '_' + str(remote_container_id)) bridges[bridge_key] = {} bridges[bridge_key]["created_time"] = datetime.now().isoformat() bridges[bridge_key]["your_sshd_port"] = your_sshd_port bridges[bridge_key]["forwarding_port"] = forwarding_port bridges[bridge_key]["local_service_port"] = local_service_port bridges[bridge_key]["local_container_id"] = local_container_id bridges[bridge_key]["remote_container_id"] = remote_container_id self.data["bridges"] = bridges self.save()
def get_all_processes(self, container_id): cmd = "nvidia-docker top %(container_id)s" % { "container_id": container_id } ret_code, msg = exe_cmd_on_local(cmd, ret_msg=True) lines = msg.split("\n") lines = lines[1:] ret_pids = [] for line in lines: if len(line.strip()) > 0: #print line words = line.split() pid = words[1] ret_pids.append(pid) return ret_pids
def modify_machine_if_modified(self, machine_id_on_site, cpu_cores, memory, gpu_memory, hdd_disk_size): container_id = self.search_container_by_machine_id_on_site( machine_id_on_site) if self.data["machines"][container_id]["gpu_memory"] != gpu_memory: self.data["machines"][container_id]["gpu_memory"] = gpu_memory self.save() if self.data["machines"][container_id]["cpu_cores"] != cpu_cores or \ self.data["machines"][container_id]["memory"] != memory: self.data["machines"][container_id]["cpu_cores"] = cpu_cores self.data["machines"][container_id]["memory"] = memory cmd_fmt = "nvidia-docker update --cpus %(cpu_cores)d --memory %(memory)dm %(container_id)s" cmd_fmt_dict = {} cmd_fmt_dict["container_id"] = container_id cmd_fmt_dict["cpu_cores"] = cpu_cores cmd_fmt_dict["memory"] = memory cmd = cmd_fmt % cmd_fmt_dict #print("! cmd=", cmd) ret_code, msg = exe_cmd_on_local(cmd, ret_msg=True) self.client.set_virtual_machine_connection_info( machine_id_on_site, "ready") self.save() if "hdd_disk_size" in self.data["machines"][container_id]: if self.data["machines"][container_id][ "hdd_disk_size"] < hdd_disk_size: self.data["machines"][container_id][ "hdd_disk_size"] = hdd_disk_size cmd = "nvidia-docker stop %(container_id)s" % { "container_id": container_id } os.system(cmd) self.hdd_disk_manager.increase_vol(machine_id_on_site, hdd_disk_size) cmd = "nvidia-docker start %(container_id)s" % { "container_id": container_id } os.system(cmd) self.client.set_virtual_machine_connection_info( machine_id_on_site, "ready") self.save() self.client.set_virtual_machine_connection_info( machine_id_on_site, "ready")
def start_bridge_if_exist(self, local_service_port): self.load() bridges = self.data.get("bridges", {}) for key in bridges: bridge = bridges[key] if bridge["local_service_port"] == local_service_port: local_container_id = bridge["local_container_id"] remote_container_id = bridge["remote_container_id"] cmd_remote = "docker start %s" % remote_container_id ret, msg = exe_cmd_on_remote(self.remote_login, self.remote_host, cmd_remote, ret_msg=True) if ret != 0: print("fail to execute ", cmd_remote) cmd_local = "docker start %s" % local_container_id ret, msg = exe_cmd_on_local(cmd_local, ret_msg=True) if ret != 0: print("fail to execute ", cmd_local) return True return False
def create_vol(self, vol_name, size): vol_dir_path = os.path.join(self.base_dir, vol_name) image_path = os.path.join(self.base_dir, vol_name + ".img") data = { "image_path": image_path, "size": size * 1024, "vol_dir_path": vol_dir_path, } cmd = "sudo dd if=/dev/zero of=%(image_path)s bs=1M count=100" cmd = cmd % data ret_code, msg = exe_cmd_on_local(cmd, ret_msg=True) if ret_code != 0: print("fail ", cmd) return ret_code, msg cmd = "sudo mkfs.ext4 %(image_path)s" % data cmd = cmd % data ret_code, msg = exe_cmd_on_local(cmd, ret_msg=True) if ret_code != 0: print("fail ", cmd) return ret_code, msg cmd = "sudo e2fsck -y -f %(image_path)s" cmd = cmd % data ret_code, msg = exe_cmd_on_local(cmd, ret_msg=True) if ret_code != 0: print("fail ", cmd) return ret_code, msg cmd = "sudo resize2fs %(image_path)s %(size)dM" cmd = cmd % data ret_code, msg = exe_cmd_on_local(cmd, ret_msg=True) if ret_code != 0: print("fail ", cmd) return ret_code, msg cmd = "sudo mkdir %(vol_dir_path)s" % data ret_code, msg = exe_cmd_on_local(cmd, ret_msg=True) if ret_code != 0: print("fail ", cmd) return ret_code, msg cmd = "sudo mount -o loop %(image_path)s %(vol_dir_path)s" % data ret_code, msg = exe_cmd_on_local(cmd, ret_msg=True) if ret_code != 0: print("fail ", cmd) return ret_code, msg cmd = "echo '%(image_path)s %(vol_dir_path)s ext4 loop 0 2' | sudo tee -a /etc/fstab" % data os.system(cmd) cmd = "sudo chmod -R o-r %(vol_dir_path)s" % data ret_code, msg = exe_cmd_on_local(cmd, ret_msg=True) if ret_code != 0: print("fail ", cmd) return ret_code, msg cmd = "sudo chmod -R o-r %(image_path)s" % data ret_code, msg = exe_cmd_on_local(cmd, ret_msg=True) if ret_code != 0: print("fail ", cmd) return ret_code, msg return True
def generate_docker_machine(self, machine_id_on_site, cpu_cores=1, memory=1000, disk_size=10, hdd_disk_size=10, gpu_memory=1000, container_password="******"): memory = int(memory) disk_size = int(disk_size) cpu_cores = int(cpu_cores) container_config = {} container_config["memory"] = memory container_config["disk_size"] = disk_size container_config["cpu_cores"] = cpu_cores ## need improve with size and cpu ## check if machine_id_on_site if exist container_status = self.get_status_by_machine_id_on_site( machine_id_on_site) if container_status is not None: ## check if it is modified self.modify_machine_if_modified(machine_id_on_site, cpu_cores, memory, gpu_memory, hdd_disk_size) #print("machine %s already is already created." % machine_id_on_site) return self.search_container_by_machine_id_on_site( machine_id_on_site) ## build hdd disk size ret = self.hdd_disk_manager.create_vol(machine_id_on_site, hdd_disk_size) if not ret: return None ## return public server and public server ssh port ## build a machine locally local_ssh_port = self.port_manager.allocate_port() container_config["local_ssh_port"] = local_ssh_port container_config["uuid"] = str(uuid.uuid4())[:8] container_config["vol_dir_path"] = self.hdd_disk_manager.get_vol_path( machine_id_on_site) cmd_fmt = "nvidia-docker run -m %(memory)dm -v %(vol_dir_path)s:/mnt/data --cpus %(cpu_cores)d -d -t -p %(local_ssh_port)d:22 --storage-opt size=%(disk_size)dG --name sshd_cuda_machine_%(uuid)s jinpengli/sshd_cuda" cmd = cmd_fmt % container_config print("! cmd = ", cmd) ## build a port mapping ret_code, msg = exe_cmd_on_local(cmd, ret_msg=True) if ret_code == 0: container_id = msg.strip().split("\n")[-1].strip() print("success create machine %s:%d" % (container_id, local_ssh_port)) self.add_machine_meta_info(container_id, local_ssh_port, machine_id_on_site, cpu_cores, memory, disk_size, hdd_disk_size, gpu_memory) #cmd_fmt = "nvidia-docker exec -it %(container_id)s echo 'root:%(container_password)s' | sudo chpasswd" #cmd = cmd_fmt % {"container_id": container_id, "container_password": container_password} cmd = [ "nvidia-docker", "exec", "-d", container_id, "bash", "-c", "echo 'root:%s' | chpasswd" % container_password ] print("! cmd ", cmd) ret_code, msg = exe_cmd_on_local(cmd, ret_msg=True) return container_id else: print("fail to create the machine...") print(msg) return None