def local_cluster_engine(args): from paddlerec.core.engine.local_cluster import LocalClusterEngine _envs = envs.load_yaml(args.model) run_extras = get_all_inters_from_yaml(args.model, ["train.", "runner."]) trainer_class = run_extras.get("runner." + _envs["mode"] + ".runner_class", None) if trainer_class: trainer = trainer_class else: trainer = "GeneralTrainer" executor_mode = "train" distributed_strategy = run_extras.get( "runner." + _envs["mode"] + ".distribute_strategy", "async") worker_num = run_extras.get("runner." + _envs["mode"] + ".worker_num", 1) server_num = run_extras.get("runner." + _envs["mode"] + ".server_num", 1) selected_gpus = run_extras.get( "runner." + _envs["mode"] + ".selected_gpus", "0") fleet_mode = run_extras.get("runner." + _envs["mode"] + ".fleet_mode", "") if fleet_mode == "": device = run_extras.get("runner." + _envs["mode"] + ".device", "cpu") if len(selected_gpus.split(",")) > 1 and device.upper() == "GPU": fleet_mode = "COLLECTIVE" else: fleet_mode = "PS" cluster_envs = {} cluster_envs["server_num"] = server_num cluster_envs["worker_num"] = worker_num cluster_envs["selected_gpus"] = selected_gpus cluster_envs["start_port"] = envs.find_free_port() cluster_envs["fleet_mode"] = fleet_mode cluster_envs["log_dir"] = "logs" cluster_envs["train.trainer.trainer"] = trainer cluster_envs["train.trainer.executor_mode"] = executor_mode cluster_envs["train.trainer.strategy"] = distributed_strategy cluster_envs["train.trainer.threads"] = "2" cluster_envs["train.trainer.engine"] = "local_cluster" cluster_envs["train.trainer.platform"] = envs.get_platform() cluster_envs["CPU_NUM"] = "2" print("launch {} engine with cluster to run model: {}".format( trainer, args.model)) set_runtime_envs(cluster_envs, args.model) launch = LocalClusterEngine(cluster_envs, args.model) return launch
def local_cluster_engine(args): from paddlerec.core.engine.local_cluster import LocalClusterEngine trainer = get_trainer_prefix(args) + "ClusterTrainer" cluster_envs = {} cluster_envs["server_num"] = 1 cluster_envs["worker_num"] = 1 cluster_envs["start_port"] = envs.find_free_port() cluster_envs["log_dir"] = "logs" cluster_envs["train.trainer.trainer"] = trainer cluster_envs["train.trainer.strategy"] = "async" cluster_envs["train.trainer.threads"] = "2" cluster_envs["train.trainer.engine"] = "local_cluster" cluster_envs["train.trainer.platform"] = envs.get_platform() cluster_envs["CPU_NUM"] = "2" print("launch {} engine with cluster to run model: {}".format( trainer, args.model)) set_runtime_envs(cluster_envs, args.model) launch = LocalClusterEngine(cluster_envs, args.model) return launch
def start_procs(self): fleet_mode = self.envs["fleet_mode"] worker_num = self.envs["worker_num"] server_num = self.envs["server_num"] ports = [self.envs["start_port"]] logs_dir = self.envs["log_dir"] selected_gpus = self.envs["selected_gpus"].split(",") default_env = os.environ.copy() current_env = copy.copy(default_env) current_env["CLUSTER_INSTANCE"] = "1" current_env.pop("http_proxy", None) current_env.pop("https_proxy", None) procs = [] log_fns = [] if fleet_mode.upper() == "PS": for i in range(server_num - 1): while True: new_port = envs.find_free_port() if new_port not in ports: ports.append(new_port) break user_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports]) user_endpoints_ips = [ x.split(":")[0] for x in user_endpoints.split(",") ] user_endpoints_port = [ x.split(":")[1] for x in user_endpoints.split(",") ] factory = "paddlerec.core.factory" cmd = [sys.executable, "-u", "-m", factory, self.trainer] for i in range(server_num): current_env.update({ "PADDLE_PSERVERS_IP_PORT_LIST": user_endpoints, "PADDLE_PORT": user_endpoints_port[i], "TRAINING_ROLE": "PSERVER", "PADDLE_TRAINERS_NUM": str(worker_num), "POD_IP": user_endpoints_ips[i] }) os.system("mkdir -p {}".format(logs_dir)) fn = open("%s/server.%d" % (logs_dir, i), "w") log_fns.append(fn) proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn, cwd=os.getcwd()) procs.append(proc) for i in range(worker_num): current_env.update({ "PADDLE_PSERVERS_IP_PORT_LIST": user_endpoints, "PADDLE_TRAINERS_NUM": str(worker_num), "TRAINING_ROLE": "TRAINER", "PADDLE_TRAINER_ID": str(i) }) os.system("mkdir -p {}".format(logs_dir)) fn = open("%s/worker.%d" % (logs_dir, i), "w") log_fns.append(fn) proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn, cwd=os.getcwd()) procs.append(proc) # only wait worker to finish here for i, proc in enumerate(procs): if i < server_num: continue procs[i].wait() if len(log_fns) > 0: log_fns[i].close() for i in range(server_num): if len(log_fns) > 0: log_fns[i].close() procs[i].terminate() print( "all workers already completed, you can view logs under the `{}` directory" .format(logs_dir), file=sys.stderr) elif fleet_mode.upper() == "COLLECTIVE": selected_gpus_num = len(selected_gpus) for i in range(selected_gpus_num - 1): while True: new_port = envs.find_free_port() if new_port not in ports: ports.append(new_port) break user_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports]) factory = "paddlerec.core.factory" cmd = [sys.executable, "-u", "-m", factory, self.trainer] for i in range(selected_gpus_num): current_env.update({ "PADDLE_TRAINER_ENDPOINTS": user_endpoints, "PADDLE_CURRENT_ENDPOINTS": user_endpoints[i], "PADDLE_TRAINERS_NUM": str(worker_num), "TRAINING_ROLE": "TRAINER", "PADDLE_TRAINER_ID": str(i), "FLAGS_selected_gpus": str(selected_gpus[i]) }) os.system("mkdir -p {}".format(logs_dir)) fn = open("%s/worker.%d" % (logs_dir, i), "w") log_fns.append(fn) proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn, cwd=os.getcwd()) procs.append(proc)
def local_cluster_engine(args): def get_worker_num(run_extras, workers): _envs = envs.load_yaml(args.model) mode = envs.get_runtime_environ("mode") workspace = envs.get_runtime_environ("workspace") phases_class = ".".join(["runner", mode, "phases"]) phase_names = run_extras.get(phases_class) phases = [] all_phases = _envs.get("phase") if phase_names is None: phases = all_phases else: for phase in all_phases: if phase["name"] in phase_names: phases.append(phase) dataset_names = [] for phase in phases: dataset_names.append(phase["dataset_name"]) datapaths = [] for dataset in _envs.get("dataset"): if dataset["name"] in dataset_names: datapaths.append(dataset["data_path"]) if not datapaths: raise ValueError("data path must exist for training/inference") datapaths = [ envs.workspace_adapter_by_specific(path, workspace) for path in datapaths ] all_workers = [len(os.listdir(path)) for path in datapaths] all_workers.append(workers) max_worker_num = min(all_workers) if max_worker_num >= workers: return workers print( "phases do not have enough datas for training, set worker/gpu cards num from {} to {}" .format(workers, max_worker_num)) return max_worker_num from paddlerec.core.engine.local_cluster import LocalClusterEngine run_extras = get_all_inters_from_yaml(args.model, ["runner."]) mode = envs.get_runtime_environ("mode") trainer_class = ".".join(["runner", mode, "trainer_class"]) fleet_class = ".".join(["runner", mode, "fleet_mode"]) device_class = ".".join(["runner", mode, "device"]) selected_gpus_class = ".".join(["runner", mode, "selected_gpus"]) strategy_class = ".".join(["runner", mode, "distribute_strategy"]) worker_class = ".".join(["runner", mode, "worker_num"]) server_class = ".".join(["runner", mode, "server_num"]) trainer = run_extras.get(trainer_class, "GeneralTrainer") fleet_mode = run_extras.get(fleet_class, "ps") device = run_extras.get(device_class, "cpu") selected_gpus = run_extras.get(selected_gpus_class, "0") distributed_strategy = run_extras.get(strategy_class, "async") executor_mode = "train" worker_num = run_extras.get(worker_class, 1) server_num = run_extras.get(server_class, 1) device = device.upper() fleet_mode = fleet_mode.upper() cluster_envs = {} # Todo: delete follow hard code when paddle support ps-gpu. if device == "CPU": fleet_mode = "PS" elif device == "GPU": fleet_mode = "COLLECTIVE" if fleet_mode == "PS" and device != "CPU": raise ValueError("PS can not be used with GPU") if fleet_mode == "COLLECTIVE" and device != "GPU": raise ValueError("COLLECTIVE can not be used without GPU") if fleet_mode == "PS": worker_num = get_worker_num(run_extras, worker_num) if fleet_mode == "COLLECTIVE": cluster_envs["selected_gpus"] = selected_gpus gpus = selected_gpus.split(",") worker_num = get_worker_num(run_extras, len(gpus)) cluster_envs["selected_gpus"] = ','.join(gpus[:worker_num]) cluster_envs["server_num"] = server_num cluster_envs["worker_num"] = worker_num cluster_envs["start_port"] = envs.find_free_port() cluster_envs["fleet_mode"] = fleet_mode cluster_envs["log_dir"] = "logs" cluster_envs["train.trainer.trainer"] = trainer cluster_envs["train.trainer.executor_mode"] = executor_mode cluster_envs["train.trainer.strategy"] = distributed_strategy cluster_envs["train.trainer.threads"] = "2" cluster_envs["CPU_NUM"] = cluster_envs["train.trainer.threads"] cluster_envs["train.trainer.engine"] = "local_cluster" cluster_envs["train.trainer.platform"] = envs.get_platform() print("launch {} engine with cluster to run model: {}".format( trainer, args.model)) set_runtime_envs(cluster_envs, args.model) launch = LocalClusterEngine(cluster_envs, args.model) return launch
def start_procs(self): fleet_mode = self.envs["fleet_mode"] worker_num = self.envs["worker_num"] server_num = self.envs["server_num"] ports = [self.envs["start_port"]] logs_dir = self.envs["log_dir"] default_env = os.environ.copy() current_env = copy.copy(default_env) current_env["CLUSTER_INSTANCE"] = "1" current_env.pop("http_proxy", None) current_env.pop("https_proxy", None) procs = [] log_fns = [] if fleet_mode.upper() == "PS": for i in range(server_num - 1): while True: new_port = envs.find_free_port() if new_port not in ports: ports.append(new_port) break user_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports]) user_endpoints_ips = [ x.split(":")[0] for x in user_endpoints.split(",") ] user_endpoints_port = [ x.split(":")[1] for x in user_endpoints.split(",") ] factory = "paddlerec.core.factory" cmd = [sys.executable, "-u", "-m", factory, self.trainer] for i in range(server_num): current_env.update({ "PADDLE_PSERVERS_IP_PORT_LIST": user_endpoints, "PADDLE_PORT": user_endpoints_port[i], "TRAINING_ROLE": "PSERVER", "PADDLE_TRAINERS_NUM": str(worker_num), "POD_IP": user_endpoints_ips[i] }) os.system("mkdir -p {}".format(logs_dir)) fn = open("%s/server.%d" % (logs_dir, i), "w") log_fns.append(fn) proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn, cwd=os.getcwd()) procs.append(proc) for i in range(worker_num): current_env.update({ "PADDLE_PSERVERS_IP_PORT_LIST": user_endpoints, "PADDLE_TRAINERS_NUM": str(worker_num), "TRAINING_ROLE": "TRAINER", "PADDLE_TRAINER_ID": str(i) }) os.system("mkdir -p {}".format(logs_dir)) fn = open("%s/worker.%d" % (logs_dir, i), "w") log_fns.append(fn) proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn, cwd=os.getcwd()) procs.append(proc) elif fleet_mode.upper() == "COLLECTIVE": cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") if cuda_visible_devices is None or cuda_visible_devices == "": selected_gpus = [ x.strip() for x in self.envs["selected_gpus"].split(",") ] else: # change selected_gpus into relative values # e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.selected_gpus=4,5,6,7; # therefore selected_gpus=0,1,2,3 cuda_visible_devices_list = cuda_visible_devices.split(',') for x in self.envs["selected_gpus"].split(","): assert x in cuda_visible_devices_list, "Can't find "\ "your selected_gpus %s in CUDA_VISIBLE_DEVICES[%s]."\ % (x, cuda_visible_devices) selected_gpus = [ cuda_visible_devices_list.index(x.strip()) for x in self.envs["selected_gpus"].split(",") ] selected_gpus_num = len(selected_gpus) factory = "paddlerec.core.factory" cmd = [sys.executable, "-u", "-m", factory, self.trainer] print("use_paddlecloud_flag:{}".format( cluster_utils.use_paddlecloud())) if cluster_utils.use_paddlecloud(): cluster, pod = cluster_utils.get_cloud_cluster(selected_gpus) logger.info("get cluster from cloud:{}".format(cluster)) procs = cluster_utils.start_local_trainers(cluster, pod, cmd, log_dir=logs_dir) else: # trainers_num = 1 or not use paddlecloud ips="a,b" for i in range(selected_gpus_num - 1): while True: new_port = envs.find_free_port() if new_port not in ports: ports.append(new_port) break user_endpoints = ",".join( ["127.0.0.1:" + str(x) for x in ports]) for i in range(selected_gpus_num): current_env.update({ "PADDLE_TRAINER_ENDPOINTS": user_endpoints, "PADDLE_CURRENT_ENDPOINTS": user_endpoints[i], "PADDLE_TRAINERS_NUM": str(worker_num), "TRAINING_ROLE": "TRAINER", "PADDLE_TRAINER_ID": str(i), "FLAGS_selected_gpus": str(selected_gpus[i]), "PADDLEREC_GPU_NUMS": str(selected_gpus_num) }) os.system("mkdir -p {}".format(logs_dir)) fn = open("%s/worker.%d" % (logs_dir, i), "w") log_fns.append(fn) proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn, cwd=os.getcwd()) procs.append(proc) # only wait worker to finish here for i, proc in enumerate(procs): if i < server_num: continue procs[i].wait() if len(log_fns) > 0: log_fns[i].close() for i in range(server_num): if len(log_fns) > 0: log_fns[i].close() procs[i].terminate() print( "all workers already completed, you can view logs under the `{}` directory" .format(logs_dir), file=sys.stderr)