def get_all_inters_from_yaml(file, filters): _envs = envs.load_yaml(file) all_flattens = {} def fatten_env_namespace(namespace_nests, local_envs): for k, v in local_envs.items(): if isinstance(v, dict): nests = copy.deepcopy(namespace_nests) nests.append(k) fatten_env_namespace(nests, v) elif (k == "dataset" or k == "phase" or k == "runner") and isinstance(v, list): for i in v: if i.get("name") is None: raise ValueError("name must be in dataset list. ", v) nests = copy.deepcopy(namespace_nests) nests.append(k) nests.append(i["name"]) fatten_env_namespace(nests, i) else: global_k = ".".join(namespace_nests + [k]) all_flattens[global_k] = v fatten_env_namespace([], _envs) ret = {} for k, v in all_flattens.items(): for f in filters: if k.startswith(f): ret[k] = v return ret
def master(): from paddlerec.core.engine.cluster.cluster import ClusterEngine # Get fleet_mode & device run_extras = get_all_inters_from_yaml(args.model, ["runner."]) mode = envs.get_runtime_environ("mode") fleet_class = ".".join(["runner", mode, "fleet_mode"]) device_class = ".".join(["runner", mode, "device"]) fleet_mode = run_extras.get(fleet_class, "ps") device = run_extras.get(device_class, "cpu") device = device.upper() fleet_mode = fleet_mode.upper() if fleet_mode == "COLLECTIVE" and device != "GPU": raise ValueError("COLLECTIVE can not be used without GPU") # Get Thread nums model_envs = envs.load_yaml(args.model) phases_class = ".".join(["runner", mode, "phases"]) phase_names = run_extras.get(phases_class) phases = [] all_phases = model_envs.get("phase") if phase_names is None: phases = all_phases else: for phase in all_phases: if phase["name"] in phase_names: phases.append(phase) thread_num = [] for phase in phases: thread_num.append(int(phase["thread_num"])) max_thread_num = max(thread_num) backend_envs = envs.load_yaml(args.backend) flattens = envs.flatten_environs(backend_envs, "_") flattens["engine_role"] = "MASTER" flattens["engine_mode"] = envs.get_runtime_environ("mode") flattens["engine_run_config"] = args.model flattens["max_thread_num"] = max_thread_num flattens["fleet_mode"] = fleet_mode flattens["device"] = device flattens["backend_yaml"] = args.backend envs.set_runtime_environs(flattens) launch = ClusterEngine(None, args.model) return launch
def __init__(self, config=None): self._status_processor = {} self._place = fluid.CPUPlace() self._exe = fluid.Executor(self._place) self._exector_context = {} self._context = {'status': 'uninit', 'is_exit': False} self._config_yaml = config self._config = envs.load_yaml(config)
def get_inters_from_yaml(file, filters): _envs = envs.load_yaml(file) flattens = envs.flatten_environs(_envs) inters = {} for k, v in flattens.items(): for f in filters: if k.startswith(f): inters[k] = v return inters
def user_define_engine(engine_yaml): _config = envs.load_yaml(engine_yaml) envs.set_runtime_environs(_config) train_location = envs.get_global_env("engine.file") train_dirname = os.path.dirname(train_location) base_name = os.path.splitext(os.path.basename(train_location))[0] sys.path.append(train_dirname) trainer_class = envs.lazy_instance_by_fliename(base_name, "UserDefineTraining") return trainer_class
def master(): role = "MASTER" from paddlerec.core.engine.cluster.cluster import ClusterEngine _envs = envs.load_yaml(args.backend) flattens = envs.flatten_environs(_envs, "_") flattens["engine_role"] = role flattens["engine_run_config"] = args.model flattens["engine_temp_path"] = tempfile.mkdtemp() envs.set_runtime_environs(flattens) print(envs.pretty_print_envs(flattens, ("Submit Envs", "Value"))) launch = ClusterEngine(None, args.model) return launch
def local_cluster_engine(args): from paddlerec.core.engine.local_cluster import LocalClusterEngine _envs = envs.load_yaml(args.model) run_extras = get_all_inters_from_yaml(args.model, ["train.", "runner."]) trainer_class = run_extras.get("runner." + _envs["mode"] + ".runner_class", None) if trainer_class: trainer = trainer_class else: trainer = "GeneralTrainer" executor_mode = "train" distributed_strategy = run_extras.get( "runner." + _envs["mode"] + ".distribute_strategy", "async") worker_num = run_extras.get("runner." + _envs["mode"] + ".worker_num", 1) server_num = run_extras.get("runner." + _envs["mode"] + ".server_num", 1) selected_gpus = run_extras.get( "runner." + _envs["mode"] + ".selected_gpus", "0") fleet_mode = run_extras.get("runner." + _envs["mode"] + ".fleet_mode", "") if fleet_mode == "": device = run_extras.get("runner." + _envs["mode"] + ".device", "cpu") if len(selected_gpus.split(",")) > 1 and device.upper() == "GPU": fleet_mode = "COLLECTIVE" else: fleet_mode = "PS" cluster_envs = {} cluster_envs["server_num"] = server_num cluster_envs["worker_num"] = worker_num cluster_envs["selected_gpus"] = selected_gpus cluster_envs["start_port"] = envs.find_free_port() cluster_envs["fleet_mode"] = fleet_mode cluster_envs["log_dir"] = "logs" cluster_envs["train.trainer.trainer"] = trainer cluster_envs["train.trainer.executor_mode"] = executor_mode cluster_envs["train.trainer.strategy"] = distributed_strategy cluster_envs["train.trainer.threads"] = "2" cluster_envs["train.trainer.engine"] = "local_cluster" cluster_envs["train.trainer.platform"] = envs.get_platform() cluster_envs["CPU_NUM"] = "2" print("launch {} engine with cluster to run model: {}".format( trainer, args.model)) set_runtime_envs(cluster_envs, args.model) launch = LocalClusterEngine(cluster_envs, args.model) return launch
def get_worker_num(run_extras, workers): _envs = envs.load_yaml(args.model) mode = envs.get_runtime_environ("mode") workspace = envs.get_runtime_environ("workspace") phases_class = ".".join(["runner", mode, "phases"]) phase_names = run_extras.get(phases_class) phases = [] all_phases = _envs.get("phase") if phase_names is None: phases = all_phases else: for phase in all_phases: if phase["name"] in phase_names: phases.append(phase) dataset_names = [] for phase in phases: dataset_names.append(phase["dataset_name"]) datapaths = [] for dataset in _envs.get("dataset"): if dataset["name"] in dataset_names: datapaths.append(dataset["data_path"]) if not datapaths: raise ValueError("data path must exist for training/inference") datapaths = [ envs.workspace_adapter_by_specific(path, workspace) for path in datapaths ] all_workers = [len(os.listdir(path)) for path in datapaths] all_workers.append(workers) max_worker_num = min(all_workers) if max_worker_num >= workers: return workers print( "phases do not have enough datas for training, set worker/gpu cards num from {} to {}" .format(workers, max_worker_num)) return max_worker_num
def local_mpi_engine(args): print("launch cluster engine with cluster to run model: {}".format( args.model)) from paddlerec.core.engine.local_mpi import LocalMPIEngine print("use 1X1 MPI ClusterTraining at localhost to run model: {}".format( args.model)) mpi = util.run_which("mpirun") if not mpi: raise RuntimeError("can not find mpirun, please check environment") _envs = envs.load_yaml(args.model) run_extras = get_all_inters_from_yaml(args.model, ["train.", "runner."]) trainer_class = run_extras.get("runner." + _envs["mode"] + ".runner_class", None) executor_mode = "train" distributed_strategy = run_extras.get( "runner." + _envs["mode"] + ".distribute_strategy", "async") fleet_mode = run_extras.get("runner." + _envs["mode"] + ".fleet_mode", "ps") if trainer_class: trainer = trainer_class else: trainer = "GeneralTrainer" cluster_envs = {} cluster_envs["mpirun"] = mpi cluster_envs["train.trainer.trainer"] = trainer cluster_envs["log_dir"] = "logs" cluster_envs["train.trainer.engine"] = "local_cluster" cluster_envs["train.trainer.executor_mode"] = executor_mode cluster_envs["fleet_mode"] = fleet_mode cluster_envs["train.trainer.strategy"] = distributed_strategy cluster_envs["train.trainer.threads"] = "2" cluster_envs["train.trainer.engine"] = "local_cluster" cluster_envs["train.trainer.platform"] = envs.get_platform() set_runtime_envs(cluster_envs, args.model) launch = LocalMPIEngine(cluster_envs, args.model) return launch
def worker(): role = "WORKER" _envs = envs.load_yaml(args.model) run_extras = get_all_inters_from_yaml(args.model, ["train.", "runner."]) trainer_class = run_extras.get( "runner." + _envs["mode"] + ".trainer_class", None) if trainer_class: trainer = trainer_class else: trainer = "GeneralTrainer" executor_mode = "train" distributed_strategy = run_extras.get( "runner." + _envs["mode"] + ".distribute_strategy", "async") selected_gpus = run_extras.get( "runner." + _envs["mode"] + ".selected_gpus", "0") fleet_mode = run_extras.get("runner." + _envs["mode"] + ".fleet_mode", "ps") cluster_envs = {} cluster_envs["selected_gpus"] = selected_gpus cluster_envs["fleet_mode"] = fleet_mode cluster_envs["train.trainer.trainer"] = trainer cluster_envs["train.trainer.executor_mode"] = executor_mode cluster_envs["train.trainer.engine"] = "cluster" cluster_envs["train.trainer.strategy"] = distributed_strategy cluster_envs["train.trainer.threads"] = envs.get_runtime_environ( "CPU_NUM") cluster_envs["train.trainer.platform"] = envs.get_platform() print("launch {} engine with cluster to with model: {}".format( trainer, args.model)) set_runtime_envs(cluster_envs, args.model) trainer = TrainerFactory.create(args.model) return trainer
def single_infer_engine(args): _envs = envs.load_yaml(args.model) run_extras = get_all_inters_from_yaml(args.model, ["runner."]) mode = envs.get_runtime_environ("mode") trainer_class = ".".join(["runner", mode, "trainer_class"]) fleet_class = ".".join(["runner", mode, "fleet_mode"]) device_class = ".".join(["runner", mode, "device"]) selected_gpus_class = ".".join(["runner", mode, "selected_gpus"]) trainer = run_extras.get(trainer_class, "GeneralTrainer") fleet_mode = run_extras.get(fleet_class, "ps") device = run_extras.get(device_class, "cpu") selected_gpus = run_extras.get(selected_gpus_class, "0") executor_mode = "infer" single_envs = {} if device.upper() == "GPU": selected_gpus_num = len(selected_gpus.split(",")) if selected_gpus_num != 1: raise ValueError( "Single Mode Only Support One GPU, Set Local Cluster Mode to use Multi-GPUS" ) single_envs["selsected_gpus"] = selected_gpus single_envs["FLAGS_selected_gpus"] = selected_gpus single_envs["train.trainer.trainer"] = trainer single_envs["train.trainer.executor_mode"] = executor_mode single_envs["fleet_mode"] = fleet_mode single_envs["train.trainer.threads"] = "2" single_envs["train.trainer.platform"] = envs.get_platform() single_envs["train.trainer.engine"] = "single" set_runtime_envs(single_envs, args.model) trainer = TrainerFactory.create(args.model) return trainer
def __init__(self, config=None): self._status_processor = {} self.model = None self.inference_models = [] self.increment_models = [] self._exector_context = {} self._context = {'status': 'uninit', 'is_exit': False} self._context["config_yaml"] = config self._model = {} self._dataset = {} self._runner_name = envs.get_runtime_environ("mode") self._context["runner_name"] = self._runner_name phase_names = envs.get_global_env( "runner." + self._runner_name + ".phases", None) _config = envs.load_yaml(config) self._context["env"] = _config self._context["dataset"] = _config.get("dataset") phases = [] if phase_names is None: phases = _config.get("phase") else: for phase in _config.get("phase"): if phase["name"] in phase_names: phases.append(phase) self._context["phases"] = phases print("PaddleRec: Runner {} Begin".format(self._runner_name)) self.which_engine() self.which_device() self.which_fleet_mode() self.which_executor_mode() self.legality_check()
def yaml_validation(config): all_checkers, require_checkers = register() _config = envs.load_yaml(config) flattens = envs.flatten_environs(_config) for required in require_checkers: if required not in flattens.keys(): print("\ncan not find {} in yaml, which is required\n".format( required)) return False for name, flatten in flattens.items(): checker = all_checkers.get(name, None) if not checker: continue ret = checker.is_valid(name, flattens) if not ret: return False return True
def yaml_validation(config): all_checkers = register() require_checkers = [] for name, checker in all_checkers.items(): if checker.required: require_checkers.append(name) _config = envs.load_yaml(config) for required in require_checkers: if required not in _config.keys(): print("\ncan not find {} in yaml, which is required\n".format( required)) return False for name, value in _config.items(): checker = all_checkers.get(name, None) if checker: ret = checker.is_valid(name, value) if not ret: return False return True
def create(config): _config = envs.load_yaml(config) envs.set_global_envs(_config) envs.update_workspace() trainer = TrainerFactory._build_trainer(config) return trainer
def __init__(self): # get backend env backend_yaml = envs.get_runtime_environ("backend_yaml") _env = envs.load_yaml(backend_yaml) self.backend_env = envs.flatten_environs(_env, ".") self.cluster_env = {}
def __init__(self, config): dg.MultiSlotDataGenerator.__init__(self) _config = envs.load_yaml(config)
def __init__(self, config): dg.MultiSlotDataGenerator.__init__(self) _config = envs.load_yaml(config) envs.set_global_envs(_config) envs.update_workspace()