def create_dataset(self, dataset_name, context): name = "dataset." + dataset_name + "." type_name = envs.get_global_env(name + "type") if envs.get_platform() != "LINUX": print("platform ", envs.get_platform(), "Reader To Dataloader") type_name = "DataLoader" if type_name == "DataLoader": return None else: return self._get_dataset(dataset_name, context)
def _create_dataset(self, dataset_name): name = "dataset." + dataset_name + "." sparse_slots = envs.get_global_env(name + "sparse_slots") dense_slots = envs.get_global_env(name + "dense_slots") thread_num = envs.get_global_env(name + "thread_num") batch_size = envs.get_global_env(name + "batch_size") type_name = envs.get_global_env(name + "type") if envs.get_platform() != "LINUX": print("platform ", envs.get_platform(), " change reader to DataLoader") type_name = "DataLoader" padding = 0 if type_name == "DataLoader": return None else: return self._get_dataset(dataset_name)
def single_infer_engine(args): trainer = get_trainer_prefix(args) + "SingleInfer" single_envs = {} single_envs["train.trainer.trainer"] = trainer single_envs["train.trainer.threads"] = "2" single_envs["train.trainer.engine"] = "single_infer" single_envs["train.trainer.platform"] = envs.get_platform() print("use {} engine to run model: {}".format(trainer, args.model)) set_runtime_envs(single_envs, args.model) trainer = TrainerFactory.create(args.model) return trainer
def processor_register(self): self.regist_context_processor('uninit', self.instance) self.regist_context_processor('init_pass', self.init) self.regist_context_processor('startup_pass', self.startup) if envs.get_platform() == "LINUX" and envs.get_global_env( "dataset_class", None, "train.reader") != "DataLoader": self.regist_context_processor('train_pass', self.dataset_train) else: self.regist_context_processor('train_pass', self.dataloader_train) self.regist_context_processor('infer_pass', self.infer) self.regist_context_processor('terminal_pass', self.terminal)
def online_learning(args): trainer = "OnlineLearningTrainer" single_envs = {} single_envs["train.trainer.trainer"] = trainer single_envs["train.trainer.threads"] = "2" single_envs["train.trainer.engine"] = "online_learning" single_envs["train.trainer.platform"] = envs.get_platform() print("use {} engine to run model: {}".format(trainer, args.model)) set_runtime_envs(single_envs, args.model) trainer = TrainerFactory.create(args.model) return trainer
def cluster_mpi_engine(args): print("launch cluster engine with cluster to run model: {}".format( args.model)) cluster_envs = {} cluster_envs["train.trainer.trainer"] = "CtrCodingTrainer" cluster_envs["train.trainer.platform"] = envs.get_platform() set_runtime_envs(cluster_envs, args.model) trainer = TrainerFactory.create(args.model) return trainer
def __init__(self, config): """R """ self._cost = None self._metrics = {} self._data_var = [] self._infer_data_var = [] self._infer_results = {} self._data_loader = None self._infer_data_loader = None self._fetch_interval = 20 self._namespace = "train.model" self._platform = envs.get_platform()
def local_cluster_engine(args): from paddlerec.core.engine.local_cluster import LocalClusterEngine _envs = envs.load_yaml(args.model) run_extras = get_all_inters_from_yaml(args.model, ["train.", "runner."]) trainer_class = run_extras.get("runner." + _envs["mode"] + ".runner_class", None) if trainer_class: trainer = trainer_class else: trainer = "GeneralTrainer" executor_mode = "train" distributed_strategy = run_extras.get( "runner." + _envs["mode"] + ".distribute_strategy", "async") worker_num = run_extras.get("runner." + _envs["mode"] + ".worker_num", 1) server_num = run_extras.get("runner." + _envs["mode"] + ".server_num", 1) selected_gpus = run_extras.get( "runner." + _envs["mode"] + ".selected_gpus", "0") fleet_mode = run_extras.get("runner." + _envs["mode"] + ".fleet_mode", "") if fleet_mode == "": device = run_extras.get("runner." + _envs["mode"] + ".device", "cpu") if len(selected_gpus.split(",")) > 1 and device.upper() == "GPU": fleet_mode = "COLLECTIVE" else: fleet_mode = "PS" cluster_envs = {} cluster_envs["server_num"] = server_num cluster_envs["worker_num"] = worker_num cluster_envs["selected_gpus"] = selected_gpus cluster_envs["start_port"] = envs.find_free_port() cluster_envs["fleet_mode"] = fleet_mode cluster_envs["log_dir"] = "logs" cluster_envs["train.trainer.trainer"] = trainer cluster_envs["train.trainer.executor_mode"] = executor_mode cluster_envs["train.trainer.strategy"] = distributed_strategy cluster_envs["train.trainer.threads"] = "2" cluster_envs["train.trainer.engine"] = "local_cluster" cluster_envs["train.trainer.platform"] = envs.get_platform() cluster_envs["CPU_NUM"] = "2" print("launch {} engine with cluster to run model: {}".format( trainer, args.model)) set_runtime_envs(cluster_envs, args.model) launch = LocalClusterEngine(cluster_envs, args.model) return launch
def worker(mode): if not mode: raise ValueError("mode: {} can not be recognized") run_extras = get_all_inters_from_yaml(args.model, ["runner."]) trainer_class = ".".join(["runner", mode, "trainer_class"]) fleet_class = ".".join(["runner", mode, "fleet_mode"]) device_class = ".".join(["runner", mode, "device"]) selected_gpus_class = ".".join(["runner", mode, "selected_gpus"]) strategy_class = ".".join(["runner", mode, "distribute_strategy"]) worker_class = ".".join(["runner", mode, "worker_num"]) server_class = ".".join(["runner", mode, "server_num"]) trainer = run_extras.get(trainer_class, "GeneralTrainer") fleet_mode = run_extras.get(fleet_class, "ps") device = run_extras.get(device_class, "cpu") selected_gpus = run_extras.get(selected_gpus_class, "0") distributed_strategy = run_extras.get(strategy_class, "async") worker_num = run_extras.get(worker_class, 1) server_num = run_extras.get(server_class, 1) executor_mode = "train" device = device.upper() fleet_mode = fleet_mode.upper() if fleet_mode == "COLLECTIVE" and device != "GPU": raise ValueError("COLLECTIVE can not be used with GPU") cluster_envs = {} if device == "GPU": cluster_envs["selected_gpus"] = selected_gpus cluster_envs["server_num"] = server_num cluster_envs["worker_num"] = worker_num cluster_envs["fleet_mode"] = fleet_mode cluster_envs["train.trainer.trainer"] = trainer cluster_envs["train.trainer.engine"] = "cluster" cluster_envs["train.trainer.executor_mode"] = executor_mode cluster_envs["train.trainer.strategy"] = distributed_strategy cluster_envs["train.trainer.threads"] = envs.get_runtime_environ( "CPU_NUM") cluster_envs["train.trainer.platform"] = envs.get_platform() print("launch {} engine with cluster to with model: {}".format( trainer, args.model)) set_runtime_envs(cluster_envs, args.model) trainer = TrainerFactory.create(args.model) return trainer
def __init__(self, config): """R """ self._cost = None self._metrics = {} self._data_var = [] self._infer_data_var = [] self._infer_results = {} self._data_loader = None self._infer_data_loader = None self._fetch_interval = 20 self._platform = envs.get_platform() self._init_hyper_parameters() self._env = config self._slot_inited = False
def worker(): role = "WORKER" trainer = get_trainer_prefix(args) + "ClusterTrainer" cluster_envs = {} cluster_envs["train.trainer.trainer"] = trainer cluster_envs["train.trainer.engine"] = "cluster" cluster_envs["train.trainer.threads"] = envs.get_runtime_environ( "CPU_NUM") cluster_envs["train.trainer.platform"] = envs.get_platform() print("launch {} engine with cluster to with model: {}".format( trainer, args.model)) set_runtime_envs(cluster_envs, args.model) trainer = TrainerFactory.create(args.model) return trainer
def single_infer_engine(args): run_extras = get_all_inters_from_yaml(args.model, ["runner."]) mode = envs.get_runtime_environ("mode") trainer_class = ".".join(["runner", mode, "trainer_class"]) fleet_class = ".".join(["runner", mode, "fleet_mode"]) device_class = ".".join(["runner", mode, "device"]) selected_gpus_class = ".".join(["runner", mode, "selected_gpus"]) epochs_class = ".".join(["runner", mode, "epochs"]) epochs = run_extras.get(epochs_class, 1) if epochs > 1: warnings.warn( "It makes no sense to predict the same model for multiple epochs", category=UserWarning, stacklevel=2) trainer = run_extras.get(trainer_class, "GeneralTrainer") fleet_mode = run_extras.get(fleet_class, "ps") device = run_extras.get(device_class, "cpu") selected_gpus = run_extras.get(selected_gpus_class, "0") executor_mode = "infer" single_envs = {} if device.upper() == "GPU": selected_gpus_num = len(selected_gpus.split(",")) if selected_gpus_num != 1: raise ValueError( "Single Mode Only Support One GPU, Set Local Cluster Mode to use Multi-GPUS" ) single_envs["selsected_gpus"] = selected_gpus single_envs["FLAGS_selected_gpus"] = selected_gpus single_envs["train.trainer.trainer"] = trainer single_envs["train.trainer.executor_mode"] = executor_mode single_envs["fleet_mode"] = fleet_mode single_envs["train.trainer.threads"] = "2" single_envs["train.trainer.platform"] = envs.get_platform() single_envs["train.trainer.engine"] = "single" set_runtime_envs(single_envs, args.model) trainer = TrainerFactory.create(args.model) return trainer
def local_mpi_engine(args): print("launch cluster engine with cluster to run model: {}".format( args.model)) from paddlerec.core.engine.local_mpi import LocalMPIEngine print("use 1X1 MPI ClusterTraining at localhost to run model: {}".format( args.model)) mpi = util.run_which("mpirun") if not mpi: raise RuntimeError("can not find mpirun, please check environment") _envs = envs.load_yaml(args.model) run_extras = get_all_inters_from_yaml(args.model, ["train.", "runner."]) trainer_class = run_extras.get("runner." + _envs["mode"] + ".runner_class", None) executor_mode = "train" distributed_strategy = run_extras.get( "runner." + _envs["mode"] + ".distribute_strategy", "async") fleet_mode = run_extras.get("runner." + _envs["mode"] + ".fleet_mode", "ps") if trainer_class: trainer = trainer_class else: trainer = "GeneralTrainer" cluster_envs = {} cluster_envs["mpirun"] = mpi cluster_envs["train.trainer.trainer"] = trainer cluster_envs["log_dir"] = "logs" cluster_envs["train.trainer.engine"] = "local_cluster" cluster_envs["train.trainer.executor_mode"] = executor_mode cluster_envs["fleet_mode"] = fleet_mode cluster_envs["train.trainer.strategy"] = distributed_strategy cluster_envs["train.trainer.threads"] = "2" cluster_envs["train.trainer.engine"] = "local_cluster" cluster_envs["train.trainer.platform"] = envs.get_platform() set_runtime_envs(cluster_envs, args.model) launch = LocalMPIEngine(cluster_envs, args.model) return launch
def worker(mode): if not mode: raise ValueError("mode: {} can not be recognized") from paddlerec.core.engine.cluster.cluster import ClusterEngine run_extras = get_all_inters_from_yaml(args.model, ["runner."]) trainer_class = ".".join(["runner", mode, "trainer_class"]) fleet_class = ".".join(["runner", mode, "fleet_mode"]) device_class = ".".join(["runner", mode, "device"]) strategy_class = ".".join(["runner", mode, "distribute_strategy"]) trainer = run_extras.get(trainer_class, "GeneralTrainer") fleet_mode = run_extras.get(fleet_class, "ps") device = run_extras.get(device_class, "cpu") distributed_strategy = run_extras.get(strategy_class, "async") executor_mode = "train" device = device.upper() fleet_mode = fleet_mode.upper() if fleet_mode == "COLLECTIVE" and device != "GPU": raise ValueError("COLLECTIVE can not be used without GPU") cluster_envs = {} cluster_envs["fleet_mode"] = fleet_mode cluster_envs["engine_role"] = "WORKER" cluster_envs["log_dir"] = "logs" cluster_envs["train.trainer.trainer"] = trainer cluster_envs["train.trainer.engine"] = "cluster" cluster_envs["train.trainer.executor_mode"] = executor_mode cluster_envs["train.trainer.strategy"] = distributed_strategy cluster_envs["train.trainer.threads"] = envs.get_runtime_environ( "CPU_NUM") cluster_envs["train.trainer.platform"] = envs.get_platform() print("launch {} engine with cluster to with model: {}".format( trainer, args.model)) set_runtime_envs(cluster_envs, args.model) launch = ClusterEngine(None, args.model) return launch
def worker(): role = "WORKER" _envs = envs.load_yaml(args.model) run_extras = get_all_inters_from_yaml(args.model, ["train.", "runner."]) trainer_class = run_extras.get( "runner." + _envs["mode"] + ".trainer_class", None) if trainer_class: trainer = trainer_class else: trainer = "GeneralTrainer" executor_mode = "train" distributed_strategy = run_extras.get( "runner." + _envs["mode"] + ".distribute_strategy", "async") selected_gpus = run_extras.get( "runner." + _envs["mode"] + ".selected_gpus", "0") fleet_mode = run_extras.get("runner." + _envs["mode"] + ".fleet_mode", "ps") cluster_envs = {} cluster_envs["selected_gpus"] = selected_gpus cluster_envs["fleet_mode"] = fleet_mode cluster_envs["train.trainer.trainer"] = trainer cluster_envs["train.trainer.executor_mode"] = executor_mode cluster_envs["train.trainer.engine"] = "cluster" cluster_envs["train.trainer.strategy"] = distributed_strategy cluster_envs["train.trainer.threads"] = envs.get_runtime_environ( "CPU_NUM") cluster_envs["train.trainer.platform"] = envs.get_platform() print("launch {} engine with cluster to with model: {}".format( trainer, args.model)) set_runtime_envs(cluster_envs, args.model) trainer = TrainerFactory.create(args.model) return trainer
def local_mpi_engine(args): print("launch cluster engine with cluster to run model: {}".format( args.model)) from paddlerec.core.engine.local_mpi import LocalMPIEngine print("use 1X1 MPI ClusterTraining at localhost to run model: {}".format( args.model)) mpi = util.run_which("mpirun") if not mpi: raise RuntimeError("can not find mpirun, please check environment") cluster_envs = {} cluster_envs["mpirun"] = mpi cluster_envs["train.trainer.trainer"] = "CtrCodingTrainer" cluster_envs["log_dir"] = "logs" cluster_envs["train.trainer.engine"] = "local_cluster" cluster_envs["train.trainer.platform"] = envs.get_platform() set_runtime_envs(cluster_envs, args.model) launch = LocalMPIEngine(cluster_envs, args.model) return launch
def local_cluster_engine(args): from paddlerec.core.engine.local_cluster import LocalClusterEngine trainer = get_trainer_prefix(args) + "ClusterTrainer" cluster_envs = {} cluster_envs["server_num"] = 1 cluster_envs["worker_num"] = 1 cluster_envs["start_port"] = envs.find_free_port() cluster_envs["log_dir"] = "logs" cluster_envs["train.trainer.trainer"] = trainer cluster_envs["train.trainer.strategy"] = "async" cluster_envs["train.trainer.threads"] = "2" cluster_envs["train.trainer.engine"] = "local_cluster" cluster_envs["train.trainer.platform"] = envs.get_platform() cluster_envs["CPU_NUM"] = "2" print("launch {} engine with cluster to run model: {}".format( trainer, args.model)) set_runtime_envs(cluster_envs, args.model) launch = LocalClusterEngine(cluster_envs, args.model) return launch
def local_cluster_engine(args): def get_worker_num(run_extras, workers): _envs = envs.load_yaml(args.model) mode = envs.get_runtime_environ("mode") workspace = envs.get_runtime_environ("workspace") phases_class = ".".join(["runner", mode, "phases"]) phase_names = run_extras.get(phases_class) phases = [] all_phases = _envs.get("phase") if phase_names is None: phases = all_phases else: for phase in all_phases: if phase["name"] in phase_names: phases.append(phase) dataset_names = [] for phase in phases: dataset_names.append(phase["dataset_name"]) datapaths = [] for dataset in _envs.get("dataset"): if dataset["name"] in dataset_names: datapaths.append(dataset["data_path"]) if not datapaths: raise ValueError("data path must exist for training/inference") datapaths = [ envs.workspace_adapter_by_specific(path, workspace) for path in datapaths ] all_workers = [len(os.listdir(path)) for path in datapaths] all_workers.append(workers) max_worker_num = min(all_workers) if max_worker_num >= workers: return workers print( "phases do not have enough datas for training, set worker/gpu cards num from {} to {}" .format(workers, max_worker_num)) return max_worker_num from paddlerec.core.engine.local_cluster import LocalClusterEngine run_extras = get_all_inters_from_yaml(args.model, ["runner."]) mode = envs.get_runtime_environ("mode") trainer_class = ".".join(["runner", mode, "trainer_class"]) fleet_class = ".".join(["runner", mode, "fleet_mode"]) device_class = ".".join(["runner", mode, "device"]) selected_gpus_class = ".".join(["runner", mode, "selected_gpus"]) strategy_class = ".".join(["runner", mode, "distribute_strategy"]) worker_class = ".".join(["runner", mode, "worker_num"]) server_class = ".".join(["runner", mode, "server_num"]) trainer = run_extras.get(trainer_class, "GeneralTrainer") fleet_mode = run_extras.get(fleet_class, "ps") device = run_extras.get(device_class, "cpu") selected_gpus = run_extras.get(selected_gpus_class, "0") distributed_strategy = run_extras.get(strategy_class, "async") executor_mode = "train" worker_num = run_extras.get(worker_class, 1) server_num = run_extras.get(server_class, 1) device = device.upper() fleet_mode = fleet_mode.upper() cluster_envs = {} # Todo: delete follow hard code when paddle support ps-gpu. if device == "CPU": fleet_mode = "PS" elif device == "GPU": fleet_mode = "COLLECTIVE" if fleet_mode == "PS" and device != "CPU": raise ValueError("PS can not be used with GPU") if fleet_mode == "COLLECTIVE" and device != "GPU": raise ValueError("COLLECTIVE can not be used without GPU") if fleet_mode == "PS": worker_num = get_worker_num(run_extras, worker_num) if fleet_mode == "COLLECTIVE": cluster_envs["selected_gpus"] = selected_gpus gpus = selected_gpus.split(",") worker_num = get_worker_num(run_extras, len(gpus)) cluster_envs["selected_gpus"] = ','.join(gpus[:worker_num]) cluster_envs["server_num"] = server_num cluster_envs["worker_num"] = worker_num cluster_envs["start_port"] = envs.find_free_port() cluster_envs["fleet_mode"] = fleet_mode cluster_envs["log_dir"] = "logs" cluster_envs["train.trainer.trainer"] = trainer cluster_envs["train.trainer.executor_mode"] = executor_mode cluster_envs["train.trainer.strategy"] = distributed_strategy cluster_envs["train.trainer.threads"] = "2" cluster_envs["CPU_NUM"] = cluster_envs["train.trainer.threads"] cluster_envs["train.trainer.engine"] = "local_cluster" cluster_envs["train.trainer.platform"] = envs.get_platform() print("launch {} engine with cluster to run model: {}".format( trainer, args.model)) set_runtime_envs(cluster_envs, args.model) launch = LocalClusterEngine(cluster_envs, args.model) return launch