Ejemplo n.º 1
0
    def start_worker_procs(self):
        if (envs.get_runtime_environ("fleet_mode") == "COLLECTIVE"):
            #trainer_ports = os.getenv("TRAINER_PORTS", None).split(",")
            cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
            if cuda_visible_devices is None or cuda_visible_devices == "":
                selected_gpus = range(int(os.getenv("TRAINER_GPU_CARD_COUNT")))
            else:
                # change selected_gpus into relative values
                # e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.selected_gpus=4,5,6,7;
                # therefore selected_gpus=0,1,2,3
                cuda_visible_devices_list = cuda_visible_devices.split(',')
                for x in range(int(os.getenv("TRAINER_GPU_CARD_COUNT"))):
                    assert x in cuda_visible_devices_list, "Can't find "\
                        "your selected_gpus %s in CUDA_VISIBLE_DEVICES[%s]."\
                        % (x, cuda_visible_devices)
                selected_gpus = [cuda_visible_devices_list.index(x)]
            print("selected_gpus:{}".format(selected_gpus))

            factory = "paddlerec.core.factory"
            cmd = [sys.executable, "-u", "-m", factory, self.trainer]
            logs_dir = envs.get_runtime_environ("log_dir")
            print("use_paddlecloud_flag:{}".format(
                cluster_utils.use_paddlecloud()))
            if cluster_utils.use_paddlecloud():
                cluster, pod = cluster_utils.get_cloud_cluster(selected_gpus)
                logger.info("get cluster from cloud:{}".format(cluster))
                procs = cluster_utils.start_local_trainers(cluster,
                                                           pod,
                                                           cmd,
                                                           log_dir=logs_dir)
                print("cluster:{}".format(cluster))
                print("pod:{}".format(pod))
        else:
            trainer = TrainerFactory.create(self.trainer)
            trainer.run()
Ejemplo n.º 2
0
    def _get_dataset(self, state="TRAIN"):
        if state == "TRAIN":
            inputs = self.model.get_inputs()
            namespace = "train.reader"
            train_data_path = envs.get_global_env("train_data_path", None,
                                                  namespace)
        else:
            inputs = self.model.get_infer_inputs()
            namespace = "evaluate.reader"
            train_data_path = envs.get_global_env("test_data_path", None,
                                                  namespace)

        sparse_slots = envs.get_global_env("sparse_slots", None, namespace)
        dense_slots = envs.get_global_env("dense_slots", None, namespace)

        threads = int(envs.get_runtime_environ("train.trainer.threads"))
        batch_size = envs.get_global_env("batch_size", None, namespace)
        reader_class = envs.get_global_env("class", None, namespace)
        abs_dir = os.path.dirname(os.path.abspath(__file__))
        reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py')

        if sparse_slots is None and dense_slots is None:
            pipe_cmd = "python {} {} {} {}".format(reader, reader_class, state,
                                                   self._config_yaml)
        else:
            padding = envs.get_global_env("padding", 0, namespace)
            pipe_cmd = "python {} {} {} {} {} {} {} {}".format(
                reader, "slot", "slot", self._config_yaml, namespace, \
                sparse_slots.replace(" ", "#"), dense_slots.replace(" ", "#"), str(padding))

        if train_data_path.startswith("paddlerec::"):
            package_base = envs.get_runtime_environ("PACKAGE_BASE")
            assert package_base is not None
            train_data_path = os.path.join(package_base,
                                           train_data_path.split("::")[1])

        dataset = fluid.DatasetFactory().create_dataset()
        dataset.set_use_var(inputs)
        dataset.set_pipe_command(pipe_cmd)
        dataset.set_batch_size(batch_size)
        dataset.set_thread(threads)
        file_list = [
            os.path.join(train_data_path, x)
            for x in os.listdir(train_data_path)
        ]
        self.files = file_list
        dataset.set_filelist(self.files)

        debug_mode = envs.get_global_env("reader_debug_mode", False, namespace)
        if debug_mode:
            print("--- Dataset Debug Mode Begin , show pre 10 data of {}---".
                  format(file_list[0]))
            os.system("cat {} | {} | head -10".format(file_list[0], pipe_cmd))
            print("--- Dataset Debug Mode End , show pre 10 data of {}---".
                  format(file_list[0]))
            exit(0)

        return dataset
Ejemplo n.º 3
0
def dataloader_by_name(readerclass, dataset_name, yaml_file):
    reader_class = lazy_instance_by_fliename(readerclass, "TrainReader")
    name = "dataset." + dataset_name + "."
    data_path = get_global_env(name + "data_path")

    if data_path.startswith("paddlerec::"):
        package_base = get_runtime_environ("PACKAGE_BASE")
        assert package_base is not None
        data_path = os.path.join(package_base, data_path.split("::")[1])

    files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)]
    reader = reader_class(yaml_file)
    reader.init()

    def gen_reader():
        for file in files:
            with open(file, 'r') as f:
                for line in f:
                    line = line.rstrip('\n')
                    iter = reader.generate_sample(line)
                    for parsed_line in iter():
                        if parsed_line is None:
                            continue
                        else:
                            values = []
                            for pased in parsed_line:
                                values.append(pased[1])
                            yield values

    def gen_batch_reader():
        return reader.generate_batch_from_trainfiles(files)

    if hasattr(reader, 'generate_batch_from_trainfiles'):
        return gen_batch_reader()
    return gen_reader
Ejemplo n.º 4
0
 def __init_impl__(self):
     abs_dir = os.path.dirname(os.path.abspath(__file__))
     backend = envs.get_runtime_environ("engine_backend")
     if backend == "PaddleCloud":
         self.submit_script = os.path.join(abs_dir, "cloud/cluster.sh")
     else:
         raise ValueError("{} can not be supported now".format(backend))
Ejemplo n.º 5
0
    def master():
        from paddlerec.core.engine.cluster.cluster import ClusterEngine

        # Get fleet_mode & device
        run_extras = get_all_inters_from_yaml(args.model, ["runner."])
        mode = envs.get_runtime_environ("mode")
        fleet_class = ".".join(["runner", mode, "fleet_mode"])
        device_class = ".".join(["runner", mode, "device"])
        fleet_mode = run_extras.get(fleet_class, "ps")
        device = run_extras.get(device_class, "cpu")
        device = device.upper()
        fleet_mode = fleet_mode.upper()

        if fleet_mode == "COLLECTIVE" and device != "GPU":
            raise ValueError("COLLECTIVE can not be used without GPU")

        # Get Thread nums
        model_envs = envs.load_yaml(args.model)
        phases_class = ".".join(["runner", mode, "phases"])
        phase_names = run_extras.get(phases_class)
        phases = []
        all_phases = model_envs.get("phase")
        if phase_names is None:
            phases = all_phases
        else:
            for phase in all_phases:
                if phase["name"] in phase_names:
                    phases.append(phase)

        thread_num = []
        for phase in phases:
            thread_num.append(int(phase["thread_num"]))
        max_thread_num = max(thread_num)

        backend_envs = envs.load_yaml(args.backend)
        flattens = envs.flatten_environs(backend_envs, "_")
        flattens["engine_role"] = "MASTER"
        flattens["engine_mode"] = envs.get_runtime_environ("mode")
        flattens["engine_run_config"] = args.model
        flattens["max_thread_num"] = max_thread_num
        flattens["fleet_mode"] = fleet_mode
        flattens["device"] = device
        flattens["backend_yaml"] = args.backend
        envs.set_runtime_environs(flattens)

        launch = ClusterEngine(None, args.model)
        return launch
Ejemplo n.º 6
0
    def get_worker_num(run_extras, workers):
        _envs = envs.load_yaml(args.model)
        mode = envs.get_runtime_environ("mode")
        workspace = envs.get_runtime_environ("workspace")
        phases_class = ".".join(["runner", mode, "phases"])
        phase_names = run_extras.get(phases_class)
        phases = []
        all_phases = _envs.get("phase")
        if phase_names is None:
            phases = all_phases
        else:
            for phase in all_phases:
                if phase["name"] in phase_names:
                    phases.append(phase)

        dataset_names = []
        for phase in phases:
            dataset_names.append(phase["dataset_name"])

        datapaths = []
        for dataset in _envs.get("dataset"):
            if dataset["name"] in dataset_names:
                datapaths.append(dataset["data_path"])

        if not datapaths:
            raise ValueError("data path must exist for training/inference")

        datapaths = [
            envs.workspace_adapter_by_specific(path, workspace)
            for path in datapaths
        ]

        all_workers = [len(os.listdir(path)) for path in datapaths]
        all_workers.append(workers)
        max_worker_num = min(all_workers)

        if max_worker_num >= workers:
            return workers

        print(
            "phases do not have enough datas for training, set worker/gpu cards num from {} to {}"
            .format(workers, max_worker_num))

        return max_worker_num
Ejemplo n.º 7
0
    def __init_impl__(self):
        self.role = envs.get_runtime_environ("engine_role")
        if self.role == "WORKER":
            return

        abs_dir = os.path.dirname(os.path.abspath(__file__))
        os.environ["abs_dir"] = str(abs_dir)

        self.backend = envs.get_runtime_environ("backend")
        if not self.backend:
            self.backend = ""
        self.backend = self.backend.upper()
        if self.backend == "PADDLECLOUD":
            self.submit_script = os.path.join(abs_dir, "cloud/cluster.sh")
        elif self.backend == "KUBERNETES":
            self.submit_script = os.path.join(abs_dir, "k8s/cluster.sh")
        else:
            raise ValueError("{} can not be supported now".format(
                self.backend))
Ejemplo n.º 8
0
    def _get_dataset(self, state="TRAIN", hour=None):
        if state == "TRAIN":
            inputs = self.model.get_inputs()
            namespace = "train.reader"
            train_data_path = envs.get_global_env("train_data_path", None,
                                                  namespace)
        else:
            inputs = self.model.get_infer_inputs()
            namespace = "evaluate.reader"
            train_data_path = envs.get_global_env("test_data_path", None,
                                                  namespace)

        threads = int(envs.get_runtime_environ("train.trainer.threads"))
        batch_size = envs.get_global_env("batch_size", None, namespace)
        reader_class = envs.get_global_env("class", None, namespace)
        abs_dir = os.path.dirname(os.path.abspath(__file__))
        reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py')
        pipe_cmd = "python {} {} {} {}".format(reader, reader_class, state,
                                               self._config_yaml)

        if train_data_path.startswith("paddlerec::"):
            package_base = envs.get_runtime_environ("PACKAGE_BASE")
            assert package_base is not None
            train_data_path = os.path.join(package_base,
                                           train_data_path.split("::")[1])

        dataset = fluid.DatasetFactory().create_dataset()
        dataset.set_use_var(inputs)
        dataset.set_pipe_command(pipe_cmd)
        dataset.set_batch_size(batch_size)
        dataset.set_thread(threads)

        if hour is not None:
            train_data_path = os.path.join(train_data_path, hour)

        file_list = [
            os.path.join(train_data_path, x)
            for x in os.listdir(train_data_path)
        ]

        self.files = file_list
        dataset.set_filelist(self.files)
        return dataset
Ejemplo n.º 9
0
 def which_executor_mode(self):
     executor_mode = envs.get_runtime_environ("train.trainer.executor_mode")
     if executor_mode.upper() not in ["TRAIN", "INFER"]:
         raise ValueError(
             "Not Support Executor Mode {}".format(executor_mode))
     if executor_mode.upper() == "TRAIN":
         self.is_infer = False
     else:
         self.is_infer = True
     print("Executor Mode: {}".format(executor_mode))
     self._context["is_infer"] = self.is_infer
Ejemplo n.º 10
0
    def run(self):
        role = envs.get_runtime_environ("engine_role")

        if role == "MASTER":
            self.start_master_procs()

        elif role == "WORKER":
            self.start_worker_procs()

        else:
            raise ValueError("role {} error, must in MASTER/WORKER".format(role))
Ejemplo n.º 11
0
def slotdataloader(readerclass, train, yaml_file, context):
    if train == "TRAIN":
        reader_name = "SlotReader"
        namespace = "train.reader"
        data_path = get_global_env("train_data_path", None, namespace)
    else:
        reader_name = "SlotReader"
        namespace = "evaluate.reader"
        data_path = get_global_env("test_data_path", None, namespace)

    if data_path.startswith("paddlerec::"):
        package_base = get_runtime_environ("PACKAGE_BASE")
        assert package_base is not None
        data_path = os.path.join(package_base, data_path.split("::")[1])

    files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)]
    if context["engine"] == EngineMode.LOCAL_CLUSTER:
        files = split_files(files, context["fleet"].worker_index(),
                            context["fleet"].worker_num())
        print("file_list: {}".format(files))

    sparse = get_global_env("sparse_slots", "#", namespace)
    if sparse == "":
        sparse = "#"
    dense = get_global_env("dense_slots", "#", namespace)
    if dense == "":
        dense = "#"
    padding = get_global_env("padding", 0, namespace)
    reader = SlotReader(yaml_file)
    reader.init(sparse, dense, int(padding))

    def gen_reader():
        for file in files:
            with open(file, 'r') as f:
                for line in f:
                    line = line.rstrip('\n')
                    iter = reader.generate_sample(line)
                    for parsed_line in iter():
                        if parsed_line is None:
                            continue
                        else:
                            values = []
                            for pased in parsed_line:
                                values.append(pased[1])
                            yield values

    def gen_batch_reader():
        return reader.generate_batch_from_trainfiles(files)

    if hasattr(reader, 'generate_batch_from_trainfiles'):
        return gen_batch_reader()
    return gen_reader
Ejemplo n.º 12
0
    def which_fleet_mode(self):
        fleet_mode = envs.get_runtime_environ("fleet_mode")
        if fleet_mode.upper() == "PS":
            self.fleet_mode = FleetMode.PS
        elif fleet_mode.upper() == "COLLECTIVE":
            self.fleet_mode = FleetMode.COLLECTIVE
        elif fleet_mode.upper() == "PSLIB":
            self.fleet_mode = FleetMode.PSLIB
        else:
            raise ValueError("Not Support Fleet Mode {}".format(fleet_mode))

        self._context["is_pslib"] = (fleet_mode.upper() == "PSLIB")
        self._context["fleet_mode"] = fleet_mode
Ejemplo n.º 13
0
    def __init_impl__(self):
        abs_dir = os.path.dirname(os.path.abspath(__file__))

        backend = envs.get_runtime_environ("engine_backend")
        if not backend:
            backend = ""
        backend = backend.upper()
        if backend == "PADDLECLOUD":
            self.submit_script = os.path.join(abs_dir, "cloud/cluster.sh")
        elif backend == "KUBERNETES":
            self.submit_script = os.path.join(abs_dir, "k8s/cluster.sh")
        else:
            raise ValueError("{} can not be supported now".format(backend))
Ejemplo n.º 14
0
    def worker(mode):
        if not mode:
            raise ValueError("mode: {} can not be recognized")

        run_extras = get_all_inters_from_yaml(args.model, ["runner."])

        trainer_class = ".".join(["runner", mode, "trainer_class"])
        fleet_class = ".".join(["runner", mode, "fleet_mode"])
        device_class = ".".join(["runner", mode, "device"])
        selected_gpus_class = ".".join(["runner", mode, "selected_gpus"])
        strategy_class = ".".join(["runner", mode, "distribute_strategy"])
        worker_class = ".".join(["runner", mode, "worker_num"])
        server_class = ".".join(["runner", mode, "server_num"])

        trainer = run_extras.get(trainer_class, "GeneralTrainer")
        fleet_mode = run_extras.get(fleet_class, "ps")
        device = run_extras.get(device_class, "cpu")
        selected_gpus = run_extras.get(selected_gpus_class, "0")
        distributed_strategy = run_extras.get(strategy_class, "async")
        worker_num = run_extras.get(worker_class, 1)
        server_num = run_extras.get(server_class, 1)
        executor_mode = "train"

        device = device.upper()
        fleet_mode = fleet_mode.upper()

        if fleet_mode == "COLLECTIVE" and device != "GPU":
            raise ValueError("COLLECTIVE can not be used with GPU")

        cluster_envs = {}

        if device == "GPU":
            cluster_envs["selected_gpus"] = selected_gpus

        cluster_envs["server_num"] = server_num
        cluster_envs["worker_num"] = worker_num
        cluster_envs["fleet_mode"] = fleet_mode
        cluster_envs["train.trainer.trainer"] = trainer
        cluster_envs["train.trainer.engine"] = "cluster"
        cluster_envs["train.trainer.executor_mode"] = executor_mode
        cluster_envs["train.trainer.strategy"] = distributed_strategy
        cluster_envs["train.trainer.threads"] = envs.get_runtime_environ(
            "CPU_NUM")
        cluster_envs["train.trainer.platform"] = envs.get_platform()
        print("launch {} engine with cluster to with model: {}".format(
            trainer, args.model))
        set_runtime_envs(cluster_envs, args.model)

        trainer = TrainerFactory.create(args.model)
        return trainer
Ejemplo n.º 15
0
    def master():
        from paddlerec.core.engine.cluster.cluster import ClusterEngine
        _envs = envs.load_yaml(args.backend)
        flattens = envs.flatten_environs(_envs, "_")
        flattens["engine_role"] = "MASTER"
        flattens["engine_mode"] = envs.get_runtime_environ("mode")
        flattens["engine_run_config"] = args.model
        flattens["engine_temp_path"] = tempfile.mkdtemp()
        envs.set_runtime_environs(flattens)
        ClusterEngine.workspace_replace()
        print(envs.pretty_print_envs(flattens, ("Submit Envs", "Value")))

        launch = ClusterEngine(None, args.model)
        return launch
Ejemplo n.º 16
0
    def paddlecloud_env_check(self):
        # get fleet mode
        fleet_mode = envs.get_runtime_environ("fleet_mode")
        # get device
        device = envs.get_runtime_environ("device")
        # get cluster type
        cluster_type = envs.get_runtime_environ("cluster_type")

        cluster_env_check_tool = None
        if cluster_type.upper() == "MPI":
            if device == "CPU" and fleet_mode == "PS":
                cluster_env_check_tool = PaddleCloudMpiEnv()
            else:
                raise ValueError(
                    "Paddlecloud with Mpi don't support GPU training, check your config.yaml & backend.yaml"
                )
        elif cluster_type.upper() == "K8S":
            if fleet_mode == "PS":
                if device == "CPU":
                    cluster_env_check_tool = CloudPsCpuEnv()
                elif device == "GPU":
                    raise ValueError(
                        "PS-GPU on paddlecloud is not supported at this time, comming soon"
                    )
            if fleet_mode == "COLLECTIVE":
                if device == "GPU":
                    cluster_env_check_tool = CloudCollectiveEnv()
                elif device == "CPU":
                    raise ValueError(
                        "Unexpected config -> device: CPU with fleet_mode: Collective, check your config.yaml"
                    )
        else:
            raise ValueError(
                "cluster_type {} error, must in MPI/K8S".format(cluster_type))

        cluster_env_check_tool.env_check()
        cluster_env_check_tool.env_set()
Ejemplo n.º 17
0
 def which_engine(self):
     engine = envs.get_runtime_environ("train.trainer.engine")
     if engine.upper() == "SINGLE":
         self.engine = EngineMode.SINGLE
         self.is_fleet = False
     elif engine.upper() == "LOCAL_CLUSTER":
         self.engine = EngineMode.LOCAL_CLUSTER
         self.is_fleet = True
     elif engine.upper() == "CLUSTER":
         self.engine = EngineMode.CLUSTER
         self.is_fleet = True
     else:
         raise ValueError("Not Support Engine {}".format(engine))
     self._context["is_fleet"] = self.is_fleet
     self._context["engine"] = self.engine
Ejemplo n.º 18
0
    def worker():
        role = "WORKER"
        trainer = get_trainer_prefix(args) + "ClusterTrainer"
        cluster_envs = {}
        cluster_envs["train.trainer.trainer"] = trainer
        cluster_envs["train.trainer.engine"] = "cluster"
        cluster_envs["train.trainer.threads"] = envs.get_runtime_environ(
            "CPU_NUM")
        cluster_envs["train.trainer.platform"] = envs.get_platform()
        print("launch {} engine with cluster to with model: {}".format(
            trainer, args.model))
        set_runtime_envs(cluster_envs, args.model)

        trainer = TrainerFactory.create(args.model)
        return trainer
Ejemplo n.º 19
0
def dataloader_by_name(readerclass,
                       dataset_name,
                       yaml_file,
                       context,
                       reader_class_name="Reader"):

    reader_class = lazy_instance_by_fliename(readerclass, reader_class_name)

    name = "dataset." + dataset_name + "."
    data_path = get_global_env(name + "data_path")

    if data_path.startswith("paddlerec::"):
        package_base = get_runtime_environ("PACKAGE_BASE")
        assert package_base is not None
        data_path = os.path.join(package_base, data_path.split("::")[1])

    files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)]
    if context["engine"] == EngineMode.LOCAL_CLUSTER:
        files = split_files(files, context["fleet"].worker_index(),
                            context["fleet"].worker_num())
    print("file_list : {}".format(files))

    reader = reader_class(yaml_file)
    reader.init()

    def gen_reader():
        for file in files:
            with open(file, 'r') as f:
                for line in f:
                    line = line.rstrip('\n')
                    iter = reader.generate_sample(line)
                    for parsed_line in iter():
                        if parsed_line is None:
                            continue
                        else:
                            values = []
                            for pased in parsed_line:
                                values.append(pased[1])
                            yield values

    def gen_batch_reader():
        return reader.generate_batch_from_trainfiles(files)

    if hasattr(reader, 'generate_batch_from_trainfiles'):
        return gen_batch_reader()
    return gen_reader
Ejemplo n.º 20
0
def single_infer_engine(args):
    run_extras = get_all_inters_from_yaml(args.model, ["runner."])

    mode = envs.get_runtime_environ("mode")
    trainer_class = ".".join(["runner", mode, "trainer_class"])
    fleet_class = ".".join(["runner", mode, "fleet_mode"])
    device_class = ".".join(["runner", mode, "device"])
    selected_gpus_class = ".".join(["runner", mode, "selected_gpus"])

    epochs_class = ".".join(["runner", mode, "epochs"])
    epochs = run_extras.get(epochs_class, 1)
    if epochs > 1:
        warnings.warn(
            "It makes no sense to predict the same model for multiple epochs",
            category=UserWarning,
            stacklevel=2)

    trainer = run_extras.get(trainer_class, "GeneralTrainer")
    fleet_mode = run_extras.get(fleet_class, "ps")
    device = run_extras.get(device_class, "cpu")
    selected_gpus = run_extras.get(selected_gpus_class, "0")
    executor_mode = "infer"

    single_envs = {}

    if device.upper() == "GPU":
        selected_gpus_num = len(selected_gpus.split(","))
        if selected_gpus_num != 1:
            raise ValueError(
                "Single Mode Only Support One GPU, Set Local Cluster Mode to use Multi-GPUS"
            )

        single_envs["selsected_gpus"] = selected_gpus
        single_envs["FLAGS_selected_gpus"] = selected_gpus

    single_envs["train.trainer.trainer"] = trainer
    single_envs["train.trainer.executor_mode"] = executor_mode
    single_envs["fleet_mode"] = fleet_mode
    single_envs["train.trainer.threads"] = "2"
    single_envs["train.trainer.platform"] = envs.get_platform()
    single_envs["train.trainer.engine"] = "single"

    set_runtime_envs(single_envs, args.model)
    trainer = TrainerFactory.create(args.model)
    return trainer
Ejemplo n.º 21
0
    def _build_strategy(self, context):
        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
        mode = envs.get_runtime_environ("train.trainer.strategy")
        assert mode in ["async", "geo", "sync"]

        strategy = None

        if context['fleet_mode'] == "PS":
            strategy = paddle.distributed.fleet.DistributedStrategy()
            if mode == 'async':
                strategy.a_sync = True
            elif mode == 'sync':
                strategy.a_sync = False
            elif mode == 'geo':
                strategy.a_sync = True
                strategy.a_sync_configs = {"k_steps": 400}
        elif context['fleet_mode'] == "COLLECTIVE":
            strategy = paddle.distributed.fleet.DistributedStrategy()
            strategy.sync_nccl_allreduce = True
            strategy.nccl_comm_num = 2
            strategy.fuse_all_reduce_ops = True

            # build strategy
            build_strategy = fluid.BuildStrategy()
            build_strategy.enable_sequential_execution = True
            build_strategy.fuse_elewise_add_act_ops = True
            build_strategy.fuse_bn_act_ops = True
            build_strategy.enable_auto_fusion = True
            build_strategy.fuse_all_optimizer_ops = True
            strategy.build_strategy = build_strategy

            # execute strategy
            execution_strategy = paddle.static.ExecutionStrategy()
            execution_strategy.num_threads = int(os.getenv('CPU_NUM', 2))
            execution_strategy.num_iteration_per_drop_scope = 100
            execution_strategy.num_iteration_per_run = 1
            strategy.execution_strategy = execution_strategy

        assert strategy is not None

        context["strategy"] = strategy
        return strategy
Ejemplo n.º 22
0
def slotdataloader_by_name(readerclass, dataset_name, yaml_file):
    name = "dataset." + dataset_name + "."
    reader_name = "SlotReader"
    data_path = get_global_env(name + "data_path")

    if data_path.startswith("paddlerec::"):
        package_base = get_runtime_environ("PACKAGE_BASE")
        assert package_base is not None
        data_path = os.path.join(package_base, data_path.split("::")[1])

    files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)]
    sparse = get_global_env(name + "sparse_slots", "#")
    if sparse == "":
        sparse = "#"
    dense = get_global_env(name + "dense_slots", "#")
    if dense == "":
        dense = "#"
    padding = get_global_env(name + "padding", 0)
    reader = SlotReader(yaml_file)
    reader.init(sparse, dense, int(padding))

    def gen_reader():
        for file in files:
            with open(file, 'r') as f:
                for line in f:
                    line = line.rstrip('\n')
                    iter = reader.generate_sample(line)
                    for parsed_line in iter():
                        if parsed_line is None:
                            continue
                        else:
                            values = []
                            for pased in parsed_line:
                                values.append(pased[1])
                            yield values

    def gen_batch_reader():
        return reader.generate_batch_from_trainfiles(files)

    if hasattr(reader, 'generate_batch_from_trainfiles'):
        return gen_batch_reader()
    return gen_reader
Ejemplo n.º 23
0
    def build_strategy(self):
        mode = envs.get_runtime_environ("train.trainer.strategy")
        assert mode in ["async", "geo", "sync", "half_async"]

        strategy = None

        if mode == "async":
            strategy = StrategyFactory.create_async_strategy()
        elif mode == "geo":
            push_num = envs.get_global_env("train.strategy.mode.push_num", 100)
            strategy = StrategyFactory.create_geo_strategy(push_num)
        elif mode == "sync":
            strategy = StrategyFactory.create_sync_strategy()
        elif mode == "half_async":
            strategy = StrategyFactory.create_half_async_strategy()

        assert strategy is not None

        self.strategy = strategy
        return strategy
Ejemplo n.º 24
0
    def _build_strategy(self, context):
        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
        mode = envs.get_runtime_environ("train.trainer.strategy")
        assert mode in ["async", "geo", "sync", "half_async"]

        strategy = None

        if mode == "async":
            strategy = StrategyFactory.create_async_strategy()
        elif mode == "geo":
            push_num = envs.get_global_env("train.strategy.mode.push_num", 100)
            strategy = StrategyFactory.create_geo_strategy(push_num)
        elif mode == "sync":
            strategy = StrategyFactory.create_sync_strategy()
        elif mode == "half_async":
            strategy = StrategyFactory.create_half_async_strategy()

        assert strategy is not None

        context["strategy"] = strategy
        return strategy
Ejemplo n.º 25
0
    def worker(mode):
        if not mode:
            raise ValueError("mode: {} can not be recognized")
        from paddlerec.core.engine.cluster.cluster import ClusterEngine

        run_extras = get_all_inters_from_yaml(args.model, ["runner."])

        trainer_class = ".".join(["runner", mode, "trainer_class"])
        fleet_class = ".".join(["runner", mode, "fleet_mode"])
        device_class = ".".join(["runner", mode, "device"])
        strategy_class = ".".join(["runner", mode, "distribute_strategy"])
        trainer = run_extras.get(trainer_class, "GeneralTrainer")
        fleet_mode = run_extras.get(fleet_class, "ps")
        device = run_extras.get(device_class, "cpu")
        distributed_strategy = run_extras.get(strategy_class, "async")
        executor_mode = "train"

        device = device.upper()
        fleet_mode = fleet_mode.upper()
        if fleet_mode == "COLLECTIVE" and device != "GPU":
            raise ValueError("COLLECTIVE can not be used without GPU")

        cluster_envs = {}

        cluster_envs["fleet_mode"] = fleet_mode
        cluster_envs["engine_role"] = "WORKER"
        cluster_envs["log_dir"] = "logs"
        cluster_envs["train.trainer.trainer"] = trainer
        cluster_envs["train.trainer.engine"] = "cluster"
        cluster_envs["train.trainer.executor_mode"] = executor_mode
        cluster_envs["train.trainer.strategy"] = distributed_strategy
        cluster_envs["train.trainer.threads"] = envs.get_runtime_environ(
            "CPU_NUM")
        cluster_envs["train.trainer.platform"] = envs.get_platform()
        print("launch {} engine with cluster to with model: {}".format(
            trainer, args.model))

        set_runtime_envs(cluster_envs, args.model)
        launch = ClusterEngine(None, args.model)
        return launch
Ejemplo n.º 26
0
    def worker():
        role = "WORKER"

        _envs = envs.load_yaml(args.model)
        run_extras = get_all_inters_from_yaml(args.model,
                                              ["train.", "runner."])
        trainer_class = run_extras.get(
            "runner." + _envs["mode"] + ".trainer_class", None)

        if trainer_class:
            trainer = trainer_class
        else:
            trainer = "GeneralTrainer"

        executor_mode = "train"

        distributed_strategy = run_extras.get(
            "runner." + _envs["mode"] + ".distribute_strategy", "async")
        selected_gpus = run_extras.get(
            "runner." + _envs["mode"] + ".selected_gpus", "0")
        fleet_mode = run_extras.get("runner." + _envs["mode"] + ".fleet_mode",
                                    "ps")

        cluster_envs = {}
        cluster_envs["selected_gpus"] = selected_gpus
        cluster_envs["fleet_mode"] = fleet_mode
        cluster_envs["train.trainer.trainer"] = trainer
        cluster_envs["train.trainer.executor_mode"] = executor_mode
        cluster_envs["train.trainer.engine"] = "cluster"
        cluster_envs["train.trainer.strategy"] = distributed_strategy
        cluster_envs["train.trainer.threads"] = envs.get_runtime_environ(
            "CPU_NUM")
        cluster_envs["train.trainer.platform"] = envs.get_platform()
        print("launch {} engine with cluster to with model: {}".format(
            trainer, args.model))
        set_runtime_envs(cluster_envs, args.model)

        trainer = TrainerFactory.create(args.model)
        return trainer
Ejemplo n.º 27
0
    def __init__(self, config=None):
        self._status_processor = {}
        self.model = None
        self.inference_models = []
        self.increment_models = []
        self._exector_context = {}
        self._context = {'status': 'uninit', 'is_exit': False}
        self._context["config_yaml"] = config

        self._model = {}
        self._dataset = {}

        self._runner_name = envs.get_runtime_environ("mode")
        self._context["runner_name"] = self._runner_name

        phase_names = envs.get_global_env(
            "runner." + self._runner_name + ".phases", None)

        _config = envs.load_yaml(config)

        self._context["env"] = _config
        self._context["dataset"] = _config.get("dataset")

        phases = []
        if phase_names is None:
            phases = _config.get("phase")
        else:
            for phase in _config.get("phase"):
                if phase["name"] in phase_names:
                    phases.append(phase)

        self._context["phases"] = phases
        print("PaddleRec: Runner {} Begin".format(self._runner_name))
        self.which_engine()
        self.which_device()
        self.which_fleet_mode()
        self.which_executor_mode()
        self.legality_check()
Ejemplo n.º 28
0
def local_mpi_engine(args):
    print("launch cluster engine with cluster to run model: {}".format(
        args.model))
    from paddlerec.core.engine.local_mpi import LocalMPIEngine

    print("use 1X1 MPI ClusterTraining at localhost to run model: {}".format(
        args.model))

    mpi = util.run_which("mpirun")
    if not mpi:
        raise RuntimeError("can not find mpirun, please check environment")

    run_extras = get_all_inters_from_yaml(args.model, ["runner."])

    mode = envs.get_runtime_environ("mode")
    trainer_class = ".".join(["runner", mode, "trainer_class"])
    fleet_class = ".".join(["runner", mode, "fleet_mode"])
    distributed_strategy = "async"
    executor_mode = "train"

    trainer = run_extras.get(trainer_class, "GeneralTrainer")
    fleet_mode = run_extras.get(fleet_class, "ps")

    cluster_envs = {}
    cluster_envs["mpirun"] = mpi
    cluster_envs["train.trainer.trainer"] = trainer
    cluster_envs["log_dir"] = "logs"
    cluster_envs["train.trainer.engine"] = "local_cluster"
    cluster_envs["train.trainer.executor_mode"] = executor_mode
    cluster_envs["fleet_mode"] = fleet_mode
    cluster_envs["train.trainer.strategy"] = distributed_strategy
    cluster_envs["train.trainer.threads"] = "2"
    cluster_envs["train.trainer.platform"] = envs.get_platform()

    set_runtime_envs(cluster_envs, args.model)
    launch = LocalMPIEngine(cluster_envs, args.model)
    return launch
Ejemplo n.º 29
0
def dataloader_by_name(readerclass,
                       dataset_name,
                       yaml_file,
                       context,
                       reader_class_name="Reader"):

    reader_class = lazy_instance_by_fliename(readerclass, reader_class_name)

    name = "dataset." + dataset_name + "."
    data_path = get_global_env(name + "data_path")

    if data_path.startswith("paddlerec::"):
        package_base = get_runtime_environ("PACKAGE_BASE")
        assert package_base is not None
        data_path = os.path.join(package_base, data_path.split("::")[1])

    hidden_file_list, files = check_filelist(hidden_file_list=[],
                                             data_file_list=[],
                                             train_data_path=data_path)
    if (hidden_file_list is not None):
        print(
            "Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}"
            .format(hidden_file_list))

    files.sort()

    # for local cluster: discard some files if files cannot be divided equally between GPUs
    if (context["device"] == "GPU") and "PADDLEREC_GPU_NUMS" in os.environ:
        selected_gpu_nums = int(os.getenv("PADDLEREC_GPU_NUMS"))
        discard_file_nums = len(files) % selected_gpu_nums
        if (discard_file_nums != 0):
            warnings.warn(
                "Because files cannot be divided equally between GPUs,discard these files:{}"
                .format(files[-discard_file_nums:]))
            files = files[:len(files) - discard_file_nums]

    need_split_files = False
    if context["engine"] == EngineMode.LOCAL_CLUSTER:
        # for local cluster: split files for multi process
        need_split_files = True
    elif context["engine"] == EngineMode.CLUSTER and context[
            "cluster_type"] == "K8S":
        # for k8s mount mode, split files for every node
        need_split_files = True
    print("need_split_files: {}".format(need_split_files))
    if need_split_files:
        files = split_files(files, context["fleet"].worker_index(),
                            context["fleet"].worker_num())
    context["file_list"] = files
    reader = reader_class(yaml_file)
    reader.init()

    def gen_reader():
        for file in files:
            with open(file, 'r') as f:
                for line in f:
                    line = line.rstrip('\n')
                    iter = reader.generate_sample(line)
                    for parsed_line in iter():
                        if parsed_line is None:
                            continue
                        else:
                            values = []
                            for pased in parsed_line:
                                values.append(pased[1])
                            yield values

    def gen_batch_reader():
        return reader.generate_batch_from_trainfiles(files)

    if hasattr(reader, 'generate_batch_from_trainfiles'):
        return gen_batch_reader()

    if hasattr(reader, "batch_tensor_creator"):
        return reader.batch_tensor_creator(gen_reader)

    return gen_reader
Ejemplo n.º 30
0
def local_cluster_engine(args):
    def get_worker_num(run_extras, workers):
        _envs = envs.load_yaml(args.model)
        mode = envs.get_runtime_environ("mode")
        workspace = envs.get_runtime_environ("workspace")
        phases_class = ".".join(["runner", mode, "phases"])
        phase_names = run_extras.get(phases_class)
        phases = []
        all_phases = _envs.get("phase")
        if phase_names is None:
            phases = all_phases
        else:
            for phase in all_phases:
                if phase["name"] in phase_names:
                    phases.append(phase)

        dataset_names = []
        for phase in phases:
            dataset_names.append(phase["dataset_name"])

        datapaths = []
        for dataset in _envs.get("dataset"):
            if dataset["name"] in dataset_names:
                datapaths.append(dataset["data_path"])

        if not datapaths:
            raise ValueError("data path must exist for training/inference")

        datapaths = [
            envs.workspace_adapter_by_specific(path, workspace)
            for path in datapaths
        ]

        all_workers = [len(os.listdir(path)) for path in datapaths]
        all_workers.append(workers)
        max_worker_num = min(all_workers)

        if max_worker_num >= workers:
            return workers

        print(
            "phases do not have enough datas for training, set worker/gpu cards num from {} to {}"
            .format(workers, max_worker_num))

        return max_worker_num

    from paddlerec.core.engine.local_cluster import LocalClusterEngine

    run_extras = get_all_inters_from_yaml(args.model, ["runner."])
    mode = envs.get_runtime_environ("mode")
    trainer_class = ".".join(["runner", mode, "trainer_class"])
    fleet_class = ".".join(["runner", mode, "fleet_mode"])
    device_class = ".".join(["runner", mode, "device"])
    selected_gpus_class = ".".join(["runner", mode, "selected_gpus"])
    strategy_class = ".".join(["runner", mode, "distribute_strategy"])
    worker_class = ".".join(["runner", mode, "worker_num"])
    server_class = ".".join(["runner", mode, "server_num"])

    trainer = run_extras.get(trainer_class, "GeneralTrainer")
    fleet_mode = run_extras.get(fleet_class, "ps")
    device = run_extras.get(device_class, "cpu")
    selected_gpus = run_extras.get(selected_gpus_class, "0")
    distributed_strategy = run_extras.get(strategy_class, "async")
    executor_mode = "train"

    worker_num = run_extras.get(worker_class, 1)
    server_num = run_extras.get(server_class, 1)

    device = device.upper()
    fleet_mode = fleet_mode.upper()

    cluster_envs = {}

    # Todo: delete follow hard code when paddle support ps-gpu.
    if device == "CPU":
        fleet_mode = "PS"
    elif device == "GPU":
        fleet_mode = "COLLECTIVE"
    if fleet_mode == "PS" and device != "CPU":
        raise ValueError("PS can not be used with GPU")

    if fleet_mode == "COLLECTIVE" and device != "GPU":
        raise ValueError("COLLECTIVE can not be used without GPU")

    if fleet_mode == "PS":
        worker_num = get_worker_num(run_extras, worker_num)

    if fleet_mode == "COLLECTIVE":
        cluster_envs["selected_gpus"] = selected_gpus
        gpus = selected_gpus.split(",")
        worker_num = get_worker_num(run_extras, len(gpus))
        cluster_envs["selected_gpus"] = ','.join(gpus[:worker_num])

    cluster_envs["server_num"] = server_num
    cluster_envs["worker_num"] = worker_num
    cluster_envs["start_port"] = envs.find_free_port()
    cluster_envs["fleet_mode"] = fleet_mode
    cluster_envs["log_dir"] = "logs"
    cluster_envs["train.trainer.trainer"] = trainer
    cluster_envs["train.trainer.executor_mode"] = executor_mode
    cluster_envs["train.trainer.strategy"] = distributed_strategy
    cluster_envs["train.trainer.threads"] = "2"
    cluster_envs["CPU_NUM"] = cluster_envs["train.trainer.threads"]
    cluster_envs["train.trainer.engine"] = "local_cluster"
    cluster_envs["train.trainer.platform"] = envs.get_platform()

    print("launch {} engine with cluster to run model: {}".format(
        trainer, args.model))

    set_runtime_envs(cluster_envs, args.model)
    launch = LocalClusterEngine(cluster_envs, args.model)
    return launch