Exemple #1
0
    def start_worker_procs(self):
        if (envs.get_runtime_environ("fleet_mode") == "COLLECTIVE"):
            #trainer_ports = os.getenv("TRAINER_PORTS", None).split(",")
            cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
            if cuda_visible_devices is None or cuda_visible_devices == "":
                selected_gpus = range(int(os.getenv("TRAINER_GPU_CARD_COUNT")))
            else:
                # change selected_gpus into relative values
                # e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.selected_gpus=4,5,6,7;
                # therefore selected_gpus=0,1,2,3
                cuda_visible_devices_list = cuda_visible_devices.split(',')
                for x in range(int(os.getenv("TRAINER_GPU_CARD_COUNT"))):
                    assert x in cuda_visible_devices_list, "Can't find "\
                        "your selected_gpus %s in CUDA_VISIBLE_DEVICES[%s]."\
                        % (x, cuda_visible_devices)
                selected_gpus = [cuda_visible_devices_list.index(x)]
            print("selected_gpus:{}".format(selected_gpus))

            factory = "paddlerec.core.factory"
            cmd = [sys.executable, "-u", "-m", factory, self.trainer]
            logs_dir = envs.get_runtime_environ("log_dir")
            print("use_paddlecloud_flag:{}".format(
                cluster_utils.use_paddlecloud()))
            if cluster_utils.use_paddlecloud():
                cluster, pod = cluster_utils.get_cloud_cluster(selected_gpus)
                logger.info("get cluster from cloud:{}".format(cluster))
                procs = cluster_utils.start_local_trainers(cluster,
                                                           pod,
                                                           cmd,
                                                           log_dir=logs_dir)
                print("cluster:{}".format(cluster))
                print("pod:{}".format(pod))
        else:
            trainer = TrainerFactory.create(self.trainer)
            trainer.run()
Exemple #2
0
def single_infer_engine(args):
    trainer = get_trainer_prefix(args) + "SingleInfer"
    single_envs = {}
    single_envs["train.trainer.trainer"] = trainer
    single_envs["train.trainer.threads"] = "2"
    single_envs["train.trainer.engine"] = "single_infer"
    single_envs["train.trainer.platform"] = envs.get_platform()
    print("use {} engine to run model: {}".format(trainer, args.model))
    set_runtime_envs(single_envs, args.model)
    trainer = TrainerFactory.create(args.model)
    return trainer
Exemple #3
0
def cluster_mpi_engine(args):
    print("launch cluster engine with cluster to run model: {}".format(
        args.model))

    cluster_envs = {}
    cluster_envs["train.trainer.trainer"] = "CtrCodingTrainer"
    cluster_envs["train.trainer.platform"] = envs.get_platform()

    set_runtime_envs(cluster_envs, args.model)

    trainer = TrainerFactory.create(args.model)
    return trainer
Exemple #4
0
def online_learning(args):
    trainer = "OnlineLearningTrainer"
    single_envs = {}
    single_envs["train.trainer.trainer"] = trainer
    single_envs["train.trainer.threads"] = "2"
    single_envs["train.trainer.engine"] = "online_learning"
    single_envs["train.trainer.platform"] = envs.get_platform()
    print("use {} engine to run model: {}".format(trainer, args.model))

    set_runtime_envs(single_envs, args.model)
    trainer = TrainerFactory.create(args.model)
    return trainer
Exemple #5
0
    def worker(mode):
        if not mode:
            raise ValueError("mode: {} can not be recognized")

        run_extras = get_all_inters_from_yaml(args.model, ["runner."])

        trainer_class = ".".join(["runner", mode, "trainer_class"])
        fleet_class = ".".join(["runner", mode, "fleet_mode"])
        device_class = ".".join(["runner", mode, "device"])
        selected_gpus_class = ".".join(["runner", mode, "selected_gpus"])
        strategy_class = ".".join(["runner", mode, "distribute_strategy"])
        worker_class = ".".join(["runner", mode, "worker_num"])
        server_class = ".".join(["runner", mode, "server_num"])

        trainer = run_extras.get(trainer_class, "GeneralTrainer")
        fleet_mode = run_extras.get(fleet_class, "ps")
        device = run_extras.get(device_class, "cpu")
        selected_gpus = run_extras.get(selected_gpus_class, "0")
        distributed_strategy = run_extras.get(strategy_class, "async")
        worker_num = run_extras.get(worker_class, 1)
        server_num = run_extras.get(server_class, 1)
        executor_mode = "train"

        device = device.upper()
        fleet_mode = fleet_mode.upper()

        if fleet_mode == "COLLECTIVE" and device != "GPU":
            raise ValueError("COLLECTIVE can not be used with GPU")

        cluster_envs = {}

        if device == "GPU":
            cluster_envs["selected_gpus"] = selected_gpus

        cluster_envs["server_num"] = server_num
        cluster_envs["worker_num"] = worker_num
        cluster_envs["fleet_mode"] = fleet_mode
        cluster_envs["train.trainer.trainer"] = trainer
        cluster_envs["train.trainer.engine"] = "cluster"
        cluster_envs["train.trainer.executor_mode"] = executor_mode
        cluster_envs["train.trainer.strategy"] = distributed_strategy
        cluster_envs["train.trainer.threads"] = envs.get_runtime_environ(
            "CPU_NUM")
        cluster_envs["train.trainer.platform"] = envs.get_platform()
        print("launch {} engine with cluster to with model: {}".format(
            trainer, args.model))
        set_runtime_envs(cluster_envs, args.model)

        trainer = TrainerFactory.create(args.model)
        return trainer
Exemple #6
0
    def worker():
        role = "WORKER"
        trainer = get_trainer_prefix(args) + "ClusterTrainer"
        cluster_envs = {}
        cluster_envs["train.trainer.trainer"] = trainer
        cluster_envs["train.trainer.engine"] = "cluster"
        cluster_envs["train.trainer.threads"] = envs.get_runtime_environ(
            "CPU_NUM")
        cluster_envs["train.trainer.platform"] = envs.get_platform()
        print("launch {} engine with cluster to with model: {}".format(
            trainer, args.model))
        set_runtime_envs(cluster_envs, args.model)

        trainer = TrainerFactory.create(args.model)
        return trainer
Exemple #7
0
def single_infer_engine(args):
    run_extras = get_all_inters_from_yaml(args.model, ["runner."])

    mode = envs.get_runtime_environ("mode")
    trainer_class = ".".join(["runner", mode, "trainer_class"])
    fleet_class = ".".join(["runner", mode, "fleet_mode"])
    device_class = ".".join(["runner", mode, "device"])
    selected_gpus_class = ".".join(["runner", mode, "selected_gpus"])

    epochs_class = ".".join(["runner", mode, "epochs"])
    epochs = run_extras.get(epochs_class, 1)
    if epochs > 1:
        warnings.warn(
            "It makes no sense to predict the same model for multiple epochs",
            category=UserWarning,
            stacklevel=2)

    trainer = run_extras.get(trainer_class, "GeneralTrainer")
    fleet_mode = run_extras.get(fleet_class, "ps")
    device = run_extras.get(device_class, "cpu")
    selected_gpus = run_extras.get(selected_gpus_class, "0")
    executor_mode = "infer"

    single_envs = {}

    if device.upper() == "GPU":
        selected_gpus_num = len(selected_gpus.split(","))
        if selected_gpus_num != 1:
            raise ValueError(
                "Single Mode Only Support One GPU, Set Local Cluster Mode to use Multi-GPUS"
            )

        single_envs["selsected_gpus"] = selected_gpus
        single_envs["FLAGS_selected_gpus"] = selected_gpus

    single_envs["train.trainer.trainer"] = trainer
    single_envs["train.trainer.executor_mode"] = executor_mode
    single_envs["fleet_mode"] = fleet_mode
    single_envs["train.trainer.threads"] = "2"
    single_envs["train.trainer.platform"] = envs.get_platform()
    single_envs["train.trainer.engine"] = "single"

    set_runtime_envs(single_envs, args.model)
    trainer = TrainerFactory.create(args.model)
    return trainer
Exemple #8
0
    def worker():
        role = "WORKER"

        _envs = envs.load_yaml(args.model)
        run_extras = get_all_inters_from_yaml(args.model,
                                              ["train.", "runner."])
        trainer_class = run_extras.get(
            "runner." + _envs["mode"] + ".trainer_class", None)

        if trainer_class:
            trainer = trainer_class
        else:
            trainer = "GeneralTrainer"

        executor_mode = "train"

        distributed_strategy = run_extras.get(
            "runner." + _envs["mode"] + ".distribute_strategy", "async")
        selected_gpus = run_extras.get(
            "runner." + _envs["mode"] + ".selected_gpus", "0")
        fleet_mode = run_extras.get("runner." + _envs["mode"] + ".fleet_mode",
                                    "ps")

        cluster_envs = {}
        cluster_envs["selected_gpus"] = selected_gpus
        cluster_envs["fleet_mode"] = fleet_mode
        cluster_envs["train.trainer.trainer"] = trainer
        cluster_envs["train.trainer.executor_mode"] = executor_mode
        cluster_envs["train.trainer.engine"] = "cluster"
        cluster_envs["train.trainer.strategy"] = distributed_strategy
        cluster_envs["train.trainer.threads"] = envs.get_runtime_environ(
            "CPU_NUM")
        cluster_envs["train.trainer.platform"] = envs.get_platform()
        print("launch {} engine with cluster to with model: {}".format(
            trainer, args.model))
        set_runtime_envs(cluster_envs, args.model)

        trainer = TrainerFactory.create(args.model)
        return trainer
Exemple #9
0
 def start_worker_procs(self):
     trainer = TrainerFactory.create(self.trainer)
     trainer.run()