Example #1
0
def local_cluster_engine(args):
    from paddlerec.core.engine.local_cluster import LocalClusterEngine

    _envs = envs.load_yaml(args.model)
    run_extras = get_all_inters_from_yaml(args.model, ["train.", "runner."])
    trainer_class = run_extras.get("runner." + _envs["mode"] + ".runner_class",
                                   None)

    if trainer_class:
        trainer = trainer_class
    else:
        trainer = "GeneralTrainer"

    executor_mode = "train"
    distributed_strategy = run_extras.get(
        "runner." + _envs["mode"] + ".distribute_strategy", "async")

    worker_num = run_extras.get("runner." + _envs["mode"] + ".worker_num", 1)
    server_num = run_extras.get("runner." + _envs["mode"] + ".server_num", 1)
    selected_gpus = run_extras.get(
        "runner." + _envs["mode"] + ".selected_gpus", "0")

    fleet_mode = run_extras.get("runner." + _envs["mode"] + ".fleet_mode", "")
    if fleet_mode == "":
        device = run_extras.get("runner." + _envs["mode"] + ".device", "cpu")
        if len(selected_gpus.split(",")) > 1 and device.upper() == "GPU":
            fleet_mode = "COLLECTIVE"
        else:
            fleet_mode = "PS"

    cluster_envs = {}
    cluster_envs["server_num"] = server_num
    cluster_envs["worker_num"] = worker_num
    cluster_envs["selected_gpus"] = selected_gpus
    cluster_envs["start_port"] = envs.find_free_port()
    cluster_envs["fleet_mode"] = fleet_mode
    cluster_envs["log_dir"] = "logs"
    cluster_envs["train.trainer.trainer"] = trainer
    cluster_envs["train.trainer.executor_mode"] = executor_mode
    cluster_envs["train.trainer.strategy"] = distributed_strategy
    cluster_envs["train.trainer.threads"] = "2"
    cluster_envs["train.trainer.engine"] = "local_cluster"
    cluster_envs["train.trainer.platform"] = envs.get_platform()

    cluster_envs["CPU_NUM"] = "2"
    print("launch {} engine with cluster to run model: {}".format(
        trainer, args.model))

    set_runtime_envs(cluster_envs, args.model)
    launch = LocalClusterEngine(cluster_envs, args.model)
    return launch
Example #2
0
def local_cluster_engine(args):
    from paddlerec.core.engine.local_cluster import LocalClusterEngine

    trainer = get_trainer_prefix(args) + "ClusterTrainer"
    cluster_envs = {}
    cluster_envs["server_num"] = 1
    cluster_envs["worker_num"] = 1
    cluster_envs["start_port"] = envs.find_free_port()
    cluster_envs["log_dir"] = "logs"
    cluster_envs["train.trainer.trainer"] = trainer
    cluster_envs["train.trainer.strategy"] = "async"
    cluster_envs["train.trainer.threads"] = "2"
    cluster_envs["train.trainer.engine"] = "local_cluster"
    cluster_envs["train.trainer.platform"] = envs.get_platform()

    cluster_envs["CPU_NUM"] = "2"
    print("launch {} engine with cluster to run model: {}".format(
        trainer, args.model))

    set_runtime_envs(cluster_envs, args.model)
    launch = LocalClusterEngine(cluster_envs, args.model)
    return launch
Example #3
0
    def start_procs(self):
        fleet_mode = self.envs["fleet_mode"]
        worker_num = self.envs["worker_num"]
        server_num = self.envs["server_num"]
        ports = [self.envs["start_port"]]
        logs_dir = self.envs["log_dir"]
        selected_gpus = self.envs["selected_gpus"].split(",")
        default_env = os.environ.copy()
        current_env = copy.copy(default_env)
        current_env["CLUSTER_INSTANCE"] = "1"
        current_env.pop("http_proxy", None)
        current_env.pop("https_proxy", None)
        procs = []
        log_fns = []

        if fleet_mode.upper() == "PS":
            for i in range(server_num - 1):
                while True:
                    new_port = envs.find_free_port()
                    if new_port not in ports:
                        ports.append(new_port)
                        break
            user_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports])

            user_endpoints_ips = [
                x.split(":")[0] for x in user_endpoints.split(",")
            ]
            user_endpoints_port = [
                x.split(":")[1] for x in user_endpoints.split(",")
            ]

            factory = "paddlerec.core.factory"
            cmd = [sys.executable, "-u", "-m", factory, self.trainer]

            for i in range(server_num):
                current_env.update({
                    "PADDLE_PSERVERS_IP_PORT_LIST": user_endpoints,
                    "PADDLE_PORT": user_endpoints_port[i],
                    "TRAINING_ROLE": "PSERVER",
                    "PADDLE_TRAINERS_NUM": str(worker_num),
                    "POD_IP": user_endpoints_ips[i]
                })

                os.system("mkdir -p {}".format(logs_dir))
                fn = open("%s/server.%d" % (logs_dir, i), "w")
                log_fns.append(fn)
                proc = subprocess.Popen(cmd,
                                        env=current_env,
                                        stdout=fn,
                                        stderr=fn,
                                        cwd=os.getcwd())
                procs.append(proc)

            for i in range(worker_num):
                current_env.update({
                    "PADDLE_PSERVERS_IP_PORT_LIST": user_endpoints,
                    "PADDLE_TRAINERS_NUM": str(worker_num),
                    "TRAINING_ROLE": "TRAINER",
                    "PADDLE_TRAINER_ID": str(i)
                })

                os.system("mkdir -p {}".format(logs_dir))
                fn = open("%s/worker.%d" % (logs_dir, i), "w")
                log_fns.append(fn)
                proc = subprocess.Popen(cmd,
                                        env=current_env,
                                        stdout=fn,
                                        stderr=fn,
                                        cwd=os.getcwd())
                procs.append(proc)

            # only wait worker to finish here
            for i, proc in enumerate(procs):
                if i < server_num:
                    continue
                procs[i].wait()
                if len(log_fns) > 0:
                    log_fns[i].close()

            for i in range(server_num):
                if len(log_fns) > 0:
                    log_fns[i].close()
                procs[i].terminate()
            print(
                "all workers already completed, you can view logs under the `{}` directory"
                .format(logs_dir),
                file=sys.stderr)
        elif fleet_mode.upper() == "COLLECTIVE":

            selected_gpus_num = len(selected_gpus)

            for i in range(selected_gpus_num - 1):
                while True:
                    new_port = envs.find_free_port()
                    if new_port not in ports:
                        ports.append(new_port)
                        break
            user_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports])

            factory = "paddlerec.core.factory"
            cmd = [sys.executable, "-u", "-m", factory, self.trainer]

            for i in range(selected_gpus_num):
                current_env.update({
                    "PADDLE_TRAINER_ENDPOINTS":
                    user_endpoints,
                    "PADDLE_CURRENT_ENDPOINTS":
                    user_endpoints[i],
                    "PADDLE_TRAINERS_NUM":
                    str(worker_num),
                    "TRAINING_ROLE":
                    "TRAINER",
                    "PADDLE_TRAINER_ID":
                    str(i),
                    "FLAGS_selected_gpus":
                    str(selected_gpus[i])
                })

                os.system("mkdir -p {}".format(logs_dir))
                fn = open("%s/worker.%d" % (logs_dir, i), "w")
                log_fns.append(fn)
                proc = subprocess.Popen(cmd,
                                        env=current_env,
                                        stdout=fn,
                                        stderr=fn,
                                        cwd=os.getcwd())
                procs.append(proc)
Example #4
0
def local_cluster_engine(args):
    def get_worker_num(run_extras, workers):
        _envs = envs.load_yaml(args.model)
        mode = envs.get_runtime_environ("mode")
        workspace = envs.get_runtime_environ("workspace")
        phases_class = ".".join(["runner", mode, "phases"])
        phase_names = run_extras.get(phases_class)
        phases = []
        all_phases = _envs.get("phase")
        if phase_names is None:
            phases = all_phases
        else:
            for phase in all_phases:
                if phase["name"] in phase_names:
                    phases.append(phase)

        dataset_names = []
        for phase in phases:
            dataset_names.append(phase["dataset_name"])

        datapaths = []
        for dataset in _envs.get("dataset"):
            if dataset["name"] in dataset_names:
                datapaths.append(dataset["data_path"])

        if not datapaths:
            raise ValueError("data path must exist for training/inference")

        datapaths = [
            envs.workspace_adapter_by_specific(path, workspace)
            for path in datapaths
        ]

        all_workers = [len(os.listdir(path)) for path in datapaths]
        all_workers.append(workers)
        max_worker_num = min(all_workers)

        if max_worker_num >= workers:
            return workers

        print(
            "phases do not have enough datas for training, set worker/gpu cards num from {} to {}"
            .format(workers, max_worker_num))

        return max_worker_num

    from paddlerec.core.engine.local_cluster import LocalClusterEngine

    run_extras = get_all_inters_from_yaml(args.model, ["runner."])
    mode = envs.get_runtime_environ("mode")
    trainer_class = ".".join(["runner", mode, "trainer_class"])
    fleet_class = ".".join(["runner", mode, "fleet_mode"])
    device_class = ".".join(["runner", mode, "device"])
    selected_gpus_class = ".".join(["runner", mode, "selected_gpus"])
    strategy_class = ".".join(["runner", mode, "distribute_strategy"])
    worker_class = ".".join(["runner", mode, "worker_num"])
    server_class = ".".join(["runner", mode, "server_num"])

    trainer = run_extras.get(trainer_class, "GeneralTrainer")
    fleet_mode = run_extras.get(fleet_class, "ps")
    device = run_extras.get(device_class, "cpu")
    selected_gpus = run_extras.get(selected_gpus_class, "0")
    distributed_strategy = run_extras.get(strategy_class, "async")
    executor_mode = "train"

    worker_num = run_extras.get(worker_class, 1)
    server_num = run_extras.get(server_class, 1)

    device = device.upper()
    fleet_mode = fleet_mode.upper()

    cluster_envs = {}

    # Todo: delete follow hard code when paddle support ps-gpu.
    if device == "CPU":
        fleet_mode = "PS"
    elif device == "GPU":
        fleet_mode = "COLLECTIVE"
    if fleet_mode == "PS" and device != "CPU":
        raise ValueError("PS can not be used with GPU")

    if fleet_mode == "COLLECTIVE" and device != "GPU":
        raise ValueError("COLLECTIVE can not be used without GPU")

    if fleet_mode == "PS":
        worker_num = get_worker_num(run_extras, worker_num)

    if fleet_mode == "COLLECTIVE":
        cluster_envs["selected_gpus"] = selected_gpus
        gpus = selected_gpus.split(",")
        worker_num = get_worker_num(run_extras, len(gpus))
        cluster_envs["selected_gpus"] = ','.join(gpus[:worker_num])

    cluster_envs["server_num"] = server_num
    cluster_envs["worker_num"] = worker_num
    cluster_envs["start_port"] = envs.find_free_port()
    cluster_envs["fleet_mode"] = fleet_mode
    cluster_envs["log_dir"] = "logs"
    cluster_envs["train.trainer.trainer"] = trainer
    cluster_envs["train.trainer.executor_mode"] = executor_mode
    cluster_envs["train.trainer.strategy"] = distributed_strategy
    cluster_envs["train.trainer.threads"] = "2"
    cluster_envs["CPU_NUM"] = cluster_envs["train.trainer.threads"]
    cluster_envs["train.trainer.engine"] = "local_cluster"
    cluster_envs["train.trainer.platform"] = envs.get_platform()

    print("launch {} engine with cluster to run model: {}".format(
        trainer, args.model))

    set_runtime_envs(cluster_envs, args.model)
    launch = LocalClusterEngine(cluster_envs, args.model)
    return launch
Example #5
0
    def start_procs(self):
        fleet_mode = self.envs["fleet_mode"]
        worker_num = self.envs["worker_num"]
        server_num = self.envs["server_num"]
        ports = [self.envs["start_port"]]
        logs_dir = self.envs["log_dir"]

        default_env = os.environ.copy()
        current_env = copy.copy(default_env)
        current_env["CLUSTER_INSTANCE"] = "1"
        current_env.pop("http_proxy", None)
        current_env.pop("https_proxy", None)
        procs = []
        log_fns = []

        if fleet_mode.upper() == "PS":
            for i in range(server_num - 1):
                while True:
                    new_port = envs.find_free_port()
                    if new_port not in ports:
                        ports.append(new_port)
                        break
            user_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports])

            user_endpoints_ips = [
                x.split(":")[0] for x in user_endpoints.split(",")
            ]
            user_endpoints_port = [
                x.split(":")[1] for x in user_endpoints.split(",")
            ]

            factory = "paddlerec.core.factory"
            cmd = [sys.executable, "-u", "-m", factory, self.trainer]

            for i in range(server_num):
                current_env.update({
                    "PADDLE_PSERVERS_IP_PORT_LIST": user_endpoints,
                    "PADDLE_PORT": user_endpoints_port[i],
                    "TRAINING_ROLE": "PSERVER",
                    "PADDLE_TRAINERS_NUM": str(worker_num),
                    "POD_IP": user_endpoints_ips[i]
                })

                os.system("mkdir -p {}".format(logs_dir))
                fn = open("%s/server.%d" % (logs_dir, i), "w")
                log_fns.append(fn)
                proc = subprocess.Popen(cmd,
                                        env=current_env,
                                        stdout=fn,
                                        stderr=fn,
                                        cwd=os.getcwd())
                procs.append(proc)

            for i in range(worker_num):
                current_env.update({
                    "PADDLE_PSERVERS_IP_PORT_LIST": user_endpoints,
                    "PADDLE_TRAINERS_NUM": str(worker_num),
                    "TRAINING_ROLE": "TRAINER",
                    "PADDLE_TRAINER_ID": str(i)
                })

                os.system("mkdir -p {}".format(logs_dir))
                fn = open("%s/worker.%d" % (logs_dir, i), "w")
                log_fns.append(fn)
                proc = subprocess.Popen(cmd,
                                        env=current_env,
                                        stdout=fn,
                                        stderr=fn,
                                        cwd=os.getcwd())
                procs.append(proc)

        elif fleet_mode.upper() == "COLLECTIVE":
            cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
            if cuda_visible_devices is None or cuda_visible_devices == "":
                selected_gpus = [
                    x.strip() for x in self.envs["selected_gpus"].split(",")
                ]
            else:
                # change selected_gpus into relative values
                # e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.selected_gpus=4,5,6,7;
                # therefore selected_gpus=0,1,2,3
                cuda_visible_devices_list = cuda_visible_devices.split(',')
                for x in self.envs["selected_gpus"].split(","):
                    assert x in cuda_visible_devices_list, "Can't find "\
                    "your selected_gpus %s in CUDA_VISIBLE_DEVICES[%s]."\
                    % (x, cuda_visible_devices)
                selected_gpus = [
                    cuda_visible_devices_list.index(x.strip())
                    for x in self.envs["selected_gpus"].split(",")
                ]
            selected_gpus_num = len(selected_gpus)

            factory = "paddlerec.core.factory"
            cmd = [sys.executable, "-u", "-m", factory, self.trainer]

            print("use_paddlecloud_flag:{}".format(
                cluster_utils.use_paddlecloud()))
            if cluster_utils.use_paddlecloud():
                cluster, pod = cluster_utils.get_cloud_cluster(selected_gpus)
                logger.info("get cluster from cloud:{}".format(cluster))
                procs = cluster_utils.start_local_trainers(cluster,
                                                           pod,
                                                           cmd,
                                                           log_dir=logs_dir)

            else:
                # trainers_num = 1 or not use paddlecloud ips="a,b"
                for i in range(selected_gpus_num - 1):
                    while True:
                        new_port = envs.find_free_port()
                        if new_port not in ports:
                            ports.append(new_port)
                            break
                user_endpoints = ",".join(
                    ["127.0.0.1:" + str(x) for x in ports])
                for i in range(selected_gpus_num):
                    current_env.update({
                        "PADDLE_TRAINER_ENDPOINTS":
                        user_endpoints,
                        "PADDLE_CURRENT_ENDPOINTS":
                        user_endpoints[i],
                        "PADDLE_TRAINERS_NUM":
                        str(worker_num),
                        "TRAINING_ROLE":
                        "TRAINER",
                        "PADDLE_TRAINER_ID":
                        str(i),
                        "FLAGS_selected_gpus":
                        str(selected_gpus[i]),
                        "PADDLEREC_GPU_NUMS":
                        str(selected_gpus_num)
                    })

                    os.system("mkdir -p {}".format(logs_dir))
                    fn = open("%s/worker.%d" % (logs_dir, i), "w")
                    log_fns.append(fn)
                    proc = subprocess.Popen(cmd,
                                            env=current_env,
                                            stdout=fn,
                                            stderr=fn,
                                            cwd=os.getcwd())
                    procs.append(proc)

        # only wait worker to finish here
        for i, proc in enumerate(procs):
            if i < server_num:
                continue
            procs[i].wait()
            if len(log_fns) > 0:
                log_fns[i].close()

        for i in range(server_num):
            if len(log_fns) > 0:
                log_fns[i].close()
            procs[i].terminate()
        print(
            "all workers already completed, you can view logs under the `{}` directory"
            .format(logs_dir),
            file=sys.stderr)