Esempio n. 1
0
    def master():
        from paddlerec.core.engine.cluster.cluster import ClusterEngine
        _envs = envs.load_yaml(args.backend)
        flattens = envs.flatten_environs(_envs, "_")
        flattens["engine_role"] = "MASTER"
        flattens["engine_mode"] = envs.get_runtime_environ("mode")
        flattens["engine_run_config"] = args.model
        flattens["engine_temp_path"] = tempfile.mkdtemp()
        envs.set_runtime_environs(flattens)
        ClusterEngine.workspace_replace()
        print(envs.pretty_print_envs(flattens, ("Submit Envs", "Value")))

        launch = ClusterEngine(None, args.model)
        return launch
Esempio n. 2
0
    def master():
        from paddlerec.core.engine.cluster.cluster import ClusterEngine

        # Get fleet_mode & device
        run_extras = get_all_inters_from_yaml(args.model, ["runner."])
        mode = envs.get_runtime_environ("mode")
        fleet_class = ".".join(["runner", mode, "fleet_mode"])
        device_class = ".".join(["runner", mode, "device"])
        fleet_mode = run_extras.get(fleet_class, "ps")
        device = run_extras.get(device_class, "cpu")
        device = device.upper()
        fleet_mode = fleet_mode.upper()

        if fleet_mode == "COLLECTIVE" and device != "GPU":
            raise ValueError("COLLECTIVE can not be used without GPU")

        # Get Thread nums
        model_envs = envs.load_yaml(args.model)
        phases_class = ".".join(["runner", mode, "phases"])
        phase_names = run_extras.get(phases_class)
        phases = []
        all_phases = model_envs.get("phase")
        if phase_names is None:
            phases = all_phases
        else:
            for phase in all_phases:
                if phase["name"] in phase_names:
                    phases.append(phase)

        thread_num = []
        for phase in phases:
            thread_num.append(int(phase["thread_num"]))
        max_thread_num = max(thread_num)

        backend_envs = envs.load_yaml(args.backend)
        flattens = envs.flatten_environs(backend_envs, "_")
        flattens["engine_role"] = "MASTER"
        flattens["engine_mode"] = envs.get_runtime_environ("mode")
        flattens["engine_run_config"] = args.model
        flattens["max_thread_num"] = max_thread_num
        flattens["fleet_mode"] = fleet_mode
        flattens["device"] = device
        flattens["backend_yaml"] = args.backend
        envs.set_runtime_environs(flattens)

        launch = ClusterEngine(None, args.model)
        return launch
Esempio n. 3
0
    def master():
        role = "MASTER"
        from paddlerec.core.engine.cluster.cluster import ClusterEngine
        with open(args.backend, 'r') as rb:
            _envs = yaml.load(rb.read(), Loader=yaml.FullLoader)

        flattens = envs.flatten_environs(_envs, "_")
        flattens["engine_role"] = role
        flattens["engine_run_config"] = args.model
        flattens["engine_temp_path"] = tempfile.mkdtemp()
        update_workspace(flattens)

        envs.set_runtime_environs(flattens)
        print(
            envs.pretty_print_envs(flattens, ("Submit Runtime Envs", "Value")))

        launch = ClusterEngine(None, args.model)
        return launch
Esempio n. 4
0
    def worker(mode):
        if not mode:
            raise ValueError("mode: {} can not be recognized")
        from paddlerec.core.engine.cluster.cluster import ClusterEngine

        run_extras = get_all_inters_from_yaml(args.model, ["runner."])

        trainer_class = ".".join(["runner", mode, "trainer_class"])
        fleet_class = ".".join(["runner", mode, "fleet_mode"])
        device_class = ".".join(["runner", mode, "device"])
        strategy_class = ".".join(["runner", mode, "distribute_strategy"])
        trainer = run_extras.get(trainer_class, "GeneralTrainer")
        fleet_mode = run_extras.get(fleet_class, "ps")
        device = run_extras.get(device_class, "cpu")
        distributed_strategy = run_extras.get(strategy_class, "async")
        executor_mode = "train"

        device = device.upper()
        fleet_mode = fleet_mode.upper()
        if fleet_mode == "COLLECTIVE" and device != "GPU":
            raise ValueError("COLLECTIVE can not be used without GPU")

        cluster_envs = {}

        cluster_envs["fleet_mode"] = fleet_mode
        cluster_envs["engine_role"] = "WORKER"
        cluster_envs["log_dir"] = "logs"
        cluster_envs["train.trainer.trainer"] = trainer
        cluster_envs["train.trainer.engine"] = "cluster"
        cluster_envs["train.trainer.executor_mode"] = executor_mode
        cluster_envs["train.trainer.strategy"] = distributed_strategy
        cluster_envs["train.trainer.threads"] = envs.get_runtime_environ(
            "CPU_NUM")
        cluster_envs["train.trainer.platform"] = envs.get_platform()
        print("launch {} engine with cluster to with model: {}".format(
            trainer, args.model))

        set_runtime_envs(cluster_envs, args.model)
        launch = ClusterEngine(None, args.model)
        return launch