Ejemplo n.º 1
0
    def _build_trainer_programs(self, compiled_config):
        from paddle.fluid.incubate.fleet.parameter_server.ir import trainer_pass as worker

        _main = compiled_config.origin_main_program.clone()
        _startup = compiled_config.origin_startup_program.clone()

        if not compiled_config.is_geo_mode():
            from paddle.fluid.incubate.fleet.parameter_server.ir.public import _add_lr_decay_table_pass
            _add_lr_decay_table_pass(
                _main, compiled_config,
                self.user_defined_strategy.a_sync_configs["lr_decay_steps"])

            # for main program
            _main = worker.delete_optimizer_pass(_main, compiled_config)
            _main = worker.distributed_ops_pass(_main, compiled_config)
            _main = worker.append_send_ops_pass(_main, compiled_config)

            # for startup program
            _startup = worker.fake_init_ops_pass(_startup, compiled_config)
            _startup = worker.delet_extra_optimizes_pass(
                _startup, compiled_config)

            compiled_config.set_origin_ps_main_program(_main)
            compiled_config.set_origin_ps_startup_program(_startup)
            # for heter program
            if self.role_maker._is_heter_parameter_server_mode:
                from paddle.fluid.incubate.fleet.parameter_server.ir import heter_trainer_pass as heter_worker
                if self.role_maker._is_heter_worker():
                    # for heter worker
                    _main = heter_worker.split_heter_worker_ops_pass(
                        _main, compiled_config)
                else:
                    # for default worker
                    _main = heter_worker.split_trainer_ops_pass(
                        _main, compiled_config)
                # for startup change
                _startup = heter_worker.delete_startup_useless_ops_var_pass(
                    _startup, _main, compiled_config)
        else:
            _main = worker.append_send_ops_pass(_main, compiled_config)
            _startup = _startup
            compiled_config.set_origin_ps_main_program(_main)
            compiled_config.set_origin_ps_startup_program(_startup)

        launch_barrier = self.user_defined_strategy.a_sync_configs[
            "launch_barrier"]
        launch_barrier_flag = int(os.getenv("FLAGS_LAUNCH_BARRIER", "1"))
        if launch_barrier and launch_barrier_flag:
            # for trainer wait server ready
            wait_server_ready(self.role_maker._get_pserver_endpoints())

            # for ps-heter mode, wait heter worker ready
            # if self.role_maker._is_heter_parameter_server_mode and self.role_maker._is_worker(
            # ):
            #     wait_server_ready(self.role_maker._get_heter_worker_endpoints())

        return _main, _startup
Ejemplo n.º 2
0
    def _build_trainer_programs(self, compiled_config):
        _main = fleet._origin_main_program.clone()
        _startup = fleet._origin_startup_program.clone()

        if not compiled_config.is_geo_mode():
            # for main program
            _main = worker.delete_optimizer_pass(_main, compiled_config)
            _main = worker.distributed_ops_pass(_main, compiled_config)
            _main = worker.append_send_ops_pass(_main, compiled_config)

            # for startup program
            _startup = worker.fake_init_ops_pass(_startup, compiled_config)
            _startup = worker.init_from_server_pass(_startup, compiled_config)
            _startup = worker.delet_extra_optimizes_pass(
                _startup, compiled_config)
        else:
            _main = worker.append_send_ops_pass(_main, compiled_config)
            _startup = _startup

        return _main, _startup
    def _build_trainer_programs(self, compiled_config):
        from paddle.fluid.incubate.fleet.parameter_server.ir import trainer_pass as worker

        _main = compiled_config.origin_main_program.clone()
        _startup = compiled_config.origin_startup_program.clone()

        use_ps_gpu = self.user_defined_strategy.a_sync_configs["use_ps_gpu"]

        if not compiled_config.is_geo_mode():
            from paddle.fluid.incubate.fleet.parameter_server.ir.public import _add_lr_decay_table_pass
            _add_lr_decay_table_pass(
                _main, compiled_config,
                self.user_defined_strategy.a_sync_configs["lr_decay_steps"])

            # for main program
            _main = worker.distributed_ops_pass(_main, compiled_config,
                                                use_ps_gpu)
            if not use_ps_gpu:
                _main = worker.delete_optimizer_pass(_main, compiled_config)
                _main = worker.append_send_ops_pass(_main, compiled_config)
                _startup = worker.delete_extra_optimizes_pass(
                    _startup, compiled_config)

                # for startup program
            _startup = worker.fake_init_ops_pass(_startup, compiled_config)
            if use_ps_gpu:
                _main = worker.ps_gpu_pass(_main)
                from paddle.fluid.transpiler.collective import SingleProcessMultiThread
                t = SingleProcessMultiThread()
                env = self.get_dist_env()
                t.transpile(startup_program=_startup,
                            main_program=_main,
                            rank=env["trainer_id"],
                            endpoints=env["trainer_endpoints"],
                            current_endpoint=env['current_endpoint'],
                            wait_port=False)

            compiled_config.set_origin_ps_main_program(_main)
            compiled_config.set_origin_ps_startup_program(_startup)
            # for heter program
            if self.role_maker._is_heter_parameter_server_mode:
                from paddle.fluid.incubate.fleet.parameter_server.ir import heter_trainer_pass as heter_worker
                if self.role_maker._is_heter_worker():
                    # for heter worker
                    stage_id = self.role_maker._get_stage_id()
                    device = self.role_maker._heter_device_type().lower()
                    _main = heter_worker.split_heter_worker_ops_pass(
                        _main, compiled_config, stage_id, device)
                else:
                    # for default worker
                    _main = heter_worker.split_trainer_ops_pass(
                        _main, compiled_config)
        else:
            _main = worker.append_send_ops_pass(_main, compiled_config)
            _startup = _startup
            compiled_config.set_origin_ps_main_program(_main)
            compiled_config.set_origin_ps_startup_program(_startup)

        launch_barrier = self.user_defined_strategy.a_sync_configs[
            "launch_barrier"]
        launch_barrier_flag = int(os.getenv("FLAGS_LAUNCH_BARRIER", "1"))
        if launch_barrier and launch_barrier_flag:
            # for trainer wait server ready
            wait_server_ready(self.role_maker._get_pserver_endpoints())

            # for ps-heter mode, wait heter worker ready
            # if self.role_maker._is_heter_parameter_server_mode and self.role_maker._is_worker(
            # ):
            #     wait_server_ready(self.role_maker._get_heter_worker_endpoints())

        return _main, _startup