def _build_trainer_programs(self, compiled_config): from paddle.fluid.incubate.fleet.parameter_server.ir import trainer_pass as worker _main = compiled_config.origin_main_program.clone() _startup = compiled_config.origin_startup_program.clone() if not compiled_config.is_geo_mode(): from paddle.fluid.incubate.fleet.parameter_server.ir.public import _add_lr_decay_table_pass _add_lr_decay_table_pass( _main, compiled_config, self.user_defined_strategy.a_sync_configs["lr_decay_steps"]) # for main program _main = worker.delete_optimizer_pass(_main, compiled_config) _main = worker.distributed_ops_pass(_main, compiled_config) _main = worker.append_send_ops_pass(_main, compiled_config) # for startup program _startup = worker.fake_init_ops_pass(_startup, compiled_config) _startup = worker.delet_extra_optimizes_pass( _startup, compiled_config) compiled_config.set_origin_ps_main_program(_main) compiled_config.set_origin_ps_startup_program(_startup) # for heter program if self.role_maker._is_heter_parameter_server_mode: from paddle.fluid.incubate.fleet.parameter_server.ir import heter_trainer_pass as heter_worker if self.role_maker._is_heter_worker(): # for heter worker _main = heter_worker.split_heter_worker_ops_pass( _main, compiled_config) else: # for default worker _main = heter_worker.split_trainer_ops_pass( _main, compiled_config) # for startup change _startup = heter_worker.delete_startup_useless_ops_var_pass( _startup, _main, compiled_config) else: _main = worker.append_send_ops_pass(_main, compiled_config) _startup = _startup compiled_config.set_origin_ps_main_program(_main) compiled_config.set_origin_ps_startup_program(_startup) launch_barrier = self.user_defined_strategy.a_sync_configs[ "launch_barrier"] launch_barrier_flag = int(os.getenv("FLAGS_LAUNCH_BARRIER", "1")) if launch_barrier and launch_barrier_flag: # for trainer wait server ready wait_server_ready(self.role_maker._get_pserver_endpoints()) # for ps-heter mode, wait heter worker ready # if self.role_maker._is_heter_parameter_server_mode and self.role_maker._is_worker( # ): # wait_server_ready(self.role_maker._get_heter_worker_endpoints()) return _main, _startup
def _build_trainer_programs(self, compiled_config): _main = fleet._origin_main_program.clone() _startup = fleet._origin_startup_program.clone() if not compiled_config.is_geo_mode(): # for main program _main = worker.delete_optimizer_pass(_main, compiled_config) _main = worker.distributed_ops_pass(_main, compiled_config) _main = worker.append_send_ops_pass(_main, compiled_config) # for startup program _startup = worker.fake_init_ops_pass(_startup, compiled_config) _startup = worker.init_from_server_pass(_startup, compiled_config) _startup = worker.delet_extra_optimizes_pass( _startup, compiled_config) else: _main = worker.append_send_ops_pass(_main, compiled_config) _startup = _startup return _main, _startup
def run_single_pass(self): self.init_fleet_with_gloo() self.model = get_model(config) input_data = self.model.create_feeds() metrics = self.model.net(input_data) loss = self.model._cost user_defined_strategy = get_user_defined_strategy(config) learning_rate = config.get("hyper_parameters.optimizer.learning_rate") sync_mode = self.config.get("runner.sync_mode") inner_optimizer = paddle.optimizer.Adam(learning_rate, lazy_mode=True) startup_program = paddle.static.default_startup_program() inner_optimizer.minimize(loss, startup_program) if self.config['debug_new_pass'] == 1: print("entering run {} - new".format( str(config["applied_pass_name"]))) from paddle.distributed.fleet.meta_optimizers.ps_optimizer import ParameterServerOptimizer ps_optimizer = ParameterServerOptimizer(inner_optimizer) ps_optimizer._set_basic_info(loss, self.role_maker, inner_optimizer, user_defined_strategy) ps_optimizer._set_origin_programs([loss]) ps_optimizer._init_ps_pass_context(loss, startup_program) _main = ps_optimizer.pass_ctx._attrs['cloned_main'] append_send_ops_pass = new_pass(config["applied_pass_name"], ps_optimizer.pass_ctx._attrs) append_send_ops_pass.apply([_main], [None], ps_optimizer.pass_ctx) else: print("entering run {} - old".format( str(config["applied_pass_name"]))) from paddle.fluid.incubate.fleet.parameter_server.ir import public as public dist_strategy = get_distributed_strategy(user_defined_strategy) compiled_config = public.CompileTimeStrategy( loss.block.program, startup_program, dist_strategy, self.role_maker) _main = compiled_config.origin_main_program.clone() _startup = compiled_config.origin_startup_program.clone() from paddle.fluid.incubate.fleet.parameter_server.ir import trainer_pass as worker _main = worker.append_send_ops_pass(_main, compiled_config) if fleet.is_server(): _main_file = ps_log_root_dir + sync_mode + "_" + str( config["applied_pass_name"]) + '_debug:_' + str( self.config['debug_new_pass']) + '_server_main.prototxt' debug_program(_main_file, _main) elif fleet.is_worker(): _main_file = ps_log_root_dir + sync_mode + "_" + str( config["applied_pass_name"]) + '_debug:_' + str( self.config['debug_new_pass']) + '_worker_main.prototxt' debug_program(_main_file, _main)
def _build_trainer_programs(self, compiled_config): from paddle.fluid.incubate.fleet.parameter_server.ir import trainer_pass as worker _main = compiled_config.origin_main_program.clone() _startup = compiled_config.origin_startup_program.clone() use_ps_gpu = self.user_defined_strategy.a_sync_configs["use_ps_gpu"] if not compiled_config.is_geo_mode(): from paddle.fluid.incubate.fleet.parameter_server.ir.public import _add_lr_decay_table_pass _add_lr_decay_table_pass( _main, compiled_config, self.user_defined_strategy.a_sync_configs["lr_decay_steps"]) # for main program _main = worker.distributed_ops_pass(_main, compiled_config, use_ps_gpu) if not use_ps_gpu: _main = worker.delete_optimizer_pass(_main, compiled_config) _main = worker.append_send_ops_pass(_main, compiled_config) _startup = worker.delete_extra_optimizes_pass( _startup, compiled_config) # for startup program _startup = worker.fake_init_ops_pass(_startup, compiled_config) if use_ps_gpu: _main = worker.ps_gpu_pass(_main) from paddle.fluid.transpiler.collective import SingleProcessMultiThread t = SingleProcessMultiThread() env = self.get_dist_env() t.transpile(startup_program=_startup, main_program=_main, rank=env["trainer_id"], endpoints=env["trainer_endpoints"], current_endpoint=env['current_endpoint'], wait_port=False) compiled_config.set_origin_ps_main_program(_main) compiled_config.set_origin_ps_startup_program(_startup) # for heter program if self.role_maker._is_heter_parameter_server_mode: from paddle.fluid.incubate.fleet.parameter_server.ir import heter_trainer_pass as heter_worker if self.role_maker._is_heter_worker(): # for heter worker stage_id = self.role_maker._get_stage_id() device = self.role_maker._heter_device_type().lower() _main = heter_worker.split_heter_worker_ops_pass( _main, compiled_config, stage_id, device) else: # for default worker _main = heter_worker.split_trainer_ops_pass( _main, compiled_config) else: _main = worker.append_send_ops_pass(_main, compiled_config) _startup = _startup compiled_config.set_origin_ps_main_program(_main) compiled_config.set_origin_ps_startup_program(_startup) launch_barrier = self.user_defined_strategy.a_sync_configs[ "launch_barrier"] launch_barrier_flag = int(os.getenv("FLAGS_LAUNCH_BARRIER", "1")) if launch_barrier and launch_barrier_flag: # for trainer wait server ready wait_server_ready(self.role_maker._get_pserver_endpoints()) # for ps-heter mode, wait heter worker ready # if self.role_maker._is_heter_parameter_server_mode and self.role_maker._is_worker( # ): # wait_server_ready(self.role_maker._get_heter_worker_endpoints()) return _main, _startup