def __init__(self, program_func, optimizer, param_path=None, place=None): # 1. we need to generate a framework.Program by calling # program_func. Reference: fluid.program_guard in # test_word2vec.py self.scope = core.Scope() self.startup_program = framework.Program() self.train_program = framework.Program() with framework.program_guard(self.train_program, self.startup_program): loss = program_func() if not isinstance(optimizer, opt_module.Optimizer): raise TypeError( "The optimizer should be an instance of Optimizer") optimize_ops, params_grads = optimizer.minimize(loss) self.place = Trainer._check_and_get_place(place) self.dist_transpile_if_necessary(optimize_ops, params_grads) # 2. move the default_main_program to self.program and run the # default_startup program on an empty core.Scope() # Run startup program with self._prog_and_scope_guard(): exe = executor.Executor(place) exe.run(self.startup_program) if param_path: # load params from param_path into scope io.load_persistables(exe, dirname=param_path)
def __init__(self, infer_func, param_path, place=None, parallel=False): """ :param infer_func: a function that will return predict Variable :param param_path: the path where the inference model is saved by fluid.io.save_params :param place: place to do the inference :param parallel: use parallel_executor to run the inference, it will use multi CPU/GPU. """ self.param_path = param_path self.scope = core.Scope() self.parallel = parallel self.place = check_and_get_place(place) self.inference_program = framework.Program() with framework.program_guard(self.inference_program): with unique_name.guard(): self.predict_var = infer_func() with self._prog_and_scope_guard(): # load params from param_path into scope io.load_params(executor.Executor(self.place), param_path) if parallel: with self._prog_and_scope_guard(): self.exe = parallel_executor.ParallelExecutor( use_cuda=isinstance(self.place, core.CUDAPlace), loss_name=self.predict_var.name) else: self.exe = executor.Executor(self.place)
def __init__(self, train_func, optimizer, param_path=None, place=None, parallel=False): self.__stop = False self.parallel = parallel # 1. we need to generate a framework.Program by calling # program_func. Reference: fluid.program_guard in # test_word2vec.py if not isinstance(optimizer, opt_module.Optimizer): raise TypeError("The optimizer should be an instance of Optimizer") self.scope = core.Scope() self.startup_program = framework.Program() self.train_program = framework.Program() with framework.program_guard(self.train_program, self.startup_program): program_func_outs = train_func() self.train_func_outputs = program_func_outs if isinstance( program_func_outs, list) else [program_func_outs] self.test_program = self.train_program.clone() if not isinstance(optimizer, opt_module.Optimizer): raise TypeError( "The optimizer should be an instance of Optimizer") # The fisrt element of program_func_outs is loss. loss = self.train_func_outputs[0] optimize_ops, params_grads = optimizer.minimize(loss) self.place = check_and_get_place(place) self._dist_transpile_if_necessary(optimize_ops, params_grads) # 2. move the default_main_program to self.program and run the # default_startup program on an empty core.Scope() # Run startup program with self._prog_and_scope_guard(): exe = executor.Executor(place) exe.run(self.startup_program) if param_path: # load params from param_path into scope io.load_persistables(exe, dirname=param_path)
def _load_lookup_table_vars(executor, dirname, program, pserver_id, table_name): """ The parameter server will load lookup table's local file in selectedrows variable. Args: executor(Executor): The executor to run for loading persistable variables dirname(str): The directory path main_program(Program): Find the variable named table_name in main_program pserver_id(int): the serial number in pserver_endpoints list table_name(str): lookup table name Returns: None Examples: .. code-block:: python exe = fluid.Executor(fluid.CPUPlace()) dirname = "./checkpoints/checkpoint_9/" prog = fluid.default_main_program() pserver_id = 1 table_name = "share_w" _load_lookup_table_vars(executor=exe, dirname=dirname, program=prog, pserver_id=pserver_id, table_name=table_name) """ for var in program.list_vars(): if var.name == table_name: lookup_table_var = var break assert lookup_table_var is not None lookup_table_dir = os.path.join(dirname, LOOKUP_TABLE_DIR) table_file = table_name + CHECKPOINT_SEPARATOR + str(pserver_id) load_prog = framework.Program() load_block = load_prog.global_block() load_block.append_op( type='load', inputs={}, outputs={'Out': [lookup_table_var]}, attrs={'file_path': os.path.join(lookup_table_dir, table_file)}) executor.run(load_prog)
def _save_pserver_vars_by_notify(executor, dirname, lookup_table, ps_endpoint_list): """ This function will send checkpoint notify message from Trainer 0 to all the pservers. The checkpoint notify message contains lookup table name, the absolute path on pserver to save lookup_table. Args: executor(Executor): The executor to run for send checkpoint notify. dirname(str): The folder where to save checkpoints. lookup_table(string): the lookup table name, when use distribute lookup table, we can get lookup table name by DistributeTranspiler. table_name ps_endpoint_list(list): the parameter server ip:port list. when use distribute lookup table, we can get ps_endpoint_list by distribute arguments. Return: None Examples: .. code-block:: python exe = fluid.Executor(fluid.CPUPlace()) param_path = "./my_paddle_model" prog = fluid.default_main_program() table_name = "share_w" ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"] _save_pserver_vars_by_notify(executor=exe, dirname=param_path, lookup_table=table_name, ps_endpoint_list=ps_endpoints) """ cur_dir = _get_lookuptable_dir(dirname) checkpoint_notify_program = framework.Program() checkpoint_notify_block = checkpoint_notify_program.global_block() attrs = {} attrs['epmap'] = ps_endpoint_list attrs['dir'] = cur_dir attrs['lookup_table'] = lookup_table checkpoint_notify_block.append_op(type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs) executor.run(checkpoint_notify_program)
def __init__(self, network_func, param_path=None, place=None): # 1. we need to generate a framework.Program by calling # network_func. Reference: fluid.program_guard in test_word2vec.py # 2. move the default_main_program to self.program. # 3. run the default_startup program. # 4. load params from param_path into scope self.scope = core.Scope() self.place = place self.startup_program = framework.Program() # TODO: generate the startup_program with network_func exe = executor.Executor(place) exe.run(self.startup_program, scope=self.scope) if param_path: # load params from param_path into scope io.load_persistables(exe, dirname=param_path)
def __init__(self, infer_func, param_path, place=None, parallel=False): self.param_path = param_path self.scope = core.Scope() self.parallel = parallel self.place = check_and_get_place(place) self.inference_program = framework.Program() with framework.program_guard(self.inference_program): with unique_name.guard(): self.predict_var = infer_func() with self._prog_and_scope_guard(): # load params from param_path into scope io.load_params(executor.Executor(self.place), param_path) if parallel: with self._prog_and_scope_guard(): self.exe = parallel_executor.ParallelExecutor( use_cuda=isinstance(self.place, core.CUDAPlace), loss_name=self.predict_var.name) else: self.exe = executor.Executor(self.place) self.inference_program = self.inference_program.clone(for_test=True)
def __init__(self, train_func, optimizer_func, param_path=None, place=None, parallel=False, checkpoint_config=None): self.__stop = False self.parallel = parallel # config for checkpoint # only chief worker will save variables self.trainer_id = 0 self.checkpoint_cfg = checkpoint_config if self.checkpoint_cfg: assert isinstance(self.checkpoint_cfg, CheckpointConfig) serial = io.get_latest_checkpoint_serial( self.checkpoint_cfg.checkpoint_dir) self.checkpoint_cfg.load_serial = serial if serial >= 0 else None self.scope = core.Scope() # 1. we need to generate a framework.Program by calling # program_func. Reference: fluid.program_guard in # test_word2vec.py self.startup_program = framework.Program() self.train_program = framework.Program() with framework.program_guard(self.train_program, self.startup_program): program_func_outs = train_func() self.train_func_outputs = program_func_outs if isinstance( program_func_outs, list) else [program_func_outs] self.test_program = self.train_program.clone(for_test=True) # The first element of program_func_outs is loss. loss = self.train_func_outputs[0] optimizer = optimizer_func() if not isinstance(optimizer, opt_module.Optimizer): raise TypeError( "The optimizer should be an instance of Optimizer") optimize_ops, params_grads = optimizer.minimize(loss) self.place = check_and_get_place(place) self._dist_transpile_if_necessary(optimize_ops, params_grads) # 2. move the default_main_program to self.program and run the # default_startup program on an empty core.Scope() # Run startup program with self._prog_and_scope_guard(): exe = executor.Executor(place) exe.run(self.startup_program) if self.checkpoint_cfg and self.checkpoint_cfg.load_serial: with self._prog_and_scope_guard(): exe = executor.Executor(place) io.load_checkpoint(exe, self.checkpoint_cfg.checkpoint_dir, self.checkpoint_cfg.load_serial, self.startup_program) if not self.checkpoint_cfg.is_pserver: epoch_id, step_id = io.load_trainer_args( self.checkpoint_cfg.checkpoint_dir, self.checkpoint_cfg.load_serial, self.trainer_id, self._get_checkpoint_load_args()) self.checkpoint_cfg.epoch_id = int(epoch_id) self.checkpoint_cfg.step_id = int(step_id) if param_path and os.path.isdir(param_path): # load params from param_path into scope io.load_persist_vars_without_grad( exe, dirname=param_path, program=self.startup_program)