def run_pserver(pserver_id): remove_ps_flag(os.getpid()) scope = fluid.core.Scope() program = Program() with fluid.scope_guard(scope): with program_guard(program, startup_program=Program()): # create table parameter in scope place = fluid.CPUPlace() # create and initialize Param Variable param = scope.var('table').get_tensor() param_array = np.ones((5, 8)).astype("float32") for i in range(len(param_array)): param_array[i] *= param_array[i] * i + pserver_id * 10 + 1 param.set(param_array, place) optimize_block = program._create_block(program.global_block().idx) program.global_block().append_op(type="listen_and_serv", inputs={'X': []}, outputs={}, attrs={ "optimize_blocks": [optimize_block], "endpoint": '127.0.0.1:0', "Fanin": 1, "distributed_mode": DistributedMode.SYNC, "grad_to_block_id": [] }) exe = fluid.Executor(place) exe.run(program)
def run_pserver(pserver_id, use_cuda, sync_mode): scope = fluid.core.Scope() program = Program() with fluid.scope_guard(scope): with program_guard(program, startup_program=Program()): # create table parameter in scope place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() # create and initialize Param Variable param = scope.var('table').get_tensor() param_array = np.ones((10, 8)).astype("float32") for i in range(len(param_array)): param_array[i] *= param_array[i] * i + pserver_id * 10 param.set(param_array, place) optimize_block = program._create_block(program.global_block().idx) program.global_block().append_op( type="listen_and_serv", inputs={'X': []}, outputs={}, attrs={ "optimize_blocks": [optimize_block], "endpoint": '127.0.0.1:0', "Fanin": 1, "sync_mode": True, "grad_to_block_id": [] }) exe = fluid.Executor(place) exe.run(program)
def get_pserver_program(self, endpoint): """ Get parameter server side program. Args: endpoint (str): current parameter server endpoint. Returns: Program: the program for current parameter server to run. """ # TODO(panyx0718): Revisit this assumption. what if #blocks > #pservers. # NOTE: assume blocks of the same variable is not distributed # on the same pserver, only change param/grad varnames for # trainers to fetch. sys.stderr.write( "get_pserver_program() is deprecated, call get_pserver_programs() to get pserver main and startup in a single call.\n" ) # step1 pserver_program = Program() pserver_program.random_seed = self.origin_program.random_seed pserver_program._copy_dist_param_info_from(self.origin_program) # step2: Create vars to receive vars at parameter servers. recv_inputs = [] for v in self.param_grad_ep_mapping[endpoint]["params"]: self._clone_var(pserver_program.global_block(), v) for v in self.param_grad_ep_mapping[endpoint]["opti"]: # create vars for each trainer in global scope, so # we don't need to create them when grad arrives. # change client side var name to origin name by # removing ".trainer_%d" suffix suff_idx = v.name.find(".opti.trainer_") if suff_idx >= 0: orig_var_name = v.name[:suff_idx] # NOTE: single_trainer_var must be created for multi-trainer # case to merge grads from multiple trainers single_trainer_var = pserver_program.global_block().var( orig_var_name) if self.sync_mode and self.trainer_num > 1: for trainer_id in range(self.trainer_num): var = pserver_program.global_block().create_var( name="%s.opti.trainer_%d" % (orig_var_name, trainer_id), persistable=False, type=v.type, dtype=v.dtype, shape=v.shape) recv_inputs.append(var) # step 3 # Create a union-find data structure from optimize ops, # If two ops are connected, we could add these two ops # into one set. ufind = self._create_ufind(self.optimize_ops) # step 3.2 # Iterate through the ops and append optimize op which # located on current pserver opt_op_on_pserver = [] for _, op in enumerate(self.optimize_ops): if self._is_optimizer_op(op) and self._is_opt_op_on_pserver( endpoint, op): opt_op_on_pserver.append(op) # step 3.4 # Iterate through the ops, and if an op and the optimize ops # which located on current pserver are in one set, then # append it into the sub program. global_ops = [] # sparse grad name to param name sparse_grad_to_param = [] # append lr decay ops to the child block if exists lr_ops = self._get_lr_ops() # record optimize blocks and we can run them on pserver parallel opti_blocks = [] # append op to the current block grad_to_block_id = [] pre_block_idx = pserver_program.num_blocks - 1 for idx, opt_op in enumerate(self._opti_var_list): per_opt_block = pserver_program._create_block(pre_block_idx) opti_blocks.append(per_opt_block) optimize_target_param_name = self._opti_to_param[opt_op] pserver_block = per_opt_block.program.global_block() # append grad merging ops before clip and weight decay # e.g. merge grad -> L2Decay op -> clip op -> optimize merged_var = pserver_block.vars[optimize_target_param_name] if self.sync_mode and self.trainer_num > 1: vars2merge = [] for i in range(self.trainer_num): per_trainer_name = "%s.opti.trainer_%d" % \ (optimize_target_param_name, i) vars2merge.append(pserver_block.vars[per_trainer_name]) per_opt_block.append_op(type="sum", inputs={"X": vars2merge}, outputs={"Out": merged_var}, attrs={"use_mkldnn": False}) per_opt_block.append_op( type="scale", inputs={"X": merged_var}, outputs={"Out": merged_var}, attrs={"scale": 1.0 / float(self.trainer_num)}) # In some case, some parameter server will have no parameter to optimize # So we give an empty optimize block to parameter server. attrs = { "optimize_blocks": opti_blocks, "endpoint": endpoint, "Fanin": self.trainer_num, "sync_mode": self.sync_mode, } # step5 append the listen_and_serv op pserver_program.global_block().append_op(type="fl_listen_and_serv", inputs={'X': recv_inputs}, outputs={}, attrs=attrs) pserver_program._sync_with_cpp() # save pserver program to generate pserver side startup relatively. self.pserver_program = pserver_program return pserver_program