def get_pserver_program(self, endpoint, optimize_ops): pserver_program = Program() for v in self.param_grad_map[endpoint]["params"]: self._clone_param(pserver_program.global_block(), v) optimize_sub_program = Program() grad_var_names = [ var.name for var in self.param_grad_map[endpoint]["grads"] ] for opt_op in optimize_ops: for _, var in opt_op.inputs.iteritems(): # NOTE: append operators to merge gradients from multiple # trainers. If trainers == 1, this is not needed. if self.trainers > 1 and var.name in grad_var_names: vars2merge = self._create_var_for_trainers( optimize_sub_program.global_block(), var, self.trainers) merged_var = optimize_sub_program.global_block( ).create_var(name=var.name, persistable=var.persistable, dtype=var.dtype, shape=var.shape) optimize_sub_program.global_block().append_op( type="sum", inputs={"X": vars2merge}, outputs={"Out": merged_var}) optimize_sub_program.global_block().append_op( type="scale", inputs={"X": merged_var}, outputs={"Out": merged_var}, attrs={"scale": 1.0 / float(self.trainers)}) else: optimize_sub_program.global_block().create_var( name=var.name, persistable=var.persistable, dtype=var.dtype, shape=var.shape) if opt_op.inputs.has_key("Grad"): if opt_op.inputs["Grad"].name in grad_var_names: optimize_sub_program.global_block().append_op( type=opt_op.type, inputs=opt_op.inputs, outputs=opt_op.outputs, attrs=opt_op.attrs) else: optimize_sub_program.global_block().append_op( type=opt_op.type, inputs=opt_op.inputs, outputs=opt_op.outputs, attrs=opt_op.attrs) pserver_program.global_block().append_op( type="recv", inputs={"RX": self.param_grad_map[endpoint]["grads"]}, # grads to recv outputs={}, attrs={ "OptimizeBlock": optimize_sub_program.global_block(), "endpoint": endpoint, "ParamList": [p.name for p in self.param_grad_map[endpoint]["params"]], "GradList": [p.name for p in self.param_grad_map[endpoint]["grads"]], "Trainers": self.trainers }) pserver_program.sync_with_cpp() return pserver_program
def get_pserver_program(self, endpoint): """ Get pserver side program using the endpoint. TODO(panyx0718): Revisit this assumption. what if #blocks > #pservers. NOTE: assume blocks of the same variable is not distributed on the same pserver, only change param/grad varnames for trainers to fetch. """ # step1 pserver_program = Program() # step2: Create vars to receive vars at parameter servers. recv_inputs = [] for v in self.param_grad_ep_mapping[endpoint]["params"]: self._clone_var(pserver_program.global_block(), v) for v in self.param_grad_ep_mapping[endpoint]["grads"]: # create vars for each trainer in global scope, so # we don't need to create them when grad arrives. # change client side var name to origin name by # removing ".trainer_%d" suffix suff_idx = v.name.find(".trainer_") if suff_idx >= 0: orig_var_name = v.name[:suff_idx] else: orig_var_name = v.name # NOTE: single_trainer_var must be created for multi-trainer # case to merge grads from multiple trainers single_trainer_var = \ pserver_program.global_block().create_var( name=orig_var_name, persistable=True, type=v.type, dtype=v.dtype, shape=v.shape) if self.sync_mode and self.trainer_num > 1: for trainer_id in xrange(self.trainer_num): var = pserver_program.global_block().create_var( name="%s.trainer_%d" % (orig_var_name, trainer_id), persistable=False, type=v.type, dtype=v.dtype, shape=v.shape) recv_inputs.append(var) else: recv_inputs.append(single_trainer_var) # step 3 # Create a union-find data structure from optimize ops, # If two ops are connected, we could add these two ops # into one set. ufind = self._create_ufind(self.optimize_ops) # step 3.2 # Iterate through the ops and append optimize op which # located on current pserver opt_op_on_pserver = [] for _, op in enumerate(self.optimize_ops): if self._is_opt_op(op) and self._is_opt_op_on_pserver( endpoint, op): opt_op_on_pserver.append(op) # step 3.3 # Iterate through the ops, and if an op and the optimize ops # which located on current pserver are in one set, then # append it into the sub program. # We try to put optimization program run parallelly, assume # optimization program always looks like: # # prevop -> prevop -> opt op -> following op -> following op; -> # prevop -> prevop -> opt op -> following op -> following op; -> # global op -> global op # # we put operators that can run parallelly to many program blocks. # in above example, we seperate ops by the ";". Global ops must run # after all the optimize ops finished. global_ops = [] # HACK: optimization global ops only used to scale beta1 and beta2 # replace it with dependency engine. for op in self.optimize_ops: if self._is_adam_connected_op(op): global_ops.append(op) def __append_optimize_op__(op, block, grad_to_block_id): if self._is_opt_op(op): self._append_pserver_ops(block, op, endpoint, grad_to_block_id, default_main_program()) else: self._append_pserver_non_opt_ops(block, op) # append lr decay ops to the child block if exists lr_ops = self._get_lr_ops() if len(lr_ops) > 0: lr_decay_block = pserver_program.create_block( pserver_program.num_blocks - 1) for _, op in enumerate(lr_ops): self._append_pserver_non_opt_ops(lr_decay_block, op) # append op to the current block grad_to_block_id = [] pre_block_idx = pserver_program.num_blocks - 1 for idx, opt_op in enumerate(opt_op_on_pserver): per_opt_block = pserver_program.create_block(pre_block_idx) for _, op in enumerate(self.optimize_ops): # optimizer is connected to itself if ufind.is_connected(op, opt_op) and op not in global_ops: __append_optimize_op__(op, per_opt_block, grad_to_block_id) # append global ops if global_ops: opt_state_block = pserver_program.create_block( pserver_program.num_blocks - 1) for glb_op in global_ops: __append_optimize_op__(glb_op, opt_state_block, grad_to_block_id) # NOT USED: single block version: # # for _, op in enumerate(self.optimize_ops): # for _, opt_op in enumerate(opt_op_on_pserver): # if ufind.is_connected(op, opt_op): # __append_optimize_op__(glb_op, optimize_block) # break # process distributed lookup_table prefetch_block = None if self.has_distributed_lookup_table: pserver_index = self.pserver_endpoints.index(endpoint) table_opt_block = self._create_table_optimize_block( pserver_index, pserver_program, pre_block_idx) prefetch_block = self._create_prefetch_block( pserver_index, pserver_program, table_opt_block) # NOTE: if has_distributed_lookup_table is False, then prefetch_block will # not be executed, so it's safe to use optimize_block to hold the place if self.has_distributed_lookup_table: assert prefetch_block is not None else: assert prefetch_block is None prefetch_block = pserver_program.global_block() # step5 append the listen_and_serv op pserver_program.global_block().append_op(type="listen_and_serv", inputs={'X': recv_inputs}, outputs={}, attrs={ "OptimizeBlock": pserver_program.block(1), "endpoint": endpoint, "Fanin": self.trainer_num, "PrefetchBlock": prefetch_block, "sync_mode": self.sync_mode, "grad_to_block_id": grad_to_block_id }) pserver_program.sync_with_cpp() return pserver_program