def backward_value_helper(self, cond_func, use_cuda, use_parallel_exe): """ Helper function that compares calculated backward value is close to dy/dx """ main_program = Program() main_program.random_seed = 123 startup_program = Program() startup_program.random_seed = 123 with program_guard(main_program, startup_program): img = fluid.data(name='image', shape=[-1, 9], dtype='float32') img.stop_gradient = False label = fluid.data(name='label', shape=[-1, 1], dtype='int64') i = fluid.data(name="i", shape=[1], dtype='int32') loss = cond_func(i, img, label) append_backward(loss) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_program) num_devices = 1 if use_parallel_exe: os.environ['CPU_NUM'] = str(2) exe = fluid.ParallelExecutor(use_cuda=use_cuda, main_program=main_program, loss_name=loss.name) num_devices = exe.device_count delta = 0.005 for feed_i in range(0, 10): feed_img = np.random.random(size=[1, 9]).astype(np.float32) feed_label = np.random.randint(low=0, high=10, size=[1, 1], dtype=np.int64) if use_parallel_exe: img_grad, loss_value = exe.run( feed={ 'i': np.full((num_devices), feed_i, np.int32), 'image': np.repeat(feed_img, num_devices, axis=0), 'label': np.repeat(feed_label, num_devices, axis=0) }, fetch_list=[img.grad_name, loss.name]) else: img_grad, loss_value = exe.run( main_program, feed={ 'i': np.full((1), feed_i, np.int32), 'image': feed_img, 'label': feed_label }, fetch_list=[img.grad_name, loss.name]) numerical_grad = np.zeros(shape=[num_devices, 9], dtype=np.float32) feed_img_delta = np.copy(feed_img) for j in range(9): feed_img_delta[0][j] = feed_img[0][j] + delta if use_parallel_exe: loss_delta = exe.run(feed={ 'i': np.full((num_devices), feed_i, np.int32), 'image': np.repeat(feed_img_delta, num_devices, axis=0), 'label': np.repeat(feed_label, num_devices, axis=0) }, fetch_list=[loss.name]) multi_device_grad = (loss_delta[0] - loss_value[0]) / delta / num_devices for d in range(num_devices): numerical_grad[d][j] = multi_device_grad[d] else: loss_delta = exe.run(main_program, feed={ 'i': np.full((1), feed_i, np.int32), 'image': feed_img_delta, 'label': feed_label }, fetch_list=[loss.name]) numerical_grad[0][j] = (loss_delta[0] - loss_value[0]) / delta feed_img_delta[0][j] = feed_img[0][j] self.assertTrue( np.isclose(img_grad, numerical_grad, atol=0.05, rtol=0.05).all())
def get_pserver_program(self, endpoint): """ Get parameter server side program. Args: endpoint (str): current parameter server endpoint. Returns: Program: the program for current parameter server to run. """ # TODO(panyx0718): Revisit this assumption. what if #blocks > #pservers. # NOTE: assume blocks of the same variable is not distributed # on the same pserver, only change param/grad varnames for # trainers to fetch. sys.stderr.write( "get_pserver_program() is deprecated, call get_pserver_programs() to get pserver main and startup in a single call.\n" ) # step1 pserver_program = Program() pserver_program.random_seed = self.origin_program.random_seed pserver_program._copy_dist_param_info_from(self.origin_program) # step2: Create vars to receive vars at parameter servers. recv_inputs = [] for v in self.param_grad_ep_mapping[endpoint]["params"]: self._clone_var(pserver_program.global_block(), v) for v in self.param_grad_ep_mapping[endpoint]["opti"]: # create vars for each trainer in global scope, so # we don't need to create them when grad arrives. # change client side var name to origin name by # removing ".trainer_%d" suffix suff_idx = v.name.find(".opti.trainer_") if suff_idx >= 0: orig_var_name = v.name[:suff_idx] # NOTE: single_trainer_var must be created for multi-trainer # case to merge grads from multiple trainers single_trainer_var = pserver_program.global_block().var( orig_var_name) if self.sync_mode and self.trainer_num > 1: for trainer_id in range(self.trainer_num): var = pserver_program.global_block().create_var( name="%s.opti.trainer_%d" % (orig_var_name, trainer_id), persistable=False, type=v.type, dtype=v.dtype, shape=v.shape) recv_inputs.append(var) # step 3 # Create a union-find data structure from optimize ops, # If two ops are connected, we could add these two ops # into one set. ufind = self._create_ufind(self.optimize_ops) # step 3.2 # Iterate through the ops and append optimize op which # located on current pserver opt_op_on_pserver = [] for _, op in enumerate(self.optimize_ops): if self._is_optimizer_op(op) and self._is_opt_op_on_pserver( endpoint, op): opt_op_on_pserver.append(op) # step 3.4 # Iterate through the ops, and if an op and the optimize ops # which located on current pserver are in one set, then # append it into the sub program. global_ops = [] # sparse grad name to param name sparse_grad_to_param = [] # append lr decay ops to the child block if exists lr_ops = self._get_lr_ops() # record optimize blocks and we can run them on pserver parallel opti_blocks = [] # append op to the current block grad_to_block_id = [] pre_block_idx = pserver_program.num_blocks - 1 for idx, opt_op in enumerate(self._opti_var_list): per_opt_block = pserver_program._create_block(pre_block_idx) opti_blocks.append(per_opt_block) optimize_target_param_name = self._opti_to_param[opt_op] pserver_block = per_opt_block.program.global_block() # append grad merging ops before clip and weight decay # e.g. merge grad -> L2Decay op -> clip op -> optimize merged_var = pserver_block.vars[optimize_target_param_name] if self.sync_mode and self.trainer_num > 1: vars2merge = [] for i in range(self.trainer_num): per_trainer_name = "%s.opti.trainer_%d" % \ (optimize_target_param_name, i) vars2merge.append(pserver_block.vars[per_trainer_name]) per_opt_block.append_op(type="sum", inputs={"X": vars2merge}, outputs={"Out": merged_var}, attrs={"use_mkldnn": False}) per_opt_block.append_op( type="scale", inputs={"X": merged_var}, outputs={"Out": merged_var}, attrs={"scale": 1.0 / float(self.trainer_num)}) # In some case, some parameter server will have no parameter to optimize # So we give an empty optimize block to parameter server. attrs = { "optimize_blocks": opti_blocks, "endpoint": endpoint, "Fanin": self.trainer_num, "sync_mode": self.sync_mode, } # step5 append the listen_and_serv op pserver_program.global_block().append_op(type="fl_listen_and_serv", inputs={'X': recv_inputs}, outputs={}, attrs=attrs) pserver_program._sync_with_cpp() # save pserver program to generate pserver side startup relatively. self.pserver_program = pserver_program return pserver_program
def get_startup_program(self, endpoint, pserver_program=None, startup_program=None): """ **Deprecated** Get startup program for current parameter server. Modify operator input variables if there are variables that were split to several blocks. Args: endpoint (str): current pserver endpoint. pserver_program (Program): deprecated, call get_pserver_program first. startup_program (Program): deprecated, should pass startup_program when initalizing Returns: Program: parameter server side startup program. """ s_prog = Program() orig_s_prog = self.startup_program s_prog.random_seed = orig_s_prog.random_seed params = self.param_grad_ep_mapping[endpoint]["params"] def _get_splited_name_and_shape(varname): for idx, splited_param in enumerate(params): pname = splited_param.name if same_or_split_var(pname, varname) and varname != pname: return pname, splited_param.shape return "", [] # 1. create vars in pserver program to startup program pserver_vars = pserver_program.global_block().vars created_var_map = collections.OrderedDict() for _, var in six.iteritems(pserver_vars): tmpvar = s_prog.global_block()._clone_variable(var) created_var_map[var.name] = tmpvar # 2. rename op outputs for op in orig_s_prog.global_block().ops: new_outputs = collections.OrderedDict() # do not append startup op if var is not on this pserver op_on_pserver = False # TODO(gongwb): remove this line. if op.type not in ["recv", "fetch_barrier", "concat"]: for key in op.output_names: newname, _ = _get_splited_name_and_shape(op.output(key)[0]) if newname: op_on_pserver = True new_outputs[key] = created_var_map[newname] elif op.output(key)[0] in pserver_vars: op_on_pserver = True new_outputs[key] = pserver_vars[op.output(key)[0]] if op_on_pserver: # most startup program ops have no inputs new_inputs = self._get_input_map_from_op(pserver_vars, op) if op.type in [ "gaussian_random", "fill_constant", "uniform_random", "truncated_gaussian_random" ]: op._set_attr("shape", list(new_outputs["Out"].shape)) s_prog.global_block().append_op(type=op.type, inputs=new_inputs, outputs=new_outputs, attrs=op.all_attrs()) return s_prog
def static(train_data, loss_in_switch=True, use_cuda=False, use_parallel_exe=False): startup_program = Program() main_program = Program() startup_program.random_seed = SEED main_program.random_seed = SEED with program_guard(main_program, startup_program): def double_fc_net(image): hidden = layers.fc( image, size=FC_SIZE, act='relu', param_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=0.99)), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=0.5)), name="hidden") prediction = layers.fc( hidden, size=CLASS_NUM, act='softmax', param_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=1.2)), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=0.8)), name="prediction") return hidden, prediction def fn_1(opt, avg_loss=None, pred=None, label=None): if avg_loss is None: loss = layers.cross_entropy(input=pred, label=label) avg_loss = layers.mean(loss, name='mean_cross_entropy_loss') opt.minimize(avg_loss) return avg_loss def fn_2(opt, avg_loss=None, pred=None, label=None): if avg_loss is None: loss = layers.softmax_with_cross_entropy(logits=pred, label=label) avg_loss = layers.mean(loss, name='mean_softmax_loss') opt.minimize(avg_loss) return avg_loss image = fluid.data('image', [BATCH_SIZE, INPUT_SIZE], 'float32') label = fluid.data('label', [BATCH_SIZE, 1], 'int64') hidden, prediction = double_fc_net(image) adam = optimizer.Adam(learning_rate=LR) sgd = optimizer.SGD(learning_rate=LR) id = fluid.data('id', [1], 'int32') two = layers.fill_constant([1], 'int32', 2) mod_two = layers.elementwise_mod(id, two) == 0 if loss_in_switch: avg_loss = layers.case( [(mod_two, lambda: fn_1(adam, None, prediction, label))], lambda: fn_2(sgd, None, prediction, label)) else: loss_1 = layers.cross_entropy(input=prediction, label=label) avg_loss_1 = layers.mean(loss_1) loss_2 = layers.softmax_with_cross_entropy(logits=prediction, label=label) avg_loss_2 = layers.mean(loss_2) avg_loss = layers.case([(mod_two, lambda: fn_1(adam, avg_loss_1))], lambda: fn_2(sgd, avg_loss_2)) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_program) for epoch in range(EPOCH_NUM): feed_image, feed_label = train_data[epoch] fetch_list = [hidden, prediction, avg_loss] feed = { 'image': feed_image, 'label': feed_label, 'id': np.array([epoch]).astype('int32') } out = exe.run(main_program, feed=feed, fetch_list=fetch_list) out_hidden, out_pred, loss = out return out_hidden, out_pred, loss