def test_sink(seed): rng = np.random.RandomState(seed) v = nn.Variable((2, 3, 4), need_grad=True) h0 = F.tanh(v) h1 = F.sigmoid(v) v.d = rng.randn(*v.shape).astype(np.float32) # Create references v.grad.zero() h0.forward() h1.forward() h0.backward() h1.backward() # v.grad is accumulated. h0d = h0.d.copy() h1d = h1.d.copy() vg = v.g.copy() # Reset values h0.data.zero() h1.data.zero() v.grad.zero() # Check if sink works dummy = F.sink(h0, h1, one_input_grad=True) dummy.forward() dummy.backward() assert np.all(h0d == h0.d) assert np.all(h1d == h1.d) assert np.all(vg == v.g)
def get_gru_grad(xs_np, h0_np, w0_np, w_np, b_np, dy, dh, num_layers=1, dropout=0.0, bidirectional=False, training=True, **kw): # Inputs are numpy arrays num_directions = 2 if bidirectional else 1 seq_len = xs_np.shape[0] batch_size = xs_np.shape[1] hidden_size = h0_np.shape[3] xs = nn.Variable.from_numpy_array(xs_np, need_grad=True) h0 = nn.Variable.from_numpy_array(h0_np, need_grad=True) w0 = nn.Variable.from_numpy_array(w0_np, need_grad=True) w = None b = None with_bias = False if num_layers > 1: w = nn.Variable.from_numpy_array(w_np, need_grad=True) if type(b_np) == np.ndarray: b = nn.Variable.from_numpy_array(b_np, need_grad=True) with_bias = True xs.grad.zero() h0.grad.zero() w0.grad.zero() if num_layers > 1: w.grad.zero() if with_bias: b.grad.zero() ys, hn = create_fixed_length_gru( xs, h0, w0, w, b, num_layers, num_directions, with_bias) # returns Variables dummy = F.sink(ys, hn, one_input_grad=False) dummy.forward() ys.g = np.reshape(dy, ys.shape) hn.g = dh dummy.backward() if num_layers > 1 and with_bias: return np.concatenate((xs.g.flat, h0.g.flat, w0.g.flat, w.g.flat, b.g.flat)) elif num_layers > 1 and not with_bias: return np.concatenate((xs.g.flat, h0.g.flat, w0.g.flat, w.g.flat)) elif num_layers == 1 and with_bias: return np.concatenate((xs.g.flat, h0.g.flat, w0.g.flat, b.g.flat)) else: return np.concatenate((xs.g.flat, h0.g.flat, w0.g.flat))
def test_clear_output_grad_prohibit_clear_input(self): x1 = nn.Variable([1], need_grad=True) xx1 = F.identity(x1) y1 = F.add_scalar(xx1) y2 = F.add_scalar(xx1) y3 = F.sink(y1, y2) answer_grad = [] answer_grad.append([True]) # y3 answer_grad.append([False]) # y2 answer_grad.append([False]) # y1 answer_grad.append([True]) # xx1 y3.forward(clear_no_need_grad=True) clear_called_flag_recorder.deactivate_clear_called_flag_recorder() clear_called_flag_recorder.activate_clear_called_flag_recorder() y3.backward(clear_buffer=True) self.check_grad_cleared_flags(answer_grad)
def create_ema_op(params, ema_decay=0.9999): """ Define exponential moving average update for trainable params. """ def ema_update(p_ema, p_train): return F.assign(p_ema, ema_decay * p_ema + (1. - ema_decay) * p_train) ops = [] with nn.parameter_scope("ema"): for name, p_train in params.items(): p_ema = get_parameter_or_create(name, shape=p_train.shape, need_grad=False) p_ema.data.copy_from(p_train.data, use_current_context=False) # initialize ops.append(ema_update(p_ema, p_train)) ema_params = nn.get_parameters(grad_only=False) return F.sink(*ops), ema_params
def __init__(self, num_actions, num_envs, batch_size, v_coeff, ent_coeff, lr_scheduler): # inference graph self.infer_obs_t = nn.Variable((num_envs, 4, 84, 84)) self.infer_pi_t,\ self.infer_value_t = cnn_network(self.infer_obs_t, num_actions, 'network') self.infer_t = F.sink(self.infer_pi_t, self.infer_value_t) # evaluation graph self.eval_obs_t = nn.Variable((1, 4, 84, 84)) self.eval_pi_t, _ = cnn_network(self.eval_obs_t, num_actions, 'network') # training graph self.obss_t = nn.Variable((batch_size, 4, 84, 84)) self.acts_t = nn.Variable((batch_size, 1)) self.rets_t = nn.Variable((batch_size, 1)) self.advs_t = nn.Variable((batch_size, 1)) pi_t, value_t = cnn_network(self.obss_t, num_actions, 'network') # value loss l2loss = F.squared_error(value_t, self.rets_t) self.value_loss = v_coeff * F.mean(l2loss) # policy loss log_pi_t = F.log(pi_t + 1e-20) a_one_hot = F.one_hot(self.acts_t, (num_actions, )) log_probs_t = F.sum(log_pi_t * a_one_hot, axis=1, keepdims=True) self.pi_loss = F.mean(log_probs_t * self.advs_t) # KL loss entropy = -ent_coeff * F.mean(F.sum(pi_t * log_pi_t, axis=1)) self.loss = self.value_loss - self.pi_loss - entropy self.params = nn.get_parameters() self.solver = S.RMSprop(lr_scheduler(0.0), 0.99, 1e-5) self.solver.set_parameters(self.params) self.lr_scheduler = lr_scheduler
def forward_variable(inputs, outputs, side, feed=None): rng = np.random.RandomState(389) if feed is None: if isinstance(inputs, nn.Variable): inputs.d = rng.randn(*inputs.d.shape) else: for v in inputs: v.d = rng.randn(*v.d.shape) elif callable(feed): feed(inputs, rng) if isinstance(outputs, nn.Variable): outputs.forward() yield outputs.d.copy() else: y = F.sink(*outputs) for v in outputs: v.persistent = True y.forward() for v in outputs: yield v.d.copy()
def test_instance_normalization_forward_backward(seed, x_shape, batch_axis, channel_axis, output_stat): rng = np.random.RandomState(seed) input = np.array(rng.randn(*x_shape).astype(np.float32)) eps = 1e-05 stat_shape = tuple([ x_shape[i] if i in _force_list(batch_axis) + [ channel_axis, ] else 1 for i in range(len(x_shape)) ]) beta = rng.randn(*stat_shape).astype(np.float32) gamma = rng.randn(*stat_shape).astype(np.float32) x = nn.Variable.from_numpy_array(input) v_beta = nn.Variable.from_numpy_array(beta) v_gamma = nn.Variable.from_numpy_array(gamma) output = F.instance_normalization(x, v_beta, v_gamma, channel_axis, batch_axis, eps, output_stat) ref = ref_instance_normalization(input, beta, gamma, channel_axis, batch_axis, eps, output_stat) if output_stat: tmp = F.sink(*output) tmp.forward() tmp.backward() for o, r in zip(output, ref): assert o.shape == r.shape assert_allclose(o.d, r, atol=1e-2, rtol=1e-5) else: output.forward() output.backward() assert output.shape == ref.shape assert_allclose(output.d, ref, atol=1e-2, rtol=1e-5)
def execute_fixed_length_rnn(xs_np, h0_np, w0_np, w_np, b_np, num_layers=1, nonlinearity='tanh', dropout=0.0, bidirectional=False, training=True): # Inputs are numpy arrays num_directions = 2 if bidirectional else 1 seq_len = xs_np.shape[0] batch_size = xs_np.shape[1] hidden_size = h0_np.shape[3] xs = nn.Variable.from_numpy_array(xs_np) h0 = nn.Variable.from_numpy_array(h0_np) w0 = nn.Variable.from_numpy_array(w0_np) w = None b = None with_bias = False if num_layers > 1: w = nn.Variable.from_numpy_array(w_np) if type(b_np) is np.ndarray: b = nn.Variable.from_numpy_array(b_np) with_bias = True ys, hn = create_fixed_length_rnn(xs, h0, w0, w, b, num_layers, nonlinearity, num_directions, with_bias) # returns Variables dummy = F.sink(ys, hn) dummy.forward() # returns numpy arrays ys = F.reshape(ys, (seq_len, batch_size, num_directions * hidden_size)) ys.forward() return ys.d, hn.d
def test_group_normalization_forward_backward(seed, num_groups, x_shape, batch_axis, channel_axis, output_stat): rng = np.random.RandomState(seed) input = np.array(rng.randn(*x_shape).astype(np.float32)) stat_shape = [1 for _ in range(len(x_shape))] stat_shape[channel_axis] = input.shape[channel_axis] beta = rng.randn(*stat_shape).astype(np.float32) gamma = rng.randn(*stat_shape).astype(np.float32) eps = 1e-05 x = nn.Variable.from_numpy_array(input) v_beta = nn.Variable.from_numpy_array(beta) v_gamma = nn.Variable.from_numpy_array(gamma) output = F.group_normalization(x, v_beta, v_gamma, num_groups, channel_axis, batch_axis, eps, output_stat) ref = ref_group_normalization(input, beta, gamma, num_groups, channel_axis, batch_axis, eps, output_stat) if output_stat: tmp = F.sink(*output) tmp.forward() tmp.backward() for o, r in zip(output, ref): assert o.shape == r.shape assert_allclose(o.d, r, atol=1e-2, rtol=1e-5) else: output.forward() output.backward() assert output.shape == ref.shape assert_allclose(output.d, ref, atol=1e-2, rtol=1e-5)
def test_weight_standardization_forward_backward(rng, w_shape, channel_axis, output_stat): input = np.array(rng.randn(*w_shape).astype(np.float32)) eps = 1e-05 x = nn.Variable.from_numpy_array(input) output = F.weight_standardization(x, channel_axis, eps, output_stat) ref = ref_weight_standardization(input, channel_axis, eps, output_stat) if output_stat: tmp = F.sink(*output) tmp.forward() tmp.backward() for o, r in zip(output, ref): assert o.shape == r.shape assert np.allclose(o.d, r, atol=1e-2, rtol=1e-5) else: output.forward() output.backward() assert np.allclose(output.d, ref, atol=1e-2, rtol=1e-5)
def test_iterator_through_forward_sequence(module_func): func, in_shapes = module_func with nn.graph_def.graph() as g: inputs = [nn.ProtoVariable(shape) for shape in in_shapes] outputs = func(*inputs) inputs = [nn.Variable(shape) for shape in in_shapes] for i in inputs: i.d = np.random.random(i.shape) outputs_ref = func(*inputs) if not isinstance(outputs_ref, tuple): outputs_ref = (outputs_ref, ) output = F.sink(*outputs_ref) forward_sequence = [] def visit_func(f): if f.name != 'Sink': forward_sequence.append(f.name) output.visit(visit_func) for a, b in zip(g.default_graph().forward_sequence(), forward_sequence): assert a.type == b
def test_layer_normalization_forward_backward(seed, x_shape, batch_axis, output_stat): rng = np.random.RandomState(seed) input = rng.randn(*x_shape).astype(np.float32) stat_shape = list(x_shape) for baxis in _force_list(batch_axis): stat_shape[baxis] = 1 beta = rng.randn(*stat_shape).astype(np.float32) gamma = rng.randn(*stat_shape).astype(np.float32) eps = 1e-05 x = nn.Variable.from_numpy_array(input) v_beta = nn.Variable.from_numpy_array(beta) v_gamma = nn.Variable.from_numpy_array(gamma) output = F.layer_normalization(x, v_beta, v_gamma, batch_axis, eps, output_stat) ref = ref_layer_normalization(input, beta, gamma, batch_axis, eps, output_stat) if output_stat: tmp = F.sink(*output) tmp.forward() tmp.backward() for o, r in zip(output, ref): assert o.shape == r.shape assert_allclose(o.d, r, atol=1e-2, rtol=1e-5) else: output.forward() output.backward() assert_allclose(output.d, ref, atol=1e-2, rtol=1e-5)
def _build(self): # inference self.infer_obs_t = nn.Variable((1,) + self.obs_shape) with nn.parameter_scope('trainable'): self.infer_policy_t = policy_network(self.infer_obs_t, self.action_size, 'actor') # training self.obss_t = nn.Variable((self.batch_size,) + self.obs_shape) self.acts_t = nn.Variable((self.batch_size, self.action_size)) self.rews_tp1 = nn.Variable((self.batch_size, 1)) self.obss_tp1 = nn.Variable((self.batch_size,) + self.obs_shape) self.ters_tp1 = nn.Variable((self.batch_size, 1)) # critic loss with nn.parameter_scope('trainable'): # critic functions q1_t = q_network(self.obss_t, self.acts_t, 'critic/1') q2_t = q_network(self.obss_t, self.acts_t, 'critic/2') with nn.parameter_scope('target'): # target functions policy_tp1 = policy_network(self.obss_tp1, self.action_size, 'actor') smoothed_target = _smoothing_target(policy_tp1, self.target_reg_sigma, self.target_reg_clip) q1_tp1 = q_network(self.obss_tp1, smoothed_target, 'critic/1') q2_tp1 = q_network(self.obss_tp1, smoothed_target, 'critic/2') q_tp1 = F.minimum2(q1_tp1, q2_tp1) y = self.rews_tp1 + self.gamma * q_tp1 * (1.0 - self.ters_tp1) td1 = F.mean(F.squared_error(q1_t, y)) td2 = F.mean(F.squared_error(q2_t, y)) self.critic_loss = td1 + td2 # actor loss with nn.parameter_scope('trainable'): policy_t = policy_network(self.obss_t, self.action_size, 'actor') q1_t_with_actor = q_network(self.obss_t, policy_t, 'critic/1') q2_t_with_actor = q_network(self.obss_t, policy_t, 'critic/2') q_t_with_actor = F.minimum2(q1_t_with_actor, q2_t_with_actor) self.actor_loss = -F.mean(q_t_with_actor) # get neural network parameters with nn.parameter_scope('trainable'): with nn.parameter_scope('critic'): critic_params = nn.get_parameters() with nn.parameter_scope('actor'): actor_params = nn.get_parameters() # setup optimizers self.critic_solver = S.Adam(self.critic_lr) self.critic_solver.set_parameters(critic_params) self.actor_solver = S.Adam(self.actor_lr) self.actor_solver.set_parameters(actor_params) with nn.parameter_scope('trainable'): trainable_params = nn.get_parameters() with nn.parameter_scope('target'): target_params = nn.get_parameters() # build target update update_targets = [] sync_targets = [] for key, src in trainable_params.items(): dst = target_params[key] updated_dst = (1.0 - self.tau) * dst + self.tau * src update_targets.append(F.assign(dst, updated_dst)) sync_targets.append(F.assign(dst, src)) self.update_target_expr = F.sink(*update_targets) self.sync_target_expr = F.sink(*sync_targets)
def forward_backward_all(*vv): y = F.sink(*vv) y.forward() y.backward()
def train(args): """ Multi-Device Training NOTE: the communicator exposes low-level interfaces Steps: * Instantiate a communicator and set parameter variables. * Specify contexts for computation. * Initialize DataIterator. * Construct a computation graph for training and one for validation. * Initialize solver and set parameter variables to that. * Load checkpoint to resume previous training. * Create monitor instances for saving and displaying training stats. * Training loop * Computate error rate for validation data (periodically) * Get a next minibatch. * Execute forwardprop * Set parameter gradients zero * Execute backprop. * AllReduce for gradients * Solver updates parameters by using gradients computed by backprop and all reduce. * Compute training error """ # Create Communicator and Context comm = create_communicator(ignore_error=True) if comm: n_devices = comm.size mpi_rank = comm.rank device_id = comm.local_rank else: n_devices = 1 mpi_rank = 0 device_id = args.device_id if args.context == 'cpu': import nnabla_ext.cpu context = nnabla_ext.cpu.context() else: import nnabla_ext.cudnn context = nnabla_ext.cudnn.context(device_id=device_id) nn.set_default_context(context) n_train_samples = 50000 n_valid_samples = 10000 bs_valid = args.batch_size iter_per_epoch = int(n_train_samples / args.batch_size / n_devices) # Model rng = np.random.RandomState(313) comm_syncbn = comm if args.sync_bn else None if args.net == "cifar10_resnet23": prediction = functools.partial(resnet23_prediction, rng=rng, ncls=10, nmaps=64, act=F.relu, comm=comm_syncbn) data_iterator = data_iterator_cifar10 if args.net == "cifar100_resnet23": prediction = functools.partial(resnet23_prediction, rng=rng, ncls=100, nmaps=384, act=F.elu, comm=comm_syncbn) data_iterator = data_iterator_cifar100 # Create training graphs image_train = nn.Variable((args.batch_size, 3, 32, 32)) label_train = nn.Variable((args.batch_size, 1)) pred_train = prediction(image_train, test=False) pred_train.persistent = True loss_train = (loss_function(pred_train, label_train) / n_devices).apply(persistent=True) error_train = F.mean(F.top_n_error(pred_train, label_train, axis=1)).apply(persistent=True) loss_error_train = F.sink(loss_train, error_train) # Create validation graphs image_valid = nn.Variable((bs_valid, 3, 32, 32)) label_valid = nn.Variable((bs_valid, 1)) pred_valid = prediction(image_valid, test=True) error_valid = F.mean(F.top_n_error(pred_valid, label_valid, axis=1)) # Solvers solver = S.Adam() solver.set_parameters(nn.get_parameters()) base_lr = args.learning_rate warmup_iter = iter_per_epoch * args.warmup_epoch warmup_slope = base_lr * (n_devices - 1) / warmup_iter solver.set_learning_rate(base_lr) # load checkpoint if file exist. start_point = 0 if args.use_latest_checkpoint: files = glob.glob(f'{args.model_save_path}/checkpoint_*.json') if len(files) != 0: index = max([ int(n) for n in [re.sub(r'.*checkpoint_(\d+).json', '\\1', f) for f in files] ]) # load weights and solver state info from specified checkpoint file. start_point = load_checkpoint( f'{args.model_save_path}/checkpoint_{index}.json', solver) print(f'checkpoint is loaded. start iteration from {start_point}') # Create monitor monitor = Monitor(args.monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=10) monitor_err = MonitorSeries("Training error", monitor, interval=10) monitor_time = MonitorTimeElapsed("Training time", monitor, interval=10) monitor_verr = MonitorSeries("Validation error", monitor, interval=1) monitor_vtime = MonitorTimeElapsed("Validation time", monitor, interval=1) # Data Iterator # If the data does not exist, it will try to download it from the server # and prepare it. When executing multiple processes on the same host, it is # necessary to execute initial data preparation by the representative # process (rank is 0) on the host. # Download dataset by rank-0 process if single_or_rankzero(): rng = np.random.RandomState(mpi_rank) _, tdata = data_iterator(args.batch_size, True, rng) vsource, vdata = data_iterator(bs_valid, False) # Wait for data to be prepared without watchdog if comm: comm.barrier() # Prepare dataset for remaining process if not single_or_rankzero(): rng = np.random.RandomState(mpi_rank) _, tdata = data_iterator(args.batch_size, True, rng) vsource, vdata = data_iterator(bs_valid, False) # Training-loop ve = nn.Variable() for i in range(start_point // n_devices, args.epochs * iter_per_epoch): # Validation if i % iter_per_epoch == 0: ve_local = 0. k = 0 idx = np.random.permutation(n_valid_samples) val_images = vsource.images[idx] val_labels = vsource.labels[idx] for j in range(int(n_valid_samples / n_devices * mpi_rank), int(n_valid_samples / n_devices * (mpi_rank + 1)), bs_valid): image = val_images[j:j + bs_valid] label = val_labels[j:j + bs_valid] if len(image ) != bs_valid: # note that smaller batch is ignored continue image_valid.d = image label_valid.d = label error_valid.forward(clear_buffer=True) ve_local += error_valid.d.copy() k += 1 ve_local /= k ve.d = ve_local if comm: comm.all_reduce(ve.data, division=True, inplace=True) # Monitoring error and elapsed time if single_or_rankzero(): monitor_verr.add(i * n_devices, ve.d.copy()) monitor_vtime.add(i * n_devices) # Save model if single_or_rankzero(): if i % (args.model_save_interval // n_devices) == 0: iter = i * n_devices nn.save_parameters( os.path.join(args.model_save_path, 'params_%06d.h5' % iter)) if args.use_latest_checkpoint: save_checkpoint(args.model_save_path, iter, solver) # Forward/Zerograd image, label = tdata.next() image_train.d = image label_train.d = label loss_error_train.forward(clear_no_need_grad=True) solver.zero_grad() # Backward/AllReduce backward_and_all_reduce( loss_error_train, comm, with_all_reduce_callback=args.with_all_reduce_callback) # Solvers update solver.update() # Linear Warmup if i <= warmup_iter: lr = base_lr + warmup_slope * i solver.set_learning_rate(lr) # Monitoring loss, error and elapsed time if single_or_rankzero(): monitor_loss.add(i * n_devices, loss_train.d.copy()) monitor_err.add(i * n_devices, error_train.d.copy()) monitor_time.add(i * n_devices) # Save nnp last epoch if single_or_rankzero(): runtime_contents = { 'networks': [{ 'name': 'Validation', 'batch_size': args.batch_size, 'outputs': { 'y': pred_valid }, 'names': { 'x': image_valid } }], 'executors': [{ 'name': 'Runtime', 'network': 'Validation', 'data': ['x'], 'output': ['y'] }] } iter = args.epochs * iter_per_epoch nn.save_parameters( os.path.join(args.model_save_path, 'params_%06d.h5' % iter)) nnabla.utils.save.save( os.path.join(args.model_save_path, f'{args.net}_result.nnp'), runtime_contents) if comm: comm.barrier()
def train(): parser = argparse.ArgumentParser() parser.add_argument("--train-file", type=str) parser.add_argument("--valid-file", type=str) parser.add_argument("--num-training-examples", type=int, default=50) parser.add_argument("--accum-grad", type=int, default=1) parser.add_argument("--valid-interval", type=int, default=200) parser.add_argument("--threshold", type=float, default=0.95) parser.add_argument("--context", type=str, default="cpu") parser.add_argument("--device-id", type=int, default=0) args = parser.parse_args() from nnabla.ext_utils import get_extension_context extension_module = args.context ctx = get_extension_context(extension_module, device_id=args.device_id) nn.set_default_context(ctx) # prepare data iterators tdata = data_iterator( BAbI15DataSource(args.train_file, args.num_training_examples, shuffle=True), 1, False, False, False) vdata = data_iterator( BAbI15DataSource(args.valid_file, 1000, shuffle=True), 1, False, False, False) # prepare monitors monitor = M.Monitor("./bAbI15") tloss = M.MonitorSeries("Training Loss", monitor, interval=10) terror = M.MonitorSeries("Training Error", monitor, interval=10) verror = M.MonitorSeries("Validation Error", monitor, interval=1) # prepare solver solver = S.Adam() solver_initialized = False cnt = 0 while True: l = 0.0 e = 0.0 solver.zero_grad() for _ in range(args.accum_grad): # read next data x = tdata.next() V = x[1][0][0] E = x[2][0][0] ans = x[3][0][0] # construct GGNN output = predict(V, E) output = F.reshape(output, (1, output.shape[0])) # initialize solver if not solver_initialized: solver.set_parameters(nn.get_parameters()) solver_initialized = True solver.zero_grad() # calculate loss/error label = nn.Variable((1, 1)) label.data.data[0, 0] = ans output2 = output.unlinked() loss = F.mean(F.softmax_cross_entropy(output, label)) error = F.mean(F.top_n_error(output2, label)) F.sink(loss, error).forward(clear_no_need_grad=True) loss.backward(clear_buffer=True) l += loss.data.data e += error.data.data # dump log tloss.add(cnt, l / args.accum_grad) terror.add(cnt, e / args.accum_grad) l = 0.0 e = 0.0 solver.update() cnt += 1 if cnt % args.valid_interval == 0: # validation validation_error = 0 correct_example = None wrong_example = None for _ in range(vdata.size): x = vdata.next() id2str = x[0][0][0] V = x[1][0][0] E = x[2][0][0] ans = x[3][0][0] output = predict(V, E) output = F.reshape(output, (1, output.shape[0])) # calculate error label = nn.Variable((1, 1)) label.data.data[0, 0] = ans error = F.top_n_error(output, label) error.forward(clear_no_need_grad=True) if error.data.data > 0.5: if wrong_example is None: wrong_example = (id2str, V, E, ans, output.data.data) else: if correct_example is None: correct_example = (id2str, V, E, ans, output.data.data) validation_error += error.data.data validation_error /= vdata.size verror.add(cnt, validation_error) accuracy = 1 - validation_error if accuracy >= args.threshold: def show(example): for i, j in example[2]["is"]: print("{} is {}.".format(example[0][i], example[0][j])) for i, j in example[2]["has_fear"]: print("{} are afraid of {}.".format( example[0][i], example[0][j])) i = np.argmax(example[1]) print("What is {} afraid of?".format(example[0][i])) i = np.argmax(example[4]) print("Expected: {}, Actual: {}".format( example[0][example[3]], example[0][i])) if correct_example is not None: show(correct_example) if wrong_example is not None: show(wrong_example) break
def train(): """ Naive Multi-Device Training NOTE: the communicator exposes low-level interfaces * Parse command line arguments. * Instantiate a communicator and set parameter variables. * Specify contexts for computation. * Initialize DataIterator. * Construct a computation graph for training and one for validation. * Initialize solver and set parameter variables to that. * Create monitor instances for saving and displaying training stats. * Training loop * Computate error rate for validation data (periodically) * Get a next minibatch. * Execute forwardprop * Set parameter gradients zero * Execute backprop. * AllReduce for gradients * Solver updates parameters by using gradients computed by backprop and all reduce. * Compute training error """ # Parse args args = get_args() n_train_samples = 50000 n_valid_samples = 10000 bs_valid = args.batch_size # Create Communicator and Context extension_module = "cudnn" ctx = get_extension_context(extension_module, type_config=args.type_config) comm = C.MultiProcessDataParalellCommunicator(ctx) comm.init() n_devices = comm.size mpi_rank = comm.rank mpi_local_rank = comm.local_rank device_id = mpi_local_rank ctx.device_id = str(device_id) nn.set_default_context(ctx) # Model rng = np.random.RandomState(313) comm_syncbn = comm if args.sync_bn else None if args.net == "cifar10_resnet23": prediction = functools.partial(resnet23_prediction, rng=rng, ncls=10, nmaps=32, act=F.relu, comm=comm_syncbn) data_iterator = data_iterator_cifar10 if args.net == "cifar100_resnet23": prediction = functools.partial(resnet23_prediction, rng=rng, ncls=100, nmaps=384, act=F.elu, comm=comm_syncbn) data_iterator = data_iterator_cifar100 # Create training graphs image_train = nn.Variable((args.batch_size, 3, 32, 32)) label_train = nn.Variable((args.batch_size, 1)) pred_train = prediction(image_train, test=False) pred_train.persistent = True loss_train = (loss_function(pred_train, label_train) / n_devices).apply(persistent=True) error_train = F.mean(F.top_n_error(pred_train, label_train, axis=1)).apply(persistent=True) loss_error_train = F.sink(loss_train, error_train) input_image_train = {"image": image_train, "label": label_train} # Create validation graph image_valid = nn.Variable((bs_valid, 3, 32, 32)) label_valid = nn.Variable((args.batch_size, 1)) pred_valid = prediction(image_valid, test=True) error_valid = F.mean(F.top_n_error(pred_valid, label_valid, axis=1)) input_image_valid = {"image": image_valid, "label": label_valid} # Solvers solver = S.Adam() solver.set_parameters(nn.get_parameters()) base_lr = args.learning_rate warmup_iter = int( 1. * n_train_samples / args.batch_size / n_devices) * args.warmup_epoch warmup_slope = base_lr * (n_devices - 1) / warmup_iter solver.set_learning_rate(base_lr) # Create monitor from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed monitor = Monitor(args.monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=10) monitor_err = MonitorSeries("Training error", monitor, interval=10) monitor_time = MonitorTimeElapsed("Training time", monitor, interval=10) monitor_verr = MonitorSeries("Validation error", monitor, interval=1) monitor_vtime = MonitorTimeElapsed("Validation time", monitor, interval=1) # Data Iterator rng = np.random.RandomState(device_id) _, tdata = data_iterator(args.batch_size, True, rng) vsource, vdata = data_iterator(args.batch_size, False) # loss_error_train.forward() # Training-loop ve = nn.Variable() for i in range(int(args.max_iter / n_devices)): # Validation if i % int(n_train_samples / args.batch_size / n_devices) == 0: ve_local = 0. k = 0 idx = np.random.permutation(n_valid_samples) val_images = vsource.images[idx] val_labels = vsource.labels[idx] for j in range(int(n_valid_samples / n_devices * mpi_rank), int(n_valid_samples / n_devices * (mpi_rank + 1)), bs_valid): image = val_images[j:j + bs_valid] label = val_labels[j:j + bs_valid] if len(image ) != bs_valid: # note that smaller batch is ignored continue input_image_valid["image"].d = image input_image_valid["label"].d = label error_valid.forward(clear_buffer=True) ve_local += error_valid.d.copy() k += 1 ve_local /= k ve.d = ve_local comm.all_reduce(ve.data, division=True, inplace=True) # Save model if device_id == 0: monitor_verr.add(i * n_devices, ve.d.copy()) monitor_vtime.add(i * n_devices) if i % int(args.model_save_interval / n_devices) == 0: nn.save_parameters( os.path.join(args.model_save_path, 'params_%06d.h5' % i)) # Forward/Zerograd image, label = tdata.next() input_image_train["image"].d = image input_image_train["label"].d = label loss_error_train.forward(clear_no_need_grad=True) solver.zero_grad() # Backward/AllReduce backward_and_all_reduce( loss_error_train, comm, with_all_reduce_callback=args.with_all_reduce_callback) # Solvers update solver.update() # Linear Warmup if i <= warmup_iter: lr = base_lr + warmup_slope * i solver.set_learning_rate(lr) if device_id == 0: # loss and error locally, and elapsed time monitor_loss.add(i * n_devices, loss_train.d.copy()) monitor_err.add(i * n_devices, error_train.d.copy()) monitor_time.add(i * n_devices) # exit(0) if device_id == 0: nn.save_parameters( os.path.join(args.model_save_path, 'params_%06d.h5' % (args.max_iter / n_devices)))
def clear_no_need_grad_tester(rng, func, inputs, func_args=[], func_kwargs={}, backward=None, atol_f=1e-6, ctx=None, func_name=None, insert_identity=[], auto_forward=False): if ctx is None: ctx = nn.Context() if backward is None: backward = [True for _ in inputs] if not True in backward: return state_rng = None if rng is not None: state_rng = rng.get_state() else: rng = rng = np.random.RandomState(313) def create_variables(inputs, backward): vinputs = [] for i, b in zip(inputs, backward): if i is None: vinputs += [None] continue vinputs += [nn.Variable(i.shape, need_grad=b)] vinputs[-1].data.cast(i.dtype)[...] = i return vinputs vinputs = create_variables(inputs, backward) vinputs_clear_buffer = create_variables(inputs, backward) vinputs_identity_clear_buffer = [] if not insert_identity: insert_identity = [True] * len(vinputs) with nn.context_scope(ctx), nn.auto_forward(auto_forward): for idx, i in enumerate(vinputs_clear_buffer): if i is None: vinputs_identity_clear_buffer += [None] elif insert_identity[idx]: vinputs_identity_clear_buffer += [F.identity(i)] else: vinputs_identity_clear_buffer += [i] # Checking forward(clear_no_need_grad=True) with nn.context_scope(ctx), nn.auto_forward(auto_forward): o = func(*(vinputs + func_args), **func_kwargs) o = force_tuple(o) F.sink(*o).forward(clear_no_need_grad=False) o_clear_buffer = func(*(vinputs_identity_clear_buffer + func_args), **func_kwargs) o_clear_buffer = force_tuple(o_clear_buffer) o_identity_clear_buffer = list( map(lambda x: F.identity(x) if x is not None else None, o_clear_buffer)) o_identity_clear_buffer = list( filter(lambda x: x is not None, o_identity_clear_buffer)) F.sink(*o_identity_clear_buffer).forward(clear_no_need_grad=True) for i in range(len(o)): if o[i] is None: continue ref = o[i].d res = o_identity_clear_buffer[i].d assert_allclose( ref, res, atol=atol_f, err_msg="{} forward(clear_no_need_grad=True) test fails".format( func_name)) vinputs = list(filter(lambda x: x is not None, vinputs)) vinputs_clear_buffer = list( filter(lambda x: x is not None, vinputs_clear_buffer)) for i in range(len(vinputs)): vinputs[i].grad.zero() vinputs_clear_buffer[i].grad.zero() for i in range(len(o)): if o[i] is None: continue o[i].g = randn(rng, *o[i].shape) o_identity_clear_buffer[i].g = o[i].g F.sink(*o).backward() F.sink(*o_identity_clear_buffer).backward(clear_buffer=True) for i in range(len(vinputs)): ref = vinputs[i].g res = vinputs_clear_buffer[i].g assert_allclose( ref, res, atol=atol_f, err_msg="{} forward(clear_no_need_grad=True) and backward test fails" .format(func_name)) if state_rng: rng.set_state(state_rng)
def _create_optimizer(ctx, o, networks, datasets, renamed): class Optimizer: pass optimizer = Optimizer() optimizer.comm = current_communicator() comm_size = optimizer.comm.size if optimizer.comm else 1 optimizer.start_iter = (o.start_iter - 1) // comm_size + \ 1 if o.start_iter > 0 else 0 optimizer.end_iter = (o.end_iter - 1) // comm_size + \ 1 if o.end_iter > 0 else 0 optimizer.name = o.name optimizer.order = o.order optimizer.update_interval = o.update_interval if o.update_interval > 0 else 1 optimizer.network = networks[o.network_name] optimizer.data_iterators = OrderedDict() for d in o.dataset_name: optimizer.data_iterators[d] = datasets[d].data_iterator optimizer.dataset_assign = OrderedDict() for d in o.data_variable: optimizer.dataset_assign[ optimizer.network.variables[renamed.get(d.variable_name, d.variable_name)]] = d.data_name optimizer.generator_assign = OrderedDict() for g in o.generator_variable: optimizer.generator_assign[optimizer.network.variables[ renamed.get(g.variable_name, g.variable_name)]] = _get_generator(g) # for debugging # optimizer.net_variables = optimizer.network.variables # optimizer.net_variables.update(optimizer.network.parameters) optimizer.loss_variables = [] for l in o.loss_variable: optimizer.loss_variables.append( optimizer.network.variables[renamed.get(l.variable_name, l.variable_name)]) optimizer.parameter_learning_rate_multipliers = OrderedDict() for p in o.parameter_variable: param_variable_names = _get_matching_variable_names( p.variable_name, list(itertools.chain(optimizer.network.parameters.keys(), optimizer.network.variables.keys()))) for v_name in param_variable_names: if v_name in optimizer.network.parameters: optimizer.parameter_learning_rate_multipliers[ optimizer.network.parameters[v_name]] = p.learning_rate_multiplier elif v_name in optimizer.network.variables: optimizer.parameter_learning_rate_multipliers[ optimizer.network.variables[v_name]] = p.learning_rate_multiplier with nn.context_scope(ctx): if o.solver.type == 'Adagrad': optimizer.solver = S.Adagrad( o.solver.adagrad_param.lr, o.solver.adagrad_param.eps) init_lr = o.solver.adagrad_param.lr elif o.solver.type == 'Adadelta': optimizer.solver = S.Adadelta( o.solver.adadelta_param.lr, o.solver.adadelta_param.decay, o.solver.adadelta_param.eps) init_lr = o.solver.adadelta_param.lr elif o.solver.type == 'AdaBelief': optimizer.solver = S.AdaBelief(o.solver.adabelief_param.alpha, o.solver.adabelief_param.beta1, o.solver.adabelief_param.beta2, o.solver.adabelief_param.eps, o.solver.adabelief_param.wd, o.solver.adabelief_param.amsgrad, o.solver.adabelief_param.weight_decouple, o.solver.adabelief_param.fixed_decay, o.solver.adabelief_param.rectify) init_lr = o.solver.adabelief_param.alpha elif o.solver.type == 'Adam': optimizer.solver = S.Adam(o.solver.adam_param.alpha, o.solver.adam_param.beta1, o.solver.adam_param.beta2, o.solver.adam_param.eps) init_lr = o.solver.adam_param.alpha elif o.solver.type == 'AdamW': optimizer.solver = S.AdamW(o.solver.adamw_param.alpha, o.solver.adamw_param.beta1, o.solver.adamw_param.beta2, o.solver.adamw_param.eps, o.solver.adamw_param.wd) init_lr = o.solver.adamw_param.alpha elif o.solver.type == 'Adamax': optimizer.solver = S.Adamax(o.solver.adamax_param.alpha, o.solver.adamax_param.beta1, o.solver.adamax_param.beta2, o.solver.adamax_param.eps) init_lr = o.solver.adamax_param.alpha elif o.solver.type == 'AdaBound': optimizer.solver = S.AdaBound(o.solver.adabound_param.alpha, o.solver.adabound_param.beta1, o.solver.adabound_param.beta2, o.solver.adabound_param.eps, o.solver.adabound_param.final_lr, o.solver.adabound_param.gamma) init_lr = o.solver.adabound_param.alpha elif o.solver.type == 'AMSGRAD': optimizer.solver = S.AMSGRAD(o.solver.amsgrad_param.alpha, o.solver.amsgrad_param.beta1, o.solver.amsgrad_param.beta2, o.solver.amsgrad_param.eps) init_lr = o.solver.amsgrad_param.alpha elif o.solver.type == 'AMSBound': optimizer.solver = S.AMSBound(o.solver.amsbound_param.alpha, o.solver.amsbound_param.beta1, o.solver.amsbound_param.beta2, o.solver.amsbound_param.eps, o.solver.amsbound_param.final_lr, o.solver.amsbound_param.gamma) init_lr = o.solver.amsbound_param.alpha elif o.solver.type == 'Eve': p = o.solver.eve_param optimizer.solver = S.Eve( p.alpha, p.beta1, p.beta2, p.beta3, p.k, p.k2, p.eps) init_lr = p.alpha elif o.solver.type == 'Lars': optimizer.solver = S.Lars(o.solver.lars_param.lr, o.solver.lars_param.momentum, o.solver.lars_param.coefficient, o.solver.lars_param.eps) init_lr = o.solver.lars_param.lr elif o.solver.type == 'Momentum': optimizer.solver = S.Momentum( o.solver.momentum_param.lr, o.solver.momentum_param.momentum) init_lr = o.solver.momentum_param.lr elif o.solver.type == 'Nesterov': optimizer.solver = S.Nesterov( o.solver.nesterov_param.lr, o.solver.nesterov_param.momentum) init_lr = o.solver.nesterov_param.lr elif o.solver.type == 'RMSprop': optimizer.solver = S.RMSprop( o.solver.rmsprop_param.lr, o.solver.rmsprop_param.decay, o.solver.rmsprop_param.eps) init_lr = o.solver.rmsprop_param.lr elif o.solver.type == 'RMSpropGraves': optimizer.solver = S.RMSpropGraves( o.solver.rmsprop_graves_param.lr, o.solver.rmsprop_graves_param.decay, o.solver.rmsprop_graves_param.momentum, o.solver.rmsprop_graves_param.eps) init_lr = o.solver.rmsprop_graves_param.lr elif o.solver.type == 'Sgd' or o.solver.type == 'SGD': optimizer.solver = S.Sgd(o.solver.sgd_param.lr) init_lr = o.solver.sgd_param.lr elif o.solver.type == 'SgdW': optimizer.solver = S.SgdW(o.solver.sgdw_param.lr, o.solver.sgdw_param.momentum, o.solver.sgdw_param.wd) init_lr = o.solver.sgdw_param.lr else: raise ValueError('Solver "' + o.solver.type + '" is not supported.') parameters = {v.name: v.variable_instance for v, local_lr in optimizer.parameter_learning_rate_multipliers.items() if local_lr > 0.0} optimizer.solver.set_parameters(parameters) optimizer.parameters = OrderedDict( sorted(parameters.items(), key=lambda x: x[0])) optimizer.weight_decay = o.solver.weight_decay # keep following 2 lines for backward compatibility optimizer.lr_decay = o.solver.lr_decay if o.solver.lr_decay > 0.0 else 1.0 optimizer.lr_decay_interval = o.solver.lr_decay_interval if o.solver.lr_decay_interval > 0 else 1 optimizer.solver.set_states_from_protobuf(o) optimizer.comm = current_communicator() comm_size = optimizer.comm.size if optimizer.comm else 1 optimizer.scheduler = ExponentialScheduler(init_lr, 1.0, 1) if o.solver.lr_scheduler_type == 'Polynomial': if o.solver.polynomial_scheduler_param.power != 0.0: optimizer.scheduler = PolynomialScheduler( init_lr, o.solver.polynomial_scheduler_param.max_iter // comm_size, o.solver.polynomial_scheduler_param.power) elif o.solver.lr_scheduler_type == 'Cosine': optimizer.scheduler = CosineScheduler( init_lr, o.solver.cosine_scheduler_param.max_iter // comm_size) elif o.solver.lr_scheduler_type == 'Exponential': if o.solver.exponential_scheduler_param.gamma != 1.0: optimizer.scheduler = ExponentialScheduler( init_lr, o.solver.exponential_scheduler_param.gamma, o.solver.exponential_scheduler_param.iter_interval // comm_size if o.solver.exponential_scheduler_param.iter_interval > comm_size else 1) elif o.solver.lr_scheduler_type == 'Step': if o.solver.step_scheduler_param.gamma != 1.0 and len(o.solver.step_scheduler_param.iter_steps) > 0: optimizer.scheduler = StepScheduler( init_lr, o.solver.step_scheduler_param.gamma, [step // comm_size for step in o.solver.step_scheduler_param.iter_steps]) elif o.solver.lr_scheduler_type == 'Custom': # ToDo raise NotImplementedError() elif o.solver.lr_scheduler_type == '': if o.solver.lr_decay_interval != 0 or o.solver.lr_decay != 0.0: optimizer.scheduler = ExponentialScheduler( init_lr, o.solver.lr_decay if o.solver.lr_decay > 0.0 else 1.0, o.solver.lr_decay_interval // comm_size if o.solver.lr_decay_interval > comm_size else 1) else: raise ValueError('Learning Rate Scheduler "' + o.solver.lr_scheduler_type + '" is not supported.') if o.solver.lr_warmup_scheduler_type == 'Linear': if o.solver.linear_warmup_scheduler_param.warmup_iter >= comm_size: optimizer.scheduler = LinearWarmupScheduler( optimizer.scheduler, o.solver.linear_warmup_scheduler_param.warmup_iter // comm_size) for v in optimizer.loss_variables: v.variable_instance.grad.fill(1.0 / v.variable_instance.size) if len(optimizer.loss_variables) == 1: optimizer.target = optimizer.loss_variables[0].variable_instance else: optimizer.target = F.sink( *[v.variable_instance for v in optimizer.loss_variables], one_input_grad=False) return optimizer
def GRU(self, network, func): def register_parameters(h, w, b, with_bias): hidden_size = h.shape[1] w0, w1, w2 = (np.squeeze(wd, 0) for wd in np.split(w, w.shape[0], axis=0)) w0_nn = nn.Variable.from_numpy_array(np.transpose(w0, (1, 0))) w1_nn = nn.Variable.from_numpy_array(np.transpose(w1, (1, 0))) w2_0 = w2[:, :w2.shape[1] - hidden_size] w2_1 = w2[:, w2.shape[1] - hidden_size:] w2_0_nn = nn.Variable.from_numpy_array(np.transpose(w2_0, (1, 0))) w2_1_nn = nn.Variable.from_numpy_array(np.transpose(w2_1, (1, 0))) w_dict = { self._get_unique_name("@{}/gru/w0_nn".format(func_model_name)): w0_nn, self._get_unique_name("@{}/gru/w1_nn".format(func_model_name)): w1_nn, self._get_unique_name("@{}/gru/w2_0_nn".format(func_model_name)): w2_0_nn, self._get_unique_name("@{}/gru/w2_1_nn".format(func_model_name)): w2_1_nn } params.update(w_dict) names.update(w_dict) b0 = b1 = b2 = b3 = None if with_bias: b_dict = { self._get_unique_name("@{}/gru/b{}_nn".format( func_model_name, i)): nn.Variable.from_numpy_array(np.squeeze(b_item, 0)) for i, b_item in enumerate(np.split(b, b.shape[0], axis=0)) } b0, b1, b2, b3 = b_dict.values() names.update(b_dict) params.update(b_dict) parameters_dict = { 'w0_nn': w0_nn, 'w1_nn': w1_nn, 'w2_0_nn': w2_0_nn, 'w2_1_nn': w2_1_nn, 'b0': b0, 'b1': b1, 'b2': b2, 'b3': b3, } return parameters_dict def gru(x, h, parameters_dict): xh = F.concatenate(*(x, h), axis=1) w0_nn = parameters_dict.get('w0_nn', None) w1_nn = parameters_dict.get('w1_nn', None) w2_0_nn = parameters_dict.get('w2_0_nn', None) w2_1_nn = parameters_dict.get('w2_1_nn', None) b0 = parameters_dict.get('b0', None) b1 = parameters_dict.get('b1', None) b2 = parameters_dict.get('b2', None) b3 = parameters_dict.get('b3', None) r_t = F.sigmoid(F.affine(xh, w0_nn, b0)) z_t = F.sigmoid(F.affine(xh, w1_nn, b1)) n_t = F.tanh( F.affine(x, w2_0_nn, b2) + r_t * F.affine(h, w2_1_nn, b3)) h_t = (1 - z_t) * n_t + z_t * h return h_t def create_fixed_length_gru(xs0, h0, w0, w, b, num_layers, num_directions, with_bias): # xs : [T, B, I] # h0 : [L, D, B, H] # c0 : [L, D, B, H] # w0 : [D, 3, H, I+H] # w : [L-1, D, 3, H, D * H + H] # b : [L, D, 3, H] batch_size = xs0.shape[1] hidden_size = h0.shape[3] if xs0.shape[0] == 1: xs = [xs0[0]] else: xs = F.split(xs0, axis=0) hn = [] for i in range(num_layers): wi = w0 if i > 0: wi = w[i - 1] # wi : [D, 3, H, ?] # Forward direction hif = h0[i, 0] # [B, H] wif = wi[0] bif = None if with_bias: bif = b[i, 0] p_dict = register_parameters(hif, wif, bif, with_bias) hs = [] for j, x in enumerate(xs): # x : [B, I] hif = gru(x, hif, p_dict) hs.append(hif) hn.append(hif) if num_directions == 1: xs = hs continue # Backward direction hib = h0[i, 1] # [B, H] wib = wi[1] bib = None if with_bias: bib = b[i, 1] p_dict = register_parameters(hib, wib, bib, with_bias) for k, x, in enumerate(reversed(xs)): j = len(xs) - 1 - k # x : [B, I] hib = gru(x, hib, p_dict) hs[j] = F.concatenate(hs[j], hib, axis=1) hn.append(hib) xs = hs ys = xs # list of [B, HD] ys = F.stack(*ys, axis=0) # [T, B, HD] hn = F.reshape(F.stack(*hn, axis=0), (num_layers, num_directions, batch_size, hidden_size)) # LD list of [B, H] --> [L, D, B, H] return ys, hn num_layers = func.gru_param.num_layers drop_out = func.gru_param.dropout # no use bidirectional = func.gru_param.bidirectional training = func.gru_param.training # no use num_directions = 2 if bidirectional else 1 xs_nn = nn.Variable(self._variables[func.input[0]].shape.dim[:]) h0_nn = nn.Variable(self._variables[func.input[1]].shape.dim[:]) w0_np = self._get_parameter(func.input[2]) w_np = None b_np = None with_bias = False if num_layers > 1: w_np = self._get_parameter(func.input[3]) if len(func.input) == 5: b_np = self._get_parameter(func.input[4]) with_bias = True else: if len(func.input) == 4: b_np = self._get_parameter(func.input[3]) with_bias = True nn.graph_def.reset_default_graph() names = {func.input[0]: xs_nn, func.input[1]: h0_nn} params = {} func_model_name = self._get_unique_name("gru") ys, hn = create_fixed_length_gru(xs_nn, h0_nn, w0_np, w_np, b_np, num_layers, num_directions, with_bias) # returns Variables names.update({func.output[0]: ys, func.output[1]: hn}) output = F.sink(ys, hn) pg = ProtoGenerator(func_model_name, names) output.visit(pg) for _, proto_v in pg.variables.items(): self._variables[proto_v.name] = proto_v for pv_name, pv in params.items(): if pv_name in self._variables: self._variables[pv_name].type = "Parameter" parameter = self._nnp.protobuf.parameter.add() parameter.variable_name = pv_name parameter.shape.dim.extend(pv.shape) parameter.data.extend(np.array(pv.d).flatten().tolist()) parameter.need_grad = pv.need_grad self._parameters[pv_name] = parameter for proto_f in pg.functions: self._default_resolver(network, proto_f)
def _build(self): # infer variable self.infer_obs_t = infer_obs_t = nn.Variable((1, 4, 84, 84)) # inference output self.infer_q_t,\ self.infer_probs_t, _ = self.q_function(infer_obs_t, self.num_actions, self.min_v, self.max_v, self.num_bins, 'q_func') self.infer_t = F.sink(self.infer_q_t, self.infer_probs_t) # train variables self.obss_t = nn.Variable((self.batch_size, 4, 84, 84)) self.acts_t = nn.Variable((self.batch_size, 1)) self.rews_tp1 = nn.Variable((self.batch_size, 1)) self.obss_tp1 = nn.Variable((self.batch_size, 4, 84, 84)) self.ters_tp1 = nn.Variable((self.batch_size, 1)) # training output q_t, probs_t, dists = self.q_function(self.obss_t, self.num_actions, self.min_v, self.max_v, self.num_bins, 'q_func') q_tp1, probs_tp1, _ = self.q_function(self.obss_tp1, self.num_actions, self.min_v, self.max_v, self.num_bins, 'target_q_func') expand_last = lambda x: F.reshape(x, x.shape + (1, )) flat = lambda x: F.reshape(x, (-1, 1)) # extract selected dimension a_t_one_hot = expand_last(F.one_hot(self.acts_t, (self.num_actions, ))) probs_t_selected = F.max(probs_t * a_t_one_hot, axis=1) # extract max dimension _, indices = F.max(q_tp1, axis=1, keepdims=True, with_index=True) a_tp1_one_hot = expand_last(F.one_hot(indices, (self.num_actions, ))) probs_tp1_best = F.max(probs_tp1 * a_tp1_one_hot, axis=1) # clipping reward clipped_rews_tp1 = clip_by_value(self.rews_tp1, -1.0, 1.0) disc_q_tp1 = F.reshape(dists, (1, -1)) * (1.0 - self.ters_tp1) t_z = clip_by_value(clipped_rews_tp1 + self.gamma * disc_q_tp1, self.min_v, self.max_v) # update indices b = (t_z - self.min_v) / ((self.max_v - self.min_v) / (self.num_bins - 1)) l = F.floor(b) l_mask = F.reshape(F.one_hot(flat(l), (self.num_bins, )), (-1, self.num_bins, self.num_bins)) u = F.ceil(b) u_mask = F.reshape(F.one_hot(flat(u), (self.num_bins, )), (-1, self.num_bins, self.num_bins)) m_l = expand_last(probs_tp1_best * (1 - (b - l))) m_u = expand_last(probs_tp1_best * (b - l)) m = F.sum(m_l * l_mask + m_u * u_mask, axis=1) m.need_grad = False self.loss = -F.mean(F.sum(m * F.log(probs_t_selected + 1e-10), axis=1)) # optimizer self.solver = S.RMSprop(self.lr, 0.95, 1e-2) # weights and biases with nn.parameter_scope('q_func'): self.params = nn.get_parameters() with nn.parameter_scope('target_q_func'): self.target_params = nn.get_parameters() # set q function parameters to solver self.solver.set_parameters(self.params)
def _get_network_sink(outputs): import nnabla.functions as F outputs = [o for o in outputs.values()] return F.sink(*outputs)
def inner_train_test(inputa, inputb, labela, labelb, data_generator, meta_training, args): lossesa, lossesb, accuraciesa, accuraciesb = [], [], [], [] if meta_training: num_updates = args.num_updates update_lr = args.train_update_lr else: num_updates = args.test_num_updates update_lr = args.update_lr # Training for inp in data_generator.next(): inputa.d, inputb.d, labela.d, labelb.d = inp # Initialize network with nn.parameter_scope('meta'): resulta = net(inputa, labela, True, args) resultb = net(inputb, labelb, True, args) fast_weights = nn.get_parameters() # For saving training accuracies resulta[0].persistent = True resulta[1].persistent = True task_lossa_var = [ resulta[0], ] task_accuracya_var = [ resulta[1], ] # Inner loop for j in range(num_updates): grad_list = nn.grad(resulta[0], fast_weights.values()) for ind, key in enumerate(fast_weights.keys()): if grad_list[ind] is None: continue if args.first_order or not meta_training: grad_list[ind].need_grad = False fast_weights[key] = fast_weights[key] - \ update_lr * grad_list[ind] resulta = net(inputa, labela, True, args, fast_weights) resulta[0].persistent = True resulta[1].persistent = True task_lossa_var.append(resulta[0]) task_accuracya_var.append(resulta[1]) # Loss on queries is calculated only at the end of the inner loop # Following the original implementation, # we always use batch stats for batch normalization even in a test phase resultb = net(inputb, labelb, True, args, fast_weights) # Forward calculation result_all = F.sink(resulta[0], resulta[1], resultb[0], resultb[1]) result_all.forward() if meta_training: # Backward calculation lossb = resultb[0] / data_generator.batch_size lossb.backward( ) # gradients on weights are automatically accumlated task_lossa = [] task_accuracya = [] for j in range(num_updates + 1): task_accuracya_var[j].forward() task_lossa.append(task_lossa_var[j].d) task_accuracya.append(task_accuracya_var[j].d) lossesa.append(task_lossa) lossesb.append(resultb[0].d) accuraciesa.append(task_accuracya) accuraciesb.append(resultb[1].d) return lossesa, lossesb, accuraciesa, accuraciesb
def sample_loop(self, model, shape, sampler, noise=None, dump_interval=-1, progress=False, without_auto_forward=False): """ Iteratively Sample data from model from t=T to t=0. T is specified as the length of betas given to __init__(). Args: model (collable): A callable that takes x_t and t and predict noise (and sigma related parameters). shape (list like object): A data shape. sampler (callable): A function to sample x_{t-1} given x_{t} and t. Typically, self.p_sample or self.ddim_sample. noise (collable): A noise generator. If None, F.randn(shape) will be used. interval (int): If > 0, all intermediate results at every `interval` step will be returned as a list. e.g. if interval = 10, the predicted results at {10, 20, 30, ...} will be returned. progress (bool): If True, tqdm will be used to show the sampling progress. Returns: - x_0 (nn.Variable): the final sampled result of x_0 - samples (a list of nn.Variable): the sampled results at every `interval` - pred_x_starts (a list of nn.Variable): the predicted x_0 from each x_t at every `interval`: """ T = self.num_timesteps indices = list(range(T))[::-1] samples = [] pred_x_starts = [] if progress: from tqdm.auto import tqdm indices = tqdm(indices) if without_auto_forward: if noise is None: noise = np.random.randn(*shape) else: assert isinstance(noise, np.ndarray) assert noise.shape == shape x_t = nn.Variable.from_numpy_array(noise) t = nn.Variable.from_numpy_array([T - 1 for _ in range(shape[0])]) # build graph y, pred_x_start = sampler(model, x_t, t) up_x_t = F.assign(x_t, y) up_t = F.assign(t, t - 1) update = F.sink(up_x_t, up_t) cnt = 0 for step in indices: y.forward(clear_buffer=True) update.forward(clear_buffer=True) cnt += 1 if dump_interval > 0 and cnt % dump_interval == 0: samples.append((step, y.d.copy())) pred_x_starts.append((step, pred_x_start.d.copy())) else: with nn.auto_forward(): if noise is None: x_t = F.randn(shape=shape) else: assert isinstance(noise, np.ndarray) assert noise.shape == shape x_t = nn.Variable.from_numpy_array(noise) cnt = 0 for step in indices: t = F.constant(step, shape=(shape[0], )) x_t, pred_x_start = sampler( model, x_t, t, no_noise=step == 0) cnt += 1 if dump_interval > 0 and cnt % dump_interval == 0: samples.append((step, x_t.d.copy())) pred_x_starts.append((step, pred_x_start.d.copy())) assert x_t.shape == shape return x_t.d.copy(), samples, pred_x_starts
def train(): parser = argparse.ArgumentParser() parser.add_argument("--train-file", type=str) parser.add_argument("--valid-file", type=str) parser.add_argument("--num-training-examples", type=int, default=250) parser.add_argument("--accum-grad", type=int, default=1) parser.add_argument("--valid-interval", type=int, default=200) parser.add_argument("--threshold", type=float, default=0.95) parser.add_argument("--context", type=str, default="cpu") parser.add_argument("--device-id", type=int, default=0) args = parser.parse_args() from nnabla.ext_utils import get_extension_context extension_module = args.context ctx = get_extension_context(extension_module, device_id=args.device_id) nn.set_default_context(ctx) # prepare data iterators tdata = data_iterator( BAbI19DataSource(args.train_file, args.num_training_examples, shuffle=True), 1, False, False, False) vdata = data_iterator( BAbI19DataSource(args.valid_file, 1000, shuffle=True), 1, False, False, False) # prepare monitors monitor = M.Monitor("./bAbI19") tloss = M.MonitorSeries("Training Loss", monitor, interval=10) terror = M.MonitorSeries("Training Error", monitor, interval=10) verror = M.MonitorSeries("Validation Error", monitor, interval=1) # prepare solver solver = S.Adam() solver_initialized = False cnt = 0 while True: l = 0.0 e = 0.0 solver.zero_grad() for _ in range(args.accum_grad): # read next data x = tdata.next() V = x[1][0][0] E = x[2][0][0] ans = x[3][0][0] # construct GGNN ## convert to nn.Variable x = nn.Variable(V.shape) x.data.data = V h = nn.Variable((len(V), 6)) h.data.data = utils.h_0(V, 6) outputs = predict(V, E, len(ans)) losses = [] errors = [] for a, output in zip(ans, outputs): label = nn.Variable((1, 1)) label.data.data[0, 0] = a losses.append(F.mean(F.softmax_cross_entropy(output, label))) output2 = output.unlinked() errors.append(F.mean(F.top_n_error(output2, label))) # initialize solver if not solver_initialized: solver.set_parameters(nn.get_parameters()) solver_initialized = True solver.zero_grad() # calculate loss/error loss = F.mean(F.stack(*losses)) error = F.mean(F.stack(*errors)) F.sink(loss, error).forward(clear_no_need_grad=True) loss.backward(clear_buffer=True) l += loss.data.data e += error.data.data # dump log tloss.add(cnt, l / args.accum_grad) terror.add(cnt, e / args.accum_grad) l = 0.0 e = 0.0 solver.update() cnt += 1 if cnt % args.valid_interval == 0: # validation validation_error = 0 correct_example = None wrong_example = None for _ in range(vdata.size): x = vdata.next() id2str = x[0][0][0] V = x[1][0][0] E = x[2][0][0] ans = x[3][0][0] # construct GGNN ## convert to nn.Variable x = nn.Variable(V.shape) x.data.data = V h = nn.Variable((len(V), 6)) h.data.data = utils.h_0(V, 6) outputs = predict(V, E, len(ans)) errors = [] actual = [] for a, output in zip(ans, outputs): label = nn.Variable((1, 1)) label.data.data[0, 0] = a errors.append(F.mean(F.top_n_error(output, label))) actual.append(output.data.data) error = F.mean(F.stack(*errors)) error.forward(clear_no_need_grad=True) x = 0.0 if error.data.data == 0: x = 0 else: x = 1 if x > 0.5: if wrong_example is None: wrong_example = (id2str, V, E, ans, actual) else: if correct_example is None: correct_example = (id2str, V, E, ans, actual) validation_error += x validation_error /= vdata.size verror.add(cnt, validation_error) accuracy = 1 - validation_error if accuracy >= args.threshold: def show(example): if "s" in example[2]: for i, j in example[2]["s"]: print("The {} is south the {}.".format( example[0][i], example[0][j])) if "n" in example[2]: for i, j in example[2]["n"]: print("The {} is north the {}.".format( example[0][i], example[0][j])) if "w" in example[2]: for i, j in example[2]["w"]: print("The {} is west the {}.".format( example[0][i], example[0][j])) if "e" in example[2]: for i, j in example[2]["e"]: print("The {} is east the {}.".format( example[0][i], example[0][j])) i = np.argmax(example[1][:, 0]) j = np.argmax(example[1][:, 1]) print("What is the path from {} to {}?".format( example[0][i], example[0][j])) for (expected, actual) in zip(example[3], example[4]): i = np.argmax(actual[0]) print("Expected: {}, Actual: {}".format( id2classes[expected], id2classes[i])) if correct_example is not None: show(correct_example) if wrong_example is not None: show(wrong_example) break
def _executors(info): renamed = info.renamed_variables proto, networks = info.proto, info.networks class Executor: pass executors = OrderedDict() for e in proto.executor: executor = Executor() executor.network = networks[e.network_name] executor.num_evaluations = e.num_evaluations if e.num_evaluations > 0 else 1 executor.repeat_evaluation_type = e.repeat_evaluation_type executor.need_back_propagation = e.need_back_propagation executor.no_image_normalization = e.no_image_normalization executor.dataset_assign = OrderedDict() for d in e.data_variable: executor.dataset_assign[executor.network.variables[ renamed.get(d.variable_name, d.variable_name)]] = d.data_name executor.generator_assign = OrderedDict() for g in e.generator_variable: executor.generator_assign[executor.network.variables[ renamed.get(g.variable_name, g.variable_name)]] = _get_generator(g) executor.output_assign = OrderedDict() for o in e.output_variable: executor.output_assign[executor.network.variables[ renamed.get(o.variable_name, o.variable_name)]] = [o.type, o.data_name] executor.parameters = OrderedDict() for p in e.parameter_variable: param_variable_names = _get_matching_variable_names( p.variable_name, list(itertools.chain(executor.network.parameters.keys(), executor.network.variables.keys()))) for v_name in param_variable_names: if v_name in executor.network.parameters: executor.parameters[ executor.network.parameters[v_name]] = v_name if v_name in executor.network.variables: executor.parameters[ executor.network.variables[v_name]] = v_name executor.forward_target = F.sink(*[v.variable_instance for v in executor.output_assign.keys()]) if executor.need_back_propagation: executor.loss_variables = [] for l in e.loss_variable: executor.loss_variables.append(executor.network.variables[ l.variable_name]) executor.parameter_learning_rate_multipliers = OrderedDict() for p in e.parameter_variable: param_variable_names = _get_matching_variable_names( p.variable_name, list(itertools.chain(executor.network.parameters.keys(), executor.network.variables.keys()))) for v_name in param_variable_names: if v_name in executor.network.parameters: executor.parameter_learning_rate_multipliers[ executor.network.parameters[v_name]] = p.learning_rate_multiplier elif v_name in executor.network.variables: executor.parameter_learning_rate_multipliers[ executor.network.variables[v_name]] = p.learning_rate_multiplier executor.backward_target = F.sink( *[v.variable_instance for v in executor.loss_variables]) executors[e.name] = executor return executors
def _build(self): # inference graph self.infer_obs_t = nn.Variable((1, ) + self.obs_shape) with nn.parameter_scope('trainable'): infer_dist = policy_network(self.infer_obs_t, self.action_size, 'actor') self.infer_act_t, _ = _squash_action(infer_dist) self.deterministic_act_t = infer_dist.mean() # training graph self.obss_t = nn.Variable((self.batch_size, ) + self.obs_shape) self.acts_t = nn.Variable((self.batch_size, self.action_size)) self.rews_tp1 = nn.Variable((self.batch_size, 1)) self.obss_tp1 = nn.Variable((self.batch_size, ) + self.obs_shape) self.ters_tp1 = nn.Variable((self.batch_size, 1)) with nn.parameter_scope('trainable'): dist = policy_network(self.obss_t, self.action_size, 'actor') squashed_act_t, log_prob_t = _squash_action(dist) v_t = v_network(self.obss_t, 'value') q_t1 = q_network(self.obss_t, self.acts_t, 'critic/1') q_t2 = q_network(self.obss_t, self.acts_t, 'critic/2') q_t1_with_actor = q_network(self.obss_t, squashed_act_t, 'critic/1') q_t2_with_actor = q_network(self.obss_t, squashed_act_t, 'critic/2') with nn.parameter_scope('target'): v_tp1 = v_network(self.obss_tp1, 'value') # value loss q_t = F.minimum2(q_t1_with_actor, q_t2_with_actor) v_target = q_t - log_prob_t v_target.need_grad = False self.value_loss = 0.5 * F.mean(F.squared_error(v_t, v_target)) # q function loss scaled_rews_tp1 = self.rews_tp1 * self.reward_scale q_target = scaled_rews_tp1 + self.gamma * v_tp1 * (1.0 - self.ters_tp1) q_target.need_grad = False q1_loss = 0.5 * F.mean(F.squared_error(q_t1, q_target)) q2_loss = 0.5 * F.mean(F.squared_error(q_t2, q_target)) self.critic_loss = q1_loss + q2_loss # policy function loss mean_loss = 0.5 * F.mean(dist.mean()**2) logstd_loss = 0.5 * F.mean(F.log(dist.stddev())**2) policy_reg_loss = self.policy_reg * (mean_loss + logstd_loss) self.objective_loss = F.mean(log_prob_t - q_t) self.actor_loss = self.objective_loss + policy_reg_loss # trainable parameters with nn.parameter_scope('trainable'): with nn.parameter_scope('value'): value_params = nn.get_parameters() with nn.parameter_scope('critic'): critic_params = nn.get_parameters() with nn.parameter_scope('actor'): actor_params = nn.get_parameters() # target parameters with nn.parameter_scope('target/value'): target_params = nn.get_parameters() # target update update_targets = [] sync_targets = [] for key, src in value_params.items(): dst = target_params[key] updated_dst = (1.0 - self.tau) * dst + self.tau * src update_targets.append(F.assign(dst, updated_dst)) sync_targets.append(F.assign(dst, src)) self.update_target_expr = F.sink(*update_targets) self.sync_target_expr = F.sink(*sync_targets) # setup solvers self.value_solver = S.Adam(self.value_lr) self.value_solver.set_parameters(value_params) self.critic_solver = S.Adam(self.critic_lr) self.critic_solver.set_parameters(critic_params) self.actor_solver = S.Adam(self.actor_lr) self.actor_solver.set_parameters(actor_params)
def backward_function_tester(rng, func, ref_func, inputs, func_args=[], func_kwargs={}, atol_f=1e-6, atol_b=1e-3, atol_accum=1e-3, dstep=1e-3, backward=None, ctx=None, func_name=None, ref_grad=None, disable_half_test=False, atol_half=1e-1): """Backward function tester In the forward test, it compares the results of nn.grad and `func`.backward. In the backward test, it compares the analytical gradients and numerical gradient with `grad_outputs`. """ # TODO: half from scipy.optimize import approx_fprime if ctx is None: ctx = nn.Context() if backward is None: backward = [True if i is not None else False for i in inputs] # TODO: Remove set_default_context after adding ctx to BackwardFunction. nn.set_default_context(ctx) # Create Variables def create_variables(inputs, backward): vinputs = [] for i, b in zip(inputs, backward): if i is None: vinputs += [None] continue vinputs += [nn.Variable(i.shape, need_grad=b)] vinputs[-1].data.cast(i.dtype)[...] = i return vinputs # Create grad_outputs def create_grad_outputs(outputs): grad_outputs = [] for o in outputs: if o.shape == (): go = nn.NdArray.from_numpy_array(np.array(randn(rng))) #go = nn.NdArray.from_numpy_array(np.array(1.0)) else: go = nn.NdArray.from_numpy_array(randn(rng, *o.shape)) #go = nn.NdArray.from_numpy_array(np.ones(o.shape)) grad_outputs.append(go) return grad_outputs # Fill grads def fill_grads(vinputs, grads): for vi, gd in zip(vinputs, grads): if vi is None: continue vi.g = gd # Fill grads def zero_grads(vinputs): for vi in vinputs: if vi is None: continue vi.grad.zero() return # Gradient penalty on grads def gradient_penalty2(grads): gp2 = 0.0 for g in grads: gp2 += F.sum(g**2.0) return gp2 # Product sum def prod_sum(inputs0, inputs1): out = 0.0 for inp0, inp1 in zip(inputs0, inputs1): out += inp0 * nn.Variable(inp1.shape).apply(data=inp1) return out # Set inputs for the numerical gradients def set_inputs(inputs0, vinputs): begin = 0 for i in vinputs: end = begin + i.size if i.need_grad == True: i.d = inputs0[begin:end].reshape(i.shape) begin = end # Gradient penalty on grads used for computing numerical gradients def obj_func(inputs0, gp2, vinputs): set_inputs(inputs0, vinputs) gp2.forward() return gp2.d.copy() # # Half test # if not disable_half_test: # finputs = create_variables(inputs, backward) # hinputs = create_variables(inputs, backward) # half_test(rng, func, finputs, hinputs, func_args, # func_kwargs, backward, ctx, func_name, atol=atol_half) # Create input variables vinputs = create_variables(inputs, backward) # --- Forward test --- # # Zero grads zero_grads(vinputs) # Forward/Backward on the forward graph voutputs = [ F.sigmoid(x) for x in force_list(func(*(vinputs + func_args), **func_kwargs)) ] agrad_outputs = create_grad_outputs(voutputs) o = prod_sum(voutputs, agrad_outputs) o.forward() o.backward() # clear_buffer=True) # Grads voutputs = voutputs vinputs = list(filter(lambda vi: vi is not None, vinputs)) agrad_outputs = agrad_outputs grads = nn.grad(voutputs, vinputs, agrad_outputs) grads = list(filter(lambda x: x is not None, grads)) o = F.sink(*grads) o.forward() # Check forward for vi, go in zip(vinputs, grads): if vi.need_grad is False: continue fgrads = vi.g bgrads = go.d assert_allclose(fgrads, bgrads, atol=atol_f) # TODO: 1. Pass function argument directly to backward functions. # TODO: 2. should be changed for the simplier form by simply testing BackwardFunction # --- Backward (accum = False) test --- # # Zero grads zero_grads(vinputs) # Compute analytical grads gp2 = gradient_penalty2(grads) gp2.forward() gp2.backward(clear_buffer=True) analytical_grads = np.concatenate( [vi.g.copy().flatten() for vi in vinputs]) analytical_grads0 = analytical_grads # Compute numerical grads inputs0 = np.concatenate( [inp.flatten() for inp in inputs if inp is not None]) numerical_grads = approx_fprime(inputs0, obj_func, dstep, gp2, vinputs) # Check backward assert_allclose(analytical_grads, numerical_grads, atol=atol_b) # --- Backward (accum = True) test --- # # Random grads rand_grads = [randn(rng, *vi.shape) for vi in vinputs] fill_grads(vinputs, rand_grads) # Compute analytical grads gp2.forward() gp2.backward(clear_buffer=True) analytical_grads = np.concatenate( [vi.g.copy().flatten() for vi in vinputs]) rand_grads = np.concatenate([ rg.flatten() if isinstance(rg, np.ndarray) else np.array(rg).reshape( (1, )) for rg in rand_grads ]) analytical_grads -= rand_grads # Check backward assert_allclose(analytical_grads, analytical_grads0, atol=atol_accum)
def _build(self): # inference graph self.infer_obs_t = nn.Variable((1, ) + self.obs_shape) with nn.parameter_scope('trainable'): infer_dist = policy_network(self.infer_obs_t, self.action_size, 'actor') self.infer_act_t, _ = _squash_action(infer_dist) self.deterministic_act_t = infer_dist.mean() # training graph self.obss_t = nn.Variable((self.batch_size, ) + self.obs_shape) self.acts_t = nn.Variable((self.batch_size, self.action_size)) self.rews_tp1 = nn.Variable((self.batch_size, 1)) self.obss_tp1 = nn.Variable((self.batch_size, ) + self.obs_shape) self.ters_tp1 = nn.Variable((self.batch_size, 1)) with nn.parameter_scope('trainable'): self.log_temp = get_parameter_or_create('temp', [1, 1], ConstantInitializer(0.0)) dist_t = policy_network(self.obss_t, self.action_size, 'actor') dist_tp1 = policy_network(self.obss_tp1, self.action_size, 'actor') squashed_act_t, log_prob_t = _squash_action(dist_t) squashed_act_tp1, log_prob_tp1 = _squash_action(dist_tp1) q1_t = q_network(self.obss_t, self.acts_t, 'critic/1') q2_t = q_network(self.obss_t, self.acts_t, 'critic/2') q1_t_with_actor = q_network(self.obss_t, squashed_act_t, 'critic/1') q2_t_with_actor = q_network(self.obss_t, squashed_act_t, 'critic/2') with nn.parameter_scope('target'): q1_tp1 = q_network(self.obss_tp1, squashed_act_tp1, 'critic/1') q2_tp1 = q_network(self.obss_tp1, squashed_act_tp1, 'critic/2') # q function loss q_tp1 = F.minimum2(q1_tp1, q2_tp1) entropy_tp1 = F.exp(self.log_temp) * log_prob_tp1 mask = (1.0 - self.ters_tp1) q_target = self.rews_tp1 + self.gamma * (q_tp1 - entropy_tp1) * mask q_target.need_grad = False q1_loss = 0.5 * F.mean(F.squared_error(q1_t, q_target)) q2_loss = 0.5 * F.mean(F.squared_error(q2_t, q_target)) self.critic_loss = q1_loss + q2_loss # policy function loss q_t = F.minimum2(q1_t_with_actor, q2_t_with_actor) entropy_t = F.exp(self.log_temp) * log_prob_t self.actor_loss = F.mean(entropy_t - q_t) # temperature loss temp_target = log_prob_t - self.action_size temp_target.need_grad = False self.temp_loss = -F.mean(F.exp(self.log_temp) * temp_target) # trainable parameters with nn.parameter_scope('trainable'): with nn.parameter_scope('critic'): critic_params = nn.get_parameters() with nn.parameter_scope('actor'): actor_params = nn.get_parameters() # target parameters with nn.parameter_scope('target/critic'): target_params = nn.get_parameters() # target update update_targets = [] sync_targets = [] for key, src in critic_params.items(): dst = target_params[key] updated_dst = (1.0 - self.tau) * dst + self.tau * src update_targets.append(F.assign(dst, updated_dst)) sync_targets.append(F.assign(dst, src)) self.update_target_expr = F.sink(*update_targets) self.sync_target_expr = F.sink(*sync_targets) # setup solvers self.critic_solver = S.Adam(self.critic_lr) self.critic_solver.set_parameters(critic_params) self.actor_solver = S.Adam(self.actor_lr) self.actor_solver.set_parameters(actor_params) self.temp_solver = S.Adam(self.temp_lr) self.temp_solver.set_parameters({'temp': self.log_temp})
def backward_function_tester(rng, func, inputs=None, func_args=[], func_kwargs={}, atol_f=1e-4, atol_b=1e-3, atol_accum=5e-2, dstep=1e-3, backward=None, backward_b=None, ctx=None, non_accum_check=False, skip_backward_check=False, insert_identity=[], auto_forward=False): """ Automatic testing of backward function and backward pass of `func` by comparing it. The backward pass of `func` is the reference; therefore, the backward pass of `func` must be tested first! Syntax of `ref_func`: inputs, parameters """ if ctx is None: ctx = nn.Context() if backward is None: backward = [True for _ in inputs] def create_variables(inputs, backward): vinputs = [] for i, b in zip(inputs, backward): if i is None: vinputs += [None] continue vinp = nn.Variable(i.shape, need_grad=b) vinp.grad.zero() # grads always not accumulation vinputs += [vinp] vinputs[-1].data.cast(i.dtype)[...] = i return vinputs vinputs = create_variables(inputs, backward) vinputs_for_clear_buffer = create_variables(inputs, backward) vinputs_for_nn_grad = create_variables(inputs, backward) vinputs_identity = [] vinputs_identity_for_clear_buffer = [] vinputs_identity_for_nn_grad = [] if not insert_identity: insert_identity = [True] * len(vinputs) for idx, i in enumerate( zip(vinputs, vinputs_for_clear_buffer, vinputs_for_nn_grad)): with nn.auto_forward(auto_forward): i0, i1, i2 = i if i0 is None: vinputs_identity += [None] vinputs_identity_for_clear_buffer += [None] vinputs_identity_for_nn_grad += [None] elif insert_identity[idx]: vinputs_identity += [F.identity(i0)] vinputs_identity_for_clear_buffer += [F.identity(i1)] vinputs_identity_for_nn_grad += [F.identity(i2)] else: vinputs_identity += [i0] vinputs_identity_for_clear_buffer += [i1] vinputs_identity_for_nn_grad += [i2] # Forward and backward of the forward function with no buffer clear with nn.context_scope(ctx), nn.auto_forward(auto_forward): outputs0 = func(*(vinputs_identity + func_args), **func_kwargs) outputs0 = force_list(outputs0) F.sink(*outputs0).forward(clear_no_need_grad=False) grad_voutputs = [] for output in outputs0: ograd = rng.randn(*output.shape) grad_voutputs.append( nn.Variable.from_numpy_array(ograd).apply(need_grad=True)) output.g = ograd F.sink(*outputs0, one_input_grad=False).backward() vinputs = list(filter(lambda x: x is not None, vinputs)) vinputs_identity = list(filter(lambda x: x is not None, vinputs_identity)) vinputs_for_clear_buffer = list( filter(lambda x: x is not None, vinputs_for_clear_buffer)) grad_inputs0 = [inp.g.copy() for inp in vinputs] # Forward and backward of the forward function with clear redundant buffer with nn.context_scope(ctx), nn.auto_forward(auto_forward): outputs_for_clear_buffer = func( *(vinputs_identity_for_clear_buffer + func_args), **func_kwargs) outputs_for_clear_buffer = force_list(outputs_for_clear_buffer) outputs_for_clear_buffer = list( map(lambda x: F.identity(x) if x is not None else None, outputs_for_clear_buffer)) F.sink(*outputs_for_clear_buffer).forward(clear_no_need_grad=True) for o, ref_o in zip(outputs_for_clear_buffer, outputs0): o.g = ref_o.g # Check backward F.sink(*outputs_for_clear_buffer, one_input_grad=False).backward(clear_buffer=True) grad_inputs_for_clear_buffer = [ inp.g.copy() for inp in vinputs_for_clear_buffer ] for grad_ref, grad_res in zip(grad_inputs0, grad_inputs_for_clear_buffer): if grad_ref is None or grad_res is None: continue assert_allclose( grad_ref, grad_res, atol=atol_f, err_msg= "backward(clear_buffer=True) and backward(clear_buffer=False) results differ." ) # Forward of the backward function from nnabla.backward_functions import registry func_name = output.parent.info.type_name func_backward = registry[func_name] grad_vinputs = grad_voutputs + vinputs grad_vinputs_identity = grad_voutputs + vinputs_identity func_info_args = output.parent.info.args with nn.context_scope(ctx), nn.auto_forward(auto_forward): ograds0 = func_backward(grad_vinputs_identity, **func_info_args) ograds0 = force_list(ograds0) ograds0_ = list(filter(lambda o: o is not None, ograds0)) F.sink(*ograds0_).forward(clear_no_need_grad=True) outputs1 = [] for i, ograd in enumerate(ograds0): outputs1.append(ograd.d.copy()) if ograd is not None else \ outputs1.append(None) # Check num of returned elements assert_allclose( len(vinputs), len(outputs1), err_msg="Length of the outputs ({}) does not match " "the length of the inputs ({}) to the backward function".format( len(outputs1), len(vinputs))) # Check forward for i, elm in enumerate(zip(grad_inputs0, outputs1)): grad_ref, grad_res = elm if grad_ref is None or grad_res is None: continue assert_allclose( grad_ref, grad_res, atol=atol_f, err_msg= "Forward of the backward function ({}) fails at {}-th output.". format(func_backward.__name__, i)) # Check the same results between backward_function and nn.grad vinputs = [v for b, v in zip(backward, vinputs) if b] vinputs = list(filter(lambda x: x is not None, vinputs)) with nn.context_scope(ctx), nn.auto_forward(auto_forward): outputs0_for_nn_grad = func( *(vinputs_identity_for_nn_grad + func_args), **func_kwargs) outputs0_for_nn_grad = force_list(outputs0_for_nn_grad) vinputs_identity_for_nn_grad = [ v for b, v in zip(backward, vinputs_identity_for_nn_grad) if b ] vinputs_identity_for_nn_grad = list( filter(lambda x: x is not None, vinputs_identity_for_nn_grad)) ograds1 = nn.grad(outputs0_for_nn_grad, vinputs_identity_for_nn_grad, grad_outputs=[g.d.copy() for g in grad_voutputs]) F.sink(*ograds1).forward(clear_no_need_grad=True) ograds0 = list(filter(lambda o: o is not None, ograds0)) ograds1 = list(filter(lambda o: o is not None, ograds1)) for i in range(len(ograds0)): if ograds0[i].parent is None: continue assert_allclose(ograds0[i].d, ograds1[i].d, atol=atol_f, err_msg="nn.grad and backward_functon results differ.") # Check backward # needed since we sometimes do need_grad=False for optimization, e.g., mask. def set_inputs(inputs0, vinputs): begin = 0 for i in vinputs: end = begin + i.size i.d = inputs0[begin:end].reshape(i.shape) begin = end def obj_func(inputs0, voutput, vinputs): set_inputs(inputs0, vinputs) voutput.forward() y = voutput.d.copy() return y initial_grads = [] for grad_vinput in grad_vinputs: if grad_vinput is None: continue g = np.asarray(rng.randn(*grad_vinput.shape)) initial_grads.append(g) grad_inputs1 = np.concatenate( [v.d.flatten() for v in grad_vinputs if v is not None]) for i, ograd in enumerate(ograds0): # We can skip if the backward is the functions composite. # If the backward is of functions composite, # the numerical difference is really different from the analytical one for some functions. if skip_backward_check: continue if ograd is None or not backward[i]: continue for ig, v in zip(initial_grads, grad_vinputs): v.g = ig # This must be first since approx_fprime destroys the input values # analytical grad. rgrad = rng.randn() with nn.auto_forward(auto_forward): sum_ograd = F.sum(ograd) * rgrad sum_ograd.forward(clear_no_need_grad=True) sum_ograd.backward() analytical_grads = np.concatenate( [v.g.flatten() for v in grad_vinputs]) analytical_grads -= np.concatenate( [g.flatten() for g in initial_grads]) # numerical grad from scipy.optimize import approx_fprime numerical_grads = approx_fprime(grad_inputs1, obj_func, dstep, sum_ograd, grad_vinputs) # grad_vinputs: dy_1, ..., dy_n, x_1, ..., x_n # grad_voutputs: dy_1, ..., dy_n seps = [0] + np.cumsum([int(np.prod(v.shape)) for v in grad_vinputs]).tolist() ngrads = len(grad_voutputs) ninputs = len(grad_vinputs) backward_b = [True] * ninputs if backward_b is None else backward_b for k, sep in enumerate(zip(seps[:-1], seps[1:])): if k >= ngrads and not backward[k - ngrads] or not backward_b[k]: continue s0, s1 = sep analytical_grad = analytical_grads[s0:s1] numerical_grad = numerical_grads[s0:s1] assert_allclose( analytical_grad, numerical_grad, atol=atol_accum, err_msg= "Backward (accum) of the backward function ({}) wrt {}-th / {} input fails." .format(func_backward.__name__, k, ninputs)) # Some functions backward like AffineDataGrad and AffineFilterGrad does not check non-accum anywhere # so check those non-accum backward method here. if non_accum_check: # for any outputs, parents are the same function. parent = outputs0[0].parent inputs = parent.inputs # Accum initial_grads = np.concatenate( [inp.g.flatten() for inp, b in zip(inputs, backward) if b]) accum = [True] * len(inputs) parent.backward(inputs, outputs0, accum=accum) accum_grads = np.concatenate( [inp.g.flatten() for inp, b in zip(inputs, backward) if b]) non_accum_grads0 = accum_grads - initial_grads # Non-accum accum = [False] * len(inputs) parent.backward(inputs, outputs0, accum=accum) non_accum_grads1 = np.concatenate( [inp.g.flatten() for inp, b in zip(inputs, backward) if b]) # Check assert_allclose( non_accum_grads0, non_accum_grads1, atol=atol_b, err_msg="Backward (non-accum) of the backward function ({}) fails." .format(func_backward.__name__))
def test_sink(seed): rng = np.random.RandomState(seed) v = nn.Variable((2, 3, 4), need_grad=True) h0 = F.tanh(v) h1 = F.sigmoid(v) v.d = rng.randn(*v.shape).astype(np.float32) # Create references v.grad.zero() h0.forward() h1.forward() h0.backward() h1.backward() # v.grad is accumulated. h0d = h0.d.copy() h1d = h1.d.copy() vg = v.g.copy() # Reset values h0.data.zero() h1.data.zero() v.grad.zero() # Check if sink works dummy = F.sink(h0, h1, one_input_grad=True) dummy.forward() dummy.backward() assert np.all(h0d == h0.d) assert np.all(h1d == h1.d) assert np.all(vg == v.g) # Check if clear_buffer still keeps h0 an h1 even though they are not # leaf variables. # It's done by defining prohibit_clear_input_buffers function in sink.hpp. dummy = F.sink(h0, h1, one_input_grad=True) dummy.forward(clear_buffer=True) assert np.all(h0d == h0.d) assert np.all(h1d == h1.d) # Also checking backward when clear buffers. v.grad.zero() dummy = F.sink(h0, h1, one_input_grad=True) dummy.forward(clear_no_need_grad=True) dummy.backward(clear_buffer=True) assert np.all(h0d == h0.d) assert np.all(h1d == h1.d) assert np.all(vg == v.g) # Check if one_input_grad=False works dummy = F.sink(h0, h1, one_input_grad=False) g0 = rng.randn(*h0.shape).astype(np.float32) g1 = rng.randn(*h1.shape).astype(np.float32) h0.g = g0 h1.g = g1 dummy.forward() # Compute reference v.grad.zero() h0.backward(grad=g0) h1.backward(grad=g1) gv = v.g.copy() # Compute with sink v.grad.zero() dummy.backward() assert_allclose(v.g, gv)