def get_onehot_grad(self, xs, ys=None): if ys is None: with chainer.using_config('train', False): ys = self.predict(xs, argmax=True) ys = F.expand_dims(ys, axis=1) ys = [y for y in ys] encodings, exs = self.encoder.get_grad(xs) outputs = self.output(encodings) concat_truths = F.concat(ys, axis=0) loss = F.softmax_cross_entropy(outputs, concat_truths) if isinstance(exs, tuple): exs_grad = chainer.grad([loss], exs) ex_sections = np.cumsum([ex.shape[0] for ex in exs[:-1]]) exs = F.concat(exs, axis=0) exs_grad = F.concat(exs_grad, axis=0) onehot_grad = F.sum(exs_grad * exs, axis=1) onehot_grad = F.split_axis(onehot_grad, ex_sections, axis=0) else: exs_grad = chainer.grad([loss], [exs])[0] # (batch_size, n_dim, max_length, 1) assert exs_grad.shape == exs.shape onehot_grad = F.squeeze(F.sum(exs_grad * exs, 1), 2) lengths = [len(x) for x in xs] onehot_grad = [x[:l] for x, l in zip(onehot_grad, lengths)] return onehot_grad
def get_onehot_grad(self, xs, ys=None): if ys is None: with chainer.using_config('train', False): ys = self.predict(xs, argmax=True) u, exs_prem = self.encoder.get_grad(xs[0]) v, exs_hypo = self.encoder.get_grad(xs[1]) encodings = F.concat((u, v, F.absolute(u - v), u * v), axis=1) outputs = self.output(self.mlp(encodings, no_dropout=True)) loss = F.softmax_cross_entropy(outputs, ys) exs = exs_hypo lengths = [len(x) for x in xs[1]] if isinstance(exs, tuple): exs_grad = chainer.grad([loss], exs) ex_sections = np.cumsum([ex.shape[0] for ex in exs[:-1]]) exs = F.concat(exs, axis=0) exs_grad = F.concat(exs_grad, axis=0) onehot_grad = F.sum(exs_grad * exs, axis=1) onehot_grad = F.split_axis(onehot_grad, ex_sections, axis=0) else: exs_grad = chainer.grad([loss], [exs])[0] # (batch_size, n_dim, max_length, 1) assert exs_grad.shape == exs.shape onehot_grad = F.squeeze(F.sum(exs_grad * exs, 1), 2) onehot_grad = [x[:l] for x, l in zip(onehot_grad, lengths)] return onehot_grad
def test_unconditional_forward(): """ checking gradient leaking along batch axis """ width = 5 height = 7 z_size = 2 batch_size = 3 model = CPPN( ModelConfig(width=width, height=height, n_units_xyrz=3, n_hidden_units=[5, 5], z_size=z_size)) model.zerograds() # create inputs: inputs is dict whose key is batch index, and value is tuple of (x, z) for each index x, z, inputs = gen_input_batch(batch_size, width, height, z_size) # forward prop y = model.forward(x, z) # taking loss at only first image t = get_dammy_output(batch_size, width, height) loss = F.mean_squared_error(y[0], t[0]) # check gradient leaking assert sum([g.data.sum() for g in chainer.grad((loss, ), inputs[0])]) != 0.0 assert sum([g.data.sum() for g in chainer.grad((loss, ), inputs[1])]) == 0.0 assert sum([g.data.sum() for g in chainer.grad((loss, ), inputs[2])]) == 0.0
def update_core(self): gen_optimizer = self.get_optimizer('gen') dis_optimizer = self.get_optimizer('dis') xp = self.gen.xp for i in range(self.n_dis): batch = self.get_iterator('main').next() batchsize = len(batch) x_real = Variable(self.converter(batch, self.device)) h_real = self.dis(x_real) z = self.gen.make_hidden(batchsize) x_fake = self.gen(z) h_fake = self.dis(x_fake) z2 = self.gen.make_hidden(batchsize) x_fake2 = self.gen(z2) h_fake2 = self.dis(x_fake2) if i == 0: loss_gen = self.energy_distance(h_real, h_fake, h_fake2) self.gen.cleargrads() loss_gen.backward() gen_optimizer.update() chainer.reporter.report({'gen/loss': loss_gen}) x_fake.unchain_backward() x_fake2.unchain_backward() critic_real = self.critic(h_real, h_fake2) critic_fake = self.critic(h_fake, h_fake2) loss_surrogate = F.mean(critic_real - critic_fake) eps = self.xp.random.uniform(0, 1, size=batchsize).astype("f")[ :, None, None, None] x_mid = eps * x_real + (1.0 - eps) * x_fake h_mid = chainer.Variable(self.dis(x_mid).data) base_grad, = chainer.grad([self.critic(h_mid, h_fake.data)], [ h_mid], enable_double_backprop=True) grad, = chainer.grad([self.dis(x_mid)], [x_mid], grad_outputs=[ base_grad], enable_double_backprop=True) grad = F.sqrt(F.batch_l2_norm_squared(grad)) loss_gp = self.lam * \ F.mean_squared_error(grad, xp.ones_like(grad.data)) self.dis.cleargrads() (-loss_surrogate).backward() loss_gp.backward() dis_optimizer.update() chainer.reporter.report({'critic/loss': -loss_surrogate + loss_gp}) chainer.reporter.report({"cramer distance": loss_surrogate}) chainer.reporter.report({'critic/loss_grad': loss_gp}) chainer.reporter.report({'g': F.mean(grad)})
def wrapper(self, structure, Rc, *params): differentiate_more = self._order > 0 with chainer.using_config('enable_backprop', differentiate_more): G = func(self, structure, Rc, *params) yield F.stack([F.stack(g) for g in G]) n_atom = len(G[0]) r = [] j_indices = [] for r_, j_idx in structure.get_neighbor_info( Rc, ['distance_vector', 'j_indices']): r.append(r_) j_indices.append(j_idx) differentiate_more = self._order > 1 with chainer.using_config('enable_backprop', differentiate_more): dG = [] for g in G: with chainer.force_backprop_mode(): grad = chainer.grad( g, r, enable_double_backprop=differentiate_more) dg = [ F.concat([ F.sum(dg_, axis=0) for dg_ in F.split_axis(grad_, j_idx[1:], axis=0) ], axis=0) for grad_, j_idx in zip(grad, j_indices) ] dG.append(dg) yield F.stack([F.stack(dg) for dg in dG]) differentiate_more = self._order > 2 with chainer.using_config('enable_backprop', differentiate_more): d2G = [] for dg in dG: d2g = [] for i in range(3 * n_atom): with chainer.force_backprop_mode(): grad = chainer.grad( [dg_[i] for dg_ in dg], r, enable_double_backprop=differentiate_more) d2g_ = [ F.concat([ F.sum(d2g_, axis=0) for d2g_ in F.split_axis( grad_, j_idx[1:], axis=0) ], axis=0) for grad_, j_idx in zip(grad, j_indices) ] d2g.append(d2g_) d2G.append(d2g) yield F.stack([ F.stack([F.stack(d2g_) for d2g_ in d2g]) for d2g in d2G ]).transpose(0, 2, 1, 3)
def feed(self, x, label): """ feed Args: x: list or array: Input image. Only one image can be acceptable. label: int: The number of class label. Return: L_gcam: Grad-CAM++ result. """ # feed forward activations = self.forward(x) # label selection prob = activations[self.prob_layer][0].data target_label = self.select_target(prob, label) # target loss target_prob = \ chainer.Variable(target_label) * activations[self.prob_layer] # backward # self.backward(target_prob, enable_double_backprop=True) target_activation = activations[self.target_layer] label_index = target_label.argmax() coeff = self.xp.exp(target_prob[0][label_index].data) # first_grad = coeff * target_activation.grad_var first_grad, = chainer.grad([coeff * target_prob], [target_activation], enable_double_backprop=True) second_grad, = chainer.grad([first_grad], [target_activation], enable_double_backprop=True) third_grad, = chainer.grad([second_grad], [target_activation], enable_double_backprop=True) global_sum = self.xp.sum(target_activation.data, axis=(2, 3)) global_sum = global_sum.reshape(first_grad.data[0].shape[0], 1, 1) alpha_num = second_grad.data[0] alpha_denom = \ 2.0 * second_grad.data[0] + global_sum[0] * third_grad.data[0] alpha_denom = self.xp.where(alpha_denom != 0.0, alpha_denom, self.xp.ones(alpha_denom.shape)) alphas = alpha_num / alpha_denom alphas /= self.xp.sum(alphas, axis=(1, 2))[:, self.xp.newaxis, self.xp.newaxis] importances = self.xp.sum( alphas * self.xp.maximum(first_grad.data[0], 0), # alphas * first_grad.data[0], axis=(1, 2)) L_gcam = self.xp.tensordot(importances, target_activation.data[0], axes=(0, 0)) L_gcam = (L_gcam > 0.) * L_gcam / L_gcam.max() * 255. # resize L_gcam = imresize(L_gcam, x[0].size) return L_gcam
def test(self): batch_size = self.batch_size N = self.N N_prime = self.N_prime huber_loss_threshold = self.huber_loss_threshold # Overestimation is penalized proportionally to tau # Underestimation is penalized proportionally to (1-tau) y = np.random.normal(size=(batch_size, N)).astype('f') y_var = chainer.Variable(y) t = np.random.normal(size=(batch_size, N_prime)).astype('f') tau = np.random.uniform(size=(batch_size, N)).astype('f') loss = iqn.compute_eltwise_huber_quantile_loss( y_var, t, tau, huber_loss_threshold=huber_loss_threshold) y_var_b, t_b = F.broadcast( F.reshape(y_var, (batch_size, N, 1)), F.reshape(t, (batch_size, 1, N_prime)), ) self.assertEqual(loss.shape, (batch_size, N, N_prime)) huber_loss = F.huber_loss(y_var_b, t_b, delta=huber_loss_threshold, reduce='no') self.assertEqual(huber_loss.shape, (batch_size, N, N_prime)) for i in range(batch_size): for j in range(N): for k in range(N_prime): # loss is always positive scalar_loss = loss[i, j, k] scalar_grad = chainer.grad([scalar_loss], [y_var])[0][i, j] self.assertGreater(scalar_loss.array, 0) if y[i, j] > t[i, k]: # y over-estimates t # loss equals huber loss scaled by tau correct_scalar_loss = tau[i, j] * huber_loss[i, j, k] else: # y under-estimates t # loss equals huber loss scaled by (1-tau) correct_scalar_loss = ((1 - tau[i, j]) * huber_loss[i, j, k]) correct_scalar_grad = chainer.grad([correct_scalar_loss], [y_var])[0][i, j] self.assertAlmostEqual( scalar_loss.array, correct_scalar_loss.array, places=5, ) self.assertAlmostEqual( scalar_grad.array, correct_scalar_grad.array, places=5, )
def compute_hessian(y, params): grads = chainer.grad([y], params, enable_double_backprop=True) flat_grads = trpo._flatten_and_concat_variables(grads) hessian_rows = [] for i in range(len(flat_grads)): ggrads = chainer.grad([flat_grads[i]], params) assert all(ggrad is not None for ggrad in ggrads) ggrads_data = [ggrad.data for ggrad in ggrads] flat_ggrads_data = trpo._flatten_and_concat_ndarrays(ggrads_data) hessian_rows.append(flat_ggrads_data) return np.asarray(hessian_rows)
def _compute_kl_constrained_step(self, action_distrib, action_distrib_old, gain): """Compute a step of policy parameters with a KL constraint.""" policy_params = _get_ordered_params(self.policy) kl = F.mean(action_distrib_old.kl(action_distrib)) # Check if kl computation fully supports double backprop old_style_funcs = _find_old_style_function([kl]) if old_style_funcs: raise RuntimeError("""\ Old-style functions (chainer.Function) are used to compute KL divergence. Since TRPO requires second-order derivative of KL divergence, its computation should be done with new-style functions (chainer.FunctionNode) only. Found old-style functions: {}""".format(old_style_funcs)) kl_grads = chainer.grad([kl], policy_params, enable_double_backprop=True) assert all(g is not None for g in kl_grads), "\ The gradient contains None. The policy may have unused parameters." flat_kl_grads = _flatten_and_concat_variables(kl_grads) def fisher_vector_product_func(vec): fvp = _hessian_vector_product(flat_kl_grads, policy_params, vec) return fvp + self.conjugate_gradient_damping * vec gain_grads = chainer.grad([gain], policy_params) assert all(g is not None for g in kl_grads), "\ The gradient contains None. The policy may have unused parameters." flat_gain_grads = _flatten_and_concat_ndarrays(gain_grads) step_direction = chainerrl.misc.conjugate_gradient( fisher_vector_product_func, flat_gain_grads, max_iter=self.conjugate_gradient_max_iter, ) # We want a step size that satisfies KL(old|new) < max_kl. # Let d = alpha * step_direction be the actual parameter updates. # The second-order approximation of KL divergence is: # KL(old|new) = 1/2 d^T I d + O(||d||^3), # where I is a Fisher information matrix. # Substitute d = alpha * step_direction and solve KL(old|new) = max_kl # for alpha to get the step size that tightly satisfies the constraint. dId = float( step_direction.dot(fisher_vector_product_func(step_direction))) scale = (2.0 * self.max_kl / (dId + 1e-8))**0.5 return scale * step_direction
def test_conditional_backward(): """ checking gradient leaking along batch axis """ width = 5 height = 7 z_size = 2 batch_size = 3 model = ConditionalCPPN( ConditionalModelConfig(width=width, height=height, n_units_xyr=3, n_hidden_units=[ 10, 10, ], z_size=z_size, in_width=64, in_height=64, in_channel=1, use_batch_norm=False)) model.zerograds() # create inputs: inputs is dict whose key is batch index, and value is tuple of (x, z) for each index x, z, inputs = gen_input_batch(batch_size, width, height, z_size) c = chainer.Variable(get_dammy_input(batch_size, 64, 64, 1)) # init dammy conditional input # forward prop y = model.forward(x, z, c) # taking loss at only first image t = get_dammy_output(batch_size, width, height) loss = F.mean_squared_error(y[0], t[0]) g_x, g_z = chainer.grad((loss, ), inputs[0]) g_c = chainer.grad((loss, ), (c, ))[0].data assert g_c[0].sum() != 0.0, f"gradient of c is zero" assert g_x.data.sum() != 0.0, f"gradient of x is zero" assert g_z.data.sum() != 0.0, f"gradient of z is zero" g_x, g_z = chainer.grad((loss, ), inputs[1]) assert g_c[1].sum() == 0.0, f"gradient of c is zero" assert g_x.data.sum() == 0.0, f"gradient of x is zero" assert g_z.data.sum() == 0.0, f"gradient of z is zero" g_x, g_z = chainer.grad((loss, ), inputs[2]) assert g_c[2].sum() == 0.0, f"gradient of c is zero" assert g_x.data.sum() == 0.0, f"gradient of x is zero" assert g_z.data.sum() == 0.0, f"gradient of z is zero"
def adversarial_attack(cls, x, y=None, steps=1, loss_type='cross_entropy', eps=2.0, clip_x=True, norm_type='L2', alpha=None): # you can prevent from label leaking by setting y to None xp = cuda.get_array_module(x.array) if alpha is None: alpha = eps x_org = copy.deepcopy(x) for t in range(steps): logit = cls(x) if y is None: y = F.argmax(logit, 1) loss = loss_fun(logit, y, type=loss_type) grad = chainer.grad([loss], [x])[0] d = _normalize(grad.array, xp, norm_type=norm_type) x = x + alpha * d x = Variable( _projection(x_org.array, x.array, eps, xp, norm_type=norm_type)) if clip_x: x = F.clip(x, -1., 1.) return x
def check_unstride_backward(self, xp): x = xp.arange(12, dtype=self.dtype).reshape((3, 4))[::-1] v = Variable(x) y = as_strided(v, (12,), (1,), 0) y.grad = xp.ones((12,), dtype=self.dtype) gx, = grad((y,), (v,)) testing.assert_allclose(gx.array, xp.ones(x.shape, dtype=self.dtype))
def check_flip_backward(self, xp): x = xp.arange(4, dtype=self.dtype) v = Variable(x) y = as_strided(v, (4,), (-1,), 3) y.grad = xp.ones((4,), dtype=self.dtype) gx, = grad((y,), (v,)) testing.assert_allclose(gx.array, xp.ones((4,), dtype=self.dtype))
def gradient_penalty(self, y: chainer.Variable, x: chainer.Variable): """Compute gradient penalty: (L2_norm(dy/dx) - 1)**2.""" xp = self.xp weight = [Variable(xp.ones(y.shape, dtype='f'))] dydx, = chainer.grad(outputs=[y], inputs=[x], grad_outputs=weight, enable_double_backprop=True) dydx = F.sqrt(F.sum(dydx * dydx, axis=(1, 2, 3))) return F.mean_squared_error(dydx, xp.ones_like(dydx.array))
def check_unstride_backward(self, xp): x = xp.arange(12, dtype=self.dtype).reshape((3, 4))[::-1] v = Variable(x) y = as_strided(v, (12,), (1,), 0) y.grad = xp.ones((12,), dtype=self.dtype) with self.assertRaises(TypeError): gx, = grad((y,), (v,))
def check_flip_backward(self, xp): x = xp.arange(4, dtype=self.dtype) v = Variable(x) y = as_strided(v, (4,), (-1,), 3) y.grad = xp.ones((4,), dtype=self.dtype) with self.assertRaises(TypeError): gx, = grad((y,), (v,))
def virtual_adversarial_attack(cls, x, steps=1, loss_type='kl', eps=2.0, xi=1e-6, logit=None, clip_x=True): xp = cuda.get_array_module(x.array) if logit is None: logit = cls(x) x_org = copy.deepcopy(x) for t in range(steps): # Apply 1 step virtual adversarial attack and multiple projected gradient descent d = _normalize(xp.random.normal(size=x.shape), xp) x_d = x + xi * d logit_d = cls(x_d) kl_loss = loss_fun(logit, logit_d, type=loss_type) grad = chainer.grad([kl_loss], [x_d])[0] d = _normalize(grad.array, xp) x = x + eps * d x = Variable(_projection(x_org.array, x.array, eps, xp)) if clip_x: x = F.clip(x, -1., 1.) return x
def _gradient_penalty(self, discriminator, real_video, fake_video): """ For details and background, please see the algorithm on page 4 (line 4-8) and the corresponding equation (3) of the gradient penalty on: https://arxiv.org/abs/1704.00028. The loss of the discriminator network enforces the Lipschitz constraint on its loss, by interpolating a real and a fake video, feeding it as input into the discriminator network and thereby restricitng the gradient norm of the critics output with regard to its input""" def l2norm(vec): # Calculate the l2norm (or euclidean norm) if vec.ndim > 1: # Add epsilon to avoid problems of square root derivative close to zero. Since f(x + ε) = f(x) # => f(x + ε) - f(x) = 0 vec = F.sqrt(F.sum(vec * vec, axis=(1,2,3,4)) + 1e-12) return abs(vec) # Interpolation creates new data points within range of discrete data points xp = self.generator.xp epsilon = xp.random.uniform(low=0, high=1, size=(self.batch_size,1,1,1,1)).astype(xp.float32) interpolates = (1. - epsilon) * fake_video + epsilon * real_video # Feed interpolated sample into discriminator and compute gradients eval_interpolate = discriminator(interpolates) gradients = chainer.grad([eval_interpolate], [interpolates], enable_double_backprop=True)[0] slopes = l2norm(gradients) # Penalty coefficient is a hyperparameter, where 10 was found to be working best (eq. 7) gradient_penalty = (self.penalty_coeff * (slopes - 1.) ** 2)[:, xp.newaxis] # Expected gradient penalty gradient_penalty = F.sum(gradient_penalty) / self.batch_size chainer.report({'gp' : gradient_penalty}) return gradient_penalty
def check_broadcast_backward(self, xp): x = xp.arange(12, dtype=self.dtype).reshape((3, 4)).copy() v = Variable(x) y = as_strided(v, (2, 3, 4), (0, 4, 1), 0) y.grad = xp.ones((2, 3, 4), dtype=self.dtype) with self.assertRaises(TypeError): gx, = grad((y,), (v,))
def wrm_attack(cls, x, y=None, steps=5.0, loss_type='cross_entropy', c_type='sqaure', gamma=1., alpha=1.0, clip_x=True, return_phis=False): xp = cls.xp x_org = copy.deepcopy(x) _alpha = alpha / gamma if return_phis: phis = [] for t in range(steps): logit = cls(x) if y is None: y = F.argmax(logit, axis=1) loss = loss_fun(logit, y, loss_type, reduce='sum') cost = cost_fun(x1=x, y1=y, x2=x_org, y2=y, type=c_type, reduce='sum') phi = loss - gamma * cost # print(xp.mean(phi.array), xp.mean(xp.sum((x.array - x_org.array) ** 2, axis=(1, 2, 3)))) if return_phis: phis.append(phi.array) grad = chainer.grad([phi], [x])[0] lr = _alpha / (t + 1) x = x + lr * grad.array if clip_x: x = F.clip(x, -1., 1.) if return_phis: return x, phis else: return x
def _hessian_vector_product(flat_grads, params, vec): """Compute hessian vector product efficiently by backprop.""" grads = chainer.grad([F.sum(flat_grads * vec)], params) assert all(grad is not None for grad in grads),\ "The Hessian-vector product contains None." grads_data = [grad.array for grad in grads] return _flatten_and_concat_ndarrays(grads_data)
def check_flip_backward(self, xp): x = xp.arange(4, dtype=self.dtype) v = chainer.Variable(x) y = F.as_strided(v, (4, ), (-1, ), 3) y.grad = xp.ones((4, ), dtype=self.dtype) gx, = chainer.grad((y, ), (v, )) testing.assert_allclose(gx.array, xp.ones((4, ), dtype=self.dtype))
def check_broadcast_backward(self, xp): x = xp.arange(12, dtype=self.dtype).reshape((3, 4)).copy() v = chainer.Variable(x) y = F.as_strided(v, (2, 3, 4), (0, 4, 1), 0) y.grad = xp.ones((2, 3, 4), dtype=self.dtype) with self.assertRaises(TypeError): gx, = chainer.grad((y, ), (v, ))
def check_flip_backward(self, xp): x = xp.arange(4, dtype=self.dtype) v = chainer.Variable(x) y = F.as_strided(v, (4, ), (-1, ), 3) y.grad = xp.ones((4, ), dtype=self.dtype) with self.assertRaises(TypeError): gx, = chainer.grad((y, ), (v, ))
def check(self, option, grads_before, grads_after): vs = [] v = self._var(0.5) for _ in range(4): vs.append(v) v += v vs.append(v) v *= 1. _, x1, _, x2, _, y1, _, y2 = vs gx1 = self._var(1000.) gx2 = self._var(100.) gy1 = self._var(10.) gy2 = self._var(1.) for v, g in zip(vs, grads_before): if g is not None: v.grad_var = self._var(g) grads = chainer.grad( [y1, y2], [x1, x2], [gy1, gy2], [gx1, gx2], **option) numpy.testing.assert_allclose(grads[0].array, 1248.) numpy.testing.assert_allclose(grads[1].array, 124.) for v, ans in zip(vs, grads_after): if ans is None: self.assertIsNone(v.grad) else: numpy.testing.assert_allclose(v.grad, ans)
def langevin(batchsize, gen, dis, y_fake, eval=False, given_z=None): if eval: Step_lr = args.eval_step_lr num_steps = args.eval_num_steps Noise_scale = args.eval_noise_scale else: Step_lr = args.step_lr num_steps = args.num_steps Noise_scale = args.noise_scale if given_z is None: z = sample_continuous(gen.dim_z, batchsize, distribution=gen.distribution, xp=gen.xp) z = chainer.Variable(z) else: z = given_z x_fake = gen(batchsize, z=z, y=y_fake) for step in range(num_steps): energy = dis(x_fake, y=y_fake) * args.temperature z_grad = chainer.grad(outputs=[energy], inputs=[z])[0] # pdb.set_trace() if args.anealing: step_lr = Step_lr * 0.1**(step // (num_steps / 5)) noise_scale = Noise_scale * 0.1**(step // (num_steps / 5)) else: step_lr = Step_lr noise_scale = Noise_scale z_grad_noise = step_lr/2*z_grad + \ (step_lr**0.5)*cp.random.normal(size=z.shape, loc=0.0, scale=noise_scale) z = z + z_grad_noise z.unchain_backward() x_fake = gen(batchsize, z=z, y=y_fake) return x_fake, z
def check(self, option, grads_before, grads_after): vs = [] v = self._var(0.5) for _ in range(4): vs.append(v) v += v vs.append(v) v *= 1. _, x1, _, x2, _, y1, _, y2 = vs gx1 = self._var(1000.) gx2 = self._var(100.) gy1 = self._var(10.) gy2 = self._var(1.) for v, g in zip(vs, grads_before): if g is not None: v.grad_var = self._var(g) grads = chainer.grad([y1, y2], [x1, x2], [gy1, gy2], [gx1, gx2], **option) numpy.testing.assert_allclose(grads[0].array, 1248.) numpy.testing.assert_allclose(grads[1].array, 124.) for v, ans in zip(vs, grads_after): if ans is None: self.assertIsNone(v.grad) else: numpy.testing.assert_allclose(v.grad, ans)
def check_grad(self): self.forward() ys = [getattr(self, name) for name in self.y_names] if self.extend_graph_y: self._ys = [v * 1. for v in ys] # graph_x extension should be done here # to avoid chainer/chainerx mixed graph if self.extend_graph_x: for v in self.xs: v *= 1. gxs = chainer.grad(ys, self.xs, self.gys, self.gxs, loss_scale=self.loss_scale) expected = self.expected_grad() for i, gx in enumerate(self.gxs): expected[i] += gx self.assertEqual(len(gxs), len(expected)) try: for a, e in zip(gxs, expected): testing.assert_allclose(self._get_value(a), self._get_value(e)) except Exception: self._print_inputs() self._print_variables('gxs (actual) ', gxs) self._print_variables('gxs (expected)', expected) raise
def path_length(ws, x, mask): levels, batch, size = len(ws), *(ws[0].shape) gradients = grad([x * mask], ws, enable_double_backprop=True) gradient = stack(gradients).transpose(1, 0, 2).reshape(batch * levels, size) path_lengths = batch_l2_norm_squared(gradient).reshape(batch, levels) return sqrt(mean(path_lengths, axis=1))
def check_unstride_backward(self, xp): x = xp.arange(12, dtype=self.dtype).reshape((3, 4))[::-1] v = chainer.Variable(x) y = F.as_strided(v, (12, ), (1, ), 0) y.grad = xp.ones((12, ), dtype=self.dtype) gx, = chainer.grad((y, ), (v, )) testing.assert_allclose(gx.array, xp.ones(x.shape, dtype=self.dtype))
def check_unstride_backward(self, xp): x = xp.arange(12, dtype=self.dtype).reshape((3, 4))[::-1] v = chainer.Variable(x) y = F.as_strided(v, (12, ), (1, ), 0) y.grad = xp.ones((12, ), dtype=self.dtype) with self.assertRaises(TypeError): gx, = chainer.grad((y, ), (v, ))
def update_core(self): def _update(optimizer, loss): optimizer.target.cleargrads() loss.backward() optimizer.update() xp = self.generator.xp if self.iteration < 50: n_critic = 100 else: n_critic = 5 # update critic n_critic times for _ in range(n_critic): # real image x_real = self.next_batch(self.x) y_real = self.critic(x_real) loss1 = -F.sum(y_real) / self.batchsize # fake image z = self.next_batch(self.z) x_fake = self.generator(z) y_fake = self.critic(x_fake) loss2 = F.sum(y_fake) / self.batchsize x_fake.unchain_backward() # gp eps = xp.random.uniform(0, 1, size=self.batchsize).astype("f")[:, None, None, None] x_mid = eps * x_real + (1.0 - eps) * x_fake y_mid = self.critic(x_mid) grad, = chainer.grad([y_mid], [x_mid], enable_double_backprop=True) grad = F.sqrt(F.batch_l2_norm_squared(grad)) loss_gp = self.lam * F.mean_squared_error(grad, xp.ones_like(grad.data)) # compute loss critic_loss = loss1 + loss2 + loss_gp # update critic _update(self.optimizer_critic, critic_loss) chainer.reporter.report({ 'critic/loss/real': loss1, 'critic/loss/fake': loss2, 'critic/loss/gp': loss_gp, 'critic/loss': critic_loss, 'wasserstein': -loss1 - loss2, }) # update generator 1 time z = self.next_batch(self.z) x_fake = self.generator(z) y_fake = self.critic(x_fake) gen_loss = -F.sum(y_fake) / self.batchsize _update(self.optimizer_generator, gen_loss) chainer.report({'generator/loss': gen_loss})
def c_ctc(yhat, y): yhat = [np.expand_dims(yhat[:,i], 0) for i in range(yhat.shape[1])] yhat = [chainer.Variable(yI) for yI in yhat] loss = ctc(yhat, y, 61) g = chainer.grad([loss], yhat) g = np.vstack([gI.data for gI in g]).T return loss.data, g
def check_broadcast_backward(self, xp): x = xp.arange(12, dtype=self.dtype).reshape((3, 4)).copy() v = chainer.Variable(x) y = F.as_strided(v, (2, 3, 4), (0, 4, 1), 0) y.grad = xp.ones((2, 3, 4), dtype=self.dtype) gx, = chainer.grad((y, ), (v, )) testing.assert_allclose(gx.array, xp.ones(x.shape, dtype=self.dtype) * 2)
def check_broadcast_backward(self, xp): x = xp.arange(12, dtype=self.dtype).reshape((3, 4)).copy() v = Variable(x) y = as_strided(v, (2, 3, 4), (0, 4, 1), 0) y.grad = xp.ones((2, 3, 4), dtype=self.dtype) gx, = grad((y,), (v,)) testing.assert_allclose(gx.array, xp.ones(x.shape, dtype=self.dtype) * 2)
def backward(self, indexes, grad_outputs): inputs = self.get_retained_inputs() with function.force_backprop_mode(): outs = _call_func(self.func, inputs) # Return gradients that are further backproable return chainer.grad( outs, inputs, grad_outputs=grad_outputs, enable_double_backprop=True)
def zero_centered_gradient_penalty_fake(fake, y): grad, = chainer.grad([fake], [y], enable_double_backprop=True) grad = F.sqrt(F.batch_l2_norm_squared(grad)) zeros = call_zeros(grad) loss = 10 * F.mean_squared_error(grad, zeros) return loss
def check_general_stride_backward(self, xp): x = _stride_array(xp.arange(8, dtype=self.dtype), (3, 3), (-1, 2), 3) # [[3., 5., 7.], [2., 4., 6.], [1., 3., 5.]] v = Variable(x) y = as_strided(v, (3, 3), (1, 2), 0) # [[0., 2., 4.], [1., 3., 5.,], [2., 4., 6.]] y.grad = xp.ones(y.shape, dtype=self.dtype) with self.assertRaises(TypeError): gx, = grad((y,), (v,))
def check_double_grad(self): self.forward() ys = [getattr(self, name) for name in self.y_names] gxs = chainer.grad(ys, self.xs, self.gys, self.gxs, enable_double_backprop=True) y = sum(gxs) ggxs = chainer.grad([y], self.xs) expected = self.expected_double_grad() self.assertEqual(len(ggxs), len(expected)) try: for a, e in zip(ggxs, expected): testing.assert_allclose(self._get_value(a), self._get_value(e)) except Exception: self._print_inputs() self._print_variables('gxs ', gxs) self._print_variables('ggxs (actual) ', ggxs) self._print_variables('ggxs (expected)', expected) raise
def test_length_check(self): x = chainer.Variable(numpy.array(3, numpy.float32)) y = chainer.functions.identity(x) with self.assertRaises(ValueError): chainer.grad([y], [x], [], [None]) with self.assertRaises(ValueError): chainer.grad([y], [x], [None, None], [None]) with self.assertRaises(ValueError): chainer.grad([y], [x], [None], []) with self.assertRaises(ValueError): chainer.grad([y], [x], [None], [None, None])
def test_unchain_split(self): x = chainer.Variable(numpy.arange(4).astype('f').reshape(2, 2)) h0, h1 = chainer.functions.split_axis(x, [1], axis=0) y = chainer.functions.sum(h0) z = chainer.functions.sum(h1) w = y + z h0.unchain() dy_dh0 = numpy.array([[1., 1.]]) dz_dh1 = numpy.array([[1., 1.]]) dy_dx = None dz_dx = numpy.array([[0., 0.], [1., 1.]]) dw_dx = numpy.array([[0., 0.], [1., 1.]]) testing.assert_allclose(chainer.grad([y], [h0])[0].array, dy_dh0) testing.assert_allclose(chainer.grad([z], [h1])[0].array, dz_dh1) assert chainer.grad([y], [x])[0] is dy_dx testing.assert_allclose(chainer.grad([z], [x])[0].array, dz_dx) testing.assert_allclose(chainer.grad([w], [x])[0].array, dw_dx)
def test_forward_no_cast_grad(self): # This test would fail if F.cast does not create new function nodes for # no-op casts x = chainer.Variable(self.x) y1 = functions.cast(x, self.dtype) y2 = functions.cast(x, self.dtype) z = y1 + y2 gy1, gy2 = chainer.grad([z], [y1, y2], [numpy.ones_like(z.data)]) assert gy1.dtype == self.dtype assert gy2.dtype == self.dtype numpy.testing.assert_array_equal(gy1.data, numpy.ones_like(y1.data)) numpy.testing.assert_array_equal(gy2.data, numpy.ones_like(y2.data))
def test_retain_output(self): xp = numpy x_array = xp.random.randn(3) y1_grad = xp.random.randn(3) x_grad_grad = xp.random.randn(3) x = chainer.Variable(x_array, name='x') y0, y1 = exp_and_expm1(x) del y0 # (x: Variable) requires grad # (y1_grad: ndarray) does not require grad gx, = chainer.grad([y1], [x], [y1_grad], enable_double_backprop=True) # assert gx == exp(x) * y1_grad xp.testing.assert_allclose( gx.array, xp.exp(x.array) * y1_grad) gx_, = chainer.grad([gx], [x], [x_grad_grad]) xp.testing.assert_allclose( gx_.array, gx.array * x_grad_grad)
def check_general_stride_backward(self, xp): x = _stride_array(xp.arange(8, dtype=self.dtype), (3, 3), (-1, 2), 3) # [[3., 5., 7.], [2., 4., 6.], [1., 3., 5.]] v = Variable(x) y = as_strided(v, (3, 3), (1, 2), 0) # [[0., 2., 4.], [1., 3., 5.,], [2., 4., 6.]] y.grad = xp.ones(y.shape, dtype=self.dtype) gx, = grad((y,), (v,)) testing.assert_allclose(gx.array, xp.array([ [0.5, 0.5, 0.], [2., 2., 1.], [1., 0.5, 0.5] ], dtype=self.dtype) )
def _compute_backward(self, x, gamma, beta, y, gy): assert isinstance(x, chainer.Variable) assert isinstance(gamma, chainer.Variable) assert isinstance(beta, chainer.Variable) assert isinstance(y, chainer.Variable) assert isinstance(gy, chainer.Variable) if x.xp is chainerx: # TODO(niboshi): ChainerX does not support grad yet y.grad = gy.array.copy() y.backward() gx = x.grad_var ggamma = gamma.grad_var gbeta = beta.grad_var else: gx, ggamma, gbeta = chainer.grad([y], [x, gamma, beta], [gy]) return gx.array, ggamma.array, gbeta.array
def check_grad(self): self.forward() ys = [getattr(self, name) for name in self.y_names] gxs = chainer.grad(ys, self.xs, self.gys, self.gxs) expected = self.expected_grad() for i, gx in enumerate(self.gxs): expected[i] += gx self.assertEqual(len(gxs), len(expected)) try: for a, e in zip(gxs, expected): testing.assert_allclose(self._get_value(a), self._get_value(e)) except Exception: self._print_inputs() self._print_variables('gxs (actual) ', gxs) self._print_variables('gxs (expected)', expected) raise
def _sigmoid_derivative(x): h = chainer.functions.sigmoid(x) return chainer.grad([h], [x], enable_double_backprop=True)[0]
def test_all_called_with_grad(self): x = chainer.Variable(numpy.random.rand(2, 3).astype(numpy.float32)) y = chainer.functions.sum(x * x) self.check_hook_methods_called(lambda: chainer.grad([y], [x]))
def main(): parser = argparse.ArgumentParser(description='GradNorm') parser.add_argument('--gpu', '-g', type=int, default=-1) parser.add_argument('--n-iter', '-it', type=int, default=5000) parser.add_argument('--mode', '-m', choices=('grad_norm', 'equal_weight'), default='grad_norm') args = parser.parse_args() np.random.seed(123) sigmas = [1, 10] n_task = len(sigmas) epsilons = np.random.normal( scale=3.5, size=(n_task, 100, 250)).astype(np.float32) dataset = RegressionDataset(sigmas, epsilons) model = RegressionTrainChain(RegressionChain(n_task)) if args.gpu >= 0: chainer.backends.cuda.get_device_from_id(args.gpu).use() model.to_gpu() optimizer = chainer.optimizers.Adam(alpha=1e-2) optimizer.setup(model) train_iter = chainer.iterators.SerialIterator(dataset, 200) xp = model.xp weights = [] task_losses = [] loss_ratios = [] final_layer_names = ['task_{}'.format(i) for i in range(n_task)] for t in range(args.n_iter): batch = train_iter.next() x, ts = chainer.dataset.convert.concat_examples(batch, device=args.gpu) task_loss = model(x, ts) weighted_task_loss = model.weight * task_loss if t == 0: initial_task_loss = task_loss.data loss = F.mean(weighted_task_loss) model.cleargrads() loss.backward() # Ignore a gradient to the coefficient vector, which # is computed from the standard loss. model.weight.cleargrad() if args.mode == 'grad_norm': # Use |\nabla_W w_i * L_i | = w_i |\nabla_W L_i| gygw_norms = [] for i, layer_name in enumerate(final_layer_names): l = getattr(model.model, layer_name) gygw = chainer.grad([task_loss[i]], [l.W])[0].data gygw_norms.append(xp.linalg.norm(gygw)) gygw_norms = xp.stack(gygw_norms) norms = model.weight * gygw_norms alpha = 0.16 mean_norm = xp.mean(norms.data) loss_ratio = task_loss.data / initial_task_loss inverse_train_rate = loss_ratio / xp.mean(loss_ratio) diff = norms - (inverse_train_rate ** alpha) * mean_norm grad_norm_loss = F.mean(F.absolute(diff)) grad_norm_loss.backward() # For debugging purpose only # from chainer import computational_graph # import os # cg = computational_graph.build_computational_graph( # [grad_norm_loss]).dump() # with open('grad_weight_loss_cg', 'w') as f: # f.write(cg) optimizer.update() # Renormalize normalize_coeff = n_task / xp.sum(model.weight.data) model.weight.data[:] = model.weight.data * normalize_coeff # Record task_losses.append(chainer.backends.cuda.to_cpu(task_loss.data)) loss_ratios.append(np.mean(task_losses[-1] / task_losses[0])) weights.append(chainer.backends.cuda.to_cpu(model.weight.data)) if t % 100 == 0: print('{}/{}: loss_ratio={}, weights={} task_loss={}'.format( t, args.n_iter, loss_ratios[-1], model.weight.data, task_loss.data)) task_losses = np.array(task_losses) weights = np.array(weights) fig = plt.figure() ax1 = fig.add_subplot(1, 4, 1) ax1.set_title('loss (task 0)') ax2 = fig.add_subplot(1, 4, 2) ax2.set_title('loss (task 1)') ax3 = fig.add_subplot(1, 4, 3) ax3.set_title('sum of normalized losses') ax4 = fig.add_subplot(1, 4, 4) ax4.set_title('change of weights over time') ax1.plot(task_losses[:, 0]) ax2.plot(task_losses[:, 1]) ax3.plot(loss_ratios) ax4.plot(weights[:, 0]) ax4.plot(weights[:, 1]) plt.show()
def test_type_check(self): x = chainer.Variable(numpy.random.uniform(-1, 1, (2, 3)).astype('f')) y = x * x gx = chainer.Variable(numpy.random.uniform(-1, 1, (2, 3)).astype('f')) gy = chainer.Variable(numpy.random.uniform(-1, 1, (2, 3)).astype('f')) chainer.grad([y], [x], [gx], [gy]) chainer.grad((y,), (x,), (gx,), (gy,)) with self.assertRaises(TypeError): chainer.grad(y, [x], [gx], [gy]) with self.assertRaises(TypeError): chainer.grad([y], x, [gx], [gy]) with self.assertRaises(TypeError): chainer.grad([y], [x], gx, [gy]) with self.assertRaises(TypeError): chainer.grad([y], [x], [gx], gy)