Esempio n. 1
0
    def get_onehot_grad(self, xs, ys=None):
        if ys is None:
            with chainer.using_config('train', False):
                ys = self.predict(xs, argmax=True)
                ys = F.expand_dims(ys, axis=1)
                ys = [y for y in ys]
        encodings, exs = self.encoder.get_grad(xs)
        outputs = self.output(encodings)
        concat_truths = F.concat(ys, axis=0)
        loss = F.softmax_cross_entropy(outputs, concat_truths)

        if isinstance(exs, tuple):
            exs_grad = chainer.grad([loss], exs)
            ex_sections = np.cumsum([ex.shape[0] for ex in exs[:-1]])
            exs = F.concat(exs, axis=0)
            exs_grad = F.concat(exs_grad, axis=0)
            onehot_grad = F.sum(exs_grad * exs, axis=1)
            onehot_grad = F.split_axis(onehot_grad, ex_sections, axis=0)
        else:
            exs_grad = chainer.grad([loss], [exs])[0]
            # (batch_size, n_dim, max_length, 1)
            assert exs_grad.shape == exs.shape
            onehot_grad = F.squeeze(F.sum(exs_grad * exs, 1), 2)
            lengths = [len(x) for x in xs]
            onehot_grad = [x[:l] for x, l in zip(onehot_grad, lengths)]
        return onehot_grad
Esempio n. 2
0
    def get_onehot_grad(self, xs, ys=None):
        if ys is None:
            with chainer.using_config('train', False):
                ys = self.predict(xs, argmax=True)
        u, exs_prem = self.encoder.get_grad(xs[0])
        v, exs_hypo = self.encoder.get_grad(xs[1])
        encodings = F.concat((u, v, F.absolute(u - v), u * v), axis=1)
        outputs = self.output(self.mlp(encodings, no_dropout=True))
        loss = F.softmax_cross_entropy(outputs, ys)

        exs = exs_hypo
        lengths = [len(x) for x in xs[1]]

        if isinstance(exs, tuple):
            exs_grad = chainer.grad([loss], exs)
            ex_sections = np.cumsum([ex.shape[0] for ex in exs[:-1]])
            exs = F.concat(exs, axis=0)
            exs_grad = F.concat(exs_grad, axis=0)
            onehot_grad = F.sum(exs_grad * exs, axis=1)
            onehot_grad = F.split_axis(onehot_grad, ex_sections, axis=0)
        else:
            exs_grad = chainer.grad([loss], [exs])[0]
            # (batch_size, n_dim, max_length, 1)
            assert exs_grad.shape == exs.shape
            onehot_grad = F.squeeze(F.sum(exs_grad * exs, 1), 2)
            onehot_grad = [x[:l] for x, l in zip(onehot_grad, lengths)]
        return onehot_grad
Esempio n. 3
0
def test_unconditional_forward():
    """ checking gradient leaking along batch axis """
    width = 5
    height = 7
    z_size = 2
    batch_size = 3

    model = CPPN(
        ModelConfig(width=width,
                    height=height,
                    n_units_xyrz=3,
                    n_hidden_units=[5, 5],
                    z_size=z_size))
    model.zerograds()

    # create inputs: inputs is dict whose key is batch index, and value is tuple of (x, z) for each index
    x, z, inputs = gen_input_batch(batch_size, width, height, z_size)

    # forward prop
    y = model.forward(x, z)

    # taking loss at only first image
    t = get_dammy_output(batch_size, width, height)
    loss = F.mean_squared_error(y[0], t[0])

    # check gradient leaking
    assert sum([g.data.sum()
                for g in chainer.grad((loss, ), inputs[0])]) != 0.0
    assert sum([g.data.sum()
                for g in chainer.grad((loss, ), inputs[1])]) == 0.0
    assert sum([g.data.sum()
                for g in chainer.grad((loss, ), inputs[2])]) == 0.0
Esempio n. 4
0
    def update_core(self):
        gen_optimizer = self.get_optimizer('gen')
        dis_optimizer = self.get_optimizer('dis')
        xp = self.gen.xp

        for i in range(self.n_dis):
            batch = self.get_iterator('main').next()
            batchsize = len(batch)
            x_real = Variable(self.converter(batch, self.device))
            h_real = self.dis(x_real)

            z = self.gen.make_hidden(batchsize)
            x_fake = self.gen(z)
            h_fake = self.dis(x_fake)

            z2 = self.gen.make_hidden(batchsize)
            x_fake2 = self.gen(z2)
            h_fake2 = self.dis(x_fake2)

            if i == 0:
                loss_gen = self.energy_distance(h_real, h_fake, h_fake2)
                self.gen.cleargrads()
                loss_gen.backward()
                gen_optimizer.update()
                chainer.reporter.report({'gen/loss': loss_gen})
            x_fake.unchain_backward()
            x_fake2.unchain_backward()

            critic_real = self.critic(h_real, h_fake2)
            critic_fake = self.critic(h_fake, h_fake2)

            loss_surrogate = F.mean(critic_real - critic_fake)

            eps = self.xp.random.uniform(0, 1, size=batchsize).astype("f")[
                :, None, None, None]
            x_mid = eps * x_real + (1.0 - eps) * x_fake

            h_mid = chainer.Variable(self.dis(x_mid).data)

            base_grad, = chainer.grad([self.critic(h_mid, h_fake.data)], [
                                      h_mid], enable_double_backprop=True)
            grad, = chainer.grad([self.dis(x_mid)], [x_mid], grad_outputs=[
                                 base_grad], enable_double_backprop=True)
            grad = F.sqrt(F.batch_l2_norm_squared(grad))
            loss_gp = self.lam * \
                F.mean_squared_error(grad, xp.ones_like(grad.data))

            self.dis.cleargrads()
            (-loss_surrogate).backward()
            loss_gp.backward()
            dis_optimizer.update()

            chainer.reporter.report({'critic/loss': -loss_surrogate + loss_gp})
            chainer.reporter.report({"cramer distance": loss_surrogate})
            chainer.reporter.report({'critic/loss_grad': loss_gp})
            chainer.reporter.report({'g': F.mean(grad)})
Esempio n. 5
0
        def wrapper(self, structure, Rc, *params):
            differentiate_more = self._order > 0
            with chainer.using_config('enable_backprop', differentiate_more):
                G = func(self, structure, Rc, *params)
                yield F.stack([F.stack(g) for g in G])

            n_atom = len(G[0])
            r = []
            j_indices = []
            for r_, j_idx in structure.get_neighbor_info(
                    Rc, ['distance_vector', 'j_indices']):
                r.append(r_)
                j_indices.append(j_idx)

            differentiate_more = self._order > 1
            with chainer.using_config('enable_backprop', differentiate_more):
                dG = []
                for g in G:
                    with chainer.force_backprop_mode():
                        grad = chainer.grad(
                            g, r, enable_double_backprop=differentiate_more)
                    dg = [
                        F.concat([
                            F.sum(dg_, axis=0)
                            for dg_ in F.split_axis(grad_, j_idx[1:], axis=0)
                        ],
                                 axis=0)
                        for grad_, j_idx in zip(grad, j_indices)
                    ]
                    dG.append(dg)
                yield F.stack([F.stack(dg) for dg in dG])

            differentiate_more = self._order > 2
            with chainer.using_config('enable_backprop', differentiate_more):
                d2G = []
                for dg in dG:
                    d2g = []
                    for i in range(3 * n_atom):
                        with chainer.force_backprop_mode():
                            grad = chainer.grad(
                                [dg_[i] for dg_ in dg],
                                r,
                                enable_double_backprop=differentiate_more)
                        d2g_ = [
                            F.concat([
                                F.sum(d2g_, axis=0) for d2g_ in F.split_axis(
                                    grad_, j_idx[1:], axis=0)
                            ],
                                     axis=0)
                            for grad_, j_idx in zip(grad, j_indices)
                        ]
                        d2g.append(d2g_)
                    d2G.append(d2g)
                yield F.stack([
                    F.stack([F.stack(d2g_) for d2g_ in d2g]) for d2g in d2G
                ]).transpose(0, 2, 1, 3)
    def feed(self, x, label):
        """ feed
        Args:
            x: list or array: Input image. Only one image can be acceptable.
            label: int: The number of class label.
        Return:
            L_gcam: Grad-CAM++ result.
        """
        # feed forward
        activations = self.forward(x)

        # label selection
        prob = activations[self.prob_layer][0].data
        target_label = self.select_target(prob, label)

        # target loss
        target_prob = \
            chainer.Variable(target_label) * activations[self.prob_layer]

        # backward
        # self.backward(target_prob, enable_double_backprop=True)
        target_activation = activations[self.target_layer]
        label_index = target_label.argmax()
        coeff = self.xp.exp(target_prob[0][label_index].data)
        # first_grad = coeff * target_activation.grad_var
        first_grad, = chainer.grad([coeff * target_prob], [target_activation],
                                   enable_double_backprop=True)
        second_grad, = chainer.grad([first_grad], [target_activation],
                                    enable_double_backprop=True)
        third_grad, = chainer.grad([second_grad], [target_activation],
                                   enable_double_backprop=True)
        global_sum = self.xp.sum(target_activation.data, axis=(2, 3))
        global_sum = global_sum.reshape(first_grad.data[0].shape[0], 1, 1)
        alpha_num = second_grad.data[0]
        alpha_denom = \
            2.0 * second_grad.data[0] + global_sum[0] * third_grad.data[0]
        alpha_denom = self.xp.where(alpha_denom != 0.0, alpha_denom,
                                    self.xp.ones(alpha_denom.shape))
        alphas = alpha_num / alpha_denom
        alphas /= self.xp.sum(alphas, axis=(1, 2))[:, self.xp.newaxis,
                                                   self.xp.newaxis]
        importances = self.xp.sum(
            alphas * self.xp.maximum(first_grad.data[0], 0),
            # alphas * first_grad.data[0],
            axis=(1, 2))

        L_gcam = self.xp.tensordot(importances,
                                   target_activation.data[0],
                                   axes=(0, 0))
        L_gcam = (L_gcam > 0.) * L_gcam / L_gcam.max() * 255.

        # resize
        L_gcam = imresize(L_gcam, x[0].size)

        return L_gcam
Esempio n. 7
0
    def test(self):
        batch_size = self.batch_size
        N = self.N
        N_prime = self.N_prime
        huber_loss_threshold = self.huber_loss_threshold

        # Overestimation is penalized proportionally to tau
        # Underestimation is penalized proportionally to (1-tau)
        y = np.random.normal(size=(batch_size, N)).astype('f')
        y_var = chainer.Variable(y)
        t = np.random.normal(size=(batch_size, N_prime)).astype('f')
        tau = np.random.uniform(size=(batch_size, N)).astype('f')

        loss = iqn.compute_eltwise_huber_quantile_loss(
            y_var, t, tau, huber_loss_threshold=huber_loss_threshold)
        y_var_b, t_b = F.broadcast(
            F.reshape(y_var, (batch_size, N, 1)),
            F.reshape(t, (batch_size, 1, N_prime)),
        )
        self.assertEqual(loss.shape, (batch_size, N, N_prime))
        huber_loss = F.huber_loss(y_var_b,
                                  t_b,
                                  delta=huber_loss_threshold,
                                  reduce='no')
        self.assertEqual(huber_loss.shape, (batch_size, N, N_prime))

        for i in range(batch_size):
            for j in range(N):
                for k in range(N_prime):
                    # loss is always positive
                    scalar_loss = loss[i, j, k]
                    scalar_grad = chainer.grad([scalar_loss], [y_var])[0][i, j]
                    self.assertGreater(scalar_loss.array, 0)
                    if y[i, j] > t[i, k]:
                        # y over-estimates t
                        # loss equals huber loss scaled by tau
                        correct_scalar_loss = tau[i, j] * huber_loss[i, j, k]
                    else:
                        # y under-estimates t
                        # loss equals huber loss scaled by (1-tau)
                        correct_scalar_loss = ((1 - tau[i, j]) *
                                               huber_loss[i, j, k])
                    correct_scalar_grad = chainer.grad([correct_scalar_loss],
                                                       [y_var])[0][i, j]
                    self.assertAlmostEqual(
                        scalar_loss.array,
                        correct_scalar_loss.array,
                        places=5,
                    )
                    self.assertAlmostEqual(
                        scalar_grad.array,
                        correct_scalar_grad.array,
                        places=5,
                    )
Esempio n. 8
0
def compute_hessian(y, params):
    grads = chainer.grad([y], params, enable_double_backprop=True)
    flat_grads = trpo._flatten_and_concat_variables(grads)
    hessian_rows = []
    for i in range(len(flat_grads)):
        ggrads = chainer.grad([flat_grads[i]], params)
        assert all(ggrad is not None for ggrad in ggrads)
        ggrads_data = [ggrad.data for ggrad in ggrads]
        flat_ggrads_data = trpo._flatten_and_concat_ndarrays(ggrads_data)
        hessian_rows.append(flat_ggrads_data)
    return np.asarray(hessian_rows)
Esempio n. 9
0
    def _compute_kl_constrained_step(self, action_distrib, action_distrib_old,
                                     gain):
        """Compute a step of policy parameters with a KL constraint."""
        policy_params = _get_ordered_params(self.policy)
        kl = F.mean(action_distrib_old.kl(action_distrib))

        # Check if kl computation fully supports double backprop
        old_style_funcs = _find_old_style_function([kl])
        if old_style_funcs:
            raise RuntimeError("""\
Old-style functions (chainer.Function) are used to compute KL divergence.
Since TRPO requires second-order derivative of KL divergence, its computation
should be done with new-style functions (chainer.FunctionNode) only.

Found old-style functions: {}""".format(old_style_funcs))

        kl_grads = chainer.grad([kl],
                                policy_params,
                                enable_double_backprop=True)
        assert all(g is not None for g in kl_grads), "\
The gradient contains None. The policy may have unused parameters."

        flat_kl_grads = _flatten_and_concat_variables(kl_grads)

        def fisher_vector_product_func(vec):
            fvp = _hessian_vector_product(flat_kl_grads, policy_params, vec)
            return fvp + self.conjugate_gradient_damping * vec

        gain_grads = chainer.grad([gain], policy_params)
        assert all(g is not None for g in kl_grads), "\
The gradient contains None. The policy may have unused parameters."

        flat_gain_grads = _flatten_and_concat_ndarrays(gain_grads)
        step_direction = chainerrl.misc.conjugate_gradient(
            fisher_vector_product_func,
            flat_gain_grads,
            max_iter=self.conjugate_gradient_max_iter,
        )

        # We want a step size that satisfies KL(old|new) < max_kl.
        # Let d = alpha * step_direction be the actual parameter updates.
        # The second-order approximation of KL divergence is:
        #   KL(old|new) = 1/2 d^T I d + O(||d||^3),
        # where I is a Fisher information matrix.
        # Substitute d = alpha * step_direction and solve KL(old|new) = max_kl
        # for alpha to get the step size that tightly satisfies the constraint.

        dId = float(
            step_direction.dot(fisher_vector_product_func(step_direction)))
        scale = (2.0 * self.max_kl / (dId + 1e-8))**0.5
        return scale * step_direction
Esempio n. 10
0
def test_conditional_backward():
    """ checking gradient leaking along batch axis """
    width = 5
    height = 7
    z_size = 2
    batch_size = 3

    model = ConditionalCPPN(
        ConditionalModelConfig(width=width,
                               height=height,
                               n_units_xyr=3,
                               n_hidden_units=[
                                   10,
                                   10,
                               ],
                               z_size=z_size,
                               in_width=64,
                               in_height=64,
                               in_channel=1,
                               use_batch_norm=False))
    model.zerograds()

    # create inputs: inputs is dict whose key is batch index, and value is tuple of (x, z) for each index
    x, z, inputs = gen_input_batch(batch_size, width, height, z_size)
    c = chainer.Variable(get_dammy_input(batch_size, 64, 64,
                                         1))  # init dammy conditional input

    # forward prop
    y = model.forward(x, z, c)

    # taking loss at only first image
    t = get_dammy_output(batch_size, width, height)
    loss = F.mean_squared_error(y[0], t[0])

    g_x, g_z = chainer.grad((loss, ), inputs[0])
    g_c = chainer.grad((loss, ), (c, ))[0].data

    assert g_c[0].sum() != 0.0, f"gradient of c is zero"
    assert g_x.data.sum() != 0.0, f"gradient of x is zero"
    assert g_z.data.sum() != 0.0, f"gradient of z is zero"

    g_x, g_z = chainer.grad((loss, ), inputs[1])
    assert g_c[1].sum() == 0.0, f"gradient of c is zero"
    assert g_x.data.sum() == 0.0, f"gradient of x is zero"
    assert g_z.data.sum() == 0.0, f"gradient of z is zero"

    g_x, g_z = chainer.grad((loss, ), inputs[2])
    assert g_c[2].sum() == 0.0, f"gradient of c is zero"
    assert g_x.data.sum() == 0.0, f"gradient of x is zero"
    assert g_z.data.sum() == 0.0, f"gradient of z is zero"
Esempio n. 11
0
def adversarial_attack(cls,
                       x,
                       y=None,
                       steps=1,
                       loss_type='cross_entropy',
                       eps=2.0,
                       clip_x=True,
                       norm_type='L2',
                       alpha=None):
    # you can prevent from label leaking by setting y to None
    xp = cuda.get_array_module(x.array)
    if alpha is None:
        alpha = eps

    x_org = copy.deepcopy(x)
    for t in range(steps):
        logit = cls(x)
        if y is None:
            y = F.argmax(logit, 1)
        loss = loss_fun(logit, y, type=loss_type)
        grad = chainer.grad([loss], [x])[0]
        d = _normalize(grad.array, xp, norm_type=norm_type)
        x = x + alpha * d
        x = Variable(
            _projection(x_org.array, x.array, eps, xp, norm_type=norm_type))
        if clip_x:
            x = F.clip(x, -1., 1.)
    return x
Esempio n. 12
0
 def check_unstride_backward(self, xp):
     x = xp.arange(12, dtype=self.dtype).reshape((3, 4))[::-1]
     v = Variable(x)
     y = as_strided(v, (12,), (1,), 0)
     y.grad = xp.ones((12,), dtype=self.dtype)
     gx, = grad((y,), (v,))
     testing.assert_allclose(gx.array, xp.ones(x.shape, dtype=self.dtype))
Esempio n. 13
0
 def check_flip_backward(self, xp):
     x = xp.arange(4, dtype=self.dtype)
     v = Variable(x)
     y = as_strided(v, (4,), (-1,), 3)
     y.grad = xp.ones((4,), dtype=self.dtype)
     gx, = grad((y,), (v,))
     testing.assert_allclose(gx.array, xp.ones((4,), dtype=self.dtype))
Esempio n. 14
0
 def gradient_penalty(self, y: chainer.Variable, x: chainer.Variable):
     """Compute gradient penalty: (L2_norm(dy/dx) - 1)**2."""
     xp = self.xp
     weight = [Variable(xp.ones(y.shape, dtype='f'))]
     dydx, = chainer.grad(outputs=[y], inputs=[x], grad_outputs=weight, enable_double_backprop=True)
     dydx = F.sqrt(F.sum(dydx * dydx, axis=(1, 2, 3)))
     return F.mean_squared_error(dydx, xp.ones_like(dydx.array))
Esempio n. 15
0
 def check_unstride_backward(self, xp):
     x = xp.arange(12, dtype=self.dtype).reshape((3, 4))[::-1]
     v = Variable(x)
     y = as_strided(v, (12,), (1,), 0)
     y.grad = xp.ones((12,), dtype=self.dtype)
     with self.assertRaises(TypeError):
         gx, = grad((y,), (v,))
Esempio n. 16
0
 def check_flip_backward(self, xp):
     x = xp.arange(4, dtype=self.dtype)
     v = Variable(x)
     y = as_strided(v, (4,), (-1,), 3)
     y.grad = xp.ones((4,), dtype=self.dtype)
     with self.assertRaises(TypeError):
         gx, = grad((y,), (v,))
Esempio n. 17
0
def virtual_adversarial_attack(cls,
                               x,
                               steps=1,
                               loss_type='kl',
                               eps=2.0,
                               xi=1e-6,
                               logit=None,
                               clip_x=True):
    xp = cuda.get_array_module(x.array)
    if logit is None:
        logit = cls(x)
    x_org = copy.deepcopy(x)
    for t in range(steps):
        # Apply 1 step virtual adversarial attack and multiple projected gradient descent
        d = _normalize(xp.random.normal(size=x.shape), xp)
        x_d = x + xi * d
        logit_d = cls(x_d)
        kl_loss = loss_fun(logit, logit_d, type=loss_type)
        grad = chainer.grad([kl_loss], [x_d])[0]
        d = _normalize(grad.array, xp)
        x = x + eps * d
        x = Variable(_projection(x_org.array, x.array, eps, xp))
        if clip_x:
            x = F.clip(x, -1., 1.)
    return x
Esempio n. 18
0
    def _gradient_penalty(self, discriminator, real_video, fake_video):
        """ For details and background, please see the algorithm on page 4 (line 4-8) and the corresponding equation
            (3) of the gradient penalty on: https://arxiv.org/abs/1704.00028. The loss of the discriminator network
            enforces the Lipschitz constraint on its loss, by interpolating a real and a fake video, feeding it as
            input into the discriminator network and thereby restricitng the gradient norm of the critics output
            with regard to its input"""

        def l2norm(vec):
            # Calculate the l2norm (or euclidean norm)
            if vec.ndim > 1:
                # Add epsilon to avoid problems of square root derivative close to zero. Since f(x + ε) = f(x)
                # => f(x + ε) - f(x) = 0
                vec = F.sqrt(F.sum(vec * vec, axis=(1,2,3,4)) + 1e-12)
            return abs(vec)

        # Interpolation creates new data points within range of discrete data points
        xp = self.generator.xp
        epsilon = xp.random.uniform(low=0, high=1, size=(self.batch_size,1,1,1,1)).astype(xp.float32)
        interpolates = (1. - epsilon) * fake_video + epsilon * real_video

        # Feed interpolated sample into discriminator and compute gradients
        eval_interpolate = discriminator(interpolates)
        gradients = chainer.grad([eval_interpolate], [interpolates], enable_double_backprop=True)[0]
        slopes = l2norm(gradients)

        # Penalty coefficient is a hyperparameter, where 10 was found to be working best (eq. 7)
        gradient_penalty = (self.penalty_coeff * (slopes - 1.) ** 2)[:, xp.newaxis]

        # Expected gradient penalty
        gradient_penalty = F.sum(gradient_penalty) / self.batch_size

        chainer.report({'gp' : gradient_penalty})

        return gradient_penalty
Esempio n. 19
0
 def check_broadcast_backward(self, xp):
     x = xp.arange(12, dtype=self.dtype).reshape((3, 4)).copy()
     v = Variable(x)
     y = as_strided(v, (2, 3, 4), (0, 4, 1), 0)
     y.grad = xp.ones((2, 3, 4), dtype=self.dtype)
     with self.assertRaises(TypeError):
         gx, = grad((y,), (v,))
Esempio n. 20
0
def wrm_attack(cls,
               x,
               y=None,
               steps=5.0,
               loss_type='cross_entropy',
               c_type='sqaure',
               gamma=1.,
               alpha=1.0,
               clip_x=True,
               return_phis=False):
    xp = cls.xp
    x_org = copy.deepcopy(x)
    _alpha = alpha / gamma
    if return_phis:
        phis = []
    for t in range(steps):
        logit = cls(x)
        if y is None:
            y = F.argmax(logit, axis=1)
        loss = loss_fun(logit, y, loss_type, reduce='sum')
        cost = cost_fun(x1=x, y1=y, x2=x_org, y2=y, type=c_type, reduce='sum')
        phi = loss - gamma * cost
        # print(xp.mean(phi.array), xp.mean(xp.sum((x.array - x_org.array) ** 2, axis=(1, 2, 3))))
        if return_phis:
            phis.append(phi.array)
        grad = chainer.grad([phi], [x])[0]
        lr = _alpha / (t + 1)
        x = x + lr * grad.array
        if clip_x:
            x = F.clip(x, -1., 1.)
    if return_phis:
        return x, phis
    else:
        return x
Esempio n. 21
0
def _hessian_vector_product(flat_grads, params, vec):
    """Compute hessian vector product efficiently by backprop."""
    grads = chainer.grad([F.sum(flat_grads * vec)], params)
    assert all(grad is not None for grad in grads),\
        "The Hessian-vector product contains None."
    grads_data = [grad.array for grad in grads]
    return _flatten_and_concat_ndarrays(grads_data)
Esempio n. 22
0
 def check_flip_backward(self, xp):
     x = xp.arange(4, dtype=self.dtype)
     v = chainer.Variable(x)
     y = F.as_strided(v, (4, ), (-1, ), 3)
     y.grad = xp.ones((4, ), dtype=self.dtype)
     gx, = chainer.grad((y, ), (v, ))
     testing.assert_allclose(gx.array, xp.ones((4, ), dtype=self.dtype))
Esempio n. 23
0
 def check_broadcast_backward(self, xp):
     x = xp.arange(12, dtype=self.dtype).reshape((3, 4)).copy()
     v = chainer.Variable(x)
     y = F.as_strided(v, (2, 3, 4), (0, 4, 1), 0)
     y.grad = xp.ones((2, 3, 4), dtype=self.dtype)
     with self.assertRaises(TypeError):
         gx, = chainer.grad((y, ), (v, ))
Esempio n. 24
0
 def check_flip_backward(self, xp):
     x = xp.arange(4, dtype=self.dtype)
     v = chainer.Variable(x)
     y = F.as_strided(v, (4, ), (-1, ), 3)
     y.grad = xp.ones((4, ), dtype=self.dtype)
     with self.assertRaises(TypeError):
         gx, = chainer.grad((y, ), (v, ))
Esempio n. 25
0
 def check(self, option, grads_before, grads_after):
     vs = []
     v = self._var(0.5)
     for _ in range(4):
         vs.append(v)
         v += v
         vs.append(v)
         v *= 1.
     _, x1, _, x2, _, y1, _, y2 = vs
     gx1 = self._var(1000.)
     gx2 = self._var(100.)
     gy1 = self._var(10.)
     gy2 = self._var(1.)
     for v, g in zip(vs, grads_before):
         if g is not None:
             v.grad_var = self._var(g)
     grads = chainer.grad(
         [y1, y2], [x1, x2], [gy1, gy2], [gx1, gx2], **option)
     numpy.testing.assert_allclose(grads[0].array, 1248.)
     numpy.testing.assert_allclose(grads[1].array, 124.)
     for v, ans in zip(vs, grads_after):
         if ans is None:
             self.assertIsNone(v.grad)
         else:
             numpy.testing.assert_allclose(v.grad, ans)
def langevin(batchsize, gen, dis, y_fake, eval=False, given_z=None):
    if eval:
        Step_lr = args.eval_step_lr
        num_steps = args.eval_num_steps
        Noise_scale = args.eval_noise_scale
    else:
        Step_lr = args.step_lr
        num_steps = args.num_steps
        Noise_scale = args.noise_scale
    if given_z is None:
        z = sample_continuous(gen.dim_z,
                              batchsize,
                              distribution=gen.distribution,
                              xp=gen.xp)
        z = chainer.Variable(z)
    else:
        z = given_z
    x_fake = gen(batchsize, z=z, y=y_fake)
    for step in range(num_steps):
        energy = dis(x_fake, y=y_fake) * args.temperature
        z_grad = chainer.grad(outputs=[energy], inputs=[z])[0]
        # pdb.set_trace()
        if args.anealing:
            step_lr = Step_lr * 0.1**(step // (num_steps / 5))
            noise_scale = Noise_scale * 0.1**(step // (num_steps / 5))
        else:
            step_lr = Step_lr
            noise_scale = Noise_scale
        z_grad_noise = step_lr/2*z_grad + \
            (step_lr**0.5)*cp.random.normal(size=z.shape, loc=0.0, scale=noise_scale)
        z = z + z_grad_noise
        z.unchain_backward()
        x_fake = gen(batchsize, z=z, y=y_fake)
    return x_fake, z
Esempio n. 27
0
 def check(self, option, grads_before, grads_after):
     vs = []
     v = self._var(0.5)
     for _ in range(4):
         vs.append(v)
         v += v
         vs.append(v)
         v *= 1.
     _, x1, _, x2, _, y1, _, y2 = vs
     gx1 = self._var(1000.)
     gx2 = self._var(100.)
     gy1 = self._var(10.)
     gy2 = self._var(1.)
     for v, g in zip(vs, grads_before):
         if g is not None:
             v.grad_var = self._var(g)
     grads = chainer.grad([y1, y2], [x1, x2], [gy1, gy2], [gx1, gx2],
                          **option)
     numpy.testing.assert_allclose(grads[0].array, 1248.)
     numpy.testing.assert_allclose(grads[1].array, 124.)
     for v, ans in zip(vs, grads_after):
         if ans is None:
             self.assertIsNone(v.grad)
         else:
             numpy.testing.assert_allclose(v.grad, ans)
Esempio n. 28
0
    def check_grad(self):
        self.forward()
        ys = [getattr(self, name) for name in self.y_names]
        if self.extend_graph_y:
            self._ys = [v * 1. for v in ys]

        # graph_x extension should be done here
        # to avoid chainer/chainerx mixed graph
        if self.extend_graph_x:
            for v in self.xs:
                v *= 1.

        gxs = chainer.grad(ys,
                           self.xs,
                           self.gys,
                           self.gxs,
                           loss_scale=self.loss_scale)

        expected = self.expected_grad()
        for i, gx in enumerate(self.gxs):
            expected[i] += gx

        self.assertEqual(len(gxs), len(expected))
        try:
            for a, e in zip(gxs, expected):
                testing.assert_allclose(self._get_value(a), self._get_value(e))
        except Exception:
            self._print_inputs()
            self._print_variables('gxs (actual)  ', gxs)
            self._print_variables('gxs (expected)', expected)
            raise
Esempio n. 29
0
 def path_length(ws, x, mask):
     levels, batch, size = len(ws), *(ws[0].shape)
     gradients = grad([x * mask], ws, enable_double_backprop=True)
     gradient = stack(gradients).transpose(1, 0,
                                           2).reshape(batch * levels, size)
     path_lengths = batch_l2_norm_squared(gradient).reshape(batch, levels)
     return sqrt(mean(path_lengths, axis=1))
Esempio n. 30
0
 def check_unstride_backward(self, xp):
     x = xp.arange(12, dtype=self.dtype).reshape((3, 4))[::-1]
     v = chainer.Variable(x)
     y = F.as_strided(v, (12, ), (1, ), 0)
     y.grad = xp.ones((12, ), dtype=self.dtype)
     gx, = chainer.grad((y, ), (v, ))
     testing.assert_allclose(gx.array, xp.ones(x.shape, dtype=self.dtype))
Esempio n. 31
0
 def check_unstride_backward(self, xp):
     x = xp.arange(12, dtype=self.dtype).reshape((3, 4))[::-1]
     v = chainer.Variable(x)
     y = F.as_strided(v, (12, ), (1, ), 0)
     y.grad = xp.ones((12, ), dtype=self.dtype)
     with self.assertRaises(TypeError):
         gx, = chainer.grad((y, ), (v, ))
Esempio n. 32
0
    def update_core(self):
        def _update(optimizer, loss):
            optimizer.target.cleargrads()
            loss.backward()
            optimizer.update()

        xp = self.generator.xp
        if self.iteration < 50:
            n_critic = 100
        else:
            n_critic = 5
        # update critic n_critic times
        for _ in range(n_critic):
            # real image
            x_real = self.next_batch(self.x)
            y_real = self.critic(x_real)
            loss1 = -F.sum(y_real) / self.batchsize

            # fake image
            z = self.next_batch(self.z)
            x_fake = self.generator(z)
            y_fake = self.critic(x_fake)
            loss2 = F.sum(y_fake) / self.batchsize

            x_fake.unchain_backward()

            # gp
            eps = xp.random.uniform(0, 1,
                                    size=self.batchsize).astype("f")[:, None,
                                                                     None,
                                                                     None]
            x_mid = eps * x_real + (1.0 - eps) * x_fake
            y_mid = self.critic(x_mid)
            grad, = chainer.grad([y_mid], [x_mid], enable_double_backprop=True)
            grad = F.sqrt(F.batch_l2_norm_squared(grad))
            loss_gp = self.lam * F.mean_squared_error(grad,
                                                      xp.ones_like(grad.data))

            # compute loss
            critic_loss = loss1 + loss2 + loss_gp

            # update critic
            _update(self.optimizer_critic, critic_loss)

            chainer.reporter.report({
                'critic/loss/real': loss1,
                'critic/loss/fake': loss2,
                'critic/loss/gp': loss_gp,
                'critic/loss': critic_loss,
                'wasserstein': -loss1 - loss2,
            })

        # update generator 1 time
        z = self.next_batch(self.z)
        x_fake = self.generator(z)
        y_fake = self.critic(x_fake)
        gen_loss = -F.sum(y_fake) / self.batchsize
        _update(self.optimizer_generator, gen_loss)
        chainer.report({'generator/loss': gen_loss})
Esempio n. 33
0
def c_ctc(yhat, y):

    yhat = [np.expand_dims(yhat[:,i], 0) for i in range(yhat.shape[1])]
    yhat = [chainer.Variable(yI) for yI in yhat]
    loss = ctc(yhat, y, 61)
    g = chainer.grad([loss], yhat)
    g = np.vstack([gI.data for gI in g]).T
    return loss.data, g
Esempio n. 34
0
 def check_broadcast_backward(self, xp):
     x = xp.arange(12, dtype=self.dtype).reshape((3, 4)).copy()
     v = chainer.Variable(x)
     y = F.as_strided(v, (2, 3, 4), (0, 4, 1), 0)
     y.grad = xp.ones((2, 3, 4), dtype=self.dtype)
     gx, = chainer.grad((y, ), (v, ))
     testing.assert_allclose(gx.array,
                             xp.ones(x.shape, dtype=self.dtype) * 2)
Esempio n. 35
0
 def check_broadcast_backward(self, xp):
     x = xp.arange(12, dtype=self.dtype).reshape((3, 4)).copy()
     v = Variable(x)
     y = as_strided(v, (2, 3, 4), (0, 4, 1), 0)
     y.grad = xp.ones((2, 3, 4), dtype=self.dtype)
     gx, = grad((y,), (v,))
     testing.assert_allclose(gx.array,
                             xp.ones(x.shape, dtype=self.dtype) * 2)
Esempio n. 36
0
 def backward(self, indexes, grad_outputs):
     inputs = self.get_retained_inputs()
     with function.force_backprop_mode():
         outs = _call_func(self.func, inputs)
     # Return gradients that are further backproable
     return chainer.grad(
         outs, inputs, grad_outputs=grad_outputs,
         enable_double_backprop=True)
Esempio n. 37
0
    def zero_centered_gradient_penalty_fake(fake, y):
        grad, = chainer.grad([fake], [y], enable_double_backprop=True)
        grad = F.sqrt(F.batch_l2_norm_squared(grad))
        zeros = call_zeros(grad)

        loss = 10 * F.mean_squared_error(grad, zeros)

        return loss
Esempio n. 38
0
 def check_general_stride_backward(self, xp):
     x = _stride_array(xp.arange(8, dtype=self.dtype), (3, 3), (-1, 2), 3)
     # [[3., 5., 7.], [2., 4., 6.], [1., 3., 5.]]
     v = Variable(x)
     y = as_strided(v, (3, 3), (1, 2), 0)
     # [[0., 2., 4.], [1., 3., 5.,], [2., 4., 6.]]
     y.grad = xp.ones(y.shape, dtype=self.dtype)
     with self.assertRaises(TypeError):
         gx, = grad((y,), (v,))
Esempio n. 39
0
    def check_double_grad(self):
        self.forward()
        ys = [getattr(self, name) for name in self.y_names]
        gxs = chainer.grad(ys, self.xs, self.gys, self.gxs,
                           enable_double_backprop=True)
        y = sum(gxs)
        ggxs = chainer.grad([y], self.xs)

        expected = self.expected_double_grad()
        self.assertEqual(len(ggxs), len(expected))
        try:
            for a, e in zip(ggxs, expected):
                testing.assert_allclose(self._get_value(a), self._get_value(e))
        except Exception:
            self._print_inputs()
            self._print_variables('gxs            ', gxs)
            self._print_variables('ggxs (actual)  ', ggxs)
            self._print_variables('ggxs (expected)', expected)
            raise
Esempio n. 40
0
    def test_length_check(self):
        x = chainer.Variable(numpy.array(3, numpy.float32))
        y = chainer.functions.identity(x)

        with self.assertRaises(ValueError):
            chainer.grad([y], [x], [], [None])
        with self.assertRaises(ValueError):
            chainer.grad([y], [x], [None, None], [None])
        with self.assertRaises(ValueError):
            chainer.grad([y], [x], [None], [])
        with self.assertRaises(ValueError):
            chainer.grad([y], [x], [None], [None, None])
Esempio n. 41
0
    def test_unchain_split(self):
        x = chainer.Variable(numpy.arange(4).astype('f').reshape(2, 2))
        h0, h1 = chainer.functions.split_axis(x, [1], axis=0)
        y = chainer.functions.sum(h0)
        z = chainer.functions.sum(h1)
        w = y + z
        h0.unchain()

        dy_dh0 = numpy.array([[1., 1.]])
        dz_dh1 = numpy.array([[1., 1.]])
        dy_dx = None
        dz_dx = numpy.array([[0., 0.], [1., 1.]])
        dw_dx = numpy.array([[0., 0.], [1., 1.]])

        testing.assert_allclose(chainer.grad([y], [h0])[0].array, dy_dh0)
        testing.assert_allclose(chainer.grad([z], [h1])[0].array, dz_dh1)
        assert chainer.grad([y], [x])[0] is dy_dx
        testing.assert_allclose(chainer.grad([z], [x])[0].array, dz_dx)
        testing.assert_allclose(chainer.grad([w], [x])[0].array, dw_dx)
Esempio n. 42
0
 def test_forward_no_cast_grad(self):
     # This test would fail if F.cast does not create new function nodes for
     # no-op casts
     x = chainer.Variable(self.x)
     y1 = functions.cast(x, self.dtype)
     y2 = functions.cast(x, self.dtype)
     z = y1 + y2
     gy1, gy2 = chainer.grad([z], [y1, y2], [numpy.ones_like(z.data)])
     assert gy1.dtype == self.dtype
     assert gy2.dtype == self.dtype
     numpy.testing.assert_array_equal(gy1.data, numpy.ones_like(y1.data))
     numpy.testing.assert_array_equal(gy2.data, numpy.ones_like(y2.data))
Esempio n. 43
0
    def test_retain_output(self):
        xp = numpy
        x_array = xp.random.randn(3)
        y1_grad = xp.random.randn(3)
        x_grad_grad = xp.random.randn(3)

        x = chainer.Variable(x_array, name='x')
        y0, y1 = exp_and_expm1(x)
        del y0

        # (x: Variable) requires grad
        # (y1_grad: ndarray) does not require grad
        gx, = chainer.grad([y1], [x], [y1_grad], enable_double_backprop=True)

        # assert gx == exp(x) * y1_grad
        xp.testing.assert_allclose(
            gx.array,
            xp.exp(x.array) * y1_grad)

        gx_, = chainer.grad([gx], [x], [x_grad_grad])
        xp.testing.assert_allclose(
            gx_.array,
            gx.array * x_grad_grad)
Esempio n. 44
0
 def check_general_stride_backward(self, xp):
     x = _stride_array(xp.arange(8, dtype=self.dtype), (3, 3), (-1, 2), 3)
     # [[3., 5., 7.], [2., 4., 6.], [1., 3., 5.]]
     v = Variable(x)
     y = as_strided(v, (3, 3), (1, 2), 0)
     # [[0., 2., 4.], [1., 3., 5.,], [2., 4., 6.]]
     y.grad = xp.ones(y.shape, dtype=self.dtype)
     gx, = grad((y,), (v,))
     testing.assert_allclose(gx.array,
                             xp.array([
                                 [0.5, 0.5, 0.],
                                 [2., 2., 1.],
                                 [1., 0.5, 0.5]
                             ], dtype=self.dtype)
                             )
    def _compute_backward(self, x, gamma, beta, y, gy):
        assert isinstance(x, chainer.Variable)
        assert isinstance(gamma, chainer.Variable)
        assert isinstance(beta, chainer.Variable)
        assert isinstance(y, chainer.Variable)
        assert isinstance(gy, chainer.Variable)

        if x.xp is chainerx:
            # TODO(niboshi): ChainerX does not support grad yet
            y.grad = gy.array.copy()
            y.backward()
            gx = x.grad_var
            ggamma = gamma.grad_var
            gbeta = beta.grad_var
        else:
            gx, ggamma, gbeta = chainer.grad([y], [x, gamma, beta], [gy])
        return gx.array, ggamma.array, gbeta.array
Esempio n. 46
0
    def check_grad(self):
        self.forward()
        ys = [getattr(self, name) for name in self.y_names]
        gxs = chainer.grad(ys, self.xs, self.gys, self.gxs)

        expected = self.expected_grad()
        for i, gx in enumerate(self.gxs):
            expected[i] += gx

        self.assertEqual(len(gxs), len(expected))
        try:
            for a, e in zip(gxs, expected):
                testing.assert_allclose(self._get_value(a), self._get_value(e))
        except Exception:
            self._print_inputs()
            self._print_variables('gxs (actual)  ', gxs)
            self._print_variables('gxs (expected)', expected)
            raise
Esempio n. 47
0
def _sigmoid_derivative(x):
    h = chainer.functions.sigmoid(x)
    return chainer.grad([h], [x], enable_double_backprop=True)[0]
Esempio n. 48
0
 def test_all_called_with_grad(self):
     x = chainer.Variable(numpy.random.rand(2, 3).astype(numpy.float32))
     y = chainer.functions.sum(x * x)
     self.check_hook_methods_called(lambda: chainer.grad([y], [x]))
Esempio n. 49
0
def main():
    parser = argparse.ArgumentParser(description='GradNorm')
    parser.add_argument('--gpu', '-g', type=int, default=-1)
    parser.add_argument('--n-iter', '-it', type=int, default=5000)
    parser.add_argument('--mode', '-m', choices=('grad_norm', 'equal_weight'),
                        default='grad_norm')
    args = parser.parse_args()

    np.random.seed(123)
    sigmas = [1, 10]
    n_task = len(sigmas)
    epsilons = np.random.normal(
        scale=3.5, size=(n_task, 100, 250)).astype(np.float32)
    dataset = RegressionDataset(sigmas, epsilons)

    model = RegressionTrainChain(RegressionChain(n_task))

    if args.gpu >= 0:
        chainer.backends.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()

    optimizer = chainer.optimizers.Adam(alpha=1e-2)
    optimizer.setup(model)

    train_iter = chainer.iterators.SerialIterator(dataset, 200)

    xp = model.xp
    weights = []
    task_losses = []
    loss_ratios = []
    final_layer_names = ['task_{}'.format(i) for i in range(n_task)]
    for t in range(args.n_iter):
        batch = train_iter.next()
        x, ts = chainer.dataset.convert.concat_examples(batch, device=args.gpu)

        task_loss = model(x, ts)
        weighted_task_loss = model.weight * task_loss
        if t == 0:
            initial_task_loss = task_loss.data
        loss = F.mean(weighted_task_loss)
        model.cleargrads()
        loss.backward()
        # Ignore a gradient to the coefficient vector, which
        # is computed from the standard loss.
        model.weight.cleargrad()
        if args.mode == 'grad_norm':
            # Use |\nabla_W w_i * L_i | = w_i |\nabla_W L_i|
            gygw_norms = []
            for i, layer_name in enumerate(final_layer_names):
                l = getattr(model.model, layer_name)
                gygw = chainer.grad([task_loss[i]], [l.W])[0].data
                gygw_norms.append(xp.linalg.norm(gygw))
            gygw_norms = xp.stack(gygw_norms)
            norms = model.weight * gygw_norms

            alpha = 0.16
            mean_norm = xp.mean(norms.data)
            loss_ratio = task_loss.data / initial_task_loss
            inverse_train_rate = loss_ratio / xp.mean(loss_ratio)

            diff = norms - (inverse_train_rate ** alpha) * mean_norm
            grad_norm_loss = F.mean(F.absolute(diff))
            grad_norm_loss.backward()

            # For debugging purpose only
            # from chainer import computational_graph
            # import os
            # cg = computational_graph.build_computational_graph(
            #     [grad_norm_loss]).dump()
            # with open('grad_weight_loss_cg', 'w') as f:
            #     f.write(cg)

        optimizer.update()

        # Renormalize
        normalize_coeff = n_task / xp.sum(model.weight.data)
        model.weight.data[:] = model.weight.data * normalize_coeff

        # Record
        task_losses.append(chainer.backends.cuda.to_cpu(task_loss.data))
        loss_ratios.append(np.mean(task_losses[-1] / task_losses[0]))
        weights.append(chainer.backends.cuda.to_cpu(model.weight.data))

        if t % 100 == 0:
            print('{}/{}:  loss_ratio={}, weights={} task_loss={}'.format(
                t, args.n_iter, loss_ratios[-1], model.weight.data, task_loss.data))
    task_losses = np.array(task_losses)
    weights = np.array(weights)

    fig = plt.figure()
    ax1 = fig.add_subplot(1, 4, 1)
    ax1.set_title('loss (task 0)')
    ax2 = fig.add_subplot(1, 4, 2)
    ax2.set_title('loss (task 1)')
    ax3 = fig.add_subplot(1, 4, 3)
    ax3.set_title('sum of normalized losses')
    ax4 = fig.add_subplot(1, 4, 4)
    ax4.set_title('change of weights over time')
    ax1.plot(task_losses[:, 0])
    ax2.plot(task_losses[:, 1])
    ax3.plot(loss_ratios)
    ax4.plot(weights[:, 0])
    ax4.plot(weights[:, 1])
    plt.show()
Esempio n. 50
0
    def test_type_check(self):
        x = chainer.Variable(numpy.random.uniform(-1, 1, (2, 3)).astype('f'))
        y = x * x
        gx = chainer.Variable(numpy.random.uniform(-1, 1, (2, 3)).astype('f'))
        gy = chainer.Variable(numpy.random.uniform(-1, 1, (2, 3)).astype('f'))

        chainer.grad([y], [x], [gx], [gy])
        chainer.grad((y,), (x,), (gx,), (gy,))

        with self.assertRaises(TypeError):
            chainer.grad(y, [x], [gx], [gy])
        with self.assertRaises(TypeError):
            chainer.grad([y], x, [gx], [gy])
        with self.assertRaises(TypeError):
            chainer.grad([y], [x], gx, [gy])
        with self.assertRaises(TypeError):
            chainer.grad([y], [x], [gx], gy)