def check_forward(self, log_pi_data, tau):
        log_pi = chainer.Variable(log_pi_data)
        y = functions.gumbel_softmax(log_pi, tau=tau)

        # Only checks dtype and shape because its result contains noise
        self.assertEqual(y.dtype, numpy.float32)
        self.assertEqual(y.shape, log_pi.shape)
        self.assertEqual(
            backend.get_array_module(y),
            backend.get_array_module(log_pi))
Exemple #2
0
    def check_forward(self, e1_data, e2_data, W_data, V1_data, V2_data,
                      b_data):
        e1 = chainer.Variable(e1_data)
        e2 = chainer.Variable(e2_data)
        W = chainer.Variable(W_data)

        e1_data = e1_data.reshape(e1_data.shape[0], -1)
        e2_data = e2_data.reshape(e2_data.shape[0], -1)
        xp = backend.get_array_module(e1)
        y_expect = xp.einsum('ij,ik,jkl->il', e1_data, e2_data, W_data)

        flags = V1_data is None, V2_data is None, b_data is None
        if any(flags):
            if not all(flags):
                raise ValueError(
                    'Test either all or none of the optional parameters.')
            y = functions.bilinear(e1, e2, W)
        else:
            V1 = chainer.Variable(V1_data)
            V2 = chainer.Variable(V2_data)
            b = chainer.Variable(b_data)
            y = functions.bilinear(e1, e2, W, V1, V2, b)

            y_expect = xp.einsum('ij,ik,jkl->il', e1_data, e2_data, W_data)
            y_expect += e1_data.dot(V1_data)
            y_expect += e2_data.dot(V2_data)
            y_expect += b_data

        testing.assert_allclose(y_expect, cuda.to_cpu(y.data))
        assert y.data.dtype == e1_data.dtype
def update_approximate_vectors(
        weight_matrix, u, n_power_iteration, eps):
    """Update the first left and right singular vectors.

    This function updates the first left singular vector `u` and
    the first right singular vector `v`.

    Args:
        weight_matrix (~chainer.Variable): 2D weight.
        u (numpy.ndarray, cupy.ndarray, or None):
            Vector that approximates the first left singular vector and
            has the shape of (out_size,).
        n_power_iteration (int): Number of iterations to approximate
            the first right and left singular vectors.

    Returns:
        :class:`numpy.ndarray` or `cupy.ndarray`:
            Approximate first left singular vector.
        :class:`numpy.ndarray` or `cupy.ndarray`:
            Approximate first right singular vector.

    """
    weight_matrix = weight_matrix.array
    xp = backend.get_array_module(weight_matrix)
    for _ in range(n_power_iteration):
        v = l2normalize(xp, xp.dot(u, weight_matrix), eps)
        u = l2normalize(xp, xp.dot(weight_matrix, v), eps)
    return u, v
    def check_backward(self, x_data, W_data, b_data, y_grad,
                       use_cudnn='never'):
        if not self.c_contiguous:
            xp = backend.get_array_module(x_data)
            x_data = xp.asfortranarray(x_data)
            W_data = xp.asfortranarray(W_data)
            y_grad = xp.asfortranarray(y_grad)
            self.assertFalse(x_data.flags.c_contiguous)
            self.assertFalse(W_data.flags.c_contiguous)
            self.assertFalse(y_grad.flags.c_contiguous)
            if b_data is not None:
                b = xp.empty((len(b_data) * 2,), dtype=self.b.dtype)
                b[::2] = b_data
                b_data = b[::2]
                self.assertFalse(b_data.flags.c_contiguous)

        args = (x_data, W_data)
        if b_data is not None:
            args += (b_data,)

        def f(*args):
            return F.deconvolution_nd(*args, stride=self.stride, pad=self.pad,
                                      outsize=self.outsize, dilate=self.dilate,
                                      groups=self.groups)

        with chainer.using_config('use_cudnn', use_cudnn):
            with chainer.using_config('autotune', self.autotune):
                gradient_check.check_backward(
                    f, args, y_grad, **self.check_backward_options)
Exemple #5
0
def variable_repr(var):
    """Return the string representation of a variable.

    Args:
        var (~chainer.Variable): Input Variable.
    .. seealso:: numpy.array_repr
    """
    xp = backend.get_array_module(var)
    if xp is numpy:
        arr = var.data
    else:
        arr = var.data.get()

    if var.name:
        prefix = 'variable ' + var.name
    else:
        prefix = 'variable'

    if arr is None:
        lst = 'None'
    elif arr.size > 0 or arr.shape == (0,):
        lst = numpy.array2string(arr, None, None, None, ', ', prefix + '(')
    else:  # show zero-length shape unless it is (0,)
        lst = '[], shape=%s' % (repr(arr.shape),)

    return '%s(%s)' % (prefix, lst)
Exemple #6
0
    def forward(self, x):
        self.retain_inputs(())

        dims = x[0].shape[2:]
        ndim = self.ndim
        ksize = self.ksize
        stride = self.stride
        pad = self.pad
        if self.outs is None:
            self.outs = tuple(
                conv.get_deconv_outsize(d, k, s, p, cover_all=self.cover_all)
                for (d, k, s, p) in six.moves.zip(dims, ksize, stride, pad))

        xp = backend.get_array_module(*x)

        colon = slice(None)
        # (:, :, None, None, ..., None)
        tile_index = (colon, colon) + (None,) * ndim
        # (1, 1, k_1, k_2, ..., k_n, 1, 1, ..., 1)
        tile_reps = (1, 1) + ksize + (1,) * ndim
        col = xp.tile(x[0][tile_index], tile_reps)

        if xp is numpy:
            col2im_nd = conv_nd.col2im_nd_cpu
        else:
            col2im_nd = conv_nd.col2im_nd_gpu
        y = col2im_nd(col, stride, pad, self.outs)

        return y,
Exemple #7
0
    def backward(self, indexes, grad_outputs):
        x, = self.get_retained_inputs()
        xp = backend.get_array_module(x)
        y, = self.get_retained_outputs()
        gy, = grad_outputs
        F = chainer.functions

        axis = self.axis
        if axis is None:
            shape = x.shape
            axis = 0
            x = F.flatten(x)
        else:
            shape = None
            if axis < 0:
                axis += y.ndim

        if y.shape[axis] <= 1:
            gx = gy
        else:
            _, x = F.split_axis(x, (1,), axis)
            gx = _flipcumprodsum(x, gy, axis)
            y, ylast = F.split_axis(y, (-1,), axis)
            gx *= F.concat([xp.ones_like(ylast.array), y], axis=axis)
        if shape is not None:
            gx = F.reshape(gx, shape)
        return gx,
Exemple #8
0
    def forward(self, inputs):
        gy, = inputs
        xp = backend.get_array_module(*inputs)
        gx = xp.zeros(self._in_shape, gy.dtype)
        if xp is numpy:
            try:
                numpy.add.at(gx, self.slices, gy)
            except IndexError:
                done = False
                # In numpy<1.13, 0-dim boolean index is not supported in
                # numpy.add.at and it's supported for 0-dim arr in
                # arr.__getitem__.
                if not _numpy_supports_0d_bool_index and len(self.slices) == 1:
                    idx = numpy.asanyarray(self.slices[0])
                    if idx.dtype == numpy.dtype(bool):
                        # Convert the array and the mask to 1-dim.
                        # numpy.add.at with them is supported in older numpy.
                        numpy.add.at(gx[None], idx[None], gy)
                        done = True

                if not done:
                    msg = '''
GetItem does not support backward for this slices. The slices argument is not
supported by numpy.add.at, while it is supported by numpy.ndarray.__getitem__.

Please report this error to the issue tracker with the stack trace,
the information of your environment, and your script:
https://github.com/chainer/chainer/issues/new.
'''
                    raise IndexError(msg)
        else:
            gx.scatter_add(self.slices, inputs[0])
        return gx,
Exemple #9
0
    def forward(self, inputs):
        xp = backend.get_array_module(inputs[0])
        self.input_length, label_length, t, xs = inputs

        if self.zero_padding is None:
            if xs.dtype == numpy.float16:
                self.zero_padding = -10000.0
            else:
                self.zero_padding = -10000000000.0

        if chainer.is_debug():
            assert len(xs) >= xp.max(self.input_length)
            assert t.shape[1] >= xp.max(label_length)

        self.path_length = 2 * label_length + 1

        self.yseq = _softmax(xs, xp)
        log_yseq = self.log_matrix(self.yseq, xp)
        self.path = _label_to_path(t, self.blank_symbol, xp)
        self.prob_trans = self.calc_trans(
            log_yseq, self.input_length, t,
            label_length, self.path, self.path_length, xp)

        loss = -_logsumexp(self.prob_trans[0], xp, axis=1)
        if self.reduce == 'mean':
            loss = utils.force_array(xp.mean(loss))
        return loss,
Exemple #10
0
    def backward(self, indexes, grad_outputs):
        anchor, positive, negative = self.get_retained_inputs()

        N = anchor.shape[0]
        x_dim = anchor.shape[1]

        xp = backend.get_array_module(anchor)
        tmp = xp.repeat(self.dist_hinge[:, None], x_dim, axis=1)
        mask = xp.array(tmp > 0, dtype=anchor.dtype)

        gy, = grad_outputs
        if self.reduce == 'mean':
            g = gy / N
        else:
            g = gy[:, None]

        tmp = 2 * chainer.functions.broadcast_to(g, mask.shape) * mask

        ret = []
        if 0 in indexes:
            ret.append(tmp * (negative - positive))
        if 1 in indexes:
            ret.append(tmp * (positive - anchor))
        if 2 in indexes:
            ret.append(tmp * (anchor - negative))

        return ret
    def forward(self, inputs):
        self.retain_inputs((0, 1))
        scale = inputs[1].dtype.type(1. / (1 - self.ratio))
        xp = backend.get_array_module(*inputs)

        if self.mask is None:
            if self.use_batchwise_mask:
                mask_shape = (inputs[0].shape[0], inputs[1].shape[0],
                              inputs[1].shape[1])
            else:
                mask_shape = (inputs[1].shape[0], inputs[1].shape[1])
            if xp == numpy:
                self.mask = xp.random.rand(*mask_shape) >= self.ratio
            else:
                self.mask = xp.random.rand(*mask_shape,
                                           dtype=numpy.float32) >= self.ratio
        elif isinstance(self.mask, variable.Variable):
            self.mask = self.mask.data

        x = _as_mat(inputs[0])
        W = inputs[1] * scale * self.mask

        # (i)jk,ik->ij
        y = _matmul(W, x[:, :, None], xp)
        y = y.reshape(y.shape[0], y.shape[1]).astype(x.dtype, copy=False)

        if len(inputs) == 3:
            b = inputs[2]
            y += b
        return y,
    def forward(self, inputs):
        if inputs[0].shape[1] % self.groups != 0:
            raise ValueError('The number of channels {} is not divisible by '
                             '\'groups\' argument {}.'
                             .format(inputs[0].shape[1], self.groups))

        xp = backend.get_array_module(*inputs)
        if xp is cuda.cupy and chainer.should_use_cudnn('>=auto', 5000):
            return self.forward_cudnn(inputs)

        self.retain_inputs((0, 1))
        x, gamma, beta = inputs

        orig_shape = x.shape
        batch_size, channels = orig_shape[:2]
        groups = self.groups
        reduced_shape = (batch_size * groups, -1)
        x = x.reshape(reduced_shape)

        self.mean = x.mean(axis=1)
        x_hat = x - self.mean[:, None]
        var = (x_hat * x_hat).mean(axis=1)
        var += self.eps
        self.inv_std = var
        del var
        xp.sqrt(self.inv_std, out=self.inv_std, dtype=x.dtype)
        xp.reciprocal(self.inv_std, out=self.inv_std)
        x_hat *= self.inv_std[:, None]

        y = x_hat.reshape((batch_size, channels, -1))
        y *= gamma[:, None]
        y += beta[:, None]

        y = y.reshape(orig_shape)
        return y,
Exemple #13
0
    def check_backward_consistency_regression(self, backend_config):
        # Regression test to two-dimensional unpooling layer.

        x_data, = self.generate_inputs()
        gy_data = numpy.random.uniform(-1, 1, self.gy_shape).astype(self.dtype)

        ksize = self.ksize
        stride = self.stride
        pad = self.pad
        xp = backend.get_array_module(x_data)

        # Backward computation for N-dimensional unpooling layer.
        x_nd = chainer.Variable(xp.array(x_data))
        y_nd = functions.unpooling_nd(
            x_nd, ksize, stride=stride, pad=pad, cover_all=self.cover_all)
        y_nd.grad = gy_data
        y_nd.backward()

        # Backward computation for two-dimensional unpooling layer.
        x_2d = chainer.Variable(xp.array(x_data))
        y_2d = functions.unpooling_2d(
            x_2d, ksize, stride=stride, pad=pad, cover_all=self.cover_all)
        y_2d.grad = gy_data
        y_2d.backward()

        # Test that the two result gradients are close enough.
        opt = self.check_backward_options
        testing.assert_allclose(
            x_nd.grad, x_2d.grad, atol=opt['atol'], rtol=opt['rtol'])
Exemple #14
0
    def backward(self, inputs, grad_outputs):
        expander = self.expander

        x, gamma, gy = inputs
        gx1, ggamma1, _ = self.output_data
        ggx1, gggamma1, ggbeta1 = grad_outputs
        xp = backend.get_array_module(x)

        # auxiliary values
        inv_m = gamma.dtype.type(1. / (x.size // gamma.size))
        r = 0 if ggx1 is None else (gx1 * ggx1).sum(axis=self.axis)
        coeff = gamma * self.inv_std
        coeff_m = coeff * inv_m
        x_hat = _x_hat(x, self.mean[expander], self.inv_std[expander])

        # handle None in output gradients
        ggx1 = _zero_if_none(xp, ggx1, x.shape, x.dtype)
        gggamma1 = _zero_if_none(xp, gggamma1, gamma.shape, gamma.dtype)
        ggbeta1 = _zero_if_none(xp, ggbeta1, gamma.shape, gamma.dtype)

        gggamma2 = gggamma1 - coeff_m * (x_hat * ggx1).sum(axis=self.axis)
        ggbeta2 = ggbeta1 - coeff_m * ggx1.sum(axis=self.axis)

        ggamma2 = r / gamma

        gx_hat2 = (gggamma2[expander] * gy -
                   (coeff_m * ggamma1)[expander] * ggx1)
        gstd2 = -self.inv_std * (r + (x_hat * gx_hat2).sum(axis=self.axis))
        gmean2 = -self.inv_std * gx_hat2.sum(axis=self.axis)
        gx2 = self.inv_std[expander] * gx_hat2 + inv_m * (
            gmean2[expander] + x_hat * gstd2[expander])
        ggy2 = (gggamma2[expander] * x_hat + ggbeta2[expander]
                + coeff[expander] * ggx1)

        return gx2, ggamma2, ggy2
    def check_backward(self, x_data, W_data, b_data, y_grad):
        xp = backend.get_array_module(x_data)
        if not self.c_contiguous:
            x_data = xp.asfortranarray(x_data)
            W_data = xp.asfortranarray(W_data)
            y_grad = xp.asfortranarray(y_grad)
            self.assertFalse(x_data.flags.c_contiguous)
            self.assertFalse(W_data.flags.c_contiguous)
            self.assertFalse(y_grad.flags.c_contiguous)
            if b_data is not None:
                b = xp.empty((len(b_data) * 2,), dtype=self.b.dtype)
                b[::2] = b_data
                b_data = b[::2]
                self.assertFalse(b_data.flags.c_contiguous)

        args = (x_data, W_data)
        if b_data is not None:
            args = args + (b_data,)

        def f(*args):
            return F.dilated_convolution_2d(*args, stride=self.stride,
                                            pad=self.pad, dilate=self.dilate,
                                            cover_all=self.cover_all)

        with chainer.using_config('use_cudnn', self.use_cudnn):
            gradient_check.check_backward(
                f, args, y_grad, dtype=numpy.float64,
                **self.check_backward_options)
Exemple #16
0
def _coo_matmul_gradsp(a, b, c_row, c_col, c_shape, transa, transb, transc,
                       dtype):
    if dtype is None:
        dtype = numpy.result_type(a.dtype, b.dtype)

    if transa:
        A = a.swapaxes(-1, -2)
    else:
        A = a
    if transb:
        B = b.swapaxes(-1, -2)
    else:
        B = b
    if transc:
        C_row = c_col
        C_col = c_row
    else:
        C_row = c_row
        C_col = c_col

    xp = backend.get_array_module(A, B)
    if xp is numpy:
        return _coo_matmul_gradsp_cpu(A, B, C_row, C_col, dtype)
    else:
        return _coo_matmul_gradsp_gpu(A, B, C_row, C_col, dtype)
Exemple #17
0
 def forward(self, inputs):
     x, = inputs
     xp = backend.get_array_module()
     y0 = xp.exp(x)
     y1 = xp.expm1(x)
     self.retain_outputs((0,))
     return y0, y1
Exemple #18
0
def sign(x):
    """Elementwise sign function.

    For a given input :math:`x`, this function returns :math:`sgn(x)`
    defined as

    .. math::

        sgn(x) = \\left \\{ \\begin{array}{cc}
        -1 & {\\rm if~x < 0} \\\\
        0 & {\\rm if~x = 0} \\\\
        1 & {\\rm if~x > 0} \\\\
        \\end{array} \\right.

    .. note::

        The gradient of this function is ``None`` everywhere and therefore
        unchains the computational graph.

    Args:
        x (~chainer.Variable): Input variable for which the sign is computed.

    Returns:
        ~chainer.Variable: Output variable.

    """
    if isinstance(x, chainer.variable.Variable):
        x = x.array
    xp = backend.get_array_module(x)
    return chainer.as_variable(utils.force_array(xp.sign(x)))
Exemple #19
0
    def __init__(self, initializer=None, shape=None, name=None):
        if initializer is None:
            initializer = constant.NaN()
        elif numpy.isscalar(initializer):
            initializer = constant.Constant(initializer)
        if shape is None:
            if isinstance(initializer, (numpy.ndarray, cuda.ndarray)):
                # parameter initialized by the initial array
                super(Parameter, self).__init__(initializer, name=name)
            else:
                # uninitialized parameter
                super(Parameter, self).__init__(name=name)
                dtype = getattr(initializer, 'dtype', None)
                self._grad_initializer = constant.NaN(dtype)
        else:
            # parameter initialized with a given shape
            if isinstance(initializer, (numpy.ndarray, cuda.ndarray)):
                xp = backend.get_array_module(initializer)
                initializer = constant.Constant(initializer)
            else:
                xp = numpy
            data = initializers.generate_array(initializer, shape, xp)
            grad = xp.full_like(data, numpy.nan)
            super(Parameter, self).__init__(data, name=name, grad=grad)

        self.update_rule = None
        self.initializer = initializer
Exemple #20
0
 def backward(self, inputs, grads):
     xp = backend.get_array_module(*inputs)
     _, indices, _ = inputs
     g = grads[0]
     gv = g[xp.arange(len(indices)), indices]
     g[xp.arange(len(indices)), indices] = 0
     return g, None, gv
Exemple #21
0
 def forward(self, inputs):
     self.retain_inputs((0, 1))
     xp = backend.get_array_module(*inputs)
     x1, x2 = inputs
     difference = x1 - x2
     y = xp.square(difference)
     return utils.force_array(y, dtype=x1.dtype),
 def __call__(self, rule, param):
     grad = param.grad
     if grad is None:
         return
     xp = backend.get_array_module(grad)
     with cuda.get_device_from_array(grad):
         xp.clip(grad, self.lower_bound, self.upper_bound, out=grad)
    def forward(self, inputs):

        # Channels-first is Chainer's tensor format
        # W is 6-dimensional
        x, W = inputs[:2]
        b = inputs[2] if len(inputs) == 3 else None
        stride_row, stride_col = self.sy, self.sx
        output_row, output_col = W.shape[1], W.shape[2]
        feature_dim = W.shape[3] * W.shape[4] * W.shape[5]
        xp = backend.get_array_module(*inputs)
        output = xp.empty((x.shape[0], W.shape[0], output_row, output_col,),
                          dtype=x.dtype)
        for i in moves.range(output_row):
            for j in moves.range(output_col):
                slice_row = slice(i * stride_row,
                                  i * stride_row + W.shape[4])
                slice_col = slice(j * stride_col,
                                  j * stride_col + W.shape[5])
                x_flatten = xp.reshape(x[..., slice_row, slice_col],
                                       (-1, feature_dim))
                W_flatten = xp.reshape(W[:, i, j, ...],
                                       (-1, feature_dim))
                output[..., i, j] = xp.dot(x_flatten, W_flatten.T)

        if b is not None:
            output += b[None, :, :, :]

        self.retain_inputs((0, 1))  # only retain x and W
        return output,
    def backward(self, inputs, grad_outputs):
        x, gamma, _ = inputs
        gy = grad_outputs[0]
        head_ndim = gamma.ndim + 1
        expander = (None, Ellipsis) + (None,) * (x.ndim - head_ndim)
        m = gamma.dtype.type(x.size // gamma.size)
        axis = (0,) + tuple(range(head_ndim, x.ndim))
        xp = backend.get_array_module(x)

        # Note: we must be in train mode.
        assert configuration.config.train
        # NOTE(tommi): cuDNN is not used since it does not support
        # batch renormalization
        gbeta = gy.sum(axis=axis)
        ggamma = (gy * self.x_hat_renorm).sum(axis=axis)
        gsigma_batch = (gy * self.x_hat).sum(axis=axis)
        if xp is numpy:
            scale = (self.r * gamma / self.std)[expander]
            gx = scale * (gy - (self.x_hat * gsigma_batch[expander] +
                                gbeta[expander]) / m)
        else:
            inv_m = numpy.float32(1) / m
            gx = cuda.elementwise(
                'T gy, T x_hat, T gamma, T std, T gsigma_batch, T gbeta, \
                T inv_m, T r',
                'T gx',
                'gx = (r * gamma / std) * (gy - (x_hat * gsigma_batch + gbeta) * \
                inv_m)',
                'bn_bwd')(gy, self.x_hat, gamma[expander],
                          self.std[expander], gsigma_batch[expander],
                          gbeta[expander], inv_m, self.r[expander])
        return gx, ggamma, gbeta
Exemple #25
0
    def check_backward_consistency_regression(self, x_data, gy_data):
        # Regression test to two-dimensional unpooling layer.

        ndim = len(self.dims)
        if ndim != 2:
            return

        ksize = self.ksize
        stride = self.stride
        pad = self.pad
        xp = backend.get_array_module(x_data)

        # Backward computation for N-dimensional unpooling layer.
        x_nd = chainer.Variable(xp.array(x_data))
        y_nd = functions.unpooling_nd(
            x_nd, ksize, stride=stride, pad=pad, cover_all=self.cover_all)
        y_nd.grad = gy_data
        y_nd.backward()

        # Backward computation for two-dimensional unpooling layer.
        x_2d = chainer.Variable(xp.array(x_data))
        y_2d = functions.unpooling_2d(
            x_2d, ksize, stride=stride, pad=pad, cover_all=self.cover_all)
        y_2d.grad = gy_data
        y_2d.backward()

        # Test that the two result gradients are close enough.
        opt = self.check_backward_options
        testing.assert_allclose(
            x_nd.grad, x_2d.grad, atol=opt['atol'], rtol=opt['rtol'])
Exemple #26
0
    def check_LARS(self):
        w0 = self.target[0].param.data
        g0 = self.target[0].param.grad
        w1 = self.target[1].param.data
        g1 = self.target[1].param.grad
        xp = backend.get_array_module(w0)
        threshold = 1e-2
        weight_decay = 0.2
        eps = 1e-9

        p0_norm = xp.linalg.norm(w0)
        g0_norm = xp.linalg.norm(g0)
        clip_rate = p0_norm / (eps + g0_norm + weight_decay * p0_norm)
        expect0 = w0 - clip_rate * (g0 + weight_decay * w0)
        expect1 = w1 - 1.0 * (g1 + weight_decay * w1)

        opt = optimizers.SGD(lr=1)
        opt.setup(self.target)
        opt.add_hook(optimizer_hooks.GradientLARS(threshold=threshold,
                                                  weight_decay=weight_decay,
                                                  eps=eps))
        opt.update()

        testing.assert_allclose(expect0, w0)
        testing.assert_allclose(expect1, w1)
Exemple #27
0
    def backward(self, indexes, grad_outputs):
        x0, x1, y = self.get_retained_inputs()
        gy, = grad_outputs
        xp = backend.get_array_module(gy.data)

        # Recompute intermediate variables as in forward.
        diff = x0 - x1
        dist_sq = chainer.functions.sum(diff ** 2, axis=1)
        dist = chainer.functions.sqrt(dist_sq)
        mdist = self.margin - dist

        y = y.data
        x_dim = x0.shape[1]
        y = xp.repeat(y[:, None], x_dim, axis=1)
        if self.reduce == 'mean':
            alpha = gy / y.shape[0]
        else:
            alpha = gy[:, None]
        alpha = chainer.functions.broadcast_to(alpha, y.shape)
        dist = chainer.functions.repeat(dist[:, None], x_dim, axis=1)
        # avoid division by zero, 1e-7 is sufficiently small value that can be
        # represented even in half precision
        dist = chainer.functions.maximum(
            dist, xp.full(dist.shape, 1e-7, dtype=dist.dtype))
        # similar pair
        gx0 = alpha * y.astype(alpha.dtype) * diff
        # dissimilar pair
        d = chainer.functions.repeat(mdist[:, None], x_dim, axis=1)
        mdist = chainer.functions.maximum(
            d, xp.zeros(shape=d.shape, dtype=d.dtype))
        gx0 += alpha * (1 - y) * mdist * -(diff / dist)
        gx0 = chainer.functions.cast(gx0, x0.dtype)

        return gx0, -gx0, None
    def forward(self, inputs):
        self.retain_inputs((0, 1))

        xp = backend.get_array_module(*inputs)
        x, t = inputs
        self.ignore_mask = (t != self.ignore_label)

        # stable computation of the cross entropy.
        loss = -(
            self.ignore_mask *
            (x * (t - (x >= 0)) - xp.log1p(xp.exp(-xp.abs(x)))))

        if not self.reduce == 'mean':
            return utils.force_array(loss.astype(x.dtype)),

        if self.normalize:
            count = xp.maximum(1, self.ignore_mask.sum())
        else:
            count = max(1, len(x))
        self.count = count

        # TODO(takagi): Fix to perform division in a specific dtype. See
        # cupy/cupy#1534.
        return utils.force_array(
            xp.divide(xp.sum(loss), self.count), dtype=x.dtype),
    def __call__(self, trainer):
        if self.available():
            # Dynamically import pyplot to call matplotlib.use()
            # after importing chainer.training.extensions
            import matplotlib.pyplot as plt
        else:
            return

        xp = backend.get_array_module(self._vars[0].data)
        stats = xp.zeros(self._data_shape, dtype=xp.float32)
        for i, k in enumerate(self._keys):
            xs = []
            for var in self._vars:
                x = getattr(var, k, None)
                if x is not None:
                    xs.append(x.ravel())
            if len(xs) > 0:
                stat_dict = self._statistician(
                    xp.concatenate(xs, axis=0), axis=0, xp=xp)
                stat_list = []
                if self._plot_mean:
                    stat_list.append(xp.atleast_1d(stat_dict['mean']))
                if self._plot_std:
                    stat_list.append(xp.atleast_1d(stat_dict['std']))
                if self._plot_percentile:
                    stat_list.append(xp.atleast_1d(stat_dict['percentile']))
                stats[i] = xp.concatenate(stat_list, axis=0)

        if xp != numpy:
            stats = cuda.to_cpu(stats)
        self._samples.add(stats, idx=trainer.updater.iteration)

        if self._trigger(trainer):
            file_path = os.path.join(trainer.out, self._file_name)
            self.save_plot_using_module(file_path, plt)
Exemple #30
0
    def _sample_directions(self):
        # Samples a direction vector (list of arrays with the same shapes as
        # input arrays and parameters)
        x_data = self.x_data
        params = self.params
        no_grads = self.no_grads

        xp = backend.get_array_module(*x_data)
        direction_xs_shapes = [
            x.shape
            for x, no_grad in six.moves.zip(x_data, no_grads)
            if not no_grad]
        direction_param_shapes = [p.shape for p in params]
        direction_shapes = direction_xs_shapes + direction_param_shapes
        directions = [
            xp.random.normal(size=shape) for shape in direction_shapes]
        # The direction vector is normalized in order to keep the scale of
        # differentiation error invariant with respect to the number of input
        # dimensions. Ideally, the scale of the curvature with respect to each
        # input dimension should be taken into account, but we ignore the
        # differences and assume that the curvature is uniform with respect to
        # all the input dimentions.
        norm = math.sqrt(sum([xp.square(d).sum() for d in directions]))
        if norm != 0:
            # norm could be zero if input arrays are 0-sized.
            scale = 1. / norm
            directions = [d * scale for d in directions]

        return directions
Exemple #31
0
    def forward(self, inputs):
        x, = inputs
        xp = backend.get_array_module(x)

        B, C, H, W = x.shape

        u_1d = xp.linspace(0, W - 1, num=self.out_W)
        v_1d = xp.linspace(0, H - 1, num=self.out_H)
        grid = xp.meshgrid(u_1d, v_1d)
        # u, v are of shape (out_H * out_W,)
        u = grid[0].ravel()
        v = grid[1].ravel()

        # indices of the 2x2 pixel neighborhood surrounding the coordinates
        u0 = xp.floor(u).astype(numpy.int32)
        u0 = u0.clip(0, W - 2)
        u1 = u0 + 1
        v0 = xp.floor(v).astype(numpy.int32)
        v0 = v0.clip(0, H - 2)
        v1 = v0 + 1

        # weights
        w1 = (u1 - u) * (v1 - v)
        w2 = (u - u0) * (v1 - v)
        w3 = (u1 - u) * (v - v0)
        w4 = (u - u0) * (v - v0)
        w1 = w1.astype(x.dtype, copy=False)
        w2 = w2.astype(x.dtype, copy=False)
        w3 = w3.astype(x.dtype, copy=False)
        w4 = w4.astype(x.dtype, copy=False)

        y = (w1[None, None, :] * x[:, :, v0, u0] +
             w2[None, None, :] * x[:, :, v0, u1] +
             w3[None, None, :] * x[:, :, v1, u0] +
             w4[None, None, :] * x[:, :, v1, u1])
        y = y.reshape(B, C, self.out_H, self.out_W)
        return y,
Exemple #32
0
    def forward(self, inputs):
        n_args = len(inputs)
        # TODO(kataoka): Do not retain inputs if n_args == 1
        self.retain_inputs(tuple(range(n_args)))

        xp = backend.get_array_module(inputs[0])
        dtype = xp.result_type(*[x.dtype for x in inputs])

        out_set = set(self.out_sub)

        # '@' is a single char, ',' is excluded.
        io_set = out_set.intersection(set(self.in_subs))

        if len(io_set) == len(self.out_sub):
            y = _einsum(xp, dtype, self.in_subs, self.out_sub, *inputs)
        else:
            direct_sub = []
            inverse_sub = []
            expander = []
            for c in sorted(out_set):
                if c in io_set:
                    direct_sub.append(c)
                    expander.append(slice(None))
                else:
                    expander.append(None)
                inverse_sub.append(c)

            y = xp.zeros(self.out_shape, dtype)
            diag_y = _einsum(xp, dtype, self.out_sub, ''.join(inverse_sub), y)
            if diag_y.base is not y:
                raise ValueError('Update CuPy to close CuPy Issue #1199')
            # Make the view writeable as numpy PR #5410 for numpy<1.10.
            if xp is not cuda.cupy:  # no setflags in cupy
                diag_y.setflags(write=True)
            diag_y[...] = _einsum(xp, dtype, self.in_subs, ''.join(direct_sub),
                                  *inputs)[tuple(expander)]
        return y,
Exemple #33
0
    def forward(self, inputs):
        self.retain_inputs((0,))
        xp = backend.get_array_module(*inputs)
        x, gy = inputs
        self._gy_shape = gy.shape
        gW = xp.zeros(self.w_shape, dtype=gy.dtype)

        if xp is numpy:
            # It is equivalent to `numpy.add.at(gW, x, gy)` but ufunc.at is
            # too slow.
            for ix, igy in six.moves.zip(x.ravel(),
                                         gy.reshape(x.size, -1)):
                if ix == self.ignore_label:
                    continue
                gW[ix] += igy
        else:
            utils.nondeterministic('atomicAdd')
            if self.ignore_label is None:
                cuda.elementwise(
                    'T gy, S x, S n_out', 'raw T gW',
                    'ptrdiff_t w_ind[] = {x, i % n_out};'
                    'atomicAdd(&gW[w_ind], gy)',
                    'embed_id_bwd')(
                        gy, xp.expand_dims(x, -1), gW.shape[1], gW)
            else:
                cuda.elementwise(
                    'T gy, S x, S n_out, S ignore', 'raw T gW',
                    '''
                    if (x != ignore) {
                      ptrdiff_t w_ind[] = {x, i % n_out};
                      atomicAdd(&gW[w_ind], gy);
                    }
                    ''',
                    'embed_id_bwd_ignore_label')(
                        gy, xp.expand_dims(x, -1), gW.shape[1],
                        self.ignore_label, gW)
        return gW,
def average_pooling_2d(x, ksize, stride=None, pad=0):
    """Spatial average pooling function.

    This function acts similarly to :func:`~chainer.functions.convolution_2d`,
    but it computes the average of input spatial patch for each channel without
    any parameter instead of computing the inner products.

    Args:
        x (~chainer.Variable): Input variable.
        ksize (int or pair of ints): Size of pooling window. ``ksize=k`` and
            ``ksize=(k, k)`` are equivalent.
        stride (int or pair of ints or None): Stride of pooling applications.
            ``stride=s`` and ``stride=(s, s)`` are equivalent. If ``None`` is
            specified, then it uses same stride as the pooling window size.
        pad (int or pair of ints): Spatial padding width for the input array.
            ``pad=p`` and ``pad=(p, p)`` are equivalent.

    Returns:
        ~chainer.Variable: Output variable.

    .. note::

       This function currently does not support ``cover_all`` mode as
       :func:`max_pooling_2d`. Average pooling runs in non-cover-all mode.

    .. note::

       The values in the padded region is treated as 0, leading the averages
       biased towards zero.
       To obtain unbiased averages, use :func:`average_pooling_nd` with
       ``pad_value=None``.

    """
    if backend.get_array_module(x) is chainerx:
        return average_pooling_nd.average_pooling_nd(x, ksize, stride, pad)
    return AveragePooling2D(ksize, stride, pad, False).apply((x, ))[0]
Exemple #35
0
    def __call__(self, rule, param):
        p, g = param.data, param.grad
        if p is None or g is None:
            return

        xp = backend.get_array_module(p)

        # weight norm
        p_norm = xp.linalg.norm(p)
        # grad norm
        g_norm = xp.linalg.norm(g)
        local_rate = p_norm / (self.eps + g_norm + self.weight_decay * p_norm)
        rate = xp.where(p_norm > self.threshold, local_rate, 1.0)
        with cuda.get_device_from_array(p) as dev:
            if int(dev) == -1:
                g += self.weight_decay * p
                g *= rate
            else:
                kernel = cuda.elementwise(
                    'T p, T rate, T weight_decay',
                    'T g',
                    'g += weight_decay * p; g *= rate;',
                    'lars')
                kernel(p, rate, self.weight_decay, g)
Exemple #36
0
    def forward(self, inputs):
        self.retain_inputs((0, 1, 2, 4))
        x, gamma, mean, var, gy = inputs
        expander = self.expander
        xp = backend.get_array_module(x)

        if self.inv_std is None or self.inv_var is None:
            self.inv_var = xp.reciprocal(var + self.eps)
            self.inv_std = xp.sqrt(self.inv_var, dtype=self.inv_var.dtype)

        self.gamma_over_std = gamma * self.inv_std
        x_hat = _x_hat(x, mean[expander], self.inv_std[expander])

        gx = self.gamma_over_std[expander] * gy
        gbeta = gy.sum(axis=self.axis, dtype=gamma.dtype)
        ggamma = (x_hat * gy).sum(axis=self.axis)
        gmean = -self.gamma_over_std * gbeta
        gvar = - 0.5 * self.inv_var * (
            gamma * ggamma).astype(var.dtype, copy=False)

        gx = gx.astype(dtype=x.dtype)

        self.retain_outputs((0, 1, 2, 3, 4))
        return gx, ggamma, gbeta, gmean, gvar
    def forward(self, inputs):
        self.retain_inputs((0, 1))

        xp = backend.get_array_module(*inputs)
        x, t = inputs
        self.ignore_mask = (t != self.ignore_label)

        # stable computation of the cross entropy.
        loss = -(self.ignore_mask *
                 (x * (t - (x >= 0)) - xp.log1p(xp.exp(-xp.abs(x)))))

        if not self.reduce == 'mean':
            return utils.force_array(loss.astype(x.dtype)),

        if self.normalize:
            count = xp.maximum(1, self.ignore_mask.sum())
        else:
            count = max(1, len(x))
        self.count = count

        # TODO(takagi): Fix to perform division in a specific dtype. See
        # cupy/cupy#1534.
        return utils.force_array(xp.divide(xp.sum(loss), self.count),
                                 dtype=x.dtype),
Exemple #38
0
    def forward(self, inputs):
        self.retain_inputs((1,))
        x, t = inputs
        self._in_shape = x.shape
        self._in_dtype = x.dtype
        if chainer.is_debug():
            if not ((0 <= t).all() and
                    (t < x.shape[1]).all()):
                msg = 'Each label `t` need to satisfty `0 <= t < x.shape[1]`'
                raise ValueError(msg)

        xp = backend.get_array_module(x)
        if xp is numpy:
            # This code is equivalent to `t.choose(x.T)`, but `numpy.choose`
            # does not work when `x.shape[1] > 32`.
            return x[six.moves.range(t.size), t],
        else:
            y = cuda.elementwise(
                'S t, raw T x',
                'T y',
                'int ind[] = {i, t}; y = x[ind];',
                'getitem_fwd'
            )(t, x)
            return y,
Exemple #39
0
    def forward(self, inputs):
        xp = backend.get_array_module(*inputs)
        y, t = inputs

        if self.ignore_label is not None:
            mask = (t == self.ignore_label)
            ignore_cnt = mask.sum()

            # will always be true when the true label is ignore_label
            # TODO(henry0312)
            #   If cupy.where returns indexes, we could make the code better.
            #   Also, we would need Advanced Indexing.
            pred = xp.where(mask, self.ignore_label,
                            y.argmax(axis=1).reshape(t.shape))
            count = (pred == t).sum() - ignore_cnt
            total = t.size - ignore_cnt

            if total == 0:
                return xp.asarray(0.0, dtype=y.dtype),
            else:
                return xp.asarray(float(count) / total, dtype=y.dtype),
        else:
            pred = y.argmax(axis=1).reshape(t.shape)
            return xp.asarray((pred == t).mean(dtype=y.dtype)),
Exemple #40
0
def lstm_grad_grad(c_prev, a, i, f, o, c, gc, gh, ggc_prev, gga, ggi, ggf, ggo,
                   gc_prev, ga, gi, gf, go, gc_next, ggc, ggh):
    xp = backend.get_array_module(a)
    sig_o = _sigmoid(o, xp)
    gsig_o = _grad_sigmoid(sig_o)
    ggsig_o = _grad_grad_sigmoid(sig_o)
    sig_i = _sigmoid(i, xp)
    gsig_i = _grad_sigmoid(sig_i)
    ggsig_i = _grad_grad_sigmoid(sig_i)
    sig_f = _sigmoid(f, xp)
    gsig_f = _grad_sigmoid(sig_f)
    ggsig_f = _grad_grad_sigmoid(sig_f)
    tanh_a = xp.tanh(a)
    gtanh_a = _grad_tanh(tanh_a)
    ggtanh_a = _grad_grad_tanh(tanh_a, gtanh_a)
    tanh_c = xp.tanh(c)
    gtanh_c = _grad_tanh(tanh_c)
    ggtanh_c = _grad_grad_tanh(tanh_c, gtanh_c)

    gc_bar = gh * sig_o * gtanh_c + gc

    gc_prev[:] = ggf * gc_bar * gsig_f
    ga[:] = (gga * sig_i * ggtanh_a + ggi * gtanh_a * gsig_i) * gc_bar
    gi[:] = (gga * gtanh_a * gsig_i + ggi * tanh_a * ggsig_i) * gc_bar
    gf[:] = (ggc_prev * (gh * sig_o * gtanh_c + gc) * gsig_f +
             ggf * gc_bar * c_prev * ggsig_f)

    ggc[:] = (ggc_prev * sig_f + gga * sig_i * gtanh_a +
              ggi * tanh_a * gsig_i + ggf * c_prev * gsig_f)

    dgc_do = gh * gsig_o * gtanh_c
    go[:] = ggc * dgc_do + ggo * gh * tanh_c * ggsig_o
    dgc_dc = gh * sig_o * ggtanh_c
    gc_next[:] = ggc * dgc_dc + ggo * gh * gtanh_c * gsig_o
    ggh[:] = ggc * sig_o * gtanh_c + ggo * tanh_c * gsig_o
    return gc_prev, ga, gi, gf, go, gc_next, ggc, ggh
Exemple #41
0
    def backward(self, inputs, grads):
        gpu = backend.get_array_module(*inputs) is not numpy

        # TODO(unno): We can remove redundant gpu-cpu copy using
        # theano.sandbox.cuda.basic_ops.gpu_from_host
        args = [cuda.to_cpu(x) for x in inputs + grads]

        outputs = self.backward_func(*args)
        assert len(outputs) == len(inputs)

        if gpu:
            # TODO(unno): We can remove redundant gpu-cpu copy using
            # theano.sandbox.cuda.CudaNdarray.gpudata
            device = cuda.get_device_from_array(inputs)
            outputs = [cuda.to_gpu(x, device) for x in outputs]

        results = []
        for o, i in zip(outputs, inputs):
            if i.dtype.kind != 'f':
                o = None
            elif o.dtype != i.dtype:
                o = o.astype(i.dtype)
            results.append(o)
        return tuple(results)
Exemple #42
0
def variable_str(var):
    """Return the string representation of a variable.

    Args:
        var (~chainer.Variable): Input Variable.
    .. seealso:: numpy.array_str
    """
    xp = backend.get_array_module(var)
    if xp is numpy:
        arr = var.data
    else:
        arr = var.data.get()

    if var.name:
        prefix = 'variable ' + var.name
    else:
        prefix = 'variable'

    if arr is None:
        lst = 'None'
    else:
        lst = numpy.array2string(arr, None, None, None, ' ', prefix + '(')

    return '%s(%s)' % (prefix, lst)
Exemple #43
0
    def _sample_directions(self):
        # Samples a direction vector (list of arrays with the same shapes as
        # input arrays and parameters)
        x_data = self.x_data
        params = self.params
        no_grads = self.no_grads

        xp = backend.get_array_module(*x_data)
        direction_xs_shapes = [
            x.shape for x, no_grad in six.moves.zip(x_data, no_grads)
            if not no_grad
        ]
        direction_param_shapes = [p.shape for p in params]
        direction_shapes = direction_xs_shapes + direction_param_shapes
        if self.is_chainerx:
            directions = [
                xp.random.normal(size=shape, device=x_data[0].device)
                for shape in direction_shapes
            ]
        else:
            directions = [
                xp.random.normal(size=shape) for shape in direction_shapes
            ]
        # The direction vector is normalized in order to keep the scale of
        # differentiation error invariant with respect to the number of input
        # dimensions. Ideally, the scale of the curvature with respect to each
        # input dimension should be taken into account, but we ignore the
        # differences and assume that the curvature is uniform with respect to
        # all the input dimentions.
        norm = math.sqrt(sum([xp.square(d).sum() for d in directions]))
        if norm != 0:
            # norm could be zero if input arrays are 0-sized.
            scale = 1. / norm
            directions = [d * scale for d in directions]

        return directions
Exemple #44
0
def warp_perspective(image, mat):
    """Warp images with perspective transformation

    Args:
        image (:class `~chainer.Variable` or :ref:`ndarray`):
            A 4-D array of shape `(B, C, H, W)`.

        mat (:class `~chainer.Variable` or :ref:`ndarray`):
            Perspective transformaion matrices.
            A 3-D array of shape `(B, 3, 3)`.

    Returns:
        ~chainer.Variable:
            Warped image.
            A 4-D array of shape `(B, C, H, W)`.
    """
    xp = backend.get_array_module(image)
    B, _, H, W = image.shape

    ps1 = pixel_coords(xp, H, W, mat.dtype).reshape(1, 2, -1)
    num   = affine(mat[:,:2,:2], mat[:,:2,2], ps1)
    denom = affine(mat[:,2,:2].reshape(-1, 1, 2), mat[:,2,2].reshape(-1, 1), ps1)
    ps0 = num / denom
    return warp_dense(image, ps0.reshape(1, 2, H, W))
Exemple #45
0
def _is_c_order(row, col):
    """Check if a coo matrix with given row and col is c_order"""
    if row.shape != col.shape:
        raise ValueError('shape of row and col must be the same.')
    if row.ndim != 1:
        for i in range(row.shape[0]):
            if not _is_c_order(row[i], col[i]):
                return False
        return True
    xp = backend.get_array_module(row)
    _row = row[col >= 0]
    _col = col[row >= 0]
    if _row[_row < 0].size > 0 or _col[_col < 0].size:
        raise ValueError('invalid index combination of row and col.')
    if _row.shape[0] <= 1:
        return True
    row_diff = xp.zeros(_row.shape, dtype=_row.dtype)
    row_diff[1:] = _row[1:] - _row[:-1]
    if xp.amin(row_diff) < 0:
        return False
    col_diff = xp.zeros(_col.shape, dtype=_col.dtype)
    col_diff[1:] = _col[1:] - _col[:-1]
    col_diff[(row_diff > 0)] = 0
    return xp.amin(col_diff) >= 0
Exemple #46
0
    def check_backward(self,
                       x_data,
                       W_data,
                       b_data,
                       y_grad,
                       use_cudnn='never'):
        xp = backend.get_array_module(x_data)
        if not self.c_contiguous:
            x_data = xp.asfortranarray(x_data)
            W_data = xp.asfortranarray(W_data)
            y_grad = xp.asfortranarray(y_grad)
            self.assertTrue(x_data.flags.f_contiguous)
            self.assertTrue(W_data.flags.f_contiguous)
            self.assertTrue(y_grad.flags.f_contiguous)
            if b_data is not None:
                b = xp.empty((len(b_data) * 2, ), dtype=b_data.dtype)
                b[::2] = b_data
                b_data = b[::2]
                self.assertFalse(b_data.flags.c_contiguous)

        args = (x_data, W_data)
        if b_data is not None:
            args += (b_data, )

        def f(*args):
            return F.convolution_nd(*args,
                                    stride=self.stride,
                                    pad=self.pad,
                                    cover_all=self.cover_all,
                                    dilate=self.dilate,
                                    groups=self.groups)

        with chainer.using_config('use_cudnn', use_cudnn):
            with chainer.using_config('autotune', self.autotune):
                gradient_check.check_backward(f, args, y_grad,
                                              **self.check_backward_options)
Exemple #47
0
def _crs_matmul(sp_data,
                sp_row,
                sp_col,
                sp_shape,
                dn,
                transa,
                transb,
                transc,
                dtype=None):
    if dtype is None:
        dtype = numpy.result_type(sp_data.dtype, dn.dtype)

    A_data = sp_data
    if transa:
        A_row = sp_col
        A_col = sp_row
        A_shape = (sp_shape[1], sp_shape[0])
        # TODO(denjiry): write something instead of order
    else:
        A_row = sp_row
        A_col = sp_col
        A_shape = sp_shape
    if transb:
        B = dn.swapaxes(-1, -2)
    else:
        B = dn

    xp = backend.get_array_module(A_data, B)
    if xp is numpy:
        C = _crs_matmul_cpu(A_data, A_row, A_col, A_shape, B, dtype)
    else:
        C = _crs_matmul_gpu(A_data, A_row, A_col, A_shape, B, dtype)

    if transc:
        C = C.swapaxes(-1, -2)
    return C
Exemple #48
0
    def __call__(self, trainer):
        if self.available():
            # Dynamically import pyplot to call matplotlib.use()
            # after importing chainer.training.extensions
            import matplotlib.pyplot as plt
        else:
            return

        xp = backend.get_array_module(self._vars[0].data)
        stats = xp.zeros(self._data_shape, dtype=xp.float32)
        for i, k in enumerate(self._keys):
            xs = []
            for var in self._vars:
                x = getattr(var, k, None)
                if x is not None:
                    xs.append(x.ravel())
            if len(xs) > 0:
                stat_dict = self._statistician(xp.concatenate(xs, axis=0),
                                               axis=0,
                                               xp=xp)
                stat_list = []
                if self._plot_mean:
                    stat_list.append(xp.atleast_1d(stat_dict['mean']))
                if self._plot_std:
                    stat_list.append(xp.atleast_1d(stat_dict['std']))
                if self._plot_percentile:
                    stat_list.append(xp.atleast_1d(stat_dict['percentile']))
                stats[i] = xp.concatenate(stat_list, axis=0)

        if xp != numpy:
            stats = cuda.to_cpu(stats)
        self._samples.add(stats, idx=trainer.updater.iteration)

        if self._trigger(trainer):
            file_path = os.path.join(trainer.out, self._file_name)
            self.save_plot_using_module(file_path, plt)
Exemple #49
0
def position_encode(embed, sentences):
    """Position encoding.

    It is defined as:

    .. math::

       m = \\sum_j l_j A x_j,

    where :math:`A` is an embed matrix, :math:`x_j` is :math:`j`-th word ID and

    .. math::

       l_{kj} = (1 - j / J) - (k / d)(1 - 2j / J).

    :math:`J` is length of a sentence and :math:`d` is the dimension of the
    embedding.

    """

    xp = backend.get_array_module(sentences)
    e = embed(sentences)
    n_words, n_units = e.shape[-2:]

    # To avoid 0/0, we use max(length, 1) here.
    # Note that when the length is zero, its embedding is always zero and
    # is ignored.
    length = xp.maximum(xp.sum((sentences != 0).astype(xp.float32), axis=-1),
                        1)
    length = length.reshape((length.shape + (1, 1)))
    k = xp.arange(1, n_units + 1, dtype=numpy.float32) / n_units
    i = xp.arange(1, n_words + 1, dtype=numpy.float32)[:, None]
    coeff = (1 - i / length) - k * (1 - 2.0 * i / length)
    e = coeff * e
    s = F.sum(e, axis=-2)
    return s
Exemple #50
0
    def backward(self, indexes, grad_outputs):
        F = chainer.functions
        expander = self.expander

        x, gamma, gy = self.get_retained_inputs()
        gx1, ggamma1 = self.get_retained_outputs()
        ggx1, gggamma1, ggbeta1 = grad_outputs
        xp = backend.get_array_module(x)

        # auxiliary values
        inv_m = gamma.dtype.type(1. / (x.size // gamma.size))
        r = 0 if ggx1 is None else F.sum(gx1 * ggx1, axis=self.axis)
        coeff = gamma * self.inv_std
        coeff_m = coeff * inv_m
        x_hat = _x_hat(x, self.mean[expander], self.inv_std[expander])

        # handle None in output gradients
        ggx1 = _zero_if_none(xp, ggx1, x.shape, x.dtype)
        gggamma1 = _zero_if_none(xp, gggamma1, gamma.shape, gamma.dtype)
        ggbeta1 = _zero_if_none(xp, ggbeta1, gamma.shape, gamma.dtype)

        gggamma2 = gggamma1 - coeff_m * F.sum(x_hat * ggx1, axis=self.axis)
        ggbeta2 = ggbeta1 - coeff_m * F.sum(ggx1, axis=self.axis)

        ggamma2 = r / gamma

        gx_hat2 = (gggamma2[expander] * gy -
                   (coeff_m * ggamma1)[expander] * ggx1)
        gstd2 = -self.inv_std * (r + F.sum(x_hat * gx_hat2, axis=self.axis))
        gmean2 = -self.inv_std * F.sum(gx_hat2, axis=self.axis)
        gx2 = self.inv_std[expander] * gx_hat2 + inv_m * (
            gmean2[expander] + x_hat * gstd2[expander])
        ggy2 = (gggamma2[expander] * x_hat + ggbeta2[expander] +
                coeff[expander] * ggx1)

        return gx2, ggamma2, ggy2
Exemple #51
0
 def forward(self, inputs):
     x, = inputs
     self._in_shape = x.shape
     xp = backend.get_array_module(x)
     return xp.cumsum(x, axis=self.axis),
Exemple #52
0
    def forward(self, inputs):
        xp = backend.get_array_module(*inputs)
        x, gamma, beta = inputs

        # Note: we must be in train mode.
        assert configuration.config.train

        if not self.update_statistics:
            self._running_mean = xp.array(self._running_mean)
            self._running_var = xp.array(self._running_var)

        head_ndim = gamma.ndim + 1
        expander = (None, Ellipsis) + (None,) * (x.ndim - head_ndim)

        # NOTE(tommi): cuDNN is not used since it does not support
        # batch renormalization
        axis = (0,) + tuple(range(head_ndim, x.ndim))
        mean = x.mean(axis=axis)
        var = x.var(axis=axis) + self.eps
        self.std = xp.sqrt(var, dtype=var.dtype)

        running_sigma = xp.sqrt(self._running_var + self.eps,
                                dtype=self._running_mean.dtype)
        self.r = xp.clip(self.std / running_sigma,
                         1.0 / self.rmax, self.rmax)
        d = xp.clip(
            (mean - self._running_mean) / running_sigma,
            -self.dmax, self.dmax)

        # Update running statistics:
        m = x.size // gamma[expander].size
        self._running_mean *= self.decay
        adjust = m / max(m - 1., 1.)  # unbiased estimation
        temp_ar = xp.array(mean)
        temp_ar *= (1 - self.decay)
        self._running_mean += temp_ar
        del temp_ar
        self._running_var *= self.decay
        temp_ar = xp.array(var)
        temp_ar *= (1 - self.decay) * adjust
        self._running_var += temp_ar
        del temp_ar

        gamma = gamma[expander]
        beta = beta[expander]

        if xp is numpy:
            self.x_hat = _xhat(x, mean, self.std, expander)
            self.x_hat_renorm = self.x_hat * self.r[expander] + d[expander]
            y = gamma * self.x_hat_renorm
            y += beta
        else:
            self.x_hat, self.x_hat_renorm, y = cuda.elementwise(
                'T x, T mean, T std, T gamma, T beta, T r, T d',
                'T x_hat, T x_hat_renorm, T y',
                '''
                x_hat = (x - mean) / std;
                x_hat_renorm = x_hat * r + d;
                y = gamma * x_hat_renorm + beta;
                ''',
                'bn_fwd')(x, mean[expander], self.std[expander], gamma,
                          beta, self.r[expander], d[expander])

        return y,
Exemple #53
0
def connectionist_temporal_classification(x,
                                          t,
                                          blank_symbol,
                                          input_length=None,
                                          label_length=None,
                                          reduce='mean'):
    """Connectionist Temporal Classification loss function.

    Connectionist Temporal Classification(CTC) [Graves2006]_ is a loss function
    of sequence labeling where the alignment between the inputs and target is
    unknown. See also [Graves2012]_

    The output is a variable whose value depends on the value of
    the option ``reduce``. If it is ``'no'``, it holds the samplewise
    loss values. If it is ``'mean'``, it takes the mean of loss values.


    Args:
        x (list or tuple of :class:`~chainer.Variable`):
            A list of unnormalized probabilities for labels.
            Each element of ``x``, ``x[i]`` is a :class:`~chainer.Variable`
            object, which has shape ``(B, V)``, where ``B``
            is the batch size and ``V`` is the number of labels.
            The softmax of ``x[i]`` represents the probabilities of the labels
            at time ``i``.
        t (:class:`~chainer.Variable` or :ref:`ndarray`):
            A matrix including expected label sequences.
            Its shape is ``(B, M)``, where ``B`` is the batch size and ``M`` is
            the maximum length of the label sequences.
            All elements in ``t`` must be less than ``V``, the number of
            labels.
        blank_symbol (int): Index of blank_symbol.
            This value must be non-negative.
        input_length (:class:`~chainer.Variable` or :ref:`ndarray`):
            Length of sequence for each of mini batch ``x`` (optional).
            Its shape must be ``(B,)``.
            If the ``input_length`` is omitted or ``None``, it assumes that
            all of ``x`` is valid input.
        label_length (:class:`~chainer.Variable` or :ref:`ndarray`):
            Length of sequence for each of mini batch ``t`` (optional).
            Its shape must be ``(B,)``.
            If the ``label_length`` is omitted or ``None``, it assumes that
            all of ``t`` is valid input.
        reduce (str): Reduction option. Its value must be either
            ``'mean'`` or ``'no'``. Otherwise,
            :class:`ValueError` is raised.

    Returns:
       ~chainer.Variable:
           A variable holding a scalar value of the CTC loss.
           If ``reduce`` is ``'no'``, the output variable holds array
           whose shape is `(B,)` where `B` is the number of samples.
           If it is ``'mean'``, it holds a scalar.

    .. note::
       You need to input ``x`` without applying to activation functions(e.g.
       softmax function), because this function applies softmax functions
       to ``x`` before calculating CTC loss to avoid numerical limitations.
       You also need to apply softmax function to forwarded values before you
       decode it.

    .. note::
       This function is differentiable only by ``x``.

    .. note::
       This function supports (batch, sequence, 1-dimensional input)-data.

    .. [Graves2006] Alex Graves, Santiago Fernandez,\
    Faustino Gomez, Jurgen Schmidhuber,\
    `Connectionist Temporal Classification: Labelling Unsegmented\
    Sequence Data with Recurrent Neural Networks\
    <ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf>`_

    .. [Graves2012] Alex Graves,\
    `Supervised Sequence Labelling with Recurrent Neural Networks\
    <https://www.cs.toronto.edu/~graves/preprint.pdf>`_

    """
    if not isinstance(x, collections_abc.Sequence):
        raise TypeError('x must be a list of Variables')
    if not isinstance(blank_symbol, int):
        raise TypeError('blank_symbol must be non-negative integer.')
    assert 0 <= blank_symbol < x[0].shape[1]
    # This implementation only supports 1-dimensional data.
    # TODO(jnishi): Support d(>1)-dimensional inputs.
    assert x[0].ndim == 2

    xp = backend.get_array_module(x[0])
    if input_length is None:
        input_length = xp.full(len(x[0]), len(x), dtype=numpy.int32)
    if label_length is None:
        label_length = xp.full(len(t), t.shape[1], dtype=numpy.int32)

    return ConnectionistTemporalClassification(blank_symbol, reduce)(
        input_length, label_length, t, chainer.functions.stack(x))
Exemple #54
0
def _matmul(a, b):
    xp = backend.get_array_module(a)
    if not hasattr(xp, 'matmul'):
        # NumPy 1.9 does not support matmul. We use einsum instead.
        return xp.einsum('ijl,ilk->ijk', a, b)
    return xp.matmul(a, b)
 def forward(self, inputs):
     x, = inputs
     xp = backend.get_array_module(x)
     y = xp.zeros(self.mask.shape, x.dtype)
     y[self.mask] = x
     return y,
Exemple #56
0
 def forward(self, inputs):
     # may broadcast
     xp = backend.get_array_module(*inputs)
     x, y = inputs
     condition = self.condition
     return xp.where(condition, x, y),
Exemple #57
0
def n_step_rnn_base(n_layers, dropout_ratio, hx, ws, bs, xs, activation,
                    use_bi_direction, **kwargs):
    """n_step_rnn_base(n_layers, dropout_ratio, hx, ws, bs, xs, activation, use_bi_direction)

    Base function for Stack RNN/BiRNN functions.

    This function is used at  :func:`chainer.functions.n_step_birnn` and
    :func:`chainer.functions.n_step_rnn`.
    This function's behavior depends on following arguments,
    ``activation`` and ``use_bi_direction``.

    Args:
        n_layers(int): Number of layers.
        dropout_ratio(float): Dropout ratio.
        hx (chainer.Variable): Variable holding stacked hidden states.
            Its shape is ``(S, B, N)`` where ``S`` is number of layers and is
            equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is
            dimension of hidden units.
        ws (list of list of chainer.Variable): Weight matrices. ``ws[i]``
            represents weights for i-th layer.
            Each ``ws[i]`` is a list containing two matrices.
            ``ws[i][j]`` is corresponding with ``W_j`` in the equation.
            Only ``ws[0][j]`` where ``0 <= j < 1`` is ``(I, N)`` shape as they
            are multiplied with input variables. All other matrices has
            ``(N, N)`` shape.
        bs (list of list of chainer.Variable): Bias vectors. ``bs[i]``
            represnents biases for i-th layer.
            Each ``bs[i]`` is a list containing two vectors.
            ``bs[i][j]`` is corresponding with ``b_j`` in the equation.
            Shape of each matrix is ``(N,)`` where ``N`` is dimension of
            hidden units.
        xs (list of chainer.Variable): A list of :class:`~chainer.Variable`
            holding input values. Each element ``xs[t]`` holds input value
            for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is
            mini-batch size for time ``t``, and ``I`` is size of input units.
            Note that this function supports variable length sequences.
            When sequneces has different lengths, sort sequences in descending
            order by length, and transpose the sorted sequence.
            :func:`~chainer.functions.transpose_sequence` transpose a list
            of :func:`~chainer.Variable` holding sequence.
            So ``xs`` needs to satisfy
            ``xs[t].shape[0] >= xs[t + 1].shape[0]``.
        activation (str): Activation function name.
            Please select ``tanh`` or ``relu``.
        use_bi_direction (bool): If ``True``, this function uses
            Bi-directional RNN.

    Returns:
        tuple: This function returns a tuple containing three elements,
            ``hy`` and ``ys``.

            - ``hy`` is an updated hidden states whose shape is same as ``hx``.
            - ``ys`` is a list of :class:`~chainer.Variable` . Each element
              ``ys[t]`` holds hidden states of the last layer corresponding
              to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t``
              is mini-batch size for time ``t``, and ``N`` is size of hidden
              units. Note that ``B_t`` is the same value as ``xs[t]``.

    .. seealso::
       :func:`chainer.functions.n_step_rnn`
       :func:`chainer.functions.n_step_birnn`

    """  # NOQA
    if kwargs:
        argument.check_unexpected_kwargs(
            kwargs,
            train='train argument is not supported anymore. '
            'Use chainer.using_config',
            use_cudnn='use_cudnn argument is not supported anymore. '
            'Use chainer.using_config')
        argument.assert_kwargs_empty(kwargs)

    activation_list = ['tanh', 'relu']
    if activation not in activation_list:
        candidate = ','.join(activation_list)
        raise ValueError('Invalid activation: "%s". Please select from [%s]' %
                         (activation, candidate))

    xp = backend.get_array_module(hx)

    if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000):
        states = cuda.get_cudnn_dropout_states()
        states.set_dropout_ratio(dropout_ratio)
        lengths = [len(x) for x in xs]
        xs = chainer.functions.concat(xs, axis=0)

        rnn_mode = 'rnn_%s' % activation
        w = cudnn_rnn_weight_concat(n_layers, states, use_bi_direction,
                                    rnn_mode, ws, bs)

        if use_bi_direction:
            # Bi-directional RNN
            if activation == 'tanh':
                rnn = NStepBiRNNTanh
            elif activation == 'relu':
                rnn = NStepBiRNNReLU
        else:
            # Uni-directional RNN
            if activation == 'tanh':
                rnn = NStepRNNTanh
            elif activation == 'relu':
                rnn = NStepRNNReLU

        hy, ys = rnn(n_layers, states, lengths)(hx, w, xs)
        sections = numpy.cumsum(lengths[:-1])
        ys = chainer.functions.split_axis(ys, sections, 0)
        return hy, ys

    else:

        def f(x, h, c, w, b):
            xw, hw = w
            xb, hb = b
            rnn_in = linear.linear(x, xw, xb) + linear.linear(h, hw, hb)
            if activation == 'tanh':
                return tanh.tanh(rnn_in), None
            elif activation == 'relu':
                return relu.relu(rnn_in), None

        hy, _, ys = n_step_rnn_impl(f, n_layers, dropout_ratio, hx, None, ws,
                                    bs, xs, use_bi_direction)
        return hy, ys
Exemple #58
0
 def forward(self, inputs):
     # output a dummy array.
     xp = backend.get_array_module(inputs[0])
     return xp.empty((0, ), dtype=numpy.float32),
Exemple #59
0
def _ones_like(arr):
    device = cuda.get_device_from_array(arr)
    xp = backend.get_array_module(arr)
    with device:
        return xp.ones_like(arr)
Exemple #60
0
 def forward(self, xs):
     xp = backend.get_array_module(*xs)
     return xp.hstack(xs),