예제 #1
0
    def backward_gpu(self, x, gy):
        e1 = array.as_mat(x[0])
        e2 = array.as_mat(x[1])
        gy, = gy

        kern_add = cuda.reduce('T in0, T in1, T in2', 'T out',
                               'in0 * in1 * in2', 'a + b', 'out += a', 0,
                               'bilinear_product_add')
        kern = cuda.reduce('T in0, T in1, T in2', 'T out', 'in0 * in1 * in2',
                           'a + b', 'out = a', 0, 'bilinear_product')

        e1_b = e1[:, :, numpy.newaxis, numpy.newaxis]  # ij
        e2_b = e2[:, numpy.newaxis, :, numpy.newaxis]  # ik
        gy_b = gy[:, numpy.newaxis, numpy.newaxis, :]  # il
        W_b = self.W[numpy.newaxis, :, :, :]  # jkl

        # 'ij,ik,il->jkl'
        kern_add(e1_b, e2_b, gy_b, self.gW, axis=0)

        if not self.nobias:
            self.gV1 += e1.T.dot(gy)
            self.gV2 += e2.T.dot(gy)
            self.gb += gy.sum(axis=0)

        # 'ik,jkl,il->ij'
        ge1 = kern(e2_b, W_b, gy_b, axis=(2, 3))
        # 'ij,jkl,il->ik'
        ge2 = kern(e1_b, W_b, gy_b, axis=(1, 3))

        if not self.nobias:
            ge1 += gy.dot(self.V1.T)
            ge2 += gy.dot(self.V2.T)
        return (ge1.reshape(x[0].shape), ge2.reshape(x[1].shape))
예제 #2
0
    def forward_gpu(self, inputs):
        cupy = cuda.cupy
        x, t = inputs[:2]
        log_y = super(AdaptiveSoftmaxCrossEntropy, self).forward(inputs)[0]
        self.y = cupy.exp(log_y)

        if self.normalize:
            coeff = cupy.maximum(1, (t != self.ignore_label).sum())
        else:
            coeff = max(1, len(t))
        self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
        if self.reduce == 'mean':
            ret = cuda.reduce(
                'S t, raw T log_y, int32 n_channel, raw T coeff, '
                'S ignore_label', 'T out',
                't == ignore_label ? T(0) : log_y[_j * n_channel + t]',
                'a + b', 'out = a * -coeff[0]', '0',
                'crossent_fwd')(t, log_y.reduced_view(), log_y.shape[-1],
                                self._coeff, self.ignore_label)
        else:
            ret = cuda.elementwise(
                'S t, raw T log_y, int32 n_channel, T ignore', 'T out', '''
                if (t == ignore) {
                  out = 0;
                } else {
                  out = -log_y[i * n_channel + t];
                }
                ''', 'softmax_crossent_no_reduce_fwd')(t, log_y.reduced_view(),
                                                       log_y.shape[-1],
                                                       self.ignore_label)
            ret = ret.reshape(t.shape)
        return ret,
예제 #3
0
    def forward_gpu(self, inputs):
        cupy = cuda.cupy
        x, t = inputs
        if chainer.is_debug():
            self._check_input_values(x, t)

        log_y = softmax_log(x, self.use_cudnn)
        if self.cache_score:
            self.y = cupy.exp(log_y)
        if getattr(self, "normalize", True):
            coeff = cupy.maximum(1, (t != self.ignore_label).sum())
        else:
            coeff = max(1, len(t))
        self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
        ret = cuda.reduce(
            "S t, raw T log_y, int32 n_channel, raw T coeff",
            "T out",
            "t == -1 ? T(0) : log_y[_j * n_channel + t]",
            "a + b",
            "out = a * -coeff[0]",
            "0",
            "crossent_fwd",
        )(t, log_y.reduced_view(), log_y.shape[-1], self._coeff)
        return (ret,)
예제 #4
0
    def backward(self, inputs, grad_outputs):
        e1 = array.as_mat(inputs[0])
        e2 = array.as_mat(inputs[1])
        W = inputs[2]
        gy = grad_outputs[0]

        xp = cuda.get_array_module(*inputs)
        if xp is numpy:
            gW = numpy.einsum('ij,ik,il->jkl', e1, e2, gy)
            ge1 = numpy.einsum('ik,jkl,il->ij', e2, W, gy)
            ge2 = numpy.einsum('ij,jkl,il->ik', e1, W, gy)
        else:
            kern = cuda.reduce('T in0, T in1, T in2', 'T out',
                               'in0 * in1 * in2', 'a + b', 'out = a', 0,
                               'bilinear_product')

            e1_b = e1[:, :, None, None]  # ij
            e2_b = e2[:, None, :, None]  # ik
            gy_b = gy[:, None, None, :]  # il
            W_b = W[None, :, :, :]  # jkl

            gW = kern(e1_b, e2_b, gy_b, axis=0)  # 'ij,ik,il->jkl'
            ge1 = kern(e2_b, W_b, gy_b, axis=(2, 3))  # 'ik,jkl,il->ij'
            ge2 = kern(e1_b, W_b, gy_b, axis=(1, 3))  # 'ij,jkl,il->ik'

        ret = ge1.reshape(inputs[0].shape), ge2.reshape(inputs[1].shape), gW
        if len(inputs) == 6:
            V1, V2, b = inputs[3:]
            gV1 = e1.T.dot(gy)
            gV2 = e2.T.dot(gy)
            gb = gy.sum(0)
            ge1 += gy.dot(V1.T)
            ge2 += gy.dot(V2.T)
            ret += gV1, gV2, gb
        return ret
예제 #5
0
    def forward_gpu(self, inputs):
        cupy = cuda.cupy
        x, t = inputs
        if chainer.is_debug():
            self._check_input_values(x, t)

        log_y = log_softmax._log_softmax(x, self.use_cudnn)
        if self.cache_score:
            self.y = cupy.exp(log_y)
        if self.class_weight is not None:
            shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
            log_y *= cupy.broadcast_to(
                self.class_weight.reshape(shape), x.shape)
        if self.normalize:
            coeff = cupy.maximum(1, (t != self.ignore_label).sum())
        else:
            coeff = max(1, len(t))
        self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
        ret = cuda.reduce(
            'S t, raw T log_y, int32 n_channel, raw T coeff', 'T out',
            't == -1 ? T(0) : log_y[_j * n_channel + t]',
            'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd'
        )(t, log_y.reduced_view(), log_y.shape[-1], self._coeff)
        return ret,
def numerical_grad(f, inputs, grad_outputs, eps=1e-3):
    """Computes numerical gradient by finite differences.

    This function is used to implement gradient check. For usage example, see
    unit tests of :mod:`chainer.functions`.

    Args:
        f (function): Python function with no arguments that runs forward
            computation and returns the result.
        inputs (tuple of arrays): Tuple of arrays that should be treated as
            inputs. Each element of them is slightly modified to realize
            numerical gradient by finite differences.
        grad_outputs (tuple of arrays): Tuple of arrays that are treated as
            output gradients.
        eps (float): Epsilon value of finite differences.

    Returns:
        tuple: Numerical gradient arrays corresponding to ``inputs``.

    """
    assert eps > 0
    inputs = tuple(inputs)
    grad_outputs = tuple(grad_outputs)
    gpu = any(isinstance(x, cuda.ndarray) for x in inputs + grad_outputs)
    cpu = any(isinstance(x, numpy.ndarray) for x in inputs + grad_outputs)

    if gpu and cpu:
        raise RuntimeError('Do not mix GPU and CPU arrays in `numerical_grad`')

    if gpu:
        xp = cuda.cupy
        numerical_grad_kernel = cuda.reduce(
            'T y1, T y2, U gy, T eps', 'V gxi',
            '(y1 - y2) * gy', 'a + b', 'gxi += a / (eps * 2)', '0',
            'numerical_grad_kernel'
        )
    else:
        xp = numpy
    grads = [xp.zeros_like(x) for x in inputs]

    with configuration.using_config('type_check', False):
        for x, gx in six.moves.zip(inputs, grads):
            for i in numpy.ndindex(x.shape):
                orig = x[i].copy()  # hold original value
                x[i] = orig + eps
                ys1 = _copy_arrays(f())
                x[i] = orig - eps
                ys2 = _copy_arrays(f())
                x[i] = orig
                for y1, y2, gy in six.moves.zip(ys1, ys2, grad_outputs):
                    if gy is not None:
                        if (gpu and isinstance(y1, cuda.ndarray) and
                                isinstance(y2, cuda.ndarray) and
                                isinstance(gy, cuda.ndarray)):
                            numerical_grad_kernel(y1, y2, gy, eps, gx[i])
                        else:
                            dot = ((y1 - y2) * gy).sum()
                            gx[i] += dot / (2 * eps)

    return grads
예제 #7
0
    def forward_gpu(self, inputs):
        cupy = cuda.cupy
        x, t = inputs
        if chainer.is_debug():
            self._check_input_values(x, t)

        log_y = softmax_log(x, self.use_cudnn)
        if self.cache_score:
            self.y = cupy.exp(log_y)
        if getattr(self, 'normalize', True):
            coeff = cupy.maximum(1, (t != self.ignore_label).sum())
        else:
            coeff = max(1, len(t))
        self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)

        tw = cupy.asnumpy(t.copy())
        for i_class, weight in enumerate(self.class_weights):
            tw[tw == i_class] = self.class_weights[i_class]

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
        ret = cuda.reduce(
            'S t, S tw, raw T log_y, int32 n_channel, raw T coeff', 'T out',
            't == -1 ? 0 : log_y[_j * n_channel + t]*tw', 'a + b',
            'out = a * -coeff[0]', '0', 'crossent_fwd')(t, cupy.array(tw),
                                                        log_y.reduced_view(),
                                                        log_y.shape[-1],
                                                        self._coeff)
        return ret,
예제 #8
0
def numerical_grad(f, inputs, grad_outputs, eps=1e-3):
    """Computes numerical gradient by finite differences.

    This function is used to implement gradient check. For usage example, see
    unit tests of :mod:`chainer.functions`.

    Args:
        f (function): Python function with no arguments that runs forward
            computation and returns the result.
        inputs (tuple of arrays): Tuple of arrays that should be treated as
            inputs. Each element of them is slightly modified to realize
            numerical gradient by finite differences.
        grad_outputs (tuple of arrays): Tuple of arrays that are treated as
            output gradients.
        eps (float): Epsilon value of finite differences.

    Returns:
        tuple: Numerical gradient arrays corresponding to ``inputs``.

    """
    assert eps > 0
    inputs = tuple(inputs)
    grad_outputs = tuple(grad_outputs)
    gpu = any(isinstance(x, cuda.ndarray) for x in inputs + grad_outputs)
    cpu = any(isinstance(x, numpy.ndarray) for x in inputs + grad_outputs)

    if gpu and cpu:
        raise RuntimeError('Do not mix GPU and CPU arrays in `numerical_grad`')

    if gpu:
        xp = cuda.cupy
        numerical_grad_kernel = cuda.reduce(
            'T y1, T y2, U gy, T eps', 'V gxi',
            '(y1 - y2) * gy', 'a + b', 'gxi += a / (eps * 2)', '0',
            'numerical_grad_kernel'
        )
    else:
        xp = numpy
    grads = [xp.zeros_like(x) for x in inputs]

    with configuration.using_config('type_check', False):
        for x, gx in six.moves.zip(inputs, grads):
            for i in numpy.ndindex(x.shape):
                orig = x[i].copy()  # hold original value
                x[i] = orig + eps
                ys1 = _copy_arrays(f())
                x[i] = orig - eps
                ys2 = _copy_arrays(f())
                x[i] = orig
                for y1, y2, gy in six.moves.zip(ys1, ys2, grad_outputs):
                    if gy is not None:
                        if (gpu and isinstance(y1, cuda.ndarray) and
                                isinstance(y2, cuda.ndarray) and
                                isinstance(gy, cuda.ndarray)):
                            numerical_grad_kernel(y1, y2, gy, eps, gx[i])
                        else:
                            dot = ((y1 - y2) * gy).sum()
                            gx[i] += dot / (2 * eps)

    return grads
예제 #9
0
    def backward(self, inputs, grad_outputs):
        e1 = array.as_mat(inputs[0])
        e2 = array.as_mat(inputs[1])
        W = inputs[2]
        gy = grad_outputs[0]

        xp = cuda.get_array_module(*inputs)
        if xp is numpy:
            gW = numpy.einsum("ij,ik,il->jkl", e1, e2, gy)
            ge1 = numpy.einsum("ik,jkl,il->ij", e2, W, gy)
            ge2 = numpy.einsum("ij,jkl,il->ik", e1, W, gy)
        else:
            kern = cuda.reduce(
                "T in0, T in1, T in2", "T out", "in0 * in1 * in2", "a + b", "out = a", 0, "bilinear_product"
            )

            e1_b = e1[:, :, None, None]  # ij
            e2_b = e2[:, None, :, None]  # ik
            gy_b = gy[:, None, None, :]  # il
            W_b = W[None, :, :, :]  # jkl

            gW = kern(e1_b, e2_b, gy_b, axis=0)  # 'ij,ik,il->jkl'
            ge1 = kern(e2_b, W_b, gy_b, axis=(2, 3))  # 'ik,jkl,il->ij'
            ge2 = kern(e1_b, W_b, gy_b, axis=(1, 3))  # 'ij,jkl,il->ik'

        ret = ge1.reshape(inputs[0].shape), ge2.reshape(inputs[1].shape), gW
        if len(inputs) == 6:
            V1, V2, b = inputs[3:]
            gV1 = e1.T.dot(gy)
            gV2 = e2.T.dot(gy)
            gb = gy.sum(0)
            ge1 += gy.dot(V1.T)
            ge2 += gy.dot(V2.T)
            ret += gV1, gV2, gb
        return ret
예제 #10
0
 def forward_gpu(self, inputs):
     x0, x1 = inputs
     ret = cuda.reduce('const float* x0, const float* x1',
                       '(x0[i] - x1[i]) * (x0[i] - x1[i])', 'a+b', '0',
                       'mse_fwd', numpy.float32)(x0, x1)
     ret /= x0.size
     return ret,
예제 #11
0
    def forward_gpu(self, inputs):
        cupy = cuda.cupy
        x, t = inputs
        if chainer.is_debug():
            self._check_input_values(x, t)

        log_y = log_softmax._log_softmax(x, self.use_cudnn)
        if self.cache_score:
            self.y = cupy.exp(log_y)
        if self.class_weight is not None:
            shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
            log_y *= cupy.broadcast_to(self.class_weight.reshape(shape),
                                       x.shape)
        if self.normalize:
            coeff = cupy.maximum(1, (t != self.ignore_label).sum())
        else:
            coeff = max(1, len(t))
        self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
        ret = cuda.reduce('S t, raw T log_y, int32 n_channel, raw T coeff',
                          'T out',
                          't == -1 ? T(0) : log_y[_j * n_channel + t]',
                          'a + b', 'out = a * -coeff[0]', '0',
                          'crossent_fwd')(t, log_y.reduced_view(),
                                          log_y.shape[-1], self._coeff)
        return ret,
예제 #12
0
 def forward_gpu(self, inputs):
     x, t = inputs
     self.y, = sigmoid.Sigmoid(self.use_cudnn).forward_gpu((x,))
     loss = -cuda.reduce(
         'int* t, float* x',
         'x[i] * (t[i] - (x[i] >= 0)) - log1pf(expf(-fabsf(x[i])))',
         'a+b', '0', 'sigmoid_crossent_fwd', numpy.float32)(t, x)
     return loss / t.shape[0],
예제 #13
0
def _popcount():
    return cuda.reduce(
            "T x", "T y",
            "2*__popc(x)-32",
            "a+b",
            "y = a",
            "0",
            "popcount")
예제 #14
0
 def forward_gpu(self, inputs):
     x0, x1 = inputs
     ret = cuda.reduce(
         'const float* x0, const float* x1',
         '(x0[i] - x1[i]) * (x0[i] - x1[i])',
         'a+b', '0', 'mse_fwd', numpy.float32)(x0, x1)
     ret /= x0.size
     return ret,
예제 #15
0
 def forward_gpu(self, inputs):
     x, t = inputs
     self.y, = Softmax(self.use_cudnn).forward_gpu((x,))
     ret = cuda.reduce(
         'int* t, float* y, int n_channel', '-log(y[i * n_channel + t[i]])',
         'a+b', '0', 'crossent_fwd', numpy.float32)(t, self.y, self.y.shape[1])
     ret /= t.size
     return ret,
예제 #16
0
 def forward_gpu(self, inputs):
     x, t = inputs
     self.y, = sigmoid.Sigmoid(self.use_cudnn).forward_gpu((x, ))
     loss = cuda.reduce('T x, S t, T inv_cnt', 'T out',
                        'x * (t - (x >= 0)) - log1p(exp(-fabs(x)))',
                        'a + b', 'out = a * inv_cnt', 0,
                        'sigmoid_crossent_fwd')(x, t, -1.0 / t.shape[0])
     return loss,
 def forward_gpu(self, inputs):
     x, t = inputs
     self.y, = sigmoid.Sigmoid(self.use_cudnn).forward_gpu((x, ))
     loss = -cuda.reduce(
         'int* t, float* x',
         'x[i] * (t[i] - (x[i] >= 0)) - log1pf(expf(-fabsf(x[i])))', 'a+b',
         '0', 'sigmoid_crossent_fwd', numpy.float32)(t, x)
     return loss / t.shape[0],
예제 #18
0
    def backward(self, inputs, grad_outputs):
        #
        # preprocess
        #
        x, gamma = inputs[:2]
        gy, gl = grad_outputs
        head_ndim = gamma.ndim + 1
        expander = (None, Ellipsis) + (None, ) * (x.ndim - head_ndim)
        m = gamma.dtype.type(x.size // gamma.size)
        axis = (0, ) + tuple(range(head_ndim, x.ndim))
        xp = cuda.get_array_module(x)

        if len(inputs) == 6:
            assert not chainer.config.train
            # we do not have to consider Lipschitz constant
            var = inputs[5] + self.eps
            gs = gamma * self.std_inv
            gbeta = gy.sum(axis=axis)
            ggamma = (gy * self.x_hat).sum(axis=axis)
            gmean = -gs * gbeta
            gvar = -0.5 * gamma / var * ggamma
            gx = gs[expander] * gy
            return gx, ggamma, gbeta, None, gmean, gvar

        assert chainer.config.train
        gbeta = gy.sum(axis=axis)
        ggamma = cuda.reduce('T gy, T x_hat', 'T out', 'gy * x_hat', 'a + b',
                             'out = a', '0', 'conv_bn_ggamma')(gy,
                                                               self.x_hat,
                                                               axis=axis,
                                                               keepdims=False)
        if gl is not None:
            assert getattr(chainer.config, 'lmt', False)
            cuda.elementwise(
                'T gl, T u, T u_mid, T std_inv', 'T ggamma', '''
                ggamma += gl * u * u_mid * std_inv;
                ''', 'conv_bn_ggamma2')(gl, self.u.reshape(self.std_inv.shape),
                                        self.u_mid.reshape(self.std_inv.shape),
                                        self.std_inv, ggamma)
        inv_m = numpy.float32(1) / m
        if xp is numpy:
            gx = (gamma * self.std_inv)[expander] * (
                gy - (self.x_hat * ggamma[expander] + gbeta[expander]) / m)
        else:
            # in LMT, ggamma is changed and this automatically corrects gx
            gx = cuda.elementwise(
                'T gy, T x_hat, T gamma, T std_inv, T ggamma, T gbeta, \
                T inv_m', 'T gx',
                'gx = (gamma * std_inv) * (gy - (x_hat * ggamma + gbeta) * \
                inv_m)',
                'conv_bn_bwd')(gy, self.x_hat, gamma[expander],
                               self.std_inv[expander], ggamma[expander],
                               gbeta[expander], inv_m)
        if gl is not None:
            return gx, ggamma, gbeta, (gl * self.u_mid.T * self.v).reshape(
                inputs[3].shape)
        else:
            return gx, ggamma, gbeta, None
예제 #19
0
 def forward_gpu(self, inputs):
     x, t = inputs
     self.y, = sigmoid.Sigmoid(self.use_cudnn).forward_gpu((x,))
     loss = cuda.reduce(
         'T x, S t, T inv_cnt', 'T out',
         'x * (t - (x >= 0)) - log1p(exp(-fabs(x)))',
         'a + b', 'out = a * inv_cnt', 0,
         'sigmoid_crossent_fwd')(x, t, -1.0 / t.shape[0])
     return loss,
예제 #20
0
 def forward_gpu(self, inputs):
     x, t = inputs
     self.y, = Softmax(self.use_cudnn).forward_gpu((x, ))
     ret = cuda.reduce('int* t, float* y, int n_channel',
                       '-log(y[i * n_channel + t[i]])', 'a+b', '0',
                       'crossent_fwd', numpy.float32)(t, self.y,
                                                      self.y.shape[1])
     ret /= t.size
     return ret,
예제 #21
0
 def forward_gpu(self, inputs):
     x0, x1 = inputs
     ret = cuda.reduce(
         "const float* x0, const float* x1",
         "(x0[i] - x1[i]) * (x0[i] - x1[i])",
         "a+b",
         "0",
         "mse_fwd",
         numpy.float32,
     )(x0, x1)
     ret /= x0.size
     return (ret,)
예제 #22
0
    def forward_gpu(self, inputs):
        cupy = cuda.cupy
        x, t = inputs
        if chainer.is_debug():
            _check_input_values(x, t, self.ignore_label)

        if x.size == 0:
            y = cupy.zeros(t.shape, dtype=x.dtype)
            if self.cache_score:
                self.y = y
            if self.reduce == 'mean':
                return y.sum(),
            else:
                return y,
        log_y = log_softmax._log_softmax(x)
        if self.cache_score:
            self.y = cupy.exp(log_y)
        if self.class_weight is not None:
            shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
            log_y *= cupy.broadcast_to(
                self.class_weight.reshape(shape), x.shape)
        if self.normalize:
            coeff = cupy.maximum(1, (t != self.ignore_label).sum())
        else:
            coeff = max(1, len(t))
        self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
        if self.reduce == 'mean':
            ret = cuda.reduce(
                'S t, raw T log_y, int32 n_channel, raw T coeff, '
                'S ignore_label',
                'T out',
                't == ignore_label ? T(0) : log_y[_j * n_channel + t]',
                'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd'
            )(t, log_y.reduced_view(), log_y.shape[-1],
              self._coeff, self.ignore_label)
        else:
            ret = cuda.elementwise(
                'S t, raw T log_y, int32 n_channel, T ignore', 'T out',
                '''
                if (t == ignore) {
                  out = 0;
                } else {
                  out = -log_y[i * n_channel + t];
                }
                ''',
                'softmax_crossent_no_reduce_fwd'
            )(t, log_y.reduced_view(), log_y.shape[-1], self.ignore_label)
            ret = ret.reshape(t.shape)
        return ret,
예제 #23
0
    def forward_gpu(self, inputs):
        cupy = cuda.cupy
        x, t = inputs
        if chainer.is_debug():
            _check_input_values(x, t, self.ignore_label)

        if x.size == 0:
            y = cupy.zeros(t.shape, dtype=x.dtype)
            if self.cache_score:
                self.y = y
            if self.reduce == 'mean':
                return y.sum(),
            else:
                return y,
        log_y = log_softmax._log_softmax(x)
        if self.cache_score:
            self.y = cupy.exp(log_y)
        if self.class_weight is not None:
            shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
            log_y *= cupy.broadcast_to(
                self.class_weight.reshape(shape), x.shape)
        if self.normalize:
            coeff = cupy.maximum(1, (t != self.ignore_label).sum())
        else:
            coeff = max(1, len(t))
        self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
        if self.reduce == 'mean':
            ret = cuda.reduce(
                'S t, raw T log_y, int32 n_channel, raw T coeff, '
                'S ignore_label',
                'T out',
                't == ignore_label ? T(0) : log_y[_j * n_channel + t]',
                'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd'
            )(t, log_y.reduced_view(), log_y.shape[-1],
              self._coeff, self.ignore_label)
        else:
            ret = cuda.elementwise(
                'S t, raw T log_y, int32 n_channel, T ignore', 'T out',
                '''
                if (t == ignore) {
                  out = 0;
                } else {
                  out = -log_y[i * n_channel + t];
                }
                ''',
                'softmax_crossent_no_reduce_fwd'
            )(t, log_y.reduced_view(), log_y.shape[-1], self.ignore_label)
            ret = ret.reshape(t.shape)
        return ret,
예제 #24
0
def normalize(arr, eps):
    """normalize input array and return its norm
    from https://github.com/pfnet-research/sngan_projection/blob/master/source/functions/max_sv.py#L5

    :param arr: numpy ndarray or cupy ndarray
    :param eps: epsilon for numerical stability
    :return: norm of input array
    """

    norm = cuda.reduce('T x', 'T out', 'x * x', 'a + b', 'out = sqrt(a)', 0,
                       'norm_sn')(arr)
    cuda.elementwise('T norm, T eps', 'T x', 'x /= (norm + eps)',
                     'div_sn')(norm, eps, arr)
    return norm
예제 #25
0
파일: bilinear.py 프로젝트: ryuuji5/chainer
    def backward_gpu(self, x, gy):
        e1 = array.as_mat(x[0])
        e2 = array.as_mat(x[1])
        gy, = gy

        kern_add = cuda.reduce(
            'T in0, T in1, T in2', 'T out',
            'in0 * in1 * in2', 'a + b', 'out += a', 0,
            'bilinear_product_add')
        kern = cuda.reduce(
            'T in0, T in1, T in2', 'T out',
            'in0 * in1 * in2', 'a + b', 'out = a', 0,
            'bilinear_product')

        e1_b = e1[:, :, numpy.newaxis, numpy.newaxis]  # ij
        e2_b = e2[:, numpy.newaxis, :, numpy.newaxis]  # ik
        gy_b = gy[:, numpy.newaxis, numpy.newaxis, :]  # il
        W_b = self.W[numpy.newaxis, :, :, :]  # jkl

        # 'ij,ik,il->jkl'
        kern_add(e1_b, e2_b, gy_b, self.gW, axis=0)

        if not self.nobias:
            self.gV1 += e1.T.dot(gy)
            self.gV2 += e2.T.dot(gy)
            self.gb += gy.sum(axis=0)

        # 'ik,jkl,il->ij'
        ge1 = kern(e2_b, W_b, gy_b, axis=(2, 3))
        # 'ij,jkl,il->ik'
        ge2 = kern(e1_b, W_b, gy_b, axis=(1, 3))

        if not self.nobias:
            ge1 += gy.dot(self.V1.T)
            ge2 += gy.dot(self.V2.T)
        return (ge1.reshape(x[0].shape), ge2.reshape(x[1].shape))
예제 #26
0
 def forward_gpu(self, inputs):
     cupy = cuda.cupy
     x, t = inputs
     self.y, = softmax.Softmax(self.use_cudnn).forward((x, ))
     n_unit = int(numpy.prod(self.y.shape[2:]))
     if getattr(self, 'normalize', True):
         count = t.shape[0] * n_unit
     else:
         count = t.shape[0]
     y = cupy.rollaxis(self.y, 1, len(self.y))
     ret = cuda.reduce('S t, raw T y, int32 n_channel, T inv_count',
                       'T out', 'log(y[_j * n_channel + t])', 'a + b',
                       'out = a * inv_count', '0',
                       'crossent_fwd')(t, y, y.shape[-1], -1.0 / count)
     return ret,
예제 #27
0
 def forward_gpu(self, inputs):
     cupy = cuda.cupy
     x, t = inputs
     self.y, = softmax.Softmax(self.use_cudnn).forward((x, ))
     if getattr(self, 'normalize', True):
         count = x.size // x.shape[1]
     else:
         count = x.shape[0]
     y = cupy.rollaxis(self.y, 1, self.y.ndim)
     ret = cuda.reduce('S t, raw T y, int32 n_channel, T inv_count',
                       'T out', 'log(y[_j * n_channel + t])', 'a + b',
                       'out = a * inv_count', '0',
                       'crossent_fwd')(t, y.reduced_view(), y.shape[-1],
                                       -1.0 / count)
     return ret,
예제 #28
0
    def forward_gpu(self, inputs):
        x, t = inputs

        max_length = cuda.reduce(
            'int* t, int* begins', 'begins[t[i] + 1] - begins[t[i]]',
            'max(a,b)', '0', 'binary_hierarchical_softmax_max_length',
            numpy.int32
        )(t, self.begins)
        max_length = cuda.to_cpu(max_length)[()]

        length = max_length * x.shape[0]
        ls = cuda.empty((length,), dtype=numpy.float32)
        n_in = x.shape[1]
        wxy = cuda.empty((length,), dtype=numpy.float32)
        cuda.elementwise(
            '''float* ls, float* wxy, const float* x, const float* w,
            const int* ts, const int* paths, const float* codes,
            const int* begins, int c, int max_length''',
            '''
            int ind = i / max_length;
            int offset = i - ind * max_length;
            int t = ts[ind];

            int begin = begins[t];
            int length = begins[t + 1] - begins[t];

            if (offset < length) {
              int p = begin + offset;
              int node = paths[p];

              x = &x[ind * c];

              float wx = 0;
              for (int j = 0; j < c; ++j) {
                wx += w[node * c + j] * x[j];
              }
              wxy[i] = wx * codes[p];
              ls[i] = log(1 + exp(-wxy[i]));
            } else {
              ls[i] = 0;
            }
            ''',
            'binary_hierarchical_softmax_forward'
        )(ls, wxy, x, self.W, t, self.paths, self.codes, self.begins,
          n_in, max_length)
        self.max_length = max_length
        self.wxy = wxy
        return cuda.gpuarray.sum(ls),
예제 #29
0
    def forward_gpu(self, inputs):
        x, t = inputs

        max_length = cuda.reduce(
            'int* t, int* begins', 'begins[t[i] + 1] - begins[t[i]]',
            'max(a,b)', '0', 'binary_hierarchical_softmax_max_length',
            numpy.int32
        )(t, self.begins)
        max_length = cuda.to_cpu(max_length)[()]

        length = max_length * x.shape[0]
        ls = cuda.empty((length,), dtype=numpy.float32)
        n_in = x.shape[1]
        wxy = cuda.empty((length,), dtype=numpy.float32)
        cuda.elementwise(
            '''float* ls, float* wxy, const float* x, const float* w,
            const int* ts, const int* paths, const float* codes,
            const int* begins, int c, int max_length''',
            '''
            int ind = i / max_length;
            int offset = i - ind * max_length;
            int t = ts[ind];

            int begin = begins[t];
            int length = begins[t + 1] - begins[t];

            if (offset < length) {
              int p = begin + offset;
              int node = paths[p];

              x = &x[ind * c];

              float wx = 0;
              for (int j = 0; j < c; ++j) {
                wx += w[node * c + j] * x[j];
              }
              wxy[i] = wx * codes[p];
              ls[i] = log(1 + exp(-wxy[i]));
            } else {
              ls[i] = 0;
            }
            ''',
            'binary_hierarchical_softmax_forward'
        )(ls, wxy, x, self.W, t, self.paths, self.codes, self.begins,
          n_in, max_length)
        self.max_length = max_length
        self.wxy = wxy
        return cuda.gpuarray.sum(ls),
예제 #30
0
    def forward_gpu(self, inputs):
        cupy = cuda.cupy
        x, t, W = inputs
        max_length = cuda.reduce(
            'T t, raw T begins', 'T out', 'begins[t + 1] - begins[t]',
            'max(a, b)', 'out = a', '0',
            'binary_hierarchical_softmax_max_length')(t, self.begins)
        max_length = cuda.to_cpu(max_length)[()]

        length = max_length * x.shape[0]
        ls = cupy.empty((length,), dtype=numpy.float32)
        n_in = x.shape[1]
        wxy = cupy.empty_like(ls)
        cuda.elementwise(
            '''raw T x, raw T w, raw int32 ts, raw int32 paths,
            raw T codes, raw int32 begins, int32 c, int32 max_length''',
            'T ls, T wxy',
            '''
            int ind = i / max_length;
            int offset = i - ind * max_length;
            int t = ts[ind];

            int begin = begins[t];
            int length = begins[t + 1] - begins[t];

            if (offset < length) {
              int p = begin + offset;
              int node = paths[p];

              T wx = 0;
              for (int j = 0; j < c; ++j) {
                int w_ind[] = {node, j};
                int x_ind[] = {ind, j};
                wx += w[w_ind] * x[x_ind];
              }
              wxy = wx * codes[p];
              ls = log(1 + exp(-wxy));
            } else {
              ls = 0;
            }
            ''',
            'binary_hierarchical_softmax_forward'
        )(x, W, t, self.paths, self.codes, self.begins, n_in, max_length, ls,
          wxy)
        self.max_length = max_length
        self.wxy = wxy
        return ls.sum(),
예제 #31
0
    def forward_gpu(self, inputs):
        x, t, W = inputs
        max_length = cuda.reduce(
            'T t, raw T begins', 'T out', 'begins[t + 1] - begins[t]',
            'max(a, b)', 'out = a', '0',
            'binary_hierarchical_softmax_max_length')(t, self.begins)
        max_length = cuda.to_cpu(max_length)[()]

        length = max_length * x.shape[0]
        ls = cuda.cupy.empty((length,), dtype=numpy.float32)
        n_in = x.shape[1]
        wxy = cuda.cupy.empty_like(ls)
        cuda.elementwise(
            '''raw T x, raw T w, raw int32 ts, raw int32 paths,
            raw T codes, raw int32 begins, int32 c, int32 max_length''',
            'T ls, T wxy',
            '''
            int ind = i / max_length;
            int offset = i - ind * max_length;
            int t = ts[ind];

            int begin = begins[t];
            int length = begins[t + 1] - begins[t];

            if (offset < length) {
              int p = begin + offset;
              int node = paths[p];

              T wx = 0;
              for (int j = 0; j < c; ++j) {
                int w_ind[] = {node, j};
                int x_ind[] = {ind, j};
                wx += w[w_ind] * x[x_ind];
              }
              wxy = wx * codes[p];
              ls = log(1 + exp(-wxy));
            } else {
              ls = 0;
            }
            ''',
            'binary_hierarchical_softmax_forward'
        )(x, W, t, self.paths, self.codes, self.begins, n_in, max_length, ls,
          wxy)
        self.max_length = max_length
        self.wxy = wxy
        return ls.sum(),
예제 #32
0
    def forward_gpu(self, inputs):
        cupy = cuda.cupy
        x, t = inputs
        log_y = softmax_log(x, self.use_cudnn)
        self.y = cupy.exp(log_y)
        if getattr(self, 'normalize', True):
            coeff = cupy.maximum(1, (t != self.ignore_label).sum())
        else:
            coeff = max(1, len(t))
        self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
        ret = cuda.reduce(
            'S t, raw T log_y, int32 n_channel, raw T coeff', 'T out',
            't == -1 ? 0 : log_y[_j * n_channel + t]',
            'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd'
        )(t, log_y.reduced_view(), log_y.shape[-1], self._coeff)
        return ret,
예제 #33
0
    def forward_gpu(self, inputs):
        cupy = cuda.cupy
        x, t = inputs
        log_y = softmax_log(x, self.use_cudnn)
        self.y = cupy.exp(log_y)
        if getattr(self, 'normalize', True):
            coeff = cupy.maximum(1, (t != self.ignore_label).sum())
        else:
            coeff = max(1, len(t))
        self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
        ret = cuda.reduce('S t, raw T log_y, int32 n_channel, raw T coeff',
                          'T out', 't == -1 ? 0 : log_y[_j * n_channel + t]',
                          'a + b', 'out = a * -coeff[0]', '0',
                          'crossent_fwd')(t, log_y.reduced_view(),
                                          log_y.shape[-1], self._coeff)
        return ret,
예제 #34
0
    def backward(self, inputs, grad_outputs):
        x, gamma = inputs[:2]
        gy, gl = grad_outputs
        head_ndim = gamma.ndim + 1
        expander = (None, Ellipsis) + (None, ) * (x.ndim - head_ndim)
        m = gamma.dtype.type(x.size // gamma.size)
        axis = (0, ) + tuple(range(head_ndim, x.ndim))
        xp = cuda.get_array_module(x)
        if len(inputs) == 5:
            assert not chainer.config.train
            # we do not have to consider Lipschitz constant
            var = inputs[4] + self.eps
            gs = gamma * self.std_inv
            gbeta = gy.sum(axis=axis)
            ggamma = (gy * self.x_hat).sum(axis=axis)
            gmean = -gs * gbeta
            gvar = -0.5 * gamma / var * ggamma
            gx = gs[expander] * gy
            return gx, ggamma, gbeta, gmean, gvar

        assert configuration.config.train
        gbeta = gy.sum(axis=axis)
        ggamma = cuda.reduce('T gy, T x_hat', 'T out', 'gy * x_hat', 'a + b',
                             'out = a', '0', 'bn_ggamma')(gy,
                                                          self.x_hat,
                                                          axis=axis,
                                                          keepdims=False)
        if gl is not None:
            assert getattr(chainer.config, 'lmt', False)
            ggamma[self.index] += gl.reshape(
                tuple()) * self.std_inv[self.index]
        inv_m = numpy.float32(1) / m
        if xp is numpy:
            gx = (gamma * self.std_inv)[expander] * (
                gy - (self.x_hat * ggamma[expander] + gbeta[expander]) / m)
        else:
            gx = cuda.elementwise(
                'T gy, T x_hat, T gamma, T std_inv, T ggamma, T gbeta, \
                T inv_m', 'T gx',
                'gx = (gamma * std_inv) * (gy - (x_hat * ggamma + gbeta) * \
                inv_m)', 'bn_bwd')(gy, self.x_hat, gamma[expander],
                                   self.std_inv[expander], ggamma[expander],
                                   gbeta[expander], inv_m)
        return gx, ggamma, gbeta
예제 #35
0
 def __init__(self,  epsilon=1e-5, stability=1e0):
     """
     
     Args:
         epsilon: How close to take point for perturbation calc 
         stability: Add this term to denominator to stabilize...
     """
     self.epsilon = epsilon
     self.stability_term = stability
     self.init = True
     self.calc_inner_product_sum = cuda.reduce(
         'T u, T v',
         'T sum_uv',
         'u * v',
         'a + b',
         'sum_uv = a',
         '0',
         'calc_inner_product_sum'
     )
 def forward_gpu(self, inputs):
     x, t = inputs
     self.y, = softmax.Softmax(self.use_cudnn).forward_gpu((x,))
     n_unit = int(numpy.prod(self.y.shape[2:]))
     # the map_expr is equivalent to the pseudo code -log(y[n, c, m]),
     # where n = i / n_unit, c = t[i], and m = i % n_unit
     ret = cuda.reduce(
         'int* t, float* y, int n_channel, int n_unit',
         '-log(y[n_unit * ((i / n_unit) * n_channel + t[i])'
         '       + (i % n_unit)])',
         'a+b', '0', 'crossent_fwd', numpy.float32
     )(t, self.y, self.y.shape[1], n_unit)
     if getattr(self, 'normalize', True):
         n_unit = int(numpy.prod(self.y.shape[2:]))
         count = t.shape[0] * n_unit
     else:
         count = t.shape[0]
     ret /= count
     return ret,
예제 #37
0
 def forward_gpu(self, inputs):
     cupy = cuda.cupy
     x, t = inputs
     self.y, = softmax.Softmax(self.use_cudnn).forward((x,))
     if getattr(self, "normalize", True):
         count = x.size // x.shape[1]
     else:
         count = x.shape[0]
     y = cupy.rollaxis(self.y, 1, self.y.ndim)
     ret = cuda.reduce(
         "S t, raw T y, int32 n_channel, T inv_count",
         "T out",
         "log(y[_j * n_channel + t])",
         "a + b",
         "out = a * inv_count",
         "0",
         "crossent_fwd",
     )(t, y.reduced_view(), y.shape[-1], -1.0 / count)
     return (ret,)
예제 #38
0
 def forward_gpu(self, inputs):
     x, t = inputs
     self.y, = softmax.Softmax(self.use_cudnn).forward_gpu((x,))
     n_unit = int(numpy.prod(self.y.shape[2:]))
     # the map_expr is equivalent to the pseudo code -log(y[n, c, m]),
     # where n = i / n_unit, c = t[i], and m = i % n_unit
     ret = cuda.reduce(
         'int* t, float* y, int n_channel, int n_unit',
         '-log(y[n_unit * ((i / n_unit) * n_channel + t[i])'
         '       + (i % n_unit)])',
         'a+b', '0', 'crossent_fwd', numpy.float32
     )(t, self.y, self.y.shape[1], n_unit)
     if getattr(self, 'normalize', True):
         n_unit = int(numpy.prod(self.y.shape[2:]))
         count = t.shape[0] * n_unit
     else:
         count = t.shape[0]
     ret /= count
     return ret,
예제 #39
0
    def backward(self, inputs, grad_outputs):
        e1 = array.as_mat(inputs[0])
        e2 = array.as_mat(inputs[1])
        W = inputs[2]

        if not type_check.same_types(*inputs):
            raise ValueError(
                'numpy and cupy must not be used together\n'
                'type(W): {0}, type(e1): {1}, type(e2): {2}'.format(
                    type(W), type(e1), type(e2)))

        gy = grad_outputs[0]

        xp = cuda.get_array_module(*inputs)
        if xp is numpy:
            gW = numpy.einsum('ij,ik,il->jkl', e1, e2, gy)
            ge1 = numpy.einsum('ik,jkl,il->ij', e2, W, gy)
            ge2 = numpy.einsum('ij,jkl,il->ik', e1, W, gy)
        else:
            kern = cuda.reduce('T in0, T in1, T in2', 'T out',
                               'in0 * in1 * in2', 'a + b', 'out = a', 0,
                               'bilinear_product')

            e1_b = e1[:, :, None, None]  # ij
            e2_b = e2[:, None, :, None]  # ik
            gy_b = gy[:, None, None, :]  # il
            W_b = W[None, :, :, :]  # jkl

            gW = kern(e1_b, e2_b, gy_b, axis=0)  # 'ij,ik,il->jkl'
            ge1 = kern(e2_b, W_b, gy_b, axis=(2, 3))  # 'ik,jkl,il->ij'
            ge2 = kern(e1_b, W_b, gy_b, axis=(1, 3))  # 'ij,jkl,il->ik'

        ret = ge1.reshape(inputs[0].shape), ge2.reshape(inputs[1].shape), gW
        if len(inputs) == 6:
            V1, V2, b = inputs[3:]
            gV1 = e1.T.dot(gy)
            gV2 = e2.T.dot(gy)
            gb = gy.sum(0)
            ge1 += gy.dot(V1.T)
            ge2 += gy.dot(V2.T)
            ret += gV1, gV2, gb
        return ret
예제 #40
0
    def forward_gpu(self, inputs):
        cupy = cuda.cupy
        x, t = inputs
        self.y, = softmax.Softmax(self.use_cudnn).forward((x,))
        if getattr(self, 'normalize', True):
            count = float((t != self.ignore_label).sum())
        else:
            count = t.shape[0]
        self.count = count

        if count == 0:
            return cupy.zeros((), dtype=x.dtype),

        y = cupy.rollaxis(self.y, 1, self.y.ndim)
        ret = cuda.reduce(
            'S t, raw T y, int32 n_channel, T inv_count', 'T out',
            't == -1 ? 0 : log(y[_j * n_channel + t])',
            'a + b', 'out = a * inv_count', '0', 'crossent_fwd'
        )(t, y.reduced_view(), y.shape[-1], -1.0 / count)
        return ret,
예제 #41
0
    def backward(self, inputs, grad_outputs):
        e1 = array.as_mat(inputs[0])
        e2 = array.as_mat(inputs[1])
        W = inputs[2]

        if not type_check.same_types(*inputs):
            raise ValueError('numpy and cupy must not be used together\n'
                             'type(W): {0}, type(e1): {1}, type(e2): {2}'
                             .format(type(W), type(e1), type(e2)))

        gy = grad_outputs[0]

        xp = cuda.get_array_module(*inputs)
        if xp is numpy:
            gW = numpy.einsum('ij,ik,il->jkl', e1, e2, gy)
            ge1 = numpy.einsum('ik,jkl,il->ij', e2, W, gy)
            ge2 = numpy.einsum('ij,jkl,il->ik', e1, W, gy)
        else:
            kern = cuda.reduce('T in0, T in1, T in2', 'T out',
                               'in0 * in1 * in2', 'a + b', 'out = a', 0,
                               'bilinear_product')

            e1_b = e1[:, :, None, None]  # ij
            e2_b = e2[:, None, :, None]  # ik
            gy_b = gy[:, None, None, :]  # il
            W_b = W[None, :, :, :]  # jkl

            gW = kern(e1_b, e2_b, gy_b, axis=0)  # 'ij,ik,il->jkl'
            ge1 = kern(e2_b, W_b, gy_b, axis=(2, 3))  # 'ik,jkl,il->ij'
            ge2 = kern(e1_b, W_b, gy_b, axis=(1, 3))  # 'ij,jkl,il->ik'

        ret = ge1.reshape(inputs[0].shape), ge2.reshape(inputs[1].shape), gW
        if len(inputs) == 6:
            V1, V2, b = inputs[3:]
            gV1 = e1.T.dot(gy)
            gV2 = e2.T.dot(gy)
            gb = gy.sum(0)
            ge1 += gy.dot(V1.T)
            ge2 += gy.dot(V2.T)
            ret += gV1, gV2, gb
        return ret
예제 #42
0
    def forward_gpu(self, inputs):
        cupy = cuda.cupy
        x, t = inputs
        if chainer.is_debug():
            self._check_input_values(x, t)

        log_y = cupy.log(x)
        if self.cache_score:
            self.y = x
        if getattr(self, 'normalize', True):
            coeff = cupy.maximum(1, (t != self.ignore_label).sum())
        else:
            coeff = max(1, len(t))
        self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
        ret = cuda.reduce(
            'S t, raw T log_y, int32 n_channel, raw T coeff', 'T out',
            't == -1 ? 0 : log_y[_j * n_channel + t]',
            'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd'
        )(t, log_y.reduced_view(), log_y.shape[-1], self._coeff)
        return ret,
    def forward_gpu(self, inputs):
        cupy = cuda.cupy
        x, t = inputs
        log_y = cupy.log(x + 1e-5)
        self.y = x

	if(self.debug):
		ipdb.set_trace()

        if getattr(self, 'normalize', True):
            coeff = cupy.maximum(1, (t != self.ignore_label).sum())
        else:
            coeff = max(1, len(t))
        self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
        ret = cuda.reduce(
            'S t, raw T log_y, int32 n_channel, raw T coeff, raw T weights', 'T out',
            't == -1 ? 0 : log_y[_j * n_channel + t] * weights[t]',
            'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd'
        )(t, log_y.reduced_view(), log_y.shape[-1], self._coeff, self.weights.reduced_view())
        return ret,
예제 #44
0
    def forward_gpu(self, inputs):
        cupy = cuda.cupy
        x, t = inputs
        if chainer.is_debug():
            self._check_input_values(x, t)

        log_y = cupy.log(x)
        if self.cache_score:
            self.y = x
        if getattr(self, 'normalize', True):
            coeff = cupy.maximum(1, (t != self.ignore_label).sum())
        else:
            coeff = max(1, len(t))
        self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
        ret = cuda.reduce('S t, raw T log_y, int32 n_channel, raw T coeff',
                          'T out', 't == -1 ? 0 : log_y[_j * n_channel + t]',
                          'a + b', 'out = a * -coeff[0]', '0',
                          'crossent_fwd')(t, log_y.reduced_view(),
                                          log_y.shape[-1], self._coeff)
        return ret,
    def forward_gpu(self, inputs):
        cupy = cuda.cupy
        x, t = inputs
        log_y = cupy.log(x + 1e-5)
        self.y = x

        if (self.debug):
            ipdb.set_trace()

        if getattr(self, 'normalize', True):
            coeff = cupy.maximum(1, (t != self.ignore_label).sum())
        else:
            coeff = max(1, len(t))
        self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
        ret = cuda.reduce(
            'S t, raw T log_y, int32 n_channel, raw T coeff, raw T weights',
            'T out', 't == -1 ? 0 : log_y[_j * n_channel + t] * weights[t]',
            'a + b', 'out = a * -coeff[0]', '0',
            'crossent_fwd')(t, log_y.reduced_view(), log_y.shape[-1],
                            self._coeff, self.weights.reduced_view())
        return ret,
예제 #46
0
    def forward(self, inputs):
        cs, ls, alpha_hat, beta_hat, kappa_hat, kappa_prev = inputs

        # cs :  one-hot-encoding vectors whose shape is (W, U)
        #       U: maximal length of character sequences in a batch
        #       W: number of characters used in a data
        #
        # ls : a vector containing lengths of character sequences in a batch
        #
        # alpha, beta, kappa: length K vectors. shape = (batchsize, K)
        #

        batchsize, W, U = cs.shape
        K = alpha_hat.shape[1]

        if isinstance(cs, numpy.ndarray):
            self.alpha = numpy.exp(alpha_hat).reshape((batchsize, K, 1))
            self.beta = numpy.exp(beta_hat).reshape((batchsize, K, 1))
            self.kappa = (kappa_prev + numpy.exp(kappa_hat)).reshape(
                (batchsize, K, 1))
            us = numpy.arange(U).astype(numpy.float32).reshape((1, 1, U))
            self.phai_mat = self.alpha * numpy.exp(
                -self.beta * (self.kappa - us)**2)  # --> (batchsize, K, U)
            ws = numpy.matmul(
                cs,
                self.phai_mat.sum(axis=1).reshape(batchsize, U, 1)
            )  # (batchsize, W, U) x (batchsize, U, 1)--> (batchsize, W, 1)
            if ls.sum() > 0:  #ls is not None:
                max_phai_idx = numpy.sum(self.phai_mat, axis=1).argmax(
                    axis=1
                )  # (batchsize, K, U) --> (batchsize, U) --> (batchsize, 1)
                eow = numpy.where(max_phai_idx > ls, max_phai_idx,
                                  -1)  # (batchsize, 1)
            else:
                eow = numpy.zeros((batchsize, U))  #None
        else:
            self.alpha, self.beta, self.kappa = cuda.elementwise(
                'T a_hat, T b_hat, T ka_hat, T ka_prev', 'T a, T b, T ka', '''
                a = exp(a_hat);
                b = exp(b_hat);
                ka = ka_prev + exp(ka_hat);
            ''', 'softwindow_fwd1')(alpha_hat, beta_hat, kappa_hat, kappa_prev)

            us = cuda.cupy.arange(U).astype(cuda.cupy.float32).reshape(
                (1, 1, U))

            self.phai_mat = cuda.elementwise(
                'T a, T b, T k, T u', 'T ph', '''
                ph = a * exp(- b *(k - u)*(k - u));
            ''', 'softwindow_fwd2'
            )(
                self.alpha.reshape(batchsize, K, 1),
                self.beta.reshape(batchsize, K, 1),
                self.kappa.reshape(batchsize, K, 1),
                us  #cuda.cupy.arange(U).astype(cuda.cupy.float32).reshape((1, 1, U))
            )

            #phais = self.phai_mat.sum(axis=1).reshape(batchsize, U, 1)
            phais = cuda.reduce(
                'T x',
                'T y',
                'x',
                'a+b',
                'y=a',
                '0',
                'softwindow_fwd3',
            )(self.phai_mat, axis=1)
            if ls.sum() > 0:  # ls is not None:
                max_phai_idx = cuda.cupy.argmax(phais, axis=1, keepdims=True)

            phais = phais.reshape(batchsize, U, 1)
            ws = cuda.cupy.empty((batchsize, W, 1)).astype(cuda.cupy.float32)
            _batch_matmul_gpu(cs, phais, out=ws)
            if ls.sum() > 0:  # ls is not None:
                eow = cuda.cupy.where(max_phai_idx > ls, max_phai_idx, -1)
            else:
                eow = cuda.cupy.zeros((batchsize, U))  #None

        return ws.reshape(batchsize, W), self.kappa.reshape(
            (batchsize, K)), eow
    def forward(self, inputs):
        xp = cuda.get_array_module(*inputs)
        xnext, eow, e_hat, pi_hat, mux_hat, muy_hat, sgmx_hat, sgmy_hat, rho_hat = inputs
        batchsize, M = pi_hat.shape
        x1 = xnext[:, 0].reshape((batchsize, 1))
        x2 = xnext[:, 1].reshape((batchsize, 1))
        x3 = xnext[:, 2].reshape((batchsize, 1))
        if isinstance(mux_hat, numpy.ndarray):
            self.x = xnext
            self.eos = 1. / (1. + numpy.exp(e_hat))  #_sigmoid(e_hat)
            self.pi_ = numpy.exp(pi_hat) / numpy.exp(pi_hat).sum(
                axis=1).reshape((batchsize, 1))
            self.mux = mux_hat
            self.muy = muy_hat
            self.sgmx = numpy.exp(sgmx_hat)
            self.sgmy = numpy.exp(sgmy_hat)
            self.rho_ = numpy.tanh(rho_hat)

            if x3.sum() >= 0.0:  #xnext is not None: # training & validation
                #x1 = xnext[:,0].reshape((batchsize, 1))
                #x2 = xnext[:,1].reshape((batchsize, 1))
                #x3 = xnext[:,2].reshape((batchsize, 1))

                dx1 = (x1 - self.mux) / self.sgmx
                dx2 = (x2 - self.muy) / self.sgmy
                self.Zs = dx1 * dx1 + dx2 * dx2 - 2. * self.rho_ * dx1 * dx2
                Ns = numpy.exp(-0.5 * self.Zs / (1. - self.rho_**2)) / (
                    2. * 3.1415927 * self.sgmx * self.sgmy *
                    numpy.sqrt(1. - self.rho_**2) + 1e-10)
                gamma_hats = self.pi_ * Ns
                sum_gamma_hats = gamma_hats.sum(axis=1).reshape(
                    (batchsize, 1)) + 1e-10
                self.gammas = gamma_hats / sum_gamma_hats
                loss_t = -numpy.log(sum_gamma_hats) - x3 * numpy.log(
                    self.eos) - (1. - x3) * numpy.log(1. - self.eos)
                idx = numpy.where(x3 == 2)[0]
                self.update_or_not = numpy.ones_like(x3)
                self.update_or_not[idx, 0] = 0.0
                loss_t = loss_t * self.update_or_not
                self.xnext = xnext

                # Prediction in training
                xnext_h = numpy.copy(xnext)
                with chainer.no_backprop_mode():
                    myux_min_h = mux_hat.min(axis=1).reshape((batchsize, 1))
                    myux_max_h = mux_hat.max(axis=1).reshape((batchsize, 1))
                    myuy_min_h = muy_hat.min(axis=1).reshape((batchsize, 1))
                    myuy_max_h = muy_hat.max(axis=1).reshape((batchsize, 1))
                    protect_mask = numpy.ones((batchsize, 1))
                    while protect_mask.sum() > 0:
                        z1_h = numpy.random.uniform(size=batchsize).reshape(
                            (batchsize, 1))
                        z2_ = numpy.random.uniform(size=batchsize).reshape(
                            (batchsize, 1))
                        x1_h = myux_min_h + (myux_max_h - myux_min_h) * z1_h
                        x2_h = myuy_min_h + (myuy_max_h - myuy_min_h) * z2_

                        dx1_h = (x1_h - self.mux) / self.sgmx
                        dx2_h = (x2_h - self.muy) / self.sgmy
                        self.Zs_h = dx1_h * dx1_h + dx2_h * dx2_h - 2. * self.rho_ * dx1_h * dx2_h
                        Ns = numpy.exp(
                            -0.5 * self.Zs_h / (1. - self.rho_**2)) / (
                                2. * 3.1415927 * self.sgmx * self.sgmy *
                                numpy.sqrt(1. - self.rho_**2) + 1e-10)
                        gamma_hats_h = self.pi_ * Ns
                        sum_gamma_hats = gamma_hats_h.sum(axis=1)  # Pr(x|ys)

                        us_h = numpy.random.uniform(size=batchsize)
                        idx = numpy.where(sum_gamma_hats > us_h)[0]
                        xnext_h[idx, 0] += (x1_h * protect_mask)[idx, 0]
                        xnext_h[idx, 1] += (x2_h * protect_mask)[idx, 0]
                        protect_mask[idx, 0] = 0.0

                    #xnext[:, 2] = self.eos[:, 0]
                    #xnext[:, 2] = numpy.where(eow < 0, xnext[:, 2], 2.)
                    #xnext_h[:, 2] = self.eos[:, 0]
                    #mask = eow < 0
                    #if not mask.all():
                    #    xnext_h[:, 2] = 2.0
                    #xnext[:, 2:] = xp.where(eow < 0, self.eos[:, 0:1], 2.)
                    xnext_h[:, 2] = xp.where(self.eos[:, 0] > 0.10, 1.0, 0.0)

                self.xnext = xnext_h

            else:  # prediction
                xnext = numpy.zeros((batchsize, 3))
                myux_min = mux_hat.min(axis=1).reshape((batchsize, 1))
                myux_max = mux_hat.max(axis=1).reshape((batchsize, 1))
                myuy_min = muy_hat.min(axis=1).reshape((batchsize, 1))
                myuy_max = muy_hat.max(axis=1).reshape((batchsize, 1))
                protect_mask = numpy.ones((batchsize, 1))
                while protect_mask.sum() > 0:
                    z1 = numpy.random.uniform(size=batchsize).reshape(
                        (batchsize, 1))
                    z2 = numpy.random.uniform(size=batchsize).reshape(
                        (batchsize, 1))
                    x1 = myux_min + (myux_max - myux_min) * z1
                    x2 = myuy_min + (myuy_max - myuy_min) * z2

                    dx1 = (x1 - self.mux) / self.sgmx
                    dx2 = (x2 - self.muy) / self.sgmy
                    self.Zs = dx1 * dx1 + dx2 * dx2 - 2. * self.rho_ * dx1 * dx2
                    Ns = numpy.exp(-0.5 * self.Zs / (1. - self.rho_**2)) / (
                        2. * 3.1415927 * self.sgmx * self.sgmy *
                        numpy.sqrt(1. - self.rho_**2) + 1e-10)
                    gamma_hats = self.pi_ * Ns
                    sum_gamma_hats = gamma_hats.sum(axis=1)  # Pr(x|ys)

                    us = numpy.random.uniform(size=batchsize)
                    idx = numpy.where(sum_gamma_hats > us)[0]
                    xnext[idx, 0] += (x1 * protect_mask)[idx, 0]
                    xnext[idx, 1] += (x2 * protect_mask)[idx, 0]
                    protect_mask[idx, 0] = 0.0

                #xnext[:, 2] = self.eos[:, 0]
                #xnext[:, 2] = numpy.where(eow < 0, xnext[:, 2], 2.)
                xnext[:, 2] = self.eos[:, 0]
                mask = eow < 0
                if not mask.all():
                    xnext[:, 2] = 2.0
                #xnext[:, 2:] = xp.where(eow < 0, self.eos[:, 0:1], 2.)

                self.xnext = xnext
                #loss_t = None
                loss_t = xp.zeros((batchsize, 1)).astype(xp.float32)
                self.Zs = None

        else:
            self.mux = mux_hat
            self.muy = muy_hat
            self.pi_hat = pi_hat - pi_hat.max(axis=1).reshape(batchsize, 1)
            sum_exp_pi = cuda.reduce(
                'T x',  # input params
                'T y',  # output params
                'exp(x)',  # map
                'a+b',  # reduce
                'y=a',  # post-reduction map
                '1e-10',  # identity value
                'mdout_sumexp'  # kernel name
            )(self.pi_hat, axis=1)

            self.eos = 1. / (1. + cuda.cupy.exp(e_hat))

            if x3.sum() >= 0.0:  #xnext is not None:  # training & validation
                gamma_hats, self.Zs, self.pi_, self.sgmx, self.sgmy, self.rho_ = cuda.elementwise(
                    'T x1, T x2, T pi_hat, T mux_, T muy_, T sgmx_hat, T sgmy_hat, T rho_hat, T sum_exp_pi',  # input
                    'T gammas, T Zs, T pi_, T sgmx_, T sgmy_, T rho_',  # output
                    '''
                    pi_ = exp(pi_hat)/sum_exp_pi;
                    sgmx_ = exp(sgmx_hat) + 1e-10;
                    sgmy_ = exp(sgmy_hat) + 1e-10;
                    rho_ = tanh(rho_hat);
                    T rho2 = 1. - rho_*rho_ + 1e-10;
                    T dx1 = (x1 - mux_)/sgmx_;
                    T dx2 = (x2 - muy_)/sgmy_;
                    Zs = dx1*dx1 + dx2*dx2- 2.*rho_*dx1*dx2;
                    T Ns = exp( -0.5*Zs /rho2)/(2. * 3.1415927 * sgmx_ * sgmy_ * sqrt(rho2));
                    gammas = pi_ * Ns;
                ''',
                    'mdout_fwd1',
                )(x1, x2, self.pi_hat, mux_hat, muy_hat, sgmx_hat, sgmy_hat,
                  rho_hat, sum_exp_pi.reshape((batchsize, 1)))

                sum_gamma_hats = gamma_hats.sum(axis=1).reshape(
                    (batchsize, 1)) + 1e-10
                self.gammas = gamma_hats / sum_gamma_hats
                loss_t = cuda.elementwise(
                    'T sum_, T x3, T eos',
                    'T loss',
                    '''
                    loss = -log(sum_) - x3 * log(eos) - (1. - x3) * log(1.-eos);
                ''',
                    'mdout_fwd2',
                )(sum_gamma_hats, x3, self.eos)
                self.update_or_not = xp.where(x3 == 2., 0.0,
                                              1.0).astype(xp.float32)
                loss_t = loss_t * self.update_or_not
                self.xnext = xnext

                # Prediction in training
                with chainer.no_backprop_mode():
                    self.sgmx_h = xp.where(self.sgmx < 0.0015, 0.0015,
                                           self.sgmx)
                    self.sgmy_h = xp.where(self.sgmy < 0.0015, 0.0015,
                                           self.sgmy)

                    muxs = xp.empty((batchsize, M, M)).astype(xp.float32)
                    muys = xp.empty((batchsize, M, M)).astype(xp.float32)
                    _batch_matmul_gpu(mux_hat.reshape((batchsize, M, 1)),
                                      xp.ones((batchsize, 1,
                                               M)).astype(xp.float32),
                                      out=muxs)
                    _batch_matmul_gpu(muy_hat.reshape((batchsize, M, 1)),
                                      xp.ones((batchsize, 1,
                                               M)).astype(xp.float32),
                                      out=muys)

                    gamma_hats_at_components = cuda.elementwise(
                        'T x1, T x2, T pi_, T mux_, T muy_, T sgmx_, T sgmy_, T rho_',  # input
                        'T gammas',  # output
                        '''
                        T rho2 = 1. - rho_*rho_ + 1e-10;
                        T dx1 = (x1 - mux_)/sgmx_;
                        T dx2 = (x2 - muy_)/sgmy_;
                        T Zs = dx1*dx1 + dx2*dx2- 2.*rho_*dx1*dx2;
                        T Ns = exp( -0.5*Zs /rho2)/(2. * 3.1415927 * sgmx_ * sgmy_ * sqrt(rho2));
                        gammas = pi_ * Ns;
                    ''',
                        'mdout_fwd5',
                    )(muxs, muys, self.pi_.reshape((batchsize, 1, M)),
                      mux_hat.reshape((batchsize, 1, M)),
                      muy_hat.reshape((batchsize, 1, M)),
                      self.sgmx_h.reshape((batchsize, 1, M)),
                      self.sgmy_h.reshape((batchsize, 1, M)),
                      self.rho_.reshape((batchsize, 1, M)))

                    sum_gamma_hats_at_components = gamma_hats_at_components.sum(
                        axis=2)  # (batchsize, M)
                    p_maxs = sum_gamma_hats_at_components.max(axis=1).reshape(
                        (batchsize, 1))  # (batchsize, 1)

                    myux_min_h = mux_hat.min(axis=1).reshape(
                        (batchsize, 1, 1)) - 0.01
                    myux_max_h = mux_hat.max(axis=1).reshape(
                        (batchsize, 1, 1)) + 0.01
                    myuy_min_h = muy_hat.min(axis=1).reshape(
                        (batchsize, 1, 1)) - 0.01
                    myuy_max_h = muy_hat.max(axis=1).reshape(
                        (batchsize, 1, 1)) + 0.01

                    xnext_h = xp.zeros((batchsize, 3)).astype(xp.float32)
                    protect_mask = xp.ones((batchsize, 1)).astype(xp.float32)
                    n_samples = 32768 * 2  #16384 #8192 #4096 #2048 #1024 #512
                    x1_h = xp.copy(x1)
                    x2_h = xp.copy(x2)
                    while protect_mask.sum() > 0:
                        # sampling n (=n_samples) samples in parallel at a step
                        z1_h = xp.random.uniform(size=batchsize *
                                                 n_samples).reshape(
                                                     (batchsize, n_samples, 1))
                        z2_h = xp.random.uniform(size=batchsize *
                                                 n_samples).reshape(
                                                     (batchsize, n_samples, 1))
                        x1__h = (myux_min_h +
                                 (myux_max_h - myux_min_h) * z1_h).astype(
                                     xp.float32)  # (batchsize, n_samples, 1)
                        x2__h = (myuy_min_h +
                                 (myuy_max_h - myuy_min_h) * z2_h).astype(
                                     xp.float32)  # (batchsize, n_samples, 1)
                        gamma_hats_h = cuda.elementwise(
                            'T x1, T x2, T pi_, T mux_, T muy_, T sgmx_, T sgmy_, T rho_',  # input
                            'T gammas',  # output
                            '''
                            T rho2 = 1. - rho_*rho_ + 1e-10;
                            T dx1 = (x1 - mux_)/sgmx_;
                            T dx2 = (x2 - muy_)/sgmy_;
                            T Zs = dx1*dx1 + dx2*dx2- 2.*rho_*dx1*dx2;
                            T Ns = exp( -0.5*Zs /rho2)/(2. * 3.1415927 * sgmx_ * sgmy_ * sqrt(rho2));
                            gammas = pi_ * Ns;
                        ''',
                            'mdout_fwd4',
                        )(x1__h, x2__h, self.pi_.reshape((batchsize, 1, M)),
                          mux_hat.reshape((batchsize, 1, M)),
                          muy_hat.reshape((batchsize, 1, M)),
                          self.sgmx_h.reshape((batchsize, 1, M)),
                          self.sgmy_h.reshape((batchsize, 1, M)),
                          self.rho_.reshape((batchsize, 1, M)))
                        sum_gamma_hats_h = gamma_hats_h.sum(axis=2)

                        us_h = xp.random.uniform(
                            size=batchsize * n_samples).reshape(
                                (batchsize, n_samples)) * p_maxs
                        update_mask__h = xp.where(
                            sum_gamma_hats_h > us_h, 1.0,
                            0.0).astype(xp.float32).reshape(
                                (batchsize, n_samples))
                        update_mask_h = update_mask__h.max(axis=1).reshape(
                            (batchsize, 1))
                        sample_idx_h = update_mask__h.argmax(axis=1).reshape(
                            (batchsize, 1))
                        for bb in xrange(batchsize):
                            this_midx = sample_idx_h[bb, 0]
                            x1_h[bb:bb + 1,
                                 0] = x1__h[bb:bb + 1, this_midx:this_midx + 1,
                                            0]
                            x2_h[bb:bb + 1,
                                 0] = x2__h[bb:bb + 1, this_midx:this_midx + 1,
                                            0]
                        xnext_h[:,
                                0] += (x1_h * protect_mask * update_mask_h)[:,
                                                                            0]
                        xnext_h[:,
                                1] += (x2_h * protect_mask * update_mask_h)[:,
                                                                            0]
                        protect_mask -= protect_mask * update_mask_h

                    xnext_h[:, 2:] = xp.where(self.eos[:, 0:1] > 0.10, 1.0,
                                              0.0)
                    #xnext_h[:, 2:] = xp.where(eow < 0, self.eos[:, 0:1], 2.)

                    self.xnext = xnext_h
                    #loss_t = xp.zeros((batchsize, 1)).astype(xp.float32)
                    #self.Zs = None

            else:  # prediction (sampling from probability distribution)
                # pi, sgmx, sgmy, rho  <-- pi_hat, sgmx_hat, sgmy_hat, rho_hat
                self.pi_, self.sgmx, self.sgmy, self.rho_ = cuda.elementwise(
                    'T pi_hat, T sgmx_hat, T sgmy_hat, T rho_hat, T sum_exp_pi',  # input
                    'T pi_, T sgmx_, T sgmy_, T rho_',  # output
                    '''
                    pi_ = exp(pi_hat)/sum_exp_pi;
                    sgmx_ = exp(sgmx_hat) + 1e-10;
                    sgmy_ = exp(sgmy_hat) + 1e-10;
                    rho_ = tanh(rho_hat);
                ''',
                    'mdout_fwd3',
                )(self.pi_hat, sgmx_hat, sgmy_hat, rho_hat,
                  sum_exp_pi.reshape((batchsize, 1)))

                # because variances of gaussians are very small, sampling is virtually impossible, we set lower boundary for variances!
                self.sgmx = xp.where(self.sgmx < 0.0015, 0.0015, self.sgmx)
                self.sgmy = xp.where(self.sgmy < 0.0015, 0.0015, self.sgmy)
                #print(self.sgmx.min(), self.sgmy.min())

                # get the (aproximated) maximum p value of M-mixture gaussian distributions.
                # Here I assume that the maximum p value is taken at a center of a gaussian component in the mixture.
                # First, calculate p-values at each center of gaussian components,
                # and the maximum of these p-values is considered as the upper boundary of the M-mixture gaussian distributions

                # prepare x1 and x2 matrices like
                # [ [mux0, mux0, ...., mux0],
                #   [mux1, mux1, ...., mux1],
                #   ...
                #   [muxn, muxn, ...., muxn]]  where n = batchsize

                muxs = xp.empty((batchsize, M, M)).astype(xp.float32)
                muys = xp.empty((batchsize, M, M)).astype(xp.float32)
                _batch_matmul_gpu(mux_hat.reshape((batchsize, M, 1)),
                                  xp.ones(
                                      (batchsize, 1, M)).astype(xp.float32),
                                  out=muxs)
                _batch_matmul_gpu(muy_hat.reshape((batchsize, M, 1)),
                                  xp.ones(
                                      (batchsize, 1, M)).astype(xp.float32),
                                  out=muys)

                # N_i((mux[j], muy[j])) for i = 0, 1, ..., M and j = 0, 1, ..., M
                gamma_hats_at_components = cuda.elementwise(
                    'T x1, T x2, T pi_, T mux_, T muy_, T sgmx_, T sgmy_, T rho_',  # input
                    'T gammas',  # output
                    '''
                    T rho2 = 1. - rho_*rho_ + 1e-10;
                    T dx1 = (x1 - mux_)/sgmx_;
                    T dx2 = (x2 - muy_)/sgmy_;
                    T Zs = dx1*dx1 + dx2*dx2- 2.*rho_*dx1*dx2;
                    T Ns = exp( -0.5*Zs /rho2)/(2. * 3.1415927 * sgmx_ * sgmy_ * sqrt(rho2));
                    gammas = pi_ * Ns;
                ''',
                    'mdout_fwd5',
                )(muxs, muys, self.pi_.reshape((batchsize, 1, M)),
                  mux_hat.reshape((batchsize, 1, M)),
                  muy_hat.reshape((batchsize, 1, M)),
                  self.sgmx.reshape((batchsize, 1, M)),
                  self.sgmy.reshape((batchsize, 1, M)),
                  self.rho_.reshape((batchsize, 1, M)))

                # p[j] = sum(N_i((mux[j], muy[j])) for i = 0, 1, ..., M
                sum_gamma_hats_at_components = gamma_hats_at_components.sum(
                    axis=2)  # (batchsize, M)
                # max(p[0], p[1], ..., p[M]) for each batch
                p_maxs = sum_gamma_hats_at_components.max(axis=1).reshape(
                    (batchsize, 1))  # (batchsize, 1)
                #print(p_maxs.reshape((1, batchsize)))

                myux_min = mux_hat.min(axis=1).reshape(
                    (batchsize, 1, 1)) - 0.01
                myux_max = mux_hat.max(axis=1).reshape(
                    (batchsize, 1, 1)) + 0.01
                myuy_min = muy_hat.min(axis=1).reshape(
                    (batchsize, 1, 1)) - 0.01
                myuy_max = muy_hat.max(axis=1).reshape(
                    (batchsize, 1, 1)) + 0.01

                xnext = xp.zeros((batchsize, 3)).astype(xp.float32)
                protect_mask = xp.ones((batchsize, 1)).astype(xp.float32)
                n_samples = 32768 * 2  #16384 #8192 #4096 #2048 #1024 #512
                while protect_mask.sum() > 0:
                    # sampling n (=n_samples) samples in parallel at a step
                    z1 = xp.random.uniform(size=batchsize * n_samples).reshape(
                        (batchsize, n_samples, 1))
                    z2 = xp.random.uniform(size=batchsize * n_samples).reshape(
                        (batchsize, n_samples, 1))
                    x1_ = (myux_min + (myux_max - myux_min) * z1).astype(
                        xp.float32)  # (batchsize, n_samples, 1)
                    x2_ = (myuy_min + (myuy_max - myuy_min) * z2).astype(
                        xp.float32)  # (batchsize, n_samples, 1)
                    gamma_hats = cuda.elementwise(
                        'T x1, T x2, T pi_, T mux_, T muy_, T sgmx_, T sgmy_, T rho_',  # input
                        'T gammas',  # output
                        '''
                        T rho2 = 1. - rho_*rho_ + 1e-10;
                        T dx1 = (x1 - mux_)/sgmx_;
                        T dx2 = (x2 - muy_)/sgmy_;
                        T Zs = dx1*dx1 + dx2*dx2- 2.*rho_*dx1*dx2;
                        T Ns = exp( -0.5*Zs /rho2)/(2. * 3.1415927 * sgmx_ * sgmy_ * sqrt(rho2));
                        gammas = pi_ * Ns;
                    ''',
                        'mdout_fwd4',
                    )(x1_, x2_, self.pi_.reshape((batchsize, 1, M)),
                      mux_hat.reshape((batchsize, 1, M)),
                      muy_hat.reshape((batchsize, 1, M)),
                      self.sgmx.reshape((batchsize, 1, M)),
                      self.sgmy.reshape((batchsize, 1, M)),
                      self.rho_.reshape((batchsize, 1, M)))
                    sum_gamma_hats_ = gamma_hats.sum(axis=2)
                    """
                    sum_gamma_hats  = sum_gamma_hats_.max(axis=1).reshape((batchsize, 1))
                    sample_idx = sum_gamma_hats_.argmax(axis=1).reshape((batchsize, 1))
                    for bb in xrange(batchsize):
                        this_midx = sample_idx[bb, 0]
                        x1[bb:bb+1, 0] = x1_[bb:bb+1, this_midx:this_midx+1, 0]
                        x2[bb:bb+1, 0] = x2_[bb:bb+1, this_midx:this_midx+1, 0]
                    us = xp.random.uniform(size=batchsize).reshape((batchsize, 1)) * p_maxs
                    update_mask  = xp.where(sum_gamma_hats > us, 1.0, 0.0).astype(xp.float32).reshape((batchsize, 1))
                    xnext[:, 0]  += (x1*protect_mask*update_mask)[:, 0]
                    xnext[:, 1]  += (x2*protect_mask*update_mask)[:, 0]
                    protect_mask -= protect_mask * update_mask
                    """
                    """
                    us_ = xp.random.uniform(size=batchsize* n_samples).reshape((batchsize, n_samples)) * p_maxs
                    update_mask_  = xp.where(sum_gamma_hats_ > us_, 1.0, 0.0).astype(xp.float32).reshape((batchsize, n_samples))
                    x1 = x1_.reshape((batchsize, n_samples)) * update_mask_
                    x2 = x2_.reshape((batchsize, n_samples)) * update_mask_
                    for i in xrange(n_samples):
                        xnext[:, 0]  += (x1_[:,i, :]*protect_mask)[:, 0]
                        xnext[:, 1]  += (x2_[:,i, :]*protect_mask)[:, 0]
                        #print(protect_mask.shape, update_mask_[:, i:(i+1)].shape)
                        protect_mask -= protect_mask * update_mask_[:, i:(i+1)]
                    """
                    us_ = xp.random.uniform(
                        size=batchsize * n_samples).reshape(
                            (batchsize, n_samples)) * p_maxs
                    update_mask_ = xp.where(sum_gamma_hats_ > us_, 1.0,
                                            0.0).astype(xp.float32).reshape(
                                                (batchsize, n_samples))
                    update_mask = update_mask_.max(axis=1).reshape(
                        (batchsize, 1))
                    sample_idx = update_mask_.argmax(axis=1).reshape(
                        (batchsize, 1))
                    for bb in xrange(batchsize):
                        this_midx = sample_idx[bb, 0]
                        x1[bb:bb + 1, 0] = x1_[bb:bb + 1,
                                               this_midx:this_midx + 1, 0]
                        x2[bb:bb + 1, 0] = x2_[bb:bb + 1,
                                               this_midx:this_midx + 1, 0]
                    xnext[:, 0] += (x1 * protect_mask * update_mask)[:, 0]
                    xnext[:, 1] += (x2 * protect_mask * update_mask)[:, 0]
                    protect_mask -= protect_mask * update_mask

                xnext[:, 2:] = self.eos[:, 0:1]
                xnext[:, 2:] = xp.where(eow < 0, self.eos[:, 0:1], 2.)
                self.xnext = xnext
                loss_t = xp.zeros((batchsize, 1)).astype(xp.float32)
                self.Zs = None

        return loss_t, self.xnext, self.eos, self.pi_, self.mux, self.muy, self.sgmx, self.sgmy, self.rho_,
예제 #48
0
def _l2normalize(v, eps=1e-12):
    norm = cuda.reduce('T x', 'T out', 'x * x', 'a + b', 'out = sqrt(a)', 0,
                       'norm_sn')
    div = cuda.elementwise('T x, T norm, T eps', 'T out',
                           'out = x / (norm + eps)', 'div_sn')
    return div(v, norm(v), eps)
예제 #49
0
    def backward(self, inputs, grad_outputs):
        xp = cuda.get_array_module(*inputs)
        cs, ls, alpha_hat, beta_hat, kappa_hat, kappa_prev = inputs

        batchsize, W, U = cs.shape
        K = alpha_hat.shape[1]
        gw, gk = grad_outputs[0:2]  # (batchsize, W)

        ga_hat = xp.empty_like(alpha_hat)
        gb_hat = xp.empty_like(beta_hat)
        gk_hat = xp.empty_like(kappa_hat)
        gk_prev = xp.empty_like(kappa_prev)  #gk)
        gc = xp.empty_like(cs)

        # Consider the case that either gradient is not given
        if gw is None:
            gw = 0
        if gk is None:
            gk = 0

        if xp is numpy:
            gwc = numpy.matmul(gw.reshape(batchsize, 1, W),
                               cs)  # (batchsize, 1, U)
            emat = self.phai_mat * gwc  # (batchsize, K, U)
            ga_hat[:] = emat.sum(axis=2)
            us = numpy.arange(U).astype(numpy.float32).reshape((1, 1, U))
            diff = us - self.kappa
            b = self.beta.reshape((batchsize, K))
            gb_hat[:] = -b * (emat * diff**2).sum(axis=2)
            gk_prev[:] = gk + 2. * b * (emat * diff).sum(axis=2)
            gk_hat[:] = numpy.exp(kappa_hat) * gk_prev
        else:
            gwc = cuda.cupy.empty((batchsize, 1, U)).astype(cuda.cupy.float32)
            #for i in xrange(batchsize):
            #    gwc[i] = (gw.reshape(batchsize, 1, W))[i].dot(cs[i]) # (1, W).(W, U) --> (1, U)
            _batch_matmul_gpu(gw.reshape(batchsize, 1, W), cs, out=gwc)
            #emat      = self.phai_mat * gwc

            emat = cuda.elementwise(
                'T phai, T gwc',
                'T emat',
                '''
                emat = phai * gwc;
            ''',
                'softwindow_bw1',
            )(self.phai_mat, gwc)

            #ga_hat[:] = emat.sum(axis=2)

            ga_hat[:] = cuda.reduce(
                'T x',
                'T y',
                'x',
                'a+b',
                'y=a',
                '0',
                'softwindow_bw2',
            )(emat, axis=2)

            us = cuda.cupy.arange(U).astype(cuda.cupy.float32).reshape(
                (1, 1, U))
            diff = us - self.kappa.reshape(batchsize, K, 1)
            b = self.beta.reshape(batchsize, K)
            tmp2, tmp1 = cuda.elementwise(
                'T emat, T diff', 'T ed2, T ed1', '''
                ed1 = emat * diff;
                ed2 = ed1  * diff;
            ''', 'softwindow_bw3')(emat, diff)
            sum1 = cuda.reduce(
                'T x',
                'T y',
                'x',
                'a+b',
                'y=a',
                '0',
                'softwindow_bw4',
            )(tmp1, axis=2)
            sum2 = cuda.reduce(
                'T x',
                'T y',
                'x',
                'a+b',
                'y=a',
                '0',
                'softwindow_bw5',
            )(tmp2, axis=2)
            gb_hat[:] = -b * sum2
            gk_prev[:] = gk + 2. * b * sum1
            #gb_hat[:] = - b * (emat * diff**2).sum(axis=2)
            #gk_prev[:]= gk + 2. * b * (emat * diff).sum(axis=2)
            #gk_hat[:] = cuda.cupy.exp(kappa_hat)*gk_prev
            gk_hat = cuda.elementwise(
                'T k_hat, T gk_prev', 'T gk_hat', '''
                gk_hat = exp(k_hat)*gk_prev;
            ''', 'softwindow_bw6')(kappa_hat, gk_prev)

        return None, None, ga_hat, gb_hat, gk_hat, gk_prev,
예제 #50
0
    def forward_gpu(self, inputs):
        from chainer.cuda import cupy

        mean_x, cov_x, t = inputs
        dim = len(mean_x[0])
        self._make_samples(t)

        self._pos_indexes = self.samples[:,0]
        self._neg_indexes = self.samples[:,1]

        self._m_pos = self.M.take(self._pos_indexes, axis=0)
        self._c_pos = self.C.take(self._pos_indexes, axis=0)
        self._m_neg = self.M.take(self._neg_indexes, axis=0)
        self._c_neg = self.C.take(self._neg_indexes, axis=0)

        if self._covariance_type == CovarianceType.diagonal:
            kern_trace = cuda.reduce(
                'T Ci, T Cj', 'T tr', 
                'Cj / Ci', 'a + b', 'tr = a', 0, 
                'trace')
            tr_p = kern_trace(cov_x, self._c_pos, axis=1)
            tr_n = kern_trace(cov_x, self._c_neg, axis=1)

            kern_det = cuda.reduce(
                'T Ci, T Cj', 'T det', 
                '__logf(Cj) - __logf(Ci)', 'a + b', 'det = a', 0, 
                'determinant')
            det_p = kern_det(cov_x, self._c_pos, axis=1)
            det_n = kern_det(cov_x, self._c_neg, axis=1)

            kern_fac = cuda.reduce(
                'T Mi, T Mj, T Ci', 'T out', 
                '__powf(abs(Mi - Mj), 2.0) / Ci', 'a + b', 'out = a', 0, 
                'factor')
            fac_p = kern_fac(mean_x, self._m_pos, cov_x, axis=1)
            fac_n = kern_fac(mean_x, self._m_neg, cov_x, axis=1)

            self._kl_pos, self._kl_neg, loss = cuda.elementwise(
                'T f_p, T f_n, T tr_p, T tr_n, T det_p, T det_n, S ip, S in, \
                 float32 m, int32 dim',
                'T kl_p, T kl_n, T L',
                '''
                if (ip == in) {
                  kl_p = 0.0;
                  kl_n = 0.0;
                  L = m;
                } else {
                  kl_p = -0.5 * (tr_p + f_p - dim - det_p);
                  kl_n = -0.5 * (tr_n + f_n - dim - det_n);
                  L = max(0.0, m - kl_p + kl_n);
                }
                ''', 
                'loss_function_diagonal'
            )(fac_p, fac_n, tr_p, tr_n, det_p, det_n, self._pos_indexes, 
              self._neg_indexes, self._margin, dim)

        elif self._covariance_type == CovarianceType.spherical:
            kern_sq_err_sum = cuda.reduce(
                'T in0, T in1', 'T out', 
                '__powf(abs(in0 - in1), 2.0)', 'a + b', 'out = a', 0, 
                'residual_sum_of_squares')
            
            sq_p = kern_sq_err_sum(mean_x, self._m_pos, axis=1)[cupy.newaxis, :].T
            sq_n = kern_sq_err_sum(mean_x, self._m_neg, axis=1)[cupy.newaxis, :].T

            self._kl_pos, self._kl_neg, loss = cuda.elementwise(
                'T sq_p, T sq_n, T cx, T cp, T cn, S ip, S in, float32 m, int32 dim',
                'T kl_p, T kl_n, T L',
                '''
                if (ip == in) {
                  kl_p = 0.0;
                  kl_n = 0.0;
                  L = m;
                } else {
                  T tr_p = dim * cp / cx;
                  T tr_n = dim * cn / cx;
                  T det_p = dim * __logf(cp / cx);
                  T det_n = dim * __logf(cn / cx);
                  kl_p = -0.5 * (tr_p + sq_p / cx - dim - det_p);
                  kl_n = -0.5 * (tr_n + sq_n / cx - dim - det_n);
                  L = max(0.0, m - kl_p + kl_n);
                }
                ''', 
                'loss_function_spherical'
            )(sq_p, sq_n, cov_x, self._c_pos, self._c_neg,
              self._pos_indexes[cupy.newaxis, :].T, 
              self._neg_indexes[cupy.newaxis, :].T,
              self._margin, dim)
            
        sum_loss = cuda.cupy.sum(loss)
        return sum_loss, 
예제 #51
0
    def forward(self, inputs):
        #
        # preprocessing
        #
        xp = cuda.get_array_module(*inputs)
        x, gamma, beta = inputs[:3]
        if configuration.config.train:
            if self.running_mean is None:
                self.running_mean = xp.zeros_like(beta, dtype=xp.float32)
                self.running_var = xp.zeros_like(gamma, dtype=xp.float32)
            else:
                self.running_mean = xp.array(self.running_mean)
                self.running_var = xp.array(self.running_var)
        elif len(inputs) == 6:
            self.fixed_mean = inputs[4]
            self.fixed_var = inputs[5]

        head_ndim = beta.ndim + 1
        expander = (None, Ellipsis) + (None, ) * (x.ndim - head_ndim)

        #
        # start of forward path
        #
        if configuration.config.train:
            axis = (0, ) + tuple(range(head_ndim, x.ndim))
            mean = x.mean(axis=axis)
            var = cuda.reduce('S x, T mean, T alpha', 'T out',
                              '(x - mean) * (x - mean)', 'a + b',
                              'out = alpha * a', '0',
                              'conv_bn_var')(x,
                                             mean[expander],
                                             x.shape[1] / x.size,
                                             axis=axis,
                                             keepdims=False)
        else:
            mean = self.fixed_mean
            var = self.fixed_var
        if xp is numpy:
            raise NotImplementedError()
        else:
            self.std_inv = cuda.elementwise(
                'T var, T eps', 'T std_inv', '''
                std_inv = 1 / sqrt(var + eps);
                ''', 'conv_bn_std_inv')(var, self.eps)
            self.x_hat, y = cuda.elementwise(
                'T x, T mean, T std_inv, T gamma, T beta', 'T x_hat, T y', '''
                x_hat = (x - mean) * std_inv;
                y = gamma * x_hat + beta;
                ''', 'conv_bn_fwd')(x, mean[expander], self.std_inv[expander],
                                    gamma[expander], beta[expander])
        #
        # end of forward path
        #

        #
        # calculation of lipschitz constant
        #
        if chainer.config.train and getattr(chainer.config, 'lmt', False):
            #
            # power iteration for a matrix Diag(\gamma_i/\sigma_i)W
            #
            # u <= Diag(\gamma_i/\sigma_i) u
            # v <= W u
            # u_mid <= W^T v
            # u <= Diag(\gamma_i/\sigma_i)^T v
            #
            W = inputs[3].reshape((inputs[3].shape[0], -1))

            tmp_l = gamma * self.std_inv
            self.u *= tmp_l
            self.v = self.u.dot(W)

            # normalize for back propagation
            normalize(self.v, eps=1e-20)

            # do not normalize u_mid
            self.u_mid = self.v.dot(W.T)

            self.u[:] = self.u_mid * tmp_l

            # normalize for back propagation
            nu = normalize(self.u, eps=1e-20)

            # spectral norm is approximated by the norm of a vector u
            l = nu.reshape((1, ))
        else:
            # not used
            l = xp.ones((1, ), dtype=xp.float32)

        #
        # calculate running average of statistics
        #
        if configuration.config.train:
            self.running_mean *= self.decay
            self.running_mean += mean * (1 - self.decay)
            self.running_var *= self.decay
            self.running_var += var * (1 - self.decay)
        return y, l