Esempio n. 1
0
    def forward_cpu(self, inputs):
        x, t = inputs
        if chainer.is_debug():
            _check_input_values(x, t, self.ignore_label)

        log_y = log_softmax._log_softmax(x)
        if self.cache_score:
            self.y = numpy.exp(log_y)
        if self.class_weight is not None:
            shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
            log_y *= _broadcast_to(self.class_weight.reshape(shape), x.shape)
        log_yd = numpy.rollaxis(log_y, 1)
        log_yd = log_yd.reshape(len(log_yd), -1)
        t_valid = t != self.ignore_label
        t = t * t_valid
        log_p = log_yd[t.ravel(), numpy.arange(t.size)]

        log_p *= t_valid.ravel()
        if self.reduce == 'mean':
            # deal with the case where the SoftmaxCrossEntropy is
            # unpickled from the old version
            if self.normalize:
                count = t_valid.sum()
            else:
                count = len(x)
            self._coeff = 1.0 / max(count, 1)

            y = log_p.sum(keepdims=True) * (-self._coeff)
            return y.reshape(()),
        else:
            return -log_p.reshape(t.shape),
    def forward_cpu(self, inputs):
        x, t = inputs
        if chainer.is_debug():
            self._check_input_values(x, t)

        log_y = log_softmax._log_softmax(x, self.use_cudnn)
        if self.cache_score:
            self.y = numpy.exp(log_y)
        if self.class_weight is not None:
            if self.class_weight.shape != x.shape:
                shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
                self.class_weight = numpy.broadcast_to(
                    self.class_weight.reshape(shape), x.shape)
            log_y *= self.class_weight
        log_yd = numpy.rollaxis(log_y, 1)
        log_yd = log_yd.reshape(len(log_yd), -1)
        log_p = log_yd[numpy.maximum(t.ravel(), 0), numpy.arange(t.size)]

        # deal with the case where the SoftmaxCrossEntropy is
        # unpickled from the old version
        if self.normalize:
            count = (t != self.ignore_label).sum()
        else:
            count = len(x)
        self._coeff = 1.0 / max(count, 1)

        y = (log_p * (t.ravel() != self.ignore_label)).sum(keepdims=True) \
            * (-self._coeff)
        return y.reshape(()),
    def forward_cpu(self, inputs):
        x, t = inputs
        if chainer.is_debug():
            _check_input_values(x, t, self.ignore_label)

        log_y = log_softmax._log_softmax(x)
        if self.cache_score:
            self.y = numpy.exp(log_y)
        if self.class_weight is not None:
            shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
            log_y *= _broadcast_to(self.class_weight.reshape(shape), x.shape)
        log_yd = numpy.rollaxis(log_y, 1)
        log_yd = log_yd.reshape(len(log_yd), -1)
        log_p = log_yd[numpy.maximum(t.ravel(), 0), numpy.arange(t.size)]

        log_p *= (t.ravel() != self.ignore_label)
        if self.reduce == 'mean':
            # deal with the case where the SoftmaxCrossEntropy is
            # unpickled from the old version
            if self.normalize:
                count = (t != self.ignore_label).sum()
            else:
                count = len(x)
            self._coeff = 1.0 / max(count, 1)

            y = log_p.sum(keepdims=True) * (-self._coeff)
            return y.reshape(()),
        else:
            return -log_p.reshape(t.shape),
Esempio n. 4
0
    def backward_cpu(self, inputs, grad_outputs):
        x, t = inputs
        gloss = grad_outputs[0]
        n_unit = t.size // len(t)
        if hasattr(self, 'y'):
            y = self.y.copy()
        else:
            y = log_softmax._log_softmax(x, self.use_cudnn)
            numpy.exp(y, out=y)
        if y.ndim == 2:
            gx = y
            gx[numpy.arange(len(t)), numpy.maximum(t, 0)] -= 1
            gx *= (t != self.ignore_label).reshape((len(t), 1))
        else:
            # in the case where y.ndim is higher than 2,
            # we think that a current implementation is inefficient
            # because it yields two provisional arrays for indexing.
            gx = y.reshape(y.shape[0], y.shape[1], -1)
            fst_index = numpy.arange(t.size) // n_unit
            trd_index = numpy.arange(t.size) % n_unit
            gx[fst_index, numpy.maximum(t.ravel(), 0), trd_index] -= 1
            gx *= (t != self.ignore_label).reshape((len(t), 1, -1))
            gx = gx.reshape(y.shape)

        gx *= gloss * self._coeff
        return gx, None
Esempio n. 5
0
    def backward_gpu(self, inputs, grad_outputs):
        cupy = cuda.cupy
        x, t = inputs
        if hasattr(self, 'y'):
            y = self.y
        else:
            y = log_softmax._log_softmax(x, self.use_cudnn)
            cupy.exp(y, out=y)
        gloss = grad_outputs[0]
        n_unit = t.size // len(t)
        coeff = gloss * self._coeff
        if self.class_weight is None:
            gx = cuda.elementwise(
                'T y, S t, raw T coeff, S n_channel, S n_unit, S ignore_label',
                'T gx', '''
					const int c = (i / n_unit % n_channel);
					gx = (t == ignore_label) ? 0 : (coeff[0] * (y - (c == t)));
				''', 'softmax_crossent_bwd')(y, cupy.expand_dims(t, 1), coeff, x.shape[1],
                                 n_unit, self.ignore_label)
        else:
            gx = cuda.elementwise(
                'T y, raw T w, S t, raw T coeff, S n_channel, S n_unit, S ignore_label',
                'T gx', '''
					const int c = (i / n_unit % n_channel);
					gx = t == ignore_label ? 0 : coeff[0] * (y - (c == t)) * w[t];
				''', 'softmax_crossent_bwd')(y, self.class_weight, cupy.expand_dims(t, 1),
                                 coeff, x.shape[1], n_unit, self.ignore_label)
        return gx, None
Esempio n. 6
0
 def backward_gpu(self, inputs, grad_outputs):
     cupy = cuda.cupy
     x, t = inputs
     if hasattr(self, 'y'):
         y = self.y
     else:
         y = log_softmax._log_softmax(x, self.use_cudnn)
         cupy.exp(y, out=y)
     gloss = grad_outputs[0]
     n_unit = t.size // len(t)
     coeff = gloss * self._coeff
     if self.class_weight is None:
         gx = cuda.elementwise(
             'T y, S t, raw T coeff, S n_channel, S n_unit',
             'T gx',
             '''
                 const int c = (i / n_unit % n_channel);
                 gx = (t == -1) ? 0 : (coeff[0] * (y - (c == t)));
             ''',
             'softmax_crossent_bwd')(
                 y, cupy.expand_dims(t, 1), coeff, x.shape[1], n_unit)
     else:
         gx = cuda.elementwise(
             'T y, raw T w, S t, raw T coeff, S n_channel, S n_unit',
             'T gx',
             '''
                 const int c = (i / n_unit % n_channel);
                 gx = t == -1 ? 0 : coeff[0] * (y - (c == t)) * w[t];
             ''',
             'softmax_crossent_bwd')(
                 y, self.class_weight, cupy.expand_dims(t, 1), coeff,
                 x.shape[1], n_unit)
     return gx, None
Esempio n. 7
0
    def backward_cpu(self, inputs, grad_outputs):
        x, t = inputs
        gloss = grad_outputs[0]
        n_unit = t.size // len(t)
        if hasattr(self, 'y'):
            y = self.y.copy()
        else:
            y = log_softmax._log_softmax(x, self.use_cudnn)
            y = numpy.exp(y, out=y)
        if y.ndim == 2:
            gx = y
            gx[numpy.arange(len(t)), numpy.maximum(t, 0)] -= 1
            gx *= (t != self.ignore_label).reshape((len(t), 1))
        else:
            # in the case where y.ndim is higher than 2,
            # we think that a current implementation is inefficient
            # because it yields two provisional arrays for indexing.
            gx = y.reshape(y.shape[0], y.shape[1], -1)
            fst_index = numpy.arange(t.size) // n_unit
            trd_index = numpy.arange(t.size) % n_unit
            gx[fst_index, numpy.maximum(t.ravel(), 0), trd_index] -= 1
            gx *= (t != self.ignore_label).reshape((len(t), 1, -1))
            gx = gx.reshape(y.shape)

        gx *= gloss * self._coeff
        return gx, None
Esempio n. 8
0
    def backward_gpu(self, inputs, grad_outputs):
        cupy = cuda.cupy
        x, t = inputs
        if hasattr(self, 'y'):
            y = self.y
        else:
            y = log_softmax._log_softmax(x, self.use_cudnn)
            cupy.exp(y, out=y)
        gloss = grad_outputs[0]
        n_unit = t.size // len(t)
        if self.reduce == 'mean':
            coeff = gloss * self._coeff
        else:
            coeff = gloss[:, None, ...]

        if self.class_weight is None:
            gx = cuda.elementwise(
                'T y, S t, T coeff, S n_channel, S n_unit', 'T gx', '''
                    const int c = (i / n_unit % n_channel);
                    gx = t == -1 ? 0 : coeff * (y - (c == t));
                ''', 'softmax_crossent_bwd')(y, cupy.expand_dims(t, 1), coeff,
                                             x.shape[1], n_unit)
        else:
            gx = cuda.elementwise(
                'T y, raw T w, S t, T coeff, S n_channel, S n_unit', 'T gx',
                '''
                    const int c = (i / n_unit % n_channel);
                    gx = t == -1 ? 0 : coeff * (y - (c == t)) * w[t];
                ''', 'softmax_crossent_weight_bwd')(y, self.class_weight,
                                                    cupy.expand_dims(t, 1),
                                                    coeff, x.shape[1], n_unit)

        return gx, None
Esempio n. 9
0
    def forward_gpu(self, inputs):
        cupy = cuda.cupy
        x, t = inputs
        if chainer.is_debug():
            self._check_input_values(x, t)

        log_y = log_softmax._log_softmax(x, self.use_cudnn)
        if self.cache_score:
            self.y = cupy.exp(log_y)
        if self.class_weight is not None:
            shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
            log_y *= cupy.broadcast_to(
                self.class_weight.reshape(shape), x.shape)
        if self.normalize:
            coeff = cupy.maximum(1, (t != self.ignore_label).sum())
        else:
            coeff = max(1, len(t))
        self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
        ret = cuda.reduce(
            'S t, raw T log_y, int32 n_channel, raw T coeff', 'T out',
            't == -1 ? T(0) : log_y[_j * n_channel + t]',
            'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd'
        )(t, log_y.reduced_view(), log_y.shape[-1], self._coeff)
        return ret,
Esempio n. 10
0
    def forward_cpu(self, inputs):
        x, t = inputs
        if chainer.is_debug():
            _check_input_values(x, t, self.ignore_label)

        log_y = log_softmax._log_softmax(x)
        if self.cache_score:
            self.y = numpy.exp(log_y)
        if self.class_weight is not None:
            log_y *= self.class_weight
        log_yd = numpy.rollaxis(log_y, 1)
        log_yd = log_yd.reshape(len(log_yd), -1)
        log_p = log_yd[numpy.maximum(t.ravel(), 0), numpy.arange(t.size)]

        log_p *= (t.ravel() != self.ignore_label)
        if self.reduce == 'mean':
            # deal with the case where the SoftmaxCrossEntropy is
            # unpickled from the old version
            if self.normalize:
                count = (t != self.ignore_label).sum()
            else:
                count = len(x)
            self._coeff = 1.0 / max(count, 1)

            y = log_p.sum(keepdims=True) * (-self._coeff)
            return y.reshape(()),
        else:
            return -log_p.reshape(t.shape),
Esempio n. 11
0
    def forward_gpu(self, inputs):
        cupy = cuda.cupy
        x, t = inputs
        if chainer.is_debug():
            self._check_input_values(x, t)

        log_y = log_softmax._log_softmax(x, self.use_cudnn)
        if self.cache_score:
            self.y = cupy.exp(log_y)
        if self.class_weight is not None:
            shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
            log_y *= cupy.broadcast_to(self.class_weight.reshape(shape),
                                       x.shape)
        if self.normalize:
            coeff = cupy.maximum(1, (t != self.ignore_label).sum())
        else:
            coeff = max(1, len(t))
        self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
        ret = cuda.reduce('S t, raw T log_y, int32 n_channel, raw T coeff',
                          'T out',
                          't == -1 ? T(0) : log_y[_j * n_channel + t]',
                          'a + b', 'out = a * -coeff[0]', '0',
                          'crossent_fwd')(t, log_y.reduced_view(),
                                          log_y.shape[-1], self._coeff)
        return ret,
Esempio n. 12
0
    def forward_gpu(self, inputs):
        class_weight = backend.from_chx(self.class_weight)

        self.retain_inputs((0, 1))
        cupy = cuda.cupy
        x, t = inputs
        if chainer.is_debug():
            _check_input_values(x, t, self.ignore_label)

        if x.size == 0:
            y = cupy.zeros(t.shape, dtype=x.dtype)
            if self.cache_score:
                self.y = y
            if self.reduce == 'mean':
                return y.sum(),
            else:
                return y,
        log_y = log_softmax._log_softmax(x)
        if self.cache_score:
            self.y = cupy.exp(log_y)
        if class_weight is not None:
            shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
            log_y *= cupy.broadcast_to(class_weight.reshape(shape), x.shape)

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)

        if self.reduce == 'mean':
            # Reduction is performed in a promoted dtype
            reduc_dtype = _reduction_dtype(x.dtype)
            if self.normalize:
                count = (t != self.ignore_label).sum(dtype=reduc_dtype)
                count = cupy.maximum(1, count)
                coeff = 1. / count
            else:
                coeff = cupy.array(1. / max(1, len(t)), dtype=reduc_dtype)
            self._coeff = coeff

            ret = cuda.reduce(
                'S t, raw T log_y, int32 n_channel, raw U coeff, '
                'S ignore_label', 'U out',
                't == ignore_label ? T(0) : log_y[_j * n_channel + t]',
                'a + b', 'out = static_cast<U>(a * -coeff[0])', '0',
                'crossent_fwd')(t, log_y.reduced_view(), log_y.shape[-1],
                                self._coeff, self.ignore_label)
            ret = ret.astype(log_y.dtype, copy=False)
        else:
            ret = cuda.elementwise(
                'S t, raw T log_y, int32 n_channel, T ignore', 'T out', '''
                if (t == ignore) {
                  out = 0;
                } else {
                  out = -log_y[i * n_channel + t];
                }
                ''', 'softmax_crossent_no_reduce_fwd')(t, log_y.reduced_view(),
                                                       log_y.shape[-1],
                                                       self.ignore_label)
            ret = ret.reshape(t.shape)
        return ret,
Esempio n. 13
0
 def backward_cpu(self, inputs, grad_outputs):
     if len(inputs) == 2:
         (x, t), tt = inputs, None
     else:
         x, t, tt = inputs
     gloss = grad_outputs[0]
     if x.size == 0:
         return numpy.zeros(x.shape, dtype=x.dtype), None
     if self.y is not None:
         y = self.y.copy()
     else:
         y = log_softmax._log_softmax(x)
         numpy.exp(y, out=y)
     if self.train_threshold is not None:
         _yt = y[numpy.arange(len(t)), t]
         # print('# _yt: {}'.format(_yt))
         _scale = (1.0 - _yt / self.train_threshold) / (1.0 - _yt + 1e-5)
         _scale[_scale < 0.0] = 0.0
         # print('# _scale: {}'.format(_scale))
     if y.ndim == 2:
         gx = y
         if tt is None:
             gx[numpy.arange(len(t)), numpy.maximum(t, 0)] -= 1
         else:
             gx -= tt
         if self.class_weight is not None:
             shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
             c = _broadcast_to(self.class_weight.reshape(shape), x.shape)
             c = c[numpy.arange(len(t)), numpy.maximum(t, 0)]
             gx *= _broadcast_to(numpy.expand_dims(c, 1), gx.shape)
         gx *= (t != self.ignore_label).reshape((len(t), 1))
         if self.train_threshold is not None:
             # print('# gx:\n{}'.format(gx))
             gx *= _scale.reshape((len(t), 1))
             # print('# gx:\n{}'.format(gx))
     else:
         # in the case where y.ndim is higher than 2,
         # we think that a current implementation is inefficient
         # because it yields two provisional arrays for indexing.
         n_unit = t.size // len(t)
         gx = y.reshape(y.shape[0], y.shape[1], -1)
         fst_index = numpy.arange(t.size) // n_unit
         trd_index = numpy.arange(t.size) % n_unit
         gx[fst_index, numpy.maximum(t.ravel(), 0), trd_index] -= 1
         if self.class_weight is not None:
             shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
             c = _broadcast_to(self.class_weight.reshape(shape), x.shape)
             c = c.reshape(gx.shape)
             c = c[fst_index, numpy.maximum(t.ravel(), 0), trd_index]
             c = c.reshape(y.shape[0], 1, -1)
             gx *= _broadcast_to(c, gx.shape)
         gx *= (t != self.ignore_label).reshape((len(t), 1, -1))
         gx = gx.reshape(y.shape)
     if self.reduce == 'mean':
         gx *= gloss * self._coeff
     else:
         gx *= gloss[:, None]
     return gx, None
Esempio n. 14
0
    def forward(self, inputs):
        xp = cuda.get_array_module(*inputs)
        x, t = inputs

        log_y = log_softmax._log_softmax(x)
        self.y = xp.exp(log_y)

        count = (t != self.ignore_label).sum()
        self._coeff = 1.0 / max(count, 1)

        return xp.array([1.], dtype=xp.float32),
Esempio n. 15
0
    def forward_gpu(self, inputs_and_grad_outputs):
        class_weight = cuda.to_gpu(self.class_weight)

        cupy = cuda.cupy
        x, t, gloss = inputs_and_grad_outputs
        if x.size == 0:
            return cupy.zeros(x.shape, dtype=x.dtype), None
        if self.y is not None:
            y = self.y
        else:
            y = log_softmax._log_softmax(x)
            cupy.exp(y, out=y)
        n_unit = t.size // len(t)
        if self.coeff is not None:
            coeff = self.coeff
        else:
            gloss = gloss[:, None, ...]
            coeff = cupy.array(1, dtype=gloss.dtype)  # dtype does not matter

        if self.class_weight is None:
            gx = cuda.elementwise(
                'T y, S t, T gloss, U coeff, S n_channel, S n_unit, '
                'S ignore_label',
                'T gx',
                '''
                    const int c = (i / n_unit % n_channel);
                    if (t == ignore_label) {
                        gx = T(0);
                    } else {
                        gx = static_cast<T>(gloss * coeff * (y - (c == t)));
                    }
                ''',
                'softmax_crossent_bwd')(
                    y, cupy.expand_dims(t, 1), gloss, coeff, x.shape[1],
                    n_unit, self.ignore_label)
        else:
            gx = cuda.elementwise(
                'T y, raw T w, S t, T gloss, U coeff, '
                'S n_channel, S n_unit, S ignore_label',
                'T gx',
                '''
                    const int c = (i / n_unit % n_channel);
                    if (t == ignore_label) {
                        gx = T(0);
                    } else {
                        gx = static_cast<T>(
                            gloss * coeff * (y - (c == t)) * w[t]);
                    }
                ''',
                'softmax_crossent_weight_bwd')(
                    y, class_weight, cupy.expand_dims(t, 1), gloss, coeff,
                    x.shape[1], n_unit, self.ignore_label)

        return gx,
Esempio n. 16
0
    def backward_gpu(self, inputs, grad_outputs):
        cupy = cuda.cupy
        if len(inputs) == 2:
            (x, t), tt = inputs, None
        else:
            x, t, tt = inputs
        if x.size == 0:
            return cupy.zeros(x.shape, dtype=x.dtype), None
        if self.y is not None:
            y = self.y
        else:
            y = log_softmax._log_softmax(x)
            cupy.exp(y, out=y)
        if self.train_threshold is not None:
            _yt = y[cupy.arange(len(t)), t]
            _scale = (1.0 - _yt / self.train_threshold) / (1.0 - _yt + 1e-5)
            _scale[_scale < 0.0] = 0.0
        gloss = grad_outputs[0]
        n_unit = t.size // len(t)
        if self.reduce == 'mean':
            coeff = gloss * self._coeff
        else:
            coeff = gloss[:, None, ...]

        if self.class_weight is None:
            if tt is None:
                gx = cuda.elementwise(
                    'T y, S t, T coeff, S n_channel, S n_unit, S ignore_label',
                    'T gx', '''
                    const int c = (i / n_unit % n_channel);
                    gx = t == ignore_label ? 0 : coeff * (y - (c == t));
                    ''', 'softmax_crossent_bwd')(y, cupy.expand_dims(t, 1),
                                                 coeff, x.shape[1], n_unit,
                                                 self.ignore_label)
            else:
                # print('# tt:{}'.format(tt))
                # print('# tt:{}'.format(tt.sum(axis=1)))
                gx = coeff * (y - tt)
        else:
            gx = cuda.elementwise(
                'T y, raw T w, S t, T coeff, S n_channel, S n_unit, '
                'S ignore_label', 'T gx', '''
                    const int c = (i / n_unit % n_channel);
                    gx = t == ignore_label ? 0 : coeff * (y - (c == t)) * w[t];
                ''', 'softmax_crossent_weight_bwd')(y, self.class_weight,
                                                    cupy.expand_dims(t, 1),
                                                    coeff, x.shape[1], n_unit,
                                                    self.ignore_label)

        if self.train_threshold is not None:
            gx *= _scale.reshape((len(t), 1))

        return gx, None
Esempio n. 17
0
    def forward_gpu(self, inputs):
        class_weight = backend.from_chainerx(self.class_weight)

        self.retain_inputs((0, 1))
        cupy = cuda.cupy
        x, t = inputs
        if chainer.is_debug():
            _check_input_values(x, t, self.ignore_label)

        if x.size == 0:
            y = cupy.zeros(t.shape, dtype=x.dtype)
            if self.cache_score:
                self.y = y
            if self.reduce == 'mean':
                return y.sum(),
            else:
                return y,
        log_y = log_softmax._log_softmax(x)
        if self.cache_score:
            self.y = cupy.exp(log_y)
        if class_weight is not None:
            shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
            log_y *= cupy.broadcast_to(class_weight.reshape(shape), x.shape)
        if self.normalize:
            coeff = cupy.maximum(1, (t != self.ignore_label).sum())
        else:
            coeff = max(1, len(t))
        self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
        if self.reduce == 'mean':
            ret = cuda.reduce(
                'S t, raw T log_y, int32 n_channel, raw T coeff, '
                'S ignore_label',
                'T out',
                't == ignore_label ? T(0) : log_y[_j * n_channel + t]',
                'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd'
            )(t, log_y.reduced_view(), log_y.shape[-1],
              self._coeff, self.ignore_label)
        else:
            ret = cuda.elementwise(
                'S t, raw T log_y, int32 n_channel, T ignore', 'T out',
                '''
                if (t == ignore) {
                  out = 0;
                } else {
                  out = -log_y[i * n_channel + t];
                }
                ''',
                'softmax_crossent_no_reduce_fwd'
            )(t, log_y.reduced_view(), log_y.shape[-1], self.ignore_label)
            ret = ret.reshape(t.shape)
        return ret,
Esempio n. 18
0
    def forward_gpu(self, inputs):
        cupy = cuda.cupy
        if len(inputs) == 2:
            (x, t), tt = inputs, None
        else:
            x, t, tt = inputs
        if chainer.is_debug():
            _check_input_values(x, t, self.ignore_label)

        if x.size == 0:
            y = cupy.zeros(t.shape, dtype=x.dtype)
            if self.cache_score:
                self.y = y
            if self.reduce == 'mean':
                return y.sum(),
            else:
                return y,
        log_y = log_softmax._log_softmax(x)
        if self.cache_score:
            self.y = cupy.exp(log_y)
        if self.class_weight is not None:
            shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
            log_y *= cupy.broadcast_to(self.class_weight.reshape(shape),
                                       x.shape)
        if self.normalize:
            coeff = cupy.maximum(1, (t != self.ignore_label).sum())
        else:
            coeff = max(1, len(t))
        self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
        if self.reduce == 'mean':
            ret = cuda.reduce(
                'S t, raw T log_y, int32 n_channel, raw T coeff, '
                'S ignore_label', 'T out',
                't == ignore_label ? T(0) : log_y[_j * n_channel + t]',
                'a + b', 'out = a * -coeff[0]', '0',
                'crossent_fwd')(t, log_y.reduced_view(), log_y.shape[-1],
                                self._coeff, self.ignore_label)
        else:
            ret = cuda.elementwise(
                'S t, raw T log_y, int32 n_channel, T ignore', 'T out', '''
                if (t == ignore) {
                  out = 0;
                } else {
                  out = -log_y[i * n_channel + t];
                }
                ''', 'softmax_crossent_no_reduce_fwd')(t, log_y.reduced_view(),
                                                       log_y.shape[-1],
                                                       self.ignore_label)
            ret = ret.reshape(t.shape)
        return ret,
Esempio n. 19
0
    def forward_gpu(self, inputs_and_grad_outputs):
        class_weight = cuda.to_gpu(self.class_weight)

        cupy = cuda.cupy
        x, t, gloss = inputs_and_grad_outputs
        if x.size == 0:
            return cupy.zeros(x.shape, dtype=x.dtype), None
        if self.y is not None:
            y = self.y
        else:
            y = log_softmax._log_softmax(x)
            cupy.exp(y, out=y)
        n_unit = t.size // len(t)
        if self.coeff is not None:
            coeff = self.coeff
        else:
            gloss = gloss[:, None, ...]
            coeff = cupy.array(1, dtype=gloss.dtype)  # dtype does not matter

        if self.soft_target:
            gx = gloss * coeff * (y - t)
        elif self.class_weight is None:
            gx = cuda.elementwise(
                'T y, S t, T gloss, U coeff, S n_channel, S n_unit, '
                'S ignore_label', 'T gx', '''
                    const int c = (i / n_unit % n_channel);
                    if (t == ignore_label) {
                        gx = T(0);
                    } else {
                        gx = static_cast<T>(gloss * coeff * (y - (c == t)));
                    }
                ''', 'softmax_crossent_bwd')(y, cupy.expand_dims(t, 1), gloss,
                                             coeff, x.shape[1], n_unit,
                                             self.ignore_label)
        else:
            gx = cuda.elementwise(
                'T y, raw T w, S t, T gloss, U coeff, '
                'S n_channel, S n_unit, S ignore_label', 'T gx', '''
                    const int c = (i / n_unit % n_channel);
                    if (t == ignore_label) {
                        gx = T(0);
                    } else {
                        gx = static_cast<T>(
                            gloss * coeff * (y - (c == t)) * w[t]);
                    }
                ''', 'softmax_crossent_weight_bwd')(y, class_weight,
                                                    cupy.expand_dims(t, 1),
                                                    gloss, coeff, x.shape[1],
                                                    n_unit, self.ignore_label)

        return gx,
Esempio n. 20
0
    def backward_cpu(self, inputs, grad_outputs):
        x, t = inputs
        gloss = grad_outputs[0]
        if hasattr(self, 'y'):
            y = self.y.copy()
        else:
            y = log_softmax._log_softmax(x, self.use_cudnn)
            numpy.exp(y, out=y)
        if y.ndim == 2:
            gx = y

            # Improve me
            # It is disabled by default
            if mkld.enable_softmax_cross_entropyF(inputs):
                mkldnn_sce_bwd = mkldnn.SoftmaxCrossEntropy_F32_softmax_cross_entropy_create_backward(
                    gx.shape)
                mkldnn_sce_bwd.backward(gx.ravel(), t.ravel(), gx.shape)
            else:
                gx[numpy.arange(len(t)), numpy.maximum(t, 0)] -= 1

            if self.class_weight is not None:
                shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
                c = numpy.broadcast_to(self.class_weight.reshape(shape),
                                       x.shape)
                c = c[numpy.arange(len(t)), numpy.maximum(t, 0)]
                gx *= numpy.broadcast_to(numpy.expand_dims(c, 1), gx.shape)
            gx *= (t != self.ignore_label).reshape((len(t), 1))
        else:
            # in the case where y.ndim is higher than 2,
            # we think that a current implementation is inefficient
            # because it yields two provisional arrays for indexing.
            n_unit = t.size // len(t)
            gx = y.reshape(y.shape[0], y.shape[1], -1)
            fst_index = numpy.arange(t.size) // n_unit
            trd_index = numpy.arange(t.size) % n_unit
            gx[fst_index, numpy.maximum(t.ravel(), 0), trd_index] -= 1
            if self.class_weight is not None:
                shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
                c = numpy.broadcast_to(self.class_weight.reshape(shape),
                                       x.shape)
                c = c.reshape(gx.shape)
                c = c[fst_index, numpy.maximum(t.ravel(), 0), trd_index]
                c = c.reshape(y.shape[0], 1, -1)
                gx *= numpy.broadcast_to(c, gx.shape)
            gx *= (t != self.ignore_label).reshape((len(t), 1, -1))
            gx = gx.reshape(y.shape)
        gx *= gloss * self._coeff
        return gx, None
Esempio n. 21
0
 def forward_cpu(self, inputs_and_grad_outputs):
     x, t, gloss = inputs_and_grad_outputs
     if x.size == 0:
         return numpy.zeros(x.shape, dtype=x.dtype), None
     if self.y is not None:
         y = self.y.copy()
     else:
         y = log_softmax._log_softmax(x)
         numpy.exp(y, out=y)
     t_valid = t != self.ignore_label
     t = t * t_valid
     if self.soft_target:
         gx = y - t
     elif y.ndim == 2:
         gx = y
         gx[numpy.arange(len(t)), t] -= 1
         if self.class_weight is not None:
             shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
             c = _broadcast_to(self.class_weight.reshape(shape), x.shape)
             c = c[numpy.arange(len(t)), t]
             gx *= _broadcast_to(numpy.expand_dims(c, 1), gx.shape)
         gx *= t_valid.reshape((len(t), 1))
     else:
         # in the case where y.ndim is higher than 2,
         # we think that a current implementation is inefficient
         # because it yields two provisional arrays for indexing.
         n_unit = t.size // len(t)
         gx = y.reshape(y.shape[0], y.shape[1], -1)
         fst_index = numpy.arange(t.size) // n_unit
         trd_index = numpy.arange(t.size) % n_unit
         gx[fst_index, t.ravel(), trd_index] -= 1
         if self.class_weight is not None:
             shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
             c = _broadcast_to(self.class_weight.reshape(shape), x.shape)
             c = c.reshape(gx.shape)
             c = c[fst_index, t.ravel(), trd_index]
             c = c.reshape(y.shape[0], 1, -1)
             gx *= _broadcast_to(c, gx.shape)
         gx *= t_valid.reshape((len(t), 1, -1))
         gx = gx.reshape(y.shape)
     if self.coeff is not None:
         gx *= gloss * self.coeff
     else:
         gx *= gloss[:, None]
     return gx,
Esempio n. 22
0
    def backward_gpu(self, inputs, grad_outputs):
        cupy = cuda.cupy
        x, soft_label = inputs
        if x.size == 0:
            return cupy.zeros(x.shape, dtype=x.dtype), None
        if self.y is not None:
            y = self.y
        else:
            y = log_softmax._log_softmax(x)
            cupy.exp(y, out=y)
        gloss = grad_outputs[0]
        coeff = gloss * self._coeff

        if self.class_weight is None:
            gx = (y - soft_label) * coeff
        else:
            gx = (y - soft_label) * self.class_weight * coeff
        return gx, None
Esempio n. 23
0
 def backward_cpu(self, inputs, grad_outputs):
     x, t = inputs
     gloss = grad_outputs[0]
     if x.size == 0:
         return numpy.zeros(x.shape, dtype=x.dtype), None
     if self.y is not None:
         y = self.y.copy()
     else:
         y = log_softmax._log_softmax(x)
         numpy.exp(y, out=y)
     t_valid = t != self.ignore_label
     t = t * t_valid
     if y.ndim == 2:
         gx = y
         gx[numpy.arange(len(t)), t] -= 1
         if self.class_weight is not None:
             shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
             c = _broadcast_to(self.class_weight.reshape(shape), x.shape)
             c = c[numpy.arange(len(t)), t]
             gx *= _broadcast_to(numpy.expand_dims(c, 1), gx.shape)
         gx *= t_valid.reshape((len(t), 1))
     else:
         # in the case where y.ndim is higher than 2,
         # we think that a current implementation is inefficient
         # because it yields two provisional arrays for indexing.
         n_unit = t.size // len(t)
         gx = y.reshape(y.shape[0], y.shape[1], -1)
         fst_index = numpy.arange(t.size) // n_unit
         trd_index = numpy.arange(t.size) % n_unit
         gx[fst_index, t.ravel(), trd_index] -= 1
         if self.class_weight is not None:
             shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
             c = _broadcast_to(self.class_weight.reshape(shape), x.shape)
             c = c.reshape(gx.shape)
             c = c[fst_index, t.ravel(), trd_index]
             c = c.reshape(y.shape[0], 1, -1)
             gx *= _broadcast_to(c, gx.shape)
         gx *= t_valid.reshape((len(t), 1, -1))
         gx = gx.reshape(y.shape)
     if self.reduce == 'mean':
         gx *= gloss * self._coeff
     else:
         gx *= gloss[:, None]
     return gx, None
    def backward_cpu(self, inputs, grad_outputs):
        x, t = inputs
        gloss = grad_outputs[0]
        if hasattr(self, 'y'):
            y = self.y.copy()
        else:
            y = log_softmax._log_softmax(x, self.use_cudnn)
            numpy.exp(y, out=y)
        if y.ndim == 2:
            gx = y
            gx[numpy.arange(len(t)), numpy.maximum(t, 0)] -= 1
            if self.class_weight is not None:
                shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
                c = _broadcast_to(self.class_weight.reshape(shape), x.shape)
                c = c[numpy.arange(len(t)), numpy.maximum(t, 0)]
                gx *= _broadcast_to(numpy.expand_dims(c, 1), gx.shape)
            gx *= (t != self.ignore_label).reshape((len(t), 1))
        else:
            # in the case where y.ndim is higher than 2,
            # we think that a current implementation is inefficient
            # because it yields two provisional arrays for indexing.
            n_unit = t.size // len(t)
            gx = y.reshape(y.shape[0], y.shape[1], -1)
            fst_index = numpy.arange(t.size) // n_unit
            trd_index = numpy.arange(t.size) % n_unit
            gx[fst_index, numpy.maximum(t.ravel(), 0), trd_index] -= 1
            if self.class_weight is not None:
                shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
                c = _broadcast_to(self.class_weight.reshape(shape), x.shape)
                c = c.reshape(gx.shape)
                c = c[fst_index, numpy.maximum(t.ravel(), 0), trd_index]
                c = c.reshape(y.shape[0], 1, -1)
                gx *= _broadcast_to(c, gx.shape)
            gx *= (t != self.ignore_label).reshape((len(t), 1, -1))
            gx = gx.reshape(y.shape)

        if self.reduce == 'mean':
            gx *= gloss * self._coeff
        else:
            gx *= gloss[:, None]
        # weight
        gx *= self.weight.reshape((len(y), 1))
        return gx, None
Esempio n. 25
0
    def forward_gpu(self, inputs_and_grad_outputs):
        class_weight = cuda.to_gpu(self.class_weight)

        cupy = cuda.cupy
        x, t, gloss = inputs_and_grad_outputs
        if x.size == 0:
            return cupy.zeros(x.shape, dtype=x.dtype), None
        if self.y is not None:
            y = self.y
        else:
            y = log_softmax._log_softmax(x)
            cupy.exp(y, out=y)
        n_unit = t.size // len(t)
        if self.reduce == 'mean':
            coeff = gloss * self.coeff
        else:
            coeff = gloss[:, None, ...]

        if self.class_weight is None:
            gx = cuda.elementwise(
                'T y, S t, T coeff, S n_channel, S n_unit, S ignore_label',
                'T gx',
                '''
                    const int c = (i / n_unit % n_channel);
                    gx = t == ignore_label ? 0 : coeff * (y - (c == t));
                ''',
                'softmax_crossent_bwd')(
                    y, cupy.expand_dims(t, 1), coeff, x.shape[1],
                    n_unit, self.ignore_label)
        else:
            gx = cuda.elementwise(
                'T y, raw T w, S t, T coeff, S n_channel, S n_unit, '
                'S ignore_label',
                'T gx',
                '''
                    const int c = (i / n_unit % n_channel);
                    gx = t == ignore_label ? 0 : coeff * (y - (c == t)) * w[t];
                ''',
                'softmax_crossent_weight_bwd')(
                    y, class_weight, cupy.expand_dims(t, 1), coeff,
                    x.shape[1], n_unit, self.ignore_label)

        return gx,
Esempio n. 26
0
    def backward_cpu(self, inputs, grad_outputs):
        x, soft_label = inputs
        gloss = grad_outputs[0]
        if x.size == 0:
            return numpy.zeros(x.shape, dtype=x.dtype), None
        if self.y is not None:
            y = self.y.copy()
        else:
            y = log_softmax._log_softmax(x)
            numpy.exp(y, out=y)
        gx = y
        gx -= soft_label
        if self.class_weight is not None:
            shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
            c = _broadcast_to(self.class_weight.reshape(shape), x.shape)
            gx *= _broadcast_to(numpy.expand_dims(c, 1), gx.shape)
        gx *= gloss * self._coeff

        return gx, None
Esempio n. 27
0
    def forward_cpu(self, inputs):
        class_weight = backend.from_chx(self.class_weight)

        self.retain_inputs((0, 1))
        x, t = inputs
        if x.ndim == t.ndim and x.shape == t.shape:
            self.soft_target = True
        if chainer.is_debug() and not self.soft_target:
            _check_input_values(x, t, self.ignore_label)

        log_y = log_softmax._log_softmax(x)
        if self.cache_score:
            self.y = numpy.exp(log_y)

        if self.soft_target:
            return self._soft_target_loss(numpy, x, t, log_y)

        if class_weight is not None:
            shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
            log_y *= _broadcast_to(class_weight.reshape(shape), x.shape)
        log_yd = numpy.rollaxis(log_y, 1)
        log_yd = log_yd.reshape(len(log_yd), -1)
        t_valid = t != self.ignore_label
        t = t * t_valid
        log_p = log_yd[t.ravel(), numpy.arange(t.size)]

        log_p *= t_valid.ravel()
        if self.reduce == 'mean':
            if self.normalize:
                count = t_valid.sum()
            else:
                count = len(x)
            self._coeff = 1.0 / max(count, 1)

            # Perform reduction in a promoted dtype
            reduc_dtype = _reduction_dtype(x.dtype)
            y = log_p.sum(keepdims=True, dtype=reduc_dtype)
            y = y * (-self._coeff)
            y = y.astype(x.dtype, copy=False)
            return y.reshape(()),
        else:
            return -log_p.reshape(t.shape),
Esempio n. 28
0
    def backward_gpu(self, inputs, grad_outputs):
        cupy = cuda.cupy
        x, t = inputs
        if x.size == 0:
            return cupy.zeros(x.shape, dtype=x.dtype), None
        if self.y is not None:
            y = self.y
        else:
            y = log_softmax._log_softmax(x)
            cupy.exp(y, out=y)
        gloss = grad_outputs[0]
        n_unit = t.size // len(t)
        if self.reduce == 'mean':
            coeff = gloss * self._coeff
        else:
            coeff = gloss[:, None, ...]

        if self.class_weight is None:
            gx = cuda.elementwise(
                'T y, S t, T coeff, S n_channel, S n_unit, S ignore_label',
                'T gx',
                '''
                    const int c = (i / n_unit % n_channel);
                    gx = t == ignore_label ? 0 : coeff * (y - (c == t));
                ''',
                'softmax_crossent_bwd')(
                    y, cupy.expand_dims(t, 1), coeff, x.shape[1],
                    n_unit, self.ignore_label)
        else:
            gx = cuda.elementwise(
                'T y, raw T w, S t, T coeff, S n_channel, S n_unit, '
                'S ignore_label',
                'T gx',
                '''
                    const int c = (i / n_unit % n_channel);
                    gx = t == ignore_label ? 0 : coeff * (y - (c == t)) * w[t];
                ''',
                'softmax_crossent_weight_bwd')(
                    y, self.class_weight, cupy.expand_dims(t, 1), coeff,
                    x.shape[1], n_unit, self.ignore_label)

        return gx, None
Esempio n. 29
0
    def forward_gpu(self, inputs):
        cupy = cuda.cupy
        x, soft_label = inputs
        if chainer.is_debug():
            _check_input_values(x, soft_label)

        log_y = log_softmax._log_softmax(x)
        if self.cache_score:
            self.y = cupy.exp(log_y)
        if self.class_weight is not None:
            shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
            log_y *= cupy.broadcast_to(self.class_weight.reshape(shape),
                                       x.shape)
        if self.normalize:
            coeff = cupy.maximum(1, soft_label.shape[1])
        else:
            coeff = max(1, len(soft_label))
        self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)

        ret = -cupy.sum(soft_label * log_y) * self._coeff
        return ret,
Esempio n. 30
0
    def forward_cpu(self, inputs):
        x, soft_label = inputs
        if chainer.is_debug():
            _check_input_values(x, soft_label)

        log_y = log_softmax._log_softmax(x)
        if self.cache_score:
            self.y = numpy.exp(log_y)
        if self.class_weight is not None:
            shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
            log_y *= _broadcast_to(self.class_weight.reshape(shape), x.shape)

        log_p = numpy.array([numpy.sum(log_y * soft_label)])
        if self.normalize:
            count = x.shape[1]
        else:
            count = len(x)
        self._coeff = 1.0 / max(count, 1)

        y = log_p.sum(keepdims=True) * (-self._coeff)
        return y.reshape(()),
Esempio n. 31
0
    def backward_gpu(self, inputs, grad_outputs):
        cupy = cuda.cupy
        x, t = inputs
        if hasattr(self, 'y'):
            y = self.y
        else:
            y = log_softmax._log_softmax(x)
            cupy.exp(y, out=y)
        gloss = grad_outputs[0]
        n_unit = t.size // len(t)
        if self.reduce == 'mean':
            coeff = gloss * self._coeff
        else:
            coeff = gloss[:, None, ...]

        if self.class_weight is None:
            gx = cuda.elementwise(
                'T y, S t, T coeff, S n_channel, S n_unit, S ignore_label',
                'T gx',
                '''
                    const int c = (i / n_unit % n_channel);
                    gx = t == ignore_label ? 0 : coeff * (y - (c == t));
                ''',
                'softmax_crossent_bwd')(
                    y, cupy.expand_dims(t, 1), coeff, x.shape[1],
                    n_unit, self.ignore_label)
        else:
            gx = cuda.elementwise(
                'T y, raw T w, S t, T coeff, S n_channel, S n_unit, '
                'S ignore_label',
                'T gx',
                '''
                    const int c = (i / n_unit % n_channel);
                    gx = t == ignore_label ? 0 : coeff * (y - (c == t)) * w[t];
                ''',
                'softmax_crossent_weight_bwd')(
                    y, self.class_weight, cupy.expand_dims(t, 1), coeff,
                    x.shape[1], n_unit, self.ignore_label)

        return gx, None
Esempio n. 32
0
 def backward_cpu(self, inputs, grad_outputs):
     x, t = inputs
     gloss = grad_outputs[0]
     if hasattr(self, 'y'):
         y = self.y.copy()
     else:
         y = log_softmax._log_softmax(x, self.use_cudnn)
         numpy.exp(y, out=y)
     if y.ndim == 2:
         gx = y
         gx[numpy.arange(len(t)), numpy.maximum(t, 0)] -= 1
         if self.class_weight is not None:
             shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
             c = numpy.broadcast_to(
                 self.class_weight.reshape(shape), x.shape)
             c = c[numpy.arange(len(t)), numpy.maximum(t, 0)]
             gx *= numpy.broadcast_to(numpy.expand_dims(c, 1), gx.shape)
         gx *= (t != self.ignore_label).reshape((len(t), 1))
     else:
         # in the case where y.ndim is higher than 2,
         # we think that a current implementation is inefficient
         # because it yields two provisional arrays for indexing.
         n_unit = t.size // len(t)
         gx = y.reshape(y.shape[0], y.shape[1], -1)
         fst_index = numpy.arange(t.size) // n_unit
         trd_index = numpy.arange(t.size) % n_unit
         gx[fst_index, numpy.maximum(t.ravel(), 0), trd_index] -= 1
         if self.class_weight is not None:
             shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
             c = numpy.broadcast_to(
                 self.class_weight.reshape(shape), x.shape)
             c = c.reshape(gx.shape)
             c = c[fst_index, numpy.maximum(t.ravel(), 0), trd_index]
             c = c.reshape(y.shape[0], 1, -1)
             gx *= numpy.broadcast_to(c, gx.shape)
         gx *= (t != self.ignore_label).reshape((len(t), 1, -1))
         gx = gx.reshape(y.shape)
     gx *= gloss * self._coeff
     return gx, None
Esempio n. 33
0
    def forward_cpu(self, inputs):
        class_weight = backend.from_chx(self.class_weight)

        self.retain_inputs((0, 1))
        x, t = inputs
        if chainer.is_debug():
            _check_input_values(x, t, self.ignore_label)

        log_y = log_softmax._log_softmax(x)
        if self.cache_score:
            self.y = numpy.exp(log_y)
        if class_weight is not None:
            shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
            log_y *= _broadcast_to(class_weight.reshape(shape), x.shape)
        log_yd = numpy.rollaxis(log_y, 1)
        log_yd = log_yd.reshape(len(log_yd), -1)
        t_valid = t != self.ignore_label
        t = t * t_valid
        log_p = log_yd[t.ravel(), numpy.arange(t.size)]

        log_p *= t_valid.ravel()
        if self.reduce == 'mean':
            # deal with the case where the SoftmaxCrossEntropy is
            # unpickled from the old version
            if self.normalize:
                count = t_valid.sum()
            else:
                count = len(x)
            self._coeff = 1.0 / max(count, 1)

            # Perform reduction in a promoted dtype
            reduc_dtype = _reduction_dtype(x.dtype)
            y = log_p.sum(keepdims=True, dtype=reduc_dtype)
            y = y * (-self._coeff)
            y = y.astype(x.dtype, copy=False)
            return y.reshape(()),
        else:
            return -log_p.reshape(t.shape),
Esempio n. 34
0
    def forward_cpu(self, inputs):
        x, t = inputs
        if chainer.is_debug():
            self._check_input_values(x, t)

        log_y = log_softmax._log_softmax(x, self.use_cudnn)
        if self.cache_score:
            self.y = numpy.exp(log_y)
        log_yd = numpy.rollaxis(log_y, 1)
        log_yd = log_yd.reshape(len(log_yd), -1)
        log_p = log_yd[numpy.maximum(t.ravel(), 0), numpy.arange(t.size)]

        # deal with the case where the SoftmaxCrossEntropy is
        # unpickled from the old version
        if self.normalize:
            count = (t != self.ignore_label).sum()
        else:
            count = len(x)
        self._coeff = 1.0 / max(count, 1)

        y = (log_p * (t.ravel() != self.ignore_label)).sum(keepdims=True) \
            * (-self._coeff)
        return y.reshape(()),
Esempio n. 35
0
    def forward_cpu(self, inputs):
        x, t = inputs
        if chainer.is_debug():
            self._check_input_values(x, t)

        log_y = log_softmax._log_softmax(x, self.use_cudnn)
        if self.cache_score:
            self.y = numpy.exp(log_y)
        log_yd = numpy.rollaxis(log_y, 1)
        log_yd = log_yd.reshape(len(log_yd), -1)
        log_p = log_yd[numpy.maximum(t.ravel(), 0), numpy.arange(t.size)]

        # deal with the case where the SoftmaxCrossEntropy is
        # unpickled from the old version
        if self.normalize:
            count = (t != self.ignore_label).sum()
        else:
            count = len(x)
        self._coeff = 1.0 / max(count, 1)

        y = (log_p * (t.ravel() != self.ignore_label)).sum(keepdims=True) \
            * (-self._coeff)
        return y.reshape(()),
Esempio n. 36
0
    def forward_cpu(self, inputs):
        x, t = inputs
        if chainer.is_debug():
            self._check_input_values(x, t)

        # Improve me
        # It is disabled by default
        if mkld.enable_softmax_cross_entropyF(inputs):
            y_out = numpy.empty(x.shape, dtype=numpy.float32)
            mkldnn_sce_fwd = mkldnn.SoftmaxCrossEntropy_F32_softmax_cross_entropy_create_forward(
                x.shape)
            mkldnn_sce_fwd.forward(x.ravel(), y_out.ravel(), x.shape)
            log_y = y_out
        else:
            log_y = log_softmax._log_softmax(x, self.use_cudnn)

        if self.cache_score:
            self.y = numpy.exp(log_y)
        if self.class_weight is not None:
            shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
            log_y *= numpy.broadcast_to(self.class_weight.reshape(shape),
                                        x.shape)
        log_yd = numpy.rollaxis(log_y, 1)
        log_yd = log_yd.reshape(len(log_yd), -1)
        log_p = log_yd[numpy.maximum(t.ravel(), 0), numpy.arange(t.size)]

        # deal with the case where the SoftmaxCrossEntropy is
        # unpickled from the old version
        if self.normalize:
            count = (t != self.ignore_label).sum()
        else:
            count = len(x)
        self._coeff = 1.0 / max(count, 1)

        y = (log_p * (t.ravel() != self.ignore_label)).sum(keepdims=True) \
            * (-self._coeff)
        return y.reshape(()),
Esempio n. 37
0
 def backward_cpu(self, inputs, grad_outputs):
     x, t = inputs
     gloss = grad_outputs[0]
     if hasattr(self, 'y'):
         y = self.y.copy()
     else:
         y = log_softmax._log_softmax(x)
         np.exp(y, out=y)
     if y.ndim == 2:
         gx = y
         gx[np.arange(len(t)), np.maximum(t, 0)] -= 1
         if self.class_weight is not None:
             shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
             c = _broadcast_to(self.class_weight.reshape(shape), x.shape)
             c = c[np.arange(len(t)), np.maximum(t, 0)]
             gx *= _broadcast_to(np.expand_dims(c, 1), gx.shape)
         gx *= (t != self.ignore_label).reshape((len(t), 1))
     else:
         n_unit = t.size // len(t)
         gx = y.reshape(y.shape[0], y.shape[1], -1)
         fst_index = np.arange(t.size) // n_unit
         trd_index = np.arange(t.size) % n_unit
         gx[fst_index, np.maximum(t.ravel(), 0), trd_index] -= 1
         if self.class_weight is not None:
             shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
             c = _broadcast_to(self.class_weight.reshape(shape), x.shape)
             c = c.reshape(gx.shape)
             c = c[fst_index, np.maximum(t.ravel(), 0), trd_index]
             c = c.reshape(y.shape[0], 1, -1)
             gx *= _broadcast_to(c, gx.shape)
         gx *= (t != self.ignore_label).reshape((len(t), 1, -1))
         gx = gx.reshape(y.shape)
     if self.reduce == 'mean':
         gx *= gloss * self._coeff
     else:
         gx *= gloss[:, None]
     return gx, None
Esempio n. 38
0
    def forward(self, inputs):
        x, t = inputs[:2]
        rest = len(inputs) - 2
        head_W, Ws = inputs[2], inputs[3:2 + (rest - 1) // 2 + 1]
        Rs = inputs[2 + (rest - 1) // 2 + 1:]
        n_tails = len(Rs)
        # minus_inf = -1024.
        minus_inf = -numpy.inf
        xp = cuda.get_array_module(x)

        if chainer.is_debug():
            _check_input_values(x, t, self.ignore_label)

        self.retain_inputs(tuple(six.moves.range(len(inputs))))

        cluster_hots = []
        for i in six.moves.range(1, n_tails + 1):
            lower, upper = self.cutoff[i], self.cutoff[i + 1]
            in_cluster = xp.logical_and(lower <= t, t < upper)
            if self.output_all:
                in_cluster = xp.ones(
                    in_cluster.shape, dtype=in_cluster.dtype)
            cluster_hots.append(in_cluster)
        self.cluster_hots = cluster_hots

        self.head = self.linear(x, head_W)
        self.ls_head = log_softmax._log_softmax(self.head)
        self.reduced_xs = []
        self.tails = []
        self.ls_tails = []
        for i, in_cluster in enumerate(cluster_hots, start=1):
            tail_idx = i - 1
            if xp.any(in_cluster):
                reduced_x = self.linear(x[in_cluster], Rs[tail_idx])
                self.reduced_xs.append(reduced_x)
                out = self.linear(reduced_x, Ws[tail_idx])
                self.tails.append(out)
                ls_out = log_softmax._log_softmax(out)
                self.ls_tails.append(ls_out)
            else:
                self.reduced_xs.append(None)
                self.tails.append(None)
                self.ls_tails.append(None)

        n_head_out = head_W.shape[0] - n_tails
        n_out = n_head_out + sum(W.shape[0] for W in Ws)
        shape = (x.shape[0], n_out)

        log_y = xp.full(shape, minus_inf, dtype=x.dtype)

        log_y[:, :n_head_out] = self.ls_head[:, :n_head_out]
        for i, (in_cluster, tail) in enumerate(
                zip(cluster_hots, self.ls_tails), start=1):
            if tail is None:
                continue
            lower, upper = self.cutoff[i], self.cutoff[i + 1]

            tail_main = self.ls_head[:, n_head_out + i - 1]
            tail_main_in = xp.broadcast_to(
                tail_main[in_cluster][:, None], tail.shape)
            log_y[xp.nonzero(in_cluster)[0], lower:upper] = tail_main_in + tail
            not_in_cluster = xp.logical_not(in_cluster)
            log_y[xp.nonzero(not_in_cluster)[0],
                  lower] = tail_main[not_in_cluster]

        return log_y,
Esempio n. 39
0
    def forward(self, inputs):
        x, t = inputs[:2]
        rest = len(inputs) - 2
        head_W, Ws = inputs[2], inputs[3:2 + (rest - 1) // 2 + 1]
        Rs = inputs[2 + (rest - 1) // 2 + 1:]
        n_tails = len(Rs)
        # minus_inf = -1024.
        minus_inf = -numpy.inf
        xp = cuda.get_array_module(x)

        if chainer.is_debug():
            _check_input_values(x, t, self.ignore_label)

        self.retain_inputs(tuple(six.moves.range(len(inputs))))

        cluster_hots = []
        for i in six.moves.range(1, n_tails + 1):
            lower, upper = self.cutoff[i], self.cutoff[i + 1]
            in_cluster = xp.logical_and(lower <= t, t < upper)
            if self.output_all:
                in_cluster = xp.ones(
                    in_cluster.shape, dtype=in_cluster.dtype)
            cluster_hots.append(in_cluster)
        self.cluster_hots = cluster_hots

        self.head = self.linear(x, head_W)
        self.ls_head = log_softmax._log_softmax(self.head)
        self.reduced_xs = []
        self.tails = []
        self.ls_tails = []
        for i, in_cluster in enumerate(cluster_hots, start=1):
            tail_idx = i - 1
            if xp.any(in_cluster):
                reduced_x = self.linear(x[in_cluster], Rs[tail_idx])
                self.reduced_xs.append(reduced_x)
                out = self.linear(reduced_x, Ws[tail_idx])
                self.tails.append(out)
                ls_out = log_softmax._log_softmax(out)
                self.ls_tails.append(ls_out)
            else:
                self.reduced_xs.append(None)
                self.tails.append(None)
                self.ls_tails.append(None)

        n_head_out = head_W.shape[0] - n_tails
        n_out = n_head_out + sum(W.shape[0] for W in Ws)
        shape = (x.shape[0], n_out)

        log_y = xp.full(shape, minus_inf, dtype=x.dtype)

        log_y[:, :n_head_out] = self.ls_head[:, :n_head_out]
        for i, (in_cluster, tail) in enumerate(
                zip(cluster_hots, self.ls_tails), start=1):
            if tail is None:
                continue
            lower, upper = self.cutoff[i], self.cutoff[i + 1]

            tail_main = self.ls_head[:, n_head_out + i - 1]
            tail_main_in = xp.broadcast_to(
                tail_main[in_cluster][:, None], tail.shape)
            log_y[xp.nonzero(in_cluster)[0], lower:upper] = tail_main_in + tail
            not_in_cluster = xp.logical_not(in_cluster)
            log_y[xp.nonzero(not_in_cluster)[0],
                  lower] = tail_main[not_in_cluster]

        return log_y,