Beispiel #1
0
    def forward(self, x):
        """Updates the internal state and returns the LSTM outputs.

        Args:
            x (~chainer.Variable): A new batch from the input sequence.

        Returns:
            ~chainer.Variable: Outputs of updated LSTM units.

        """
        lstm_in = self.upward(x)
        if self.h is not None:
            lstm_in += self.lateral(self.h)
        if self.c is None:
            xp = self.xp
            with chainer.using_device(self.device):
                self.c = variable.Variable(
                    xp.zeros((len(x), self.state_size), dtype=x.dtype))
        lstm_in = reshape.reshape(lstm_in,
                                  (len(lstm_in), lstm_in.shape[1] // 4, 4))
        a, i, f, o = split_axis.split_axis(lstm_in, 4, 2)
        a = reshape.reshape(a, a.shape[:2])
        i = reshape.reshape(i, i.shape[:2])
        f = reshape.reshape(f, f.shape[:2])
        o = reshape.reshape(o, o.shape[:2])
        peep_in_i = self.peep_i(self.c)
        peep_in_f = self.peep_f(self.c)
        a = tanh.tanh(a)
        i = sigmoid.sigmoid(i + peep_in_i)
        f = sigmoid.sigmoid(f + peep_in_f)
        self.c = a * i + f * self.c
        peep_in_o = self.peep_o(self.c)
        o = sigmoid.sigmoid(o + peep_in_o)
        self.h = o * tanh.tanh(self.c)
        return self.h
Beispiel #2
0
    def __call__(self, x):
        """Updates the internal state and returns the LSTM outputs.

        Args:
            x (~chainer.Variable): A new batch from the input sequence.

        Returns:
            ~chainer.Variable: Outputs of updated LSTM units.

        """
        lstm_in = self.upward(x)
        if self.h is not None:
            lstm_in += self.lateral(self.h)
        if self.c is None:
            xp = self.xp
            self.c = variable.Variable(xp.zeros((x.shape[0], self.state_size), dtype=x.dtype), volatile="auto")
        lstm_in = reshape.reshape(lstm_in, (len(lstm_in.data), lstm_in.shape[1] // 4, 4))
        a, i, f, o = split_axis.split_axis(lstm_in, 4, 2)
        a = reshape.reshape(a, (len(a.data), a.shape[1]))
        i = reshape.reshape(i, (len(i.data), i.shape[1]))
        f = reshape.reshape(f, (len(f.data), f.shape[1]))
        o = reshape.reshape(o, (len(o.data), o.shape[1]))
        peep_in_i = self.peep_i(self.c)
        peep_in_f = self.peep_f(self.c)
        a = tanh.tanh(a)
        i = sigmoid.sigmoid(i + peep_in_i)
        f = sigmoid.sigmoid(f + peep_in_f)
        self.c = a * i + f * self.c
        peep_in_o = self.peep_o(self.c)
        o = sigmoid.sigmoid(o + peep_in_o)
        self.h = o * tanh.tanh(self.c)
        return self.h
Beispiel #3
0
    def __call__(self, x):
        """Updates the internal state and returns the LSTM outputs.

        Args:
            x (~chainer.Variable): A new batch from the input sequence.

        Returns:
            ~chainer.Variable: Outputs of updated LSTM units.

        """
        lstm_in = self.upward(x)
        if self.h is not None:
            lstm_in += self.lateral(self.h)
        if self.c is None:
            xp = self.xp
            self.c = variable.Variable(xp.zeros((len(x.data), self.state_size),
                                                dtype=x.data.dtype),
                                       volatile='auto')
        lstm_in = reshape.reshape(
            lstm_in, (len(lstm_in.data), lstm_in.data.shape[1] // 4, 4))
        a, i, f, o = split_axis.split_axis(lstm_in, 4, 2)
        a = reshape.reshape(a, (len(a.data), a.data.shape[1]))
        i = reshape.reshape(i, (len(i.data), i.data.shape[1]))
        f = reshape.reshape(f, (len(f.data), f.data.shape[1]))
        o = reshape.reshape(o, (len(o.data), o.data.shape[1]))
        peep_in_i = self.peep_i(self.c)
        peep_in_f = self.peep_f(self.c)
        a = tanh.tanh(a)
        i = sigmoid.sigmoid(i + peep_in_i)
        f = sigmoid.sigmoid(f + peep_in_f)
        self.c = a * i + f * self.c
        peep_in_o = self.peep_o(self.c)
        o = sigmoid.sigmoid(o + peep_in_o)
        self.h = o * tanh.tanh(self.c)
        return self.h
Beispiel #4
0
    def forward(self, x):
        """Updates the internal state and returns the LSTM outputs.

        Args:
            x (~chainer.Variable): A new batch from the input sequence.

        Returns:
            ~chainer.Variable: Outputs of updated LSTM units.

        """
        lstm_in = self.upward(x)
        if self.h is not None:
            lstm_in += self.lateral(self.h)
        if self.c is None:
            xp = self.xp
            with chainer.using_device(self.device):
                self.c = variable.Variable(
                    xp.zeros((len(x), self.state_size), dtype=x.dtype))
        lstm_in = reshape.reshape(
            lstm_in, (len(lstm_in), lstm_in.shape[1] // 4, 4))
        a, i, f, o = split_axis.split_axis(lstm_in, 4, 2)
        a = reshape.reshape(a, a.shape[:2])
        i = reshape.reshape(i, i.shape[:2])
        f = reshape.reshape(f, f.shape[:2])
        o = reshape.reshape(o, o.shape[:2])
        peep_in_i = self.peep_i(self.c)
        peep_in_f = self.peep_f(self.c)
        a = tanh.tanh(a)
        i = sigmoid.sigmoid(i + peep_in_i)
        f = sigmoid.sigmoid(f + peep_in_f)
        self.c = a * i + f * self.c
        peep_in_o = self.peep_o(self.c)
        o = sigmoid.sigmoid(o + peep_in_o)
        self.h = o * tanh.tanh(self.c)
        return self.h
Beispiel #5
0
def crf1d(cost, xs, ys):

    """Calculates negative log-likelihood of linear-chain CRF.

    It takes a transition cost matrix, a sequence of costs, and a sequence of
    labels. Let :math:`c_{st}` be a transition cost from a label :math:`s` to
    a label :math:`t`, :math:`x_{it}` be a cost of a label :math:`t` at
    position :math:`i`, and :math:`y_i` be an expected label at position
    :math:`i`. The negative log-likelihood of linear-chain CRF is defined as

    .. math::
        L = -\\left( \\sum_{i=1}^l x_{iy_i} + \\
             \\sum_{i=1}^{l-1} c_{y_i y_{i+1}} - {\\log(Z)} \\right) ,

    where :math:`l` is the length of the input sequence and :math:`Z` is the
    normalizing constant called partition function.

    Args:
        cost (Variable): A :math:`K \\times K` matrix which holds transition
            cost between two labels, where :math:`K` is the number of labels.
        xs (list of Variable): Input feature vector for each label. Each
            :class:`~chainer.Variable` holds a :math:`B \\times K`
            matrix, where :math:`B` is mini-batch size, :math:`K` is the number
            of labels.
        ys (list of Variable): Expected output labels. Each
            :class:`~chainer.Variable` holds a :math:`B` integer vector.

    Returns:
        ~chainer.Variable: A variable holding the average negative
            log-likelihood of the input sequences.

    .. note::

        See detail in the original paper: `Conditional Random Fields:
        Probabilistic Models for Segmenting and Labeling Sequence Data
        <http://repository.upenn.edu/cis_papers/159/>`_.

    """
    assert xs[0].data.shape[1] == cost.data.shape[0]

    n_label = cost.data.shape[0]
    n_batch = xs[0].data.shape[0]

    alpha = xs[0]
    for x in xs[1:]:
        b_alpha, b_cost = broadcast.broadcast(alpha[..., None], cost)
        alpha = logsumexp.logsumexp(b_alpha + b_cost, axis=1) + x

    logz = logsumexp.logsumexp(alpha, axis=1)

    score = 0
    cost = reshape.reshape(cost, (cost.data.size, 1))
    for y1, y2 in zip(ys[:-1], ys[1:]):
        score += reshape.reshape(
            embed_id.embed_id(y1 * n_label + y2, cost), (n_batch,))
    for x, y in zip(xs, ys):
        score += select_item.select_item(x, y)

    return _sum.sum(logz - score) / n_batch
Beispiel #6
0
def black_out(x, t, W, samples):
    """BlackOut loss function.

    BlackOut loss function is defined as

    .. math::

      -\\log(p(t)) - \\sum_{s \\in S} \\log(1 - p(s)),

    where :math:`t` is the correct label, :math:`S` is a set of negative
    examples and :math:`p(\cdot)` is likelihood of a given label.
    And, :math:`p` is defined as

    .. math::

       p(y) = \\frac{\\exp(W_y^\\top x)}{
       \\sum_{s \\in samples} \\exp(W_s^\\top x)}.

    Args:
        x (~chainer.Variable): Batch of input vectors.
        t (~chainer.Variable): Vector of ground truth labels.
        W (~chainer.Variable): Weight matrix.
        samples (~chainer.Variable): Negative samples.

    Returns:
        ~chainer.Variable: Loss value.

    See: `BlackOut: Speeding up Recurrent Neural Network Language Models With \
         Very Large Vocabularies <https://arxiv.org/abs/1511.06909>`_

    .. seealso:: :class:`~chainer.links.BlackOut`.

    """

    batch_size = x.shape[0]

    neg_emb = embed_id.embed_id(samples, W)
    neg_y = matmul.batch_matmul(neg_emb, x)
    neg_y = reshape.reshape(neg_y, neg_y.shape[:-1])

    pos_emb = expand_dims.expand_dims(embed_id.embed_id(t, W), 1)
    pos_y = matmul.batch_matmul(pos_emb, x)
    pos_y = reshape.reshape(pos_y, pos_y.shape[:-1])

    logz = logsumexp.logsumexp(concat.concat([pos_y, neg_y]), axis=1)
    blogz, bneg_y = broadcast.broadcast(reshape.reshape(logz, (batch_size, 1)),
                                        neg_y)
    ny = exponential.log(1 - exponential.exp(bneg_y - blogz))
    py = reshape.reshape(pos_y, (batch_size, ))
    loss = py - logz + _sum.sum(ny, axis=1)
    return -_sum.sum(loss) / batch_size
Beispiel #7
0
def black_out(x, t, W, samples):
    """BlackOut loss function.

    BlackOut loss function is defined as

    .. math::

      -\\log(p(t)) - \\sum_{s \\in S} \\log(1 - p(s)),

    where :math:`t` is the correct label, :math:`S` is a set of negative
    examples and :math:`p(\cdot)` is likelihood of a given label.
    And, :math:`p` is defined as

    .. math::

       p(y) = \\frac{\\exp(W_y^\\top x)}{
       \\sum_{s \\in samples} \\exp(W_s^\\top x)}.

    Args:
        x (~chainer.Variable): Batch of input vectors.
        t (~chainer.Variable): Vector of ground truth labels.
        W (~chainer.Variable): Weight matrix.
        samples (~chainer.Variable): Negative samples.

    Returns:
        ~chainer.Variable: Loss value.

    See: `BlackOut: Speeding up Recurrent Neural Network Language Models With \
         Very Large Vocabularies <https://arxiv.org/abs/1511.06909>`_

    .. seealso:: :class:`~chainer.links.BlackOut`.

    """

    batch_size = x.shape[0]

    neg_emb = embed_id.embed_id(samples, W)
    neg_y = matmul.batch_matmul(neg_emb, x)
    neg_y = reshape.reshape(neg_y, neg_y.shape[:-1])

    pos_emb = expand_dims.expand_dims(embed_id.embed_id(t, W), 1)
    pos_y = matmul.batch_matmul(pos_emb, x)
    pos_y = reshape.reshape(pos_y, pos_y.shape[:-1])

    logz = logsumexp.logsumexp(concat.concat([pos_y, neg_y]), axis=1)
    blogz, bneg_y = broadcast.broadcast(
        reshape.reshape(logz, (batch_size, 1)), neg_y)
    ny = exponential.log(1 - exponential.exp(bneg_y - blogz))
    py = reshape.reshape(pos_y, (batch_size,))
    loss = py - logz + _sum.sum(ny, axis=1)
    return -_sum.sum(loss) / batch_size
Beispiel #8
0
    def predict(self, images, oversample=True):
        """Computes all the probabilities of given images.

        Args:
            images (iterable of PIL.Image or numpy.ndarray): Input images.
                When you specify a color image as a :class:`numpy.ndarray`,
                make sure that color order is RGB.
            oversample (bool): If ``True``, it averages results across
                center, corners, and mirrors. Otherwise, it uses only the
                center.

        Returns:
            ~chainer.Variable: Output that contains the class probabilities
            of given images.

        """

        x = concat_examples([prepare(img, size=(256, 256)) for img in images])
        if oversample:
            x = imgproc.oversample(x, crop_dims=(224, 224))
        else:
            x = x[:, :, 16:240, 16:240]
        # Use no_backprop_mode to reduce memory consumption
        with function.no_backprop_mode(), chainer.using_config('train', False):
            x = Variable(self.xp.asarray(x))
            y = self(x, layers=['prob'])['prob']
            if oversample:
                n = len(y) // 10
                y_shape = y.shape[1:]
                y = reshape(y, (n, 10) + y_shape)
                y = sum(y, axis=1) / 10
        return y
Beispiel #9
0
def average(x, axis=None, weights=None, keepdims=False):
    """Calculate weighted average of array elements over a given axis.

    Args:
        x (~chainer.Variable): Elements to sum.
        axis (None or int or tuple of int): Axis which the method is performed.
            With the default (axis = None) it performs a mean over all the
            dimensions of the input array.
        weights (None or chainer.Variable): An array holding weights to
            calculate weighted average. If it is ``None``, all weights are
            assumed to be one.
            When ``axis`` is ``None``, ``weights`` must have the same shape
            of ``x``. And when ``axis`` is ``int``, it must be 1-D array
            satisfing ``weights.shape == (x.shape[axis],)``.
        keepdims (bool): If ``True``, the specified axes are remained as axes
            of length one.

    Returns:
        ~chainer.Variable: Output variable.

    """
    if axis is None:
        pass
    elif isinstance(axis, tuple):
        axis = [a + x.ndim if a < 0 else a for a in axis]
        axis.sort()
        for a, b in six.moves.zip(axis, axis[1:]):
            if a == b:
                raise ValueError('duplicate value in \'axis\'')
        axis = tuple(axis)
    else:
        if axis < 0:
            axis += x.ndim
        axis = (axis, )

    if weights is not None:
        if axis is not None and len(axis) > 1:
            raise ValueError(
                'tuple axis is not supported when weights is given')
        divider = sum_mod.sum(weights)
        if axis is not None:
            w_shape = [d if i in axis else 1 for i, d in enumerate(x.shape)]
            weights = broadcast.broadcast_to(reshape.reshape(weights, w_shape),
                                             x.shape)

        x = x * weights
    else:
        if axis is None:
            divider = x.size
        else:
            divider = 1
            for a in axis:
                divider *= x.shape[a]

    x_sum = sum_mod.sum(x, axis, keepdims)
    if weights is not None:
        # We do not need to call broadcast whene weights is None because
        # divider here is not a Variable but a scalar
        divider = broadcast.broadcast_to(divider, x_sum.shape)
    return x_sum / divider
Beispiel #10
0
def average(x, axis=None, weights=None, keepdims=False):
    """Calculate weighted average of array elements over a given axis.

    Args:
        x (~chainer.Variable): Elements to sum.
        axis (None or int or tuple of int): Axis which the method is performed.
            With the default (axis = None) it performs a mean over all the
            dimensions of the input array.
        weights (None or chainer.Variable): An array holding weights to
            calculate weighted average. If it is ``None``, all weights are
            assumed to be one.
            When ``axis`` is ``None``, ``weights`` must have the same shape
            of ``x``. And when ``axis`` is ``int``, it must be 1-D array
            satisfing ``weights.shape == (x.shape[axis],)``.
        keepdims (bool): If ``True``, the specified axes are remained as axes
            of length one.

    Returns:
        ~chainer.Variable: Output variable.

    """
    if axis is None:
        pass
    elif isinstance(axis, tuple):
        axis = [a + x.ndim if a < 0 else a for a in axis]
        axis.sort()
        for a, b in six.moves.zip(axis, axis[1:]):
            if a == b:
                raise ValueError('duplicate value in \'axis\'')
        axis = tuple(axis)
    else:
        if axis < 0:
            axis += x.ndim
        axis = (axis,)

    if weights is not None:
        if axis is not None and len(axis) > 1:
            raise ValueError(
                'tuple axis is not supported when weights is given')
        divider = sum_mod.sum(weights)
        if axis is not None:
            w_shape = [d if i in axis else 1 for i, d in enumerate(x.shape)]
            weights = broadcast.broadcast_to(
                reshape.reshape(weights, w_shape), x.shape)

        x = x * weights
    else:
        if axis is None:
            divider = x.size
        else:
            divider = 1
            for a in axis:
                divider *= x.shape[a]

    x_sum = sum_mod.sum(x, axis, keepdims)
    if weights is not None:
        # We do not need to call broadcast when weights is None because
        # divider here is not a Variable but a scalar
        divider = broadcast.broadcast_to(divider, x_sum.shape)
    return x_sum / divider
Beispiel #11
0
    def predict(self, images, oversample=True):
        """Computes all the probabilities of given images.

        Args:
            images (iterable of PIL.Image or numpy.ndarray): Input images.
            oversample (bool): If ``True``, it averages results across
                center, corners, and mirrors. Otherwise, it uses only the
                center.

        Returns:
            ~chainer.Variable: Output that contains the class probabilities
            of given images.

        """

        x = concat_examples([prepare(img, size=(256, 256)) for img in images])
        if oversample:
            x = imgproc.oversample(x, crop_dims=(224, 224))
        else:
            x = x[:, :, 16:240, 16:240]
        # Set volatile option to ON to reduce memory consumption
        x = Variable(self.xp.asarray(x), volatile=flag.ON)
        y = self(x, layers=['prob'])['prob']
        if oversample:
            n = y.data.shape[0] // 10
            y_shape = y.data.shape[1:]
            y = reshape(y, (n, 10) + y_shape)
            y = sum(y, axis=1) / 10
        return y
Beispiel #12
0
    def predict(self, images, oversample=True):
        """Computes all the probabilities of given images.

        Args:
            images (iterable of PIL.Image or numpy.ndarray): Input images.
            oversample (bool): If ``True``, it averages results across
                center, corners, and mirrors. Otherwise, it uses only the
                center.

        Returns:
            ~chainer.Variable: Output that contains the class probabilities
            of given images.

        """

        x = concat_examples([prepare(img, size=(256, 256)) for img in images])
        if oversample:
            x = imgproc.oversample(x, crop_dims=(224, 224))
        else:
            x = x[:, :, 16:240, 16:240]
        # Use no_backprop_mode to reduce memory consumption
        with function.no_backprop_mode():
            x = Variable(self.xp.asarray(x))
            y = self(x, layers=['prob'])['prob']
            if oversample:
                n = y.data.shape[0] // 10
                y_shape = y.data.shape[1:]
                y = reshape(y, (n, 10) + y_shape)
                y = sum(y, axis=1) / 10
        return y
Beispiel #13
0
    def __call__(self, x):
        """Updates the internal state and returns the LSTM outputs.

        Args:
            x (~chainer.Variable): A new batch from the input sequence.

        Returns:
            ~chainer.Variable: Outputs of updated LSTM units.

        """
        lstm_in = self.upward(x)
        if self.h is not None:
            lstm_in += self.lateral(self.h)
        else:
            xp = self.xp
            with cuda.get_device(self._device_id):
                self.h = variable.Variable(
                    xp.zeros((len(x.data), self.state_size),
                             dtype=x.data.dtype),
                    volatile='auto')
        if self.c is None:
            xp = self.xp
            with cuda.get_device(self._device_id):
                self.c = variable.Variable(
                    xp.zeros((len(x.data), self.state_size),
                             dtype=x.data.dtype),
                    volatile='auto')

        lstm_in = reshape.reshape(lstm_in, (len(lstm_in.data),
                                            lstm_in.data.shape[1] // 4,
                                            4))

        a, i, f, o = split_axis.split_axis(lstm_in, 4, 2)
        a = reshape.reshape(a, (len(a.data), self.state_size))
        i = reshape.reshape(i, (len(i.data), self.state_size))
        f = reshape.reshape(f, (len(f.data), self.state_size))
        o = reshape.reshape(o, (len(o.data), self.state_size))

        c_tmp = tanh.tanh(a) * sigmoid.sigmoid(i) + sigmoid.sigmoid(f) * self.c
        self.c = zoneout.zoneout(self.c, c_tmp, self.c_ratio, self.train)
        self.h = zoneout.zoneout(self.h,
                                 sigmoid.sigmoid(o) * tanh.tanh(c_tmp),
                                 self.h_ratio, self.train)
        return self.h
Beispiel #14
0
    def covariance(self):
        """ The covariance of the independent distribution.

        By definition, the covariance of the new
        distribution becomes block diagonal matrix. Let
        :math:`\\Sigma_{\\mathbf{x}}` be the covariance matrix of the original
        random variable :math:`\\mathbf{x} \\in \\mathbb{R}^d`, and
        :math:`\\mathbf{x}^{(1)}, \\mathbf{x}^{(2)}, \\cdots \\mathbf{x}^{(m)}`
        be the :math:`m` i.i.d. random variables, new covariance matrix
        :math:`\\Sigma_{\\mathbf{y}}` of :math:`\\mathbf{y} =
        [\\mathbf{x}^{(1)}, \\mathbf{x}^{(2)}, \\cdots, \\mathbf{x}^{(m)}] \\in
        \\mathbb{R}^{md}` can be written as

        .. math::
            \\left[\\begin{array}{ccc}
                    \\Sigma_{\\mathbf{x}^{1}} & & 0 \\\\
                    & \\ddots & \\\\
                    0 & & \\Sigma_{\\mathbf{x}^{m}}
            \\end{array} \\right].

        Note that this relationship holds only if the covariance matrix of the
        original distribution is given analytically.

        Returns:
            ~chainer.Variable: The covariance of the distribution.
        """
        num_repeat = array.size_of_shape(
            self.distribution.batch_shape[-self.reinterpreted_batch_ndims:])
        dim = array.size_of_shape(self.distribution.event_shape)
        cov = repeat.repeat(
            reshape.reshape(
                self.distribution.covariance,
                ((self.batch_shape) + (1, num_repeat, dim, dim))),
            num_repeat, axis=-4)
        cov = reshape.reshape(
            transpose.transpose(
                cov, axes=(
                    tuple(range(len(self.batch_shape))) + (-4, -2, -3, -1))),
            self.batch_shape + (num_repeat * dim, num_repeat * dim))
        block_indicator = self.xp.reshape(
            self._block_indicator,
            tuple([1] * len(self.batch_shape)) + self._block_indicator.shape)
        return cov * block_indicator
Beispiel #15
0
    def covariance(self):
        """ The covariance of the independent distribution.

        By definition, the covariance of the new
        distribution becomes block diagonal matrix. Let
        :math:`\\Sigma_{\\mathbf{x}}` be the covariance matrix of the original
        random variable :math:`\\mathbf{x} \\in \\mathbb{R}^d`, and
        :math:`\\mathbf{x}^{(1)}, \\mathbf{x}^{(2)}, \\cdots \\mathbf{x}^{(m)}`
        be the :math:`m` i.i.d. random variables, new covariance matrix
        :math:`\\Sigma_{\\mathbf{y}}` of :math:`\\mathbf{y} =
        [\\mathbf{x}^{(1)}, \\mathbf{x}^{(2)}, \\cdots, \\mathbf{x}^{(m)}] \\in
        \\mathbb{R}^{md}` can be written as

        .. math::
            \\left[\\begin{array}{ccc}
                    \\Sigma_{\\mathbf{x}^{1}} & & 0 \\\\
                    & \\ddots & \\\\
                    0 & & \\Sigma_{\\mathbf{x}^{m}}
            \\end{array} \\right].

        Note that this relationship holds only if the covariance matrix of the
        original distribution is given analytically.

        Returns:
            ~chainer.Variable: The covariance of the distribution.
        """
        num_repeat = array.size_of_shape(
            self.distribution.batch_shape[-self.reinterpreted_batch_ndims:])
        dim = array.size_of_shape(self.distribution.event_shape)
        cov = repeat.repeat(reshape.reshape(self.distribution.covariance,
                                            ((self.batch_shape) +
                                             (1, num_repeat, dim, dim))),
                            num_repeat,
                            axis=-4)
        cov = reshape.reshape(
            transpose.transpose(cov,
                                axes=(tuple(range(len(self.batch_shape))) +
                                      (-4, -2, -3, -1))),
            self.batch_shape + (num_repeat * dim, num_repeat * dim))
        block_indicator = self.xp.reshape(
            self._block_indicator,
            tuple([1] * len(self.batch_shape)) + self._block_indicator.shape)
        return cov * block_indicator
Beispiel #16
0
    def forward_one_step(self, hps, x_data, sensor_data, y_data, train=True):
        x_r = Variable(x_data[:, 0:3, :, :], volatile=not train)
        x_s = Variable(sensor_data, volatile=not train)
        t = Variable(y_data, volatile=not train)

        cn1_r = F.max_pooling_2d(self.prelu1_r(
            self.bn1_r(self.conv1_r(x_r), test=not train)),
                                 ksize=2,
                                 stride=2,
                                 pad=0)
        cn2_r = F.max_pooling_2d(self.prelu2_r(
            self.bn2_r(self.conv2_r(cn1_r), test=not train)),
                                 ksize=2,
                                 stride=2,
                                 pad=0)
        cn3_r = F.max_pooling_2d(self.prelu3_r(
            self.bn3_r(self.conv3_r(cn2_r), test=not train)),
                                 ksize=2,
                                 stride=2,
                                 pad=0)
        cn5_rm = F.max_pooling_2d(self.prelu5(
            self.bn5(self.conv5(cn3_r), test=not train)),
                                  ksize=2,
                                  stride=2,
                                  pad=0)
        sen6 = F.dropout(self.prelu6(self.bn6(self.fc6(x_s), test=not train)),
                         ratio=hps.dropout,
                         train=train)
        cs7 = self.cccp7(
            concat.concat(
                (reshape.reshape(cn5_rm,
                                 (cn5_rm.shape[0], 1, cn5_rm.shape[1] *
                                  cn5_rm.shape[2] * cn5_rm.shape[3], 1)),
                 reshape.reshape(sen6, (sen6.shape[0], 1, sen6.shape[1], 1))),
                axis=1))
        cs8 = F.dropout(self.prelu8(self.bn8(self.fc8(cs7), test=not train)),
                        ratio=hps.dropout,
                        train=train)

        y = self.fc9(cs8)

        return y, F.mean_squared_error(y, t)
 def __call__(self, x, train=True):
     x = reshape.reshape(x, (len(x.data), 1) + x.data.shape[1:])
     x = self.convolution(x, train)
     xs = split_axis.split_axis(x, x.data.shape[2], 2)
     for x in xs:
         x.data = self.xp.ascontiguousarray(x.data)
     for r in self.recurrent:
         r.reset_state()
     xs = self.recurrent(xs, train)
     xs = self._linear(xs, train)
     return xs
Beispiel #18
0
def _sum_rightmost(value, dim):
    """Sum out `dim` many rightmost dimensions of a given tensor.

    Args:
        value (Tensor): A tensor of ``.dim()`` at least ``dim``.
        dim (int): The number of rightmost dims to sum out.
    """
    if dim == 0:
        return value
    required_shape = value.shape[:-dim] + (-1, )
    return sum_mod.sum(reshape.reshape(value, required_shape), axis=-1)
Beispiel #19
0
 def __call__(self, x):
     x = self.embed(x)
     xs = split_axis.split_axis(x, x.data.shape[1], 1)
     ret = []
     for x in xs:
         x = self.rnn1(x)
         x = self.rnn2(x)
         x = self.linear(x)
         x = reshape.reshape(x, x.data.shape + (-1, ))
         ret.append(x)
     ret = concat.concat(ret, axis=2)
     return ret
Beispiel #20
0
    def forward(self, x, y):
        """Updates the internal state and returns the LSTM outputs.

        Args:
            x (~chainer.Variable): A new batch from the input sequence.

        Returns:
            ~chainer.Variable: Outputs of updated LSTM units.

        """
        if self.upward.has_uninitialized_params:
            in_size = x.size // x.shape[0]
            self.upward._initialize_params(in_size)
            self._initialize_params()

        batch = x.shape[0]
        lstm_in = self.upward(x)

        if self.h is not None:
            h_size = self.h.shape[0]
            if batch == 0:
                h_rest = self.h
            elif h_size < batch:
                msg = ('The batch size of x must be equal to or less than the '
                       'size of the previous state h.')
                raise TypeError(msg)
            elif h_size > batch:
                h_update, h_rest = split_axis.split_axis(
                    self.h, [batch], axis=0)
                lstm_in += self.lateral(h_update)
            else:
                lstm_in += self.lateral(self.h)
        if self.c is None:
            xp = self.xp
            self.c = variable.Variable(
                xp.zeros((batch, self.state_size), dtype=x.dtype),
                volatile='auto')

        r = reshape.reshape(lstm_in, (len(lstm_in.data), lstm_in.data.shape[1] // 4, 4) + lstm_in.data.shape[2:])
        a, i, f, o = [r[:, :, i] for i in range(4)]

        # self.c, y = lstm.lstm(self.c,lstm_in)

        a = tanh.tanh(a)  # tanh.tanh(a)
        i = sigmoid.sigmoid(i)
        f = sigmoid.sigmoid(f)
        o = sigmoid.sigmoid(o)

        self.c = a * i + f * self.c + tanh(self.w_y(y))
        self.h = o * tanh.tanh(self.c)

        return self.h
Beispiel #21
0
    def get_weights(self, hps, x_data, sensor_data, train=False):
        x_r = Variable(x_data[:, 0:3, :, :], volatile=not train)
        x_s = Variable(sensor_data, volatile=not train)

        cn1_r = F.max_pooling_2d(self.prelu1_r(
            self.bn1_r(self.conv1_r(x_r), test=not train)),
                                 ksize=2,
                                 stride=2,
                                 pad=0)
        cn2_r = F.max_pooling_2d(self.prelu2_r(
            self.bn2_r(self.conv2_r(cn1_r), test=not train)),
                                 ksize=2,
                                 stride=2,
                                 pad=0)
        cn3_r = F.max_pooling_2d(self.prelu3_r(
            self.bn3_r(self.conv3_r(cn2_r), test=not train)),
                                 ksize=2,
                                 stride=2,
                                 pad=0)
        cn5_rm = F.max_pooling_2d(self.prelu5(
            self.bn5(self.conv5(cn3_r), test=not train)),
                                  ksize=2,
                                  stride=2,
                                  pad=0)
        sen6 = F.dropout(self.prelu6(self.bn6(self.fc6(x_s), test=not train)),
                         ratio=hps.dropout,
                         train=train)
        cs7 = self.cccp7(
            concat.concat(
                (reshape.reshape(cn5_rm,
                                 (cn5_rm.shape[0], 1, cn5_rm.shape[1] *
                                  cn5_rm.shape[2] * cn5_rm.shape[3], 1)),
                 reshape.reshape(sen6, (sen6.shape[0], 1, sen6.shape[1], 1))),
                axis=1))
        cs8 = F.dropout(self.prelu8(self.bn8(self.fc8(cs7), test=not train)),
                        ratio=hps.dropout,
                        train=train)

        return cuda.to_cpu(cs8.data)
Beispiel #22
0
    def __call__(self, batchsize):
        """....
        
        Args:
            eps (~chainer.Variable): 
                a wsize-length vector whose elements are drawn from 
                normal distribution (mean = 0, std = 1).
            batchsize  (~chainer.Variable): 
                (batch size)  *  (number of truncated backward gradient calculation for a training dataset)

        Returns:
            ~chainer.Variable: Output of the linear layer.

        """
        """
        self.m_hat  = reshape.reshape(sum.sum(self.M)/self.M.data.shape[0], (1,1))
        M, m_hat = broadcast.broadcast(self.M, self.m_hat)
        self.s2_hat = sum.sum(self.S2 + (M - m_hat)*(M - m_hat))/self.M.data.shape[0]
        
        print('m_hat.data {}'.format(self.m_hat.data))
        print('self.s2_hat.data {}'.format(self.s2_hat.data))
        print('self.S2.data {}'.format(self.S2.data))
        print('self.M.data {}'.format(self.M.data))
        print('------------------')
        """

        self.fWb, loss = adaptive_weight_noise(batchsize, self.M, self.logS2,
                                               self.use_weight_noise)

        if self.nobias:
            return reshape.reshape(self.fWb,
                                   (self.out_size, self.in_size)), loss
        else:
            self.fW, self.fb = split_axis.split_axis(
                self.fWb,
                numpy.asarray([(self.in_size - 1) * self.out_size]),
                axis=0)
            return reshape.reshape(
                self.fW, (self.out_size, self.in_size - 1)), self.fb, loss
Beispiel #23
0
def separate(x, axis=0):
    """Separates an array along a given axis.

    This function separates an array along a given axis. For example, shape of
    an array is ``(2, 3, 4)``. When it separates the array with ``axis=1``, it
    returns three ``(2, 4)`` arrays.

    This function is an inverse of :func:`chainer.functions.stack`.

    Args:
        x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
        :class:`cupy.ndarray`):
            Variable to be separated.
            A :math:`(s_1, s_2, ..., s_N)` -shaped float array.
        axis (int): Axis along which variables are separated.

    Returns:
        tuple of chainer.Variable: Output variables.

    .. seealso:: :func:`chainer.functions.stack`

    .. admonition:: Example

        >>> x = np.arange(6).reshape((2, 3)).astype('f')
        >>> x
        array([[ 0.,  1.,  2.],
               [ 3.,  4.,  5.]], dtype=float32)
        >>> x.shape
        (2, 3)
        >>> y = F.separate(x) # split along axis=0
        >>> type(y)
        <class 'tuple'>
        >>> len(y)
        2
        >>> y[0].shape
        (3,)
        >>> y[0].data
        array([ 0.,  1.,  2.], dtype=float32)
        >>> y = F.separate(x, axis=1)
        >>> len(y)
        3
        >>> y[0].shape
        (2,)
        >>> y[0].data
        array([ 0.,  3.], dtype=float32)

    """
    shape = list(x.shape)
    del shape[axis]
    ys = split_axis.split_axis(x, x.shape[axis], axis, force_tuple=True)
    return tuple(reshape.reshape(y, shape) for y in ys)
Beispiel #24
0
def separate(x, axis=0):
    """Separates an array along a given axis.

    This function separates an array along a given axis. For example, shape of
    an array is ``(2, 3, 4)``. When it separates the array with ``axis=1``, it
    returns three ``(2, 4)`` arrays.

    This function is an inverse of :func:`chainer.functions.stack`.

    Args:
        x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
        :class:`cupy.ndarray`):
            Variable to be separated.
            A :math:`(s_1, s_2, ..., s_N)` -shaped float array.
        axis (int): Axis along which variables are separated.

    Returns:
        tuple of chainer.Variable: Output variables.

    .. seealso:: :func:`chainer.functions.stack`

    .. admonition:: Example

        >>> x = np.arange(6).reshape((2, 3)).astype('f')
        >>> x
        array([[ 0.,  1.,  2.],
               [ 3.,  4.,  5.]], dtype=float32)
        >>> x.shape
        (2, 3)
        >>> y = F.separate(x) # split along axis=0
        >>> isinstance(y, tuple)
        True
        >>> len(y)
        2
        >>> y[0].shape
        (3,)
        >>> y[0].data
        array([ 0.,  1.,  2.], dtype=float32)
        >>> y = F.separate(x, axis=1)
        >>> len(y)
        3
        >>> y[0].shape
        (2,)
        >>> y[0].data
        array([ 0.,  3.], dtype=float32)

    """
    shape = list(x.shape)
    del shape[axis]
    ys = split_axis.split_axis(x, x.shape[axis], axis, force_tuple=True)
    return tuple(reshape.reshape(y, shape) for y in ys)
Beispiel #25
0
 def __call__(self, x):
     x = self.embed(x)
     xs = split_axis.split_axis(x, x.data.shape[1], 1)
     ret = []
     for x in xs:
         for l in self.rnns:
             x = l(x)
             x = dropout.dropout(x, 0.25, self.train)
         for l in self.linears:
             x = l(x)
         x = reshape.reshape(x, x.data.shape + (-1, ))
         ret.append(x)
     ret = concat.concat(ret, axis=2)
     return ret
Beispiel #26
0
def maxout(x, pool_size, axis=1):
    """Maxout activation function.

    It accepts an input tensor ``x``, reshapes the ``axis`` dimension
    (say the size being ``M * pool_size``) into two dimensions
    ``(M, pool_size)``, and takes maximum along the ``axis`` dimension.
    The output of this function is same as ``x`` except that ``axis`` dimension
    is transformed from ``M * pool_size`` to ``M``.

    Typically, ``x`` is the output of a linear layer or a convolution layer.
    The following is the example where we use :func:`maxout` in combination
    with a Linear link.

    >>> import numpy, chainer, chainer.links as L
    >>> in_size, out_size, pool_size = 100, 100, 100
    >>> l = L.Linear(in_size, out_size * pool_size)
    >>> x = chainer.Variable(numpy.zeros((1, in_size), 'f'))  # prepare data
    >>> x = l(x)
    >>> y = maxout(x, pool_size)

    Args:
       x (~chainer.Variable): Input variable. Its first dimension is assumed
            to be the *minibatch dimension*. The other dimensions are treated
            as one concatenated dimension.
    Returns:
        ~chainer.Variable: Output variable.

    .. seealso:: :class:`~chainer.links.Maxout`
    """

    if pool_size <= 0:
        raise ValueError('pool_size must be a positive integer.')

    x_shape = x.data.shape
    if x_shape[axis] % pool_size != 0:
        expect = 'x.data.shape[axis] % pool_size == 0'
        actual = 'x.data.shape[axis]={}, pool_size={}'.format(
            x_shape[axis], pool_size)
        msg = 'axis dimension must be divided by pool_size'
        raise type_check.InvalidType(expect, actual, msg)

    shape = (x_shape[:axis] +
             (x_shape[axis] // pool_size, pool_size) +
             x_shape[axis + 1:])
    x = reshape.reshape(x, shape)
    return minmax.max(x, axis=axis + 1)
Beispiel #27
0
def maxout(x, pool_size, axis=1):
    """Maxout activation function.

    It accepts an input tensor ``x``, reshapes the ``axis`` dimension
    (say the size being ``M * pool_size``) into two dimensions
    ``(M, pool_size)``, and takes maximum along the ``axis`` dimension.
    The output of this function is same as ``x`` except that ``axis`` dimension
    is transformed from ``M * pool_size`` to ``M``.

    Typically, ``x`` is the output of a linear layer or a convolution layer.
    The following is the example where we use :func:`maxout` in combination
    with a Linear link.

    >>> import numpy, chainer, chainer.links as L
    >>> in_size, out_size, pool_size = 100, 100, 100
    >>> l = L.Linear(in_size, out_size * pool_size)
    >>> x = chainer.Variable(numpy.zeros((1, in_size), 'f'))  # prepare data
    >>> x = l(x)
    >>> y = maxout(x, pool_size)

    Args:
       x (~chainer.Variable): Input variable. Its first dimension is assumed
            to be the *minibatch dimension*. The other dimensions are treated
            as one concatenated dimension.
    Returns:
        ~chainer.Variable: Output variable.

    .. seealso:: :class:`~chainer.links.Maxout`
    """

    if pool_size <= 0:
        raise ValueError('pool_size must be a positive integer.')

    x_shape = x.data.shape
    if x_shape[axis] % pool_size != 0:
        expect = 'x.data.shape[axis] % pool_size == 0'
        actual = 'x.data.shape[axis]={}, pool_size={}'.format(
            x_shape[axis], pool_size)
        msg = 'axis dimension must be divided by pool_size'
        raise type_check.InvalidType(expect, actual, msg)

    shape = (x_shape[:axis] + (x_shape[axis] // pool_size, pool_size) +
             x_shape[axis + 1:])
    x = reshape.reshape(x, shape)
    return minmax.max(x, axis=axis + 1)
Beispiel #28
0
 def functions(self):
     return collections.OrderedDict([
         ('conv1', [self.conv1, relu]),
         ('pool1', [lambda x: max_pooling_2d(x, 3, stride=2)]),
         ('fire2', [self.fire2]),
         ('fire3', [self.fire3]),
         ('pool2', [lambda x: max_pooling_2d(x, 3, stride=2)]),
         ('fire4', [self.fire4]),
         ('fire5', [self.fire5]),
         ('pool3', [lambda x: max_pooling_2d(x, 3, stride=2)]),
         ('fire6', [self.fire6]),
         ('fire7', [self.fire7]),
         ('fire8', [self.fire8]),
         ('fire9', [self.fire9, dropout]),
         ('conv10', [self.conv10, relu]),
         ('pool4', [lambda x: average_pooling_2d(x, 13)]),
         ('prob',  [lambda x: reshape(x, (-1, 1000))]),
     ])
Beispiel #29
0
def bias(x, y, axis=1):
    """Elementwise summation with broadcasting.

    Computes a elementwise summation of two input variables, with the shape of
    the latter variable broadcasted to match the shape of the former. ``axis``
    is the first axis of the first variable along which the second variable is
    applied.

    The term "broadcasting" here comes from Caffe's bias layer so the
    "broadcasting" with the following arguments::

           x : 100 x 3 x 40 x 5 x 6
           y : 3 x 40
        axis : 1

    is equivalent to the following numpy broadcasting::

        x : 100 x  3 x 40 x 5 x 6
        y :  (1 x) 3 x 40 x 1 x 1

    Note that the axis of ``x`` to which we apply ``y`` is specified by the
    argument ``axis``, whose meaning is different from numpy's ``axis``.

    Args:
        x (:class:`~chainer.Variable` or :ref:`ndarray`):
            Input variable to be summed.
        y (:class:`~chainer.Variable` or :ref:`ndarray`):
            Input variable to sum, broadcasted.
        axis (int): The first axis of ``x`` along which ``y`` is applied.

    Returns:
        ~chainer.Variable: Output variable.

    """
    x_shape = x.shape
    y_shape = y.shape
    if chainer.is_debug():
        assert x_shape[axis:axis + len(y_shape)] == y_shape
    y1_shape = tuple([1] * axis + list(y_shape) + [1] *
                     (len(x_shape) - axis - len(y_shape)))
    y1 = reshape.reshape(y, y1_shape)
    y2 = broadcast.broadcast_to(y1, x_shape)
    return x + y2
Beispiel #30
0
def average(x, axis=None, weights=None, keepdims=False):
    """Calculate weighted average of array elements over a given axis.

    Args:
        x (~chainer.Variable): Elements to sum.
        axis (None or int): Axis which the method is performed.
            With the default (axis = None) it performs a mean over all the
            dimensions of the input array.
        weights (None or chainer.Variable): An array holding weights to
            calculate weighted average. If it is ``None``, all weights are
            assumed to be one.
            When ``axis`` is ``None``, ``weights`` must have the same shape
            of ``x``. And when ``axis`` is ``int``, it must be 1-D array
            satisfing ``weights.shape == (x.shape[axis],)``.
        keepdims (bool): If ``True``, the specified axes are remained as axes
            of length one.

    Returns:
        ~chainer.Variable: Output variable.

    """
    if weights is not None:
        divider = sum_mod.sum(weights)
        if axis is not None:
            if axis < 0:
                axis += x.ndim
            w_shape = [d if i == axis else 1 for i, d in enumerate(x.shape)]
            weights = broadcast.broadcast_to(reshape.reshape(weights, w_shape),
                                             x.shape)

        x = x * weights
    else:
        if axis is None:
            divider = x.size
        else:
            divider = x.shape[axis]

    x_sum = sum_mod.sum(x, axis, keepdims)
    if weights is not None:
        # We do not need to call broadcast whene weights is None because
        # divider here is not a Variable but a scalar
        divider = broadcast.broadcast_to(divider, x_sum.shape)
    return x_sum / divider
Beispiel #31
0
def bias(x, y, axis=1):
    """Elementwise summation with broadcasting.

    Computes a elementwise summation of two input variables, with the shape of
    the latter variable broadcasted to match the shape of the former. ``axis``
    is the first axis of the first variable along which the second variable is
    applied.

    The term "broadcasting" here comes from Caffe's bias layer so the
    "broadcasting" with the following arguments::

           x : 100 x 3 x 40 x 5 x 6
           y : 3 x 40
        axis : 1

    is equivalent to the following numpy broadcasting::

        x : 100 x  3 x 40 x 5 x 6
        y :  (1 x) 3 x 40 x 1 x 1

    Note that the axis of ``x`` to which we apply ``y`` is specified by the
    argument ``axis``, whose meaning is different from numpy's ``axis``.

    Args:
        x (:class:`~chainer.Variable` or :ref:`ndarray`):
            Input variable to be summed.
        y (:class:`~chainer.Variable` or :ref:`ndarray`):
            Input variable to sum, broadcasted.
        axis (int): The first axis of ``x`` along which ``y`` is applied.

    Returns:
        ~chainer.Variable: Output variable.

    """
    x_shape = x.shape
    y_shape = y.shape
    if chainer.is_debug():
        assert x_shape[axis:axis + len(y_shape)] == y_shape
    y1_shape = tuple([1] * axis + list(y_shape) +
                     [1] * (len(x_shape) - axis - len(y_shape)))
    y1 = reshape.reshape(y, y1_shape)
    y2 = broadcast.broadcast_to(y1, x_shape)
    return x + y2
Beispiel #32
0
def average(x, axis=None, weights=None, keepdims=False):
    """Calculate weighted average of array elements over a given axis.

    Args:
        x (~chainer.Variable): Elements to sum.
        axis (None or int): Axis which the method is performed.
            With the default (axis = None) it performs a mean over all the
            dimensions of the input array.
        weights (None or chainer.Variable): An array holding weights to
            calculate weighted average. If it is ``None``, all weights are
            assumed to be one.
            When ``axis`` is ``None``, ``weights`` must have the same shape
            of ``x``. And when ``axis`` is ``int``, it must be 1-D array
            satisfing ``weights.shape == (x.shape[axis],)``.
        keepdims (bool): If ``True``, the specified axes are remained as axes
            of length one.

    Returns:
        ~chainer.Variable: Output variable.

    """
    if weights is not None:
        divider = sum_mod.sum(weights)
        if axis is not None:
            if axis < 0:
                axis += x.ndim
            w_shape = [d if i == axis else 1 for i, d in enumerate(x.shape)]
            weights = broadcast.broadcast_to(
                reshape.reshape(weights, w_shape), x.shape)

        x = x * weights
    else:
        if axis is None:
            divider = x.size
        else:
            divider = x.shape[axis]

    x_sum = sum_mod.sum(x, axis, keepdims)
    if weights is not None:
        # We do not need to call broadcast whene weights is None because
        # divider here is not a Variable but a scalar
        divider = broadcast.broadcast_to(divider, x_sum.shape)
    return x_sum / divider
Beispiel #33
0
def scale(x, y, axis=1):
    """Elementwise product with broadcasting.

    Computes a elementwise product of two input variables, with the shape of
    the latter variable broadcasted to match the shape of the former. ``axis``
    is the first axis of the first variable along which the second variable is
    applied.

    The term "broadcasting" here comes from Caffe's scale layer so the
    "broadcasting" with the following arguments::

           x : 100 x 3 x 40 x 60
           y : 3 x 40
        axis : 1

    is equivalent to the following numpy broadcasting::

        x : 100 x 3 x 40 x 60
        y :   1 x 3 x 40 x 1

    Note that how the ``axis`` indicates to which axis of ``x`` we apply ``y``.

    Args:
        x (~chainer.Variable): Input variable to be scaled.
        y (~chainer.Variable): Input variable to scale, broadcasted.
        axis (int): The first axis of ``x`` along which ``y`` is applied.

    Returns:
        ~chainer.Variable: Output variable.

    """
    x_shape = x.shape
    y_shape = y.shape
    if chainer.is_debug():
        assert x_shape[axis:axis + len(y_shape)] == y_shape
    y1_shape = tuple([1] * axis + list(y_shape) +
                     [1] * (len(x_shape) - axis - len(y_shape)))
    y1 = reshape.reshape(y, y1_shape)
    y2 = broadcast.broadcast_to(y1, x_shape)
    return x * y2
def scale(x, y, axis=1):
    """Elementwise product with broadcasting.

    Computes a elementwise product of two input variables, with the shape of
    the latter variable broadcasted to match the shape of the former. ``axis``
    is the first axis of the first variable along which the second variable is
    applied.

    The term "broadcasting" here comes from Caffe's scale layer so the
    "broadcasting" with the following arguments::

           x : 100 x 3 x 40 x 60
           y : 3 x 40
        axis : 1

    is equivalent to the following numpy broadcasting::

        x : 100 x 3 x 40 x 60
        y :   1 x 3 x 40 x 1

    Note that how the ``axis`` indicates to which axis of ``x`` we apply ``y``.

    Args:
        x (~chainer.Variable): Input variable to be scaled.
        y (~chainer.Variable): Input variable to scale, broadcasted.
        axis (int): The first axis of ``x`` along which ``y`` is applied.

    Returns:
        ~chainer.Variable: Output variable.

    """
    x_shape = x.shape
    y_shape = y.shape
    if chainer.is_debug():
        assert x_shape[axis:axis + len(y_shape)] == y_shape
    y1_shape = tuple([1] * axis + list(y_shape) + [1] *
                     (len(x_shape) - axis - len(y_shape)))
    y1 = reshape.reshape(y, y1_shape)
    y2 = broadcast.broadcast_to(y1, x_shape)
    return x * y2
Beispiel #35
0
def separate(x, axis=0):
    """Separates an array along a given axis.

    This function separates an array along a given axis. For example, shape of
    an array is ``(2, 3, 4)``. When it separates the array with ``axis=1``, it
    returns three ``(2, 4)`` arrays.

    This function is an inverse of :func:`chainer.functions.stack`.

    Args:
        x (chainer.Variable): Variable to be separated.
        axis (int): Axis along which variables are separated.

    Returns:
        tuple of chainer.Variable: Output variables.

    .. seealso:: :func:`chainer.functions.stack`

    """
    shape = list(x.shape)
    del shape[axis]
    ys = split_axis.split_axis(x, x.shape[axis], axis, force_tuple=True)
    return tuple(reshape.reshape(y, shape) for y in ys)
Beispiel #36
0
def _stack_weight(ws):
    # TODO(unno): Input of the current LSTM implementaiton is shuffled
    w = stack.stack(ws, axis=1)
    shape = w.shape
    return reshape.reshape(w, (shape[0] * shape[1],) + shape[2:])
Beispiel #37
0
def n_step_lstm_base(
        n_layers, dropout_ratio, hx, cx, ws, bs, xs, use_bi_direction,
        **kwargs):
    """Base function for Stack LSTM/BiLSTM functions.

    This function is used at :func:`chainer.functions.n_step_lstm` and
    :func:`chainer.functions.n_step_bilstm`.
    This function's behavior depends on following arguments,
    ``activation`` and ``use_bi_direction``.

    Args:
        n_layers(int): The number of layers.
        dropout_ratio(float): Dropout ratio.
        hx (~chainer.Variable): Variable holding stacked hidden states.
            Its shape is ``(S, B, N)`` where ``S`` is the number of layers and
            is equal to ``n_layers``, ``B`` is the mini-batch size, and ``N``
            is the dimension of the hidden units.
        cx (~chainer.Variable): Variable holding stacked cell states.
            It has the same shape as ``hx``.
        ws (list of list of :class:`~chainer.Variable`): Weight matrices.
            ``ws[i]`` represents the weights for the i-th layer.
            Each ``ws[i]`` is a list containing eight matrices.
            ``ws[i][j]`` corresponds to :math:`W_j` in the equation.
            Only ``ws[0][j]`` where ``0 <= j < 4`` are ``(I, N)``-shape as they
            are multiplied with input variables, where ``I`` is the size of
            the input and ``N`` is the dimension of the hidden units. All
            other matrices are ``(N, N)``-shaped.
        bs (list of list of :class:`~chainer.Variable`): Bias vectors.
            ``bs[i]`` represents the biases for the i-th layer.
            Each ``bs[i]`` is a list containing eight vectors.
            ``bs[i][j]`` corresponds to :math:`b_j` in the equation.
            The shape of each matrix is ``(N,)``.
        xs (list of :class:`~chainer.Variable`):
            A list of :class:`~chainer.Variable`
            holding input values. Each element ``xs[t]`` holds input value
            for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is the
            mini-batch size for time ``t``. The sequences must be transposed.
            :func:`~chainer.functions.transpose_sequence` can be used to
            transpose a list of :class:`~chainer.Variable`\\ s each
            representing a sequence.
            When sequences has different lengths, they must be
            sorted in descending order of their lengths before transposing.
            So ``xs`` needs to satisfy
            ``xs[t].shape[0] >= xs[t + 1].shape[0]``.
        use_bi_direction (bool): If ``True``, this function uses Bi-directional
            LSTM.

    Returns:
        tuple: This functions returns a tuple concaining three elements,
        ``hy``, ``cy`` and ``ys``.

            - ``hy`` is an updated hidden states whose shape is the same as
              ``hx``.
            - ``cy`` is an updated cell states whose shape is the same as
              ``cx``.
            - ``ys`` is a list of :class:`~chainer.Variable` . Each element
              ``ys[t]`` holds hidden states of the last layer corresponding
              to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is
              the mini-batch size for time ``t``. Note that ``B_t`` is the same
              value as ``xs[t]``.

    .. seealso::

       :func:`chainer.functions.n_step_lstm`
       :func:`chainer.functions.n_step_bilstm`

    """

    argument.check_unexpected_kwargs(
        kwargs, train='train argument is not supported anymore. '
        'Use chainer.using_config',
        use_cudnn='use_cudnn argument is not supported anymore. '
        'Use chainer.using_config')
    argument.assert_kwargs_empty(kwargs)

    xp = cuda.get_array_module(hx, hx.data)

    if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000):
        states = get_random_state().create_dropout_states(dropout_ratio)
        lengths = [len(x) for x in xs]
        xs = chainer.functions.concat(xs, axis=0)
        # flatten all input variables
        inputs = tuple(itertools.chain(
            (hx, cx),
            itertools.chain.from_iterable(ws),
            itertools.chain.from_iterable(bs),
            (xs,)))
        if use_bi_direction:
            rnn = NStepBiLSTM
        else:
            rnn = NStepLSTM

        hy, cy, ys = rnn(n_layers, states, lengths)(*inputs)
        sections = numpy.cumsum(lengths[:-1])
        ys = chainer.functions.split_axis(ys, sections, 0)
        return hy, cy, ys

    else:
        direction = 2 if use_bi_direction else 1
        split_size = n_layers * direction
        hx = split_axis.split_axis(hx, split_size, axis=0, force_tuple=True)
        hx = [reshape.reshape(h, h.shape[1:]) for h in hx]
        cx = split_axis.split_axis(cx, split_size, axis=0, force_tuple=True)
        cx = [reshape.reshape(c, c.shape[1:]) for c in cx]

        xws = [_stack_weight([w[2], w[0], w[1], w[3]]) for w in ws]
        hws = [_stack_weight([w[6], w[4], w[5], w[7]]) for w in ws]
        xbs = [_stack_weight([b[2], b[0], b[1], b[3]]) for b in bs]
        hbs = [_stack_weight([b[6], b[4], b[5], b[7]]) for b in bs]

        xs_next = xs
        hy = []
        cy = []
        for layer in six.moves.range(n_layers):

            def _one_directional_loop(di):
                # di=0, forward LSTM
                # di=1, backward LSTM
                h_list = []
                c_list = []
                layer_idx = direction * layer + di
                h = hx[layer_idx]
                c = cx[layer_idx]
                if di == 0:
                    xs_list = xs_next
                else:
                    xs_list = reversed(xs_next)
                for x in xs_list:
                    batch = x.shape[0]
                    if h.shape[0] > batch:
                        h, h_rest = split_axis.split_axis(h, [batch], axis=0)
                        c, c_rest = split_axis.split_axis(c, [batch], axis=0)
                    else:
                        h_rest = None
                        c_rest = None

                    if layer != 0:
                        x = dropout.dropout(x, ratio=dropout_ratio)
                    lstm_in = linear.linear(x, xws[layer_idx],
                                            xbs[layer_idx]) + \
                        linear.linear(h, hws[layer_idx], hbs[layer_idx])

                    c_bar, h_bar = lstm.lstm(c, lstm_in)
                    if h_rest is not None:
                        h = concat.concat([h_bar, h_rest], axis=0)
                        c = concat.concat([c_bar, c_rest], axis=0)
                    else:
                        h = h_bar
                        c = c_bar
                    h_list.append(h_bar)
                    c_list.append(c_bar)
                return h, c, h_list, c_list

            h, c, h_forward, c_forward = _one_directional_loop(di=0)
            hy.append(h)
            cy.append(c)

            if use_bi_direction:
                # BiLSTM
                h, c, h_backward, c_backward = _one_directional_loop(di=1)
                hy.append(h)
                cy.append(c)

                h_backward.reverse()
                # concat
                xs_next = [concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in
                           zip(h_forward, h_backward)]
            else:
                # Uni-directional RNN
                xs_next = h_forward

        ys = xs_next
        hy = stack.stack(hy)
        cy = stack.stack(cy)
        return hy, cy, tuple(ys)
Beispiel #38
0
def _stack_weight(ws):
    # TODO(unno): Input of the current LSTM implementaiton is shuffled
    w = stack.stack(ws, axis=1)
    shape = w.shape
    return reshape.reshape(w, (shape[0] * shape[1],) + shape[2:])
Beispiel #39
0
def _global_average_pooling_2d(x):
    n, channel, rows, cols = x.data.shape
    h = average_pooling_2d(x, (rows, cols), stride=1)
    h = reshape(h, (n, channel))
    return h
Beispiel #40
0
def n_step_gru_base(n_layers, dropout_ratio, hx, ws, bs, xs,
                    use_bi_direction, **kwargs):
    """n_step_gru_base(n_layers, dropout_ratio, hx, ws, bs, xs, use_bi_direction)

    Base function for Stack GRU/BiGRU functions.

    This function is used at  :func:`chainer.functions.n_step_bigru` and
    :func:`chainer.functions.n_step_gru`.
    This function's behavior depends on argument ``use_bi_direction``.

    .. warning::

       ``train`` and ``use_cudnn`` arguments are not supported anymore since
       v2.
       Instead, use ``chainer.using_config('train', train)`` and
       ``chainer.using_config('use_cudnn', use_cudnn)`` respectively.
       See :func:`chainer.using_config`.

    Args:
        n_layers(int): Number of layers.
        dropout_ratio(float): Dropout ratio.
        hx (chainer.Variable): Variable holding stacked hidden states.
            Its shape is ``(S, B, N)`` where ``S`` is number of layers and is
            equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is
            dimension of hidden units. Because of bi-direction, the
            first dimension length is ``2S``.
        ws (list of list of chainer.Variable): Weight matrices. ``ws[i]``
            represents weights for i-th layer.
            Each ``ws[i]`` is a list containing six matrices.
            ``ws[i][j]`` is corresponding with ``W_j`` in the equation.
            Only ``ws[0][j]`` where ``0 <= j < 3`` is ``(I, N)`` shape as they
            are multiplied with input variables. All other matrices has
            ``(N, N)`` shape.
        bs (list of list of chainer.Variable): Bias vectors. ``bs[i]``
            represnents biases for i-th layer.
            Each ``bs[i]`` is a list containing six vectors.
            ``bs[i][j]`` is corresponding with ``b_j`` in the equation.
            Shape of each matrix is ``(N,)`` where ``N`` is dimension of
            hidden units.
        xs (list of chainer.Variable): A list of :class:`~chainer.Variable`
            holding input values. Each element ``xs[t]`` holds input value
            for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is
            mini-batch size for time ``t``, and ``I`` is size of input units.
            Note that this functions supports variable length sequences.
            When sequneces has different lengths, sort sequences in descending
            order by length, and transpose the sorted sequence.
            :func:`~chainer.functions.transpose_sequence` transpose a list
            of :func:`~chainer.Variable` holding sequence.
            So ``xs`` needs to satisfy
            ``xs[t].shape[0] >= xs[t + 1].shape[0]``.
        activation (str): Activation function name.
            Please select ``tanh`` or ``relu``.
        use_bi_direction (bool): If ``True``, this function uses
            Bi-direction GRU.

    .. seealso::
       :func:`chainer.functions.n_step_rnn`
       :func:`chainer.functions.n_step_birnn`

    """  # NOQA
    argument.check_unexpected_kwargs(
        kwargs, train='train argument is not supported anymore. '
        'Use chainer.using_config',
        use_cudnn='use_cudnn argument is not supported anymore. '
        'Use chainer.using_config')
    argument.assert_kwargs_empty(kwargs)

    xp = cuda.get_array_module(hx, hx.data)

    if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000):
        states = get_random_state().create_dropout_states(dropout_ratio)
        # flatten all input variables
        inputs = tuple(itertools.chain(
            (hx, ),
            itertools.chain.from_iterable(ws),
            itertools.chain.from_iterable(bs),
            xs))
        if use_bi_direction:
            rnn = NStepBiGRU(n_layers, states)
        else:
            rnn = NStepGRU(n_layers, states)

        ret = rnn(*inputs)
        hy, = ret[:1]
        ys = ret[1:]
        return hy, ys

    else:
        direction = 2 if use_bi_direction else 1
        hx = split_axis.split_axis(hx, n_layers * direction, axis=0,
                                   force_tuple=True)
        hx = [reshape.reshape(h, h.shape[1:]) for h in hx]

        xws = [concat.concat([w[0], w[1], w[2]], axis=0) for w in ws]
        hws = [concat.concat([w[3], w[4], w[5]], axis=0) for w in ws]
        xbs = [concat.concat([b[0], b[1], b[2]], axis=0) for b in bs]
        hbs = [concat.concat([b[3], b[4], b[5]], axis=0) for b in bs]

        xs_next = xs
        hy = []
        for layer in six.moves.range(n_layers):

            def _one_directional_loop(di):
                # di=0, forward GRU
                # di=1, backward GRU
                xs_list = xs_next if di == 0 else reversed(xs_next)
                layer_idx = direction * layer + di
                h = hx[layer_idx]
                h_list = []
                for x in xs_list:
                    batch = x.shape[0]
                    if h.shape[0] > batch:
                        h, h_rest = split_axis.split_axis(h, [batch], axis=0)
                    else:
                        h_rest = None

                    if layer > 0:
                        x = dropout.dropout(x, ratio=dropout_ratio)

                    gru_x = linear.linear(x, xws[layer_idx], xbs[layer_idx])
                    gru_h = linear.linear(h, hws[layer_idx], hbs[layer_idx])

                    W_r_x, W_z_x, W_x = split_axis.split_axis(gru_x, 3, axis=1)
                    U_r_h, U_z_h, U_x = split_axis.split_axis(gru_h, 3, axis=1)

                    r = sigmoid.sigmoid(W_r_x + U_r_h)
                    z = sigmoid.sigmoid(W_z_x + U_z_h)
                    h_bar = tanh.tanh(W_x + r * U_x)
                    h_bar = (1 - z) * h_bar + z * h
                    if h_rest is not None:
                        h = concat.concat([h_bar, h_rest], axis=0)
                    else:
                        h = h_bar
                    h_list.append(h_bar)
                return h, h_list

            # Forward GRU
            h, h_forward = _one_directional_loop(di=0)
            hy.append(h)

            if use_bi_direction:
                # Backward GRU
                h, h_backward = _one_directional_loop(di=1)
                h_backward.reverse()
                # Concat
                xs_next = [concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in
                           six.moves.zip(h_forward, h_backward)]
                hy.append(h)
            else:
                # Uni-directional GRU
                xs_next = h_forward

        ys = xs_next
        hy = stack.stack(hy)
        return hy, tuple(ys)
Beispiel #41
0
def n_step_rnn_base(n_layers, dropout_ratio, hx, ws, bs, xs,
                    activation, use_bi_direction, **kwargs):
    """n_step_rnn_base(n_layers, dropout_ratio, hx, ws, bs, xs, activation, use_bi_direction)

    Base function for Stack RNN/BiRNN functions.

    This function is used at  :func:`chainer.functions.n_step_birnn` and
    :func:`chainer.functions.n_step_rnn`.
    This function's behavior depends on following arguments,
    ``activation`` and ``use_bi_direction``.

    .. warning::

       ``train`` and ``use_cudnn`` arguments are not supported anymore since
       v2.
       Instead, use ``chainer.using_config('train', train)`` and
       ``chainer.using_config('use_cudnn', use_cudnn)`` respectively.
       See :func:`chainer.using_config`.

    Args:
        n_layers(int): Number of layers.
        dropout_ratio(float): Dropout ratio.
        hx (chainer.Variable): Variable holding stacked hidden states.
            Its shape is ``(S, B, N)`` where ``S`` is number of layers and is
            equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is
            dimention of hidden units.
        ws (list of list of chainer.Variable): Weight matrices. ``ws[i]``
            represents weights for i-th layer.
            Each ``ws[i]`` is a list containing two matrices.
            ``ws[i][j]`` is corresponding with ``W_j`` in the equation.
            Only ``ws[0][j]`` where ``0 <= j < 1`` is ``(I, N)`` shape as they
            are multiplied with input variables. All other matrices has
            ``(N, N)`` shape.
        bs (list of list of chainer.Variable): Bias vectors. ``bs[i]``
            represnents biases for i-th layer.
            Each ``bs[i]`` is a list containing two vectors.
            ``bs[i][j]`` is corresponding with ``b_j`` in the equation.
            Shape of each matrix is ``(N,)`` where ``N`` is dimention of
            hidden units.
        xs (list of chainer.Variable): A list of :class:`~chainer.Variable`
            holding input values. Each element ``xs[t]`` holds input value
            for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is
            mini-batch size for time ``t``, and ``I`` is size of input units.
            Note that this functions supports variable length sequences.
            When sequneces has different lengths, sort sequences in descending
            order by length, and transpose the sorted sequence.
            :func:`~chainer.functions.transpose_sequence` transpose a list
            of :func:`~chainer.Variable` holding sequence.
            So ``xs`` needs to satisfy
            ``xs[t].shape[0] >= xs[t + 1].shape[0]``.
        activation (str): Activation function name.
            Please select ``tanh`` or ``relu``.
        use_bi_direction (bool): If ``True``, this function uses
            Bi-directional RNN.

    Returns:
        tuple: This functions returns a tuple concaining three elements,
            ``hy`` and ``ys``.

            - ``hy`` is an updated hidden states whose shape is same as ``hx``.
            - ``ys`` is a list of :class:`~chainer.Variable` . Each element
              ``ys[t]`` holds hidden states of the last layer corresponding
              to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t``
              is mini-batch size for time ``t``, and ``N`` is size of hidden
              units. Note that ``B_t`` is the same value as ``xs[t]``.

    .. seealso::
       :func:`chainer.functions.n_step_rnn`
       :func:`chainer.functions.n_step_birnn`

    """  # NOQA

    argument.check_unexpected_kwargs(
        kwargs, train='train argument is not supported anymore. '
        'Use chainer.using_config',
        use_cudnn='use_cudnn argument is not supported anymore. '
        'Use chainer.using_config')
    argument.assert_kwargs_empty(kwargs)

    activation_list = ['tanh', 'relu']
    if activation not in activation_list:
        candidate = ','.join(activation_list)
        raise ValueError('Invalid activation: "%s". Please select from [%s]'
                         % (activation, candidate))

    xp = cuda.get_array_module(hx)

    if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000):
        states = get_random_state().create_dropout_states(dropout_ratio)
        # flatten all input variables
        inputs = tuple(itertools.chain(
            (hx, ),
            itertools.chain.from_iterable(ws),
            itertools.chain.from_iterable(bs),
            xs))
        if use_bi_direction:
            # Bi-directional RNN
            if activation == 'tanh':
                rnn = NStepBiRNNTanh(n_layers, states)
            elif activation == 'relu':
                rnn = NStepBiRNNReLU(n_layers, states)
        else:
            # Uni-directional RNN
            if activation == 'tanh':
                rnn = NStepRNNTanh(n_layers, states)
            elif activation == 'relu':
                rnn = NStepRNNReLU(n_layers, states)

        ret = rnn(*inputs)
        hy, = ret[:1]
        ys = ret[1:]
        return hy, ys

    else:

        direction = 2 if use_bi_direction else 1
        hx = split_axis.split_axis(hx, n_layers * direction, axis=0,
                                   force_tuple=True)
        hx = [reshape.reshape(h, h.shape[1:]) for h in hx]

        xws = [_stack_weight([w[0]]) for w in ws]
        hws = [_stack_weight([w[1]]) for w in ws]
        xbs = [_stack_weight([b[0]]) for b in bs]
        hbs = [_stack_weight([b[1]]) for b in bs]

        xs_next = xs
        hy = []
        for layer in six.moves.range(n_layers):

            def _one_directional_loop(di):
                # di=0, forward RNN
                # di=1, backward RNN
                xs_list = xs_next if di == 0 else reversed(xs_next)
                layer_idx = direction * layer + di
                h = hx[layer_idx]
                h_list = []
                for x in xs_list:
                    batch = x.shape[0]
                    if h.shape[0] > batch:
                        h, h_rest = split_axis.split_axis(h, [batch], axis=0)
                    else:
                        h_rest = None

                    if layer > 0:
                        x = dropout.dropout(x, ratio=dropout_ratio)

                    rnn_in = (linear.linear(x, xws[layer_idx],
                                            xbs[layer_idx]) +
                              linear.linear(h, hws[layer_idx], hbs[layer_idx]))
                    if activation == 'tanh':
                        h_bar = tanh.tanh(rnn_in)
                    elif activation == 'relu':
                        h_bar = relu.relu(rnn_in)

                    if h_rest is not None:
                        h = concat.concat([h_bar, h_rest], axis=0)
                    else:
                        h = h_bar
                    h_list.append(h_bar)
                return h, h_list

            # Forward RNN
            h, h_forward = _one_directional_loop(di=0)
            hy.append(h)

            if use_bi_direction:
                # Backward RNN
                h, h_backward = _one_directional_loop(di=1)
                h_backward.reverse()
                # Concat
                xs_next = [concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in
                           six.moves.zip(h_forward, h_backward)]
                hy.append(h)
            else:
                # Uni-directional RNN
                xs_next = h_forward

        ys = xs_next
        hy = stack.stack(hy)
        return hy, tuple(ys)
Beispiel #42
0
def crf1d(cost, xs, ys, reduce='mean'):
    """Calculates negative log-likelihood of linear-chain CRF.

    It takes a transition cost matrix, a sequence of costs, and a sequence of
    labels. Let :math:`c_{st}` be a transition cost from a label :math:`s` to
    a label :math:`t`, :math:`x_{it}` be a cost of a label :math:`t` at
    position :math:`i`, and :math:`y_i` be an expected label at position
    :math:`i`. The negative log-likelihood of linear-chain CRF is defined as

    .. math::
        L = -\\left( \\sum_{i=1}^l x_{iy_i} + \\
             \\sum_{i=1}^{l-1} c_{y_i y_{i+1}} - {\\log(Z)} \\right) ,

    where :math:`l` is the length of the input sequence and :math:`Z` is the
    normalizing constant called partition function.

    .. note::

       When you want to calculate the negative log-likelihood of sequences
       which have different lengths, sort the sequences in descending order of
       lengths and transpose the sequences.
       For example, you have three input sequences:

       >>> a1 = a2 = a3 = a4 = np.random.uniform(-1, 1, 3).astype(np.float32)
       >>> b1 = b2 = b3 = np.random.uniform(-1, 1, 3).astype(np.float32)
       >>> c1 = c2 = np.random.uniform(-1, 1, 3).astype(np.float32)

       >>> a = [a1, a2, a3, a4]
       >>> b = [b1, b2, b3]
       >>> c = [c1, c2]

       where ``a1`` and all other variables are arrays with ``(K,)`` shape.
       Make a transpose of the sequences:

       >>> x1 = np.stack([a1, b1, c1])
       >>> x2 = np.stack([a2, b2, c2])
       >>> x3 = np.stack([a3, b3])
       >>> x4 = np.stack([a4])

       and make a list of the arrays:

       >>> xs = [x1, x2, x3, x4]

       You need to make label sequences in the same fashion.
       And then, call the function:

       >>> cost = chainer.Variable(
       ...     np.random.uniform(-1, 1, (3, 3)).astype(np.float32))
       >>> ys = [np.zeros(x.shape[0:1], dtype=np.int32) for x in xs]
       >>> loss = F.crf1d(cost, xs, ys)

       It calculates mean of the negative log-likelihood of the three
       sequences.

       The output is a variable whose value depends on the value of
       the option ``reduce``. If it is ``'no'``, it holds the elementwise
       loss values. If it is ``'mean'``, it holds mean of the loss values.


    Args:
        cost (Variable): A :math:`K \\times K` matrix which holds transition
            cost between two labels, where :math:`K` is the number of labels.
        xs (list of Variable): Input vector for each label.
            ``len(xs)`` denotes the length of the sequence,
            and each :class:`~chainer.Variable` holds a :math:`B \\times K`
            matrix, where :math:`B` is mini-batch size, :math:`K` is the number
            of labels.
            Note that :math:`B`\\ s in all the variables are not necessary
            the same, i.e., it accepts the input sequences with different
            lengths.
        ys (list of Variable): Expected output labels. It needs to have the
            same length as ``xs``. Each :class:`~chainer.Variable` holds a
            :math:`B` integer vector.
            When ``x`` in ``xs`` has the different :math:`B`, correspoding
            ``y`` has the same :math:`B`. In other words, ``ys`` must satisfy
            ``ys[i].shape == xs[i].shape[0:1]`` for all ``i``.
        reduce (str): Reduction option. Its value must be either
            ``'mean'`` or ``'no'``. Otherwise, :class:`ValueError` is raised.

    Returns:
        ~chainer.Variable: A variable holding the average negative
        log-likelihood of the input sequences.

    .. note::

        See detail in the original paper: `Conditional Random Fields:
        Probabilistic Models for Segmenting and Labeling Sequence Data
        <https://repository.upenn.edu/cis_papers/159/>`_.

    """
    if reduce not in ('mean', 'no'):
        raise ValueError(
            "only 'mean' and 'no' are valid for 'reduce', but '%s' is "
            'given' % reduce)

    assert xs[0].shape[1] == cost.shape[0]

    n_label = cost.shape[0]
    n_batch = xs[0].shape[0]

    alpha = xs[0]
    alphas = []
    for x in xs[1:]:
        batch = x.shape[0]
        if alpha.shape[0] > batch:
            alpha, alpha_rest = split_axis.split_axis(alpha, [batch], axis=0)
            alphas.append(alpha_rest)
        b_alpha, b_cost = broadcast.broadcast(alpha[..., None], cost)
        alpha = logsumexp.logsumexp(b_alpha + b_cost, axis=1) + x

    if len(alphas) > 0:
        alphas.append(alpha)
        alpha = concat.concat(alphas[::-1], axis=0)

    logz = logsumexp.logsumexp(alpha, axis=1)

    cost = reshape.reshape(cost, (cost.size, 1))
    score = select_item.select_item(xs[0], ys[0])
    scores = []
    for x, y, y_prev in zip(xs[1:], ys[1:], ys[:-1]):
        batch = x.shape[0]
        if score.shape[0] > batch:
            y_prev, _ = split_axis.split_axis(y_prev, [batch], axis=0)
            score, score_rest = split_axis.split_axis(score, [batch], axis=0)
            scores.append(score_rest)
        score += (select_item.select_item(x, y) + reshape.reshape(
            embed_id.embed_id(y_prev * n_label + y, cost), (batch,)))

    if len(scores) > 0:
        scores.append(score)
        score = concat.concat(scores[::-1], axis=0)

    loss = logz - score
    if reduce == 'mean':
        return _sum.sum(loss) / n_batch
    else:
        return loss
Beispiel #43
0
def n_step_gru_base(n_layers, dropout_ratio, hx, ws, bs, xs, use_bi_direction,
                    **kwargs):
    """n_step_gru_base(n_layers, dropout_ratio, hx, ws, bs, xs, use_bi_direction)

    Base function for Stack GRU/BiGRU functions.

    This function is used at  :func:`chainer.functions.n_step_bigru` and
    :func:`chainer.functions.n_step_gru`.
    This function's behavior depends on argument ``use_bi_direction``.

    .. warning::

       ``train`` and ``use_cudnn`` arguments are not supported anymore since
       v2.
       Instead, use ``chainer.using_config('train', train)`` and
       ``chainer.using_config('use_cudnn', use_cudnn)`` respectively.
       See :func:`chainer.using_config`.

    Args:
        n_layers(int): Number of layers.
        dropout_ratio(float): Dropout ratio.
        hx (chainer.Variable): Variable holding stacked hidden states.
            Its shape is ``(S, B, N)`` where ``S`` is number of layers and is
            equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is
            dimension of hidden units. Because of bi-direction, the
            first dimension length is ``2S``.
        ws (list of list of chainer.Variable): Weight matrices. ``ws[i]``
            represents weights for i-th layer.
            Each ``ws[i]`` is a list containing six matrices.
            ``ws[i][j]`` is corresponding with ``W_j`` in the equation.
            Only ``ws[0][j]`` where ``0 <= j < 3`` is ``(I, N)`` shape as they
            are multiplied with input variables. All other matrices has
            ``(N, N)`` shape.
        bs (list of list of chainer.Variable): Bias vectors. ``bs[i]``
            represnents biases for i-th layer.
            Each ``bs[i]`` is a list containing six vectors.
            ``bs[i][j]`` is corresponding with ``b_j`` in the equation.
            Shape of each matrix is ``(N,)`` where ``N`` is dimension of
            hidden units.
        xs (list of chainer.Variable): A list of :class:`~chainer.Variable`
            holding input values. Each element ``xs[t]`` holds input value
            for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is
            mini-batch size for time ``t``, and ``I`` is size of input units.
            Note that this functions supports variable length sequences.
            When sequneces has different lengths, sort sequences in descending
            order by length, and transpose the sorted sequence.
            :func:`~chainer.functions.transpose_sequence` transpose a list
            of :func:`~chainer.Variable` holding sequence.
            So ``xs`` needs to satisfy
            ``xs[t].shape[0] >= xs[t + 1].shape[0]``.
        activation (str): Activation function name.
            Please select ``tanh`` or ``relu``.
        use_bi_direction (bool): If ``True``, this function uses
            Bi-direction GRU.

    .. seealso::
       :func:`chainer.functions.n_step_rnn`
       :func:`chainer.functions.n_step_birnn`

    """  # NOQA
    argument.check_unexpected_kwargs(
        kwargs,
        train='train argument is not supported anymore. '
        'Use chainer.using_config',
        use_cudnn='use_cudnn argument is not supported anymore. '
        'Use chainer.using_config')
    argument.assert_kwargs_empty(kwargs)

    xp = cuda.get_array_module(hx, hx.data)

    if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000):
        states = get_random_state().create_dropout_states(dropout_ratio)
        # flatten all input variables
        inputs = tuple(
            itertools.chain((hx, ), itertools.chain.from_iterable(ws),
                            itertools.chain.from_iterable(bs), xs))
        if use_bi_direction:
            rnn = NStepBiGRU(n_layers, states)
        else:
            rnn = NStepGRU(n_layers, states)

        ret = rnn(*inputs)
        hy, = ret[:1]
        ys = ret[1:]
        return hy, ys

    else:
        direction = 2 if use_bi_direction else 1
        hx = split_axis.split_axis(hx,
                                   n_layers * direction,
                                   axis=0,
                                   force_tuple=True)
        hx = [reshape.reshape(h, h.shape[1:]) for h in hx]

        xws = [concat.concat([w[0], w[1], w[2]], axis=0) for w in ws]
        hws = [concat.concat([w[3], w[4], w[5]], axis=0) for w in ws]
        xbs = [concat.concat([b[0], b[1], b[2]], axis=0) for b in bs]
        hbs = [concat.concat([b[3], b[4], b[5]], axis=0) for b in bs]

        xs_next = xs
        hy = []
        for layer in six.moves.range(n_layers):

            def _one_directional_loop(di):
                # di=0, forward GRU
                # di=1, backward GRU
                xs_list = xs_next if di == 0 else reversed(xs_next)
                layer_idx = direction * layer + di
                h = hx[layer_idx]
                h_list = []
                for x in xs_list:
                    batch = x.shape[0]
                    if h.shape[0] > batch:
                        h, h_rest = split_axis.split_axis(h, [batch], axis=0)
                    else:
                        h_rest = None

                    if layer > 0:
                        x = dropout.dropout(x, ratio=dropout_ratio)

                    gru_x = linear.linear(x, xws[layer_idx], xbs[layer_idx])
                    gru_h = linear.linear(h, hws[layer_idx], hbs[layer_idx])

                    W_r_x, W_z_x, W_x = split_axis.split_axis(gru_x, 3, axis=1)
                    U_r_h, U_z_h, U_x = split_axis.split_axis(gru_h, 3, axis=1)

                    r = sigmoid.sigmoid(W_r_x + U_r_h)
                    z = sigmoid.sigmoid(W_z_x + U_z_h)
                    h_bar = tanh.tanh(W_x + r * U_x)
                    h_bar = (1 - z) * h_bar + z * h
                    if h_rest is not None:
                        h = concat.concat([h_bar, h_rest], axis=0)
                    else:
                        h = h_bar
                    h_list.append(h_bar)
                return h, h_list

            # Forward GRU
            h, h_forward = _one_directional_loop(di=0)
            hy.append(h)

            if use_bi_direction:
                # Backward GRU
                h, h_backward = _one_directional_loop(di=1)
                h_backward.reverse()
                # Concat
                xs_next = [
                    concat.concat([hfi, hbi], axis=1)
                    for (hfi, hbi) in six.moves.zip(h_forward, h_backward)
                ]
                hy.append(h)
            else:
                # Uni-directional GRU
                xs_next = h_forward

        ys = xs_next
        hy = stack.stack(hy)
        return hy, tuple(ys)
Beispiel #44
0
def maxout(x, pool_size, axis=1):
    """Maxout activation function.

    It accepts an input tensor ``x``, reshapes the ``axis`` dimension
    (say the size being ``M * pool_size``) into two dimensions
    ``(M, pool_size)``, and takes maximum along the ``axis`` dimension.

    Args:
        x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
        :class:`cupy.ndarray`):
            Input variable. A :math:`n`-dimensional (:math:`n \\ge` ``axis``)
            float array. In general, its first dimension is assumed to be the
            *minibatch dimension*. The other dimensions are treated as one
            concatenated dimension.
        pool_size (int):
            The size used for downsampling of pooling layer.
        axis (int):
            The ``axis`` dimension to be reshaped. The size of ``axis``
            dimension should be ``M * pool_size``.

    Returns:
        ~chainer.Variable:
            Output variable. The shape of the output is same as ``x`` except
            that ``axis`` dimension is transformed from ``M * pool_size`` to
            ``M``.

    .. seealso:: :class:`~chainer.links.Maxout`

    .. admonition:: Example

        Typically, ``x`` is the output of a linear layer or a convolution
        layer. The following is the example where we use :func:`maxout` in
        combination with a Linear link.

        >>> in_size, out_size, pool_size = 10, 10, 10
        >>> bias = np.arange(out_size * pool_size).astype('f')
        >>> l = L.Linear(in_size, out_size * pool_size, initial_bias=bias)
        >>> x = np.zeros((1, in_size), 'f')  # prepare data
        >>> x = l(x)
        >>> y = F.maxout(x, pool_size)
        >>> x.shape
        (1, 100)
        >>> y.shape
        (1, 10)
        >>> x.reshape((out_size, pool_size)).data
        array([[  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.],
               [ 10.,  11.,  12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.],
               [ 20.,  21.,  22.,  23.,  24.,  25.,  26.,  27.,  28.,  29.],
               [ 30.,  31.,  32.,  33.,  34.,  35.,  36.,  37.,  38.,  39.],
               [ 40.,  41.,  42.,  43.,  44.,  45.,  46.,  47.,  48.,  49.],
               [ 50.,  51.,  52.,  53.,  54.,  55.,  56.,  57.,  58.,  59.],
               [ 60.,  61.,  62.,  63.,  64.,  65.,  66.,  67.,  68.,  69.],
               [ 70.,  71.,  72.,  73.,  74.,  75.,  76.,  77.,  78.,  79.],
               [ 80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,  88.,  89.],
               [ 90.,  91.,  92.,  93.,  94.,  95.,  96.,  97.,  98.,  99.]], \
dtype=float32)
        >>> y.data
        array([[  9.,  19.,  29.,  39.,  49.,  59.,  69.,  79.,  89.,  99.]], \
dtype=float32)

    """

    if pool_size <= 0:
        raise ValueError('pool_size must be a positive integer.')

    x_shape = x.shape
    if x_shape[axis] % pool_size != 0:
        expect = 'x.shape[axis] % pool_size == 0'
        actual = 'x.shape[axis]={}, pool_size={}'.format(
            x_shape[axis], pool_size)
        msg = 'axis dimension must be divided by pool_size'
        raise type_check.InvalidType(expect, actual, msg)

    shape = (x_shape[:axis] +
             (x_shape[axis] // pool_size, pool_size) +
             x_shape[axis + 1:])
    x = reshape.reshape(x, shape)
    return minmax.max(x, axis=axis + 1)
Beispiel #45
0
def group_normalization(x, groups, gamma, beta, eps=1e-5):
    """Group normalization function.

    This function implements a "group normalization"
    which divides the channels into groups and computes within each group
    the mean and variance, then normalize by these statistics,
    scales and shifts them.


    Args:
        x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
        :class:`cupy.ndarray`): Batch tensors.
            First dimension of this value must be the size of minibatch and
            second dimension must be the number of channels.
            Moreover, this value must have one or more following dimensions,
            such as height and width.
        groups (int):
            The number of channel groups.
            This value must be a divisor of the number of channels.
        gamma (~chainer.Variable): Scaling parameter.
        beta (~chainer.Variable): Shifting parameter.
        eps (float): Epsilon value for numerical stability of normalization.


    Returns:
        ~chainer.Variable: The output variable which has the same shape
        as :math:`x`.

    See: `Group Normalization <https://arxiv.org/abs/1803.08494>`_
    """
    if x.ndim <= 2:
        raise ValueError('Input dimension must be grater than 2, '
                         'including batch size dimension '
                         '(first dimension).')

    if not isinstance(groups, int):
        raise TypeError('Argument: \'groups\' type must be (int).')

    xp = backend.get_array_module(x)

    batch_size, channels = x.shape[:2]
    original_shape = x.shape

    if channels % groups != 0:
        raise ValueError('Argument: \'groups\' must be a divisor '
                         'of the number of channel.')

    # By doing this reshaping, calling batch_normalization function becomes
    # equivalent to Group Normalization.
    # And redundant dimension is added in order to utilize ideep64/cuDNN.
    x = reshape.reshape(x, (1, batch_size * groups, -1, 1))

    with cuda.get_device_from_array(x.array):
        dummy_gamma = xp.ones(batch_size * groups).astype(xp.float32)
        dummy_beta = xp.zeros(batch_size * groups).astype(xp.float32)

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        x = batch_normalization.batch_normalization(
            x, dummy_gamma, dummy_beta, eps=eps)

    x = reshape.reshape(x, original_shape)

    target_shape = [1, channels] + [1] * (x.ndim - 2)
    gamma_broadcast = broadcast.broadcast_to(
        reshape.reshape(gamma, target_shape), x.shape)
    beta_broadcast = broadcast.broadcast_to(
        reshape.reshape(beta, target_shape), x.shape)

    return x * gamma_broadcast + beta_broadcast
Beispiel #46
0
def _global_average_pooling_2d(x):
    n, channel, rows, cols = x.data.shape
    h = average_pooling_2d(x, (rows, cols), stride=1)
    h = reshape(h, (n, channel))
    return h
Beispiel #47
0
def n_step_lstm(
        n_layers, dropout_ratio, hx, cx, ws, bs, xs, train=True,
        use_cudnn=True):
    """Stacked Long Short-Term Memory function for sequence inputs.

    This function calculates stacked LSTM with sequences. This function gets
    an initial hidden state :math:`h_0`, an initial cell state :math:`c_0`,
    an input sequence :math:`x`, weight matrices :math:`W`, and bias vectors
    :math:`b`.
    This function calculates hidden states :math:`h_t` and :math:`c_t` for each
    time :math:`t` from input :math:`x_t`.

    .. math::

       i_t &= \\sigma(W_0 x_t + W_4 h_{t-1} + b_0 + b_4) \\\\
       f_t &= \\sigma(W_1 x_t + W_5 h_{t-1} + b_1 + b_5) \\\\
       o_t &= \\sigma(W_2 x_t + W_6 h_{t-1} + b_2 + b_6) \\\\
       a_t &= \\tanh(W_3 x_t + W_7 h_{t-1} + b_3 + b_7) \\\\
       c_t &= f_t \\dot c_{t-1} + i_t \\dot a_t \\\\
       h_t &= o_t \\dot \\tanh(c_t)

    As the function accepts a sequence, it calculates :math:`h_t` for all
    :math:`t` with one call. Eight weight matrices and eight bias vectors are
    required for each layers. So, when :math:`S` layers exists, you need to
    prepare :math:`8S` weigth matrices and :math:`8S` bias vectors.

    If the number of layers ``n_layers`` is greather than :math:`1`, input
    of ``k``-th layer is hidden state ``h_t`` of ``k-1``-th layer.
    Note that all input variables except first layer may have different shape
    from the first layer.

    Args:
        n_layers(int): Number of layers.
        dropout_ratio(float): Dropout ratio.
        hx (chainer.Variable): Variable holding stacked hidden states.
            Its shape is ``(S, B, N)`` where ``S`` is number of layers and is
            equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is
            dimention of hidden units.
        cx (chainer.Variable): Variable holding stacked cell states.
            It has the same shape as ``hx``.
        ws (list of list of chainer.Variable): Weight matrices. ``ws[i]``
            represents weights for i-th layer.
            Each ``ws[i]`` is a list containing eight matrices.
            ``ws[i][j]`` is corresponding with ``W_j`` in the equation.
            Only ``ws[0][j]`` where ``0 <= j < 4`` is ``(I, N)`` shape as they
            are multiplied with input variables. All other matrices has
            ``(N, N)`` shape.
        bs (list of list of chainer.Variable): Bias vectors. ``bs[i]``
            represnents biases for i-th layer.
            Each ``bs[i]`` is a list containing eight vectors.
            ``bs[i][j]`` is corresponding with ``b_j`` in the equation.
            Shape of each matrix is ``(N,)`` where ``N`` is dimention of
            hidden units.
        xs (list of chainer.Variable): A list of :class:`~chainer.Variable`
            holding input values. Each element ``xs[t]`` holds input value
            for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is
            mini-batch size for time ``t``, and ``I`` is size of input units.
            Note that this functions supports variable length sequences.
            When sequneces has different lengths, sort sequences in descending
            order by length, and transpose the sorted sequence.
            :func:`~chainer.functions.transpose_sequence` transpose a list
            of :func:`~chainer.Variable` holding sequence.
            So ``xs`` needs to satisfy
            ``xs[t].shape[0] >= xs[t + 1].shape[0]``.
        train (bool): If ``True``, this function executes dropout.
        use_cudnn (bool): If ``True``, this function uses cuDNN if available.

    Returns:
        tuple: This functions returns a tuple concaining three elements,
            ``hy``, ``cy`` and ``ys``.

            - ``hy`` is an updated hidden states whose shape is same as ``hx``.
            - ``cy`` is an updated cell states whose shape is same as ``cx``.
            - ``ys`` is a list of :class:`~chainer.Variable` . Each element
              ``ys[t]`` holds hidden states of the last layer corresponding
              to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is
              mini-batch size for time ``t``, and ``N`` is size of hidden
              units. Note that ``B_t`` is the same value as ``xs[t]``.

    .. seealso::

       :func:`chainer.functions.lstm`

    """

    xp = cuda.get_array_module(hx, hx.data)

    if use_cudnn and xp is not numpy and cuda.cudnn_enabled and \
       _cudnn_version >= 5000:
        states = get_random_state().create_dropout_states(dropout_ratio)
        # flatten all input variables
        inputs = tuple(itertools.chain(
            (hx, cx),
            itertools.chain.from_iterable(ws),
            itertools.chain.from_iterable(bs),
            xs))
        rnn = NStepLSTM(n_layers, states, train=train)
        ret = rnn(*inputs)
        hy, cy = ret[:2]
        ys = ret[2:]
        return hy, cy, ys

    else:
        hx = split_axis.split_axis(hx, n_layers, axis=0, force_tuple=True)
        hx = [reshape.reshape(h, h.shape[1:]) for h in hx]
        cx = split_axis.split_axis(cx, n_layers, axis=0, force_tuple=True)
        cx = [reshape.reshape(c, c.shape[1:]) for c in cx]

        xws = [_stack_weight([w[2], w[0], w[1], w[3]]) for w in ws]
        hws = [_stack_weight([w[6], w[4], w[5], w[7]]) for w in ws]
        xbs = [_stack_weight([b[2], b[0], b[1], b[3]]) for b in bs]
        hbs = [_stack_weight([b[6], b[4], b[5], b[7]]) for b in bs]

        ys = []
        for x in xs:
            batch = x.shape[0]
            h_next = []
            c_next = []
            for layer in six.moves.range(n_layers):
                h = hx[layer]
                c = cx[layer]
                if h.shape[0] > batch:
                    h, h_rest = split_axis.split_axis(h, [batch], axis=0)
                    c, c_rest = split_axis.split_axis(c, [batch], axis=0)
                else:
                    h_rest = None

                x = dropout.dropout(x, ratio=dropout_ratio, train=train)
                h = dropout.dropout(h, ratio=dropout_ratio, train=train)
                lstm_in = linear.linear(x, xws[layer], xbs[layer]) + \
                    linear.linear(h, hws[layer], hbs[layer])

                c_bar, h_bar = lstm.lstm(c, lstm_in)
                if h_rest is not None:
                    h = concat.concat([h_bar, h_rest], axis=0)
                    c = concat.concat([c_bar, c_rest], axis=0)
                else:
                    h = h_bar
                    c = c_bar
                h_next.append(h)
                c_next.append(c)
                x = h_bar
            hx = h_next
            cx = c_next
            ys.append(x)

        hy = stack.stack(hx)
        cy = stack.stack(cx)
        return hy, cy, tuple(ys)
Beispiel #48
0
def n_step_lstm_base(
        n_layers, dropout_ratio, hx, cx, ws, bs, xs, train, use_cudnn,
        use_bi_direction):
    """Base function for Stack LSTM/BiLSTM functions.

    This function is used at :func:`chainer.functions.n_step_lstm` and
    :func:`chainer.functions.n_step_bilstm`.
    This function's behavior depends on following arguments,
    ``activation`` and ``use_bi_direction``.

    Args:
        n_layers(int): Number of layers.
        dropout_ratio(float): Dropout ratio.
        hx (chainer.Variable): Variable holding stacked hidden states.
            Its shape is ``(S, B, N)`` where ``S`` is number of layers and is
            equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is
            dimention of hidden units.
        cx (chainer.Variable): Variable holding stacked cell states.
            It has the same shape as ``hx``.
        ws (list of list of chainer.Variable): Weight matrices. ``ws[i]``
            represents weights for i-th layer.
            Each ``ws[i]`` is a list containing eight matrices.
            ``ws[i][j]`` is corresponding with ``W_j`` in the equation.
            Only ``ws[0][j]`` where ``0 <= j < 4`` is ``(I, N)`` shape as they
            are multiplied with input variables. All other matrices has
            ``(N, N)`` shape.
        bs (list of list of chainer.Variable): Bias vectors. ``bs[i]``
            represnents biases for i-th layer.
            Each ``bs[i]`` is a list containing eight vectors.
            ``bs[i][j]`` is corresponding with ``b_j`` in the equation.
            Shape of each matrix is ``(N,)`` where ``N`` is dimention of
            hidden units.
        xs (list of chainer.Variable): A list of :class:`~chainer.Variable`
            holding input values. Each element ``xs[t]`` holds input value
            for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is
            mini-batch size for time ``t``, and ``I`` is size of input units.
            Note that this functions supports variable length sequences.
            When sequneces has different lengths, sort sequences in descending
            order by length, and transpose the sorted sequence.
            :func:`~chainer.functions.transpose_sequence` transpose a list
            of :func:`~chainer.Variable` holding sequence.
            So ``xs`` needs to satisfy
            ``xs[t].shape[0] >= xs[t + 1].shape[0]``.
        train (bool): If ``True``, this function executes dropout.
        use_cudnn (bool): If ``True``, this function uses cuDNN if available.
        use_bi_direction (bool): If ``True``, this function uses Bi-directional
            LSTM.

    Returns:
        tuple: This functions returns a tuple concaining three elements,
            ``hy``, ``cy`` and ``ys``.
            - ``hy`` is an updated hidden states whose shape is same as ``hx``.
            - ``cy`` is an updated cell states whose shape is same as ``cx``.
            - ``ys`` is a list of :class:`~chainer.Variable` . Each element
              ``ys[t]`` holds hidden states of the last layer corresponding
              to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is
              mini-batch size for time ``t``, and ``N`` is size of hidden
              units. Note that ``B_t`` is the same value as ``xs[t]``.

    .. seealso::

       :func:`chainer.functions.n_step_lstm`
       :func:`chainer.functions.n_step_bilstm`

    """

    xp = cuda.get_array_module(hx, hx.data)

    if use_cudnn and xp is not numpy and cuda.cudnn_enabled and \
       _cudnn_version >= 5000:
        states = get_random_state().create_dropout_states(dropout_ratio)
        # flatten all input variables
        inputs = tuple(itertools.chain(
            (hx, cx),
            itertools.chain.from_iterable(ws),
            itertools.chain.from_iterable(bs),
            xs))
        if use_bi_direction:
            rnn = NStepBiLSTM(n_layers, states, train=train)
        else:
            rnn = NStepLSTM(n_layers, states, train=train)

        ret = rnn(*inputs)
        hy, cy = ret[:2]
        ys = ret[2:]
        return hy, cy, ys

    else:
        direction = 2 if use_bi_direction else 1
        split_size = n_layers * direction
        hx = split_axis.split_axis(hx, split_size, axis=0, force_tuple=True)
        hx = [reshape.reshape(h, h.shape[1:]) for h in hx]
        cx = split_axis.split_axis(cx, split_size, axis=0, force_tuple=True)
        cx = [reshape.reshape(c, c.shape[1:]) for c in cx]

        xws = [_stack_weight([w[2], w[0], w[1], w[3]]) for w in ws]
        hws = [_stack_weight([w[6], w[4], w[5], w[7]]) for w in ws]
        xbs = [_stack_weight([b[2], b[0], b[1], b[3]]) for b in bs]
        hbs = [_stack_weight([b[6], b[4], b[5], b[7]]) for b in bs]

        xs_next = xs
        hy = []
        cy = []
        for layer in six.moves.range(n_layers):

            def _one_directional_loop(di):
                # di=0, forward LSTM
                # di=1, backward LSTM
                h_list = []
                c_list = []
                layer_idx = direction * layer + di
                h = hx[layer_idx]
                c = cx[layer_idx]
                if di == 0:
                    xs_list = xs_next
                else:
                    xs_list = reversed(xs_next)
                for x in xs_list:
                    batch = x.shape[0]
                    if h.shape[0] > batch:
                        h, h_rest = split_axis.split_axis(h, [batch], axis=0)
                        c, c_rest = split_axis.split_axis(c, [batch], axis=0)
                    else:
                        h_rest = None
                        c_rest = None

                    if layer != 0:
                        x = dropout.dropout(x, ratio=dropout_ratio,
                                            train=train)
                    lstm_in = linear.linear(x, xws[layer_idx],
                                            xbs[layer_idx]) + \
                        linear.linear(h, hws[layer_idx], hbs[layer_idx])

                    c_bar, h_bar = lstm.lstm(c, lstm_in)
                    if h_rest is not None:
                        h = concat.concat([h_bar, h_rest], axis=0)
                        c = concat.concat([c_bar, c_rest], axis=0)
                    else:
                        h = h_bar
                        c = c_bar
                    h_list.append(h_bar)
                    c_list.append(c_bar)
                return h, c, h_list, c_list

            h, c, h_forward, c_forward = _one_directional_loop(di=0)
            hy.append(h)
            cy.append(c)

            if use_bi_direction:
                # BiLSTM
                h, c, h_backward, c_backward = _one_directional_loop(di=1)
                hy.append(h)
                cy.append(c)

                h_backward.reverse()
                # concat
                xs_next = [concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in
                           zip(h_forward, h_backward)]
            else:
                # Uni-directional RNN
                xs_next = h_forward

        ys = xs_next
        hy = stack.stack(hy)
        cy = stack.stack(cy)
        return hy, cy, tuple(ys)
Beispiel #49
0
def n_step_lstm(n_layers,
                dropout_ratio,
                hx,
                cx,
                ws,
                bs,
                xs,
                train=True,
                use_cudnn=True):
    """Stacked Long Short-Term Memory function for sequence inputs.

    This function calculates stacked LSTM with sequences. This function gets
    an initial hidden state :math:`h_0`, an initial cell state :math:`c_0`,
    an input sequence :math:`x`, weight matrices :math:`W`, and bias vectors
    :math:`b`.
    This function calculates hidden states :math:`h_t` and :math:`c_t` for each
    time :math:`t` from input :math:`x_t`.

    .. math::

       i_t = \sigma(W_0 x_t + W_4 h_{t-1} + b_0 + b_4)
       f_t = \sigma(W_1 x_t + W_5 h_{t-1} + b_1 + b_5)
       o_t = \sigma(W_2 x_t + W_6 h_{t-1} + b_2 + b_6)
       a_t = \tanh(W_3 x_t + W_7 h_{t-1} + b_3 + b_7)
       c_t = f_t \dot c_{t-1} + i_t \dot a_t
       h_t = o_t \dot \tanh(c_t)

    As the function accepts a sequence, it calculates :math:`h_t` for all
    :math:`t` with one call. Eight weight matrices and eight bias vectors are
    required for each layers. So, when :math:`S` layers exists, you need to
    prepare :math:`8S` weigth matrices and :math:`8S` bias vectors.

    If the number of layers ``n_layers`` is greather than :math:`1`, input
    of ``k``-th layer is hidden state ``h_t`` of ``k-1``-th layer.
    Note that all input variables except first layer may have different shape
    from the first layer.

    Args:
        n_layers(int): Number of layers.
        dropout_ratio(float): Dropout ratio.
        hx (chainer.Variable): Variable holding stacked hidden states.
            Its shape is ``(S, B, N)`` where ``S`` is number of layers and is
            equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is
            dimention of hidden units.
        cx (chainer.Variable): Variable holding stacked cell states.
            It has the same shape as ``hx``.
        ws (list of list of chainer.Variable): Weight matrices. ``ws[i]``
            represents weights for i-th layer.
            Each ``ws[i]`` is a list containing eight matrices.
            ``ws[i][j]`` is corresponding with ``W_j`` in the equation.
            Only ``ws[0][j]`` where ``0 <= j < 4`` is ``(I, N)`` shape as they
            are multiplied with input variables. All other matrices has
            ``(N, N)`` shape.
        bs (list of list of chainer.Variable): Bias vectors. ``bs[i]``
            represnents biases for i-th layer.
            Each ``bs[i]`` is a list containing eight vectors.
            ``bs[i][j]`` is corresponding with ``b_j`` in the equation.
            Shape of each matrix is ``(N,)`` where ``N`` is dimention of
            hidden units.
        xs (list of chainer.Variable): A list of :class:`chainer.Variable`
            holding input values. Each element ``xs[t]`` holds input value
            for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is
            mini-batch size for time ``t``, and ``I`` is size of input units.
            Note that this functions supports variable length sequences.
            When sequneces has different lengths, sort sequences in descending
            order by length, and transpose the sorted sequence.
            :func:`~chainer.functions.transpose_sequence` transpose a list
            of :func:`~chainer.Variable` holding sequence.
            So ``xs`` needs to satisfy
            ``xs[t].shape[0] >= xs[t + 1].shape[0]``.
        train (bool): If ``True``, this function executes dropout.
        use_cudnn (bool): If ``True``, this function uses cuDNN if available.

    Returns:
        tuple: This functions returns a tuple concaining three elements,
            ``hy``, ``cy`` and ``ys``.

            - ``hy`` is an updated hidden states whose shape is same as ``hx``.
            - ``cy`` is an updated cell states whose shape is same as ``cx``.
            - ``ys`` is a list of :class:~chainer.Variable. Each element
              ``ys[t]`` holds hidden states of the last layer corresponding
              to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is
              mini-batch size for time ``t``, and ``N`` is size of hidden
              units. Note that ``B_t`` is the same value as ``xs[t]``.

    .. seealso::

       :func:`chainer.functions.lstm`

    """

    xp = cuda.get_array_module(hx, hx.data)

    if use_cudnn and xp is not numpy and cuda.cudnn_enabled and \
       _cudnn_version >= 5000:
        states = get_random_state().create_dropout_states(dropout_ratio)
        # flatten all input variables
        inputs = tuple(
            itertools.chain((hx, cx), itertools.chain.from_iterable(ws),
                            itertools.chain.from_iterable(bs), xs))
        rnn = NStepLSTM(n_layers, states, train=train)
        ret = rnn(*inputs)
        hy, cy = ret[:2]
        ys = ret[2:]
        return hy, cy, ys

    else:
        hx = split_axis.split_axis(hx, n_layers, axis=0, force_tuple=True)
        hx = [reshape.reshape(h, h.shape[1:]) for h in hx]
        cx = split_axis.split_axis(cx, n_layers, axis=0, force_tuple=True)
        cx = [reshape.reshape(c, c.shape[1:]) for c in cx]

        xws = [_stack_weight([w[2], w[0], w[1], w[3]]) for w in ws]
        hws = [_stack_weight([w[6], w[4], w[5], w[7]]) for w in ws]
        xbs = [_stack_weight([b[2], b[0], b[1], b[3]]) for b in bs]
        hbs = [_stack_weight([b[6], b[4], b[5], b[7]]) for b in bs]

        ys = []
        for x in xs:
            batch = x.shape[0]
            h_next = []
            c_next = []
            for layer in six.moves.range(n_layers):
                h = hx[layer]
                c = cx[layer]
                if h.shape[0] > batch:
                    h, h_rest = split_axis.split_axis(h, [batch], axis=0)
                    c, c_rest = split_axis.split_axis(c, [batch], axis=0)
                else:
                    h_rest = None

                x = dropout.dropout(x, ratio=dropout_ratio, train=train)
                h = dropout.dropout(h, ratio=dropout_ratio, train=train)
                lstm_in = linear.linear(x, xws[layer], xbs[layer]) + \
                    linear.linear(h, hws[layer], hbs[layer])

                c_bar, h_bar = lstm.lstm(c, lstm_in)
                if h_rest is not None:
                    h = concat.concat([h_bar, h_rest], axis=0)
                    c = concat.concat([c_bar, c_rest], axis=0)
                else:
                    h = h_bar
                    c = c_bar
                h_next.append(h)
                c_next.append(c)
                x = h_bar
            hx = h_next
            cx = c_next
            ys.append(x)

        hy = stack.stack(hx)
        cy = stack.stack(cx)
        return hy, cy, tuple(ys)
Beispiel #50
0
def black_out(x, t, W, samples, reduce='mean'):
    """BlackOut loss function.

    BlackOut loss function is defined as

    .. math::

      -\\log(p(t)) - \\sum_{s \\in S} \\log(1 - p(s)),

    where :math:`t` is the correct label, :math:`S` is a set of negative
    examples and :math:`p(\\cdot)` is likelihood of a given label.
    And, :math:`p` is defined as

    .. math::

       p(y) = \\frac{\\exp(W_y^\\top x)}{
       \\sum_{s \\in samples} \\exp(W_s^\\top x)}.

    The output is a variable whose value depends on the value of
    the option ``reduce``. If it is ``'no'``, it holds the
    no loss values. If it is ``'mean'``, this function takes
    a mean of loss values.

    Args:
        x (~chainer.Variable): Batch of input vectors.
            Its shape should be :math:`(N, D)`.
        t (~chainer.Variable): Vector of ground truth labels.
            Its shape should be :math:`(N,)`. Each elements :math:`v`
            should satisfy :math:`0 \\geq v \\geq V` or :math:`-1`
            where :math:`V` is the number of label types.
        W (~chainer.Variable): Weight matrix.
            Its shape should be :math:`(V, D)`
        samples (~chainer.Variable): Negative samples.
            Its shape should be :math:`(N, S)` where :math:`S` is
            the number of negative samples.
        reduce (str): Reduction option. Its value must be either
            ``'no'`` or ``'mean'``. Otherwise,
            :class:`ValueError` is raised.

    Returns:
        ~chainer.Variable:
            A variable object holding loss value(s).
            If ``reduce`` is ``'no'``, the output variable holds an
            array whose shape is :math:`(N,)` .
            If it is ``'mean'``, it holds a scalar.

    See: `BlackOut: Speeding up Recurrent Neural Network Language Models With \
         Very Large Vocabularies <https://arxiv.org/abs/1511.06909>`_

    .. seealso:: :class:`~chainer.links.BlackOut`.

    """

    batch_size = x.shape[0]

    neg_emb = embed_id.embed_id(samples, W)
    neg_y = matmul.matmul(neg_emb, x[:, :, None])
    neg_y = reshape.reshape(neg_y, neg_y.shape[:-1])

    pos_emb = expand_dims.expand_dims(embed_id.embed_id(t, W), 1)
    pos_y = matmul.matmul(pos_emb, x[:, :, None])
    pos_y = reshape.reshape(pos_y, pos_y.shape[:-1])

    logz = logsumexp.logsumexp(concat.concat([pos_y, neg_y]), axis=1)
    blogz, bneg_y = broadcast.broadcast(
        reshape.reshape(logz, (batch_size, 1)), neg_y)
    ny = exponential.log(1 - exponential.exp(bneg_y - blogz))
    py = reshape.reshape(pos_y, (batch_size,))
    loss = -(py - logz + _sum.sum(ny, axis=1))
    if reduce == 'mean':
        loss = average.average(loss)
    return loss
def group_normalization(x, groups, gamma, beta, eps=1e-5):
    """Group normalization function.

    This function implements a "group normalization"
    which divides the channels into groups and computes within each group
    the mean and variance, then normalize by these statistics,
    scales and shifts them.


    Args:
        x (:class:`~chainer.Variable` or :ref:`ndarray`): Batch tensors.
            First dimension of this value must be the size of minibatch and
            second dimension must be the number of channels.
            Moreover, this value must have one or more following dimensions,
            such as height and width.
        groups (int):
            The number of channel groups.
            This value must be a divisor of the number of channels.
        gamma (:class:`~chainer.Variable` or :ref:`ndarray`):
            Scaling parameter.
        beta (:class:`~chainer.Variable` or :ref:`ndarray`):
            Shifting parameter.
        eps (float): Epsilon value for numerical stability of normalization.


    Returns:
        ~chainer.Variable: The output variable which has the same shape
        as :math:`x`.

    See: `Group Normalization <https://arxiv.org/abs/1803.08494>`_
    """
    if x.ndim <= 2:
        raise ValueError('Input dimension must be grater than 2, '
                         'including batch size dimension '
                         '(first dimension).')

    if not isinstance(groups, int):
        raise TypeError('Argument: \'groups\' type must be (int).')

    xp = backend.get_array_module(x)

    batch_size, channels = x.shape[:2]
    original_shape = x.shape

    if channels % groups != 0:
        raise ValueError('Argument: \'groups\' must be a divisor '
                         'of the number of channel.')

    # By doing this reshaping, calling batch_normalization function becomes
    # equivalent to Group Normalization.
    # And redundant dimension is added in order to utilize ideep64/cuDNN.
    x = reshape.reshape(x, (1, batch_size * groups, -1, 1))

    with cuda.get_device_from_array(x.array):
        dummy_gamma = xp.ones(batch_size * groups).astype(xp.float32)
        dummy_beta = xp.zeros(batch_size * groups).astype(xp.float32)

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        x = batch_normalization.batch_normalization(x,
                                                    dummy_gamma,
                                                    dummy_beta,
                                                    eps=eps)

    x = reshape.reshape(x, original_shape)

    target_shape = [1, channels] + [1] * (x.ndim - 2)
    gamma_broadcast = broadcast.broadcast_to(
        reshape.reshape(gamma, target_shape), x.shape)
    beta_broadcast = broadcast.broadcast_to(
        reshape.reshape(beta, target_shape), x.shape)

    return x * gamma_broadcast + beta_broadcast