Ejemplo n.º 1
0
def var(x, axis=None, ddof=0, keepdims=False, constant=False):
    """
    Compute the variance along the specified axis.

    Returns the variance of the array elements, a measure of the spread of a
    distribution.  The variance is computed for the flattened array by
    default, otherwise over the specified axis.

    Parameters
    ----------
    x : array_like
        Array containing numbers whose variance is desired.

    axis : Optional[int, Tuple[int, ...]]
        Axis or axes along which the variance is computed.  The default is to
        compute the variance of the flattened array.

    ddof : int, optional (default=0)
        "Delta Degrees of Freedom": the divisor used in the calculation is
        ``N - ddof``, where ``N`` represents the number of elements. By
        default `ddof` is zero.

    keepdims : bool, optional (default=False)
        If this is set to True, the axes which are reduced are left
        in the result as dimensions with size one. With this option,
        the result will broadcast correctly against the input array..

    constant : bool, optional(default=False)
        If ``True``, the returned tensor is a constant (it
        does not back-propagate a gradient)

    Returns
    -------
    variance : mygrad.Tensor

    Notes
    -----
    The variance is the average of the squared deviations from the mean,
    i.e.,  ``var = mean(abs(x - x.mean())**2)``.

    The mean is normally calculated as ``x.sum() / N``, where ``N = len(x)``.
    If, however, `ddof` is specified, the divisor ``N - ddof`` is used
    instead.  In standard statistical practice, ``ddof=1`` provides an
    unbiased estimator of the variance of a hypothetical infinite population.
    ``ddof=0`` provides a maximum likelihood estimate of the variance for
    normally distributed variables.

    Examples
    --------
    >>> import mygrad as mg
    >>> import numpy as np
    >>> a = mg.Tensor([[1, 2],
    ...                [3, 4]])
    >>> mg.var(a)
    Tensor(1.25)
    >>> mg.var(a, axis=0)
    Tensor([ 1.,  1.])
    >>> mg.var(a, axis=1)
    Tensor([ 0.25,  0.25])

    In single precision, ``var()`` can be inaccurate:

    >>> a = mg.zeros((2, 512*512), dtype=np.float32)
    >>> a[0, :] = 1.0
    >>> a[1, :] = 0.1
    >>> mg.var(a)
    Tensor(0.20250003)

    Computing the variance in float64 is more accurate:

    >>> mg.var(a, dtype=np.float64)
    Tensor(0.20249999932944759)
    >>> ((1-0.55)**2 + (0.1-0.55)**2)/2
    Tensor(0.2025)
    """
    return Tensor._op(
        Variance,
        x,
        op_kwargs=dict(axis=axis, keepdims=keepdims, ddof=ddof),
        constant=constant,
    )
Ejemplo n.º 2
0
def sum(x, axis=None, keepdims=False, constant=False):
    """
    Sum of tensor elements over a given axis.

    Parameters
    ----------
    x : array_like

    axis : Optional[int, Tuple[ints, ...]]
        Axis or axes along which a sum is performed.  The default,
        axis=None, will sum all of the elements of the input tensor.  If
        axis is negative it counts from the last to the first axis.
        If axis is a tuple of ints, a sum is performed on all of the axes
        specified in the tuple instead of a single axis or all the axes as
        before.

    keepdims : bool, optional
        If this is set to True, the axes which are reduced are left
        in the result as dimensions with size one. With this option,
        the result will broadcast correctly against the input tensor.

    constant : bool, optional(default=False)
        If ``True``, the returned tensor is a constant (it
        does not back-propagate a gradient)

    Returns
    -------
    sum_along_axis : mygrad.Tensor
        A Tensor with the same shape as `self`, with the specified
        axis/axes removed. If `self` is a 0-d tensor, or if `axis` is None,
        a 0-dim Tensor is returned.

    See Also
    --------
    mygrad.Tensor.sum : Equivalent method.

    cumsum : Cumulative sum of array elements.

    mean, average

    Notes
    -----
    Arithmetic is modular when using integer types, and no error is
    raised on overflow.

    The sum of an empty tensor is the neutral element 0:

    >>> mygrad.sum([])
    Tensor(0.0)

    Examples
    --------
    >>> import mygrad as mg
    >>> import numpy as np
    >>> mg.sum([0.5, 1.5])
    Tensor(2.0)
    >>> mg.sum([0.5, 0.7, 0.2, 1.5], dtype=np.int32)
    Tensor(1)
    >>> mg.sum([[0, 1], [0, 5]])
    Tensor(6)
    >>> mg.sum([[0, 1], [0, 5]], axis=0)
    Tensor([0, 6])
    >>> mg.sum([[0, 1], [0, 5]], axis=1)
    Tensor([1, 5])

    If the accumulator is too small, overflow occurs:

    >>> mg.ones(128, dtype=mg.int8).sum(dtype=np.int8)
    Tensor(-128)

    You can also start the sum with a value other than zero:

    >>> mg.sum([10], initial=5)
    Tensor(15)
    """
    return Tensor._op(Sum, x, op_args=(axis, keepdims), constant=constant)
Ejemplo n.º 3
0
def gru(
    X,
    Uz,
    Wz,
    bz,
    Ur,
    Wr,
    br,
    Uh,
    Wh,
    bh,
    s0=None,
    bp_lim=None,
    dropout=0.0,
    constant=False,
):
    r""" Performs a forward pass of sequential data through a Gated Recurrent Unit layer, returning
    the 'hidden-descriptors' arrived at by utilizing the trainable parameters as follows::

                Z_{t} = sigmoid(X_{t} Uz + S_{t-1} Wz + bz)
                R_{t} = sigmoid(X_{t} Ur + S_{t-1} Wr + br)
                H_{t} =    tanh(X_{t} Uh + (R{t} * S_{t-1}) Wh + bh)
                S_{t} = (1 - Z{t}) * H{t} + Z{t} * S_{t-1}

    Parameters
    ----------
    X : array_like, shape=(T, N, C)
       The sequential data to be passed forward.

    Uz : array_like, shape=(C, D)
       The weights used to map sequential data to its hidden-descriptor representation

    Wz : array_like, shape=(D, D)
        The weights used to map a hidden-descriptor to a hidden-descriptor.

    bz : array_like, shape=(D,)
       The biases used to scale a hidden-descriptor.

    Ur : array_like, shape=(C, D)
       The weights used to map sequential data to its hidden-descriptor representation

    Wr : array_like, shape=(D, D)
        The weights used to map a hidden-descriptor to a hidden-descriptor.

    br : array_like, shape=(D,)
       The biases used to scale a hidden-descriptor.

    Uh : array_like, shape=(C, D)
       The weights used to map sequential data to its hidden-descriptor representation

    Wh : array_like, shape=(D, D)
        The weights used to map a hidden-descriptor to a hidden-descriptor.

    bh : array_like, shape=(D,)
       The biases used to scale a hidden-descriptor.

    s0 : Optional[array_like], shape=(N, D)
        The 'seed' hidden descriptors to feed into the RNN. If None, a Tensor
        of zeros of shape (N, D) is created.

    bp_lim : Optional[int]
        *This feature is experimental and is currently untested*.
        The (non-zero) limit of the depth of back propagation through time to be
        performed. If `None` back propagation is passed back through the entire sequence.

        E.g. `bp_lim=3` will propagate gradients only up to 3 steps backward through the
        recursive sequence.

    dropout : float (default=0.), 0 <= dropout < 1
        If non-zero, the dropout scheme described in [1]_ is applied. See Notes
        for more details.

    constant : bool, optional (default=False)
        If True, the resulting Tensor is a constant.

    Returns
    -------
    mygrad.Tensor, shape=(T+1, N, D)
        The sequence of 'hidden-descriptors' produced by the forward pass of the RNN.

    Notes
    -----
    - :math:`T` : Sequence length
    - :math:`N` : Batch size
    - :math:`C` : Length of single datum
    - :math:`D` : Length of 'hidden' descriptor

    The GRU system of equations is given by:

    .. math::

                Z_{t} = \sigma (X_{t} U_z + S_{t-1} Wz + bz)

                R_{t} = \sigma (X_{t} U_r + S_{t-1} Wr + br)

                H_{t} =    tanh(X_{t} U_h + (R_{t} * S_{t-1}) W_h + b_h)

                S_{t} = (1 - Z_{t}) * H_{t} + Z_{t} * S_{t-1}

    Following the dropout scheme specified in [1]_, the hidden-hidden weights (Wz/Wr/Wh)
    randomly have their weights dropped prior to forward/back-prop. The input connections
    (via Uz/Ur/Uh) have variational dropout ([2]_) applied to them with a common dropout
    mask across all t. That is three static dropout masks, each with shape-(N,D), are
    applied to

    .. math::
                                          X_{t} U_z

                                          X_{t} U_r

                                          X_{t} U_h
    respectively, for all :math:`t`.

    References
    ----------
    .. [1] S. Merity, et. al. "Regularizing and Optimizing LSTM Language Models",
           arXiv:1708.02182v1, 2017.

    .. [2] Y. Gal, Z. Ghahramani "A Theoretically Grounded Application of Dropout
           in Recurrent Neural Networks" arXiv:1512.05287v5, 2016. """
    if s0 is not None:
        if not isinstance(s0, np.ndarray) and not (isinstance(s0, Tensor) and
                                                   (constant or s0.constant)):
            raise ValueError(
                "GRU does not support non-constant tensors for the initial hidden"
                "state value, `s0`")
    s = Tensor._op(
        GRUnit,
        X,
        Uz,
        Wz,
        bz,
        Ur,
        Wr,
        br,
        Uh,
        Wh,
        bh,
        op_kwargs=dict(s0=s0, bp_lim=bp_lim, dropout=dropout),
        constant=constant,
    )
    s.creator._hidden_seq = s
    return s
Ejemplo n.º 4
0
def conv_nd(x, filter_bank, *, stride, padding=0, dilation=1, constant=False):
    """ Use `filter_bank` to perform strided N-dimensional neural network-style
        convolutions (see Notes) over `x`.::

                f(x, w) -> x ⋆ w

                shapes:
                (N, C, X0, ...) ⋆ (F, C, W0, ...) -> (N, F, G0, ...)

        ``x`` represents a batch of data over which the filters
        are convolved. Specifically, it must be a tensor of shape
        :math:`(N, C, X_0, ...)`, where :math:`N` is the number of samples in the batch,
        C is the channel-depth of each datum, and :math:`(X_0, ...)` are the
        dimensions over which the filters are convolved. Accordingly,
        each filter must have a channel depth of :math:`C`.

        Thus convolving :math:`F` filters, each with a shape :math:`(C, W_0, ...)`,
        over the data batch will produce a tensor of shape
        :math:`(N, F, G_0, ...)`, where :math:`(G_0, ...)` is the shape of the grid
        commensurate with the filter placements

        Parameters
        ----------
        x : Union[Tensor, array_like], shape=(N, C, Xo, ...)
            The data batch to be convolved over.

        filter_bank : Union[Tensor, array_like], shape=(F, C, Wo, ...)
            The filters used to perform the convolutions.

        stride : Union[int, Tuple[int, ...]]
            (keyword-only argument) The step-size with which each 
            filter is placed along the H and W axes during the 
            convolution. The tuple indicates (stride-0, ...). If a 
            single integer is provided, this stride is used for all 
            convolved dimensions

        padding : Union[int, Tuple[int, ...]]
            (keyword-only argument) The number of zeros to be padded 
            to both ends of each convolved dimension, respectively. 
            If a single integer is provided, this padding is used for 
            all of the convolved axes

        dilation : Union[int, Tuple[int, ...]], optional (default=1)
            (keyword-only argument) The spacing used when placing kernel 
            elements along the data. E.g. for a 1D convolution the ith 
            placement of the kernel multiplied  against the dilated-window: 
            ``x[:, :, i*s:(i*s + w*d):d]``, where ``s`` is
            the stride, ``w`` is the kernel-size, and ``d`` is the dilation factor.

            If a single integer is provided, that dilation value is used for all
            of the convolved axes

        constant : bool, optional (default=False)
            If True, the resulting Tensor is a constant.

        Returns
        -------
        Tensor, shape=(N, F, G0, ...)
            The result of each filter being convolved over each datum in
            the batch.

        Notes
        -----
         - The filters are *not* flipped by this operation, meaning that
           an auto-correlation is being performed rather than a true convolution.

         - Only 'valid' filter placements are permitted - where the filters overlap
           completely with the (padded) data.

         - This is a "scalar-only" operation, meaning that back propagation through
           this layer assumes that a scalar (i.e. a 0-dimensional tensor) will invoke
           ``tensor.backward()`` for the computational graph. This is standard for a
           neural network, which terminates in a scalar loss.

        Examples
        --------
        Here we perform a 1D convolution of a constant-valued kernel, ``k``, with a
        'square-wave' signal, ``x``, using stride-1. Note that because we are constrained
        to doing deep learning-style convolutions, that we prepend the dimensions
        :math:`(N=1, C=1)` to ``x``, and :math:`(F=1, C=1)` and to ``k``. That is,
        we are performing a convolution on one, single-channeled signal using
        one kernel.

        See that this convolution produces the expected triangle-shaped
        response. The shape of the resulting tensor is :math:`(N=1, F=1, G_0=12)`.
        That is, the length-5 kernel can be placed in 12 valid positions, using a
        stride of 1.

        >>> import mygrad as mg
        >>> from mygrad.nnet import conv_nd
        >>> x = mg.zeros((1, 1, 16))  # a square-wave signal
        >>> x[..., 5:11] = 1
        >>> k = mg.ones((1, 1, 5))    # a constant-valued kernel
        >>> conv_nd(x, k, stride=1)   # performing a stride-1, 1D convolution
        Tensor([[[0., 1., 2., 3., 4., 5., 5., 4., 3., 2., 1., 0.]]], dtype=float32)

        Back-propagating through the (summed) convolution:

        >>> conv_nd(x, k, stride=1).sum().backward()  # sum to a scalar to perform back-prop
        >>> x.grad  # d(summed_conv)/dx
        array([[[1., 2., 3., 4., 5., 5., 5., 5., 5., 5., 5., 5., 4., 3., 2., 1.]]],
              dtype=float32)
        >>> k.grad  # d(summed_conv)/dk
        array([[[6., 6., 6., 6., 6.]]])

        Now, let's demonstrate a more typical usage for ``conv_nd`` in the context of
        neural networks. ``x`` will represent 10, 32x32 RGB images, and we will use
        5 distinct 2x2 kernels to convolve over each of these images . Note that
        each kernel must possess 3-channel - one for each RGB channel.

        That is, we will be performing NxF channel-wise 2D convolutions. Supposing
        that we don't want the kernel placements to overlap, we can use a stride of 2. In
        total, this will produce a shape-:math:`(N=10, F=5, G_0=16, G_1=16)` tensor as a
        result.

        >>> import numpy as np
        >>> x = mg.Tensor(np.random.rand(10, 3, 32, 32))  # creating 10 random 32x32 RGB images
        >>> k = mg.Tensor(np.random.rand(5, 3, 2, 2))     # creating 5 random 3-channel 2x2 kernels

        Given the shapes of ``x`` and ``k``, ``conv_nd`` automatically executes a 2D convolution:

        >>> conv_nd(x, k, stride=2).shape
        (10, 5, 16, 16)

        Extrapolating further, ``conv_nd`` is capable of performing ND convolutions!
        """
    return Tensor._op(
        ConvND,
        x,
        filter_bank,
        op_kwargs=dict(stride=stride, padding=padding, dilation=dilation),
        constant=constant,
    )
Ejemplo n.º 5
0
def softmax_crossentropy(x, y_true, constant=False):
    r""" Given the classification scores of C classes for N pieces of data,

    computes the NxC softmax classification probabilities. The
    cross entropy is then computed by using the true classification labels.

    log-softmax is used for improved numerical stability.

    Parameters
    ----------
    x : array_like, shape=(N, C)
        The C class scores for each of the N pieces of data.

    y_true : array_like, shape=(N,)
        The correct class-indices, in [0, C), for each datum.

    constant : bool, optional(default=False)
        If ``True``, the returned tensor is a constant (it
        does not back-propagate a gradient)

    Returns
    -------
    The average softmax loss

    Raises
    ------
    ValueError
        Bad dimensionalities for ``x`` or ``y_true``

    Notes
    -----
    - :math:`N` is the number of samples in the batch.
    - :math:`C` is the number of possible classes for which scores are provided.

    Given the shape-:math:`(N, C)` tensor of scores, ``x``, the softmax classification
    probabilities are computed. That is, the score for class-:math:`k` of a given datum
    (:math:`s_{k}`) is normalized using the 'softmax' transformation:

    .. math::

        p_{k} = \frac{e^{s_k}}{\sum_{i=1}^{C}{e^{s_i}}}

    This produces the "prediction probability distribution", :math:`p`, for each datum.
    The cross-entropy loss for that datum is then computed according to the true class-index
    for that datum, as reported in ``y_true``. That is the "true probability distribution",
    :math:`t`, for the datum is :math:`1` for the correct class-index and :math:`0` elsewhere.

    The cross-entropy loss for that datum is thus:

    .. math::
       l = - \sum_{k=1}^{C}{t_{k} \log{p_{k}}}

    Having computed each per-datum cross entropy loss, this function then returns the loss
    averaged over all :math:`N` pieces of data:

    .. math::

       L = \frac{1}{N}\sum_{i=1}^{N}{l_{i}}

    Examples
    --------
    >>> import mygrad as mg
    >>> from mygrad.nnet import softmax_crossentropy

    Let's take a simple case where N=1, and C=3. We'll thus make up classification
    scores for a single datum. Suppose the scores are identical for the three classes
    and that the true class is class-0:

    >>> x = mg.Tensor([[2., 2., 2.]])  # a shape-(1, 3) tensor of scores
    >>> y_true = mg.Tensor([0])  # the correct class for this datum is class-0

    Because the scores are identical for all three classes, the softmax normalization
    will simply produce :math:`p = [\frac{1}{3}, \frac{1}{3}, \frac{1}{3}]`. Because
    class-0 is the "true" class, :math:`t = [1., 0., 0.]`. Thus our softmax cross-entropy
    loss should be:

    .. math::
      -(1 \times \log{\frac{1}{3}} + 0 \times \log{\frac{1}{3}} + 0 \times \log{\frac{1}{3}})
      = \log(3) \approx 1.099

    Let's see that this is what ``softmax_crossentropy`` returns:

    >>> softmax_crossentropy(x, y_true)
    Tensor(1.09861229)

    Similarly, suppose a datum's scores are :math:`[0, 0, 10^6]`, then the softmax normalization
    will return :math:`p \approx [0., 0., 1.]`. If the true class for this datum is class-2, then
    the loss should be nearly 0, since :math:`p` and :math:`t` are essentially identical:

    .. math::
      -(0 \times \log{0} + 0 \times \log{0} + 1 \times \log{1})
      = -\log(1) = 0

    Now, let's construct ``x`` and ``y_true`` so that they incorporate the scores/labels for
    both of the data that we have considered:

    >>> x = mg.Tensor([[2., 2.,  2.],  # a shape-(2, 3) tensor of scores
    ...                [0., 0., 1E6]])
    >>> y_true = mg.Tensor([0, 2])     # the class IDs for the two data

    ``softmax_crossentropy(x, y_true)`` will return the average loss of these two data,
    :math:`\frac{1}{2}(1.099 + 0) \approx 0.55`:

    >>> softmax_crossentropy(x, y_true)
    Tensor(0.54930614)
    """
    return Tensor._op(SoftmaxCrossEntropy,
                      x,
                      op_args=(y_true, ),
                      constant=constant)
Ejemplo n.º 6
0
def dummy(a, b, constant=False):
    return Tensor._op(Dummy, a, b, constant=constant)
Ejemplo n.º 7
0
def max_pool(x, pool, stride, constant=False):
    """ Perform max-pooling over the last N dimensions of a data batch.

    Extended Summary
    ----------------
    The data consists of N trailing axes to be pooled over, denoted by ``C0, ...``. These
    can be preceded, optionally, by un-pooled axes, denoted by ``(N0, ...)``. The dimensions
    of the window over which pooling is performed is denoted by ``P0, ...``. The window
    is placed with stride values ``S0, ...``.

    Ultimately the pooled channels have a shape ``G0, ...``.

    Parameters
    ----------
    x : mygrad.Tensor, shape=([...], C0, ...)
        The data batch; to be pooled along the trailing axes denoted by ``C0, ...``.

    pool : Tuple[Integral, ...], (P0, ...)
        The extent of the pooling window along the ``(C0, ...)`` axes, respectively. The
        length of `pool` determines ``N`` - the number of trailing dimensions to pool over.

    stride : Union[Integral, Tuple[Integral, ...]], (S0, ...)
        The spacing used to place the pooling window, along ``(P0, ...)`` axes, respectively.
        If a single value is provided, it is used for all ``N`` pooling axes.

    Returns
    -------
    numpy.ndarray, shape=([...], G0, ...)
        The pooled data batch.

    Notes
    -----
    Only "valid" placements of the pooling window are permitted - the pooling
    window cannot extend passed the "boundaries" of the data
    dimensions.

    Examples
    --------
    Simple 2D pooling on a 2D tensor. Tiling a 2x2 max-pool window with
    stride-1 over a shape-(3, 3) tensor ``x``:

    >>> import  mygrad as mg
    >>> from mygrad.nnet import max_pool
    >>> x = mg.Tensor([[0., 10.,  8.],
    ...                [2.,  7.,  3.],
    ...                [5.,  7., 20.]])
    >>> out = max_pool(x, pool=(2, 2), stride=1)
    >>> out
    Tensor([[ 10., 10.],
            [  7., 20.]])
    >>> out.sum().backward()  # sum to reduce to scalar for back-prop
    >>> x.grad  # dout/dx
    array([[0., 2., 0.],
           [0., 1., 0.],
           [0., 0., 1.]])

    Let's perform 1D pooling on a 2D tensor. Each row of the tensor
    will be pooled over independently. Let's apply a size-2 max-pool
    window to each row of ``x``, using a stride of 1:

    >>> x = mg.Tensor([[0., 10., 8.],
    ...                [9., 7.,  3.],
    ...                [5., 0., 20.]])
    >>> max_pool(x, pool=(2,), stride=1)
    Tensor([[10., 10.],
            [ 9.,  7.],
            [ 5., 20.]])

    Here we perform pooling over the trailing two dimensions of a
    4D tensor, ``x``. By specifying ``pool = (2, 2)``, we instruct
    ``max_pool`` to tile a 2x2 pooling window along these last two
    axes. Let's apply the window every two rows, and for each column;
    i.e. we specify ``stride = (2, 1)``:

    >>> import numpy as np
    >>> x = mg.Tensor(np.random.rand(10, 3, 12, 12))
    >>> pool = (2, 2)   # 2x2 pooling over the last axes
    >>> stride = (2, 1) # Apply 2x1 stride
    >>> out = max_pool(x, pool, stride)  # max-pooled Tensor
    >>> out.shape
    (10, 3, 6, 11)

    Had we specified, say, ``pool = (3, 2, 2)``, then a 3x2x2
    pooling window would have been tiled along the last *three* axes
    of ``x``.
    """
    return Tensor._op(MaxPoolND, x, op_args=(pool, stride), constant=constant)