コード例 #1
0
ファイル: schedulers.py プロジェクト: Tommliu/mx-ml
def gaussian_cdf(x, mean, var):
    """
    Compute the probability that a random draw from a 1D Gaussian with mean
    `mean` and variance `var` is less than or equal to `x`.
    """
    eps = np.finfo(float).eps
    x_scaled = (x - mean) / np.sqrt(var + eps)
    return (1 + erf(x_scaled / np.sqrt(2))) / 2
コード例 #2
0
ファイル: dsp.py プロジェクト: Tommliu/mx-ml
def DCT(frame, orthonormal=True):
    """
    A naive :math:`O(N^2)` implementation of the 1D discrete cosine transform-II
    (DCT-II).

    Notes
    -----
    For a signal :math:`\mathbf{x} = [x_1, \ldots, x_N]` consisting of `N`
    samples, the `k` th DCT coefficient, :math:`c_k`, is

    .. math::

        c_k = 2 \sum_{n=0}^{N-1} x_n \cos(\pi k (2 n + 1) / (2 N))

    where `k` ranges from :math:`0, \ldots, N-1`.

    The DCT is highly similar to the DFT -- whereas in a DFT the basis
    functions are sinusoids, in a DCT they are restricted solely to cosines. A
    signal's DCT representation tends to have more of its energy concentrated
    in a smaller number of coefficients when compared to the DFT, and is thus
    commonly used for signal compression. [1]

    .. [1] Smoother signals can be accurately approximated using fewer DFT / DCT
       coefficients, resulting in a higher compression ratio. The DCT naturally
       yields a continuous extension at the signal boundaries due its use of
       even basis functions (cosine). This in turn produces a smoother
       extension in comparison to DFT or DCT approximations, resulting in a
       higher compression.

    Parameters
    ----------
    frame : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
        A signal frame consisting of N samples
    orthonormal : bool
        Scale to ensure the coefficient vector is orthonormal. Default is True.

    Returns
    -------
    dct : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
        The discrete cosine transform of the samples in `frame`.
    """
    N = len(frame)
    out = np.zeros_like(frame)
    for k in range(N):
        for (n, xn) in enumerate(frame):
            out[k] += xn * np.cos(np.pi * k * (2 * n + 1) / (2 * N))
        scale = np.sqrt(1 / (4 * N)) if k == 0 else np.sqrt(1 / (2 * N))
        out[k] *= 2 * scale if orthonormal else 2
    return out
コード例 #3
0
def glorot_normal(weight_shape, gain=1.0):
    """
    Initialize network weights `W` using the Glorot normal initialization strategy.

    Notes
    -----
    The Glorot normal initializaiton initializes weights with draws from
    TruncatedNormal(0, b) where the variance `b` is

    .. math::

        b = \\frac{2 \\text{gain}^2}{\\text{fan_in} + \\text{fan_out}}

    The motivation for Glorot normal initialization is to choose weights to
    ensure that the variance of the layer outputs are approximately equal to
    the variance of its inputs.

    This initialization strategy was primarily developed for deep networks with
    :class:`~numpy_ml.neural_nets.activations.Tanh` and
    :class:`~numpy_ml.neural_nets.activations.Sigmoid` nonlinearities.

    Parameters
    ----------
    weight_shape : tuple
        The dimensions of the weight matrix/volume.

    Returns
    -------
    W : :py:class:`ndarray <numpy.ndarray>` of shape `weight_shape`
        The initialized weights.
    """
    fan_in, fan_out = calc_fan(weight_shape)
    std = gain * np.sqrt(2 / (fan_in + fan_out))
    return truncated_normal(0, std, weight_shape)
コード例 #4
0
def glorot_uniform(weight_shape, gain=1.0):
    """
    Initialize network weights `W` using the Glorot uniform initialization
    strategy.

    Notes
    -----
    The Glorot uniform initialization strategy initializes weights using draws
    from ``Uniform(-b, b)`` where:

    .. math::

        b = \\text{gain} \sqrt{\\frac{6}{\\text{fan_in} + \\text{fan_out}}}

    The motivation for Glorot uniform initialization is to choose weights to
    ensure that the variance of the layer outputs are approximately equal to
    the variance of its inputs.

    This initialization strategy was primarily developed for deep networks with
    tanh and logistic sigmoid nonlinearities.

    Parameters
    ----------
    weight_shape : tuple
        The dimensions of the weight matrix/volume.

    Returns
    -------
    W : :py:class:`ndarray <numpy.ndarray>` of shape `weight_shape`
        The initialized weights.
    """
    fan_in, fan_out = calc_fan(weight_shape)
    b = gain * np.sqrt(6 / (fan_in + fan_out))
    return np.random.uniform(-b, b, size=weight_shape)
コード例 #5
0
def he_normal(weight_shape):
    """
    Initialize network weights `W` using the He normal initialization strategy.

    Notes
    -----
    The He normal initialization strategy initializes the weights in `W` using
    draws from TruncatedNormal(0, b) where the variance `b` is

    .. math::

        b = \\frac{2}{\\text{fan_in}}

    He normal initialization was originally developed for deep networks with
    :class:`~numpy_ml.neural_nets.activations.ReLU` nonlinearities.

    Parameters
    ----------
    weight_shape : tuple
        The dimensions of the weight matrix/volume.

    Returns
    -------
    W : :py:class:`ndarray <numpy.ndarray>` of shape `weight_shape`
        The initialized weights.
    """
    fan_in, fan_out = calc_fan(weight_shape)
    std = np.sqrt(2 / fan_in)
    return truncated_normal(0, std, weight_shape)
コード例 #6
0
def he_uniform(weight_shape):
    """
    Initializes network weights `W` with using the He uniform initialization
    strategy.

    Notes
    -----
    The He uniform initializations trategy initializes thew eights in `W` using
    draws from Uniform(-b, b) where

    .. math::

        b = \sqrt{\\frac{6}{\\text{fan_in}}}

    Developed for deep networks with ReLU nonlinearities.

    Parameters
    ----------
    weight_shape : tuple
        The dimensions of the weight matrix/volume.

    Returns
    -------
    W : :py:class:`ndarray <numpy.ndarray>` of shape `weight_shape`
        The initialized weights.
    """
    fan_in, fan_out = calc_fan(weight_shape)
    b = np.sqrt(6 / fan_in)
    return np.random.uniform(-b, b, size=weight_shape)
コード例 #7
0
    def update(self, param, param_grad, param_name, cur_loss=None):
        """
        Compute the Adam update for a given parameter.

        Parameters
        ----------
        param : :py:class:`ndarray <numpy.ndarray>` of shape (n, m)
            The value of the parameter to be updated.
        param_grad : :py:class:`ndarray <numpy.ndarray>` of shape (n, m)
            The gradient of the loss function with respect to `param_name`.
        param_name : str
            The name of the parameter.
        cur_loss : float
            The training or validation loss for the current minibatch. Used for
            learning rate scheduling e.g., by
            :class:`~numpy_ml.neural_nets.schedulers.KingScheduler`. Default is
            None.

        Returns
        -------
        updated_params : :py:class:`ndarray <numpy.ndarray>` of shape (n, m)
            The value of `param` after applying the Adam update.
        """
        C = self.cache
        H = self.hyperparameters
        d1, d2 = H["decay1"], H["decay2"]
        eps, clip_norm = H["eps"], H["clip_norm"]
        lr = self.lr_scheduler(self.cur_step, cur_loss)

        if param_name not in C:
            C[param_name] = {
                "t": 0,
                "mean": np.zeros_like(param_grad),
                "var": np.zeros_like(param_grad),
            }

        # scale gradient to avoid explosion
        t = np.inf if clip_norm is None else clip_norm
        if norm(param_grad) > t:
            param_grad = param_grad * t / norm(param_grad)

        t = C[param_name]["t"] + 1
        var = C[param_name]["var"]
        mean = C[param_name]["mean"]

        # update cache
        C[param_name]["t"] = t
        C[param_name]["var"] = d2 * var + (1 - d2) * param_grad**2
        C[param_name]["mean"] = d1 * mean + (1 - d1) * param_grad
        self.cache = C

        # calc unbiased moment estimates and Adam update
        v_hat = C[param_name]["var"] / (1 - d2**t)
        m_hat = C[param_name]["mean"] / (1 - d1**t)
        update = lr * m_hat / (np.sqrt(v_hat) + eps)
        return param - update
コード例 #8
0
    def update(self, param, param_grad, param_name, cur_loss=None):
        """
        Compute the AdaGrad update for a given parameter.

        Notes
        -----
        Adjusts the learning rate of each weight based on the magnitudes of its
        gradients (big gradient -> small lr, small gradient -> big lr).

        Parameters
        ----------
        param : :py:class:`ndarray <numpy.ndarray>` of shape (n, m)
            The value of the parameter to be updated
        param_grad : :py:class:`ndarray <numpy.ndarray>` of shape (n, m)
            The gradient of the loss function with respect to `param_name`
        param_name : str
            The name of the parameter
        cur_loss : float or None
            The training or validation loss for the current minibatch. Used for
            learning rate scheduling e.g., by
            :class:`~numpy_ml.neural_nets.schedulers.KingScheduler`.
            Default is None.

        Returns
        -------
        updated_params : :py:class:`ndarray <numpy.ndarray>` of shape (n, m)
            The value of `param` after applying the AdaGrad update
        """
        C = self.cache
        H = self.hyperparameters
        eps, clip_norm = H["eps"], H["clip_norm"]
        lr = self.lr_scheduler(self.cur_step, cur_loss)

        if param_name not in C:
            C[param_name] = np.zeros_like(param_grad)

        # scale gradient to avoid explosion
        t = np.inf if clip_norm is None else clip_norm
        if norm(param_grad) > t:
            param_grad = param_grad * t / norm(param_grad)

        C[param_name] += param_grad**2
        update = lr * param_grad / (np.sqrt(C[param_name]) + eps)
        self.cache = C
        return param - update
コード例 #9
0
def euclidean(x, y):
    """
    Compute the Euclidean (`L2`) distance between two real vectors

    Notes
    -----
    The Euclidean distance between two vectors **x** and **y** is

    .. math::

        d(\mathbf{x}, \mathbf{y}) = \sqrt{ \sum_i (x_i - y_i)^2  }

    Parameters
    ----------
    x,y : :py:class:`ndarray <numpy.ndarray>` s of shape `(N,)`
        The two vectors to compute the distance between

    Returns
    -------
    d : float
        The L2 distance between **x** and **y**.
    """
    return np.sqrt(np.sum((x - y)**2))
コード例 #10
0
ファイル: lda.py プロジェクト: Tommliu/mx-ml
    def _maximize_alpha(self, max_iters=1000, tol=0.1):
        """
        Optimize alpha using Blei's O(n) Newton-Raphson modification
        for a Hessian with special structure
        """
        D = self.D
        T = self.T

        alpha = self.alpha
        gamma = self.gamma

        for _ in range(max_iters):
            alpha_old = alpha

            #  Calculate gradient
            g = D * (digamma(np.sum(alpha)) - digamma(alpha)) + np.sum(
                digamma(gamma) - np.tile(digamma(np.sum(gamma, axis=1)), (T, 1)).T,
                axis=0,
            )

            #  Calculate Hessian diagonal component
            h = -D * polygamma(1, alpha)

            #  Calculate Hessian constant component
            z = D * polygamma(1, np.sum(alpha))

            #  Calculate constant
            c = np.sum(g / h) / (z ** (-1.0) + np.sum(h ** (-1.0)))

            #  Update alpha
            alpha = alpha - (g - c) / h

            #  Check convergence
            if np.sqrt(np.mean(np.square(alpha - alpha_old))) < tol:
                break

        return alpha
コード例 #11
0
    def predict(self, X, conf_interval=0.95, return_cov=False):
        """
        Return the MAP estimate for :math:`y^*`, corresponding the mean/mode of
        the posterior predictive distribution, :math:`p(y^* \mid x^*, X, y)`.

        Notes
        -----
        Under the GP regression model, the posterior predictive distribution is

        .. math::

            y^* \mid x^*, X, y \sim \mathcal{N}(\mu^*, \\text{cov}^*)

        where

        .. math::

            \mu^*  &=  K^* (K + \\alpha I)^{-1} y \\\\
            \\text{cov}^*  &=  K^{**} - K^{*'} (K + \\alpha I)^{-1} K^*

        and

        .. math::

            K  &=  \\text{kernel}(X, X) \\\\
            K^*  &=  \\text{kernel}(X, X^*) \\\\
            K^{**}  &=  \\text{kernel}(X^*, X^*)

        NB. This implementation uses the inefficient but general purpose
        `np.linalg.inv` routine to invert :math:`(K + \\alpha I)`. A more
        efficient way is to rely on the fact that `K` (and hence also :math:`K
        + \\alpha I`) is symmetric positive (semi-)definite and take the inner
        product of the inverse of its (lower) Cholesky decompositions:

        .. math::

            Q^{-1} = \\text{cholesky}(Q)^{-1 \\top} \\text{cholesky}(Q)^{-1}

        For more details on a production-grade implementation, see Algorithm
        2.1 in Rasmussen & Williams (2006).

        Parameters
        ----------
        X : :py:class:`ndarray <numpy.ndarray>` of shape (N, M)
            The collection of datapoints to generate predictions on
        conf_interval : float in (0, 1)
            The percentage confidence bound to return for each prediction. If
            the scipy package is not available, this value is always set to
            0.95. Default is 0.95.
        return_cov : bool
            If True, also return the covariance (`cov*`) of the posterior
            predictive distribution for the points in `X`. Default is False.

        Returns
        -------
        y_pred : :py:class:`ndarray <numpy.ndarray>` of shape `(N, O)`
            The predicted values for each point in `X`, each with
            dimensionality `O`.
        conf : :py:class:`ndarray <numpy.ndarray>` of shape `(N, O)`
            The % conf_interval confidence bound for each `y_pred`. The conf %
            confidence interval for the `i`'th prediction is ``[y[i] - conf[i],
            y[i] + conf[i]]``.
        cov : :py:class:`ndarray <numpy.ndarray>` of shape `(N, N)`
            The covariance (`cov*`) of the posterior predictive distribution for
            `X`. Only returned if `return_cov` is True.
        """
        if conf_interval != 0.95 and not _SCIPY:
            fstr = "Cannot compute {}% confidence score without scipy.stats"
            warnings.warn(fstr.format(conf_interval))

        X_star = X
        X = self.parameters["X"]
        y = self.parameters["y"]
        K = self.parameters["GP_cov"]
        alpha = self.hyperparameters["alpha"]

        K_star = self.kernel(X_star, X)
        K_star_star = self.kernel(X_star, X_star)

        sig = np.eye(K.shape[0]) * alpha
        K_y_inv = inv(K + sig)

        pp_mean = np.dot(np.dot(K_star, K_y_inv), y)

        pp_cov = K_star_star - np.dot(np.dot(K_star, K_y_inv), K_star.T)

        # if we can't use scipy, ignore the passed value for `conf_interval`
        # and return the 95% confidence bound.
        # (norm.ppf == inverse CDF for standard normal)
        percentile = 1.96 if not _SCIPY else norm.ppf(conf_interval)
        conf = percentile * np.sqrt(np.diag(pp_cov))
        return (pp_mean, conf) if not return_cov else (pp_mean, conf, pp_cov)