Esempio n. 1
0
    def __init__(self, x, y=None):
        self.x = _create_array(x)[0]
        if y is not None:
            self.y = _create_array(y)[0]
            self.x = np.column_stack([self.x, self.y])

        self.n, self.m = self.x.shape
        self.method = 'two_pass_covariance'
        self.cov = np.empty([self.m, self.m])
Esempio n. 2
0
def isskewsymmetric(x):
    r"""
    Tests if a matrix is skew symmetric.

    Parameters
    ----------
    x : array_like
        Accepts a numpy array, nested list, dictionary, or
        pandas DataFrame. The private function _create_array
        is called to create a copy of x as a numpy array.

    Returns
    -------
    Boolean
        Returns True if matrix is antisymmetric

    Notes
    -----
    Skew symmetric matrices are also known as antisymmetric. A skew symmetric matrix is equal to its
    negative transpose.

    .. math::

        A = -A^T

    :math:`a_{ij}` must equal :math:`-a_{ji}`, thus the diagonal of a skew symmetric matrix must be 0,
    as :math:`a_{jj}` = :math:`-a_{jj}`.

    .. math::

        \begin{bmatrix}
            0 & a_{12} & a_{13} \\
            -a_{12} & 0 & a_{23} \\
            -a_{13} & -a_{23} & 0
        \end{bmatrix}

    Examples
    --------
    >>> m = pd.DataFrame({0: [0,-2,1], 1: [2,0,4], 2: [-1,-4,0]})
    >>> isskewsymmetric(m)
    True
    >>> m2 = pd.DataFrame({0: [2,-1,0], 1: [-1,2,-1], 2: [0,-1,2]})
    >>> isskewsymmetric(m2)
    False

    References
    ----------
    Rowland, Todd and Weisstein, Eric W. "Antisymmetric Matrix." From MathWorld--A Wolfram Web Resource.
        http://mathworld.wolfram.com/AntisymmetricMatrix.html

    """
    x = _create_array(x)[0]

    if x.shape[0] != x.shape[1]:
        return False

    if np.allclose(x, -np.transpose(x)) is False:
        return False

    return True
Esempio n. 3
0
def var_cond(x):
    r"""
    Calculates the condition number, denoted as :math:`\kappa` which
    measures the sensitivity of the variance :math:`S` of a sample
    vector :math:`x` as defined by Chan and Lewis (as cited in Chan,
    Golub, & Leveque, 1983). Given a machine accuracy value of
    :math:`u`, the value :math:`\kappa u` can be used as a measure to
    judge the accuracy of the different variance computation algorithms.

    Parameters
    ----------
    x : array_like
        Accepts a numpy array, nested list, dictionary, or
        pandas DataFrame. The private function _create_array
        is called to create a copy of x as a numpy array.

    Returns
    -------
    varr : numpy ndarray
        Depending on the dimension of the input, returns a 1D or 2D array of the
        column-wise computed variances.

    Notes
    -----
    The 2-norm is defined as usual:

    .. math::

        ||x||_2 = \sum^N_{i=1} x^2_i

    Then the condition number :math:`\kappa` is defined as:

    .. math::

        \kappa = \frac{||x||_2}{\sqrt{S}} = \sqrt{1 + \bar{x}^2 N / S}

    References
    ----------
    Chan, T., Golub, G., & Leveque, R. (1983). Algorithms for Computing the Sample Variance:
        Analysis and Recommendations. The American Statistician, 37(3), 242-247.
        http://dx.doi.org/10.1080/00031305.1983.10483115

    """
    x = _create_array(x)[0]

    if x.ndim == 1:
        kap_cond = norm(x) / np.sqrt(var(x))

    elif x.ndim == 2:
        kap_cond = np.empty(x.shape[1])
        j = 0
        for i in x.T:
            k = norm(i) / np.sqrt(var(i))
            kap_cond[j] = k
            j += 1

    else:
        raise ValueError('array must be 1D or 2D')

    return kap_cond
Esempio n. 4
0
 def __init__(self, x):
     self.x = _create_array(x)[0]
     self.m = self.x.shape[0]
     self.n = self.x.shape[1]
     self.r = np.zeros((self.n, self.n))
     self.q = np.zeros((self.m, self.n))
     self.method = 'householder'
Esempio n. 5
0
    def __init__(self, x, factors=None, rotate=None, covar=False):
        r"""
        Initializes the FactorAnalysis class.

        Parameters
        ----------
        x : array-like
            Numpy ndarray, pandas DataFrame, list of lists or dictionary (keys are column
            names and corresponding values are the column values) representing observation
            vectors
        factors : int, default None
            Number of underlying hypothetical factors
        rotate : str, default None
            Rotation to use when performing the factor analysis. Currently not used.
        covar : boolean, default False
            If False (default), perform the factor analysis using the covariance matrix. If
            True, the factor analysis is computed with the correlation matrix. It is highly
            recommended to use the correlation matrix in the vast majority of cases as
            variables with comparatively large variances can dominate the diagonal of the
            covariance matrix and the factors.

        """
        self.x = _create_array(x)[0]
        self.factors = int(factors)

        if self.factors > self.x.shape[1]:
            raise ValueError('number of factors cannot exceed number of observation vectors')

        self.rotate = rotate
        self.covar = covar
        self.method = 'principal_component'
Esempio n. 6
0
def issymmetric(x):
    r"""
    Tests if a matrix is symmetric.

    Parameters
    ----------
    x : array_like
        Accepts a numpy array, nested list, dictionary, or
        pandas DataFrame. The private function _create_array
        is called to create a copy of x as a numpy array.

    Returns
    -------
    Boolean
        Returns True if matrix is symmetric

    Notes
    -----
    A symmetric matrix is defined as a square matrix that is equal to its transpose.

    .. math::

        A \in \mathbb{R}^{n \times n} \qquad A^T = A

    A symmetric matrix has the following form:

    .. math::

        \begin{bmatrix}
            a_{11} & a_{12} & \cdots & a_{1n} \\
            a_{12} & a_{22} & \cdots & a_{2n} \\
            \vdots & \vdots & \ddots & \vdots \\
            a_{1n} & a_{2n} & \cdots & a_{nn}
        \end{bmatrix}

    Examples
    --------
    >>> m = pd.DataFrame({0: [2,-1,0], 1: [-1,2,-1], 2: [0,-1,2]})
    >>> issymmetric(m)
    True

    References
    ----------
    Golub, G., & Van Loan, C. (2013). Matrix computations (3rd ed.). Baltimore (MD): Johns Hopkins U.P.

    Weisstein, Eric W. "Symmetric Matrix." From MathWorld--A Wolfram Web Resource.
        http://mathworld.wolfram.com/SymmetricMatrix.html

    """
    x = _create_array(x)[0]

    if x.shape[0] != x.shape[1]:
        return False

    if np.allclose(np.transpose(x), x) is False:
        return False

    return True
Esempio n. 7
0
    def __init__(self, x):
        self.type = x.__class__.__name__
        self.x, self.cols = _create_array(x)

        if self.x.ndim > 2:
            raise ValueError('array must be 1D or 2D')

        self.dim = self.x.ndim
        self.n = self.x.shape[0]
        self.method = 'corrected_two_pass'
Esempio n. 8
0
def isorthogonal(x):
    r"""
    Tests if a matrix is orthogonal.

    Parameters
    ----------
    x : array_like
        Accepts a numpy array, nested list, dictionary, or
        pandas DataFrame. The private function _create_array
        is called to create a copy of x as a numpy array.

    Returns
    -------
    Boolean
        Returns True if matrix is orthogonal

    Notes
    -----
    A square matrix :math:`A` is said to be orthogonal if:

    .. math::

        AA^T = I

    Where :math:`A^T` is the transpose of :math:`A` and :math:`I` is the identity matrix.

    The following matrix is orthogonal:

    .. math::

        A = \begin{bmatrix}\frac{1}{3} & -\frac{2}{3} & \frac{2}{3} \\
        \frac{2}{3} & -\frac{1}{3} & -\frac{2}{3} \\
        \frac{2}{3} & \frac{2}{3} & \frac{1}{3} \end{bmatrix}

    Examples
    --------
    >>> a = pd.DataFrame({0: [1/3, 2/3, 2/3], 1: [-2/3, -1/3, 2/3], 2: [2/3,-2/3,1/3]})
    >>> isorthogonal(a)
    True

    References
    ----------
    Rowland, Todd. "Orthogonal Matrix." From MathWorld--A Wolfram Web Resource, created by Eric W. Weisstein.
        http://mathworld.wolfram.com/OrthogonalMatrix.html

    """
    x = _create_array(x)[0]

    if x.shape[0] != x.shape[1]:
        return 'Matrix is not orthogonal'

    if np.allclose(np.dot(x, x.T), np.eye(x.shape[0])):
        return True

    return False
def add_noise(cor, epsilon=None, M=None):
    x = _create_array(cor)[0]
    n = x.shape[1]

    if epsilon is None:
        epsilon = 0.05
    if M is None:
        M = 2

    np.fill_diagonal(cor, 1 - epsilon)

    cor = _CorMatrixSim._generate_noise(cor, n, M, epsilon)

    return cor
def lagrange_interpolate(x, y):
    r"""
    Interpolates a polynomial given a set of equal-length x and y values using
    Lagrangian interpolation.

    Parameters
    ----------
    x
        One-dimensional array of x values. Can be a pandas DataFrame or Series,
        list, dictionary (first key-value pair is used if there are more than one),
        or numpy array. Must be same length as y.
    y
        One-dimensional array of y values. Can be a pandas DataFrame or Series,
        list, dictionary (first key-value pair is used if there are more than one),
        or numpy array. Must be same length as y.

    Returns
    -------
    Symbolic representation of interpolating polynomial.

    Notes
    -----
    The Lagrangian method of polynomial interpolation uses Lagrangian polynomials
    to fit a polynomial to a given set of data points. The Lagrange interpolating
    polynomial is given by the following theorem:

    For a set of data points :math:`(x_0, y_0), (x_1, y_1), \cdots, (x_n, y_n)`
    with no duplicate $x$ and there exists a function $f$ which evaluates to these
    points, then there is a unique polynomial $P(x)$ with degree $\leq n$ also
    exists. The polynomial is given by:

    .. math::

        P(x) = f(x_o)L_{n,0}(x) + \cdots + f(x_n)L_{n,n}(x) = \sum^n_{k=0} f(x_k) L_{n,k}(x)

    Where each :math:`k` in :math:`k = 0, 1, \cdots, n` is:

    .. math::

        L_{n,k} = \frac{(x - x_0)(x - x_1) \cdots (x - x_{k-1})(x - x_{k+1})
        \cdots (x - x_n)}{(x_k - x_0)(x_k - x_1) \cdots (x_k - x_{k-1})(x_k - x_{k+1})
        \cdots (x_k - x_n)} = \underset{i \neq k}{\prod^n_{i=0}} \frac{(x - x_i)}{(x_k - x_i)}

    Examples
    --------
    >>> x, y = [0, 2, 3, 4], [7, 11, 28, 63]
    >>> lagrange_interpolate(x, y)
    x**3 - 2*x + 7

    References
    ----------
    Burden, R. L., & Faires, J. D. (2011). Numerical analysis (9th ed.).
        Boston, MA: Brooks/Cole, Cengage Learning.

    Cheney, E. W., & Kincaid, D. (2013). Numerical mathematics and computing (6th ed.).
        Boston, MA: Brooks/Cole, Cengage Learning.

    """
    x, y = _create_array(x)[0], _create_array(y)[0]

    if len(x) != len(y):
        raise ValueError('x and y must be the same length to evaluate polynomial')

    l = []

    for i in np.arange(len(x)):
        num = 1
        denom = 1

        p = np.delete(x, i)

        for j in p:
            num = str(num) + '*' + '(' + 'x' + ' - ' + str(j) + ')'
            denom = str(denom) + '*' + '(' + str(x[i]) + ' - ' + str(j) + ')'

        l.append('(' + num + ')' + '/' + '(' + denom + ')')

    poly = 0

    for i in np.arange(len(l)):
        poly = str(poly) + '+' + str(y[i]) + '*' + str(l[i])

    return simplify(poly)
Esempio n. 11
0
def pearson(x, y=None):
    r"""
    Computes the Pearson product-moment correlation coefficients of the given variables.

    Parameters
    ----------
    x : array-like
        Can be Pandas DataFrame, Pandas Series, numpy ndarray, list, list of lists, or dictionary
        representing a 1D or 2D array containing the variables and their respective observation
        vectors.
        The input is concatenated with the parameter y if given.
    y : array-like
        Can be Pandas DataFrame, Pandas Series, numpy ndarray, list, list of lists, or dictionary
        representing a 1D or 2D array containing the variables and their respective observation
        vectors.

    Returns
    -------
    numpy ndarray
        The correlation coefficient matrix of the inputted variables.

    Notes
    -----
    Pearson's product-moment correlation coefficient is the covariance of two random variables
    divided by the product of their standard deviations and is typically represented by
    :math:`\rho`:

    .. math::

        \rho_{x, y} = \frac{cov(X, Y)}{\sigma_X \sigma_Y}

    The correlation matrix :math:`C` and the covariance matrix :math:`R` have the following
    relationship.

    .. math::

        R_{ij} = \frac{C_{ij}}{\sqrt{C_{ii} * C_{jj}}}

    Examples
    --------
    >>> h = np.array([[16,4,8,4], [4,10,8,4], [8,8,12,10], [4,4,10,12]])
    >>> pearson(h)
    array([[ 1.        , -0.47140452, -0.24618298, -0.45732956],
       [-0.47140452,  1.        ,  0.05802589, -0.29643243],
       [-0.24618298,  0.05802589,  1.        ,  0.80218063],
       [-0.45732956, -0.29643243,  0.80218063,  1.        ]])
    >>> pearson(h[:, 0:1], h[:, 1:])
    array([[ 1.        , -0.47140452, -0.24618298, -0.45732956],
       [-0.47140452,  1.        ,  0.05802589, -0.29643243],
       [-0.24618298,  0.05802589,  1.        ,  0.80218063],
       [-0.45732956, -0.29643243,  0.80218063,  1.        ]])
    >>> pearson(h[:, 1], h[:, 2])
    array([[ 1.        ,  0.05802589],
       [ 0.05802589,  1.        ]])

    References
    ----------
    Pearson correlation coefficient. (2017, July 12). In Wikipedia, The Free Encyclopedia.
        From https://en.wikipedia.org/w/index.php?title=Pearson_correlation_coefficient&oldid=790217169

    Rencher, A. (n.d.). Methods of Multivariate Analysis (2nd ed.).
        Brigham Young University: John Wiley & Sons, Inc.

    """
    x = _create_array(x)[0]

    if y is not None:
        y = _create_array(y)[0]
        x = np.column_stack([x, y])

    cormat = np.empty((x.shape[1], x.shape[1]))

    covmat = covar(x)

    for i in np.arange(covmat.shape[0]):
        for j in np.arange(covmat.shape[0]):
            cormat[i, j] = covmat[i, j] / np.sqrt(covmat[i, i] * covmat[j, j])

    return cormat
Esempio n. 12
0
def ttest(y1, y2=None, mu=None, var_equal=False):
    r"""
    Performs one and two-sample t-tests.

    Parameters
    ----------
    y1
        First sample to test
    y2
        Second sample. Optional
    mu
        Optional, sets the mean for comparison in the one sample t-test. Default 0.
    var_equal
        Optional, default False. If False, Welch's t-test for unequal variance and
        sample sizes is used. If True, equal variance between samples is assumed
        and Student's t-test is used.

    Returns
    -------
    namedtuple
        Namedtuple containing following values:
        t-value
        degrees of freedom
        p-value
        confidence intervals
        sample means

    Notes
    -----
    Welch's t-test is an adaption of Student's t test and is more performant when the
    sample variances and size are unequal. The test still depends on the assumption of
    the underlying population distributions being normally distributed.

    Welch's t test is defined as:

    .. math::

        t = \frac{\bar{X_1} - \bar{X_2}}{\sqrt{\frac{s_{1}^{2}}{N_1} + \frac{s_{2}^{2}}{N_2}}}

    where:

    :math:`\bar{X}` is the sample mean, :math:`s^2` is the sample variance, :math:`n` is the sample size

    If the :code:`var_equal` argument is True, Student's t-test is used, which assumes the two samples
    have equal variance. The t statistic is computed as:

    .. math::

        t = \frac{\bar{X}_1 - \bar{X}_2}{s_p \sqrt{\frac{1}{n_1} + \frac{1}{n_2}}

    where:

    .. math::

        s_p = \sqrt{\frac{(n_1 - 1)s^2_{X_1} + (n_2 - 1)s^2_{X_2}}{n_1 + n_2 - 2}

    References
    ----------
    Rencher, A. C., & Christensen, W. F. (2012). Methods of multivariate analysis (3rd Edition).

    Student's t-test. (2017, June 20). In Wikipedia, The Free Encyclopedia.
        From https://en.wikipedia.org/w/index.php?title=Student%27s_t-test&oldid=786562367

    """
    y1 = _create_array(y1)[0]

    n1 = len(y1)
    s1 = var(y1)
    ybar1 = np.mean(y1)

    if y2 is not None:
        y2 = _create_array(y2)[0]
        n2 = len(y2)
        s2 = var(y2)
        ybar2 = np.mean(y2)

        if var_equal is False:
            tval = float((ybar1 - ybar2) / np.sqrt(s1 / n1 + s2 / n2))
        else:
            sp = np.sqrt(((n1 - 1.) * s1 + (n2 - 1.) * s2) / (n1 + n2 - 2.))
            tval = float((ybar1 - ybar2) / (sp * np.sqrt(1. / n1 + 1. / n2)))

    else:
        ybar2, n2, s2 = 0.0, 1.0, 0.0
        if mu is None:
            mu = 0.0

        tval = float((ybar1 - mu) / np.sqrt(s1 / n1))

    dof = degrees_of_freedom(y1, y2)
    pvalue = _student_t_pvalue(np.absolute(tval), dof)
    intervals = _t_conf_int((ybar1, n1, s1), dof=dof, y=(ybar2, n2, s2))

    if y2 is not None:
        tTestResult = namedtuple(
            'tTestResult',
            ['tvalue', 'dof', 'pvalue', 'confint', 'x_mean', 'y_mean'])

        tt = tTestResult(tvalue=tval,
                         dof=dof,
                         pvalue=pvalue,
                         confint=intervals,
                         x_mean=ybar1,
                         y_mean=ybar2)

    else:
        tTestResult = namedtuple(
            'tTestResult', ['tvalue', 'dof', 'pvalue', 'confint', 'x_mean'])
        tt = tTestResult(tvalue=tval,
                         dof=dof,
                         pvalue=pvalue,
                         confint=intervals,
                         x_mean=ybar1)

    return tt
Esempio n. 13
0
def degrees_of_freedom(y1, y2=None, var_equal=False):
    r"""
    Computes the degrees of freedom of one or two samples.

    Parameters
    ----------
    y1
        First sample to test
    y2
        Second sample. Optional.
    var_equal
        Optional, default False. If False, Welch's t-test for unequal variance and
        sample sizes is used. If True, equal variance between samples is assumed
        and Student's t-test is used.

    Returns
    -------
    float
        the degrees of freedom

    Notes
    -----
    When Welch's t test is used, the Welch-Satterthwaite equation for approximating the degrees
    of freedom should be used and is defined as:

    .. math::

        \large v \approx \frac{\left(\frac{s_{1}^2}{N_1} +
        \frac{s_{2}^2}{N_2}\right)^2}{\frac{\left(\frac{s_1^2}{N_1^{2}}\right)^2}{v_1} +
        \frac{\left(\frac{s_2^2}{N_2^{2}}\right)^2}{v_2}}

    If the two samples are assumed to have equal variance, the degrees of freedoms become simply:

    .. math::

        v = n_1 + n_2 - 2

    In the case of one sample, the degrees of freedom are:

    .. math::

        v = n - 1

    References
    ----------
    Rencher, A. C., & Christensen, W. F. (2012). Methods of multivariate analysis (3rd Edition).

    Welch's t-test. (2017, June 16). In Wikipedia, The Free Encyclopedia.
        From https://en.wikipedia.org/w/index.php?title=Welch%27s_t-test&oldid=785961228

    """
    y1 = _create_array(y1)[0]
    n1 = len(y1)
    s1 = var(y1)
    v1 = n1 - 1

    if y2 is not None:
        y2 = _create_array(y2)[0]
        n2 = len(y2)
        s2 = var(y2)
        v2 = n2 - 1

        if var_equal is False:
            v = np.power((s1 / n1 + s2 / n2), 2) / (np.power(
                (s1 / n1), 2) / v1 + np.power((s2 / n2), 2) / v2)
        else:
            v = n1 + n2 - 2

    else:
        v = v1

    return float(v)
Esempio n. 14
0
def anova_oneway(group, x, *args):
    r"""
    Performs one-way analysis of variance (ANOVA) of one measurement and a grouping variable

    Parameters
    ----------
    group
        One-dimensional array (Numpy ndarray, Pandas Series, list) that defines the group
        membership of the dependent variable(s). Must be the same length as the x parameter.
    x
        One or two-dimensional array (Numpy ndarray, Pandas DataFrame, list of lists) that
        defines the observation vectors of the dependent variables. Must be the same length
        as the group parameter.

    Returns
    -------
    namedtuple
        Namedtuple with the following entries representing an ANOVA table:
        residual Df: Residuals Degrees of Freedom
        Group Df: Group Vector Degrees of Freedom
        F-Value: Computed F-Value of ANOVA procedure
        p-value: Resulting p-value
        Group Sum of Squares: SST
        Group Mean Squares: MST
        Residual Sum of Squares: SSE
        Residual Mean Squares: MSE

    Notes
    -----
    One-way ANOVA can be considered an extension of the t-test when more than two groups
    are being tested. The factor, or categorical variable, is often referred to as the
    'treatment' in the ANOVA setting. ANOVA involves partitioning the data's total
    variation into variation between and within groups. This procedure is thus known as
    Analysis of Variance as sources of variation are examined separately.

    The data is assumed to be normally distributed with mean :math:`\mu_i` and standard
    deviation :math:`\sigma^2_i`. Stating the hypothesis is also similar to previous
    examples when there were only two samples of interest. The hypothesis can be defined
    formally as:

    :math:`H_O: \mu_1 = \mu_2 = \cdots = \mu_k`
    :math:`H_A:` Not all population means are equal

    The one-way ANOVA splits the data's variation into two sources which are in turn used
    to calculate the F-statistic. The F-statistic is determined by the F-test, which is
    done by dividing the variance between groups by the variance within groups. The sum of
    squares for treatments is defined as :math:`SST`, for error as :math:`SSE` and the total
    :math:`TotalSS`. The mean squares are calculated by dividing the sum of squares by the
    degrees of freedom.

    Each sum of squares can be defined as:

    .. math::

        SST = \sum_{i=1}^k n_i(\bar{y_{i}} - \bar{y})^2

    .. math::

        SSE = \sum_{i=1}^k (n_i - 1)s_i^2

    .. math::

        TotalSS = \sum_{i=1}^k \sum_{j=1}^{n_i} (y_{ij} - \bar{y})^2

    The mean squares are the sum of squares divided by the degrees of freedom.

    .. math::

        MST = \frac{SST}{k - 1}

    .. math::

        MSE = \frac{SSE}{n - k}

    The F-statistic is defined as:

    .. math::

        f = \frac{MST}{MSE}

    References
    ----------
    Rencher, A. (n.d.). Methods of Multivariate Analysis (2nd ed.).
        Brigham Young University: John Wiley & Sons, Inc.

    """
    if args is not ():
        c = args[0]
        for i in np.arange(1, len(args)):
            c = np.column_stack((c, args[i]))
        x = np.column_stack((x, c))

    x = _create_array(x)[0]

    if x.ndim > 1:
        x = np.sum(x, axis=1)

    grouparr, groupname = _create_array(group)
    groupnames = np.unique(grouparr)

    data = np.column_stack([grouparr, x])

    xmeans = npi.group_by(data[:, 0], data[:, 1], np.mean)
    xn = npi.group_by(data[:, 0], data[:, 1], len)
    xvars = npi.group_by(data[:, 0], data[:, 1], var)

    sst, sse = _sst(xn, xmeans, np.mean(data[:, 1])), _sse(xn, xvars)

    k = len(groupnames)
    res_dof = len(x) - (k)

    mst = sst / (k - 1)
    mse = sse / res_dof

    fval = mst / mse

    pval = _f_p_value(fval, k - 1, res_dof)

    AnovaResult = namedtuple('AnovaResult', [
        'residualdf', 'groupdf', 'fvalue', 'pvalue', 'groupSumSq',
        'groupMeanSq', 'resSumSq', 'resMeanSq'
    ])

    aov = AnovaResult(residualdf=res_dof,
                      groupdf=k,
                      fvalue=fval,
                      pvalue=pval,
                      groupSumSq=sst,
                      groupMeanSq=mst,
                      resSumSq=sse,
                      resMeanSq=mse)

    return aov
Esempio n. 15
0
def _sym_eig(x):
    x = _create_array(x)[0]
    eigs = np.linalg.eigvals(x)

    return eigs
Esempio n. 16
0
 def __init__(self, x):
     self.x = _create_array(x)[0]
     self.order = 'norm2'
def central_difference(x, y):
    r"""
    Approximates the derivative of an unknown function given a set of x
    and y = f(x) data points using the central-difference approximation method.

    Parameters
    ----------
    x : array-like
        Pandas DataFrame or Series, Numpy array, list or dictionary of x values
    y : array-like
        Pandas DataFrame or Series, Numpy array, list or dictionary of values of the
        function at x.

    Returns
    -------
    dict
        length :math:`n - 1` where :math:`n` is the length of the data vector x of the
        approximated values of the derivative function at the corresponding values of x.

    Notes
    -----
    The derivative of a function :math:`f` at a value :math:`x_0` is defined by:

    .. math::

        f^\prime(x_0) = \underset{h \rightarrow 0}{lim} \frac{x_0 + h) - f(x_0)}{h}

    The central-difference method is another approach to approximating the derivative of a function,
    whether it be a known function or a set of points and the function evaluated at those points. The
    central-difference formula is defined as:

    .. math::

        f^\prime (x_i) = \frac{f(x + h) - f(x - h)}{2h}

    The central-difference method is often more accurate than the backward or forward methods as it is
    essentially an average of the latter two approaches.

    Examples
    --------
    >>> x, y = [0.0, 0.2, 0.4], [0.00000, 0.74140, 1.3718]
    >>> central_difference(x, y)
    {'f(0.0)': 3.7069999999999994,
     'f(0.2)': 3.7069999999999994,
     'f(0.4)': 3.1519999999999997}
    >>> central_difference([0.5,0.6,0.7], [0.4794,0.5646,0.6442])
    {'f(0.5)': 0.8520000000000002,
     'f(0.6)': 0.8520000000000002,
     'f(0.7)': 0.79600000000000026}

    References
    ----------
    Burden, R. L., & Faires, J. D. (2011). Numerical analysis (9th ed.).
        Boston, MA: Brooks/Cole, Cengage Learning.

    Finite difference. (2017, June 9). In Wikipedia, The Free Encyclopedia.
        From https://en.wikipedia.org/w/index.php?title=Finite_difference&oldid=784585490

    """
    x, y = _create_array(x)[0], _create_array(y)[0]

    if len(x) != len(y):
        raise ValueError('x and y must be the same length')

    n = len(x)

    fdx = {}

    fdx['f(' + str(x[0]) + ')'] = ((y[1] - y[0]) -
                                   (y[0] - y[1])) / (2 * (x[1] - x[0]))

    for i in np.arange(1, n):
        fdx['f(' + str(x[i]) + ')'] = ((y[i] - y[i - 1]) -
                                       (y[i - 1] - y[i])) / (2 *
                                                             (x[i] - x[i - 1]))

    return fdx
def forward_difference(x, y):
    r"""
    Approximates the derivative of an unknown function given a set of x
    and y = f(x) data points using the forward-difference approximation method. The
    x-values should be equally-spaced for the central difference method to return
    accurate results. Otherwise, the forward or backward difference methods should
    be employed (or a more accurate method altogether).

    Parameters
    ----------
    x : array-like
        Pandas DataFrame or Series, Numpy array, list or dictionary of x values
    y : array-like
        Pandas DataFrame or Series, Numpy array, list or dictionary of values of the
        function at x.

    Returns
    -------
    dict
        length :math:`n - 1` where :math:`n` is the length of the data vector x of the
        approximated values of the derivative function at the corresponding values of x.

    Notes
    -----
    The derivative of a function :math:`f` at a value :math:`x_0` is defined by:

    .. math::

        f^\prime(x_0) = \underset{h \rightarrow 0}{lim} \frac{x_0 + h) - f(x_0)}{h}

    However, if the function is unknown, the derivative of the function can still be approximated
    at a value of :math:`x_0` given a set of points :math:`(x_1, y_1), (x_2, y_2), \cdots, (x_n, y_n)`.
    The forward difference method is one approach to approximating the derivative. Given a set of data
    points, the forward difference approximation of a derivative can be defined as:

    .. math::

        f^\prime (x_i) = y^\prime_i \approx \frac{y_{i+1} - y_i}{x_{i+1} - x_i}

    Examples
    --------
    >>> x, y = [0.0, 0.2, 0.4], [0.00000, 0.74140, 1.3718]
    >>> forward_difference(x, y)
    {'f(0.0)': 3.7069999999999994,
     'f(0.2)': 3.7069999999999994,
     'f(0.4)': 3.1519999999999997}
    >>> forward_difference([0.5,0.6,0.7], [0.4794,0.5646,0.6442])
    {'f(0.5)': 0.8520000000000002,
     'f(0.6)': 0.8520000000000002,
     'f(0.7)': 0.79600000000000026}

    References
    ----------
    Burden, R. L., & Faires, J. D. (2011). Numerical analysis (9th ed.).
        Boston, MA: Brooks/Cole, Cengage Learning.

    Finite difference. (2017, June 9). In Wikipedia, The Free Encyclopedia.
        From https://en.wikipedia.org/w/index.php?title=Finite_difference&oldid=784585490

    """
    x, y = _create_array(x)[0], _create_array(y)[0]

    if len(x) != len(y):
        raise ValueError('x and y must be the same length')

    n = len(x)

    fdx = {}

    fdx['f(' + str(x[0]) + ')'] = (y[1] - y[0]) / (x[1] - x[0])

    for i in np.arange(1, n):
        fdx['f(' + str(x[i]) + ')'] = (y[i - 1] - y[i]) / (x[i - 1] - x[i])

    return fdx
Esempio n. 19
0
def lu(a):
    r"""
    Computes the LU decomposition of a square matrix :math:`A`.

    Parameters
    ----------
    a : array_like
        Accepts a list, nested list, dictionary, pandas DataFrame or
        pandas Series. The private function _create_array is called
        to create a copy of x as a numpy array.

    Returns
    -------
    l_u : tuple
        Returns a tuple containing the lower triangular matrix
        :math:`L` and the upper triangular matrix  :math:`U`.

    Notes
    -----
    LU Decomposition factors a square matrix (:math:`n \times n`) into the product of a
    'lower' and 'upper' triangular matrix (hence the name 'LU'). More formally:

    .. math::

        A = LU

    The :math:`L` and :math:`U` matrices are lower and upper triangular, respectively.
    For example, the LU decomposition of a :math:`3 \times 3` matrix would be similar to:

    .. math::

        \begin{bmatrix}
            a_{11} & a_{12} & a_{13} \\
            a_{21} & a_{22} & a_{23} \\
            a_{31} & a_{32} & a_{33}
        \end{bmatrix} =
        \begin{bmatrix}
            l_{11} & 0 & 0 \\
            l_{21} & l_{22} & 0 \\
            l_{31} & l_{32} & l_{33}
        \end{bmatrix}
        \begin{bmatrix}
            u_{11} & u_{12} & u_{13} \\
            0 & u_{22} & u_{23} \\
            0 & 0 & u_{33}
        \end{bmatrix}

    Examples
    --------
    >>> a = pd.DataFrame({0: [16, 4, 8, 4], 2: [4, 10, 8, 4], 3: [8, 8, 12, 10], 4: [4, 4, 10, 12]})
    >>> l, u = lu(a)
    >>> print(l, u)
    [[ 1.          0.          0.          0.        ]
     [ 0.25        1.          0.          0.        ]
     [ 0.5         0.66666667  1.          0.        ]
     [ 0.25        0.33333333  1.5         1.        ]] [[ 16.   4.   8.   4.]
     [  0.   9.   6.   3.]
     [  0.   0.   4.   6.]
     [  0.   0.   0.   1.]]
    >>> np.dot(l, u)
    array([[ 16.,   4.,   8.,   4.],
       [  4.,  10.,   8.,   4.],
       [  8.,   8.,  12.,  10.],
       [  4.,   4.,  10.,  12.]])

    References
    ----------
    Cormen, T., Leiserson, C., Rivest, R., & Stein, C. (2009). Introduction to algorithms (3rd ed., pp. 819-822).
        Cambridge (Inglaterra): Mit Press.

    """
    x = _create_array(a)[0].copy()
    n, m = x.shape

    if n != m:
        raise ValueError('Matrix must be square to perform LU decomposition')

    l, u = np.eye(n), np.zeros((n, n))

    for k in np.arange(n):
        u[k, k] = x[k, k]
        for i in np.arange(k + 1, n):
            l[i, k] = x[i, k] / u[k, k]
            u[k, i] = x[k, i]
        for i in np.arange(k + 1, n):
            for j in np.arange(k + 1, n):
                x[i, j] = x[i, j] - l[i, k] * u[k, j]

    l_u = (l, u)

    return l_u
Esempio n. 20
0
def cholesky(a):
    r"""
    Function for computing the Cholesky decomposition of a symmetric, positive definite matrix.

    Parameters
    ----------
    a : array_like
        Accepts a list, nested list, dictionary, pandas DataFrame or
        pandas Series. The private function _create_array is called
        to create a copy of x as a numpy array.

    Returns
    -------
    llt : tuple
        The cholesky function returns the lower-triangular matrix L and its transpose,
        the upper-triangular matrix L^T.

    Notes
    -----
    Cholesky decomposition is a special case of :math:`LU` decomposition for symmetric,
    positive definite matrices (Hermitian in the complex case). Cholesky decomposition is
    preferred when applicable as it is more efficient than LU decomposition. The Cholesky
    decomposition factors a matrix :math:`A` into the product of a lower triangular matrix
    :math:`L` and its transpose :math:`L^T` (or :math:`L^*` which denotes the conjugate
    transpose in the Hermitian case). More formally, for a symmetric, positive definite
    matrix :math:`A`, the Cholesky decomposition is defined as:

    .. math::

        A = LL^T

    In component notation:

    .. math::

        L_{ii} = \sqrt{a_{ii} - \sum^{i-1}_{k=0} L^2_{ik}}

    .. math::

        L_{ji} = \frac{1}{L_{ii}} (a_{ij} - \sum^{i-1}_{k=0} L_{ik} L_{jk}) \qquad j = i + 1, i + 2, \cdots, N - 1

    Examples
    --------
    >>> h = pd.DataFrame({0: [16, 4, 8, 4], 2: [4, 10, 8, 4], 3: [8, 8, 12, 10], 4: [4, 4, 10, 12]})
    >>> l, lt = cholesky(h)
    >>> l
    array([[4, 0, 0, 0],
       [1, 3, 0, 0],
       [2, 2, 2, 0],
       [1, 1, 3, 1]], dtype=int64)
    >>> lt
    array([[4, 1, 2, 1],
       [0, 3, 2, 1],
       [0, 0, 2, 3],
       [0, 0, 0, 1]], dtype=int64)

    References
    ----------
    Press, W. (2007). Numerical Recipes 3rd Edition: The Art of Scientific Computing (3rd ed.).
        New York: Cambridge University Press.

    Watkins, D. (2010). Fundamentals of Matrix Computations, 3rd Edition. John Wiley & Sons.

    """
    x = _create_array(a)[0].copy()
    n, m = x.shape

    if ispositivedefinite(x) is False:
        raise ValueError('Matrix is not positive definite')

    for j in np.arange(n):

        x[j, j] = np.sqrt(x[j, j] - np.dot(x[j, 0:j], x[j, 0:j]))
        for i in np.arange(j + 1, n):
            x[i, j] = (x[i, j] - np.dot(x[i, 0:j], x[j, 0:j])) / x[j, j]

    for j in np.arange(1, n):
        x[0:j, j] = 0.0

    llt = (x, x.T)

    return llt
def neville(x, y, x0):
    r"""
    Evaluates an interpolated polynomial at a particular :math:`x` value given a set of
    :math:`x` and corresponding :math:`y` values.

    Parameters
    ----------
    x
        One-dimensional array of x values. Can be a pandas DataFrame or Series,
        list, dictionary (first key-value pair is used if there are more than one),
        or numpy array. Must be same length as y.
    y
        One-dimensional array of y values. Can be a pandas DataFrame or Series,
        list, dictionary (first key-value pair is used if there are more than one),
        or numpy array. Must be same length as y.
    x0
        Desired value at which to interpolate and approximate poynomial.

    Returns
    -------
    tuple
        Contains the approximated value of the interpolated polynomial evaluated at the point :math:`x` as float
        and a numpy array representing the iterated Neville table with intermediate values generated recursively.

    Notes
    -----
    Neville's method evaluates a polynomial that passes through a given set of :math:`x` and :math:`y` points
    for a particular :math:`x` value using the Newton polynomial form. Neville's method is similar to a
    now defunct procedure named Aitken's algorithm and is based on the divided differences recursion
    relation.

    It was stated before in a previous post on Lagrangian polynomial interpolation that there exists
    a Lagrange polynomial that passes through points :math:`y_1, y_2, \cdots, y_k` where each is a
    distinct integer and :math:`0 \leq y_i \leq n` at corresponding x values :math:`x_0, x_1, x_2, \cdots, x_n`.
    The :math:`k` points :math:`y_1, y_2, \cdots, y_k` are denoted :math:`P_{y_1, y_2, \cdots, y_k}(x)`.
    Neville's method can be stated as follows:

    Let a function :math:`f` be defined at points :math:`x_0, x_1, \cdots, x_k` where :math:`x_j` and
    :math:`x_i` are two distinct members. For each :math:`k`, there exists a Lagrange polynomial :math:`P`
    that interpolates the function :math:`f` at the :math:`k + 1` points :math:`x_0, x_1, \cdots, x_k`.
    The :math:`k`th Lagrange polynomial is defined as:

    .. math::

        P(x) = \frac{(x - x_j) P_{0,1,\cdots,j-1,j+1,\cdots,k}(x) - (x - x_i)
        P_{0,1,\cdots,i-1,i+1,\cdots,k}(x)}{(x_i - x_j)}

    The :math:`P_{0,1,\cdots,j-1,j+1,\cdots,k}` and :math:`P_{0,1,\cdots,i-1,i+1,\cdots,k}` are often
    denoted :math:`\hat{Q}` and :math:`Q`, respectively, for ease of notation.

    .. math::

        P(x) = \frac{(x - x_j) \hat{Q}(x) - (x - x_i) Q(x)}{(x_i - x_j)}

    Examples
    --------
    >>> x, y = [8.1, 8.3, 8.6, 8.7], [16.9446, 17.56492, 18.50515, 18.82091]
    >>> neville(x, y, 8.4)
    17.8770925

    References
    ----------
    Burden, R. L., & Faires, J. D. (2011). Numerical analysis (9th ed.).
        Boston, MA: Brooks/Cole, Cengage Learning.

    Cheney, E. W., & Kincaid, D. (2013). Numerical mathematics and computing (6th ed.).
        Boston, MA: Brooks/Cole, Cengage Learning.

    Neville's algorithm. (2016, January 2). In Wikipedia, The Free Encyclopedia.
        From https://en.wikipedia.org/w/index.php?title=Neville%27s_algorithm&oldid=697870140

    """
    x, y = _create_array(x)[0], _create_array(y)[0]

    if len(x) != len(y):
        raise ValueError('x and y must be the same length to evaluate polynomial')

    n = len(x)

    q = np.zeros((n, n))
    q[:, 0] = y

    for i in np.arange(1, n):
        for j in np.arange(i, n):
            q[j, i] = ((x0 - x[j - i]) * q[j, i - 1] - (x0 - x[j]) * q[j - 1, i - 1]) / (x[j] - x[j - i])

    return float(q[n - 1, n - 1]), q
Esempio n. 22
0
 def __init__(self, x):
     self.x = _create_array(x)[0]
     self.order = 'frobenius'
     self.n, self.m = self.x.shape
def divided_differences(x, y, x0=None):
    r"""
    Constructs an interpolating polynomial that passes through given x and y points using
    the divided differences method.

    Parameters
    ----------
    x
        One-dimensional array of x values. Can be a pandas DataFrame or Series,
        list, dictionary (first key-value pair is used if there are more than one),
        or numpy array. Must be same length as y.
    y
        One-dimensional array of y values. Can be a pandas DataFrame or Series,
        list, dictionary (first key-value pair is used if there are more than one),
        or numpy array. Must be same length as y.
    x0
        Optional. Desired value to interpolate poynomial and approximate.

    Returns
    -------
    dict
        dict object containing the following entries:
        Approximated value of the intepolated polynomial (if given)
        The interpolated polynomial
        Divided Differences Table

    Notes
    -----
    The divided differences method is a numerical procedure for interpolating a polynomial
    given a set of points. Unlike Neville's method, which is used to approximate the value
    of an interpolating polynomial at a given point, the divided differences method
    constructs the interpolating polynomial in Newton form.

    Assume that :math:`P_n(x)` is the :math:`nth` Lagrangian polynomial that corresponds with
    the function :math:`f` at a set of :math:`x` data points. The polynomial :math:`P_n(x)` can be expressed
    using the divided differences of the function :math:`f` with respect to the :math:`x`-values.

    .. math::

        P_n(x) = a_0 + a_1(x - x_0) + a_2(x - x_0)(x - x_1) + \cdots + a_n(x - x_0) \cdots (x - x_{n-1})

    Therefore the constants :math:`a_0, a_1, \cdots, a_n` must be found to construct the polynomial. To
    find these constants, the divided differences are recursively generated until :math:`n` iterations
    have been completed. We start with the zeroth divided difference of the function :math:`f` with
    respect to :math:`x_i`, which is the value of :math:`f` at that point. Bracket notation is introduced
    to distinguish the divided differences.

    .. math::

        f[x_i] = f(x_i)

    The first divided difference is then the function :math:`f` with respect to the values :math:`x_i`
    and :math:`x_{i+1}`.

    .. math::

        f[x_i, x_{i+1}] = \frac{f[x_{i+1}] - f[x_i]}{x_{i+1 - x_i}}

    The second divided difference follows:

    .. math::

        f[x_i, x_{i+1}, x_{i+2}] = \frac{f[x_{i+1},x_{i+2}] - f[x_i, x_{i+1}]}{x_{i+2} - x_i}

    This iteration continues until the :math:`n`th divided difference:

    .. math::

        f[x_0, x_1, \cdots, x_n] = \frac{f[x_1, x_2, \cdots, x_n] - f[x_0, x_1, \cdots, x_n]}{x_n - x_0}

    Thus the interpolating polynomial resulting from the divided differences method takes the form:

    .. math::

        P_n(x) = f[x_0] + f[x_0, x_1](x - x_0) + f[x_0, x_1, x_2](x - x_0)(x - x_1) + \cdots +
        f[x_0, x_1, x_2, \cdots, x_n](x - x_0)(x - x_1) \cdots (x - x_{n-1})

    Examples
    --------
    >>> x, y = [8.1, 8.3, 8.6, 8.7], [16.9446, 17.56492, 18.50515, 18.82091]
    >>> divided_differences(x, y, 8.4)
    {'Approximated Value of Interpolated Polynomial': 17.8770925200000,
     'Divided Differences Table': array([[  1.69446000e+01,   0.00000000e+00,   0.00000000e+00,
               0.00000000e+00],
            [  1.75649200e+01,   3.10160000e+00,   0.00000000e+00,
               0.00000000e+00],
            [  1.85051500e+01,   3.13410000e+00,   6.50000000e-02,
               0.00000000e+00],
            [  1.88209100e+01,   3.15760000e+00,   5.87500000e-02,
              -1.04166667e-02]]),
     'Interpolated Function': '16.9446 + 3.1016*(x - 8.1) + 0.065*(x - 8.1)*(x - 8.3) + -0.01042*(x - 8.1)*(x - 8.3)*(x - 8.6)'}

    References
    ----------
    Burden, R. L., & Faires, J. D. (2011). Numerical analysis (9th ed.).
        Boston, MA: Brooks/Cole, Cengage Learning.

    """
    x, y = _create_array(x)[0], _create_array(y)[0]

    if len(x) != len(y):
        raise ValueError('x and y must be the same length to evaluate polynomial')

    n = len(x)

    q = np.zeros((n, n))
    q[:, 0] = y

    f = str(np.round(q[0, 0], 5))
    fi = ''

    for i in np.arange(1, n):
        for j in np.arange(i, n):
            q[j, i] = (q[j, i-1] - q[j-1, i-1]) / (x[j] - x[j-i])

        fi = fi + '*(x - ' + str(x[i-1]) + ')'
        f = f + ' + ' + str(np.round(q[i,i], 5)) + fi

    x = Symbol('x')
    if x0 is None:
        raise ValueError('x0 must be provided to approximate polynomial')
    else:
        approx = parse_expr(f).evalf(subs={x: x0})

    res = {'Approximated Value of Interpolated Polynomial': approx,
           'Interpolated Function': f,
           'Divided Differences Table': q}

    return res
Esempio n. 24
0
def spearman(x, y=None):
    r"""
    Computes the Spearman correlation coefficients of the given variables.

    Parameters
    ----------
    x : array-like
        Can be Pandas DataFrame, Pandas Series, numpy ndarray, list, list of lists, or dictionary
        representing a 1D or 2D array containing the variables and their respective observation
        vectors.
        The input is concatenated with the parameter y if given.
    y : array-like
        Can be Pandas DataFrame, Pandas Series, numpy ndarray, list, list of lists, or dictionary
        representing a 1D or 2D array containing the variables and their respective observation
        vectors.

    Returns
    -------
    numpy ndarray
        The correlation coefficient matrix of the inputted variables.

    Notes
    -----
    Spearman's :math:`\rho`, often denoted :math:`r_s` is a nonparametric measure of correlation.
    While Pearson's product-moment correlation coefficient represents the linear relationship between
    two variables, Spearman's correlation measures the monotonicity of two variables. Put more simply,
    Spearman's correlation is Pearson's correlation performed on ranked variables.

    Two random variables :math:`X` and :math:`Y` and their respective observation vectors
    :math:`x_1, x_2, \cdots, x_n` and :math:`y_1, y_2, \cdots, y_n` are converted to ranked variables
    (identical values are averaged), often denoted :math:`rg_X` and :math:`rg_Y`, and the correlation
    :math:`r_s` is computed as:

    .. math::

        r_s = \rho_{rg_X, rg_Y} = \frac{cov(rg_X, rg_Y}{\sigma_{rg_X} \sigma_{rg_Y}}

    Where :math:`\rho` is the Pearson correlation coefficient applied to the ranked variables,
    :math:`cov(rg_X, rg_Y)` is the covariance of the ranked variables and :math:`\sigma_{rg_X}` and
    :math:`\sigma_{rg_Y}` are the standard deviations of the ranked variables.

    Examples
    --------
    >>> h = np.array([[16,4,8,4], [4,10,8,4], [8,8,12,10], [4,4,10,12]])
    >>> spearman(h)
    array([[ 1.        , -0.33333333, -0.03703704, -0.33333333],
       [-0.33333333,  1.        , -0.03703704, -0.33333333],
       [-0.03703704, -0.03703704,  1.        ,  0.85185185],
       [-0.33333333, -0.33333333,  0.85185185,  1.        ]])
    >>> spearman(h[:, 0:1], h[:, 1:])
    array([[ 1.        , -0.33333333, -0.03703704, -0.33333333],
       [-0.33333333,  1.        , -0.03703704, -0.33333333],
       [-0.03703704, -0.03703704,  1.        ,  0.85185185],
       [-0.33333333, -0.33333333,  0.85185185,  1.        ]])
    >>> spearman(h[:, 0], h[:, 1])
    array([[ 1.        , -0.33333333],
       [-0.33333333,  1.        ]])

    References
    ----------
    Spearman's rank correlation coefficient. (2017, June 24). In Wikipedia, The Free Encyclopedia.
        From https://en.wikipedia.org/w/index.php?title=Spearman%27s_rank_correlation_coefficient&oldid=787350680

    """
    x = _create_array(x)[0]

    if y is not None:
        y = _create_array(y)[0]
        x = np.column_stack([x, y])

    ranked = x.copy()
    for i in np.arange(ranked.shape[1]):
        ranked[:, i] = rankdata(x[:, i], 'average')

    covranked = pearson(ranked)

    return covranked
def backward_difference(x, y):
    r"""
    Approximates the derivative of an unknown function given a set of x
    and y = f(x) data points using the backward-difference approximation method.

    Parameters
    ----------
    x : array-like
        Pandas DataFrame or Series, Numpy array, list or dictionary of x values
    y : array-like
        Pandas DataFrame or Series, Numpy array, list or dictionary of values of the
        function at x.

    Returns
    -------
    dict
        length :math:`n - 1` where :math:`n` is the length of the data vector x of the
        approximated values of the derivative function at the corresponding values of x.

    Notes
    -----
    The derivative of a function :math:`f` at a value :math:`x_0` is defined by:

    .. math::

        f^\prime(x_0) = \underset{h \rightarrow 0}{lim} \frac{x_0 + h) - f(x_0)}{h}

    The backward difference method is one approach to approximating the derivative of a function,
    whether known or unknown. Given a set of data points, the backward difference approximation
    of a derivative can be defined as:

    .. math::

        f^\prime (x_i) = y^\prime_i \approx \frac{y_i - y_{i-1}}{x_i - x_{i-1}}

    Examples
    --------
    >>> x, y = [0.0, 0.2, 0.4], [0.00000, 0.74140, 1.3718]
    >>> backward_difference(x, y)
    {'f(0.0)': 3.7069999999999994,
     'f(0.2)': 3.1519999999999997,
     'f(0.4)': 3.1519999999999997}
    >>> backward_difference([0.5,0.6,0.7], [0.4794,0.5646,0.6442])
    {'f(0.5)': 0.8520000000000002,
     'f(0.6)': 0.79600000000000026,
     'f(0.7)': 0.79600000000000026}

    References
    ----------
    Burden, R. L., & Faires, J. D. (2011). Numerical analysis (9th ed.).
        Boston, MA: Brooks/Cole, Cengage Learning.

    Finite difference. (2017, June 9). In Wikipedia, The Free Encyclopedia.
        From https://en.wikipedia.org/w/index.php?title=Finite_difference&oldid=784585490

    """
    x, y = _create_array(x)[0], _create_array(y)[0]

    if len(x) != len(y):
        raise ValueError('x and y must be the same length')

    n = len(x)

    fdx = {}

    for i in np.arange(1, n):
        fdx['f(' + str(x[i - 1]) + ')'] = (y[i] - y[i - 1]) / (x[i] - x[i - 1])

    fdx['f(' + str(x[n - 1]) + ')'] = fdx['f(' + str(x[n - 2]) + ')']

    return fdx
Esempio n. 26
0
def manova_oneway(group, x, *args):
    r"""
    Performs Multiple Analysis of Variance (MANOVA) of one grouping variable and n dependent variables

    Parameters
    ----------
    group
        One-dimensional array (Numpy ndarray, Pandas Series, list) that defines the group
        membership of the dependent variable(s). Must be the same length as the x parameter.
    x
        One or two-dimensional array (Numpy ndarray, Pandas DataFrame, list of lists) that
        defines the observation vectors of the dependent variables. Must be the same length
        as the group parameter.

    Returns
    -------
    namedtuple
        Namedtuple with the following entries representing a MANOVA table:
        Group Df: Group Vector Degrees of Freedom
        residual Df: Residuals Degrees of Freedom
        Num Df: Numerator Degrees of Freedom
        Den Df: Denominator Degrees of Freedom
        Pillai Statistic: Pillai Test Statistic
        Wilk's Lambda: Wilk's Lambda
        Lawley-Hotelling T^2: T^2 statistic, also known as Lawley-Hotelling statistic
        Roy's Test: Reported value from Roy's Test
        Pillai F-Value: Approximated F-Value of Pillai statistic
        Wilk's Lambda F-Value: Approximated F-Value of Wilk's Lambda
        Lawley-Hotelling T^2 F-Value: Approximated F-Value of T^2
        Roy's Test F-Value: Approximated F-Value of Roy's Test statistic
        Pillai p-value: p-value of approximated Pillai F-Value with Num Df and Den Df
        Wilk's Lambda p-value: p-value of approximated Wilk's Lambda F-Value with Num Df and Den Df
        Lawley-Hotelling T^2 p-value: p-value of approximated Lawley-Hotelling F-Value with Num Df and Den Df
        Roy's Test p-value: p-value of approximated Roy's Test F-Value with Num Df and Den Df

    Notes
    -----
    MANOVA, or Multiple Analysis of Variance, is an extension of Analysis of
    Variance (ANOVA) to several dependent variables. The approach to MANOVA
    is similar to ANOVA in many regards and requires the same assumptions
    (normally distributed dependent variables with equal covariance matrices).

    In the MANOVA setting, each observation vector can have a model denoted as:

    .. math::

        y_{ij} = \mu_i + \epsilon_{ij} \qquad i = 1, 2, \cdots, k; \qquad j = 1, 2, \cdots, n

    An 'observation vector' is a set of observations measured over several variables.
    With :math:`p` variables, :math:`y_{ij}` becomes:

    .. math::

        \begin{bmatrix} y_{ij1} \\ y_{ij2} \\ \vdots \\ y_{ijp} \end{bmatrix} = \begin{bmatrix}
        \mu_{i1} \\ \mu_{i2} \\ \vdots \\ \mu_{ip} \end{bmatrix} + \begin{bmatrix} \epsilon_{ij1}
        \\ \epsilon_{ij2} \\ \vdots \\ \epsilon_{ijp} \end{bmatrix}

    As before in ANOVA, the goal is to compare the groups to see if there are any significant
    differences. However, instead of a single variable, the comparisons will be made with the
    mean vectors of the samples. The null hypothesis :math:`H_0` can be formalized the same
    way in MANOVA:

    .. math::

        H_0: \mu_1 = \mu_2 = \dots = \mu_k

    With an alternative hypothesis :math:`H_a` that at least two :math:`\mu` are unequal.
    There are :math:`p(k - 1)`, where :math:`k` is the number of groups in the data,
    equalities that must be true for :math:`H_0` to be accepted.

    Similar to ANOVA, we are interested in partitioning the data's total variation into
    variation between and within groups. In the case of ANOVA, this partitioning is done
    by calculating :math:`SSH` and :math:`SSE`; however, in the multivariate case, we must
    extend this to encompass the variation in all the :math:`p` variables. Therefore, we
    must compute the between and within sum of squares for each possible comparison. This
    procedure results in the :math:`H` "hypothesis matrix" and :math:`E` "error matrix."

    The :math:`H` matrix is a square :math:`p \times p` with the form:

    .. math::

        H = \begin{bmatrix} SSH_{11} & SPH_{21} & \dots & SPH_{1p} \\
        SPH_{12} & SSH_{22} & \dots & SPH_{2p} \\ \vdots & \vdots & & \vdots \\
        SPH_{1p} & SPH_{2p} & \cdots & SSH_{pp} \end{bmatrix}

    The error matrix :math:`E` is also :math:`p \times p`

    .. math::

        E = \begin{bmatrix} SSE_{11} & SPE_{12} & \cdots & SPE_{1p} \\
        SPE_{12} & SSE_{22} & \cdots & SPE_{2p} \\ \vdots & \vdots & & \vdots \\
        SPE_{1p} & SPE_{2p} & \cdots & SSE_{pp} \end{bmatrix}

    Once the :math:`H` and :math:`E` matrices are constructed, the mean vectors can be
    compared to determine if significant differences exist. There are several test
    statistics, of which the most common are Wilk's lambda, Roy's test, Pillai, and
    Lawley-Hotelling, that can be employed to test for significant differences. Each test
    statistic has specific properties and power.

    References
    ----------
    Rencher, A. (n.d.). Methods of Multivariate Analysis (2nd ed.).
        Brigham Young University: John Wiley & Sons, Inc.

    """
    if args is not ():
        c = args[0]
        for i in np.arange(1, len(args)):
            c = np.column_stack((c, args[i]))
        x = np.column_stack((x, c))

    x = _create_array(x)[0]
    grouparr, groupname = _create_array(group)

    groupnames = np.unique(grouparr)
    kn = len(groupnames)

    data = np.column_stack((grouparr, x))

    xmeans = data[:, 1:].mean(axis=0)
    xn = len(xmeans)

    groupmeans = npi.group_by(data[:, 0]).mean(data)[1][:, 1:]
    groupn = npi.group_by(data[:, 0], data, len)

    groups = npi.group_by(data[:, 0], data[:, 1:])[1]

    n = [i for _, i in groupn]

    h, e = np.zeros((xn, xn)), np.zeros((xn, xn))

    for i in np.arange(xn):
        for j in np.arange(i + 1):

            h[i, j] = n[i] * np.sum((groupmeans[:, i] - xmeans[i]) *
                                    (groupmeans[:, j] - xmeans[j]))
            h[j, i] = n[i] * np.sum((groupmeans[:, j] - xmeans[j]) *
                                    (groupmeans[:, i] - xmeans[i]))

            b = []

            for k in groups:
                a = np.sum((k[:, i] - np.mean(k[:, i])) *
                           (k[:, j] - np.mean(k[:, j])))
                b.append(a)

            e[i, j], e[j, i] = np.sum(b), np.sum(b)

    vh, ve, pillai, pillai_f, wilks_lambda, wilks_lambda_f, t2, t2_f, roy, roy_f = _manova_statistics(
        h, e, kn, len(x))

    num_df, denom_df = vh * xn, ve * xn

    pillai_pval, wilks_pval, t2_pval, roy_pval = _f_p_value(pillai_f, num_df, denom_df), \
                                                 _f_p_value(wilks_lambda_f, num_df, denom_df), \
                                                 _f_p_value(t2_f, num_df, denom_df), \
                                                 _f_p_value(roy_f, num_df, denom_df)

    ManovaResult = namedtuple('ManovaResult', [
        'groupdf', 'residualdf', 'numdf', 'denomdf', 'pillai', 'wilks', 't2',
        'roy', 'pillai_f', 'wilks_f', 't2_f', 'roy_f', 'pillai_p', 'wilks_p',
        't2_p', 'roy_p'
    ])

    maov = ManovaResult(groupdf=vh,
                        residualdf=ve,
                        numdf=num_df,
                        denomdf=denom_df,
                        pillai=pillai,
                        wilks=wilks_lambda,
                        t2=t2,
                        roy=roy,
                        pillai_f=pillai_f,
                        wilks_f=wilks_lambda_f,
                        t2_f=t2_f,
                        roy_f=roy_f,
                        pillai_p=pillai_pval,
                        wilks_p=wilks_pval,
                        t2_p=t2_pval,
                        roy_p=roy_pval)

    return maov