Beispiel #1
0
def test_osrt(a, val_col: str = None, group_col: str = None, sort: bool = False):

    '''Hayter's one-sided studentised range test (OSRT) against an ordered
    alternative for normal data with equal variances [1]_.

    Parameters
    ----------
    a : array_like or pandas DataFrame object
        An array, any object exposing the array interface or a pandas
        DataFrame.

    val_col : str, optional
        Name of a DataFrame column that contains dependent variable values (test
        or response variable). Values should have a non-nominal scale. Must be
        specified if `a` is a pandas DataFrame object.

    group_col : str, optional
        Name of a DataFrame column that contains independent variable values
        (grouping or predictor variable). Values should have a nominal scale
        (categorical). Must be specified if `a` is a pandas DataFrame object.

    sort : bool, optional
        If True, sort data by block and group columns.

    Returns
    -------
    P value, statistic.

    Notes
    -----
    P values are computed from the Tukey distribution.

    References
    ----------
    .. [1] Hayter, A.J.(1990) A One-Sided Studentised Range Test for Testing
        Against a Simple Ordered Alternative, Journal of the American
        Statistical Association, 85, 778-785.

    Examples
    --------
    >>> import scikit_posthocs as sp
    >>> import pandas as pd
    >>> x = pd.DataFrame({"a": [1,2,3,5,1], "b": [12,31,54,62,12], "c": [10,12,6,74,11]})
    >>> x = x.melt(var_name='groups', value_name='values')
    >>> sp.test_osrt(x, val_col='values', group_col='groups')

    '''

    x, _val_col, _group_col = __convert_to_df(a, val_col, group_col)

    if not sort:
        x[_group_col] = Categorical(x[_group_col], categories=x[_group_col].unique(), ordered=True)

    x.sort_values(by=[_group_col], ascending=True, inplace=True)
    groups = np.unique(x[_group_col])
    x_grouped = x.groupby(_group_col)[_val_col]

    xi = x_grouped.mean()
    ni = x_grouped.count()
    k = groups.size
    n = len(x.index)
    df = n - k

    sigma2 = 0
    c = -1

    for i in range(k):
        for j in range(ni.iloc[i]):
            c += 1
            sigma2 += (x[_val_col].iat[c] - xi[i])**2. / df

    sigma = np.sqrt(sigma2)

    def compare(i, j):
        dif = xi.loc[groups[j]] - xi.loc[groups[i]]
        A = sigma / np.sqrt(2.) * np.sqrt(1. / ni[groups[j]] + 1. / ni[groups[i]])
        qval = np.abs(dif) / A
        return qval

    vs = np.zeros((k, k), dtype=float)
    combs = it.combinations(range(k), 2)

    for i, j in combs:
        vs[i, j] = compare(i, j)

    stat = np.max(vs)
    pval = psturng(stat, k, df)
    return pval, stat
Beispiel #2
0
def test_mackwolfe(a: Union[List, np.ndarray, DataFrame], val_col: str = None,
                   group_col: str = None, p: int = None, n_perm: int = 100,
                   sort: bool = False) -> Tuple[float, float]:

    '''Mack-Wolfe Test for Umbrella Alternatives.

    In dose-finding studies one may assume an increasing treatment effect with
    increasing dose level. However, the test subject may actually succumb to
    toxic effects at high doses, which leads to decresing treatment
    effects [1]_, [2]_.

    The scope of the Mack-Wolfe Test is to test for umbrella alternatives for
    either a known or unknown point P (i.e. dose-level), where the peak
    (umbrella point) is present.

    Parameters
    ----------
    a : array_like or pandas DataFrame object
        An array, any object exposing the array interface or a pandas
        DataFrame.

    val_col : str, optional
        Name of a DataFrame column that contains dependent variable values
        (test or response variable). Values should have a non-nominal scale.
        Must be specified if `a` is a pandas DataFrame object.

    group_col : str, optional
        Name of a DataFrame column that contains independent variable values
        (grouping or predictor variable). Values should have a nominal scale
        (categorical). Must be specified if `a` is a pandas DataFrame object.

    p : int, optional
        The a-priori known peak as an ordinal number of the treatment group
        including the zero dose level, i.e. p = {0, ..., k-1}.
        Defaults to None.

    sort : bool, optional
        If True, sort data by block and group columns.

    Returns
    -------
    p : float
        P value.
    stat : float
        Statistic.

    References
    ----------
    .. [1] Chen, I.Y. (1991) Notes on the Mack-Wolfe and Chen-Wolfe Tests for
        Umbrella Alternatives. Biom. J., 33, 281-290.
    .. [2] Mack, G.A., Wolfe, D. A. (1981) K-sample rank tests for umbrella
        alternatives. J. Amer. Statist. Assoc., 76, 175-181.

    Examples
    --------
    >>> x = [[22, 23, 35], [60, 59, 54], [98, 78, 50], [60, 82, 59], [22, 44, 33], [23, 21, 25]]
    >>> sp.posthoc_mackwolfe(x)

    '''

    x, _val_col, _group_col = __convert_to_df(a, val_col, group_col)

    if not sort:
        x[_group_col] = Categorical(x[_group_col], categories=x[_group_col].unique(), ordered=True)
    x.sort_values(by=[_group_col], ascending=True, inplace=True)

    k = x[_group_col].unique().size

    if p and p > k:
        print("Selected 'p' > number of groups:", str(p), " > ", str(k))
        return False
    elif p is not None and p < 1:
        print("Selected 'p' < 1: ", str(p))
        return False

    Rij = x[_val_col].rank()
    n = x.groupby(_group_col)[_val_col].count()

    def _fn(Ri, Rj):
        return np.sum(Ri.apply(lambda x: Rj[Rj > x].size))

    def _ustat(Rij, g, k):
        levels = np.unique(g)
        U = np.identity(k)

        for i in range(k):
            for j in range(i):
                U[i, j] = _fn(Rij[x[_group_col] == levels[i]], Rij[x[_group_col] == levels[j]])
                U[j, i] = _fn(Rij[x[_group_col] == levels[j]], Rij[x[_group_col] == levels[i]])

        return U

    def _ap(p, U):
        tmp1 = 0.
        if p > 0:
            for i in range(p):
                for j in range(i+1, p+1):
                    tmp1 += U[i, j]
        tmp2 = 0.
        if p < k:
            for i in range(p, k):
                for j in range(i+1, k):
                    tmp2 += U[j, i]

        return tmp1 + tmp2

    def _n1(p, n):
        return np.sum(n[:p+1])

    def _n2(p, n):
        return np.sum(n[p:k])

    def _mean_at(p, n):
        N1 = _n1(p, n)
        N2 = _n2(p, n)
        return (N1**2. + N2**2. - np.sum(n**2.) - n.iloc[p]**2.)/4.

    def _var_at(p, n):
        N1 = _n1(p, n)
        N2 = _n2(p, n)
        N = np.sum(n)

        var = (2. * (N1**3 + N2**3) + 3. * (N1**2 + N2**2) -
               np.sum(n**2 * (2*n + 3.)) - n.iloc[p]**2. * (2. * n.iloc[p] + 3.) +
               12. * n.iloc[p] * N1 * N2 - 12. * n.iloc[p] ** 2. * N) / 72.
        return var

    if p:
        # if (x.groupby(_val_col).count() > 1).any().any():
        #    print("Ties are present")
        U = _ustat(Rij, x[_group_col], k)
        est = _ap(p, U)
        mean = _mean_at(p, n)
        sd = np.sqrt(_var_at(p, n))
        stat = (est - mean)/sd
        p_value = ss.norm.sf(stat)
    else:
        U = _ustat(Rij, x[_group_col], k)
        Ap = np.array([_ap(i, U) for i in range(k)]).ravel()
        mean = np.array([_mean_at(i, n) for i in range(k)]).ravel()
        var = np.array([_var_at(i, n) for i in range(k)]).ravel()
        A = (Ap - mean) / np.sqrt(var)
        stat = np.max(A)

        mt = []
        for _ in range(n_perm):

            ix = Series(np.random.permutation(Rij))
            uix = _ustat(ix, x[_group_col], k)
            apix = np.array([_ap(i, uix) for i in range(k)])
            astarix = (apix - mean) / np.sqrt(var)
            mt.append(np.max(astarix))

        mt = np.array(mt)
        p_value = mt[mt > stat] / n_perm

    return p_value, stat