Ejemplo n.º 1
0
def test_linconb():

    np.random.seed(42)
    df = create_example_data(3)
    results = linconb(df, con1way(3))
    expected = pickle.load(open("test_data/linconb.pkl", "rb"))
    expected_truth=build_truth_list(expected)
    actual_truth = check_dict_items_equality(expected, results)

    assert actual_truth == expected_truth
Ejemplo n.º 2
0
def pkl_linconb():

    np.random.seed(42)
    df = create_example_data(3)
    results = linconb(df, con1way(3))
    pickle.dump(results, open("hypothesize/tests/test_data/linconb.pkl", "wb"))
def rmmcppbd(x,
             est,
             *args,
             alpha=.05,
             con=None,
             nboot=None,
             hoch=True,
             seed=False):
    """
    Use a percentile bootstrap method to compare dependent groups
    based on difference scores.
    By default,
    compute a .95 confidence interval for all linear contrasts
    specified by con, a J by C matrix, where  C is the number of
    contrasts to be tested, and the columns of con are the
    contrast coefficients.
    If con is not specified, all pairwise comparisons are done.

    nboot is the bootstrap sample size. If not specified, a value will
    be chosen depending on the number of contrasts there are.

    A sequentially rejective method is used to control alpha.
    If n>=80, hochberg's method is used.

    Note that arguments up to and including `args` are positional arguments

    :param x:
    :param y:
    :param alpha:
    :param con:
    :param est:
    :param nboot:
    :param hoch:
    :param seed:
    :return:
    """

    x = x[~np.isnan(x).any(axis=1)]
    J = x.shape[1]
    n = x.shape[0]
    if n >= 80:
        hoch = True

    #Jm=J-1
    if con is None:
        con = con1way(J)

    d = con.shape[1]
    if not nboot:

        if d <= 10:
            nboot = 3000

        elif d <= 6:
            nboot = 2000

        elif d <= 4:
            nboot = 1000

        else:
            nboot = 5000

    connum = d
    xx = x @ con

    if seed:
        np.random.seed(seed)

    psihat = np.zeros([connum, nboot])
    data = np.random.randint(n, size=(nboot, n))

    # wilcox's implementation in R is a bit more complicated,
    # I have simplified. Hopefully correctly.
    for ib in range(nboot):
        psihat[:, ib] = est(xx[data[ib, :], :], *args)

    test = np.full(connum, np.nan)
    icl = round(alpha * nboot // 2)  #+ 1
    icu = nboot - icl - 2  #- 1
    cimat = np.full([connum, 2], np.nan)

    for ic in range(connum):

        test[ic] = (sum(psihat[ic, :] > 0) +
                    .5 * sum(psihat[ic, :] == 0)) / nboot
        test[ic] = min(test[ic], 1 - test[ic])
        temp = np.sort(psihat[ic, :])
        cimat[ic, 0] = temp[icl]
        cimat[ic, 1] = temp[icu]

    test = 2 * test
    ncon = con.shape[1]

    if alpha == .05:
        dvec = [
            .025, .025, .0169, .0127, .0102, .00851, .0073, .00639, .00568,
            .00511
        ]

        if ncon > 10:
            avec = .05 / np.arange(11, ncon + 1)
            dvec = np.append(dvec, avec)

    elif alpha == .01:
        dvec = [
            .005, .005, .00334, .00251, .00201, .00167, .00143, .00126, .00112,
            .00101
        ]

        if ncon > 10:
            avec = .01 / np.arange(11, ncon + 1)
            dvec = np.append(dvec, avec)

    else:
        dvec = alpha / np.arange(1, ncon + 1)
        dvec[1] = alpha / 2

    if hoch:
        dvec = alpha / (2 * np.arange(1, ncon + 1))

    dvec = 2 * dvec
    temp2 = (-test).argsort()
    ncon = con.shape[1]
    zvec = dvec[:ncon]
    output = np.zeros([connum, 6])

    tmeans = est(xx, *args)
    output[temp2, 3] = zvec

    for ic in range(ncon):
        output[ic, 1] = tmeans[ic]
        output[ic, 0] = ic
        output[ic, 2] = test[ic]
        output[ic, 4:6] = cimat[ic, :]

    num_sig = np.sum(output[:, 2] <= output[:, 3])

    return {"output": output, "con": con, "num_sig": num_sig}
def rmmcppb(x,
            est,
            *args,
            alpha=.05,
            con=None,
            dif=True,
            nboot=None,
            BA=False,
            hoch=False,
            SR=False,
            seed=False):
    """
    Use a percentile bootstrap method to compare dependent groups.
    By default, compute a .95 confidence interval for all linear contrasts
    specified by con, a J-by-C matrix, where C is the number of
    contrasts to be tested, and the columns of `con` are the
    contrast coefficients. If con is not specified,
    all pairwise comparisons are done.

    If `est` is the function `onestep` or `mom` (these are not implemeted yet),
    method SR can be used to control the probability of at least one Type I error.
    Otherwise, Hochberg's method is used.

    If `dif` is `False` and `BA` is `True`, the bias adjusted
    estimate of the generalized p-value is recommended.
    Using `BA`=`True` (when `dif`=`False`)
    is recommended when comparing groups
    with M-estimators and MOM, but it is not necessary when
    comparing 20% trimmed means (Wilcox & Keselman, 2002).

    Hochberg's sequentially rejective method can be used and is used
    if n>=80.

    Note that arguments up to and including `args` are positional arguments

    :param x: Pandas DataFrame
    Each column represents a group of data

    :param est: function
    Measure of location (currently only `trim_mean` is supported)

    :param args: list/value
    Parameter(s) for measure of location (e.g., .2)

    :param alpha: float
    Alpha level (default is .05)

    :param con: array
    `con` is a J (number of columns) by d (number of contrasts)
    matrix containing the contrast coefficents of interest.
    All linear constrasts can be created automatically by using the function [con1way](J)
    (the result of which can be used for `con`). The default is `None` and in this
    case all linear contrasts are created automatically.

    :param dif: bool
    When `True`, use difference scores, otherwise use marginal distributions

    :param nboot: int
    Number of bootstrap samples. Default is `None`
    in which case `nboot` will be chosen for you
    based on the number of contrasts.

    :param BA: bool
    When `True`, use the bias adjusted estimate of the
    generalized p-value is applied (e.g., when `dif` is `False`)

    :param hoch: bool
    When `True`, Hochberg's sequentially rejective method can be used and is used
    if n>=80.

    :param SR: bool
    When `True`, use the modified "sequentially rejective", especially when
    comparing one-step M-estimators or M-estimators.

    :param seed: bool
    Random seed for reprodicible results (default is `False`)

    :return:
    Dictionary of results

    con: array
    Contrast matrix

    num_sig: int
    Number of statistically significant results

    output: DataFrame
    Difference score, p-value, critical value, and CI for each contrast
    """

    called_directly = False
    if type(x) is pd.core.frame.DataFrame:
        called_directly = True
        x = x.dropna().values

    if hoch:
        SR = False

    if SR:
        raise Exception(
            "onestep and mom estimators are not yet implemented"
            "and only these can be used with SR method. Please set SR to False for now."
        )

    if dif:
        print("analysis is being done on difference scores",
              "each confidence interval has probability coverage of 1-alpha.")

        temp = rmmcppbd(x,
                        est,
                        *args,
                        alpha=alpha,
                        con=con,
                        nboot=nboot,
                        hoch=True)

        if called_directly:

            col_names = [
                'con_num', 'psihat', 'p_value', 'p_crit', 'ci_lower',
                'ci_upper'
            ]

            return {
                'output': pd.DataFrame(temp['output'], columns=col_names),
                'con': temp['con'],
                "num_sig": temp['num_sig']
            }

        else:

            return {
                'output': temp['output'],
                'con': temp['con'],
                "num_sig": temp['num_sig']
            }

    else:
        print("dif=False so using marginal distributions")

        if not BA:
            print("If and when MOM and/or onestep estimators are implemeted, "
                  "it is suggested to use BA=True and hoch=T")

        J = x.shape[1]
        xcen = np.full([x.shape[0], x.shape[1]], np.nan)
        for j in range(J):
            xcen[:, j] = x[:, j] - est(x[:, j], *args)

        if con is None:
            con = con1way(J)

        d = con.shape[1]

        if nboot is None:
            if d < 4:
                nboot = 1000
            elif d > 4:
                nboot = 5000

        n = x.shape[0]
        connum = con.shape[1]

        if seed:
            np.random.seed(seed)

        xbars = est(x, *args)

        psidat = np.zeros(connum)
        for ic in range(connum):
            psidat[ic] = np.sum(con[:, ic] * xbars)

        psihat = np.zeros([connum, nboot])
        psihatcen = np.zeros([connum, nboot])
        bvec = np.full([nboot, J], np.nan)
        bveccen = np.full([nboot, J], np.nan)
        data = np.random.randint(n, size=(nboot, n))
        for ib in range(nboot):
            bvec[ib, :] = est(x[data[ib, :], :], *args)
            bveccen[ib, :] = est(xcen[data[ib, :], :], *args)

        test = np.full(connum, np.nan)
        bias = np.full(connum, np.nan)

        for ic in range(connum):
            psihat[ic, :] = [bptdpsi(row, con[:, ic]) for row in bvec]
            psihatcen[ic, :] = [bptdpsi(row, con[:, ic]) for row in bveccen]
            bias[ic] = np.sum((psihatcen[ic, :] > 0)) / nboot - .5
            ptemp = (np.sum(psihat[ic, :] > 0) +
                     .5 * np.sum(psihat[ic, :] == 0)) / nboot

            if BA:
                test[ic] = ptemp - .1 * bias[ic]

            if not BA:
                test[ic] = ptemp

            test[ic] = np.min([test[ic], 1 - test[ic]])
            test[ic] = np.max([test[ic],
                               0])  # bias corrected might be less than zero

        test = 2 * test
        ncon = con.shape[1]
        dvec = alpha / np.arange(1, ncon + 1)

        if SR:

            if alpha == .05:

                dvec = [
                    .025, .025, .0169, .0127, .0102, .00851, .0073, .00639,
                    .00568, .00511
                ]

                dvecba = [
                    .05, .025, .0169, .0127, .0102, .00851, .0073, .00639,
                    .00568, .00511
                ]

                if ncon > 10:
                    avec = .05 / np.arange(11, ncon + 1)
                    dvec = np.append(dvec, avec)

            elif alpha == .01:

                dvec = [
                    .005, .005, .00334, .00251, .00201, .00167, .00143, .00126,
                    .00112, .00101
                ]

                dvecba = [
                    .01, .005, .00334, .00251, .00201, .00167, .00143, .00126,
                    .00112, .00101
                ]

                if ncon > 10:
                    avec = .01 / np.arange(11, ncon + 1)
                    dvec = np.append(dvec, avec)

            else:

                dvec = alpha / np.arange(1, ncon + 1)
                dvecba = dvec
                dvec[1] = alpha

        if hoch:
            dvec = alpha / np.arange(1, ncon + 1)

        dvecba = dvec
        temp2 = (-test).argsort()
        zvec = dvec[:ncon]

        if BA:
            zvec = dvecba[:ncon]

        output = np.zeros([connum, 6])
        tmeans = est(x, *args)

        output[temp2, 3] = zvec
        for ic in range(ncon):
            output[ic, 1] = np.sum(con[:, ic] * tmeans)
            output[ic, 0] = ic
            output[ic, 2] = test[ic]
            temp = np.sort(psihat[ic, :])
            icl = round(alpha * nboot / 2)  #+ 1
            icu = nboot - icl - 1  #nboot - (icl - 1)
            output[ic, 4] = temp[icl]
            output[ic, 5] = temp[icu]

    num_sig = output.shape[0]
    ior = (-output[:, 2]).argsort()
    for j in range(output.shape[0]):
        if output[ior[j], 2] <= output[ior[j], 3]:
            break
        else:
            num_sig = num_sig - 1

    if called_directly:
        col_names = [
            'con_num', 'psihat', 'p_value', 'p_crit', 'ci_lower', 'ci_upper'
        ]
        results = {
            "output": pd.DataFrame(output, columns=col_names),
            "con": con,
            "num_sig": num_sig
        }
        print(results)

    else:
        results = {"output": output, "con": con, "num_sig": num_sig}

    return results
def tmcppb(x,
           est,
           *args,
           con=None,
           bhop=False,
           alpha=.05,
           nboot=None,
           seed=False):
    """
    Multiple comparisons for J independent groups using trimmed means and
    the percentile bootstrap method. Rom’s method is used to control the
    probability of one or more type I errors. For C > 10 hypotheses,
    or when the goal is to test at some level other than .05 and .01,
    Hochberg’s method is used. Setting the argument `bhop` to `True` uses the
    Benjamini–Hochberg method instead.

    Note that arguments up to and including `args` are positional arguments

    :param x: Pandas DataFrame
    Each column represents a group of data

    :param est: function
    Measure of location (currently only `trim_mean` is supported)

    :param args: list/value
    Parameter(s) for measure of location (e.g., .2)

    :param con: array
    `con` is a J (number of columns) by d (number of contrasts)
    matrix containing the contrast coefficents of interest.
    All linear constrasts can be created automatically by using the function [con1way](J)
    (the result of which can be used for `con`). The default is `None` and in this
    case all linear contrasts are created automatically.

    :param bhop: bool
    If `True`, the Benjamini–Hochberg method is used to control FWE

    :param alpha: float
    Alpha level. Default is .05.

    :param nboot: int
    Number of bootstrap samples (default is 2000)

    :param seed: bool
    Random seed for reproducible results. Default is `False`.

    :return:
    Dictionary of results

    con: array
    Contrast matrix

    num_sig: int
    Number of statistically significant results

    output: DataFrame
    Difference score, p-value, critical value, and CI for each contrast
    """

    x = pandas_to_arrays(x)
    x = remove_nans_based_on_design(x, len(x), 'independent_groups')
    J = len(x)

    mvec = [est(i, *args) for i in x]

    if con is None:
        con = con1way(J)

    ncon = con.shape[1]

    if not nboot:
        nboot = 5000
        if J <= 8:
            nboot = 4000
        elif J <= 3:
            nboot = 2000

    if not bhop:

        if alpha == .05:
            dvec = [
                .05, .025, .0169, .0127, .0102, .00851, .0073, .00639, .00568,
                .00511
            ]

            if ncon > 10:
                avec = .05 / np.arange(11, ncon + 1)
                dvec = [dvec, avec]

        elif alpha == .01:
            dvec = [
                .01, .005, .00334, .00251, .00201, .00167, .00143, .00126,
                .00112, .00101
            ]

            if ncon > 10:
                avec = .01 / np.arange(11, ncon + 1)
                dvec = [dvec, avec]

        else:  #not (alpha != .05 or alpha != .01):
            dvec = alpha / np.arange(1, ncon + 1)

    else:
        dvec = (ncon - np.arange(1, ncon + 1) + 1) * alpha / ncon

    if seed:
        np.random.seed(seed)

    bvec = np.full([J, nboot], np.nan)
    for i, j in enumerate(x):
        data = np.random.choice(j, size=(nboot, len(j)))
        bvec[i, :] = [est(row, *args) for row in data]

    bcon = con.T @ bvec
    tvec = con.T @ mvec
    test = np.full(ncon, np.nan)
    for d in range(ncon):
        tv = np.sum(bcon[d, :] == 0) / nboot
        test[d] = np.sum(bcon[d, :] > 0) / nboot + .5 * tv
        if test[d] > .5:
            test[d] = 1 - test[d]

    output = np.full([ncon, 6], np.nan)
    test = 2 * test
    temp2 = (-test).argsort()
    zvec = dvec[:ncon]
    output[temp2, 3] = zvec
    icl = int(np.round(dvec[-1] * nboot / 2) + 1) - 1
    icu = nboot - icl - 3

    for ic in range(ncon):
        output[ic, 1] = tvec[ic]
        output[ic, 0] = ic
        output[ic, 2] = test[ic]
        temp = np.sort(bcon[ic, :])
        output[ic, 4] = temp[icl]
        output[ic, 5] = temp[icu]

    num_sig = np.sum(output[:, 2] <= output[:, 3])
    cols = ["con_num", "psihat", "p_value", "p_crit", "ci_lower", "ci_upper"]
    output = pd.DataFrame(output, columns=cols)

    results = {'output': output, 'con': con, 'num_sig': num_sig}

    return results