def remove_duplicates(hdf5file):
    """Removes the duplicates from each table of hdf5file."""
    # Build a list of all tables.......
    tableslist = [n for n in hdf5file.walkNodes()
                  if isinstance(n, tables.table.Table) and 'id' in n.colnames]
    for tbl in tableslist:
        idcol = tbl.col('id')
        nullrow = tuple([-9999] * len(tbl[-1]))
        # Find the duplicates..........
        for dup in find_repeats(idcol)[0]:
            duprow = (idcol == dup).nonzero()[0]
            baserow = tbl[duprow[0]]
            # Set the duplicates to the null row....
            for r in [d for d in duprow[1:] 
                      if (tbl[d] == baserow) or \
                         (tbl[d].tostring() == baserow.tostring())]:
                tbl.modifyRows(r, rows=[nullrow, ])
        # Save the results .........................
        tbl.flush()        
        # Get the list of the flagged rows .........
        nullist = numpy.array([r.nrow for r in tbl if r['id'] == -9999])
        # Remove the flagged rows iteratively ......
        while len(nullist) > 0:
            first = nullist[0]
            last = nullist[(numpy.diff(nullist) != 1).nonzero()]
            if not last.size:
                last = nullist[-1]
            else:
                last = last[0]
            if first == last:
                last += 1
            tbl.removeRows(first, last)
            tbl.flush()
            nullist = numpy.array([r.nrow for r in tbl if r['id'] == -9999])
    return hdf5file
Example #2
0
    def _xr2_test(self):

        ranks = []
        for i in range(self.n):
            ranks.append(rankdata(self.design_matrix[i]))

        ranks = np.vstack(ranks)

        ties = []

        for i in range(self.n):
            repeat_count = list(find_repeats(self.design_matrix[i])[1])
            if repeat_count:
                ties.append(repeat_count)

        correction = 1 - np.sum(np.array(ties)**3 -
                                np.array(ties)) / (self.n *
                                                   (self.k**3 - self.k))

        xr2 = (12. / (self.n * self.k * (self.k + 1.))) * np.sum(np.sum(ranks, axis=0) ** 2.) \
              - (3. * self.n * (self.k + 1.))

        xr2 /= correction

        return xr2
Example #3
0
def wilcoxon(x, y=None):

    if y is None:
        d = np.asarray(x)
    else:
        x, y = map(np.asarray, (x, y))
        if len(x) != len(y):
            raise ValueError('Unequal N in wilcoxon.  Aborting.')
        d = x - y

    d = np.compress(np.not_equal(d, 0), d, axis=-1)

    count = len(d)
    if count < 10:
        print("Warning: sample size too small for normal approximation.")

    r = stats.rankdata(abs(d))
    r_plus = np.sum((d > 0) * r, axis=0)
    r_minus = np.sum((d < 0) * r, axis=0)

    T = min(r_plus, r_minus)
    mn = count * (count + 1.) * 0.25
    se = count * (count + 1.) * (2. * count + 1.)

    _, repnum = stats.find_repeats(r)
    if repnum.size != 0:
        # Correction for repeated elements.
        se -= 0.5 * (repnum * (repnum * repnum - 1)).sum()

    se = math.sqrt(se / 24)
    Z = (T - mn) / se
    R = abs(Z) / math.sqrt(len(x))
    p = 2. * stats.norm.sf(abs(Z))

    return T, Z, R, p
Example #4
0
def mannwhitney_permute(x1, x2, n_boot=10000):
    """ Two-sided Mann-Whitney U test by permutation
    """
    # calc U statistic
    n1 = len(x1)
    n2 = len(x2)
    mid = 0.5 * n1 * (n1 + 1)
    x = list(x1)
    x.extend(x2)
    x = np.array(x)
    if len(ss.find_repeats(x).values) > 0:
        raise ValueError("cannot compute exact p-value with ties")
    ranks = ss.rankdata(x)
    r1 = np.sum(ranks[:n1])
    u1 = r1 - mid
    u = np.min((u1, n1 * n2 - u1))

    # permute
    u_dist = np.empty(n_boot)
    for i in range(n_boot):
        np.random.shuffle(x)
        ranks = ss.rankdata(x)
        r1 = np.sum(ranks[:n1])
        u1 = r1 - mid
        u_dist[i] = np.min((u1, n1 * n2 - u1))
    pval = np.sum(u_dist <= u) / float(n_boot)
    return u, pval
Example #5
0
    def test_find_repeats(self):
        x = np.asarray([1,1,2,2,3,3,3,4,4,4,4]).astype('float')
        tmp = np.asarray([1,1,2,2,3,3,3,4,4,4,4,5,5,5,5]).astype('float')
        xm = np.ma.array(tmp,mask=tmp == 5.)

        r = stats.find_repeats(x)
        rm = stats.mstats.find_repeats(xm)

        assert_equal(r,rm)
Example #6
0
def faster_mode1D(a):
    arr = np.asarray(a)  # would be _chk_array
    v, c = stats.find_repeats(arr)
    if len(c) == 0:
        arr.sort()  # mimic first value behavior
        return arr[0], 1.
    else:
        pos = c.argmax()
        return v[pos], c[pos]
Example #7
0
    def test_find_repeats(self):
        x = np.asarray([1,1,2,2,3,3,3,4,4,4,4]).astype('float')
        tmp = np.asarray([1,1,2,2,3,3,3,4,4,4,4,5,5,5,5]).astype('float')
        mask = (tmp == 5.)
        xm = np.ma.array(tmp, mask=mask)

        r = stats.find_repeats(x)
        rm = stats.mstats.find_repeats(xm)

        assert_equal(r,rm)
Example #8
0
def pedersen_distribution():
    """Returns tests for the distribution of public values
    for Pedersen commitments when the message is zero or one
    """ 
    x= 1000000
    gen = pedersen.Pedersen(256)
    c0 = []
    c1 = []
    for i in range(x):
        c0.append(gen.commit(0).c / 1.0)
        c1.append(gen.commit(1).c / 1.0)
        if (i % (x / 100) == 0):
            print(100 * i / x)
    print(stats.ks_2samp(c0, c1))
    print("ks 0:", stats.kstest(c0, "uniform"))
    print("ks 1:", stats.kstest(c1, "uniform"))
    print("0", stats.describe(c0))
    print("1", stats.describe(c1))
    print("0 repeats:", stats.find_repeats(c0))
    print("1 repeats:", stats.find_repeats(c1))
    print("0 entropy:", stats.entropy(c0))
    print("1 entropy:", stats.entropy(c1))
    input()
Example #9
0
def bitproof_test(z, title, same_graph = True, x = 100000):
    """Print's histogram for selected value of bitproof commitment
    along with statistical tests on the values
    z takes the form of lambda x: x.(variable here)
    same_graph is whether the histograms are on the same page
    x is the number of trials
    """
    gen = pedersen.Pedersen(64)
    c0 = gen.commit(0)
    c1 = gen.commit(1)
    b0 = []
    b1 = []
    for i in range(x):
        b0.append(z(bitproof.bitproof(0, c0, gen.state)) / 1.0)
        b1.append(z(bitproof.bitproof(1, c1, gen.state)) / 1.0)
        if (i % (x/100) == 0):
            print(100 * i / x)
    print(title)
    print(stats.ks_2samp(b0, b1))
    print("ks 0:", stats.kstest(b0, "uniform"))
    print("ks 1:", stats.kstest(b1, "uniform"))
    print("0", stats.describe(b0))
    print("1", stats.describe(b1))
    print("0 repeats:", stats.find_repeats(b0))
    print("1 repeats:", stats.find_repeats(b1))
    print("0 entropy:", stats.entropy(b0))
    print("1 entropy:", stats.entropy(b1))
    plt.title(f"Histogram of {title} Values")
    plt.hist(b0, bins = "auto", range = (0, gen.state.p / 1.0)) 
    if not same_graph:
        plt.ylabel("Occurences")
        plt.xlabel("Variable value")
        plt.show()
    plt.hist(b1, bins = "auto", range = (0, gen.state.p / 1.0))
    plt.ylabel("Occurences")
    plt.xlabel("Variable value")
    plt.show()
Example #10
0
def my_wilcoxon(condition2, condition1, x, y, global_mean, global_std, motif_name, correction = False):

	x, y = map(np.asarray, (x, y)) #apply np.asarray for both input arrays

	if len(x) != len(y):
		raise ValueError("The length of both arrays in Wilcoxon test should be the same. Aborting")

	d = x - y #find the difference

	#keep all non-zero differences
	d = np.compress(np.not_equal(d, 0), d) #in scipy axis = -1, in my case it does not matter, as i have a flattened array

	#correct the differences according to the global mean and std
	d_normalized = (d - global_mean) / global_std

	count = len(d_normalized)

	if count < 10:
		logger.info("The sampe size is too small for normal approximation")

	r = stats.rankdata(abs(d_normalized)) #assign ranks to data, dealing with ties appropriately

	r_plus = np.sum((d_normalized > 0) * r, axis = 0)
	r_minus = np.sum((d_normalized < 0) * r, axis = 0)

	T = min(r_plus, r_minus)
	mn = count * (count + 1.) * 0.25
	se = count * (count + 1.) * (2. * count + 1.)

	replist, repnum = stats.find_repeats(r)

	if repnum.size != 0:
		#correction for repeated elements
		se -= 0.5 * (repnum * (repnum * repnum -1)).sum()

	se = np.sqrt(se / 24)
	correction = 0.5 * int(bool(correction)) * np.sign(T - mn)
	z = (T - mn - correction) / se
	prob = 2. * stats.norm.sf(abs(z), scale = 1) #do not scale

	motif_std = np.std(d_normalized, ddof = 1)
	motif_mu = np.mean(d_normalized)

	direction = get_name_from_path(condition2)
	if motif_mu < 0:
		direction = get_name_from_path(condition1)

	return prob, direction, d, d_normalized, motif_std
Example #11
0
def _check_friedman(n_strategies, n_datasets, ranked_data, alpha):
    """
    Check whether Friedman test is significant.

    Larger parts of code copied from scipy.

    Arguments
    ---------
    n_strategies : int
      number of strategies to evaluate
    n_datasets : int
      number of datasets classified per strategy
    ranked_data : np.array (shape: n_strategies * n_datasets)
      rank of strategy on dataset

    Returns
    -------
    is_significant : bool
      Indicates whether strategies differ significantly in terms of performance
      (according to Friedman test).
    """
    if n_strategies < 3:
        raise ValueError(
            "At least 3 sets of measurements must be given for Friedmann test, got{}.".format(
                n_strategies
            )
        )

    # calculate c to correct chisq for ties:
    ties = 0
    for i in range(n_datasets):
        replist, repnum = find_repeats(ranked_data[i])
        for t in repnum:
            ties += t * (t * t - 1)
    c = 1 - ties / (n_strategies * (n_strategies * n_strategies - 1) * n_datasets)

    ssbn = np.sum(ranked_data.sum(axis=0) ** 2)
    chisq = (
        12.0 / (n_strategies * n_datasets * (n_strategies + 1)) * ssbn
        - 3 * n_datasets * (n_strategies + 1)
    ) / c
    p = distributions.chi2.sf(chisq, n_strategies - 1)
    if p < alpha:
        is_significant = True
    else:
        is_significant = False
    return is_significant
Example #12
0
def find2ds(a, b):
    """
    This function is used to find element in a 2D array
    parameter:
    ----------
    a : the array in which will to find the element
    b : the elements need to find
    return:
    ---------
    data_ip:give the index all the elements in given array 
    PS :this part can't be used to find boor type
    """
    try:
        da = np.array(a)
        goal = b
        f = st.find_repeats(da)
        ix = f[0] == goal
        iy = ix.tolist()
        x = iy.index(True)
        y = f[1][x]
        data_ip = np.zeros((y, 2), dtype=np.int0)
        A = da.flatten()
        B = da.shape
        for k in range(y):
            ia = A == goal
            ib = ia.tolist()
            ip = ib.index(True)
            N = np.floor((ip + 1) / B[1])
            if (ip + 1) % B[1] == 0:
                X = N - 1
            else:
                X = N
            Y = ip % B[1]
            data_ip[k, 0] = X
            data_ip[k, 1] = Y
            if goal == 0:
                A[ip] = np.int0(True)
            else:
                A[ip] = np.int0(False)
    except ValueError:
        print('can not find element!')
        data_ip = 'Error!'
    return data_ip


#find2ds(a = True,b = True)
Example #13
0
    def test_find_repeats(self):
        x = np.asarray([1,1,2,2,3,3,3,4,4,4,4]).astype('float')
        tmp = np.asarray([1,1,2,2,3,3,3,4,4,4,4,5,5,5,5]).astype('float')
        mask = (tmp == 5.)
        xm = np.ma.array(tmp, mask=mask)
        x_orig, xm_orig = x.copy(), xm.copy()

        r = stats.find_repeats(x)
        rm = stats.mstats.find_repeats(xm)

        assert_equal(r, rm)
        assert_equal(x, x_orig)
        assert_equal(xm, xm_orig)

        # This crazy behavior is expected by count_tied_groups, but is not
        # in the docstring...
        _, counts = stats.mstats.find_repeats([])
        assert_equal(counts, np.array(0, dtype=np.intp))
Example #14
0
    def test_find_repeats(self):
        x = np.asarray([1,1,2,2,3,3,3,4,4,4,4]).astype('float')
        tmp = np.asarray([1,1,2,2,3,3,3,4,4,4,4,5,5,5,5]).astype('float')
        mask = (tmp == 5.)
        xm = np.ma.array(tmp, mask=mask)
        x_orig, xm_orig = x.copy(), xm.copy()

        r = stats.find_repeats(x)
        rm = stats.mstats.find_repeats(xm)

        assert_equal(r, rm)
        assert_equal(x, x_orig)
        assert_equal(xm, xm_orig)

        # This crazy behavior is expected by count_tied_groups, but is not
        # in the docstring...
        _, counts = stats.mstats.find_repeats([])
        assert_equal(counts, np.array(0, dtype=np.intp))
Example #15
0
def wilcoxon_test(x, y, alternative):
    """
    One-sided Wilcoxon signed-rank test derived from Scipy's two-sided test
    e.g. for alternative == constants.LESS, rejecting the null means that median difference x - y < 0
    Returns p-value
    """
    # TODO: add unit tests to verify results identical to R's Wilcoxon test for a host of input values
    # pylint: disable = invalid-name, too-many-locals
    x, y = map(asarray, (x, y))
    d = x - y

    d = compress(np.not_equal(d, 0), d, axis=-1)

    count = len(d)

    r = rankdata(abs(d))
    T = np.sum((d > 0) * r, axis=0)

    mn = count * (count + 1.) * 0.25
    se = count * (count + 1.) * (2. * count + 1.)

    if se < 1e-20:
        return 1.  # Degenerate case

    _, repnum = find_repeats(r)
    if repnum.size != 0:
        # Correction for repeated elements.
        se -= 0.5 * (repnum * (repnum * repnum - 1)).sum()

    se = sqrt(se / 24)
    if alternative == constants.LESS:
        correction = -0.5
    elif alternative == constants.GREATER:
        correction = 0.5
    else:
        correction = 0.5 * np.sign(T - mn)  # two-sided

    z = (T - mn - correction) / se

    if alternative == constants.LESS:
        return norm.cdf(z)
    if alternative == constants.GREATER:
        return norm.sf(z)
    return 2 * min(norm.cdf(z), norm.sf(z))  # two-sided
Example #16
0
    def _runs_test(self):
        n1, n2 = find_repeats(pd.factorize(self.x)[0]).counts

        r_range = np.arange(2, self.r + 1)
        evens = r_range[r_range % 2 == 0]
        odds = r_range[r_range % 2 != 0]

        p_even = 1 / comb(n1 + n2, n1) * np.sum(
            2 * comb(n1 - 1, evens / 2 - 1) * comb(n2 - 1, evens / 2 - 1))

        p_odd = 1 / comb(n1 + n2, n1) * np.sum(
            comb(n1 - 1, odds - 1) * comb(n2 - 1, odds - 2) +
            comb(n1 - 1, odds - 2) * comb(n2 - 1, odds - 1))

        p = p_even + p_odd

        if all(np.array([n1, n2]) < 20):
            r_crit_1, r_crit_2 = r_critical_value(n1, n2)

            test_summary = {
                'probability': p,
                'p-value': p,
                'r critical value 1': r_crit_1,
                'r critical value 2': r_crit_2,
                'r': self.r
            }
            return test_summary

        else:
            mean = (2 * n1 * n2) / (n1 + n2) + 1
            sd = np.sqrt((2 * n1 * n2 * (2 * n1 * n2 - n1 - n2)) /
                         ((n1 + n2)**2 * (n1 + n2 - 1)))
            z = (np.absolute(self.r - mean) - (0.5 * self.continuity)) / sd
            p_val = norm.sf(z) * 2

            test_summary = {
                'probability': p,
                'mean of runs': mean,
                'standard deviation of runs': sd,
                'z-value': z,
                'p-value': p_val
            }

            return test_summary
Example #17
0
def find1ds(a, b):
    """
    This function will use to find elements those equal to the given value
    parameter:
    ----------
    a : the array use to find an element
    b : the element need to find(b not only one)
    return:
    -------
    data_ip : the index of the element in the given array,in type int 
    for index should be int type
    PS:this function also can be use to find element like NaN,inf,but 
    this part can't be used to find boor type
    """
    try:
        da = np.array(a)
        goal = b
        f = st.find_repeats(da)
        ix = f[0] == goal
        iy = ix.tolist()
        x = iy.index(True)
        y = f[1][x]
        data_ip = np.zeros(y, dtype=np.int0)
        for k in range(y):
            ia = da == goal
            ib = ia.tolist()
            ic = ib.index(True)
            data_ip[k] = ic
            if goal == 0:
                da[ic] = np.int0(True)
            else:
                da[ic] = np.int0(False)
    except ValueError:
        print('can not find element!')
        data_ip = 'Error!'
    return data_ip
Example #18
0
def wsr_test(X, H0):
    '''
    Wilcoxon Signed Rank Test.  
    H0: M(X) ≤, ≥, = 0.  
    In slides 427.  
    REQUIRE: H0 can take three value: "equal", "less", "greater".  
    RETURN: (E[w], Var[w]), (w_minus, w_plus), p-value.     
    '''
    d = np.asarray([k for k in X if k != 0])
    n = len(d)
    if n < 10:
        print("Sample size too small for normal approximation.")
    r = stats.rankdata(np.abs(d))
    print(d)
    print(r)
    w_plus = np.sum((d > 0) * r, axis=0)
    w_minus = np.sum((d < 0) * r, axis=0)
    E_w = n*(n+1)/4
    Var_w = n*(n+1)*(2*n+1)/24

    replist, repnum = stats.find_repeats(r)
    print(repnum)
    if repnum.size != 0:
        # Correction for repeated elements.
        Var_w -= (repnum * (repnum * repnum - 1)).sum()/48
    
    if H0 == "less":
        Z = (abs(w_minus) - E_w) / Var_w**0.5
        p_value = stats.norm.cdf(Z)
    elif H0 == "greater":
        Z = (w_plus - E_w) / Var_w**0.5
        p_value = stats.norm.cdf(Z)
    elif H0 == "equal":
        Z = (min(abs(w_minus), w_plus) - E_w) / Var_w**0.5
        p_value = 2 * stats.norm.cdf(Z)
    return (E_w, Var_w), (-w_minus, w_plus), (Z, p_value)
Example #19
0
#        if addedLine == False:
#            noline(kalman ,kCount, y_k, im, frameNum, SL_file, videoOut)
#            kCount += 1
#            if kCount > kThres:
#                kalman.statePost = np.array( [im.shape[0]*0.50, 0] ).reshape((2,1))
#                kCount = 0
#            print("Not enough non-zero-angle lines found")
#            videoOut.write(im);
#            cv2.imshow("Lane lines on image", im)
#            if cv2.waitKey(fRate) >= 0:
#                break
#            continue

        try:
            angleMode_t = stats.find_repeats(horizonAngles)
            angleMode = float(angleMode_t[0][0])
        except:
            angleMode = float(horizonAngles[0])
        print(angleMode)
        print("hello")
        #-----------------GET LINE ANGLE AVERAGES-----------------------
        angleGoodLines = []
        Sum = 0
        for horizonLine in horizonLines:
            if abs(horizonLine[5] - angleMode) < 0.3:
                angleGoodLines.append(horizonLine)
                Sum += horizonLine[5]
        print(len(angleGoodLines))
        angleMean = Sum / len(angleGoodLines)
        print(angleMean)
import find
import h5py
import matplotlib.gridspec as grid
# setlect model
import handy.scatter as hsc
from astropy.wcs import *
import astropy.wcs as awc
from astropy.coordinates import SkyCoord
goal_data = aft.getdata(
    '/mnt/ddnfs/data_users/cxkttwl/ICL/data/redmapper/redmapper_dr8_public_v6.3_catalog.fits'
)
sub_data = aft.getdata(
    '/mnt/ddnfs/data_users/cxkttwl/ICL/data/redmapper/redmapper_dr8_public_v6.3_members.fits'
)
# find the member of each BGC -cluster, by find the repeat ID
repeat = sts.find_repeats(sub_data.ID)
rept_ID = np.int0(repeat)
ID_array = np.int0(sub_data.ID)
sub_redshift = np.array(
    sub_data.Z_SPEC)  #use to figure out how big the satellite
center_distance = sub_data.R  # select the distance of satellite galaxies
member_pos = np.array([sub_data.RA,
                       sub_data.DEC])  # record the position of satellite
# read the center galaxy position
RA = np.array(goal_data.RA)
DEC = np.array(goal_data.DEC)
redshift = np.array(goal_data.Z_SPEC)
richness = np.array(goal_data.LAMBDA)
host_ID = np.array(goal_data.ID)
# except the part with no spectra redshift
z_eff = redshift[redshift != -1]
Example #21
0
def wilcoxon(x,
             y=None,
             zero_method="wilcox",
             correction=False,
             alternative="two-sided"):
    """
    Calculate the Wilcoxon signed-rank test.
    The Wilcoxon signed-rank test tests the null hypothesis that two
    related paired samples come from the same distribution. In particular,
    it tests whether the distribution of the differences x - y is symmetric
    about zero. It is a non-parametric version of the paired T-test.
    Parameters
    ----------
    x : array_like
        The first set of measurements.
    y : array_like, optional
        The second set of measurements.  If `y` is not given, then the `x`
        array is considered to be the differences between the two sets of
        measurements.
    zero_method : {"pratt", "wilcox", "zsplit"}, optional. Default is "wilcox".
        "pratt":
            includes zero-differences in the ranking process,
            but drops the ranks of the zeros, see [4]_, (more conservative)
        "wilcox":
            discards all zero-differences, the default
        "zsplit":
            includes zero-differences in the ranking process and split the
            zero rank between positive and negative ones
    correction : bool, optional
        If True, apply continuity correction by adjusting the Wilcoxon rank
        statistic by 0.5 towards the mean value when computing the
        z-statistic.  Default is False.
    alternative : {"two-sided", "greater", "less"}, optional
        The alternative hypothesis to be tested, see Notes. Default is
        "two-sided".
    Returns
    -------
    statistic : float
        If `alternative` is "two-sided", the sum of the ranks of the
        differences above or below zero, whichever is smaller.
        Otherwise the sum of the ranks of the differences above zero.
    pvalue : float
        The p-value for the test depending on `alternative`.
    See Also
    --------
    kruskal, mannwhitneyu
    Notes
    -----
    The test has been introduced in [4]_. Given n independent samples
    (xi, yi) from a bivariate distribution (i.e. paired samples),
    it computes the differences di = xi - yi. One assumption of the test
    is that the differences are symmetric, see [2]_.
    The two-sided test has the null hypothesis that the median of the
    differences is zero against the alternative that it is different from
    zero. The one-sided test has the null that the median is positive against
    the alternative that the it is negative (``alternative == 'less'``),
    or vice versa (``alternative == 'greater.'``).
    The test uses a normal approximation to derive the p-value. A typical rule
    is to require that n > 20. For smaller n, exact tables can be used to find
    critical values.
    References
    ----------
    .. [1] https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test
    .. [2] Conover, W.J., Practical Nonparametric Statistics, 1971.
    .. [3] Pratt, J.W., Remarks on Zeros and Ties in the Wilcoxon Signed
       Rank Procedures, Journal of the American Statistical Association,
       Vol. 54, 1959, pp. 655-667. :doi:`10.1080/01621459.1959.10501526`
    .. [4] Wilcoxon, F., Individual Comparisons by Ranking Methods,
       Biometrics Bulletin, Vol. 1, 1945, pp. 80-83. :doi:`10.2307/3001968`
    Examples
    --------
    In [4]_, the differences in height between cross- and self-fertilized
    corn plants is given as follows:
    >>> d = [6, 8, 14, 16, 23, 24, 28, 29, 41, -48, 49, 56, 60, -67, 75]
    Cross-fertilized plants appear to be be higher. To test the null
    hypothesis that there is no height difference, we can apply the
    two-sided test:
    >>> from scipy.stats import wilcoxon
    >>> w, p = wilcoxon(d)
    >>> w, p
    (24.0, 0.04088813291185591)
    Hence, we would reject the null hypothesis at a confidence level of 5%,
    concluding that there is a difference in height between the groups.
    To confirm that the median of the differences can be assumed to be
    positive, we use:
    >>> w, p = wilcoxon(d, alternative='greater')
    >>> w, p
    (96.0, 0.020444066455927955)
    This shows that the null hypothesis that the median is negative can be
    rejected at a confidence level of 5% in favor of the alternative that
    the median is greater than zero. The p-value based on the approximation
    is within the range of 0.019 and 0.054 given in [2]_.
    Note that the statistic changed to 96 in the one-sided case (the sum
    of ranks of positive differences) whereas it is 24 in the two-sided
    case (the minimum of sum of ranks above and below zero).
    """

    WilcoxonResult = namedtuple('WilcoxonResult',
                                ('w_statistic', 'z_statistic', 'pvalue'))

    if zero_method not in ["wilcox", "pratt", "zsplit"]:
        raise ValueError("Zero method should be either 'wilcox' "
                         "or 'pratt' or 'zsplit'")

    if alternative not in ["two-sided", "less", "greater"]:
        raise ValueError("Alternative must be either 'two-sided', "
                         "'greater' or 'less'")

    if y is None:
        d = asarray(x)
    else:
        x, y = map(asarray, (x, y))
        if len(x) != len(y):
            raise ValueError('Unequal N in wilcoxon. Aborting.')
        d = x - y

    if zero_method == "wilcox":
        # Keep all non-zero differences
        d = compress(np.not_equal(d, 0), d, axis=-1)

    count = len(d)
    if count < 10:
        warnings.warn("Sample size too small for normal approximation.")

    r = stats.rankdata(abs(d))
    r_plus = np.sum((d > 0) * r, axis=0)
    r_minus = np.sum((d < 0) * r, axis=0)

    if zero_method == "zsplit":
        r_zero = np.sum((d == 0) * r, axis=0)
        r_plus += r_zero / 2.
        r_minus += r_zero / 2.

    # return min for two-sided test, but r_plus for one-sided test
    # the literature is not consistent here
    # r_plus is more informative since r_plus + r_minus = count*(count+1)/2,
    # i.e. the sum of the ranks, so r_minus and the min can be inferred
    # (If alternative='pratt', r_plus + r_minus = count*(count+1)/2 - r_zero.)
    # [3] uses the r_plus for the one-sided test, keep min for two-sided test
    # to keep backwards compatability
    if alternative == "two-sided":
        T = min(r_plus, r_minus)
    elif alternative == "greater":
        T = r_plus
    else:
        T = r_minus
    mn = count * (count + 1.) * 0.25
    se = count * (count + 1.) * (2. * count + 1.)

    if zero_method == "pratt":
        r = r[d != 0]

    replist, repnum = stats.find_repeats(r)
    if repnum.size != 0:
        # Correction for repeated elements.
        se -= 0.5 * (repnum * (repnum * repnum - 1)).sum()

    se = np.sqrt(se / 24)

    # apply continuity correction if applicable
    d = 0
    if correction:
        if alternative == "two-sided":
            d = 0.5 * np.sign(T - mn)
        elif alternative == "less":
            d = -0.5
        else:
            d = 0.5

    # compute statistic and p-value using normal approximation
    z = (T - mn - d) / se
    if alternative == "two-sided":
        prob = 2. * distributions.norm.sf(abs(z))
    elif alternative == "greater":
        # large T = r_plus indicates x is greater than y; i.e.
        # accept alternative in that case and return small p-value (sf)
        prob = distributions.norm.sf(z)
    else:
        prob = distributions.norm.cdf(z)

    return WilcoxonResult(T, z, prob)


# from scipy import stats
# import numpy as np
# from collections import namedtuple
#
# def wilcoxon(x, y=None, zero_method="wilcox", correction=False, alternative='two-sided'):
#     WilcoxonResult = namedtuple('WilcoxonResult', ('statistic', 'pvalue'))
#
#     if y is None:
#         d = np.asarray(x)
#     else:
#         x, y = map(np.asarray, (x, y))
#         if len(x) != len(y):
#             raise ValueError('Unequal N in wilcoxon.  Aborting.')
#         d = x - y
#
#     if zero_method == "wilcox": # Keep all non-zero differences
#         d = np.compress(np.not_equal(d, 0), d, axis=-1)
#
#     count = len(d)
#     r = stats.rankdata(abs(d))
#     r_plus = np.sum((d > 0) * r, axis=0)
#     r_minus = np.sum((d < 0) * r, axis=0)
#
#     if zero_method == "zsplit":
#         r_zero = np.sum((d == 0) * r, axis=0)
#         r_plus += r_zero / 2.
#         r_minus += r_zero / 2.
#
#     T = min(r_plus, r_minus)
#     mn = count * (count + 1.) * 0.25
#     se = count * (count + 1.) * (2. * count + 1.)
#
#     if zero_method == "pratt":
#         r = r[d != 0]
#
#     replist, repnum = stats.find_repeats(r)
#     if repnum.size != 0: # Correction for repeated elements.
#         se -= 0.5 * (repnum * (repnum * repnum - 1)).sum()
#
#     se = np.sqrt(se / 24)
#     correction = 0.5 * int(bool(correction)) * np.sign(T - mn)
#     z = (T - mn - correction) / se
#     prob = 2. * stats.distributions.norm.sf(abs(z))
#
#     if alternative == "two-sided":
#         return WilcoxonResult(T, prob)
#     elif alternative == "greater":
#         return WilcoxonResult(T, prob/2) if z > 0 else WilcoxonResult(T, 1 - prob/2)
#     elif alternative == "less":
#         return WilcoxonResult(T, prob/2) if z < 0 else WilcoxonResult(T, 1 - prob/2)
#     else:
#         raise ValueError("Alternative should be either 'two-sided' "
#                          "or 'less' or 'greater'")
Example #22
0
 def test_basic(self):
     a = [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 5]
     res, nums = stats.find_repeats(a)
     assert_array_equal(res, [1, 2, 3, 4])
     assert_array_equal(nums, [3, 3, 2, 2])
Example #23
0
def wilcoxon(x, y=None, zero_method="wilcox", correction=False):
    """
    Calculate the Wilcoxon signed-rank test.

    The Wilcoxon signed-rank test tests the null hypothesis that two
    related paired samples come from the same distribution. In particular,
    it tests whether the distribution of the differences x - y is symmetric
    about zero. It is a non-parametric version of the paired T-test.

    Parameters
    ----------
    x : array_like
        The first set of measurements.
    y : array_like, optional
        The second set of measurements.  If `y` is not given, then the `x`
        array is considered to be the differences between the two sets of
        measurements.
    zero_method : string, {"pratt", "wilcox", "zsplit"}, optional
        "pratt":
            Pratt treatment: includes zero-differences in the ranking process
            (more conservative)
        "wilcox":
            Wilcox treatment: discards all zero-differences
        "zsplit":
            Zero rank split: just like Pratt, but spliting the zero rank
            between positive and negative ones
    correction : bool, optional
        If True, apply continuity correction by adjusting the Wilcoxon rank
        statistic by 0.5 towards the mean value when computing the
        z-statistic.  Default is False.

    Returns
    -------
    T : float
        The sum of the ranks of the differences above or below zero, whichever
        is smaller.
    p-value : float
        The two-sided p-value for the test.
    outcome : integer
        The direction of the effect (if any): +1 means a positive difference (x > y)
        -1 means a negative difference (x < y)
        0 means no difference
        This is computed by comparing W_plus and W_minus.

    Notes
    -----
    Because the normal approximation is used for the calculations, the
    samples used should be large.  A typical rule is to require that
    n > 20.

    References
    ----------
    .. [1] http://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test

    """

    if not zero_method in ["wilcox", "pratt", "zsplit"]:
        raise ValueError("Zero method should be either 'wilcox' \
                          or 'pratt' or 'zsplit'")

    if y is None:
        d = x
    else:
        x, y = map(np.asarray, (x, y))
        if len(x) != len(y):
            raise ValueError('Unequal N in wilcoxon.  Aborting.')
        d = x - y

    if zero_method == "wilcox":
        d = np.compress(np.not_equal(d, 0), d,
                        axis=-1)  # Keep all non-zero differences

    count = len(d)
    if count == 0:
        return None, 1.0, 0.

    if (count < 10):
        warnings.warn(
            "Warning: sample size too small for normal approximation.")
    r = stats.rankdata(abs(d))
    r_plus = np.sum((d > 0) * r, axis=0)
    r_minus = np.sum((d < 0) * r, axis=0)

    if zero_method == "zsplit":
        r_zero = np.sum((d == 0) * r, axis=0)
        r_plus += r_zero / 2.
        r_minus += r_zero / 2.

    T = min(r_plus, r_minus)
    mn = count * (count + 1.) * 0.25
    se = count * (count + 1.) * (2. * count + 1.)

    if zero_method == "pratt":
        r = r[d != 0]

    replist, repnum = stats.find_repeats(r)
    if repnum.size != 0:
        # Correction for repeated elements.
        se -= 0.5 * (repnum * (repnum * repnum - 1)).sum()

    se = np.sqrt(se / 24)
    correction = 0.5 * int(bool(correction)) * np.sign(T - mn)
    z = (T - mn - correction) / se
    prob = 2. * stats.distributions.norm.sf(abs(z))

    # Added by GR: compute direction of effect (if present)
    outcome = 2 * (r_plus > r_minus) - 1 if r_plus != r_minus else 0

    return T, prob, outcome
Example #24
0
 def test_empty_result(self):
     # Check that empty arrays are returned when there are no repeats.
     a = [10, 20, 50, 30, 40]
     repeated, counts = stats.find_repeats(a)
     assert_array_equal(repeated, [])
     assert_array_equal(counts, [])
Example #25
0
 def test_basic(self):
     a = [1,2,3,4,1,2,3,4,1,2,5]
     res,nums = stats.find_repeats(a)
     assert_array_equal(res,[1,2,3,4])
     assert_array_equal(nums,[3,3,2,2])
Example #26
0
 def test_empty_result(self):
     # Check that empty arrays are returned when there are no repeats.
     a = [10, 20, 50, 30, 40]
     repeated, counts = stats.find_repeats(a)
     assert_array_equal(repeated, [])
     assert_array_equal(counts, [])
Example #27
0
def wilcoxon(x,
             y=None,
             alpha=0.05,
             alternative='two-sided',
             mode='auto',
             zero_method='wilcox',
             return_tuple=False):
    """Wilcoxon signed-rank test.

    :param x: First sample to compare. If `y` is not provided, will correspond to the difference :math:`x - y`.
    :type x: :obj:`numpy.array`
    :param y: Second sample to compare, defaults to None.
    :type y: :obj:`numpy.array`, optional
    :param alpha: Confidence level, defaults to 0.05.
    :type alpha: :obj:`float``, optional
    :param alternative: Perform a one or two-sided test. Values can be `two-sided`, `greater`, `less`, defaults to 'two-sided'.
    :type alternative: :obj:`str`, optional
    :param mode: Method to calculate the p-value. Computes the exact distribution is sample size is less than 25, otherwise uses normal approximation. Values can be `auto`, `approx` or `exact`, defaults to 'auto'.
    :type mode: :obj:`str`, optional
    :param zero_method: Method to handle the zero differences., defaults to 'wilcox'
    :type zero_method: :obj:`str`, optional
    :param return_tuple: Return a tuple with t statistic, critical value and p-value, defaults to False.
    :type return_tuple: :obj:`bool`, optional

    :example:

    >>> from statinf import stats
    >>> import numpy as np
    >>> x = np.random.poisson(2, size=100)
    >>> y = x_dist + np.random.normal(loc=0, scale=1, size=100)
    >>> stats.wilcoxon(x, y)
    ... +------------------------------------------------------------+
    ... |                       Wilcoxon test                        |
    ... +------------+----------------+------------+---------+-------+
    ... |     df     | Critical value | Stat value | p-value |   H0  |
    ... +------------+----------------+------------+---------+-------+
    ... |        100 |   1.9599639845 |  -1.316878 | 0.18788 | True  |
    ... +------------+----------------+------------+---------+-------+
    ...  * We cannot reject H0: x - y ~ symmetric distribution centered in 0
    ...  * The T-value is: 2142.0

    :reference: * Wilcoxon, F., Individual Comparisons by Ranking Methods, Biometrics Bulletin, Vol. 1, 1945, pp. 80-83.
        * Cureton, E.E., The Normal Approximation to the Signed-Rank Sampling Distribution When Zero Differences are Present, Journal of the American Statistical Association, Vol. 62, 1967, pp. 1068-1069.

    :return: Summary for the test or tuple statistic, critical value, p-value.
    :rtype: :obj:`tuple`
    """

    # Code mostly inspired from: https://github.com/scipy/scipy/blob/v1.7.0/scipy/stats/morestats.py#L2984-L3233

    # Define test degrees of freedom
    if alternative == 'two-sided':
        quant_order = 1 - (alpha / 2)
        h0 = 'x - y ~ symmetric distribution centered in 0'
        h1 = 'x - y is not a symmetric distribution centered in 0'
    else:
        quant_order = 1 - alpha
        h0 = 'x - y ~ symmetric distribution centered in 0'
        h1 = 'x - y is not a symmetric distribution centered in 0'

    if y is None:
        # If y is not provided, we consider x already corresponds to x - y
        d = format_object(x, to_type='array', name='x')
    else:
        x = format_object(x, to_type='array', name='x')
        y = format_object(y, to_type='array', name='y')
        d = x - y

    if mode == "auto":
        if len(d) <= 25:
            mode = "exact"
        else:
            mode = "approx"

    n_zero = np.sum(d == 0)
    if n_zero > 0:
        mode = "approx"
        warnings.warn("Found some ties, switching mode to 'approx.'")

    if mode == "approx":
        if zero_method in ["wilcox", "pratt"]:
            if n_zero == len(d):
                raise ValueError("zero_method 'wilcox' and 'pratt' do not "
                                 "work if x - y is zero for all elements.")
        if zero_method == "wilcox":
            # Keep all non-zero differences
            # d = compress(np.not_equal(d, 0), d)
            d = np.array([_d for _d in d if _d != 0])

    count = len(d)
    if count < 10 and mode == "approx":
        ValueError(
            f"Sample size is too small for normal approximation, got n={count}."
        )

    r = scp.rankdata(abs(d))
    r_plus = np.sum((d > 0) * r)
    r_minus = np.sum((d < 0) * r)

    if alternative == "two-sided":
        T = min(r_plus, r_minus)
    else:
        T = r_plus

    # Estimation with approximation (dim < 25)
    if mode == "approx":
        mn = count * (count + 1.) * 0.25
        se = count * (count + 1.) * (2. * count + 1.)

        if zero_method == "pratt":
            r = r[d != 0]
            # normal approximation needs to be adjusted, see Cureton (1967)
            mn -= n_zero * (n_zero + 1.) * 0.25
            se -= n_zero * (n_zero + 1.) * (2. * n_zero + 1.)

        _, repnum = scp.find_repeats(r)
        if repnum.size != 0:
            # Correction for repeated elements.
            se -= 0.5 * (repnum * (repnum * repnum - 1)).sum()

        se = math.sqrt(se / 24)

        # apply continuity correction if applicable
        d = 0

        # compute statistic and p-value using normal approximation
        z = (T - mn - d) / se
        if alternative == "two-sided":
            p = 2. * scp.norm.sf(abs(z))
        elif alternative == "greater":
            # large T = r_plus indicates x is greater than y; i.e.
            # accept alternative in that case and return small p-value (sf)
            p = scp.norm.sf(z)
        else:
            p = scp.norm.cdf(z)
    # Exact estimation
    elif mode == "exact":
        # Get frequencies cnt of the possible positive ranksums r_plus
        cnt = scp._hypotests._get_wilcoxon_distr(count)
        # Note: r_plus is int (ties not allowed), need int for slices below
        r_plus = int(r_plus)
        if alternative == "two-sided":
            if r_plus == (len(cnt) - 1) // 2:
                # r_plus is the center of the distribution.
                p = 1.0
            else:
                p_less = np.sum(cnt[:r_plus + 1]) / 2**count
                p_greater = np.sum(cnt[r_plus:]) / 2**count
                p = 2 * min(p_greater, p_less)
        elif alternative == "greater":
            p = np.sum(cnt[r_plus:]) / 2**count
        else:
            p = np.sum(cnt[:r_plus + 1]) / 2**count

    cv = scp.norm.ppf(quant_order)

    _summ = test_summary(df=count,
                         critical_value=cv,
                         t_value=z,
                         p_value=p,
                         alpha=alpha,
                         title='Wilcoxon test',
                         h0=h0,
                         h1=h1,
                         extra=f' * The T-value is: {round(T, 5)}')

    if return_tuple:
        return z, cv, p
    else:
        print(_summ)
Example #28
0
def friedman(dv=None,
             within=None,
             subject=None,
             data=None,
             export_filename=None):
    """Friedman test for repeated measurements.

    Parameters
    ----------
    dv : string
        Name of column containing the dependant variable.
    within : string
        Name of column containing the within-subject factor.
    subject : string
        Name of column containing the subject identifier.
    data : pandas DataFrame
        DataFrame
    export_filename : string
        Filename (without extension) for the output file.
        If None, do not export the table.
        By default, the file will be created in the current python console
        directory. To change that, specify the filename with full path.

    Returns
    -------
    stats : DataFrame
        Test summary ::

        'Q' : The Friedman Q statistic, corrected for ties
        'p-unc' : Uncorrected p-value
        'dof' : degrees of freedom

    Notes
    -----
    The Friedman test is used for one-way repeated measures ANOVA by ranks.

    Data are expected to be in long-format.

    Note that if the dataset contains one or more other within subject
    factors, an automatic collapsing to the mean is applied on the dependant
    variable (same behavior as the ezANOVA R package). As such, results can
    differ from those of JASP. If you can, always double-check the results.

    Due to the assumption that the test statistic has a chi squared
    distribution, the p-value is only reliable for n > 10 and more than 6
    repeated measurements.

    NaN values are automatically removed.

    Examples
    --------
    Compute the Friedman test for repeated measurements.

    >>> from pingouin import friedman, read_dataset
    >>> df = read_dataset('rm_anova')
    >>> friedman(dv='DesireToKill', within='Disgustingness',
    ...          subject='Subject', data=df)
                      Source  ddof1      Q     p-unc
    Friedman  Disgustingness      1  9.228  0.002384
    """
    from scipy.stats import rankdata, chi2, find_repeats

    # Check data
    _check_dataframe(dv=dv,
                     within=within,
                     data=data,
                     subject=subject,
                     effects='within')

    # Collapse to the mean
    data = data.groupby([subject, within]).mean().reset_index()

    # Remove NaN
    if data[dv].isnull().any():
        data = remove_rm_na(dv=dv,
                            within=within,
                            subject=subject,
                            data=data[[subject, within, dv]])

    # Extract number of groups and total sample size
    grp = data.groupby(within)[dv]
    rm = list(data[within].unique())
    k = len(rm)
    X = np.array([grp.get_group(r).values for r in rm]).T
    n = X.shape[0]

    # Rank per subject
    ranked = np.zeros(X.shape)
    for i in range(n):
        ranked[i] = rankdata(X[i, :])

    ssbn = (ranked.sum(axis=0)**2).sum()

    # Compute the test statistic
    Q = (12 / (n * k * (k + 1))) * ssbn - 3 * n * (k + 1)

    # Correct for ties
    ties = 0
    for i in range(n):
        replist, repnum = find_repeats(X[i])
        for t in repnum:
            ties += t * (t * t - 1)

    c = 1 - ties / float(k * (k * k - 1) * n)
    Q /= c

    # Approximate the p-value
    ddof1 = k - 1
    p_unc = chi2.sf(Q, ddof1)

    # Create output dataframe
    stats = pd.DataFrame(
        {
            'Source': within,
            'ddof1': ddof1,
            'Q': np.round(Q, 3),
            'p-unc': p_unc,
        },
        index=['Friedman'])

    col_order = ['Source', 'ddof1', 'Q', 'p-unc']

    stats = stats.reindex(columns=col_order)
    stats.dropna(how='all', axis=1, inplace=True)

    # Export to .csv
    if export_filename is not None:
        _export_table(stats, export_filename)
    return stats
Example #29
0
             'o',
             markerfacecolor=tuple(col),
             markeredgecolor='k',
             markersize=5)
    xy = raw_data[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 0],
             xy[:, 1],
             'o',
             markerfacecolor=tuple(col),
             markeredgecolor='k',
             markersize=1)
plt.title('Estimated number of clusters ({}) and noise points ({})'.format(
    n_clusters_, n_noise_))
plt.axis("off")
plt.show()
stats.find_repeats(labels)

# Second sample

raw_data = pd.DataFrame.to_numpy(outline_data[["x", "y", "z"]],
                                 dtype="float64")

db = DBSCAN(eps=0.04, min_samples=30).fit(raw_data)

core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
unique_labels = set(labels)
colors = [
def wilcoxon_greater(x, y, zero_method="wilcox", correction=False):
    """
	data if x is larger than y, single-sided.
	"""

    if np.allclose(x, y, equal_nan=True):
        return WilcoxonResult(np.nan, np.nan)
    """
	shamelessly stolen from scipy
	"""
    if len(x) < 10 and not (np.allclose(x, x[0]) and np.allclose(y, y[0])):
        #sample size too small, using the ttest
        t_statistic, t_pvalue = ttest_1samp(x - y, popmean=0)
        if np.mean(x - y) > 0:
            t_pvalue /= 2.0
        else:
            t_pvalue = 1 - t_pvalue / 2.0
        return WilcoxonResult(t_statistic, t_pvalue)

    if zero_method not in ["wilcox", "pratt", "zsplit"]:
        raise ValueError("Zero method should be either 'wilcox' "
                         "or 'pratt' or 'zsplit'")
    if y is None:
        d = np.asarray(x)
    else:
        x, y = map(np.asarray, (x, y))
        if len(x) != len(y):
            raise ValueError('Unequal N in wilcoxon.  Aborting.')
        d = x - y
        d[(d == 0) & (x + y != 0)] = -1  #penalty for equal value

    if zero_method == "wilcox":
        # Keep all non-zero differences
        d = np.compress(np.not_equal(d, 0), d, axis=-1)

    count = len(d)
    # if count < 10:
    # 	warnings.warn("Warning: sample size too small for normal approximation.")

    r = stats.rankdata(abs(d))
    r_plus = np.sum((d > 0) * r, axis=0)
    r_minus = np.sum((d < 0) * r, axis=0)

    if zero_method == "zsplit":
        r_zero = np.sum((d == 0) * r, axis=0)
        r_plus += r_zero / 2.
        r_minus += r_zero / 2.

    T = min(r_plus, r_minus)
    mn = count * (count + 1.) * 0.25
    se = count * (count + 1.) * (2. * count + 1.)

    if zero_method == "pratt":
        r = r[d != 0]

    replist, repnum = find_repeats(r)
    if repnum.size != 0:
        # Correction for repeated elements.
        se -= 0.5 * (repnum * (repnum * repnum - 1)).sum()

    se = np.sqrt(se / 24)
    correction = 0.5 * int(bool(correction)) * np.sign(T - mn)
    z = (T - mn - correction) / se
    if r_plus > r_minus:
        prob = distributions.norm.sf(abs(z))
    else:
        prob = 1 - distributions.norm.sf(abs(z))

    return WilcoxonResult(T, prob)
Example #31
0
import numpy as np
from scipy.stats import describe, find_repeats, entropy, iqr, pearsonr

data = np.random.random((100)) * 100
data = np.round(data)
print(describe(data))
print(find_repeats(data))
print('entropy', entropy(data))
print('iqr', iqr(data))

data = np.round(np.random.random((2, 2)) * 100).astype(np.int)

x, y = data[0], data[1]
print('x', x)
print('y', y)

print('pearsonr', pearsonr(x, y))
Example #32
0
def find_repeats(array):
    return stats.find_repeats(array)
Example #33
0
def my_wilcoxon_test(x, y=None, alternative='less', correction=False):
    """
    Calculate the paired Wilcoxon signed-rank test.

    ** Modified scipy implementation to mimic R implementation with support for one-sided tests **

    https://github.com/scipy/scipy/blob/v1.0.0/scipy/stats/morestats.py#L2316-L2413
    https://github.com/SurajGupta/r-source/blob/master/src/library/stats/R/wilcox.test.R
    
    Parameters
    ----------
    x : array_like
        The first set of measurements.
    y : array_like, optional
        The second set of measurements.  If `y` is not given, then the `x`
        array is considered to be the differences between the two sets of
        measurements.
    alternative : string, {"two.sided", "less", "greater"}, optional
    correction : bool, optional
        If True, apply continuity correction by adjusting the Wilcoxon rank
        statistic by 0.5 towards the mean value when computing the
        z-statistic.  Default is False.
    Returns
    -------
    float: The single-sided p-value for the test.
    
    Notes
    -----
    Because the normal approximation is used for the calculations, the
    samples used should be large.  A typical rule is to require that
    n > 20.
    References
    ----------
    .. [1] http://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test
    """

    if alternative not in ['two.sided', 'less', 'greater']:
        raise ValueError("Alternative hypothesis should be either 'two.sided', 'less' or 'greater'")

    if y is None:
        d = asarray(x)
    else:
        x, y = map(np.asarray, (x, y))
        if len(x) != len(y):
            raise ValueError('Unequal N in wilcoxon.  Aborting.')
        d = x - y

    # Keep all non-zero differences (zero_method == "wilcox")
    d = np.compress(np.not_equal(d, 0), d, axis=-1)

    count = len(d)
    if count < 20:
        warnings.warn("Warning: sample size too small for normal approximation.")

    r = stats.rankdata(abs(d))
    T = np.sum((d > 0) * r, axis=0)

    mn = count * (count + 1.) / 4.
    se = count * (count + 1.) * (2. * count + 1.) / 24.

    replist, repnum = stats.find_repeats(r)
    if repnum.size != 0:
        # Correction for repeated elements.
        se -= (repnum ** 3 - repnum).sum() / 48.

    se = np.sqrt(se)

    correct = 0.
    if correction:
        if alternative == "two.sided":
            correct = 0.5 * np.sign(T - mn)
        elif alternative == "greater":
            correct = 0.5
        elif alternative == "less":
            correct = -0.5

    z = (T - mn - correct) / se

    prob = None
    if alternative == "two.sided":
        prob = 2. * min(stats.distributions.norm.cdf(z), stats.distributions.norm.sf(z))
    elif alternative == "greater":
        prob = stats.distributions.norm.sf(z)
    elif alternative == "less":
        prob = stats.distributions.norm.cdf(z)

    return prob
Example #34
0
def _mean_repeats(dist):
    """"""
    return np.mean(st.find_repeats(dist)[1])
 def F_test(self, alpha = 0.05):
     data = self.wdf
     
     stat = pd.DataFrame(index=['Friedman xấp xỉ',
                                'Friedman chính xác',
                                'Iman-Davenport'])
     
     X = data.values
     conds = list(data.columns)
     n,k = data.shape
     
     # Tính Friedman Chisquared test
     
     rank_mat = np.zeros(X.shape)
             
     for i in range(n):
         rank_mat[i] = stats.rankdata(X[i, :])
         
     self.rank_mat = rank_mat
     
     # Phương pháp xấp xỉ
     ssb = (rank_mat.sum(axis=0)**2).sum()
     F_approx = (12 / (n * k * (k + 1))) * ssb - 3 * n * (k + 1)
     
     # Phương pháp chính xác
     rj = rank_mat.mean(axis = 0)
     rm = rank_mat.mean()
     SST = n*((rj - rm)**2).sum()
     SSE = ((rank_mat-rm)**2).sum()/(n*(k-1))
     
     F_exact = SST/SSE
     
     # Hiệu chỉnh thứ hạng bằng nhau
     ties = 0
     for i in range(n):
         replist, repnum = stats.find_repeats(X[i])
         for t in repnum:
             ties += t * (t * t - 1)
     c = 1 - ties / float(k * (k * k - 1) * n)
     
     F_approx /= c
     F_exact /= c
     
     dof = (k - 1)
     
     p1 = stats.chi2.sf(F_approx, dof)
     p2 = stats.chi2.sf(F_exact, dof)
     
     # Iman-Davenport F test
     
     Fc = ((n - 1)*F_approx)/(n*(k-1) - F_approx)
     
     dof_1 = k-1
     dof_2 = (k-1)*(n-1)
     
     p3 = f_test.sf(Fc, dof_1, dof_2)
     
     stat['F'] = [F_approx,F_exact,Fc]
     stat['Độ tự do'] = [str(dof), str(dof), str(f'({dof_1},{dof_2})')]
     stat['Giá trị p'] = [p1, p2,p3]
     
     stat['Phủ định H0'] = ['Có thể' if i else 'Không thể' for i in stat['Giá trị p'] < alpha]
     
     return stat