def remove_duplicates(hdf5file): """Removes the duplicates from each table of hdf5file.""" # Build a list of all tables....... tableslist = [n for n in hdf5file.walkNodes() if isinstance(n, tables.table.Table) and 'id' in n.colnames] for tbl in tableslist: idcol = tbl.col('id') nullrow = tuple([-9999] * len(tbl[-1])) # Find the duplicates.......... for dup in find_repeats(idcol)[0]: duprow = (idcol == dup).nonzero()[0] baserow = tbl[duprow[0]] # Set the duplicates to the null row.... for r in [d for d in duprow[1:] if (tbl[d] == baserow) or \ (tbl[d].tostring() == baserow.tostring())]: tbl.modifyRows(r, rows=[nullrow, ]) # Save the results ......................... tbl.flush() # Get the list of the flagged rows ......... nullist = numpy.array([r.nrow for r in tbl if r['id'] == -9999]) # Remove the flagged rows iteratively ...... while len(nullist) > 0: first = nullist[0] last = nullist[(numpy.diff(nullist) != 1).nonzero()] if not last.size: last = nullist[-1] else: last = last[0] if first == last: last += 1 tbl.removeRows(first, last) tbl.flush() nullist = numpy.array([r.nrow for r in tbl if r['id'] == -9999]) return hdf5file
def _xr2_test(self): ranks = [] for i in range(self.n): ranks.append(rankdata(self.design_matrix[i])) ranks = np.vstack(ranks) ties = [] for i in range(self.n): repeat_count = list(find_repeats(self.design_matrix[i])[1]) if repeat_count: ties.append(repeat_count) correction = 1 - np.sum(np.array(ties)**3 - np.array(ties)) / (self.n * (self.k**3 - self.k)) xr2 = (12. / (self.n * self.k * (self.k + 1.))) * np.sum(np.sum(ranks, axis=0) ** 2.) \ - (3. * self.n * (self.k + 1.)) xr2 /= correction return xr2
def wilcoxon(x, y=None): if y is None: d = np.asarray(x) else: x, y = map(np.asarray, (x, y)) if len(x) != len(y): raise ValueError('Unequal N in wilcoxon. Aborting.') d = x - y d = np.compress(np.not_equal(d, 0), d, axis=-1) count = len(d) if count < 10: print("Warning: sample size too small for normal approximation.") r = stats.rankdata(abs(d)) r_plus = np.sum((d > 0) * r, axis=0) r_minus = np.sum((d < 0) * r, axis=0) T = min(r_plus, r_minus) mn = count * (count + 1.) * 0.25 se = count * (count + 1.) * (2. * count + 1.) _, repnum = stats.find_repeats(r) if repnum.size != 0: # Correction for repeated elements. se -= 0.5 * (repnum * (repnum * repnum - 1)).sum() se = math.sqrt(se / 24) Z = (T - mn) / se R = abs(Z) / math.sqrt(len(x)) p = 2. * stats.norm.sf(abs(Z)) return T, Z, R, p
def mannwhitney_permute(x1, x2, n_boot=10000): """ Two-sided Mann-Whitney U test by permutation """ # calc U statistic n1 = len(x1) n2 = len(x2) mid = 0.5 * n1 * (n1 + 1) x = list(x1) x.extend(x2) x = np.array(x) if len(ss.find_repeats(x).values) > 0: raise ValueError("cannot compute exact p-value with ties") ranks = ss.rankdata(x) r1 = np.sum(ranks[:n1]) u1 = r1 - mid u = np.min((u1, n1 * n2 - u1)) # permute u_dist = np.empty(n_boot) for i in range(n_boot): np.random.shuffle(x) ranks = ss.rankdata(x) r1 = np.sum(ranks[:n1]) u1 = r1 - mid u_dist[i] = np.min((u1, n1 * n2 - u1)) pval = np.sum(u_dist <= u) / float(n_boot) return u, pval
def test_find_repeats(self): x = np.asarray([1,1,2,2,3,3,3,4,4,4,4]).astype('float') tmp = np.asarray([1,1,2,2,3,3,3,4,4,4,4,5,5,5,5]).astype('float') xm = np.ma.array(tmp,mask=tmp == 5.) r = stats.find_repeats(x) rm = stats.mstats.find_repeats(xm) assert_equal(r,rm)
def faster_mode1D(a): arr = np.asarray(a) # would be _chk_array v, c = stats.find_repeats(arr) if len(c) == 0: arr.sort() # mimic first value behavior return arr[0], 1. else: pos = c.argmax() return v[pos], c[pos]
def test_find_repeats(self): x = np.asarray([1,1,2,2,3,3,3,4,4,4,4]).astype('float') tmp = np.asarray([1,1,2,2,3,3,3,4,4,4,4,5,5,5,5]).astype('float') mask = (tmp == 5.) xm = np.ma.array(tmp, mask=mask) r = stats.find_repeats(x) rm = stats.mstats.find_repeats(xm) assert_equal(r,rm)
def pedersen_distribution(): """Returns tests for the distribution of public values for Pedersen commitments when the message is zero or one """ x= 1000000 gen = pedersen.Pedersen(256) c0 = [] c1 = [] for i in range(x): c0.append(gen.commit(0).c / 1.0) c1.append(gen.commit(1).c / 1.0) if (i % (x / 100) == 0): print(100 * i / x) print(stats.ks_2samp(c0, c1)) print("ks 0:", stats.kstest(c0, "uniform")) print("ks 1:", stats.kstest(c1, "uniform")) print("0", stats.describe(c0)) print("1", stats.describe(c1)) print("0 repeats:", stats.find_repeats(c0)) print("1 repeats:", stats.find_repeats(c1)) print("0 entropy:", stats.entropy(c0)) print("1 entropy:", stats.entropy(c1)) input()
def bitproof_test(z, title, same_graph = True, x = 100000): """Print's histogram for selected value of bitproof commitment along with statistical tests on the values z takes the form of lambda x: x.(variable here) same_graph is whether the histograms are on the same page x is the number of trials """ gen = pedersen.Pedersen(64) c0 = gen.commit(0) c1 = gen.commit(1) b0 = [] b1 = [] for i in range(x): b0.append(z(bitproof.bitproof(0, c0, gen.state)) / 1.0) b1.append(z(bitproof.bitproof(1, c1, gen.state)) / 1.0) if (i % (x/100) == 0): print(100 * i / x) print(title) print(stats.ks_2samp(b0, b1)) print("ks 0:", stats.kstest(b0, "uniform")) print("ks 1:", stats.kstest(b1, "uniform")) print("0", stats.describe(b0)) print("1", stats.describe(b1)) print("0 repeats:", stats.find_repeats(b0)) print("1 repeats:", stats.find_repeats(b1)) print("0 entropy:", stats.entropy(b0)) print("1 entropy:", stats.entropy(b1)) plt.title(f"Histogram of {title} Values") plt.hist(b0, bins = "auto", range = (0, gen.state.p / 1.0)) if not same_graph: plt.ylabel("Occurences") plt.xlabel("Variable value") plt.show() plt.hist(b1, bins = "auto", range = (0, gen.state.p / 1.0)) plt.ylabel("Occurences") plt.xlabel("Variable value") plt.show()
def my_wilcoxon(condition2, condition1, x, y, global_mean, global_std, motif_name, correction = False): x, y = map(np.asarray, (x, y)) #apply np.asarray for both input arrays if len(x) != len(y): raise ValueError("The length of both arrays in Wilcoxon test should be the same. Aborting") d = x - y #find the difference #keep all non-zero differences d = np.compress(np.not_equal(d, 0), d) #in scipy axis = -1, in my case it does not matter, as i have a flattened array #correct the differences according to the global mean and std d_normalized = (d - global_mean) / global_std count = len(d_normalized) if count < 10: logger.info("The sampe size is too small for normal approximation") r = stats.rankdata(abs(d_normalized)) #assign ranks to data, dealing with ties appropriately r_plus = np.sum((d_normalized > 0) * r, axis = 0) r_minus = np.sum((d_normalized < 0) * r, axis = 0) T = min(r_plus, r_minus) mn = count * (count + 1.) * 0.25 se = count * (count + 1.) * (2. * count + 1.) replist, repnum = stats.find_repeats(r) if repnum.size != 0: #correction for repeated elements se -= 0.5 * (repnum * (repnum * repnum -1)).sum() se = np.sqrt(se / 24) correction = 0.5 * int(bool(correction)) * np.sign(T - mn) z = (T - mn - correction) / se prob = 2. * stats.norm.sf(abs(z), scale = 1) #do not scale motif_std = np.std(d_normalized, ddof = 1) motif_mu = np.mean(d_normalized) direction = get_name_from_path(condition2) if motif_mu < 0: direction = get_name_from_path(condition1) return prob, direction, d, d_normalized, motif_std
def _check_friedman(n_strategies, n_datasets, ranked_data, alpha): """ Check whether Friedman test is significant. Larger parts of code copied from scipy. Arguments --------- n_strategies : int number of strategies to evaluate n_datasets : int number of datasets classified per strategy ranked_data : np.array (shape: n_strategies * n_datasets) rank of strategy on dataset Returns ------- is_significant : bool Indicates whether strategies differ significantly in terms of performance (according to Friedman test). """ if n_strategies < 3: raise ValueError( "At least 3 sets of measurements must be given for Friedmann test, got{}.".format( n_strategies ) ) # calculate c to correct chisq for ties: ties = 0 for i in range(n_datasets): replist, repnum = find_repeats(ranked_data[i]) for t in repnum: ties += t * (t * t - 1) c = 1 - ties / (n_strategies * (n_strategies * n_strategies - 1) * n_datasets) ssbn = np.sum(ranked_data.sum(axis=0) ** 2) chisq = ( 12.0 / (n_strategies * n_datasets * (n_strategies + 1)) * ssbn - 3 * n_datasets * (n_strategies + 1) ) / c p = distributions.chi2.sf(chisq, n_strategies - 1) if p < alpha: is_significant = True else: is_significant = False return is_significant
def find2ds(a, b): """ This function is used to find element in a 2D array parameter: ---------- a : the array in which will to find the element b : the elements need to find return: --------- data_ip:give the index all the elements in given array PS :this part can't be used to find boor type """ try: da = np.array(a) goal = b f = st.find_repeats(da) ix = f[0] == goal iy = ix.tolist() x = iy.index(True) y = f[1][x] data_ip = np.zeros((y, 2), dtype=np.int0) A = da.flatten() B = da.shape for k in range(y): ia = A == goal ib = ia.tolist() ip = ib.index(True) N = np.floor((ip + 1) / B[1]) if (ip + 1) % B[1] == 0: X = N - 1 else: X = N Y = ip % B[1] data_ip[k, 0] = X data_ip[k, 1] = Y if goal == 0: A[ip] = np.int0(True) else: A[ip] = np.int0(False) except ValueError: print('can not find element!') data_ip = 'Error!' return data_ip #find2ds(a = True,b = True)
def test_find_repeats(self): x = np.asarray([1,1,2,2,3,3,3,4,4,4,4]).astype('float') tmp = np.asarray([1,1,2,2,3,3,3,4,4,4,4,5,5,5,5]).astype('float') mask = (tmp == 5.) xm = np.ma.array(tmp, mask=mask) x_orig, xm_orig = x.copy(), xm.copy() r = stats.find_repeats(x) rm = stats.mstats.find_repeats(xm) assert_equal(r, rm) assert_equal(x, x_orig) assert_equal(xm, xm_orig) # This crazy behavior is expected by count_tied_groups, but is not # in the docstring... _, counts = stats.mstats.find_repeats([]) assert_equal(counts, np.array(0, dtype=np.intp))
def wilcoxon_test(x, y, alternative): """ One-sided Wilcoxon signed-rank test derived from Scipy's two-sided test e.g. for alternative == constants.LESS, rejecting the null means that median difference x - y < 0 Returns p-value """ # TODO: add unit tests to verify results identical to R's Wilcoxon test for a host of input values # pylint: disable = invalid-name, too-many-locals x, y = map(asarray, (x, y)) d = x - y d = compress(np.not_equal(d, 0), d, axis=-1) count = len(d) r = rankdata(abs(d)) T = np.sum((d > 0) * r, axis=0) mn = count * (count + 1.) * 0.25 se = count * (count + 1.) * (2. * count + 1.) if se < 1e-20: return 1. # Degenerate case _, repnum = find_repeats(r) if repnum.size != 0: # Correction for repeated elements. se -= 0.5 * (repnum * (repnum * repnum - 1)).sum() se = sqrt(se / 24) if alternative == constants.LESS: correction = -0.5 elif alternative == constants.GREATER: correction = 0.5 else: correction = 0.5 * np.sign(T - mn) # two-sided z = (T - mn - correction) / se if alternative == constants.LESS: return norm.cdf(z) if alternative == constants.GREATER: return norm.sf(z) return 2 * min(norm.cdf(z), norm.sf(z)) # two-sided
def _runs_test(self): n1, n2 = find_repeats(pd.factorize(self.x)[0]).counts r_range = np.arange(2, self.r + 1) evens = r_range[r_range % 2 == 0] odds = r_range[r_range % 2 != 0] p_even = 1 / comb(n1 + n2, n1) * np.sum( 2 * comb(n1 - 1, evens / 2 - 1) * comb(n2 - 1, evens / 2 - 1)) p_odd = 1 / comb(n1 + n2, n1) * np.sum( comb(n1 - 1, odds - 1) * comb(n2 - 1, odds - 2) + comb(n1 - 1, odds - 2) * comb(n2 - 1, odds - 1)) p = p_even + p_odd if all(np.array([n1, n2]) < 20): r_crit_1, r_crit_2 = r_critical_value(n1, n2) test_summary = { 'probability': p, 'p-value': p, 'r critical value 1': r_crit_1, 'r critical value 2': r_crit_2, 'r': self.r } return test_summary else: mean = (2 * n1 * n2) / (n1 + n2) + 1 sd = np.sqrt((2 * n1 * n2 * (2 * n1 * n2 - n1 - n2)) / ((n1 + n2)**2 * (n1 + n2 - 1))) z = (np.absolute(self.r - mean) - (0.5 * self.continuity)) / sd p_val = norm.sf(z) * 2 test_summary = { 'probability': p, 'mean of runs': mean, 'standard deviation of runs': sd, 'z-value': z, 'p-value': p_val } return test_summary
def find1ds(a, b): """ This function will use to find elements those equal to the given value parameter: ---------- a : the array use to find an element b : the element need to find(b not only one) return: ------- data_ip : the index of the element in the given array,in type int for index should be int type PS:this function also can be use to find element like NaN,inf,but this part can't be used to find boor type """ try: da = np.array(a) goal = b f = st.find_repeats(da) ix = f[0] == goal iy = ix.tolist() x = iy.index(True) y = f[1][x] data_ip = np.zeros(y, dtype=np.int0) for k in range(y): ia = da == goal ib = ia.tolist() ic = ib.index(True) data_ip[k] = ic if goal == 0: da[ic] = np.int0(True) else: da[ic] = np.int0(False) except ValueError: print('can not find element!') data_ip = 'Error!' return data_ip
def wsr_test(X, H0): ''' Wilcoxon Signed Rank Test. H0: M(X) ≤, ≥, = 0. In slides 427. REQUIRE: H0 can take three value: "equal", "less", "greater". RETURN: (E[w], Var[w]), (w_minus, w_plus), p-value. ''' d = np.asarray([k for k in X if k != 0]) n = len(d) if n < 10: print("Sample size too small for normal approximation.") r = stats.rankdata(np.abs(d)) print(d) print(r) w_plus = np.sum((d > 0) * r, axis=0) w_minus = np.sum((d < 0) * r, axis=0) E_w = n*(n+1)/4 Var_w = n*(n+1)*(2*n+1)/24 replist, repnum = stats.find_repeats(r) print(repnum) if repnum.size != 0: # Correction for repeated elements. Var_w -= (repnum * (repnum * repnum - 1)).sum()/48 if H0 == "less": Z = (abs(w_minus) - E_w) / Var_w**0.5 p_value = stats.norm.cdf(Z) elif H0 == "greater": Z = (w_plus - E_w) / Var_w**0.5 p_value = stats.norm.cdf(Z) elif H0 == "equal": Z = (min(abs(w_minus), w_plus) - E_w) / Var_w**0.5 p_value = 2 * stats.norm.cdf(Z) return (E_w, Var_w), (-w_minus, w_plus), (Z, p_value)
# if addedLine == False: # noline(kalman ,kCount, y_k, im, frameNum, SL_file, videoOut) # kCount += 1 # if kCount > kThres: # kalman.statePost = np.array( [im.shape[0]*0.50, 0] ).reshape((2,1)) # kCount = 0 # print("Not enough non-zero-angle lines found") # videoOut.write(im); # cv2.imshow("Lane lines on image", im) # if cv2.waitKey(fRate) >= 0: # break # continue try: angleMode_t = stats.find_repeats(horizonAngles) angleMode = float(angleMode_t[0][0]) except: angleMode = float(horizonAngles[0]) print(angleMode) print("hello") #-----------------GET LINE ANGLE AVERAGES----------------------- angleGoodLines = [] Sum = 0 for horizonLine in horizonLines: if abs(horizonLine[5] - angleMode) < 0.3: angleGoodLines.append(horizonLine) Sum += horizonLine[5] print(len(angleGoodLines)) angleMean = Sum / len(angleGoodLines) print(angleMean)
import find import h5py import matplotlib.gridspec as grid # setlect model import handy.scatter as hsc from astropy.wcs import * import astropy.wcs as awc from astropy.coordinates import SkyCoord goal_data = aft.getdata( '/mnt/ddnfs/data_users/cxkttwl/ICL/data/redmapper/redmapper_dr8_public_v6.3_catalog.fits' ) sub_data = aft.getdata( '/mnt/ddnfs/data_users/cxkttwl/ICL/data/redmapper/redmapper_dr8_public_v6.3_members.fits' ) # find the member of each BGC -cluster, by find the repeat ID repeat = sts.find_repeats(sub_data.ID) rept_ID = np.int0(repeat) ID_array = np.int0(sub_data.ID) sub_redshift = np.array( sub_data.Z_SPEC) #use to figure out how big the satellite center_distance = sub_data.R # select the distance of satellite galaxies member_pos = np.array([sub_data.RA, sub_data.DEC]) # record the position of satellite # read the center galaxy position RA = np.array(goal_data.RA) DEC = np.array(goal_data.DEC) redshift = np.array(goal_data.Z_SPEC) richness = np.array(goal_data.LAMBDA) host_ID = np.array(goal_data.ID) # except the part with no spectra redshift z_eff = redshift[redshift != -1]
def wilcoxon(x, y=None, zero_method="wilcox", correction=False, alternative="two-sided"): """ Calculate the Wilcoxon signed-rank test. The Wilcoxon signed-rank test tests the null hypothesis that two related paired samples come from the same distribution. In particular, it tests whether the distribution of the differences x - y is symmetric about zero. It is a non-parametric version of the paired T-test. Parameters ---------- x : array_like The first set of measurements. y : array_like, optional The second set of measurements. If `y` is not given, then the `x` array is considered to be the differences between the two sets of measurements. zero_method : {"pratt", "wilcox", "zsplit"}, optional. Default is "wilcox". "pratt": includes zero-differences in the ranking process, but drops the ranks of the zeros, see [4]_, (more conservative) "wilcox": discards all zero-differences, the default "zsplit": includes zero-differences in the ranking process and split the zero rank between positive and negative ones correction : bool, optional If True, apply continuity correction by adjusting the Wilcoxon rank statistic by 0.5 towards the mean value when computing the z-statistic. Default is False. alternative : {"two-sided", "greater", "less"}, optional The alternative hypothesis to be tested, see Notes. Default is "two-sided". Returns ------- statistic : float If `alternative` is "two-sided", the sum of the ranks of the differences above or below zero, whichever is smaller. Otherwise the sum of the ranks of the differences above zero. pvalue : float The p-value for the test depending on `alternative`. See Also -------- kruskal, mannwhitneyu Notes ----- The test has been introduced in [4]_. Given n independent samples (xi, yi) from a bivariate distribution (i.e. paired samples), it computes the differences di = xi - yi. One assumption of the test is that the differences are symmetric, see [2]_. The two-sided test has the null hypothesis that the median of the differences is zero against the alternative that it is different from zero. The one-sided test has the null that the median is positive against the alternative that the it is negative (``alternative == 'less'``), or vice versa (``alternative == 'greater.'``). The test uses a normal approximation to derive the p-value. A typical rule is to require that n > 20. For smaller n, exact tables can be used to find critical values. References ---------- .. [1] https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test .. [2] Conover, W.J., Practical Nonparametric Statistics, 1971. .. [3] Pratt, J.W., Remarks on Zeros and Ties in the Wilcoxon Signed Rank Procedures, Journal of the American Statistical Association, Vol. 54, 1959, pp. 655-667. :doi:`10.1080/01621459.1959.10501526` .. [4] Wilcoxon, F., Individual Comparisons by Ranking Methods, Biometrics Bulletin, Vol. 1, 1945, pp. 80-83. :doi:`10.2307/3001968` Examples -------- In [4]_, the differences in height between cross- and self-fertilized corn plants is given as follows: >>> d = [6, 8, 14, 16, 23, 24, 28, 29, 41, -48, 49, 56, 60, -67, 75] Cross-fertilized plants appear to be be higher. To test the null hypothesis that there is no height difference, we can apply the two-sided test: >>> from scipy.stats import wilcoxon >>> w, p = wilcoxon(d) >>> w, p (24.0, 0.04088813291185591) Hence, we would reject the null hypothesis at a confidence level of 5%, concluding that there is a difference in height between the groups. To confirm that the median of the differences can be assumed to be positive, we use: >>> w, p = wilcoxon(d, alternative='greater') >>> w, p (96.0, 0.020444066455927955) This shows that the null hypothesis that the median is negative can be rejected at a confidence level of 5% in favor of the alternative that the median is greater than zero. The p-value based on the approximation is within the range of 0.019 and 0.054 given in [2]_. Note that the statistic changed to 96 in the one-sided case (the sum of ranks of positive differences) whereas it is 24 in the two-sided case (the minimum of sum of ranks above and below zero). """ WilcoxonResult = namedtuple('WilcoxonResult', ('w_statistic', 'z_statistic', 'pvalue')) if zero_method not in ["wilcox", "pratt", "zsplit"]: raise ValueError("Zero method should be either 'wilcox' " "or 'pratt' or 'zsplit'") if alternative not in ["two-sided", "less", "greater"]: raise ValueError("Alternative must be either 'two-sided', " "'greater' or 'less'") if y is None: d = asarray(x) else: x, y = map(asarray, (x, y)) if len(x) != len(y): raise ValueError('Unequal N in wilcoxon. Aborting.') d = x - y if zero_method == "wilcox": # Keep all non-zero differences d = compress(np.not_equal(d, 0), d, axis=-1) count = len(d) if count < 10: warnings.warn("Sample size too small for normal approximation.") r = stats.rankdata(abs(d)) r_plus = np.sum((d > 0) * r, axis=0) r_minus = np.sum((d < 0) * r, axis=0) if zero_method == "zsplit": r_zero = np.sum((d == 0) * r, axis=0) r_plus += r_zero / 2. r_minus += r_zero / 2. # return min for two-sided test, but r_plus for one-sided test # the literature is not consistent here # r_plus is more informative since r_plus + r_minus = count*(count+1)/2, # i.e. the sum of the ranks, so r_minus and the min can be inferred # (If alternative='pratt', r_plus + r_minus = count*(count+1)/2 - r_zero.) # [3] uses the r_plus for the one-sided test, keep min for two-sided test # to keep backwards compatability if alternative == "two-sided": T = min(r_plus, r_minus) elif alternative == "greater": T = r_plus else: T = r_minus mn = count * (count + 1.) * 0.25 se = count * (count + 1.) * (2. * count + 1.) if zero_method == "pratt": r = r[d != 0] replist, repnum = stats.find_repeats(r) if repnum.size != 0: # Correction for repeated elements. se -= 0.5 * (repnum * (repnum * repnum - 1)).sum() se = np.sqrt(se / 24) # apply continuity correction if applicable d = 0 if correction: if alternative == "two-sided": d = 0.5 * np.sign(T - mn) elif alternative == "less": d = -0.5 else: d = 0.5 # compute statistic and p-value using normal approximation z = (T - mn - d) / se if alternative == "two-sided": prob = 2. * distributions.norm.sf(abs(z)) elif alternative == "greater": # large T = r_plus indicates x is greater than y; i.e. # accept alternative in that case and return small p-value (sf) prob = distributions.norm.sf(z) else: prob = distributions.norm.cdf(z) return WilcoxonResult(T, z, prob) # from scipy import stats # import numpy as np # from collections import namedtuple # # def wilcoxon(x, y=None, zero_method="wilcox", correction=False, alternative='two-sided'): # WilcoxonResult = namedtuple('WilcoxonResult', ('statistic', 'pvalue')) # # if y is None: # d = np.asarray(x) # else: # x, y = map(np.asarray, (x, y)) # if len(x) != len(y): # raise ValueError('Unequal N in wilcoxon. Aborting.') # d = x - y # # if zero_method == "wilcox": # Keep all non-zero differences # d = np.compress(np.not_equal(d, 0), d, axis=-1) # # count = len(d) # r = stats.rankdata(abs(d)) # r_plus = np.sum((d > 0) * r, axis=0) # r_minus = np.sum((d < 0) * r, axis=0) # # if zero_method == "zsplit": # r_zero = np.sum((d == 0) * r, axis=0) # r_plus += r_zero / 2. # r_minus += r_zero / 2. # # T = min(r_plus, r_minus) # mn = count * (count + 1.) * 0.25 # se = count * (count + 1.) * (2. * count + 1.) # # if zero_method == "pratt": # r = r[d != 0] # # replist, repnum = stats.find_repeats(r) # if repnum.size != 0: # Correction for repeated elements. # se -= 0.5 * (repnum * (repnum * repnum - 1)).sum() # # se = np.sqrt(se / 24) # correction = 0.5 * int(bool(correction)) * np.sign(T - mn) # z = (T - mn - correction) / se # prob = 2. * stats.distributions.norm.sf(abs(z)) # # if alternative == "two-sided": # return WilcoxonResult(T, prob) # elif alternative == "greater": # return WilcoxonResult(T, prob/2) if z > 0 else WilcoxonResult(T, 1 - prob/2) # elif alternative == "less": # return WilcoxonResult(T, prob/2) if z < 0 else WilcoxonResult(T, 1 - prob/2) # else: # raise ValueError("Alternative should be either 'two-sided' " # "or 'less' or 'greater'")
def test_basic(self): a = [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 5] res, nums = stats.find_repeats(a) assert_array_equal(res, [1, 2, 3, 4]) assert_array_equal(nums, [3, 3, 2, 2])
def wilcoxon(x, y=None, zero_method="wilcox", correction=False): """ Calculate the Wilcoxon signed-rank test. The Wilcoxon signed-rank test tests the null hypothesis that two related paired samples come from the same distribution. In particular, it tests whether the distribution of the differences x - y is symmetric about zero. It is a non-parametric version of the paired T-test. Parameters ---------- x : array_like The first set of measurements. y : array_like, optional The second set of measurements. If `y` is not given, then the `x` array is considered to be the differences between the two sets of measurements. zero_method : string, {"pratt", "wilcox", "zsplit"}, optional "pratt": Pratt treatment: includes zero-differences in the ranking process (more conservative) "wilcox": Wilcox treatment: discards all zero-differences "zsplit": Zero rank split: just like Pratt, but spliting the zero rank between positive and negative ones correction : bool, optional If True, apply continuity correction by adjusting the Wilcoxon rank statistic by 0.5 towards the mean value when computing the z-statistic. Default is False. Returns ------- T : float The sum of the ranks of the differences above or below zero, whichever is smaller. p-value : float The two-sided p-value for the test. outcome : integer The direction of the effect (if any): +1 means a positive difference (x > y) -1 means a negative difference (x < y) 0 means no difference This is computed by comparing W_plus and W_minus. Notes ----- Because the normal approximation is used for the calculations, the samples used should be large. A typical rule is to require that n > 20. References ---------- .. [1] http://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test """ if not zero_method in ["wilcox", "pratt", "zsplit"]: raise ValueError("Zero method should be either 'wilcox' \ or 'pratt' or 'zsplit'") if y is None: d = x else: x, y = map(np.asarray, (x, y)) if len(x) != len(y): raise ValueError('Unequal N in wilcoxon. Aborting.') d = x - y if zero_method == "wilcox": d = np.compress(np.not_equal(d, 0), d, axis=-1) # Keep all non-zero differences count = len(d) if count == 0: return None, 1.0, 0. if (count < 10): warnings.warn( "Warning: sample size too small for normal approximation.") r = stats.rankdata(abs(d)) r_plus = np.sum((d > 0) * r, axis=0) r_minus = np.sum((d < 0) * r, axis=0) if zero_method == "zsplit": r_zero = np.sum((d == 0) * r, axis=0) r_plus += r_zero / 2. r_minus += r_zero / 2. T = min(r_plus, r_minus) mn = count * (count + 1.) * 0.25 se = count * (count + 1.) * (2. * count + 1.) if zero_method == "pratt": r = r[d != 0] replist, repnum = stats.find_repeats(r) if repnum.size != 0: # Correction for repeated elements. se -= 0.5 * (repnum * (repnum * repnum - 1)).sum() se = np.sqrt(se / 24) correction = 0.5 * int(bool(correction)) * np.sign(T - mn) z = (T - mn - correction) / se prob = 2. * stats.distributions.norm.sf(abs(z)) # Added by GR: compute direction of effect (if present) outcome = 2 * (r_plus > r_minus) - 1 if r_plus != r_minus else 0 return T, prob, outcome
def test_empty_result(self): # Check that empty arrays are returned when there are no repeats. a = [10, 20, 50, 30, 40] repeated, counts = stats.find_repeats(a) assert_array_equal(repeated, []) assert_array_equal(counts, [])
def test_basic(self): a = [1,2,3,4,1,2,3,4,1,2,5] res,nums = stats.find_repeats(a) assert_array_equal(res,[1,2,3,4]) assert_array_equal(nums,[3,3,2,2])
def wilcoxon(x, y=None, alpha=0.05, alternative='two-sided', mode='auto', zero_method='wilcox', return_tuple=False): """Wilcoxon signed-rank test. :param x: First sample to compare. If `y` is not provided, will correspond to the difference :math:`x - y`. :type x: :obj:`numpy.array` :param y: Second sample to compare, defaults to None. :type y: :obj:`numpy.array`, optional :param alpha: Confidence level, defaults to 0.05. :type alpha: :obj:`float``, optional :param alternative: Perform a one or two-sided test. Values can be `two-sided`, `greater`, `less`, defaults to 'two-sided'. :type alternative: :obj:`str`, optional :param mode: Method to calculate the p-value. Computes the exact distribution is sample size is less than 25, otherwise uses normal approximation. Values can be `auto`, `approx` or `exact`, defaults to 'auto'. :type mode: :obj:`str`, optional :param zero_method: Method to handle the zero differences., defaults to 'wilcox' :type zero_method: :obj:`str`, optional :param return_tuple: Return a tuple with t statistic, critical value and p-value, defaults to False. :type return_tuple: :obj:`bool`, optional :example: >>> from statinf import stats >>> import numpy as np >>> x = np.random.poisson(2, size=100) >>> y = x_dist + np.random.normal(loc=0, scale=1, size=100) >>> stats.wilcoxon(x, y) ... +------------------------------------------------------------+ ... | Wilcoxon test | ... +------------+----------------+------------+---------+-------+ ... | df | Critical value | Stat value | p-value | H0 | ... +------------+----------------+------------+---------+-------+ ... | 100 | 1.9599639845 | -1.316878 | 0.18788 | True | ... +------------+----------------+------------+---------+-------+ ... * We cannot reject H0: x - y ~ symmetric distribution centered in 0 ... * The T-value is: 2142.0 :reference: * Wilcoxon, F., Individual Comparisons by Ranking Methods, Biometrics Bulletin, Vol. 1, 1945, pp. 80-83. * Cureton, E.E., The Normal Approximation to the Signed-Rank Sampling Distribution When Zero Differences are Present, Journal of the American Statistical Association, Vol. 62, 1967, pp. 1068-1069. :return: Summary for the test or tuple statistic, critical value, p-value. :rtype: :obj:`tuple` """ # Code mostly inspired from: https://github.com/scipy/scipy/blob/v1.7.0/scipy/stats/morestats.py#L2984-L3233 # Define test degrees of freedom if alternative == 'two-sided': quant_order = 1 - (alpha / 2) h0 = 'x - y ~ symmetric distribution centered in 0' h1 = 'x - y is not a symmetric distribution centered in 0' else: quant_order = 1 - alpha h0 = 'x - y ~ symmetric distribution centered in 0' h1 = 'x - y is not a symmetric distribution centered in 0' if y is None: # If y is not provided, we consider x already corresponds to x - y d = format_object(x, to_type='array', name='x') else: x = format_object(x, to_type='array', name='x') y = format_object(y, to_type='array', name='y') d = x - y if mode == "auto": if len(d) <= 25: mode = "exact" else: mode = "approx" n_zero = np.sum(d == 0) if n_zero > 0: mode = "approx" warnings.warn("Found some ties, switching mode to 'approx.'") if mode == "approx": if zero_method in ["wilcox", "pratt"]: if n_zero == len(d): raise ValueError("zero_method 'wilcox' and 'pratt' do not " "work if x - y is zero for all elements.") if zero_method == "wilcox": # Keep all non-zero differences # d = compress(np.not_equal(d, 0), d) d = np.array([_d for _d in d if _d != 0]) count = len(d) if count < 10 and mode == "approx": ValueError( f"Sample size is too small for normal approximation, got n={count}." ) r = scp.rankdata(abs(d)) r_plus = np.sum((d > 0) * r) r_minus = np.sum((d < 0) * r) if alternative == "two-sided": T = min(r_plus, r_minus) else: T = r_plus # Estimation with approximation (dim < 25) if mode == "approx": mn = count * (count + 1.) * 0.25 se = count * (count + 1.) * (2. * count + 1.) if zero_method == "pratt": r = r[d != 0] # normal approximation needs to be adjusted, see Cureton (1967) mn -= n_zero * (n_zero + 1.) * 0.25 se -= n_zero * (n_zero + 1.) * (2. * n_zero + 1.) _, repnum = scp.find_repeats(r) if repnum.size != 0: # Correction for repeated elements. se -= 0.5 * (repnum * (repnum * repnum - 1)).sum() se = math.sqrt(se / 24) # apply continuity correction if applicable d = 0 # compute statistic and p-value using normal approximation z = (T - mn - d) / se if alternative == "two-sided": p = 2. * scp.norm.sf(abs(z)) elif alternative == "greater": # large T = r_plus indicates x is greater than y; i.e. # accept alternative in that case and return small p-value (sf) p = scp.norm.sf(z) else: p = scp.norm.cdf(z) # Exact estimation elif mode == "exact": # Get frequencies cnt of the possible positive ranksums r_plus cnt = scp._hypotests._get_wilcoxon_distr(count) # Note: r_plus is int (ties not allowed), need int for slices below r_plus = int(r_plus) if alternative == "two-sided": if r_plus == (len(cnt) - 1) // 2: # r_plus is the center of the distribution. p = 1.0 else: p_less = np.sum(cnt[:r_plus + 1]) / 2**count p_greater = np.sum(cnt[r_plus:]) / 2**count p = 2 * min(p_greater, p_less) elif alternative == "greater": p = np.sum(cnt[r_plus:]) / 2**count else: p = np.sum(cnt[:r_plus + 1]) / 2**count cv = scp.norm.ppf(quant_order) _summ = test_summary(df=count, critical_value=cv, t_value=z, p_value=p, alpha=alpha, title='Wilcoxon test', h0=h0, h1=h1, extra=f' * The T-value is: {round(T, 5)}') if return_tuple: return z, cv, p else: print(_summ)
def friedman(dv=None, within=None, subject=None, data=None, export_filename=None): """Friedman test for repeated measurements. Parameters ---------- dv : string Name of column containing the dependant variable. within : string Name of column containing the within-subject factor. subject : string Name of column containing the subject identifier. data : pandas DataFrame DataFrame export_filename : string Filename (without extension) for the output file. If None, do not export the table. By default, the file will be created in the current python console directory. To change that, specify the filename with full path. Returns ------- stats : DataFrame Test summary :: 'Q' : The Friedman Q statistic, corrected for ties 'p-unc' : Uncorrected p-value 'dof' : degrees of freedom Notes ----- The Friedman test is used for one-way repeated measures ANOVA by ranks. Data are expected to be in long-format. Note that if the dataset contains one or more other within subject factors, an automatic collapsing to the mean is applied on the dependant variable (same behavior as the ezANOVA R package). As such, results can differ from those of JASP. If you can, always double-check the results. Due to the assumption that the test statistic has a chi squared distribution, the p-value is only reliable for n > 10 and more than 6 repeated measurements. NaN values are automatically removed. Examples -------- Compute the Friedman test for repeated measurements. >>> from pingouin import friedman, read_dataset >>> df = read_dataset('rm_anova') >>> friedman(dv='DesireToKill', within='Disgustingness', ... subject='Subject', data=df) Source ddof1 Q p-unc Friedman Disgustingness 1 9.228 0.002384 """ from scipy.stats import rankdata, chi2, find_repeats # Check data _check_dataframe(dv=dv, within=within, data=data, subject=subject, effects='within') # Collapse to the mean data = data.groupby([subject, within]).mean().reset_index() # Remove NaN if data[dv].isnull().any(): data = remove_rm_na(dv=dv, within=within, subject=subject, data=data[[subject, within, dv]]) # Extract number of groups and total sample size grp = data.groupby(within)[dv] rm = list(data[within].unique()) k = len(rm) X = np.array([grp.get_group(r).values for r in rm]).T n = X.shape[0] # Rank per subject ranked = np.zeros(X.shape) for i in range(n): ranked[i] = rankdata(X[i, :]) ssbn = (ranked.sum(axis=0)**2).sum() # Compute the test statistic Q = (12 / (n * k * (k + 1))) * ssbn - 3 * n * (k + 1) # Correct for ties ties = 0 for i in range(n): replist, repnum = find_repeats(X[i]) for t in repnum: ties += t * (t * t - 1) c = 1 - ties / float(k * (k * k - 1) * n) Q /= c # Approximate the p-value ddof1 = k - 1 p_unc = chi2.sf(Q, ddof1) # Create output dataframe stats = pd.DataFrame( { 'Source': within, 'ddof1': ddof1, 'Q': np.round(Q, 3), 'p-unc': p_unc, }, index=['Friedman']) col_order = ['Source', 'ddof1', 'Q', 'p-unc'] stats = stats.reindex(columns=col_order) stats.dropna(how='all', axis=1, inplace=True) # Export to .csv if export_filename is not None: _export_table(stats, export_filename) return stats
'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=5) xy = raw_data[class_member_mask & ~core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=1) plt.title('Estimated number of clusters ({}) and noise points ({})'.format( n_clusters_, n_noise_)) plt.axis("off") plt.show() stats.find_repeats(labels) # Second sample raw_data = pd.DataFrame.to_numpy(outline_data[["x", "y", "z"]], dtype="float64") db = DBSCAN(eps=0.04, min_samples=30).fit(raw_data) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) unique_labels = set(labels) colors = [
def wilcoxon_greater(x, y, zero_method="wilcox", correction=False): """ data if x is larger than y, single-sided. """ if np.allclose(x, y, equal_nan=True): return WilcoxonResult(np.nan, np.nan) """ shamelessly stolen from scipy """ if len(x) < 10 and not (np.allclose(x, x[0]) and np.allclose(y, y[0])): #sample size too small, using the ttest t_statistic, t_pvalue = ttest_1samp(x - y, popmean=0) if np.mean(x - y) > 0: t_pvalue /= 2.0 else: t_pvalue = 1 - t_pvalue / 2.0 return WilcoxonResult(t_statistic, t_pvalue) if zero_method not in ["wilcox", "pratt", "zsplit"]: raise ValueError("Zero method should be either 'wilcox' " "or 'pratt' or 'zsplit'") if y is None: d = np.asarray(x) else: x, y = map(np.asarray, (x, y)) if len(x) != len(y): raise ValueError('Unequal N in wilcoxon. Aborting.') d = x - y d[(d == 0) & (x + y != 0)] = -1 #penalty for equal value if zero_method == "wilcox": # Keep all non-zero differences d = np.compress(np.not_equal(d, 0), d, axis=-1) count = len(d) # if count < 10: # warnings.warn("Warning: sample size too small for normal approximation.") r = stats.rankdata(abs(d)) r_plus = np.sum((d > 0) * r, axis=0) r_minus = np.sum((d < 0) * r, axis=0) if zero_method == "zsplit": r_zero = np.sum((d == 0) * r, axis=0) r_plus += r_zero / 2. r_minus += r_zero / 2. T = min(r_plus, r_minus) mn = count * (count + 1.) * 0.25 se = count * (count + 1.) * (2. * count + 1.) if zero_method == "pratt": r = r[d != 0] replist, repnum = find_repeats(r) if repnum.size != 0: # Correction for repeated elements. se -= 0.5 * (repnum * (repnum * repnum - 1)).sum() se = np.sqrt(se / 24) correction = 0.5 * int(bool(correction)) * np.sign(T - mn) z = (T - mn - correction) / se if r_plus > r_minus: prob = distributions.norm.sf(abs(z)) else: prob = 1 - distributions.norm.sf(abs(z)) return WilcoxonResult(T, prob)
import numpy as np from scipy.stats import describe, find_repeats, entropy, iqr, pearsonr data = np.random.random((100)) * 100 data = np.round(data) print(describe(data)) print(find_repeats(data)) print('entropy', entropy(data)) print('iqr', iqr(data)) data = np.round(np.random.random((2, 2)) * 100).astype(np.int) x, y = data[0], data[1] print('x', x) print('y', y) print('pearsonr', pearsonr(x, y))
def find_repeats(array): return stats.find_repeats(array)
def my_wilcoxon_test(x, y=None, alternative='less', correction=False): """ Calculate the paired Wilcoxon signed-rank test. ** Modified scipy implementation to mimic R implementation with support for one-sided tests ** https://github.com/scipy/scipy/blob/v1.0.0/scipy/stats/morestats.py#L2316-L2413 https://github.com/SurajGupta/r-source/blob/master/src/library/stats/R/wilcox.test.R Parameters ---------- x : array_like The first set of measurements. y : array_like, optional The second set of measurements. If `y` is not given, then the `x` array is considered to be the differences between the two sets of measurements. alternative : string, {"two.sided", "less", "greater"}, optional correction : bool, optional If True, apply continuity correction by adjusting the Wilcoxon rank statistic by 0.5 towards the mean value when computing the z-statistic. Default is False. Returns ------- float: The single-sided p-value for the test. Notes ----- Because the normal approximation is used for the calculations, the samples used should be large. A typical rule is to require that n > 20. References ---------- .. [1] http://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test """ if alternative not in ['two.sided', 'less', 'greater']: raise ValueError("Alternative hypothesis should be either 'two.sided', 'less' or 'greater'") if y is None: d = asarray(x) else: x, y = map(np.asarray, (x, y)) if len(x) != len(y): raise ValueError('Unequal N in wilcoxon. Aborting.') d = x - y # Keep all non-zero differences (zero_method == "wilcox") d = np.compress(np.not_equal(d, 0), d, axis=-1) count = len(d) if count < 20: warnings.warn("Warning: sample size too small for normal approximation.") r = stats.rankdata(abs(d)) T = np.sum((d > 0) * r, axis=0) mn = count * (count + 1.) / 4. se = count * (count + 1.) * (2. * count + 1.) / 24. replist, repnum = stats.find_repeats(r) if repnum.size != 0: # Correction for repeated elements. se -= (repnum ** 3 - repnum).sum() / 48. se = np.sqrt(se) correct = 0. if correction: if alternative == "two.sided": correct = 0.5 * np.sign(T - mn) elif alternative == "greater": correct = 0.5 elif alternative == "less": correct = -0.5 z = (T - mn - correct) / se prob = None if alternative == "two.sided": prob = 2. * min(stats.distributions.norm.cdf(z), stats.distributions.norm.sf(z)) elif alternative == "greater": prob = stats.distributions.norm.sf(z) elif alternative == "less": prob = stats.distributions.norm.cdf(z) return prob
def _mean_repeats(dist): """""" return np.mean(st.find_repeats(dist)[1])
def F_test(self, alpha = 0.05): data = self.wdf stat = pd.DataFrame(index=['Friedman xấp xỉ', 'Friedman chính xác', 'Iman-Davenport']) X = data.values conds = list(data.columns) n,k = data.shape # Tính Friedman Chisquared test rank_mat = np.zeros(X.shape) for i in range(n): rank_mat[i] = stats.rankdata(X[i, :]) self.rank_mat = rank_mat # Phương pháp xấp xỉ ssb = (rank_mat.sum(axis=0)**2).sum() F_approx = (12 / (n * k * (k + 1))) * ssb - 3 * n * (k + 1) # Phương pháp chính xác rj = rank_mat.mean(axis = 0) rm = rank_mat.mean() SST = n*((rj - rm)**2).sum() SSE = ((rank_mat-rm)**2).sum()/(n*(k-1)) F_exact = SST/SSE # Hiệu chỉnh thứ hạng bằng nhau ties = 0 for i in range(n): replist, repnum = stats.find_repeats(X[i]) for t in repnum: ties += t * (t * t - 1) c = 1 - ties / float(k * (k * k - 1) * n) F_approx /= c F_exact /= c dof = (k - 1) p1 = stats.chi2.sf(F_approx, dof) p2 = stats.chi2.sf(F_exact, dof) # Iman-Davenport F test Fc = ((n - 1)*F_approx)/(n*(k-1) - F_approx) dof_1 = k-1 dof_2 = (k-1)*(n-1) p3 = f_test.sf(Fc, dof_1, dof_2) stat['F'] = [F_approx,F_exact,Fc] stat['Độ tự do'] = [str(dof), str(dof), str(f'({dof_1},{dof_2})')] stat['Giá trị p'] = [p1, p2,p3] stat['Phủ định H0'] = ['Có thể' if i else 'Không thể' for i in stat['Giá trị p'] < alpha] return stat