def preliminary_stats(selected_column_widths, selected_column_heights): # trimmed_selected_column_widths = stats.trimboth(selected_column_widths, 0.1) mean_trimmed_selected_column_widths = np.mean( trimmed_selected_column_widths) std_trimmed_selected_column_widths = np.std(trimmed_selected_column_widths) # trimmed_selected_column_heights = stats.trimboth(selected_column_heights, 0.1) mean_trimmed_selected_column_heights = np.mean( trimmed_selected_column_heights) std_trimmed_selected_column_heights = np.std( trimmed_selected_column_heights) print("\n") print(mean_trimmed_selected_column_widths) print(std_trimmed_selected_column_widths) print(mean_trimmed_selected_column_heights) print(std_trimmed_selected_column_heights) # return mean_trimmed_selected_column_widths, std_trimmed_selected_column_widths, \ mean_trimmed_selected_column_heights, std_trimmed_selected_column_heights,
def trimmed_mean(arr, axis=-1, ratio=2., use_sem=True): arr = np.sort(arr, axis=axis) std = np.std(arr, axis, keepdims=1) std = np.std(st.trimboth(arr, 0.1, axis), keepdims=1) mean = np.mean(st.trimboth(arr, 0.1, axis), keepdims=1) #std = np.std(st.trimboth(arr, 0.1, axis), keepdims=1) #mean = np.mean(st.trimboth(arr, 0.1, axis), keepdims=1) idx = np.abs(arr - mean) > ratio * std n = np.sqrt(np.sum(~idx, axis)) if not use_sem: n = 1 arr[idx] = np.nan mean = np.nanmean(arr, axis) std = np.nanstd(arr, axis, ddof=1)/n return mean, std
def calibrate_flat(self, sub): ''' Perform calibrations on flat which include subtracting bias if available , and rescaling so the mean intensity is .5 (because outlier rejection methods used to combine flat subs work best with normalised frames due to changing light levels; the value of .5 is so that we can use B & W controls; we rescale to a mean of 1 when saving since this is what a good flat needs for dividing) ''' im = sub.get_image() # subtract bias if available bias = self.get_bias(sub) if bias is not None: #print('subtracting bias') im = im - self.get_master(bias) # normalise by mean of image in central 3rd zone perc = 75 # retain central 75% of points when computing mean w, h = im.shape w1, w2 = int(w / 3), int(2 * w / 3) h1, h2 = int(h / 3), int(2 * h / 3) imr = im[h1:h2, w1:w2] robust_mean = np.mean(trimboth(np.sort(imr.ravel(), axis=0), (100 - perc) / 100, axis=0), axis=0) sub.image = .5 * im / robust_mean
def line_stats(service_rates): if not service_rates: return [0, 0] trimmed_rates = stats.trimboth(service_rates, 0.1) mean = stat.mean(trimmed_rates) stddev = stat.stdev(trimmed_rates, mean) return [mean, stddev]
def lfstdv(y_in, x_in=None): y = y_in.copy() if x_in != None: y = y[np.argsort(x_in)] delta = np.sort((y[1:-1] - y[2:]) - 0.5 * (y[:-2] - y[2:])) ### scaled to match standard deviation of ### gaussian noise on constant signal ### also, trim outliers. return 0.8166137 * np.std(trimboth(delta, 0.05))
def _make_plot(self, dc, filename): fig = plt.figure(figsize=(10,6)) ax1 = fig.add_subplot(111) plt.subplots_adjust(left=0.125, right=0.9, top=0.9, bottom=0.135) col_index = 0 for name, data in dc.items(): mean = np.array([np.mean(trimboth(np.sort(x), 0.25)) for x in data.T]) plt.plot(mean, color=colors[col_index], label=name, linewidth=2) col_index += 1 ax1.set_xlabel("Runde") ax1.set_ylabel("Kumulativer Regret") plt.legend(loc=0) # 0=best, 4=lower right null = fig.savefig(filename + "_plot.pdf", dpi=400, bbox_inches="tight")
def create_histogram(a, trim_p, bin_size, p_title, p_ylabel, p_xlabel, file_prefix, dec_prec=0, trim_type="both"): """Interacts with pylab to draw and save histogram plot. Arguments: a -- array_like list of values to plot trim_p -- Percentile (range 0 to 1) of values to trim (float) bin_size -- Size of histogram's value bins (float) p_title -- Title of plot p_ylabel -- ylabel of plot p_xlabel -- xlabel of plot file_prefix -- Filename prefix dec_prec -- (Optional) Decimal precision of bins. Defaults to 0. (int) trim_type -- (Optional) Controls the tail of distribution that percentile trim_p is applied. Values include "both", "left", and "right". Defaults to "both". """ a.sort() sample_size = len(a) print("a length pre-trim_p", len(a)) if trim_type == "left" or trim_type == "right": a = stats.trim1(a, trim_p, trim_type) else: a = stats.trimboth(a, trim_p) print("a length post-trim_p", len(a)) bin_min = math.floor(min(a)) # TODO Round down to dec_prec instead bin_max = round(max(a), dec_prec) print("bin size=" + str(bin_size) + ", bin min=" + str(bin_min) + ", bin max=" + str(bin_max)) # Create histogram of values n, bins, patches = pylab.hist(a, bins=pylab.frange(bin_min, bin_max, bin_size), normed=False, histtype="stepfilled") pylab.setp(patches, "facecolor", "g", "alpha", 0.75) pylab.title(p_title) pylab.xlabel(p_xlabel) pylab.ylabel(p_ylabel) if trim_p > 0: pylab.savefig(file_prefix + "_trimmed.png") else: pylab.savefig(file_prefix + ".png") pylab.show()
def _mev(bss_data): """ Maximum Epoch Variance. Captures the temporal dynamics of horizontal eye movements. Parameters ---------- bss_data : array Array with dimensions CxTxE, where C is the number of components, T the number of time instants and E the number of events Returns ------- res : array Vector of length C with the computed values for each component """ # As _t_dim < _ev_dim, the events dimension will be shifted down one # position after computing the kurtosis (as time dimension will have # disappeared) ev_dim = _ev_dim - 1 # Compute time variance try: var_data = bss_data.var(axis=_t_dim) except AttributeError: raise _chk_parameters(bss_data=bss_data) # Sort across events before trim try: np.sort(var_data, axis=ev_dim) except ValueError: raise _chk_parameters(bss_data=bss_data) # Trimmed vector of time variances var = sp_stats.trimboth(np.sort(var_data, axis=ev_dim), 0.01, axis=ev_dim) # Final results res = var.max(axis=ev_dim) / var.mean(axis=ev_dim) # bss_data dimensionality has to be checked explicitly, as a ND array with # N > 3 does not raise an exception if bss_data.ndim > 3: raise _chk_parameters(bss_data=bss_data) # Done return res
def get_features(spaces): # List of features: # duration; mean; variance; fmat = [] for s in spaces: st = trimboth(s,0.15) fvec = [] fvec.append(len(s)) # duration fvec.append(s.max()/st.mean()) # max over mean fvec.append(st.var()) # variance fvec.append(st.mean()) # mean fvec.append(skew(s)) # difference betwwen 0.65 and 0.95 quantiles q = mquantiles(s,prob=[0.65,0.95]) fvec.append(q[1]-q[0]) fmat.append(fvec) fmat = np.array(fmat) return fmat
def check_correlations(df1, df2, plot=False): """ checks for correlations between two dataframes; both must have a 'daily_pct_chg' column this can be used for checking that a 2 or 3x etf is properly correlated with the underlying asset also gets the multiplication factor and standard deviation """ # check correlation to make sure it's high both = pd.concat([df1['daily_pct_chg'], df2['daily_pct_chg']], axis=1).dropna() both.columns = ['regular', 'leveraged'] if plot: corr = both.corr() # usually around 99.9 or 99.8 both.plot.scatter(x='regular', y='leveraged') plt.title('correlation: ' + str(round(corr.iloc[0, 1], 4))) plt.show() # look at distribution of TQQQ return multiples t = (both['leveraged'] / both['regular']).fillna(0).to_frame() t[np.isinf(t[0])] = 0 # exclude outliers, which throw off the mean # removes right and leftmost 5% of quantiles new_t = trimboth(t[0], 0.05) # t = t[t[0] < t[0].quantile(0.9)] # some large outliers # t[(t < 6) & (t > -6)].hist(bins=50) if plot: plt.hist(new_t, bins=50) plt.show() print('mean and std for multiples:') avg, std = new_t.mean(), new_t.std() print(avg) print(std) return avg
def measure_error(results, one_dim=False): metrics = {} for result in results: # option = result[0].split('_', 1)[1] option = result[0].rsplit('_', 1)[0] if option in metrics: metrics[option].append(result[1]) else: metrics[option] = [result[1]] errors = {} for option, result in metrics.iteritems(): print(option, len(result)) if not one_dim: result = np.atleast_2d(result) else: result = np.array(result) if result.size == 0: continue result = np.sort(result, 0) result = trimboth(result, 0.05, 0) errors[option] = np.mean(result, 0), np.sqrt(np.var(result, 0)) return errors
def bootstrap_conf(estimates, alpha=0.05): trim_pct = alpha / 2 new_arr = ss.trimboth(a=estimates, proportiontocut=trim_pct) return [new_arr[0], new_arr[-1]]
def bootstrap_conf_pivot(estimates, samp_est, alpha=0.05): #also bootstrap_conf_delta diffs = np.array(estimates) - samp_est new_arr = ss.trimboth(a=diffs, proportiontocut=alpha / 2) return [samp_est - new_arr[-1], samp_est - new_arr[0]]
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--infile", required=True, help="Tabular file.") parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.") parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;") parser.add_argument("--test_id", help="statistical test method") parser.add_argument( "--mwu_use_continuity", action="store_true", default=False, help="Whether a continuity correction (1/2.) should be taken into account.", ) parser.add_argument( "--equal_var", action="store_true", default=False, help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.", ) parser.add_argument( "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values." ) parser.add_argument("--fisher", action="store_true", default=False, help="if true then Fisher definition is used") parser.add_argument( "--bias", action="store_true", default=False, help="if false,then the calculations are corrected for statistical bias", ) parser.add_argument("--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored") parser.add_argument( "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored" ) parser.add_argument("--inclusive", action="store_true", default=False, help="if false,limit will be ignored") parser.add_argument( "--printextras", action="store_true", default=False, help="If True, if there are extra points a warning is raised saying how many of those points there are", ) parser.add_argument( "--initial_lexsort", action="store_true", default="False", help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.", ) parser.add_argument("--correction", action="store_true", default=False, help="continuity correction ") parser.add_argument( "--axis", type=int, default=0, help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)", ) parser.add_argument( "--n", type=int, default=0, help="the number of trials. This is ignored if x gives both the number of successes and failures", ) parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram") parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction") parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--m", type=float, default=0.0, help="limits") parser.add_argument("--mf", type=float, default=2.0, help="lower limit") parser.add_argument("--nf", type=float, default=99.9, help="higher_limit") parser.add_argument( "--p", type=float, default=0.5, help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5", ) parser.add_argument("--alpha", type=float, default=0.9, help="probability") parser.add_argument("--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds") parser.add_argument( "--proportiontocut", type=float, default=0.0, help="Proportion (in range 0-1) of total data set to trim of each end.", ) parser.add_argument( "--lambda_", type=float, default=1.0, help="lambda_ gives the power in the Cressie-Read power divergence statistic", ) parser.add_argument( "--imbda", type=float, default=0, help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.", ) parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e") parser.add_argument("--dtype", help="dtype") parser.add_argument("--med", help="med") parser.add_argument("--cdf", help="cdf") parser.add_argument("--zero_method", help="zero_method options") parser.add_argument("--dist", help="dist options") parser.add_argument("--ties", help="ties options") parser.add_argument("--alternative", help="alternative options") parser.add_argument("--mode", help="mode options") parser.add_argument("--method", help="method options") parser.add_argument("--md", help="md options") parser.add_argument("--center", help="center options") parser.add_argument("--kind", help="kind options") parser.add_argument("--tail", help="tail options") parser.add_argument("--interpolation", help="interpolation options") parser.add_argument("--statistic", help="statistic options") args = parser.parse_args() infile = args.infile outfile = open(args.outfile, "w+") test_id = args.test_id nf = args.nf mf = args.mf imbda = args.imbda inclusive1 = args.inclusive1 inclusive2 = args.inclusive2 sample0 = 0 sample1 = 0 sample2 = 0 if args.sample_cols != None: sample0 = 1 barlett_samples = [] for sample in args.sample_cols.split(";"): barlett_samples.append(map(int, sample.split(","))) if args.sample_one_cols != None: sample1 = 1 sample_one_cols = args.sample_one_cols.split(",") if args.sample_two_cols != None: sample_two_cols = args.sample_two_cols.split(",") sample2 = 1 for line in open(infile): sample_one = [] sample_two = [] cols = line.strip().split("\t") if sample0 == 1: b_samples = columns_to_values(barlett_samples, line) if sample1 == 1: for index in sample_one_cols: sample_one.append(cols[int(index) - 1]) if sample2 == 1: for index in sample_two_cols: sample_two.append(cols[int(index) - 1]) if test_id.strip() == "describe": size, min_max, mean, uv, bs, bk = stats.describe(map(float, sample_one)) cols.append(size) cols.append(min_max) cols.append(mean) cols.append(uv) cols.append(bs) cols.append(bk) elif test_id.strip() == "mode": vals, counts = stats.mode(map(float, sample_one)) cols.append(vals) cols.append(counts) elif test_id.strip() == "nanmean": m = stats.nanmean(map(float, sample_one)) cols.append(m) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "kurtosistest": z_value, p_value = stats.kurtosistest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "itemfreq": freq = stats.itemfreq(map(float, sample_one)) for list in freq: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "boxcox_llf": IIf = stats.boxcox_llf(imbda, map(float, sample_one)) cols.append(IIf) elif test_id.strip() == "tiecorrect": fa = stats.tiecorrect(map(float, sample_one)) cols.append(fa) elif test_id.strip() == "rankdata": r = stats.rankdata(map(float, sample_one), method=args.md) cols.append(r) elif test_id.strip() == "nanstd": s = stats.nanstd(map(float, sample_one), bias=args.bias) cols.append(s) elif test_id.strip() == "anderson": A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist) cols.append(A2) for list in critical: cols.append(list) cols.append(",") for list in sig: cols.append(list) elif test_id.strip() == "binom_test": p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p) cols.append(p_value) elif test_id.strip() == "gmean": gm = stats.gmean(map(float, sample_one), dtype=args.dtype) cols.append(gm) elif test_id.strip() == "hmean": hm = stats.hmean(map(float, sample_one), dtype=args.dtype) cols.append(hm) elif test_id.strip() == "kurtosis": k = stats.kurtosis(map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias) cols.append(k) elif test_id.strip() == "moment": n_moment = stats.moment(map(float, sample_one), n=args.n) cols.append(n_moment) elif test_id.strip() == "normaltest": k2, p_value = stats.normaltest(map(float, sample_one)) cols.append(k2) cols.append(p_value) elif test_id.strip() == "skew": skewness = stats.skew(map(float, sample_one), bias=args.bias) cols.append(skewness) elif test_id.strip() == "skewtest": z_value, p_value = stats.skewtest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "sem": s = stats.sem(map(float, sample_one), ddof=args.ddof) cols.append(s) elif test_id.strip() == "zscore": z = stats.zscore(map(float, sample_one), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "signaltonoise": s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof) cols.append(s2n) elif test_id.strip() == "percentileofscore": p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind) cols.append(p) elif test_id.strip() == "bayes_mvs": c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha) cols.append(c_mean) cols.append(c_var) cols.append(c_std) elif test_id.strip() == "sigmaclip": c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n) cols.append(c) cols.append(c_low) cols.append(c_up) elif test_id.strip() == "kstest": d, p_value = stats.kstest( map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode ) cols.append(d) cols.append(p_value) elif test_id.strip() == "chi2_contingency": chi2, p, dof, ex = stats.chi2_contingency( map(float, sample_one), correction=args.correction, lambda_=args.lambda_ ) cols.append(chi2) cols.append(p) cols.append(dof) cols.append(ex) elif test_id.strip() == "tmean": if nf is 0 and mf is 0: mean = stats.tmean(map(float, sample_one)) else: mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(mean) elif test_id.strip() == "tmin": if mf is 0: min = stats.tmin(map(float, sample_one)) else: min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive) cols.append(min) elif test_id.strip() == "tmax": if nf is 0: max = stats.tmax(map(float, sample_one)) else: max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive) cols.append(max) elif test_id.strip() == "tvar": if nf is 0 and mf is 0: var = stats.tvar(map(float, sample_one)) else: var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(var) elif test_id.strip() == "tstd": if nf is 0 and mf is 0: std = stats.tstd(map(float, sample_one)) else: std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(std) elif test_id.strip() == "tsem": if nf is 0 and mf is 0: s = stats.tsem(map(float, sample_one)) else: s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(s) elif test_id.strip() == "scoreatpercentile": if nf is 0 and mf is 0: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation ) else: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation ) for list in s: cols.append(list) elif test_id.strip() == "relfreq": if nf is 0 and mf is 0: rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b) else: rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b, (mf, nf)) for list in rel: cols.append(list) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "binned_statistic": if nf is 0 and mf is 0: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b ) else: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b, range=(mf, nf), ) cols.append(st) cols.append(b_edge) cols.append(b_n) elif test_id.strip() == "threshold": if nf is 0 and mf is 0: o = stats.threshold(map(float, sample_one), newval=args.new) else: o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new) for list in o: cols.append(list) elif test_id.strip() == "trimboth": o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut) for list in o: cols.append(list) elif test_id.strip() == "trim1": t1 = stats.trim1(map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail) for list in t1: cols.append(list) elif test_id.strip() == "histogram": if nf is 0 and mf is 0: hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b) else: hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b, (mf, nf)) cols.append(hi) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "cumfreq": if nf is 0 and mf is 0: cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b) else: cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b, (mf, nf)) cols.append(cum) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "boxcox_normmax": if nf is 0 and mf is 0: ma = stats.boxcox_normmax(map(float, sample_one)) else: ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method) cols.append(ma) elif test_id.strip() == "boxcox": if imbda is 0: box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha) cols.append(box) cols.append(ma) cols.append(ci) else: box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha) cols.append(box) elif test_id.strip() == "histogram2": h2 = stats.histogram2(map(float, sample_one), map(float, sample_two)) for list in h2: cols.append(list) elif test_id.strip() == "ranksums": z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two)) cols.append(z_statistic) cols.append(p_value) elif test_id.strip() == "ttest_1samp": t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two)) for list in t: cols.append(list) for list in prob: cols.append(list) elif test_id.strip() == "ansari": AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two)) cols.append(AB) cols.append(p_value) elif test_id.strip() == "linregress": slope, intercept, r_value, p_value, stderr = stats.linregress( map(float, sample_one), map(float, sample_two) ) cols.append(slope) cols.append(intercept) cols.append(r_value) cols.append(p_value) cols.append(stderr) elif test_id.strip() == "pearsonr": cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two)) cols.append(cor) cols.append(p_value) elif test_id.strip() == "pointbiserialr": r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two)) cols.append(r) cols.append(p_value) elif test_id.strip() == "ks_2samp": d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two)) cols.append(d) cols.append(p_value) elif test_id.strip() == "mannwhitneyu": mw_stats_u, p_value = stats.mannwhitneyu( map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "zmap": z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "ttest_ind": mw_stats_u, p_value = stats.ttest_ind( map(float, sample_one), map(float, sample_two), equal_var=args.equal_var ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "ttest_rel": t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(t) cols.append(prob) elif test_id.strip() == "mood": z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(z) cols.append(p_value) elif test_id.strip() == "shapiro": W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta) cols.append(W) cols.append(p_value) for list in a: cols.append(list) elif test_id.strip() == "kendalltau": k, p_value = stats.kendalltau( map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort ) cols.append(k) cols.append(p_value) elif test_id.strip() == "entropy": s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base) cols.append(s) elif test_id.strip() == "spearmanr": if sample2 == 1: rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two)) else: rho, p_value = stats.spearmanr(map(float, sample_one)) cols.append(rho) cols.append(p_value) elif test_id.strip() == "wilcoxon": if sample2 == 1: T, p_value = stats.wilcoxon( map(float, sample_one), map(float, sample_two), zero_method=args.zero_method, correction=args.correction, ) else: T, p_value = stats.wilcoxon( map(float, sample_one), zero_method=args.zero_method, correction=args.correction ) cols.append(T) cols.append(p_value) elif test_id.strip() == "chisquare": if sample2 == 1: rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof) else: rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof) cols.append(rho) cols.append(p_value) elif test_id.strip() == "power_divergence": if sample2 == 1: stat, p_value = stats.power_divergence( map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_ ) else: stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_) cols.append(stat) cols.append(p_value) elif test_id.strip() == "theilslopes": if sample2 == 1: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha) else: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha) cols.append(mpe) cols.append(met) cols.append(lo) cols.append(up) elif test_id.strip() == "combine_pvalues": if sample2 == 1: stat, p_value = stats.combine_pvalues( map(float, sample_one), method=args.med, weights=map(float, sample_two) ) else: stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med) cols.append(stat) cols.append(p_value) elif test_id.strip() == "obrientransform": ob = stats.obrientransform(*b_samples) for list in ob: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "f_oneway": f_value, p_value = stats.f_oneway(*b_samples) cols.append(f_value) cols.append(p_value) elif test_id.strip() == "kruskal": h, p_value = stats.kruskal(*b_samples) cols.append(h) cols.append(p_value) elif test_id.strip() == "friedmanchisquare": fr, p_value = stats.friedmanchisquare(*b_samples) cols.append(fr) cols.append(p_value) elif test_id.strip() == "fligner": xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(xsq) cols.append(p_value) elif test_id.strip() == "bartlett": T, p_value = stats.bartlett(*b_samples) cols.append(T) cols.append(p_value) elif test_id.strip() == "levene": w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(w) cols.append(p_value) elif test_id.strip() == "median_test": stat, p_value, m, table = stats.median_test( ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples ) cols.append(stat) cols.append(p_value) cols.append(m) cols.append(table) for list in table: elements = ",".join(map(str, list)) cols.append(elements) outfile.write("%s\n" % "\t".join(map(str, cols))) outfile.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--infile", required=True, help="Tabular file.") parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.") parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi") parser.add_argument( "--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;", ) parser.add_argument("--test_id", help="statistical test method") parser.add_argument( "--mwu_use_continuity", action="store_true", default=False, help= "Whether a continuity correction (1/2.) should be taken into account.", ) parser.add_argument( "--equal_var", action="store_true", default=False, help= "If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.", ) parser.add_argument( "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values.", ) parser.add_argument( "--fisher", action="store_true", default=False, help="if true then Fisher definition is used", ) parser.add_argument( "--bias", action="store_true", default=False, help= "if false,then the calculations are corrected for statistical bias", ) parser.add_argument( "--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored", ) parser.add_argument( "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored", ) parser.add_argument( "--inclusive", action="store_true", default=False, help="if false,limit will be ignored", ) parser.add_argument( "--printextras", action="store_true", default=False, help= "If True, if there are extra points a warning is raised saying how many of those points there are", ) parser.add_argument( "--initial_lexsort", action="store_true", default="False", help= "Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.", ) parser.add_argument( "--correction", action="store_true", default=False, help="continuity correction ", ) parser.add_argument( "--axis", type=int, default=0, help= "Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)", ) parser.add_argument( "--n", type=int, default=0, help= "the number of trials. This is ignored if x gives both the number of successes and failures", ) parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram") parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction") parser.add_argument( "--score", type=int, default=0, help="Score that is compared to the elements in a.", ) parser.add_argument("--m", type=float, default=0.0, help="limits") parser.add_argument("--mf", type=float, default=2.0, help="lower limit") parser.add_argument("--nf", type=float, default=99.9, help="higher_limit") parser.add_argument( "--p", type=float, default=0.5, help= "The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5", ) parser.add_argument("--alpha", type=float, default=0.9, help="probability") parser.add_argument( "--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds", ) parser.add_argument( "--proportiontocut", type=float, default=0.0, help="Proportion (in range 0-1) of total data set to trim of each end.", ) parser.add_argument( "--lambda_", type=float, default=1.0, help= "lambda_ gives the power in the Cressie-Read power divergence statistic", ) parser.add_argument( "--imbda", type=float, default=0, help= "If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.", ) parser.add_argument( "--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e", ) parser.add_argument("--dtype", help="dtype") parser.add_argument("--med", help="med") parser.add_argument("--cdf", help="cdf") parser.add_argument("--zero_method", help="zero_method options") parser.add_argument("--dist", help="dist options") parser.add_argument("--ties", help="ties options") parser.add_argument("--alternative", help="alternative options") parser.add_argument("--mode", help="mode options") parser.add_argument("--method", help="method options") parser.add_argument("--md", help="md options") parser.add_argument("--center", help="center options") parser.add_argument("--kind", help="kind options") parser.add_argument("--tail", help="tail options") parser.add_argument("--interpolation", help="interpolation options") parser.add_argument("--statistic", help="statistic options") args = parser.parse_args() infile = args.infile outfile = open(args.outfile, "w+") test_id = args.test_id nf = args.nf mf = args.mf imbda = args.imbda inclusive1 = args.inclusive1 inclusive2 = args.inclusive2 sample0 = 0 sample1 = 0 sample2 = 0 if args.sample_cols is not None: sample0 = 1 barlett_samples = [] for sample in args.sample_cols.split(";"): barlett_samples.append(map(int, sample.split(","))) if args.sample_one_cols is not None: sample1 = 1 sample_one_cols = args.sample_one_cols.split(",") if args.sample_two_cols is not None: sample_two_cols = args.sample_two_cols.split(",") sample2 = 1 for line in open(infile): sample_one = [] sample_two = [] cols = line.strip().split("\t") if sample0 == 1: b_samples = columns_to_values(barlett_samples, line) if sample1 == 1: for index in sample_one_cols: sample_one.append(cols[int(index) - 1]) if sample2 == 1: for index in sample_two_cols: sample_two.append(cols[int(index) - 1]) if test_id.strip() == "describe": size, min_max, mean, uv, bs, bk = stats.describe( map(float, sample_one)) cols.append(size) cols.append(min_max) cols.append(mean) cols.append(uv) cols.append(bs) cols.append(bk) elif test_id.strip() == "mode": vals, counts = stats.mode(map(float, sample_one)) cols.append(vals) cols.append(counts) elif test_id.strip() == "nanmean": m = stats.nanmean(map(float, sample_one)) cols.append(m) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "kurtosistest": z_value, p_value = stats.kurtosistest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "itemfreq": freq = stats.itemfreq(map(float, sample_one)) for list in freq: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "boxcox_llf": IIf = stats.boxcox_llf(imbda, map(float, sample_one)) cols.append(IIf) elif test_id.strip() == "tiecorrect": fa = stats.tiecorrect(map(float, sample_one)) cols.append(fa) elif test_id.strip() == "rankdata": r = stats.rankdata(map(float, sample_one), method=args.md) cols.append(r) elif test_id.strip() == "nanstd": s = stats.nanstd(map(float, sample_one), bias=args.bias) cols.append(s) elif test_id.strip() == "anderson": A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist) cols.append(A2) for list in critical: cols.append(list) cols.append(",") for list in sig: cols.append(list) elif test_id.strip() == "binom_test": p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p) cols.append(p_value) elif test_id.strip() == "gmean": gm = stats.gmean(map(float, sample_one), dtype=args.dtype) cols.append(gm) elif test_id.strip() == "hmean": hm = stats.hmean(map(float, sample_one), dtype=args.dtype) cols.append(hm) elif test_id.strip() == "kurtosis": k = stats.kurtosis( map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias, ) cols.append(k) elif test_id.strip() == "moment": n_moment = stats.moment(map(float, sample_one), n=args.n) cols.append(n_moment) elif test_id.strip() == "normaltest": k2, p_value = stats.normaltest(map(float, sample_one)) cols.append(k2) cols.append(p_value) elif test_id.strip() == "skew": skewness = stats.skew(map(float, sample_one), bias=args.bias) cols.append(skewness) elif test_id.strip() == "skewtest": z_value, p_value = stats.skewtest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "sem": s = stats.sem(map(float, sample_one), ddof=args.ddof) cols.append(s) elif test_id.strip() == "zscore": z = stats.zscore(map(float, sample_one), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "signaltonoise": s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof) cols.append(s2n) elif test_id.strip() == "percentileofscore": p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind) cols.append(p) elif test_id.strip() == "bayes_mvs": c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha) cols.append(c_mean) cols.append(c_var) cols.append(c_std) elif test_id.strip() == "sigmaclip": c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n) cols.append(c) cols.append(c_low) cols.append(c_up) elif test_id.strip() == "kstest": d, p_value = stats.kstest( map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode, ) cols.append(d) cols.append(p_value) elif test_id.strip() == "chi2_contingency": chi2, p, dof, ex = stats.chi2_contingency( map(float, sample_one), correction=args.correction, lambda_=args.lambda_) cols.append(chi2) cols.append(p) cols.append(dof) cols.append(ex) elif test_id.strip() == "tmean": if nf == 0 and mf == 0: mean = stats.tmean(map(float, sample_one)) else: mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(mean) elif test_id.strip() == "tmin": if mf == 0: min = stats.tmin(map(float, sample_one)) else: min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive) cols.append(min) elif test_id.strip() == "tmax": if nf == 0: max = stats.tmax(map(float, sample_one)) else: max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive) cols.append(max) elif test_id.strip() == "tvar": if nf == 0 and mf == 0: var = stats.tvar(map(float, sample_one)) else: var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(var) elif test_id.strip() == "tstd": if nf == 0 and mf == 0: std = stats.tstd(map(float, sample_one)) else: std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(std) elif test_id.strip() == "tsem": if nf == 0 and mf == 0: s = stats.tsem(map(float, sample_one)) else: s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(s) elif test_id.strip() == "scoreatpercentile": if nf == 0 and mf == 0: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation, ) else: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation, ) for list in s: cols.append(list) elif test_id.strip() == "relfreq": if nf == 0 and mf == 0: rel, low_range, binsize, ex = stats.relfreq( map(float, sample_one), args.b) else: rel, low_range, binsize, ex = stats.relfreq( map(float, sample_one), args.b, (mf, nf)) for list in rel: cols.append(list) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "binned_statistic": if nf == 0 and mf == 0: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b, ) else: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b, range=(mf, nf), ) cols.append(st) cols.append(b_edge) cols.append(b_n) elif test_id.strip() == "threshold": if nf == 0 and mf == 0: o = stats.threshold(map(float, sample_one), newval=args.new) else: o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new) for list in o: cols.append(list) elif test_id.strip() == "trimboth": o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut) for list in o: cols.append(list) elif test_id.strip() == "trim1": t1 = stats.trim1( map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail, ) for list in t1: cols.append(list) elif test_id.strip() == "histogram": if nf == 0 and mf == 0: hi, low_range, binsize, ex = stats.histogram( map(float, sample_one), args.b) else: hi, low_range, binsize, ex = stats.histogram( map(float, sample_one), args.b, (mf, nf)) cols.append(hi) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "cumfreq": if nf == 0 and mf == 0: cum, low_range, binsize, ex = stats.cumfreq( map(float, sample_one), args.b) else: cum, low_range, binsize, ex = stats.cumfreq( map(float, sample_one), args.b, (mf, nf)) cols.append(cum) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "boxcox_normmax": if nf == 0 and mf == 0: ma = stats.boxcox_normmax(map(float, sample_one)) else: ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method) cols.append(ma) elif test_id.strip() == "boxcox": if imbda == 0: box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha) cols.append(box) cols.append(ma) cols.append(ci) else: box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha) cols.append(box) elif test_id.strip() == "histogram2": h2 = stats.histogram2(map(float, sample_one), map(float, sample_two)) for list in h2: cols.append(list) elif test_id.strip() == "ranksums": z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two)) cols.append(z_statistic) cols.append(p_value) elif test_id.strip() == "ttest_1samp": t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two)) for list in t: cols.append(list) for list in prob: cols.append(list) elif test_id.strip() == "ansari": AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two)) cols.append(AB) cols.append(p_value) elif test_id.strip() == "linregress": slope, intercept, r_value, p_value, stderr = stats.linregress( map(float, sample_one), map(float, sample_two)) cols.append(slope) cols.append(intercept) cols.append(r_value) cols.append(p_value) cols.append(stderr) elif test_id.strip() == "pearsonr": cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two)) cols.append(cor) cols.append(p_value) elif test_id.strip() == "pointbiserialr": r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two)) cols.append(r) cols.append(p_value) elif test_id.strip() == "ks_2samp": d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two)) cols.append(d) cols.append(p_value) elif test_id.strip() == "mannwhitneyu": mw_stats_u, p_value = stats.mannwhitneyu( map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity, ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "zmap": z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "ttest_ind": mw_stats_u, p_value = stats.ttest_ind(map(float, sample_one), map(float, sample_two), equal_var=args.equal_var) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "ttest_rel": t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(t) cols.append(prob) elif test_id.strip() == "mood": z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(z) cols.append(p_value) elif test_id.strip() == "shapiro": W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta) cols.append(W) cols.append(p_value) for list in a: cols.append(list) elif test_id.strip() == "kendalltau": k, p_value = stats.kendalltau( map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort, ) cols.append(k) cols.append(p_value) elif test_id.strip() == "entropy": s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base) cols.append(s) elif test_id.strip() == "spearmanr": if sample2 == 1: rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two)) else: rho, p_value = stats.spearmanr(map(float, sample_one)) cols.append(rho) cols.append(p_value) elif test_id.strip() == "wilcoxon": if sample2 == 1: T, p_value = stats.wilcoxon( map(float, sample_one), map(float, sample_two), zero_method=args.zero_method, correction=args.correction, ) else: T, p_value = stats.wilcoxon( map(float, sample_one), zero_method=args.zero_method, correction=args.correction, ) cols.append(T) cols.append(p_value) elif test_id.strip() == "chisquare": if sample2 == 1: rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof) else: rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof) cols.append(rho) cols.append(p_value) elif test_id.strip() == "power_divergence": if sample2 == 1: stat, p_value = stats.power_divergence( map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_, ) else: stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_) cols.append(stat) cols.append(p_value) elif test_id.strip() == "theilslopes": if sample2 == 1: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha) else: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha) cols.append(mpe) cols.append(met) cols.append(lo) cols.append(up) elif test_id.strip() == "combine_pvalues": if sample2 == 1: stat, p_value = stats.combine_pvalues( map(float, sample_one), method=args.med, weights=map(float, sample_two), ) else: stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med) cols.append(stat) cols.append(p_value) elif test_id.strip() == "obrientransform": ob = stats.obrientransform(*b_samples) for list in ob: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "f_oneway": f_value, p_value = stats.f_oneway(*b_samples) cols.append(f_value) cols.append(p_value) elif test_id.strip() == "kruskal": h, p_value = stats.kruskal(*b_samples) cols.append(h) cols.append(p_value) elif test_id.strip() == "friedmanchisquare": fr, p_value = stats.friedmanchisquare(*b_samples) cols.append(fr) cols.append(p_value) elif test_id.strip() == "fligner": xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(xsq) cols.append(p_value) elif test_id.strip() == "bartlett": T, p_value = stats.bartlett(*b_samples) cols.append(T) cols.append(p_value) elif test_id.strip() == "levene": w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(w) cols.append(p_value) elif test_id.strip() == "median_test": stat, p_value, m, table = stats.median_test( ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples) cols.append(stat) cols.append(p_value) cols.append(m) cols.append(table) for list in table: elements = ",".join(map(str, list)) cols.append(elements) outfile.write("%s\n" % "\t".join(map(str, cols))) outfile.close()
from pandas import read_csv import matplotlib.pyplot as plt from reader import read results = read() for dataset in ['BBBC004', 'BBBC039']: fig, axs = plt.subplots(2, 2) #name + ", " + trans[m]) fig.suptitle(dataset) for m in ['jaccard', 'dice', 'adj_rand', 'warping_error']: fcm_vals = results[dataset]['fcm'][m] unet_vals = results[dataset]['unet'][m] dognet_vals = results[dataset]['dognet'][m] trim = 0.0 trimmed_fcm = stats.trimboth(fcm_vals, trim) trimmed_unet = stats.trimboth(unet_vals, trim) trimmed_dognet = stats.trimboth(dognet_vals, trim) trans = dict( zip(['jaccard', 'dice', 'adj_rand', 'warping_error'], [ 'Jaccard Index', 'Dice Coefficient', 'Adj. Rand index', 'Warping error' ])) if m == 'jaccard': p = axs[0][0] p.set_xlim([0.7, 1.0]) elif m == 'dice': p = axs[0][1] p.set_xlim([0.7, 1.0])
import numpy as np import pandas as pd from scipy import stats from sklearn.linear_model import LinearRegression from sklearn.datasets import load_boston dataset = load_boston() samples, label, feature_names = dataset.data, dataset.target, dataset.feature_names samples_trim = stats.trimboth(samples, 0.1) label_trim = stats.trimboth(label, 0.1) print(samples.shape, label.shape) print(samples_trim.shape, label_trim.shape) from sklearn.model_selection import train_test_split samples_train, samples_test, label_train, label_test = train_test_split(samples_trim, label_trim, test_size=0.2, random_state=0) print(samples_train.shape, label_train.shape) print(samples_test.shape, label_test.shape) regressor = LinearRegression() regressor.fit(samples_train, label_train) label_pred = regressor.predict(samples_test) import matplotlib.pyplot as plt plt.scatter(label_test, label_pred)
def main(data, output, header, trim): if header: output.write( f" title signal lag_min lag_delta rise_mean rise_stddev\n" ) args = [InputArg(x) for x in data] for arg in args: if arg.path.exists() and arg.path.is_file(): continue sys.stderr.write(f"{arg.path}: file does not exist\n") exit(1) for arg in args: with open(arg.path, "r") as f: samples = read_measurements(f) changetimes = [] risetimes = [] deltas = [] for sample in samples: times = np.array(sample.times) values = np.array(sample.values) (times, values) = linear_interp(times, values) values = moving_average(values) # The moving average discards the ends of the values to reduce error times = times[2:-2] # Check if there's a significant difference between the initial and # final light level rise = values[-1] - values[0] if rise < 10: raise Exception("No significant difference in light level") deltas.append(rise) rise_lim = rise * .1 begin = np.argmax(values > (values[1] + rise_lim)) end = np.argmin(values < (values[-1] - rise_lim)) risetime = (times[end] - times[begin]) risetimes.append(risetime) midpoint = np.argmax(values > (values[1] + (rise / 2))) changetime = times[midpoint] changetimes.append(changetime) signal_delta = sum(deltas) / len(deltas) changetimes = np.array(changetimes) if trim != 0: changetimes = stats.trimboth(changetimes, trim) (loc, scale) = stats.uniform.fit(changetimes) lag_min = loc (rise_mu, rise_std) = stats.norm.fit(risetimes) output.write( f"{arg.title:>20} {signal_delta:10.3f} {lag_min:10.3f} {scale:10.3f} {rise_mu:10.3f} {rise_std:11.3f}\n" )
minimum = np.round(np.amin(CRIM), decimals=1) maximum = np.round(np.amax(CRIM), decimals=1) variance = np.round(np.var(CRIM), decimals=1) mean = np.round(np.mean(CRIM), decimals=1) Befor_trim = np.vstack(minimum, maximum, variance, mean) minimum_trim = stats.tmin(CRIM, 1) maximum_trim = stats.tmax(CRIM, 40) variance_trim = stats.tmin(CRIM, (1, 40)) mean_trim = stats.tmin(CRIM, (1, 40)) After_trim = np.round(np.vstack(minimum_trim, maximum_trim, variance_trim, mean_trim), decimals=1) stat_labels1 = ['minm', 'maxm', 'vari', 'mean'] Basic_stastice1 = np.hstack((Befor_trim, After_trimfrm)) print(" Before After") for stat_labels1, row1 in zip(stat_labels1, Basic_stastice1): print('%s [%s]' % (stat_labels1, ''.join('%07s' % a for a in row1))) CRIM_TRIMED = stats.trimboth(CRIM, 0.2) fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 2)) axs = [ax1, ax2] df = [CRIM, CRIM_TRIMED] list_mathods = ['Befor trim', 'After trim'] for n in range(0, len(axs)): axs[n].hist(df[n], bins='auto') axs[n].set_title('{}'.format(list_mathods[n])) #Correlation
@author: mariaguadalupe """ import matplotlib.pyplot as plt import pandas as pd import numpy as np import scipy.stats as stats data = pd.read_csv( "C:/Users/mariaguadalupe/Downloads/MinDa/Code/salaries_nyrb.csv") col = 'Salary' print('Statistics for ', col) dataCol = np.array(data[col].values) print(dataCol) print('Minimun = ', np.min(dataCol)) print('Maximun = ', np.max(dataCol)) print('First quartile = ', np.percentile(dataCol, 25)) print('Median = ', np.median(dataCol)) print('Mean = ', np.mean(dataCol)) print('Third quartile = ', np.percentile(dataCol, 75)) iqr = np.percentile(dataCol, 75) - np.percentile(dataCol, 25) print('IQR = ', iqr) plt.boxplot(dataCol) plt.figure() per = 0.1 trim_data = stats.trimboth(data[col], per) print('Statics for ', col, 'Trimmed at ', per) print('Minimun = ', np.min(trim_data))
def _percentile_clip(a, perc=80): return np.mean(trimboth(np.sort(a, axis=0), (100 - perc) / 100, axis=0), axis=0)
def trimmed_std(a, p): temp = stats.trimboth(a, p / 2) return np.std(temp)
stdout = sys.stdout PATH = "present/" fe = "Thelen_2020_SPW37_spectrum.txt" #"bonus/2013_00227_X1942_spectrum_f.txt files = [f"{PATH}{f}" for f in os.listdir(PATH) if f == fe] #files += [f"data/{f}" for f in os.listdir("data/") ] for f in files: os.system(f"notify-send 'started!' {f}") data = autospec.read(autospec.SpectralFile(f)) data = data._asdict() data['frequency'] = data['frequency'][np.where( (data['frequency'] > 255.8) & (data['frequency'] < 256.08))] data['intensity'] = data['intensity'][np.where( (data['frequency'] > 255.8) & (data['frequency'] < 256.08))] data2 = autospec.SpectralData(**data) spikes = autospec.identify_spikes( data2, std_method=lambda x: ss.trimboth(x, 0.1).std()) autospec.plot_spikes(spikes) # import pdb; pdb.set_trace() save_path = f.split("/")[-1].replace(".", "_").replace( "_txt", "") + "working_example_zoom2" molecules = autospec.get_molecules_from_spikes(spikes, f"{save_path}.obj") cover = autospec.SetCovering(spikes, molecules, threshold=0.1) cover.save_results(save_path) os.system("notify-send 'done!'") #
def test_trimboth(self): a = np.arange(20) b = stats.trimboth(a, 0.1) bm = stats.mstats.trimboth(a, 0.1) assert_allclose(np.sort(b), bm.data[~bm.mask])
def trimmed_std(a,p=0.05): from scipy import stats temp = stats.trimboth(a,p/2) return np.std(temp)
def std(x): return ss.trimboth(x, 0.1).std()
def test_trimboth(self): a = np.arange(20) b = stats.trimboth(a,0.1) bm = stats.mstats.trimboth(a,0.1) assert(np.all(b == bm.data[~bm.mask]))
if insert == 0 or insert > MAXREADLEN: continue nread += 1 if not nread % interval: # screen trace print('.', end=' ', flush=True) lendata.append(insert) hist[int(insert)] += 1 if nread > MAXREADS: break print() tdata = stat.trimboth(lendata, TRIMFRAC) descrip = stat.describe(tdata) lenmean = descrip.mean lensd = np.sqrt(descrip.variance) print('\n{} reads read from {}'.format(nread, filename)) print('\n{} fraction of values trimmed from each end'.format(TRIMFRAC)) print('trimmed mean: {:.3f}\ttrimmed standard devation: {:.3f}'.format( lenmean, lensd)) fig = plt.figure() ax = fig.add_subplot(111) # insert size histogram bins = [i for i in range(int(descrip.minmax[1])) if not i % BINSIZE] bins.append(int(descrip.minmax[1]))
def test_trimboth(self): a = np.arange(20) b = stats.trimboth(a, 0.1) bm = stats.mstats.trimboth(a, 0.1) assert_allclose(b, bm.data[~bm.mask])
Suppose we need to identify the no of gems in a box. We go to numerous persons and record their guesses and note it down. Now by using the concept of trimmed mean and comparing the mean and median, we get a result close to the actual result. This is crowd computing/ social computing. Quora, Wikipedia etc. sites uses this concept to show us result of a particular question from scipy import stats import matplotlib.pyplot as plt import statistics Estimates=[] #list of numbers or data Estimates.sort() #Sorting the data #tv=int(0.1*len(Estimates)) # trimming 10% of the data #Estimates=Estimates[tv:] # deleting smallest 10% values #Estimates=Estimates[:len(Estimates)-tv] # deleting largest 10% values for i in range(len(Estimates)): print(Estimates[i]) a=stats.trimboth(Estimates,proportiontocut=0.1) print(a) m=stats.trim_mean(Estimates,0.1) #proportion cut, cuts 10% of the data from start and end. print(m) # We just need to show the list, for that we will be plotting it in the x axis and fixing Y as constant y=[] for i in range(len(a)): y.append(5) plt.plot(a,y,'ro') plt.plot([statistics.mean(a)],[5],'bo') plt.plot([statistics.median(a)],[5],'go') plt.plot([#actual value],[5],'ro') # Comparing the results we get a number close to the actual number.