Ejemplo n.º 1
0
def preliminary_stats(selected_column_widths, selected_column_heights):

    #
    trimmed_selected_column_widths = stats.trimboth(selected_column_widths,
                                                    0.1)
    mean_trimmed_selected_column_widths = np.mean(
        trimmed_selected_column_widths)
    std_trimmed_selected_column_widths = np.std(trimmed_selected_column_widths)

    #
    trimmed_selected_column_heights = stats.trimboth(selected_column_heights,
                                                     0.1)
    mean_trimmed_selected_column_heights = np.mean(
        trimmed_selected_column_heights)
    std_trimmed_selected_column_heights = np.std(
        trimmed_selected_column_heights)

    print("\n")
    print(mean_trimmed_selected_column_widths)
    print(std_trimmed_selected_column_widths)
    print(mean_trimmed_selected_column_heights)
    print(std_trimmed_selected_column_heights)

    #
    return mean_trimmed_selected_column_widths, std_trimmed_selected_column_widths, \
           mean_trimmed_selected_column_heights, std_trimmed_selected_column_heights,
Ejemplo n.º 2
0
def trimmed_mean(arr, axis=-1, ratio=2., use_sem=True):
    arr = np.sort(arr, axis=axis)
    std = np.std(arr, axis, keepdims=1)
    std = np.std(st.trimboth(arr, 0.1, axis), keepdims=1)
    mean = np.mean(st.trimboth(arr, 0.1, axis), keepdims=1)
    #std = np.std(st.trimboth(arr, 0.1, axis), keepdims=1)
    #mean = np.mean(st.trimboth(arr, 0.1, axis), keepdims=1)
    idx = np.abs(arr - mean) > ratio * std
    n = np.sqrt(np.sum(~idx, axis))
    if not use_sem:
        n = 1
    arr[idx] = np.nan

    mean = np.nanmean(arr, axis)
    std = np.nanstd(arr, axis, ddof=1)/n
    return mean, std
Ejemplo n.º 3
0
def trimmed_mean(arr, axis=-1, ratio=2., use_sem=True):
    arr = np.sort(arr, axis=axis)
    std = np.std(arr, axis, keepdims=1)
    std = np.std(st.trimboth(arr, 0.1, axis), keepdims=1)
    mean = np.mean(st.trimboth(arr, 0.1, axis), keepdims=1)
    #std = np.std(st.trimboth(arr, 0.1, axis), keepdims=1)
    #mean = np.mean(st.trimboth(arr, 0.1, axis), keepdims=1)
    idx = np.abs(arr - mean) > ratio * std
    n = np.sqrt(np.sum(~idx, axis))
    if not use_sem:
        n = 1
    arr[idx] = np.nan

    mean = np.nanmean(arr, axis)
    std = np.nanstd(arr, axis, ddof=1)/n
    return mean, std
Ejemplo n.º 4
0
    def calibrate_flat(self, sub):
        ''' Perform calibrations on flat which include subtracting bias if
        available , and rescaling so the mean intensity is .5 (because outlier rejection 
        methods used to combine flat subs work best with normalised frames due to changing 
        light levels; the value of .5 is so that we can use B & W controls; we rescale to 
        a mean of 1 when saving since this is what a good flat needs for dividing)
        '''

        im = sub.get_image()

        # subtract bias if available
        bias = self.get_bias(sub)
        if bias is not None:
            #print('subtracting bias')
            im = im - self.get_master(bias)

        # normalise by mean of image in central 3rd zone
        perc = 75  # retain central 75% of points when computing mean
        w, h = im.shape
        w1, w2 = int(w / 3), int(2 * w / 3)
        h1, h2 = int(h / 3), int(2 * h / 3)
        imr = im[h1:h2, w1:w2]
        robust_mean = np.mean(trimboth(np.sort(imr.ravel(), axis=0),
                                       (100 - perc) / 100,
                                       axis=0),
                              axis=0)

        sub.image = .5 * im / robust_mean
Ejemplo n.º 5
0
def line_stats(service_rates):
    if not service_rates:
        return [0, 0]

    trimmed_rates = stats.trimboth(service_rates, 0.1)
    mean = stat.mean(trimmed_rates)
    stddev = stat.stdev(trimmed_rates, mean)

    return [mean, stddev]
Ejemplo n.º 6
0
def lfstdv(y_in, x_in=None):
    y = y_in.copy()
    if x_in != None:
        y = y[np.argsort(x_in)]
    delta = np.sort((y[1:-1] - y[2:]) - 0.5 * (y[:-2] - y[2:]))

    ###    scaled to match standard deviation of
    ###    gaussian noise on constant signal
    ###    also, trim outliers.
    return 0.8166137 * np.std(trimboth(delta, 0.05))
Ejemplo n.º 7
0
 def _make_plot(self, dc, filename):
     fig = plt.figure(figsize=(10,6))
     ax1 = fig.add_subplot(111)
     plt.subplots_adjust(left=0.125, right=0.9, top=0.9, bottom=0.135)
     col_index = 0
     for name, data in dc.items():
         mean = np.array([np.mean(trimboth(np.sort(x), 0.25)) for x in data.T])
         plt.plot(mean, color=colors[col_index], label=name, linewidth=2)
         col_index += 1
     ax1.set_xlabel("Runde")
     ax1.set_ylabel("Kumulativer Regret")
     plt.legend(loc=0) # 0=best, 4=lower right
     null = fig.savefig(filename + "_plot.pdf", dpi=400,
                        bbox_inches="tight")
Ejemplo n.º 8
0
def create_histogram(a, trim_p, bin_size, p_title, p_ylabel, p_xlabel, 
                     file_prefix, dec_prec=0, trim_type="both"):
    """Interacts with pylab to draw and save histogram plot.
    
    Arguments:
    a -- array_like list of values to plot
    trim_p -- Percentile (range 0 to 1) of values to trim (float)
    bin_size -- Size of histogram's value bins (float)
    p_title -- Title of plot
    p_ylabel -- ylabel of plot
    p_xlabel -- xlabel of plot
    file_prefix -- Filename prefix
    dec_prec -- (Optional) Decimal precision of bins. Defaults to 0. (int)
    trim_type -- (Optional) Controls the tail of distribution that percentile 
                 trim_p is applied. Values include "both", "left", and "right".
                 Defaults to "both".
    """
    a.sort()
    sample_size = len(a)
    print("a length pre-trim_p", len(a))
    if trim_type == "left" or trim_type == "right":
        a = stats.trim1(a, trim_p, trim_type)
    else:
        a = stats.trimboth(a, trim_p)
    print("a length post-trim_p", len(a))
    bin_min = math.floor(min(a)) # TODO Round down to dec_prec instead
    bin_max = round(max(a), dec_prec)
    print("bin size=" + str(bin_size) + 
          ", bin min=" + str(bin_min) + 
          ", bin max=" + str(bin_max))
    # Create histogram of values
    n, bins, patches = pylab.hist(a, 
                                  bins=pylab.frange(bin_min, 
                                                    bin_max, 
                                                    bin_size), 
                                  normed=False, 
                                  histtype="stepfilled")
    pylab.setp(patches, "facecolor", "g", "alpha", 0.75)
    pylab.title(p_title)
    pylab.xlabel(p_xlabel)
    pylab.ylabel(p_ylabel)
    
    if trim_p > 0:
        pylab.savefig(file_prefix + "_trimmed.png")
    else:
        pylab.savefig(file_prefix + ".png")
    pylab.show()
Ejemplo n.º 9
0
def _mev(bss_data):
    """
    Maximum Epoch Variance.

    Captures the temporal dynamics of horizontal eye movements.

    Parameters
    ----------
    bss_data : array
        Array with dimensions CxTxE, where C is the number of components, T the
        number of time instants and E the number of events

    Returns
    -------
    res : array
        Vector of length C with the computed values for each component
    """
    # As _t_dim < _ev_dim, the events dimension will be shifted down one
    # position after computing the kurtosis (as time dimension will have
    # disappeared)
    ev_dim = _ev_dim - 1

    # Compute time variance
    try:
        var_data = bss_data.var(axis=_t_dim)
    except AttributeError:
        raise _chk_parameters(bss_data=bss_data)
    # Sort across events before trim
    try:
        np.sort(var_data, axis=ev_dim)
    except ValueError:
        raise _chk_parameters(bss_data=bss_data)
    # Trimmed vector of time variances
    var = sp_stats.trimboth(np.sort(var_data, axis=ev_dim), 0.01, axis=ev_dim)
    # Final results
    res = var.max(axis=ev_dim) / var.mean(axis=ev_dim)
    # bss_data dimensionality has to be checked explicitly, as a ND array with
    # N > 3 does not raise an exception
    if bss_data.ndim > 3:
        raise _chk_parameters(bss_data=bss_data)
    # Done
    return res
Ejemplo n.º 10
0
def get_features(spaces):
	# List of features: 
	#	duration; mean; variance; 

	fmat = []
	for s in spaces:
		st = trimboth(s,0.15)
		fvec = []
		fvec.append(len(s))			# duration	
		fvec.append(s.max()/st.mean())	# max over mean
		fvec.append(st.var())		# variance
		fvec.append(st.mean())		# mean
		fvec.append(skew(s))
		# difference betwwen 0.65 and 0.95 quantiles
		q = mquantiles(s,prob=[0.65,0.95])
		fvec.append(q[1]-q[0])

		fmat.append(fvec)

	fmat = np.array(fmat)
	return fmat
Ejemplo n.º 11
0
def check_correlations(df1, df2, plot=False):
    """
    checks for correlations between two dataframes;
    both must have a 'daily_pct_chg' column
    this can be used for checking that a 2 or 3x etf is properly correlated with the underlying asset

    also gets the multiplication factor and standard deviation
    """
    # check correlation to make sure it's high
    both = pd.concat([df1['daily_pct_chg'], df2['daily_pct_chg']],
                     axis=1).dropna()
    both.columns = ['regular', 'leveraged']
    if plot:
        corr = both.corr()  # usually around 99.9 or 99.8
        both.plot.scatter(x='regular', y='leveraged')
        plt.title('correlation: ' + str(round(corr.iloc[0, 1], 4)))
        plt.show()

    # look at distribution of TQQQ return multiples
    t = (both['leveraged'] / both['regular']).fillna(0).to_frame()
    t[np.isinf(t[0])] = 0
    # exclude outliers, which throw off the mean
    # removes right and leftmost 5% of quantiles
    new_t = trimboth(t[0], 0.05)
    # t = t[t[0] < t[0].quantile(0.9)]
    # some large outliers
    # t[(t < 6) & (t > -6)].hist(bins=50)

    if plot:
        plt.hist(new_t, bins=50)
        plt.show()

    print('mean and std for multiples:')
    avg, std = new_t.mean(), new_t.std()
    print(avg)
    print(std)
    return avg
def measure_error(results, one_dim=False):
    metrics = {}
    for result in results:
        # option = result[0].split('_', 1)[1]
        option = result[0].rsplit('_', 1)[0]
        if option in metrics:
            metrics[option].append(result[1])
        else:
            metrics[option] = [result[1]]

    errors = {}
    for option, result in metrics.iteritems():
        print(option, len(result))
        if not one_dim:
            result = np.atleast_2d(result)
        else:
            result = np.array(result)
        if result.size == 0:
            continue
        result = np.sort(result, 0)
        result = trimboth(result, 0.05, 0)
        errors[option] = np.mean(result, 0), np.sqrt(np.var(result, 0))

    return errors
Ejemplo n.º 13
0
def bootstrap_conf(estimates, alpha=0.05):
    trim_pct = alpha / 2
    new_arr = ss.trimboth(a=estimates, proportiontocut=trim_pct)
    return [new_arr[0], new_arr[-1]]
Ejemplo n.º 14
0
def bootstrap_conf_pivot(estimates,
                         samp_est,
                         alpha=0.05):  #also bootstrap_conf_delta
    diffs = np.array(estimates) - samp_est
    new_arr = ss.trimboth(a=diffs, proportiontocut=alpha / 2)
    return [samp_est - new_arr[-1], samp_est - new_arr[0]]
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--infile", required=True, help="Tabular file.")
    parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.")
    parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;")
    parser.add_argument("--test_id", help="statistical test method")
    parser.add_argument(
        "--mwu_use_continuity",
        action="store_true",
        default=False,
        help="Whether a continuity correction (1/2.) should be taken into account.",
    )
    parser.add_argument(
        "--equal_var",
        action="store_true",
        default=False,
        help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.",
    )
    parser.add_argument(
        "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values."
    )
    parser.add_argument("--fisher", action="store_true", default=False, help="if true then Fisher definition is used")
    parser.add_argument(
        "--bias",
        action="store_true",
        default=False,
        help="if false,then the calculations are corrected for statistical bias",
    )
    parser.add_argument("--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored")
    parser.add_argument(
        "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored"
    )
    parser.add_argument("--inclusive", action="store_true", default=False, help="if false,limit will be ignored")
    parser.add_argument(
        "--printextras",
        action="store_true",
        default=False,
        help="If True, if there are extra points a warning is raised saying how many of those points there are",
    )
    parser.add_argument(
        "--initial_lexsort",
        action="store_true",
        default="False",
        help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.",
    )
    parser.add_argument("--correction", action="store_true", default=False, help="continuity correction ")
    parser.add_argument(
        "--axis",
        type=int,
        default=0,
        help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)",
    )
    parser.add_argument(
        "--n",
        type=int,
        default=0,
        help="the number of trials. This is ignored if x gives both the number of successes and failures",
    )
    parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram")
    parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.")
    parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction")
    parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.")
    parser.add_argument("--m", type=float, default=0.0, help="limits")
    parser.add_argument("--mf", type=float, default=2.0, help="lower limit")
    parser.add_argument("--nf", type=float, default=99.9, help="higher_limit")
    parser.add_argument(
        "--p",
        type=float,
        default=0.5,
        help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5",
    )
    parser.add_argument("--alpha", type=float, default=0.9, help="probability")
    parser.add_argument("--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds")
    parser.add_argument(
        "--proportiontocut",
        type=float,
        default=0.0,
        help="Proportion (in range 0-1) of total data set to trim of each end.",
    )
    parser.add_argument(
        "--lambda_",
        type=float,
        default=1.0,
        help="lambda_ gives the power in the Cressie-Read power divergence statistic",
    )
    parser.add_argument(
        "--imbda",
        type=float,
        default=0,
        help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.",
    )
    parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e")
    parser.add_argument("--dtype", help="dtype")
    parser.add_argument("--med", help="med")
    parser.add_argument("--cdf", help="cdf")
    parser.add_argument("--zero_method", help="zero_method options")
    parser.add_argument("--dist", help="dist options")
    parser.add_argument("--ties", help="ties options")
    parser.add_argument("--alternative", help="alternative options")
    parser.add_argument("--mode", help="mode options")
    parser.add_argument("--method", help="method options")
    parser.add_argument("--md", help="md options")
    parser.add_argument("--center", help="center options")
    parser.add_argument("--kind", help="kind options")
    parser.add_argument("--tail", help="tail options")
    parser.add_argument("--interpolation", help="interpolation options")
    parser.add_argument("--statistic", help="statistic options")

    args = parser.parse_args()
    infile = args.infile
    outfile = open(args.outfile, "w+")
    test_id = args.test_id
    nf = args.nf
    mf = args.mf
    imbda = args.imbda
    inclusive1 = args.inclusive1
    inclusive2 = args.inclusive2
    sample0 = 0
    sample1 = 0
    sample2 = 0
    if args.sample_cols != None:
        sample0 = 1
        barlett_samples = []
        for sample in args.sample_cols.split(";"):
            barlett_samples.append(map(int, sample.split(",")))
    if args.sample_one_cols != None:
        sample1 = 1
        sample_one_cols = args.sample_one_cols.split(",")
    if args.sample_two_cols != None:
        sample_two_cols = args.sample_two_cols.split(",")
        sample2 = 1
    for line in open(infile):
        sample_one = []
        sample_two = []
        cols = line.strip().split("\t")
        if sample0 == 1:
            b_samples = columns_to_values(barlett_samples, line)
        if sample1 == 1:
            for index in sample_one_cols:
                sample_one.append(cols[int(index) - 1])
        if sample2 == 1:
            for index in sample_two_cols:
                sample_two.append(cols[int(index) - 1])
        if test_id.strip() == "describe":
            size, min_max, mean, uv, bs, bk = stats.describe(map(float, sample_one))
            cols.append(size)
            cols.append(min_max)
            cols.append(mean)
            cols.append(uv)
            cols.append(bs)
            cols.append(bk)
        elif test_id.strip() == "mode":
            vals, counts = stats.mode(map(float, sample_one))
            cols.append(vals)
            cols.append(counts)
        elif test_id.strip() == "nanmean":
            m = stats.nanmean(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "kurtosistest":
            z_value, p_value = stats.kurtosistest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "itemfreq":
            freq = stats.itemfreq(map(float, sample_one))
            for list in freq:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "boxcox_llf":
            IIf = stats.boxcox_llf(imbda, map(float, sample_one))
            cols.append(IIf)
        elif test_id.strip() == "tiecorrect":
            fa = stats.tiecorrect(map(float, sample_one))
            cols.append(fa)
        elif test_id.strip() == "rankdata":
            r = stats.rankdata(map(float, sample_one), method=args.md)
            cols.append(r)
        elif test_id.strip() == "nanstd":
            s = stats.nanstd(map(float, sample_one), bias=args.bias)
            cols.append(s)
        elif test_id.strip() == "anderson":
            A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist)
            cols.append(A2)
            for list in critical:
                cols.append(list)
            cols.append(",")
            for list in sig:
                cols.append(list)
        elif test_id.strip() == "binom_test":
            p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p)
            cols.append(p_value)
        elif test_id.strip() == "gmean":
            gm = stats.gmean(map(float, sample_one), dtype=args.dtype)
            cols.append(gm)
        elif test_id.strip() == "hmean":
            hm = stats.hmean(map(float, sample_one), dtype=args.dtype)
            cols.append(hm)
        elif test_id.strip() == "kurtosis":
            k = stats.kurtosis(map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias)
            cols.append(k)
        elif test_id.strip() == "moment":
            n_moment = stats.moment(map(float, sample_one), n=args.n)
            cols.append(n_moment)
        elif test_id.strip() == "normaltest":
            k2, p_value = stats.normaltest(map(float, sample_one))
            cols.append(k2)
            cols.append(p_value)
        elif test_id.strip() == "skew":
            skewness = stats.skew(map(float, sample_one), bias=args.bias)
            cols.append(skewness)
        elif test_id.strip() == "skewtest":
            z_value, p_value = stats.skewtest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "sem":
            s = stats.sem(map(float, sample_one), ddof=args.ddof)
            cols.append(s)
        elif test_id.strip() == "zscore":
            z = stats.zscore(map(float, sample_one), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "signaltonoise":
            s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof)
            cols.append(s2n)
        elif test_id.strip() == "percentileofscore":
            p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind)
            cols.append(p)
        elif test_id.strip() == "bayes_mvs":
            c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha)
            cols.append(c_mean)
            cols.append(c_var)
            cols.append(c_std)
        elif test_id.strip() == "sigmaclip":
            c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n)
            cols.append(c)
            cols.append(c_low)
            cols.append(c_up)
        elif test_id.strip() == "kstest":
            d, p_value = stats.kstest(
                map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode
            )
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "chi2_contingency":
            chi2, p, dof, ex = stats.chi2_contingency(
                map(float, sample_one), correction=args.correction, lambda_=args.lambda_
            )
            cols.append(chi2)
            cols.append(p)
            cols.append(dof)
            cols.append(ex)
        elif test_id.strip() == "tmean":
            if nf is 0 and mf is 0:
                mean = stats.tmean(map(float, sample_one))
            else:
                mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(mean)
        elif test_id.strip() == "tmin":
            if mf is 0:
                min = stats.tmin(map(float, sample_one))
            else:
                min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive)
            cols.append(min)
        elif test_id.strip() == "tmax":
            if nf is 0:
                max = stats.tmax(map(float, sample_one))
            else:
                max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive)
            cols.append(max)
        elif test_id.strip() == "tvar":
            if nf is 0 and mf is 0:
                var = stats.tvar(map(float, sample_one))
            else:
                var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(var)
        elif test_id.strip() == "tstd":
            if nf is 0 and mf is 0:
                std = stats.tstd(map(float, sample_one))
            else:
                std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(std)
        elif test_id.strip() == "tsem":
            if nf is 0 and mf is 0:
                s = stats.tsem(map(float, sample_one))
            else:
                s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(s)
        elif test_id.strip() == "scoreatpercentile":
            if nf is 0 and mf is 0:
                s = stats.scoreatpercentile(
                    map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation
                )
            else:
                s = stats.scoreatpercentile(
                    map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation
                )
            for list in s:
                cols.append(list)
        elif test_id.strip() == "relfreq":
            if nf is 0 and mf is 0:
                rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b)
            else:
                rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b, (mf, nf))
            for list in rel:
                cols.append(list)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "binned_statistic":
            if nf is 0 and mf is 0:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b
                )
            else:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                    range=(mf, nf),
                )
            cols.append(st)
            cols.append(b_edge)
            cols.append(b_n)
        elif test_id.strip() == "threshold":
            if nf is 0 and mf is 0:
                o = stats.threshold(map(float, sample_one), newval=args.new)
            else:
                o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trimboth":
            o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trim1":
            t1 = stats.trim1(map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail)
            for list in t1:
                cols.append(list)
        elif test_id.strip() == "histogram":
            if nf is 0 and mf is 0:
                hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b)
            else:
                hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b, (mf, nf))
            cols.append(hi)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "cumfreq":
            if nf is 0 and mf is 0:
                cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b)
            else:
                cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b, (mf, nf))
            cols.append(cum)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "boxcox_normmax":
            if nf is 0 and mf is 0:
                ma = stats.boxcox_normmax(map(float, sample_one))
            else:
                ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method)
            cols.append(ma)
        elif test_id.strip() == "boxcox":
            if imbda is 0:
                box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha)
                cols.append(box)
                cols.append(ma)
                cols.append(ci)
            else:
                box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha)
                cols.append(box)
        elif test_id.strip() == "histogram2":
            h2 = stats.histogram2(map(float, sample_one), map(float, sample_two))
            for list in h2:
                cols.append(list)
        elif test_id.strip() == "ranksums":
            z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two))
            cols.append(z_statistic)
            cols.append(p_value)
        elif test_id.strip() == "ttest_1samp":
            t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two))
            for list in t:
                cols.append(list)
            for list in prob:
                cols.append(list)
        elif test_id.strip() == "ansari":
            AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two))
            cols.append(AB)
            cols.append(p_value)
        elif test_id.strip() == "linregress":
            slope, intercept, r_value, p_value, stderr = stats.linregress(
                map(float, sample_one), map(float, sample_two)
            )
            cols.append(slope)
            cols.append(intercept)
            cols.append(r_value)
            cols.append(p_value)
            cols.append(stderr)
        elif test_id.strip() == "pearsonr":
            cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two))
            cols.append(cor)
            cols.append(p_value)
        elif test_id.strip() == "pointbiserialr":
            r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two))
            cols.append(r)
            cols.append(p_value)
        elif test_id.strip() == "ks_2samp":
            d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two))
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "mannwhitneyu":
            mw_stats_u, p_value = stats.mannwhitneyu(
                map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "zmap":
            z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "ttest_ind":
            mw_stats_u, p_value = stats.ttest_ind(
                map(float, sample_one), map(float, sample_two), equal_var=args.equal_var
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "ttest_rel":
            t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis)
            cols.append(t)
            cols.append(prob)
        elif test_id.strip() == "mood":
            z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis)
            cols.append(z)
            cols.append(p_value)
        elif test_id.strip() == "shapiro":
            W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta)
            cols.append(W)
            cols.append(p_value)
            for list in a:
                cols.append(list)
        elif test_id.strip() == "kendalltau":
            k, p_value = stats.kendalltau(
                map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort
            )
            cols.append(k)
            cols.append(p_value)
        elif test_id.strip() == "entropy":
            s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base)
            cols.append(s)
        elif test_id.strip() == "spearmanr":
            if sample2 == 1:
                rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two))
            else:
                rho, p_value = stats.spearmanr(map(float, sample_one))
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "wilcoxon":
            if sample2 == 1:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one),
                    map(float, sample_two),
                    zero_method=args.zero_method,
                    correction=args.correction,
                )
            else:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one), zero_method=args.zero_method, correction=args.correction
                )
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "chisquare":
            if sample2 == 1:
                rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof)
            else:
                rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof)
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "power_divergence":
            if sample2 == 1:
                stat, p_value = stats.power_divergence(
                    map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_
                )
            else:
                stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "theilslopes":
            if sample2 == 1:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha)
            else:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha)
            cols.append(mpe)
            cols.append(met)
            cols.append(lo)
            cols.append(up)
        elif test_id.strip() == "combine_pvalues":
            if sample2 == 1:
                stat, p_value = stats.combine_pvalues(
                    map(float, sample_one), method=args.med, weights=map(float, sample_two)
                )
            else:
                stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "obrientransform":
            ob = stats.obrientransform(*b_samples)
            for list in ob:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "f_oneway":
            f_value, p_value = stats.f_oneway(*b_samples)
            cols.append(f_value)
            cols.append(p_value)
        elif test_id.strip() == "kruskal":
            h, p_value = stats.kruskal(*b_samples)
            cols.append(h)
            cols.append(p_value)
        elif test_id.strip() == "friedmanchisquare":
            fr, p_value = stats.friedmanchisquare(*b_samples)
            cols.append(fr)
            cols.append(p_value)
        elif test_id.strip() == "fligner":
            xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples)
            cols.append(xsq)
            cols.append(p_value)
        elif test_id.strip() == "bartlett":
            T, p_value = stats.bartlett(*b_samples)
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "levene":
            w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples)
            cols.append(w)
            cols.append(p_value)
        elif test_id.strip() == "median_test":
            stat, p_value, m, table = stats.median_test(
                ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples
            )
            cols.append(stat)
            cols.append(p_value)
            cols.append(m)
            cols.append(table)
            for list in table:
                elements = ",".join(map(str, list))
                cols.append(elements)
        outfile.write("%s\n" % "\t".join(map(str, cols)))
    outfile.close()
Ejemplo n.º 16
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--infile", required=True, help="Tabular file.")
    parser.add_argument("-o",
                        "--outfile",
                        required=True,
                        help="Path to the output file.")
    parser.add_argument("--sample_one_cols",
                        help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_two_cols",
                        help="Input format, like smi, sdf, inchi")
    parser.add_argument(
        "--sample_cols",
        help="Input format, like smi, sdf, inchi,separate arrays using ;",
    )
    parser.add_argument("--test_id", help="statistical test method")
    parser.add_argument(
        "--mwu_use_continuity",
        action="store_true",
        default=False,
        help=
        "Whether a continuity correction (1/2.) should be taken into account.",
    )
    parser.add_argument(
        "--equal_var",
        action="store_true",
        default=False,
        help=
        "If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.",
    )
    parser.add_argument(
        "--reta",
        action="store_true",
        default=False,
        help="Whether or not to return the internally computed a values.",
    )
    parser.add_argument(
        "--fisher",
        action="store_true",
        default=False,
        help="if true then Fisher definition is used",
    )
    parser.add_argument(
        "--bias",
        action="store_true",
        default=False,
        help=
        "if false,then the calculations are corrected for statistical bias",
    )
    parser.add_argument(
        "--inclusive1",
        action="store_true",
        default=False,
        help="if false,lower_limit will be ignored",
    )
    parser.add_argument(
        "--inclusive2",
        action="store_true",
        default=False,
        help="if false,higher_limit will be ignored",
    )
    parser.add_argument(
        "--inclusive",
        action="store_true",
        default=False,
        help="if false,limit will be ignored",
    )
    parser.add_argument(
        "--printextras",
        action="store_true",
        default=False,
        help=
        "If True, if there are extra points a warning is raised saying how many of those points there are",
    )
    parser.add_argument(
        "--initial_lexsort",
        action="store_true",
        default="False",
        help=
        "Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.",
    )
    parser.add_argument(
        "--correction",
        action="store_true",
        default=False,
        help="continuity correction ",
    )
    parser.add_argument(
        "--axis",
        type=int,
        default=0,
        help=
        "Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)",
    )
    parser.add_argument(
        "--n",
        type=int,
        default=0,
        help=
        "the number of trials. This is ignored if x gives both the number of successes and failures",
    )
    parser.add_argument("--b",
                        type=int,
                        default=0,
                        help="The number of bins to use for the histogram")
    parser.add_argument("--N",
                        type=int,
                        default=0,
                        help="Score that is compared to the elements in a.")
    parser.add_argument("--ddof",
                        type=int,
                        default=0,
                        help="Degrees of freedom correction")
    parser.add_argument(
        "--score",
        type=int,
        default=0,
        help="Score that is compared to the elements in a.",
    )
    parser.add_argument("--m", type=float, default=0.0, help="limits")
    parser.add_argument("--mf", type=float, default=2.0, help="lower limit")
    parser.add_argument("--nf", type=float, default=99.9, help="higher_limit")
    parser.add_argument(
        "--p",
        type=float,
        default=0.5,
        help=
        "The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5",
    )
    parser.add_argument("--alpha", type=float, default=0.9, help="probability")
    parser.add_argument(
        "--new",
        type=float,
        default=0.0,
        help="Value to put in place of values in a outside of bounds",
    )
    parser.add_argument(
        "--proportiontocut",
        type=float,
        default=0.0,
        help="Proportion (in range 0-1) of total data set to trim of each end.",
    )
    parser.add_argument(
        "--lambda_",
        type=float,
        default=1.0,
        help=
        "lambda_ gives the power in the Cressie-Read power divergence statistic",
    )
    parser.add_argument(
        "--imbda",
        type=float,
        default=0,
        help=
        "If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.",
    )
    parser.add_argument(
        "--base",
        type=float,
        default=1.6,
        help="The logarithmic base to use, defaults to e",
    )
    parser.add_argument("--dtype", help="dtype")
    parser.add_argument("--med", help="med")
    parser.add_argument("--cdf", help="cdf")
    parser.add_argument("--zero_method", help="zero_method options")
    parser.add_argument("--dist", help="dist options")
    parser.add_argument("--ties", help="ties options")
    parser.add_argument("--alternative", help="alternative options")
    parser.add_argument("--mode", help="mode options")
    parser.add_argument("--method", help="method options")
    parser.add_argument("--md", help="md options")
    parser.add_argument("--center", help="center options")
    parser.add_argument("--kind", help="kind options")
    parser.add_argument("--tail", help="tail options")
    parser.add_argument("--interpolation", help="interpolation options")
    parser.add_argument("--statistic", help="statistic options")

    args = parser.parse_args()
    infile = args.infile
    outfile = open(args.outfile, "w+")
    test_id = args.test_id
    nf = args.nf
    mf = args.mf
    imbda = args.imbda
    inclusive1 = args.inclusive1
    inclusive2 = args.inclusive2
    sample0 = 0
    sample1 = 0
    sample2 = 0
    if args.sample_cols is not None:
        sample0 = 1
        barlett_samples = []
        for sample in args.sample_cols.split(";"):
            barlett_samples.append(map(int, sample.split(",")))
    if args.sample_one_cols is not None:
        sample1 = 1
        sample_one_cols = args.sample_one_cols.split(",")
    if args.sample_two_cols is not None:
        sample_two_cols = args.sample_two_cols.split(",")
        sample2 = 1
    for line in open(infile):
        sample_one = []
        sample_two = []
        cols = line.strip().split("\t")
        if sample0 == 1:
            b_samples = columns_to_values(barlett_samples, line)
        if sample1 == 1:
            for index in sample_one_cols:
                sample_one.append(cols[int(index) - 1])
        if sample2 == 1:
            for index in sample_two_cols:
                sample_two.append(cols[int(index) - 1])
        if test_id.strip() == "describe":
            size, min_max, mean, uv, bs, bk = stats.describe(
                map(float, sample_one))
            cols.append(size)
            cols.append(min_max)
            cols.append(mean)
            cols.append(uv)
            cols.append(bs)
            cols.append(bk)
        elif test_id.strip() == "mode":
            vals, counts = stats.mode(map(float, sample_one))
            cols.append(vals)
            cols.append(counts)
        elif test_id.strip() == "nanmean":
            m = stats.nanmean(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "kurtosistest":
            z_value, p_value = stats.kurtosistest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "itemfreq":
            freq = stats.itemfreq(map(float, sample_one))
            for list in freq:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "boxcox_llf":
            IIf = stats.boxcox_llf(imbda, map(float, sample_one))
            cols.append(IIf)
        elif test_id.strip() == "tiecorrect":
            fa = stats.tiecorrect(map(float, sample_one))
            cols.append(fa)
        elif test_id.strip() == "rankdata":
            r = stats.rankdata(map(float, sample_one), method=args.md)
            cols.append(r)
        elif test_id.strip() == "nanstd":
            s = stats.nanstd(map(float, sample_one), bias=args.bias)
            cols.append(s)
        elif test_id.strip() == "anderson":
            A2, critical, sig = stats.anderson(map(float, sample_one),
                                               dist=args.dist)
            cols.append(A2)
            for list in critical:
                cols.append(list)
            cols.append(",")
            for list in sig:
                cols.append(list)
        elif test_id.strip() == "binom_test":
            p_value = stats.binom_test(map(float, sample_one),
                                       n=args.n,
                                       p=args.p)
            cols.append(p_value)
        elif test_id.strip() == "gmean":
            gm = stats.gmean(map(float, sample_one), dtype=args.dtype)
            cols.append(gm)
        elif test_id.strip() == "hmean":
            hm = stats.hmean(map(float, sample_one), dtype=args.dtype)
            cols.append(hm)
        elif test_id.strip() == "kurtosis":
            k = stats.kurtosis(
                map(float, sample_one),
                axis=args.axis,
                fisher=args.fisher,
                bias=args.bias,
            )
            cols.append(k)
        elif test_id.strip() == "moment":
            n_moment = stats.moment(map(float, sample_one), n=args.n)
            cols.append(n_moment)
        elif test_id.strip() == "normaltest":
            k2, p_value = stats.normaltest(map(float, sample_one))
            cols.append(k2)
            cols.append(p_value)
        elif test_id.strip() == "skew":
            skewness = stats.skew(map(float, sample_one), bias=args.bias)
            cols.append(skewness)
        elif test_id.strip() == "skewtest":
            z_value, p_value = stats.skewtest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "sem":
            s = stats.sem(map(float, sample_one), ddof=args.ddof)
            cols.append(s)
        elif test_id.strip() == "zscore":
            z = stats.zscore(map(float, sample_one), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "signaltonoise":
            s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof)
            cols.append(s2n)
        elif test_id.strip() == "percentileofscore":
            p = stats.percentileofscore(map(float, sample_one),
                                        score=args.score,
                                        kind=args.kind)
            cols.append(p)
        elif test_id.strip() == "bayes_mvs":
            c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one),
                                                   alpha=args.alpha)
            cols.append(c_mean)
            cols.append(c_var)
            cols.append(c_std)
        elif test_id.strip() == "sigmaclip":
            c, c_low, c_up = stats.sigmaclip(map(float, sample_one),
                                             low=args.m,
                                             high=args.n)
            cols.append(c)
            cols.append(c_low)
            cols.append(c_up)
        elif test_id.strip() == "kstest":
            d, p_value = stats.kstest(
                map(float, sample_one),
                cdf=args.cdf,
                N=args.N,
                alternative=args.alternative,
                mode=args.mode,
            )
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "chi2_contingency":
            chi2, p, dof, ex = stats.chi2_contingency(
                map(float, sample_one),
                correction=args.correction,
                lambda_=args.lambda_)
            cols.append(chi2)
            cols.append(p)
            cols.append(dof)
            cols.append(ex)
        elif test_id.strip() == "tmean":
            if nf == 0 and mf == 0:
                mean = stats.tmean(map(float, sample_one))
            else:
                mean = stats.tmean(map(float, sample_one), (mf, nf),
                                   (inclusive1, inclusive2))
            cols.append(mean)
        elif test_id.strip() == "tmin":
            if mf == 0:
                min = stats.tmin(map(float, sample_one))
            else:
                min = stats.tmin(map(float, sample_one),
                                 lowerlimit=mf,
                                 inclusive=args.inclusive)
            cols.append(min)
        elif test_id.strip() == "tmax":
            if nf == 0:
                max = stats.tmax(map(float, sample_one))
            else:
                max = stats.tmax(map(float, sample_one),
                                 upperlimit=nf,
                                 inclusive=args.inclusive)
            cols.append(max)
        elif test_id.strip() == "tvar":
            if nf == 0 and mf == 0:
                var = stats.tvar(map(float, sample_one))
            else:
                var = stats.tvar(map(float, sample_one), (mf, nf),
                                 (inclusive1, inclusive2))
            cols.append(var)
        elif test_id.strip() == "tstd":
            if nf == 0 and mf == 0:
                std = stats.tstd(map(float, sample_one))
            else:
                std = stats.tstd(map(float, sample_one), (mf, nf),
                                 (inclusive1, inclusive2))
            cols.append(std)
        elif test_id.strip() == "tsem":
            if nf == 0 and mf == 0:
                s = stats.tsem(map(float, sample_one))
            else:
                s = stats.tsem(map(float, sample_one), (mf, nf),
                               (inclusive1, inclusive2))
            cols.append(s)
        elif test_id.strip() == "scoreatpercentile":
            if nf == 0 and mf == 0:
                s = stats.scoreatpercentile(
                    map(float, sample_one),
                    map(float, sample_two),
                    interpolation_method=args.interpolation,
                )
            else:
                s = stats.scoreatpercentile(
                    map(float, sample_one),
                    map(float, sample_two),
                    (mf, nf),
                    interpolation_method=args.interpolation,
                )
            for list in s:
                cols.append(list)
        elif test_id.strip() == "relfreq":
            if nf == 0 and mf == 0:
                rel, low_range, binsize, ex = stats.relfreq(
                    map(float, sample_one), args.b)
            else:
                rel, low_range, binsize, ex = stats.relfreq(
                    map(float, sample_one), args.b, (mf, nf))
            for list in rel:
                cols.append(list)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "binned_statistic":
            if nf == 0 and mf == 0:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                )
            else:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                    range=(mf, nf),
                )
            cols.append(st)
            cols.append(b_edge)
            cols.append(b_n)
        elif test_id.strip() == "threshold":
            if nf == 0 and mf == 0:
                o = stats.threshold(map(float, sample_one), newval=args.new)
            else:
                o = stats.threshold(map(float, sample_one),
                                    mf,
                                    nf,
                                    newval=args.new)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trimboth":
            o = stats.trimboth(map(float, sample_one),
                               proportiontocut=args.proportiontocut)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trim1":
            t1 = stats.trim1(
                map(float, sample_one),
                proportiontocut=args.proportiontocut,
                tail=args.tail,
            )
            for list in t1:
                cols.append(list)
        elif test_id.strip() == "histogram":
            if nf == 0 and mf == 0:
                hi, low_range, binsize, ex = stats.histogram(
                    map(float, sample_one), args.b)
            else:
                hi, low_range, binsize, ex = stats.histogram(
                    map(float, sample_one), args.b, (mf, nf))
            cols.append(hi)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "cumfreq":
            if nf == 0 and mf == 0:
                cum, low_range, binsize, ex = stats.cumfreq(
                    map(float, sample_one), args.b)
            else:
                cum, low_range, binsize, ex = stats.cumfreq(
                    map(float, sample_one), args.b, (mf, nf))
            cols.append(cum)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "boxcox_normmax":
            if nf == 0 and mf == 0:
                ma = stats.boxcox_normmax(map(float, sample_one))
            else:
                ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf),
                                          method=args.method)
            cols.append(ma)
        elif test_id.strip() == "boxcox":
            if imbda == 0:
                box, ma, ci = stats.boxcox(map(float, sample_one),
                                           alpha=args.alpha)
                cols.append(box)
                cols.append(ma)
                cols.append(ci)
            else:
                box = stats.boxcox(map(float, sample_one),
                                   imbda,
                                   alpha=args.alpha)
                cols.append(box)
        elif test_id.strip() == "histogram2":
            h2 = stats.histogram2(map(float, sample_one),
                                  map(float, sample_two))
            for list in h2:
                cols.append(list)
        elif test_id.strip() == "ranksums":
            z_statistic, p_value = stats.ranksums(map(float, sample_one),
                                                  map(float, sample_two))
            cols.append(z_statistic)
            cols.append(p_value)
        elif test_id.strip() == "ttest_1samp":
            t, prob = stats.ttest_1samp(map(float, sample_one),
                                        map(float, sample_two))
            for list in t:
                cols.append(list)
            for list in prob:
                cols.append(list)
        elif test_id.strip() == "ansari":
            AB, p_value = stats.ansari(map(float, sample_one),
                                       map(float, sample_two))
            cols.append(AB)
            cols.append(p_value)
        elif test_id.strip() == "linregress":
            slope, intercept, r_value, p_value, stderr = stats.linregress(
                map(float, sample_one), map(float, sample_two))
            cols.append(slope)
            cols.append(intercept)
            cols.append(r_value)
            cols.append(p_value)
            cols.append(stderr)
        elif test_id.strip() == "pearsonr":
            cor, p_value = stats.pearsonr(map(float, sample_one),
                                          map(float, sample_two))
            cols.append(cor)
            cols.append(p_value)
        elif test_id.strip() == "pointbiserialr":
            r, p_value = stats.pointbiserialr(map(float, sample_one),
                                              map(float, sample_two))
            cols.append(r)
            cols.append(p_value)
        elif test_id.strip() == "ks_2samp":
            d, p_value = stats.ks_2samp(map(float, sample_one),
                                        map(float, sample_two))
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "mannwhitneyu":
            mw_stats_u, p_value = stats.mannwhitneyu(
                map(float, sample_one),
                map(float, sample_two),
                use_continuity=args.mwu_use_continuity,
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "zmap":
            z = stats.zmap(map(float, sample_one),
                           map(float, sample_two),
                           ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "ttest_ind":
            mw_stats_u, p_value = stats.ttest_ind(map(float, sample_one),
                                                  map(float, sample_two),
                                                  equal_var=args.equal_var)
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "ttest_rel":
            t, prob = stats.ttest_rel(map(float, sample_one),
                                      map(float, sample_two),
                                      axis=args.axis)
            cols.append(t)
            cols.append(prob)
        elif test_id.strip() == "mood":
            z, p_value = stats.mood(map(float, sample_one),
                                    map(float, sample_two),
                                    axis=args.axis)
            cols.append(z)
            cols.append(p_value)
        elif test_id.strip() == "shapiro":
            W, p_value, a = stats.shapiro(map(float, sample_one),
                                          map(float, sample_two), args.reta)
            cols.append(W)
            cols.append(p_value)
            for list in a:
                cols.append(list)
        elif test_id.strip() == "kendalltau":
            k, p_value = stats.kendalltau(
                map(float, sample_one),
                map(float, sample_two),
                initial_lexsort=args.initial_lexsort,
            )
            cols.append(k)
            cols.append(p_value)
        elif test_id.strip() == "entropy":
            s = stats.entropy(map(float, sample_one),
                              map(float, sample_two),
                              base=args.base)
            cols.append(s)
        elif test_id.strip() == "spearmanr":
            if sample2 == 1:
                rho, p_value = stats.spearmanr(map(float, sample_one),
                                               map(float, sample_two))
            else:
                rho, p_value = stats.spearmanr(map(float, sample_one))
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "wilcoxon":
            if sample2 == 1:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one),
                    map(float, sample_two),
                    zero_method=args.zero_method,
                    correction=args.correction,
                )
            else:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one),
                    zero_method=args.zero_method,
                    correction=args.correction,
                )
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "chisquare":
            if sample2 == 1:
                rho, p_value = stats.chisquare(map(float, sample_one),
                                               map(float, sample_two),
                                               ddof=args.ddof)
            else:
                rho, p_value = stats.chisquare(map(float, sample_one),
                                               ddof=args.ddof)
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "power_divergence":
            if sample2 == 1:
                stat, p_value = stats.power_divergence(
                    map(float, sample_one),
                    map(float, sample_two),
                    ddof=args.ddof,
                    lambda_=args.lambda_,
                )
            else:
                stat, p_value = stats.power_divergence(map(float, sample_one),
                                                       ddof=args.ddof,
                                                       lambda_=args.lambda_)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "theilslopes":
            if sample2 == 1:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one),
                                                     map(float, sample_two),
                                                     alpha=args.alpha)
            else:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one),
                                                     alpha=args.alpha)
            cols.append(mpe)
            cols.append(met)
            cols.append(lo)
            cols.append(up)
        elif test_id.strip() == "combine_pvalues":
            if sample2 == 1:
                stat, p_value = stats.combine_pvalues(
                    map(float, sample_one),
                    method=args.med,
                    weights=map(float, sample_two),
                )
            else:
                stat, p_value = stats.combine_pvalues(map(float, sample_one),
                                                      method=args.med)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "obrientransform":
            ob = stats.obrientransform(*b_samples)
            for list in ob:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "f_oneway":
            f_value, p_value = stats.f_oneway(*b_samples)
            cols.append(f_value)
            cols.append(p_value)
        elif test_id.strip() == "kruskal":
            h, p_value = stats.kruskal(*b_samples)
            cols.append(h)
            cols.append(p_value)
        elif test_id.strip() == "friedmanchisquare":
            fr, p_value = stats.friedmanchisquare(*b_samples)
            cols.append(fr)
            cols.append(p_value)
        elif test_id.strip() == "fligner":
            xsq, p_value = stats.fligner(center=args.center,
                                         proportiontocut=args.proportiontocut,
                                         *b_samples)
            cols.append(xsq)
            cols.append(p_value)
        elif test_id.strip() == "bartlett":
            T, p_value = stats.bartlett(*b_samples)
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "levene":
            w, p_value = stats.levene(center=args.center,
                                      proportiontocut=args.proportiontocut,
                                      *b_samples)
            cols.append(w)
            cols.append(p_value)
        elif test_id.strip() == "median_test":
            stat, p_value, m, table = stats.median_test(
                ties=args.ties,
                correction=args.correction,
                lambda_=args.lambda_,
                *b_samples)
            cols.append(stat)
            cols.append(p_value)
            cols.append(m)
            cols.append(table)
            for list in table:
                elements = ",".join(map(str, list))
                cols.append(elements)
        outfile.write("%s\n" % "\t".join(map(str, cols)))
    outfile.close()
from pandas import read_csv
import matplotlib.pyplot as plt
from reader import read

results = read()

for dataset in ['BBBC004', 'BBBC039']:
    fig, axs = plt.subplots(2, 2)  #name + ", " + trans[m])
    fig.suptitle(dataset)
    for m in ['jaccard', 'dice', 'adj_rand', 'warping_error']:
        fcm_vals = results[dataset]['fcm'][m]
        unet_vals = results[dataset]['unet'][m]
        dognet_vals = results[dataset]['dognet'][m]

        trim = 0.0
        trimmed_fcm = stats.trimboth(fcm_vals, trim)
        trimmed_unet = stats.trimboth(unet_vals, trim)
        trimmed_dognet = stats.trimboth(dognet_vals, trim)

        trans = dict(
            zip(['jaccard', 'dice', 'adj_rand', 'warping_error'], [
                'Jaccard Index', 'Dice Coefficient', 'Adj. Rand index',
                'Warping error'
            ]))

        if m == 'jaccard':
            p = axs[0][0]
            p.set_xlim([0.7, 1.0])
        elif m == 'dice':
            p = axs[0][1]
            p.set_xlim([0.7, 1.0])
Ejemplo n.º 18
0
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.linear_model import LinearRegression

from sklearn.datasets import load_boston
dataset = load_boston()

samples, label, feature_names = dataset.data, dataset.target, dataset.feature_names

samples_trim = stats.trimboth(samples, 0.1)
label_trim = stats.trimboth(label, 0.1)

print(samples.shape, label.shape)
print(samples_trim.shape, label_trim.shape)

from sklearn.model_selection import train_test_split
samples_train, samples_test, label_train, label_test = train_test_split(samples_trim, label_trim,
 test_size=0.2, random_state=0)

print(samples_train.shape, label_train.shape)
print(samples_test.shape, label_test.shape)

regressor = LinearRegression()
regressor.fit(samples_train, label_train)

label_pred = regressor.predict(samples_test)

import matplotlib.pyplot as plt

plt.scatter(label_test, label_pred)
Ejemplo n.º 19
0
def main(data, output, header, trim):
    if header:
        output.write(
            f"               title     signal    lag_min  lag_delta  rise_mean rise_stddev\n"
        )

    args = [InputArg(x) for x in data]

    for arg in args:
        if arg.path.exists() and arg.path.is_file():
            continue

        sys.stderr.write(f"{arg.path}: file does not exist\n")
        exit(1)

    for arg in args:
        with open(arg.path, "r") as f:
            samples = read_measurements(f)

        changetimes = []
        risetimes = []
        deltas = []
        for sample in samples:

            times = np.array(sample.times)
            values = np.array(sample.values)

            (times, values) = linear_interp(times, values)
            values = moving_average(values)
            # The moving average discards the ends of the values to reduce error
            times = times[2:-2]

            # Check if there's a significant difference between the initial and
            # final light level
            rise = values[-1] - values[0]
            if rise < 10:
                raise Exception("No significant difference in light level")

            deltas.append(rise)

            rise_lim = rise * .1

            begin = np.argmax(values > (values[1] + rise_lim))
            end = np.argmin(values < (values[-1] - rise_lim))

            risetime = (times[end] - times[begin])
            risetimes.append(risetime)

            midpoint = np.argmax(values > (values[1] + (rise / 2)))

            changetime = times[midpoint]
            changetimes.append(changetime)

        signal_delta = sum(deltas) / len(deltas)

        changetimes = np.array(changetimes)
        if trim != 0:
            changetimes = stats.trimboth(changetimes, trim)
        (loc, scale) = stats.uniform.fit(changetimes)
        lag_min = loc

        (rise_mu, rise_std) = stats.norm.fit(risetimes)

        output.write(
            f"{arg.title:>20} {signal_delta:10.3f} {lag_min:10.3f} {scale:10.3f} {rise_mu:10.3f} {rise_std:11.3f}\n"
        )
Ejemplo n.º 20
0
minimum = np.round(np.amin(CRIM), decimals=1)
maximum = np.round(np.amax(CRIM), decimals=1)
variance = np.round(np.var(CRIM), decimals=1)
mean = np.round(np.mean(CRIM), decimals=1)
Befor_trim = np.vstack(minimum, maximum, variance, mean)
minimum_trim = stats.tmin(CRIM, 1)
maximum_trim = stats.tmax(CRIM, 40)
variance_trim = stats.tmin(CRIM, (1, 40))
mean_trim = stats.tmin(CRIM, (1, 40))
After_trim = np.round(np.vstack(minimum_trim, maximum_trim, variance_trim,
                                mean_trim),
                      decimals=1)

stat_labels1 = ['minm', 'maxm', 'vari', 'mean']
Basic_stastice1 = np.hstack((Befor_trim, After_trimfrm))

print("        Before  After")
for stat_labels1, row1 in zip(stat_labels1, Basic_stastice1):
    print('%s [%s]' % (stat_labels1, ''.join('%07s' % a for a in row1)))

CRIM_TRIMED = stats.trimboth(CRIM, 0.2)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 2))
axs = [ax1, ax2]
df = [CRIM, CRIM_TRIMED]
list_mathods = ['Befor trim', 'After trim']
for n in range(0, len(axs)):
    axs[n].hist(df[n], bins='auto')
    axs[n].set_title('{}'.format(list_mathods[n]))

#Correlation
Ejemplo n.º 21
0
@author: mariaguadalupe
"""

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as stats

data = pd.read_csv(
    "C:/Users/mariaguadalupe/Downloads/MinDa/Code/salaries_nyrb.csv")
col = 'Salary'

print('Statistics for ', col)
dataCol = np.array(data[col].values)
print(dataCol)
print('Minimun = ', np.min(dataCol))
print('Maximun = ', np.max(dataCol))
print('First quartile = ', np.percentile(dataCol, 25))
print('Median = ', np.median(dataCol))
print('Mean = ', np.mean(dataCol))
print('Third quartile = ', np.percentile(dataCol, 75))
iqr = np.percentile(dataCol, 75) - np.percentile(dataCol, 25)
print('IQR = ', iqr)
plt.boxplot(dataCol)

plt.figure()
per = 0.1
trim_data = stats.trimboth(data[col], per)
print('Statics for ', col, 'Trimmed at ', per)
print('Minimun = ', np.min(trim_data))
Ejemplo n.º 22
0
def _percentile_clip(a, perc=80):
    return np.mean(trimboth(np.sort(a, axis=0), (100 - perc) / 100, axis=0),
                   axis=0)
Ejemplo n.º 23
0
def trimmed_std(a, p):
    temp = stats.trimboth(a, p / 2)
    return np.std(temp)
Ejemplo n.º 24
0
stdout = sys.stdout

PATH = "present/"
fe = "Thelen_2020_SPW37_spectrum.txt"
#"bonus/2013_00227_X1942_spectrum_f.txt
files = [f"{PATH}{f}" for f in os.listdir(PATH) if f == fe]
#files += [f"data/{f}" for f in os.listdir("data/") ]

for f in files:
    os.system(f"notify-send 'started!' {f}")
    data = autospec.read(autospec.SpectralFile(f))
    data = data._asdict()
    data['frequency'] = data['frequency'][np.where(
        (data['frequency'] > 255.8) & (data['frequency'] < 256.08))]
    data['intensity'] = data['intensity'][np.where(
        (data['frequency'] > 255.8) & (data['frequency'] < 256.08))]
    data2 = autospec.SpectralData(**data)
    spikes = autospec.identify_spikes(
        data2, std_method=lambda x: ss.trimboth(x, 0.1).std())
    autospec.plot_spikes(spikes)
    #    import pdb; pdb.set_trace()
    save_path = f.split("/")[-1].replace(".", "_").replace(
        "_txt", "") + "working_example_zoom2"
    molecules = autospec.get_molecules_from_spikes(spikes, f"{save_path}.obj")
    cover = autospec.SetCovering(spikes, molecules, threshold=0.1)
    cover.save_results(save_path)
    os.system("notify-send 'done!'")

#
Ejemplo n.º 25
0
 def test_trimboth(self):
     a = np.arange(20)
     b = stats.trimboth(a, 0.1)
     bm = stats.mstats.trimboth(a, 0.1)
     assert_allclose(np.sort(b), bm.data[~bm.mask])
Ejemplo n.º 26
0
def trimmed_std(a,p=0.05):
    from scipy import stats
    temp = stats.trimboth(a,p/2)
    return np.std(temp)
Ejemplo n.º 27
0
def std(x):
    return ss.trimboth(x, 0.1).std()
Ejemplo n.º 28
0
    def test_trimboth(self):
        a = np.arange(20)
        b = stats.trimboth(a,0.1)
        bm = stats.mstats.trimboth(a,0.1)

        assert(np.all(b == bm.data[~bm.mask]))
Ejemplo n.º 29
0
        if insert == 0 or insert > MAXREADLEN:
            continue

        nread += 1
        if not nread % interval:
            # screen trace
            print('.', end=' ', flush=True)

        lendata.append(insert)
        hist[int(insert)] += 1

        if nread > MAXREADS:
            break
    print()

    tdata = stat.trimboth(lendata, TRIMFRAC)
    descrip = stat.describe(tdata)
    lenmean = descrip.mean
    lensd = np.sqrt(descrip.variance)

    print('\n{} reads read from {}'.format(nread, filename))
    print('\n{} fraction of values trimmed from each end'.format(TRIMFRAC))
    print('trimmed mean: {:.3f}\ttrimmed standard devation: {:.3f}'.format(
        lenmean, lensd))

    fig = plt.figure()
    ax = fig.add_subplot(111)

    # insert size histogram
    bins = [i for i in range(int(descrip.minmax[1])) if not i % BINSIZE]
    bins.append(int(descrip.minmax[1]))
Ejemplo n.º 30
0
 def test_trimboth(self):
     a = np.arange(20)
     b = stats.trimboth(a, 0.1)
     bm = stats.mstats.trimboth(a, 0.1)
     assert_allclose(b, bm.data[~bm.mask])
Suppose we need to identify the no of gems in a box. We go to numerous persons and record their guesses and note it down. Now by using the concept of trimmed mean and comparing the mean and median, we get a result close to the actual result. This is crowd computing/ social computing. 
Quora, Wikipedia etc. sites uses this concept to show us result of a particular question

from scipy import stats
import matplotlib.pyplot as plt
import statistics
Estimates=[] #list of numbers or data
Estimates.sort() #Sorting the data 

#tv=int(0.1*len(Estimates))    # trimming 10% of the data
#Estimates=Estimates[tv:]      # deleting smallest 10% values
#Estimates=Estimates[:len(Estimates)-tv]     # deleting largest 10% values 

for i in range(len(Estimates)):
    print(Estimates[i])
a=stats.trimboth(Estimates,proportiontocut=0.1) 
print(a)
m=stats.trim_mean(Estimates,0.1) #proportion cut, cuts 10% of the data from start and end.
print(m)    

# We just need to show the list, for that we will be plotting it in the x axis and fixing Y as constant
y=[]
for i in range(len(a)):
    y.append(5)

plt.plot(a,y,'ro')
plt.plot([statistics.mean(a)],[5],'bo')
plt.plot([statistics.median(a)],[5],'go')     
plt.plot([#actual value],[5],'ro')     

# Comparing the results we get a number close to the actual number.