def FitToDelayData( DelayValues, timerange=1000, GenerateImages=True, verbose=0): ''' Loads cropped data and fits Gaussian Calculates error using bootstrap ''' freq, binedges = histogram( DelayValues, bins=2 * timerange / 25 + 1, range=(-timerange, timerange)) binedges = 0.5 * (binedges[1:] + binedges[:-1]) binedges = binedges[freq > 0] freq = freq[freq > 0] (param, err), chival = normfit(binedges, freq, yerr=sqrt(freq), ScaleGuess=100, verbose=verbose) # fit to CTR peak p1, p2, p3 = param DelayValues = array(DelayValues) fRawData = DelayValues[abs(DelayValues) < 500] CILower, CIUpper = btp.ci(fRawData, std) scaleerr = (CIUpper - std(fRawData)) / 1.96 CILower, CIUpper = btp.ci(fRawData, mean) locerr = (CIUpper - mean(fRawData)) / 1.96 ##amperr = p3 # currently ignored p1err, p2err, p3err = err.diagonal() return param, (locerr, scaleerr, p3err)
def ScikitsBootstrap(fdf, loc=0, scale=100, leftsigma=5, rightsigma=5, minsamples=100, verbose=1): ''' parameters from fit of Gaussian are used to clip total range of data from this a BCA bootstrap of the error in the loc and scale are found by the MLE estimates (std and mean respectively) --> This will ONLY work if the data given IS Gaussian ''' # fRawData = fdf.Ampl[abs(fdf.Ampl) < 1000] fRawData = fdf.Ampl[ (fdf.Ampl > loc - leftsigma * scale) & (fdf.Ampl < loc + rightsigma * scale)] if verbose > 0: print("number of samples", len(fRawData)) if len(fRawData) < minsamples: if verbose > 0: print("insufficient data") return (1e12, 1e12, 1e12) CILower, CIUpper = btp.ci(fRawData, std) scaleerr = (CIUpper - std(fRawData)) / 1.96 CILower, CIUpper = btp.ci(fRawData, mean) locerr = (CIUpper - mean(fRawData)) / 1.96 amperr = 0 # currently ignored return (locerr, scaleerr, amperr)
def bootstrap_error( data, n_samples=None ): x = np.array(data) meanx = np.mean(x) #if debug: try: if (n_samples): CIs = bootstrap.ci(data, scipy.mean, n_samples=n_samples) else: CIs = bootstrap.ci(data, scipy.mean) #, n_samples=1000) err_size = max( (meanx - CIs[0]), (CIs[1] - meanx) ) return CIs except (ValueError): CIs = None X = [] ## estimates stdx = np.std(x) for xx in xrange(1000): ## do this 1000 times X.append( np.mean( x[np.random.randint(len(x),size=len(x))] ) ) #if debug: # print len(X) #print X mean_X = np.mean(X) std_X = np.std(X) ## re-sample means are not guaranteed to be quite right. ## Conf 0.95, loc=sample mean, scale = (np.std(X, ddof=1)/np.sqrt(len(X))) conf_int = stats.norm.interval(0.95, loc=mean_X, scale=stats.sem(X)) err_size = max( (mean_X - conf_int[0]), (conf_int[1] - mean_X) ) if (np.isnan(err_size)): err_size = 0 return conf_int
def _test_bootci(n_samples=10000, method='bca'): import scikits.bootstrap as boot import time np.random.seed(110820) dat = np.random.randn(1000, 5) @jit(nopython=True) def func(d): return np.array([np.mean(d[:, 0]), np.median(d[:, 1]), np.max(d[:, 2])]) st = time.time() res = bootci_nb(dat, func, alpha=0.05, n_samples=n_samples, method=method) et = (time.time() - st) print(res) print('Time: %1.2f sec' % et) st = time.time() a = boot.ci(dat[:, 0], statfunction=np.mean, n_samples=n_samples, method=method) b = boot.ci(dat[:, 1], statfunction=np.median, n_samples=n_samples, method=method) c = boot.ci(dat[:, 2], statfunction=np.max, n_samples=n_samples, method=method) et = (time.time() - st) print('Mean_0', a) print('Median_1', b) print('Median_2', c) print('Time: %1.2f sec' % et)
def _test_bootci_pd(n_samples=10000, method='bca'): import scikits.bootstrap as boot import time df = pd.DataFrame(np.random.randn(100, 5)) def func(d): return {'MeanA': d[0].mean(), 'MedianB': np.median(d[1])} def func2(d): return d.mean() st = time.time() res = bootci_pd(df, func, alpha=0.05, n_samples=n_samples, method=method) et = (time.time() - st) print(res) print('Time: %1.2f sec' % et) st = time.time() a = boot.ci(df[0].values, statfunction=np.mean, n_samples=n_samples, method=method) b = boot.ci(df[1].values, statfunction=np.median, n_samples=n_samples, method=method) et = (time.time() - st) print('MeanA', a) print('MedianB', b) print('Time: %1.2f sec' % et)
def fit_learning_curve(data, length=10, user_length=None, context_answer_limit=100, reverse=False, bootstrap_samples=100): confidence_vals = [[] for i in range(length)] def _fit_learning_curve(series): references_by_attempt = map(lambda references: [r for r in references if r is not None], zip(*series)) learning_curve = map(lambda xs: (numpy.mean(xs), len(xs)), references_by_attempt) def _learn_fun(attempt, a, k): return a * (1.0 / (attempt + 1) ** k) opt, _ = curve_fit( _learn_fun, numpy.arange(len(learning_curve)), numpy.array(map(lambda x: x[0], learning_curve)), sigma=numpy.array(map(lambda x: 1.0 / numpy.sqrt(x[1] + 1), learning_curve)) ) fit = map(lambda attempt: _learn_fun(attempt, opt[0], opt[1]), range(len(learning_curve))) for i, r in enumerate(fit): confidence_vals[i].append(r) return fit[-1] series = reference_series(data, length=length, user_length=user_length, context_answer_limit=context_answer_limit, reverse=reverse) try: bootstrap.ci(series, _fit_learning_curve, method='pi', n_samples=bootstrap_samples) def _aggr(rs): return { 'value': numpy.median(rs), 'confidence_interval_min': numpy.percentile(rs, 2), 'confidence_interval_max': numpy.percentile(rs, 98), } return map(_aggr, confidence_vals) except: return []
def test_bca_errorbar_output_simple(self): np.random.seed(1234567890) results_default = boot.ci(self.data) np.random.seed(1234567890) results_errorbar = boot.ci(self.data, output='errorbar') np.testing.assert_array_almost_equal( results_errorbar.T, abs(np.average(self.data) - results_default)[np.newaxis])
def ScikitsBootstrap(fdf): CILower, CIUpper = btp.ci(fdf.counts, std) scaleerr = (CIUpper - std(fdf.counts)) / 1.96 CILower, CIUpper = btp.ci(fdf.counts, mean) locerr = (CIUpper - mean(fdf.counts)) / 1.96 amperr = 0 # currently ignored return (locerr, scaleerr, amperr)
def test_pi_multi_2dout_multialpha(self): np.random.seed(1234567890) results1 = boot.ci((self.x,self.y), stats.linregress, alpha=(0.1,0.2,0.8,0.9),n_samples=2000,method='pi') np.random.seed(1234567890) results2 = boot.ci(np.vstack((self.x,self.y)).T, lambda a: stats.linregress(a)[0], alpha=(0.1,0.2,0.8,0.9),n_samples=2000,method='pi') np.random.seed(1234567890) results3 = boot.ci(np.vstack((self.x,self.y)).T, lambda a: stats.linregress(a)[1], alpha=(0.1,0.2,0.8,0.9),n_samples=2000,method='pi') np.testing.assert_array_almost_equal(results1[:,0],results2) np.testing.assert_array_almost_equal(results1[:,1],results3)
def print_switches_per_model(self, models=('early', 'enes', 'esen'), l2_epoch=25, print_total=True, n_sample=10000, print_per_lang=True, switch_type=('alternational', 'insertional', 'ambiguous')): for m in models: print(m) if print_total: df = pd.read_csv( f'{self.results_dir}/{m}{self.fname_suffix}/performance.csv', index_col=None, header=0, skipinitialspace=True, dtype={'epoch': int}) df = df[df.epoch == ( l2_epoch if m == 'early' else df.epoch.max())] print(df.epoch.max()) cs_sum = df[f'alternational_percentage'] + df[ f'insertional_percentage'] + df[f'ambiguous_percentage'] low, high = boot.ci(cs_sum, n_samples=n_sample) print('TOTAL', round(cs_sum.mean(), 1), 'CI:', round(low, 1), round(high, 1)) for stype in switch_type: low, high = boot.ci(df[f'{stype}_percentage'], n_samples=n_sample) print(stype, round(df[f'{stype}_percentage'].mean(), 1), 'CI:', round(low, 1), round(high, 1)) if print_per_lang: df = pd.read_csv( f'{self.results_dir}/{m}{self.fname_suffix}/performance_per_lang.csv', index_col=None, header=0, skipinitialspace=True, dtype={'epoch': int}) df = df[df.epoch == ( l2_epoch if m == 'early' else df.epoch.max())] for lang in self.languages: print('per lang:', lang) df_lang = df[df.switch_from == lang] cs_sum = (df_lang[f'alternational_percentage'] + df_lang[f'insertional_percentage'] + df_lang[f'ambiguous_percentage']) low, high = boot.ci(cs_sum, n_samples=n_sample) print('TOTAL per lang', round(cs_sum.mean(), 1), 'CI:', round(low, 1), round(high, 1)) for stype in switch_type: low, high = boot.ci(df_lang[f'{stype}_percentage'], n_samples=n_sample) print(stype, round(df_lang[f'{stype}_percentage'].mean(), 1), 'CI:', round(low, 1), round(high, 1))
def test_pi_multi_2dout_multialpha(self): np.random.seed(1234567890) results1 = boot.ci((self.x,self.y), lambda a,b: np.polyfit(a,b,1), alpha=(0.1,0.2,0.8,0.9),n_samples=2000,method='pi') np.random.seed(1234567890) results2 = boot.ci(np.vstack((self.x,self.y)).T, lambda a: np.polyfit(a[:,0],a[:,1],1)[0], alpha=(0.1,0.2,0.8,0.9),n_samples=2000,method='pi') np.random.seed(1234567890) results3 = boot.ci(np.vstack((self.x,self.y)).T, lambda a: np.polyfit(a[:,0],a[:,1],1)[1], alpha=(0.1,0.2,0.8,0.9),n_samples=2000,method='pi') np.testing.assert_array_almost_equal(results1[:,0],results2) np.testing.assert_array_almost_equal(results1[:,1],results3)
def test_bca_multi_multialpha(self): np.random.seed(1234567890) results1 = boot.ci((self.x, self.y), lambda a, b: stats.linregress(a, b)[1], alpha=(0.1, 0.2, 0.8, 0.9), n_samples=1000) np.random.seed(1234567890) results2 = boot.ci(np.vstack((self.x, self.y)).T, lambda a: stats.linregress(a)[1], alpha=(0.1, 0.2, 0.8, 0.9), n_samples=1000) np.testing.assert_array_almost_equal(results1, results2)
def test_bca_multi_multialpha(self): np.random.seed(1234567890) results1 = boot.ci((self.x, self.y), lambda a, b: np.polyfit(a, b, 1), alpha=(0.1, 0.2, 0.8, 0.9), n_samples=1000) np.random.seed(1234567890) results2 = boot.ci(np.vstack((self.x, self.y)).T, lambda a: np.polyfit(a[:, 0], a[:, 1], 1), alpha=(0.1, 0.2, 0.8, 0.9), n_samples=1000) np.testing.assert_array_almost_equal(results1, results2)
def esci_indep_cohens_d(data1, data2, n_boot=5000, has_preds=False): '''Compute Cohen's d effect size and its bootstrap 95% confidence interval. (using bias corrected accelerated bootstrap). Parameters ---------- data1 : np.ndarray One dimensional array of values for the "high" group (for example diagnosed participants). data2 : np.ndarray One dimensional array of values for the "low" group (for example healthy controls). n_boot : int Number of bootstraps to use. has_preds : bool Wheter array of predictors is provided in the data. If so the first column of data1 and data2 are data for separate groups and the following columns are the predictors used in regression with the predictor of interest (group membership) being the last one and the rest treated as confounds. Returns ------- stats : dict Dictionary of results. * ``stats['es']`` contains effect size. * ``stats['ci']`` contains 95% confidence interval for the effect size. * ``stats['bootstraps']`` contains bootstrap effect size values. ''' if not has_preds: assert data2 is not None import dabest df = utils.psd_to_df(data1, data2) dbst_set = dabest.load(df, idx=("controls", "diagnosed"), x="group", y="FAA", resamples=n_boot) results = dbst_set.cohens_d.results cohen_d = results.difference.values[0] cohen_d_ci = (results.bca_low.values[0], results.bca_high.values[0]) bootstraps = results.bootstraps[0] else: from borsar.stats import compute_regression_t import scikits.bootstrap as boot def regression_Cohens_d(data1, data2): data = np.concatenate([data1, data2], axis=0) preds = data[:, 1:] tvals = compute_regression_t(data[:, [0]], preds) return d_from_t_categorical(tvals[-1, 0], preds) cohen_d = regression_Cohens_d(data1, data2) cohen_d_ci, bootstraps = boot.ci((data1, data2), regression_Cohens_d, multi='independent', n_samples=n_boot, return_dist=True) stats = dict(es=cohen_d, ci=cohen_d_ci, bootstraps=bootstraps) return stats
def rci_boot(x, y, alpha=0.95, verbose=True, n_samples=10000, method='bca'): """ Calculate a confidence interval for the Pearson correlation coefficient r between two series 'x' and 'y' using the bootstrap method. It is helpful to compare the bootstrapped Confidence Intervals (CIs) for the Pearson correlation coefficient r with the CIs obtained with the more standard Fisher’s transformation method, as suggested by Cox (2008). References ---------- Cox (2008): Speaking Stata: Correlation with confidence, or Fisher’s z revisited. The Stata Journal (2008) 8, Number 3, pp. 413-439. Example ------- TODO """ x,y = map(np.asanyarray, (x,y)) ## Bootstrapped confidence intervals. xl, xu = bootstrap.ci((x, y), statfunction=rcoeff, alpha=(1-alpha), n_samples=n_samples, method=method, multi=True) if verbose: print "" print "Bootstrapped CI (xl,xu): (%.3f,%.3f)"%(xl, xu) print "" return (xl,xu)
def forcedChoicePlot(listenerAccuracies, listenerScores, mturkAccuracies, mturkScores, outFile, title, errorBars=False): """listenerAccuracies is an array of accuracy arrays, one per problem level. mturkAccuracies is a 1-d array of mturk accuracies on each problem level. """ matplotlib.rcParams.update({'font.size' : 20}) lw = 4 plt.hold(True) nListeners = len(listenerAccuracies) nIterations = len(listenerAccuracies[0]) - 1 plt.axis([0, nIterations, 0, 1]) plt.ylabel('Listener Accuracy') plt.xlabel('Training Iterations') for levelAccuracies, levelScores, lineColor in zip(listenerAccuracies, listenerScores, colors): if errorBars: yerrs = [] for scores in levelScores: if np.array(scores).all(): yerrs.append(0) else: interval = boot.ci(np.array(scores), np.average) err = (interval[1] - interval[0]) / 2.0 yerrs.append(err) plt.errorbar(range(len(levelAccuracies)), levelAccuracies, yerr=yerrs, linewidth=lw, color=lineColor) print lineColor print levelAccuracies else: plt.plot(levelAccuracies, linewidth=lw, marker='o', color=lineColor) listenerTitles = ['Level %d' % level for level in range(nListeners)] plt.legend(listenerTitles, loc='lower right') plt.title(title) plt.savefig(outFile, format='pdf') plt.show()
def stats_per_group(x): print 'stats-per-group' x = x.groupby(['sid']).mean() x = x.value print len(x) res = {'median': [], 'qtile': []} medians = np.median(x) res['mean'] = np.average(x) res['median'] = medians lower_quartile, upper_quartile = np.percentile(x, [25, 75]) res['qtile'] = (upper_quartile, lower_quartile) # res['ci'] = np.percentile(x, [2.5,97.5]) iqr = upper_quartile - lower_quartile upper_whisker = x[x <= upper_quartile + 1.5 * iqr].max() lower_whisker = x[x >= lower_quartile - 1.5 * iqr].min() res['whisk'] = (lower_whisker, upper_whisker) res['err'] = (np.abs(lower_whisker - medians), np.abs(upper_whisker - medians)) res['ci'] = bootstrap.ci(x, n_samples=BOOTSTRAP_NUM) return pd.Series(res)
def totalNspks(self): """ Compute statistical comparisons of total nosepokes in no inhibition versus inhibition session of NpHR subjects Return dictionary with means, sems, p-value, bootstrapped 95 percent CI """ totalNspks = {} totalNspks['controlMean'] = self.datadict['totalNspksControl']['NoInhib'].mean() totalNspks['controlSEM'] = self.datadict['totalNspksControl']['NoInhib'].sem() totalNspks['controlCI'] = bootstrap.ci(data=self.datadict['totalNspksControl']['NoInhib'], statfunction=scipy.mean) totalNspks['inhibMean'] = self.datadict['totalNspksInhibited']['Inhibited'].mean() totalNspks['inhibSEM'] = self.datadict['totalNspksInhibited']['Inhibited'].sem() totalNspks['inhibCI'] = bootstrap.ci(data=self.datadict['totalNspksInhibited']['Inhibited'], statfunction=scipy.mean) totalNspks['p'] = scipy.stats.ttest_rel(self.datadict['totalNspksControl']['NoInhib'], self.datadict['totalNspksInhibited']['Inhibited']) return totalNspks
def meanNspksInhib(self): """ Compute statistical comparisons of mean nosepokes in laser versus simlaser in inhibition session Return dictionary with means, sems, p-value, bootstrapped 95 percent CI """ meanNspksInhib = {} meanNspksInhib['simMean'] = self.datadict['meanNspksInhibited']['simLaser'].mean() meanNspksInhib['simSEM'] = self.datadict['meanNspksInhibited']['simLaser'].sem() meanNspksInhib['simCI'] = bootstrap.ci(data=self.datadict['meanNspksInhibited']['simLaser'], statfunction=scipy.mean) meanNspksInhib['laserMean'] = self.datadict['meanNspksInhibited']['Laser'].mean() meanNspksInhib['laserSEM'] = self.datadict['meanNspksInhibited']['Laser'].sem() meanNspksInhib['laserCI'] = bootstrap.ci(data=self.datadict['meanNspksInhibited']['Laser'], statfunction=scipy.mean) meanNspksInhib['p'] = scipy.stats.ttest_rel(self.datadict['meanNspksInhibited']['simLaser'], self.datadict['meanNspksInhibited']['Laser']) return meanNspksInhib
def run_for_all(self, bound_response, sfs, fff, blank, n_samples=500): bf = BootFit() bf.sfs = sfs bf.fff = fff.mean if fff else None bf.blank = blank.mean if blank else None self.n_samples = n_samples msg = ('Performing {} samples... ').format(self.n_samples) print msg try: self.interval = bootstrap.ci( data = bound_response, statfunction = bf.stat_for_all, n_samples = self.n_samples ) stats = bf.rvs[:self.n_samples] self.mean, self.std = np.nanmean(stats), np.nanstd(stats) except Exception as e: stats = bf.rvs[:self.n_samples] self.mean, self.std = np.nanmean(stats), np.nanstd(stats) sys.stderr.write(str(e)) sys.stderr.flush() # print 'INTERVAL:{s.interval}, MEAN: {s.mean}, STD: {s.std}'.format(s=self) # print stats print '{} unique preferred SF were made.'.format(len(set(stats))) return self
def confidence_intervals(system_scores, baseline_scores, gold_scores): """ Compute BCa confidence intervals for a system compared to a baseline. :param system_scores: list of system's scores :param baseline_scores: list of baseline method's scores :param gold_scores: list of gold scores :return: dict containing system and baseline Pearson correlation, delta between them, and confidence interval """ system_prs = pearsonr(gold_scores, system_scores)[0] baseline_prs = pearsonr(gold_scores, baseline_scores)[0] data = list(zip(gold_scores, system_scores, baseline_scores)) def statistic(data): gs = data[:, 0] sys = data[:, 1] base = data[:, 2] r1 = pearsonr(gs, sys)[0] r2 = pearsonr(gs, base)[0] return r1 - r2 conf_int = bootstrap.ci(data, statfunction=statistic, method='bca') return { 'system': system_prs, 'baseline': baseline_prs, 'delta': system_prs - baseline_prs, 'conf_int': list(conf_int), }
def _metrics_stats_fn(preds_and_labels): # Slice predictions and labels into batches and compute metrics on them. metric_values = np.asarray(metric_fn(preds_and_labels)) # Compute metric mean and CI using bootstrap. metric_mean = np.mean(metric_values) metric_ci = boot.ci(metric_values, np.mean, alpha=(1.0 - ci / 100.0)) return metric_mean, metric_ci
def flag_outlier(in_vec, thresh_percentage=95): """ Flags an outlier according to a percent difference threshold :param thresh_percentage: percent confidence interval :param in_vec: :return: outlier_ind """ in_vec = np.array(in_vec) # find largest outlier outlier_ind = 0 l2_resid_old = 0 mask = np.ones(len(in_vec), dtype=bool) for i in xrange(in_vec.shape[0]): mask[i] = False l2_resid = (in_vec[i] - np.mean(in_vec[mask]))**2 if l2_resid > l2_resid_old: outlier_ind = i l2_resid_old = l2_resid mask[i] = True # check if outlier is outside threshold percentage # bootstrap a 95% ci from data a_lvl = 1 - (thresh_percentage / 100.) CIs = bootstrap.ci(data=in_vec, statfunction=mean, alpha=a_lvl) if in_vec[outlier_ind] < CIs[0] or in_vec[outlier_ind] > CIs[1]: return outlier_ind else: return None
def rci_boot(x, y, alpha=0.95, verbose=True, n_samples=10000, method='bca'): """ Calculate a confidence interval for the Pearson correlation coefficient r between two series 'x' and 'y' using the bootstrap method. It is helpful to compare the bootstrapped Confidence Intervals (CIs) for the Pearson correlation coefficient r with the CIs obtained with the more standard Fisher’s transformation method, as suggested by Cox (2008). References ---------- Cox (2008): Speaking Stata: Correlation with confidence, or Fisher’s z revisited. The Stata Journal (2008) 8, Number 3, pp. 413-439. Example ------- TODO """ x,y = map(np.asanyarray, (x,y)) ## Bootstrapped confidence intervals. xl, xu = bootstrap.ci((x, y), statfunction=rcoeff, alpha=(1-alpha), n_samples=n_samples, method=method, multi=True) if verbose: print("") print("Bootstrapped CI (xl,xu): (%.3f,%.3f)"%(xl, xu)) print("") return (xl,xu)
def plot_serial(all_s, color, label=None, xk=None, nan=False): mean = np.mean if nan: mean = nanmean if xk is None: xx = xxx2 else: xx = xk stderr = array([ ci(sb, statfunction=mean, alpha=1 - 0.68, method="pi") for sb in (all_s).T ]) if not label: fill_between(xx, degrees(stderr[:, 0]), degrees(stderr[:, 1]), color=color, alpha=0.2) else: fill_between(xx, degrees(stderr[:, 0]), degrees(stderr[:, 1]), color=color, alpha=0.2, label=label) plot(xx, degrees(mean(all_s, 0)), color=color) plot(xx, zeros(len(xx)), "k--", alpha=0.5) if type_ori: xlabel(r"relative orientation of previous trial ($^\circ$)") else: xlabel(r"relative color of previous trial ($^\circ$)") ylabel(r"error on current trial ($^\circ$)") #legend() sns.despine() ylim(-2, 3)
def bootstrapCI(data, statFunc=None, alpha=0.05, nPerms=10000, output='lowhigh', method='pi'): """Wrapper around a function in the scikits_bootstrap module: https://pypi.python.org/pypi/scikits.bootstrap Parameters ---------- data : np.ndarray Data for computing the confidence interval. statFunc : function Should take data and operate along axis=0 alpha : float Returns the [alpha/2, 1-alpha/2] percentile confidence intervals. nPerms : int output : str Use 'lowhigh' or 'errorbar', for matplotlib errorbars""" if statFunc is None: statFunc = partial(np.nanmean, axis=0) try: out = ci(data=data, statfunction=statFunc, alpha=alpha, n_samples=nPerms, output='lowhigh', method=method) except IndexError: shp = list(data.shape) shp[0] = 2 out = np.nan * np.ones(shp) if output == 'errorbar': mu = statFunc(data) shp = list(out.shape) out[0,:] = out[0,:] - mu out[1,:] = mu - out[1,:] out = np.reshape(out, shp) return out
def plot(data_arr, data_shuf_arr, data_err_arr, file_desc='sample'): rcParams['pdf.fonttype'] = 42 rcParams['ps.fonttype'] = 42 rcParams['font.family'] = 'sans-serif' rcParams['font.sans-serif'] = ['Arial'] cmm = np.mean(data_arr, axis=1) cshufmm = np.mean(data_shuf_arr, axis=1) cerrmm = np.mean(data_err_arr, axis=1) data_boot = [ boot.ci(data_arr[b, :], np.mean, n_samples=1000) for b in range(5) ] data_shuf_boot = [ boot.ci(data_shuf_arr[b, :], np.mean, n_samples=1000) for b in range(5) ] data_err_boot = [ boot.ci(data_err_arr[b, :], np.mean, n_samples=1000) for b in range(5) ] (fh, ax) = plt.subplots(1, 1, figsize=(5 / 2.54, 5 / 2.54), dpi=300) ax.fill_between(np.arange(1, 6) + 0.5, [x[0] for x in data_boot], [x[1] for x in data_boot], color='r', alpha=0.2) ax.fill_between(np.arange(1, 6) + 0.5, [x[0] for x in data_shuf_boot], [x[1] for x in data_shuf_boot], color='k', alpha=0.2) ax.fill_between(np.arange(1, 6) + 0.5, [x[0] for x in data_err_boot], [x[1] for x in data_err_boot], color='b', alpha=0.2) ax.plot(np.arange(1, 6) + 0.5, np.mean(data_arr, axis=1), '-r') ax.plot(np.arange(1, 6) + 0.5, np.mean(data_shuf_arr, axis=1), '-k') ax.plot(np.arange(1, 6) + 0.5, np.mean(data_err_arr, axis=1), '-b') ax.set_xlim((1, 6)) ax.set_ylim((0.3, 0.8)) ax.set_yticks((0.4, 0.6, 0.8)) ax.set_xticks((1, 6)) ax.set_xlabel('Time (s)') ax.set_ylabel('Classification accuracy') fh.savefig('4su_stp_decoding_{}.pdf'.format(file_desc), bbox_inches='tight') return fh
def test_bca_n_samples(self): np.random.seed(1234567890) results = boot.ci(self.data, np.average, alpha=(0.1, 0.2, 0.8, 0.9), n_samples=500) np.testing.assert_array_almost_equal( results, np.array([0.40027628, 0.5063184, 0.94082515, 1.05653929]))
def test_abc_multialpha_unified(self): results = boot.ci(self.data, lambda x, weights: np.average(x, weights=weights), alpha=(0.1, 0.2, 0.8, 0.9), method='abc') np.testing.assert_array_almost_equal( results, np.array([0.39472915, 0.51161304, 0.93789723, 1.04407254]))
def calc_bootstrap(data): # Calculate the bootstrap CIs = bootstrap.ci(data=data, statfunction=sp.mean) # Print the data: the "*" turns the array CIs into a list print('The conficence intervals for the mean are: {0} - {1}'.format(*CIs)) return CIs
def get_ci(data, ci): try: ci_vals = bootstrap.ci(data=data, alpha = ci, statfunction=print_class, n_samples = 10) except: ci_vals = [-1.0,1.0] return ci_vals
def summSC(scfile): #Create merged and summarised dataframe df=pd.DataFrame.from([map(osp.basename, imlist), [labelset[i] for i in gtlist], [labelset[i] for i in estlist]]).T df.columns=['image','observed','predicted'] df=df.merge(sunits,left_on='image',right_on='image', how='left') df=pd.melt(df, id_vars=['sampleunit','image'], value_vars=['observed','predicted'], var_name='method', value_name='label') df=df.groupby(['sampleunit','method','label']).size().reset_index(name='count') df=df.groupby(['sampleunit','method','label']).agg({'count': 'sum'}) df=df.groupby(level=['sampleunit','method']).apply(lambda x: 100 * x / float(x.sum())).reset_index() df=df.merge(lsmap, on='label', how='left') df=df.groupby(['sampleunit','method','tier3_name'])['count'].agg({'count':np.sum}).reset_index() df=df.rename(index=str, columns={"tier3_name": "label"}) df=df.pivot_table(index=['sampleunit','label'], columns='method', values='count').reset_index().fillna(value=0) df['error']=abs(df['observed']-df['predicted']) df=df.groupby('label')['error'].agg({'mean': np.mean, 'std': np.std, 'cilow': lambda x: bootstrap.ci(x, statfunction=scipy.mean)[0], 'cimax':lambda x: bootstrap.ci(x, statfunction=scipy.mean)[1], }).reset_index() #Plot Mean Absolute Error as the absolute diference between machine predictions and manual observations from test images. cierror=[df['cilow'],df['cimax']] plot = df.plot(kind='bar', y='mean', x='label', yerr=cierror, color='DarkGreen', edgecolor='black', grid=False, figsize=(8,2), position=0.45, error_kw=dict(ecolor='black',elinewidth=0.5), width=0.8, legend=False, rot=90, fontsize=9) plot.set_xlabel('Labels', fontsize=12) plot.set_ylabel('Mean Absolute Error (%)', fontsize=12) plot.xaxis.set_tick_params('labelcenter')
def test_pi_multialpha(self): np.random.seed(1234567890) results = boot.ci(self.data, np.average, method='pi', alpha=(0.1, 0.2, 0.8, 0.9)) np.testing.assert_array_almost_equal( results, np.array([0.40351601, 0.51723236, 0.94547054, 1.05749207]))
def quick_ci(g, x, fun=scipy.mean, alpha=0.05, n=200): import warnings warnings.simplefilter("ignore", bootstrap.InstabilityWarning) l, h = bootstrap.ci(data=g, statfunction=fun, alpha=alpha, n_samples=n) return pandas.DataFrame({ x: g.name, 'mean': g.mean(), 'low': l, 'high': h }, index=[g.name])
def esci_regression_r(x, y, n_boot=5000): '''Compute Pearson's r effect size and its bootstrap 95% confidence interval (using bias corrected accelerated bootstrap). Parameters ---------- x : np.ndarray Predictors - one or two-dimensional array of values for the correlation. If predictors are two-dimensional the last column is treated as the predictor of interest and the rest as confounds. y : np.ndarray Dependent variable. One dimensional array of values for the correlation. n_boot : int Number of bootstraps to use. Returns ------- stats : dict Dictionary of results. * ``stats['es']`` contains effect size. * ``stats['ci']`` contains 95% confidence interval for the effect size. * ``stats['bootstraps']`` contains bootstrap effect size values. ''' # use pearson correlation from scipy.stats import pearsonr import scikits.bootstrap as boot stats = dict() if x.ndim == 1: # normal correlation def corr(x, y): return pearsonr(x, y)[0] else: from borsar.stats import compute_regression_t # we use regression t value and then turn it to r def corr(x, y): tvals = compute_regression_t(y[:, np.newaxis], x) return r_from_t(tvals[-1, 0], x) r = corr(x, y) # currently this is available only on my branch of scikits-bootstrap # but I'll prepare a PR to the github repo, and it will be available # when/if it gets accepted r_ci, bootstraps = boot.ci((x, y), corr, multi=True, n_samples=n_boot, return_dist=True) stats.update(bootstraps=bootstraps) stats.update(es=r, ci=r_ci) return stats
def bootstrap(self): """ performs bootrapping of f1 measure on dataset. A narrow confidence interval is more indicative of a sufficient sample size A 95% confidence interval means we are 95% confident that the true f1 measure is between (1) and (2). ( 1 and 2 are values return by bootstrap library). :return: """ data = list(self.algorithm_results.items()) CIs = bootstrap.ci(data=data, statfunction=self.f1_bootstrap, n_samples=10000) print(self.algorithm_name) print("Bootstrapped 95% confidence intervals for f1 \nLow:", CIs[0], "\nHigh:", CIs[1])
def calculate(self, questionsNAnswers): if None in [q.answer() for q in questionsNAnswers]: raise ValueError() sample = [1. if q.answer() else 0. for q in questionsNAnswers] if sample == [ 0. ] * len(questionsNAnswers): # bootstrap fails if we pass all zeroes return [0., 0.] print(sample) percentageCI = bootstrap.ci(data=sample, statfunction=scipy.mean) return [b * len(self.__corpora.words()) for b in percentageCI]
def ci_eval(samples): # alpha sets the confidence interval to 1 sigma # bootstrap gives us a lower and upper errorbar # we assume them to be almost equal such that # a simple average is justified return np.average( boot.ci(samples, statfunction=(lambda x: np.average(np.abs(x))), alpha=(1. - 0.6827), n_samples=5000, method='bca', output='errorbar'))
def calc_bootstrap(data): ''' Find the confidence interval for the mean of the given data set with bootstrapping. ''' # --- >>> START stats <<< --- # Calculate the bootstrap CIs = bootstrap.ci(data=data, statfunction=sp.mean) # --- >>> STOP stats <<< --- # Print the data: the "*" turns the array "CIs" into a list print(('The conficence intervals for the mean are: {0} - {1}'.format(*CIs))) return CIs
def calc_bootstrap(data): """ Find the confidence interval for the mean of the given data set with bootstrapping. """ # --- >>> START stats <<< --- # Calculate the bootstrap CIs = bootstrap.ci(data=data, statfunction=sp.mean) # --- >>> STOP stats <<< --- # Print the data: the "*" turns the array "CIs" into a list print(f'The conficence intervals for the mean are: {CIs[0]} - {CIs[1]}') return CIs
def boot_bin_stack(data_bin, n_samples=3000): warnings.filterwarnings("ignore") count = data_bin.shape[0] if count > 1: if n_samples is not None: cci = ci(data_bin, n_samples=n_samples) else: cci = np.array([np.nan, np.nan]) mu = np.average(data_bin) else: cci = np.array([np.nan, np.nan]) mu = np.nan return mu, cci, count
def calc_bootstrap(data): ''' Find the confidence interval for the mean of the given data set with bootstrapping. ''' # --- >>> START stats <<< --- # Calculate the bootstrap CIs = bootstrap.ci(data=data, statfunction=sp.mean) # --- >>> STOP stats <<< --- # Print the data: the "*" turns the array "CIs" into a list print( ('The conficence intervals for the mean are: {0} - {1}'.format(*CIs))) return CIs
def write_data(fn,data): """Performs descriptive stats and writes stats to output file""" f = open(fn,'w') mue,muese = MUE(data) f.write("Errors are 95% CIs\n") f.write("MUE = %5.3f +/- %5.3f\n" % (mue,muese*1.96)) mse,msese = MSE(data) f.write("MSE = %5.3f +/- %5.3f\n" % (mse,msese*1.96)) correldict = correls(data) f.write("R^2 = %3.2f\n" % correldict['r_value']**2) f.write("K-Tau = %3.2f\n\n" % correldict['tau']) f.write("BOOTSTRAPPED RESULTS (10k resamples, 95% CIs)\n") CIs = boot.ci(data,MUE) f.write("MUE = %5.3f < %5.3f < %5.3f\n" % (CIs[0][0],mue,CIs[1][0])) CIs = boot.ci(data,MSE) f.write("MSE = %5.3f < %5.3f < %5.3f\n" % (CIs[0][0],mse,CIs[1][0])) CIs = boot.ci(data,correls_for_bootstrap) f.write("Pearson's R = %3.2f < %3.2f < %3.2f\n" % (CIs[0][2],correldict['r_value'],CIs[1][2])) f.write("R^2 = %3.2f < %3.2f < %3.2f\n" % (CIs[0][3],correldict['r_value']**2,CIs[1][3])) f.write("K-Tau = %3.2f < %3.2f < %3.2f\n\n" % (CIs[0][6],correldict['tau'],CIs[1][6])) f.close()
def scalesHiddenPlot(name='scales'): matplotlib.rcParams.update({'font.size' : 20}) lw = 3 plt.hold(True) if name == 'scalesPlus': experimentName = 'Complex' nLevels = 3 leveledFcData = turk.readScalesProblems('../../data/scale_plus_6stimuli_3levels_no_fam_24_january_SCAL.csv', name) elif name == 'scales': experimentName = 'Simple' nLevels = 2 leveledFcData = turk.readScalesProblems('../../data/scales_6stimuli_3levels_no_fam_25_january_OSCA.csv', name) else: print '[forcedChoiceExperiments] Unknown experiment name: ', name sizes = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80] nModels = 10 # numbered 1 to 10 agents = [] # will be an array of arrays, one entry per hidden node, one entry per training iteration # load the agents for size in sizes: sizeAgents = [] for agentNum in range(1,nModels + 1): (listeners, speakers) = loadAllAgents('../../data/cogsci/agents-%d-%d.pickle' % (size, agentNum)) sizeAgents.append(listeners) agents.append(sizeAgents) for (levelProblems, lineColor) in zip(leveledFcData, colors): dataset = forcedChoiceProblemsToDataset(levelProblems) hiddenLayerAccuracies = [] hiddenLayerScores = [] yerrs = [] for (allListeners, size) in zip(agents, sizes): # for each # of hidden layers sizeAccuracies = [] # accuracies for each independent trial for this # of hidden nodes and this level of problem. will be averaged. sizeScores = [] for listeners in allListeners: lastListener = listeners[3] (correct, activations, scores) = evalListenerOnClassificationDataset(lastListener, dataset) sizeAccuracies.append(float(correct) / len(scores)) sizeScores.append(scores) averageAccuracy = np.array(sizeAccuracies).mean() hiddenLayerAccuracies.append(averageAccuracy) hiddenLayerScores.append(sizeScores) interval = boot.ci(np.array(sizeScores), np.average) err = (interval[1] - interval[0])/2.0 yerrs.append(err) plt.errorbar(sizes, hiddenLayerAccuracies, yerr=yerrs, linewidth=lw, color=lineColor) plt.axis([0, sizes[-1], 0, 1]) plt.title('ANN Accuracy on the %s Condition' % experimentName) plt.xlabel('Number of Hidden Nodes') plt.ylabel('Average Accuracy') plt.legend(['Level %d' % i for i in range(nLevels)], loc='lower right') plt.savefig('hidden%s.pdf' % name, format='pdf') plt.show()
def test_bootstrap(): import numpy as np from scikits.bootstrap import ci data = np.random.normal(loc=1, scale=1, size=1000) print('std = %.2f' % data.std()) samples = bootstrap(data, 100) boot_error = calc_bootstrap_error(samples, 0.32) boot_error_ci = ci(data, np.median, 0.32) print('bootstrap error', boot_error) print('bootstrap error ci', boot_error_ci)
def compute_ci(scores, exclude=()): cfs = {} for ds, items in scores.items(): if ds in exclude: continue cfs[ds] = {} for sim1, sim2 in combinations(items, 2): print(f'Computing CI for {ds} - {sim1} : {sim2}') human_scores, sim_scores1 = scores[ds][sim1] _, sim_scores2 = scores[ds][sim2] data = list(zip(human_scores, sim_scores1, sim_scores2)) cfs[ds][(sim1, sim2)] = \ bstrap.ci(data, statfunction=statistic, method='bca') return cfs
def diffusion_tensor_ci(positions, orientations, lagtime=1, fps=1., ndim=3, **kwargs): """Calculate the diffusion tensor and the confidence interval using bootstrap.""" from scikits import bootstrap delta_tjn, all_xjn = _compute_displ(positions, orientations, lagtime, fps) if ndim == 2: all_xjn = all_xjn[:, [0, 1, 5]] # only x, y transl and z rot statfunc = lambda x: (x[:, :, np.newaxis] * x[:, np.newaxis, :]).mean(0).ravel() * 0.5 / delta_tjn result = bootstrap.ci(all_xjn, statfunc, **kwargs) if ndim == 2: result = result.reshape((2, 3, 3)) else: result = result.reshape((2, 6, 6)) return result
def syntheticHiddenPlot(): """ Evaluate a variety of hidden layer agents""" matplotlib.rcParams.update({'font.size' : 20}) lw = 3 plt.hold(True) levelInstances = [loadFacesInstances('../../data/facesInstances-%d.csv' % level) for level in [0,1,2]] sizes = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80] nModels = 10 # numbered 1 to 10 agents = [] # will be an array of arrays, one entry per hidden node, one entry per training iteration # load the agents for size in sizes: sizeAgents = [] for agentNum in range(1,nModels + 1): (listeners, speakers) = loadAllAgents('../../data/cogsci/agents-%d-%d.pickle' % (size, agentNum)) sizeAgents.append(listeners) agents.append(sizeAgents) # loop over levels, then over model sizes, then over agents.. for (instances, lineColor) in zip(levelInstances, colors): # for each level dataset = goldListenerTrainingExamplesFromInstances(instances) hiddenLayerAccuracies = [] # average accuracy for each hidden layer hiddenLayerScores = [] yerrs = [] for (allListeners, size) in zip(agents, sizes): # for each # of hidden layers sizeAccuracies = [] # accuracies for each independent trial for this # of hidden nodes and this level of problem. will be averaged. sizeScores = [] for listeners in allListeners: lastListener = listeners[3] (correct, activations, scores) = evalListenerOnClassificationDataset(lastListener, dataset) sizeAccuracies.append(float(correct) / len(scores)) sizeScores.append(scores) averageAccuracy = np.array(sizeAccuracies).mean() hiddenLayerAccuracies.append(averageAccuracy) hiddenLayerScores.append(sizeScores) interval = boot.ci(np.array(sizeScores), np.average) err = (interval[1] - interval[0])/2.0 yerrs.append(err) plt.errorbar(sizes, hiddenLayerAccuracies, yerr=yerrs, linewidth=lw, color=lineColor) plt.title('ANN Accuracy by Size of Hidden Layer') plt.axis([0, sizes[-1], 0, 1]) plt.xlabel('Number of Hidden Nodes') plt.ylabel('Listener Accuracy') legendTitles = ['Level 0', 'Level 1', 'Level 2'] plt.legend(legendTitles, loc='lower right') plt.savefig('hiddenSynthetic.pdf', format='pdf') plt.show()
def bootstrap_pce_regression(pts_filename, vals_filename,rv_trans,alpha=0.05,n_samples=3000): # Must be a ( num_dims x num_pts ) matrix pts = numpy.loadtxt( pts_filename, delimiter = ',' ) # must be a ( num_pts x 1 ) vector vals = numpy.loadtxt( vals_filename, delimiter = ',' ) vals = vals.reshape( vals.shape[0], 1 ) #data=numpy.hstack((pts.transpose(),vals)) def bootstrappable_pce_regression(pts,vals): ## bootstrap gives this function a tuple of arrays of shape (N,...) ## but PCE expects pts to be of shape (...,N), so we transpose pts=pts.transpose() num_dims, num_pts = pts.shape #num_dims-= 1 #pts = data[:,range(num_dims)] #vals = data[:,num_dims] # find degree of PCE degree = 2 while ( True ): num_basis_terms = nchoosek( degree + num_dims, num_dims ) if ( num_basis_terms > num_pts ): break degree += 1 degree -= 1 # define the parameters of the PCE pce = PolynomialChaosExpansion() pce.set_random_variable_transformation( rv_trans ) pce.define_isotropic_expansion( degree, 1. ) # form matrices needed for normal equations V, build_vals = pce.build_linear_system( pts, vals, False ) assert V.shape[1] <= V.shape[0] # Solve least squares to find PCE coefficients coeff = numpy.linalg.solve( numpy.dot( V.T, V ), numpy.dot( V.T, build_vals ) ) pce.set_coefficients( coeff.reshape( coeff.shape[0], 1 ) ) return get_tsi(pce,qoi=0) TSIs=bootstrap.ci((pts.transpose(),vals),bootstrappable_pce_regression,alpha=alpha,n_samples=n_samples,multi=True) return TSIs
def scores_table(row_preds, row_names, y_true, score_func=matthews_corrcoef, alpha=0.05): for preds, name in zip(row_preds, row_names): best_idx = best_setting(preds, y_true, score_func) settings, y_pred = preds[best_idx] print_row = "|| {} ||".format(name) for report_score_func in accuracy_score, f1_score, matthews_corrcoef: score = report_score_func(y_true, y_pred) if score == 0: score_low, score_hi = 0, 0 else: score_low, score_hi = ci((y_true, y_pred), report_score_func, alpha=alpha, n_samples=5000, method='bca') print_row += "{:.2f} ({:.2f}-{:.2f}) ||".format(score, score_low, score_hi) print(print_row)
def syntheticPlot(allAccuracies, allScores, outFile, title, errorBars=False, overall=False): """Generate figure of accuracy accross listeners and datasets. accuracy is an array of np arrays. each np array has the accuracy for a given level of each model, where the 0th is the literal one. If overall == True, use the last entry labeled as Overall. Otherwise, label the scores by their level name. """ matplotlib.rcParams.update({'font.size' : 20}) lw = 3 plt.hold(True) for levelAccuracies, levelScores, lineColor in zip(allAccuracies, allScores, colors): if errorBars: yerrs = [] for scores in levelScores: # one per each level if np.array(scores).all(): yerrs.append(0) else: interval = boot.ci(np.array(scores), np.average) err = (interval[1] - interval[0])/2.0 yerrs.append(err) plt.errorbar(range(len(levelAccuracies)), levelAccuracies, yerr=yerrs, linewidth=lw, color=lineColor) else: plt.plot(levelAccuracies, linewidth=lw, marker='o', color=lineColor) nListeners = len(allAccuracies[0]) # number of models nLevels = len(allAccuracies) # types of problems plt.axis([0, nListeners - 1, 0, 1]) plt.ylabel('Listener Accuracy') plt.xlabel('Training Iterations') if overall: legendTitles = ['Level %d' % level for level in range(nLevels - 1)] legendTitles.append('Overall') else: legendTitles = ['Level %d' % level for level in range(nLevels)] plt.legend(legendTitles, loc='lower right') plt.title(title) plt.savefig(outFile, format='pdf') plt.show()
# X_LOSO = rest_data[subject_subset,:] # expVar(beh_keysfn, X_LOSO, Y_LOSO, penalty) # LOSO_loadings = SCCA_r(X_LOSO, Y_LOSO, n_components, penalty) # SCCA_Output_Sheet('SCCA_LOSO', region_labels_fn, beh_keysfn, subject_subset, X, Y, LOSO_loadings) # np.save('SCCAloading_LOSO_long',LOSO_loadings) #################LOSO COMPLETE################# import scikits.bootstrap as boot data = (X, Y) def SCCA_boot(X,Y): loadings = SCCA_r(X,Y, 6, (0.3,0.5)) return True ci_test = boot.ci(data, statfunction=SCCA_boot) boot_loadings = np.load(expanduser('bootstrap_all_comp_long.npy')) SCCA_Output_Sheet('SCCA_Bootstrap_long', region_labels_fn, beh_keysfn, subject_subset, X, Y, boot_loadings) from numpy import genfromtxt data_by_task = genfromtxt('Behavioural\\mwq_byTask.csv', delimiter=',',skip_header=1) data_CRT = data_by_task[:,1:14] data_WM = data_by_task[:,14:] subject_subset = data_by_task[:, 0].astype('i4') loadings = boot_loadings[1] comp = np.zeros((len(data_CRT), loadings.shape[1]*2)) for i in range(loadings.shape[1]):
high = 1000 fig = plt.figure(1, facecolor='white', figsize=(7,5.6)) trueProb = 0.5 xs = np.arange(stride, high+stride, stride) ys = np.ones_like(xs)*trueProb yerr_low = np.zeros_like(xs, dtype=np.float) yerr_high = np.zeros_like(xs, dtype=np.float) i = 0 for x in xs: crossPos = x*trueProb crossNeg = x-crossPos transitions = np.concatenate([np.ones(crossPos), np.ones(crossNeg)*-1], axis=0) CI = skbootstrap.ci(data=transitions, statfunction=getRho, output='errorbar', n_samples=10000, method='pi') print x, CI yerr_low[i] = CI[0,0] yerr_high[i] = CI[1,0] i+=1 ax1 = fig.add_subplot(111) ax1.margins(0,0.05) ax1.errorbar(xs, ys, yerr=[yerr_low, yerr_high], ecolor='r', color='k', fmt='o', elinewidth=2, capthick=2) ax1.set_ylabel(r'Probability') ax1.set_xlabel(r'Number of Samples') ax1.set_xlim([0,high]) ax1.set_ylim([0,1])
continue if topic == 'all': break if measure not in sc: sc[measure] = {} sc[measure][topic] = float(score) for measure in measures: values = np.fromiter(sc[measure].values(), np.float) mean = values.mean() lo, hi = ci.t_ci_mean(values) shap = ci.shape(mean, lo, hi) cover = ci.coverage(values, ci.t_ci_mean) print('{} t {:.4f} [{:.4f},{:.4f}] {:.2f} {:.3f}'.format(measure, values.mean(), lo, hi, shap, cover)) lo, hi = ci.bootstrap_t_ci_mean(values) shap = ci.shape(mean, lo, hi) cover = ci.coverage(values, ci.bootstrap_t_ci_mean) print('{} bootstrap_t {:.4f} [{:.4f},{:.4f}] {:.2f} {:.3f}'.format(measure, values.mean(), lo, hi, shap, cover)) lo, hi = ci.bootstrap_pct_ci_mean(values) shap = ci.shape(mean, lo, hi) cover = ci.coverage(values, ci.bootstrap_pct_ci_mean) print('{} bootstrap_pct {:.4f} [{:.4f},{:.4f}] {:.2f} {:.3f}'.format(measure, values.mean(), lo, hi, shap, cover)) lo, hi = bootstrap.ci(values, n_samples=2000) shap = ci.shape(mean, lo, hi) cover = ci.coverage(values, lambda x: bootstrap.ci(x, n_samples=2000)) print('{} sk.bs-bca {:.4f} [{:.4f},{:.4f}] {:.2f} {:.3f}'.format(measure, values.mean(), lo, hi, shap, cover))
def mean_confidence_interval(data, confidence=0.95): a = 1.0*np.array(data) n = len(a) m, se = np.mean(a), scipy.stats.sem(a) h = se * sp.stats.t._ppf((1+confidence)/2., n-1) return m, m-h, m+h rnm = infile rg = open(rnm) line = rg.readline() rg.close() line1 = line[:-4].split(')) ') res = [] n_samples = len(line1[0].split(') (')[0].split(' ')) fnl = np.zeros((queries,conditions,n_samples)) for a,i in enumerate(line1): for b,j in enumerate(i.split(') (')): # conditions fnl[a,b,:] = np.array(j.translate(None,'(').split(' ')) #samples fnlc = np.zeros((fnl.shape[0:2])) fnlc = [] # organize as 1-6 Strength, 7-12 Luck for a in range(queries): for b in range(conditions): fnlc.append(bootstrap.ci(fnl[a,b])) np.savetxt((rnm[:-8]+'_95ci.csv'), fnlc, delimiter=',')
def sliding_median_iqr(neighbors, random=None, compute_random=0, window=1000, p0=None): """ Compute sliding median of spearmanr and size, interquartile range and 95% CI of spearmanr of randomly paired genes Parameters ---------- neighbors: neighboring gene pairs dataframe window: size of window for sliding median Returns ------- rolling_median: sliding median of spearmanr and size with IQR for spearmanr median and 95% confidence interval of median from random pairs """ #load dataframe if not provided yet if isinstance(neighbors , basestring): neighbors = pd.read_csv(neighbors) if compute_random and isinstance(random , basestring): random = pd.read_csv(random) # sort by size to do sliding window with increasing intergenic distance # nans cause error in sliding median neighbors = neighbors.sort('size').dropna() print 'computing sliding median...' # compute rolling medians. 1000 looks good, less is unnecesserily heavy and noisy. rolling_median_spearmanr = pd.rolling_median(neighbors.spearmanr, window) print 'computing IQR...' # compute interquartile range (IQR). Top 75% and bottom 25%. rolling_spearmanr_q1 = - pd.rolling_quantile(neighbors.spearmanr, window, 0.25) + \ rolling_median_spearmanr rolling_spearmanr_q3 = pd.rolling_quantile(neighbors.spearmanr, window, 0.75) - \ rolling_median_spearmanr rolling_median_size = pd.rolling_median(neighbors['size'], window)/1000 # put it all together rolling_median_s = pd.DataFrame({'spearmanr': rolling_median_spearmanr, 'size':rolling_median_size, 'q1': rolling_spearmanr_q1, 'q3': rolling_spearmanr_q3}) # drop all nans from sliding median (first 1000 because of window) rolling_median_s = rolling_median_s.dropna() # reindex is necessary rolling_median_s.index = np.arange(len(rolling_median_s)) if compute_random: print 'computing random pairs median CI' # compute 95% confidence interval of median in random pairs ci_median = bs.ci(random.spearmanr.dropna().loc[:20000], np.median) rolling_median_s['random_lci'] = ci_median[0] rolling_median_s['random_hci'] = ci_median[1] print 'fitting to exp decay...' popt_s, pcov_s = curve_fit(exp_decay, rolling_median_s['size'], rolling_median_s.spearmanr, p0=p0) rolling_median_s['popt1'] = popt_s[0] rolling_median_s['popt2'] = popt_s[1] rolling_median_s['popt3'] = popt_s[2] print 'done' return rolling_median_s