def test_basic2(self): a1 = [3, 4, 5, 10, -3, -5, 6] a2 = [3, -6, -2, 8, 7, 4, 2, 1] a3 = [3., 4, 5, 10, -3, -5, -6, 7.0] assert_equal(stats.median(a1), 4) assert_equal(stats.median(a2), 2.5) assert_equal(stats.median(a3), 3.5)
def descStats(data): """ Compute descriptive statistics of data """ dataList = list(data) logDataList = list(N.log10(dataList)) desc = dict() if len(dataList) == 0: desc['mean'] = 0 desc['median'] = 0 desc['logMean'] = 0 desc['logMedian'] = 0 elif len(dataList) < 2: desc['mean'] = dataList[0] desc['median'] = dataList[0] desc['logMean'] = logDataList[0] desc['logMedian'] = logDataList[0] else: desc['mean'] = mean(dataList) desc['median'] = median(dataList) desc['logMean'] = mean(logDataList) desc['logMedian'] = median(logDataList) if len(dataList) < 3: desc['stdev'] = 0 desc['sterr'] = 0 desc['logStdev'] = 0 desc['logSterr'] = 0 else: desc['stdev'] = std(dataList) desc['sterr'] = stderr(dataList) desc['logStdev'] = std(logDataList) desc['logSterr'] = stderr(logDataList) return desc
def test_basic2(self): a1 = [3,4,5,10,-3,-5,6] a2 = [3,-6,-2,8,7,4,2,1] a3 = [3.,4,5,10,-3,-5,-6,7.0] assert_equal(stats.median(a1),4) assert_equal(stats.median(a2),2.5) assert_equal(stats.median(a3),3.5)
def bin2m(a, factor): ''' Median instead of mean for bin2 ''' oldshape = a.shape newshape = numpy.asarray(oldshape) / factor tmpshape = (newshape[0], factor, newshape[1], factor) f = factor * factor binned = stats.median(stats.median(numpy.reshape(a, tmpshape), 1), 2) return binned
def bin2m(a, factor): ''' Median instead of mean for bin2 ''' oldshape = a.shape newshape = numpy.asarray(oldshape)/factor tmpshape = (newshape[0], factor, newshape[1], factor) f = factor * factor binned = stats.median(stats.median(numpy.reshape(a, tmpshape), 1), 2) return binned
def calcola_ad(AD): alt_c = [] ref_c = [] for ad in AD: if ad != '.': ref_c += [int(ad.split(',')[0])] alt_c += [int(ad.split(',')[1])] try: ref = stats.median(ref_c) alt = stats.median(alt_c) except: ref = '.' alt = '.' return ','.join([str(ref),str(alt)])
def gStats(self, missingValue=0.0): """dict of {geneID: (min,max,mean,median,std,stderr, Shapiro-Wilk(w,p),normaltest_chisq (D'Agostino and Pearson),...} """ import scipy as S import scipy.stats as SS rv = {} for k, v in self.items(): # print k,v va = S.array(self.gValues(k, missingValue)) try: normaltest = SS.normaltest(va) except: normaltest = None try: shapiro = SS.shapiro(va) except: shapiro = None try: rv[k] = (va.min(), va.max(), va.mean(), SS.median(va), SS.std(va), SS.stderr(va), normaltest, shapiro) except: print k, va raise return rv
def time_diff_analysis(all_items, test_type, item_type, log_scale=False): plt.plot([ st.median([float(p) for p in item if p is not None]) for item in list( itertools.zip_longest(*[ all_items[item_type][test_type]['CR'][i:i + 24] for i in range(0, len(all_items['Ratio']['r']['CR']), 24) ])) ], label='CR') plt.plot([ st.median([float(p) for p in item if p is not None]) for item in list( itertools.zip_longest(*[ all_items[item_type][test_type]['CR+'][i:i + 24] for i in range(0, len(all_items['Ratio']['r']['CR+']), 24) ])) ], label='CR+') plt.plot([ st.median([float(p) for p in item if p is not None]) for item in list( itertools.zip_longest(*[ all_items[item_type][test_type]['RTW'][i:i + 24] for i in range(0, len(all_items['Ratio']['r']['RTW']), 24) ])) ], label='RTW') plt.plot([ st.median([float(p) for p in item if p is not None]) for item in list( itertools.zip_longest(*[ all_items[item_type][test_type]['SR'][i:i + 24] for i in range(0, len(all_items['Ratio']['r']['SR']), 24) ])) ], label='SR') plt.plot([ st.median([float(p) for p in item if p is not None]) for item in list( itertools.zip_longest(*[ all_items[item_type][test_type][' TI'][i:i + 24] for i in range(0, len(all_items['Ratio']['r']['TI']), 24) ])) ], label='TI') plt.legend() plt.title(f"{test_type}-{item_type}") if log_scale: plt.yscale("log") plt.show()
def nanmedian(x): """Find the median over the given axis ignoring nans. fixme: should be fixed to work along an axis. """ x = _asarray1d(x).copy() y = compress(isfinite(x), x) return median(y)
def get_statistics_from_diffs(diffs): the_mean = st.mean(diffs) return { 'min': min(diffs), 'max': max(diffs), 'mean': the_mean, 'median': st.median(diffs), 'stdev': st.stdev(diffs, the_mean), 'q1': np.percentile(diffs, 25), 'q3': np.percentile(diffs, 75) }
def _calc_basic_statistics(self): """This function determines the mean and the standard deviation of the data sample. Furthermore, several other simple properties are determined. """ self.mean = stats.mean(self._data_samples) self.geom_mean = stats.geomean(self._data_samples) self.median = stats.median(self._data_samples) self.std_dev = stats.stddev(self._data_samples) self.min = min(self._data_samples) self.max = max(self._data_samples)
def calcola_dp(DP): try: DP.remove('.') except: print(DP) return '.' try: return stats.median(DP) except: print(DP) return '.'
def __init__(self, samples): self.samples = numpy.asarray(samples) self.N = len(samples) self.median = stats.median(samples) self.min = numpy.amin(samples) self.max = numpy.amax(samples) self.mean = stats.mean(samples) self.std = stats.std(samples) self.var = self.std**2. self.skew = stats.skew(samples) self.kurtosis = stats.kurtosis(samples) self.range = self.max - self.min
def test_median_simple(self): self.assertEqual(2.5, stats.median([1, 2, 3, 4])) self.assertAlmostEqual(2.5, stats.median([1.0, 2.0, 3.0, 4.0])) self.assertAlmostEqual(25, stats.median(self._integers)) self.assertAlmostEqual(25, stats.median(self._floats)) self.assertAlmostEqual(25 + 2.31, stats.median(self._floats2)) self.assertAlmostEqual(27.5, stats.median(self._mixed))
def HLdistance(X1, X2): """ The Hodges–Lehmann estimator is a statistical method for robust estimation. The principal form of this estimator is used to give an estimate of the difference between the values in two sets of data. If the two sets of data contain m and n data points respectively, m × n pairs of points (one from each set) can be formed and each pair gives a difference of values. The Hodges–Lehmann estimator for the difference is defined as the median of the m × n differences. """ diffList = list() for x1 in X1: for x2 in X2: diffList.append(x1-x2) return median(diffList)
def __init__(self, samples, name=None): samples = np.asarray(samples) self.samples = samples self.name = name self.npts = len(samples) self.median = stats.median(samples) self.min = samples.min() self.max = samples.max() self.mean = samples.mean() self.std = samples.std() self.var = samples.var() self.skew = stats.skew(samples) self.kurtosis = stats.kurtosis(samples) self.range = self.max - self.min
def test_descriptive(self): from econpy.pytrix.stat import Dstat1 x = numpy.array(self.data) d = Dstat1(x) self.assertEqual(d.nobs, x.size) self.assertAlmostEqual(d.sum, x.sum()) self.assertEqual(d.min, x.min()) self.assertEqual(d.max, x.max()) self.assertAlmostEqual(d.mean, x.mean()) #var: measure of the spread of the data set about the mean: unbiased self.assertAlmostEqual(d.m2, numpy.var(x)) self.assertAlmostEqual(d.std, numpy.std(x)) #assertEqual(d.zscores , Sstats.zs(x)) self.assertAlmostEqual(d.median, Sstats.median(x)) '''
def test_descriptive(self): from econpy.pytrix.stat import Dstat1 x = numpy.array(self.data) d = Dstat1(x) self.assertEqual(d.nobs , x.size) self.assertAlmostEqual(d.sum , x.sum()) self.assertEqual(d.min , x.min()) self.assertEqual(d.max , x.max()) self.assertAlmostEqual(d.mean , x.mean()) #var: measure of the spread of the data set about the mean: unbiased self.assertAlmostEqual(d.m2 , numpy.var(x)) self.assertAlmostEqual(d.std , numpy.std(x)) #assertEqual(d.zscores , Sstats.zs(x)) self.assertAlmostEqual(d.median , Sstats.median(x)) '''
def test_median_vs_numpy(self): self.assertEqual(numpy.median([1, 2, 3, 4]), stats.median([1, 2, 3, 4])) self.assertAlmostEqual(numpy.median([1.0, 2.0, 3.0, 4.0]), stats.median([1.0, 2.0, 3.0, 4.0])) self.assertAlmostEqual(numpy.median(self._integers), stats.median(self._integers)) self.assertAlmostEqual(numpy.median(self._floats), stats.median(self._floats)) self.assertAlmostEqual(numpy.median(self._floats2), stats.median(self._floats2)) self.assertAlmostEqual(numpy.median(self._mixed), stats.median(self._mixed))
def flux_err(self, filt, zp=None): """assumes unit = day for now not a correct calculation for large errors """ if not zp: zp = 3000e3 ## just choose something to get us close to mJy if isinstance(filt, str): if filt not in self.filts: return array([]) else: ret = array([]) for c in self.data['ts'][filt]: #if c.has_key("name"): # if c['name'] == "f_err": # ret = c['val'] # break # if c['name'] == "m_err": # ret = c["val"]*self.flux(filt,zp) # break if c.has_key("ucd"): if c['ucd'].find("flux") != -1 and c['ucd'].find( "err") != -1: ret = c["val"] break if c['ucd'].find( "phot.mag") and c['ucd'].find("err") != -1: ret = c["val"] * self.flux(filt, zp) break if (ret == 0).sum() == ret.shape[0]: ## all zeros! figure out the scatter tmp = self.flux(filt, zp) for i in range(5): med = median(tmp) sigma = sqrt(((tmp - med)**2).sum() / ret.shape[0]) tmpi = (abs(tmp - med) < 2.5 * sigma).nonzero()[0] tmp = tmp[tmpi] #print med, sigma ret = sigma * ones(ret.shape[0]) return ret return array([])
def flux_err(self,filt,zp=None): """assumes unit = day for now not a correct calculation for large errors """ if not zp: zp = 3000e3 ## just choose something to get us close to mJy if isinstance(filt,str): if filt not in self.filts: return array([]) else: ret = array([]) for c in self.data['ts'][filt]: #if c.has_key("name"): # if c['name'] == "f_err": # ret = c['val'] # break # if c['name'] == "m_err": # ret = c["val"]*self.flux(filt,zp) # break if c.has_key("ucd"): if c['ucd'].find("flux") != -1 and c['ucd'].find("err") != -1: ret = c["val"] break if c['ucd'].find("phot.mag") and c['ucd'].find("err") != -1: ret = c["val"]*self.flux(filt,zp) break if (ret == 0).sum() == ret.shape[0]: ## all zeros! figure out the scatter tmp = self.flux(filt,zp) for i in range(5): med = median(tmp) sigma = sqrt(((tmp - med)**2).sum()/ret.shape[0]) tmpi = (abs(tmp - med) < 2.5*sigma).nonzero()[0] tmp = tmp[tmpi] #print med, sigma ret = sigma*ones(ret.shape[0]) return ret return array([])
fin.close() return data class Usage(Exception): def __init__(self, msg): self.msg = msg if __name__ == "__main__": src_file = None try: try: opts, args = getopt.getopt(sys.argv[1:], "hs:", ["help"]) for o, a in opts: if o in ("-h", "--help"): print __doc__ raise Usage("-s <input file>") elif o == "-s": src_file = a except getopt.error, msg: raise Usage(msg) # more code, unchanged except Usage, err: print >>sys.stderr, err.msg print >>sys.stderr, "for help use --help" data = read_data(src_file) mean = stats.mean(data) median = stats.median(data)
def test_nanmedian_none(self): """Check nanmedian when no values are nan.""" m = stats.nanmedian(self.X) assert_approx_equal(m, stats.median(self.X))
def test_axis(self): """Regression test for #760.""" a1 = np.array([[3,4,5], [10,-3,-5]]) assert_equal(stats.median(a1), np.array([6.5, 0.5, 0.])) assert_equal(stats.median(a1, axis=-1), np.array([4., -3]))
def signif(self, files, bucktype='none'): """Compute signification of 3 input sets: test, system_output1, system_output2 """ if len(files) != 3: raise ValueError( "You must supply 3 input files for `signif` command") if bucktype not in ['none', 'dialog']: raise ValueError("Unknown `bucktype`: %r" % bucktype) self.logger.debug("Importing scipy") from scipy.stats import median, mean, tvar, tstd from scipy.stats.morestats import wilcoxon from scipy.stats.distributions import norm, t as t from scipy import sqrt forest1, forest2, forest3 = self.loadForestFiles(files) self.logger.info("Processing forests 1 and 2") diff1 = {} for fn, tree1, tree2, dist, script in self.forestProcessor( forest1, forest2): H, D, I, S = script.HDIS n_errors = D + I + S fn = self.filenameKey(fn, bucktype) diff1.setdefault(fn, 0.) diff1[fn] += n_errors self.logger.info("Processing forests 1 and 3") diff2 = {} for fn, tree1, tree2, dist, script in self.forestProcessor( forest1, forest3): H, D, I, S = script.HDIS n_errors = D + I + S fn = self.filenameKey(fn, bucktype) diff2.setdefault(fn, 0.) diff2[fn] += n_errors def mapsswe(x, y): xm = mean(x) ym = mean(y) s = 0. n = 0. for xi, yi in izip(w1, w2): s += ((xi - yi) - (xm - ym))**2 n += 1 t_stat = sqrt(n) * abs(xm - ym) / sqrt(s / (n - 1.)) p_value = t.sf(t_stat, n - 1) * 2 return t_stat, p_value Z_values = [] w1 = [] w2 = [] for key in sorted(diff1.keys()): if key not in diff2: self.logger.error("Unmatched utterance: %r", key) continue Na = diff1.pop(key) Nb = diff2.pop(key) w1.append(Na) w2.append(Nb) Z_values.append(Na - Nb) Z_mean = mean(Z_values) Z_median = median(Z_values) Z_tvar = tvar(Z_values) Z_tstd = tstd(Z_values) wilcoxon_t_stat, wilcoxon_p_value = wilcoxon(w1, w2) mapsswe_w_stat, mapsswe_p_value = mapsswe(w1, w2) fw = sys.stdout fw.write("Z stats:\n") fw.write("========\n") fw.write(" - mean: %9.3f\n" % Z_mean) fw.write(" - median: %9.3f\n" % Z_median) fw.write(" - tvar: %9.3f\n" % Z_tvar) fw.write(" - tstd: %9.3f\n\n" % Z_tstd) fw.write("Wilcoxon test:\n") fw.write("==============\n") fw.write( " - p-value: %9.3f (two-tailed) [significant if <= 0.05]\n" % wilcoxon_p_value) fw.write(" - t-stat: %9.3f\n\n" % wilcoxon_t_stat) fw.write("MAPSSWE test:\n") fw.write("=============\n") fw.write( " - p-value: %9.3f (two-tailed) [significant if <= 0.05]\n" % mapsswe_p_value) fw.write(" - t-stat: %9.3f\n\n" % mapsswe_w_stat)
def _filt_run(self, dat, filt, do_sim=False, vplot=True, nrange=1): if self.doplot and vplot: errorbar(dat[0], dat[1], dat[2], fmt="o") new = True if new: mymodel = Model(self.fitfunc_small_te, extra_args=[dat[1], dat[2], False]) else: mymodel = Model( self.fitfunc_te) #,extra_args=[dat[1],dat[2],False]) # get some good guesses try: scale = trim_mean(dat[1], 0.3) except: scale = mean(dat[1]) offset = 1.0 #trim_mean(dat[1],0.3) t0 = median(dat[0]) umin = 1.0 b = 0.0 ## trending slope mydata = RealData(dat[0], dat[1], sx=1.0 / (60 * 24), sy=dat[2]) trange = list(linspace(min(dat[0]), max(dat[0]), nrange)) maxi = (dat[1] == max(dat[1])).nonzero()[0] trange.extend(list(dat[0][maxi])) trange.extend([t0, max(dat[0]) + 10, max(dat[0]) + 100]) final_output = None for t0i in trange: for te in 10**linspace(log10(2), log10(200), nrange): if new: pinit = [te, umin, t0i] # ,scale,offset,b] else: pinit = [te, umin, t0i, scale, offset, b] myodr = ODR(mydata, mymodel, beta0=pinit) myoutput = myodr.run() if final_output is None: final_output = myoutput old_sd_beta = final_output.sd_beta continue if trim_mean(log10(myoutput.sd_beta / final_output.sd_beta),0.0) < 0.0 and \ myoutput.res_var <= final_output.res_var and (myoutput.sd_beta == 0.0).sum() <= (final_output.sd_beta == 0.0).sum(): final_output = myoutput if 1: t = linspace( min(dat[0]), max([ max(dat[0]), final_output.beta[2] + 6 * final_output.beta[0] ]), 1500) if new: tmp = self.fitfunc_small_te(final_output.beta, dat[0], dat[1], dat[2], True) #print tmp, "***" p = list(final_output.beta) p.extend([tmp[0], tmp[1], tmp[2]]) y = array(self.modelfunc_small_te(p, t)) else: p = final_output.beta y = self.fitfunc_te(final_output.beta, t) #print final_output.beta if self.doplot: plot(t, y) xlabel('Time [days]') ylabel('Relative Flux Density') if do_sim: for i in range(10): tmp = r.multivariate_normal(myoutput.beta, myoutput.cov_beta) if self.doplot: plot( t, self.a_te(tmp[0], tmp[1], tmp[2], tmp[3], tmp[4], tmp[5], t), "-") return (final_output, p, new)
results[strat][lookahead][type_cov].keys()): covs = map(str, results[strat][lookahead][type_cov][effort_frac]) o.write('%f %s\n' % (effort_frac, ' '.join(covs))) o.close() ofile = '%s%s.%s.%s-post-stats-%d-%s-%s' % ( folder, outname, type, coverage, lookahead, strat, type_cov) o = open(ofile, 'w') o.write( '# effort_fract mean stdev median min max quartile1 quartile2 quartile3\n' ) for effort_frac in sorted( results[strat][lookahead][type_cov].keys()): covs = results[strat][lookahead][type_cov][effort_frac] mean, stdev, median = numpy.mean(covs), numpy.std( covs), stats.median( covs) #stats.stdev(covs), numpy.median(covs) cov_min, cov_max = min(covs), max(covs) #quartile1, quartile2, quartile3 = stats.lscoreatpercentile(covs,.25),stats.lscoreatpercentile(covs,.50),stats.lscoreatpercentile(covs,.75) quartile1, quartile2, quartile3 = stats.scoreatpercentile( covs, 25), stats.scoreatpercentile( covs, 50), stats.scoreatpercentile(covs, 75) #scoreatpercentile o.write('%f %f %f %f %f %f %f %f %f\n' % (effort_frac, mean, stdev, median, cov_min, cov_max, quartile1, quartile2, quartile3)) o.close() if __name__ == '__main__': pass
def test_axis(self): """Regression test for #760.""" a1 = np.array([[3, 4, 5], [10, -3, -5]]) assert_equal(stats.median(a1), np.array([6.5, 0.5, 0.])) assert_equal(stats.median(a1, axis=-1), np.array([4., -3]))
def test_nanmedian_some(self): """Check nanmedian when some values only are nan.""" m = stats.nanmedian(self.Xsome) assert_approx_equal(m, stats.median(self.Xsomet))
def test_median(self): assert_equal(stats.median(self.a1), 4) assert_equal(stats.median(self.a2), 2.5) assert_equal(stats.median(self.a3), 3.5)
def clust_main(): parent = "/home/ethan/hiv/papers/jidletter/" outmi = open(parent + 'sumary.mi', 'w') outmi.write('freq,cut,p.clu,mean.clu,med.clu,std.clu,act.pri\n') out3 = open(parent + 'sumary.30y', 'w') out3.write('freq,cut,p.clu,mean.clu,med.clu,std.clu,act.pri\n') inf_mi, inf_3 = {}, {} clu_mi, clu_3 = {}, {} cuts = [6] cuts.extend([(x+1)*12 for x in range(19)]) for freq in np.linspace(0.05, 1.0, 20): inf_mi[freq], inf_3[freq] = [], [] infile = open(parent + "pkl/full/" + "lin0.pkl.full", 'r') data = cPickle.load(infile) infile.close() c_mi = data['clu_mi'] c_3 = data['clu_30y'] for inst in c_mi: freq = inst[0] cut = inst[1] if not clu_mi.has_key(freq): clu_mi[freq] = {} if not clu_mi[freq].has_key(cut): clu_mi[freq][cut] = [] for inst in c_3: freq = inst[0] cut = inst[1] if not clu_3.has_key(freq): clu_3[freq] = {} if not clu_3[freq].has_key(cut): clu_3[freq][cut] = [] for file in os.listdir(parent + "pkl/full/"): print file infile = open(parent + "pkl/full/" + file, 'r') data = cPickle.load(infile) infile.close() history = data['history'] smp_mi = data['samples_maxinc'] smp_3 = data['samples_30y'] c_mi = data['clu_mi'] c_3 = data['clu_30y'] for freq in np.linspace(0.05, 1.0, 20): for mi in smp_mi[freq]: inf_mi[freq].append(infectors_stage(history, mi)) for th in smp_3[freq]: inf_3[freq].append(infectors_stage(history, th)) for inst in c_mi: freq = inst[0] cut = inst[1] for i, tok in enumerate(inst): if i > 1: clu_mi[freq][cut].append(int(tok)) for inst in c_3: freq = inst[0] cut = inst[1] for i, tok in enumerate(inst): if i > 1: clu_3[freq][cut].append(int(tok)) for k, v in inf_mi.items(): pcount = 0 for tok in v: if tok == 'p': pcount += 1 sk = str(k) sk = sk + '00000000000000' inf_mi[sk[0:8]] = float(pcount)/len(v) for k, v in inf_3.items(): pcount = 0 for tok in v: if tok == 'p': pcount += 1 sk = str(k) sk = sk + '00000000000000' inf_3[sk[0:8]] = float(pcount)/len(v) for k, v in clu_mi.items(): for k2, v2 in v.items(): prclust = pr_clustering(v2) mean = stats.mean(v2) median = stats.median(v2) std = stats.tstd(v2) outmi.write('%s,%s,%f,%f,%f,%f,%f\n' % (k, k2, prclust, mean, median, std, inf_mi[k])) for k, v in clu_3.items(): for k2, v2 in v.items(): prclust = pr_clustering(v2) mean = stats.mean(v2) median = stats.median(v2) std = stats.tstd(v2) out3.write('%s,%s,%f,%f,%f,%f,%f\n' % (k, k2, prclust, mean, median, std, inf_3[k]))
def _filt_run(self,dat,filt,do_sim=False,vplot=True,nrange=1): if self.doplot and vplot: errorbar(dat[0],dat[1],dat[2],fmt="o") new = True if new: mymodel = Model(self.fitfunc_small_te,extra_args=[dat[1],dat[2],False]) else: mymodel = Model(self.fitfunc_te) #,extra_args=[dat[1],dat[2],False]) # get some good guesses try: scale = trim_mean(dat[1],0.3) except: scale = mean(dat[1]) offset = 1.0 #trim_mean(dat[1],0.3) t0 = median(dat[0]) umin = 1.0 b = 0.0 ## trending slope mydata = RealData(dat[0],dat[1],sx=1.0/(60*24),sy=dat[2]) trange = list(linspace(min(dat[0]),max(dat[0]),nrange)) maxi = (dat[1] == max(dat[1])).nonzero()[0] trange.extend(list(dat[0][maxi])) trange.extend([t0, max(dat[0]) + 10, max(dat[0]) + 100]) final_output = None for t0i in trange: for te in 10**linspace(log10(2),log10(200),nrange): if new: pinit = [te,umin,t0i] # ,scale,offset,b] else: pinit = [te,umin,t0i ,scale,offset,b] myodr = ODR(mydata,mymodel,beta0=pinit) myoutput = myodr.run() if final_output is None: final_output = myoutput old_sd_beta = final_output.sd_beta continue if trim_mean(log10(myoutput.sd_beta / final_output.sd_beta),0.0) < 0.0 and \ myoutput.res_var <= final_output.res_var and (myoutput.sd_beta == 0.0).sum() <= (final_output.sd_beta == 0.0).sum(): final_output = myoutput if 1: t = linspace(min(dat[0]),max([max(dat[0]),final_output.beta[2] + 6*final_output.beta[0]]),1500) if new: tmp = self.fitfunc_small_te(final_output.beta,dat[0],dat[1],dat[2],True) #print tmp, "***" p = list(final_output.beta) p.extend([tmp[0],tmp[1],tmp[2]]) y = array(self.modelfunc_small_te(p,t)) else: p = final_output.beta y = self.fitfunc_te(final_output.beta,t) #print final_output.beta if self.doplot: plot(t,y) xlabel('Time [days]') ylabel('Relative Flux Density') if do_sim: for i in range(10): tmp = r.multivariate_normal(myoutput.beta, myoutput.cov_beta) if self.doplot: plot(t, self.a_te(tmp[0],tmp[1],tmp[2],tmp[3],tmp[4],tmp[5],t),"-") return (final_output, p, new)
def reindex(self): values = self.series[-self.periods:] m = median(values).toscalar() self.append(m)
def BaseTreeAgeStatisticTest(Data, filename): """ Try for statistical test of randomness nonhomohenious tree distribution with account of its ages. Save the resulting statistcs and boundaries in txt-file, named <filename> Old. version. have to be changed Depricated function. Last revision: 20.11.2011 """ ############################Base statistical test####################### import numpy as np RadiusI = np.linspace(30, 200, 100) MinAge = np.nanmin(Data[:, 2]) MaxAge = np.nanmax(Data[:, 2]) M = 1000 N = np.shape(Data)[0] MaxMonte = 400 alpha = 0.05 indbound = np.round(MaxMonte * alpha) Z_mean = np.array([]) Z_max = np.array([]) Z_ang = np.array([]) Z_min = np.array([]) Z_kur = np.array([]) Z_sk = np.array([]) Z_med = np.array([]) Iang = [] Imax = [] Imin = [] Imean = [] Ikur = [] Isk = [] Imed = [] for r in RadiusI: ##########call monte-carlo function########## cnts, angs, corrs, maxs, mins, meds, kurs, sks = EvaluateFunct( MaxIteration=MaxMonte, LowIteration=M, RightBound=RightBound, TopBound=TopBound, Density=N, Radius=r, MinAge=MinAge, MaxAge=MaxAge, ) angss = np.sort(angs) maxss = np.sort(maxs) minss = np.sort(mins) corrss = np.sort(corrs) kurss = np.sort(kurs) skss = np.sort(sks) medss = np.sort(meds) Iang.append([angss[-indbound], angss[indbound]]) Imax.append([maxss[-indbound], maxss[indbound]]) Imin.append([minss[-indbound], minss[indbound]]) Imean.append([corrss[-indbound], corrss[indbound]]) Ikur.append([kurss[-indbound], kurss[indbound]]) Isk.append([skss[-indbound], skss[indbound]]) Imed.append([medss[-indbound], medss[indbound]]) corr = [] phi = [] indf = [] index_false = 1 for k in xrange(M): area = { "XCenter": r + np.random.rand() * (RightBound - r), "YCenter": r + np.random.rand() * (TopBound - r), "Radius": r, } [phi0, flag0, opt_val] = GetOptimalDirection(area, Data, allowed=4) if flag0 == True: corr.append(opt_val) phi.append(phi0) else: index_false = index_false + 1.0 indf.append(index_false) Z_mean = np.append(Z_mean, np.mean(corr)) Z_ang = np.append(Z_ang, np.mean(phi)) Z_max = np.append(Z_max, np.nanmax(corr)) Z_min = np.append(Z_min, np.nanmin(corr)) Z_kur = np.append(Z_kur, st.kurtosis(corr)) Z_sk = np.append(Z_sk, st.skew(corr)) Z_med = np.append(Z_med, st.median(corr)) print "Текущий радиус", r print "Средний угол:", Iang[-1], "значение", Z_ang[-1] print "Максимальная корреляция:", Imax[-1], "значение", Z_max[-1] print "Минимальная корреляция:", Imin[-1], "значение", Z_min[-1] print "Средняя корреляция:", Imean[-1], "значение", Z_mean[-1] print "Эксцесс:", Ikur[-1], "значение", Z_kur[-1] print "Асимметрия:", Isk[-1], "значение", Z_sk[-1] print "Mедиана:", Imed[-1], "значение", Z_med[-1] np.save( filename, np.array( [Z_mean, Z_ang, Z_max, Z_min, Z_kur, Z_sk, Z_med, Iang, Imax, Imin, Imean, Ikur, Isk, Imed], dtype=np.object ), )
def test_basic(self): data1 = [1, 3, 5, 2, 3, 1, 19, -10, 2, 4.0] data2 = [3, 5, 1, 10, 23, -10, 3, -2, 6, 8, 15] assert_almost_equal(stats.median(data1), 2.5) assert_almost_equal(stats.median(data2), 5)
def clust_main(): parent = "/home/ethan/hiv/papers/jidletter/" outmi = open(parent + 'sumary.mi', 'w') outmi.write('freq,cut,p.clu,mean.clu,med.clu,std.clu,act.pri\n') out3 = open(parent + 'sumary.30y', 'w') out3.write('freq,cut,p.clu,mean.clu,med.clu,std.clu,act.pri\n') inf_mi, inf_3 = {}, {} clu_mi, clu_3 = {}, {} cuts = [6] cuts.extend([(x + 1) * 12 for x in range(19)]) for freq in np.linspace(0.05, 1.0, 20): inf_mi[freq], inf_3[freq] = [], [] infile = open(parent + "pkl/full/" + "lin0.pkl.full", 'r') data = cPickle.load(infile) infile.close() c_mi = data['clu_mi'] c_3 = data['clu_30y'] for inst in c_mi: freq = inst[0] cut = inst[1] if not clu_mi.has_key(freq): clu_mi[freq] = {} if not clu_mi[freq].has_key(cut): clu_mi[freq][cut] = [] for inst in c_3: freq = inst[0] cut = inst[1] if not clu_3.has_key(freq): clu_3[freq] = {} if not clu_3[freq].has_key(cut): clu_3[freq][cut] = [] for file in os.listdir(parent + "pkl/full/"): print file infile = open(parent + "pkl/full/" + file, 'r') data = cPickle.load(infile) infile.close() history = data['history'] smp_mi = data['samples_maxinc'] smp_3 = data['samples_30y'] c_mi = data['clu_mi'] c_3 = data['clu_30y'] for freq in np.linspace(0.05, 1.0, 20): for mi in smp_mi[freq]: inf_mi[freq].append(infectors_stage(history, mi)) for th in smp_3[freq]: inf_3[freq].append(infectors_stage(history, th)) for inst in c_mi: freq = inst[0] cut = inst[1] for i, tok in enumerate(inst): if i > 1: clu_mi[freq][cut].append(int(tok)) for inst in c_3: freq = inst[0] cut = inst[1] for i, tok in enumerate(inst): if i > 1: clu_3[freq][cut].append(int(tok)) for k, v in inf_mi.items(): pcount = 0 for tok in v: if tok == 'p': pcount += 1 sk = str(k) sk = sk + '00000000000000' inf_mi[sk[0:8]] = float(pcount) / len(v) for k, v in inf_3.items(): pcount = 0 for tok in v: if tok == 'p': pcount += 1 sk = str(k) sk = sk + '00000000000000' inf_3[sk[0:8]] = float(pcount) / len(v) for k, v in clu_mi.items(): for k2, v2 in v.items(): prclust = pr_clustering(v2) mean = stats.mean(v2) median = stats.median(v2) std = stats.tstd(v2) outmi.write('%s,%s,%f,%f,%f,%f,%f\n' % (k, k2, prclust, mean, median, std, inf_mi[k])) for k, v in clu_3.items(): for k2, v2 in v.items(): prclust = pr_clustering(v2) mean = stats.mean(v2) median = stats.median(v2) std = stats.tstd(v2) out3.write('%s,%s,%f,%f,%f,%f,%f\n' % (k, k2, prclust, mean, median, std, inf_3[k]))
def clean_data(data): data.taxdelinquencyflag = data.taxdelinquencyflag.fillna('N').replace( ['Y', 'N'], [1, 0]) # Drop all rows where parcelvalue is null data = data[data['parcelvalue'].notnull()] # Replace null is "is" columns (one that should be ether 1 or 0) data[['fireplace', 'tubflag']] = data[['fireplace', 'tubflag']].fillna(0) # Drop columns that have a very high number of null values data = high_null_count(data, 0.9) # Aircon # Assume all na are non aircon and change to 0/1. Drop ordinal column data['is_aircond'] = [ 0 if (x == 5 or math.isnan(x)) else 1 for x in data['aircond'] ] data = data.drop(columns='aircond', axis=1) # Heating # Assume all na are non heating and change to 0/1. Drop ordinal column data['is_heating'] = [ 0 if (x == 13 or math.isnan(x)) else 1 for x in data['heatingtype'] ] data = data.drop(columns='heatingtype', axis=1) # Setting numbath numfullbath, 34bath as 0 data[['numfullbath', 'num34bath', 'numbath']] = data[['numfullbath', 'num34bath', 'numbath']].fillna(0) data = data.drop(columns=['num34bath', 'numfullbath'], axis=1) # Number of stories/pools/garage, if null then 0 data['numstories'] = data['numstories'].fillna(1) data[['poolnum', 'garagenum']] = data[['poolnum', 'garagenum']].fillna(0) # Drop rows that have more then 75% of the data missing data = data.dropna(axis=0, thresh=len(data.columns) * 0.75) # Convert country code to binary and dropping country code 2. # data.countycode = data.countycode.replace([6037, 6059, 6111], ['A', 'B', 'C']) # Dummy country column dummy_country = pd.get_dummies(data['countycode'], drop_first=True, prefix='countycode') data = data.merge(dummy_country, left_index=True, right_index=True) # Drop original column and country code 2 as it is very similar data = data.drop(columns=['countycode', 'countycode2'], axis=1) data = data[data['regioncode'].notnull()] data = data[data['citycode'].notnull()] # Drop columns where can't extrapolate data data = data.drop(columns='neighborhoodcode', axis=1) # if unitnum is null assume only 1 building data.unitnum = data.unitnum.fillna(1) # Set garage area to 0 if null data.garagearea = data.garagearea.fillna(0) # No way to get the year of the buiulding so drop rows with null year data = data[data['year'].notnull()] # Fill lot area and finished area with median data['lotarea'] = data.lotarea.fillna(stats.median(data['lotarea'])) data['finishedarea'] = data.finishedarea.fillna( stats.median(data['finishedarea'])) # filling quality build column # print(data.corr().loc['taxyear', 'qualitybuild']) # data.boxplot('qualitybuild', 'taxyear') # plt.show() # Fill qualitybuild based on tax year (highest correlation) mask_1 = data['taxyear'] == 2016.00000 mask_2 = data['taxyear'] == 2015.00000 data.loc[mask_1, 'qualitybuild'] = data.loc[mask_1, 'qualitybuild'].fillna(7) data.loc[mask_2, 'qualitybuild'] = data.loc[mask_2, 'qualitybuild'].fillna(5) # drop string columns data_clean = data.drop(columns=['transactiondate', 'tubflag', 'fireplace'], axis=1) return data_clean
def signif(self, files, bucktype='none'): """Compute signification of 3 input sets: test, system_output1, system_output2 """ if len(files) != 3: raise ValueError("You must supply 3 input files for `signif` command") if bucktype not in ['none', 'dialog']: raise ValueError("Unknown `bucktype`: %r" % bucktype) self.logger.debug("Importing scipy") from scipy.stats import median, mean, tvar, tstd from scipy.stats.morestats import wilcoxon from scipy.stats.distributions import norm, t as t from scipy import sqrt forest1, forest2, forest3 = self.loadForestFiles(files) self.logger.info("Processing forests 1 and 2") diff1 = {} for fn, tree1, tree2, dist, script in self.forestProcessor(forest1, forest2): H, D, I, S = script.HDIS n_errors = D+I+S fn = self.filenameKey(fn, bucktype) diff1.setdefault(fn, 0.) diff1[fn] += n_errors self.logger.info("Processing forests 1 and 3") diff2 = {} for fn, tree1, tree2, dist, script in self.forestProcessor(forest1, forest3): H, D, I, S = script.HDIS n_errors = D+I+S fn = self.filenameKey(fn, bucktype) diff2.setdefault(fn, 0.) diff2[fn] += n_errors def mapsswe(x, y): xm = mean(x) ym = mean(y) s = 0. n = 0. for xi, yi in izip(w1, w2): s += ((xi-yi) - (xm-ym))**2 n += 1 t_stat = sqrt(n) * abs(xm-ym) / sqrt(s/(n-1.)) p_value = t.sf(t_stat, n-1) * 2 return t_stat, p_value Z_values = [] w1 = [] w2 = [] for key in sorted(diff1.keys()): if key not in diff2: self.logger.error("Unmatched utterance: %r", key) continue Na = diff1.pop(key) Nb = diff2.pop(key) w1.append(Na) w2.append(Nb) Z_values.append(Na-Nb) Z_mean = mean(Z_values) Z_median = median(Z_values) Z_tvar = tvar(Z_values) Z_tstd = tstd(Z_values) wilcoxon_t_stat, wilcoxon_p_value = wilcoxon(w1, w2) mapsswe_w_stat, mapsswe_p_value = mapsswe(w1, w2) fw = sys.stdout fw.write("Z stats:\n") fw.write("========\n") fw.write(" - mean: %9.3f\n" % Z_mean) fw.write(" - median: %9.3f\n" % Z_median) fw.write(" - tvar: %9.3f\n" % Z_tvar) fw.write(" - tstd: %9.3f\n\n" % Z_tstd) fw.write("Wilcoxon test:\n") fw.write("==============\n") fw.write(" - p-value: %9.3f (two-tailed) [significant if <= 0.05]\n" % wilcoxon_p_value) fw.write(" - t-stat: %9.3f\n\n" % wilcoxon_t_stat) fw.write("MAPSSWE test:\n") fw.write("=============\n") fw.write(" - p-value: %9.3f (two-tailed) [significant if <= 0.05]\n" % mapsswe_p_value) fw.write(" - t-stat: %9.3f\n\n" % mapsswe_w_stat)
def test_basic(self): data1 = [1,3,5,2,3,1,19,-10,2,4.0] data2 = [3,5,1,10,23,-10,3,-2,6,8,15] assert_almost_equal(stats.median(data1),2.5) assert_almost_equal(stats.median(data2),5)