def print_stats(datums): print 'Mean:', stats.tmean(datums) print 'Median:', stats.cmedian(datums) print 'Std Dev:', stats.tstd(datums) print 'Variation:', stats.variation(datums) print 'Kurtosis:', stats.kurtosis(datums, fisher=False) print 'Skewness:', stats.skew(datums)
def plot_histogram(histogram, html_writer, title='', max_pathway_length=8, xmin=None, xlim=20, error_bars=True, min_to_show=20, legend_loc='upper left'): fig = pylab.figure() pylab.hold(True) reps = 1000 y_offset = 0 offset_step = 0.007 colors = {1:'r', 2:'orange', 3:'green', 4:'cyan', 5:'blue', 'Rest':'violet', 'Not first':'k--', 'No known regulation':'grey', 'Activated':'green', 'Inhibited':'r', 'Mixed regulation':'blue'} for key, value in histogram.iteritems(): if len(value) >= min_to_show: m = stats.cmedian(value) sample_std = None if error_bars: sample_vals = [] i = 0 while i < reps: samples = [] while len(samples) < len(value): samples.append(random.choice(value)) sample_vals.append(pylab.median(samples)) i += 1 sample_std = pylab.std(sample_vals) plotting.cdf(value, label='%s (med=%.1f, N=%d)' % \ (key, m, len(value)), style=colors.get(key, 'grey'), std=sample_std, y_offset=y_offset) y_offset += offset_step xmin = -1 * xlim if xmin == None else xmin pylab.xlim(xmin, xlim) pylab.xlabel('Irreversability') #pylab.xlabel('deltaG') pylab.ylabel('Cumulative distribution') legendfont = matplotlib.font_manager.FontProperties(size=11) pylab.legend(loc=legend_loc, prop=legendfont) pylab.title(title) pylab.hold(False) if 'Not first' in histogram: print '%s, first vs. non-first ranksum test: ' % title + '(%f, %f)' % stats.ranksums(histogram[1], histogram['Not first']) if 'Inhibited' in histogram: print '%s, inhibited vs. non-regulated ranksum test: ' % title + '(%f, %f)' % stats.ranksums(histogram['Inhibited'], histogram['No known regulation']) #for k1, h1 in histogram.iteritems(): # for k2, h2 in histogram.iteritems(): # print k1, k2, stats.ranksums(h1, h2) return fig
def freq_correlate(self): xs = [] # human score ys = [] # left freq zs = [] # right freq for key in self.collocdict.keys(): xs.append(self.collocdict[key]) ys.append(self.freqdict.get(key, 0)) parts = key.split(":") rel = parts[1] invrel = self.parameters["inversefeatures"][rel] inverted = parts[2] + ":" + invrel + ":" + parts[0] zs.append(self.freqdict.get(inverted, 0)) print xs print ys print zs xarray = np.array(xs) yarray = np.array(ys) zarray = np.array(zs) leftcorr = stats.spearmanr(xarray, yarray) rightcorr = stats.spearmanr(xarray, zarray) print "Correlation with left frequency ", leftcorr print "Correlation with right frequency ", rightcorr if self.parameters["wn_wiki"]: self.freqthresh = stats.cmedian(yarray) print "Median left frequency ", self.freqthresh else: self.freqthresh = stats.cmedian(zarray) print "Median right frequency ", self.freqthresh if self.parameters["diff"]: self.chunkthresh = [ stats.cmedian(xarray), np.max(xarray), ] # for chunking the input into len(self.chunkthresh) chunks - needs work for generalisation to more than 2 chunks else: self.chunkthresh = [np.max(xarray)]
def fit(self, X, y): self.business_winner_bias = {} business_review_votes = defaultdict(list) for review in self.data.training_reviews.values(): business_review_votes[review['business_id']].append(review['votes']['useful']) for business_id, review_votes in business_review_votes.iteritems(): median = cmedian(review_votes) mean = tmean(review_votes) if len(review_votes) > 0 and mean != 0: bias = median / mean else: bias = 1 self.business_winner_bias[business_id] = bias return self
def test_basic(self): data = [1, 2, 3, 1, 5, 3, 6, 4, 3, 2, 4, 3, 5, 2.0] assert_almost_equal(stats.cmedian(data, 5), 3.2916666666666665) assert_almost_equal(stats.cmedian(data, 3), 3.083333333333333) assert_almost_equal(stats.cmedian(data), 3.0020020020020022)
#!/usr/bin/env python import os from numpy import mean, std from scipy.stats import cmedian DIR = "./times" #print "Chunk Size, LFS Mean, LFS StdDev, LFS Median, ext4 Mean, ext4 StdDev, ext4 Median, LFS (kcache) Mean, LFS (kcache) StdDev, LFS (kcache) Median" files = sorted(os.listdir(DIR), key=lambda x: int(x.split(".")[1])) files = [x for x in files if x.startswith("nokcache")] for filename in files: fn = os.path.join(DIR, filename) fn2 = os.path.join(DIR, filename[2:]) a = [map(float, x.split(" ")) for x in open(fn).read().split("\n")[:-1]] b = [map(float, x.split(" ")) for x in open(fn2).read().split("\n")[:-1]] chunk = filename[9:] if int(filename[9:]) < 1024 else str(int(filename[9:])/1024) + "K" print ','.join(map(str, [chunk, mean(a[0]), std(a[0]), cmedian(a[0]), mean(a[1]), std(a[1]), cmedian(a[1]), mean(b[0]), std(b[0]), cmedian(b[0])]))
def execute(cls, choices, galaxyFn=None, username=""): """Is called when execute-button is pushed by web-user. Should print output as HTML to standard out, which will be directed to a results page in Galaxy history. If getOutputFormat is anything else than HTML, the output should be written to the file with path galaxyFn. If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files). choices is a list of selections made by web-user in each options box. """ print "Executing..." genome = choices[0] infile = choices[1] windowSize = int(choices[2]) normquantile = float(choices[3]) percentile = float(choices[4]) inFn = ExternalTrackManager.extractFnFromGalaxyTN(infile.split(":")) data = open(inFn, "r").read() fetVals, addr = cls.preProcessPvalues(data, 2) stddevs, addr = cls.preProcessPvalues(data, 3) output = open(galaxyFn, "w") # Tuva changed sorted elms to FALSE output.write( "##gtrack version: 1.0\n" + "##track type: segments\n" + "##uninterrupted data lines: true\n" + "##sorted elements: false\n" + "##no overlapping elements: true\n" + "###seqid\tstart\tend\n" ) # Calculate limit for FET: m = stats.cmedian(fetVals) upperquant = stats.scoreatpercentile(stddevs, percentile) qnorm = stats.norm.ppf(normquantile) limit = m + qnorm * upperquant print "Windows found", sum(fetVals >= limit) print "percentile", percentile, "normquantile", normquantile print "mean", m, "upperquant", upperquant, "qnorm", qnorm print "Limit", limit addrs = numpy.array(addr) filteredaddrs = addrs[fetVals >= limit] print GenomeInfo.getChrList(genome) curchrom = "" start = "" end = sys.maxint prevAddr = -1000000.0 for addr in filteredaddrs: addrList = addr.split("\t") if addrList[0] != curchrom or int(addrList[1]) - windowSize > prevAddr: if curchrom != "": newend = prevAddr + windowSize if prevAddr + windowSize < end else end output.write(start + "\t" + str(newend) + "\n") start = addr curchrom = addrList[0] end = int(GenomeInfo.getChrLen(genome, curchrom)) - 1 prevAddr = int(addr.split("\t")[1]) newend = prevAddr + windowSize if prevAddr + windowSize < end else end output.write(start + "\t" + str(newend) + "\n") output.close()
def test_basic(self): data = [1,2,3,1,5,3,6,4,3,2,4,3,5,2.0] assert_almost_equal(stats.cmedian(data,5),3.2916666666666665) assert_almost_equal(stats.cmedian(data,3),3.083333333333333) assert_almost_equal(stats.cmedian(data),3.0020020020020022)