def __init__( self, path, lastcol=None, lastrow=None, colmeta=None, rowmeta=None, eps_noise=False, ): self.tbl = table(path) self.bak = self.tbl.copy() # strip / save metadata if lastrow is not None: self.tbl.head(lastrow, invert=True) if lastcol is not None: self.tbl.head(lastcol, invert=True, transposed=True) self.row = self.tbl.rowheads[:] self.col = self.tbl.colheads[:] if eps_noise: self.tbl.float() self.tbl.apply_entries(lambda x: x + c_eps * random.random()) self.dat = self.tbl.table2array() # colmetas from file / table if colmeta is None: self.colmeta = None self.colmetaname = None else: self.colmeta = [] self.colmetaname = [] for x in colmeta: if os.path.exists(x): warn("Loading col metadata from file:", x) temp = col2dict(x, value=1) self.colmeta.append( [temp.get(k, c_str_none) for k in self.col]) self.colmetaname.append(path2name(x)) else: temp = self.bak.rowdict(x) self.colmeta.append( [temp.get(k, c_str_none) for k in self.col]) self.colmetaname.append(x) # rowmetas from file / table if rowmeta is None: self.rowmeta = None self.rowmetaname = None else: self.rowmeta = [] self.rowmetaname = [] for x in rowmeta: if os.path.exists(x): warn("Loading row metadata from file:", x) temp = col2dict(x, value=1) self.rowmeta.append( [temp.get(k, c_str_none) for k in self.row]) self.rowmetaname.append(path2name(x)) else: temp = self.bak.coldict(x) self.rowmeta.append( [temp.get(k, c_str_none) for k in self.row]) self.rowmetaname.append(x)
def main( ): args = get_args( ) tbl = table( args.table ) data = tbl.table2array( args.last_metadata ).transpose( ) dist = distance_matrix( data, args.dissimilarity ) if args.method == "manual": embedding, varexp, goodness = ordinate_cmdscale( dist ) else: embedding, varexp, goodness = ordinate_sklearn( dist, method=args.method ) fig = plt.figure() fig.set_size_inches( 10, 6 ) ax = plt.subplot( 111 ) xcoords = embedding[:, 0] ycoords = embedding[:, 1] shapes = ["o" for x in xcoords] if args.shapeby is not None: field, path = args.shapeby shapes = shapeize( tbl.row( field ), path, ax ) colors = ["black" for x in xcoords] if args.colorby is not None: field, path = args.colorby colors = colorize( tbl.row( field ), path, ax ) for x, y, c, s in zip( xcoords, ycoords, colors, shapes ): ax.scatter( x, y, color=c, marker=s ) ax.set_xlim( min( xcoords ), max( xcoords ) ) ax.set_ylim( min( ycoords ), max( ycoords ) ) mu.funcMargin( ax ) title = path2name( args.table ) if args.title is None else args.title ax.set_title( "%s | Goodness of fit = %.3f" % ( title, goodness ) ) ax.set_xlabel( "Dimension 1 (%.1f%%)" % (100 * varexp[0] ) ) ax.set_ylabel( "Dimension 2 (%.1f%%)" % (100 * varexp[1] ) ) mu.funcSetTickParams( ax ) for m in args.level_biplot: level_biplot( ax, embedding, tbl.row( m ) ) for m in args.quant_biplot: quant_biplot( ax, embedding, tbl.row( m ), m ) # Shrink current axis by 20% box = ax.get_position( ) ax.set_position( [box.x0, box.y0, box.width * 0.7, box.height] ) ax.legend( scatterpoints=1, fontsize=8, loc='center left', bbox_to_anchor=(1, 0.5), ) #plt.tight_layout( ) plt.savefig( args.outfile )
def main(): args = get_args() T = table(args.table) if args.select is not None: T.select(args.select[0], args.select[1], transposed=True) groups = T.row(args.groups) if args.last_metadata is not None: m, T = T.metasplit(args.last_metadata) T.float() adonis(T, groups)
def main( ): args = get_args( ) t = table( args.table, verbose=False ) if args.stratify_by is None: dt = {} dt["ALL_SAMPLES"] = t else: dt = t.stratify( args.stratify_by ) reports = {} for level, t in dt.items( ): if args.last_metadata is not None: m, t = t.metasplit( args.last_metadata ) reports[level] = check( t ) for level, [summary, failed] in reports.items( ): print "Considering table <{}> at level <{}>:".format( args.table, level ) print " Summary:", summary print " FAILED: ", len( failed ) for s, msg in failed.items( ): print "\t".join( [" FAILED:", s, msg] ) return None
#! usr/bin/env python import sys from zopy.table2 import table t = table(sys.argv[1]) t.float() t.groupby(lambda x: x[0].split("|")[0], sum) t.dump()
) parser.add_argument( "-l", "--legacy", action="store_true", help="iteratively merge tables (better maintains feature order)", ) args = parser.parse_args() # --------------------------------------------------------------- # load and process data # --------------------------------------------------------------- p = args.tables[0] if len( args.tables ) == 1: t = table( p ) elif args.legacy: t = table( p ) for p2 in args.tables[1:]: t2 = table( p2 ) t.merge( t2 ) else: data = {} for p in args.tables: d = table( p ).table2nesteddict( ) for r in d: inner = data.setdefault( r, {} ) for c in d[r]: if c in inner and inner[c] != d[r][c]: warn( p, "overwrites", r, c, inner[c], "with", d[r][c] ) inner[c] = d[r][c]
f: focus on non-header field (first arg) i: invert m: protect/reattach metadata x: read choices from file (one per line) =============================================== Author: Eric Franzosa ([email protected]) """ import os, sys from zopy.table2 import table args = sys.argv[1:] # first arg is a command cluster; read stdin if args[0][0] == "-": t = table() # interpret first arg as a file name else: t = table(args[0]) args = args[1:] # determine operations ops = [] i = 0 while i < len(args): if args[i][0] == "-": op = [args[i], []] i += 1 while i < len(args) and args[i][0] != "-": op[1].append(args[i]) i += 1
strInputPath = args.input strMode = args.mode strOutputPath = args.output if args.output is not None else ".".join( strInputPath, strMode ) fReadLength = args.read_length strSampleReadsPath = args.sample_reads # --------------------------------------------------------------- # manipulate data # --------------------------------------------------------------- if strMode == "rpkm": if strSampleReadsPath is None: sys.exit( "to compute rpkm you must include a file mapping sample IDs to #s of reads" ) dictMillions = col2dict( strSampleReadsPath, key=0, value=1, func=lambda x: float( x ) / 1e6 ) tableCladeRPK = table( strInputPath ) tableCladeRPK.grep( "headers", "s__" ) tableCladeRPK.float() tableCladeRPK.groupby( lambda x: x.split( "|" )[0], median ) if strMode != "rpk": for bug, sample, value in tableCladeRPK.iter_entries(): if strMode == "coverage": tableCladeRPK.set( bug, sample, value * fReadLength / 1e3 ) elif strMode == "rpkm": tableCladeRPK.set( bug, sample, value / dictMillions[sample] if dictMillions[sample] > 0 else 0 ) tableCladeRPK.unfloat() tableCladeRPK.colsort() tableCladeRPK.rowsort() tableCladeRPK.dump( strOutputPath )
def trunc_normal(m, sd, zmax): outlier = True while outlier: sim = normal(m, sd) if abs(m - sim) / sd < zmax: outlier = False return sim #------------------------------------------------------------------------------- # munge hmp data #------------------------------------------------------------------------------- zu.say(args.basename, "->", "parsing HMP data") T = table(args.hmp) T.select("STSite", args.site, transposed=True) T.select("VISNO", "1", transposed=True) T.head("SRS", invert=True) T.apply_rowheads(lambda x: x.split("|")[-1]) T.grep("headers", "s__") T.grep("headers", "_unclassified", invert=True) T.dump("subset.tmp") T.float() T.unrarify(1e-20, 1) bugs = [] for bug, row in T.iter_rows(): stats = [] nonzero = [log(k) / log(10) for k in row if k > 0] stats.append(len(nonzero) / float(len(row)))
default=0, type=int, help="position post splitting to isolate") args = parser.parse_args() # process map idmap, isnewid = {}, {} with open(args.map) as fh: for line in fh: oldid, newid = line.strip().split("\t") if newid in isnewid: sys.exit("ERROR: %s listed 2+ times in map" % (newid)) else: idmap[oldid] = newid isnewid[newid] = 1 # execute def applier(colhead): oldid = colhead.split(args.split)[args.pos] if oldid in idmap: return idmap[oldid] else: print >> sys.stderr, "can't convert", colhead, "->", oldid return oldid t = table(args.table) t.apply_colheads(applier) t.dump()
from zopy.table2 import table from time import time start = time() def interval(): global start stop = time() print stop - start, "seconds" start = stop print "loading" t1 = table(sys.argv[1]) t2 = table(sys.argv[2]) interval() print "testing merge" t1.merge(t2) interval() print "loading" t3 = table(sys.argv[1]) t4 = table(sys.argv[2]) interval() print "testing merge_old" t3.merge_old(t4) interval() for i in range(len(t1.data)):
default=None, help="value to insert in place of missing values", ) parser.add_argument( "-m", "--metamerge", action="store_true", help="specify if second table is a metadata table", ) args = parser.parse_args() # --------------------------------------------------------------- # load and process data # --------------------------------------------------------------- table1 = table(args.input1) table2 = table(args.input2) if args.metamerge: table1.metamerge(table2) else: table1.merge(table2) if args.fill_empty is not None: table1.apply_entries(lambda x: x if x != c_strNA else args.fill_empty) # --------------------------------------------------------------- # dump table # --------------------------------------------------------------- # not ideal if args.output is not None: table1.dump(args.output)
def main(): args = get_args() t = table(args.table) t.float() # apply scaling gmax = max([max(row) for header, row in t.iter_rows()]) t.apply_entries(lambda x: (x / gmax)**(1 / float(args.scale))) # format labels xlab = sorted(t.colheads) ylab = sorted(t.rowheads, key=lambda x: argmax(t.rowdict(x)), reverse=True) # make plot colors = mu.ncolors(len(xlab), args.colormap) fig = plt.figure() axes = mu.funcPlotMatrix([1], [len(t.rowheads), 2]) ax = axes[0][0] legend = axes[1][0] ax.xaxis.tick_top() ax.yaxis.tick_right() ax.set_xticks(range(len(xlab))) ax.set_yticks(range(len(ylab))) ax.set_xticklabels(xlab, rotation=45, rotation_mode="anchor", ha="left") ax.set_yticklabels(ylab) ax.set_xlim(-0.5, len(xlab) - 0.5) ax.set_ylim(-0.5, len(ylab) - 0.5) # add tiles for i, x in enumerate(xlab): for j, y in enumerate(ylab): value = t.entry(y, x) if value == 0: continue tileh = 1.0 if args.mode == "width" else value tilew = 1.0 if args.mode == "height" else value alpha = 1.0 if args.emphasis == "fade" and argmax(t.rowdict(y)) != x: alpha = c_fadeout ax.add_patch( patches.Rectangle( (i - tilew / 2.0, j - tileh / 2.0), tilew, tileh, edgecolor="white", facecolor=colors[i], alpha=alpha, )) if args.emphasis == "dot" and argmax(t.rowdict(y)) == x: ax.scatter([i], [j], s=10, color="white", edgecolor="none", zorder=2) # add grid if args.grid == "box": mu.funcGrid2( ax, h=[k - 0.5 for k in range(1, len(ylab))], v=[k - 0.5 for k in range(1, len(xlab))], color="0.9", zorder=2, border=True, ) elif args.grid == "cross": mu.funcGrid2( ax, h=[k for k in range(len(ylab))], v=[k for k in range(len(xlab))], color="0.9", zorder=0, border=True, ) mu.funcHideBorder(ax) # cleanup mu.funcHideTicks(ax) # draw the legend mu.funcSetTickParams(legend) #mu.funcHideBorder( legend ) legend.yaxis.tick_right() legend.set_yticks([0]) legend.set_yticklabels(["Scale"]) legend.set_xlim(-0.5, len(xlab) - 0.5) legend.set_xticks(range(len(xlab))) legend.set_ylim(-0.5, 0.5) start = nearest_power(gmax, 10) values = [(10**start) * 0.1**i for i in range(len(xlab))] values = [(gmax) * 0.1**i for i in range(len(xlab))] values.reverse() legend.set_xticklabels(["%.2g" % k for k in values]) j = 0 for i, value in enumerate(values): value = (value / gmax)**(1 / float(args.scale)) tileh = 1.0 if args.mode == "width" else value tilew = 1.0 if args.mode == "height" else value legend.add_patch( patches.Rectangle( (i - tilew / 2.0, j - tileh / 2.0), tilew, tileh, edgecolor="none", facecolor="0.5", )) # finalize fig.set_size_inches( c_colw * len(xlab) + c_rowlabw, c_rowh * (1 + len(ylab)) + c_collabh, ) plt.tight_layout() plt.savefig(args.outfile)
#! /usr/bin/env python import os, sys, re, glob, argparse from zopy.table2 import table from zopy.dictation import col2dict from zopy.utils import path2name dictMap = col2dict( sys.argv[1], key=0, value=1 ) tableData = table( sys.argv[2] ) tableData.apply_colheads( lambda x: path2name( x ) ) tableData.apply_colheads( lambda x: x.split( "." )[0] ) tableData.apply_colheads( lambda x: dictMap[x] ) tableData.dump( sys.argv[3] )
c_strColor2 = "0.60" c_strBackgroundColor = "0.95" # constants from args c_pathPCL = args.input c_astrSamples = funcArgPathCheck(args.samples) c_astrClades = funcArgPathCheck(args.clades) c_strScaling = args.scaling c_afBins = funcArgPathCheck(args.bins) if c_afBins is not None: c_afBins = [float(k) for k in c_afBins] c_afBins.sort() c_astrHighlight = funcArgPathCheck(args.highlight) # load and manipulate table tableData = table(c_pathPCL) tableData.float() if c_astrClades is not None: tableData.grep("headers", c_astrClades) if c_astrSamples is not None: tableData.select("headers", c_astrSamples, transposed=True) dictCladeTables = tableData.groupify( lambda strRowhead: strRowhead.split("|")[0]) # override these lists based on what was in the table c_astrClades = dictCladeTables.keys() c_astrSamples = tableData.colheads[:] # derived constants c_iClades = len(c_astrClades) c_iSamples = len(c_astrSamples)
# argparse # --------------------------------------------------------------- parser = argparse.ArgumentParser() parser.add_argument('--input', help='') parser.add_argument('--output', help='') parser.add_argument('--logmin', type=float, help='') parser.add_argument('--logmax', type=float, help='') args = parser.parse_args() # --------------------------------------------------------------- # main # --------------------------------------------------------------- # get and manipulate data t = table(args.input) t.apply_entries(lambda x: None if x == "" else x) t.apply_entries(lambda x: float(x) if x is not None else None) # derived features aColors = [ plt.cm.Dark2(i / float(len(t.colheads))) for i, colhead in enumerate(t.colheads) ] aaData = [row for rowhead, row in t.iter_rows()] logmin, logmax = args.logmin, args.logmax # make plot fig = plt.figure() axes = plt.subplot(111) stepplot(axes, aaData, colors=aColors)