def __init__( self, path, lastcol=None, lastrow=None, colmeta=None, rowmeta=None, eps_noise=False, ): self.tbl = table(path) self.bak = self.tbl.copy() # strip / save metadata if lastrow is not None: self.tbl.head(lastrow, invert=True) if lastcol is not None: self.tbl.head(lastcol, invert=True, transposed=True) self.row = self.tbl.rowheads[:] self.col = self.tbl.colheads[:] if eps_noise: self.tbl.float() self.tbl.apply_entries(lambda x: x + c_eps * random.random()) self.dat = self.tbl.table2array() # colmetas from file / table if colmeta is None: self.colmeta = None self.colmetaname = None else: self.colmeta = [] self.colmetaname = [] for x in colmeta: if os.path.exists(x): warn("Loading col metadata from file:", x) temp = col2dict(x, value=1) self.colmeta.append( [temp.get(k, c_str_none) for k in self.col]) self.colmetaname.append(path2name(x)) else: temp = self.bak.rowdict(x) self.colmeta.append( [temp.get(k, c_str_none) for k in self.col]) self.colmetaname.append(x) # rowmetas from file / table if rowmeta is None: self.rowmeta = None self.rowmetaname = None else: self.rowmeta = [] self.rowmetaname = [] for x in rowmeta: if os.path.exists(x): warn("Loading row metadata from file:", x) temp = col2dict(x, value=1) self.rowmeta.append( [temp.get(k, c_str_none) for k in self.row]) self.rowmetaname.append(path2name(x)) else: temp = self.bak.coldict(x) self.rowmeta.append( [temp.get(k, c_str_none) for k in self.row]) self.rowmetaname.append(x)
def main(): args = get_args() # load genes (accounting for linkage) genes = col2dict( args.genes, value=(1 if args.linked else None), headers=args.skip_headers, ) genes = {g: (g if k is None else k) for g, k in genes.items()} # load background (accounting for linkage) background = None if args.background is not None: background = col2dict( args.background, value=(1 if args.linked else None), headers=args.skip_headers, ) background = { g: (g if k is None else k) for g, k in background.items() } # load gene sets gene_sets = polymap( args.gene_sets, reverse=args.reversed_mapping, ) # run analysis results = fisher_enrich( genes, gene_sets, depletions=not args.exclude_depletions, background=background, intersect_background=args.intersect_background, intersect_annotated=args.intersect_annotated, fdr=args.fdr, min_expected_overlap=args.min_expected_overlap, verbose=False, ) # write results fh = open(args.outfile, "w") if args.outfile is not None else sys.stdout writer = csv.writer(fh, dialect="excel-tab") writer.writerow(c_fisher_fields) for R in results: writer.writerow(R.row()) # wrapup if len(results) == 0: say("# NO SIGNIFICANT ENRICHMENTS") fh.close() return None
def main(): args = get_args() # derived args path_faa = args.faa total_reads = args.reads pident = args.pident if pident > 1: print >> sys.stderr, "interpret", pident, "as percent;", pident /= 100.0 print >> sys.stderr, "fractional value is", pident mut_rate = 1 - pident read_len = args.readlen frag_len = int(math.ceil(read_len / 3.0)) # load proteins faa = load_faa(path_faa, frag_len) # initialize weights to effective protein lengths weights = {k: (len(v) - frag_len + 1) for k, v in faa.items()} # augment weights? if args.weights: weights2 = col2dict(args.weights, value=1, func=float) weights = {k: v * weights2.get(k, 0) for k, v in weights.items()} # make the chooser wc = WeightedChooser(weights) # make the reads (weighting by protein length) read = 0 for prot_name in wc.iter_choice(total_reads): read += 1 prot_seq = faa[prot_name] rand_read(prot_seq, prot_name, read, read_len, frag_len, mut_rate)
def metacolors(values, cmap): found_other = False found_none = False unique = set(values) if c_str_other in unique: found_other = True unique -= {c_str_other} if c_str_none in unique: found_none = True unique -= {c_str_none} if os.path.exists(cmap): cmap = col2dict(cmap, value=1) for k in unique: if k not in cmap: cmap[k] = c_none_color cmap = {l: c for l, c in cmap.items() if l in unique} else: ncol = mu.ncolors(len(unique), cmap) cmap = {} for k, c in zip(sorted(unique), ncol): cmap[k] = c if found_other: cmap[c_str_other] = c_other_color if found_none: cmap[c_str_none] = c_none_color return cmap
def rowlabel(self, df, des, ax_label, scaled=False, path=None): ax_label = "{} (n={:,})".format(ax_label, len(df.row)) # override row names from the table? mapping = {} if path is not None: mapping = col2dict(path, value=1) for l in df.row: if l not in mapping: mapping[l] = "" if path is not None else l if len(df.row) > c_max_lab and not scaled: warn("Too many row labels.") cx = sum(self.rownames.get_xlim()) / 2.0 cy = sum(self.rownames.get_ylim()) / 2.0 self.rownames.text(cx, cy, ax_label, size=c_font3, ha="center", va="center", rotation=90, rotation_mode="anchor") else: ha = "left" xi, xf = self.rownames.get_xlim() if des.wmargin("heatmap") > des.wmargin("rownames"): ha = "right" xi, xf = xf, xi self.rownames.set_ylim(0, len(df.row)) for i, l in enumerate(df.row): self.rownames.text( xi, i + 0.5, mapping[l], ha=ha, va="center", size=c_font1, clip_on=False, ) cx = xf cy = sum(self.rownames.get_ylim()) / 2.0 self.rownames.text(cx, cy, ax_label, size=c_font3, ha="center", va="bottom" if ha == "left" else "top", rotation=90, rotation_mode="anchor")
def collabel(self, df, des, ax_label, scaled=False, path=None): ax_label = "{} (n={:,})".format(ax_label, len(df.col)) # override col names from the table? mapping = {} if path is not None: mapping = col2dict(path, value=1) for l in df.col: if l not in mapping: mapping[l] = "" if path is not None else l if len(df.col) > c_max_lab and not scaled: warn("Too many column labels.") cx = sum(self.colnames.get_xlim()) / 2.0 cy = sum(self.colnames.get_ylim()) / 2.0 self.colnames.text(cx, cy, ax_label, size=c_font3, ha="center", va="center") else: ha = "left" yi, yf = self.colnames.get_ylim() if des.hmargin("heatmap") < des.hmargin("colnames"): yi, yf = yf, yi ha = "right" self.colnames.set_xlim(0, len(df.col)) for i, l in enumerate(df.col): self.colnames.text( i + 0.5, yi, mapping[l], rotation=45, rotation_mode="anchor", va="center", ha=ha, size=c_font1, clip_on=False, ) cx = 0 cy = yf self.colnames.text( cx, cy, ax_label, size=c_font3, ha="left", va="top" if ha == "left" else "bottom", )
def rowlabel(self, df, ax_label, scaled=False, path=None): ax_label = "{} (n={:,})".format(ax_label, len(df.row)) # override row names from the table? mapping = {} if path is not None: mapping = col2dict(path, value=1) for l in df.row: if l not in mapping: mapping[l] = "" if path is not None else l if len(df.row) > c_max_lab and not scaled: warn("Too many row labels.") cx = sum(self.rownames.get_xlim()) / 2.0 cy = sum(self.rownames.get_ylim()) / 2.0 self.rownames.text(cx, cy, ax_label, size=c_font3, ha="center", va="center", rotation=90, rotation_mode="anchor") else: self.rownames.set_ylim(0, len(df.row)) for i, l in enumerate(df.row): self.rownames.text( 0, i + 0.5, mapping[l], va="center", size=c_font1, clip_on=True, ) cx = self.rownames.get_xlim()[1] cy = sum(self.rownames.get_ylim()) / 2.0 self.rownames.text(cx, cy, ax_label, size=c_font3, ha="center", va="bottom", rotation=90, rotation_mode="anchor")
def main(): args = get_args() # load key values def make_link(row): key = row[1] if args.linked else row[0] return Link(key, float(row[-1])) values = col2dict( args.values, func=make_link, headers=args.skip_headers, ) # load key sets gene_sets = polymap( args.gene_sets, reverse=args.reversed_mapping, ) # perform analysis results = rank_enrich( values, gene_sets, depletions=not args.exclude_depletions, intersect_annotated=args.intersect_annotated, fdr=args.fdr, min_overlap=args.min_overlap, verbose=True, ) # write results fh = open(args.outfile, "w") if args.outfile is not None else sys.stdout writer = csv.writer(fh, dialect="excel-tab") writer.writerow(c_rank_fields) for R in results: writer.writerow(R.row()) # wrapup if len(results) == 0: say("# NO SIGNIFICANT ENRICHMENTS") fh.close() return None
def main(): args = get_args() # load obo / report rel type obo = Ontology(args.obo) warn("Summary of relationship types:") for k in sorted(parentage_types): warn(k, parentage_types[k]) # attach genes if args.mapping is not None: mapping = polymap(args.mapping, reverse=args.flip) if args.allowed_genes is not None: allowed = col2dict(args.allowed_genes) mapping = {k: v for k, v in mapping.items() if k in allowed} obo.attach_genes(mapping) warn("# of attached genes:", len(obo.attached_genes)) # informative cut if args.informative is not None: threshold = float(args.informative) if threshold < 1: warn( "Intepretting informative cutoff as fraction of annotated genes" ) threshold *= len(obo.attached_genes) threshold = int(threshold) obo.set_informative(threshold) for term in obo.iter_terms(): if not term.is_informative: term.is_acceptable = False # pruning cut if args.prune is not None: obo.prune(args.prune) for term in obo.iter_terms(): if not term.is_pruned: term.is_acceptable = False # depth cut if args.depth is not None: for term in obo.iter_terms(): if term.depth != args.depth: term.is_acceptable = False # grep cut if args.grep is not None: for term in obo.iter_terms(): if not re.search(args.grep, term.name): term.is_acceptable = False # namespace cut if args.namespace is not None: for term in obo.iter_terms(): if term.namespace_short not in args.namespace: term.is_acceptable = False # output the new polymap fh = open(args.outfile, "w") if args.outfile is not None else sys.stdout for term in obo.iter_terms(): if term.is_acceptable: outline = [str(term)] if not args.terms_only: outline += list(term.get_progeny_genes( ) if not args.ignore_progeny else term.genes) print >> fh, "\t".join(outline) fh.close()
# --------------------------------------------------------------- strInputPath = args.input strMode = args.mode strOutputPath = args.output if args.output is not None else ".".join( strInputPath, strMode ) fReadLength = args.read_length strSampleReadsPath = args.sample_reads # --------------------------------------------------------------- # manipulate data # --------------------------------------------------------------- if strMode == "rpkm": if strSampleReadsPath is None: sys.exit( "to compute rpkm you must include a file mapping sample IDs to #s of reads" ) dictMillions = col2dict( strSampleReadsPath, key=0, value=1, func=lambda x: float( x ) / 1e6 ) tableCladeRPK = table( strInputPath ) tableCladeRPK.grep( "headers", "s__" ) tableCladeRPK.float() tableCladeRPK.groupby( lambda x: x.split( "|" )[0], median ) if strMode != "rpk": for bug, sample, value in tableCladeRPK.iter_entries(): if strMode == "coverage": tableCladeRPK.set( bug, sample, value * fReadLength / 1e3 ) elif strMode == "rpkm": tableCladeRPK.set( bug, sample, value / dictMillions[sample] if dictMillions[sample] > 0 else 0 ) tableCladeRPK.unfloat() tableCladeRPK.colsort()
def print2(d): for key, d2 in d.items(): print key for key2, val in d2.items(): print " ", key2, "-->", val print # --------------------------------------------------------------- # col2dict tests # --------------------------------------------------------------- print "col2dict tests" os.system("head %s" % (file)) print1(d.col2dict(file)) print1(d.col2dict(file, headers=True)) print1(d.col2dict(file, key=1, headers=True)) print1(d.col2dict(file, key=1, value=2, headers=True)) print1(d.col2dict(file, key=1, func=lambda row: float(row[2]), headers=True)) # --------------------------------------------------------------- # col2dict2 tests # --------------------------------------------------------------- print "col2dict2 tests" os.system("head %s" % (file)) print2(d.col2dict2(file)) print2(d.col2dict2(file, headers=True, mirror=True)) print2( d.col2dict2(file,
#! /usr/bin/env python import os, sys, re, glob, argparse from zopy.table2 import table from zopy.dictation import col2dict from zopy.utils import path2name dictMap = col2dict( sys.argv[1], key=0, value=1 ) tableData = table( sys.argv[2] ) tableData.apply_colheads( lambda x: path2name( x ) ) tableData.apply_colheads( lambda x: x.split( "." )[0] ) tableData.apply_colheads( lambda x: dictMap[x] ) tableData.dump( sys.argv[3] )