def dodict(action, alias=None, name=None, always=False, clean=False): to_clean = [] cstrings = {} # long actions can be supplied as list if type(action) is list: action = " ".join(action) cstrings["action_original"] = action # replace aliased items via python formatting if alias is None: alias = {} for old, new in c_default_alias.items(): if old in alias and alias[old] != new: print >> sys.stderr, "warning, default alias overwrite:", old, alias[ old], "<--", new alias[old] = new action = action.format(**alias) # process file deps cstrings["action_formatted"] = action file_dep = [] for match in re.finditer("([Dd]):(.*?)(\s|$)", action): flag, item, other = match.groups() if flag == "d": file_dep.append(item) elif flag == "D": cstrings[item] = status(item) action = re.sub("[Dd]:", "", action) # process targets targets = [] for match in re.finditer("([Tt]):(.*?)(\s|$)", action): flag, item, other = match.groups() if flag == "t": targets.append(item) elif flag == "T": if not clean: sys.exit( "Lethal Error: Folder target 'T:' used without invoking clean=True" ) targets.append(item) to_clean.append(item) action = re.sub("[Tt]:", "", action) if len(targets) == 0: say("Action has no targets?\n\t{}".format(cstrings["action_original"])) # remove commented items action = re.sub(" +#.*", "", action) cstrings["action_uncommented"] = action # expected task dictionary (augmented below) doitdict = { "targets": targets, "file_dep": file_dep, "actions": [(clean_targets, to_clean), (mkdirs, targets), action], "uptodate": [not always, config_changed(cstrings)], "verbosity": 2, } if name is not None: if type(name) is not str: name = ":".join([str(k) for k in name]) doitdict["name"] = name # return task dictionary return doitdict
def fdr(pvalues, **kwargs): if type(pvalues) is list: return pvalues2values(pvalues, **kwargs) elif type(pvalues) is dict: return pdict2qdict(pvalues, **kwargs) else: say("Can't FDR non-list, non-dict") return None
def deduplicate(items): seen = set() deduplicated = False for i, k in enumerate(items): while k in seen: deduplicated = True k += "-dup" items[i] = k seen.add(k) if deduplicated: zu.say("Some fields were deduplicated.")
def main(): args = get_args() # load genes (accounting for linkage) genes = col2dict( args.genes, value=(1 if args.linked else None), headers=args.skip_headers, ) genes = {g: (g if k is None else k) for g, k in genes.items()} # load background (accounting for linkage) background = None if args.background is not None: background = col2dict( args.background, value=(1 if args.linked else None), headers=args.skip_headers, ) background = { g: (g if k is None else k) for g, k in background.items() } # load gene sets gene_sets = polymap( args.gene_sets, reverse=args.reversed_mapping, ) # run analysis results = fisher_enrich( genes, gene_sets, depletions=not args.exclude_depletions, background=background, intersect_background=args.intersect_background, intersect_annotated=args.intersect_annotated, fdr=args.fdr, min_expected_overlap=args.min_expected_overlap, verbose=False, ) # write results fh = open(args.outfile, "w") if args.outfile is not None else sys.stdout writer = csv.writer(fh, dialect="excel-tab") writer.writerow(c_fisher_fields) for R in results: writer.writerow(R.row()) # wrapup if len(results) == 0: say("# NO SIGNIFICANT ENRICHMENTS") fh.close() return None
def index(xx, yy): """ provides the index positions of ( 0,y ) ( x,0 ) and ( 0,0 ) points """ xzeroes = [] yzeroes = [] nonzero = [] discard = 0 for i, (x, y) in enumerate(zip(xx, yy)): if x > 0 and y > 0: nonzero.append(i) elif x > 0 and y == 0: yzeroes.append(i) elif y > 0 and x == 0: xzeroes.append(i) else: discard += 1 if discard > 0: say("zillplot ignoring", discard, "( 0,0 ) points") return xzeroes, yzeroes, nonzero
def annotation_report(message, linking, annotations): is_linked = not all([k == v for k, v in linking.items()]) say(message) is_annotated = generate_background(annotations) n_keys = len(linking) n_keys_annotated = len( {key for key, link in linking.items() if link in is_annotated}) is_link = set(linking.values()) n_links = len(is_link) n_links_annotated = len(is_link.__and__(is_annotated)) # outer key results say(" Total keys: {:,}".format(n_keys)) say(" Annotated keys: {:,} ({:.1f}%)".format( n_keys_annotated, 100 * n_keys_annotated / (c_eps + n_keys))) # inner key (link) results if is_linked: say(" Total links: {:,}".format(n_links)) say(" Annotated links: {:,} ({:.1f}%)".format( n_links_annotated, 100 * n_links_annotated / (c_eps + n_links))) return None
def main(): args = get_args() # load key values def make_link(row): key = row[1] if args.linked else row[0] return Link(key, float(row[-1])) values = col2dict( args.values, func=make_link, headers=args.skip_headers, ) # load key sets gene_sets = polymap( args.gene_sets, reverse=args.reversed_mapping, ) # perform analysis results = rank_enrich( values, gene_sets, depletions=not args.exclude_depletions, intersect_annotated=args.intersect_annotated, fdr=args.fdr, min_overlap=args.min_overlap, verbose=True, ) # write results fh = open(args.outfile, "w") if args.outfile is not None else sys.stdout writer = csv.writer(fh, dialect="excel-tab") writer.writerow(c_rank_fields) for R in results: writer.writerow(R.row()) # wrapup if len(results) == 0: say("# NO SIGNIFICANT ENRICHMENTS") fh.close() return None
def __init__(self, gff_row, counter): # unique tag for locus based on position in GFF self.index = counter # gff fields if len(gff_row) != len(c_gff_fields): zu.die("Bad GFF row:", gff_row) for [fname, ftype], value in zip(c_gff_fields, gff_row): setattr(self, fname, ftype(value) if value != "." else value) # attributes temp = {} for item in self.attributes.split(";"): if "=" not in item: continue item = item.strip() system, value = item.split("=") if system in temp: zu.say("Warning: Multiple definitions for system", system) temp[system] = value self.attributes = temp # no name by default self.name = self.attributes.get("ID", None) self.code = ":".join([str(self.start), str(self.end), self.strand])
def preprocess_annotations(annotations, min_size): ni = nf = len(annotations) if min_size is not None: annotations = { k: v for k, v in annotations.items() if len(v) >= min_size } nf = len(annotations) say("Annotations:") say(" Loaded: {:,}".format(ni)) if ni != nf: say(" After filtering: {:,} ({:.1f}%)".format(nf, 100.0 * nf / ni)) return annotations
def progress(counter, annotations): say("Testing annotation {: >5d} of {: >5d}".format(counter, len(annotations)))
#!/usr/bin/env python from __future__ import print_function import os import sys import re from collections import OrderedDict from zopy.utils import try_open, say, die try: from Bio import SeqIO except: say("zopy.bio imported with biopython") # --------------------------------------------------------------- # fasta # --------------------------------------------------------------- def read_fasta(path, full_headers=False): fdict = OrderedDict() with try_open(path) as fh: for line in fh: line = line.strip() if line[0] == ">": header = line[1:] if not full_headers: header = header.split()[0].rstrip("|") else:
"-m", "--mode", choices=["piped", "piped_humann"], help="special sorting options", ) args = parser.parse_args() # --------------------------------------------------------------- # load all data # --------------------------------------------------------------- dictTableData = {} # modified for faster looking up 4/2/2015 dictFeatureIndex = {} say("Will load:", len(args.inputs), "gathered from command line") if args.file is not None: before = len(args.inputs) with open(args.file) as fh: for line in fh: args.inputs.append(line.strip()) after = len(args.inputs) say("Will load:", after - before, "additional files gathered from:", args.file) for iDex, strPath in enumerate(args.inputs): say(sys.stderr, "Loading", iDex + 1, "of", len(args.inputs), ":", strPath) aastrData = [] strColhead = path2name( strPath) if not args.use_full_names else os.path.split(strPath)[1]
def report(self, *args, **kwargs): items = [self.sourcename, "::", " ".join([str(k) for k in args])] if kwargs.get("die", False): zu.die(*items) elif self.verbose: zu.say(*items)
def get_args(): parser = argparse.ArgumentParser() parser.add_argument("fasta1") parser.add_argument("fasta2") parser.add_argument("-t", "--top", default=1, type=int) parser.add_argument("-l", "--local", action="store_true") parser.add_argument("-k", "--k-size", default=3, type=int) parser.add_argument("-c", "--compress", default=None, type=float) return parser.parse_args() if __name__ == "__main__": args = get_args() zu.say("Loading fasta1") fasta1 = read_fasta(args.fasta1) zu.say("Loading fasta2") fasta2 = read_fasta(args.fasta2) zu.say("Indexing fasta2") I = Index(k=args.k_size) I.update_from_dict(fasta2) if args.compress: zu.say("Compressing index") I.compress(args.compress) zu.say("Searching") for i, name1 in enumerate(fasta1): seq = fasta1[name1] hits = I.score(seq, top=args.top, local=args.local) for hit in hits: zu.tprint(
def trunc_normal(m, sd, zmax): outlier = True while outlier: sim = normal(m, sd) if abs(m - sim) / sd < zmax: outlier = False return sim #------------------------------------------------------------------------------- # munge hmp data #------------------------------------------------------------------------------- zu.say(args.basename, "->", "parsing HMP data") T = table(args.hmp) T.select("STSite", args.site, transposed=True) T.select("VISNO", "1", transposed=True) T.head("SRS", invert=True) T.apply_rowheads(lambda x: x.split("|")[-1]) T.grep("headers", "s__") T.grep("headers", "_unclassified", invert=True) T.dump("subset.tmp") T.float() T.unrarify(1e-20, 1) bugs = [] for bug, row in T.iter_rows(): stats = []
args = parser.parse_args() #------------------------------------------------------------------------------- # constants #------------------------------------------------------------------------------- cafa_codes = {"EXP", "TAS", "IC"} #------------------------------------------------------------------------------- # load gene set #------------------------------------------------------------------------------- genes = set() for row in iter_rows(args.gene_list): genes.add(row[0]) say("Loaded", len(genes), "genes") #------------------------------------------------------------------------------- # process goa file #------------------------------------------------------------------------------- """ The GOA (.gaf) file is tab-delimited. Comment lines start with "!" Col2 is the uniprot id (a superset of uniref50). Col5 is the Gene Ontology annotation. Col4 is a logical modifier of the uniprot->go mapping. Must exclude the cases where this is "NOT". Col7 is a short evidence code """ # term->gene mapping