def layout_arg_leaves(arg): """Layout the leaves of an ARG""" basetree = treelib.Tree() nodes = list(arg.postorder()) nodes.sort(key=lambda x: x.age) lookup = {} for node in nodes: if node.is_leaf(): lookup[node] = basetree.new_node(node.name) else: basechildren = [] for child in node.children: basechild = lookup[child] while basechild.parent: basechild = basechild.parent basechildren.append(basechild) basechildren = util.unique(basechildren) if len(basechildren) > 1: lookup[node] = basenode = basetree.new_node(node.name) for basechild in basechildren: basetree.add_child(basenode, basechild) else: lookup[node] = basechildren[0] basetree.root = lookup[nodes[-1]] # assign layout based on basetree layout # layout leaves return dict((arg[name], i) for i, name in enumerate(basetree.leaf_names()))
def guess_seq(seq): """Guesses whether a sequence is 'dna' or 'pep'""" dna = "ACTG-N" chars = util.unique(seq.upper()) for char in chars: if char not in dna: return "pep" return "dna"
def makeFamilyGeneNames(self): """Tries to name and describe a family using its genes""" self.cur.execute("""SELECT g.famid, g.common_name, g.description FROM Genes g """) fams = util.groupby(lambda x: x[0], self.cur) familyGeneNames = {} for famid, rows in fams.iteritems(): names = util.unique([ "".join([i for i in x if not i.isdigit() and i != "-"]) for x in util.cget(rows, 1) if x != "" ]) names.sort() description = self.getFamDescription(util.cget(rows, 2)) familyGeneNames[famid] = (",".join(names), description) return familyGeneNames
def makeFamilyGeneNames(self): """Tries to name and describe a family using its genes""" self.cur.execute("""SELECT g.famid, g.common_name, g.description FROM Genes g """) fams = util.groupby(lambda x: x[0], self.cur) familyGeneNames = {} for famid, rows in fams.iteritems(): names = util.unique(["".join([i for i in x if not i.isdigit() and i != "-"]) for x in util.cget(rows, 1) if x != ""]) names.sort() description = self.getFamDescription(util.cget(rows, 2)) familyGeneNames[famid] = (",".join(names), description) return familyGeneNames
# make degenerate counts # # example: # # CGT => "R" # CGC => "R" # CGA => "R" # CGG => "R" # # CODON_DEGEN["R"] = [1, 1, 4] # CODON_DEGEN["CGT"] = [1, 1, 4] # CODON_DEGEN = {} AA_DEGEN = {} for aa, lst in REV_CODON_TABLE.items(): folds = map(lambda x: len(util.unique(x)), zip(* lst)) for codon in lst: AA_DEGEN[aa] = folds CODON_DEGEN[codon] = folds # substitution types SUB_NONE = 0 # none SUB_TSIT = 1 # tranSition SUB_TVER = 2 # transVersion SUB_INS = 3 # insert SUB_DEL = 4 # del SUBSTITUTION_TYPES = { "AA": SUB_NONE, "AC": SUB_TVER, "AG": SUB_TSIT, "AT": SUB_TVER, "CA": SUB_TVER, "CC": SUB_NONE, "CG": SUB_TVER, "CT": SUB_TSIT, "GA": SUB_TSIT, "GC": SUB_TVER, "GG": SUB_NONE, "GT": SUB_TVER,
def mergeBuh(conf, genes, parts1, parts2, blastfiles): """Merge by Best Unidirectional Hits""" # don't use this code without double checking it assert False lookup1 = item2part(parts1) lookup2 = item2part(parts2) best = util.Dict(dim=1, default=(0, None)) util.tic("read hits") for blastfile, order in blastfiles: util.tic("determine best hits '%s'" % os.path.basename(blastfile)) for hit in blast.BlastReader(blastfile): if order: gene1 = blast.query(hit) gene2 = blast.subject(hit) alnlen1 = blast.queryLength(hit) alnlen2 = blast.subjectLength(hit) else: gene2 = blast.query(hit) gene1 = blast.subject(hit) alnlen2 = blast.queryLength(hit) alnlen1 = blast.subjectLength(hit) score = blast.bitscore(hit) len1 = genes[gene1]["length"] len2 = genes[gene2]["length"] coverage = max(alnlen1 / float(len1), alnlen2 / float(len2)) # discard a hit that does not pass basic cutoffs if blast.bitscore(hit) / float(blast.alignLength(hit)) < \ conf["bitspersite"] or \ coverage < conf["coverage"] or \ blast.evalue(hit) > conf["signif"]: continue #if blast.evalue(hit) > conf["signif"]: # continue if gene1 in lookup1: part1 = (0, lookup1[gene1]) else: parts1.append([gene1]) lookup1[gene1] = len(parts1) - 1 part1 = (0, len(parts1) - 1) if gene2 in lookup2: part2 = (1, lookup2[gene2]) else: parts2.append([gene2]) lookup2[gene2] = len(parts2) - 1 part2 = (1, len(parts2) - 1) if score > best[part1][0]: best[part1] = (score, part2) if score > best[part2][0]: best[part2] = (score, part1) util.toc() util.toc() util.tic("determine clusters") sets = {} for gene in best: sets[gene] = sets.UnionFind([gene]) for blastfile, order in blastfiles: util.tic("read hits '%s'" % os.path.basename(blastfile)) for hit in blast.BlastReader(blastfile): if order: gene1 = blast.query(hit) gene2 = blast.subject(hit) alnlen1 = blast.queryLength(hit) alnlen2 = blast.subjectLength(hit) else: gene2 = blast.query(hit) gene1 = blast.subject(hit) alnlen2 = blast.queryLength(hit) alnlen1 = blast.subjectLength(hit) score = blast.bitscore(hit) len1 = genes[gene1]["length"] len2 = genes[gene2]["length"] coverage = max(alnlen1 / float(len1), alnlen2 / float(len2)) # discard a hit that does not pass basic cutoffs if blast.bitscore(hit) / float(blast.alignLength(hit)) < \ conf["bitspersite"] or \ coverage < conf["coverage"] or \ blast.evalue(hit) > conf["signif"]: continue #if blast.evalue(hit) > conf["signif"]: # continue part1 = (0, lookup1[gene1]) part2 = (1, lookup2[gene2]) if score >= best[part1][0] * conf["relcutoff"]: sets[part1].union(sets[part2]) if score >= best[part2][0] * conf["relcutoff"]: sets[part2].union(sets[part1]) util.toc() sets = util.unique([x.root() for x in sets.values()]) parts = [] joining = (parts1, parts2) for set in sets: parts.append([]) for i, row in set.members(): parts[-1].extend(joining[i][row]) util.toc() return parts
def mergeAvg(conf, genes, parts1, parts2, blastfiles, outblastfiles): lookup1 = item2part(parts1) lookup2 = item2part(parts2) # value is [sum, total] hits = util.Dict(dim=2, default=[0, 0]) if "accept" in conf: accept = conf["accept"] else: accept = False util.tic("read hits") for blastfile, order in blastfiles: util.tic("determine best hits '%s'" % os.path.basename(blastfile)) for hit in blast.BlastReader(blastfile): if order: gene1 = blast.query(hit) gene2 = blast.subject(hit) alnlen1 = blast.queryLength(hit) alnlen2 = blast.subjectLength(hit) else: gene2 = blast.query(hit) gene1 = blast.subject(hit) alnlen2 = blast.queryLength(hit) alnlen1 = blast.subjectLength(hit) score = blast.bitscore(hit) len1 = genes[gene1]["length"] len2 = genes[gene2]["length"] coveragesmall = min(alnlen1 / float(len1), alnlen2 / float(len2)) coveragebig = max(alnlen1 / float(len1), alnlen2 / float(len2)) # discard a hit that does not pass basic cutoffs if blast.bitscore(hit) / float(blast.alignLength(hit)) < \ conf["bitspersite"] or \ coveragesmall < conf["coveragesmall"] or \ coveragebig < conf["coveragebig"] or \ blast.evalue(hit) > conf["signif"]: continue if accept and \ (gene1 not in accept or gene2 not in accept): continue # create a key for a partition: (side, index) if gene1 in lookup1: part1 = (0, lookup1[gene1]) else: parts1.append([gene1]) lookup1[gene1] = len(parts1) - 1 part1 = (0, len(parts1) - 1) if gene2 in lookup2: part2 = (1, lookup2[gene2]) else: parts2.append([gene2]) lookup2[gene2] = len(parts2) - 1 part2 = (1, len(parts2) - 1) val = hits[part1][part2] val[0] += score val[1] += 1 hits[part2][part1] = val util.toc() util.toc() util.tic("read outgroup hits") outbest = util.Dict(default=[0, 0]) for blastfile, order in outblastfiles: util.tic("determine best hits '%s'" % os.path.basename(blastfile)) for hit in blast.BlastReader(blastfile): if order: genein = blast.query(hit) geneout = blast.subject(hit) else: geneout = blast.query(hit) genein = blast.subject(hit) score = blast.bitscore(hit) # create a key for a partition: (side, index) if genein in lookup1: partin = (0, lookup1[genein]) elif gene1 in lookup2: partin = (1, lookup2[genein]) else: continue val = outbest[partin] val[0] += score val[1] += 1 util.toc() util.toc() assert len(parts1) == len(unionPart(parts1)) assert len(parts2) == len(unionPart(parts2)) util.tic("determine clusters") sets = {} for i in xrange(len(parts1)): sets[(0, i)] = sets.UnionFind([(0, i)]) for i in xrange(len(parts2)): sets[(1, i)] = sets.UnionFind([(1, i)]) # merge top avg hits for part1 in hits: o1 = outbest[part1] outavg1 = float(o1[0]) / max(o1[1], 1) top = 0 toppart = None for part2, (tot, num) in hits[part1].iteritems(): avg = float(tot) / num o2 = outbest[part2] outavg2 = float(o2[0]) / max(o2[1], 1) if avg > outavg1 and avg > outavg2 and avg > top: top = avg toppart = part2 if toppart: sets[part1].union(sets[toppart]) sets = util.unique([x.root() for x in sets.values()]) # create partition of genes parts = [] joining = (parts1, parts2) for set in sets: parts.append([]) for i, row in set: parts[-1].extend(joining[i][row]) util.toc() assert len(parts) == len(unionPart(parts)) return parts
# make degenerate counts # # example: # # CGT => "R" # CGC => "R" # CGA => "R" # CGG => "R" # # CODON_DEGEN["R"] = [1, 1, 4] # CODON_DEGEN["CGT"] = [1, 1, 4] # CODON_DEGEN = {} AA_DEGEN = {} for aa, lst in REV_CODON_TABLE.items(): folds = map(lambda x: len(util.unique(x)), zip(*lst)) for codon in lst: AA_DEGEN[aa] = folds CODON_DEGEN[codon] = folds # substitution types SUB_NONE = 0 # none SUB_TSIT = 1 # tranSition SUB_TVER = 2 # transVersion SUB_INS = 3 # insert SUB_DEL = 4 # del SUBSTITUTION_TYPES = { "AA": SUB_NONE, "AC": SUB_TVER, "AG": SUB_TSIT, "AT": SUB_TVER,
def keys(self): return util.sort(util.unique(self.db.keys()))
def mergeAvg(conf, genes, parts1, parts2, blastfiles, outblastfiles): lookup1 = item2part(parts1) lookup2 = item2part(parts2) # value is [sum, total] hits = util.Dict(dim=2, default = [0, 0]) if "accept" in conf: accept = conf["accept"] else: accept = False util.tic("read hits") for blastfile, order in blastfiles: util.tic("determine best hits '%s'" % os.path.basename(blastfile)) for hit in blast.BlastReader(blastfile): if order: gene1 = blast.query(hit) gene2 = blast.subject(hit) alnlen1 = blast.queryLength(hit) alnlen2 = blast.subjectLength(hit) else: gene2 = blast.query(hit) gene1 = blast.subject(hit) alnlen2 = blast.queryLength(hit) alnlen1 = blast.subjectLength(hit) score = blast.bitscore(hit) len1 = genes[gene1]["length"] len2 = genes[gene2]["length"] coveragesmall = min(alnlen1 / float(len1), alnlen2 / float(len2)) coveragebig = max(alnlen1 / float(len1), alnlen2 / float(len2)) # discard a hit that does not pass basic cutoffs if blast.bitscore(hit) / float(blast.alignLength(hit)) < \ conf["bitspersite"] or \ coveragesmall < conf["coveragesmall"] or \ coveragebig < conf["coveragebig"] or \ blast.evalue(hit) > conf["signif"]: continue if accept and \ (gene1 not in accept or gene2 not in accept): continue # create a key for a partition: (side, index) if gene1 in lookup1: part1 = (0, lookup1[gene1]) else: parts1.append([gene1]) lookup1[gene1] = len(parts1) - 1 part1 = (0, len(parts1) - 1) if gene2 in lookup2: part2 = (1, lookup2[gene2]) else: parts2.append([gene2]) lookup2[gene2] = len(parts2) - 1 part2 = (1, len(parts2) - 1) val = hits[part1][part2] val[0] += score val[1] += 1 hits[part2][part1] = val util.toc() util.toc() util.tic("read outgroup hits") outbest = util.Dict(default=[0, 0]) for blastfile, order in outblastfiles: util.tic("determine best hits '%s'" % os.path.basename(blastfile)) for hit in blast.BlastReader(blastfile): if order: genein = blast.query(hit) geneout = blast.subject(hit) else: geneout = blast.query(hit) genein = blast.subject(hit) score = blast.bitscore(hit) # create a key for a partition: (side, index) if genein in lookup1: partin = (0, lookup1[genein]) elif gene1 in lookup2: partin = (1, lookup2[genein]) else: continue val = outbest[partin] val[0] += score val[1] += 1 util.toc() util.toc() assert len(parts1) == len(unionPart(parts1)) assert len(parts2) == len(unionPart(parts2)) util.tic("determine clusters") sets = {} for i in xrange(len(parts1)): sets[(0, i)] = sets.UnionFind([(0, i)]) for i in xrange(len(parts2)): sets[(1, i)] = sets.UnionFind([(1, i)]) # merge top avg hits for part1 in hits: o1 = outbest[part1] outavg1 = float(o1[0]) / max(o1[1], 1) top = 0 toppart = None for part2, (tot, num) in hits[part1].iteritems(): avg = float(tot) / num o2 = outbest[part2] outavg2 = float(o2[0]) / max(o2[1], 1) if avg > outavg1 and avg > outavg2 and avg > top: top = avg toppart = part2 if toppart: sets[part1].union(sets[toppart]) sets = util.unique([x.root() for x in sets.values()]) # create partition of genes parts = [] joining = (parts1, parts2) for set in sets: parts.append([]) for i, row in set: parts[-1].extend(joining[i][row]) util.toc() assert len(parts) == len(unionPart(parts)) return parts
# make degenerate counts # # example: # # CGT => "R" # CGC => "R" # CGA => "R" # CGG => "R" # # CODON_DEGEN["R"] = [1, 1, 4] # CODON_DEGEN["CGT"] = [1, 1, 4] # CODON_DEGEN = {} AA_DEGEN = {} for aa, lst in list(REV_CODON_TABLE.items()): folds = [len(util.unique(x)) for x in zip(*lst)] for codon in lst: AA_DEGEN[aa] = folds CODON_DEGEN[codon] = folds # substitution types SUB_NONE = 0 # none SUB_TSIT = 1 # tranSition SUB_TVER = 2 # transVersion SUB_INS = 3 # insert SUB_DEL = 4 # del SUBSTITUTION_TYPES = { "AA": SUB_NONE, "AC": SUB_TVER, "AG": SUB_TSIT, "AT": SUB_TVER,