def addFamilies(self, eventsfile, discard=[]): if not tableExists(self.cur, "Families"): self.makeFamiliesTable() util.tic("add families") events_tab = tablelib.read_table(eventsfile) events_lookup = events_tab.lookup("partid") familyGeneNames = self.makeFamilyGeneNames() discard = set(discard) for row in events_tab: famid = row["partid"] if famid in discard: util.logger("discarding '%s'" % famid) continue tree = treelib.read_tree(self.getTreeFile(famid)) treelen = sum(x.dist for x in tree) seqs = fasta.read_fasta(self.getFastaFile(famid)) seqlen = stats.median(map(len, seqs.values())) self.cur.execute( """INSERT INTO Families VALUES ("%s", "%s", %f, %f, %f, %d, %d, %d, "%s");""" % (row["partid"], familyGeneNames.get(row["partid"], ("", ""))[0], row["famrate"], treelen, seqlen * 3, row["dup"], row["loss"], row["genes"], familyGeneNames.get(row["partid"], ("", ""))[1])) util.toc()
def revtranslate_align(aaseqs, dnaseqs, check=False, trim=False): """Reverse translates aminoacid alignment into DNA alignment Must supply original ungapped DNA. """ align = new_align(aaseqs) for name, seq in aaseqs.iteritems(): try: dna = dnaseqs[name].upper() dnalen = len(dna) aalen = sum(int(a != "-") for a in seq) if len(dna) != aalen * 3: if trim: # make dna a multiple of three dna = dna[:(len(dna) // 3) * 3] if len(dna) > aalen * 3: # trim dna dna = dna[:aalen*3] else: # trim peptide to match nucleotide j = 0 for i in xrange(len(seq)): if seq[i] != '-': j += 1 if j > len(dna) // 3: seq = seq[:i] + "-" * (len(seq) - i) break aalen2 = sum(int(a != "-") for a in seq) assert len(dna) == aalen2 * 3, ( len(dna), aalen2 * 3) util.logger("trim dna (%d) and pep (%d)" % (dnalen - len(dna), aalen - aalen2)) else: # is last residue X? for i in xrange(len(seq)-1, -1, -1): if seq[i] == "-": continue if seq[i] == "X": # repair seq = seq[:i] + "-" * (len(seq)-i) dna = dna[:-3] break align[name] = seqlib.revtranslate(seq, dna, check=check) except seqlib.TranslateError: raise return align
def add(self, key, value, errors=False): if key in self: if errors: util.logger("duplicate key", key) # keep the longest value, by default if len(value) >= len(self[key]): dict.__setitem__(self, key, value) else: self.names.append(key) dict.__setitem__(self, key, value)
def revtranslate_align(aaseqs, dnaseqs, check=False, trim=False): """Reverse translates aminoacid alignment into DNA alignment Must supply original ungapped DNA. """ align = new_align(aaseqs) for name, seq in aaseqs.items(): try: dna = dnaseqs[name].upper() dnalen = len(dna) aalen = sum(int(a != "-") for a in seq) if len(dna) != aalen * 3: if trim: # make dna a multiple of three dna = dna[:(len(dna) // 3) * 3] if len(dna) > aalen * 3: # trim dna dna = dna[:aalen * 3] else: # trim peptide to match nucleotide j = 0 for i in range(len(seq)): if seq[i] != '-': j += 1 if j > len(dna) // 3: seq = seq[:i] + "-" * (len(seq) - i) break aalen2 = sum(int(a != "-") for a in seq) assert len(dna) == aalen2 * 3, (len(dna), aalen2 * 3) util.logger("trim dna (%d) and pep (%d)" % (dnalen - len(dna), aalen - aalen2)) else: # is last residue X? for i in range(len(seq) - 1, -1, -1): if seq[i] == "-": continue if seq[i] == "X": # repair seq = seq[:i] + "-" * (len(seq) - i) dna = dna[:-3] #-(len(dna) % 3)] break align[name] = revtranslate(seq, dna, check=check) except TranslateError as e: raise return align
def exec_phylip(cmd, args, verbose=False): """Execute a phylip-like program that expects arguments from stdin""" util.logger("exec: %s" % cmd) util.logger("args: %s" % args) if verbose: util.logger("exec: %s" % cmd) util.logger("args: %s" % args) assert ( os.system( """cat <<EOF | %s %s""" % (cmd, args) ) == 0 ) else: assert ( os.system( """cat <<EOF | %s >/dev/null 2>&1 %s""" % (cmd, args) ) == 0 )
def walk(node): for child in node.children: walk(child) if not node.is_leaf(): blastfiles = [] leaves1 = node.children[0].leaf_names() leaves2 = node.children[1].leaf_names() # determine sibling blast files for leaf1 in leaves1: for leaf2 in leaves2: if leaf1 in blastFileLookup and \ leaf2 in blastFileLookup[leaf1]: blastfiles.append(blastFileLookup[leaf1][leaf2]) # determine outgroup blast files (all other files, potentially) # go up one level, blastfiles for leaves, and subtract # sibling files outblastfiles = [] if node.parent: inleaves = leaves1 + leaves2 outleaves = set(node.parent.leaf_names()) - set(inleaves) for leaf1 in inleaves: for leaf2 in outleaves: if leaf1 in blastFileLookup and \ leaf2 in blastFileLookup[leaf1]: outblastfiles.append(blastFileLookup[leaf1][leaf2]) util.tic("merging") util.logger("leaves1: ", leaves1) util.logger("leaves2: ", leaves2) if "merge" in conf and \ conf["merge"] == "avg": node.parts = mergeAvg(conf, genes, node.children[0].parts, node.children[1].parts, blastfiles, outblastfiles) else: node.parts = mergeBuh(conf, genes, node.children[0].parts, node.children[1].parts, blastfiles) if "output" in conf and len(node.parts) > 0: util.write_delim(conf["output"] + str(node.name) + ".part", node.parts) util.logger("number of parts: ", len(node.parts)) if len(node.parts) > 0: util.logger("largest part:", max(map(len, node.parts))) util.toc()
def removeFamily(self, *famids): famids = set(famids) self.famtab[:] = filter(lambda fam: fam["famid"] not in famids, self.famtab) current_famids = set(self.famtab.cget("famid")) # update disk self.archive() self.saveTable() for famid in famids: if famid in current_famids: famdir = self.familyDir(famid) shutil.move(famdir, os.path.join(self.olddatadir, famid)) util.logger("famdb: archived '%s'" % famdir) else: util.logger("famdb: family '%s' does not exist" % famid)
def addFamily(self, parts, famids=None): # determine highest current famid if famids == None: # assume famids are ints maxid = max(map(int, self.famtab.cget("famid"))) famids = map(str, range(maxid + 1, maxid + 1 + len(parts))) for famid, part in izip(famids, parts): self.famtab.add(famid=famid, genes=",".join(part)) util.logger("famdb: added family '%s'" % famid) # update disk self.archive() self.saveTable() for famid in famids: os.mkdir(self.familyDir(famid))
def exec_phylip(cmd, args, verbose=False): """Execute a phylip-like program that expects arguments from stdin""" util.logger("exec: %s" % cmd) util.logger("args: %s" % args) if verbose: util.logger("exec: %s" % cmd) util.logger("args: %s" % args) assert os.system("""cat <<EOF | %s %s""" % (cmd, args)) == 0 else: assert os.system("""cat <<EOF | %s >/dev/null 2>&1 %s""" % (cmd, args)) == 0
def resample_arg_regions(arg, seqs, niters, width=1000, ntimes=20, rho=1.5e-8, mu=2.5e-8, popsize=1e4, times=None, carg=False, verbose=False): seqlen = len(seqs.values()[0]) if is_carg(arg): trees, names = arg arg2 = ctrees2arg(trees, names, times, verbose=verbose, delete_arg=False) recomb_pos = list(x.pos for x in arg2 if x.event == "recomb") else: recomb_pos = list(x.pos for x in arg if x.event == "recomb") for it in range(niters): maxr = 0 for i, j, a, b in stats.iter_window_index(recomb_pos, width): r = j - i + 1 if r > maxr: maxr = r region = [max(recomb_pos[i]-10, 10), min(recomb_pos[j]+10, seqlen - 10)] if verbose: util.tic("sample ARG region %s" % region) print arg arg = argweaver.resample_arg_region(arg, seqs, region[0], region[1], rho=rho, mu=mu, times=times, carg=carg, verbose=True) if not carg: recomb_pos = list(x.pos for x in arg if x.event == "recomb") if verbose: util.logger("%d: # recombs %d" % (it, len(recomb_pos))) if verbose: util.toc() return arg
def archive(self): archivefile = os.path.join(self.olddatadir, self.filename + "-" + self.timestamp()) shutil.copy(self.filename, archivefile) util.logger("famdb: archived '%s'" % archivefile)
def saveTable(self): self.famtab.write(self.filename) util.logger("famdb: updated '%s'" % self.filename)