def ali_in_tree(self,aliname = 'group2.stk', rank = 'genus', **kwargs): all_seqs = ali.get_seqs(aliname) alinodes = ali.get_taxnodes(aliname) aliranks = [t.rank if t else None for t in alinodes] all_leaves = self.t.get_terminals() leafnodes = self.leafNodes(reset = mod(reset, 2)) leafranks =[n.rank if n else None for n in leafnodes] ali_families = ali.get_taxon_forall(rank = rank,aliname = aliname, **mem.sr(kwargs)) leaf_families= self.getTaxon(rank = rank, **mem.sr(kwargs)) aset = set(ali_families) lset = set(leaf_families) a_domains =[(node, ncbi.get_taxon(node,'superkingdom')) for node in aset] l_domains =[(node, ncbi.get_taxon(node,'superkingdom')) for node in lset] bac_domain = [x[1] for x in l_domains if ncbi.sciname(x[1])== 'Bacteria'][0] l_bacs = set((l[0] for l in l_domains if l[1] == bac_domain)) a_bacs = set((a[0] for a in a_domains if a[1] == bac_domain)) leaf_bacteria = [leaf if leaf in l_bacs else None for leaf in leaf_families] ali_bacteria = [a if a in a_bacs else None for a in ali_families] return leaf_bacteria, ali_bacteria, leafnodes, alinodes
def make_anc_batches(aliname, rank_name = 'phylum', do_bsub = False, run = False, nrun = 0, **kwargs): BT = getBTOL(**mem.sr(kwargs)) tree_tax = BT.getTaxon(rank_name) ali_tax = ali.get_taxon_forall(aliname, rank_name) union_tax = set(tree_tax).intersection(set(ali_tax)) union_taxids = [e.id for e in union_tax if e] batch_pdicts = [dict(taxid = taxid, rank_name = rank_name, aliname = aliname) for taxid in union_taxids] cmds = [] for idx, d in enumerate(batch_pdicts): run_id=bsub.get_run_id(idx, prefix = rank_name) bsub.save_inp(d, run_id) cmds.append(bsub.cmd(os.path.abspath(inspect.stack()[0][1]),'run_anc',run_id,\ do_bsub = do_bsub, run_id = run_id)) if run: for c in cmds: out = subprocess.call(c, shell = True) print out return cmds
def set_taxnodes(**kwargs): all_seqs = get_seqs(dbname,**mem.sr(kwargs)) seq_taxa = [s.source_taxon if s.source_taxon else None for s in all_seqs] alinodes = [ncbi.get_node(s) if s != None else None for s in seq_taxa] return alinodes
def setBTOL(**kwargs): B = BTOL(**mem.sr(kwargs)) if not B.treeInitialized(): print 'Underlying tree structure apparently uninitialized: initializing\n...' B.initTree() print '...\nDone\nSaving\n...' B.saveTree() print '...\nDone' return B
def check_bdtnp(**kwargs): ids = id_map(**mem.sr(kwargs)) btgs = nio.getBDTNP() stfs,stgs = nio.getNet() sxs = dict([(i, {}) for i in ids.keys()]) for i, elt in enumerate(ids.iteritems()): gid, gene_val = elt if stfs.has_key(gid) : sxs[gid]['stf'] = True; if btgs.has_key(gid) : sxs[gid]['btg'] = True; if stgs.has_key(gid) : sxs[gid]['stg'] = True; return sxs
def getTaxon(self,rank = rank, **kwargs): def setTaxon(BTInstance = None, rank = None, **kwargs): assert rank; assert BTInstance leafnodes = BTInstance.leafNodes(**mem.sr(kwargs)) leaf_families = [ncbi.get_taxon(node, rank=rank) if node else None for node in leafnodes] return leaf_families return mem.getOrSet(setTaxon, **mem.sr(kwargs, rank = rank, BTInstance = self, on_fail = 'compute', hardcopy = False, register = rank))
def run(rfid,run_id, inp_run_id, reset = True, draw_alis = draw_all_easy): sgs = get_seq_groups(rfid = rfid, **mem.sr({},reset = False)) all_seq_group_datas = [] for s in sgs: if len(s) < 4: all_seq_group_datas.append(None) continue if len(s) > 40: print 'skipping cuz it takes too long!' continue print 'Not Skipping' all_seq_group_datas.append(eval_seq_group(s,rfid, '{0}_{1}'.format(run_id,len(s)), inp_run_id, reset = reset, draw_alis = draw_alis)) return all_seq_group_datas
def run(**kwargs): BT = getBTOL(**mem.sr(kwargs)) seqnodes = BT.investigatePhylum(**kwargs) recs, seqelts, seqtuples = seq_recs(seqnodes) align = align_seqnodes(recs) tree = phyml.tree(align) rstfile= paml.run_paml(tree, align) anc_tree = paml.rst_parser(rstfile) anc_alignment = [SeqRecord(elt.m['seq'], id = None, name = elt.name, annotations = {'scores':elt.m['probs']}) for elt in anc_tree.get_nonterminals()] return (tree, anc_tree), (align, anc_alignment)
def set_datafiles(**kwargs): out ={} idmap = id_map(**mem.sr(kwargs)) for k,v in idmap.iteritems(): out[k] = array([ [float(e) for e in re.compile('\s+').split(l.strip())] for l in open(v['file']).readlines() if l[0] in '0123456789']) return out
def __init__(self, **kwargs): self.t = sqt.init(**mem.sr(kwargs))
def investigatePhylum(self, aliname = 'group2.stk', p_node = None, **kwargs): if not p_node: p_node = ncbi.taxon_with_name('phylum', 'Thermotogae') ali_seqs = ali.get_seqs(aliname, **mem.sr(kwargs)) ali_nodes = array(ali.get_taxnodes(aliname, **mem.sr(kwargs))) ali_phyla = array(ali.get_taxon_forall(aliname,**mem.sr(kwargs, rank = 'phylum'))) ali_inds = nonzero(equal(ali_phyla, p_node))[0] leaf_terminals = self.t.get_terminals() leaf_nodes = array(self.leafNodes(**mem.sr(kwargs))) leaf_phyla = array(self.getTaxon('phylum', **mem.sr(kwargs))) leaf_inds = nonzero(equal(leaf_phyla, p_node))[0] ap_sub = ali_phyla[ali_inds] lp_sub = leaf_phyla[leaf_inds] ag_sub = array(ali.get_taxon_forsome(ali_nodes[ali_inds],'genus','thermo', **mem.sr(kwargs))) lg_sub = array(self.getTaxon('genus', **mem.sr(kwargs)))[leaf_inds] as_sub = array(ali.get_taxon_forsome(ali_nodes[ali_inds], 'species', 'thermo')) ls_sub = array(self.getTaxon('species',**mem.sr(kwargs)))[leaf_inds] db16 = cbdb.getName('16s') a_16s= [ db16.S.q(db16.Sequence). filter_by(source_taxon = n.id).all() for n in ali_nodes[ali_inds]] l_16s= [ db16.S.q(db16.Sequence). filter_by(source_taxon = n.id).all() for n in leaf_nodes[leaf_inds]] #fill any empty nodes... (those lacking 16s rRNA) for idx, elt in enumerate(a_16s): cur_node= ali_nodes[ali_inds[idx]] while not elt: cur_node = cur_node.parent elt.extend(db16.S.q(db16.Sequence).filter_by(source_taxon = cur_node.id).all()) for idx, elt in enumerate(l_16s): cur_node= leaf_nodes[leaf_inds[idx]] while not elt: cur_node = cur_node.parent elt.extend(db16.S.q(db16.Sequence).filter_by(source_taxon = cur_node.id).all()) all_lens = dict([ (k, [len(list( e)) for e in seqlist] ) for seqlist,k in [[a_16s,'a_16s'],[l_16s,'l_16s']]]) leaf_sns = [ SeqNode(lg_sub[i],ls_sub[i] , leaf_nodes[idx], [(x.sequence,x.gb_id, x.source_taxon, ncbi.get_node(x.source_taxon).rank) for x in l_16s[i]], src = leaf_terminals[idx], node_id = 'btol:default:{0}'.format(leaf_terminals[idx].m['id'])) for i, idx in enumerate(leaf_inds)] ali_sns = [ SeqNode(ag_sub[i],as_sub[i] , ali_nodes[idx], [( x.sequence,x.gb_id , x.source_taxon, ncbi.get_node(x.source_taxon).rank) for x in a_16s[i]], src = ali_seqs[idx], node_id = 'ali:{0}:{1}'.format(aliname,ali_seqs[idx].id)) for i, idx in enumerate(ali_inds)] return list(it.chain(leaf_sns, ali_sns))
def setTaxon(BTInstance = None, rank = None, **kwargs): assert rank; assert BTInstance leafnodes = BTInstance.leafNodes(**mem.sr(kwargs)) leaf_families = [ncbi.get_taxon(node, rank=rank) if node else None for node in leafnodes] return leaf_families
def setTaxon(aliname = None, rank = None,**kwargs): assert aliname != None and rank != None nodes = get_taxnodes(aliname,**mem.sr(kwargs)) taxon = [ncbi.get_taxon(node, rank=rank) if node else None for node in nodes] return taxon
def setC2(**kwargs): ll = c2(**mem.sr(kwargs)) result = c2(ll, **mem.sr(kwargs)) return result