def ali_in_tree(self,aliname = 'group2.stk', rank = 'genus', **kwargs): all_seqs = ali.get_seqs(aliname) alinodes = ali.get_taxnodes(aliname) aliranks = [t.rank if t else None for t in alinodes] all_leaves = self.t.get_terminals() leafnodes = self.leafNodes(reset = mod(reset, 2)) leafranks =[n.rank if n else None for n in leafnodes] ali_families = ali.get_taxon_forall(rank = rank,aliname = aliname, **mem.sr(kwargs)) leaf_families= self.getTaxon(rank = rank, **mem.sr(kwargs)) aset = set(ali_families) lset = set(leaf_families) a_domains =[(node, ncbi.get_taxon(node,'superkingdom')) for node in aset] l_domains =[(node, ncbi.get_taxon(node,'superkingdom')) for node in lset] bac_domain = [x[1] for x in l_domains if ncbi.sciname(x[1])== 'Bacteria'][0] l_bacs = set((l[0] for l in l_domains if l[1] == bac_domain)) a_bacs = set((a[0] for a in a_domains if a[1] == bac_domain)) leaf_bacteria = [leaf if leaf in l_bacs else None for leaf in leaf_families] ali_bacteria = [a if a in a_bacs else None for a in ali_families] return leaf_bacteria, ali_bacteria, leafnodes, alinodes
def investigatePhylum(self, aliname = 'group2.stk', p_node = None, **kwargs): if not p_node: p_node = ncbi.taxon_with_name('phylum', 'Thermotogae') ali_seqs = ali.get_seqs(aliname, **mem.sr(kwargs)) ali_nodes = array(ali.get_taxnodes(aliname, **mem.sr(kwargs))) ali_phyla = array(ali.get_taxon_forall(aliname,**mem.sr(kwargs, rank = 'phylum'))) ali_inds = nonzero(equal(ali_phyla, p_node))[0] leaf_terminals = self.t.get_terminals() leaf_nodes = array(self.leafNodes(**mem.sr(kwargs))) leaf_phyla = array(self.getTaxon('phylum', **mem.sr(kwargs))) leaf_inds = nonzero(equal(leaf_phyla, p_node))[0] ap_sub = ali_phyla[ali_inds] lp_sub = leaf_phyla[leaf_inds] ag_sub = array(ali.get_taxon_forsome(ali_nodes[ali_inds],'genus','thermo', **mem.sr(kwargs))) lg_sub = array(self.getTaxon('genus', **mem.sr(kwargs)))[leaf_inds] as_sub = array(ali.get_taxon_forsome(ali_nodes[ali_inds], 'species', 'thermo')) ls_sub = array(self.getTaxon('species',**mem.sr(kwargs)))[leaf_inds] db16 = cbdb.getName('16s') a_16s= [ db16.S.q(db16.Sequence). filter_by(source_taxon = n.id).all() for n in ali_nodes[ali_inds]] l_16s= [ db16.S.q(db16.Sequence). filter_by(source_taxon = n.id).all() for n in leaf_nodes[leaf_inds]] #fill any empty nodes... (those lacking 16s rRNA) for idx, elt in enumerate(a_16s): cur_node= ali_nodes[ali_inds[idx]] while not elt: cur_node = cur_node.parent elt.extend(db16.S.q(db16.Sequence).filter_by(source_taxon = cur_node.id).all()) for idx, elt in enumerate(l_16s): cur_node= leaf_nodes[leaf_inds[idx]] while not elt: cur_node = cur_node.parent elt.extend(db16.S.q(db16.Sequence).filter_by(source_taxon = cur_node.id).all()) all_lens = dict([ (k, [len(list( e)) for e in seqlist] ) for seqlist,k in [[a_16s,'a_16s'],[l_16s,'l_16s']]]) leaf_sns = [ SeqNode(lg_sub[i],ls_sub[i] , leaf_nodes[idx], [(x.sequence,x.gb_id, x.source_taxon, ncbi.get_node(x.source_taxon).rank) for x in l_16s[i]], src = leaf_terminals[idx], node_id = 'btol:default:{0}'.format(leaf_terminals[idx].m['id'])) for i, idx in enumerate(leaf_inds)] ali_sns = [ SeqNode(ag_sub[i],as_sub[i] , ali_nodes[idx], [( x.sequence,x.gb_id , x.source_taxon, ncbi.get_node(x.source_taxon).rank) for x in a_16s[i]], src = ali_seqs[idx], node_id = 'ali:{0}:{1}'.format(aliname,ali_seqs[idx].id)) for i, idx in enumerate(ali_inds)] return list(it.chain(leaf_sns, ali_sns))