def run_anc(input_dict,run_id = None): assert run_id rank_name = input_dict['rank_name'] taxon_id = input_dict['taxid'] aliname = input_dict['aliname'] BT = getBTOL() p_node = ncbi.get_node(taxon_id) seqnodes = BT.investigatePhylum(p_node = p_node) recs, seqelts, seqtuples = seq_recs(seqnodes) align = align_seqnodes(recs) tree = phyml.tree(align, run_id = run_id) rstfile= paml.run_paml(tree, align, run_id = run_id) anc_tree = paml.rst_parser(rstfile) anc_alignment = [SeqRecord(elt.m['seq'], id = None, name = elt.name, annotations = {'scores':elt.m['probs']}) for elt in anc_tree.get_nonterminals()] out_dict = dict(anc_tree=anc_tree, anc_align= anc_alignment, term_tree = tree, term_align = align) return out_dict
def run(**kwargs): BT = getBTOL(**mem.sr(kwargs)) seqnodes = BT.investigatePhylum(**kwargs) recs, seqelts, seqtuples = seq_recs(seqnodes) align = align_seqnodes(recs) tree = phyml.tree(align) rstfile= paml.run_paml(tree, align) anc_tree = paml.rst_parser(rstfile) anc_alignment = [SeqRecord(elt.m['seq'], id = None, name = elt.name, annotations = {'scores':elt.m['probs']}) for elt in anc_tree.get_nonterminals()] return (tree, anc_tree), (align, anc_alignment)
def seq_dists(ali,run_id, tree = True): import Levenshtein n = len(ali) dists = zeros((n,n)) if tree: ali_named = align.MultipleSeqAlignment(ali) maps = {} for idx, a in enumerate(ali_named): a.id = 'S{0:05}'.format(idx) maps[a.id] = idx tree = phyml.tree(ali_named, run_id = run_id, bionj = True) for n1 in tree.get_terminals(): for n2 in tree.get_terminals(): dists[maps[n1.name],maps[n2.name]] = \ tree.distance(n1,n2) else: for i in range(n): for j in range(i): dists[i,j] = Levenshtein.distance(str(ali[i].seq), str(ali[j].seq)) dists[j,i] = dists[i,j] return dists
def eval_seq_group(gap_seqs, rfid, run_id, inp_run_id, reset = True, draw_alis = draw_all_easy, clade_alignment_method = clade_alignment_method, max_structs = 5): rutils = utils data = butils.load_data(inp_run_id, 'output') structs = data['structs'] energies = data['energies'] esrt = argsort(energies)[::-1] s_inds = esrt[:max_structs] structs, energies = [structs[i] for i in s_inds], [energies[i] for i in s_inds] refseq = data['seq'] nq = len(gap_seqs) ns = len(structs) names = ['N{1:04}'.format(rfid, idx) for idx in range(nq)] seqs = [rutils.ungapped_seq(gap_seqs[i], names[i]) for i in range(nq)] profiles = mem.getOrSet(setProfiles, **mem.rc({}, seq = refseq, structs = structs, run_id = rfid, reset = reset, on_fail = 'compute', register = 'tuprof_{0}'.format(rfid))) if draw_alis: draw_cm_muscle_congruencies(seqs, profiles, run_id, reset = reset) if clade_alignment_method == 'cm': alis, refs, all_pairs =\ mem.getOrSet(setAlignments, **mem.rc({}, seqs = seqs, profiles = profiles, run_id = rfid, ali_type = 'struct', reset = reset, on_fail = 'compute', register = 'tuali_struct_{0}'.format(rfid))) else: raise Exception('No methods besides cm are yet implemented') seq_group_data = {} seq_group_data['seqs'] = gap_seqs seq_group_data['structs'] = [] for i, struct in enumerate(structs): struct_data = {} ali = alis[i] ref = refs[i] pairs = all_pairs[i] #NOTE THAT DUE TO AN AWKWARD SYNTAX DECISION, #I AM ALLOWING FOR THE POSSIBILITY THAT EACH #ALI ELT HAS DIFFERENT PAIRS. # #ALL OF MY ROUTINES SO FAR ONLY USE A SINGLE #PAIR SET AND SO I USE PAIRS[0] EXCLUSIVELY struct_data.update(ref = ref[0], pairs = pairs[0], ali = ali) rid = '{0}_{1}'.format(run_id, i) if clade_tree_method == 'bionj': tree = phyml.tree(ali, run_id = rid, bionj = True) else: tree = get_phase_tree(ali, pairs[0], run_id) for i, ct in enumerate(tree.get_terminals()): seq = filter(lambda x: x.id == ct.name, ali)[0] ct.m = {'seq':seq, 'probs':array([1 for j in range(len(seq))])} if clade_ancestor_method == 'independent': ml_tree = get_ml_ancestor_tree(tree, ali, '{0}_paml{1}'.format(run_id, i)) else: ml_tree = get_structure_ancestor_tree(\ tree, ali,'{0}_stree{1}'.format(run_id, i)) muts, times, gaps, irresolvables = tree_conservation.count_struct(ml_tree, pairs[0]) struct_data.update(muts = muts, times = times, gaps = gaps, irresolvables = irresolvables) seq_group_data['structs'].append(struct_data) return seq_group_data
def draw_cm_muscle_congruencies(seqs, profiles, run_id, reset = True): print 'computing alignments...' print ' ...using muscle' malis, mrefs, mpairs =\ mem.getOrSet(setAlignments, **mem.rc({}, seqs = seqs, profiles = profiles, run_id = run_id, ali_type = 'muscle', reset = reset, on_fail = 'compute', register = 'tuali_musc_{0}'.format(run_id))) print ' ...using cmalign.' salis, srefs, spairs =\ mem.getOrSet(setAlignments, **mem.rc({}, seqs = seqs, profiles = profiles, run_id = run_id, ali_type = 'struct', reset = reset, on_fail = 'compute', register = 'tuali__struct_{0}'.format(run_id))) print ' ...making trees.' for idx, alis in enumerate(zip(malis, salis)): m, s = alis mtree = phyml.tree(m,run_id, bionj = True) stree = phyml.tree(s,run_id, bionj = True) maps = dict([(elt.id,i) for i, elt in enumerate(m)]) mdists = zeros((len(maps),len(maps))) sdists = zeros((len(maps),len(maps))) for n1 in mtree.get_terminals(): for n2 in mtree.get_terminals(): mdists[maps[n1.name],maps[n2.name]] = \ mtree.distance(n1,n2) for n1 in stree.get_terminals(): for n2 in stree.get_terminals(): sdists[maps[n1.name],maps[n2.name]] = \ stree.distance(n1,n2) tree_similarity(sdists, mdists, '{0}_struct_{1}'.format(run_id,idx), k = len(sdists - 1)) tree_similarity(sdists, mdists, '{0}_struct_{1}'.format(run_id,idx), k = 6) f = myplots.fignum(4, (8,10)) ct = mycolors.getct(len(mtree.get_terminals())) import networkx for t, sp, ttype in zip([mtree, stree], [211,212], ['sequence', 'structural']): a = f.add_subplot(sp) layout = 'neato' G = phylo.to_networkx(t) Gi = networkx.convert_node_labels_to_integers(G, discard_old_labels=False) posi = networkx.pygraphviz_layout(Gi, layout, args = '') posn = dict((n, posi[Gi.node_labels[n]]) for n in G) networkx.draw(G, posn, labels = dict([(n, '') for n in G.nodes()]), node_size = [100 if n.name in maps.keys() else 0 for n in G.nodes()], width = 1, edge_color = 'black', ax = a, node_color = [ct[maps.get(n.name, -1)] for n in G.nodes()] ) a.annotate('Embedded tree for {0} alignment.'.format(ttype), [0,1], xycoords = 'axes fraction', va = 'top', xytext = [10,0],textcoords = 'offset pixels') a.annotate('Total branch length is {0}'.format(t.total_branch_length()), [1,0], xycoords = 'axes fraction', ha = 'right', xytext = [-10,10],textcoords = 'offset pixels') #phylo.draw_graphviz( mtree, label_func = lambda x: '', # node_color = [ct[maps.get(n.name, -1)] for n in G.nodes()] +\ # [ct[0] for n in mtree.get_nonterminals()], axes = ax) datafile = cfg.dataPath('figs/gpm2/pt2_mus_cm_tree_embeddings_{0}_struct_{1}.ps'.format(run_id, idx)) f.savefig(datafile, dpi = 200, format = 'ps')