Exemple #1
0
def run_anc(input_dict,run_id = None):
  assert run_id
  rank_name = input_dict['rank_name']
  taxon_id  = input_dict['taxid']
  aliname = input_dict['aliname']
  
  BT = getBTOL()
  p_node = ncbi.get_node(taxon_id)
  seqnodes = BT.investigatePhylum(p_node = p_node)
  recs, seqelts, seqtuples = seq_recs(seqnodes)

  align = align_seqnodes(recs)
  tree = phyml.tree(align, run_id = run_id)
  rstfile= paml.run_paml(tree, align, run_id = run_id)
  anc_tree = paml.rst_parser(rstfile)

  anc_alignment = [SeqRecord(elt.m['seq'], 
                             id = None,
                             name = elt.name,
                             annotations = {'scores':elt.m['probs']})
                   for elt in anc_tree.get_nonterminals()]
  out_dict = dict(anc_tree=anc_tree,
                  anc_align= anc_alignment,
                  term_tree = tree,
                  term_align = align)
  return out_dict
Exemple #2
0
def run(**kwargs):
  BT = getBTOL(**mem.sr(kwargs))
  seqnodes = BT.investigatePhylum(**kwargs)
  recs, seqelts, seqtuples = seq_recs(seqnodes)
  align = align_seqnodes(recs)
  tree = phyml.tree(align)
  rstfile= paml.run_paml(tree, align)
  anc_tree = paml.rst_parser(rstfile)

  anc_alignment = [SeqRecord(elt.m['seq'], 
                             id = None,
                             name = elt.name,
                             annotations = {'scores':elt.m['probs']})
                   for elt in anc_tree.get_nonterminals()]
  

  return (tree, anc_tree), (align, anc_alignment)
Exemple #3
0
def seq_dists(ali,run_id, tree = True):
    import Levenshtein
    n = len(ali)
    dists = zeros((n,n))

    if tree:
        ali_named = align.MultipleSeqAlignment(ali)
        maps = {}
        for idx, a in enumerate(ali_named):
            a.id = 'S{0:05}'.format(idx) 
            maps[a.id] = idx
        tree = phyml.tree(ali_named, run_id = run_id, bionj = True)
        for n1 in tree.get_terminals():
            for n2 in tree.get_terminals():
                dists[maps[n1.name],maps[n2.name]] = \
                    tree.distance(n1,n2)
    else:
        for i in range(n):
            for j in range(i):
                dists[i,j] = Levenshtein.distance(str(ali[i].seq), str(ali[j].seq))
                dists[j,i] = dists[i,j]
    return dists
Exemple #4
0
def eval_seq_group(gap_seqs, rfid, run_id, inp_run_id, reset = True,
                   draw_alis = draw_all_easy,
                   clade_alignment_method = clade_alignment_method,
                   max_structs = 5):

    rutils = utils
    data = butils.load_data(inp_run_id, 'output')
    structs = data['structs']
    energies = data['energies']
    esrt = argsort(energies)[::-1]
    s_inds = esrt[:max_structs]
    structs, energies = [structs[i] for i in s_inds], [energies[i] for i in s_inds]

    refseq = data['seq']
    
    nq = len(gap_seqs)
    ns = len(structs)

    names = ['N{1:04}'.format(rfid, idx) for idx in range(nq)]
    seqs = [rutils.ungapped_seq(gap_seqs[i], names[i]) for i in range(nq)]
    


    profiles = mem.getOrSet(setProfiles, 
                            **mem.rc({},
                                     seq = refseq, structs = structs, run_id = rfid,
                                     reset = reset,
                                     on_fail = 'compute', 
                                     register = 'tuprof_{0}'.format(rfid)))
    
    if draw_alis: 
        draw_cm_muscle_congruencies(seqs, profiles, 
                                    run_id, reset = reset)
    

    if clade_alignment_method == 'cm':
        alis, refs, all_pairs  =\
            mem.getOrSet(setAlignments, 
                         **mem.rc({},
                                  seqs = seqs, profiles = profiles, 
                                  run_id = rfid, ali_type = 'struct',
                                  reset = reset,
                                  on_fail = 'compute', 
                                  register = 'tuali_struct_{0}'.format(rfid)))
    else:
        raise Exception('No methods besides cm are yet implemented')
    

    seq_group_data = {}
    seq_group_data['seqs'] = gap_seqs
    seq_group_data['structs'] = []
    for i, struct in enumerate(structs):
        struct_data = {}
        ali = alis[i]
        ref = refs[i]
        pairs = all_pairs[i]
        
        #NOTE THAT DUE TO AN AWKWARD SYNTAX DECISION,
        #I AM ALLOWING FOR THE POSSIBILITY THAT EACH
        #ALI ELT HAS DIFFERENT PAIRS.
        #
        #ALL OF MY ROUTINES SO FAR ONLY USE A SINGLE 
        #PAIR SET AND SO I USE PAIRS[0] EXCLUSIVELY
        struct_data.update(ref = ref[0], 
                           pairs = pairs[0],
                           ali = ali)
                        
        rid = '{0}_{1}'.format(run_id, i)

        if clade_tree_method ==  'bionj': 
            tree = phyml.tree(ali, run_id = rid, bionj = True)
        else: tree = get_phase_tree(ali, pairs[0], run_id)

        for i, ct in enumerate(tree.get_terminals()):
            seq = filter(lambda x: x.id == ct.name, ali)[0]
            ct.m = {'seq':seq,
                    'probs':array([1 for j in range(len(seq))])}

        if clade_ancestor_method == 'independent':
            ml_tree = get_ml_ancestor_tree(tree, ali, 
                                           '{0}_paml{1}'.format(run_id, i))
        else:
            ml_tree = get_structure_ancestor_tree(\
                tree, ali,'{0}_stree{1}'.format(run_id, i))
        
        muts, times, gaps, irresolvables = tree_conservation.count_struct(ml_tree, pairs[0])

        struct_data.update(muts = muts, times = times, 
                        gaps = gaps, irresolvables = irresolvables)
        seq_group_data['structs'].append(struct_data)

    return seq_group_data
Exemple #5
0
def draw_cm_muscle_congruencies(seqs, profiles, run_id, reset = True):
    print 'computing alignments...'
    print '  ...using muscle'
    malis, mrefs, mpairs =\
            mem.getOrSet(setAlignments, 
                         **mem.rc({},
                                  seqs = seqs, profiles = profiles, 
                                  run_id = run_id, ali_type = 'muscle',
                                  reset = reset,
                                  on_fail = 'compute', 
                                  register = 'tuali_musc_{0}'.format(run_id))) 
    print '  ...using cmalign.'
    salis, srefs, spairs  =\
        mem.getOrSet(setAlignments, 
                     **mem.rc({},
                              seqs = seqs, profiles = profiles, 
                              run_id = run_id, ali_type = 'struct',
                              reset = reset,
                              on_fail = 'compute', 
                              register = 'tuali__struct_{0}'.format(run_id)))
 
    print '  ...making trees.'
    
    for idx, alis in enumerate(zip(malis, salis)):
        m, s = alis
        mtree  = phyml.tree(m,run_id, bionj = True)
        stree  = phyml.tree(s,run_id, bionj = True)
        
        maps = dict([(elt.id,i) for i, elt in enumerate(m)])
        mdists = zeros((len(maps),len(maps)))
        sdists = zeros((len(maps),len(maps)))
        for n1 in mtree.get_terminals():
            for n2 in mtree.get_terminals():
                mdists[maps[n1.name],maps[n2.name]] = \
                    mtree.distance(n1,n2)
        
        for n1 in stree.get_terminals():
            for n2 in stree.get_terminals():
                sdists[maps[n1.name],maps[n2.name]] = \
                    stree.distance(n1,n2)
        tree_similarity(sdists, mdists, '{0}_struct_{1}'.format(run_id,idx), k = len(sdists - 1))
        tree_similarity(sdists, mdists, '{0}_struct_{1}'.format(run_id,idx), k = 6)

        f = myplots.fignum(4, (8,10))
        ct = mycolors.getct(len(mtree.get_terminals()))

        import networkx

        for t, sp, ttype in zip([mtree, stree], [211,212], ['sequence', 'structural']):
            a = f.add_subplot(sp)
            layout = 'neato'
            G = phylo.to_networkx(t)
            Gi = networkx.convert_node_labels_to_integers(G, discard_old_labels=False)
            posi = networkx.pygraphviz_layout(Gi, layout, args = '')
            posn = dict((n, posi[Gi.node_labels[n]]) for n in G)


            networkx.draw(G, posn, labels = dict([(n, '') for n in G.nodes()]),
                      node_size = [100 if  n.name in maps.keys() else 0 for n in G.nodes()],
                      width = 1, edge_color = 'black',
                      ax = a,
                      node_color = [ct[maps.get(n.name, -1)] for n in G.nodes()] )
        

            a.annotate('Embedded tree for {0} alignment.'.format(ttype),
                    [0,1], xycoords = 'axes fraction', va = 'top',
                    xytext = [10,0],textcoords = 'offset pixels')
            a.annotate('Total branch length is {0}'.format(t.total_branch_length()),
                    [1,0], xycoords = 'axes fraction', ha = 'right',
                    xytext = [-10,10],textcoords = 'offset pixels')            

        #phylo.draw_graphviz(  mtree,  label_func = lambda x: '', 
        #                      node_color = [ct[maps.get(n.name, -1)] for n in G.nodes()] +\
        #                          [ct[0] for n in mtree.get_nonterminals()], axes = ax)

        datafile = cfg.dataPath('figs/gpm2/pt2_mus_cm_tree_embeddings_{0}_struct_{1}.ps'.format(run_id, idx))
        f.savefig(datafile, dpi = 200, format = 'ps')