def calculate_region(arg):
    prot, start, win, nseqs, trop_dict = arg
    
    fname = 'phyliptrees/%s-%i-%i.tree' % (prot, start, win)
    
    if os.path.exists(fname):
        contree = dendropy.Tree.get_from_path(fname, 'nexus')
        treeset = dendropy.TreeList.get_from_path(fname + 'set', 'nexus')
    else:
        
        alphabet = generic_protein if prot != 'LTR' else generic_dna
        contree = TreeingTools.phylip_tree(nseqs, alphabet=alphabet)
        treeset = dendropy.TreeList([contree])
        contree.write_to_path(fname, 'nexus')
        treeset.write_to_path(fname + 'set', 'nexus')
    
    
    try:
        bats_res = TreeingTools.run_bats(treeset, trop_dict, nreps = 1000)
    except:
        bats_res = None
    
    try:
        dmat = TreeingTools.get_pairwise_distances(contree)
        benj_res = TreeingTools.check_distance_pvals(dmat, trop_dict, nreps = 50)
    except:
        benj_res = None
    
    return prot, win, start, bats_res, benj_res
def calculate_region(arg):
    gname, sub, prot, start, win, nseqs, trop_dict = arg
    
    treename = 'quicktrees/%s-%s-%s-%i-%i.tree' % (gname, sub, prot, start, win)
    matfname = 'quicktrees/%s-%s-%s-%i-%i.pkl' % (gname, sub, prot, start, win)
    
    if os.path.exists(treename):
        #benj_res = 'Already Processed'
        #return gname, sub, prot, win, start, benj_res
        
        with open(matfname) as handle:
            dmat = pickle.load(handle)
            
        with open(treename) as handle:
            tree = dendropy.Tree.get_from_stream(handle, 'newick')
        
    else:
        
        is_aa = prot != 'LTR'
        alphabet = generic_protein if is_aa else generic_dna
        
        try:
            tree, dmat = TreeingTools.phylip_tree_collapse_unique(nseqs, alphabet=alphabet)
        except ValueError:
            benj_res = 'Too few unique sequences to process'
            return gname, sub, prot, win, start, benj_res
        except:
            benj_res = 'uncaught exception in dist-mat'
            return gname, sub, prot, win, start, benj_res
        print 'writing'
        with open(matfname, 'w') as handle:
            pickle.dump(dmat, handle)
        with open(treename, 'w') as handle:
            tree.write_to_stream(handle, 'newick')
    
    try:
        benj_res = TreeingTools.check_distance_pvals(dmat, trop_dict, nreps = 50)
    except AssertionError:
        benj_res = 'too few groups'
        return  gname, sub, prot, win, start, benj_res
    except:
        benj_res = 'uncaught exception'
        return  gname, sub, prot, win, start, benj_res
    
    
    try:
        out = TreeingTools.evaluate_association_index(tree, trop_dict)
        benj_res['AI'], benj_res['AI-pval'], benj_res['AI-null'] = out
    except:
        benj_res['AI'], benj_res['AI-pval'], benj_res['AI-null'] = ('error', 'error', 'error')
    
    return gname, sub, prot, win, start, benj_res
def test_make_mrbayes_trees():

    seqs = tree_seqs()
    con_tree, all_trees = TreeingTools.make_mrbayes_trees(seqs, is_aa=False)
    for tst in check_tree(con_tree):
        # yield tst
        pass
def rolling_tree_apply(tup):
    
    group_series, seq_series, kwargs = tup
    
    fname = '/home/will/SubCData/Trees/Tree-%(sub)s-%(Prot)s-%(Start)i-%(WinSize)i.newick' % kwargs
    if os.path.exists(fname):
        return True
    
    
    alpha = generic_dna if kwargs['Prot'] == 'LTR' else generic_protein
    
    seq_series = seq_series.dropna(thresh = 5)
    
    vseq, vgroup = seq_series.align(group_series.dropna(), join = 'inner', axis = 0)
    
    nseq_ser = vseq.apply(append_seq, axis = 1)
    nseqs = sorted(nseq_ser.to_dict().items())
    
    trop_dict = vgroup.to_dict()
    #print nseqs
    #try:
    #    tree, dmat = TreeingTools.phylip_tree_collapse_unique(nseqs, alphabet=alpha, use_fast=True)
    #except:
    #    return False
    #print 'treeing', fname
    tree = TreeingTools.run_FastTree(nseqs, alphabet=alpha, uniq_seqs=True)
    
    with open(fname, 'w') as handle:
        tree.write(handle, schema='newick')
    return True
    
    
    try:
        tree, dmat = TreeingTools.phylip_tree_collapse_unique(nseqs, alphabet=alpha, use_fast=True)
        benj_res = TreeingTools.check_distance_pvals(dmat, trop_dict, nreps = 50)
    except:
        return kwargs
    
    benj_res.update(kwargs)
    try:
        out = TreeingTools.evaluate_association_index(tree, trop_dict)
        benj_res['AI'], benj_res['AI-pval'], benj_res['AI-null'] = out
    except:
        benj_res['AI'], benj_res['AI-pval'], benj_res['AI-null'] = (None, None, None)
        
    return benj_res
def test_generate_mrbayes_nexus():

    cmd = TreeingTools.generate_mrbayes_nexus("/path/to/alignment", "/path/to/output")
    checks = [
        "begin mrbayes;",
        "set autoclose=yes nowarn=yes;",
        "execute /path/to/alignment;",
        "prset aamodelpr = mixed;",
        "sump;",
        "sumt;",
    ]
    for check in checks:
        yield ok_, check in cmd, 'Missing: "%s"' % check
def test_bats_format_nexus():

    taxons = dendropy.TaxonSet(["test_check%i" % i for i in range(10)])
    trop_dict = dict([("test_check%i" % i, (i % 2) == 0) for i in range(10)])
    tlist = [dendropy.treesim.uniform_pure_birth(taxons) for _ in range(20)]
    treelist = dendropy.TreeList(tlist)

    outhandle = StringIO()
    TreeingTools.bats_format_nexus(treelist, outhandle, trop_dict)

    outhandle.seek(0)
    good_lines = ifilter(lambda x: len(x.strip()) > 0, outhandle)
    eq_(good_lines.next().strip(), "#NEXUS", 'First line must be "#NEXUS"')
    eq_(good_lines.next().strip(), "begin states;", 'Second line line must be "begin states;"')

    num = 0
    for num, line in enumerate(good_lines, 1):
        if line.strip() == "End;":
            break

        tnum, state = line.strip().split()
        eq_(num, int(tnum))
        if ((num - 1) % 2) == 0:
            eq_(state, "True")
        else:
            eq_(state, "False")
    eq_(num, len(trop_dict) + 1, "Some leafs were missing!")

    num = 0
    eq_(good_lines.next().strip(), "begin trees;")
    for num, line in enumerate(good_lines, 1):
        if line.strip() == "end;":
            break
        ok_(line.startswith("tree tree_%i" % num))
        ok_("test_check" not in line, "Taxon names are in the tree!")
    eq_(num, len(tlist) + 1, "Some trees were missing!")
def test_fast_tree():

    seqs = tree_seqs()
    tree = TreeingTools.run_FastTree(seqs, alphabet=generic_dna)
    for tst in check_tree(tree):
        yield tst
def test_phylip_tree():

    seqs = tree_seqs()
    tree, _ = TreeingTools.phylip_tree(seqs, alphabet=generic_dna)
    for tst in check_tree(tree):
        yield tst
# <codecell>


# <codecell>

#with open('allgp120.fasta', 'w') as handle:
tres = []
for key, row in wanted_data[['gp120-seq-align', 'Tropism']].dropna().iterrows():
    oname = key+'-'+row['Tropism']
    tres.append((oname, ''.join(row['gp120-seq-align'])))
    
    

# <codecell>

tree, dmat = TreeingTools.phylip_tree_collapse_unique(tres, alphabet=generic_protein)

# <codecell>

with open('gp120tree.nexus', 'w') as handle:
    tree.write_to_stream(handle, 'nexus')

# <codecell>

import networkx
with open('gp120tree.dot') as handle:
    new_tree = networkx.read_dot(handle)

# <codecell>

pos = networkx.spring_layout(new_tree, dim=100)
for name, start, stop in gp120_features:
    rect = Rectangle([start, 0], stop - start, 25, facecolor="r", alpha=0.2)
    plt.gca().add_patch(rect)
    # plt.text((start+stop)/2, 330, name)
    # plt.vlines([start, stop], 0, 300)

plt.legend(loc="upper left")
plt.ylim([0, 25])
plt.xlim([0, 460])
plt.hold(False)
plt.savefig("gp120-multi-smoothed.png")

# <codecell>

import pickle

# <codecell>

with open("wanted_data.pkl") as handle:
    wanted_data = pickle.load(handle)

# <codecell>

import TreeingTools

seq_data = wanted_data["Nef-seq-align"].dropna().map(lambda x: "".join(x[:30])).to_dict().items()
with open("test_nef_seq.phylip", "w") as handle:
    TreeingTools.write_phylip_seqs(seq_data, handle)

# <codecell>
wanted_pat = pat_data[cols.keys()].dropna()
wanted_scores = ltr_df_cp[score_cols+seq_cols].dropna()
wanted_scores['TFJoin'] = wanted_scores[seq_cols].apply(lambda x: ''.join(x), axis=1)
#wanted_scores = wanted_scores.drop(seq_cols, axis=1)

check_data = pd.concat(wanted_pat.align(wanted_scores, axis=0, join='inner'), axis=1).rename(columns = cols)
check_data = check_data.fillna(check_data[score_cols].min())

ncols = dict((col, col.replace('-', '_').replace('/', '_')) for col in check_data.columns)
check_data = check_data.rename(columns = ncols)

# <codecell>

import TreeingTools

tree = TreeingTools.run_FastTree(check_data['TFJoin'].to_dict().items(),
                                 alphabet=TreeingTools.generic_dna)

# <codecell>

import networkx as nx
from itertools import combinations
import csv
with open('ltr_tree.nwk', 'w') as handle:
    tree.write_to_stream(handle, schema = 'phylip', exclude_chars=True)

# <codecell>



# <codecell>