def calculate_region(arg): prot, start, win, nseqs, trop_dict = arg fname = 'phyliptrees/%s-%i-%i.tree' % (prot, start, win) if os.path.exists(fname): contree = dendropy.Tree.get_from_path(fname, 'nexus') treeset = dendropy.TreeList.get_from_path(fname + 'set', 'nexus') else: alphabet = generic_protein if prot != 'LTR' else generic_dna contree = TreeingTools.phylip_tree(nseqs, alphabet=alphabet) treeset = dendropy.TreeList([contree]) contree.write_to_path(fname, 'nexus') treeset.write_to_path(fname + 'set', 'nexus') try: bats_res = TreeingTools.run_bats(treeset, trop_dict, nreps = 1000) except: bats_res = None try: dmat = TreeingTools.get_pairwise_distances(contree) benj_res = TreeingTools.check_distance_pvals(dmat, trop_dict, nreps = 50) except: benj_res = None return prot, win, start, bats_res, benj_res
def calculate_region(arg): gname, sub, prot, start, win, nseqs, trop_dict = arg treename = 'quicktrees/%s-%s-%s-%i-%i.tree' % (gname, sub, prot, start, win) matfname = 'quicktrees/%s-%s-%s-%i-%i.pkl' % (gname, sub, prot, start, win) if os.path.exists(treename): #benj_res = 'Already Processed' #return gname, sub, prot, win, start, benj_res with open(matfname) as handle: dmat = pickle.load(handle) with open(treename) as handle: tree = dendropy.Tree.get_from_stream(handle, 'newick') else: is_aa = prot != 'LTR' alphabet = generic_protein if is_aa else generic_dna try: tree, dmat = TreeingTools.phylip_tree_collapse_unique(nseqs, alphabet=alphabet) except ValueError: benj_res = 'Too few unique sequences to process' return gname, sub, prot, win, start, benj_res except: benj_res = 'uncaught exception in dist-mat' return gname, sub, prot, win, start, benj_res print 'writing' with open(matfname, 'w') as handle: pickle.dump(dmat, handle) with open(treename, 'w') as handle: tree.write_to_stream(handle, 'newick') try: benj_res = TreeingTools.check_distance_pvals(dmat, trop_dict, nreps = 50) except AssertionError: benj_res = 'too few groups' return gname, sub, prot, win, start, benj_res except: benj_res = 'uncaught exception' return gname, sub, prot, win, start, benj_res try: out = TreeingTools.evaluate_association_index(tree, trop_dict) benj_res['AI'], benj_res['AI-pval'], benj_res['AI-null'] = out except: benj_res['AI'], benj_res['AI-pval'], benj_res['AI-null'] = ('error', 'error', 'error') return gname, sub, prot, win, start, benj_res
def test_make_mrbayes_trees(): seqs = tree_seqs() con_tree, all_trees = TreeingTools.make_mrbayes_trees(seqs, is_aa=False) for tst in check_tree(con_tree): # yield tst pass
def rolling_tree_apply(tup): group_series, seq_series, kwargs = tup fname = '/home/will/SubCData/Trees/Tree-%(sub)s-%(Prot)s-%(Start)i-%(WinSize)i.newick' % kwargs if os.path.exists(fname): return True alpha = generic_dna if kwargs['Prot'] == 'LTR' else generic_protein seq_series = seq_series.dropna(thresh = 5) vseq, vgroup = seq_series.align(group_series.dropna(), join = 'inner', axis = 0) nseq_ser = vseq.apply(append_seq, axis = 1) nseqs = sorted(nseq_ser.to_dict().items()) trop_dict = vgroup.to_dict() #print nseqs #try: # tree, dmat = TreeingTools.phylip_tree_collapse_unique(nseqs, alphabet=alpha, use_fast=True) #except: # return False #print 'treeing', fname tree = TreeingTools.run_FastTree(nseqs, alphabet=alpha, uniq_seqs=True) with open(fname, 'w') as handle: tree.write(handle, schema='newick') return True try: tree, dmat = TreeingTools.phylip_tree_collapse_unique(nseqs, alphabet=alpha, use_fast=True) benj_res = TreeingTools.check_distance_pvals(dmat, trop_dict, nreps = 50) except: return kwargs benj_res.update(kwargs) try: out = TreeingTools.evaluate_association_index(tree, trop_dict) benj_res['AI'], benj_res['AI-pval'], benj_res['AI-null'] = out except: benj_res['AI'], benj_res['AI-pval'], benj_res['AI-null'] = (None, None, None) return benj_res
def test_generate_mrbayes_nexus(): cmd = TreeingTools.generate_mrbayes_nexus("/path/to/alignment", "/path/to/output") checks = [ "begin mrbayes;", "set autoclose=yes nowarn=yes;", "execute /path/to/alignment;", "prset aamodelpr = mixed;", "sump;", "sumt;", ] for check in checks: yield ok_, check in cmd, 'Missing: "%s"' % check
def test_bats_format_nexus(): taxons = dendropy.TaxonSet(["test_check%i" % i for i in range(10)]) trop_dict = dict([("test_check%i" % i, (i % 2) == 0) for i in range(10)]) tlist = [dendropy.treesim.uniform_pure_birth(taxons) for _ in range(20)] treelist = dendropy.TreeList(tlist) outhandle = StringIO() TreeingTools.bats_format_nexus(treelist, outhandle, trop_dict) outhandle.seek(0) good_lines = ifilter(lambda x: len(x.strip()) > 0, outhandle) eq_(good_lines.next().strip(), "#NEXUS", 'First line must be "#NEXUS"') eq_(good_lines.next().strip(), "begin states;", 'Second line line must be "begin states;"') num = 0 for num, line in enumerate(good_lines, 1): if line.strip() == "End;": break tnum, state = line.strip().split() eq_(num, int(tnum)) if ((num - 1) % 2) == 0: eq_(state, "True") else: eq_(state, "False") eq_(num, len(trop_dict) + 1, "Some leafs were missing!") num = 0 eq_(good_lines.next().strip(), "begin trees;") for num, line in enumerate(good_lines, 1): if line.strip() == "end;": break ok_(line.startswith("tree tree_%i" % num)) ok_("test_check" not in line, "Taxon names are in the tree!") eq_(num, len(tlist) + 1, "Some trees were missing!")
def test_fast_tree(): seqs = tree_seqs() tree = TreeingTools.run_FastTree(seqs, alphabet=generic_dna) for tst in check_tree(tree): yield tst
def test_phylip_tree(): seqs = tree_seqs() tree, _ = TreeingTools.phylip_tree(seqs, alphabet=generic_dna) for tst in check_tree(tree): yield tst
# <codecell> # <codecell> #with open('allgp120.fasta', 'w') as handle: tres = [] for key, row in wanted_data[['gp120-seq-align', 'Tropism']].dropna().iterrows(): oname = key+'-'+row['Tropism'] tres.append((oname, ''.join(row['gp120-seq-align']))) # <codecell> tree, dmat = TreeingTools.phylip_tree_collapse_unique(tres, alphabet=generic_protein) # <codecell> with open('gp120tree.nexus', 'w') as handle: tree.write_to_stream(handle, 'nexus') # <codecell> import networkx with open('gp120tree.dot') as handle: new_tree = networkx.read_dot(handle) # <codecell> pos = networkx.spring_layout(new_tree, dim=100)
for name, start, stop in gp120_features: rect = Rectangle([start, 0], stop - start, 25, facecolor="r", alpha=0.2) plt.gca().add_patch(rect) # plt.text((start+stop)/2, 330, name) # plt.vlines([start, stop], 0, 300) plt.legend(loc="upper left") plt.ylim([0, 25]) plt.xlim([0, 460]) plt.hold(False) plt.savefig("gp120-multi-smoothed.png") # <codecell> import pickle # <codecell> with open("wanted_data.pkl") as handle: wanted_data = pickle.load(handle) # <codecell> import TreeingTools seq_data = wanted_data["Nef-seq-align"].dropna().map(lambda x: "".join(x[:30])).to_dict().items() with open("test_nef_seq.phylip", "w") as handle: TreeingTools.write_phylip_seqs(seq_data, handle) # <codecell>
wanted_pat = pat_data[cols.keys()].dropna() wanted_scores = ltr_df_cp[score_cols+seq_cols].dropna() wanted_scores['TFJoin'] = wanted_scores[seq_cols].apply(lambda x: ''.join(x), axis=1) #wanted_scores = wanted_scores.drop(seq_cols, axis=1) check_data = pd.concat(wanted_pat.align(wanted_scores, axis=0, join='inner'), axis=1).rename(columns = cols) check_data = check_data.fillna(check_data[score_cols].min()) ncols = dict((col, col.replace('-', '_').replace('/', '_')) for col in check_data.columns) check_data = check_data.rename(columns = ncols) # <codecell> import TreeingTools tree = TreeingTools.run_FastTree(check_data['TFJoin'].to_dict().items(), alphabet=TreeingTools.generic_dna) # <codecell> import networkx as nx from itertools import combinations import csv with open('ltr_tree.nwk', 'w') as handle: tree.write_to_stream(handle, schema = 'phylip', exclude_chars=True) # <codecell> # <codecell>