def test_backfill_names_gap(self): """correctly backfill names""" consensus_tree = DndParser("(((s1,s2)g1,(s3,s4)g2,(s5,s6)g3)f1)o1;") rank_lookup = {'s':6,'g':5,'f':4,'o':3,'c':2,'p':1,'k':0} for n in consensus_tree.traverse(include_self=True): n.Rank = rank_lookup[n.Name[0]] input = "((((1)s1,(2)s2),((3)s3,(4)s5)))o1;" lookup = dict([(n.Name, n) for n in consensus_tree.traverse(include_self=True)]) #exp = "((((1)s1,(2)s2)g1,((3)'g2; s3',(4)'g3; s5')))'o1; f1'" t = DndParser(input) t.Rank = 3 t.Children[0].Rank = None t.Children[0].Children[0].Rank = None t.Children[0].Children[1].Rank = None t.Children[0].Children[0].Children[0].Rank = 6 t.Children[0].Children[0].Children[1].Rank = 6 t.Children[0].Children[1].Children[0].Rank = 6 t.Children[0].Children[1].Children[1].Rank = 6 backfill_names_gap(t, lookup) self.assertEqual(t.BackFillNames, ['o1']) self.assertEqual(t.Children[0].BackFillNames, []) self.assertEqual(t.Children[0].Children[0].BackFillNames, []) self.assertEqual(t.Children[0].Children[1].BackFillNames, []) self.assertEqual(t.Children[0].Children[0].Children[0].BackFillNames, ['f1','g1','s1']) self.assertEqual(t.Children[0].Children[0].Children[1].BackFillNames, ['f1','g1','s2']) self.assertEqual(t.Children[0].Children[1].Children[0].BackFillNames, ['f1','g2','s3']) self.assertEqual(t.Children[0].Children[1].Children[1].BackFillNames, ['f1','g3','s5'])
def test_score_tree(self): """Determine's the tree's fmeasure score""" # set RankNames and RankNameScores # if name in RankNames, check score, look at tips, etc t_str = "(((a,b),(c,d))e,(f,g),h)i;" t = DndParser(t_str) t.RankNames = ['i',None,None,None] # 1.0 * 6 t.RankNameScores = [1.0,None,None,None] t.Children[0].RankNames = [None,'e','foo',None] # 0.5 * 3, 0.6 * 3 t.Children[0].RankNameScores = [None, 0.5, 0.6, None] t.Children[0].Children[0].RankNames = [None] * 7 t.Children[0].Children[1].RankNames = [None] * 7 t.Children[1].RankNames = [None] * 7 t.Children[1].RankNameScores = [None] * 7 tips = t.tips() tips[0].Consensus = [None] * 7 tips[1].Consensus = [1,3,None,None] tips[2].Consensus = [2,4,5,None] tips[3].Consensus = [None,1,None,None] tips[4].Consensus = [None,1,None,None] tips[5].Consensus = [2,None,3,None] tips[6].Consensus = [None,4,None,None] decorate_ntips(t) exp = ((1.0 * 6) + (0.5 * 3) + (0.6 * 3)) / (6 + 3 + 3) obs = score_tree(t) self.assertEqual(obs, exp)
def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): """Returns a tree from Alignment object aln. aln: an cogent.core.alignment.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object best_tree: if True (default:False), uses a slower but more accurate algorithm to build the tree. params: dict of parameters to pass in to the Clustal app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. """ # Create instance of app controller, enable tree, disable alignment app = Clustalw(InputHandler="_input_as_multiline_string", params=params, WorkingDir="/tmp") app.Parameters["-align"].off() # Set params to empty dict if None. if params is None: params = {} if moltype == DNA or moltype == RNA: params["-type"] = "d" elif moltype == PROTEIN: params["-type"] = "p" else: raise ValueError, "moltype must be DNA, RNA, or PROTEIN" # best_tree -> bootstrap if best_tree: if "-bootstrap" not in params: app.Parameters["-bootstrap"].on(1000) if "-seed" not in params: app.Parameters["-seed"].on(randint(0, 1000)) if "-bootlabels" not in params: app.Parameters["-bootlabels"].on("nodes") else: app.Parameters["-tree"].on() # Setup mapping. Clustalw clips identifiers. We will need to remap them. seq_collection = SequenceCollection(aln) int_map, int_keys = seq_collection.getIntMap() int_map = SequenceCollection(int_map) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result["Tree"].read(), constructor=PhyloNode) for node in tree.tips(): node.Name = int_keys[node.Name] # Clean up result.cleanUp() del (seq_collection, app, result, int_map, int_keys) return tree
def get_support_file(group, tree_file, support_file): def test_group(s): try: return group[s] except KeyError: return None color_map = {} for ind, group_name in enumerate(list(set(group.itervalues()))): if len(list(set(group.itervalues())))>20: color_map[group_name] = "#000000" else: color_map[group_name] = COLS_BREWER[ind] color_dict = {} t = DndParser(open(tree_file, 'U'), constructor=PhyloNode, unescape_name=True) nodes = t.getNodesDict() for node, value in nodes.iteritems(): sub_nodes = value.getNodeNames() sub_node_groups = set(map(test_group, sub_nodes)) try: sub_node_groups.remove(None) except KeyError: pass sub_node_groups = list(sub_node_groups) if (len(sub_node_groups)) > 1: color_dict[node] = 'grey' else: color_dict[node] = color_map[sub_node_groups[0]] with open(support_file, 'w') as out: for node, color in color_dict.iteritems(): out.write('%s\t%s\n' % (node, color))
def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): """Returns a tree from alignment Will check MolType of aln object """ if params is None: params = {} if moltype == DNA or moltype == RNA: params['-nt'] = True elif moltype == PROTEIN: params['-nt'] = False else: raise ValueError, \ "FastTree does not support moltype: %s" % moltype.label if best_tree: params['-slow'] = True #Create mapping between abbreviated IDs and full IDs int_map, int_keys = aln.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map, MolType=moltype) app = FastTree(params=params) result = app(int_map.toFasta()) tree = DndParser(result['Tree'].read(), constructor=PhyloNode) #remap tip names for tip in tree.tips(): tip.Name = int_keys[tip.Name] return tree
def sort_order(records): """returns the sort order by id""" tree = DndParser("(((nosp,sp)named,notnamed)inpref,\ ((nosp,sp)named,notnamed)outpref);") for n in tree.tips(): n.LengthsAndIds = [] lookup = {} lookup[('named_isolate',True,True)] = \ tree.Children[0].Children[0].Children[0] lookup[('named_isolate',True,False)] = \ tree.Children[0].Children[0].Children[1] lookup[('clone',True,False)] = \ tree.Children[0].Children[1] lookup[('named_isolate',False,True)] = \ tree.Children[1].Children[0].Children[0] lookup[('named_isolate',False,False)] = \ tree.Children[1].Children[0].Children[1] lookup[('clone',False,False)] = \ tree.Children[1].Children[1] for k,v in records.items(): to_lookup = tuple(v[1:]) lookup[to_lookup].LengthsAndIds.append((v[0],k)) order = [] # tips go left->right for n in tree.tips(): order.extend([i for l,i in sorted(n.LengthsAndIds)[::-1]]) return order
def test_DndParser(self): """DndParser tests""" t_str = "(A_a,(B:1.0,C),'D_e':0.5)E;" tree_unesc = DndParser(t_str, PhyloNode, unescape_name=True) tree_esc = DndParser(t_str, PhyloNode, unescape_name=False) self.assertEqual(tree_unesc.Name, 'E') self.assertEqual(tree_unesc.Children[0].Name, 'A a') self.assertEqual(tree_unesc.Children[1].Children[0].Name, 'B') self.assertEqual(tree_unesc.Children[1].Children[0].Length, 1.0) self.assertEqual(tree_unesc.Children[1].Children[1].Name, 'C') self.assertEqual(tree_unesc.Children[2].Name, 'D_e') self.assertEqual(tree_unesc.Children[2].Length, 0.5) self.assertEqual(tree_esc.Name, 'E') self.assertEqual(tree_esc.Children[0].Name, 'A_a') self.assertEqual(tree_esc.Children[1].Children[0].Name, 'B') self.assertEqual(tree_esc.Children[1].Children[0].Length, 1.0) self.assertEqual(tree_esc.Children[1].Children[1].Name, 'C') self.assertEqual(tree_esc.Children[2].Name, "'D_e'") self.assertEqual(tree_esc.Children[2].Length, 0.5) reload_test = tree_esc.getNewick(with_distances=True, \ escape_name=False) obs = DndParser(reload_test, unescape_name=False) self.assertEqual(obs.getNewick(with_distances=True), \ tree_esc.getNewick(with_distances=True)) reload_test = tree_unesc.getNewick(with_distances=True, \ escape_name=False) obs = DndParser(reload_test, unescape_name=False) self.assertEqual(obs.getNewick(with_distances=True), \ tree_unesc.getNewick(with_distances=True))
def check_tree_subset(fasta_labels, tree_fp): """ Returns a list of all fasta labels that are not a subset of the tree fasta_labels: list of fasta labels tree_fp: tree filepath """ # Need to get modified fasta labels with underscore stripped raw_fasta_labels = set([label.split('_')[0] for label in fasta_labels]) tree_f = open(tree_fp, "U") tree = DndParser(tree_f) # Get a set of tree tip names tree_tips = set(tree.getTipNames()) labels_not_in_tips = [] for curr_label in raw_fasta_labels: if curr_label not in tree_tips: labels_not_in_tips.append(curr_label) # Return True if all found in tree tips if len(labels_not_in_tips) == 0: labels_not_in_tips = True return labels_not_in_tips
def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): """Returns a tree from alignment Will check MolType of aln object """ if params is None: params = {} if moltype == DNA or moltype == RNA: params["-nt"] = True elif moltype == PROTEIN: params["-nt"] = False else: raise ValueError, "FastTree does not support moltype: %s" % moltype.label if best_tree: params["-slow"] = True # Create mapping between abbreviated IDs and full IDs int_map, int_keys = aln.getIntMap() # Create SequenceCollection from int_map. int_map = SequenceCollection(int_map, MolType=moltype) app = FastTree(params=params) result = app(int_map.toFasta()) tree = DndParser(result["Tree"].read(), constructor=PhyloNode) # remap tip names for tip in tree.tips(): tip.Name = int_keys[tip.Name] return tree
def test_make_distance_based_exclusion_fn(self): """make_distance_based_exclusion_fn should return a working function""" exclude_similar_strains = make_distance_based_exclusion_fn(0.03) # Test that new function is documented exp_doc = "Exclude neighbors of tip within 0.030000 branch length units" self.assertEqual(exp_doc, exclude_similar_strains.__doc__) # Test that the function works test_tree = self.SimpleTree.deepcopy() # print test_tree.getNewick(with_distances=True) tip = test_tree.getNodeMatchingName("C") obs = exclude_similar_strains(tip, test_tree).getNewick(with_distances=True) exp = "(A:0.02,B:0.01)root;" self.assertEqual(obs, exp) # Test on a tree where a single node will remain test_tree = DndParser("((A:0.02,B:0.01)E:0.05,(C:0.06,D:0.01)F:0.05)root;") # print test_tree.getNewick(with_distances=True) tip = test_tree.getNodeMatchingName("D") obs = exclude_similar_strains(tip, test_tree).getNewick(with_distances=True) exp = "((A:0.02,B:0.01)E:0.05,C:0.11)root;" self.assertEqual(obs, exp) # Test that we raise if distance is too large test_tree = self.SimpleTree.deepcopy() test_fn = make_distance_based_exclusion_fn(300.0) tip = test_tree.getNodeMatchingName("C") self.assertRaises(ValueError, test_fn, tip, test_tree)
def test_bifurcating(self): """Coerces nodes to have <= 2 children""" t_str = "((a:1,b:2,c:3)d:4,(e:5,f:6,g:7)h:8,(i:9,j:10,k:11)l:12)m:14;" t = DndParser(t_str) # can't break up easily... sorry 80char exp_str = "((a:1.0,(b:2.0,c:3.0):0.0)d:4.0,((e:5.0,(f:6.0,g:7.0):0.0)h:8.0,(i:9.0,(j:10.0,k:11.0):0.0)l:12.0):0.0)m:14.0;" obs = t.bifurcating()
def build_tree_from_alignment(aln, moltype, best_tree=False, params={}): """Returns a tree from Alignment object aln. aln: an xxx.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object best_tree: best_tree suppport is currently not implemented params: dict of parameters to pass in to the RAxML app controller. The result will be an xxx.Alignment object, or None if tree fails. """ if best_tree: raise NotImplementedError if '-m' not in params: if moltype == DNA or moltype == RNA: #params["-m"] = 'GTRMIX' # in version 7.2.3, GTRMIX is no longer supported but says GTRCAT # behaves like GTRMIX (http://www.phylo.org/tools/raxmlhpc2.html) params["-m"] = 'GTRGAMMA' elif moltype == PROTEIN: params["-m"] = 'PROTGAMMAmatrixName' else: raise ValueError("Moltype must be either DNA, RNA, or PROTEIN") if not hasattr(aln, 'toPhylip'): aln = Alignment(aln) seqs, align_map = aln.toPhylip() # generate temp filename for output params["-w"] = "/tmp/" params["-n"] = get_tmp_filename().split("/")[-1] params["-k"] = True params["-p"] = randint(1, 100000) params["-x"] = randint(1, 100000) ih = '_input_as_multiline_string' raxml_app = Raxml(params=params, InputHandler=ih, WorkingDir=None, SuppressStderr=True, SuppressStdout=True) raxml_result = raxml_app(seqs) tree = DndParser(raxml_result['Bootstrap'], constructor=PhyloNode) for node in tree.tips(): node.Name = align_map[node.Name] raxml_result.cleanUp() return tree
def build_tree_from_alignment(aln, moltype, best_tree=False, params={}): """Returns a tree from Alignment object aln. aln: an xxx.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object best_tree: best_tree suppport is currently not implemented params: dict of parameters to pass in to the RAxML app controller. The result will be an xxx.Alignment object, or None if tree fails. """ if best_tree: raise NotImplementedError if '-m' not in params: if moltype == DNA or moltype == RNA: #params["-m"] = 'GTRMIX' # in version 7.2.3, GTRMIX is no longer supported but says GTRCAT # behaves like GTRMIX (http://www.phylo.org/tools/raxmlhpc2.html) params["-m"] = 'GTRGAMMA' elif moltype == PROTEIN: params["-m"] = 'PROTGAMMAmatrixName' else: raise ValueError("Moltype must be either DNA, RNA, or PROTEIN") if not hasattr(aln, 'toPhylip'): aln = Alignment(aln) seqs, align_map = aln.toPhylip() # generate temp filename for output params["-w"] = "/tmp/" params["-n"] = get_tmp_filename().split("/")[-1] params["-k"] = True params["-p"] = randint(1,100000) params["-x"] = randint(1,100000) ih = '_input_as_multiline_string' raxml_app = Raxml(params=params, InputHandler=ih, WorkingDir=None, SuppressStderr=True, SuppressStdout=True) raxml_result = raxml_app(seqs) tree = DndParser(raxml_result['Bootstrap'], constructor=PhyloNode) for node in tree.tips(): node.Name = align_map[node.Name] raxml_result.cleanUp() return tree
def bootstrap_tree_from_alignment(aln, seed=None, num_trees=None, params=None): """Returns a tree from Alignment object aln with bootstrap support values. aln: an cogent.core.alignment.Alignment object, or data that can be used to build one. seed: an interger, seed value to use num_trees: an integer, number of trees to bootstrap against params: dict of parameters to pass in to the Clustal app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. If seed is not specifed in params, a random integer between 0-1000 is used. """ # Create instance of controllor, enable bootstrap, disable alignment,tree app = Clustalw(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir='/tmp') app.Parameters['-align'].off() app.Parameters['-tree'].off() if app.Parameters['-bootstrap'].isOff(): if num_trees is None: num_trees = 1000 app.Parameters['-bootstrap'].on(num_trees) if app.Parameters['-seed'].isOff(): if seed is None: seed = randint(0,1000) app.Parameters['-seed'].on(seed) if app.Parameters['-bootlabels'].isOff(): app.Parameters['-bootlabels'].on("node") # Setup mapping. Clustalw clips identifiers. We will need to remap them. seq_collection = SequenceCollection(aln) int_map, int_keys = seq_collection.getIntMap() int_map = SequenceCollection(int_map) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result['Tree'].read(), constructor=PhyloNode) for node in tree.tips(): node.Name = int_keys[node.Name] # Clean up result.cleanUp() del(seq_collection, app, result, int_map, int_keys) return tree
def bootstrap_tree_from_alignment(aln, seed=None, num_trees=None, params=None): """Returns a tree from Alignment object aln with bootstrap support values. aln: an cogent.core.alignment.Alignment object, or data that can be used to build one. seed: an interger, seed value to use num_trees: an integer, number of trees to bootstrap against params: dict of parameters to pass in to the Clustal app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. If seed is not specifed in params, a random integer between 0-1000 is used. """ # Create instance of controllor, enable bootstrap, disable alignment,tree app = Clustalw(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir='/tmp') app.Parameters['-align'].off() app.Parameters['-tree'].off() if app.Parameters['-bootstrap'].isOff(): if num_trees is None: num_trees = 1000 app.Parameters['-bootstrap'].on(num_trees) if app.Parameters['-seed'].isOff(): if seed is None: seed = randint(0, 1000) app.Parameters['-seed'].on(seed) if app.Parameters['-bootlabels'].isOff(): app.Parameters['-bootlabels'].on("node") # Setup mapping. Clustalw clips identifiers. We will need to remap them. seq_collection = SequenceCollection(aln) int_map, int_keys = seq_collection.getIntMap() int_map = SequenceCollection(int_map) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result['Tree'].read(), constructor=PhyloNode) for node in tree.tips(): node.Name = int_keys[node.Name] # Clean up result.cleanUp() del (seq_collection, app, result, int_map, int_keys) return tree
def test_get_nearest_named_ancestor(self): """correctly get the nearest named ancestor""" t = DndParser("(((s1,s2)g1,s3))root;") t2 = DndParser("(((s1,s2)g1,s3));") exp_t = t exp_t2 = None obs_t = get_nearest_named_ancestor(t.getNodeMatchingName('s3')) obs_t2 = get_nearest_named_ancestor(t2.getNodeMatchingName('s3')) self.assertEqual(obs_t, exp_t) self.assertEqual(obs_t2, exp_t2)
def test_reroot(self): """Should correctly reroot a tree""" t = DndParser("(((a,b)c,(d,e)f)g,(h,i)j);") tips = ['a','b'] for n in t.traverse(): n.Length = 1.0 # note, g is lost because it has a single descendent and gets pruned off exp = "((a:1.0,b:1.0)c:0.5,((d:1.0,e:1.0)f:1.0,(h:1.0,i:1.0)j:2.0):0.5);" obs = reroot(t, tips) self.assertEqual(obs.getNewick(with_distances=True), exp)
def raxml_alignment(align_obj, raxml_model="GTRCAT", params={}, SuppressStderr=True, SuppressStdout=True): """Run raxml on alignment object align_obj: Alignment object params: you can set any params except -w and -n returns: tuple (phylonode, parsimonyphylonode, log likelihood, total exec time) """ # generate temp filename for output params["-w"] = "/tmp/" params["-n"] = get_tmp_filename().split("/")[-1] params["-m"] = raxml_model params["-p"] = randint(1,100000) ih = '_input_as_multiline_string' seqs, align_map = align_obj.toPhylip() #print params["-n"] # set up command raxml_app = Raxml( params=params, InputHandler=ih, WorkingDir=None, SuppressStderr=SuppressStderr, SuppressStdout=SuppressStdout) # run raxml ra = raxml_app(seqs) # generate tree tree_node = DndParser(ra["Result"]) # generate parsimony tree parsimony_tree_node = DndParser(ra["ParsimonyTree"]) # extract log likelihood from log file log_file = ra["Log"] total_exec_time = exec_time = log_likelihood = 0.0 for line in log_file: exec_time, log_likelihood = map(float, line.split()) total_exec_time += exec_time # remove output files ra.cleanUp() return tree_node, parsimony_tree_node, log_likelihood, total_exec_time
def test_build_tree_from_alignment(self): tree = build_tree_from_alignment(self.seqs, DNA) # test expected output for fasttree 1.1 and 2.0.1 try: for o,e in zip(tree.traverse(), DndParser(exp_tree).traverse()): self.assertEqual(o.Name,e.Name) self.assertFloatEqual(o.Length,e.Length) except AssertionError: for o,e in zip(tree.traverse(), DndParser(exp_tree_201).traverse()): self.assertEqual(o.Name,e.Name) self.assertFloatEqual(o.Length,e.Length)
def assign_tax_labels_to_tree(tree, std): """Puts new tip labels onto tree tree : newick string std : output from shorten_taxonomy_strings """ tree_nodes = DndParser(tree, PhyloNode) for node in tree_nodes.tips(): label = node.Name.strip('\'') #incase there are actual quotes tax = std[label] new_label = str(label) + '_' + tax node.Name = new_label return tree_nodes
def test_data(self): """DndParser should work as expected on real data""" t = DndParser(sample) self.assertEqual( str(t), '((xyz:0.28124,(def:0.24498,mno:0.03627):0.1771):0.0487,abc:0.05925,(ghi:0.06914,jkl:0.13776):0.09853);' ) tdata = DndParser(node_data_sample, unescape_name=True) self.assertEqual( str(tdata), "((xyz:0.28124,(def:0.24498,mno:0.03627)A:0.1771)B:0.0487,abc:0.05925,(ghi:0.06914,jkl:0.13776)C:0.09853);" )
def assign_tax_labels_to_tree(tree,std): """Puts new tip labels onto tree tree : newick string std : output from shorten_taxonomy_strings """ tree_nodes = DndParser(tree, PhyloNode) for node in tree_nodes.tips(): label = node.Name.strip('\'') #incase there are actual quotes tax = std[label] new_label = str(label) + '_' + tax node.Name = new_label return tree_nodes
def test_getsubtree(self): """testing getting a subtree """ otu_names = ['NineBande', 'Mouse', 'HowlerMon', 'DogFaced'] newick = '(((Human,HowlerMon),Mouse),NineBande,DogFaced);' newick_reduced = '((Mouse,HowlerMon),NineBande,DogFaced);' tree = DndParser(newick, constructor = PicrustNode) subtree = tree.getSubTree(otu_names) new_tree = DndParser(newick_reduced, constructor = PicrustNode) # check we get the same names self.assertEqual(*[len(t.Children) for t in (subtree,new_tree)]) self.assertEqual(subtree.getNewick(), new_tree.getNewick())
def remove_taxonomy(tree, regex_string): """Puts new tip labels onto tree tree : LoadTree object regex_string : """ tree_nodes = DndParser(tree, PhyloNode) for node in tree_nodes.tips(): label = node.Name.strip('\'') # incase there are actual quotes p = re.compile(regex_string) new_label = p.sub('', label) #print new_label node.Name = new_label return tree_nodes
def test_gnodedata(self): """DndParser should assign Name to internal nodes correctly""" t = DndParser(nodedata) self.assertEqual(len(t), 2) self.assertEqual(len(t[0]), 0) #first child is terminal self.assertEqual(len(t[1]), 2) #second child has two children self.assertEqual(str(t), '(abc:3.0,(def:4.0,ghi:5.0)jkl:6.0);') info_dict = {} for node in t.traverse(): info_dict[node.Name] = node.Length self.assertEqual(info_dict['abc'], 3.0) self.assertEqual(info_dict['def'], 4.0) self.assertEqual(info_dict['ghi'], 5.0) self.assertEqual(info_dict['jkl'], 6.0)
def test_ascii(self): self.tree.asciiArt() # unlabeled internal node tr = DndParser("(B:0.2,(C:0.3,D:0.4):0.6)F;") tr.asciiArt(show_internal=True, compact=False) tr.asciiArt(show_internal=True, compact=True) tr.asciiArt(show_internal=False, compact=False)
def test_join_nodes(self): """join them nodes! (((99 + 97) + 94) + 91) + ...""" parsed = [make_nodes(self.clst_99, 0.01, 99), make_nodes(self.clst_97, 0.02, 97), make_nodes(self.clst_94, 0.03, 94)] exp = """((((3:.005)99_2_3:.01,(8:.005,7:.005)99_3_8:.01)97_0_3:.015)94_0_3, (((1:.005,6:.005)99_1_1:.01)97_1_1:.015, ((10:.005,20:.005,30:.005)99_0_10:.01)97_2_10:.015)94_1_1);""" expt = DndParser(exp) obs = join_nodes(parsed) self.assertEqual(obs.getNewick(with_distances=True), expt.getNewick(with_distances=True))
def build_tree_from_distance_matrix(matrix, best_tree=False, params={}, working_dir="/tmp"): """Returns a tree from a distance matrix. matrix: a square Dict2D object (cogent.util.dict2d) best_tree: if True (default:False), uses a slower but more accurate algorithm to build the tree. params: dict of parameters to pass in to the Clearcut app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. """ params["--out"] = get_tmp_filename(working_dir) # Create instance of app controller, enable tree, disable alignment app = Clearcut( InputHandler="_input_as_multiline_string", params=params, WorkingDir=working_dir, SuppressStdout=True, SuppressStderr=True, ) # Turn off input as alignment app.Parameters["-a"].off() # Input is a distance matrix app.Parameters["-d"].on() if best_tree: app.Parameters["-N"].on() # Turn the dict2d object into the expected input format matrix_input, int_keys = _matrix_input_from_dict2d(matrix) # Collect result result = app(matrix_input) # Build tree tree = DndParser(result["Tree"].read(), constructor=PhyloNode) # reassign to original names for node in tree.tips(): node.Name = int_keys[node.Name] # Clean up result.cleanUp() del (app, result, params) return tree
def test_str(self): """RangeNode should round-trip Newick string corrrectly.""" r = RangeNode() self.assertEqual(str(r), '()') #should work for tree with branch lengths set t = DndParser(self.sample_tree_string, RangeNode) expected = self.sample_tree_string.replace('\n', '') expected = expected.replace(' ', '') self.assertEqual(str(t), expected) #self.assertEqual(t.getNewick(with_distances=True), expected) #should also work for tree w/o branch lengths t2 = DndParser(self.sample_string_2, RangeNode) self.assertEqual(str(t2), self.sample_string_2)
def test_shuffle_tipnames(self): """shuffle_tipnames should return copy of tree w/ labels permuted""" #Note: this should never fail but is technically still stochastic #5! is 120 so repeating 5 times should fail about 1 in 10^10. for i in range(5): try: t = DndParser(self.t_str) result = shuffle_tipnames(t) orig_names = [n.Name for n in t.tips()] new_names = [n.Name for n in result.tips()] self.assertIsPermutation(orig_names, new_names) return except AssertionError: continue raise AssertionError("Produced same permutation in 5 tries: broken?")
def test_shuffle_tipnames(self): """shuffle_tipnames should return copy of tree w/ labels permuted""" #Note: this should never fail but is technically still stochastic #5! is 120 so repeating 5 times should fail about 1 in 10^10. for i in range(5): try: t = DndParser(self.t_str) result = shuffle_tipnames(t) orig_names = [n.Name for n in t.tips()] new_names = [n.Name for n in result.tips()] self.assertIsPermutation(orig_names, new_names) return except AssertionError: continue raise AssertionError, "Produced same permutation in 5 tries: broken?"
def load_tree(input, tipname_map, verbose=False): """Returns a PhyloNode tree decorated with helper attrs Helper attrs include Consensus, TipStart and TipStop. Nontips and tips that do not have consensus information will have [None] * len(RANK_ORDER) set as Consensus """ if verbose: print "loading tree..." if isinstance(input, TreeNode): tree = input else: tree = DndParser(input) tips = tree.tips() n_ranks = len(RANK_ORDER) for idx, tip in enumerate(tips): tip.TipStart = idx tip.TipStop = idx tip.Consensus = tipname_map.get(tip.Name, [None] * 7) if verbose and tip.Consensus is None: print "No consensus for %s" % tip.Name for node in tree.postorder(include_self=True): if node.istip(): continue node.TipStart = node.Children[0].TipStart node.TipStop = node.Children[-1].TipStop node.Consensus = [None] * n_ranks if node.Name is None: node.Bootstrap = None else: try: node.Bootstrap = float(node.Name) node.Name = None except: if verbose: print "Could not save bootstrap %s, node is root: %s" % \ (node.Name, str(node.Parent == None)) node.Bootstrap = None for tip in tree.tips(): if tip.Name: tip.Name = tip.Name.replace("'","") return tree
def test_fitch_descendants_missing_data(self): """fitch_descendants should work with missing data""" #tree and envs for testing missing values t_str = '(((a:1,b:2):4,(c:3,d:1):2):1,(e:2,f:1):3);' env_str = """a A b B c D d C e C f D""" t = DndParser(t_str, UniFracTreeNode) node_index, nodes = index_tree(t) env_counts = count_envs(env_str.split('\n')) count_array, unique_envs, env_to_index, node_to_index = \ index_envs(env_counts, node_index) branch_lengths = get_branch_lengths(node_index) #test just the AB pair ab_counts = count_array[:, 0:2] bindings = bind_to_array(nodes, ab_counts) changes = fitch_descendants(bindings, counter=FitchCounter) self.assertEqual(changes, 1) orig_result = ab_counts.copy() #check that the original Fitch counter gives the expected #incorrect parsimony result changes = fitch_descendants(bindings, counter=FitchCounterDense) self.assertEqual(changes, 5) new_result = ab_counts.copy() #check that the two versions fill the array with the same values self.assertEqual(orig_result, new_result)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) # get options tree_fp = opts.input_tree tips_to_keep = opts.tips_to_keep.split(',') scoring_method = opts.scoring_method # load tree tree = DndParser(open(tree_fp, 'U'), constructor=PhyloNode) # decorate measurements onto tree (either by depth or by number of children) if scoring_method == 'depth': tree2 = decorate_depth(tree) elif scoring_method == 'numtips': tree2 = decorate_numtips(tree) # get the nodes for the inserted sequences nodes_dict = get_insert_dict(tree2, set(tips_to_keep)) # remove nodes accordingly final_tree = drop_duplicate_nodes(tree2, nodes_dict) #final_tree.nameUnnamedNodes() # write out the resulting tree open_outpath = open(opts.output_fp, 'w') open_outpath.write(final_tree.getNewick(with_distances=True)) open_outpath.close()
def timing(tree_size, num_trees, num_samples): FastUnifrac_times = list() EMDUnifrac_times = list() EMDUnifrac_flow_times = list() for tree_it in range(num_trees): t = Tree() t.populate(tree_size, random_branches = True) tree_str = t.write(format=1) tr = DndParser(tree_str, UniFracTreeNode) (T,l,nodes_in_order) = EMDU.parse_tree(tree_str) for it in range(num_samples): envs = EMDU.simulate_data(t.get_leaf_names()) # FastUnifrac can only take weight on leaf nodes (envs_prob_dict, samples) = EMDU.parse_envs(envs, nodes_in_order) P = envs_prob_dict[samples[0]] Q = envs_prob_dict[samples[1]] #EMDUnifrac with flow t0 = timeit.default_timer() (Z, Flow, diffab) = EMDU.EMDUnifrac_weighted_flow(T, l, nodes_in_order, P, Q) t1 = timeit.default_timer() EMDUnifrac_flow_times.append(t1-t0) #EMDUnifrac no flow t0 = timeit.default_timer() (Z,diffab) = EMDU.EMDUnifrac_weighted(T, l, nodes_in_order, P, Q) t1 = timeit.default_timer() EMDUnifrac_times.append(t1-t0) #FastUnifrac weighted t0 = timeit.default_timer() res = fast_unifrac(tr, envs, weighted=True, modes=set(['distance_matrix'])) t1 = timeit.default_timer() FastUnifrac_times.append(t1-t0) return (np.array(EMDUnifrac_times).mean(), np.array(EMDUnifrac_flow_times).mean(), np.array(FastUnifrac_times).mean())
def convert_tree_tips(align_map,tree_fp): """ rename the starting tree to correspond to the new phylip names, which are assigned to each sequence """ # flip key value pairs tree_tip_to_seq_name={} for i in align_map: tree_tip_to_seq_name[align_map[i]] = i # change the tip labels to phylip labels open_tree=open(tree_fp) tree=DndParser(open_tree, constructor=PhyloNode) for node in tree.tips(): node.Name = tree_tip_to_seq_name[node.Name] return tree
def test_gonenest(self): """DndParser should work correctly with nested data""" t = DndParser(onenest) self.assertEqual(len(t), 2) self.assertEqual(len(t[0]), 0) #first child is terminal self.assertEqual(len(t[1]), 2) #second child has two children self.assertEqual(str(t), '(abc:3.0,(def:4.0,ghi:5.0):6.0);')
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) output_dir = opts.output_dir create_dir(output_dir) otu_table_fp = opts.otu_table otu_table = load_table(otu_table_fp) tree_fh = open(opts.tree_file, 'U') tree = DndParser(tree_fh) tree_fh.close() mapping_fp = opts.mapping_fp if mapping_fp: mapping_f = open(mapping_fp, 'U') input_map_basename = splitext(split(mapping_fp)[1])[0] else: mapping_f = None input_map_basename = None input_table_basename = splitext(split(otu_table_fp)[1])[0] simsam_range_to_files(otu_table, tree, simulated_sample_sizes=map(int, opts.num.split(',')), dissimilarities=map(float, opts.dissim.split(',')), output_dir=output_dir, mapping_f=mapping_f, output_table_basename=input_table_basename, output_map_basename=input_map_basename)
def test_join_nodes(self): """join them nodes! (((99 + 97) + 94) + 91) + ...""" parsed = [ make_nodes(self.clst_99, 0.01, 99), make_nodes(self.clst_97, 0.02, 97), make_nodes(self.clst_94, 0.03, 94) ] exp = """((((3:.005)99_2_3:.01,(8:.005,7:.005)99_3_8:.01)97_0_3:.015)94_0_3, (((1:.005,6:.005)99_1_1:.01)97_1_1:.015, ((10:.005,20:.005,30:.005)99_0_10:.01)97_2_10:.015)94_1_1);""" expt = DndParser(exp) obs = join_nodes(parsed) self.assertEqual(obs.getNewick(with_distances=True), expt.getNewick(with_distances=True))
def wagner_for_picrust(tree_path, trait_table_path, gain=None, max_paralogs=None, HALT_EXEC=False): '''Runs count application controller given path of tree and trait table and returns a Table''' #initialize Count app controller count = Count(HALT_EXEC=HALT_EXEC) #set the parameters if gain: count.Parameters['-gain'].on(gain) if max_paralogs: count.Parameters['-max_paralogs'].on(max_paralogs) ###Have to manipulate the trait table some. Need to transpose it and strip ids surrounded in quotes. table = LoadTable(filename=trait_table_path, header=True, sep='\t') #get the first column (containing row ids) genome_ids = table.getRawData(table.Header[0]) #remove single quotes from the id if they exist genome_ids = [str(id).strip('\'') for id in genome_ids] #transpose the matrix table = table.transposed(new_column_name=table.Header[0]) #Change the headers table = table.withNewHeader(table.Header[1:], genome_ids) #write the modified table to a tmp file tmp_table_path = get_tmp_filename() table.writeToFile(tmp_table_path, sep='\t') #Run Count here result = count(data=(tree_path, tmp_table_path)) #Remove tmp file remove(tmp_table_path) #tree=LoadTree(tree_path) tree = DndParser(open(tree_path)) #parse the results into a Cogent Table asr_table = parse_wagner_parsimony_output(result["StdOut"].readlines(), remove_num_tips=len(tree.tips())) #transpose the table asr_table = asr_table.transposed(new_column_name='nodes') return asr_table
def test_cache_tipnames(self): """caches tipnames""" t = DndParser("((a,b)c,(d,e)f)g;") cache_tipnames(t) self.assertEqual(t.TipNames, ['a', 'b', 'd', 'e']) self.assertEqual(t.Children[0].TipNames, ['a', 'b']) self.assertEqual(t.Children[1].TipNames, ['d', 'e'])
def build_tree_from_distance_matrix(matrix, best_tree=False, params={},\ working_dir='/tmp'): """Returns a tree from a distance matrix. matrix: a square Dict2D object (cogent.util.dict2d) best_tree: if True (default:False), uses a slower but more accurate algorithm to build the tree. params: dict of parameters to pass in to the Clearcut app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. """ params['--out'] = get_tmp_filename(working_dir) # Create instance of app controller, enable tree, disable alignment app = Clearcut(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir=working_dir, SuppressStdout=True,\ SuppressStderr=True) #Turn off input as alignment app.Parameters['-a'].off() #Input is a distance matrix app.Parameters['-d'].on() if best_tree: app.Parameters['-N'].on() # Turn the dict2d object into the expected input format matrix_input, int_keys = _matrix_input_from_dict2d(matrix) # Collect result result = app(matrix_input) # Build tree tree = DndParser(result['Tree'].read(), constructor=PhyloNode) # reassign to original names for node in tree.tips(): node.Name = int_keys[node.Name] # Clean up result.cleanUp() del (app, result, params) return tree
def test_gsingle(self): """DndParser should produce a single-child PhyloNode on minimal data""" t = DndParser(single) self.assertEqual(len(t), 1) child = t[0] self.assertEqual(child.Name, 'abc') self.assertEqual(child.Length, 3) self.assertEqual(str(t), '(abc:3.0);')
def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): """Returns a tree from Alignment object aln. aln: a cogent.core.alignment.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object best_tree: unsupported params: dict of parameters to pass in to the Muscle app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. """ # Create instance of app controller, enable tree, disable alignment app = Muscle(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir='/tmp') app.Parameters['-cluster'].on() app.Parameters['-tree1'].on(get_tmp_filename(app.WorkingDir)) app.Parameters['-seqtype'].on(moltype.label) seq_collection = SequenceCollection(aln, MolType=moltype) #Create mapping between abbreviated IDs and full IDs int_map, int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map,MolType=moltype) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result['Tree1Out'].read(), constructor=PhyloNode) for tip in tree.tips(): tip.Name = int_keys[tip.Name] # Clean up result.cleanUp() del(seq_collection, app, result) return tree
def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): """Returns a tree from Alignment object aln. aln: a cogent.core.alignment.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object best_tree: unsupported params: dict of parameters to pass in to the Muscle app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. """ # Create instance of app controller, enable tree, disable alignment app = Muscle(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir='/tmp') app.Parameters['-clusteronly'].on() app.Parameters['-tree1'].on(get_tmp_filename(app.WorkingDir)) app.Parameters['-seqtype'].on(moltype.label) seq_collection = SequenceCollection(aln, MolType=moltype) #Create mapping between abbreviated IDs and full IDs int_map, int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map,MolType=moltype) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result['Tree1Out'].read(), constructor=PhyloNode) for tip in tree.tips(): tip.Name = int_keys[tip.Name] # Clean up result.cleanUp() del(seq_collection, app, result) return tree
def test_decorate_ntips(self): """correctly decorate the tree with the NumTips param""" input = "(((a,b)c,(d,e,f)g)h,(i,j)k)l;" tree = DndParser(input) tips = dict([(tip.Name, tip) for tip in tree.tips()]) tips['a'].Consensus = [1,2,3,4,5,6,7] tips['b'].Consensus = [None,None,None,5,None,None,None] tips['d'].Consensus = [1,2,3,4,5,6,8] tips['e'].Consensus = [None, None,None,None,None,None,None] tips['f'].Consensus = [1,2,3,4,5,6,8] tips['i'].Consensus = [1,2,3,4,5,6,8] tips['j'].Consensus = [1,2,3,4,5,6,8] decorate_ntips(tree) self.assertEqual(tree.NumTips, 6) self.assertEqual(tree.Children[0].NumTips, 4) self.assertEqual(tree.Children[1].NumTips, 2) self.assertEqual(tree.Children[0].Children[0].NumTips, 2) self.assertEqual(tree.Children[0].Children[1].NumTips, 2)
def setUp(self): self.SimpleTree = \ DndParser("((A:0.02,B:0.01)E:0.05,(C:0.01,D:0.01)F:0.05)root;") self.SimpleTreeWithSpaces = \ DndParser("((E coli:0.02,S typhimurium :0.01)Gamma proteobacteria:0.05,(C\t:0.01,D:0.01)F:0.05)root;") self.SimplePolytomyTree = \ DndParser("((A:0.02,B:0.01,B_prime:0.03)E:0.05,(C:0.01,D:0.01)F:0.05)root;") self.SimpleUnlabelledTree = \ DndParser("((A:0.02,B:0.01):0.05,(C:0.01,D:0.01):0.05)root;") #First number is GG id, the second is IMG self.GreengenesToIMG = \ [('469810','645058788'),\ ('457471','645058789'),\ ('266998','641736109')]
def cluster_seqs(seqs, neighbor_join=False, params={}, add_seq_names=True, WorkingDir=None, SuppressStderr=None, SuppressStdout=None, max_chars=1000000, max_hours=1.0, constructor=PhyloNode, clean_up=True ): """Muscle cluster list of sequences. seqs: either file name or list of sequence objects or list of strings or single multiline string containing sequences. Addl docs coming soon """ num_seqs = len(seqs) if num_seqs < 2: raise ValueError("Muscle requres 2 or more sequences to cluster.") num_chars = sum(map(len, seqs)) if num_chars > max_chars: params["-maxiters"] = 2 params["-diags1"] = True params["-sv"] = True #params["-distance1"] = "kmer6_6" #params["-distance1"] = "kmer20_3" #params["-distance1"] = "kbit20_3" print("lots of chars, using fast align", num_chars) params["-maxhours"] = max_hours #params["-maxiters"] = 10 #cluster_type = "upgmb" #if neighbor_join: # cluster_type = "neighborjoining" params["-cluster"] = True params["-tree1"] = get_tmp_filename(WorkingDir) muscle_res = muscle_seqs(seqs, params=params, add_seq_names=add_seq_names, WorkingDir=WorkingDir, SuppressStderr=SuppressStderr, SuppressStdout=SuppressStdout) tree = DndParser(muscle_res["Tree1Out"], constructor=constructor) if clean_up: muscle_res.cleanUp() return tree
def check_tree_exact_match(fasta_labels, tree_fp): """Checks fasta labels to exact match to tree tips Returns a list of two lists, the fasta labels not in tips, and tips not in fasta labels. fasta_labels: list of fasta labels tree_fp: tree filepath """ # Need to get modified fasta labels with underscore stripped raw_fasta_labels = set([label.split('_')[0] for label in fasta_labels]) tree_f = open(tree_fp, "U") tree = DndParser(tree_f) # Get a set of tree tip names tree_tips = set(tree.getTipNames()) labels_not_in_tips = [] for curr_label in raw_fasta_labels: if curr_label not in tree_tips: labels_not_in_tips.append(curr_label) # Return True if all found in tree tips if len(labels_not_in_tips) == 0: labels_not_in_tips = True tips_not_in_labels = [] for curr_tip in tree_tips: if curr_tip not in raw_fasta_labels: tips_not_in_labels.append(curr_tip) if len(tips_not_in_labels) == 0: tips_not_in_labels = True return [labels_not_in_tips, tips_not_in_labels]
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_tree_fp = opts.input_tree_fp tips_fp = opts.tips_fp fasta_fp = opts.fasta_fp output_tree_fp = opts.output_tree_fp if tips_fp != None: tips_to_keep = get_seqs_to_keep_lookup_from_seq_id_file(open(tips_fp,'U')) elif fasta_fp != None: tips_to_keep = get_seqs_to_keep_lookup_from_fasta_file(open(fasta_fp,'U')) else: option_parser.error("Must provide either -t or -f.") tree = DndParser(open(input_tree_fp,'U')) if opts.negate: tips_to_keep = negate_tips_to_keep(tips_to_keep, tree) tree_out = tree.getSubTree(tips_to_keep) tree_out.writeToFile(output_tree_fp)
def setUp(self): """Make some standard objects to test.""" #Notes on sample string: # #1. trailing zeros are stripped in conversion to/from float, so result # is only exactly the same without them. # #2. trailing chars (e.g. semicolon) are not recaptured in the output, # so were deleted from original Newick-format string. # #3. whitespace is stripped, but is handy for formatting, so is stripped # from original string before comparisons. self.sample_tree_string = """ ( ( xyz:0.28124, ( def:0.24498, mno:0.03627) A:0.1771) B:0.0487, abc:0.05925, ( ghi:0.06914, jkl:0.13776) C:0.09853) """ self.t = DndParser(self.sample_tree_string, RangeNode) self.i = self.t.indexByAttr('Name') self.sample_string_2 = '((((a,b),c),(d,e)),((f,g),h))' self.t2 = DndParser(self.sample_string_2, RangeNode) self.i2 = self.t2.indexByAttr('Name') self.sample_string_3 = '(((a,b),c),(d,e))' self.t3 = DndParser(self.sample_string_3, RangeNode)
def setUp(self): """Define a couple of standard trees""" self.t1 = DndParser("(((a,b),c),(d,e))", UniFracTreeNode) self.t2 = DndParser("(((a,b),(c,d)),(e,f))", UniFracTreeNode) self.t3 = DndParser("(((a,b,c),(d)),(e,f))", UniFracTreeNode) self.t4 = DndParser("((c)b,((f,g,h)e,i)d)", UniFracTreeNode) self.t4.Name = "a" self.t_str = "((a:1,b:2):4,(c:3,(d:1,e:1):2):3)" self.t = DndParser(self.t_str, UniFracTreeNode) self.env_str = """ a A 1 a C 2 b A 1 b B 1 c B 1 d B 3 e C 1""" self.env_counts = count_envs(self.env_str.splitlines()) self.node_index, self.nodes = index_tree(self.t) self.count_array, self.unique_envs, self.env_to_index, self.node_to_index = index_envs( self.env_counts, self.node_index ) self.branch_lengths = get_branch_lengths(self.node_index) self.old_t_str = "((org1:0.11,org2:0.22,(org3:0.12,org4:0.23)g:0.33)b:0.2,(org5:0.44,org6:0.55)c:0.3,org7:0.4)" self.old_t = DndParser(self.old_t_str, UniFracTreeNode) self.old_env_str = """ org1 env1 1 org1 env2 1 org2 env2 1 org3 env2 1 org4 env3 1 org5 env1 1 org6 env1 1 org7 env3 1 """ self.old_env_counts = count_envs(self.old_env_str.splitlines()) self.old_node_index, self.old_nodes = index_tree(self.old_t) self.old_count_array, self.old_unique_envs, self.old_env_to_index, self.old_node_to_index = index_envs( self.old_env_counts, self.old_node_index ) self.old_branch_lengths = get_branch_lengths(self.old_node_index)