def sort_order(records): """returns the sort order by id""" tree = DndParser("(((nosp,sp)named,notnamed)inpref,\ ((nosp,sp)named,notnamed)outpref);") for n in tree.tips(): n.LengthsAndIds = [] lookup = {} lookup[('named_isolate',True,True)] = \ tree.Children[0].Children[0].Children[0] lookup[('named_isolate',True,False)] = \ tree.Children[0].Children[0].Children[1] lookup[('clone',True,False)] = \ tree.Children[0].Children[1] lookup[('named_isolate',False,True)] = \ tree.Children[1].Children[0].Children[0] lookup[('named_isolate',False,False)] = \ tree.Children[1].Children[0].Children[1] lookup[('clone',False,False)] = \ tree.Children[1].Children[1] for k,v in records.items(): to_lookup = tuple(v[1:]) lookup[to_lookup].LengthsAndIds.append((v[0],k)) order = [] # tips go left->right for n in tree.tips(): order.extend([i for l,i in sorted(n.LengthsAndIds)[::-1]]) return order
def load_tree(input, tipname_map, verbose=False): """Returns a PhyloNode tree decorated with helper attrs Helper attrs include Consensus, TipStart and TipStop. Nontips and tips that do not have consensus information will have [None] * len(RANK_ORDER) set as Consensus """ if verbose: print "loading tree..." if isinstance(input, TreeNode): tree = input else: tree = DndParser(input) tips = tree.tips() n_ranks = len(RANK_ORDER) for idx, tip in enumerate(tips): tip.TipStart = idx tip.TipStop = idx tip.Consensus = tipname_map.get(tip.Name, [None] * 7) if verbose and tip.Consensus is None: print "No consensus for %s" % tip.Name for node in tree.postorder(include_self=True): if node.istip(): continue node.TipStart = node.Children[0].TipStart node.TipStop = node.Children[-1].TipStop node.Consensus = [None] * n_ranks if node.Name is None: node.Bootstrap = None else: try: node.Bootstrap = float(node.Name) node.Name = None except: if verbose: print "Could not save bootstrap %s, node is root: %s" % \ (node.Name, str(node.Parent == None)) node.Bootstrap = None for tip in tree.tips(): if tip.Name: tip.Name = tip.Name.replace("'","") return tree
def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): """Returns a tree from Alignment object aln. aln: an cogent.core.alignment.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object best_tree: if True (default:False), uses a slower but more accurate algorithm to build the tree. params: dict of parameters to pass in to the Clustal app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. """ # Create instance of app controller, enable tree, disable alignment app = Clustalw(InputHandler="_input_as_multiline_string", params=params, WorkingDir="/tmp") app.Parameters["-align"].off() # Set params to empty dict if None. if params is None: params = {} if moltype == DNA or moltype == RNA: params["-type"] = "d" elif moltype == PROTEIN: params["-type"] = "p" else: raise ValueError, "moltype must be DNA, RNA, or PROTEIN" # best_tree -> bootstrap if best_tree: if "-bootstrap" not in params: app.Parameters["-bootstrap"].on(1000) if "-seed" not in params: app.Parameters["-seed"].on(randint(0, 1000)) if "-bootlabels" not in params: app.Parameters["-bootlabels"].on("nodes") else: app.Parameters["-tree"].on() # Setup mapping. Clustalw clips identifiers. We will need to remap them. seq_collection = SequenceCollection(aln) int_map, int_keys = seq_collection.getIntMap() int_map = SequenceCollection(int_map) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result["Tree"].read(), constructor=PhyloNode) for node in tree.tips(): node.Name = int_keys[node.Name] # Clean up result.cleanUp() del (seq_collection, app, result, int_map, int_keys) return tree
def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): """Returns a tree from alignment Will check MolType of aln object """ if params is None: params = {} if moltype == DNA or moltype == RNA: params['-nt'] = True elif moltype == PROTEIN: params['-nt'] = False else: raise ValueError, \ "FastTree does not support moltype: %s" % moltype.label if best_tree: params['-slow'] = True #Create mapping between abbreviated IDs and full IDs int_map, int_keys = aln.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map, MolType=moltype) app = FastTree(params=params) result = app(int_map.toFasta()) tree = DndParser(result['Tree'].read(), constructor=PhyloNode) #remap tip names for tip in tree.tips(): tip.Name = int_keys[tip.Name] return tree
def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): """Returns a tree from alignment Will check MolType of aln object """ if params is None: params = {} if moltype == DNA or moltype == RNA: params["-nt"] = True elif moltype == PROTEIN: params["-nt"] = False else: raise ValueError, "FastTree does not support moltype: %s" % moltype.label if best_tree: params["-slow"] = True # Create mapping between abbreviated IDs and full IDs int_map, int_keys = aln.getIntMap() # Create SequenceCollection from int_map. int_map = SequenceCollection(int_map, MolType=moltype) app = FastTree(params=params) result = app(int_map.toFasta()) tree = DndParser(result["Tree"].read(), constructor=PhyloNode) # remap tip names for tip in tree.tips(): tip.Name = int_keys[tip.Name] return tree
def test_score_tree(self): """Determine's the tree's fmeasure score""" # set RankNames and RankNameScores # if name in RankNames, check score, look at tips, etc t_str = "(((a,b),(c,d))e,(f,g),h)i;" t = DndParser(t_str) t.RankNames = ['i',None,None,None] # 1.0 * 6 t.RankNameScores = [1.0,None,None,None] t.Children[0].RankNames = [None,'e','foo',None] # 0.5 * 3, 0.6 * 3 t.Children[0].RankNameScores = [None, 0.5, 0.6, None] t.Children[0].Children[0].RankNames = [None] * 7 t.Children[0].Children[1].RankNames = [None] * 7 t.Children[1].RankNames = [None] * 7 t.Children[1].RankNameScores = [None] * 7 tips = t.tips() tips[0].Consensus = [None] * 7 tips[1].Consensus = [1,3,None,None] tips[2].Consensus = [2,4,5,None] tips[3].Consensus = [None,1,None,None] tips[4].Consensus = [None,1,None,None] tips[5].Consensus = [2,None,3,None] tips[6].Consensus = [None,4,None,None] decorate_ntips(t) exp = ((1.0 * 6) + (0.5 * 3) + (0.6 * 3)) / (6 + 3 + 3) obs = score_tree(t) self.assertEqual(obs, exp)
def bootstrap_tree_from_alignment(aln, seed=None, num_trees=None, params=None): """Returns a tree from Alignment object aln with bootstrap support values. aln: an cogent.core.alignment.Alignment object, or data that can be used to build one. seed: an interger, seed value to use num_trees: an integer, number of trees to bootstrap against params: dict of parameters to pass in to the Clustal app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. If seed is not specifed in params, a random integer between 0-1000 is used. """ # Create instance of controllor, enable bootstrap, disable alignment,tree app = Clustalw(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir='/tmp') app.Parameters['-align'].off() app.Parameters['-tree'].off() if app.Parameters['-bootstrap'].isOff(): if num_trees is None: num_trees = 1000 app.Parameters['-bootstrap'].on(num_trees) if app.Parameters['-seed'].isOff(): if seed is None: seed = randint(0,1000) app.Parameters['-seed'].on(seed) if app.Parameters['-bootlabels'].isOff(): app.Parameters['-bootlabels'].on("node") # Setup mapping. Clustalw clips identifiers. We will need to remap them. seq_collection = SequenceCollection(aln) int_map, int_keys = seq_collection.getIntMap() int_map = SequenceCollection(int_map) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result['Tree'].read(), constructor=PhyloNode) for node in tree.tips(): node.Name = int_keys[node.Name] # Clean up result.cleanUp() del(seq_collection, app, result, int_map, int_keys) return tree
def bootstrap_tree_from_alignment(aln, seed=None, num_trees=None, params=None): """Returns a tree from Alignment object aln with bootstrap support values. aln: an cogent.core.alignment.Alignment object, or data that can be used to build one. seed: an interger, seed value to use num_trees: an integer, number of trees to bootstrap against params: dict of parameters to pass in to the Clustal app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. If seed is not specifed in params, a random integer between 0-1000 is used. """ # Create instance of controllor, enable bootstrap, disable alignment,tree app = Clustalw(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir='/tmp') app.Parameters['-align'].off() app.Parameters['-tree'].off() if app.Parameters['-bootstrap'].isOff(): if num_trees is None: num_trees = 1000 app.Parameters['-bootstrap'].on(num_trees) if app.Parameters['-seed'].isOff(): if seed is None: seed = randint(0, 1000) app.Parameters['-seed'].on(seed) if app.Parameters['-bootlabels'].isOff(): app.Parameters['-bootlabels'].on("node") # Setup mapping. Clustalw clips identifiers. We will need to remap them. seq_collection = SequenceCollection(aln) int_map, int_keys = seq_collection.getIntMap() int_map = SequenceCollection(int_map) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result['Tree'].read(), constructor=PhyloNode) for node in tree.tips(): node.Name = int_keys[node.Name] # Clean up result.cleanUp() del (seq_collection, app, result, int_map, int_keys) return tree
def build_tree_from_alignment(aln, moltype, best_tree=False, params={}): """Returns a tree from Alignment object aln. aln: an xxx.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object best_tree: best_tree suppport is currently not implemented params: dict of parameters to pass in to the RAxML app controller. The result will be an xxx.Alignment object, or None if tree fails. """ if best_tree: raise NotImplementedError if '-m' not in params: if moltype == DNA or moltype == RNA: #params["-m"] = 'GTRMIX' # in version 7.2.3, GTRMIX is no longer supported but says GTRCAT # behaves like GTRMIX (http://www.phylo.org/tools/raxmlhpc2.html) params["-m"] = 'GTRGAMMA' elif moltype == PROTEIN: params["-m"] = 'PROTGAMMAmatrixName' else: raise ValueError("Moltype must be either DNA, RNA, or PROTEIN") if not hasattr(aln, 'toPhylip'): aln = Alignment(aln) seqs, align_map = aln.toPhylip() # generate temp filename for output params["-w"] = "/tmp/" params["-n"] = get_tmp_filename().split("/")[-1] params["-k"] = True params["-p"] = randint(1, 100000) params["-x"] = randint(1, 100000) ih = '_input_as_multiline_string' raxml_app = Raxml(params=params, InputHandler=ih, WorkingDir=None, SuppressStderr=True, SuppressStdout=True) raxml_result = raxml_app(seqs) tree = DndParser(raxml_result['Bootstrap'], constructor=PhyloNode) for node in tree.tips(): node.Name = align_map[node.Name] raxml_result.cleanUp() return tree
def build_tree_from_alignment(aln, moltype, best_tree=False, params={}): """Returns a tree from Alignment object aln. aln: an xxx.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object best_tree: best_tree suppport is currently not implemented params: dict of parameters to pass in to the RAxML app controller. The result will be an xxx.Alignment object, or None if tree fails. """ if best_tree: raise NotImplementedError if '-m' not in params: if moltype == DNA or moltype == RNA: #params["-m"] = 'GTRMIX' # in version 7.2.3, GTRMIX is no longer supported but says GTRCAT # behaves like GTRMIX (http://www.phylo.org/tools/raxmlhpc2.html) params["-m"] = 'GTRGAMMA' elif moltype == PROTEIN: params["-m"] = 'PROTGAMMAmatrixName' else: raise ValueError("Moltype must be either DNA, RNA, or PROTEIN") if not hasattr(aln, 'toPhylip'): aln = Alignment(aln) seqs, align_map = aln.toPhylip() # generate temp filename for output params["-w"] = "/tmp/" params["-n"] = get_tmp_filename().split("/")[-1] params["-k"] = True params["-p"] = randint(1,100000) params["-x"] = randint(1,100000) ih = '_input_as_multiline_string' raxml_app = Raxml(params=params, InputHandler=ih, WorkingDir=None, SuppressStderr=True, SuppressStdout=True) raxml_result = raxml_app(seqs) tree = DndParser(raxml_result['Bootstrap'], constructor=PhyloNode) for node in tree.tips(): node.Name = align_map[node.Name] raxml_result.cleanUp() return tree
def assign_tax_labels_to_tree(tree, std): """Puts new tip labels onto tree tree : newick string std : output from shorten_taxonomy_strings """ tree_nodes = DndParser(tree, PhyloNode) for node in tree_nodes.tips(): label = node.Name.strip('\'') #incase there are actual quotes tax = std[label] new_label = str(label) + '_' + tax node.Name = new_label return tree_nodes
def assign_tax_labels_to_tree(tree,std): """Puts new tip labels onto tree tree : newick string std : output from shorten_taxonomy_strings """ tree_nodes = DndParser(tree, PhyloNode) for node in tree_nodes.tips(): label = node.Name.strip('\'') #incase there are actual quotes tax = std[label] new_label = str(label) + '_' + tax node.Name = new_label return tree_nodes
def remove_taxonomy(tree, regex_string): """Puts new tip labels onto tree tree : LoadTree object regex_string : """ tree_nodes = DndParser(tree, PhyloNode) for node in tree_nodes.tips(): label = node.Name.strip('\'') # incase there are actual quotes p = re.compile(regex_string) new_label = p.sub('', label) #print new_label node.Name = new_label return tree_nodes
def build_tree_from_distance_matrix(matrix, best_tree=False, params={}, working_dir="/tmp"): """Returns a tree from a distance matrix. matrix: a square Dict2D object (cogent.util.dict2d) best_tree: if True (default:False), uses a slower but more accurate algorithm to build the tree. params: dict of parameters to pass in to the Clearcut app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. """ params["--out"] = get_tmp_filename(working_dir) # Create instance of app controller, enable tree, disable alignment app = Clearcut( InputHandler="_input_as_multiline_string", params=params, WorkingDir=working_dir, SuppressStdout=True, SuppressStderr=True, ) # Turn off input as alignment app.Parameters["-a"].off() # Input is a distance matrix app.Parameters["-d"].on() if best_tree: app.Parameters["-N"].on() # Turn the dict2d object into the expected input format matrix_input, int_keys = _matrix_input_from_dict2d(matrix) # Collect result result = app(matrix_input) # Build tree tree = DndParser(result["Tree"].read(), constructor=PhyloNode) # reassign to original names for node in tree.tips(): node.Name = int_keys[node.Name] # Clean up result.cleanUp() del (app, result, params) return tree
def test_shuffle_tipnames(self): """shuffle_tipnames should return copy of tree w/ labels permuted""" #Note: this should never fail but is technically still stochastic #5! is 120 so repeating 5 times should fail about 1 in 10^10. for i in range(5): try: t = DndParser(self.t_str) result = shuffle_tipnames(t) orig_names = [n.Name for n in t.tips()] new_names = [n.Name for n in result.tips()] self.assertIsPermutation(orig_names, new_names) return except AssertionError: continue raise AssertionError, "Produced same permutation in 5 tries: broken?"
def test_shuffle_tipnames(self): """shuffle_tipnames should return copy of tree w/ labels permuted""" #Note: this should never fail but is technically still stochastic #5! is 120 so repeating 5 times should fail about 1 in 10^10. for i in range(5): try: t = DndParser(self.t_str) result = shuffle_tipnames(t) orig_names = [n.Name for n in t.tips()] new_names = [n.Name for n in result.tips()] self.assertIsPermutation(orig_names, new_names) return except AssertionError: continue raise AssertionError("Produced same permutation in 5 tries: broken?")
def convert_tree_tips(align_map,tree_fp): """ rename the starting tree to correspond to the new phylip names, which are assigned to each sequence """ # flip key value pairs tree_tip_to_seq_name={} for i in align_map: tree_tip_to_seq_name[align_map[i]] = i # change the tip labels to phylip labels open_tree=open(tree_fp) tree=DndParser(open_tree, constructor=PhyloNode) for node in tree.tips(): node.Name = tree_tip_to_seq_name[node.Name] return tree
def wagner_for_picrust(tree_path, trait_table_path, gain=None, max_paralogs=None, HALT_EXEC=False): '''Runs count application controller given path of tree and trait table and returns a Table''' #initialize Count app controller count = Count(HALT_EXEC=HALT_EXEC) #set the parameters if gain: count.Parameters['-gain'].on(gain) if max_paralogs: count.Parameters['-max_paralogs'].on(max_paralogs) ###Have to manipulate the trait table some. Need to transpose it and strip ids surrounded in quotes. table = LoadTable(filename=trait_table_path, header=True, sep='\t') #get the first column (containing row ids) genome_ids = table.getRawData(table.Header[0]) #remove single quotes from the id if they exist genome_ids = [str(id).strip('\'') for id in genome_ids] #transpose the matrix table = table.transposed(new_column_name=table.Header[0]) #Change the headers table = table.withNewHeader(table.Header[1:], genome_ids) #write the modified table to a tmp file tmp_table_path = get_tmp_filename() table.writeToFile(tmp_table_path, sep='\t') #Run Count here result = count(data=(tree_path, tmp_table_path)) #Remove tmp file remove(tmp_table_path) #tree=LoadTree(tree_path) tree = DndParser(open(tree_path)) #parse the results into a Cogent Table asr_table = parse_wagner_parsimony_output(result["StdOut"].readlines(), remove_num_tips=len(tree.tips())) #transpose the table asr_table = asr_table.transposed(new_column_name='nodes') return asr_table
def build_tree_from_distance_matrix(matrix, best_tree=False, params={},\ working_dir='/tmp'): """Returns a tree from a distance matrix. matrix: a square Dict2D object (cogent.util.dict2d) best_tree: if True (default:False), uses a slower but more accurate algorithm to build the tree. params: dict of parameters to pass in to the Clearcut app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. """ params['--out'] = get_tmp_filename(working_dir) # Create instance of app controller, enable tree, disable alignment app = Clearcut(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir=working_dir, SuppressStdout=True,\ SuppressStderr=True) #Turn off input as alignment app.Parameters['-a'].off() #Input is a distance matrix app.Parameters['-d'].on() if best_tree: app.Parameters['-N'].on() # Turn the dict2d object into the expected input format matrix_input, int_keys = _matrix_input_from_dict2d(matrix) # Collect result result = app(matrix_input) # Build tree tree = DndParser(result['Tree'].read(), constructor=PhyloNode) # reassign to original names for node in tree.tips(): node.Name = int_keys[node.Name] # Clean up result.cleanUp() del (app, result, params) return tree
def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): """Returns a tree from Alignment object aln. aln: a cogent.core.alignment.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object best_tree: unsupported params: dict of parameters to pass in to the Muscle app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. """ # Create instance of app controller, enable tree, disable alignment app = Muscle(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir='/tmp') app.Parameters['-cluster'].on() app.Parameters['-tree1'].on(get_tmp_filename(app.WorkingDir)) app.Parameters['-seqtype'].on(moltype.label) seq_collection = SequenceCollection(aln, MolType=moltype) #Create mapping between abbreviated IDs and full IDs int_map, int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map,MolType=moltype) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result['Tree1Out'].read(), constructor=PhyloNode) for tip in tree.tips(): tip.Name = int_keys[tip.Name] # Clean up result.cleanUp() del(seq_collection, app, result) return tree
def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): """Returns a tree from Alignment object aln. aln: a cogent.core.alignment.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object best_tree: unsupported params: dict of parameters to pass in to the Muscle app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. """ # Create instance of app controller, enable tree, disable alignment app = Muscle(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir='/tmp') app.Parameters['-clusteronly'].on() app.Parameters['-tree1'].on(get_tmp_filename(app.WorkingDir)) app.Parameters['-seqtype'].on(moltype.label) seq_collection = SequenceCollection(aln, MolType=moltype) #Create mapping between abbreviated IDs and full IDs int_map, int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map,MolType=moltype) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result['Tree1Out'].read(), constructor=PhyloNode) for tip in tree.tips(): tip.Name = int_keys[tip.Name] # Clean up result.cleanUp() del(seq_collection, app, result) return tree
def test_decorate_ntips(self): """correctly decorate the tree with the NumTips param""" input = "(((a,b)c,(d,e,f)g)h,(i,j)k)l;" tree = DndParser(input) tips = dict([(tip.Name, tip) for tip in tree.tips()]) tips['a'].Consensus = [1,2,3,4,5,6,7] tips['b'].Consensus = [None,None,None,5,None,None,None] tips['d'].Consensus = [1,2,3,4,5,6,8] tips['e'].Consensus = [None, None,None,None,None,None,None] tips['f'].Consensus = [1,2,3,4,5,6,8] tips['i'].Consensus = [1,2,3,4,5,6,8] tips['j'].Consensus = [1,2,3,4,5,6,8] decorate_ntips(tree) self.assertEqual(tree.NumTips, 6) self.assertEqual(tree.Children[0].NumTips, 4) self.assertEqual(tree.Children[1].NumTips, 2) self.assertEqual(tree.Children[0].Children[0].NumTips, 2) self.assertEqual(tree.Children[0].Children[1].NumTips, 2)
def wagner_for_picrust(tree_path,trait_table_path,gain=None,max_paralogs=None,HALT_EXEC=False): '''Runs count application controller given path of tree and trait table and returns a Table''' #initialize Count app controller count=Count(HALT_EXEC=HALT_EXEC) #set the parameters if gain: count.Parameters['-gain'].on(gain) if max_paralogs: count.Parameters['-max_paralogs'].on(max_paralogs) ###Have to manipulate the trait table some. Need to transpose it and strip ids surrounded in quotes. table = LoadTable(filename=trait_table_path,header=True,sep='\t') #get the first column (containing row ids) genome_ids = table.getRawData(table.Header[0]) #remove single quotes from the id if they exist genome_ids=[str(id).strip('\'') for id in genome_ids] #transpose the matrix table = table.transposed(new_column_name=table.Header[0]) #Change the headers table=table.withNewHeader(table.Header[1:],genome_ids) #write the modified table to a tmp file tmp_table_path =get_tmp_filename() table.writeToFile(tmp_table_path,sep='\t') #Run Count here result = count(data=(tree_path,tmp_table_path)) #Remove tmp file remove(tmp_table_path) #tree=LoadTree(tree_path) tree=DndParser(open(tree_path)) #parse the results into a Cogent Table asr_table= parse_wagner_parsimony_output(result["StdOut"].readlines(),remove_num_tips=len(tree.tips())) #transpose the table asr_table = asr_table.transposed(new_column_name='nodes') return asr_table
def build_tree_from_alignment(aln, moltype, best_tree=False, params={},\ working_dir='/tmp'): """Returns a tree from Alignment object aln. aln: an cogent.core.alignment.Alignment object, or data that can be used to build one. - Clearcut only accepts aligned sequences. Alignment object used to handle unaligned sequences. moltype: a cogent.core.moltype object. - NOTE: If moltype = RNA, we must convert to DNA since Clearcut v1.0.8 gives incorrect results if RNA is passed in. 'U' is treated as an incorrect character and is excluded from distance calculations. best_tree: if True (default:False), uses a slower but more accurate algorithm to build the tree. params: dict of parameters to pass in to the Clearcut app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. """ params['--out'] = get_tmp_filename(working_dir) # Create instance of app controller, enable tree, disable alignment app = Clearcut(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir=working_dir, SuppressStdout=True,\ SuppressStderr=True) #Input is an alignment app.Parameters['-a'].on() #Turn off input as distance matrix app.Parameters['-d'].off() #If moltype = RNA, we must convert to DNA. if moltype == RNA: moltype = DNA if best_tree: app.Parameters['-N'].on() #Turn on correct moltype moltype_string = moltype.label.upper() app.Parameters[MOLTYPE_MAP[moltype_string]].on() # Setup mapping. Clearcut clips identifiers. We will need to remap them. # Clearcut only accepts aligned sequences. Let Alignment object handle # unaligned sequences. seq_aln = Alignment(aln,MolType=moltype) #get int mapping int_map, int_keys = seq_aln.getIntMap() #create new Alignment object with int_map int_map = Alignment(int_map) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result['Tree'].read(), constructor=PhyloNode) for node in tree.tips(): node.Name = int_keys[node.Name] # Clean up result.cleanUp() del(seq_aln, app, result, int_map, int_keys, params) return tree
def test_unifrac_make_subtree(self): """unifrac result should not depend on make_subtree environment M contains only tips not in tree, tip j, k is in no envs one clade is missing entirely values were calculated by hand we also test that we still have a valid tree at the end """ t1 = DndParser('((a:1,b:2):4,((c:3, (j:1,k:2)mt:17),(d:1,e:1):2):3)',\ UniFracTreeNode) # note c,j is len 0 node # /-------- /-a # ---------| \-b # | /-------- /-c # \--------| \mt------ /-j # | \-k # \-------- /-d # \-e # env_str = """ a A 1 a C 2 b A 1 b B 1 c B 1 d B 3 e C 1 m M 88""" env_counts = count_envs(env_str.splitlines()) self.assertFloatEqual(fast_unifrac(t1,env_counts,make_subtree=False)['distance_matrix'], \ (array( [[0,10/16, 8/13], [10/16,0,8/17], [8/13,8/17,0]]),['A','B','C'])) self.assertFloatEqual(fast_unifrac(t1,env_counts,make_subtree=True)['distance_matrix'], \ (array( [[0,10/16, 8/13], [10/16,0,8/17], [8/13,8/17,0]]),['A','B','C'])) # changing tree topology relative to c,j tips shouldn't change anything t2 = DndParser('((a:1,b:2):4,((c:2, (j:1,k:2)mt:17):1,(d:1,e:1):2):3)', \ UniFracTreeNode) self.assertFloatEqual(fast_unifrac(t2,env_counts,make_subtree=False)['distance_matrix'], \ (array( [[0,10/16, 8/13], [10/16,0,8/17], [8/13,8/17,0]]),['A','B','C'])) self.assertFloatEqual(fast_unifrac(t2,env_counts,make_subtree=True)['distance_matrix'], \ (array( [[0,10/16, 8/13], [10/16,0,8/17], [8/13,8/17,0]]),['A','B','C'])) # ensure we haven't meaningfully changed the tree # by passing it to unifrac t3 = DndParser('((a:1,b:2):4,((c:3, (j:1,k:2)mt:17),(d:1,e:1):2):3)',\ UniFracTreeNode) # note c,j is len 0 node t1_tips = [tip.Name for tip in t1.tips()] t1_tips.sort() t3_tips = [tip.Name for tip in t3.tips()] t3_tips.sort() self.assertEqual(t1_tips, t3_tips) tipj3 = t3.getNodeMatchingName('j') tipb3 = t3.getNodeMatchingName('b') tipj1 = t1.getNodeMatchingName('j') tipb1 = t1.getNodeMatchingName('b') self.assertFloatEqual(tipj1.distance(tipb1), tipj3.distance(tipb3))
def main(): usage = "%prog [options] tree_to_midpoint_reroot" opt_parser = OptionParser(usage=usage) (options, args) = opt_parser.parse_args() if len(args) != 1: opt_parser.error('Incorrect number of arguments') if not os.path.exists(args[0]): opt_parser.error('Tree file %s not found' % args[0]) f = open(args[0]) tree_string = f.read() f.close() unrooted_tree = DndParser(tree_string, PhyloNode) breadth_first_visit_order, visit_order_of_node, branch_length_of, \ child_visit_orders_of, num_nodes \ = get_breadth_first_visit_order(unrooted_tree) # We will refer to the node objects by their index in the visit order tip_node_objects = unrooted_tree.tips() num_tips = len(tip_node_objects) # We will refer to the tip objects by their index in the tip_node_objects # list distance_from_node_to_tip = numpy.zeros((num_nodes, num_tips)) stepping_stone_from_node_to_tip = numpy.zeros((num_nodes, num_tips)) tips_connected_to_node = {} for node in xrange(num_nodes): tips_connected_to_node[node] = set() for tip in xrange(num_tips): distance_from_node_to_tip[node,tip] = -1.0 stepping_stone_from_node_to_tip[node,tip] = -1 for tip in xrange(num_tips): tip_as_node = visit_order_of_node[tip_node_objects[tip]] distance_from_node_to_tip[tip_as_node,tip] = 0.0 stepping_stone_from_node_to_tip[tip_as_node,tip] = tip_as_node tips_connected_to_node[tip_as_node].add(tip) for parent in reversed(xrange(num_nodes)): for child in child_visit_orders_of[parent]: child_to_parent_distance = branch_length_of[child] for tip in tips_connected_to_node[child]: tip_distance_to_child = distance_from_node_to_tip[child, tip] tip_to_parent_distance_through_child \ = tip_distance_to_child + child_to_parent_distance if tip in tips_connected_to_node[parent]: tip_distance_to_parent = distance_from_node_to_tip[parent, tip] if tip_to_parent_distance_through_child < tip_distance_to_parent: distance_from_node_to_tip[parent, tip] \ = tip_to_parent_distance_through_child stepping_stone_from_node_to_tip[parent, tip] = child else: distance_from_node_to_tip[parent, tip] \ = tip_to_parent_distance_through_child stepping_stone_from_node_to_tip[parent, tip] = child tips_connected_to_node[parent].add(tip) for parent in xrange(num_nodes): for child in child_visit_orders_of[parent]: child_to_parent_distance = branch_length_of[child] for tip in tips_connected_to_node[parent]: tip_distance_to_parent = distance_from_node_to_tip[parent, tip] tip_to_child_distance_through_parent \ = tip_distance_to_parent + child_to_parent_distance if tip in tips_connected_to_node[child]: tip_distance_to_child = distance_from_node_to_tip[child, tip] if tip_to_child_distance_through_parent < tip_distance_to_child: distance_from_node_to_tip[child, tip] \ = tip_to_child_distance_through_parent stepping_stone_from_node_to_tip[child, tip] = parent else: distance_from_node_to_tip[child, tip] \ = tip_to_child_distance_through_parent stepping_stone_from_node_to_tip[child, tip] = parent tips_connected_to_node[child].add(tip) max_distance = 0.0 max_tip0 = None max_tip1 = None for tip0 in xrange(num_tips): tip0_as_node = visit_order_of_node[tip_node_objects[tip0]] for tip1 in xrange(num_tips): tip0_tip1_distance = distance_from_node_to_tip[tip0_as_node,tip1] if tip0_tip1_distance > max_distance: max_distance = tip0_tip1_distance max_tip0 = tip0 max_tip1 = tip1 midpoint_distance = max_distance / 2 tip0 = max_tip0 tip1 = max_tip1 node_closer_to_tip0 = visit_order_of_node[tip_node_objects[tip0]] node_closer_to_tip1 = node_closer_to_tip0 distance_to_tip1 = distance_from_node_to_tip[node_closer_to_tip0, tip1] node_even_closer_to_tip1 \ = stepping_stone_from_node_to_tip[node_closer_to_tip0, tip1] previous_distance_to_tip1 = distance_to_tip1 while distance_to_tip1 > midpoint_distance: node_closer_to_tip0 = node_closer_to_tip1 node_closer_to_tip1 = node_even_closer_to_tip1 previous_distance_to_tip1 = distance_to_tip1 distance_to_tip1 = distance_from_node_to_tip[node_closer_to_tip1, tip1] node_even_closer_to_tip1 \ = stepping_stone_from_node_to_tip[node_closer_to_tip1, tip1] node_object_closer_to_tip0 = breadth_first_visit_order[node_closer_to_tip0] node_object_closer_to_tip1 = breadth_first_visit_order[node_closer_to_tip1] if node_object_closer_to_tip1 == node_object_closer_to_tip0._parent: theParent = node_object_closer_to_tip1 theChild = node_object_closer_to_tip0 distance_from_new_root_to_parent \ = midpoint_distance - distance_to_tip1 distance_from_new_root_to_child \ = previous_distance_to_tip1 - midpoint_distance elif node_object_closer_to_tip0 == node_object_closer_to_tip1._parent: theParent = node_object_closer_to_tip0 theChild = node_object_closer_to_tip1 distance_from_new_root_to_parent \ = previous_distance_to_tip1 - midpoint_distance distance_from_new_root_to_child \ = midpoint_distance - distance_to_tip1 else: # Should never get here raise AssertionError('Adjacent nodes on maximum span not parent-child') sys.stdout.write('(') # omit the branch length from theChild to its parent, since this is the # branch being broken in two sys.stdout.write(':'.join( theChild.getNewick(with_distances=True,semicolon=False).split(':')[:-1])) sys.stdout.write(":%g" % distance_from_new_root_to_child) sys.stdout.write(',') def print_rotated_node(child, parent, distance_from_parent_to_new_parent): sys.stdout.write('(') sys.stdout.write(','.join([other_child.getNewick(with_distances=True, semicolon=False) for other_child in parent.Children if other_child != child])) if parent._parent: sys.stdout.write(',') print_rotated_node(parent, parent._parent, parent.params['length']) sys.stdout.write(')') if parent.Name: sys.stdout.write(parent.Name) sys.stdout.write(":%g" % distance_from_parent_to_new_parent) print_rotated_node(theChild, theParent, distance_from_new_root_to_parent) sys.stdout.write(');\n')
class fast_tree_tests(TestCase): """Tests of top-level functions""" def setUp(self): """Define a couple of standard trees""" self.t1 = DndParser('(((a,b),c),(d,e))', UniFracTreeNode) self.t2 = DndParser('(((a,b),(c,d)),(e,f))', UniFracTreeNode) self.t3 = DndParser('(((a,b,c),(d)),(e,f))', UniFracTreeNode) self.t4 = DndParser('((c)b,((f,g,h)e,i)d)', UniFracTreeNode) self.t4.Name = 'a' self.t_str = '((a:1,b:2):4,(c:3,(d:1,e:1):2):3)' self.t = DndParser(self.t_str, UniFracTreeNode) self.env_str = """ a A 1 a C 2 b A 1 b B 1 c B 1 d B 3 e C 1""" self.env_counts = count_envs(self.env_str.splitlines()) self.node_index, self.nodes = index_tree(self.t) self.count_array, self.unique_envs, self.env_to_index, \ self.node_to_index = index_envs(self.env_counts, self.node_index) self.branch_lengths = get_branch_lengths(self.node_index) self.old_t_str = '((org1:0.11,org2:0.22,(org3:0.12,org4:0.23)g:0.33)b:0.2,(org5:0.44,org6:0.55)c:0.3,org7:0.4)' self.old_t = DndParser(self.old_t_str, UniFracTreeNode) self.old_env_str = """ org1 env1 1 org1 env2 1 org2 env2 1 org3 env2 1 org4 env3 1 org5 env1 1 org6 env1 1 org7 env3 1 """ self.old_env_counts = count_envs(self.old_env_str.splitlines()) self.old_node_index, self.old_nodes = index_tree(self.old_t) self.old_count_array, self.old_unique_envs, self.old_env_to_index, \ self.old_node_to_index = index_envs(self.old_env_counts, self.old_node_index) self.old_branch_lengths = get_branch_lengths(self.old_node_index) def test_traverse(self): """traverse should work iterative or recursive""" stti = self.t4.traverse stt = self.t4.traverse_recursive obs = [i.Name for i in stt(self_before=False, self_after=False)] exp = [i.Name for i in stti(self_before=False, self_after=False)] self.assertEqual(obs, exp) obs = [i.Name for i in stt(self_before=True, self_after=False)] exp = [i.Name for i in stti(self_before=True, self_after=False)] self.assertEqual(obs, exp) obs = [i.Name for i in stt(self_before=False, self_after=True)] exp = [i.Name for i in stti(self_before=False, self_after=True)] self.assertEqual(obs, exp) obs = [i.Name for i in stt(self_before=True, self_after=True)] exp = [i.Name for i in stti(self_before=True, self_after=True)] self.assertEqual(obs, exp) def test_count_envs(self): """count_envs should return correct counts from lines""" envs = """ a A 3 some other junk a B a C 1 b A 2 skip c B d b A 99 """ result = count_envs(envs.splitlines()) self.assertEqual(result, \ {'a':{'A':3,'B':1,'C':1},'b':{'A':99},'c':{'B':1}}) def test_sum_env_dict(self): """sum_env_dict should return correct counts from env_dict""" envs = """ a A 3 some other junk a B a C 1 b A 2 skip c B d b A 99 """ result = count_envs(envs.splitlines()) sum_ = sum_env_dict(result) self.assertEqual(sum_, 105) def test_index_envs(self): """index_envs should map envs and taxa onto indices""" self.assertEqual(self.unique_envs, ['A', 'B', 'C']) self.assertEqual(self.env_to_index, {'A': 0, 'B': 1, 'C': 2}) self.assertEqual(self.node_to_index, { 'a': 0, 'b': 1, 'c': 4, 'd': 2, 'e': 3 }) self.assertEqual(self.count_array, \ array([[1,0,2],[1,1,0],[0,3,0],[0,0,1], \ [0,1,0],[0,0,0],[0,0,0],[0,0,0],[0,0,0]])) def test_get_branch_lengths(self): """get_branch_lengths should make array of branch lengths from index""" result = get_branch_lengths(self.node_index) self.assertEqual(result, array([1, 2, 1, 1, 3, 2, 4, 3, 0])) def test_env_unique_fraction(self): """should report unique fraction of bl in each env """ # testing old unique fraction cur_count_array = self.count_array.copy() bound_indices = bind_to_array(self.nodes, cur_count_array) total_bl = sum(self.branch_lengths) bool_descendants(bound_indices) env_bl_sums, env_bl_ufracs = env_unique_fraction( self.branch_lengths, cur_count_array) # env A has 0 unique bl, B has 4, C has 1 self.assertEqual(env_bl_sums, [0, 4, 1]) self.assertEqual(env_bl_ufracs, [0, 4 / 17.0, 1 / 17.0]) cur_count_array = self.old_count_array.copy() bound_indices = bind_to_array(self.old_nodes, cur_count_array) total_bl = sum(self.old_branch_lengths) bool_descendants(bound_indices) env_bl_sums, env_bl_ufracs = env_unique_fraction( self.old_branch_lengths, cur_count_array) # env A has 0 unique bl, B has 4, C has 1 self.assertEqual(env_bl_sums, env_bl_sums) self.assertEqual(env_bl_sums, [1.29, 0.33999999999999997, 0.63]) self.assertEqual(env_bl_ufracs, [1.29 / 2.9, 0.33999999999999997 / 2.9, 0.63 / 2.9]) def test_index_tree(self): """index_tree should produce correct index and node map""" #test for first tree: contains singleton outgroup t1 = self.t1 id_1, child_1 = index_tree(t1) nodes_1 = [n._leaf_index for n in t1.traverse(self_before=False, \ self_after=True)] self.assertEqual(nodes_1, [0, 1, 2, 3, 6, 4, 5, 7, 8]) self.assertEqual(child_1, [(2, 0, 1), (6, 2, 3), (7, 4, 5), (8, 6, 7)]) #test for second tree: strictly bifurcating t2 = self.t2 id_2, child_2 = index_tree(t2) nodes_2 = [n._leaf_index for n in t2.traverse(self_before=False, \ self_after=True)] self.assertEqual(nodes_2, [0, 1, 4, 2, 3, 5, 8, 6, 7, 9, 10]) self.assertEqual(child_2, [(4, 0, 1), (5, 2, 3), (8, 4, 5), (9, 6, 7), (10, 8, 9)]) #test for third tree: contains trifurcation and single-child parent t3 = self.t3 id_3, child_3 = index_tree(t3) nodes_3 = [n._leaf_index for n in t3.traverse(self_before=False, \ self_after=True)] self.assertEqual(nodes_3, [0, 1, 2, 4, 3, 5, 8, 6, 7, 9, 10]) self.assertEqual(child_3, [(4, 0, 2), (5, 3, 3), (8, 4, 5), (9, 6, 7), (10, 8, 9)]) def test_bind_to_array(self): """bind_to_array should return correct array ranges""" a = reshape(arange(33), (11, 3)) id_, child = index_tree(self.t3) bindings = bind_to_array(child, a) self.assertEqual(len(bindings), 5) self.assertEqual(bindings[0][0], a[4]) self.assertEqual(bindings[0][1], a[0:3]) self.assertEqual(bindings[0][1].shape, (3, 3)) self.assertEqual(bindings[1][0], a[5]) self.assertEqual(bindings[1][1], a[3:4]) self.assertEqual(bindings[1][1].shape, (1, 3)) self.assertEqual(bindings[2][0], a[8]) self.assertEqual(bindings[2][1], a[4:6]) self.assertEqual(bindings[2][1].shape, (2, 3)) self.assertEqual(bindings[3][0], a[9]) self.assertEqual(bindings[3][1], a[6:8]) self.assertEqual(bindings[3][1].shape, (2, 3)) self.assertEqual(bindings[4][0], a[10]) self.assertEqual(bindings[4][1], a[8:10]) self.assertEqual(bindings[4][1].shape, (2, 3)) def test_bind_to_parent_array(self): """bind_to_parent_array should bind tree to array correctly""" a = reshape(arange(33), (11, 3)) index_tree(self.t3) bindings = bind_to_parent_array(self.t3, a) self.assertEqual(len(bindings), 10) self.assertEqual(bindings[0][0], a[8]) self.assertEqual(bindings[0][1], a[10]) self.assertEqual(bindings[1][0], a[4]) self.assertEqual(bindings[1][1], a[8]) self.assertEqual(bindings[2][0], a[0]) self.assertEqual(bindings[2][1], a[4]) self.assertEqual(bindings[3][0], a[1]) self.assertEqual(bindings[3][1], a[4]) self.assertEqual(bindings[4][0], a[2]) self.assertEqual(bindings[4][1], a[4]) self.assertEqual(bindings[5][0], a[5]) self.assertEqual(bindings[5][1], a[8]) self.assertEqual(bindings[6][0], a[3]) self.assertEqual(bindings[6][1], a[5]) self.assertEqual(bindings[7][0], a[9]) self.assertEqual(bindings[7][1], a[10]) self.assertEqual(bindings[8][0], a[6]) self.assertEqual(bindings[8][1], a[9]) self.assertEqual(bindings[9][0], a[7]) self.assertEqual(bindings[9][1], a[9]) def test_delete_empty_parents(self): """delete_empty_parents should remove empty parents from bound indices""" id_to_node, node_first_last = index_tree(self.t) bound_indices = bind_to_array(node_first_last, self.count_array[:, 0:1]) bool_descendants(bound_indices) self.assertEqual(len(bound_indices), 4) deleted = delete_empty_parents(bound_indices) self.assertEqual(len(deleted), 2) for d in deleted: self.assertEqual(d[0][0], 1) def test_traverse_reduce(self): """traverse_reduce should reduce array in traversal order.""" id_, child = index_tree(self.t3) a = zeros((11, 3)) + 99 #fill with junk bindings = bind_to_array(child, a) #load in leaf envs a[0] = a[1] = a[2] = a[7] = [0, 1, 0] a[3] = [1, 0, 0] a[6] = [0, 0, 1] f = logical_or.reduce traverse_reduce(bindings, f) self.assertEqual(a,\ array([[0,1,0],[0,1,0],[0,1,0],[1,0,0],[0,1,0],[1,0,0],\ [0,0,1],[0,1,0],[1,1,0],[0,1,1],[1,1,1]]) ) f = sum traverse_reduce(bindings, f) self.assertEqual( a, \ array([[0,1,0],[0,1,0],[0,1,0],[1,0,0],[0,3,0],[1,0,0],\ [0,0,1],[0,1,0],[1,3,0],[0,1,1],[1,4,1]]) ) def test_bool_descendants(self): """bool_descendants should be true if any descendant true""" #self.t3 = DndParser('(((a,b,c),(d)),(e,f))', UniFracTreeNode) id_, child = index_tree(self.t3) a = zeros((11, 3)) + 99 #fill with junk bindings = bind_to_array(child, a) #load in leaf envs a[0] = a[1] = a[2] = a[7] = [0, 1, 0] a[3] = [1, 0, 0] a[6] = [0, 0, 1] bool_descendants(bindings) self.assertEqual(a, \ array([[0,1,0],[0,1,0],[0,1,0],[1,0,0],[0,1,0],[1,0,0],\ [0,0,1],[0,1,0],[1,1,0],[0,1,1],[1,1,1]]) ) def test_sum_descendants(self): """sum_descendants should sum total descendants w/ each state""" id_, child = index_tree(self.t3) a = zeros((11, 3)) + 99 #fill with junk bindings = bind_to_array(child, a) #load in leaf envs a[0] = a[1] = a[2] = a[7] = [0, 1, 0] a[3] = [1, 0, 0] a[6] = [0, 0, 1] sum_descendants(bindings) self.assertEqual(a, \ array([[0,1,0],[0,1,0],[0,1,0],[1,0,0],[0,3,0],[1,0,0],\ [0,0,1],[0,1,0],[1,3,0],[0,1,1],[1,4,1]]) ) def test_fitch_descendants(self): """fitch_descendants should assign states by fitch parsimony, ret. #""" id_, child = index_tree(self.t3) a = zeros((11, 3)) + 99 #fill with junk bindings = bind_to_array(child, a) #load in leaf envs a[0] = a[1] = a[2] = a[7] = [0, 1, 0] a[3] = [1, 0, 0] a[6] = [0, 0, 1] changes = fitch_descendants(bindings) self.assertEqual(changes, 2) self.assertEqual(a, \ array([[0,1,0],[0,1,0],[0,1,0],[1,0,0],[0,1,0],[1,0,0],\ [0,0,1],[0,1,0],[1,1,0],[0,1,1],[0,1,0]]) ) def test_fitch_descendants_missing_data(self): """fitch_descendants should work with missing data""" #tree and envs for testing missing values t_str = '(((a:1,b:2):4,(c:3,d:1):2):1,(e:2,f:1):3);' env_str = """a A b B c D d C e C f D""" t = DndParser(t_str, UniFracTreeNode) node_index, nodes = index_tree(t) env_counts = count_envs(env_str.split('\n')) count_array, unique_envs, env_to_index, node_to_index = \ index_envs(env_counts, node_index) branch_lengths = get_branch_lengths(node_index) #test just the AB pair ab_counts = count_array[:, 0:2] bindings = bind_to_array(nodes, ab_counts) changes = fitch_descendants(bindings, counter=FitchCounter) self.assertEqual(changes, 1) orig_result = ab_counts.copy() #check that the original Fitch counter gives the expected #incorrect parsimony result changes = fitch_descendants(bindings, counter=FitchCounterDense) self.assertEqual(changes, 5) new_result = ab_counts.copy() #check that the two versions fill the array with the same values self.assertEqual(orig_result, new_result) def test_tip_distances(self): """tip_distances should set tips to correct distances.""" t = self.t bl = self.branch_lengths.copy()[:, newaxis] bindings = bind_to_parent_array(t, bl) tips = [] for n in t.traverse(self_before=False, self_after=True): if not n.Children: tips.append(n._leaf_index) tip_distances(bl, bindings, tips) self.assertEqual(bl, array([5, 6, 6, 6, 6, 0, 0, 0, 0])[:, newaxis]) def test_permute_selected_rows(self): """permute_selected_rows should switch just the selected rows in a""" orig = reshape(arange(8), (4, 2)) new = orig.copy() fake_permutation = lambda a: range(a)[::-1] #reverse order permute_selected_rows([0, 2], orig, new, fake_permutation) self.assertEqual(new, array([[4, 5], [2, 3], [0, 1], [6, 7]])) #make sure we didn't change orig self.assertEqual(orig, reshape(arange(8), (4, 2))) def test_prep_items_for_jackknife(self): """prep_items_for_jackknife should expand indices of repeated counts""" a = array([0, 1, 0, 1, 2, 0, 3]) # 0 1 2 3 4 5 6 result = prep_items_for_jackknife(a) exp = array([1, 3, 4, 4, 6, 6, 6]) self.assertEqual(result, exp) def test_jackknife_bool(self): """jackknife_bool should make a vector with right number of nonzeros""" fake_permutation = lambda a: range(a)[::-1] #reverse order orig_vec = array([0, 0, 1, 0, 1, 1, 0, 1, 1]) orig_items = flatnonzero(orig_vec) length = len(orig_vec) result = jackknife_bool(orig_items, 3, len(orig_vec), fake_permutation) self.assertEqual(result, array([0, 0, 0, 0, 0, 1, 0, 1, 1])) #returns the original if trying to take too many self.assertEqual(jackknife_bool(orig_items, 20, len(orig_vec)), \ orig_vec) def test_jackknife_int(self): """jackknife_int should make a vector with right counts""" orig_vec = array([0, 2, 1, 0, 3, 1]) orig_items = array([1, 1, 2, 4, 4, 4, 5]) # 0 1 2 3 4 5 6 fake_permutation = lambda a: a == 7 and array([4, 6, 3, 1, 2, 6, 5]) result = jackknife_int(orig_items, 4, len(orig_vec), fake_permutation) self.assertEqual(result, array([0, 1, 0, 0, 2, 1])) #returns the original if trying to take too many self.assertEqual(jackknife_int(orig_items, 20, len(orig_vec)), \ orig_vec) def test_jackknife_array(self): """jackknife_array should make a new array with right counts""" orig_vec1 = array([0, 2, 2, 3, 1]) orig_vec2 = array([2, 2, 1, 2, 2]) test_array = array([orig_vec1, orig_vec2]) # implement this, just doing by eye now #perm_fn = fake_permutation perm_fn = permutation #print "need to test with fake permutation!!" new_mat1 = jackknife_array(test_array, 1, axis=1, jackknife_f=jackknife_int, permutation_f=permutation) self.assertEqual(new_mat1.sum(axis=0), [1, 1, 1, 1, 1]) new_mat2 = jackknife_array(test_array, 2, axis=1, jackknife_f=jackknife_int, permutation_f=permutation) self.assertEqual(new_mat2.sum(axis=0), [2, 2, 2, 2, 2]) new_mat3 = jackknife_array(test_array, 2, axis=0, jackknife_f=jackknife_int, permutation_f=permutation) self.assertEqual(new_mat3.sum(axis=1), [2, 2]) # test that you get orig mat back if too many self.assertEqual(jackknife_array(test_array, 20, axis=1), test_array) def test_unifrac(self): """unifrac should return correct results for model tree""" m = array([[1,0,1],[1,1,0],[0,1,0],[0,0,1],[0,1,0],[0,1,1],[1,1,1],\ [0,1,1],[1,1,1]]) bl = self.branch_lengths self.assertEqual(unifrac(bl, m[:, 0], m[:, 1]), 10 / 16.0) self.assertEqual(unifrac(bl, m[:, 0], m[:, 2]), 8 / 13.0) self.assertEqual(unifrac(bl, m[:, 1], m[:, 2]), 8 / 17.0) def test_unnormalized_unifrac(self): """unnormalized unifrac should return correct results for model tree""" m = array([[1,0,1],[1,1,0],[0,1,0],[0,0,1],[0,1,0],[0,1,1],[1,1,1],\ [0,1,1],[1,1,1]]) bl = self.branch_lengths self.assertEqual(unnormalized_unifrac(bl, m[:, 0], m[:, 1]), 10 / 17.) self.assertEqual(unnormalized_unifrac(bl, m[:, 0], m[:, 2]), 8 / 17.) self.assertEqual(unnormalized_unifrac(bl, m[:, 1], m[:, 2]), 8 / 17.) def test_PD(self): """PD should return correct results for model tree""" m = array([[1,0,1],[1,1,0],[0,1,0],[0,0,1],[0,1,0],[0,1,1],[1,1,1],\ [0,1,1],[1,1,1]]) bl = self.branch_lengths self.assertEqual(PD(bl, m[:, 0]), 7) self.assertEqual(PD(bl, m[:, 1]), 15) self.assertEqual(PD(bl, m[:, 2]), 11) def test_G(self): """G should return correct results for model tree""" m = array([[1,0,1],[1,1,0],[0,1,0],[0,0,1],[0,1,0],[0,1,1],[1,1,1],\ [0,1,1],[1,1,1]]) bl = self.branch_lengths self.assertEqual(G(bl, m[:, 0], m[:, 0]), 0) self.assertEqual(G(bl, m[:, 0], m[:, 1]), 1 / 16.0) self.assertEqual(G(bl, m[:, 1], m[:, 0]), 9 / 16.0) def test_unnormalized_G(self): """unnormalized_G should return correct results for model tree""" m = array([[1,0,1],[1,1,0],[0,1,0],[0,0,1],[0,1,0],[0,1,1],[1,1,1],\ [0,1,1],[1,1,1]]) bl = self.branch_lengths self.assertEqual(unnormalized_G(bl, m[:, 0], m[:, 0]), 0 / 17.) self.assertEqual(unnormalized_G(bl, m[:, 0], m[:, 1]), 1 / 17.) self.assertEqual(unnormalized_G(bl, m[:, 1], m[:, 0]), 9 / 17.) def test_unifrac_matrix(self): """unifrac_matrix should return correct results for model tree""" m = array([[1,0,1],[1,1,0],[0,1,0],[0,0,1],[0,1,0],[0,1,1],[1,1,1],\ [0,1,1],[1,1,1]]) bl = self.branch_lengths result = unifrac_matrix(bl, m) self.assertEqual(result, array([[0, 10/16.,8/13.],[10/16.,0,8/17.],\ [8/13.,8/17.,0]])) #should work if we tell it the measure is asymmetric result = unifrac_matrix(bl, m, is_symmetric=False) self.assertEqual(result, array([[0, 10/16.,8/13.],[10/16.,0,8/17.],\ [8/13.,8/17.,0]])) #should work if the measure really is asymmetric result = unifrac_matrix(bl, m, metric=unnormalized_G, is_symmetric=False) self.assertEqual(result, array([[0, 1/17.,2/17.],[9/17.,0,6/17.],\ [6/17.,2/17.,0]])) #should also match web site calculations envs = self.count_array bound_indices = bind_to_array(self.nodes, envs) bool_descendants(bound_indices) result = unifrac_matrix(bl, envs) exp = array([[0, 0.6250, 0.6154], [0.6250, 0, \ 0.4706], [0.6154, 0.4707, 0]]) assert (abs(result - exp)).max() < 0.001 def test_unifrac_vector(self): """unifrac_vector should return correct results for model tree""" m = array([[1,0,1],[1,1,0],[0,1,0],[0,0,1],[0,1,0],[0,1,1],[1,1,1],\ [0,1,1],[1,1,1]]) bl = self.branch_lengths result = unifrac_vector(bl, m) self.assertFloatEqual(result, array([10. / 17, 6. / 17, 7. / 17])) def test_PD_vector(self): """PD_vector should return correct results for model tree""" m = array([[1,0,1],[1,1,0],[0,1,0],[0,0,1],[0,1,0],[0,1,1],[1,1,1],\ [0,1,1],[1,1,1]]) bl = self.branch_lengths result = PD_vector(bl, m) self.assertFloatEqual(result, array([7, 15, 11])) def test_weighted_unifrac_matrix(self): """weighted unifrac matrix should return correct results for model tree""" #should match web site calculations envs = self.count_array bound_indices = bind_to_array(self.nodes, envs) sum_descendants(bound_indices) bl = self.branch_lengths tip_indices = [n._leaf_index for n in self.t.tips()] result = weighted_unifrac_matrix(bl, envs, tip_indices) exp = array([[0, 9.1, 4.5], [9.1, 0, \ 6.4], [4.5, 6.4, 0]]) assert (abs(result - exp)).max() < 0.001 #should work with branch length corrections td = bl.copy()[:, newaxis] tip_bindings = bind_to_parent_array(self.t, td) tips = [n._leaf_index for n in self.t.tips()] tip_distances(td, tip_bindings, tips) result = weighted_unifrac_matrix(bl, envs, tip_indices, bl_correct=True, tip_distances=td) exp = array([[0, 9.1/11.5, 4.5/(10.5+1./3)], [9.1/11.5, 0, \ 6.4/(11+1./3)], [4.5/(10.5+1./3), 6.4/(11+1./3), 0]]) assert (abs(result - exp)).max() < 0.001 def test_weighted_unifrac_vector(self): """weighted_unifrac_vector should return correct results for model tree""" envs = self.count_array bound_indices = bind_to_array(self.nodes, envs) sum_descendants(bound_indices) bl = self.branch_lengths tip_indices = [n._leaf_index for n in self.t.tips()] result = weighted_unifrac_vector(bl, envs, tip_indices) self.assertFloatEqual( result[0], sum([ abs(1. / 2 - 2. / 8) * 1, abs(1. / 2 - 1. / 8) * 2, abs(0 - 1. / 8) * 3, abs(0 - 3. / 8) * 1, abs(0 - 1. / 8) * 1, abs(0 - 4. / 8) * 2, abs(2. / 2 - 3. / 8) * 4, abs(0. - 5. / 8) * 3. ])) self.assertFloatEqual( result[1], sum([ abs(0 - .6) * 1, abs(.2 - .2) * 2, abs(.2 - 0) * 3, abs(.6 - 0) * 1, abs(0 - .2) * 1, abs(.6 - .2) * 2, abs(.2 - .8) * 4, abs(.8 - .2) * 3 ])) self.assertFloatEqual( result[2], sum([ abs(2. / 3 - 1. / 7) * 1, abs(0 - 2. / 7) * 2, abs(0 - 1. / 7) * 3, abs(0 - 3. / 7) * 1, abs(1. / 3 - 0) * 1, abs(1. / 3 - 3. / 7) * 2, abs(2. / 3 - 3. / 7) * 4, abs(1. / 3 - 4. / 7) * 3 ]))
def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): """Returns a tree from Alignment object aln. aln: an cogent.core.alignment.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object best_tree: if True (default:False), uses a slower but more accurate algorithm to build the tree. params: dict of parameters to pass in to the Clustal app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. """ # Create instance of app controller, enable tree, disable alignment app = Clustalw(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir='/tmp') app.Parameters['-align'].off() #Set params to empty dict if None. if params is None: params = {} if moltype == DNA or moltype == RNA: params['-type'] = 'd' elif moltype == PROTEIN: params['-type'] = 'p' else: raise ValueError, "moltype must be DNA, RNA, or PROTEIN" # best_tree -> bootstrap if best_tree: if '-bootstrap' not in params: app.Parameters['-bootstrap'].on(1000) if '-seed' not in params: app.Parameters['-seed'].on(randint(0, 1000)) if '-bootlabels' not in params: app.Parameters['-bootlabels'].on('nodes') else: app.Parameters['-tree'].on() # Setup mapping. Clustalw clips identifiers. We will need to remap them. seq_collection = SequenceCollection(aln) int_map, int_keys = seq_collection.getIntMap() int_map = SequenceCollection(int_map) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result['Tree'].read(), constructor=PhyloNode) for node in tree.tips(): node.Name = int_keys[node.Name] # Clean up result.cleanUp() del (seq_collection, app, result, int_map, int_keys) return tree
class fast_tree_tests(TestCase): """Tests of top-level functions""" def setUp(self): """Define a couple of standard trees""" self.t1 = DndParser('(((a,b),c),(d,e))', UniFracTreeNode) self.t2 = DndParser('(((a,b),(c,d)),(e,f))', UniFracTreeNode) self.t3 = DndParser('(((a,b,c),(d)),(e,f))', UniFracTreeNode) self.t4 = DndParser('((c)b,((f,g,h)e,i)d)', UniFracTreeNode) self.t4.Name = 'a' self.t_str = '((a:1,b:2):4,(c:3,(d:1,e:1):2):3)' self.t = DndParser(self.t_str, UniFracTreeNode) self.env_str = """ a A 1 a C 2 b A 1 b B 1 c B 1 d B 3 e C 1""" self.env_counts = count_envs(self.env_str.splitlines()) self.node_index, self.nodes = index_tree(self.t) self.count_array, self.unique_envs, self.env_to_index, \ self.node_to_index = index_envs(self.env_counts, self.node_index) self.branch_lengths = get_branch_lengths(self.node_index) self.old_t_str = '((org1:0.11,org2:0.22,(org3:0.12,org4:0.23)g:0.33)b:0.2,(org5:0.44,org6:0.55)c:0.3,org7:0.4)' self.old_t = DndParser(self.old_t_str, UniFracTreeNode) self.old_env_str = """ org1 env1 1 org1 env2 1 org2 env2 1 org3 env2 1 org4 env3 1 org5 env1 1 org6 env1 1 org7 env3 1 """ self.old_env_counts = count_envs(self.old_env_str.splitlines()) self.old_node_index, self.old_nodes = index_tree(self.old_t) self.old_count_array, self.old_unique_envs, self.old_env_to_index, \ self.old_node_to_index = index_envs(self.old_env_counts, self.old_node_index) self.old_branch_lengths = get_branch_lengths(self.old_node_index) def test_traverse(self): """traverse should work iterative or recursive""" stti = self.t4.traverse stt = self.t4.traverse_recursive obs = [i.Name for i in stt(self_before=False, self_after=False)] exp = [i.Name for i in stti(self_before=False, self_after=False)] self.assertEqual(obs, exp) obs = [i.Name for i in stt(self_before=True, self_after=False)] exp = [i.Name for i in stti(self_before=True, self_after=False)] self.assertEqual(obs, exp) obs = [i.Name for i in stt(self_before=False, self_after=True)] exp = [i.Name for i in stti(self_before=False, self_after=True)] self.assertEqual(obs, exp) obs = [i.Name for i in stt(self_before=True, self_after=True)] exp = [i.Name for i in stti(self_before=True, self_after=True)] self.assertEqual(obs, exp) def test_count_envs(self): """count_envs should return correct counts from lines""" envs = """ a A 3 some other junk a B a C 1 b A 2 skip c B d b A 99 """ result = count_envs(envs.splitlines()) self.assertEqual(result, \ {'a':{'A':3,'B':1,'C':1},'b':{'A':99},'c':{'B':1}}) def test_sum_env_dict(self): """sum_env_dict should return correct counts from env_dict""" envs = """ a A 3 some other junk a B a C 1 b A 2 skip c B d b A 99 """ result = count_envs(envs.splitlines()) sum_ = sum_env_dict(result) self.assertEqual(sum_, 105) def test_index_envs(self): """index_envs should map envs and taxa onto indices""" self.assertEqual(self.unique_envs, ['A','B','C']) self.assertEqual(self.env_to_index, {'A':0, 'B':1, 'C':2}) self.assertEqual(self.node_to_index,{'a':0, 'b':1, 'c':4, 'd':2, 'e':3}) self.assertEqual(self.count_array, \ array([[1,0,2],[1,1,0],[0,3,0],[0,0,1], \ [0,1,0],[0,0,0],[0,0,0],[0,0,0],[0,0,0]])) def test_get_branch_lengths(self): """get_branch_lengths should make array of branch lengths from index""" result = get_branch_lengths(self.node_index) self.assertEqual(result, array([1,2,1,1,3,2,4,3,0])) def test_env_unique_fraction(self): """should report unique fraction of bl in each env """ # testing old unique fraction cur_count_array = self.count_array.copy() bound_indices = bind_to_array(self.nodes, cur_count_array) total_bl = sum(self.branch_lengths) bool_descendants(bound_indices) env_bl_sums, env_bl_ufracs = env_unique_fraction(self.branch_lengths, cur_count_array) # env A has 0 unique bl, B has 4, C has 1 self.assertEqual(env_bl_sums, [0,4,1]) self.assertEqual(env_bl_ufracs, [0,4/17.0,1/17.0]) cur_count_array = self.old_count_array.copy() bound_indices = bind_to_array(self.old_nodes, cur_count_array) total_bl = sum(self.old_branch_lengths) bool_descendants(bound_indices) env_bl_sums, env_bl_ufracs = env_unique_fraction(self.old_branch_lengths, cur_count_array) # env A has 0 unique bl, B has 4, C has 1 self.assertEqual(env_bl_sums, env_bl_sums) self.assertEqual(env_bl_sums, [1.29, 0.33999999999999997, 0.63]) self.assertEqual(env_bl_ufracs, [1.29/2.9,0.33999999999999997/2.9, 0.63/2.9]) def test_index_tree(self): """index_tree should produce correct index and node map""" #test for first tree: contains singleton outgroup t1 = self.t1 id_1, child_1 = index_tree(t1) nodes_1 = [n._leaf_index for n in t1.traverse(self_before=False, \ self_after=True)] self.assertEqual(nodes_1, [0,1,2,3,6,4,5,7,8]) self.assertEqual(child_1, [(2,0,1),(6,2,3),(7,4,5),(8,6,7)]) #test for second tree: strictly bifurcating t2 = self.t2 id_2, child_2 = index_tree(t2) nodes_2 = [n._leaf_index for n in t2.traverse(self_before=False, \ self_after=True)] self.assertEqual(nodes_2, [0,1,4,2,3,5,8,6,7,9,10]) self.assertEqual(child_2, [(4,0,1),(5,2,3),(8,4,5),(9,6,7),(10,8,9)]) #test for third tree: contains trifurcation and single-child parent t3 = self.t3 id_3, child_3 = index_tree(t3) nodes_3 = [n._leaf_index for n in t3.traverse(self_before=False, \ self_after=True)] self.assertEqual(nodes_3, [0,1,2,4,3,5,8,6,7,9,10]) self.assertEqual(child_3, [(4,0,2),(5,3,3),(8,4,5),(9,6,7),(10,8,9)]) def test_bind_to_array(self): """bind_to_array should return correct array ranges""" a = reshape(arange(33), (11,3)) id_, child = index_tree(self.t3) bindings = bind_to_array(child, a) self.assertEqual(len(bindings), 5) self.assertEqual(bindings[0][0], a[4]) self.assertEqual(bindings[0][1], a[0:3]) self.assertEqual(bindings[0][1].shape, (3,3)) self.assertEqual(bindings[1][0], a[5]) self.assertEqual(bindings[1][1], a[3:4]) self.assertEqual(bindings[1][1].shape, (1,3)) self.assertEqual(bindings[2][0], a[8]) self.assertEqual(bindings[2][1], a[4:6]) self.assertEqual(bindings[2][1].shape, (2,3)) self.assertEqual(bindings[3][0], a[9]) self.assertEqual(bindings[3][1], a[6:8]) self.assertEqual(bindings[3][1].shape, (2,3)) self.assertEqual(bindings[4][0], a[10]) self.assertEqual(bindings[4][1], a[8:10]) self.assertEqual(bindings[4][1].shape, (2,3)) def test_bind_to_parent_array(self): """bind_to_parent_array should bind tree to array correctly""" a = reshape(arange(33), (11,3)) index_tree(self.t3) bindings = bind_to_parent_array(self.t3, a) self.assertEqual(len(bindings), 10) self.assertEqual(bindings[0][0], a[8]) self.assertEqual(bindings[0][1], a[10]) self.assertEqual(bindings[1][0], a[4]) self.assertEqual(bindings[1][1], a[8]) self.assertEqual(bindings[2][0], a[0]) self.assertEqual(bindings[2][1], a[4]) self.assertEqual(bindings[3][0], a[1]) self.assertEqual(bindings[3][1], a[4]) self.assertEqual(bindings[4][0], a[2]) self.assertEqual(bindings[4][1], a[4]) self.assertEqual(bindings[5][0], a[5]) self.assertEqual(bindings[5][1], a[8]) self.assertEqual(bindings[6][0], a[3]) self.assertEqual(bindings[6][1], a[5]) self.assertEqual(bindings[7][0], a[9]) self.assertEqual(bindings[7][1], a[10]) self.assertEqual(bindings[8][0], a[6]) self.assertEqual(bindings[8][1], a[9]) self.assertEqual(bindings[9][0], a[7]) self.assertEqual(bindings[9][1], a[9]) def test_delete_empty_parents(self): """delete_empty_parents should remove empty parents from bound indices""" id_to_node, node_first_last = index_tree(self.t) bound_indices = bind_to_array(node_first_last, self.count_array[:,0:1]) bool_descendants(bound_indices) self.assertEqual(len(bound_indices), 4) deleted = delete_empty_parents(bound_indices) self.assertEqual(len(deleted), 2) for d in deleted: self.assertEqual(d[0][0], 1) def test_traverse_reduce(self): """traverse_reduce should reduce array in traversal order.""" id_, child = index_tree(self.t3) a = zeros((11,3)) + 99 #fill with junk bindings = bind_to_array(child, a) #load in leaf envs a[0] = a[1] = a[2] = a[7] = [0,1,0] a[3] = [1,0,0] a[6] = [0,0,1] f = logical_or.reduce traverse_reduce(bindings, f) self.assertEqual(a,\ array([[0,1,0],[0,1,0],[0,1,0],[1,0,0],[0,1,0],[1,0,0],\ [0,0,1],[0,1,0],[1,1,0],[0,1,1],[1,1,1]]) ) f = sum traverse_reduce(bindings, f) self.assertEqual( a, \ array([[0,1,0],[0,1,0],[0,1,0],[1,0,0],[0,3,0],[1,0,0],\ [0,0,1],[0,1,0],[1,3,0],[0,1,1],[1,4,1]]) ) def test_bool_descendants(self): """bool_descendants should be true if any descendant true""" #self.t3 = DndParser('(((a,b,c),(d)),(e,f))', UniFracTreeNode) id_, child = index_tree(self.t3) a = zeros((11,3)) + 99 #fill with junk bindings = bind_to_array(child, a) #load in leaf envs a[0] = a[1] = a[2] = a[7] = [0,1,0] a[3] = [1,0,0] a[6] = [0,0,1] bool_descendants(bindings) self.assertEqual(a, \ array([[0,1,0],[0,1,0],[0,1,0],[1,0,0],[0,1,0],[1,0,0],\ [0,0,1],[0,1,0],[1,1,0],[0,1,1],[1,1,1]]) ) def test_sum_descendants(self): """sum_descendants should sum total descendants w/ each state""" id_, child = index_tree(self.t3) a = zeros((11,3)) + 99 #fill with junk bindings = bind_to_array(child, a) #load in leaf envs a[0] = a[1] = a[2] = a[7] = [0,1,0] a[3] = [1,0,0] a[6] = [0,0,1] sum_descendants(bindings) self.assertEqual(a, \ array([[0,1,0],[0,1,0],[0,1,0],[1,0,0],[0,3,0],[1,0,0],\ [0,0,1],[0,1,0],[1,3,0],[0,1,1],[1,4,1]]) ) def test_fitch_descendants(self): """fitch_descendants should assign states by fitch parsimony, ret. #""" id_, child = index_tree(self.t3) a = zeros((11,3)) + 99 #fill with junk bindings = bind_to_array(child, a) #load in leaf envs a[0] = a[1] = a[2] = a[7] = [0,1,0] a[3] = [1,0,0] a[6] = [0,0,1] changes = fitch_descendants(bindings) self.assertEqual(changes, 2) self.assertEqual(a, \ array([[0,1,0],[0,1,0],[0,1,0],[1,0,0],[0,1,0],[1,0,0],\ [0,0,1],[0,1,0],[1,1,0],[0,1,1],[0,1,0]]) ) def test_fitch_descendants_missing_data(self): """fitch_descendants should work with missing data""" #tree and envs for testing missing values t_str = '(((a:1,b:2):4,(c:3,d:1):2):1,(e:2,f:1):3);' env_str = """a A b B c D d C e C f D""" t = DndParser(t_str, UniFracTreeNode) node_index, nodes = index_tree(t) env_counts = count_envs(env_str.split('\n')) count_array, unique_envs, env_to_index, node_to_index = \ index_envs(env_counts, node_index) branch_lengths = get_branch_lengths(node_index) #test just the AB pair ab_counts = count_array[:, 0:2] bindings = bind_to_array(nodes, ab_counts) changes = fitch_descendants(bindings, counter=FitchCounter) self.assertEqual(changes, 1) orig_result = ab_counts.copy() #check that the original Fitch counter gives the expected #incorrect parsimony result changes = fitch_descendants(bindings, counter=FitchCounterDense) self.assertEqual(changes, 5) new_result = ab_counts.copy() #check that the two versions fill the array with the same values self.assertEqual(orig_result, new_result) def test_tip_distances(self): """tip_distances should set tips to correct distances.""" t = self.t bl = self.branch_lengths.copy()[:,newaxis] bindings = bind_to_parent_array(t, bl) tips = [] for n in t.traverse(self_before=False, self_after=True): if not n.Children: tips.append(n._leaf_index) tip_distances(bl, bindings, tips) self.assertEqual(bl, array([5,6,6,6,6,0,0,0,0])[:,newaxis]) def test_permute_selected_rows(self): """permute_selected_rows should switch just the selected rows in a""" orig = reshape(arange(8),(4,2)) new = orig.copy() fake_permutation = lambda a: range(a)[::-1] #reverse order permute_selected_rows([0,2], orig, new, fake_permutation) self.assertEqual(new, array([[4,5],[2,3],[0,1],[6,7]])) #make sure we didn't change orig self.assertEqual(orig, reshape(arange(8), (4,2))) def test_prep_items_for_jackknife(self): """prep_items_for_jackknife should expand indices of repeated counts""" a = array([0,1,0,1,2,0,3]) # 0 1 2 3 4 5 6 result = prep_items_for_jackknife(a) exp = array([1,3,4,4,6,6,6]) self.assertEqual(result, exp) def test_jackknife_bool(self): """jackknife_bool should make a vector with right number of nonzeros""" fake_permutation = lambda a: range(a)[::-1] #reverse order orig_vec = array([0,0,1,0,1,1,0,1,1]) orig_items = flatnonzero(orig_vec) length = len(orig_vec) result = jackknife_bool(orig_items, 3, len(orig_vec), fake_permutation) self.assertEqual(result, array([0,0,0,0,0,1,0,1,1])) #returns the original if trying to take too many self.assertEqual(jackknife_bool(orig_items, 20, len(orig_vec)), \ orig_vec) def test_jackknife_int(self): """jackknife_int should make a vector with right counts""" orig_vec = array([0,2,1,0,3,1]) orig_items = array([1,1,2,4,4,4,5]) # 0 1 2 3 4 5 6 fake_permutation = lambda a: a == 7 and array([4,6,3,1,2,6,5]) result = jackknife_int(orig_items, 4, len(orig_vec), fake_permutation) self.assertEqual(result, array([0,1,0,0,2,1])) #returns the original if trying to take too many self.assertEqual(jackknife_int(orig_items, 20, len(orig_vec)), \ orig_vec) def test_jackknife_array(self): """jackknife_array should make a new array with right counts""" orig_vec1 = array([0,2,2,3,1]) orig_vec2 = array([2,2,1,2,2]) test_array = array([orig_vec1, orig_vec2]) # implement this, just doing by eye now #perm_fn = fake_permutation perm_fn = permutation #print "need to test with fake permutation!!" new_mat1 = jackknife_array(test_array, 1, axis=1, jackknife_f=jackknife_int, permutation_f=permutation) self.assertEqual(new_mat1.sum(axis=0), [1,1,1,1,1]) new_mat2 = jackknife_array(test_array, 2, axis=1, jackknife_f=jackknife_int, permutation_f=permutation) self.assertEqual(new_mat2.sum(axis=0), [2,2,2,2,2]) new_mat3 = jackknife_array(test_array, 2, axis=0, jackknife_f=jackknife_int, permutation_f=permutation) self.assertEqual(new_mat3.sum(axis=1), [2,2]) # test that you get orig mat back if too many self.assertEqual(jackknife_array(test_array, 20, axis=1), test_array) def test_unifrac(self): """unifrac should return correct results for model tree""" m = array([[1,0,1],[1,1,0],[0,1,0],[0,0,1],[0,1,0],[0,1,1],[1,1,1],\ [0,1,1],[1,1,1]]) bl = self.branch_lengths self.assertEqual(unifrac(bl, m[:,0], m[:,1]), 10/16.0) self.assertEqual(unifrac(bl, m[:,0], m[:,2]), 8/13.0) self.assertEqual(unifrac(bl, m[:,1], m[:,2]), 8/17.0) def test_unnormalized_unifrac(self): """unnormalized unifrac should return correct results for model tree""" m = array([[1,0,1],[1,1,0],[0,1,0],[0,0,1],[0,1,0],[0,1,1],[1,1,1],\ [0,1,1],[1,1,1]]) bl = self.branch_lengths self.assertEqual(unnormalized_unifrac(bl, m[:,0], m[:,1]), 10/17.) self.assertEqual(unnormalized_unifrac(bl, m[:,0], m[:,2]), 8/17.) self.assertEqual(unnormalized_unifrac(bl, m[:,1], m[:,2]), 8/17.) def test_PD(self): """PD should return correct results for model tree""" m = array([[1,0,1],[1,1,0],[0,1,0],[0,0,1],[0,1,0],[0,1,1],[1,1,1],\ [0,1,1],[1,1,1]]) bl = self.branch_lengths self.assertEqual(PD(bl, m[:,0]), 7) self.assertEqual(PD(bl, m[:,1]), 15) self.assertEqual(PD(bl, m[:,2]), 11) def test_G(self): """G should return correct results for model tree""" m = array([[1,0,1],[1,1,0],[0,1,0],[0,0,1],[0,1,0],[0,1,1],[1,1,1],\ [0,1,1],[1,1,1]]) bl = self.branch_lengths self.assertEqual(G(bl, m[:,0], m[:,0]), 0) self.assertEqual(G(bl, m[:,0], m[:,1]), 1/16.0) self.assertEqual(G(bl, m[:,1], m[:,0]), 9/16.0) def test_unnormalized_G(self): """unnormalized_G should return correct results for model tree""" m = array([[1,0,1],[1,1,0],[0,1,0],[0,0,1],[0,1,0],[0,1,1],[1,1,1],\ [0,1,1],[1,1,1]]) bl = self.branch_lengths self.assertEqual(unnormalized_G(bl, m[:,0], m[:,0]), 0/17.) self.assertEqual(unnormalized_G(bl, m[:,0], m[:,1]), 1/17.) self.assertEqual(unnormalized_G(bl, m[:,1], m[:,0]), 9/17.) def test_unifrac_matrix(self): """unifrac_matrix should return correct results for model tree""" m = array([[1,0,1],[1,1,0],[0,1,0],[0,0,1],[0,1,0],[0,1,1],[1,1,1],\ [0,1,1],[1,1,1]]) bl = self.branch_lengths result = unifrac_matrix(bl, m) self.assertEqual(result, array([[0, 10/16.,8/13.],[10/16.,0,8/17.],\ [8/13.,8/17.,0]])) #should work if we tell it the measure is asymmetric result = unifrac_matrix(bl, m, is_symmetric=False) self.assertEqual(result, array([[0, 10/16.,8/13.],[10/16.,0,8/17.],\ [8/13.,8/17.,0]])) #should work if the measure really is asymmetric result = unifrac_matrix(bl,m,metric=unnormalized_G,is_symmetric=False) self.assertEqual(result, array([[0, 1/17.,2/17.],[9/17.,0,6/17.],\ [6/17.,2/17.,0]])) #should also match web site calculations envs = self.count_array bound_indices = bind_to_array(self.nodes, envs) bool_descendants(bound_indices) result = unifrac_matrix(bl, envs) exp = array([[0, 0.6250, 0.6154], [0.6250, 0, \ 0.4706], [0.6154, 0.4707, 0]]) assert (abs(result - exp)).max() < 0.001 def test_unifrac_vector(self): """unifrac_vector should return correct results for model tree""" m = array([[1,0,1],[1,1,0],[0,1,0],[0,0,1],[0,1,0],[0,1,1],[1,1,1],\ [0,1,1],[1,1,1]]) bl = self.branch_lengths result = unifrac_vector(bl, m) self.assertFloatEqual(result, array([10./17,6./17,7./17])) def test_PD_vector(self): """PD_vector should return correct results for model tree""" m = array([[1,0,1],[1,1,0],[0,1,0],[0,0,1],[0,1,0],[0,1,1],[1,1,1],\ [0,1,1],[1,1,1]]) bl = self.branch_lengths result = PD_vector(bl, m) self.assertFloatEqual(result, array([7,15,11])) def test_weighted_unifrac_matrix(self): """weighted unifrac matrix should return correct results for model tree""" #should match web site calculations envs = self.count_array bound_indices = bind_to_array(self.nodes, envs) sum_descendants(bound_indices) bl = self.branch_lengths tip_indices = [n._leaf_index for n in self.t.tips()] result = weighted_unifrac_matrix(bl, envs, tip_indices) exp = array([[0, 9.1, 4.5], [9.1, 0, \ 6.4], [4.5, 6.4, 0]]) assert (abs(result - exp)).max() < 0.001 #should work with branch length corrections td = bl.copy()[:,newaxis] tip_bindings = bind_to_parent_array(self.t, td) tips = [n._leaf_index for n in self.t.tips()] tip_distances(td, tip_bindings, tips) result = weighted_unifrac_matrix(bl, envs, tip_indices, bl_correct=True, tip_distances=td) exp = array([[0, 9.1/11.5, 4.5/(10.5+1./3)], [9.1/11.5, 0, \ 6.4/(11+1./3)], [4.5/(10.5+1./3), 6.4/(11+1./3), 0]]) assert (abs(result - exp)).max() < 0.001 def test_weighted_unifrac_vector(self): """weighted_unifrac_vector should return correct results for model tree""" envs = self.count_array bound_indices = bind_to_array(self.nodes, envs) sum_descendants(bound_indices) bl = self.branch_lengths tip_indices = [n._leaf_index for n in self.t.tips()] result = weighted_unifrac_vector(bl, envs, tip_indices) self.assertFloatEqual(result[0], sum([ abs(1./2 - 2./8)*1, abs(1./2 - 1./8)*2, abs(0 - 1./8)*3, abs(0 - 3./8)*1, abs(0 - 1./8)*1, abs(0 - 4./8)*2, abs(2./2 - 3./8)*4, abs(0. - 5./8)*3.])) self.assertFloatEqual(result[1], sum([ abs(0-.6)*1, abs(.2-.2)*2, abs(.2-0)*3, abs(.6-0)*1, abs(0-.2)*1, abs(.6-.2)*2, abs(.2-.8)*4, abs(.8-.2)*3])) self.assertFloatEqual(result[2], sum([ abs(2./3-1./7)*1, abs(0-2./7)*2, abs(0-1./7)*3, abs(0-3./7)*1, abs(1./3-0)*1, abs(1./3-3./7)*2, abs(2./3-3./7)*4, abs(1./3-4./7)*3]))
def build_tree_from_alignment(aln, moltype, best_tree=False, params={},\ working_dir='/tmp'): """Returns a tree from Alignment object aln. aln: an cogent.core.alignment.Alignment object, or data that can be used to build one. - Clearcut only accepts aligned sequences. Alignment object used to handle unaligned sequences. moltype: a cogent.core.moltype object. - NOTE: If moltype = RNA, we must convert to DNA since Clearcut v1.0.8 gives incorrect results if RNA is passed in. 'U' is treated as an incorrect character and is excluded from distance calculations. best_tree: if True (default:False), uses a slower but more accurate algorithm to build the tree. params: dict of parameters to pass in to the Clearcut app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. """ params['--out'] = get_tmp_filename(working_dir) # Create instance of app controller, enable tree, disable alignment app = Clearcut(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir=working_dir, SuppressStdout=True,\ SuppressStderr=True) #Input is an alignment app.Parameters['-a'].on() #Turn off input as distance matrix app.Parameters['-d'].off() #If moltype = RNA, we must convert to DNA. if moltype == RNA: moltype = DNA if best_tree: app.Parameters['-N'].on() #Turn on correct moltype moltype_string = moltype.label.upper() app.Parameters[MOLTYPE_MAP[moltype_string]].on() # Setup mapping. Clearcut clips identifiers. We will need to remap them. # Clearcut only accepts aligned sequences. Let Alignment object handle # unaligned sequences. seq_aln = Alignment(aln, MolType=moltype) #get int mapping int_map, int_keys = seq_aln.getIntMap() #create new Alignment object with int_map int_map = Alignment(int_map) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result['Tree'].read(), constructor=PhyloNode) for node in tree.tips(): node.Name = int_keys[node.Name] # Clean up result.cleanUp() del (seq_aln, app, result, int_map, int_keys, params) return tree