def ml4(aln, true_tree): ''' Input a true tree and an alignment Calculate the likelihood of all possible unrooted 4-taxon trees Return True if the ML tree is the true tree Return False otherwise ''' # all trees with unit branch lengths all_trees = [ LoadTree(treestring='((a,b),(c,d))'), LoadTree(treestring='((a,c),(b,d))'), LoadTree(treestring='((a,d),(b,c))') ] # optimise lf for all trees sm = JC69() results = [] for t in all_trees: lf = sm.makeLikelihoodFunction(t) lf.setAlignment(aln) lf.optimise(local=True) results.append(lf.getLogLikelihood()) # get the ml tree and compare to true tree ml_tree = all_trees[results.index(max(results))] return ml_tree.sameTopology(true_tree)
def test_balanced(self): """balancing an unrooted tree""" t = LoadTree( treestring='((a,b),((c1,(c2,(c3,(c4,(c5,(c6,c7)))))),(d,e)),f)') b = LoadTree( treestring='(c1,(c2,(c3,(c4,(c5,(c6,c7))))),((d,e),((a,b),f)))') self.assertEqual(str(t.balanced()), str(b))
def test_sameShape(self): """test topology assessment""" t1 = LoadTree(treestring="(((s1,s5),s3),s2,s4);") t2 = LoadTree(treestring="((s1,s5),(s2,s4),s3);") t3 = LoadTree(treestring="((s1,s4),(s2,s5),s3);") assert t1.sameTopology(t2), (t1, t2) assert not t1.sameTopology(t3), (t1, t3) assert not t2.sameTopology(t3), (t2, t3)
def inflate_likelihood_function(data, model=None): supported_subs_models = ('GeneralStationary', 'General', 'DiscreteSubstitutionModel', 'General_with_gaps') if not model is None: model = model() elif data['name'] == 'GTR': if data['with_rate']: model = GTR(optimise_motif_probs=True, with_rate=True, distribution='gamma') else: model = GTR(optimise_motif_probs=True) elif data['name'] == 'General_with_gaps': assert not data['with_rate'], data['name'] + ' plus Gamma not supported' model = General(DNA.Alphabet, optimise_motif_probs=True, model_gaps=True, recode_gaps=False, name='General_with_gaps') elif data['name'] in supported_subs_models: assert not data['with_rate'], data['name'] + ' plus Gamma not supported' model = eval(data['name'])(DNA.Alphabet, optimise_motif_probs=True, model_gaps=False, recode_gaps=True, name=data['name']) else: st = 'inflate_likelihood_function: unsupported model ' + data['name'] raise NotImplementedError(st) if 'tree' in data: tree = LoadTree(treestring=data['tree'].encode('utf-8')) else: tip_names = [tip_name.encode('utf-8') for tip_name in data['tip_names']] tree = LoadTree(tip_names=tip_names) if data['with_rate']: lf = model.makeLikelihoodFunction(tree, bins=4) else: lf = model.makeLikelihoodFunction(tree) with lf.updatesPostponed(): lf.setMotifProbs(data['mprobs']) params = data['params'] for param in data['params']: dimensions = lf.defn_for[param].valid_dimensions if len(dimensions) == 0: lf.setParamRule(param, init=params[param]) elif 'edge' in dimensions and 'bin' in dimensions: for edge, bins in params[param].items(): for bin, init in bins.items(): lf.setParamRule(param, edge=edge, bin=bin, init=init) elif 'edge' in dimensions: for edge, init in params[param].items(): lf.setParamRule(param, edge=edge, init=init) elif 'bin' in dimensions: for bin, init in params[param].items(): lf.setParamRule(param, bin=bin, init=init) if 'dependencies' in data: for param, scopes in data['dependencies'].items(): for scope in scopes: lf.setParamRule(param, is_independent=False, **scope) return lf
def test_limited_wls(self): """testing (well, exercising at least), wls with constrained start""" init = LoadTree(treestring='((a,c),b,d)') reconstructed = wls(self.dists, start=init) self.assertEqual(len(reconstructed.getTipNames()), 5) init2 = LoadTree(treestring='((a,d),b,c)') reconstructed = wls(self.dists, start=[init, init2]) self.assertEqual(len(reconstructed.getTipNames()), 5) init3 = LoadTree(treestring='((a,d),b,e)') self.assertRaises(Exception, wls, self.dists, start=[init, init3]) # if start tree has all seq names, should raise an error self.assertRaises(Exception, wls, self.dists, start=[LoadTree(treestring='((a,c),b,(d,e))')])
def test_getEdgeNamesUseOutgroup(self): t1 = LoadTree(treestring="((A,B)ab,(F,(C,D)cd)cdf,E)root;") # a, e, ogroup f t2 = LoadTree(treestring="((E,(A,B)ab)abe,F,(C,D)cd)root;") expected = ['A', 'B', 'E', 'ab'] for t in [t1, t2]: edges = t.getEdgeNames('A', 'E', getstem=False, getclade=True, outgroup_name="F") edges.sort() self.assertEqual(expected, edges)
def test_trees(self): treestring = "((A:.1,B:.22)ab:.3,((C:.4,D:.5)cd:.55,E:.6)cde:.7,F:.2)" for edge in 'ABCDEF': treestring = treestring.replace(edge, edge + edge.lower() * 10) t = LoadTree(treestring=treestring) for klass in [ UnrootedDendrogram, SquareDendrogram, ContemporaneousDendrogram, ShelvedDendrogram, # StraightDendrogram, # ContemporaneousStraightDendrogram ]: dendro = klass(t) dendro.getConnectingNode( 'Ccccccccccc', 'Eeeeeeeeeee').setCollapsed(color="green", label="C, D and E") do(klass.__name__, dendro, shade_param="length", show_params=["length"]) def callback(edge): return ["blue", "red"][edge.Name.startswith("A")] do("Highlight edge A", UnrootedDendrogram(t), edge_color_callback=callback)
def test_making_from_list(self): tipnames_with_spaces = ['a_b', 'a b', "T'lk"] tipnames_with_spaces.sort() t = LoadTree(tip_names=tipnames_with_spaces) result = t.getTipNames() result.sort() assert result == tipnames_with_spaces
def test_getsetParamValue(self): """test getting, setting of param values""" t = LoadTree(treestring='((((a:.2,b:.3)ab:.1,c:.3)abc:.4),d:.6)') self.assertEqual(t.getParamValue('length', 'ab'), 0.1, 2) t.setParamValue('zz', 'ab', 4.321) node = t.getNodeMatchingName('ab') self.assertEqual(4.321, node.params['zz'], 4)
def setUp(self): #length all edges 1 except c=2. b&d transitions all other transverions self.al = LoadSeqs( data={'a':'tata', 'b':'tgtc', 'c':'gcga', 'd':'gaac', 'e':'gagc',}) self.tree = LoadTree(treestring='((a,b),(c,d),e);') self.model = cogent.evolve.substitution_model.Nucleotide( do_scaling=True, equal_motif_probs=True, model_gaps=True)
def get_tree(filename): tree = LoadTree(filename) treename = os.path.basename(filename).rsplit('.', 1)[0] for edge in tree.getEdgeVector(): edge.NameLoaded = True edge.Name = edge.Name.replace('.', '_') return {'treename': treename, 'treestring': str(tree)}
def MakeCachedObjects(model, tree, seq_length, opt_args): """simulates an alignment under F81, all models should be the same""" lf = model.makeLikelihoodFunction(tree) lf.setMotifProbs(dict(A=0.1, C=0.2, G=0.3, T=0.4)) aln = lf.simulateAlignment(seq_length) results = dict(aln=aln) discrete_tree = LoadTree(tip_names=aln.Names) def fit_general(results=results): if 'general' in results: return gen = General(DNA.Alphabet) gen_lf = _make_likelihood(gen, tree, results) gen_lf.optimise(**opt_args) results['general'] = gen_lf return def fit_gen_stat(results=results): if 'gen_stat' in results: return gen_stat = GeneralStationary(DNA.Alphabet) gen_stat_lf = _make_likelihood(gen_stat, tree, results) gen_stat_lf.optimise(**opt_args) results['gen_stat'] = gen_stat_lf def fit_constructed_gen(results=results): if 'constructed_gen' in results: return preds = [ MotifChange(a, b, forward_only=True) for a, b in [['A', 'C'], ['A', 'G'], ['A', 'T'], ['C', 'A'], ['C', 'G'], ['C', 'T'], ['G', 'C'], ['G', 'T'], ['T', 'A'], ['T', 'C'], ['T', 'G']] ] nuc = Nucleotide(predicates=preds) nuc_lf = _make_likelihood(nuc, tree, results) nuc_lf.optimise(**opt_args) results['constructed_gen'] = nuc_lf def fit_discrete(results=results): if 'discrete' in results: return dis_lf = _make_likelihood(DiscreteSubstitutionModel(DNA.Alphabet), discrete_tree, results, is_discrete=True) dis_lf.optimise(**opt_args) results['discrete'] = dis_lf funcs = dict(general=fit_general, gen_stat=fit_gen_stat, discrete=fit_discrete, constructed_gen=fit_constructed_gen) def call(self, obj_name): if obj_name not in results: funcs[obj_name]() return results[obj_name] return call
def rooted(doc, rooted_edges=None, gc=None, **kw): aln = LoadSeqs(data=doc['aln'].encode('utf-8'), moltype=DNA) tree = LoadTree(treestring=doc['tree'].encode('utf-8')) code = get_genetic_code(gc) aln = aln.withoutTerminalStopCodons(code) aln = aln.filtered(lambda x: set(''.join(x)) <= set(DNA), motif_length=3) sp_kw = dict(upper=20., lower=0.05, is_independent=False) sm = MG94GTR(optimise_motif_probs=True) init_lf = sm.makeLikelihoodFunction(tree) init_lf.setAlignment(aln) with init_lf.updatesPostponed(): for param in init_lf.getParamNames(): if '/' in param: init_lf.setParamRule(param, **sp_kw) init_lf.setParamRule('length', edges=rooted_edges, is_independent=False) init_lf.optimise(local=True, show_progress=False, limit_action='raise') init_lf = nest.deflate_likelihood_function(init_lf, save_jsd=False) sm = GNC(optimise_motif_probs=True) lf = sm.makeLikelihoodFunction(tree) lf.setAlignment(aln) _populate_parameters(lf, init_lf, **sp_kw) for param in lf.getParamNames(): if '>' in param or param == 'omega': lf.setParamRule(param, edges=rooted_edges, is_independent=False) lf.optimise(local=True, show_progress=False, limit_action='raise') flat_lf = nest.deflate_likelihood_function(lf) flat_lf['hard_up'] = _is_hard_up(lf) return {'lf': flat_lf, 'gc': code.Name, 'rooted_edges': rooted_edges}
def test_gapped_CNFGTR(): aln = get_aln(os.path.join(get_data_dir(), 'ENSG00000100393.fasta.gz'), codon_position=-1, filter_gaps=False) tree = LoadTree(treestring='(Human,Mouse,Opossum);') doc = {'aln': str(aln), 'tree': str(tree)} cnfgtr_result = gapped.ml(doc, model='CNFGTR', model_gaps=True, omega_indep=False, indel_indep=False) model = lambda: gapped.CNFGTR(optimise_motif_probs=True, model_gaps=True) cnfgtr = gapped.inflate_likelihood_function(cnfgtr_result['lf'], model) pi = cnfgtr.getMotifProbsByNode()['root'].asarray() P = cnfgtr.getPsubForEdge('Human') assert_almost_equal(pi.dot(P), pi) omega = cnfgtr.getParamValue('omega') pi = cnfgtr.getMotifProbs() Q = cnfgtr.getRateMatrixForEdge('Human') cond_p = pi['CCG'] / sum(pi['CC' + c] for c in 'ACGT') ref_cell = Q['CCT']['CCG'] / cond_p cond_p = pi['CCC'] / sum(pi['CC' + c] for c in 'ACGT') assert_almost_equal(Q['CCA']['CCC'] / cond_p / ref_cell, cnfgtr.getParamValue('A/C')) assert_almost_equal(Q['---']['CCC'] / pi['CCC'] / ref_cell, cnfgtr.getParamValue('indel')) R = Q.asarray() / pi.asarray() assert_almost_equal(R.T, R)
def ml(doc, model='NG', gc=None, omega_indep=True, model_gaps=False, indel_indep=True, **kw): aln = LoadSeqs(data=doc['aln'].encode('utf-8'), moltype=DNA) tree = LoadTree(treestring=doc['tree'].encode('utf-8')) code = get_genetic_code(gc) if model != 'NG': # Trim terminal stop codons aln = aln.withoutTerminalStopCodons(code) if model_gaps: filt = lambda x: set(''.join(x)) <= set(DNA).union({'-'}) else: filt = lambda x: set(''.join(x)) <= set(DNA) aln = aln.filtered(filt, motif_length=3) flat_lf, time = _fit(aln, tree, model, code, omega_indep, model_gaps, indel_indep) return { 'lf': flat_lf, 'time': time, 'model': model, 'gc': code.Name, 'omega_indep': omega_indep, 'model_gaps': model_gaps, 'indel_indep': indel_indep }
def __init__(self, TreePath, NeedsToBeCogentModded): self.Parsed = True #used to determine if the full analysis can be conducted try: self.TreePath = TreePath self.NeedsToBeCogentModded = NeedsToBeCogentModded self.CogentTree = None #if the internal nodes need to be renamed, then it is done according to the "FixUpFileForCogent" method if self.NeedsToBeCogentModded: cogentFixUp = fixUpFileForCogent(self.TreePath) self.CogentTreeFile = cogentFixUp[0] self.CogentInputTreeString = cogentFixUp[1] self.CogentTree = LoadTree(self.CogentTreeFile.name) else: self.CogentTree = LoadTree(self.TreePath) #prepares an input string for FastML self.FastMLInputTreeString = self.FixUpFileForFastML( self.CogentTree) #executes method to fully parse tree, then sets all returned variables as class variables CogentNodesLeavesBranches = completeNodesLeavesBranches( self.CogentTree) self.NodeKey_L = CogentNodesLeavesBranches['NodeKey_L'] self.LeafKey_L = CogentNodesLeavesBranches['LeafKey_L'] self.UpperKey_L = CogentNodesLeavesBranches['UpperKey_L'] self.TopKey = CogentNodesLeavesBranches['TopKey'] self.BranchKey_L = CogentNodesLeavesBranches['BranchKey_L'] self.Nodes_D = CogentNodesLeavesBranches['Nodes_D'] #print self.LeafKey_L #executes quick run of FastML to get FastML's naming convention of internal nodes self.FastMLOutputTreeString = executeFastML( self.getTempFASTAFile(), self.FastMLInputTreeString, True) #prepares the FastMLToOriginalMatchedNodes_D self.MatchNodes() except Exception as e: self.Parsed = False
def setUp(self): self.name = 'small tree - ' self.otu_names = ['NineBande', 'Mouse', 'HowlerMon', 'DogFaced'] self.otu_names.sort() self.newick = '(((Human,HowlerMon),Mouse),NineBande,DogFaced);' self.newick_sorted = '(DogFaced,((HowlerMon,Human),Mouse),NineBande);' self.newick_reduced = '((HowlerMon,Mouse),NineBande,DogFaced);' self.tree = LoadTree(treestring=self.newick)
def test_setConstantLengths(self): t = LoadTree(treestring='((a:1,b:2):3,(c:4,d:5):6,e:7);') lf = self.model.makeLikelihoodFunction(t) #self.tree) lf.setParamRule('length', is_const=True) # lf.setConstantLengths(t) lf.setAlignment(self.al) self.assertEqual(lf.getParamValue('length', 'b'), 2) self.assertEqual(lf.getParamValue('length', 'd'), 5)
def build_tree(tree_string, bl1, bl2, r): 'build a PyCogent tree object from a string and branch lengths' # we use r/2.0 because PyCogent defaults to adding a branch of # length 1 if you don't explicitly specify it # having 2 branches of r/2.0 keeps our internal branch at r tree_string_bl = tree_string % (bl1, bl2, r / 2.0, bl1, bl2, r / 2.0) t = LoadTree(treestring=tree_string_bl) return t
def test_simulateAlignment2(self): "Simulate alignment with dinucleotide model" al = LoadSeqs(data={'a': 'ggaatt', 'c': 'cctaat'}) t = LoadTree(treestring="(a,c);") sm = substitution_model.Dinucleotide(mprob_model='tuple') lf = sm.makeParamController(t) lf.setAlignment(al) simalign = lf.simulateAlignment() self.assertEqual(len(simalign), 6)
def test_getsubtree(self): """testing getting a subtree""" subtree = self.tree.unrooted().getSubTree(self.otu_names) new_tree = LoadTree(treestring=self.newick_reduced).unrooted() # check we get the same names self.assertEqual(*[len(t.Children) for t in (subtree, new_tree)]) self.assertEqual(str(subtree), str(new_tree))
def different_tree_simulate_alignment(tree_information_list, all_trees): ''' input: list of tree information example can be[[p1,q1,r1,s1,t1,tree1],[p1,q1,r1,s1,t1,tree2]] tree1 and tree2 are the tree in the all_trees (a,b),(c,d)-->0 (a,c),(b,d)-->1 (a,d),(b,c)-->2 output: PyCogent alignment that different sites follow different order ''' #alnlist is to store all the alignment alnlist = [] #for each tree_information, call the function to get the alignment for tree_information in tree_information_list: aln1 = simulate_alignment_treefixed(all_trees, tree_information[0], tree_information[1], tree_information[2], tree_information[3], tree_information[4], tree_information[5]) alnlist.append(aln1) #put all the alignment together aln = alnlist[0] if (len(alnlist) > 1): for i in range(len(alnlist) - 1): aln = aln + alnlist[i + 1] #find the true tree and construct it according to it has the longest alignment length #index stands for the order of the longest alignment in the whole tree_information_list index = 0 for i in range(len(tree_information_list)): if tree_information_list[i][3] > tree_information_list[index][3]: index = i #build the true tree #index_true_tree stands for the order of the tree in all_trees that the longest alingment follow index_true_tree = tree_information_list[index][5] tree_string = all_trees[index_true_tree] true_tree_bl = tree_string % ( tree_information_list[index][0], tree_information_list[index][1], tree_information_list[index][2] / 2.0, tree_information_list[index][0], tree_information_list[index][1], tree_information_list[index][2] / 2.0) true_tree = LoadTree(treestring=true_tree_bl) return (aln, true_tree)
def test_distribution(): """distribution should return empirical distribution for DNA sequence""" al = get_aln('General', 1031).takeSeqs(('Mouse', )) distribution = jsd.distribution(al.getSeq('Mouse')) st = LoadTree(tip_names=('Mouse', )) sm = GTR() lf = sm.makeLikelihoodFunction(st) lf.setMotifProbsFromData(al) probs = lf.getMotifProbs() assert_array_almost_equal(array(probs), array(distribution))
def use_root_seq(root_sequence): al = LoadSeqs(data={'a': 'ggaatt', 'c': 'cctaat'}) t = LoadTree(treestring="(a,c);") sm = substitution_model.Dinucleotide(mprob_model='tuple') lf = sm.makeParamController(t) lf.setAlignment(al) simalign = lf.simulateAlignment(exclude_internal=False, root_sequence=root_sequence) root = simalign.NamedSeqs['root'] self.assertEqual(str(root), str(root_sequence))
def setUp(self): self.submodel = Nucleotide(do_scaling=True, model_gaps=False, equal_motif_probs=True, predicates={'beta': 'transition'}) self.data = LoadSeqs(filename=os.path.join(data_path, 'brca1_5.paml'), moltype=self.submodel.MolType) self.tree = LoadTree(filename=os.path.join(data_path, 'brca1_5.tree'))
def setUp(self): self.name = 'big tree - ' self.otu_names = [ 'Horse', 'TombBat', 'Rhino', 'Pig', 'AsianElep', 'SpermWhal', 'Cat', 'Gorilla', 'Orangutan', 'bandicoot', 'Hedgehog', 'Sloth', 'HairyArma', 'Manatee', 'GoldenMol', 'Pangolin' ] self.otu_names.sort() self.newick = '((((((((FlyingFox,DogFaced),((FreeTaile,LittleBro),(TombBat,RoundEare))),(FalseVamp,LeafNose)),(((Horse,Rhino),(Pangolin,(Cat,Dog))),(Llama,(Pig,(Cow,(Hippo,(SpermWhal,HumpbackW))))))),(Mole,Hedgehog)),(TreeShrew,(FlyingLem,((Jackrabbit,(FlyingSqu,(OldWorld,(Mouse,Rat)))),(Galago,(HowlerMon,(Rhesus,(Orangutan,(Gorilla,(Human,Chimpanzee)))))))))),(((NineBande,HairyArma),(Anteater,Sloth)),(((Dugong,Manatee),((AfricanEl,AsianElep),(RockHyrax,TreeHyrax))),(Aardvark,((GoldenMol,(Madagascar,Tenrec)),(LesserEle,GiantElep)))))),(caenolest,(phascogale,(wombat,bandicoot))));' self.newick_reduced = '(((((TombBat,(((Horse,Rhino),(Pangolin,Cat)),(Pig,SpermWhal))),Hedgehog),(Orangutan,Gorilla)),((HairyArma,Sloth),((Manatee,AsianElep),GoldenMol))),bandicoot);' self.tree = LoadTree(treestring=self.newick)
def ml(doc, model='GNC', gc=None, outgroup=None, neutral=None, **kw): aln = LoadSeqs(data=doc['aln'].encode('utf-8'), moltype=DNA) tree = LoadTree(treestring=doc['tree'].encode('utf-8')) code = get_genetic_code(gc) # Trim terminal stop codons aln = aln.withoutTerminalStopCodons(code) aln = aln.filtered(lambda x: set(''.join(x)) <= set(DNA), motif_length=3) flat_lf, time = _fit(aln, tree, model, code, outgroup, neutral) return {'lf': flat_lf, 'time': time, 'model': model, 'gc': code.Name}
def test_distribution(): """distribution should return empirical distribution for DNA sequence""" with GzipFile(os.path.join(get_data_dir(), 'General_1031.fasta.gz')) as ff: data = ff.read() al = Alignment(data=data).takeSeqs(('Mouse', )) distribution = jsd.distribution(al.getSeq('Mouse')) st = LoadTree(tip_names=('Mouse', )) sm = GTR() lf = sm.makeLikelihoodFunction(st) lf.setMotifProbsFromData(al) probs = lf.getMotifProbs() assert_array_almost_equal(array(probs), array(distribution))
def test_pairwise_clock(self): al = LoadSeqs(data={'a':'agct','b':'ggct'}) tree = LoadTree(treestring='(a,b);') model = cogent.evolve.substitution_model.Dinucleotide( do_scaling=True, equal_motif_probs=True, model_gaps=True, mprob_model='tuple') lf = model.makeLikelihoodFunction(tree) lf.setLocalClock('a','b') lf.setAlignment(al) lf.optimise(local=True) rd = lf.getParamValueDict(['edge'], params=['length']) self.assertAlmostEqual(lf.getLogLikelihood(),-10.1774488956) self.assertEqual(rd['length']['a'],rd['length']['b'])
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) start_time = datetime.now() t = LoadTree(opts.input_tree) translation_dict = {} for i, tip in enumerate(t.iterTips()): translation_dict[tip.Name] = i single_rate = False #Generate commands telling BayesTraits which nodes to reconstruct bayestraits_commands = make_bayestraits_script(t, translation_dict, comments=False, single_rate=single_rate) #TODO: make this dynamic #Temporarily assuming there is a nexus file available nexus_fp = opts.input_tree.rsplit(".", 1)[0] + ".nexus" command_fp = "./bayestraits_commands.txt" path_to_bayestraits = "../" outfile = "./bayestrait_reconstruction.trait_table" command_file = open(command_fp, "w+") command_file.writelines(bayestraits_commands) command_file.close() command_file = open(command_fp, "U") bayestraits = BayesTraits() bayestraits_result = bayestraits(data=(nexus_fp, opts.input_trait_data, command_fp)) #print "StdOut:",result["StdOut"].read() print "StdErr:", bayestraits_result["StdErr"].read() print "Return code:", bayestraits_result["ExitStatus"] results = parse_reconstruction_output( bayestraits_result['StdOut'].readlines()) #print "Reconstructions:",results #Reconstruction results f = open(outfile, "w+") f.writelines(results) f.close() end_time = datetime.now() print "Start time:", start_time print "End time:", end_time print "Time to reconstruct:", end_time - start_time bayestraits_result.cleanUp()