Example #1
0
class TestTree(unittest.TestCase):
    """tests for a single tree-type"""
    def setUp(self):
        self.name = 'small tree - '
        self.otu_names = ['NineBande', 'Mouse', 'HowlerMon', 'DogFaced']
        self.otu_names.sort()
        self.newick = '(((Human,HowlerMon),Mouse),NineBande,DogFaced);'
        self.newick_sorted = '(DogFaced,((HowlerMon,Human),Mouse),NineBande);'
        self.newick_reduced = '((HowlerMon,Mouse),NineBande,DogFaced);'
        self.tree = LoadTree(treestring=self.newick)

    def test_sorttree(self):
        """testing (well, exercising at least) treesort"""
        new_tree = self.tree.sorted()
        if hasattr(self, 'newick_sorted'):
            self.assertEqual(self.newick_sorted,
                             new_tree.getNewick(with_distances=0))

    def test_getsubtree(self):
        """testing getting a subtree"""
        subtree = self.tree.unrooted().getSubTree(self.otu_names)

        new_tree = LoadTree(treestring=self.newick_reduced).unrooted()

        # check we get the same names
        self.assertEqual(*[len(t.Children) for t in (subtree, new_tree)])
        self.assertEqual(str(subtree), str(new_tree))

    def test_ascii(self):
        self.tree.asciiArt()
        # unlabeled internal node
        tr = DndParser("(B:0.2,(C:0.3,D:0.4):0.6)F;")
        tr.asciiArt(show_internal=True, compact=False)
        tr.asciiArt(show_internal=True, compact=True)
        tr.asciiArt(show_internal=False, compact=False)
Example #2
0
def get_tree(filename):
    tree = LoadTree(filename)
    treename = os.path.basename(filename).rsplit('.', 1)[0]
    for edge in tree.getEdgeVector():
        edge.NameLoaded = True
        edge.Name = edge.Name.replace('.', '_')
    return {'treename': treename, 'treestring': str(tree)}
Example #3
0
class BigTreeSingleTests(TestTree):
    """using the big-tree for single-tree tests"""
    def setUp(self):
        self.name = 'big tree - '
        self.otu_names = ['Horse', 'TombBat', 'Rhino', 'Pig', 'AsianElep',
                     'SpermWhal', 'Cat', 'Gorilla', 'Orangutan',
                     'bandicoot', 'Hedgehog', 'Sloth', 'HairyArma',
                     'Manatee', 'GoldenMol', 'Pangolin']
        self.otu_names.sort()
        self.newick = '((((((((FlyingFox,DogFaced),((FreeTaile,LittleBro),(TombBat,RoundEare))),(FalseVamp,LeafNose)),(((Horse,Rhino),(Pangolin,(Cat,Dog))),(Llama,(Pig,(Cow,(Hippo,(SpermWhal,HumpbackW))))))),(Mole,Hedgehog)),(TreeShrew,(FlyingLem,((Jackrabbit,(FlyingSqu,(OldWorld,(Mouse,Rat)))),(Galago,(HowlerMon,(Rhesus,(Orangutan,(Gorilla,(Human,Chimpanzee)))))))))),(((NineBande,HairyArma),(Anteater,Sloth)),(((Dugong,Manatee),((AfricanEl,AsianElep),(RockHyrax,TreeHyrax))),(Aardvark,((GoldenMol,(Madagascar,Tenrec)),(LesserEle,GiantElep)))))),(caenolest,(phascogale,(wombat,bandicoot))));'
        self.newick_reduced = '(((((TombBat,(((Horse,Rhino),(Pangolin,Cat)),(Pig,SpermWhal))),Hedgehog),(Orangutan,Gorilla)),((HairyArma,Sloth),((Manatee,AsianElep),GoldenMol))),bandicoot);'
        self.tree = LoadTree(treestring = self.newick)
    
    def test_getEdgeNames(self):
        """testing (well, exercising at least), getedgenames"""
        # Fell over on small tree because "stem descended from root
        # joiner was a tip"
        a,b = self.otu_names[:2]
        clade = self.tree.getEdgeNames(a, b, True, False)
    
    def test_getTipNames(self):
        """testing (well, exercising at least), getTipNames"""
        a,b = self.otu_names[:2]
        tips = self.tree.getTipNames()
        self.assertEqual(len(tips), 55)
Example #4
0
 def test_getsetParamValue(self):
     """test getting, setting of param values"""
     t = LoadTree(treestring='((((a:.2,b:.3)ab:.1,c:.3)abc:.4),d:.6)')
     self.assertEqual(t.getParamValue('length', 'ab'), 0.1, 2)
     t.setParamValue('zz', 'ab', 4.321)
     node = t.getNodeMatchingName('ab')
     self.assertEqual(4.321, node.params['zz'], 4)
Example #5
0
 def test_making_from_list(self):
     tipnames_with_spaces = ['a_b', 'a b', "T'lk"]
     tipnames_with_spaces.sort()
     t = LoadTree(tip_names=tipnames_with_spaces)
     result = t.getTipNames()
     result.sort()
     assert result == tipnames_with_spaces
Example #6
0
 def test_balanced(self):
     """balancing an unrooted tree"""
     t = LoadTree(
         treestring='((a,b),((c1,(c2,(c3,(c4,(c5,(c6,c7)))))),(d,e)),f)')
     b = LoadTree(
         treestring='(c1,(c2,(c3,(c4,(c5,(c6,c7))))),((d,e),((a,b),f)))')
     self.assertEqual(str(t.balanced()), str(b))
Example #7
0
 def test_making_from_list(self):
     tipnames_with_spaces = ['a_b','a b',"T'lk"]
     tipnames_with_spaces.sort()
     t = LoadTree(tip_names=tipnames_with_spaces)
     result = t.getTipNames()
     result.sort()
     assert result == tipnames_with_spaces
Example #8
0
 def MatchNodes(self):
     #print "YAY"
     self.correctForFastMLNameChanges() #performs the correction on the output string if necessary
     #print "NAY"
     TerminiStringToNodeName_D = {}
     #a termini string is prepared for each internal node, that is, all termini under the internal node sorted an placed into a single string
     
     for NodeKey in self.UpperKey_L:
         TerminiStringToNodeName_D['-'.join(sorted(self.Nodes_D[NodeKey]['terminal']))] = NodeKey
     
     #prepares a cogent tree object for the fastML output
     FH = getInputTempFile(self.FastMLOutputTreeString)
     
     FastMLCogentTree = LoadTree(FH.name)
     
     
     self.FastMLToOriginalMatchedNodes_D = {}
     
     #for each cogent node in the FastML cogent tree
     for FastMLCogentNodeKey in FastMLCogentTree.getNodeNames():
         
         #a termini string is prepared for the fastML node
         FastMLCogentNode = FastMLCogentTree.getNodeMatchingName(FastMLCogentNodeKey)
         FastMLTermini_L = [tip.Name for tip in FastMLCogentNode.iterTips()]
         
         #if it has more than 0 termini under the node
         if len(FastMLTermini_L) > 0:
             #A fastML termini string is prepared, and this termini string will be the same termini string as the equivalent cogent node
             FastMLTerminiString = '-'.join(sorted(FastMLTermini_L))
             self.FastMLToOriginalMatchedNodes_D[FastMLCogentNodeKey] = TerminiStringToNodeName_D[FastMLTerminiString]
             
         #if it has no termini under it, then the node itself is a terminus and has the same name in FastML and Cogent
         else:
             self.FastMLToOriginalMatchedNodes_D[FastMLCogentNodeKey] = FastMLCogentNodeKey
Example #9
0
 def _test_tree(self, method, treestring):
     t = LoadTree(treestring=treestring)
     t_distances = t.getDistances()
     reconstructed = method(t_distances)
     distances = reconstructed.getDistances()
     for key in t_distances:
         self.assertAlmostEqual(t_distances[key], distances[key])
Example #10
0
class BigTreeSingleTests(TestTree):
    """using the big-tree for single-tree tests"""
    def setUp(self):
        self.name = 'big tree - '
        self.otu_names = [
            'Horse', 'TombBat', 'Rhino', 'Pig', 'AsianElep', 'SpermWhal',
            'Cat', 'Gorilla', 'Orangutan', 'bandicoot', 'Hedgehog', 'Sloth',
            'HairyArma', 'Manatee', 'GoldenMol', 'Pangolin'
        ]
        self.otu_names.sort()
        self.newick = '((((((((FlyingFox,DogFaced),((FreeTaile,LittleBro),(TombBat,RoundEare))),(FalseVamp,LeafNose)),(((Horse,Rhino),(Pangolin,(Cat,Dog))),(Llama,(Pig,(Cow,(Hippo,(SpermWhal,HumpbackW))))))),(Mole,Hedgehog)),(TreeShrew,(FlyingLem,((Jackrabbit,(FlyingSqu,(OldWorld,(Mouse,Rat)))),(Galago,(HowlerMon,(Rhesus,(Orangutan,(Gorilla,(Human,Chimpanzee)))))))))),(((NineBande,HairyArma),(Anteater,Sloth)),(((Dugong,Manatee),((AfricanEl,AsianElep),(RockHyrax,TreeHyrax))),(Aardvark,((GoldenMol,(Madagascar,Tenrec)),(LesserEle,GiantElep)))))),(caenolest,(phascogale,(wombat,bandicoot))));'
        self.newick_reduced = '(((((TombBat,(((Horse,Rhino),(Pangolin,Cat)),(Pig,SpermWhal))),Hedgehog),(Orangutan,Gorilla)),((HairyArma,Sloth),((Manatee,AsianElep),GoldenMol))),bandicoot);'
        self.tree = LoadTree(treestring=self.newick)

    def test_getEdgeNames(self):
        """testing (well, exercising at least), getedgenames"""
        # Fell over on small tree because "stem descended from root
        # joiner was a tip"
        a, b = self.otu_names[:2]
        clade = self.tree.getEdgeNames(a, b, True, False)

    def test_getTipNames(self):
        """testing (well, exercising at least), getTipNames"""
        a, b = self.otu_names[:2]
        tips = self.tree.getTipNames()
        self.assertEqual(len(tips), 55)
Example #11
0
def ml4(aln, true_tree):
    '''
	Input a true tree and an alignment
	Calculate the likelihood of all possible unrooted 4-taxon trees
	Return True if the ML tree is the true tree
	Return False otherwise
	'''

    # all trees with unit branch lengths
    all_trees = [
        LoadTree(treestring='((a,b),(c,d))'),
        LoadTree(treestring='((a,c),(b,d))'),
        LoadTree(treestring='((a,d),(b,c))')
    ]

    # optimise lf for all trees
    sm = JC69()

    results = []
    for t in all_trees:
        lf = sm.makeLikelihoodFunction(t)
        lf.setAlignment(aln)
        lf.optimise(local=True)
        results.append(lf.getLogLikelihood())

    # get the ml tree and compare to true tree
    ml_tree = all_trees[results.index(max(results))]

    return ml_tree.sameTopology(true_tree)
Example #12
0
 def setUp(self):
     self.name = 'small tree - '
     self.otu_names = ['NineBande', 'Mouse', 'HowlerMon', 'DogFaced']
     self.otu_names.sort()
     self.newick = '(((Human,HowlerMon),Mouse),NineBande,DogFaced);'
     self.newick_sorted = '(DogFaced,((HowlerMon,Human),Mouse),NineBande);'
     self.newick_reduced = '((HowlerMon,Mouse),NineBande,DogFaced);'
     self.tree = LoadTree(treestring=self.newick)
Example #13
0
 def test_sameShape(self):
     """test topology assessment"""
     t1 = LoadTree(treestring="(((s1,s5),s3),s2,s4);")
     t2 = LoadTree(treestring="((s1,s5),(s2,s4),s3);")
     t3 = LoadTree(treestring="((s1,s4),(s2,s5),s3);")
     assert t1.sameTopology(t2), (t1, t2)
     assert not t1.sameTopology(t3), (t1, t3)
     assert not t2.sameTopology(t3), (t2, t3)
Example #14
0
def inflate_likelihood_function(data, model=None):
    supported_subs_models = ('GeneralStationary', 'General',
        'DiscreteSubstitutionModel', 'General_with_gaps')
    if not model is None:
        model = model()
    elif data['name'] == 'GTR':
        if data['with_rate']:
            model = GTR(optimise_motif_probs=True, with_rate=True,
                    distribution='gamma')
        else:
            model = GTR(optimise_motif_probs=True)
    elif data['name'] == 'General_with_gaps':
        assert not data['with_rate'], data['name'] + ' plus Gamma not supported'
        model = General(DNA.Alphabet, optimise_motif_probs=True,
                model_gaps=True, recode_gaps=False, name='General_with_gaps')
    elif data['name'] in supported_subs_models:
        assert not data['with_rate'], data['name'] + ' plus Gamma not supported'
        model = eval(data['name'])(DNA.Alphabet, optimise_motif_probs=True, 
                model_gaps=False, recode_gaps=True, name=data['name'])
    else:
        st = 'inflate_likelihood_function: unsupported model ' + data['name']
        raise NotImplementedError(st)
    
    if 'tree' in data:
        tree = LoadTree(treestring=data['tree'].encode('utf-8'))
    else:
        tip_names = [tip_name.encode('utf-8') for tip_name in data['tip_names']]
        tree = LoadTree(tip_names=tip_names)
    
    if data['with_rate']:
        lf = model.makeLikelihoodFunction(tree, bins=4)
    else:
        lf = model.makeLikelihoodFunction(tree)
    with lf.updatesPostponed():
        lf.setMotifProbs(data['mprobs'])
        params = data['params']
        for param in data['params']:
            dimensions = lf.defn_for[param].valid_dimensions
            if len(dimensions) == 0:
                lf.setParamRule(param, init=params[param])
            elif 'edge' in dimensions and 'bin' in dimensions:
                for edge, bins in params[param].items():
                    for bin, init in bins.items():
                        lf.setParamRule(param, edge=edge, bin=bin, init=init)
            elif 'edge' in dimensions:
                for edge, init in params[param].items():
                    lf.setParamRule(param, edge=edge, init=init)
            elif 'bin' in dimensions:
                for bin, init in params[param].items():
                    lf.setParamRule(param, bin=bin, init=init)

        if 'dependencies' in data:
            for param, scopes in data['dependencies'].items():
                for scope in scopes:
                    lf.setParamRule(param, is_independent=False, **scope)

    return lf
Example #15
0
 def test_params_merge(self):
     t = LoadTree(treestring='((((a,b)ab,c)abc),d)')
     for (label, length, beta) in [('a',1, 20),('b',3,2.0),('ab',4,5.0),]:
         t.getNodeMatchingName(label).params = {'length':length, 'beta':beta}
     t = t.getSubTree(['b', 'c', 'd'])
     self.assertEqual(t.getNodeMatchingName('b').params,
                             {'length':7, 'beta':float(2*3+4*5)/(3+4)})
     self.assertRaises(ValueError, t.getSubTree, ['b','c','xxx'])
     self.assertEqual(str(t.getSubTree(['b','c','xxx'],ignore_missing=True)),
         '(b:7,c)root;')
Example #16
0
    def setUp(self):
        self.submodel = Nucleotide(do_scaling=True,
                                   model_gaps=False,
                                   equal_motif_probs=True,
                                   predicates={'beta': 'transition'})

        self.data = LoadSeqs(filename=os.path.join(data_path, 'brca1_5.paml'),
                             moltype=self.submodel.MolType)

        self.tree = LoadTree(filename=os.path.join(data_path, 'brca1_5.tree'))
Example #17
0
 def setUp(self):
     self.name = 'big tree - '
     self.otu_names = [
         'Horse', 'TombBat', 'Rhino', 'Pig', 'AsianElep', 'SpermWhal',
         'Cat', 'Gorilla', 'Orangutan', 'bandicoot', 'Hedgehog', 'Sloth',
         'HairyArma', 'Manatee', 'GoldenMol', 'Pangolin'
     ]
     self.otu_names.sort()
     self.newick = '((((((((FlyingFox,DogFaced),((FreeTaile,LittleBro),(TombBat,RoundEare))),(FalseVamp,LeafNose)),(((Horse,Rhino),(Pangolin,(Cat,Dog))),(Llama,(Pig,(Cow,(Hippo,(SpermWhal,HumpbackW))))))),(Mole,Hedgehog)),(TreeShrew,(FlyingLem,((Jackrabbit,(FlyingSqu,(OldWorld,(Mouse,Rat)))),(Galago,(HowlerMon,(Rhesus,(Orangutan,(Gorilla,(Human,Chimpanzee)))))))))),(((NineBande,HairyArma),(Anteater,Sloth)),(((Dugong,Manatee),((AfricanEl,AsianElep),(RockHyrax,TreeHyrax))),(Aardvark,((GoldenMol,(Madagascar,Tenrec)),(LesserEle,GiantElep)))))),(caenolest,(phascogale,(wombat,bandicoot))));'
     self.newick_reduced = '(((((TombBat,(((Horse,Rhino),(Pangolin,Cat)),(Pig,SpermWhal))),Hedgehog),(Orangutan,Gorilla)),((HairyArma,Sloth),((Manatee,AsianElep),GoldenMol))),bandicoot);'
     self.tree = LoadTree(treestring=self.newick)
Example #18
0
 def test_limited_wls(self):
     """testing (well, exercising at least), wls with constrained start"""
     init = LoadTree(treestring='((a,c),b,d)')
     reconstructed = wls(self.dists, start=init)
     self.assertEqual(len(reconstructed.getTipNames()), 5)
     init2 = LoadTree(treestring='((a,d),b,c)')
     reconstructed = wls(self.dists, start=[init, init2])
     self.assertEqual(len(reconstructed.getTipNames()), 5)
     init3 = LoadTree(treestring='((a,d),b,e)')
     self.assertRaises(Exception, wls, self.dists, start=[init, init3])
     # if start tree has all seq names, should raise an error
     self.assertRaises(Exception, wls, self.dists,
             start=[LoadTree(treestring='((a,c),b,(d,e))')])
Example #19
0
 def test_getEdgeNamesUseOutgroup(self):
     t1 = LoadTree(treestring="((A,B)ab,(F,(C,D)cd)cdf,E)root;")
     # a, e, ogroup f
     t2 = LoadTree(treestring="((E,(A,B)ab)abe,F,(C,D)cd)root;")
     expected = ['A', 'B', 'E', 'ab']
     for t in [t1, t2]:
         edges = t.getEdgeNames('A',
                                'E',
                                getstem=False,
                                getclade=True,
                                outgroup_name="F")
         edges.sort()
         self.assertEqual(expected, edges)
Example #20
0
 def __init__(self, TreePath , NeedsToBeCogentModded):
     self.Parsed = True #used to determine if the full analysis can be conducted
     
     try:
         self.TreePath = TreePath
         self.NeedsToBeCogentModded = NeedsToBeCogentModded
         
         self.CogentTree = None
         
         #if the internal nodes need to be renamed, then it is done according to the "FixUpFileForCogent" method
         if self.NeedsToBeCogentModded:
             cogentFixUp = fixUpFileForCogent(self.TreePath)
             self.CogentTreeFile = cogentFixUp[0]
             self.CogentInputTreeString = cogentFixUp[1]
             
             
             self.CogentTree = LoadTree(self.CogentTreeFile.name)
             
         else:
             
             self.CogentTree = LoadTree(self.TreePath)
         
         #prepares an input string for FastML
         self.FastMLInputTreeString = self.FixUpFileForFastML(self.CogentTree)
         
         
         #executes method to fully parse tree, then sets all returned variables as class variables
         CogentNodesLeavesBranches = completeNodesLeavesBranches(self.CogentTree)
         self.NodeKey_L = CogentNodesLeavesBranches['NodeKey_L']
         self.LeafKey_L = CogentNodesLeavesBranches['LeafKey_L']
         self.UpperKey_L = CogentNodesLeavesBranches['UpperKey_L']
         self.TopKey = CogentNodesLeavesBranches['TopKey']
         self.BranchKey_L = CogentNodesLeavesBranches['BranchKey_L']
         self.Nodes_D = CogentNodesLeavesBranches['Nodes_D']
         
         
         
         
         
         #print self.LeafKey_L
         #executes quick run of FastML to get FastML's naming convention of internal nodes
         
         self.FastMLOutputTreeString = executeFastML(self.getTempFASTAFile() , self.FastMLInputTreeString , True)
         
         
         #prepares the FastMLToOriginalMatchedNodes_D
         self.MatchNodes()
         
     except Exception as e:
         
         self.Parsed = False
Example #21
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    start_time = datetime.now()

    t = LoadTree(opts.input_tree)
    translation_dict = {}
    for i, tip in enumerate(t.iterTips()):
        translation_dict[tip.Name] = i

    single_rate = False

    #Generate commands telling BayesTraits which nodes to reconstruct
    bayestraits_commands = make_bayestraits_script(t,
                                                   translation_dict,
                                                   comments=False,
                                                   single_rate=single_rate)

    #TODO: make this dynamic
    #Temporarily assuming there is a nexus file available
    nexus_fp = opts.input_tree.rsplit(".", 1)[0] + ".nexus"
    command_fp = "./bayestraits_commands.txt"
    path_to_bayestraits = "../"
    outfile = "./bayestrait_reconstruction.trait_table"
    command_file = open(command_fp, "w+")
    command_file.writelines(bayestraits_commands)
    command_file.close()

    command_file = open(command_fp, "U")

    bayestraits = BayesTraits()
    bayestraits_result = bayestraits(data=(nexus_fp, opts.input_trait_data,
                                           command_fp))
    #print "StdOut:",result["StdOut"].read()
    print "StdErr:", bayestraits_result["StdErr"].read()
    print "Return code:", bayestraits_result["ExitStatus"]

    results = parse_reconstruction_output(
        bayestraits_result['StdOut'].readlines())
    #print "Reconstructions:",results

    #Reconstruction results
    f = open(outfile, "w+")
    f.writelines(results)
    f.close()

    end_time = datetime.now()
    print "Start time:", start_time
    print "End time:", end_time
    print "Time to reconstruct:", end_time - start_time
    bayestraits_result.cleanUp()
Example #22
0
 def test_sameShape(self):
     """test topology assessment"""
     t1 = LoadTree(treestring="(((s1,s5),s3),s2,s4);")
     t2 = LoadTree(treestring="((s1,s5),(s2,s4),s3);")
     t3 = LoadTree(treestring="((s1,s4),(s2,s5),s3);")
     assert t1.sameTopology(t2), (t1, t2)
     assert not t1.sameTopology(t3), (t1, t3)
     assert not t2.sameTopology(t3), (t2, t3)
Example #23
0
def ml(doc,
       model='NG',
       gc=None,
       omega_indep=True,
       model_gaps=False,
       indel_indep=True,
       **kw):
    aln = LoadSeqs(data=doc['aln'].encode('utf-8'), moltype=DNA)
    tree = LoadTree(treestring=doc['tree'].encode('utf-8'))

    code = get_genetic_code(gc)
    if model != 'NG':
        # Trim terminal stop codons
        aln = aln.withoutTerminalStopCodons(code)
        if model_gaps:
            filt = lambda x: set(''.join(x)) <= set(DNA).union({'-'})
        else:
            filt = lambda x: set(''.join(x)) <= set(DNA)
        aln = aln.filtered(filt, motif_length=3)

    flat_lf, time = _fit(aln, tree, model, code, omega_indep, model_gaps,
                         indel_indep)
    return {
        'lf': flat_lf,
        'time': time,
        'model': model,
        'gc': code.Name,
        'omega_indep': omega_indep,
        'model_gaps': model_gaps,
        'indel_indep': indel_indep
    }
Example #24
0
    def test_trees(self):
        treestring = "((A:.1,B:.22)ab:.3,((C:.4,D:.5)cd:.55,E:.6)cde:.7,F:.2)"
        for edge in 'ABCDEF':
            treestring = treestring.replace(edge, edge + edge.lower() * 10)
        t = LoadTree(treestring=treestring)
        for klass in [
                UnrootedDendrogram,
                SquareDendrogram,
                ContemporaneousDendrogram,
                ShelvedDendrogram,
                #        StraightDendrogram,
                #        ContemporaneousStraightDendrogram
        ]:
            dendro = klass(t)
            dendro.getConnectingNode(
                'Ccccccccccc', 'Eeeeeeeeeee').setCollapsed(color="green",
                                                           label="C, D and E")
            do(klass.__name__,
               dendro,
               shade_param="length",
               show_params=["length"])

        def callback(edge):
            return ["blue", "red"][edge.Name.startswith("A")]

        do("Highlight edge A",
           UnrootedDendrogram(t),
           edge_color_callback=callback)
Example #25
0
File: ml.py Project: HuttleyLab/gnc
def rooted(doc, rooted_edges=None, gc=None, **kw):
    aln = LoadSeqs(data=doc['aln'].encode('utf-8'), moltype=DNA)
    tree = LoadTree(treestring=doc['tree'].encode('utf-8'))

    code = get_genetic_code(gc)
    aln = aln.withoutTerminalStopCodons(code)
    aln = aln.filtered(lambda x: set(''.join(x)) <= set(DNA), motif_length=3)

    sp_kw = dict(upper=20., lower=0.05, is_independent=False)
    sm = MG94GTR(optimise_motif_probs=True)
    init_lf = sm.makeLikelihoodFunction(tree)
    init_lf.setAlignment(aln)
    with init_lf.updatesPostponed():
        for param in init_lf.getParamNames():
            if '/' in param:
                init_lf.setParamRule(param, **sp_kw)
    init_lf.setParamRule('length', edges=rooted_edges, is_independent=False)
    init_lf.optimise(local=True, show_progress=False, limit_action='raise')
    init_lf = nest.deflate_likelihood_function(init_lf, save_jsd=False)
    sm = GNC(optimise_motif_probs=True)
    lf = sm.makeLikelihoodFunction(tree)
    lf.setAlignment(aln)
    _populate_parameters(lf, init_lf, **sp_kw)
    for param in lf.getParamNames():
        if '>' in param or param == 'omega':
            lf.setParamRule(param, edges=rooted_edges, is_independent=False)
    lf.optimise(local=True, show_progress=False, limit_action='raise')
    flat_lf = nest.deflate_likelihood_function(lf)
    flat_lf['hard_up'] = _is_hard_up(lf)

    return {'lf': flat_lf, 'gc': code.Name, 'rooted_edges': rooted_edges}
Example #26
0
def MakeCachedObjects(model, tree, seq_length, opt_args):
    """simulates an alignment under F81, all models should be the same"""
    lf = model.makeLikelihoodFunction(tree)
    lf.setMotifProbs(dict(A=0.1, C=0.2, G=0.3, T=0.4))
    aln = lf.simulateAlignment(seq_length)
    results = dict(aln=aln)
    discrete_tree = LoadTree(tip_names=aln.Names)

    def fit_general(results=results):
        if 'general' in results:
            return
        gen = General(DNA.Alphabet)
        gen_lf = _make_likelihood(gen, tree, results)
        gen_lf.optimise(**opt_args)
        results['general'] = gen_lf
        return

    def fit_gen_stat(results=results):
        if 'gen_stat' in results:
            return
        gen_stat = GeneralStationary(DNA.Alphabet)
        gen_stat_lf = _make_likelihood(gen_stat, tree, results)
        gen_stat_lf.optimise(**opt_args)
        results['gen_stat'] = gen_stat_lf

    def fit_constructed_gen(results=results):
        if 'constructed_gen' in results:
            return
        preds = [
            MotifChange(a, b, forward_only=True)
            for a, b in [['A', 'C'], ['A', 'G'], ['A', 'T'], ['C', 'A'],
                         ['C', 'G'], ['C', 'T'], ['G', 'C'], ['G', 'T'],
                         ['T', 'A'], ['T', 'C'], ['T', 'G']]
        ]
        nuc = Nucleotide(predicates=preds)
        nuc_lf = _make_likelihood(nuc, tree, results)
        nuc_lf.optimise(**opt_args)
        results['constructed_gen'] = nuc_lf

    def fit_discrete(results=results):
        if 'discrete' in results:
            return
        dis_lf = _make_likelihood(DiscreteSubstitutionModel(DNA.Alphabet),
                                  discrete_tree,
                                  results,
                                  is_discrete=True)
        dis_lf.optimise(**opt_args)
        results['discrete'] = dis_lf

    funcs = dict(general=fit_general,
                 gen_stat=fit_gen_stat,
                 discrete=fit_discrete,
                 constructed_gen=fit_constructed_gen)

    def call(self, obj_name):
        if obj_name not in results:
            funcs[obj_name]()
        return results[obj_name]

    return call
 def setUp(self):
     #length all edges 1 except c=2.  b&d transitions all other transverions
     self.al = LoadSeqs(
         data={'a':'tata', 'b':'tgtc', 'c':'gcga', 'd':'gaac', 'e':'gagc',})
     self.tree = LoadTree(treestring='((a,b),(c,d),e);')
     self.model = cogent.evolve.substitution_model.Nucleotide(
         do_scaling=True, equal_motif_probs=True, model_gaps=True)
Example #28
0
def test_gapped_CNFGTR():
    aln = get_aln(os.path.join(get_data_dir(), 'ENSG00000100393.fasta.gz'),
                  codon_position=-1,
                  filter_gaps=False)
    tree = LoadTree(treestring='(Human,Mouse,Opossum);')
    doc = {'aln': str(aln), 'tree': str(tree)}
    cnfgtr_result = gapped.ml(doc,
                              model='CNFGTR',
                              model_gaps=True,
                              omega_indep=False,
                              indel_indep=False)
    model = lambda: gapped.CNFGTR(optimise_motif_probs=True, model_gaps=True)
    cnfgtr = gapped.inflate_likelihood_function(cnfgtr_result['lf'], model)

    pi = cnfgtr.getMotifProbsByNode()['root'].asarray()
    P = cnfgtr.getPsubForEdge('Human')
    assert_almost_equal(pi.dot(P), pi)

    omega = cnfgtr.getParamValue('omega')
    pi = cnfgtr.getMotifProbs()
    Q = cnfgtr.getRateMatrixForEdge('Human')
    cond_p = pi['CCG'] / sum(pi['CC' + c] for c in 'ACGT')
    ref_cell = Q['CCT']['CCG'] / cond_p
    cond_p = pi['CCC'] / sum(pi['CC' + c] for c in 'ACGT')
    assert_almost_equal(Q['CCA']['CCC'] / cond_p / ref_cell,
                        cnfgtr.getParamValue('A/C'))
    assert_almost_equal(Q['---']['CCC'] / pi['CCC'] / ref_cell,
                        cnfgtr.getParamValue('indel'))
    R = Q.asarray() / pi.asarray()
    assert_almost_equal(R.T, R)
Example #29
0
    def __init__(self, TreePath, NeedsToBeCogentModded):
        self.Parsed = True  #used to determine if the full analysis can be conducted

        try:
            self.TreePath = TreePath
            self.NeedsToBeCogentModded = NeedsToBeCogentModded

            self.CogentTree = None

            #if the internal nodes need to be renamed, then it is done according to the "FixUpFileForCogent" method
            if self.NeedsToBeCogentModded:
                cogentFixUp = fixUpFileForCogent(self.TreePath)
                self.CogentTreeFile = cogentFixUp[0]
                self.CogentInputTreeString = cogentFixUp[1]

                self.CogentTree = LoadTree(self.CogentTreeFile.name)

            else:

                self.CogentTree = LoadTree(self.TreePath)

            #prepares an input string for FastML
            self.FastMLInputTreeString = self.FixUpFileForFastML(
                self.CogentTree)

            #executes method to fully parse tree, then sets all returned variables as class variables
            CogentNodesLeavesBranches = completeNodesLeavesBranches(
                self.CogentTree)
            self.NodeKey_L = CogentNodesLeavesBranches['NodeKey_L']
            self.LeafKey_L = CogentNodesLeavesBranches['LeafKey_L']
            self.UpperKey_L = CogentNodesLeavesBranches['UpperKey_L']
            self.TopKey = CogentNodesLeavesBranches['TopKey']
            self.BranchKey_L = CogentNodesLeavesBranches['BranchKey_L']
            self.Nodes_D = CogentNodesLeavesBranches['Nodes_D']

            #print self.LeafKey_L
            #executes quick run of FastML to get FastML's naming convention of internal nodes

            self.FastMLOutputTreeString = executeFastML(
                self.getTempFASTAFile(), self.FastMLInputTreeString, True)

            #prepares the FastMLToOriginalMatchedNodes_D
            self.MatchNodes()

        except Exception as e:

            self.Parsed = False
Example #30
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    start_time = datetime.now()

    t = LoadTree(opts.input_tree)
    translation_dict = {}
    for i,tip in enumerate(t.iterTips()):
        translation_dict[tip.Name] = i

    single_rate = False

    #Generate commands telling BayesTraits which nodes to reconstruct
    bayestraits_commands = make_bayestraits_script(t,translation_dict,comments=False,single_rate=single_rate)


    #TODO: make this dynamic
    #Temporarily assuming there is a nexus file available
    nexus_fp = opts.input_tree.rsplit(".",1)[0] +".nexus"
    command_fp = "./bayestraits_commands.txt"
    path_to_bayestraits = "../"
    outfile = "./bayestrait_reconstruction.trait_table"
    command_file = open(command_fp,"w+")
    command_file.writelines(bayestraits_commands)
    command_file.close()

    command_file = open(command_fp,"U")

    bayestraits=BayesTraits()
    bayestraits_result = bayestraits(data=(nexus_fp,opts.input_trait_data,command_fp))
    #print "StdOut:",result["StdOut"].read()
    print "StdErr:",bayestraits_result["StdErr"].read()
    print "Return code:",bayestraits_result["ExitStatus"]

    results = parse_reconstruction_output(bayestraits_result['StdOut'].readlines())
    #print "Reconstructions:",results

    #Reconstruction results
    f = open(outfile,"w+")
    f.writelines(results)
    f.close()

    end_time = datetime.now()
    print "Start time:", start_time
    print "End time:",end_time
    print "Time to reconstruct:", end_time - start_time
    bayestraits_result.cleanUp()
Example #31
0
def build_tree(tree_string, bl1, bl2, r):
    'build a PyCogent tree object from a string and branch lengths'
    # we use r/2.0 because PyCogent defaults to adding a branch of
    # length 1 if you don't explicitly specify it
    # having 2 branches of r/2.0 keeps our internal branch at r
    tree_string_bl = tree_string % (bl1, bl2, r / 2.0, bl1, bl2, r / 2.0)
    t = LoadTree(treestring=tree_string_bl)
    return t
Example #32
0
 def setUp(self):
     self.name = 'small tree - '
     self.otu_names = ['NineBande', 'Mouse', 'HowlerMon', 'DogFaced']
     self.otu_names.sort()
     self.newick = '(((Human,HowlerMon),Mouse),NineBande,DogFaced);'
     self.newick_sorted = '(DogFaced,((HowlerMon,Human),Mouse),NineBande);'
     self.newick_reduced = '((HowlerMon,Mouse),NineBande,DogFaced);'
     self.tree = LoadTree(treestring = self.newick)
 def test_setConstantLengths(self):
     t = LoadTree(treestring='((a:1,b:2):3,(c:4,d:5):6,e:7);')
     lf = self.model.makeLikelihoodFunction(t)  #self.tree)
     lf.setParamRule('length', is_const=True)
     # lf.setConstantLengths(t)
     lf.setAlignment(self.al)
     self.assertEqual(lf.getParamValue('length', 'b'), 2)
     self.assertEqual(lf.getParamValue('length', 'd'), 5)
Example #34
0
def main():
    args = parser.parse_args()

    categories = args.categories
    map_fp = args.map_fp
    tree_fp = args.tree_fp
    output_fp = args.output_fp
    length = args.length

    map_dict = parse_mapping_file_to_dict(map_fp)[0]

    fields = categories.split(',')

    tree = LoadTree(tree_fp)

    furcated_tree = furcate_tree(tree, map_dict, fields, length=length)

    tree.writeToFile(output_fp)
Example #35
0
    def test_getsubtree(self):
        """testing getting a subtree"""
        subtree = self.tree.unrooted().getSubTree(self.otu_names)

        new_tree = LoadTree(treestring=self.newick_reduced).unrooted()

        # check we get the same names
        self.assertEqual(*[len(t.Children) for t in (subtree, new_tree)])
        self.assertEqual(str(subtree), str(new_tree))
Example #36
0
def different_tree_simulate_alignment(tree_information_list, all_trees):
    '''
        input:

        list of tree information

        example can be[[p1,q1,r1,s1,t1,tree1],[p1,q1,r1,s1,t1,tree2]]

        tree1 and tree2 are the tree in the all_trees
        (a,b),(c,d)-->0
        (a,c),(b,d)-->1
        (a,d),(b,c)-->2

	output:

        PyCogent alignment that different sites follow different order

	'''
    #alnlist is to store all the alignment
    alnlist = []

    #for each tree_information, call the function to get the alignment
    for tree_information in tree_information_list:
        aln1 = simulate_alignment_treefixed(all_trees, tree_information[0],
                                            tree_information[1],
                                            tree_information[2],
                                            tree_information[3],
                                            tree_information[4],
                                            tree_information[5])
        alnlist.append(aln1)

    #put all the alignment together
    aln = alnlist[0]
    if (len(alnlist) > 1):
        for i in range(len(alnlist) - 1):
            aln = aln + alnlist[i + 1]

#find the true tree and construct it according to it has the longest alignment length
#index stands for the order of the longest alignment in the whole tree_information_list

    index = 0
    for i in range(len(tree_information_list)):
        if tree_information_list[i][3] > tree_information_list[index][3]:
            index = i

#build the true tree
#index_true_tree stands for the order of the tree in all_trees that the longest alingment follow

    index_true_tree = tree_information_list[index][5]
    tree_string = all_trees[index_true_tree]
    true_tree_bl = tree_string % (
        tree_information_list[index][0], tree_information_list[index][1],
        tree_information_list[index][2] / 2.0, tree_information_list[index][0],
        tree_information_list[index][1], tree_information_list[index][2] / 2.0)
    true_tree = LoadTree(treestring=true_tree_bl)

    return (aln, true_tree)
Example #37
0
def main():
    args = parser.parse_args()

    categories = args.categories
    map_fp = args.map_fp
    tree_fp = args.tree_fp
    output_fp = args.output_fp
    length = args.length

    map_dict = parse_mapping_file_to_dict(map_fp)[0]

    fields = categories.split(',')

    tree = LoadTree(tree_fp)

    furcated_tree = furcate_tree(tree, map_dict, fields, length=length)

    tree.writeToFile(output_fp)
Example #38
0
 def test_simulateAlignment2(self):
     "Simulate alignment with dinucleotide model"
     al = LoadSeqs(data={'a': 'ggaatt', 'c': 'cctaat'})
     t = LoadTree(treestring="(a,c);")
     sm = substitution_model.Dinucleotide(mprob_model='tuple')
     lf = sm.makeParamController(t)
     lf.setAlignment(al)
     simalign = lf.simulateAlignment()
     self.assertEqual(len(simalign), 6)
Example #39
0
def test_distribution():
    """distribution should return empirical distribution for DNA sequence"""
    al = get_aln('General', 1031).takeSeqs(('Mouse', ))
    distribution = jsd.distribution(al.getSeq('Mouse'))
    st = LoadTree(tip_names=('Mouse', ))
    sm = GTR()
    lf = sm.makeLikelihoodFunction(st)
    lf.setMotifProbsFromData(al)
    probs = lf.getMotifProbs()
    assert_array_almost_equal(array(probs), array(distribution))
Example #40
0
 def use_root_seq(root_sequence):
     al = LoadSeqs(data={'a': 'ggaatt', 'c': 'cctaat'})
     t = LoadTree(treestring="(a,c);")
     sm = substitution_model.Dinucleotide(mprob_model='tuple')
     lf = sm.makeParamController(t)
     lf.setAlignment(al)
     simalign = lf.simulateAlignment(exclude_internal=False,
                                     root_sequence=root_sequence)
     root = simalign.NamedSeqs['root']
     self.assertEqual(str(root), str(root_sequence))
Example #41
0
 def setUp(self):
     self.name = 'big tree - '
     self.otu_names = ['Horse', 'TombBat', 'Rhino', 'Pig', 'AsianElep',
                  'SpermWhal', 'Cat', 'Gorilla', 'Orangutan',
                  'bandicoot', 'Hedgehog', 'Sloth', 'HairyArma',
                  'Manatee', 'GoldenMol', 'Pangolin']
     self.otu_names.sort()
     self.newick = '((((((((FlyingFox,DogFaced),((FreeTaile,LittleBro),(TombBat,RoundEare))),(FalseVamp,LeafNose)),(((Horse,Rhino),(Pangolin,(Cat,Dog))),(Llama,(Pig,(Cow,(Hippo,(SpermWhal,HumpbackW))))))),(Mole,Hedgehog)),(TreeShrew,(FlyingLem,((Jackrabbit,(FlyingSqu,(OldWorld,(Mouse,Rat)))),(Galago,(HowlerMon,(Rhesus,(Orangutan,(Gorilla,(Human,Chimpanzee)))))))))),(((NineBande,HairyArma),(Anteater,Sloth)),(((Dugong,Manatee),((AfricanEl,AsianElep),(RockHyrax,TreeHyrax))),(Aardvark,((GoldenMol,(Madagascar,Tenrec)),(LesserEle,GiantElep)))))),(caenolest,(phascogale,(wombat,bandicoot))));'
     self.newick_reduced = '(((((TombBat,(((Horse,Rhino),(Pangolin,Cat)),(Pig,SpermWhal))),Hedgehog),(Orangutan,Gorilla)),((HairyArma,Sloth),((Manatee,AsianElep),GoldenMol))),bandicoot);'
     self.tree = LoadTree(treestring = self.newick)
 def setUp(self):
     self.submodel = Nucleotide(
         do_scaling=True, model_gaps=False, equal_motif_probs=True,
         predicates = {'beta': 'transition'})
     
     self.data = LoadSeqs(
             filename = os.path.join(data_path, 'brca1_5.paml'),
             moltype = self.submodel.MolType)
     
     self.tree = LoadTree(
             filename = os.path.join(data_path, 'brca1_5.tree'))
Example #43
0
def test_distribution():
    """distribution should return empirical distribution for DNA sequence"""
    with GzipFile(os.path.join(get_data_dir(), 'General_1031.fasta.gz')) as ff:
        data = ff.read()
    al = Alignment(data=data).takeSeqs(('Mouse', ))
    distribution = jsd.distribution(al.getSeq('Mouse'))
    st = LoadTree(tip_names=('Mouse', ))
    sm = GTR()
    lf = sm.makeLikelihoodFunction(st)
    lf.setMotifProbsFromData(al)
    probs = lf.getMotifProbs()
    assert_array_almost_equal(array(probs), array(distribution))
Example #44
0
class TestTree(unittest.TestCase):
    """tests for a single tree-type"""
    
    def setUp(self):
        self.name = 'small tree - '
        self.otu_names = ['NineBande', 'Mouse', 'HowlerMon', 'DogFaced']
        self.otu_names.sort()
        self.newick = '(((Human,HowlerMon),Mouse),NineBande,DogFaced);'
        self.newick_sorted = '(DogFaced,((HowlerMon,Human),Mouse),NineBande);'
        self.newick_reduced = '((HowlerMon,Mouse),NineBande,DogFaced);'
        self.tree = LoadTree(treestring = self.newick)
    
    def test_sorttree(self):
        """testing (well, exercising at least) treesort"""
        new_tree = self.tree.sorted()
        if hasattr(self, 'newick_sorted'):
            self.assertEqual(
                self.newick_sorted,
                new_tree.getNewick(with_distances=0))
    
    def test_getsubtree(self):
        """testing getting a subtree"""
        subtree = self.tree.unrooted().getSubTree(self.otu_names)
        
        new_tree = LoadTree(treestring = self.newick_reduced).unrooted()
        
        # check we get the same names
        self.assertEqual(*[len(t.Children) for t in (subtree,new_tree)])
        self.assertEqual(str(subtree), str(new_tree))
    
    def test_ascii(self):
        self.tree.asciiArt()
        # unlabeled internal node
        tr = DndParser("(B:0.2,(C:0.3,D:0.4):0.6)F;")
        tr.asciiArt(show_internal=True, compact=False)
        tr.asciiArt(show_internal=True, compact=True)
        tr.asciiArt(show_internal=False, compact=False)
def load_de_numericized_newick_tree(tree_in,before="'",after="'",root=False):
    from cogent.core.tree import PhyloNode
    from cogent import LoadTree
    import os.path
    
    if os.path.isfile(tree_in):
        tree = LoadTree(tree_in)
    else:
        tree = LoadTree(treestring=tree_in)
    terminals = tree.getTipNames()
    rename_dict = {}
    for tip in terminals:
        rename_dict[tip] = before + str(tip) + after
    tree.reassignNames(rename_dict)
    if root:
        tree = tree.rootAtMidpoint()
    treestring = tree.getNewick(with_distances=True)
    
    return treestring
Example #46
0
 def test_run_pick_de_novo_otus_muscle(self):
     """run_pick_de_novo_otus w muscle generates expected results
     """
     self.params['assign_taxonomy'] = \
      {'id_to_taxonomy_fp':self.test_data['refseqs_tax'][0],
       'reference_seqs_fp':self.test_data['refseqs'][0]}
     self.params['align_seqs'] = {'alignment_method':'muscle'}
     self.params['filter_alignment'] = \
      {'suppress_lane_mask_filter':None,
       'entropy_threshold':'0.10'}
     
     run_pick_de_novo_otus(
      self.test_data['seqs'][0], 
      self.test_out, 
      call_commands_serially,
      self.params, 
      self.qiime_config, 
      parallel=False,
      status_update_callback=no_status_updates)
      
     input_file_basename = splitext(split(self.test_data['seqs'][0])[1])[0]
     otu_map_fp = join(self.test_out,'uclust_picked_otus',
      '%s_otus.txt' % input_file_basename)
     alignment_fp = join(self.test_out,
      'muscle_aligned_seqs','%s_rep_set_aligned.fasta' % 
       input_file_basename)
     taxonomy_assignments_fp = join(self.test_out,
      'uclust_assigned_taxonomy','%s_rep_set_tax_assignments.txt' %
      input_file_basename)
     otu_table_fp = join(self.test_out,'otu_table.biom')
     tree_fp = join(self.test_out,'rep_set.tre')
     
     input_seqs = LoadSeqs(self.test_data['seqs'][0],
                           format='fasta',
                           aligned=False)
     
     # Number of OTUs falls within a range that was manually 
     # confirmed
     otu_map_lines = list(open(otu_map_fp))
     num_otus = len(otu_map_lines)
     otu_map_otu_ids = [o.split()[0] for o in otu_map_lines]
     self.assertEqual(num_otus,14)
     
     # all otus get taxonomy assignments
     taxonomy_assignment_lines = list(open(taxonomy_assignments_fp))
     self.assertEqual(len(taxonomy_assignment_lines),num_otus)
     
     # all OTUs align
     aln = LoadSeqs(alignment_fp)
     self.assertTrue(aln.getNumSeqs(),num_otus)
     
     # all OTUs in tree
     tree = LoadTree(tree_fp)
     self.assertEqual(len(tree.tips()),num_otus)
      
     # check that the two final output files have non-zero size
     self.assertTrue(getsize(tree_fp) > 0)
     self.assertTrue(getsize(otu_table_fp) > 0)
     
     # Check that the log file is created and has size > 0
     log_fp = glob(join(self.test_out,'log*.txt'))[0]
     self.assertTrue(getsize(log_fp) > 0)
     
     # parse the otu table
     otu_table = parse_biom_table(open(otu_table_fp,'U'))
     expected_sample_ids = ['f1','f2','f3','f4','p1','p2','t1','t2','not16S.1']
     # sample IDs are as expected
     self.assertEqualItems(otu_table.SampleIds,expected_sample_ids)
     # expected OTUs
     self.assertEqualItems(otu_table.ObservationIds,otu_map_otu_ids)
     # number of sequences in the full otu table equals the number of
     # input sequences
     number_seqs_in_otu_table = sum([v.sum() for v in otu_table.iterSampleData()])
     self.assertEqual(number_seqs_in_otu_table,input_seqs.getNumSeqs())
class ExplorePrediction:

    "CONSTRUCTOR"

    def __init__(self, Directory, DerivedoI, PDBoI):
        """
        Class attributes:
        Figures_L (List): list of all the figure types that will be created
        FiguresSVG_D (Dict): key is the type of figure, value is the SVG syntax that will draw the figure
        DerivedoInterest (String): Derived node of interest that the figure will be based on
        PDBoInterest (String): PDB structure that the derived node sequence aligned to and achieved a significant hit on
        """

        #initial setup of what figures will be created
        self.Figures_L = [
            "TreeAndStates", "Alignment", "Structurecartoon",
            "Structuresurface"
        ]
        self.FigureSVG_D = {Key: [] for Key in self.Figures_L}

        self.Directory = Directory
        if self.Directory.endswith("/"):
            pass
        else:
            self.Directory = self.Directory + "/"

        self.DerivedoInterest = DerivedoI
        self.PDBoInterest = PDBoI

        print self.Directory
        print self.DerivedoInterest
        print self.PDBoInterest

        #output directory where files will be written
        self.OutputDirectory = "%sFigures/%s-%s/" % (
            self.Directory, self.DerivedoInterest, self.PDBoInterest)

        if os.path.exists(self.OutputDirectory):
            pass
        else:
            os.system("mkdir " + self.OutputDirectory)

        #paths to relevant input files
        self.ReportPATH = self.Directory + "Report.xml"
        self.TreePATH = self.Directory + "ModdedTree.nwk"
        self.MatrixPATH = self.Directory + "ScoringMatrix.xml"

        #parses the report file for sequences and branch relationships
        self.NodeToSeq_D = {
            re.compile("<H>(.+?)</H>").search(Seq).group(1):
            re.compile("<S>(.+?)</S>").search(Seq).group(1)
            for Seq in re.findall("<Seq>.+?</Seq>",
                                  open(self.ReportPATH, "r").read())
        }
        self.BranchToAlgorithm_D = {
            re.compile("<Branch_name>(.+?)</Branch_name>").search(Branch).
            group(1): ScopeAlgorithm(Branch)
            for Branch in re.findall("<Branch>.+?</Branch>",
                                     open(self.ReportPATH, "r").read(), re.
                                     DOTALL)
        }
        self.RectCount = 0

        #dimensions
        self.TreeFigWIDTH = 750
        self.TreeFigHEIGHT = 500
        self.TreeFigXOffset = 25
        self.TreeFigYOffset = 50

        #loads and parses tree, gets evolutionary distances for proper branch lengths
        self.CogentTree = LoadTree(self.TreePATH)
        self.FastMLTree = FastMLTree(self.TreePATH, False)
        self.FastMLTree.setBranchLengths()
        self.LongestDistance = self.getLongestEvoDistance()
        self.EvoDistance_D = {
            Key: self.getEvoDistance(Key)
            for Key in self.NodeToSeq_D.keys() if Key != self.FastMLTree.TopKey
        }
        self.EvoDistance_D[self.FastMLTree.TopKey] = 0.0
        self.ModdedEvoDistance_D = self.modEvoDistance()
        self.TreeCoords_D = self.setTreeCoords()

        FurthestPosition = 0.0
        FurthestClade = ""

        #gets the furthest evolutionary distance
        for Key in self.FastMLTree.LeafKey_L:
            Val = self.TreeCoords_D[Key][0] + (12 * len(Key))
            if Val > FurthestPosition:
                FurthestPosition = Val
                FurthestClade = Key

        self.BranchoInterest = ""

        for Key in self.FastMLTree.BranchKey_L:
            if Key.split(">>")[1] == self.DerivedoInterest:
                self.BranchoInterest = Key

        #gets all relevant information for the states portion of the figure
        self.StateIndices_L = [
            int(X) - 1 for X in self.BranchToAlgorithm_D[self.BranchoInterest].
            getAllMutationXMLKeysAccordingToAccession(self.PDBoInterest)
        ]
        self.LeafStates_D = {
            Key:
            [self.NodeToSeq_D[Key][state] for state in self.StateIndices_L]
            for Key in self.FastMLTree.LeafKey_L
        }
        self.StateColour_D = self.getStateToHex()

        self.StateInc = 25.0

        self.StateFigHEIGHT = 500
        self.StateFigWIDTH = self.StateInc * (len(self.StateIndices_L)) + 50
        self.StateFigXOffset = self.TreeFigXOffset + self.TreeFigWIDTH + (
            12 * len(FurthestClade)) + 25
        self.StateFigYOffset = 50
        #creates the states and tree figure
        self.FigureSVG_D["TreeAndStates"].append(
            self.getSVGHeader(
                self.TreeFigHEIGHT + (self.TreeFigYOffset * 2),
                self.StateFigXOffset + self.StateFigWIDTH +
                self.TreeFigXOffset))
        self.makeTreeFig()
        self.makeStatesFig()
        self.FigureSVG_D["TreeAndStates"].append("</svg>")

        self.TreeAndStatesFOutPATH = self.OutputDirectory + "TreeAndStates.png"
        TreeStateFOut = open(self.TreeAndStatesFOutPATH, "w")
        cairosvg.svg2png(
            bytestring="\n".join(self.FigureSVG_D["TreeAndStates"]),
            write_to=TreeStateFOut)
        TreeStateFOut.close()

        LongestCladeName = ""
        for Key in self.FastMLTree.LeafKey_L:
            if len(Key) > len(LongestCladeName):
                LongestCladeName = Key

        #gets all relevant information for the alignment cartoon portion of the figure
        self.MatrixInfo = self.parseScoringMatrix()

        self.AlnInc = 11.0

        self.AlignmentFigWIDTH = self.AlnInc * len(
            self.MatrixInfo["Sseq"]) + self.AlnInc + (
                8 * len(LongestCladeName))

        self.AlignmentFigHEIGHT = self.AlnInc * (
            len(self.FastMLTree.LeafKey_L) + 1) + self.AlnInc
        self.AlignmentFigXOffset = self.AlnInc
        self.AlignmentFigYOffset = self.AlnInc

        self.FigureSVG_D["Alignment"].append(
            self.getSVGHeader(self.AlignmentFigHEIGHT, self.AlignmentFigWIDTH))
        self.makeAlignmentFig()
        self.FigureSVG_D["Alignment"].append("</svg>")

        self.AlignmentFOutPATH = self.OutputDirectory + "Alignment.png"
        AlignmentFOut = open(self.AlignmentFOutPATH, "w")
        cairosvg.svg2png(
            bytestring="\n".join(self.FigureSVG_D["Alignment"]),
            write_to=AlignmentFOut)
        AlignmentFOut.close()

        #relevant information for the structure file in PDB format
        self.ColouredStructureFile = self.getColoredStructureFile()
        self.StructureFOutPATH = self.OutputDirectory + "Structure.pdb"
        open(self.StructureFOutPATH,
             "w").write(self.ColouredStructureFile.read())

        self.TotalFigWIDTH = 1000
        self.TotalFigHEIGHT = 600

        self.TotalElement_L = [
            self.getSVGHeader(self.TotalFigHEIGHT, self.TotalFigWIDTH)
        ]
        self.TotalElement_L.append(
            '''\t<image x="0" y="0" width="1000" height="500" xlink:href="file://%s"/>'''
            % (self.TreeAndStatesFOutPATH))
        self.TotalElement_L.append(
            '''\t<image x="0" y="500" width="1000" height="100" xlink:href="file://%s"/>'''
            % (self.AlignmentFOutPATH))
        self.TotalElement_L.append("</svg>")

    "gets the header for any SVG format file"

    def getSVGHeader(self, FrameHEIGHT, FrameWIDTH):
        return """<?xml version="1.0" standalone="no"?>

<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">

<svg xmlns:xlink="http://www.w3.org/1999/xlink" xmlns='http://www.w3.org/2000/svg' version='1.1'
    width='%s' height='%s'>
""" % (str(FrameWIDTH), str(FrameHEIGHT))

    "Dictionary where the key is the amino acid character and the value is the background colour"

    def getStateToHex(self):
        return {"A":"80B3E6","C":"E68080","D":"CC4DCC","E":"CC4DCC","F":"80B3E6",\
                "G":"E6994D","H":"1AB3B3","I":"80B3E6","K":"E6331A","L":"80B3E6",\
                "M":"80B3E6","N":"1ACC1A","P":"CCCC00","Q":"1ACC1A","R":"E6331A",\
                "S":"1ACC1A","T":"1ACC1A","V":"80B3E6","W":"80B3E6","Y":"1AB3B3",\
                "-":"FFFFFF","X":"FFFFFF"}

    "returns the total evolutionary distance from the origin to the node of interest"

    def getEvoDistance(self, startingToNodeKey):
        distance = 0.0
        rootNodeHasNotBeenReached = True
        ToNodeKey = startingToNodeKey

        while rootNodeHasNotBeenReached:

            distance += self.FastMLTree.BranchLength_D[ToNodeKey]

            branchUpHasNotBeenFound = True

            for BranchKey in self.FastMLTree.BranchKey_L:
                if branchUpHasNotBeenFound:
                    if re.compile(">>" + ToNodeKey + "$").search(BranchKey):

                        branchUpHasNotBeenFound = False
                        ToNodeKey = BranchKey.split(">>")[0]

            if ToNodeKey == self.FastMLTree.TopKey:
                rootNodeHasNotBeenReached = False

        return distance

    "gets the node with the longest evolutionary distance from the origin"

    def getLongestEvoDistance(self):
        longestDistance = 0.0

        for LeafKey in self.FastMLTree.LeafKey_L:

            distance = self.getEvoDistance(LeafKey)

            if distance > longestDistance:
                longestDistance = distance

        return longestDistance

    "modifies evolutionary distance into a different format"

    def modEvoDistance(self):
        Ret = {}

        for Key in self.EvoDistance_D.keys():
            if Key == self.FastMLTree.TopKey:
                Ret[Key] = self.EvoDistance_D[Key]

            else:
                if self.EvoDistance_D[Key] == 0:
                    Ret[Key] = self.EvoDistance_D[Key]
                else:
                    Ret[Key] = self.EvoDistance_D[Key]
        return Ret

    "sets tree node coordinates (horizontal and vertical) for the SVG image"

    def setTreeCoords(self):

        Lines_L = self.CogentTree.asciiArt().split("\n")
        MaxVert = 0
        VertCoord_D = {}

        for i in range(0, len(Lines_L)):

            if re.compile("[a-zA-Z0-9_\.@]+").search(Lines_L[i]):
                Leaves = re.findall("([a-zA-Z0-9_\.@]+)", Lines_L[i])

                for Leaf in Leaves:

                    VertCoord_D[Leaf] = i
                    MaxVert = i

        TreeCoords_D = {
            Key: [(self.ModdedEvoDistance_D[Key] / self.LongestDistance) *
                  self.TreeFigWIDTH + self.TreeFigXOffset,
                  float(float(VertCoord_D[Key]) / float(MaxVert)) *
                  self.TreeFigHEIGHT + self.TreeFigYOffset]
            for Key in self.NodeToSeq_D.keys()
        }
        return TreeCoords_D

    "adds node names at each node vertex"

    def addNodeNamesAtNodePoints(self):
        for Key in self.FastMLTree.LeafKey_L:

            xy = self.TreeCoords_D[Key]
            xStart = str(xy[0])
            yStart = str(xy[1])
            self.FigureSVG_D["TreeAndStates"].append(
                '''\t<text x='%s' y='%s' text-anchor='left' font-size='20' font-family='Courier' style="fill: #000000;"  >%s</text>'''
                % (xStart, yStart, Key))

    "adds the vertical lines of the tree image"

    def addVerticalLines(self):

        for branchKey in self.FastMLTree.BranchKey_L:
            fro = branchKey.split(">>")[0]
            to = branchKey.split(">>")[1]

            froXY = self.TreeCoords_D[fro]
            toXY = self.TreeCoords_D[to]

            if branchKey == self.BranchoInterest:
                self.FigureSVG_D["TreeAndStates"].append(
                    '''\t<line class='axis' x1='%s' y1='%s' x2='%s' y2='%s' style="stroke:rgb(255,0,0);stroke-width:1 " />'''
                    % (str(froXY[0]), str(froXY[1]), str(froXY[0]), str(
                        toXY[1])))
            else:
                self.FigureSVG_D["TreeAndStates"].append(
                    '''\t<line class='axis' x1='%s' y1='%s' x2='%s' y2='%s' style="stroke:rgb(0,0,0);stroke-width:1 " />'''
                    % (str(froXY[0]), str(froXY[1]), str(froXY[0]), str(
                        toXY[1])))

    "adds the horizontal lines of the tree image"

    def addHorizontalLines(self):

        for branchKey in self.FastMLTree.BranchKey_L:

            fro = branchKey.split(">>")[0]
            to = branchKey.split(">>")[1]

            froXY = self.TreeCoords_D[fro]
            toXY = self.TreeCoords_D[to]

            if branchKey == self.BranchoInterest:
                self.FigureSVG_D["TreeAndStates"].append(
                    '''\t<line class='axis' x1='%s' y1='%s' x2='%s' y2='%s' style="stroke:rgb(255,0,0);stroke-width:1 " />'''
                    % (str(froXY[0]), str(toXY[1]), str(toXY[0]), str(
                        toXY[1])))
            else:
                self.FigureSVG_D["TreeAndStates"].append(
                    '''\t<line class='axis' x1='%s' y1='%s' x2='%s' y2='%s' style="stroke:rgb(0,0,0);stroke-width:1 " />'''
                    % (str(froXY[0]), str(toXY[1]), str(toXY[0]), str(
                        toXY[1])))

    "does all methods necessary to make the tree image"

    def makeTreeFig(self):
        self.addNodeNamesAtNodePoints()
        self.addVerticalLines()
        self.addHorizontalLines()

    "adds the rows for the mutated states in each sequence"

    def addStateRows(self):
        inc = self.StateInc
        vertInc = float(self.StateFigHEIGHT / float(len(self.LeafStates_D)))

        lowestY = float("inf")

        for Key in self.TreeCoords_D.keys():
            if self.TreeCoords_D[Key][1] < lowestY:
                lowestY = self.TreeCoords_D[Key][1]

        stateY = lowestY - (1.5 * vertInc)

        stateX = 0.0 + self.StateFigXOffset
        for i in self.StateIndices_L:

            self.FigureSVG_D["TreeAndStates"].append(
                '''\t<text x='%s' y='%s' text-anchor='middle' font-size='16' font-family='Courier' transform="rotate(90, %s, %s)" style="fill: #000000;"  >%s</text>'''
                % (str(stateX), str(stateY), str(stateX), str(stateY),
                   str(i + 1)))

            stateX += inc

        for Key in self.LeafStates_D.keys():
            X = 0.0 + self.StateFigXOffset

            for State in self.LeafStates_D[Key]:
                Y = self.TreeCoords_D[Key][1]

                RectX = X - (float(inc / 2.0))
                RectY = Y - (float(vertInc / 2.0)) - 5.0

                self.FigureSVG_D["TreeAndStates"].append('''\t<rect class='r%s' x='%s' y='%s' width='%s' height='%s' style="fill:#%s" />''' % (str(self.RectCount),\
                                                                                                                                 str(RectX),str(RectY),\
                                                                                                                                 str(inc),str(vertInc),\
                                                                                                                                 self.StateColour_D[State]))
                self.FigureSVG_D["TreeAndStates"].append(
                    '''\t<text x='%s' y='%s' font-size='20' font-family='Courier' text-anchor='middle' style="fill: #000000;"  >%s</text>'''
                    % (str(X), str(Y), State))

                X += inc

    "executes the method to make the states figure"

    def makeStatesFig(self):
        self.addStateRows()

    "parses the scoring matrix for alignment to the PDB sequence information"

    def parseScoringMatrix(self):
        allAlignments_L = re.findall("<PDB_alignment>.+?</PDB_alignment>",
                                     open(self.MatrixPATH, "r").read(),
                                     re.DOTALL)
        KeyAln = ""
        NotFound = True

        for Alignment in allAlignments_L:
            if NotFound:
                PDBID = re.compile("<PDB_id>(.+?)</PDB_id>").search(
                    Alignment).group(1).split("|")[0]
                if self.PDBoInterest.upper() == PDBID:
                    NotFound = False
                    KeyAln = Alignment
                    self.ChainoInterest = re.compile(
                        "<PDB_id>(.+?)</PDB_id>").search(Alignment).group(
                            1).split("|")[1].lower()

        return {"Qstart" : int(re.compile("<Alignment_start_query>(.+?)</Alignment_start_query>").search(KeyAln).group(1))-1,\
                "Qend" : int(re.compile("<Alignment_end_query>(.+?)</Alignment_end_query>").search(KeyAln).group(1))-1,\
                "Sstart" : int(re.compile("<Alignment_start_subject>(.+?)</Alignment_start_subject>").search(KeyAln).group(1))-1,\
                "Send" : int(re.compile("<Alignment_end_subject>(.+?)</Alignment_end_subject>").search(KeyAln).group(1))-1,\
                "Sseq" : re.compile("<Aligned_subject_sequence>(.+?)</Aligned_subject_sequence>").search(KeyAln).group(1)}

    "makes the cartoon of all aligned sequences in the protein family"

    def makeAlignmentFig(self):
        AllSeqs_L = [self.MatrixInfo["Sseq"]] + [
            self.NodeToSeq_D[Key]
            [self.MatrixInfo["Qstart"]:self.MatrixInfo["Qstart"] +
             len(self.MatrixInfo["Sseq"])] for Key in self.FastMLTree.LeafKey_L
        ]
        l1 = len(AllSeqs_L[0])
        AllHeaders_L = [self.PDBoInterest] + self.FastMLTree.LeafKey_L
        l2 = 0

        for Header in AllHeaders_L:
            if len(Header) > l2:
                l2 = len(Header)

        l = l1

        xinc = self.AlnInc
        yinc = self.AlnInc

        Y = self.AlignmentFigYOffset

        for i in range(0, len(AllSeqs_L)):

            X = 0.0 + self.AlignmentFigXOffset

            for State in AllSeqs_L[i]:

                RectX = X - (float(xinc / 2.0))
                RectY = Y - (float(yinc / 2.0)) - 5.0

                self.FigureSVG_D["Alignment"].append('''\t<rect class='r%s' x='%s' y='%s' width='%s' height='%s' style="fill:#%s" />''' % (str(self.RectCount),\
                                                                                                                                 str(RectX),str(RectY),\
                                                                                                                                 str(xinc),str(yinc),\
                                                                                                                                 self.StateColour_D[State]))
                self.FigureSVG_D["Alignment"].append(
                    '''\t<text x='%s' y='%s' text-anchor='middle' font-size='10' font-family='Courier' style="fill: #000000;"  >%s</text>'''
                    % (str(X), str(Y), State))

                X += xinc

            self.FigureSVG_D["Alignment"].append(
                '''\t<text x='%s' y='%s' text-anchor='left' font-size='10' font-family='Courier' style="fill: #000000;"  >%s</text>'''
                % (str(X + self.AlnInc), str(Y), AllHeaders_L[i]))

            Y += yinc

    "gets a PDB format file with the temperature factors coloured to reflect mutated sites"

    def getColoredStructureFile(self):
        NotFound = True
        DesiredBranchKey = ""
        for BranchKey in self.FastMLTree.BranchKey_L:
            if BranchKey.split(">>")[1] == self.DerivedoInterest:
                DesiredBranchKey = BranchKey
                NotFound = False

        PDBAndPDBXMLContents = getAllPDBFileDicts([self.PDBoInterest])
        SA = self.BranchToAlgorithm_D[DesiredBranchKey]
        SA.PDBContents_D = PDBAndPDBXMLContents[0]
        SA.PDBXMLContents_D = PDBAndPDBXMLContents[1]

        FH = getOutputTempFile()
        SA.createPDBColoredFile(self.PDBoInterest, FH.name)

        return FH
Example #48
0
 def test_get_tree_get_splits(self):
     """getTree should provide a reciprocal map of getSplits"""
     tree = LoadTree(filename=os.path.join(data_path,"murphy.tree"))
     self.assertTrue(tree.sameTopology(getTree(getSplits(tree))))
    def __init__(self, Directory, DerivedoI, PDBoI):
        """
        Class attributes:
        Figures_L (List): list of all the figure types that will be created
        FiguresSVG_D (Dict): key is the type of figure, value is the SVG syntax that will draw the figure
        DerivedoInterest (String): Derived node of interest that the figure will be based on
        PDBoInterest (String): PDB structure that the derived node sequence aligned to and achieved a significant hit on
        """

        #initial setup of what figures will be created
        self.Figures_L = [
            "TreeAndStates", "Alignment", "Structurecartoon",
            "Structuresurface"
        ]
        self.FigureSVG_D = {Key: [] for Key in self.Figures_L}

        self.Directory = Directory
        if self.Directory.endswith("/"):
            pass
        else:
            self.Directory = self.Directory + "/"

        self.DerivedoInterest = DerivedoI
        self.PDBoInterest = PDBoI

        print self.Directory
        print self.DerivedoInterest
        print self.PDBoInterest

        #output directory where files will be written
        self.OutputDirectory = "%sFigures/%s-%s/" % (
            self.Directory, self.DerivedoInterest, self.PDBoInterest)

        if os.path.exists(self.OutputDirectory):
            pass
        else:
            os.system("mkdir " + self.OutputDirectory)

        #paths to relevant input files
        self.ReportPATH = self.Directory + "Report.xml"
        self.TreePATH = self.Directory + "ModdedTree.nwk"
        self.MatrixPATH = self.Directory + "ScoringMatrix.xml"

        #parses the report file for sequences and branch relationships
        self.NodeToSeq_D = {
            re.compile("<H>(.+?)</H>").search(Seq).group(1):
            re.compile("<S>(.+?)</S>").search(Seq).group(1)
            for Seq in re.findall("<Seq>.+?</Seq>",
                                  open(self.ReportPATH, "r").read())
        }
        self.BranchToAlgorithm_D = {
            re.compile("<Branch_name>(.+?)</Branch_name>").search(Branch).
            group(1): ScopeAlgorithm(Branch)
            for Branch in re.findall("<Branch>.+?</Branch>",
                                     open(self.ReportPATH, "r").read(), re.
                                     DOTALL)
        }
        self.RectCount = 0

        #dimensions
        self.TreeFigWIDTH = 750
        self.TreeFigHEIGHT = 500
        self.TreeFigXOffset = 25
        self.TreeFigYOffset = 50

        #loads and parses tree, gets evolutionary distances for proper branch lengths
        self.CogentTree = LoadTree(self.TreePATH)
        self.FastMLTree = FastMLTree(self.TreePATH, False)
        self.FastMLTree.setBranchLengths()
        self.LongestDistance = self.getLongestEvoDistance()
        self.EvoDistance_D = {
            Key: self.getEvoDistance(Key)
            for Key in self.NodeToSeq_D.keys() if Key != self.FastMLTree.TopKey
        }
        self.EvoDistance_D[self.FastMLTree.TopKey] = 0.0
        self.ModdedEvoDistance_D = self.modEvoDistance()
        self.TreeCoords_D = self.setTreeCoords()

        FurthestPosition = 0.0
        FurthestClade = ""

        #gets the furthest evolutionary distance
        for Key in self.FastMLTree.LeafKey_L:
            Val = self.TreeCoords_D[Key][0] + (12 * len(Key))
            if Val > FurthestPosition:
                FurthestPosition = Val
                FurthestClade = Key

        self.BranchoInterest = ""

        for Key in self.FastMLTree.BranchKey_L:
            if Key.split(">>")[1] == self.DerivedoInterest:
                self.BranchoInterest = Key

        #gets all relevant information for the states portion of the figure
        self.StateIndices_L = [
            int(X) - 1 for X in self.BranchToAlgorithm_D[self.BranchoInterest].
            getAllMutationXMLKeysAccordingToAccession(self.PDBoInterest)
        ]
        self.LeafStates_D = {
            Key:
            [self.NodeToSeq_D[Key][state] for state in self.StateIndices_L]
            for Key in self.FastMLTree.LeafKey_L
        }
        self.StateColour_D = self.getStateToHex()

        self.StateInc = 25.0

        self.StateFigHEIGHT = 500
        self.StateFigWIDTH = self.StateInc * (len(self.StateIndices_L)) + 50
        self.StateFigXOffset = self.TreeFigXOffset + self.TreeFigWIDTH + (
            12 * len(FurthestClade)) + 25
        self.StateFigYOffset = 50
        #creates the states and tree figure
        self.FigureSVG_D["TreeAndStates"].append(
            self.getSVGHeader(
                self.TreeFigHEIGHT + (self.TreeFigYOffset * 2),
                self.StateFigXOffset + self.StateFigWIDTH +
                self.TreeFigXOffset))
        self.makeTreeFig()
        self.makeStatesFig()
        self.FigureSVG_D["TreeAndStates"].append("</svg>")

        self.TreeAndStatesFOutPATH = self.OutputDirectory + "TreeAndStates.png"
        TreeStateFOut = open(self.TreeAndStatesFOutPATH, "w")
        cairosvg.svg2png(
            bytestring="\n".join(self.FigureSVG_D["TreeAndStates"]),
            write_to=TreeStateFOut)
        TreeStateFOut.close()

        LongestCladeName = ""
        for Key in self.FastMLTree.LeafKey_L:
            if len(Key) > len(LongestCladeName):
                LongestCladeName = Key

        #gets all relevant information for the alignment cartoon portion of the figure
        self.MatrixInfo = self.parseScoringMatrix()

        self.AlnInc = 11.0

        self.AlignmentFigWIDTH = self.AlnInc * len(
            self.MatrixInfo["Sseq"]) + self.AlnInc + (
                8 * len(LongestCladeName))

        self.AlignmentFigHEIGHT = self.AlnInc * (
            len(self.FastMLTree.LeafKey_L) + 1) + self.AlnInc
        self.AlignmentFigXOffset = self.AlnInc
        self.AlignmentFigYOffset = self.AlnInc

        self.FigureSVG_D["Alignment"].append(
            self.getSVGHeader(self.AlignmentFigHEIGHT, self.AlignmentFigWIDTH))
        self.makeAlignmentFig()
        self.FigureSVG_D["Alignment"].append("</svg>")

        self.AlignmentFOutPATH = self.OutputDirectory + "Alignment.png"
        AlignmentFOut = open(self.AlignmentFOutPATH, "w")
        cairosvg.svg2png(
            bytestring="\n".join(self.FigureSVG_D["Alignment"]),
            write_to=AlignmentFOut)
        AlignmentFOut.close()

        #relevant information for the structure file in PDB format
        self.ColouredStructureFile = self.getColoredStructureFile()
        self.StructureFOutPATH = self.OutputDirectory + "Structure.pdb"
        open(self.StructureFOutPATH,
             "w").write(self.ColouredStructureFile.read())

        self.TotalFigWIDTH = 1000
        self.TotalFigHEIGHT = 600

        self.TotalElement_L = [
            self.getSVGHeader(self.TotalFigHEIGHT, self.TotalFigWIDTH)
        ]
        self.TotalElement_L.append(
            '''\t<image x="0" y="0" width="1000" height="500" xlink:href="file://%s"/>'''
            % (self.TreeAndStatesFOutPATH))
        self.TotalElement_L.append(
            '''\t<image x="0" y="500" width="1000" height="100" xlink:href="file://%s"/>'''
            % (self.AlignmentFOutPATH))
        self.TotalElement_L.append("</svg>")
class LikelihoodFunctionTests(TestCase):
    """tests for a tree analysis class. Various tests to create a tree analysis class,
    set parameters, and test various functions.
    """
    def setUp(self):
        self.submodel = Nucleotide(
            do_scaling=True, model_gaps=False, equal_motif_probs=True,
            predicates = {'beta': 'transition'})
        
        self.data = LoadSeqs(
                filename = os.path.join(data_path, 'brca1_5.paml'),
                moltype = self.submodel.MolType)
        
        self.tree = LoadTree(
                filename = os.path.join(data_path, 'brca1_5.tree'))
    
    def _makeLikelihoodFunction(self, **kw):
        lf = self.submodel.makeLikelihoodFunction(self.tree, **kw)
        lf.setParamRule('beta', is_independent=True)
        lf.setAlignment(self.data)
        return lf
    
    def _setLengthsAndBetas(self, likelihood_function):
        for (species, length) in [
                ("DogFaced", 0.1),
                ("NineBande",  0.2),
                ("Human", 0.3),
                ("HowlerMon", 0.4),
                ("Mouse",  0.5)]:
            likelihood_function.setParamRule("length", value=length, 
                    edge=species, is_constant=True)
        for (species1, species2, length) in [
                ("Human", "HowlerMon", 0.7),
                ("Human", "Mouse", 0.6)]:
            LCA = self.tree.getConnectingNode(species1, species2).Name
            likelihood_function.setParamRule("length", value=length, 
                    edge=LCA, is_constant=True)
        
        likelihood_function.setParamRule("beta", value=4.0, is_constant=True)
    
    def test_information_criteria(self):
        """test get information criteria from a model."""
        lf = self._makeLikelihoodFunction()
        nfp = lf.getNumFreeParams()
        lnL = lf.getLogLikelihood()
        l = len(self.data)
        self.assertFloatEqual(lf.getAic(), aic(lnL, nfp))
        self.assertFloatEqual(lf.getAic(second_order=True),
            aic(lnL, nfp, l))
        
        self.assertFloatEqual(lf.getBic(), bic(lnL, nfp, l))
    
    def test_result_str(self):
        # actualy more a test of self._setLengthsAndBetas()
        likelihood_function = self._makeLikelihoodFunction()
        self._setLengthsAndBetas(likelihood_function)
        self.assertEqual(str(likelihood_function), \
"""Likelihood Function Table\n\
======
  beta
------
4.0000
------
=============================
     edge    parent    length
-----------------------------
    Human    edge.0    0.3000
HowlerMon    edge.0    0.4000
   edge.0    edge.1    0.7000
    Mouse    edge.1    0.5000
   edge.1      root    0.6000
NineBande      root    0.2000
 DogFaced      root    0.1000
-----------------------------
===============
motif    mprobs
---------------
    T    0.2500
    C    0.2500
    A    0.2500
    G    0.2500
---------------""")
    
        likelihood_function = self._makeLikelihoodFunction(digits=2,space=2)
        self.assertEqual(str(likelihood_function), \
"""Likelihood Function Table\n\
===============================
     edge  parent  length  beta
-------------------------------
    Human  edge.0    1.00  1.00
HowlerMon  edge.0    1.00  1.00
   edge.0  edge.1    1.00  1.00
    Mouse  edge.1    1.00  1.00
   edge.1    root    1.00  1.00
NineBande    root    1.00  1.00
 DogFaced    root    1.00  1.00
-------------------------------
=============
motif  mprobs
-------------
    T    0.25
    C    0.25
    A    0.25
    G    0.25
-------------""")
    
    def test_calclikelihood(self):
        likelihood_function = self._makeLikelihoodFunction()
        self._setLengthsAndBetas(likelihood_function)
        self.assertAlmostEquals(-250.686745262,
            likelihood_function.getLogLikelihood(),places=9)
    
    def test_g_statistic(self):
        likelihood_function = self._makeLikelihoodFunction()
        self._setLengthsAndBetas(likelihood_function)
        self.assertAlmostEquals(230.77670557,
            likelihood_function.getGStatistic(),places=6)
    
    def test_ancestralsequences(self):
        likelihood_function = self._makeLikelihoodFunction()
        self._setLengthsAndBetas(likelihood_function)
        result = likelihood_function.reconstructAncestralSeqs()['edge.0']
        a_column_with_mostly_Ts = -1
        motif_G = 2
        self.assertAlmostEquals(2.28460181711e-05,
                result[a_column_with_mostly_Ts][motif_G], places=8)
        lf = self.submodel.makeLikelihoodFunction(self.tree, bins=['low', 'high'])
        lf.setParamRule('beta', bin='low', value=0.1)
        lf.setParamRule('beta', bin='high', value=10.0)
        lf.setAlignment(self.data)
        result = lf.reconstructAncestralSeqs()
    
    def test_likely_ancestral(self):
        """excercising the most likely ancestral sequences"""
        likelihood_function = self._makeLikelihoodFunction()
        self._setLengthsAndBetas(likelihood_function)
        result = likelihood_function.likelyAncestralSeqs()
    
    def test_simulateAlignment(self):
        "Simulate DNA alignment"
        likelihood_function = self._makeLikelihoodFunction()
        self._setLengthsAndBetas(likelihood_function)
        simulated_alignment = likelihood_function.simulateAlignment(20, exclude_internal = False)
        self.assertEqual(len(simulated_alignment), 20)
        self.assertEqual(len(simulated_alignment.getSeqNames()), 8)
    
    def test_simulateHetergeneousAlignment(self):
        "Simulate substitution-heterogeneous DNA alignment"
        lf = self.submodel.makeLikelihoodFunction(self.tree, bins=['low', 'high'])
        lf.setParamRule('beta', bin='low', value=0.1)
        lf.setParamRule('beta', bin='high', value=10.0)
        simulated_alignment = lf.simulateAlignment(100)
    
    def test_simulatePatchyHetergeneousAlignment(self):
        "Simulate patchy substitution-heterogeneous DNA alignment"
        lf = self.submodel.makeLikelihoodFunction(self.tree, bins=['low', 'high'], sites_independent=False)
        lf.setParamRule('beta', bin='low', value=0.1)
        lf.setParamRule('beta', bin='high', value=10.0)
        simulated_alignment = lf.simulateAlignment(100)
    
    def test_simulateAlignment2(self):
        "Simulate alignment with dinucleotide model"
        al = LoadSeqs(data={'a':'ggaatt','c':'cctaat'})
        t = LoadTree(treestring="(a,c);")
        sm = substitution_model.Dinucleotide(mprob_model='tuple')
        lf = sm.makeParamController(t)
        lf.setAlignment(al)
        simalign = lf.simulateAlignment()
        self.assertEqual(len(simalign), 6)
    
    def test_simulateAlignment3(self):
        """Simulated alignment with gap-induced ambiguous positions
        preserved"""
        t = LoadTree(treestring='(a:0.4,b:0.3,(c:0.15,d:0.2)edge.0:0.1)root;')
        al = LoadSeqs(data={
            'a':'g--cactat?',
            'b':'---c-ctcct',
            'c':'-a-c-ctat-',
            'd':'-a-c-ctat-'})
        sm = Nucleotide(recode_gaps=True)
        lf = sm.makeParamController(t)
        #pc.setConstantLengths()
        lf.setAlignment(al)
        #print lf.simulateAlignment(sequence_length=10)
        simulated = lf.simulateAlignment()
        self.assertEqual(len(simulated.getSeqNames()), 4)
        import re
        self.assertEqual(
            re.sub('[ATCG]', 'x', simulated.todict()['a']),
            'x??xxxxxx?')
        
    
    def test_simulateAlignment_root_sequence(self):
        """provide a root sequence for simulating an alignment"""
        def use_root_seq(root_sequence):
            al = LoadSeqs(data={'a':'ggaatt','c':'cctaat'})
            t = LoadTree(treestring="(a,c);")
            sm = substitution_model.Dinucleotide(mprob_model='tuple')
            lf = sm.makeParamController(t)
            lf.setAlignment(al)
            simalign = lf.simulateAlignment(exclude_internal=False,
                                            root_sequence=root_sequence)
            root = simalign.NamedSeqs['root']
            self.assertEqual(str(root), str(root_sequence))
        
        root_sequence = DNA.makeSequence('GTAATT')
        use_root_seq(root_sequence) # as a sequence instance
        use_root_seq('GTAATC') # as a string
    
    def test_pc_initial_parameters(self):
        """Default parameter values from original annotated tree"""
        likelihood_function = self._makeLikelihoodFunction()
        self._setLengthsAndBetas(likelihood_function)
        tree = likelihood_function.getAnnotatedTree()
        lf = self.submodel.makeParamController(tree)
        lf.setAlignment(self.data)
        self.assertEqual(lf.getParamValue("length", "Human"), 0.3)
        self.assertEqual(lf.getParamValue("beta", "Human"), 4.0)
    
    def test_set_par_all(self):
        likelihood_function = self._makeLikelihoodFunction()
        likelihood_function.setParamRule("length", value=4.0, is_constant=True)
        likelihood_function.setParamRule("beta", value=6.0, is_constant=True)
        self.assertEqual(str(likelihood_function), \
"""Likelihood Function Table
======
  beta
------
6.0000
------
=============================
     edge    parent    length
-----------------------------
    Human    edge.0    4.0000
HowlerMon    edge.0    4.0000
   edge.0    edge.1    4.0000
    Mouse    edge.1    4.0000
   edge.1      root    4.0000
NineBande      root    4.0000
 DogFaced      root    4.0000
-----------------------------
===============
motif    mprobs
---------------
    T    0.2500
    C    0.2500
    A    0.2500
    G    0.2500
---------------""")
        
        #self.submodel.setScaleRule("ts",['beta'])
        #self.submodel.setScaleRule("tv",['beta'], exclude_pars = True)
        self.assertEqual(str(likelihood_function),\
"""Likelihood Function Table
======
  beta
------
6.0000
------
=============================
     edge    parent    length
-----------------------------
    Human    edge.0    4.0000
HowlerMon    edge.0    4.0000
   edge.0    edge.1    4.0000
    Mouse    edge.1    4.0000
   edge.1      root    4.0000
NineBande      root    4.0000
 DogFaced      root    4.0000
-----------------------------
===============
motif    mprobs
---------------
    T    0.2500
    C    0.2500
    A    0.2500
    G    0.2500
---------------""")
    
    def test_getMotifProbs(self):
        likelihood_function = self._makeLikelihoodFunction()
        mprobs = likelihood_function.getMotifProbs()
        assert hasattr(mprobs, 'keys'), mprobs
        keys = mprobs.keys()
        keys.sort()
        obs = self.submodel.getMotifs()
        obs.sort()
        self.assertEqual(obs, keys)
    
    def test_getAnnotatedTree(self):
        likelihood_function = self._makeLikelihoodFunction()
        likelihood_function.setParamRule("length", value=4.0, edge="Human", is_constant=True)
        result = likelihood_function.getAnnotatedTree()
        self.assertEqual(result.getNodeMatchingName('Human').params['length'], 4.0)
        self.assertEqual(result.getNodeMatchingName('Human').Length, 4.0)
    
    def test_getparamsasdict(self):
        likelihood_function = self._makeLikelihoodFunction()
        likelihood_function.setName("TEST")
        self.assertEqual(str(likelihood_function),\
"""TEST
=======================================
     edge    parent    length      beta
---------------------------------------
    Human    edge.0    1.0000    1.0000
HowlerMon    edge.0    1.0000    1.0000
   edge.0    edge.1    1.0000    1.0000
    Mouse    edge.1    1.0000    1.0000
   edge.1      root    1.0000    1.0000
NineBande      root    1.0000    1.0000
 DogFaced      root    1.0000    1.0000
---------------------------------------
===============
motif    mprobs
---------------
    T    0.2500
    C    0.2500
    A    0.2500
    G    0.2500
---------------""")
        self.assertEqual(likelihood_function.getParamValueDict(['edge']), {
 'beta': {'NineBande': 1.0, 'edge.1': 1.0,'DogFaced': 1.0, 'Human': 1.0,
      'edge.0': 1.0, 'Mouse': 1.0, 'HowlerMon': 1.0},
 'length': {'NineBande': 1.0,'edge.1': 1.0, 'DogFaced': 1.0, 'Human': 1.0,
        'edge.0': 1.0, 'Mouse': 1.0,'HowlerMon': 1.0}})
    
    def test_get_statistics_from_empirical_model(self):
        """should return valid dict from an empirical substitution model"""
        submod = JTT92()
        aln = self.data.getTranslation()
        
        lf = submod.makeLikelihoodFunction(self.tree)
        lf.setAlignment(aln)
        stats = lf.getParamValueDict(['edge'], params=['length'])
    
    def test_constant_to_free(self):
        """excercise setting a constant param rule, then freeing it"""
        # checks by just trying to make the calculator
        lf = self.submodel.makeLikelihoodFunction(self.tree)
        lf.setAlignment(self.data)
        lf.setParamRule('beta', is_constant=True, value=2.0, 
                        edges=['NineBande', 'DogFaced'], is_clade=True)
        lf.setParamRule('beta', init=2.0, is_constant=False,
                        edges=['NineBande', 'DogFaced'], is_clade=True)
    
    def test_get_psub_rate_matrix(self):
        """lf should return consistent rate matrix and psub"""
        lf = self.submodel.makeLikelihoodFunction(self.tree)
        lf.setAlignment(self.data)
        Q = lf.getRateMatrixForEdge('NineBande')
        P = lf.getPsubForEdge('NineBande')
        self.assertFloatEqual(expm(Q.array)(1.0), P.array)
        
        # should fail for a discrete Markov model
        dm = substitution_model.DiscreteSubstitutionModel(DNA.Alphabet)
        lf = dm.makeLikelihoodFunction(self.tree)
        lf.setAlignment(self.data)
        self.assertRaises(Exception, lf.getRateMatrixForEdge, 'NineBande')
    
    def test_make_discrete_markov(self):
        """lf ignores tree lengths if a discrete Markov model"""
        t = LoadTree(treestring='(a:0.4,b:0.3,(c:0.15,d:0.2)edge.0:0.1)root;')
        dm = substitution_model.DiscreteSubstitutionModel(DNA.Alphabet)
        lf = dm.makeLikelihoodFunction(t)
Example #51
0
class FastMLTree:
    
    """
    Class attributes:
    Parsed (Bool): an indication of whether or not the user-defined tree was successfully parsed, if it was not, then the rest of the analysis is not performed
    TreePath (String): absolute path to tree file
    NeedsToBeCogentModded (Bool): whether or not placeholder names for the internal nodes need to be created
    CogentTree (Object LoadTree): pyCogent Class object containing parsed newick syntax tree
    FastMLInputTreeString (String): representation of tree in newick with internal node names removed
    FastMLOutputTreeString (String): representation of tree in newick with internal nodes named according to FastML naming convention
    FastMLToOriginalMatchedNodes_D (Dict): Key is the node name in the cogent convention, value is the node name in the FastML convention
    
    NodeKey_L (List): List of all node name keys
    LeafKey_L (List): List of all terminal node name keys
    UpperKey_L (List): List of all internal (non-terminal) node name keys
    TopKey (String): root node name key
    BranchKey_L (List): List of all paths (from ancestral to immediate derived) along the tree
    Nodes_D (Dict): Key is the node name, value is a sub-dict containing immediate derived nodes and terminal nodes under the node
    """
    
    "CONSTRUCTOR"
    def __init__(self, TreePath , NeedsToBeCogentModded):
        self.Parsed = True #used to determine if the full analysis can be conducted
        
        try:
            self.TreePath = TreePath
            self.NeedsToBeCogentModded = NeedsToBeCogentModded
            
            self.CogentTree = None
            
            #if the internal nodes need to be renamed, then it is done according to the "FixUpFileForCogent" method
            if self.NeedsToBeCogentModded:
                cogentFixUp = fixUpFileForCogent(self.TreePath)
                self.CogentTreeFile = cogentFixUp[0]
                self.CogentInputTreeString = cogentFixUp[1]
                
                
                self.CogentTree = LoadTree(self.CogentTreeFile.name)
                
            else:
                
                self.CogentTree = LoadTree(self.TreePath)
            
            #prepares an input string for FastML
            self.FastMLInputTreeString = self.FixUpFileForFastML(self.CogentTree)
            
            
            #executes method to fully parse tree, then sets all returned variables as class variables
            CogentNodesLeavesBranches = completeNodesLeavesBranches(self.CogentTree)
            self.NodeKey_L = CogentNodesLeavesBranches['NodeKey_L']
            self.LeafKey_L = CogentNodesLeavesBranches['LeafKey_L']
            self.UpperKey_L = CogentNodesLeavesBranches['UpperKey_L']
            self.TopKey = CogentNodesLeavesBranches['TopKey']
            self.BranchKey_L = CogentNodesLeavesBranches['BranchKey_L']
            self.Nodes_D = CogentNodesLeavesBranches['Nodes_D']
            
            
            
            
            
            #print self.LeafKey_L
            #executes quick run of FastML to get FastML's naming convention of internal nodes
            
            self.FastMLOutputTreeString = executeFastML(self.getTempFASTAFile() , self.FastMLInputTreeString , True)
            
            
            #prepares the FastMLToOriginalMatchedNodes_D
            self.MatchNodes()
            
        except Exception as e:
            
            self.Parsed = False
        
    
    "Removes internal node names so that FastML adds its own naming convention"
    def FixUpFileForFastML(self, CogentTree):
        #gets the tree string for the cogent object
        TreeString = CogentTree.getNewick(with_distances=True).replace("'","")
        
        i = 0
        NotThroughTheString = True
        #while loop moves one space along tree string until it gets to the end
        while NotThroughTheString:
            #when a close bracket is found, it signifies the end of an internal node
            if TreeString[i] == ")":
                if TreeString[i+1] == ";":
                    pass
                else:
                    #tree string replaces the name of the internal node with nothing
                    lengthToColon = len(re.compile("^(.+?)[:;]").search(TreeString[i:]).group(1)) - 1
                    
                    TreeString = TreeString[:i+1]+ TreeString[i+lengthToColon+1:]
            #check to end while loop
            if i == len(TreeString) - 1:
                NotThroughTheString = False
            i += 1
        
        return TreeString
    
    "Prepares simple FastaFile to be given to FastML"   
    def getTempFASTAFile(self):
        retString_L = []
        
        #FastaFile will have the sequence "GREAT" for each terminal sequence
        for LeafKey in self.LeafKey_L:
            retString_L.append(">"+LeafKey)
            retString_L.append("GREAT")
        
        return '\n'.join(retString_L)
    
    "Corrects for instances where FastML anomalously renames terminal nodes"
    def correctForFastMLNameChanges(self):
        
        #gets lists of terminal names in the FastML input and output strings (in the same order)
        FastMLInputNames = [re.compile("^(.+?):").search(TaxString).group(1) for TaxString in re.findall("[A-Za-z0-9_./]+:[.0-9]+",self.FastMLInputTreeString)]
        #print FastMLInputNames
        FastMLOutputNames = [re.compile("^(.+?):").search(TaxString).group(1) for TaxString in re.findall("[A-Za-z0-9_./]+:[.0-9]+",self.FastMLOutputTreeString)]
        FastMLOutputNames = [Name for Name in FastMLOutputNames if re.compile("^N[0-9]+$").search(Name) == None]
        
        #when equivalent node names are not the same, then the output string node name is renamed according to the input string node name
        for i in range(0,len(FastMLInputNames)):
            if FastMLInputNames[i] != FastMLOutputNames[i]:
                self.FastMLOutputTreeString = re.sub("([,\(\)])%s:" % (FastMLOutputNames[i]) , r"\1%s:" % (FastMLInputNames[i]) , self.FastMLOutputTreeString)
    
    "Matches original (cogent) node names with how the nodes are named in FastML" 
    def MatchNodes(self):
        #print "YAY"
        self.correctForFastMLNameChanges() #performs the correction on the output string if necessary
        #print "NAY"
        TerminiStringToNodeName_D = {}
        #a termini string is prepared for each internal node, that is, all termini under the internal node sorted an placed into a single string
        
        for NodeKey in self.UpperKey_L:
            TerminiStringToNodeName_D['-'.join(sorted(self.Nodes_D[NodeKey]['terminal']))] = NodeKey
        
        #prepares a cogent tree object for the fastML output
        FH = getInputTempFile(self.FastMLOutputTreeString)
        
        FastMLCogentTree = LoadTree(FH.name)
        
        
        self.FastMLToOriginalMatchedNodes_D = {}
        
        #for each cogent node in the FastML cogent tree
        for FastMLCogentNodeKey in FastMLCogentTree.getNodeNames():
            
            #a termini string is prepared for the fastML node
            FastMLCogentNode = FastMLCogentTree.getNodeMatchingName(FastMLCogentNodeKey)
            FastMLTermini_L = [tip.Name for tip in FastMLCogentNode.iterTips()]
            
            #if it has more than 0 termini under the node
            if len(FastMLTermini_L) > 0:
                #A fastML termini string is prepared, and this termini string will be the same termini string as the equivalent cogent node
                FastMLTerminiString = '-'.join(sorted(FastMLTermini_L))
                self.FastMLToOriginalMatchedNodes_D[FastMLCogentNodeKey] = TerminiStringToNodeName_D[FastMLTerminiString]
                
            #if it has no termini under it, then the node itself is a terminus and has the same name in FastML and Cogent
            else:
                self.FastMLToOriginalMatchedNodes_D[FastMLCogentNodeKey] = FastMLCogentNodeKey
    
    "Sets branch lengths of each node"
    def setBranchLengths(self):
        
        self.BranchLength_D = {}
        #gets the distance between a node and its immediate ancestor
        for NodeNameKey in self.NodeKey_L:
            HigherNode = self.CogentTree.getNodeMatchingName(NodeNameKey)
            
            for ImmediateNeighbourNodeNameKey in self.Nodes_D[NodeNameKey]['immediate']:
                LowerNode = self.CogentTree.getNodeMatchingName(ImmediateNeighbourNodeNameKey)
                
                self.BranchLength_D[ImmediateNeighbourNodeNameKey] = HigherNode.distance(LowerNode)
Example #52
0
        header = line.split()[0]
        fileout.write(''.join([">", header, "\n", seqs[header], "\n"]))
        rawseqs.append((header, seqs[header]))
        tips.append(header)
    fileout.close()

    print "Aligning seqs using muscle with -diags"
    seqs = LoadSeqs(data=rawseqs, moltype=RNA, aligned=False)
    aln = align_unaligned_seqs(seqs, RNA, {"-diags": True})
    fileout = open(folderout + "/" + basenames + "-seqsaligned.fasta", 'w')
    fileout.write(str(aln))
    fileout.close()

    print "Folding sequences"
    #get subtree of the clade being folded to pass to PPfold
    tr = LoadTree(argv[3])
    sub_tree = tr.getSubTree(tips, keep_root=True)
    filesubtree = open(folderout + "/" + basenames + "-subtreeDistances.nwk", 'w')
    filesubtree.write(sub_tree.getNewick(with_distances=True))
    filesubtree.close()
    filesubtree = open(folderout + "/" + basenames + "-subtree.nwk", 'w')
    filesubtree.write(sub_tree.getNewick(with_distances=False))
    #call PPfold with aligned sequences and subtree
    args = ["java", "-jar", PPFOLDDIR + "PPfold.jar", folderout + "/" + basenames + "-seqsaligned.fasta", "--outputd", folderout]
    check_call(args)
    
    print "Converting sequences to vienna"
    check_call(["ct2b.pl", folderout + basenames + "-seqsaligned.ct", ">",folderout + basenames + "-vienna.txt"]) 

    print "DONE"
Example #53
0
def TreeAlign(model, seqs, tree=None, indel_rate=0.01, indel_length=0.01,
    ui = None, ests_from_pairwise=True, param_vals=None):
    """Returns a multiple alignment and tree.
    
    Uses the provided substitution model and a tree for determining the
    progressive order. If a tree is not provided a Neighbour Joining tree is
    constructed from pairwise distances estimated from pairwise aligning the
    sequences. If running in parallel, only the distance estimation is
    parallelised and only the master CPU returns the alignment and tree, other
    CPU's return None, None.
    
    Arguments:
        - model: a substitution model
        - seqs: a sequence collection
        - indel_rate, indel_length: parameters for the progressive pair-HMM
        - ests_from_pairwise: if no tree provided and True, the median value
          of the substitution model parameters are used
        - param_vals: named key, value pairs for model parameters. These
          override ests_from_pairwise.
    """
    _exclude_params = ['mprobs', 'rate', 'bin_switch']
    if param_vals:
        param_vals = dict(param_vals)
    else:
        param_vals = {}
    if isinstance(seqs, dict):
        seq_names = list(seqs.keys())
    else:
        seq_names = seqs.getSeqNames()
    
    two_seqs = len(seq_names) == 2
    
    if tree:
        tip_names = tree.getTipNames()
        tip_names.sort()
        seq_names.sort()
        assert tip_names == seq_names, \
            "names don't match between seqs and tree: tree=%s; seqs=%s" % \
            (tip_names, seq_names)
        ests_from_pairwise = False
    elif two_seqs:
        tree = LoadTree(tip_names=seqs.getSeqNames())
        ests_from_pairwise = False
    else:
        if ests_from_pairwise:
            est_params = [param for param in model.getParamList() \
                                    if param not in _exclude_params]
        else:
            est_params = None
        
        dcalc = EstimateDistances(seqs, model, do_pair_align=True,
                                    est_params=est_params)
        dcalc.run()
        dists = dcalc.getPairwiseDistances()
        tree = NJ.nj(dists)
    
    LF = model.makeLikelihoodFunction(tree.bifurcating(name_unnamed=True), aligned=False)
    if ests_from_pairwise and not param_vals:
        # we use the Median to avoid the influence of outlier pairs
        param_vals = {}
        for param in est_params:
            numbers = dcalc.getParamValues(param)
            print("Param Estimate Summary Stats: %s" % param)
            print(numbers.summarize())
            param_vals[param] = numbers.Median
    
    ui.display("Doing %s alignment" % ["progressive", "pairwise"][two_seqs])
    with LF.updatesPostponed():
        for param, val in list(param_vals.items()):
            LF.setParamRule(param, value=val, is_constant=True)
        LF.setParamRule('indel_rate', value=indel_rate, is_constant=True)
        LF.setParamRule('indel_length', value=indel_length, is_constant=True)
        LF.setSequences(seqs)
    edge = LF.getLogLikelihood().edge
    align = edge.getViterbiPath().getAlignment()
    info = Info()
    info["AlignParams"] = param_vals
    info["AlignParams"].update(dict(indel_length=indel_length, indel_rate=indel_rate))
    align.Info = info
    return align, tree
Example #54
0
 def test_reroot(self):
     tree = LoadTree(treestring="((a,b),(c,d),e)")
     tree2 = tree.rootedWithTip('b')
     self.assertEqual(tree2.getNewick(), "(a,b,((c,d),e));")
Example #55
0
 def test_run_pick_de_novo_otus_parallel(self):
     """run_pick_de_novo_otus generates expected results in parallel
     """
     self.params['assign_taxonomy'] = \
      {'id_to_taxonomy_fp':self.test_data['refseqs_tax'][0],
       'reference_seqs_fp':self.test_data['refseqs'][0]}
     self.params['align_seqs'] = \
      {'template_fp':self.test_data['refseqs_aligned'][0]}
     self.params['filter_alignment'] = \
      {'lane_mask_fp':self.test_data['refseqs_aligned_lanemask'][0]}
     actual_tree_fp, actual_otu_table_fp = run_pick_de_novo_otus(
      self.test_data['seqs'][0], 
      self.test_out, 
      call_commands_serially,
      self.params, 
      self.qiime_config, 
      parallel=True,
      status_update_callback=no_status_updates)
      
     input_file_basename = splitext(split(self.test_data['seqs'][0])[1])[0]
     otu_map_fp = join(self.test_out,'uclust_picked_otus',
      '%s_otus.txt' % input_file_basename)
     alignment_fp = join(self.test_out,
      'pynast_aligned_seqs','%s_rep_set_aligned.fasta' % 
       input_file_basename)
     failures_fp = join(self.test_out,
      'pynast_aligned_seqs','%s_rep_set_failures.fasta' % 
       input_file_basename)
     taxonomy_assignments_fp = join(self.test_out,
      'uclust_assigned_taxonomy','%s_rep_set_tax_assignments.txt' %
      input_file_basename)
     otu_table_fp = join(self.test_out,'otu_table.biom')
     tree_fp = join(self.test_out,'rep_set.tre')
     
     self.assertEqual(actual_tree_fp,tree_fp)
     self.assertEqual(actual_otu_table_fp,otu_table_fp)
     
     input_seqs = LoadSeqs(self.test_data['seqs'][0],
                           format='fasta',
                           aligned=False)
     
     # Number of OTUs falls within a range that was manually 
     # confirmed
     otu_map_lines = list(open(otu_map_fp))
     num_otus = len(otu_map_lines)
     otu_map_otu_ids = [o.split()[0] for o in otu_map_lines]
     self.assertEqual(num_otus,14)
     
     # all otus get taxonomy assignments
     taxonomy_assignment_lines = list(open(taxonomy_assignments_fp))
     self.assertEqual(len(taxonomy_assignment_lines),num_otus)
     
     # number of seqs which aligned + num of seqs which failed to
     # align sum to the number of OTUs
     aln = LoadSeqs(alignment_fp)
     failures = LoadSeqs(failures_fp,aligned=False)
     self.assertTrue(aln.getNumSeqs() + failures.getNumSeqs(),num_otus)
      
     # number of tips in the tree equals the number of sequences that
     # aligned
     tree = LoadTree(tree_fp)
     self.assertEqual(len(tree.tips()),aln.getNumSeqs())
     
     # parse the otu table
     otu_table = parse_biom_table(open(otu_table_fp,'U'))
     expected_sample_ids = ['f1','f2','f3','f4','p1','p2','t1','t2','not16S.1']
     # sample IDs are as expected
     self.assertEqualItems(otu_table.SampleIds,expected_sample_ids)
     # otu ids are as expected
     self.assertEqualItems(otu_table.ObservationIds,otu_map_otu_ids)
     # number of sequences in the full otu table equals the number of
     # input sequences
     number_seqs_in_otu_table = sum([v.sum() for v in otu_table.iterSampleData()])
     self.assertEqual(number_seqs_in_otu_table,input_seqs.getNumSeqs())
     
     # Check that the log file is created and has size > 0
     log_fp = glob(join(self.test_out,'log*.txt'))[0]
     self.assertTrue(getsize(log_fp) > 0)
#!/usr/bin/env python

from cogent import LoadTree
tr = LoadTree('test.nw')
print tr.rootedWithTip("X")


Example #57
0
def processTree(fstr):
    # Attempt to load input as tree
    host_tree = LoadTree(treestring=fstr)
    host_dist = cogent_dist_to_qiime_dist(host_tree.getDistances())
    return host_tree, host_dist
Example #58
0
    def gettree(self):
        treeobj = LoadTree(filename=os.path.join(data_path, "murphy.tree"))

        return treeobj.getSubTree(seqnames)
Example #59
0
 def setUp(self):
     self.tree = LoadTree(treestring='((a:3,b:4):2,(c:6,d:7):30,e:5)')
     self.dists = self.tree.getDistances()
Example #60
0
class TreeReconstructionTests(unittest.TestCase):
    def setUp(self):
        self.tree = LoadTree(treestring='((a:3,b:4):2,(c:6,d:7):30,e:5)')
        self.dists = self.tree.getDistances()
        
    def assertTreeDistancesEqual(self, t1, t2):
        d1 = t1.getDistances()
        d2 = t2.getDistances()
        self.assertEqual(len(d1), len(d2))
        for key in d2:
            self.assertAlmostEqual(d1[key], d2[key])

    def test_nj(self):
        """testing nj"""
        reconstructed = nj(self.dists)
        self.assertTreeDistancesEqual(self.tree, reconstructed)
        
    def test_gnj(self):
        """testing gnj"""
        results = gnj(self.dists, keep=1)
        (length, reconstructed) = results[0]
        self.assertTreeDistancesEqual(self.tree, reconstructed)
        
        results = gnj(self.dists, keep=10)
        (length, reconstructed) = results[0]
        self.assertTreeDistancesEqual(self.tree, reconstructed)
        
        # Results should be a TreeCollection
        len(results)
        results.getConsensusTree()

        # From GNJ paper. Pearson, Robins, Zhang 1999.
        tied_dists = {
                ('a', 'b'):3, ('a', 'c'):3, ('a', 'd'):4, ('a', 'e'):3, 
                ('b', 'c'):3, ('b', 'd'):3, ('b', 'e'):4,
                ('c', 'd'):3, ('c', 'e'):3, 
                ('d', 'e'):3}
        results = gnj(tied_dists, keep=3)
        scores = [score for (score, tree) in results]
        self.assertEqual(scores[:2], [7.75, 7.75])
        self.assertNotEqual(scores[2], 7.75)

    def test_wls(self):
        """testing wls"""
        reconstructed = wls(self.dists)
        self.assertTreeDistancesEqual(self.tree, reconstructed)

    def test_truncated_wls(self):
        """testing wls with order option"""
        order = ['e', 'b', 'c', 'd']
        reconstructed = wls(self.dists, order=order)
        self.assertEqual(set(reconstructed.getTipNames()), set(order))

    def test_limited_wls(self):
        """testing (well, exercising at least), wls with constrained start"""
        init = LoadTree(treestring='((a,c),b,d)')
        reconstructed = wls(self.dists, start=init)
        self.assertEqual(len(reconstructed.getTipNames()), 5)
        init2 = LoadTree(treestring='((a,d),b,c)')
        reconstructed = wls(self.dists, start=[init, init2])
        self.assertEqual(len(reconstructed.getTipNames()), 5)
        init3 = LoadTree(treestring='((a,d),b,e)')
        self.assertRaises(Exception, wls, self.dists, start=[init, init3])
        # if start tree has all seq names, should raise an error
        self.assertRaises(Exception, wls, self.dists,
                start=[LoadTree(treestring='((a,c),b,(d,e))')])