Ejemplo n.º 1
0
    def testParseBiomolecule(self):

        pdbfile = self.config.getTestFile('3p1u.pdb')
        parser = LegacyStructureParser(pdbfile)

        s2 = parser.parse_biomolecule(2)

        self.assertEqual(len(s2.chains), 1)
        self.assertEqual(s2.first_chain.id, 'B1')
        self.assertRaises(KeyError, parser.parse_biomolecule, 3)
Ejemplo n.º 2
0
    def testCommaSplitting(self):
        """
        @see: [CSB 0000067]
        """
        pdbfile = self.config.getTestFile('3shm_ca.pdb')
        parser = LegacyStructureParser(pdbfile)

        s1 = parser.parse_biomolecule(1, True)

        self.assertEqual(len(s1.chains), 60)
        self.assertEqual(s1.first_chain.id, 'A')
Ejemplo n.º 3
0
    def setUp(self):
        super(TestDumpLoad, self).setUp()

        self.lists = [[],
                      list(range(1000)),
                      list("Although that way may not be" +
                           "obvious at first" + "unless you're Dutch.")]
        self.arrays = [
            numpy.array([]),
            numpy.random.random(1000),
            numpy.arange(1000),
        ]

        self.strings = ["",
                        "Although that way may not be" + \
                        "obvious at first" + \
                        "unless you're Dutch.",
                        "([0-9a-zA-Z]([-.\w]*[0-9a-zA-Z])"]

        # Completly connnected graph

        self.big_graph = []
        for _i in range(250):
            n = Node()
            self.big_graph.append(n)
        for n in self.big_graph:
            n.connections = set(self.big_graph)
            n.connections.remove(n)

        # Protein
        pdbfile = self.config.getTestFile('ake-xray-ensemble-ca.pdb')
        self.protein = LegacyStructureParser(pdbfile).parse_models()[0]

        self.objs = [self.lists, self.arrays, self.strings, self.protein]
Ejemplo n.º 4
0
    def main(self):
        try:
            parser = LegacyStructureParser(self.args.pdb)
            models = parser.models()

        except IOError as e:
            self.exit('PDB file parsing failed\n' + str(e.value), ExitCodes.IO_ERROR)

        if len(models) < 2:
            self.exit('PDB file contains only one model', ExitCodes.USAGE_ERROR)

        ensemble = parser.parse_models(models)
        X = numpy.array([model[self.args.chain].get_coordinates(['CA'], True) for model in ensemble])
        x_mu = average_structure(X)
        #n = X.shape[1]
        m = X.shape[0]
        R = numpy.zeros((m, 3, 3))
        t = numpy.ones((m, 3))


        prior = GammaPrior()
        mixture = ScaleMixture(scales=X.shape[1],
                               prior=prior, d=3)

        for i in range(m):
            R[i, :, :], t[i, :] = fit(x_mu, X[i])
        
        # gibbs sampling cycle
        for j in range(self.args.niter):
            # apply rotation
            data = numpy.array([numpy.sum((x_mu - numpy.dot(X[i], numpy.transpose(R[i])) - t[i]) ** 2, -1) ** 0.5
                                for i in range(m)]).T
            # sample scales
            mixture.estimate(data)
            # sample rotations
            for i in range(m):
                R[i, :, :], t[i, :] = wfit(x_mu, X[i], mixture.scales)


        out_ensemble = csb.bio.structure.Ensemble()

        for i, model in enumerate(ensemble):
            model.transform(R[i], t[i])
            out_ensemble.models.append(model)

        out_ensemble.to_pdb(self.args.outfile)
Ejemplo n.º 5
0
    def main(self):
        try:
            parser = LegacyStructureParser(self.args.infile)
            models = parser.models()
        except:
            self.exit('PDB file parsing failed', ExitCodes.IO_ERROR)

        if len(models) < 2:
            self.exit('PDB file contains only one model', ExitCodes.USAGE_ERROR)

        ensemble = parser.parse_models(models)
        X = numpy.array([model.get_coordinates(['CA'], True) for model in ensemble])

        if self.args.type == 'segments':
            self.main_segments(ensemble, X)
        elif self.args.type == 'conformers':
            self.main_conformers(ensemble, X)
        else:
            raise ValueError('type must be "segments" or "conformers"')
Ejemplo n.º 6
0
 def testParseHetMolecules(self):
     
     with self.config.getTempStream() as tmp:
         
         tmp.write('HETATM    1  NA  BLM A   1     -14.575  27.241   3.310  1.00  0.00           N ')
         tmp.flush()
         
         parser = LegacyStructureParser(tmp.name)
         self.assertRaises(HeaderFormatError, parser.parse_structure)
         del parser
Ejemplo n.º 7
0
    def _ake_ensemble_coords(self):

        pdbfile = self.config.getTestFile('ake-xray-ensemble-ca.pdb')
        ensemble = LegacyStructureParser(pdbfile).parse_models()
        X = array([model.get_coordinates(['CA'], True) for model in ensemble])

        self.assertEqual(X.shape, (16, 211, 3))

        self._ake_ensemble_coords = lambda: X

        return X
Ejemplo n.º 8
0
    def main(self):
        try:
            parser = LegacyStructureParser(self.args.infile)
            models = parser.models()
        except:
            self.exit('PDB file parsing failed', ExitCodes.IO_ERROR)

        if len(models) < 2:
            self.exit('PDB file contains only one model',
                      ExitCodes.USAGE_ERROR)

        ensemble = parser.parse_models(models)
        X = numpy.array(
            [model.get_coordinates(['CA'], True) for model in ensemble])

        if self.args.type == 'segments':
            self.main_segments(ensemble, X)
        elif self.args.type == 'conformers':
            self.main_conformers(ensemble, X)
        else:
            raise ValueError('type must be "segments" or "conformers"')
Ejemplo n.º 9
0
    def main(self):
        try:
            parser = LegacyStructureParser(self.args.pdb1)
            r = parser.parse()

            parser = LegacyStructureParser(self.args.pdb2)
            m = parser.parse()
        except IOError as e:
            self.exit('PDB file parsing failed\n' + str(e.value), ExitCodes.IO_ERROR)

        X = numpy.array(r[self.args.chain1].get_coordinates(['CA'], True))
        Y = numpy.array(m[self.args.chain2].get_coordinates(['CA'], True))

        if self.args.alignment is not None:
            align = SequenceAlignment.parse(file(self.args.alignment).read())
            align = align[:2, :]
            
            matches = []
            for i in range(1, align.length + 1):
                if not align.gap_at(i):
                    matches.append([align.columns[i][0].rank - 1,
                                    align.columns[i][1].rank - 1])
            matches = numpy.array(matches)
            X = X[matches[:, 0], :]
            Y = Y[matches[:, 1], :]

        
        if len(X) != len(Y):
            self.exit('Structures are of different lengths,' + 
                      ' please specify an alignment',
                      ExitCodes.INPUT_ERROR)

        R, t = csb.bio.utils.bfit(X, Y, self.args.niter,
                self.args.scalemixture, self.args.em)

        m.transform(R, t)
        m.to_pdb(self.args.outfile)
Ejemplo n.º 10
0
    def testEnsemble(self):
        """
        The posterior of a gaussian scale mixture with gamma prior
        is a Student's t distribution, with parameters alpha and beta.

        Give enough samples, we shoud be able to estimate these parameters
        """
        pdbfile = self.config.getTestFile('ake-xray-ensemble-ca.pdb')
        ensemble = LegacyStructureParser(pdbfile).parse_models()

        X = numpy.array([model.get_coordinates(['CA'], True) for model in ensemble])
        x_mu = average_structure(X)
        n =X.shape[1]
        m =X.shape[0]
        R = numpy.zeros((m,3,3))
        t = numpy.ones((m,3))

          
        prior = GammaPrior()

        mixture = ScaleMixture(scales=n, prior = prior, d=3)
                               

        from csb.bio.utils import fit, wfit

        for i in range(m):
            R[i,:,:], t[i,:] = fit(x_mu, X[i])
        
        # gibbs sampling cycle
        for j in range(200):
            # apply rotation
            data = numpy.array([numpy.sum((x_mu - numpy.dot(X[i], numpy.transpose(R[i])) - t[i]) **2, -1)**0.5
                                for i in range(m)]).T
            # sample scales
            mixture.estimate(data)
            # sample rotations
            for i in range(m):
                R[i,:,:], t[i,:] = wfit(x_mu, X[i], mixture.scales)


        self.assertEqual(mixture.scales.shape, (211,))
        
        R_opt = numpy.eye(3)
        t_opt = numpy.zeros((3,))
        for k in range(m):
            for i in range(3):
                self.assertAlmostEqual(t[k,i], t_opt[i], delta=2.)
                for j in range(3):
                    self.assertAlmostEqual(abs(R[k,i, j]), R_opt[i, j], delta=0.15)
Ejemplo n.º 11
0
    def testInvGammaMAP(self):
        """
        The posterior of a gaussian scale mixture with gamma prior
        is a Student's t distribution, with parameters alpha and beta.

        Give enough samples, we shoud be able to estimate these parameters
        """
        pdbfile = self.config.getTestFile('ake-xray-ensemble-ca.pdb')
        ensemble = LegacyStructureParser(pdbfile).parse_models()
        X = numpy.array(ensemble[0].get_coordinates(['CA'], True))
        Y = numpy.array(ensemble[13].get_coordinates(['CA'], True))


               
        prior = InvGammaPrior()
        prior.estimator = InvGammaPosteriorMAP()
        mixture = ScaleMixture(scales=X.shape[0],
                               prior=prior, d=3)

        from csb.bio.utils import fit, wfit

        R, t = fit(X, Y)
        #numpy.random.seed(100)
        # gibbs sampling cycle
        for i in range(200):
            # apply rotation
            data = numpy.sum((X - numpy.dot(Y, numpy.transpose(R)) - t) ** 2, axis= -1) ** (1. / 2)
            # sample scales
            mixture.estimate(data)
            # sample rotations
            R, t = wfit(X, Y, mixture.scales)
        
        self.assertEqual(mixture.scales.shape, (211,))
        
        R_opt = numpy.eye(3)
        t_opt = numpy.zeros((3,))
        
        for i in range(3):
            self.assertAlmostEqual(t[i], t_opt[i], delta=2.)
            for j in range(3):
                self.assertAlmostEqual(R_opt[i, j], R[i, j], delta=1e-1)
Ejemplo n.º 12
0
    def main(self):
        try:
            parser = LegacyStructureParser(self.args.pdb1)
            r = parser.parse()

            parser = LegacyStructureParser(self.args.pdb2)
            m = parser.parse()
        except IOError as e:
            self.exit('PDB file parsing failed\n' + str(e.value),
                      ExitCodes.IO_ERROR)

        X = numpy.array(r[self.args.chain1].get_coordinates(['CA'], True))
        Y = numpy.array(m[self.args.chain2].get_coordinates(['CA'], True))

        if self.args.alignment is not None:
            align = SequenceAlignment.parse(file(self.args.alignment).read())
            align = align[:2, :]

            matches = []
            for i in range(1, align.length + 1):
                if not align.gap_at(i):
                    matches.append([
                        align.columns[i][0].rank - 1,
                        align.columns[i][1].rank - 1
                    ])
            matches = numpy.array(matches)
            X = X[matches[:, 0], :]
            Y = Y[matches[:, 1], :]

        if len(X) != len(Y):
            self.exit(
                'Structures are of different lengths,' +
                ' please specify an alignment', ExitCodes.INPUT_ERROR)

        R, t = csb.bio.utils.bfit(X, Y, self.args.niter,
                                  self.args.scalemixture, self.args.em)

        m.transform(R, t)
        m.to_pdb(self.args.outfile)
Ejemplo n.º 13
0
 def setUp(self):
     
     super(TestLegacyStructureParser, self).setUp()
     
     self.pdb = self.config.getTestFile('1d3z.legacy.pdb')
     self.parser = LegacyStructureParser(self.pdb)
Ejemplo n.º 14
0
class TestLegacyStructureParser(test.Case):

    def setUp(self):
        
        super(TestLegacyStructureParser, self).setUp()
        
        self.pdb = self.config.getTestFile('1d3z.legacy.pdb')
        self.parser = LegacyStructureParser(self.pdb)
        
    def testParseModels(self):
        
        ensemble = self.parser.parse_models()
        self.assertEqual(ensemble.models.length, 10)
        self.assertEqual(ensemble[0].model_id, 1)
        self.assertEqual(ensemble.models[1].model_id, 1)        
        
    def testParseStructure(self):
        
        structure = self.parser.parse(model=1)
        
        self.assertEqual(self.parser.parse_structure().model_id, 1)        

        self.assertEqual(structure.accession, '1d3z')
        self.assertEqual(structure.model_id, 1)
        
        # Chain level        
        self.assertEqual(structure.chains.length, 1)
        self.assertEqual(len(structure.chains), 1)
        self.assertEqual(structure.first_chain.molecule_id, '1') 
        self.assertEqual(structure.chains['A'].sequence, 'MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGG')       

        self.assertEqual(len(structure.chains['A']), 76)
        self.assertEqual(len(structure['A']), 76)

        # Residue level 
        self.assertEqual(len(structure['A'][1:10]), 9)
        self.assertEqual(structure['A'][0].type, SequenceAlphabets.Protein.MET)             
        self.assertEqual(structure['A'][0].label, 'MSE')
        self.assertEqual(structure['A'][1].label, 'GLN')
        self.assertTrue(structure['A'][0].is_modified)
        self.assertFalse(structure['A'][1].is_modified)   
        
        # Atom level
        self.assertEqual(structure['A'][1].atoms['CA'].element, None)
        self.assertNotEqual(structure['A'][2].atoms['CA'].element, None)
        self.assertEqual(structure['A'][2].atoms['CA'].element, ChemElements.C)             

        vector = [51.653, -89.304, 8.833]
        self.assertEqual(structure['A'][0]['CA'].vector.tolist(), vector)        

    def testParseResidue(self):
        
        self.assertEqual(self.parser.parse_residue('AGM'), SequenceAlphabets.Protein.ARG.name)                                  #@UndefinedVariable
        self.assertEqual(self.parser.parse_residue('AGM', as_type=SequenceTypes.Protein), SequenceAlphabets.Protein.ARG.name)   #@UndefinedVariable        
        self.assertRaises(UnknownPDBResidueError, self.parser.parse_residue, 'AGM', as_type=SequenceTypes.NucleicAcid)                          
    
    def testParseResidueSafe(self):
        
        self.assertEqual(self.parser.parse_residue_safe('AGM', as_type=None), SequenceAlphabets.Protein.ARG.name)                      #@UndefinedVariable
        self.assertEqual(self.parser.parse_residue_safe('AGM', as_type=SequenceTypes.Protein), SequenceAlphabets.Protein.ARG.name)     #@UndefinedVariable
        self.assertEqual(self.parser.parse_residue_safe('AGM', as_type=SequenceTypes.NucleicAcid), SequenceAlphabets.Nucleic.Any.name) #@UndefinedVariable                
        self.assertEqual(self.parser.parse_residue_safe('junk', as_type=SequenceTypes.Protein), SequenceAlphabets.Unknown.UNK.name)    #@UndefinedVariable 
    
    def testGuessSequenceType(self):
        
        self.assertEqual(self.parser.guess_sequence_type('AGM'), SequenceTypes.Protein)                                        
        self.assertEqual(self.parser.guess_sequence_type('DOC'), SequenceTypes.NucleicAcid)                                      
        self.assertRaises(UnknownPDBResidueError, self.parser.guess_sequence_type, 'junk')
        
    def testFileName(self):
        self.assertEqual(self.parser.filename, self.pdb)
    
    def testModels(self):
        self.assertEqual(self.parser.models(), list(range(1, 11)))
        
    def testParseBiomolecule(self):

        pdbfile = self.config.getTestFile('3p1u.pdb')
        parser = LegacyStructureParser(pdbfile)

        s2 = parser.parse_biomolecule(2)

        self.assertEqual(len(s2.chains), 1)
        self.assertEqual(s2.first_chain.id, 'B1')
        self.assertRaises(KeyError, parser.parse_biomolecule, 3)
        
    def testParseHetMolecules(self):
        
        with self.config.getTempStream() as tmp:
            
            tmp.write('HETATM    1  NA  BLM A   1     -14.575  27.241   3.310  1.00  0.00           N ')
            tmp.flush()
            
            parser = LegacyStructureParser(tmp.name)
            self.assertRaises(HeaderFormatError, parser.parse_structure)
            del parser