Example #1
0
 def __init__(self, protein=None, n_jobs=-1, version=1, spr=0, **kwargs):
     self.protein = protein
     self.n_jobs = n_jobs
     self.version = version
     self.spr = spr
     model = randomforest(n_estimators=500,
                          oob_score=True,
                          n_jobs=n_jobs,
                          **kwargs)
     if version == 1:
         cutoff = 12
         descriptors = close_contacts(protein,
                                      cutoff=cutoff,
                                      protein_types=protein_atomic_nums,
                                      ligand_types=ligand_atomic_nums)
     elif version == 2:
         cutoff = np.array([0, 2, 4, 6, 8, 10, 12])
         descriptors = close_contacts(protein,
                                      cutoff=cutoff,
                                      protein_types=protein_atomic_nums,
                                      ligand_types=ligand_atomic_nums)
     elif version == 3:
         cutoff = 12
         cc = close_contacts(protein,
                             cutoff=cutoff,
                             protein_types=protein_atomic_nums,
                             ligand_types=ligand_atomic_nums)
         vina = autodock_vina_descriptor(protein)
         descriptors = ensemble_descriptor((vina, cc))
     super(rfscore, self).__init__(model,
                                   descriptors,
                                   score_title='rfscore_v%i' % self.version)
Example #2
0
 def __init__(self, protein = None, n_jobs = -1, version = 1, spr = 0, **kwargs):
     self.protein = protein
     self.n_jobs = n_jobs
     self.version = version
     self.spr = spr
     model = randomforest(n_estimators = 500, oob_score = True, n_jobs = n_jobs, **kwargs)
     if version == 1:
         cutoff = 12
         descriptors = close_contacts(protein, cutoff = cutoff, protein_types = protein_atomic_nums, ligand_types = ligand_atomic_nums)
     elif version == 2:
         cutoff = np.array([ 0,  2,  4,  6,  8, 10, 12])
         descriptors = close_contacts(protein, cutoff = cutoff, protein_types = protein_atomic_nums, ligand_types = ligand_atomic_nums)
     elif version == 3:
         cutoff = 12
         cc = close_contacts(protein, cutoff = cutoff, protein_types = protein_atomic_nums, ligand_types = ligand_atomic_nums)
         vina = autodock_vina_descriptor(protein)
         descriptors = ensemble_descriptor((vina, cc))
     super(rfscore,self).__init__(model, descriptors, score_title = 'rfscore')
Example #3
0
 def __init__(self, protein=None, n_jobs=-1, version=1, spr=0, **kwargs):
     self.protein = protein
     self.n_jobs = n_jobs
     self.version = version
     self.spr = spr
     if version == 1:
         cutoff = 12
         mtry = 6
         descriptors = close_contacts_descriptor(
             protein,
             cutoff=cutoff,
             protein_types=protein_atomic_nums,
             ligand_types=ligand_atomic_nums)
     elif version == 2:
         cutoff = np.array([0, 2, 4, 6, 8, 10, 12])
         mtry = 14
         descriptors = close_contacts_descriptor(
             protein,
             cutoff=cutoff,
             protein_types=protein_atomic_nums,
             ligand_types=ligand_atomic_nums)
     elif version == 3:
         cutoff = 12
         mtry = 6
         cc = close_contacts_descriptor(protein,
                                        cutoff=cutoff,
                                        protein_types=protein_atomic_nums,
                                        ligand_types=ligand_atomic_nums)
         vina_scores = [
             'vina_gauss1', 'vina_gauss2', 'vina_repulsion',
             'vina_hydrophobic', 'vina_hydrogen', 'vina_num_rotors'
         ]
         vina = oddt_vina_descriptor(protein, vina_scores=vina_scores)
         descriptors = ensemble_descriptor((vina, cc))
     model = randomforest(n_estimators=500,
                          oob_score=True,
                          n_jobs=n_jobs,
                          max_features=mtry,
                          bootstrap=True,
                          min_samples_split=6,
                          **kwargs)
     super(rfscore, self).__init__(model,
                                   descriptors,
                                   score_title='rfscore_v%i' % self.version)
Example #4
0
def test_ensemble_descriptor():
    mols = list(oddt.toolkit.readfile('sdf', actives_sdf))[:10]
    list(map(lambda x: x.addh(), mols))

    rec = next(oddt.toolkit.readfile('pdb', receptor_pdb))
    rec.protein = True
    rec.addh()

    desc1 = rfscore(version=1).descriptor_generator
    desc2 = oddt_vina_descriptor()
    ensemble = ensemble_descriptor((desc1, desc2))

    ensemble.set_protein(rec)
    assert len(ensemble) == len(desc1) + len(desc2)

    # set protein
    assert desc1.protein == rec
    assert desc2.protein == rec

    ensemble_scores = ensemble.build(mols)
    scores1 = desc1.build(mols)
    scores2 = desc2.build(mols)
    assert_array_almost_equal(ensemble_scores, np.hstack((scores1, scores2)))
Example #5
0
    def __init__(self, protein=None, n_jobs=-1, version=1, spr=0, **kwargs):
        """Scoring function implementing RF-Score variants. It predicts the
        binding affinity  (pKi/d) of ligand in a complex utilizng simple
        descriptors (close contacts of atoms <12A) with sophisticated
        machine-learning model (random forest). The third variand supplements
        those contacts with Vina partial scores. For futher details see RF-Score
        publications v1[1]_, v2[2]_, v3[3]_.


        Parameters
        ----------
        protein : oddt.toolkit.Molecule object
            Receptor for the scored ligands

        n_jobs: int (default=-1)
            Number of cores to use for scoring and training. By default (-1)
            all cores are allocated.

        version: int (default=1)
            Scoring function variant. The deault is the simplest one (v1).

        spr: int (default=0)
            The minimum number of contacts in each pair of atom types in
            the training set for the column to be included in training.
            This is a way of removal of not frequent and empty contacts.

        References
        ----------
        .. [1] Ballester PJ, Mitchell JBO. A machine learning approach to
            predicting protein-ligand binding affinity with applications to
            molecular docking. Bioinformatics. 2010;26: 1169-1175.
            doi:10.1093/bioinformatics/btq112

        .. [2] Ballester PJ, Schreyer A, Blundell TL. Does a more precise
            chemical description of protein-ligand complexes lead to more
            accurate prediction of binding affinity? J Chem Inf Model. 2014;54:
            944-955. doi:10.1021/ci500091r

        .. [3] Li H, Leung K-S, Wong M-H, Ballester PJ. Improving AutoDock Vina
            Using Random Forest: The Growing Accuracy of Binding Affinity
            Prediction by the Effective Exploitation of Larger Data Sets.
            Mol Inform. WILEY-VCH Verlag; 2015;34: 115-126.
            doi:10.1002/minf.201400132

        """
        self.protein = protein
        self.n_jobs = n_jobs
        self.version = version
        self.spr = spr
        if version == 1:
            cutoff = 12
            mtry = 6
            descriptors = close_contacts_descriptor(
                protein,
                cutoff=cutoff,
                protein_types=protein_atomic_nums,
                ligand_types=ligand_atomic_nums)
        elif version == 2:
            cutoff = np.array([0, 2, 4, 6, 8, 10, 12])
            mtry = 14
            descriptors = close_contacts_descriptor(
                protein,
                cutoff=cutoff,
                protein_types=protein_atomic_nums,
                ligand_types=ligand_atomic_nums)
        elif version == 3:
            cutoff = 12
            mtry = 6
            cc = close_contacts_descriptor(
                protein,
                cutoff=cutoff,
                protein_types=protein_atomic_nums,
                ligand_types=ligand_atomic_nums)
            vina_scores = ['vina_gauss1',
                           'vina_gauss2',
                           'vina_repulsion',
                           'vina_hydrophobic',
                           'vina_hydrogen',
                           'vina_num_rotors']
            vina = oddt_vina_descriptor(protein, vina_scores=vina_scores)
            descriptors = ensemble_descriptor((vina, cc))
        model = randomforest(n_estimators=500,
                             oob_score=True,
                             n_jobs=n_jobs,
                             max_features=mtry,
                             bootstrap=True,
                             min_samples_split=6,
                             **kwargs)
        super(rfscore, self).__init__(model, descriptors,
                                      score_title='rfscore_v%i' % self.version)
Example #6
0
    def __init__(self, protein=None, n_jobs=-1, version=1, spr=0, **kwargs):
        """Scoring function implementing RF-Score variants. It predicts the
        binding affinity  (pKi/d) of ligand in a complex utilizng simple
        descriptors (close contacts of atoms <12A) with sophisticated
        machine-learning model (random forest). The third variand supplements
        those contacts with Vina partial scores. For futher details see RF-Score
        publications v1[1]_, v2[2]_, v3[3]_.


        Parameters
        ----------
        protein : oddt.toolkit.Molecule object
            Receptor for the scored ligands

        n_jobs: int (default=-1)
            Number of cores to use for scoring and training. By default (-1)
            all cores are allocated.

        version: int (default=1)
            Scoring function variant. The deault is the simplest one (v1).

        spr: int (default=0)
            The minimum number of contacts in each pair of atom types in
            the training set for the column to be included in training.
            This is a way of removal of not frequent and empty contacts.

        References
        ----------
        .. [1] Ballester PJ, Mitchell JBO. A machine learning approach to
            predicting protein-ligand binding affinity with applications to
            molecular docking. Bioinformatics. 2010;26: 1169-1175.
            doi:10.1093/bioinformatics/btq112

        .. [2] Ballester PJ, Schreyer A, Blundell TL. Does a more precise
            chemical description of protein-ligand complexes lead to more
            accurate prediction of binding affinity? J Chem Inf Model. 2014;54:
            944-955. doi:10.1021/ci500091r

        .. [3] Li H, Leung K-S, Wong M-H, Ballester PJ. Improving AutoDock Vina
            Using Random Forest: The Growing Accuracy of Binding Affinity
            Prediction by the Effective Exploitation of Larger Data Sets.
            Mol Inform. WILEY-VCH Verlag; 2015;34: 115-126.
            doi:10.1002/minf.201400132

        """
        self.protein = protein
        self.n_jobs = n_jobs
        self.version = version
        self.spr = spr
        if version == 1:
            cutoff = 12
            mtry = 6
            descriptors = close_contacts_descriptor(
                protein,
                cutoff=cutoff,
                protein_types=protein_atomic_nums,
                ligand_types=ligand_atomic_nums)
        elif version == 2:
            cutoff = np.array([0, 2, 4, 6, 8, 10, 12])
            mtry = 14
            descriptors = close_contacts_descriptor(
                protein,
                cutoff=cutoff,
                protein_types=protein_atomic_nums,
                ligand_types=ligand_atomic_nums)
        elif version == 3:
            cutoff = 12
            mtry = 6
            cc = close_contacts_descriptor(protein,
                                           cutoff=cutoff,
                                           protein_types=protein_atomic_nums,
                                           ligand_types=ligand_atomic_nums)
            vina_scores = [
                'vina_gauss1', 'vina_gauss2', 'vina_repulsion',
                'vina_hydrophobic', 'vina_hydrogen', 'vina_num_rotors'
            ]
            vina = oddt_vina_descriptor(protein, vina_scores=vina_scores)
            descriptors = ensemble_descriptor((vina, cc))
        # elif version == 5:
        #     cutoff = np.array([0, 2, 4, 6, 8, 10, 12])
        #     mtry = 14
        #     descriptors = close_contacts_descriptor(
        #         protein,
        #         cutoff=cutoff,
        #         protein_types=protein_atomic_nums,
        #         ligand_types=ligand_atomic_nums)
        model = randomforest(n_estimators=500,
                             oob_score=True,
                             n_jobs=n_jobs,
                             max_features=mtry,
                             bootstrap=True,
                             min_samples_split=6,
                             **kwargs)
        super(rfscore, self).__init__(model,
                                      descriptors,
                                      score_title='rfscore_v%i' % self.version)