Esempio n. 1
0
def test_internal_vina():
    """Compare internal vs orignal Vina partial scores"""
    mols = list(
        oddt.toolkit.readfile(
            'sdf',
            os.path.join(test_data_dir, 'data/dude/xiap/actives_docked.sdf')))
    list(map(lambda x: x.addh(), mols))

    rec = next(
        oddt.toolkit.readfile(
            'pdb',
            os.path.join(test_data_dir, 'data/dude/xiap/receptor_rdkit.pdb')))
    rec.protein = True
    rec.addh()

    # Delete molecule which has differences in Acceptor-Donor def in RDK and OB
    del mols[65]

    vina_scores = [
        'vina_gauss1', 'vina_gauss2', 'vina_repulsion', 'vina_hydrophobic',
        'vina_hydrogen'
    ]
    autodock_vina_results = np.loadtxt(os.path.join(
        test_data_dir, 'data/results/xiap/autodock_vina_scores.csv'),
                                       delimiter=',',
                                       dtype=np.float64)
    oddt_vina_results = oddt_vina_descriptor(
        protein=rec, vina_scores=vina_scores).build(mols)
    assert_array_almost_equal(oddt_vina_results,
                              autodock_vina_results,
                              decimal=4)
Esempio n. 2
0
    def __init__(self, protein = None):
        """ Descriptor build from binana script (as used in NNScore 2.0

        Parameters
        ----------
            protein: oddt.toolkit.Molecule object (default=None)
                Protein object to be used while generating descriptors.
        """
        self.protein = protein
        self.vina = oddt_vina_descriptor(protein, vina_scores = ['vina_affinity', 'vina_gauss1', 'vina_gauss2', 'vina_repulsion', 'vina_hydrophobic', 'vina_hydrogen'])
        # Close contacts descriptor generators
        cc_4_types = (('A', 'A'), ('A', 'C'), ('A', 'CL'), ('A', 'F'), ('A', 'FE'), ('A', 'HD'), ('A', 'MG'), ('A', 'MN'), ('A', 'N'), ('A', 'NA'), ('A', 'OA'), ('A', 'SA'), ('A', 'ZN'), ('BR', 'C'), ('BR', 'HD'), ('BR', 'OA'), ('C', 'C'), ('C', 'CL'), ('C', 'F'), ('C', 'HD'), ('C', 'MG'), ('C', 'MN'), ('C', 'N'), ('C', 'NA'), ('C', 'OA'), ('C', 'SA'), ('C', 'ZN'), ('CL', 'FE'), ('CL', 'HD'), ('CL', 'MG'), ('CL', 'N'), ('CL', 'OA'), ('CL', 'ZN'), ('F', 'HD'), ('F', 'N'), ('F', 'OA'), ('F', 'SA'), ('FE', 'HD'), ('FE', 'N'), ('FE', 'OA'), ('HD', 'HD'), ('HD', 'I'), ('HD', 'MG'), ('HD', 'MN'), ('HD', 'N'), ('HD', 'NA'), ('HD', 'OA'), ('HD', 'P'), ('HD', 'S'), ('HD', 'SA'), ('HD', 'ZN'), ('MG', 'NA'), ('MG', 'OA'), ('MN', 'N'), ('MN', 'OA'), ('N', 'N'), ('N', 'NA'), ('N', 'OA'), ('N', 'SA'), ('N', 'ZN'), ('NA', 'OA'), ('NA', 'SA'), ('NA', 'ZN'), ('OA', 'OA'), ('OA', 'SA'), ('OA', 'ZN'), ('S', 'ZN'), ('SA', 'ZN'), ('A', 'BR'), ('A', 'I'), ('A', 'P'), ('A', 'S'), ('BR', 'N'), ('BR', 'SA'), ('C', 'FE'), ('C', 'I'), ('C', 'P'), ('C', 'S'), ('CL', 'MN'), ('CL', 'NA'), ('CL', 'P'), ('CL', 'S'), ('CL', 'SA'), ('CU', 'HD'), ('CU', 'N'), ('FE', 'NA'), ('FE', 'SA'), ('I', 'N'), ('I', 'OA'), ('MG', 'N'), ('MG', 'P'), ('MG', 'S'), ('MG', 'SA'), ('MN', 'NA'), ('MN', 'P'), ('MN', 'S'), ('MN', 'SA'), ('N', 'P'), ('N', 'S'), ('NA', 'P'), ('NA', 'S'), ('OA', 'P'), ('OA', 'S'), ('P', 'S'), ('P', 'SA'), ('P', 'ZN'), ('S', 'SA'), ('SA', 'SA'), ('A', 'CU'), ('C', 'CD') )
        cc_4_rec_types, cc_4_lig_types = zip(*cc_4_types)
        self.cc_4 = cc_4_nn = close_contacts(protein, cutoff=4, protein_types=cc_4_rec_types, ligand_types=cc_4_lig_types, mode='atom_types_ad4', aligned_pairs=True)
        cc_25_types = [('A', 'A'), ('A', 'C'), ('A', 'CL'), ('A', 'F'), ('A', 'FE'), ('A', 'HD'), ('A', 'MG'), ('A', 'MN'), ('A', 'N'), ('A', 'NA'), ('A', 'OA'), ('A', 'SA'), ('A', 'ZN'), ('BR', 'C'), ('BR', 'HD'), ('BR', 'OA'), ('C', 'C'), ('C', 'CL'), ('C', 'F'), ('C', 'HD'), ('C', 'MG'), ('C', 'MN'), ('C', 'N'), ('C', 'NA'), ('C', 'OA'), ('C', 'SA'), ('C', 'ZN'), ('CD', 'OA'), ('CL', 'FE'), ('CL', 'HD'), ('CL', 'MG'), ('CL', 'N'), ('CL', 'OA'), ('CL', 'ZN'), ('F', 'HD'), ('F', 'N'), ('F', 'OA'), ('F', 'SA'), ('F', 'ZN'), ('FE', 'HD'), ('FE', 'N'), ('FE', 'OA'), ('HD', 'HD'), ('HD', 'I'), ('HD', 'MG'), ('HD', 'MN'), ('HD', 'N'), ('HD', 'NA'), ('HD', 'OA'), ('HD', 'P'), ('HD', 'S'), ('HD', 'SA'), ('HD', 'ZN'), ('MG', 'NA'), ('MG', 'OA'), ('MN', 'N'), ('MN', 'OA'), ('N', 'N'), ('N', 'NA'), ('N', 'OA'), ('N', 'SA'), ('N', 'ZN'), ('NA', 'OA'), ('NA', 'SA'), ('NA', 'ZN'), ('OA', 'OA'), ('OA', 'SA'), ('OA', 'ZN'), ('S', 'ZN'), ('SA', 'ZN')]
        cc_25_rec_types, cc_25_lig_types = zip(*cc_25_types)
        self.cc_25 = close_contacts(protein, cutoff=2.5, protein_types=cc_25_rec_types, ligand_types=cc_25_lig_types, mode='atom_types_ad4', aligned_pairs=True)
Esempio n. 3
0
 def __init__(self, protein=None, n_jobs=-1, version=1, spr=0, **kwargs):
     self.protein = protein
     self.n_jobs = n_jobs
     self.version = version
     self.spr = spr
     if version == 1:
         cutoff = 12
         mtry = 6
         descriptors = close_contacts_descriptor(
             protein,
             cutoff=cutoff,
             protein_types=protein_atomic_nums,
             ligand_types=ligand_atomic_nums)
     elif version == 2:
         cutoff = np.array([0, 2, 4, 6, 8, 10, 12])
         mtry = 14
         descriptors = close_contacts_descriptor(
             protein,
             cutoff=cutoff,
             protein_types=protein_atomic_nums,
             ligand_types=ligand_atomic_nums)
     elif version == 3:
         cutoff = 12
         mtry = 6
         cc = close_contacts_descriptor(protein,
                                        cutoff=cutoff,
                                        protein_types=protein_atomic_nums,
                                        ligand_types=ligand_atomic_nums)
         vina_scores = [
             'vina_gauss1', 'vina_gauss2', 'vina_repulsion',
             'vina_hydrophobic', 'vina_hydrogen', 'vina_num_rotors'
         ]
         vina = oddt_vina_descriptor(protein, vina_scores=vina_scores)
         descriptors = ensemble_descriptor((vina, cc))
     model = randomforest(n_estimators=500,
                          oob_score=True,
                          n_jobs=n_jobs,
                          max_features=mtry,
                          bootstrap=True,
                          min_samples_split=6,
                          **kwargs)
     super(rfscore, self).__init__(model,
                                   descriptors,
                                   score_title='rfscore_v%i' % self.version)
Esempio n. 4
0
def test_ensemble_descriptor():
    mols = list(oddt.toolkit.readfile('sdf', actives_sdf))[:10]
    list(map(lambda x: x.addh(), mols))

    rec = next(oddt.toolkit.readfile('pdb', receptor_pdb))
    rec.protein = True
    rec.addh()

    desc1 = rfscore(version=1).descriptor_generator
    desc2 = oddt_vina_descriptor()
    ensemble = ensemble_descriptor((desc1, desc2))

    ensemble.set_protein(rec)
    assert len(ensemble) == len(desc1) + len(desc2)

    # set protein
    assert desc1.protein == rec
    assert desc2.protein == rec

    ensemble_scores = ensemble.build(mols)
    scores1 = desc1.build(mols)
    scores2 = desc2.build(mols)
    assert_array_almost_equal(ensemble_scores, np.hstack((scores1, scores2)))
Esempio n. 5
0
    def __init__(self, protein=None):
        """ Descriptor build from binana script (as used in NNScore 2.0

        Parameters
        ----------
            protein: oddt.toolkit.Molecule object (default=None)
                Protein object to be used while generating descriptors.
        """
        self.protein = protein
        self.titles = []
        self.vina = oddt_vina_descriptor(protein,
                                         vina_scores=[
                                             'vina_gauss1', 'vina_gauss2',
                                             'vina_repulsion',
                                             'vina_hydrophobic',
                                             'vina_hydrogen'
                                         ])
        self.titles += self.vina.titles
        # Close contacts descriptor generators
        cc_4_types = (('A', 'A'), ('A', 'C'), ('A', 'CL'), ('A', 'F'),
                      ('A', 'FE'), ('A', 'HD'), ('A', 'MG'), ('A', 'MN'),
                      ('A', 'N'), ('A', 'NA'), ('A', 'OA'), ('A', 'SA'),
                      ('A', 'ZN'), ('BR', 'C'), ('BR', 'HD'), ('BR', 'OA'),
                      ('C', 'C'), ('C', 'CL'), ('C', 'F'), ('C', 'HD'), ('C',
                                                                         'MG'),
                      ('C', 'MN'), ('C', 'N'), ('C', 'NA'), ('C', 'OA'),
                      ('C', 'SA'), ('C', 'ZN'), ('CL', 'FE'), ('CL', 'HD'),
                      ('CL', 'MG'), ('CL', 'N'), ('CL', 'OA'), ('CL', 'ZN'),
                      ('F', 'HD'), ('F', 'N'), ('F', 'OA'), ('F', 'SA'),
                      ('FE', 'HD'), ('FE', 'N'), ('FE', 'OA'), ('HD', 'HD'),
                      ('HD', 'I'), ('HD', 'MG'), ('HD', 'MN'), ('HD', 'N'),
                      ('HD', 'NA'), ('HD', 'OA'), ('HD', 'P'), ('HD', 'S'),
                      ('HD', 'SA'), ('HD', 'ZN'), ('MG', 'NA'), ('MG', 'OA'),
                      ('MN', 'N'), ('MN', 'OA'), ('N', 'N'), ('N', 'NA'),
                      ('N', 'OA'), ('N', 'SA'), ('N', 'ZN'), ('NA', 'OA'),
                      ('NA', 'SA'), ('NA', 'ZN'), ('OA', 'OA'), ('OA', 'SA'),
                      ('OA', 'ZN'), ('S', 'ZN'), ('SA', 'ZN'), ('A', 'BR'),
                      ('A', 'I'), ('A', 'P'), ('A', 'S'), ('BR', 'N'), ('BR',
                                                                        'SA'),
                      ('C', 'FE'), ('C', 'I'), ('C', 'P'), ('C', 'S'), ('CL',
                                                                        'MN'),
                      ('CL', 'NA'), ('CL', 'P'), ('CL', 'S'), ('CL', 'SA'),
                      ('CU', 'HD'), ('CU', 'N'), ('FE', 'NA'), ('FE', 'SA'),
                      ('I', 'N'), ('I', 'OA'), ('MG', 'N'), ('MG', 'P'), ('MG',
                                                                          'S'),
                      ('MG', 'SA'), ('MN', 'NA'), ('MN', 'P'), ('MN', 'S'),
                      ('MN', 'SA'), ('N', 'P'), ('N', 'S'), ('NA', 'P'), ('NA',
                                                                          'S'),
                      ('OA', 'P'), ('OA',
                                    'S'), ('P', 'S'), ('P', 'SA'), ('P', 'ZN'),
                      ('S', 'SA'), ('SA', 'SA'), ('A', 'CU'), ('C', 'CD'))
        cc_4_rec_types, cc_4_lig_types = zip(*cc_4_types)
        self.titles += ['cc_%s.%s_4' % (t1, t2) for t1, t2 in cc_4_types]
        self.cc_4 = close_contacts_descriptor(protein,
                                              cutoff=4,
                                              protein_types=cc_4_rec_types,
                                              ligand_types=cc_4_lig_types,
                                              mode='atom_types_ad4',
                                              aligned_pairs=True)

        self.ele_types = (
            ('A', 'A'), ('A', 'C'), ('A', 'CL'), ('A', 'F'), ('A', 'FE'),
            ('A', 'HD'), ('A', 'MG'), ('A', 'MN'), ('A', 'N'), ('A', 'NA'),
            ('A', 'OA'), ('A', 'SA'), ('A', 'ZN'),
            ('BR', 'C'), ('BR', 'HD'), ('BR', 'OA'), ('C', 'C'),
            ('C', 'CL'), ('C', 'F'), ('C', 'HD'), ('C', 'MG'),
            ('C', 'MN'), ('C', 'N'), ('C', 'NA'), ('C', 'OA'), ('C', 'SA'),
            ('C', 'ZN'), ('CL', 'FE'), ('CL', 'HD'), ('CL', 'MG'),
            ('CL', 'N'), ('CL', 'OA'), ('CL', 'ZN'), ('F', 'HD'),
            ('F', 'N'), ('F', 'OA'), ('F', 'SA'), ('F', 'ZN'), ('FE', 'HD'),
            ('FE', 'N'), ('FE', 'OA'), ('HD', 'HD'), ('HD',
                                                      'I'), ('HD',
                                                             'MG'), ('HD',
                                                                     'MN'),
            ('HD', 'N'), ('HD', 'NA'), ('HD', 'OA'), ('HD', 'P'),
            ('HD', 'S'), ('HD', 'SA'), ('HD', 'ZN'), ('MG', 'NA'), ('MG',
                                                                    'OA'),
            ('MN', 'N'), ('MN',
                          'OA'), ('N', 'N'), ('N', 'NA'), ('N', 'OA'), ('N',
                                                                        'SA'),
            ('N', 'ZN'), ('NA', 'OA'), ('NA', 'SA'), ('NA', 'ZN'), ('OA',
                                                                    'OA'),
            ('OA',
             'SA'), ('OA',
                     'ZN'), ('S', 'ZN'), ('SA',
                                          'ZN'), ('A', 'BR'), ('A', 'I'),
            ('A', 'P'), ('A', 'S'), ('BR', 'N'), ('BR', 'SA'), ('C',
                                                                'FE'),
            ('C', 'I'), ('C', 'P'), ('C', 'S'), ('CL', 'MN'), ('CL',
                                                               'NA'), ('CL',
                                                                       'P'),
            ('CL', 'S'), ('CL', 'SA'), ('CU', 'HD'), ('CU', 'N'), ('FE', 'NA'),
            ('FE',
             'SA'), ('I', 'N'), ('I', 'OA'),
            ('MG', 'N'), ('MG', 'P'), ('MG', 'S'), ('MG', 'SA'), ('MN', 'NA'),
            ('MN', 'P'), ('MN', 'S'), ('MN', 'SA'), ('N', 'P'), ('N', 'S'),
            ('NA', 'P'), ('NA', 'S'), ('OA', 'P'), ('OA', 'S'), ('P', 'S'),
            ('P', 'SA'), ('P', 'ZN'), ('S', 'SA'), ('SA', 'SA'))
        self.titles += ['ele_%s.%s_4' % (t1, t2) for t1, t2 in self.ele_types]

        self.ligand_atom_types = [
            'A', 'BR', 'C', 'CL', 'F', 'HD', 'I', 'N', 'NA', 'OA', 'P', 'S',
            'SA'
        ]
        self.titles += ['lig_%s' % t1 for t1 in self.ligand_atom_types]

        cc_25_types = [('A', 'A'), ('A', 'C'), ('A', 'CL'), ('A', 'F'),
                       ('A', 'FE'), ('A', 'HD'), ('A', 'MG'), ('A', 'MN'),
                       ('A', 'N'), ('A', 'NA'), ('A', 'OA'), ('A', 'SA'),
                       ('A', 'ZN'), ('BR', 'C'), ('BR', 'HD'), ('BR', 'OA'),
                       ('C', 'C'), ('C', 'CL'), ('C', 'F'), ('C', 'HD'),
                       ('C', 'MG'), ('C', 'MN'), ('C', 'N'), ('C', 'NA'),
                       ('C', 'OA'), ('C', 'SA'), ('C', 'ZN'), ('CD', 'OA'),
                       ('CL', 'FE'), ('CL', 'HD'), ('CL', 'MG'), ('CL', 'N'),
                       ('CL', 'OA'), ('CL', 'ZN'), ('F', 'HD'), ('F', 'N'),
                       ('F', 'OA'), ('F', 'SA'), ('F', 'ZN'), ('FE', 'HD'),
                       ('FE', 'N'), ('FE', 'OA'), ('HD', 'HD'), ('HD', 'I'),
                       ('HD', 'MG'), ('HD', 'MN'), ('HD', 'N'), ('HD', 'NA'),
                       ('HD', 'OA'), ('HD', 'P'), ('HD', 'S'), ('HD', 'SA'),
                       ('HD', 'ZN'), ('MG', 'NA'), ('MG', 'OA'), ('MN', 'N'),
                       ('MN', 'OA'), ('N', 'N'), ('N', 'NA'), ('N', 'OA'),
                       ('N', 'SA'), ('N', 'ZN'), ('NA', 'OA'), ('NA', 'SA'),
                       ('NA', 'ZN'), ('OA', 'OA'), ('OA', 'SA'), ('OA', 'ZN'),
                       ('S', 'ZN'), ('SA', 'ZN')]
        cc_25_rec_types, cc_25_lig_types = zip(*cc_25_types)
        self.cc_25 = close_contacts_descriptor(protein,
                                               cutoff=2.5,
                                               protein_types=cc_25_rec_types,
                                               ligand_types=cc_25_lig_types,
                                               mode='atom_types_ad4',
                                               aligned_pairs=True)
        self.titles += ['cc_%s.%s_2.5' % (t1, t2) for t1, t2 in cc_25_types]
        # H-Bonds (<4A)
        self.titles += [
            'hb_4_mol_backbone_alpha', 'hb_4_mol_backbone_beta',
            'hb_4_mol_backbone_other', 'hb_4_mol_sidechain_alpha',
            'hb_4_mol_sidechain_beta', 'hb_4_mol_sidechain_other',
            'hb_4_rec_backbone_alpha', 'hb_4_rec_backbone_beta',
            'hb_4_rec_backbone_other', 'hb_4_rec_sidechain_alpha',
            'hb_4_rec_sidechain_beta', 'hb_4_rec_sidechain_other'
        ]
        # Hydrophobic Contact <4A
        self.titles += [
            'hyd_4_backbone_alpha', 'hyd_4_backbone_beta',
            'hyd_4_backbone_other', 'hyd_4_sidechain_alpha',
            'hyd_4_sidechain_beta', 'hyd_4_sidechain_other', 'hyd_4_all'
        ]
        # Pi-stacking (<7.5A)
        self.titles += [
            'pi_stack_7.5_alpha', 'pi_stack_7.5_beta', 'pi_stack_7.5_other'
        ]
        # T-shaped Pi-Pi interaction
        self.titles += ['pi_t_7.5_alpha', 'pi_t_7.5_beta', 'pi_t_7.5_other']
        # Pi-cation (<6A)
        self.titles += [
            'pi_cat_mol_6_alpha', 'pi_cat_mol_6_beta', 'pi_cat_mol_6_other',
            'pi_cat_rec_6_alpha', 'pi_cat_rec_6_beta', 'pi_cat_rec_6_other'
        ]
        # Active site flexibility (<4A)
        self.titles += [
            'as_flex_backbone_alpha', 'as_flex_backbone_beta',
            'as_flex_backbone_other', 'as_flex_sidechain_alpha',
            'as_flex_sidechain_beta', 'as_flex_sidechain_other', 'as_flex_all'
        ]
        # Salt bridges (<5.5)
        self.titles += [
            'salt_bridge_5.5_alpha', 'salt_bridge_5.5_beta',
            'salt_bridge_5.5_other', 'salt_bridge_5.5_all'
        ]
        # Rotatable bonds
        self.titles += ['num_rotors']

        assert len(self.titles) == len(self)
Esempio n. 6
0
    def __init__(self, protein=None, n_jobs=-1, version=1, spr=0, **kwargs):
        """Scoring function implementing RF-Score variants. It predicts the
        binding affinity  (pKi/d) of ligand in a complex utilizng simple
        descriptors (close contacts of atoms <12A) with sophisticated
        machine-learning model (random forest). The third variand supplements
        those contacts with Vina partial scores. For futher details see RF-Score
        publications v1[1]_, v2[2]_, v3[3]_.


        Parameters
        ----------
        protein : oddt.toolkit.Molecule object
            Receptor for the scored ligands

        n_jobs: int (default=-1)
            Number of cores to use for scoring and training. By default (-1)
            all cores are allocated.

        version: int (default=1)
            Scoring function variant. The deault is the simplest one (v1).

        spr: int (default=0)
            The minimum number of contacts in each pair of atom types in
            the training set for the column to be included in training.
            This is a way of removal of not frequent and empty contacts.

        References
        ----------
        .. [1] Ballester PJ, Mitchell JBO. A machine learning approach to
            predicting protein-ligand binding affinity with applications to
            molecular docking. Bioinformatics. 2010;26: 1169-1175.
            doi:10.1093/bioinformatics/btq112

        .. [2] Ballester PJ, Schreyer A, Blundell TL. Does a more precise
            chemical description of protein-ligand complexes lead to more
            accurate prediction of binding affinity? J Chem Inf Model. 2014;54:
            944-955. doi:10.1021/ci500091r

        .. [3] Li H, Leung K-S, Wong M-H, Ballester PJ. Improving AutoDock Vina
            Using Random Forest: The Growing Accuracy of Binding Affinity
            Prediction by the Effective Exploitation of Larger Data Sets.
            Mol Inform. WILEY-VCH Verlag; 2015;34: 115-126.
            doi:10.1002/minf.201400132

        """
        self.protein = protein
        self.n_jobs = n_jobs
        self.version = version
        self.spr = spr
        if version == 1:
            cutoff = 12
            mtry = 6
            descriptors = close_contacts_descriptor(
                protein,
                cutoff=cutoff,
                protein_types=protein_atomic_nums,
                ligand_types=ligand_atomic_nums)
        elif version == 2:
            cutoff = np.array([0, 2, 4, 6, 8, 10, 12])
            mtry = 14
            descriptors = close_contacts_descriptor(
                protein,
                cutoff=cutoff,
                protein_types=protein_atomic_nums,
                ligand_types=ligand_atomic_nums)
        elif version == 3:
            cutoff = 12
            mtry = 6
            cc = close_contacts_descriptor(
                protein,
                cutoff=cutoff,
                protein_types=protein_atomic_nums,
                ligand_types=ligand_atomic_nums)
            vina_scores = ['vina_gauss1',
                           'vina_gauss2',
                           'vina_repulsion',
                           'vina_hydrophobic',
                           'vina_hydrogen',
                           'vina_num_rotors']
            vina = oddt_vina_descriptor(protein, vina_scores=vina_scores)
            descriptors = ensemble_descriptor((vina, cc))
        model = randomforest(n_estimators=500,
                             oob_score=True,
                             n_jobs=n_jobs,
                             max_features=mtry,
                             bootstrap=True,
                             min_samples_split=6,
                             **kwargs)
        super(rfscore, self).__init__(model, descriptors,
                                      score_title='rfscore_v%i' % self.version)
Esempio n. 7
0
    def __init__(self, protein=None, n_jobs=-1, version=1, spr=0, **kwargs):
        """Scoring function implementing RF-Score variants. It predicts the
        binding affinity  (pKi/d) of ligand in a complex utilizng simple
        descriptors (close contacts of atoms <12A) with sophisticated
        machine-learning model (random forest). The third variand supplements
        those contacts with Vina partial scores. For futher details see RF-Score
        publications v1[1]_, v2[2]_, v3[3]_.


        Parameters
        ----------
        protein : oddt.toolkit.Molecule object
            Receptor for the scored ligands

        n_jobs: int (default=-1)
            Number of cores to use for scoring and training. By default (-1)
            all cores are allocated.

        version: int (default=1)
            Scoring function variant. The deault is the simplest one (v1).

        spr: int (default=0)
            The minimum number of contacts in each pair of atom types in
            the training set for the column to be included in training.
            This is a way of removal of not frequent and empty contacts.

        References
        ----------
        .. [1] Ballester PJ, Mitchell JBO. A machine learning approach to
            predicting protein-ligand binding affinity with applications to
            molecular docking. Bioinformatics. 2010;26: 1169-1175.
            doi:10.1093/bioinformatics/btq112

        .. [2] Ballester PJ, Schreyer A, Blundell TL. Does a more precise
            chemical description of protein-ligand complexes lead to more
            accurate prediction of binding affinity? J Chem Inf Model. 2014;54:
            944-955. doi:10.1021/ci500091r

        .. [3] Li H, Leung K-S, Wong M-H, Ballester PJ. Improving AutoDock Vina
            Using Random Forest: The Growing Accuracy of Binding Affinity
            Prediction by the Effective Exploitation of Larger Data Sets.
            Mol Inform. WILEY-VCH Verlag; 2015;34: 115-126.
            doi:10.1002/minf.201400132

        """
        self.protein = protein
        self.n_jobs = n_jobs
        self.version = version
        self.spr = spr
        if version == 1:
            cutoff = 12
            mtry = 6
            descriptors = close_contacts_descriptor(
                protein,
                cutoff=cutoff,
                protein_types=protein_atomic_nums,
                ligand_types=ligand_atomic_nums)
        elif version == 2:
            cutoff = np.array([0, 2, 4, 6, 8, 10, 12])
            mtry = 14
            descriptors = close_contacts_descriptor(
                protein,
                cutoff=cutoff,
                protein_types=protein_atomic_nums,
                ligand_types=ligand_atomic_nums)
        elif version == 3:
            cutoff = 12
            mtry = 6
            cc = close_contacts_descriptor(protein,
                                           cutoff=cutoff,
                                           protein_types=protein_atomic_nums,
                                           ligand_types=ligand_atomic_nums)
            vina_scores = [
                'vina_gauss1', 'vina_gauss2', 'vina_repulsion',
                'vina_hydrophobic', 'vina_hydrogen', 'vina_num_rotors'
            ]
            vina = oddt_vina_descriptor(protein, vina_scores=vina_scores)
            descriptors = ensemble_descriptor((vina, cc))
        # elif version == 5:
        #     cutoff = np.array([0, 2, 4, 6, 8, 10, 12])
        #     mtry = 14
        #     descriptors = close_contacts_descriptor(
        #         protein,
        #         cutoff=cutoff,
        #         protein_types=protein_atomic_nums,
        #         ligand_types=ligand_atomic_nums)
        model = randomforest(n_estimators=500,
                             oob_score=True,
                             n_jobs=n_jobs,
                             max_features=mtry,
                             bootstrap=True,
                             min_samples_split=6,
                             **kwargs)
        super(rfscore, self).__init__(model,
                                      descriptors,
                                      score_title='rfscore_v%i' % self.version)
Esempio n. 8
0
    def __init__(self, protein=None):
        """ Descriptor build from binana script (as used in NNScore 2.0

        Parameters
        ----------
        protein: oddt.toolkit.Molecule object (default=None)
            Protein object to be used while generating descriptors.
        """
        self.protein = protein
        self.titles = []
        self.vina = oddt_vina_descriptor(protein, vina_scores=['vina_gauss1',
                                                               'vina_gauss2',
                                                               'vina_repulsion',
                                                               'vina_hydrophobic',
                                                               'vina_hydrogen'])
        self.titles += self.vina.titles
        # Close contacts descriptor generators
        cc_4_types = (('A', 'A'), ('A', 'C'), ('A', 'CL'), ('A', 'F'),
                      ('A', 'FE'), ('A', 'HD'), ('A', 'MG'), ('A', 'MN'),
                      ('A', 'N'), ('A', 'NA'), ('A', 'OA'), ('A', 'SA'),
                      ('A', 'ZN'), ('BR', 'C'), ('BR', 'HD'), ('BR', 'OA'),
                      ('C', 'C'), ('C', 'CL'), ('C', 'F'), ('C', 'HD'),
                      ('C', 'MG'), ('C', 'MN'), ('C', 'N'), ('C', 'NA'),
                      ('C', 'OA'), ('C', 'SA'), ('C', 'ZN'), ('CL', 'FE'),
                      ('CL', 'HD'), ('CL', 'MG'), ('CL', 'N'), ('CL', 'OA'),
                      ('CL', 'ZN'), ('F', 'HD'), ('F', 'N'), ('F', 'OA'),
                      ('F', 'SA'), ('FE', 'HD'), ('FE', 'N'), ('FE', 'OA'),
                      ('HD', 'HD'), ('HD', 'I'), ('HD', 'MG'), ('HD', 'MN'),
                      ('HD', 'N'), ('HD', 'NA'), ('HD', 'OA'), ('HD', 'P'),
                      ('HD', 'S'), ('HD', 'SA'), ('HD', 'ZN'), ('MG', 'NA'),
                      ('MG', 'OA'), ('MN', 'N'), ('MN', 'OA'), ('N', 'N'),
                      ('N', 'NA'), ('N', 'OA'), ('N', 'SA'), ('N', 'ZN'),
                      ('NA', 'OA'), ('NA', 'SA'), ('NA', 'ZN'), ('OA', 'OA'),
                      ('OA', 'SA'), ('OA', 'ZN'), ('S', 'ZN'), ('SA', 'ZN'),
                      ('A', 'BR'), ('A', 'I'), ('A', 'P'), ('A', 'S'),
                      ('BR', 'N'), ('BR', 'SA'), ('C', 'FE'), ('C', 'I'),
                      ('C', 'P'), ('C', 'S'), ('CL', 'MN'), ('CL', 'NA'),
                      ('CL', 'P'), ('CL', 'S'), ('CL', 'SA'), ('CU', 'HD'),
                      ('CU', 'N'), ('FE', 'NA'), ('FE', 'SA'), ('I', 'N'),
                      ('I', 'OA'), ('MG', 'N'), ('MG', 'P'), ('MG', 'S'),
                      ('MG', 'SA'), ('MN', 'NA'), ('MN', 'P'), ('MN', 'S'),
                      ('MN', 'SA'), ('N', 'P'), ('N', 'S'), ('NA', 'P'),
                      ('NA', 'S'), ('OA', 'P'), ('OA', 'S'), ('P', 'S'),
                      ('P', 'SA'), ('P', 'ZN'), ('S', 'SA'), ('SA', 'SA'),
                      ('A', 'CU'), ('C', 'CD'))
        cc_4_rec_types, cc_4_lig_types = zip(*cc_4_types)
        self.titles += ['cc_%s.%s_4' % (t1, t2) for t1, t2 in cc_4_types]
        self.cc_4 = close_contacts_descriptor(protein,
                                              cutoff=4,
                                              protein_types=cc_4_rec_types,
                                              ligand_types=cc_4_lig_types,
                                              mode='atom_types_ad4',
                                              aligned_pairs=True)

        self.ele_types = (('A', 'A'), ('A', 'C'), ('A', 'CL'), ('A', 'F'),
                          ('A', 'FE'), ('A', 'HD'), ('A', 'MG'), ('A', 'MN'),
                          ('A', 'N'), ('A', 'NA'), ('A', 'OA'), ('A', 'SA'),
                          ('A', 'ZN'), ('BR', 'C'), ('BR', 'HD'), ('BR', 'OA'),
                          ('C', 'C'), ('C', 'CL'), ('C', 'F'), ('C', 'HD'),
                          ('C', 'MG'), ('C', 'MN'), ('C', 'N'), ('C', 'NA'),
                          ('C', 'OA'), ('C', 'SA'), ('C', 'ZN'), ('CL', 'FE'),
                          ('CL', 'HD'), ('CL', 'MG'), ('CL', 'N'), ('CL', 'OA'),
                          ('CL', 'ZN'), ('F', 'HD'), ('F', 'N'), ('F', 'OA'),
                          ('F', 'SA'), ('F', 'ZN'), ('FE', 'HD'), ('FE', 'N'),
                          ('FE', 'OA'), ('HD', 'HD'), ('HD', 'I'), ('HD', 'MG'),
                          ('HD', 'MN'), ('HD', 'N'), ('HD', 'NA'), ('HD', 'OA'),
                          ('HD', 'P'), ('HD', 'S'), ('HD', 'SA'), ('HD', 'ZN'),
                          ('MG', 'NA'), ('MG', 'OA'), ('MN', 'N'), ('MN', 'OA'),
                          ('N', 'N'), ('N', 'NA'), ('N', 'OA'), ('N', 'SA'),
                          ('N', 'ZN'), ('NA', 'OA'), ('NA', 'SA'), ('NA', 'ZN'),
                          ('OA', 'OA'), ('OA', 'SA'), ('OA', 'ZN'), ('S', 'ZN'),
                          ('SA', 'ZN'), ('A', 'BR'), ('A', 'I'), ('A', 'P'),
                          ('A', 'S'), ('BR', 'N'), ('BR', 'SA'), ('C', 'FE'),
                          ('C', 'I'), ('C', 'P'), ('C', 'S'), ('CL', 'MN'),
                          ('CL', 'NA'), ('CL', 'P'), ('CL', 'S'), ('CL', 'SA'),
                          ('CU', 'HD'), ('CU', 'N'), ('FE', 'NA'), ('FE', 'SA'),
                          ('I', 'N'), ('I', 'OA'), ('MG', 'N'), ('MG', 'P'),
                          ('MG', 'S'), ('MG', 'SA'), ('MN', 'NA'), ('MN', 'P'),
                          ('MN', 'S'), ('MN', 'SA'), ('N', 'P'), ('N', 'S'),
                          ('NA', 'P'), ('NA', 'S'), ('OA', 'P'), ('OA', 'S'),
                          ('P', 'S'), ('P', 'SA'), ('P', 'ZN'), ('S', 'SA'),
                          ('SA', 'SA'))
        self.titles += ['ele_%s.%s_4' % (t1, t2) for t1, t2 in self.ele_types]

        self.ligand_atom_types = ['A', 'BR', 'C', 'CL', 'F', 'HD', 'I', 'N', 'NA', 'OA', 'P', 'S', 'SA']
        self.titles += ['lig_%s' % t1 for t1 in self.ligand_atom_types]

        cc_25_types = [('A', 'A'), ('A', 'C'), ('A', 'CL'), ('A', 'F'),
                       ('A', 'FE'), ('A', 'HD'), ('A', 'MG'), ('A', 'MN'),
                       ('A', 'N'), ('A', 'NA'), ('A', 'OA'), ('A', 'SA'),
                       ('A', 'ZN'), ('BR', 'C'), ('BR', 'HD'), ('BR', 'OA'),
                       ('C', 'C'), ('C', 'CL'), ('C', 'F'), ('C', 'HD'),
                       ('C', 'MG'), ('C', 'MN'), ('C', 'N'), ('C', 'NA'),
                       ('C', 'OA'), ('C', 'SA'), ('C', 'ZN'), ('CD', 'OA'),
                       ('CL', 'FE'), ('CL', 'HD'), ('CL', 'MG'), ('CL', 'N'),
                       ('CL', 'OA'), ('CL', 'ZN'), ('F', 'HD'), ('F', 'N'),
                       ('F', 'OA'), ('F', 'SA'), ('F', 'ZN'), ('FE', 'HD'),
                       ('FE', 'N'), ('FE', 'OA'), ('HD', 'HD'), ('HD', 'I'),
                       ('HD', 'MG'), ('HD', 'MN'), ('HD', 'N'), ('HD', 'NA'),
                       ('HD', 'OA'), ('HD', 'P'), ('HD', 'S'), ('HD', 'SA'),
                       ('HD', 'ZN'), ('MG', 'NA'), ('MG', 'OA'), ('MN', 'N'),
                       ('MN', 'OA'), ('N', 'N'), ('N', 'NA'), ('N', 'OA'),
                       ('N', 'SA'), ('N', 'ZN'), ('NA', 'OA'), ('NA', 'SA'),
                       ('NA', 'ZN'), ('OA', 'OA'), ('OA', 'SA'), ('OA', 'ZN'),
                       ('S', 'ZN'), ('SA', 'ZN')]
        cc_25_rec_types, cc_25_lig_types = zip(*cc_25_types)
        self.cc_25 = close_contacts_descriptor(protein,
                                               cutoff=2.5,
                                               protein_types=cc_25_rec_types,
                                               ligand_types=cc_25_lig_types,
                                               mode='atom_types_ad4',
                                               aligned_pairs=True)
        self.titles += ['cc_%s.%s_2.5' % (t1, t2) for t1, t2 in cc_25_types]
        # H-Bonds (<4A)
        self.titles += ['hb_4_mol_backbone_alpha',
                        'hb_4_mol_backbone_beta',
                        'hb_4_mol_backbone_other',
                        'hb_4_mol_sidechain_alpha',
                        'hb_4_mol_sidechain_beta',
                        'hb_4_mol_sidechain_other',
                        'hb_4_rec_backbone_alpha',
                        'hb_4_rec_backbone_beta',
                        'hb_4_rec_backbone_other',
                        'hb_4_rec_sidechain_alpha',
                        'hb_4_rec_sidechain_beta',
                        'hb_4_rec_sidechain_other']
        # Hydrophobic Contact <4A
        self.titles += ['hyd_4_backbone_alpha',
                        'hyd_4_backbone_beta',
                        'hyd_4_backbone_other',
                        'hyd_4_sidechain_alpha',
                        'hyd_4_sidechain_beta',
                        'hyd_4_sidechain_other',
                        'hyd_4_all']
        # Pi-stacking (<7.5A)
        self.titles += ['pi_stack_7.5_alpha',
                        'pi_stack_7.5_beta',
                        'pi_stack_7.5_other']
        # T-shaped Pi-Pi interaction
        self.titles += ['pi_t_7.5_alpha',
                        'pi_t_7.5_beta',
                        'pi_t_7.5_other']
        # Pi-cation (<6A)
        self.titles += ['pi_cat_mol_6_alpha',
                        'pi_cat_mol_6_beta',
                        'pi_cat_mol_6_other',
                        'pi_cat_rec_6_alpha',
                        'pi_cat_rec_6_beta',
                        'pi_cat_rec_6_other']
        # Active site flexibility (<4A)
        self.titles += ['as_flex_backbone_alpha',
                        'as_flex_backbone_beta',
                        'as_flex_backbone_other',
                        'as_flex_sidechain_alpha',
                        'as_flex_sidechain_beta',
                        'as_flex_sidechain_other',
                        'as_flex_all']
        # Salt bridges (<5.5)
        self.titles += ['salt_bridge_5.5_alpha',
                        'salt_bridge_5.5_beta',
                        'salt_bridge_5.5_other',
                        'salt_bridge_5.5_all']
        # Rotatable bonds
        self.titles += ['num_rotors']

        assert len(self.titles) == len(self)