Beispiel #1
0
    def gen_training_data(self,
                          pdbbind_dir,
                          pdbbind_versions=(2007, 2012, 2013, 2014, 2015, 2016),
                          home_dir=None):
        pdbbind_versions = sorted(pdbbind_versions)

        # generate metadata
        df = []
        for pdbbind_version in pdbbind_versions:
            p = pdbbind('%s/v%i/' % (pdbbind_dir, pdbbind_version),
                        version=pdbbind_version,
                        opt={'b': None})
            # Core set
            tmp_df = pd.DataFrame({'pdbid': list(p.sets['core'].keys()),
                                   '%i_core' % pdbbind_version: list(p.sets['core'].values())})
            df = pd.merge(tmp_df, df, how='outer', on='pdbid') if len(df) else tmp_df

            # Refined Set
            tmp_df = pd.DataFrame({'pdbid': list(p.sets['refined'].keys()),
                                   '%i_refined' % pdbbind_version: list(p.sets['refined'].values())})
            df = pd.merge(tmp_df, df, how='outer', on='pdbid')

            # General Set
            general_name = 'general_PL' if pdbbind_version > 2007 else 'general'
            tmp_df = pd.DataFrame({'pdbid': list(p.sets[general_name].keys()),
                                   '%i_general' % pdbbind_version: list(p.sets[general_name].values())})
            df = pd.merge(tmp_df, df, how='outer', on='pdbid')

        df.sort_values('pdbid', inplace=True)
        tmp_act = df['%i_general' % pdbbind_versions[-1]].values
        df = df.set_index('pdbid').notnull()
        df['act'] = tmp_act
        # take non-empty and core + refined set
        df = df[df['act'].notnull() & df.filter(regex='.*_[refined,core]').any(axis=1)]

        # build descriptos
        pdbbind_db = pdbbind('%s/v%i/' % (pdbbind_dir, pdbbind_versions[-1]), version=pdbbind_versions[-1])
        if not home_dir:
            home_dir = dirname(__file__) + '/RFScore'

        result = Parallel(n_jobs=self.n_jobs,
                          verbose=1)(delayed(_parallel_helper)(self.descriptor_generator,
                                                               'build',
                                                               [pdbbind_db[pid].ligand],
                                                               protein=pdbbind_db[pid].pocket)
                                     for pid in df.index.values if pdbbind_db[pid].pocket is not None)
        descs = np.vstack(result)
        for i in range(len(self.descriptor_generator)):
            df[str(i)] = descs[:, i]
        df.to_csv(home_dir + '/rfscore_descs_v%i.csv' % self.version, float_format='%.5g')
Beispiel #2
0
    def gen_training_data(self,
                          pdbbind_dir,
                          pdbbind_version='2007',
                          home_dir=None,
                          sf_pickle=''):
        # build train and test
        cpus = self.n_jobs if self.n_jobs > 0 else -1
        #pool = Pool(processes=cpus)
        pdbbind_db = pdbbind(pdbbind_dir,
                             int(pdbbind_version),
                             opt={'b': None})
        if not home_dir:
            home_dir = dirname(__file__) + '/RFScore'

        pdbbind_db.default_set = 'core'
        core_set = pdbbind_db.ids
        core_act = np.array(pdbbind_db.activities)
        #         core_desc = np.vstack([self.descriptor_generator.build([pid.ligand], protein=pid.pocket) for pid in pdbbind_db])
        result = Parallel(n_jobs=cpus)(
            delayed(_parallel_helper)(self.descriptor_generator,
                                      'build', [pid.ligand],
                                      protein=pid.pocket) for pid in pdbbind_db
            if pid.pocket)
        core_desc = np.vstack(result)

        pdbbind_db.default_set = 'general'
        refined_set = [pid for pid in pdbbind_db.ids if not pid in core_set]
        refined_act = np.array([
            pdbbind_db.sets[pdbbind_db.default_set][pid] for pid in refined_set
        ])
        #         refined_desc = np.vstack([self.descriptor_generator.build([pid.ligand], protein=pid.pocket) for pid in pdbbind_db])
        result = Parallel(n_jobs=cpus)(
            delayed(_parallel_helper)(self.descriptor_generator,
                                      'build', [pid.ligand],
                                      protein=pid.pocket) for pid in pdbbind_db
            if pid.pocket and not pid.id in core_set)
        refined_desc = np.vstack(result)

        self.train_descs = refined_desc
        self.train_target = refined_act
        self.test_descs = core_desc
        self.test_target = core_act

        # save numpy arrays
        np.savetxt(home_dir + '/train_descs_v%i.csv' % (self.version),
                   self.train_descs,
                   fmt='%g',
                   delimiter=',')
        np.savetxt(home_dir + '/train_target.csv',
                   self.train_target,
                   fmt='%.2f',
                   delimiter=',')
        np.savetxt(home_dir + '/test_descs_v%i.csv' % (self.version),
                   self.test_descs,
                   fmt='%g',
                   delimiter=',')
        np.savetxt(home_dir + '/test_target.csv',
                   self.test_target,
                   fmt='%.2f',
                   delimiter=',')
Beispiel #3
0
    def gen_training_data(self, pdbbind_dir, pdbbind_version = '2007', home_dir = None, sf_pickle = ''):
        # build train and test
        cpus = self.n_jobs if self.n_jobs > 0 else -1
        #pool = Pool(processes=cpus)
        pdbbind_db = pdbbind(pdbbind_dir, int(pdbbind_version), opt={'b':None})
        if not home_dir:
            home_dir = dirname(__file__) + '/RFScore'
            
        pdbbind_db.default_set = 'core'
        core_set = pdbbind_db.ids
        core_act = np.array(pdbbind_db.activities)
#         core_desc = np.vstack([self.descriptor_generator.build([pid.ligand], protein=pid.pocket) for pid in pdbbind_db])
        result = Parallel(n_jobs=cpus)(delayed(_parallel_helper)(self.descriptor_generator, 'build', [pid.ligand], protein=pid.pocket) for pid in pdbbind_db if pid.pocket)
        core_desc = np.vstack(result)


        pdbbind_db.default_set = 'general'
        refined_set  = [pid for pid in pdbbind_db.ids if not pid in core_set]
        refined_act = np.array([pdbbind_db.sets[pdbbind_db.default_set][pid] for pid in refined_set])
#         refined_desc = np.vstack([self.descriptor_generator.build([pid.ligand], protein=pid.pocket) for pid in pdbbind_db])
        result = Parallel(n_jobs=cpus)(delayed(_parallel_helper)(self.descriptor_generator, 'build', [pid.ligand], protein=pid.pocket) for pid in pdbbind_db if pid.pocket and not pid.id in core_set)
        refined_desc = np.vstack(result)

        self.train_descs = refined_desc
        self.train_target = refined_act
        self.test_descs = core_desc
        self.test_target = core_act

        # save numpy arrays
        np.savetxt(home_dir + '/train_descs_v%i.csv' % (self.version), self.train_descs, fmt='%g', delimiter=',')
        np.savetxt(home_dir + '/train_target.csv', self.train_target, fmt='%.2f', delimiter=',')
        np.savetxt(home_dir + '/test_descs_v%i.csv' % (self.version), self.test_descs, fmt='%g', delimiter=',')
        np.savetxt(home_dir + '/test_target.csv', self.test_target, fmt='%.2f', delimiter=',')
Beispiel #4
0
    def _gen_pdbbind_desc(self,
                          pdbbind_dir,
                          pdbbind_versions=(2007, 2012, 2013, 2014, 2015,
                                            2016),
                          desc_path=None,
                          **kwargs):
        pdbbind_versions = sorted(pdbbind_versions)

        if 'opt' in kwargs:
            opt = kwargs.pop('opt')
        else:
            opt = {}

        # generate metadata
        df = None
        for pdbbind_version in pdbbind_versions:
            p = pdbbind('%s/v%i/' % (pdbbind_dir, pdbbind_version),
                        version=pdbbind_version,
                        opt=opt)
            # Core set

            for set_name in p.pdbind_sets:
                if set_name == 'general_PL':
                    dataset_key = '%i_general' % pdbbind_version
                else:
                    dataset_key = '%i_%s' % (pdbbind_version, set_name)

                tmp_df = pd.DataFrame({
                    'pdbid':
                    list(p.sets[set_name].keys()),
                    dataset_key:
                    list(p.sets[set_name].values())
                })
                if df is not None:
                    df = pd.merge(tmp_df, df, how='outer', on='pdbid')
                else:
                    df = tmp_df

        df.sort_values('pdbid', inplace=True)
        tmp_act = df['%i_general' % pdbbind_versions[-1]].values
        df = df.set_index('pdbid').notnull()
        df['act'] = tmp_act
        # take non-empty and core + refined set
        df = df[df['act'].notnull()
                & df.filter(regex='.*_[refined,core]').any(axis=1)]

        # build descriptos
        pdbbind_db = pdbbind('%s/v%i/' % (pdbbind_dir, pdbbind_versions[-1]),
                             version=pdbbind_versions[-1])
        if not desc_path:
            desc_path = path_join(dirname(__file__) + 'descs.csv')

        if self.n_jobs is None:
            n_jobs = -1
        else:
            n_jobs = self.n_jobs
        result = Parallel(n_jobs=n_jobs, verbose=1)(
            delayed(method_caller)(self.descriptor_generator,
                                   'build', [pdbbind_db[pid].ligand],
                                   protein=pdbbind_db[pid].pocket)
            for pid in df.index.values if pdbbind_db[pid].pocket is not None)
        descs = np.vstack(result)
        for i in range(len(self.descriptor_generator)):
            df[str(i)] = descs[:, i]
        df.to_csv(desc_path, float_format='%.5g')
Beispiel #5
0
    def gen_training_data(self,
                          pdbbind_dir,
                          pdbbind_version=2007,
                          home_dir=None,
                          sf_pickle=''):
        # build train and test
        pdbbind_db = pdbbind(pdbbind_dir, pdbbind_version, opt={'b': None})
        if not home_dir:
            home_dir = dirname(__file__) + '/RFScore'

        pdbbind_db.default_set = 'core'
        core_set = pdbbind_db.ids
        core_act = np.array(pdbbind_db.activities)
        #         core_desc = np.vstack([self.descriptor_generator.build([pid.ligand], protein=pid.protein) for pid in pdbbind_db])
        result = Parallel(n_jobs=self.n_jobs)(
            delayed(_parallel_helper)(self.descriptor_generator,
                                      'build', [pid.ligand],
                                      protein=pid.pocket) for pid in pdbbind_db
            if pid.pocket is not None)
        core_desc = np.vstack(result)

        pdbbind_db.default_set = 'refined'
        refined_set = [pid for pid in pdbbind_db.ids if pid not in core_set]
        refined_act = np.array([
            pdbbind_db.sets[pdbbind_db.default_set][pid] for pid in refined_set
        ])
        #         refined_desc = np.vstack([self.descriptor_generator.build([pid.ligand], protein=pid.protein) for pid in pdbbind_db])
        result = Parallel(n_jobs=self.n_jobs)(
            delayed(_parallel_helper)(self.descriptor_generator,
                                      'build', [pid.ligand],
                                      protein=pid.pocket) for pid in pdbbind_db
            if pid.pocket is not None and pid.id not in core_set)
        refined_desc = np.vstack(result)

        self.train_descs = refined_desc
        self.train_target = refined_act
        self.test_descs = core_desc
        self.test_target = core_act

        # save numpy arrays
        header = 'RFScore data generated using PDBBind v%i' % pdbbind_version
        np.savetxt(home_dir + '/train_descs_v%i_pdbbind%i.csv' %
                   (self.version, pdbbind_version),
                   self.train_descs,
                   fmt='%g',
                   delimiter=',',
                   header=header)
        np.savetxt(home_dir + '/train_target_pdbbind%i.csv' % pdbbind_version,
                   self.train_target,
                   fmt='%.2f',
                   delimiter=',',
                   header=header)
        np.savetxt(home_dir + '/test_descs_v%i_pdbbind%i.csv' %
                   (self.version, pdbbind_version),
                   self.test_descs,
                   fmt='%g',
                   delimiter=',',
                   header=header)
        np.savetxt(home_dir + '/test_target_pdbbind%i.csv' % pdbbind_version,
                   self.test_target,
                   fmt='%.2f',
                   delimiter=',',
                   header=header)
Beispiel #6
0
def test_pdbbind():

    results = {
        'core': (['4yef', '10gs'], [5.35, 6.4]),
        'refined': (['1nlp', '1imx', '4yef', '10gs'], [4.96, 3.52, 5.35, 6.4]),
        'general_PL': (['1k9q', '1nlo', '1nlp', '1imx', '4yef',
                        '10gs'], [3.15, 5.47, 4.96, 3.52, 5.35, 6.4]),
    }

    assert_raises(ValueError,
                  pdbbind,
                  home=os.path.join(test_data_dir, 'data', 'pdbbind'))

    for year in [2007, 2013, 2016]:
        pdbbind_db = pdbbind(home=os.path.join(test_data_dir, 'data',
                                               'pdbbind'),
                             version=year,
                             default_set='core')

        for set_name, (ids, activities) in results.items():
            if set_name == 'general_PL' and year == 2007:
                set_name = 'general'
            pdbbind_db.default_set = set_name
            assert_equal(pdbbind_db.ids, ids)
            assert_equal(pdbbind_db.activities, activities)

            for pid in pdbbind_db:
                assert_is_instance(pid.pocket, oddt.toolkit.Molecule)
                assert_greater(len(pid.pocket.atoms), 0)
                assert_is_instance(pid.ligand, oddt.toolkit.Molecule)
                assert_greater(len(pid.ligand.atoms), 0)
                if pid.id == '10gs':
                    assert_equal(pid.protein, None)
                else:
                    assert_is_instance(pid.protein, oddt.toolkit.Molecule)
                    assert_greater(len(pid.protein.atoms), 0)

        # reset the pdbbind set
        pdbbind_db.default_set = 'refined'

        # getting by name
        assert_equal(pdbbind_db['1imx'].id, '1imx')

        # getting by id
        assert_equal(pdbbind_db[-3].id, '1imx')
        assert_equal(pdbbind_db[1].id, '1imx')

        assert_raises(KeyError, pdbbind_db.__getitem__, 'xxxx')
        assert_raises(KeyError, pdbbind_db.__getitem__, 123456)
        assert_raises(KeyError, pdbbind_db.__getitem__, -123456)

        pid = pdbbind_db['1imx']
        # get ligand
        ligand = pid.ligand
        ligand.removeh()
        assert_equal(len(ligand.atoms), 60)

        # get pocket
        pocket = pid.pocket
        pocket.removeh()
        assert_equal(len(pocket.atoms), 234)

        # protein do exist
        protein = pid.protein
        protein.removeh()
        assert_equal(len(protein.atoms), 478)
Beispiel #7
0
def test_pdbbind():

    results = {
        'core': (['4yef', '10gs'], [5.35, 6.4]),
        'refined': (['1nlp', '1imx', '4yef', '10gs'], [4.96, 3.52, 5.35, 6.4]),
        'general_PL': (['1k9q', '1nlo', '1nlp', '1imx', '4yef',
                        '10gs'], [3.15, 5.47, 4.96, 3.52, 5.35, 6.4]),
    }

    with pytest.raises(ValueError):
        pdbbind(home=os.path.join(test_data_dir, 'data', 'pdbbind'))

    for year in [2007, 2013, 2016]:
        pdbbind_db = pdbbind(home=os.path.join(test_data_dir, 'data',
                                               'pdbbind'),
                             version=year,
                             default_set='core')

        for set_name, (ids, activities) in results.items():
            if set_name == 'general_PL' and year == 2007:
                set_name = 'general'
            pdbbind_db.default_set = set_name
            assert pdbbind_db.ids == ids
            assert pdbbind_db.activities == activities

            for pid in pdbbind_db:
                assert isinstance(pid.pocket, oddt.toolkit.Molecule)
                assert len(pid.pocket.atoms) > 0
                assert isinstance(pid.ligand, oddt.toolkit.Molecule)
                assert len(pid.ligand.atoms) > 0
                if pid.id == '10gs':
                    assert pid.protein is None
                else:
                    assert isinstance(pid.protein, oddt.toolkit.Molecule)
                    assert len(pid.protein.atoms) > 0

        # reset the pdbbind set
        pdbbind_db.default_set = 'refined'

        # getting by name
        assert pdbbind_db['1imx'].id == '1imx'

        # getting by id
        assert pdbbind_db[-3].id == '1imx'
        assert pdbbind_db[1].id == '1imx'

        with pytest.raises(KeyError):
            pdbbind_db['xxxx']
        with pytest.raises(KeyError):
            pdbbind_db[123456]
        with pytest.raises(KeyError):
            pdbbind_db[-123456]

        pid = pdbbind_db['1imx']
        # get ligand
        ligand = pid.ligand
        ligand.removeh()
        assert len(ligand.atoms) == 60

        # get pocket
        pocket = pid.pocket
        pocket.removeh()
        assert len(pocket.atoms) == 234

        # protein do exist
        protein = pid.protein
        protein.removeh()
        assert len(protein.atoms) == 478
import htmd.smallmol.smallmol as sm
import csv
from tqdm import *
import os
import pickle
import numpy as np
import multiprocessing as mp
from sklearn.model_selection import train_test_split
import h5py
from oddt import toolkit
from oddt import datasets

# Directory paths
data_dir = "../dataset"
pdbbind_dir = os.path.join(data_dir, "refined-set-2016/")
pdbbind_dataset = datasets.pdbbind(home=pdbbind_dir, default_set='refined', version=2016)


def get_pdb_complex_feature(protein_file, ligand_file):
    """ Returns voxel features for a pdb complex """

    def get_prop(mol, left_most_point):
        """ Returns atom occupancies """
        n = [24, 24, 24] # Voxel size
        
        # Get the channels
        channels = vd._getAtomtypePropertiesPDBQT(mol)
        sigmas = vd._getRadii(mol)
        channels = sigmas[:, np.newaxis] * channels.astype(float)
        
        # Choose the grid centers
Beispiel #9
0
    def _gen_pdbbind_desc(self,
                          pdbbind_dir,
                          pdbbind_versions=(2007, 2012, 2013, 2014, 2015, 2016),
                          desc_path=None,
                          include_general_set=False,
                          use_proteins=False,
                          **kwargs):
        pdbbind_versions = sorted(pdbbind_versions)
        opt = kwargs.get('opt', {})

        # generate metadata
        df = None
        for pdbbind_version in pdbbind_versions:
            p = pdbbind('%s/v%i/' % (pdbbind_dir, pdbbind_version),
                        version=pdbbind_version,
                        opt=opt)
            # Core set

            for set_name in p.pdbind_sets:
                if set_name == 'general_PL':
                    dataset_key = '%i_general' % pdbbind_version
                else:
                    dataset_key = '%i_%s' % (pdbbind_version, set_name)

                tmp_df = pd.DataFrame({
                    'pdbid': list(p.sets[set_name].keys()),
                    dataset_key: list(p.sets[set_name].values())
                })
                if df is not None:
                    df = pd.merge(tmp_df, df, how='outer', on='pdbid')
                else:
                    df = tmp_df

        df.sort_values('pdbid', inplace=True)
        tmp_act = df['%i_general' % pdbbind_versions[-1]].values
        df = df.set_index('pdbid').notnull()
        df['act'] = tmp_act
        # take non-empty and core + refined set
        df = df[df['act'].notnull() &
                (df.filter(regex='.*_[refined,core]').any(axis=1) |
                 include_general_set)]

        # build descriptos
        pdbbind_db = pdbbind('%s/v%i/' % (pdbbind_dir, pdbbind_versions[-1]),
                             version=pdbbind_versions[-1])
        if not desc_path:
            desc_path = path_join(dirname(__file__) + 'descs.csv')

        if self.n_jobs is None:
            n_jobs = -1
        else:
            n_jobs = self.n_jobs

        blacklist = []
        if use_proteins:
            # list of protein files that segfault OB 2.4.1
            blacklist = pdbbind_db.protein_blacklist[oddt.toolkit.backend]

        # check if PDBID exists or is blacklisted
        desc_idx = [pid for pid in df.index.values
                    if (pid not in blacklist and
                        getattr(pdbbind_db[pid], 'protein'
                                if use_proteins
                                else 'pocket') is not None)]

        result = Parallel(n_jobs=n_jobs, verbose=1)(
            delayed(method_caller)(
                self.descriptor_generator,
                'build',
                [pdbbind_db[pid].ligand],
                protein=getattr(pdbbind_db[pid], 'protein' if use_proteins
                                else 'pocket'))
            for pid in desc_idx)

        # sparse descs may have different shapes, dense are stored np.array
        sparse = (hasattr(self.descriptor_generator, 'sparse') and
                  self.descriptor_generator.sparse)

        if not sparse:
            result = np.vstack(result)

        # create dataframe with descriptors with pdbids as index
        df_desc = pd.DataFrame(result, index=desc_idx)
        df_desc.index.rename('pdbid', inplace=True)

        # for sparse features leave one column and cast explicitly to list
        if sparse:
            if len(df_desc.columns) > 1:
                raise Exception('There are more than one column in the '
                                'sparse descriptor table.')
            df_desc.columns = ['sparse']
            df_desc['sparse'] = df_desc['sparse'].map(
                lambda x: csr_matrix_to_sparse(x).tolist())

        compression = None
        if desc_path[-3:] == '.gz':
            compression = 'gzip'
        # DF are joined by index (pdbid) since some might be missing
        df.join(df_desc, how='inner').to_csv(desc_path,
                                             float_format='%.5g',
                                             compression=compression)
Beispiel #10
0
def test_pdbbind():

    results = {
        'core': (['4yef', '10gs'],
                 [5.35, 6.4]),
        'refined': (['1nlp', '1imx', '4yef', '10gs'],
                    [4.96, 3.52, 5.35, 6.4]),
        'general_PL': (['1k9q', '1nlo', '1nlp', '1imx', '4yef', '10gs'],
                       [3.15, 5.47, 4.96, 3.52, 5.35, 6.4]),
    }

    with pytest.raises(ValueError):
        pdbbind(home=os.path.join(test_data_dir, 'data', 'pdbbind'))

    for year in [2007, 2013, 2016]:
        pdbbind_db = pdbbind(home=os.path.join(test_data_dir, 'data', 'pdbbind'),
                             version=year, default_set='core')

        for set_name, (ids, activities) in results.items():
            if set_name == 'general_PL' and year == 2007:
                set_name = 'general'
            pdbbind_db.default_set = set_name
            assert pdbbind_db.ids == ids
            assert pdbbind_db.activities == activities

            for pid in pdbbind_db:
                assert isinstance(pid.pocket, oddt.toolkit.Molecule)
                assert len(pid.pocket.atoms) > 0
                assert isinstance(pid.ligand, oddt.toolkit.Molecule)
                assert len(pid.ligand.atoms) > 0
                if pid.id == '10gs':
                    assert pid.protein is None
                else:
                    assert isinstance(pid.protein, oddt.toolkit.Molecule)
                    assert len(pid.protein.atoms) > 0

        # reset the pdbbind set
        pdbbind_db.default_set = 'refined'

        # getting by name
        assert pdbbind_db['1imx'].id == '1imx'

        # getting by id
        assert pdbbind_db[-3].id == '1imx'
        assert pdbbind_db[1].id == '1imx'

        with pytest.raises(KeyError):
            pdbbind_db['xxxx']
        with pytest.raises(KeyError):
            pdbbind_db[123456]
        with pytest.raises(KeyError):
            pdbbind_db[-123456]

        pid = pdbbind_db['1imx']
        # get ligand
        ligand = pid.ligand
        ligand.removeh()
        assert len(ligand.atoms) == 60

        # get pocket
        pocket = pid.pocket
        pocket.removeh()
        assert len(pocket.atoms) == 234

        # protein do exist
        protein = pid.protein
        protein.removeh()
        assert len(protein.atoms) == 478