def gen_training_data(self, pdbbind_dir, pdbbind_versions=(2007, 2012, 2013, 2014, 2015, 2016), home_dir=None): pdbbind_versions = sorted(pdbbind_versions) # generate metadata df = [] for pdbbind_version in pdbbind_versions: p = pdbbind('%s/v%i/' % (pdbbind_dir, pdbbind_version), version=pdbbind_version, opt={'b': None}) # Core set tmp_df = pd.DataFrame({'pdbid': list(p.sets['core'].keys()), '%i_core' % pdbbind_version: list(p.sets['core'].values())}) df = pd.merge(tmp_df, df, how='outer', on='pdbid') if len(df) else tmp_df # Refined Set tmp_df = pd.DataFrame({'pdbid': list(p.sets['refined'].keys()), '%i_refined' % pdbbind_version: list(p.sets['refined'].values())}) df = pd.merge(tmp_df, df, how='outer', on='pdbid') # General Set general_name = 'general_PL' if pdbbind_version > 2007 else 'general' tmp_df = pd.DataFrame({'pdbid': list(p.sets[general_name].keys()), '%i_general' % pdbbind_version: list(p.sets[general_name].values())}) df = pd.merge(tmp_df, df, how='outer', on='pdbid') df.sort_values('pdbid', inplace=True) tmp_act = df['%i_general' % pdbbind_versions[-1]].values df = df.set_index('pdbid').notnull() df['act'] = tmp_act # take non-empty and core + refined set df = df[df['act'].notnull() & df.filter(regex='.*_[refined,core]').any(axis=1)] # build descriptos pdbbind_db = pdbbind('%s/v%i/' % (pdbbind_dir, pdbbind_versions[-1]), version=pdbbind_versions[-1]) if not home_dir: home_dir = dirname(__file__) + '/RFScore' result = Parallel(n_jobs=self.n_jobs, verbose=1)(delayed(_parallel_helper)(self.descriptor_generator, 'build', [pdbbind_db[pid].ligand], protein=pdbbind_db[pid].pocket) for pid in df.index.values if pdbbind_db[pid].pocket is not None) descs = np.vstack(result) for i in range(len(self.descriptor_generator)): df[str(i)] = descs[:, i] df.to_csv(home_dir + '/rfscore_descs_v%i.csv' % self.version, float_format='%.5g')
def gen_training_data(self, pdbbind_dir, pdbbind_version='2007', home_dir=None, sf_pickle=''): # build train and test cpus = self.n_jobs if self.n_jobs > 0 else -1 #pool = Pool(processes=cpus) pdbbind_db = pdbbind(pdbbind_dir, int(pdbbind_version), opt={'b': None}) if not home_dir: home_dir = dirname(__file__) + '/RFScore' pdbbind_db.default_set = 'core' core_set = pdbbind_db.ids core_act = np.array(pdbbind_db.activities) # core_desc = np.vstack([self.descriptor_generator.build([pid.ligand], protein=pid.pocket) for pid in pdbbind_db]) result = Parallel(n_jobs=cpus)( delayed(_parallel_helper)(self.descriptor_generator, 'build', [pid.ligand], protein=pid.pocket) for pid in pdbbind_db if pid.pocket) core_desc = np.vstack(result) pdbbind_db.default_set = 'general' refined_set = [pid for pid in pdbbind_db.ids if not pid in core_set] refined_act = np.array([ pdbbind_db.sets[pdbbind_db.default_set][pid] for pid in refined_set ]) # refined_desc = np.vstack([self.descriptor_generator.build([pid.ligand], protein=pid.pocket) for pid in pdbbind_db]) result = Parallel(n_jobs=cpus)( delayed(_parallel_helper)(self.descriptor_generator, 'build', [pid.ligand], protein=pid.pocket) for pid in pdbbind_db if pid.pocket and not pid.id in core_set) refined_desc = np.vstack(result) self.train_descs = refined_desc self.train_target = refined_act self.test_descs = core_desc self.test_target = core_act # save numpy arrays np.savetxt(home_dir + '/train_descs_v%i.csv' % (self.version), self.train_descs, fmt='%g', delimiter=',') np.savetxt(home_dir + '/train_target.csv', self.train_target, fmt='%.2f', delimiter=',') np.savetxt(home_dir + '/test_descs_v%i.csv' % (self.version), self.test_descs, fmt='%g', delimiter=',') np.savetxt(home_dir + '/test_target.csv', self.test_target, fmt='%.2f', delimiter=',')
def gen_training_data(self, pdbbind_dir, pdbbind_version = '2007', home_dir = None, sf_pickle = ''): # build train and test cpus = self.n_jobs if self.n_jobs > 0 else -1 #pool = Pool(processes=cpus) pdbbind_db = pdbbind(pdbbind_dir, int(pdbbind_version), opt={'b':None}) if not home_dir: home_dir = dirname(__file__) + '/RFScore' pdbbind_db.default_set = 'core' core_set = pdbbind_db.ids core_act = np.array(pdbbind_db.activities) # core_desc = np.vstack([self.descriptor_generator.build([pid.ligand], protein=pid.pocket) for pid in pdbbind_db]) result = Parallel(n_jobs=cpus)(delayed(_parallel_helper)(self.descriptor_generator, 'build', [pid.ligand], protein=pid.pocket) for pid in pdbbind_db if pid.pocket) core_desc = np.vstack(result) pdbbind_db.default_set = 'general' refined_set = [pid for pid in pdbbind_db.ids if not pid in core_set] refined_act = np.array([pdbbind_db.sets[pdbbind_db.default_set][pid] for pid in refined_set]) # refined_desc = np.vstack([self.descriptor_generator.build([pid.ligand], protein=pid.pocket) for pid in pdbbind_db]) result = Parallel(n_jobs=cpus)(delayed(_parallel_helper)(self.descriptor_generator, 'build', [pid.ligand], protein=pid.pocket) for pid in pdbbind_db if pid.pocket and not pid.id in core_set) refined_desc = np.vstack(result) self.train_descs = refined_desc self.train_target = refined_act self.test_descs = core_desc self.test_target = core_act # save numpy arrays np.savetxt(home_dir + '/train_descs_v%i.csv' % (self.version), self.train_descs, fmt='%g', delimiter=',') np.savetxt(home_dir + '/train_target.csv', self.train_target, fmt='%.2f', delimiter=',') np.savetxt(home_dir + '/test_descs_v%i.csv' % (self.version), self.test_descs, fmt='%g', delimiter=',') np.savetxt(home_dir + '/test_target.csv', self.test_target, fmt='%.2f', delimiter=',')
def _gen_pdbbind_desc(self, pdbbind_dir, pdbbind_versions=(2007, 2012, 2013, 2014, 2015, 2016), desc_path=None, **kwargs): pdbbind_versions = sorted(pdbbind_versions) if 'opt' in kwargs: opt = kwargs.pop('opt') else: opt = {} # generate metadata df = None for pdbbind_version in pdbbind_versions: p = pdbbind('%s/v%i/' % (pdbbind_dir, pdbbind_version), version=pdbbind_version, opt=opt) # Core set for set_name in p.pdbind_sets: if set_name == 'general_PL': dataset_key = '%i_general' % pdbbind_version else: dataset_key = '%i_%s' % (pdbbind_version, set_name) tmp_df = pd.DataFrame({ 'pdbid': list(p.sets[set_name].keys()), dataset_key: list(p.sets[set_name].values()) }) if df is not None: df = pd.merge(tmp_df, df, how='outer', on='pdbid') else: df = tmp_df df.sort_values('pdbid', inplace=True) tmp_act = df['%i_general' % pdbbind_versions[-1]].values df = df.set_index('pdbid').notnull() df['act'] = tmp_act # take non-empty and core + refined set df = df[df['act'].notnull() & df.filter(regex='.*_[refined,core]').any(axis=1)] # build descriptos pdbbind_db = pdbbind('%s/v%i/' % (pdbbind_dir, pdbbind_versions[-1]), version=pdbbind_versions[-1]) if not desc_path: desc_path = path_join(dirname(__file__) + 'descs.csv') if self.n_jobs is None: n_jobs = -1 else: n_jobs = self.n_jobs result = Parallel(n_jobs=n_jobs, verbose=1)( delayed(method_caller)(self.descriptor_generator, 'build', [pdbbind_db[pid].ligand], protein=pdbbind_db[pid].pocket) for pid in df.index.values if pdbbind_db[pid].pocket is not None) descs = np.vstack(result) for i in range(len(self.descriptor_generator)): df[str(i)] = descs[:, i] df.to_csv(desc_path, float_format='%.5g')
def gen_training_data(self, pdbbind_dir, pdbbind_version=2007, home_dir=None, sf_pickle=''): # build train and test pdbbind_db = pdbbind(pdbbind_dir, pdbbind_version, opt={'b': None}) if not home_dir: home_dir = dirname(__file__) + '/RFScore' pdbbind_db.default_set = 'core' core_set = pdbbind_db.ids core_act = np.array(pdbbind_db.activities) # core_desc = np.vstack([self.descriptor_generator.build([pid.ligand], protein=pid.protein) for pid in pdbbind_db]) result = Parallel(n_jobs=self.n_jobs)( delayed(_parallel_helper)(self.descriptor_generator, 'build', [pid.ligand], protein=pid.pocket) for pid in pdbbind_db if pid.pocket is not None) core_desc = np.vstack(result) pdbbind_db.default_set = 'refined' refined_set = [pid for pid in pdbbind_db.ids if pid not in core_set] refined_act = np.array([ pdbbind_db.sets[pdbbind_db.default_set][pid] for pid in refined_set ]) # refined_desc = np.vstack([self.descriptor_generator.build([pid.ligand], protein=pid.protein) for pid in pdbbind_db]) result = Parallel(n_jobs=self.n_jobs)( delayed(_parallel_helper)(self.descriptor_generator, 'build', [pid.ligand], protein=pid.pocket) for pid in pdbbind_db if pid.pocket is not None and pid.id not in core_set) refined_desc = np.vstack(result) self.train_descs = refined_desc self.train_target = refined_act self.test_descs = core_desc self.test_target = core_act # save numpy arrays header = 'RFScore data generated using PDBBind v%i' % pdbbind_version np.savetxt(home_dir + '/train_descs_v%i_pdbbind%i.csv' % (self.version, pdbbind_version), self.train_descs, fmt='%g', delimiter=',', header=header) np.savetxt(home_dir + '/train_target_pdbbind%i.csv' % pdbbind_version, self.train_target, fmt='%.2f', delimiter=',', header=header) np.savetxt(home_dir + '/test_descs_v%i_pdbbind%i.csv' % (self.version, pdbbind_version), self.test_descs, fmt='%g', delimiter=',', header=header) np.savetxt(home_dir + '/test_target_pdbbind%i.csv' % pdbbind_version, self.test_target, fmt='%.2f', delimiter=',', header=header)
def test_pdbbind(): results = { 'core': (['4yef', '10gs'], [5.35, 6.4]), 'refined': (['1nlp', '1imx', '4yef', '10gs'], [4.96, 3.52, 5.35, 6.4]), 'general_PL': (['1k9q', '1nlo', '1nlp', '1imx', '4yef', '10gs'], [3.15, 5.47, 4.96, 3.52, 5.35, 6.4]), } assert_raises(ValueError, pdbbind, home=os.path.join(test_data_dir, 'data', 'pdbbind')) for year in [2007, 2013, 2016]: pdbbind_db = pdbbind(home=os.path.join(test_data_dir, 'data', 'pdbbind'), version=year, default_set='core') for set_name, (ids, activities) in results.items(): if set_name == 'general_PL' and year == 2007: set_name = 'general' pdbbind_db.default_set = set_name assert_equal(pdbbind_db.ids, ids) assert_equal(pdbbind_db.activities, activities) for pid in pdbbind_db: assert_is_instance(pid.pocket, oddt.toolkit.Molecule) assert_greater(len(pid.pocket.atoms), 0) assert_is_instance(pid.ligand, oddt.toolkit.Molecule) assert_greater(len(pid.ligand.atoms), 0) if pid.id == '10gs': assert_equal(pid.protein, None) else: assert_is_instance(pid.protein, oddt.toolkit.Molecule) assert_greater(len(pid.protein.atoms), 0) # reset the pdbbind set pdbbind_db.default_set = 'refined' # getting by name assert_equal(pdbbind_db['1imx'].id, '1imx') # getting by id assert_equal(pdbbind_db[-3].id, '1imx') assert_equal(pdbbind_db[1].id, '1imx') assert_raises(KeyError, pdbbind_db.__getitem__, 'xxxx') assert_raises(KeyError, pdbbind_db.__getitem__, 123456) assert_raises(KeyError, pdbbind_db.__getitem__, -123456) pid = pdbbind_db['1imx'] # get ligand ligand = pid.ligand ligand.removeh() assert_equal(len(ligand.atoms), 60) # get pocket pocket = pid.pocket pocket.removeh() assert_equal(len(pocket.atoms), 234) # protein do exist protein = pid.protein protein.removeh() assert_equal(len(protein.atoms), 478)
def test_pdbbind(): results = { 'core': (['4yef', '10gs'], [5.35, 6.4]), 'refined': (['1nlp', '1imx', '4yef', '10gs'], [4.96, 3.52, 5.35, 6.4]), 'general_PL': (['1k9q', '1nlo', '1nlp', '1imx', '4yef', '10gs'], [3.15, 5.47, 4.96, 3.52, 5.35, 6.4]), } with pytest.raises(ValueError): pdbbind(home=os.path.join(test_data_dir, 'data', 'pdbbind')) for year in [2007, 2013, 2016]: pdbbind_db = pdbbind(home=os.path.join(test_data_dir, 'data', 'pdbbind'), version=year, default_set='core') for set_name, (ids, activities) in results.items(): if set_name == 'general_PL' and year == 2007: set_name = 'general' pdbbind_db.default_set = set_name assert pdbbind_db.ids == ids assert pdbbind_db.activities == activities for pid in pdbbind_db: assert isinstance(pid.pocket, oddt.toolkit.Molecule) assert len(pid.pocket.atoms) > 0 assert isinstance(pid.ligand, oddt.toolkit.Molecule) assert len(pid.ligand.atoms) > 0 if pid.id == '10gs': assert pid.protein is None else: assert isinstance(pid.protein, oddt.toolkit.Molecule) assert len(pid.protein.atoms) > 0 # reset the pdbbind set pdbbind_db.default_set = 'refined' # getting by name assert pdbbind_db['1imx'].id == '1imx' # getting by id assert pdbbind_db[-3].id == '1imx' assert pdbbind_db[1].id == '1imx' with pytest.raises(KeyError): pdbbind_db['xxxx'] with pytest.raises(KeyError): pdbbind_db[123456] with pytest.raises(KeyError): pdbbind_db[-123456] pid = pdbbind_db['1imx'] # get ligand ligand = pid.ligand ligand.removeh() assert len(ligand.atoms) == 60 # get pocket pocket = pid.pocket pocket.removeh() assert len(pocket.atoms) == 234 # protein do exist protein = pid.protein protein.removeh() assert len(protein.atoms) == 478
import htmd.smallmol.smallmol as sm import csv from tqdm import * import os import pickle import numpy as np import multiprocessing as mp from sklearn.model_selection import train_test_split import h5py from oddt import toolkit from oddt import datasets # Directory paths data_dir = "../dataset" pdbbind_dir = os.path.join(data_dir, "refined-set-2016/") pdbbind_dataset = datasets.pdbbind(home=pdbbind_dir, default_set='refined', version=2016) def get_pdb_complex_feature(protein_file, ligand_file): """ Returns voxel features for a pdb complex """ def get_prop(mol, left_most_point): """ Returns atom occupancies """ n = [24, 24, 24] # Voxel size # Get the channels channels = vd._getAtomtypePropertiesPDBQT(mol) sigmas = vd._getRadii(mol) channels = sigmas[:, np.newaxis] * channels.astype(float) # Choose the grid centers
def _gen_pdbbind_desc(self, pdbbind_dir, pdbbind_versions=(2007, 2012, 2013, 2014, 2015, 2016), desc_path=None, include_general_set=False, use_proteins=False, **kwargs): pdbbind_versions = sorted(pdbbind_versions) opt = kwargs.get('opt', {}) # generate metadata df = None for pdbbind_version in pdbbind_versions: p = pdbbind('%s/v%i/' % (pdbbind_dir, pdbbind_version), version=pdbbind_version, opt=opt) # Core set for set_name in p.pdbind_sets: if set_name == 'general_PL': dataset_key = '%i_general' % pdbbind_version else: dataset_key = '%i_%s' % (pdbbind_version, set_name) tmp_df = pd.DataFrame({ 'pdbid': list(p.sets[set_name].keys()), dataset_key: list(p.sets[set_name].values()) }) if df is not None: df = pd.merge(tmp_df, df, how='outer', on='pdbid') else: df = tmp_df df.sort_values('pdbid', inplace=True) tmp_act = df['%i_general' % pdbbind_versions[-1]].values df = df.set_index('pdbid').notnull() df['act'] = tmp_act # take non-empty and core + refined set df = df[df['act'].notnull() & (df.filter(regex='.*_[refined,core]').any(axis=1) | include_general_set)] # build descriptos pdbbind_db = pdbbind('%s/v%i/' % (pdbbind_dir, pdbbind_versions[-1]), version=pdbbind_versions[-1]) if not desc_path: desc_path = path_join(dirname(__file__) + 'descs.csv') if self.n_jobs is None: n_jobs = -1 else: n_jobs = self.n_jobs blacklist = [] if use_proteins: # list of protein files that segfault OB 2.4.1 blacklist = pdbbind_db.protein_blacklist[oddt.toolkit.backend] # check if PDBID exists or is blacklisted desc_idx = [pid for pid in df.index.values if (pid not in blacklist and getattr(pdbbind_db[pid], 'protein' if use_proteins else 'pocket') is not None)] result = Parallel(n_jobs=n_jobs, verbose=1)( delayed(method_caller)( self.descriptor_generator, 'build', [pdbbind_db[pid].ligand], protein=getattr(pdbbind_db[pid], 'protein' if use_proteins else 'pocket')) for pid in desc_idx) # sparse descs may have different shapes, dense are stored np.array sparse = (hasattr(self.descriptor_generator, 'sparse') and self.descriptor_generator.sparse) if not sparse: result = np.vstack(result) # create dataframe with descriptors with pdbids as index df_desc = pd.DataFrame(result, index=desc_idx) df_desc.index.rename('pdbid', inplace=True) # for sparse features leave one column and cast explicitly to list if sparse: if len(df_desc.columns) > 1: raise Exception('There are more than one column in the ' 'sparse descriptor table.') df_desc.columns = ['sparse'] df_desc['sparse'] = df_desc['sparse'].map( lambda x: csr_matrix_to_sparse(x).tolist()) compression = None if desc_path[-3:] == '.gz': compression = 'gzip' # DF are joined by index (pdbid) since some might be missing df.join(df_desc, how='inner').to_csv(desc_path, float_format='%.5g', compression=compression)