def test_compute_splif_features_in_range(self): prot_xyz, prot_rdk = rgf.load_molecule(self.protein_file) lig_xyz, lig_rdk = rgf.load_molecule(self.ligand_file) prot_num_atoms = prot_rdk.GetNumAtoms() lig_num_atoms = lig_rdk.GetNumAtoms() distance = rgf.compute_pairwise_distances( protein_xyz=prot_xyz, ligand_xyz=lig_xyz) for bins in ((0, 2), (2, 3)): splif_dict = rgf.compute_splif_features_in_range( prot_rdk, lig_rdk, distance, bins, ) self.assertIsInstance(splif_dict, dict) for (prot_idx, lig_idx), ecfp_pair in splif_dict.items(): for idx in (prot_idx, lig_idx): self.assertIsInstance(idx, (int, np.int64)) self.assertGreaterEqual(prot_idx, 0) self.assertLess(prot_idx, prot_num_atoms) self.assertGreaterEqual(lig_idx, 0) self.assertLess(lig_idx, lig_num_atoms) for ecfp in ecfp_pair: ecfp_idx, ecfp_frag = ecfp.split(',') ecfp_idx = int(ecfp_idx) self.assertGreaterEqual(ecfp_idx, 0)
def test_featurize_splif(self): prot_xyz, prot_rdk = rgf.load_molecule(self.protein_file) lig_xyz, lig_rdk = rgf.load_molecule(self.ligand_file) distance = rgf.compute_pairwise_distances(protein_xyz=prot_xyz, ligand_xyz=lig_xyz) bins = [(1, 2), (2, 3)] dicts = rgf.featurize_splif(prot_xyz, prot_rdk, lig_xyz, lig_rdk, contact_bins=bins, pairwise_distances=distance, ecfp_degree=2) expected_dicts = [ rgf.compute_splif_features_in_range(prot_rdk, lig_rdk, distance, c_bin, ecfp_degree=2) for c_bin in bins ] self.assertIsInstance(dicts, list) self.assertEqual(dicts, expected_dicts)
def test_compute_splif_features_in_range(self): prot_xyz, prot_rdk = rgf.load_molecule(self.protein_file) lig_xyz, lig_rdk = rgf.load_molecule(self.ligand_file) prot_num_atoms = prot_rdk.GetNumAtoms() lig_num_atoms = lig_rdk.GetNumAtoms() distance = rgf.compute_pairwise_distances(protein_xyz=prot_xyz, ligand_xyz=lig_xyz) for bins in ((0, 2), (2, 3)): splif_dict = rgf.compute_splif_features_in_range( prot_rdk, lig_rdk, distance, bins, ) self.assertIsInstance(splif_dict, dict) for (prot_idx, lig_idx), ecfp_pair in splif_dict.items(): for idx in (prot_idx, lig_idx): self.assertIsInstance(idx, (int, np.int64)) self.assertGreaterEqual(prot_idx, 0) self.assertLess(prot_idx, prot_num_atoms) self.assertGreaterEqual(lig_idx, 0) self.assertLess(lig_idx, lig_num_atoms) for ecfp in ecfp_pair: ecfp_idx, ecfp_frag = ecfp.split(',') ecfp_idx = int(ecfp_idx) self.assertGreaterEqual(ecfp_idx, 0)
def test_featurize_binding_pocket_ecfp(self): prot_xyz, prot_rdk = rgf.load_molecule(self.protein_file) lig_xyz, lig_rdk = rgf.load_molecule(self.ligand_file) distance = rgf.compute_pairwise_distances( protein_xyz=prot_xyz, ligand_xyz=lig_xyz) # check if results are the same if we provide precomputed distances prot_dict, lig_dict = rgf.featurize_binding_pocket_ecfp( prot_xyz, prot_rdk, lig_xyz, lig_rdk, ) prot_dict_dist, lig_dict_dist = rgf.featurize_binding_pocket_ecfp( prot_xyz, prot_rdk, lig_xyz, lig_rdk, pairwise_distances=distance) # ...but first check if we actually got two dicts self.assertIsInstance(prot_dict, dict) self.assertIsInstance(lig_dict, dict) self.assertEqual(prot_dict, prot_dict_dist) self.assertEqual(lig_dict, lig_dict_dist) # check if we get less features with smaller distance cutoff prot_dict_d2, lig_dict_d2 = rgf.featurize_binding_pocket_ecfp( prot_xyz, prot_rdk, lig_xyz, lig_rdk, cutoff=2.0, ) prot_dict_d6, lig_dict_d6 = rgf.featurize_binding_pocket_ecfp( prot_xyz, prot_rdk, lig_xyz, lig_rdk, cutoff=6.0, ) self.assertLess(len(prot_dict_d2), len(prot_dict)) # ligands are typically small so all atoms might be present self.assertLessEqual(len(lig_dict_d2), len(lig_dict)) self.assertGreater(len(prot_dict_d6), len(prot_dict)) self.assertGreaterEqual(len(lig_dict_d6), len(lig_dict)) # check if using different ecfp_degree changes anything prot_dict_e3, lig_dict_e3 = rgf.featurize_binding_pocket_ecfp( prot_xyz, prot_rdk, lig_xyz, lig_rdk, ecfp_degree=3, ) self.assertNotEqual(prot_dict_e3, prot_dict) self.assertNotEqual(lig_dict_e3, lig_dict)
def test_featurize_binding_pocket_ecfp(self): prot_xyz, prot_rdk = rgf.load_molecule(self.protein_file) lig_xyz, lig_rdk = rgf.load_molecule(self.ligand_file) distance = rgf.compute_pairwise_distances(protein_xyz=prot_xyz, ligand_xyz=lig_xyz) # check if results are the same if we provide precomputed distances prot_dict, lig_dict = rgf.featurize_binding_pocket_ecfp( prot_xyz, prot_rdk, lig_xyz, lig_rdk, ) prot_dict_dist, lig_dict_dist = rgf.featurize_binding_pocket_ecfp( prot_xyz, prot_rdk, lig_xyz, lig_rdk, pairwise_distances=distance) # ...but first check if we actually got two dicts self.assertIsInstance(prot_dict, dict) self.assertIsInstance(lig_dict, dict) self.assertEqual(prot_dict, prot_dict_dist) self.assertEqual(lig_dict, lig_dict_dist) # check if we get less features with smaller distance cutoff prot_dict_d2, lig_dict_d2 = rgf.featurize_binding_pocket_ecfp( prot_xyz, prot_rdk, lig_xyz, lig_rdk, cutoff=2.0, ) prot_dict_d6, lig_dict_d6 = rgf.featurize_binding_pocket_ecfp( prot_xyz, prot_rdk, lig_xyz, lig_rdk, cutoff=6.0, ) self.assertLess(len(prot_dict_d2), len(prot_dict)) # ligands are typically small so all atoms might be present self.assertLessEqual(len(lig_dict_d2), len(lig_dict)) self.assertGreater(len(prot_dict_d6), len(prot_dict)) self.assertGreaterEqual(len(lig_dict_d6), len(lig_dict)) # check if using different ecfp_degree changes anything prot_dict_e3, lig_dict_e3 = rgf.featurize_binding_pocket_ecfp( prot_xyz, prot_rdk, lig_xyz, lig_rdk, ecfp_degree=3, ) self.assertNotEqual(prot_dict_e3, prot_dict) self.assertNotEqual(lig_dict_e3, lig_dict)
def test_compute_charge_dictionary(self): for fname in (self.ligand_file, self.protein_file): _, mol = rgf.load_molecule(fname) ComputeGasteigerCharges(mol) charge_dict = rgf.compute_charge_dictionary(mol) self.assertEqual(len(charge_dict), mol.GetNumAtoms()) for i in range(mol.GetNumAtoms()): self.assertIn(i, charge_dict) self.assertIsInstance(charge_dict[i], (float, int))
def test_voxelize(self): prot_xyz, prot_rdk = rgf.load_molecule(self.protein_file) lig_xyz, lig_rdk = rgf.load_molecule(self.ligand_file) centroid = rgf.compute_centroid(lig_xyz) prot_xyz = rgf.subtract_centroid(prot_xyz, centroid) lig_xyz = rgf.subtract_centroid(lig_xyz, centroid) prot_ecfp_dict, lig_ecfp_dict = rgf.featurize_binding_pocket_ecfp( prot_xyz, prot_rdk, lig_xyz, lig_rdk) box_w = 20 f_power = 5 rgf_featurizer = rgf.RdkitGridFeaturizer( box_width=box_w, ecfp_power=f_power, feature_types=['all_combined'], flatten=True, sanitize=True) prot_tensor = rgf_featurizer._voxelize( rgf.convert_atom_to_voxel, rgf.hash_ecfp, prot_xyz, feature_dict=prot_ecfp_dict, channel_power=f_power) self.assertEqual(prot_tensor.shape, tuple([box_w] * 3 + [2**f_power])) all_features = prot_tensor.sum() # protein is too big for the box, some features should be missing self.assertGreater(all_features, 0) self.assertLess(all_features, prot_rdk.GetNumAtoms()) lig_tensor = rgf_featurizer._voxelize( rgf.convert_atom_to_voxel, rgf.hash_ecfp, lig_xyz, feature_dict=lig_ecfp_dict, channel_power=f_power) self.assertEqual(lig_tensor.shape, tuple([box_w] * 3 + [2**f_power])) all_features = lig_tensor.sum() # whole ligand should fit in the box self.assertEqual(all_features, lig_rdk.GetNumAtoms())
def test_load_molecule(self): # adding hydrogens and charges is tested in dc.utils for add_hydrogens in (True, False): for calc_charges in (True, False): mol_xyz, mol_rdk = rgf.load_molecule(self.ligand_file, add_hydrogens, calc_charges) num_atoms = mol_rdk.GetNumAtoms() self.assertIsInstance(mol_xyz, np.ndarray) self.assertIsInstance(mol_rdk, Mol) self.assertEqual(mol_xyz.shape, (num_atoms, 3))
def setUp(self): current_dir = os.path.dirname(os.path.realpath(__file__)) # simple flat ring self.cycle4 = MolFromSmiles('C1CCC1') self.cycle4.Compute2DCoords() # load and sanitize two real molecules _, self.prot = rgf.load_molecule( os.path.join(current_dir, '3ws9_protein_fixer_rdkit.pdb'), add_hydrogens=False, calc_charges=False, sanitize=True) _, self.lig = rgf.load_molecule( os.path.join(current_dir, '3ws9_ligand.sdf'), add_hydrogens=False, calc_charges=False, sanitize=True)
def setUp(self): current_dir = os.path.dirname(os.path.realpath(__file__)) # simple flat ring self.cycle4 = MolFromSmiles('C1CCC1') self.cycle4.Compute2DCoords() # load and sanitize two real molecules _, self.prot = rgf.load_molecule(os.path.join( current_dir, '3ws9_protein_fixer_rdkit.pdb'), add_hydrogens=False, calc_charges=False, sanitize=True) _, self.lig = rgf.load_molecule(os.path.join(current_dir, '3ws9_ligand.sdf'), add_hydrogens=False, calc_charges=False, sanitize=True)
def test_featurize_splif(self): prot_xyz, prot_rdk = rgf.load_molecule(self.protein_file) lig_xyz, lig_rdk = rgf.load_molecule(self.ligand_file) distance = rgf.compute_pairwise_distances( protein_xyz=prot_xyz, ligand_xyz=lig_xyz) bins = [(1, 2), (2, 3)] dicts = rgf.featurize_splif( prot_xyz, prot_rdk, lig_xyz, lig_rdk, contact_bins=bins, pairwise_distances=distance, ecfp_degree=2) expected_dicts = [ rgf.compute_splif_features_in_range( prot_rdk, lig_rdk, distance, c_bin, ecfp_degree=2) for c_bin in bins ] self.assertIsInstance(dicts, list) self.assertEqual(dicts, expected_dicts)
def test_compute_all_ecfp(self): _, mol = rgf.load_molecule(self.ligand_file) num_atoms = mol.GetNumAtoms() for degree in range(1, 4): # TODO test if dict contains smiles ecfp_all = rgf.compute_all_ecfp(mol, degree=degree) self.assertIsInstance(ecfp_all, dict) self.assertEqual(len(ecfp_all), num_atoms) self.assertEqual(list(ecfp_all.keys()), list(range(num_atoms))) num_ind = np.random.choice(range(1, num_atoms)) indices = list(np.random.choice(num_atoms, num_ind, replace=False)) ecfp_selected = rgf.compute_all_ecfp(mol, indices=indices, degree=degree) self.assertIsInstance(ecfp_selected, dict) self.assertEqual(len(ecfp_selected), num_ind) self.assertEqual(sorted(ecfp_selected.keys()), sorted(indices))