def search_cocrystals(filter_solvents=True): ''' Search the whole CSD for structures that contain two different molecules with the specific settings ''' start_time = time.clock() csd = MoleculeReader('CSD') entry_reader = EntryReader('CSD') settings = search.Search.Settings() settings.only_organic = True settings.not_polymeric = True settings.has_3d_coordinates = True settings.no_disorder = True settings.no_errors = True settings.no_ions = True settings.no_metals = True pairs=[] for entry in csd: #if len(pairs)==100: # break if settings.test(entry): mol = csd.molecule(entry.identifier) mol.normalise_labels() smi= mol.smiles if smi != None: smi = smi.split('.') # We make sure that the structure consist of two different molecules if len(Remove(smi)) == 2: pairs.append(mol.identifier) # clean the list from solvents if filter_solvents: print('Solvates and hydrates will be removed') solvates=[] name_dict={} for mol1 in pairs: mol = csd.molecule(mol1) e=entry_reader.entry(mol1) name_dict[mol1]=e.chemical_name for i in range(0, (len(mol.components))): if mol.components[i].smiles in clean_smiles.SOLVENT_SMILES: solvates.append(mol.identifier) solvates = Remove(solvates) final_cocrystals = [x for x in pairs if x not in solvates] #print(name_dict) else: final_cocrystals=pairs # Clean the list from polymorphs cocrystals = remove_polymorphs(final_cocrystals) #print the time end_time = time.clock() name=[] name= [name_dict[i] for i in cocrystals] cocrystals_data= pd.concat([pd.DataFrame(cocrystals, columns=['csd_id']), pd.DataFrame(name, columns=['name'])], axis=1) cocrystals_data=cocrystals_data.dropna(axis=0) dataset_cocrystals = cocrystals_data[~cocrystals_data.name.str.contains("solvate")] dataset_cocrystals = dataset_cocrystals[~dataset_cocrystals.name.str.contains("clathrate")] print(end_time-start_time) dataset_cocrystals.to_csv('new_all_cocrystals.csv',index=False) return cocrystals
def get_smiles_from_csd(): ''' Read each CSD identifier and save the smiles ''' co_crystals = pd.read_csv('datasets/train_data/cocrystals2020.csv', encoding='latin1') co_crystals = co_crystals.iloc[:, :] #print(co_crystals.csd_id) smiles1 = [] smiles2 = [] year = [] for i in co_crystals.csd_id.values: #print(i) csd = MoleculeReader('CSD') csd_reader = io.EntryReader('CSD') year.append(csd_reader.entry(i).publication.year) mol = csd.molecule(i) smi = mol.smiles smi = smi.split('.') smi = Remove(smi) smiles1.append(smi[0]) smiles2.append(smi[1]) #print(len(smiles1)) #cocrystal_data = pd.concat([co_crystals , pd.DataFrame(smiles1, columns=['smiles1']), pd.DataFrame(smiles2, columns=['smiles2']), #pd.DataFrame(year, columns=['year'])], axis=1) #cocrystal_data.to_csv('datasets/train_data/all_cocrystals_info.csv') return co_crystals, smiles1, smiles2
def get_ensemble(self, nrotations, charged=False): largest_lig = self.find_largest_ligand() lig = MoleculeReader(largest_lig)[0] prot = Protein.from_file(join(dirname(largest_lig), 'protein.mol2')) bs = Protein.BindingSiteFromMolecule(protein=prot, molecule=lig, distance=6.5) # prot_paths = glob(join(self.root_dir, '*', 'protein.mol2')) prot_paths = self.get_protein_paths() print(prot_paths) print(self.ensemble_name, len(prot_paths)) luigi.build([ ParalleliselRunner( prot_paths, nrotations, charged, data_source='KLIFS') ], local_scheduler=True, workers=30) #luigi.build([ParalleliselRunner(prot_paths, nrotations, charged)], local_scheduler=True, #workers=30) hot_paths = [ join(dirname(in_pdb), "fullsize_hotspots_{}".format(nrotations), "out.zip") for in_pdb in prot_paths ] return hot_paths
def scaled_score_ligands(self, tolerance): """ Applies linear scaling to scores assigned to atom, depending on distance between atom and the scored point. :param int tolerance: How many gridpoints away is it acceptable for an atom to be from the nearest point of its corresponding map. :return: """ dsc = self.get_scorer_result() hs = dsc.get_hotspot() all_ligs = MoleculeReader( join(self.hotspot_path, "docking_tmp", "docked_ligands.mol2")) scored_ligs = [] for lig in all_ligs: scored_lig = dsc.get_scaled_score(lig, tolerance, hs)[0] ligand_score = np.mean( [a.partial_charge for a in scored_lig.heavy_atoms]) scored_lig.identifier += "_{}".format(round(ligand_score, 2)) scored_ligs.append(scored_lig) with MoleculeWriter( os.path.join(self.hotspot_path, "scored_docks.mol2")) as writer: for ligand in scored_ligs: writer.write(ligand)
def main(): p = Pool(8) args = parse_arguments() sdf_dir = os.path.join(args.sdf_dir) list_of_sdf_files = [filename for filename in read_sdf_file(sdf_dir)] print(list_of_sdf_files) proc = Conformer_generator(args) t = TicToc() t.tic() for file in list_of_sdf_files: sdf_file_name = file.split('_')[0] print(sdf_file_name) full_directory_path = os.path.join(args.conformers_file_dir, '{}'.format(sdf_file_name)) os.makedirs(full_directory_path) os.chdir(full_directory_path) try: molecule_object_from_sdf_file = MoleculeReader( os.path.join(sdf_dir, file)) list_of_molecules = [m for m in molecule_object_from_sdf_file] p.map(proc.generate_conformer, list_of_molecules) except: print("can not read sdf file {}".format(file)) t.toc() print(t.elapsed)
def hot_calc(inputs): pdb, het, pdir = inputs p = Protein.from_file(os.path.join(pdir, f"{pdb}.pdb")) mol = MoleculeReader(os.path.join(pdir, f"{pdb}_{het}.mol2"))[0] runner = Runner() hr = runner.from_protein(p, nprocesses=3, cavities=mol) for p, g in hr.super_grids.items(): hr.super_grids[p] = g.max_value_of_neighbours() # with HotspotReader(os.path.join(pdir, "out.zip")) as r: # hr = [h for h in r.read() if h.identifier == "hotspot"][0] e = Extractor(hr) bv = e.extract_volume(volume=250) # smoothing for p, g in bv.super_grids.items(): bv.super_grids[p] = g.gaussian(sigma=0.5) bv.identifier = "bestvol" hr.identifier = "hotspot" with HotspotWriter(pdir) as w: w.write([hr, bv])
def find_largest_ligand(self): """ Looks for the largest ligand returned in the SIENA ensemble. :return: """ # Get the ligands for the proteins returned by SIENA print(self.lig_dir) mol_paths = glob(join(self.lig_dir, "*.sdf")) print(mol_paths) mols = MoleculeReader(mol_paths) # Get a dictionary of the molecule_ID and the filename mw_dict = { basename(m_fname): m.molecular_weight for m_fname, m in zip(mols.file_name, mols) } print(mw_dict) # Get the filename of the largest ligand: try: largest_lig = sorted( ((value, key) for (key, value) in mw_dict.items()), reverse=True)[0][1] print(largest_lig) return largest_lig except IndexError: print("SIENA found no ligands for ensemble {}".format( self.ensemble_name)) return
def dock(self): """ handle docking run with GOLD :return: """ docker = Docker() # enables hotspot constraints docker.settings = hs_screening.DockerSettings() f = os.path.join(self.temp, self.hs_pdb + ".mol2") with MoleculeWriter(f) as w: w.write(self.protein) # setup docker.settings.add_protein_file(f) docker.settings.binding_site = docker.settings.BindingSiteFromPoint(protein=docker.settings.proteins[0], origin=self.ligand.centre_of_geometry(), distance=12.0) docker.settings.fitness_function = 'plp' docker.settings.autoscale = 10. docker.settings.output_directory = self.temp docker.settings.output_file = "docked_ligands.mol2" docker.settings.add_ligand_file(self.search_ligands, ndocks=3) # constraints # docker.settings.add_constraint( # docker.settings.TemplateSimilarityConstraint(type="all", template=self.ligand, weight=150) #) # extractor = best_volume.Extractor(hr=self.hr, volume=300, mode="global", mvon=False) # bv = extractor.extracted_hotspots[0] # # with hs_io.HotspotWriter(path=os.path.join(self.path, "bv")) as hw: # hw.write(extractor.extracted_hotspots) # # hs = docker.settings.HotspotHBondConstraint.from_hotspot(protein=docker.settings.proteins[0], # hr=bv, # weight=150, # max_constraints=2) # # docker.settings.add_constraint(hs) # docker.settings.add_apolar_fitting_points(hr=self.hr) # # mol = Molecule(identifier="constraints") # for a in hs.atoms: # mol.add_atom(Atom(atomic_symbol="C", # atomic_number=14, # label="Du", # coordinates=a.coordinates)) # # with MoleculeWriter(os.path.join(self.path, "constaints.mol2")) as w: # w.write(mol) # dock docker.dock() return MoleculeReader(os.path.join(docker.settings.output_directory, docker.settings.output_file))
def _read_ligands(self): """ Reads the scored ligands from the result directories for the target. :return: a :class: ccdc.io.MoleculeReader instance """ lig_paths = glob(join(self.stem, "*", "scored_ligands.mol2")) scored_ligs = MoleculeReader(lig_paths) return scored_ligs
def test_func(self, unp_id): # Parse arguments overlay_dir = self.args.query active_dir = self.args.actives_dir decoy_dir = self.args.decoys_dir param_dir = self.args.param_dir active_conformers_dir = os.path.join(self.args.conformers_dir_actives, '{}'.format(unp_id)) decoy_conformers_dir = os.path.join(self.args.conformers_dir_decoys, '{}'.format(unp_id)) actives_to_screen = os.path.join( active_dir, '{}_active_3d_rdkit.sdf'.format(unp_id)) print(actives_to_screen) decoys_to_screen = os.path.join(decoy_dir, '{}_decoy_3d_rdkit.sdf'.format(unp_id)) print(decoys_to_screen) output_dir = self.args.output_directory complete_output_dir = os.path.join(output_dir, '{}'.format(unp_id)) #overlay_mol = os.path.join(overlay_dir, '{}_corina.sdf'.format(unp_id)) #overlay_mol = os.path.join(overlay_dir, '{}.sdf'.format(unp_id)) overlay_mol = self.fetch_overlay_mol(file_dir=overlay_dir, id=unp_id) print(overlay_mol) largest_ligand = self.select_largest_ligand(overlay_mol) print(largest_ligand) query = [m for m in MoleculeReader(largest_ligand) ] # Read the query mol or overlay of mols print(query) settings = setup_screener(param_dir) os.makedirs(complete_output_dir) os.chdir(complete_output_dir) screener = Screener( query, settings=settings) # Generate fields around the input query output_name_actives = os.path.join( complete_output_dir, "{}_actives_screened.mol2".format(unp_id)) print(output_name_actives) output_name_decoys = os.path.join( complete_output_dir, "{}_decoys_screened.mol2".format(unp_id)) print(output_name_decoys) actives_scores = screen_molecules( screener, actives_to_screen, 1, active_conformers_dir, output_name_actives) ### Screen set of actives decoys_scores = screen_molecules( screener, decoys_to_screen, 0, decoy_conformers_dir, output_name_decoys) ### Screen set of decoys print("writing scores") all_data = actives_scores all_data.extend(decoys_scores) screening_scores = sorted(all_data) output_name_scores = os.path.join( complete_output_dir, "{}_screening_scores.csv".format(unp_id)) write_scores(screening_scores, output_name_scores) print("finish:{}".format(unp_id))
def _get_out_maps(self, probe, grid_dict, return_probes=False): """ private method organises the sampling of weighted superstar maps by molecular probes :param str probe: probe identifier set in the Atomic Hotspot calculation :param dict grid_dict: dictionary with key = probe identifier and value = `hotspots.grid_extension.Grid` :param bool return_probes: optional, bool indicating if probe molecules should be returned :return: """ donor_grid = _SampleGrid('donor', grid_dict['donor'], _SampleGrid.is_donor) acceptor_grid = _SampleGrid('acceptor', grid_dict['acceptor'], _SampleGrid.is_acceptor) apolar_grid = _SampleGrid('apolar', grid_dict['apolar'], _SampleGrid.is_apolar) if self.charged_probes: negative_grid = _SampleGrid('negative', grid_dict['negative'], _SampleGrid.is_negative) positive_grid = _SampleGrid('positive', grid_dict['positive'], _SampleGrid.is_positive) kw = {'settings': self.sampler_settings} if self.charged_probes: self.sampler = self._Sampler(apolar_grid, donor_grid, acceptor_grid, negative_grid, positive_grid, **kw) else: self.sampler = self._Sampler(apolar_grid, donor_grid, acceptor_grid, **kw) probe_path = pkg_resources.resource_filename('hotspots', 'probes/') if self.charged_probes: if probe == "negative" or probe == "positive": mol = MoleculeReader(join(probe_path, "rotate-{}_{}_flat.mol2".format(probe, "test")))[0] else: mol = MoleculeReader(join(probe_path, "rotate-{}_{}_flat.mol2".format(probe, self.probe_size)))[0] else: mol = MoleculeReader(join(probe_path, "rotate-{}_{}_flat.mol2".format(probe, self.probe_size)))[0] probes = self.sampler.sample(mol, probe=probe) for pg in self.sampler.probe_grids: if pg.name.lower() == probe: try: self.out_grids[pg.name].append(pg.grid) except KeyError: self.out_grids[pg.name] = [pg.grid] if return_probes: return probes
def _molecule(self, filename, no_assign_bond_types): """ Implementation detail - Get the first molecule from a file """ m = MoleculeReader(filename) mol = m[0] mol.normalise_labels() if not no_assign_bond_types: mol.assign_bond_types() return mol
def __init__(self, data, out_dir): self.data = data self.files = {'protein': [], 'ligands': []} self.out_dir = out_dir self._write() self._protein = Protein.from_file(self.files['protein'][0]) self._ligands = [ y for y in [MoleculeReader(x) for x in self.files['ligands']] ]
def get_fragment(self): """ Gets the reference fragment :return: """ if not self.reference_fragment: ligs_p = join(self.hotspot_path, "scored_ligands.mol2") ligs = MoleculeReader(ligs_p) self.reference_fragment = ligs[0]
def select_largest_ligand(self, overlay_file): overlay = [m for m in MoleculeReader(overlay_file)] atoms = [len(m.atoms) for m in overlay] max_index, max_value = max(enumerate(atoms), key=operator.itemgetter(1)) largest_ligand = overlay[max_index].identifier filepath = os.path.join(self.args.pdb_ligand_dir, '{}.sdf'.format(largest_ligand)) return filepath
def setUp(self) -> None: self.tmpdir = os.path.abspath("testdata/wrapper_arpeggio/tmpdir") source = "file" self.data = f"testdata/wrapper_arpeggio/prepare/{source}" self.arpeggio_2vta = Arpeggio(pdb_code="2vta", hetid="LZ1", chain="A", tmpdir=os.path.join(self.tmpdir, "lz1")) self.testlz1 = MoleculeReader(os.path.join(self.data, "LZ1.mol2"))[0] self.lz1_resid = "A/1301/" self.arpeggio_1xkk = Arpeggio(pdb_code="1xkk", hetid="FMM", chain="A", tmpdir=os.path.join(self.tmpdir, "fmm")) self.testfmm = MoleculeReader(os.path.join(self.data, "FMM.mol2"))[0] self.fmm_resid = "A/91/"
def run(): # must be abspath parent = sys.argv[1] score = sys.argv[2] autoscale = sys.argv[3] run_id = sys.argv[4] crossminer_file = os.path.join(parent, sys.argv[5]) # data conf_name = "hs_gold.conf" out_path = check_dir(os.path.join(parent, "gold_results")) out_path = check_dir(os.path.join(out_path, run_id)) junk = check_dir(os.path.join(out_path, "all")) hotspot = os.path.join(parent, "hotspot_pharmacophore", "out.zip") crystal_ligand = os.path.join(parent, "crystal_ligand.mol2") actives = os.path.join(parent, "actives_final.mol2") decoys = os.path.join(parent, "decoys_final.mol2") prot_file = os.path.join(out_path, "protein.mol2") # output protein with hs_io.HotspotReader(hotspot) as reader: hr = [h for h in reader.read() if h.identifier == "bestvol"][0] with MoleculeWriter(prot_file) as w: w.write(hr.protein) hspm = HotspotPharmacophoreModel.from_file(crossminer_file) constraint_str = hspm.to_gold_conf(score=score) # create template gold_conf_str = template(autoscale, crystal_ligand, actives, decoys, junk, prot_file, constraint_str) print(gold_conf_str) with open(os.path.join(out_path, conf_name), "w") as w: w.write(gold_conf_str) # linux only gold_exe = os.path.join(os.environ["GOLD_DIR"], "bin/gold_auto") # run docking with PushDir(out_path): cmd = f"{gold_exe} {conf_name}" os.system(cmd) # process results docked = MoleculeReader(os.path.join(junk, "docked_ligands.mol2")) # make it consistent with other names with MoleculeWriter(os.path.join(out_path, "docked_ligand.mol2")) as w: for d in docked: for atm in d.atoms: if atm.atomic_symbol == "Unknown": d.remove_atom(atm) w.write(d) shutil.copyfile(os.path.join(junk, "bestranking.lst"), os.path.join(out_path, "bestranking.lst"))
def main(): # input files ############################# mol_file = "data/gold_docking_poses.sdf" hotspot_files = "data/out.zip" output_file = "data/ranked.sdf" # option 1: rank based on apolar score # sort_on = ["apolar"] # option 2: rank based on donor and acceptor scores sort_on = ["donor", "acceptor"] # option 3: # sort_on = ["simple_score"] ########################################### # read hotspots and molecules mols = [m for m in MoleculeReader(mol_file) ] # so molecules can retain new attributes hr = HotspotReader(hotspot_files).read() for p, g in hr.super_grids.items(): hr.super_grids[p] = g.max_value_of_neighbours() # create a grid which can contain all docking poses small_blank = Grid.initalise_grid( coords={atm.coordinates for mol in mols for atm in mol.heavy_atoms}, padding=2) # set the protein to -1 to detect clashing protein_grid = hr.super_grids["apolar"].copy_and_clear() for atm in hr.protein.atoms: protein_grid.set_sphere(point=atm.coordinates, radius=atm.vdw_radius * 0.9, value=-1, scaling='None') protein_grid = _shrink(small=small_blank, big=protein_grid) # shrink hotspot maps to save time sub_grids = { p: _shrink(small=small_blank, big=g) + protein_grid for p, g in hr.super_grids.items() } # score the mols for i, mol in enumerate(mols): scores = example_score(sub_grids, mol, small_blank) mol.data = scores simple = simple_score(hr, mol) mol.data.update({"simple_score": simple}) ranked_mols = ranked_molecules(mols, sort_on) # output ranked mols in sdf format with data attached _output_sdf(ranked_mols, output_file)
def read(self, path): df = pd.read_csv(os.path.join(path, "hits_attr.csv"), index_col=0) mols = MoleculeReader(os.path.join(path, "hits_mols.mol2")) for mol in mols: rmsd = df.loc[df.identifier == mol.identifier]["rmsd"].values.tolist()[0] activity = df.loc[df.identifier == mol.identifier]["activity"].values.tolist()[0] self.hits.append(Hit(mol, rmsd, mol.identifier, activity))
def __init__(self): super(self.__class__, self).__init__(description=__doc__) # handle command line arguments self.add_argument('protein', help='pdb_code of protein which was used in docking') self.add_argument('reference', help='pdb_code of reference') self.add_argument('chemical_id', help='PDB identifier for the docked ligand') self.add_argument('results', help='path to results files') self.add_argument('-r', '--chain_ref', default='A', help='Chain to used for alignment') self.add_argument('-p', '--chain_protein', default='A', help='Chain to used for alignment') self.args = self.parse_args() self.tmp = tempfile.mkdtemp() # download protein PDBResult(self.args.protein).download(self.tmp) self.protein = Protein.from_file( os.path.join(self.tmp, self.args.protein + ".pdb")) self.protein.add_hydrogens() # download reference PDBResult(self.args.reference).download(self.tmp) ref = Protein.from_file( os.path.join(self.tmp, self.args.reference + ".pdb")) ref.add_hydrogens() self.ref = self._align(self.protein, ref) self.reference_ligand = self._extract_ligands( protein=self.ref, ligand=self.args.chemical_id, chain=self.args.chain_ref)[0] with MoleculeWriter( os.path.join(os.path.dirname(os.path.realpath(__file__)), "reference.mol2")) as w: w.write(self.reference_ligand) self.results = MoleculeReader( os.path.join(os.path.dirname(os.path.realpath(__file__)), self.args.results)) self.rmsd_values = [] for l in self.results: self.rmsd_values.append(self.rmsd(l, self.reference_ligand))
def search_cocrystals(): ''' Search the whole CSD for structures that contain two different molecules with the specific settings ''' csd = MoleculeReader('CSD') settings = search.Search.Settings() settings.only_organic = True settings.not_polymeric = True settings.has_3d_coordinates = True settings.no_disorder = True settings.no_errors = True settings.no_ions = True settings.no_metals = True mol = [] fin = [] for i, entry in enumerate(csd): if settings.test(entry): molecule = entry.identifier mol.append(molecule) csd_reader = MoleculeReader(mol) for i in csd_reader: id = i.identifier mol = csd_reader.molecule(id) smi = mol.smiles if smi != None: smi = smi.split('.') if len(Remove(smi)) == 2: # We make sure that the structure consist of two different molecules fin.append(mol.identifier) final_cocrystals = [] # clean the list from solvents for mol1 in fin: mol = csd_reader.molecule(mol1) for i in range(0, (len(mol.components))): if mol.components[i].smiles in solvents(): final_cocrystals.append(mol.identifier) final_cocrystals = Remove(final_cocrystals) final_cocrystals = [x for x in fin if x not in final_cocrystals] # Clean the list from polymorphs cocrystals = remove_polymorphs(final_cocrystals) return cocrystals
def screen_molecules(screener, mols_to_screen, activity, conformers_dir, output_name): """Run the ligand screener and write out the screened conformations. Return sorted list of ranked scores. :param screener: :param mols_to_screen: Screening set :param activity: 1 if the molecule is active, 0 if it's a decoy :param nconformers: Number of conformers to screen for each molecule in screening set :param nthreads: Number of threads on which to run the conformer generation :param output_name: File name for the result molecules :return: sorted list of ranked scores """ screen_set = [m for m in MoleculeReader(mols_to_screen) ] ### Read the molecules to screen scores = [] molwriter = MoleculeWriter(output_name) for mol in screen_set: mol_id = mol.identifier list_of_conformers_files = read_mol2_file(conformers_dir) for conformers_file in list_of_conformers_files: if conformers_file.startswith(mol_id): print(conformers_file) conformers_file_path = os.path.join(conformers_dir, conformers_file) print(conformers_file_path) conformers = [[ x for x in MoleculeReader(conformers_file_path) ]] print(type(conformers)) print("yeah!!!!!! start screening") res = screener.screen(conformers) # Screening step scores.extend([(r.score, activity, r.identifier) for r in res]) # Write results for r in res: molwriter.write(r.molecule) molwriter.close() return sorted(scores)
def docks_to_ref_rmsd(self): # Only calculate for complete docking results! docks = [l.molecule for l in self.docking_result.ligands] ref_lig = MoleculeReader(self.prepared_ligand_path)[0] rmsds = [ MolecularDescriptors.rmsd(ref_lig, nd, exclude_hydrogens=True, atoms=self.match_heavy_atoms( ref_lig, nd)) for nd in docks ] return rmsds
def test_docking_fitting_pts(self): with PushDir("testdata/2vta"): # read hotspot maps with HotspotReader(path="out.zip") as r: self.result = r.read() mol = [ m for m in MoleculeReader("crystal_ligand.sdf") if "LZ1" in m.identifier.split("_") ][0] print(mol.identifier) m = self.result._docking_fitting_pts(mol)
def make_substructure_molecule(template_mol_path, query_mol_path): """ :param template_mol: path to the prepared template molecule (starting fragment) :param query_mol: path to the prepared querty molecule (suggested followup) :return: string representation fo the MCS with 3D coordinates """ #template_mol = [x for x in Chem.SDMolSupplier(template_mol_path, removeHs=False) if x is not None][0] template_mol_ccdc = MoleculeReader(template_mol_path)[0] template_mol = rdkitize_ccdc_mol(template_mol_ccdc) #query_mol = [y for y in Chem.SDMolSupplier(query_mol_path, removeHs=False, sanitize=False) if y is not None][0] #Chem.SanitizeMol(query_mol, sanitizeOps=Chem.SanitizeFlags.SANITIZE_ALL^Chem.SanitizeFlags.SANITIZE_KEKULIZE) query_mol_ccdc = MoleculeReader(query_mol_path)[0] query_mol = rdkitize_ccdc_mol(query_mol_ccdc) print(query_mol) mcsResult=rdFMCS.FindMCS([template_mol, query_mol],threshold=0.9, completeRingsOnly=True) #find the maximum common substructure if mcsResult.smartsString and len(mcsResult.smartsString)>0 : patt = Chem.MolFromSmarts(mcsResult.smartsString,mergeHs=True) # keep only the core of the reference molecule ref=AllChem.ReplaceSidechains(template_mol, patt) if ref: core=AllChem.DeleteSubstructs(ref,Chem.MolFromSmiles('*')) core.UpdatePropertyCache() try: return Chem.MolToMolBlock(core) except Exception as e: t_match = template_mol.GetSubstructMatch(patt) print(e) Chem.SanitizeMol(patt, sanitizeOps=Chem.SanitizeFlags.SANITIZE_ALL^Chem.SanitizeFlags.SANITIZE_KEKULIZE) cmap = {i:template_mol.GetConformer().GetAtomPosition(t_match[i]) for i in range(len(t_match))} GetFF=lambda x,confId=-1:AllChem.MMFFGetMoleculeForceField(x,AllChem.MMFFGetMoleculeProperties(x),confId=confId) n = AllChem.EmbedMolecule(patt,randomSeed=0xf00d,coordMap=cmap, maxAttempts=1000) AllChem.UFFOptimizeMolecule(patt) AllChem.AlignMol(patt,template_mol,atomMap = list(zip(range(len(t_match)),t_match))) return Chem.MolToMolBlock(patt)
def get_largest_binding_site(self): """ Returns the binding site created within 6.5A of the largest ligand :return: """ lig_fname = self.find_largest_ligand() lig = MoleculeReader(join(self.lig_dir, lig_fname))[0] prot = Protein.from_file( join(self.pdb_dir, lig_fname.replace("sdf", "pdb"))) bs = Protein.BindingSiteFromMolecule(protein=prot, molecule=lig, distance=6.5) return bs
def generate_fake(self, buriedness=False, weighted=False, superstar=True): """ create a small set of grids for testing :param buriedness: :param weighted: :param superstar: :return: """ def populate_grid(template, num_spheres, radius=1, value=8, scaling='linear'): h = template.copy_and_clear() for i in range(1, num_spheres): x, y, z = [np.random.randint(low=2, high=ax - 2, size=1) for ax in h.nsteps] h.set_sphere(point=h.indices_to_point(x, y, z), radius=radius, value=value, scaling=scaling) return h protein = Protein.from_file("testdata/6y2g_A/binding_site.pdb") mol = MoleculeReader("testdata/6y2g_A/A_mol.mol2")[0] g = Grid.initalise_grid([a.coordinates for a in mol.atoms]) if buriedness: buriedness_grid = Grid.from_molecule(mol) else: buriedness_grid = None interactions = ["apolar", "donor", "acceptor"] super_grids = {p: populate_grid(template=g, num_spheres=3) for p in interactions} if superstar: superstar_grids = {p: populate_grid(template=g, num_spheres=3) for p in interactions} else: superstar_grids = None if weighted: weighted_superstar_grids = {p: populate_grid(template=g, num_spheres=3) for p in interactions} else: weighted_superstar_grids = None return Results(super_grids=super_grids, protein=protein, buriedness=buriedness_grid, superstar=superstar_grids, weighted_superstar=weighted_superstar_grids)
def main(): base = "/local/pcurran/leads_frag" pdbs = [ p for p in os.listdir(base) if os.path.isdir(os.path.join(base, p)) ] for pdb in pdbs: hetid = MoleculeReader(os.path.join( base, pdb, f"{pdb}_ligand.mol2"))[0].identifier mol = ftp_download(pdb, hetid) with MoleculeWriter(os.path.join(base, pdb, f"{pdb}_ref.mol2")) as w: w.write(mol)
def ligand_map_search(base, t, num): # inputs timer = Timer() with timer(tag="screen"): feature_db_file = os.path.join(base, t, f"structure_db/{t}.feat") query_file = os.path.join(base, t, f"ligand_pharmacophores/{num}.cm") actives = os.path.join(base, t, "actives_final.mol2") decoys = os.path.join(base, t, "decoys_final.mol2") # outputs output_dir = check_dir( os.path.join(base, t, f"ligand_pharmacophores/{num}")) hit_dir = check_dir(os.path.join(output_dir, "hit_list")) score_dir = check_dir(os.path.join(output_dir, "raw_scores")) time_file = os.path.join(output_dir, "time.txt") # # feature_db_file = "/home/pcurran/github_packages/pharmacophores/testdata/search/feat_db/test.feat" feat_db = Pharmacophore.FeatureDatabase.from_file(feature_db_file) query = Pharmacophore.Query.from_file(query_file) totals = { "actives": len(MoleculeReader(actives)), "decoys": len(MoleculeReader(decoys)) } hits = search(feat_db, query) with open(time_file, "w") as f: timer.report(f) hits.write(hit_dir) # hits = HitList() # hits.read(hit_dir) estats = rank_hits(hits.hits, "rmsd", totals, num, t) estats.to_csv(os.path.join(output_dir, "enrichment_stats.csv"))
def test_func(self, unp_id): # Parse arguments overlay_dir = self.args.query active_dir = self.args.actives decoy_dir = self.args.decoys actives_to_screen = os.path.join( active_dir, '{}_active_3d_rdkit.sdf'.format(unp_id)) print(actives_to_screen) decoys_to_screen = os.path.join(decoy_dir, '{}_decoy_3d_rdkit.sdf'.format(unp_id)) print(decoys_to_screen) #nt = args.threads nc = self.args.nconfs output_dir = self.args.output_directory complete_output_dir = os.path.join(output_dir, '{}'.format(unp_id)) overlay_mol = os.path.join(overlay_dir, '{}.sdf'.format(unp_id)) print(overlay_mol) query = [m for m in MoleculeReader(overlay_mol) ] # Read the query mol or overlay of mols print("start screening") settings = setup_screener() os.makedirs(complete_output_dir) os.chdir(complete_output_dir) screener = Screener( query, settings=settings) # Generate fields around the input query output_name_actives = os.path.join( complete_output_dir, "{}_actives_screened.mol2".format(unp_id)) print(output_name_actives) output_name_decoys = os.path.join( complete_output_dir, "{}_decoys_screened.mol2".format(unp_id)) print(output_name_decoys) actives_scores = screen_molecules( screener, actives_to_screen, 1, nc, output_name_actives) ### Screen set of actives decoys_scores = screen_molecules( screener, decoys_to_screen, 0, nc, output_name_decoys) ### Screen set of decoys print("writing scores") all_data = actives_scores all_data.extend(decoys_scores) screening_scores = sorted(all_data) output_name_scores = os.path.join( complete_output_dir, "{}_screening_scores.csv".format(unp_id)) write_scores(screening_scores, output_name_scores)