def WriteSDFDataSelect(Top_List, Top_sdf, arg_pat, all_top, top_name, dock): Others, Matched = [], [] #w = Chem.SDWriter(top_name+'.smt-filt.sdf') #OUT = open(top_name+'.smt-filt.txt', 'w') m = Chem.SDWriter(top_name + '.smt-selec.sdf') SMA = open(top_name + '.smt-selec.txt', 'w') ## Use the Ranked list to rebuild a consolidated SDF ## if molecule matches SMARTS filter, separate it for idx, Item in enumerate(Top_List): score, name = Item[0], Item[1] ## If mol_name has conformer number appended on it, remove _NUM if re.search(r'_', name): name = name.split('_')[0] if Top_sdf.get(name): mol = Top_sdf[name] switch = False ## Rename mol name property to include data (ZINC, Rank, Score, Software) mol.SetProp( '_Name', '{0}::{1}::{2:.1f}::{3}'.format(name, idx + 1, float(score), dock)) for smarts in [p for p in arg_pat.split('|')]: if mol.HasSubstructMatch(Chem.MolFromSmarts(smarts)): Matched.append(mol) SMA.write('{0}\t{1}\n'.format(name, score)) switch = True break if not switch: # print(' ** {0} not match SMARTS {1} - Skip '.format(name, arg_pat)) Others.append(mol) # OUT.write('{0}\t{1}\n'.format(name, score)) else: print(' --> Molecule not found: {0} <--'.format(name)) continue ## Close all files when reached the Max. output number if len(Matched) == all_top or idx == len(Top_List) - 1: # for mol in Others: # w.write(mol) for mol in Matched: m.write(mol) print("\n ## Total Molecule Looked Thru: " + str(idx + 1)) print(' ## Molecule Not Matched: ' + str(len(Others))) print(' ## Molecule Matched {0}: {1}'.format( arg_sel, len(Matched))) # OUT.close() SMA.close() # w.flush() # w.close() m.close() gc.collect() break if grid is True: grid_print(top_name, Matched, 'sdf')
def main(list_name, Chemicals, option): ## Read in the list of selected ligand ID # List = remove_remark(file_handle(list_name)) List = [line.split()[0] for line in remove_remark(file_handle(list_name))] print len(List) ## Extract the selected ligands from the supplied SDF temp = rdkit_open(Chemicals) sdf = dict() for m in temp: if re.search(r'::', m.GetProp('_Name')): name, rank, score, x = m.GetProp('_Name').split('::') sdf[name] = [m, name, rank, score] else: name = m.GetProp('_Name') sdf[name] = [m, name, 0, 0.0] Molecules = [sdf[chem] for chem in List if chem is not None] ## Sort data, if needed if option is not None: if option == 'name': Molecules.sort(key=lambda tup: tup[1]) elif option == 'rank': Molecules.sort(key=lambda tup: int(tup[2])) elif option == 'score': Molecules.sort(key=lambda tup: float(tup[3])) Mols = [mol[0] for mol in Molecules] out = Chem.SDWriter(list_name.split('.txt')[0] + '.sdf') for molecule in Mols: out.write(molecule) out.flush() out.close() grid_print(list_name.split('.txt')[0], Mols, 'sdf')
def WriteSDFDataExclude( Top_List, Top_sdf, arg_pat, all_top, top_name, dock ): Select, Exclude = [], [] w = Chem.SDWriter(top_name+'.smt-clean.sdf') OUT = open(top_name+'.smt-clean.txt', 'w') # m = Chem.SDWriter(top_name+'.smt-excl.sdf') # SMA = open(top_name+'.smt-excl.txt', 'w') ## Use the Ranked list to rebuild a consolidated SDF ## if molecule matches SMARTS filter, separate it for idx, Item in enumerate(Top_List): score, name = Item[0], Item[1] ## If mol_name has conformer number appended on it, remove _NUM if re.search(r'_', name): name = name.split('_')[0] if Top_sdf.get(name): mol = Top_sdf[name] switch = False ## Rename mol name property to include data (ZINC, Rank, Score, Software) mol.SetProp('_Name', '{0}::{1}::{2:.1f}::{3}'.format(name, idx+1, float(score), dock) ) for smarts in [ p for p in arg_pat.split('|') ]: if mol.HasSubstructMatch(Chem.MolFromSmarts(smarts)): # print(' ** {0} matches SMARTS {1} - Skip '.format(name, smarts)) Exclude.append(mol) # SMA.write(name+'\t'+str(score)+'\n') switch = True continue if switch: continue else: print(' --> Molecule not found: {0} <--'.format(name)) continue OUT.write(name+"\t"+str(score)+"\n") Select.append(mol) ## Close all files when reached the Max. output number if len(Select) == all_top: for mol in Select: w.write(mol) # for mol in Exclude: # m.write(mol) print("\n ## Total Molecule Looked Thru: "+str(idx+1)) print(' ## Molecule Selected: '+str(len(Select))) print(' ## Molecule Matched {0}: {1}'.format(arg_exc,len(Exclude))) OUT.close() # SMA.close() w.flush() w.close() # m.close() gc.collect() break if grid is True: grid_print(top_name, Select, 'sdf')
def WriteSDFData(Top_List, Top_sdf, all_top, top_name, dock): Select = [] w = Chem.SDWriter(top_name + '.sdf') OUT = open(top_name + '.txt', 'w') ## Use the Ranked list to rebuild a consolidated SDF for idx, Item in enumerate(Top_List): score, name = Item[0], Item[1] ## If mol_name has conformer number appended on it, remove _NUM # if re.search(r'_', name): # name = name.split('_')[0] if Top_sdf.get(name): mol = Top_sdf[name] else: print(' --> Molecule not found: {0} <--'.format(name)) continue ## Rename mol name property to include data (ZINC, Rank, Score, Software) mol.SetProp( '_Name', '{0}::{1}::{2:.1f}::{3}'.format(name, idx + 1, float(score), dock)) OUT.write('{0}\t{1}\n'.format(name, score)) Select.append(mol) ## Close all files when reached the Max. output number if len(Select) == all_top: for mol in Select: w.write(mol) print("\n ## Total Molecule Looked Thru: " + str(idx + 1)) print(' ## Total Molecule Output: ' + str(len(Select))) OUT.close() w.flush() w.close() gc.collect() break if grid is True: grid_print(top_name, Select, 'sdf')
def main(list_name, Chemicals, option): ## Read in the list of selected ligand ID df = pd.read_csv(list_name, delimiter='\s+', header=None, comment='#').dropna() List = df.loc[:, 0].to_numpy() print('\n > Number of items in <{}>: {}\n'.format(list_name, len(List))) ## Extract the selected ligands from the supplied SDF print(' > List of structure file(s) read: \n', Chemicals) temp = rdkit_open(Chemicals) sdf = dict() for m in temp: if re.search(r'::', m.GetProp('_Name')): name, rank, score, x = m.GetProp('_Name').split('::') sdf[name] = [m, name, rank, score] else: name = m.GetProp('_Name') sdf[name] = [m, name, 0, 0.0] Molecules = [sdf[chem] for chem in List if chem is not None] ## Sort data, if needed if option is not None: if option == 'name': Molecules.sort(key=lambda tup: tup[1]) elif option == 'rank': Molecules.sort(key=lambda tup: int(tup[2])) elif option == 'score': Molecules.sort(key=lambda tup: float(tup[3])) else: print( ' ## Using SDF tag to sort ligand order: \033[31m{0}\033[0m\n'. format(option)) Molecules.sort(key=lambda tup: float(tup[0].GetProp(option))) Mols = [mol[0] for mol in Molecules] out = Chem.SDWriter(list_name.split('.txt')[0] + '.sdf') for molecule in Mols: out.write(molecule) out.flush() out.close() grid_print(list_name.split('.txt')[0], Mols, 'sdf')
def main(filename): mol_file = glob.glob(filename)[0] print('\n > File read: {}\n'.format(mol_file)) if re.search(r'.sdf', mol_file): handle = file_handle(mol_file) Mol = [ x for x in Chem.ForwardSDMolSupplier(handle, removeHs=True) if x is not None ] grid_print(mol_file.split('.sdf')[0], Mol, 'sdf') if re.search(r'.smi', mol_file): if re.search(r'.bz2$|.gz$', mol_file): print( '\n ## INFO: RDKit cannot take SMILES in zipped format, only ASCII\n' ) else: with open(mol_file, 'r') as fi: first_line = fi.readline() if re.search(r'smiles', first_line, re.IGNORECASE): Mol = [ x for x in Chem.SmilesMolSupplier( mol_file, titleLine=True, delimiter=' |\t|,') if x is not None ] else: Mol = [ x for x in Chem.SmilesMolSupplier( mol_file, titleLine=False, delimiter=' |\t|,') if x is not None ] grid_print(mol_file.split('.smi')[0], Mol, 'smi')
def GenClustTable( Mol_List, output_name, column=5 ): Img_Data = [] for idx, Mols in enumerate(Mol_List): Img = [] for mol in Mols: # Get molecule info m1 = mol.GetProp('Name') m2 = mol.GetProp('Rank') m3 = mol.GetProp('Score') m4 = mol.GetProp('Type') # Create tag and write out to sdf file mol.SetProp('Cluster', str(idx+1)) mol.SetProp('SMILES' , Chem.MolToSmiles(mol, isomericSmiles=True)) AssignStereochemistryFrom3D(mol) # Create figure using SMILES instead of 3D structure svg_name = '_TEMP.'+m1+'.svg' mol = rdMolDraw2D.PrepareMolForDrawing(mol) mol = Chem.RemoveHs(mol) AllChem.Compute2DCoords(mol) DrawingOptions.atomLabelFontSize=18 Draw.MolToFile(mol, svg_name, size=(225,225) ) #cairosvg.svg2png( url=svg_name, write_to=png_name, dpi=240 ) img_link = '<img src="'+svg_name+'">' # Img = (image_link, Name, Rank, Score, Type) Img.append([img_link, m1, m2, m3, m4]) Img_Data.append(Img) ## Print out a HTML page, in which every row has a maximum of 5 compound png. ## Every major cluster of compounds is grouped together. ## List the Name of the compound, then the Rank and Score. grid_print(output_name, Img_Data, 'formatted', column=5)
def main(): Cmpd_File = sys.argv[1].split(',') Lib_File = sys.argv[2].split(',') out_pref = sys.argv[3] cutoff = float(sys.argv[4]) fp_choice = sys.argv[5] Cmpd = rdkit_open(Cmpd_File) Lib = rdkit_open(Lib_File) Cmpd_FP = calculate_FP(Cmpd, fp_choice) Lib_FP = calculate_FP(Lib, fp_choice) Selection, Save = pick_similar_cmpd(Cmpd_FP, Lib_FP, cutoff, fp_choice) grid_print(out_pref, Selection, 'formatted') ######### df = pd.DataFrame(Save, columns=['name', 'mol']).drop_duplicates(subset='name', keep='last') fs = Chem.SmilesWriter(out_pref + '.smi') for mol in df.to_numpy(): fs.write(mol[1]) fs.close()
m = Cluster[0] i = m.GetProp("_Name").split()[0] a = 'TEMP.' + i + '.svg' l = '<img src="' + a + '">' h = Chem.RemoveHs(m) mol = h rdMolDraw2D.PrepareMolForDrawing(mol) AllChem.Compute2DCoords(mol) Draw.MolToFile(m, a, size=(200, 200)) Single_list.append([l, i, "-", "-", '-']) #(Mol, Name, Rank, Score) else: clust_list = [] for m in Cluster: i = m.GetProp("_Name").split()[0] a = 'TEMP.' + i + '.svg' l = '<img src="' + a + '">' h = Chem.RemoveHs(m) mol = h rdMolDraw2D.PrepareMolForDrawing(mol) AllChem.Compute2DCoords(mol) Draw.MolToFile(m, a, size=(200, 200)) ## [mol_data, Name, Rank, Score, Type] clust_list.append([l, i, "-", '-', '-']) Multi_list.append(clust_list) Multi_list.append(Single_list) grid_print(sys.argv[1], Multi_list, 'formatted')
def make_sdf(SDF_Names, All_Data, all_top, dock, prefix): from rdkit import Chem ## Build a Top-Selection list, with a 1.5x head-room for failed molecules print(" ## User-defined output total: " + str(all_top)) Top_Hash = {} Top_List = [] # [(Score, Name), ...] for rank, List in enumerate(All_Data): Top_Hash[List[1]] = List[0] Top_List.append(List) if rank == (all_top * 2) - 1: break ## Build a library of molecules found in the Top-Selction List Top_sdf = {} for sdf_file in SDF_Names: print(" # Reading SDF file: " + sdf_file) sdf_handle = file_handle(sdf_file) Temp_sdf = [ x for x in Chem.ForwardSDMolSupplier(sdf_handle, removeHs=False) if x is not None ] print(" # SDF mol read in from > " + sdf_file + " <: " + str(len(Temp_sdf))) ## Rename ligand name if previously processed with '::' tag if re.search(r'::', Temp_sdf[0].GetProp('_Name')): print(' # Remove "::" tag from ligand name #') Temp_sdf = RenameSDF(Temp_sdf) prev_name = '' for idx, mol in enumerate(Temp_sdf): if idx % 10000 == 0: print " Mol compared {0}".format(idx) ## RDKit may not handle the molecules and make a 'NoneType' item ## 'Could not sanitize molecule ending'. Ignore this molecule try: name = mol.GetProp('_Name') except AttributeError: print("A molecule failed after this molecule ID: " + prev_name) continue prev_name = name if Top_Hash.get(name.strip()): Top_sdf[name.strip()] = mol del Temp_sdf # Free memory ## Use the Ranked list to rebuild a consolidated SDF SDF = [] if all_top >= 1000: top_name = prefix + '.' + dock + '_top' + str(all_top / 1000) + 'k' w = Chem.SDWriter(top_name + '.sdf') OUT = open(top_name + '.txt', 'w') else: top_name = prefix + '.' + dock + '_top' + str(all_top) w = Chem.SDWriter(top_name + '.sdf') OUT = open(top_name + '.txt', 'w') for idx, Item in enumerate(Top_List): score = Item[0] name = Item[1] if Top_sdf.get(name): mol = Top_sdf[name] else: print(" --> Molecule {0} is not found <--".format(name)) continue ## If the FRED mol_name has conformer number appended on it, remove _NUM if re.search(r'_', name): name = name.split('_')[0] ## (ZINC, Rank, Score, Software) mol.SetProp( '_Name', name + '::' + str(idx + 1) + '::' + str("%.1f" % float(score)) + '::' + dock) w.write(mol) OUT.write(name + "\t" + str(score) + "\n") SDF.append(mol) ## Close all files when reached the Max. output number if idx == all_top - 1: print("\n ## Total Molecule Ouptut: " + str(idx + 1)) OUT.close() w.flush() w.close() gc.collect() break if grid is True: grid_print(fred_top_name, SDF, 'sdf')