def generate_structures(vae, smi, char_to_index, limit=1e4, write=False): rdkit_mols = [] temps = [] iterations = [] iteration = limit_counter = 0 while True: iteration += 1 limit_counter += 1 t = random.random()*2 candidate = decode_smiles(vae, smi, char_to_index, temp=t).split(" ")[0] try: sampled = Chem.MolFromSmiles(candidate) cation = Chem.AddHs(sampled) Chem.EmbedMolecule(cation, Chem.ETKDG()) Chem.UFFOptimizeMolecule(cation) cation = Chem.RemoveHs(cation) candidate = Chem.MolToSmiles(cation) if candidate not in rdkit_mols: temps.append(t) iterations.append(iteration) rdkit_mols.append(candidate) limit_counter = 0 df = pd.DataFrame([rdkit_mols,temps,iterations]).T df.columns = ['smiles', 'temperature', 'iteration'] print(df) except: pass if limit_counter > limit: break if write: df = pd.DataFrame([rdkit_mols,temps,iterations]).T df.columns = ['smiles', 'temperature', 'iteration'] pd.DataFrame.to_csv(df, path_or_buf='{}.csv'.format(write), index=False) return df
def get_descriptors(smiles): """ Get a dictionary of RDKit descriptors from a SMILES string. Parameters ---------- smiles : str The SMILES string of the chemical of interest Returns ------- descriptors : dict A collection of molecular descriptors Notes: Developed with RDKit 2019.03.4, although doc pages listed 2019.03.1 """ mol = Chem.MolFromSmiles(smiles) mol = Chem.AddHs(mol) Chem.EmbedMolecule(mol, Chem.ETKDG()) descriptors = {} # Starting with simple descriptors: # https://www.rdkit.org/docs/source/rdkit.Chem.Descriptors.html # Molecular weight descriptors['molwt'] = Descriptors.ExactMolWt(mol) # Partial charge metrics descriptors['max_abs_partial_charge'] = Descriptors.MaxAbsPartialCharge(mol) descriptors['max_partial_charge'] = Descriptors.MaxPartialCharge(mol) descriptors['min_abs_partial_charge'] = Descriptors.MinAbsPartialCharge(mol) descriptors['min_partial_charge'] = Descriptors.MinPartialCharge(mol) # Basic electron counts descriptors['num_radical_electrons'] = Descriptors.NumRadicalElectrons(mol) descriptors['num_valence_electrons'] = Descriptors.NumValenceElectrons(mol) # 3-D descriptors # https://www.rdkit.org/docs/source/rdkit.Chem.Descriptors3D.html # Calculating these should produce the same result, according to some basic tests # descriptors['asphericity'] = rdMolDescriptors.CalcAsphericity(mol) # descriptors['eccentricity'] = rdMolDescriptors.CalcEccentricity(mol) descriptors['asphericity'] = Descriptors3D.Asphericity(mol) descriptors['eccentricity'] = Descriptors3D.Eccentricity(mol) descriptors['inertial_shape_factor'] = Descriptors3D.InertialShapeFactor(mol) descriptors['radius_of_gyration'] = Descriptors3D.RadiusOfGyration(mol) descriptors['spherocity_index'] = Descriptors3D.SpherocityIndex(mol) # Graph descriptors # https://www.rdkit.org/docs/source/rdkit.Chem.GraphDescriptors.html descriptors['balaban_j'] = GraphDescriptors.BalabanJ(mol) descriptors['bertz_ct'] = GraphDescriptors.BertzCT(mol) descriptors['chi0'] = GraphDescriptors.Chi0(mol) descriptors['chi0n'] = GraphDescriptors.Chi0n(mol) descriptors['chi0v'] = GraphDescriptors.Chi0v(mol) descriptors['chi1'] = GraphDescriptors.Chi1(mol) descriptors['chi1n'] = GraphDescriptors.Chi1n(mol) descriptors['chi1v'] = GraphDescriptors.Chi1v(mol) descriptors['chi2n'] = GraphDescriptors.Chi2n(mol) descriptors['chi2v'] = GraphDescriptors.Chi2v(mol) descriptors['chi3n'] = GraphDescriptors.Chi3n(mol) descriptors['chi3v'] = GraphDescriptors.Chi3v(mol) descriptors['chi4n'] = GraphDescriptors.Chi4n(mol) descriptors['chi4v'] = GraphDescriptors.Chi4v(mol) descriptors['hall_kier_alpha'] = GraphDescriptors.HallKierAlpha(mol) descriptors['kappa1'] = GraphDescriptors.Kappa1(mol) descriptors['kappa2'] = GraphDescriptors.Kappa2(mol) descriptors['kappa3'] = GraphDescriptors.Kappa3(mol) # Predicted properties from Wildman and Crippen descriptors['log_p'] = Descriptors.MolLogP(mol) descriptors['refractivity'] = Descriptors.MolMR(mol) return descriptors
def summ_search(mol, name, args, log, dup_data, dup_data_idx, coord_Map=None, alg_Map=None, mol_template=None): '''embeds core conformers, then optimizes and filters based on RMSD. Finally the rotatable torsions are systematically rotated''' sdwriter = Chem.SDWriter(name + '_' + 'rdkit' + args.output) Chem.SanitizeMol(mol) mol = Chem.AddHs(mol) mol.SetProp("_Name", name) # detects and applies auto-detection of initial number of conformers if args.sample == 'auto': initial_confs = int(auto_sampling(args.auto_sample, mol, log)) else: initial_confs = int(args.sample) # dup_data.at[dup_data_idx, 'Molecule'] = name dup_data.at[dup_data_idx, 'RDKIT-Initial-samples'] = initial_confs if args.nodihedrals == False: rotmatches = getDihedralMatches(mol, args.heavyonly, log) else: rotmatches = [] if len(rotmatches) > args.max_torsions: log.write("x Too many torsions (%d). Skipping %s" % (len(rotmatches), (name + args.output))) status = -1 else: if coord_Map == None and alg_Map == None and mol_template == None: if args.etkdg: ps = Chem.ETKDG() ps.randomSeed = args.seed ps.ignoreSmoothingFailures = True ps.numThreads = 0 cids = rdDistGeom.EmbedMultipleConfs(mol, initial_confs, params=ps) else: cids = rdDistGeom.EmbedMultipleConfs( mol, initial_confs, ignoreSmoothingFailures=True, randomSeed=args.seed, numThreads=0) if len(cids) == 0 or len(cids) == 1 and initial_confs != 1: log.write( "o conformers initially sampled with random coordinates") cids = rdDistGeom.EmbedMultipleConfs( mol, initial_confs, randomSeed=args.seed, useRandomCoords=True, boxSizeMult=10.0, ignoreSmoothingFailures=True, numZeroFail=1000, numThreads=0) if args.verbose: log.write("o " + str(len(cids)) + " conformers initially sampled") # case of embed for templates else: if args.etkdg: ps = Chem.ETKDG() ps.randomSeed = args.seed ps.coordMap = coord_Map ps.ignoreSmoothingFailures = True ps.numThreads = 0 cids = rdDistGeom.EmbedMultipleConfs(mol, initial_confs, params=ps) else: cids = rdDistGeom.EmbedMultipleConfs( mol, initial_confs, randomSeed=args.seed, ignoreSmoothingFailures=True, coordMap=coord_Map, numThreads=0) if len(cids) == 0 or len(cids) == 1 and initial_confs != 1: log.write( "o conformers initially sampled with random coordinates") cids = rdDistGeom.EmbedMultipleConfs( mol, initial_confs, randomSeed=args.seed, useRandomCoords=True, boxSizeMult=10.0, numZeroFail=1000, ignoreSmoothingFailures=True, coordMap=coord_Map, numThreads=0) if args.verbose: log.write("o " + str(len(cids)) + " conformers initially sampled") #energy minimize all to get more realistic results #identify the atoms and decide Force Field for atom in mol.GetAtoms(): if atom.GetAtomicNum() > 36: #upto Kr for MMFF, if not use UFF args.ff = "UFF" #log.write("UFF is used because there are atoms that MMFF doesn't recognise") if args.verbose: log.write("o Optimizing " + str(len(cids)) + " initial conformers with" + args.ff) if args.verbose: if args.nodihedrals == False: log.write("o Found " + str(len(rotmatches)) + " rotatable torsions") # for [a,b,c,d] in rotmatches: # log.write(' '+mol.GetAtomWithIdx(a).GetSymbol()+str(a+1)+ mol.GetAtomWithIdx(b).GetSymbol()+str(b+1)+ mol.GetAtomWithIdx(c).GetSymbol()+str(c+1)+mol.GetAtomWithIdx(d).GetSymbol()+str(d+1)) else: log.write("o Systematic torsion rotation is set to OFF") cenergy, outmols = [], [] bar = IncrementalBar('o Minimizing', max=len(cids)) for i, conf in enumerate(cids): if coord_Map == None and alg_Map == None and mol_template == None: if args.ff == "MMFF": GetFF = Chem.MMFFGetMoleculeForceField( mol, Chem.MMFFGetMoleculeProperties(mol), confId=conf) elif args.ff == "UFF": GetFF = Chem.UFFGetMoleculeForceField(mol, confId=conf) else: log.write(' Force field {} not supported!'.format( args.ff)) sys.exit() GetFF.Initialize() converged = GetFF.Minimize(maxIts=args.opt_steps_RDKit) energy = GetFF.CalcEnergy() cenergy.append(GetFF.CalcEnergy()) #if args.verbose: # log.write("- conformer", (i+1), "optimized: ", args.ff, "energy", GetFF.CalcEnergy()) #id template realign before doing calculations else: num_atom_match = mol.GetSubstructMatch(mol_template) # Force field parameters if args.ff == "MMFF": GetFF = lambda mol, confId=conf: Chem.MMFFGetMoleculeForceField( mol, Chem.MMFFGetMoleculeProperties(mol), confId=conf) elif args.ff == "UFF": GetFF = lambda mol, confId=conf: Chem.UFFGetMoleculeForceField( mol, confId=conf) else: log.write(' Force field {} not supported!'.format( options.ff)) sys.exit() getForceField = GetFF # clean up the conformation ff_temp = getForceField(mol, confId=conf) for k, idxI in enumerate(num_atom_match): for l in range(k + 1, len(num_atom_match)): idxJ = num_atom_match[l] d = coord_Map[idxI].Distance(coord_Map[idxJ]) ff_temp.AddDistanceConstraint(idxI, idxJ, d, d, 10000) ff_temp.Initialize() #reassignned n from 4 to 10 for better embed and minimzation n = 10 more = ff_temp.Minimize() while more and n: more = ff_temp.Minimize() n -= 1 energy = ff_temp.CalcEnergy() # rotate the embedded conformation onto the core_mol: rms = rdMolAlign.AlignMol(mol, mol_template, prbCid=conf, atomMap=alg_Map, reflect=True, maxIters=100) # elif len(num_atom_match) == 5: # ff_temp = GetFF(mol, confId=conf) # conf_temp = mol_template.GetConformer() # for k in range(mol_template.GetNumAtoms()): # p = conf_temp.GetAtomPosition(k) # q = mol.GetConformer(conf).GetAtomPosition(k) # pIdx = ff_temp.AddExtraPoint(p.x, p.y, p.z, fixed=True) - 1 # ff_temp.AddDistanceConstraint(pIdx, num_atom_match[k], 0, 0, 10000) # ff_temp.Initialize() # n = 10 # more = ff_temp.Minimize(energyTol=1e-6, forceTol=1e-5) # while more and n: # more = ff_temp.Minimize(energyTol=1e-6, forceTol=1e-5) # n -= 1 # # realign # energy = ff_temp.CalcEnergy() # rms = rdMolAlign.AlignMol(mol, mol_template,prbCid=conf, atomMap=alg_Map,reflect=True,maxIters=50) cenergy.append(energy) # outmols is gonna be a list containing "initial_confs" mol objects with "initial_confs" # conformers. We do this to SetProp (Name and Energy) to the different conformers # and log.write in the SDF file. At the end, since all the mol objects has the same # conformers, but the energies are different, we can log.write conformers to SDF files # with the energies of the parent mol objects. We measured the computing time and # it's the same as using only 1 parent mol object with 10 conformers, but we couldn'temp # SetProp correctly pmol = PropertyMol.PropertyMol(mol) outmols.append(pmol) bar.next() bar.finish() for i, cid in enumerate(cids): outmols[cid].SetProp('_Name', name + ' conformer ' + str(i + 1)) outmols[cid].SetProp('Energy', cenergy[cid]) cids = list(range(len(outmols))) sortedcids = sorted(cids, key=lambda cid: cenergy[cid]) log.write("\n\no Filters after intial embedding of " + str(initial_confs) + " conformers") selectedcids, selectedcids_initial, eng_dup, eng_rms_dup = [], [], -1, -1 bar = IncrementalBar('o Filtering based on energy (pre-filter)', max=len(sortedcids)) for i, conf in enumerate(sortedcids): # This keeps track of whether or not your conformer is unique excluded_conf = False # include the first conformer in the list to start the filtering process if i == 0: selectedcids_initial.append(conf) # check rmsd for seenconf in selectedcids_initial: E_diff = abs(cenergy[conf] - cenergy[seenconf]) # in kcal/mol if E_diff < args.initial_energy_threshold: eng_dup += 1 excluded_conf = True break if excluded_conf == False: if conf not in selectedcids_initial: selectedcids_initial.append(conf) bar.next() bar.finish() if args.verbose == True: log.write("o " + str(eng_dup) + " Duplicates removed pre-energy filter (E < " + str(args.initial_energy_threshold) + " kcal/mol )") #reduce to unique set if args.verbose: log.write("o Removing duplicate conformers ( RMSD < " + str(args.rms_threshold) + " and E difference < " + str(args.energy_threshold) + " kcal/mol)") bar = IncrementalBar('o Filtering based on energy and rms', max=len(selectedcids_initial)) #check rmsd for i, conf in enumerate(selectedcids_initial): #set torsions to same value for m in rotmatches: rdMolTransforms.SetDihedralDeg( outmols[conf].GetConformer(conf), *m, 180.0) # This keeps track of whether or not your conformer is unique excluded_conf = False # include the first conformer in the list to start the filtering process if i == 0: selectedcids.append(conf) # check rmsd for seenconf in selectedcids: E_diff = abs(cenergy[conf] - cenergy[seenconf]) # in kcal/mol if E_diff < args.energy_threshold: rms = get_conf_RMS(outmols[conf], outmols[conf], seenconf, conf, args.heavyonly, args.max_matches_RMSD, log) if rms < args.rms_threshold: excluded_conf = True eng_rms_dup += 1 break if excluded_conf == False: if conf not in selectedcids: selectedcids.append(conf) bar.next() bar.finish() # unique_mols, unique_energies = [],[] # for id in selectedcids: # unique_mols.append(outmols[id]) # unique_energies.append(cenergy[id]) # log.write(unique_mols[0:2].GetConformers()[0].GetPositions()) if args.verbose == True: log.write("o " + str(eng_rms_dup) + " Duplicates removed (RMSD < " + str(args.rms_threshold) + " / E < " + str(args.energy_threshold) + " kcal/mol) after rotation") if args.verbose: log.write("o " + str(len(selectedcids)) + " unique (ignoring torsions) starting conformers remain") dup_data.at[dup_data_idx, 'RDKit-energy-duplicates'] = eng_dup dup_data.at[dup_data_idx, 'RDKit-RMS-and-energy-duplicates'] = eng_rms_dup dup_data.at[dup_data_idx, 'RDKIT-Unique-conformers'] = len(selectedcids) # now exhaustively drive torsions of selected conformers n_confs = int(len(selectedcids) * (360 / args.degree)**len(rotmatches)) if args.verbose and len(rotmatches) != 0: log.write("\n\no Systematic generation of " + str(n_confs) + " confomers") bar = IncrementalBar( 'o Generating conformations based on dihedral rotation', max=len(selectedcids)) else: bar = IncrementalBar('o Generating conformations', max=len(selectedcids)) total = 0 for conf in selectedcids: #log.write(outmols[conf]) total += genConformer_r(outmols[conf], conf, 0, rotmatches, args.degree, sdwriter, args, outmols[conf].GetProp('_Name'), log) bar.next() bar.finish() if args.verbose and len(rotmatches) != 0: log.write("o %d total conformations generated" % total) status = 1 sdwriter.close() #getting the energy from and mols after rotations if len(rotmatches) != 0: rdmols = Chem.SDMolSupplier(name + '_' + 'rdkit' + args.output, removeHs=False) if rdmols is None: log.write("Could not open " + name + args.output) sys.exit(-1) bar = IncrementalBar( 'o Filtering based on energy and rms after rotation of dihedrals', max=len(rdmols)) sdwriter = Chem.SDWriter(name + '_' + 'rdkit' + '_' + 'rotated' + args.output) rd_count = 0 rd_selectedcids, rd_dup_energy, rd_dup_rms_eng = [], -1, 0 for i in range(len(rdmols)): # This keeps track of whether or not your conformer is unique excluded_conf = False # include the first conformer in the list to start the filtering process if rd_count == 0: rd_selectedcids.append(i) if args.metal_complex == True: for atom in rdmols[i].GetAtoms(): if atom.GetSymbol() == 'I' and ( len(atom.GetBonds()) == 6 or len(atom.GetBonds()) == 5 or len(atom.GetBonds()) == 4 or len(atom.GetBonds()) == 3 or len(atom.GetBonds()) == 2): for el in elementspt: if el.symbol == args.metal: atomic_number = el.number atom.SetAtomicNum(atomic_number) sdwriter.write(rdmols[i]) # Only the first ID gets included rd_count = 1 # check rmsd for j in rd_selectedcids: if abs( float(rdmols[i].GetProp('Energy')) - float(rdmols[j].GetProp('Energy')) ) < args.initial_energy_threshold: # comparison in kcal/mol excluded_conf = True rd_dup_energy += 1 break if abs( float(rdmols[i].GetProp('Energy')) - float(rdmols[j].GetProp('Energy')) ) < args.energy_threshold: # in kcal/mol rms = get_conf_RMS(rdmols[i], rdmols[j], -1, -1, args.heavyonly, args.max_matches_RMSD, log) if rms < args.rms_threshold: excluded_conf = True rd_dup_rms_eng += 1 break if excluded_conf == False: if args.metal_complex == True: for atom in rdmols[i].GetAtoms(): if atom.GetSymbol() == 'I' and ( len(atom.GetBonds()) == 6 or len(atom.GetBonds()) == 5 or len(atom.GetBonds()) == 4 or len(atom.GetBonds()) == 3 or len(atom.GetBonds()) == 2): for el in elementspt: if el.symbol == args.metal: atomic_number = el.number atom.SetAtomicNum(atomic_number) sdwriter.write(rdmols[i]) if i not in rd_selectedcids: rd_selectedcids.append(i) bar.next() bar.finish() sdwriter.close() if args.verbose == True: log.write("o " + str(rd_dup_energy) + " Duplicates removed initial energy ( E < " + str(args.initial_energy_threshold) + " kcal/mol )") if args.verbose == True: log.write("o " + str(rd_dup_rms_eng) + " Duplicates removed (RMSD < " + str(args.rms_threshold) + " / E < " + str(args.energy_threshold) + " kcal/mol) after rotation") if args.verbose == True: log.write("o " + str(len(rd_selectedcids)) + " unique (after torsions) conformers remain") #filtering process after rotations dup_data.at[dup_data_idx, 'RDKIT-Rotated-conformers'] = total dup_data.at[dup_data_idx, 'RDKIT-Rotated-Unique-conformers'] = len(rd_selectedcids) return status