def test7Fragment(self): fragremover = rdMolStandardize.FragmentRemover() mol = Chem.MolFromSmiles("CN(C)C.Cl.Cl.Br") nm = fragremover.remove(mol) self.assertEqual(Chem.MolToSmiles(nm), "CN(C)C") lfragchooser = rdMolStandardize.LargestFragmentChooser() mol2 = Chem.MolFromSmiles("[N+](=O)([O-])[O-].[CH3+]") nm2 = lfragchooser.choose(mol2) self.assertEqual(Chem.MolToSmiles(nm2), "O=[N+]([O-])[O-]") lfragchooser2 = rdMolStandardize.LargestFragmentChooser( preferOrganic=True) nm3 = lfragchooser2.choose(mol2) self.assertEqual(Chem.MolToSmiles(nm3), "[CH3+]")
def main( ): args = UserInput() if args.id is None: args.id = 'ID' if args.start is None: args.start = 0 if args.end is None: args.end = -1 df = RDkitRead(args.infile, args.id, removeHs=True, add_Hs=False)[int(args.start):int(args.end)].dropna() remover = SaltRemover.SaltRemover() normzer = rdMolStandardize.Normalizer() chooser = rdMolStandardize.LargestFragmentChooser(preferOrganic=True) ## remove salts print('\033[34m## Desalting moleucles...\033[0m\n') df['mol'] = df.MOL.apply(remover.StripMol) ## choose largest fragment (most Hs) print('\033[34m## Choosing moleucles...\033[0m\n') df['mol2'] = df.mol.apply(chooser.choose) ## clean molecule (not really relevant?) print('\033[34m## Cleaning moleucles...\033[0m\n') df['mol3'] = df.mol2.apply(normzer.normalize) ## rewrite SMILES with newest mol3 print('\033[34m## Converting moleucles...\033[0m\n') df['smiles'] = df.mol3.apply(Chem.MolToSmiles) if args.format == 'sdf': rdpd.WriteSDF(df, args.outpref+'.'+args.format, molColName='mol3', idName=args.id, properties=['smiles']) elif args.format == 'smi': df.to_csv(args.outpref+'.'+args.format, index=False, sep=' ', columns=['smiles',args.id], header=True)
def graph_corpus(input, output, suffix='sdf'): metals = {'Na', 'Zn', 'Li', 'K', 'Ca', 'Mg', 'Ag', 'Cs', 'Ra', 'Rb', 'Al', 'Sr', 'Ba', 'Bi'} voc = utils.VocGraph('data/voc_atom.txt') inf = gzip.open(input) if suffix == 'sdf': mols = Chem.ForwardSDMolSupplier(inf) total = 2e6 else: mols = pd.read_table(input).drop_duplicates(subset=['Smiles']).dropna(subset=['Smiles']) total = len(mols) mols = mols.iterrows() vals = {} exps = {} codes, ids = [], [] chooser = rdMolStandardize.LargestFragmentChooser() disconnector = rdMolStandardize.MetalDisconnector() normalizer = rdMolStandardize.Normalizer() for i, mol in enumerate(tqdm(mols, total=total)): if mol is None: continue if suffix != 'sdf': idx = mol[1]['Molecule ChEMBL ID'] mol = Chem.MolFromSmiles(mol[1].Smiles) else: idx = mol.GetPropsAsDict() idx = idx['chembl_id'] try: mol = disconnector.Disconnect(mol) mol = normalizer.normalize(mol) mol = chooser.choose(mol) mol = disconnector.Disconnect(mol) mol = normalizer.normalize(mol) except: print(idx) symb = [a.GetSymbol() for a in mol.GetAtoms()] # Nr. of the atoms bonds = mol.GetBonds() if len(bonds) < 4 or len(bonds) >= 63: continue if {'C'}.isdisjoint(symb): continue if not metals.isdisjoint(symb): continue smile = Chem.MolToSmiles(mol) try: s0 = smile.replace('[O]', 'O').replace('[C]', 'C') \ .replace('[N]', 'N').replace('[B]', 'B') \ .replace('[2H]', '[H]').replace('[3H]', '[H]') s0 = Chem.CanonSmiles(s0, 0) code = voc.encode([smile]) s1 = voc.decode(code)[0] assert s0 == s1 codes.append(code[0].reshape(-1).tolist()) ids.append(idx) except Exception as ex: print(ex) print('Parse Error:', idx) df = pd.DataFrame(codes, index=ids, columns=['C%d' % i for i in range(64*4)]) df.to_csv(output, sep='\t', index=True) print(vals) print(exps)
def corpus(input, output, suffix='sdf'): if suffix =='sdf': inf = gzip.open(input) mols = Chem.ForwardSDMolSupplier(inf) # mols = [mol for mol in suppl] else: df = pd.read_table(input).Smiles.dropna() mols = [Chem.MolFromSmiles(s) for s in df] voc = Voc('data/voc_smiles.txt') charger = rdMolStandardize.Uncharger() chooser = rdMolStandardize.LargestFragmentChooser() disconnector = rdMolStandardize.MetalDisconnector() normalizer = rdMolStandardize.Normalizer() words = set() canons = [] tokens = [] smiles = set() for mol in tqdm(mols): try: mol = disconnector.Disconnect(mol) mol = normalizer.normalize(mol) mol = chooser.choose(mol) mol = charger.uncharge(mol) mol = disconnector.Disconnect(mol) mol = normalizer.normalize(mol) smileR = Chem.MolToSmiles(mol, 0) smiles.add(Chem.CanonSmiles(smileR)) except: print('Parsing Error:') #, Chem.MolToSmiles(mol)) for smile in tqdm(smiles): token = voc.split(smile) + ['EOS'] if {'C', 'c'}.isdisjoint(token): print('Warning:', smile) continue if not {'[Na]', '[Zn]'}.isdisjoint(token): print('Redudent', smile) continue if 10 < len(token) <= 100: words.update(token) canons.append(smile) tokens.append(' '.join(token)) log = open(output + '_voc.txt', 'w') log.write('\n'.join(sorted(words))) log.close() log = pd.DataFrame() log['Smiles'] = canons log['Token'] = tokens log.drop_duplicates(subset='Smiles') log.to_csv(output + '_corpus.txt', sep='\t', index=False)
def choose_largest_fragment(mol): """Gets the largest fragment. Parameters ---------- mol: rdkit.Chem.Mol A molecule with various fragments in various sizes. Returns ------- mol: rdkit.Chem.Mol Returns a molecule containing the largest fragment. """ return rdMolStandardize.LargestFragmentChooser().choose(mol)
def salt_remover(self, smiles: str) -> str: """ Removes salts and counterions. Non sanitizable molecules can't be processed :param smiles: smiles string :return cleaned_smiles: """ rmv = rdMolStandardize.LargestFragmentChooser() if "." in smiles and Chem.MolFromSmiles(smiles): cleaned_smiles = Chem.MolToSmiles(rmv.choose(self.smiles_mol)) else: cleaned_smiles = smiles return cleaned_smiles
def test7Fragment(self): fragremover = rdMolStandardize.FragmentRemover() mol = Chem.MolFromSmiles("CN(C)C.Cl.Cl.Br") nm = fragremover.remove(mol) self.assertEqual(Chem.MolToSmiles(nm), "CN(C)C") lfragchooser = rdMolStandardize.LargestFragmentChooser() mol2 = Chem.MolFromSmiles("[N+](=O)([O-])[O-].[CH3+]") nm2 = lfragchooser.choose(mol2) self.assertEqual(Chem.MolToSmiles(nm2), "O=[N+]([O-])[O-]") lfragchooser2 = rdMolStandardize.LargestFragmentChooser( preferOrganic=True) nm3 = lfragchooser2.choose(mol2) self.assertEqual(Chem.MolToSmiles(nm3), "[CH3+]") fragremover = rdMolStandardize.FragmentRemover(skip_if_all_match=True) mol = Chem.MolFromSmiles("[Na+].Cl.Cl.Br") nm = fragremover.remove(mol) self.assertEqual(nm.GetNumAtoms(), mol.GetNumAtoms()) smi3 = "CNC[C@@H]([C@H]([C@@H]([C@@H](CO)O)O)O)O.c1cc2c(cc1C(=O)O)oc(n2)c3cc(cc(c3)Cl)Cl" lfParams = rdMolStandardize.CleanupParameters() lfrag_params = rdMolStandardize.LargestFragmentChooser(lfParams) mol3 = Chem.MolFromSmiles(smi3) lfrag3 = lfrag_params.choose(mol3) self.assertEqual(Chem.MolToSmiles(lfrag3), "CNC[C@H](O)[C@@H](O)[C@H](O)[C@H](O)CO") lfParams = rdMolStandardize.CleanupParameters() lfParams.largestFragmentChooserCountHeavyAtomsOnly = True lfrag_params = rdMolStandardize.LargestFragmentChooser(lfParams) mol3 = Chem.MolFromSmiles(smi3) lfrag3 = lfrag_params.choose(mol3) self.assertEqual(Chem.MolToSmiles(lfrag3), "O=C(O)c1ccc2nc(-c3cc(Cl)cc(Cl)c3)oc2c1") lfParams = rdMolStandardize.CleanupParameters() lfParams.largestFragmentChooserUseAtomCount = False lfrag_params = rdMolStandardize.LargestFragmentChooser(lfParams) mol3 = Chem.MolFromSmiles(smi3) lfrag3 = lfrag_params.choose(mol3) self.assertEqual(Chem.MolToSmiles(lfrag3), "O=C(O)c1ccc2nc(-c3cc(Cl)cc(Cl)c3)oc2c1") smi4 = "CC.O=[Pb]=O" lfParams = rdMolStandardize.CleanupParameters() lfrag_params = rdMolStandardize.LargestFragmentChooser(lfParams) mol4 = Chem.MolFromSmiles(smi4) lfrag4 = lfrag_params.choose(mol4) self.assertEqual(Chem.MolToSmiles(lfrag4), "CC") lfParams = rdMolStandardize.CleanupParameters() lfParams.largestFragmentChooserCountHeavyAtomsOnly = True lfrag_params = rdMolStandardize.LargestFragmentChooser(lfParams) mol4 = Chem.MolFromSmiles(smi4) lfrag4 = lfrag_params.choose(mol4) self.assertEqual(Chem.MolToSmiles(lfrag4), "O=[Pb]=O") lfParams = rdMolStandardize.CleanupParameters() lfParams.largestFragmentChooserUseAtomCount = False lfrag_params = rdMolStandardize.LargestFragmentChooser(lfParams) mol4 = Chem.MolFromSmiles(smi4) lfrag4 = lfrag_params.choose(mol4) self.assertEqual(Chem.MolToSmiles(lfrag4), "O=[Pb]=O") lfParams = rdMolStandardize.CleanupParameters() lfParams.largestFragmentChooserCountHeavyAtomsOnly = True lfParams.preferOrganic = True lfrag_params = rdMolStandardize.LargestFragmentChooser(lfParams) mol4 = Chem.MolFromSmiles(smi4) lfrag4 = lfrag_params.choose(mol4) self.assertEqual(Chem.MolToSmiles(lfrag4), "CC") lfParams = rdMolStandardize.CleanupParameters() lfParams.largestFragmentChooserUseAtomCount = False lfParams.preferOrganic = True lfrag_params = rdMolStandardize.LargestFragmentChooser(lfParams) mol4 = Chem.MolFromSmiles(smi4) lfrag4 = lfrag_params.choose(mol4) self.assertEqual(Chem.MolToSmiles(lfrag4), "CC")
def main(prm_file): pref = prm_file.split('.sdf')[0] print('## Reading file...') prm_df = PandasTools.LoadSDF(prm_file, smilesName='SMILES', molColName='MOL', includeFingerprints=False) print(prm_df[:10]) ## remove salts and rename the smiles print('## Cleaning moleucles...') remover = SaltRemover.SaltRemover() chooser = rdMolStandardize.LargestFragmentChooser(preferOrganic=True) prm_df['molx'] = prm_df.MOL.apply(remover.StripMol) prm_df['mol'] = prm_df.molx.apply(chooser.choose) prm_df['smiles'] = prm_df.mol.apply(Chem.MolToSmiles) def add_cb(inp): return 'CB_' + str(inp) prm_df['ID'] = prm_df.CB_ID.apply(add_cb) # prm_df['ID'] = prm_df.CB_ID ## shuffle print('## Shuffling molecules...') df = prm_df.sample(frac=1).reset_index(drop=True) print(prm_df[:10]) ## recalculate molecular properties print('## Calculating properties...') prm_df['qed'] = prm_df.mol.apply(QED.properties) prm_df['MW'] = prm_df.qed.apply(lambda x: x.MW) # prm_df['logP'] = prm_df.qed.apply(lambda x: x.ALOGP) # prm_df['HBA'] = prm_df.qed.apply(lambda x: x.HBA) # prm_df['HBD'] = prm_df.qed.apply(lambda x: x.HBD) # prm_df['PSA'] = prm_df.qed.apply(lambda x: x.PSA) # prm_df['ROTB'] = prm_df.qed.apply(lambda x: x.ROTB) # prm_df['AROM'] = prm_df.qed.apply(lambda x: x.AROM) # prm_df['HA'] = prm_df.mol.apply(rdchem.Mol.GetNumHeavyAtoms) print(prm_df[:10]) print(' > number of molecules... ', len(prm_df)) ## print out molecule properties and smiles (shuffled) print('## Writing results...') Cols_csv = [ 'ID', 'MW', 'HA', 'logP', 'LogS', 'HBA', 'HBD', 'PSA', 'ROTB', 'AROM', 'SaltType', 'smiles' ] Cols_smi = ['smiles', 'ID'] prm_df.loc[(prm_df.MW > 150.) & (prm_df.MW <= 300.)].to_csv( pref + '.frag.csv.bz2', sep=',', float_format='%.2f', columns=Cols_csv, index=False) prm_df.loc[(prm_df.MW > 150.) & (prm_df.MW <= 300.)].to_csv( pref + '.frag.smi', sep='\t', columns=Cols_smi, index=False) prm_df.loc[(prm_df.MW > 300.) & (prm_df.MW <= 400.)].to_csv( pref + '.lead.csv.bz2', sep=',', float_format='%.2f', columns=Cols_csv, index=False) prm_df.loc[(prm_df.MW > 300.) & (prm_df.MW <= 400.)].to_csv( pref + '.lead.smi', sep='\t', columns=Cols_smi, index=False) prm_df.loc[prm_df.MW > 400.].to_csv(pref + '.drug.csv.bz2', sep=',', float_format='%.2f', columns=Cols_csv, index=False) prm_df.loc[prm_df.MW > 400.].to_csv(pref + '.drug.smi', sep='\t', columns=Cols_smi, index=False) prm_df.loc[prm_df.MW <= 150.].to_csv(pref + '.small.csv.bz2', sep=',', float_format='%.2f', columns=Cols_csv, index=False) prm_df.loc[prm_df.MW <= 150.].to_csv(pref + '.small.smi', sep='\t', columns=Cols_smi, index=False)