def main( ): args = UserInput() if args.id is None: args.id = 'ID' if args.start is None: args.start = 0 if args.end is None: args.end = -1 df = RDkitRead(args.infile, args.id, removeHs=True, add_Hs=False)[int(args.start):int(args.end)].dropna() remover = SaltRemover.SaltRemover() normzer = rdMolStandardize.Normalizer() chooser = rdMolStandardize.LargestFragmentChooser(preferOrganic=True) ## remove salts print('\033[34m## Desalting moleucles...\033[0m\n') df['mol'] = df.MOL.apply(remover.StripMol) ## choose largest fragment (most Hs) print('\033[34m## Choosing moleucles...\033[0m\n') df['mol2'] = df.mol.apply(chooser.choose) ## clean molecule (not really relevant?) print('\033[34m## Cleaning moleucles...\033[0m\n') df['mol3'] = df.mol2.apply(normzer.normalize) ## rewrite SMILES with newest mol3 print('\033[34m## Converting moleucles...\033[0m\n') df['smiles'] = df.mol3.apply(Chem.MolToSmiles) if args.format == 'sdf': rdpd.WriteSDF(df, args.outpref+'.'+args.format, molColName='mol3', idName=args.id, properties=['smiles']) elif args.format == 'smi': df.to_csv(args.outpref+'.'+args.format, index=False, sep=' ', columns=['smiles',args.id], header=True)
def graph_corpus(input, output, suffix='sdf'): metals = {'Na', 'Zn', 'Li', 'K', 'Ca', 'Mg', 'Ag', 'Cs', 'Ra', 'Rb', 'Al', 'Sr', 'Ba', 'Bi'} voc = utils.VocGraph('data/voc_atom.txt') inf = gzip.open(input) if suffix == 'sdf': mols = Chem.ForwardSDMolSupplier(inf) total = 2e6 else: mols = pd.read_table(input).drop_duplicates(subset=['Smiles']).dropna(subset=['Smiles']) total = len(mols) mols = mols.iterrows() vals = {} exps = {} codes, ids = [], [] chooser = rdMolStandardize.LargestFragmentChooser() disconnector = rdMolStandardize.MetalDisconnector() normalizer = rdMolStandardize.Normalizer() for i, mol in enumerate(tqdm(mols, total=total)): if mol is None: continue if suffix != 'sdf': idx = mol[1]['Molecule ChEMBL ID'] mol = Chem.MolFromSmiles(mol[1].Smiles) else: idx = mol.GetPropsAsDict() idx = idx['chembl_id'] try: mol = disconnector.Disconnect(mol) mol = normalizer.normalize(mol) mol = chooser.choose(mol) mol = disconnector.Disconnect(mol) mol = normalizer.normalize(mol) except: print(idx) symb = [a.GetSymbol() for a in mol.GetAtoms()] # Nr. of the atoms bonds = mol.GetBonds() if len(bonds) < 4 or len(bonds) >= 63: continue if {'C'}.isdisjoint(symb): continue if not metals.isdisjoint(symb): continue smile = Chem.MolToSmiles(mol) try: s0 = smile.replace('[O]', 'O').replace('[C]', 'C') \ .replace('[N]', 'N').replace('[B]', 'B') \ .replace('[2H]', '[H]').replace('[3H]', '[H]') s0 = Chem.CanonSmiles(s0, 0) code = voc.encode([smile]) s1 = voc.decode(code)[0] assert s0 == s1 codes.append(code[0].reshape(-1).tolist()) ids.append(idx) except Exception as ex: print(ex) print('Parse Error:', idx) df = pd.DataFrame(codes, index=ids, columns=['C%d' % i for i in range(64*4)]) df.to_csv(output, sep='\t', index=True) print(vals) print(exps)
def corpus(input, output, suffix='sdf'): if suffix =='sdf': inf = gzip.open(input) mols = Chem.ForwardSDMolSupplier(inf) # mols = [mol for mol in suppl] else: df = pd.read_table(input).Smiles.dropna() mols = [Chem.MolFromSmiles(s) for s in df] voc = Voc('data/voc_smiles.txt') charger = rdMolStandardize.Uncharger() chooser = rdMolStandardize.LargestFragmentChooser() disconnector = rdMolStandardize.MetalDisconnector() normalizer = rdMolStandardize.Normalizer() words = set() canons = [] tokens = [] smiles = set() for mol in tqdm(mols): try: mol = disconnector.Disconnect(mol) mol = normalizer.normalize(mol) mol = chooser.choose(mol) mol = charger.uncharge(mol) mol = disconnector.Disconnect(mol) mol = normalizer.normalize(mol) smileR = Chem.MolToSmiles(mol, 0) smiles.add(Chem.CanonSmiles(smileR)) except: print('Parsing Error:') #, Chem.MolToSmiles(mol)) for smile in tqdm(smiles): token = voc.split(smile) + ['EOS'] if {'C', 'c'}.isdisjoint(token): print('Warning:', smile) continue if not {'[Na]', '[Zn]'}.isdisjoint(token): print('Redudent', smile) continue if 10 < len(token) <= 100: words.update(token) canons.append(smile) tokens.append(' '.join(token)) log = open(output + '_voc.txt', 'w') log.write('\n'.join(sorted(words))) log.close() log = pd.DataFrame() log['Smiles'] = canons log['Token'] = tokens log.drop_duplicates(subset='Smiles') log.to_csv(output + '_corpus.txt', sep='\t', index=False)
def test10NormalizeFromData(self): data = """// Name SMIRKS Nitro to N+(O-)=O [N,P,As,Sb;X3:1](=[O,S,Se,Te:2])=[O,S,Se,Te:3]>>[*+1:1]([*-1:2])=[*:3] Sulfone to S(=O)(=O) [S+2:1]([O-:2])([O-:3])>>[S+0:1](=[O-0:2])(=[O-0:3]) Pyridine oxide to n+O- [n:1]=[O:2]>>[n+:1][O-:2] // Azide to N=N+=N- [*,H:1][N:2]=[N:3]#[N:4]>>[*,H:1][N:2]=[N+:3]=[N-:4] """ normalizer1 = rdMolStandardize.Normalizer() params = rdMolStandardize.CleanupParameters() normalizer2 = rdMolStandardize.NormalizerFromData(data, params) imol = Chem.MolFromSmiles("O=N(=O)CCN=N#N", sanitize=False) mol1 = normalizer1.normalize(imol) mol2 = normalizer2.normalize(imol) self.assertEqual(Chem.MolToSmiles(imol), "N#N=NCCN(=O)=O") self.assertEqual(Chem.MolToSmiles(mol1), "[N-]=[N+]=NCC[N+](=O)[O-]") self.assertEqual(Chem.MolToSmiles(mol2), "N#N=NCC[N+](=O)[O-]")
def my_standardizer(mol: Chem.Mol) -> Chem.Mol: """ MolVS implementation of standardization Args: mol (Chem.Mol): non-standardized rdkit mol object Returns: Chem.Mol: stndardized rdkit mol object """ mol = copy.deepcopy(mol) Chem.SanitizeMol(mol) mol = Chem.RemoveHs(mol) disconnector = rdMolStandardize.MetalDisconnector() mol = disconnector.Disconnect(mol) normalizer = rdMolStandardize.Normalizer() mol = normalizer.normalize(mol) reionizer = rdMolStandardize.Reionizer() mol = reionizer.reionize(mol) Chem.AssignStereochemistry(mol, force=True, cleanIt=True) # TODO: Check this removes symmetric stereocenters return mol
def normalize(mol): """Applies a series of Normalization transforms to correct functional groups and recombine charges. Parameters ---------- mol: rdkit.Chem.Mol A molecule. Returns ------- mol: rdkit.Chem.Mol Returns a molecule where various Normalization transforms to correct functional groups and recombine charges have been performed on. Notes ----- The Normalization transformations are saved in the list NORMALIZATIONS contained in rdkit/Chem/MolStandardize/normalize.py. They are derived from the InChI technical manual. """ return rdMolStandardize.Normalizer().normalize(mol)
def test8Normalize(self): normalizer = rdMolStandardize.Normalizer() mol = Chem.MolFromSmiles("C[n+]1ccccc1[O-]") nm = normalizer.normalize(mol) self.assertEqual(Chem.MolToSmiles(nm), "Cn1ccccc1=O")