def findDuplicates (sdf, name, out): lg = RDLogger.logger() lg.setLevel(RDLogger.ERROR) suppl = Chem.SDMolSupplier(sdf,removeHs=False, sanitize=False) idlist = [] nmlist = [] smlist = [] print 'reading SDFile...' counter = 0 for mol in suppl: counter+=1 if mol is None: continue try: inchi = Chem.MolToInchi(mol) inkey = Chem.InchiToInchiKey(inchi) smile = Chem.MolToSmiles(mol) except: continue try: ni = mol.GetProp(name) except: ni = 'mol%0.8d' %counter idlist.append(inkey[:-3]) nmlist.append(ni) smlist.append(smile) n = len(idlist) print 'analizing duplicates...' fo = open (out,'w+') fo.write('i\tj\tnamei\tnamej\tsmilesi\tsmilesj\n') duplicates = 0 for i in range (n): for j in range (i+1,n): if idlist[i]==idlist[j]: line=str(i)+'\t'+str(j)+'\t'+nmlist[i]+'\t'+nmlist[j]+'\t'+smlist[i]+'\t'+smlist[j] fo.write(line+'\n') duplicates+=1 fo.close() print '\n%d duplicate molecules found' %duplicates
def _get_molecule_database(self, molecule_database_src, molecule_database_src_type): """Load molecular database and return it. Optionally return features if found in excel / csv file. Args: molecule_database_src (str): Source of molecular information. Can be a folder or a filepath. In case a folder is specified, all .pdb files in the folder are sequentially read. If a file path, it is assumed that the file is a .txt file with layout: SMILES string (column1) '\b' property (column2, optional). molecule_database_src_type (str): Type of source. Can be ['folder', 'text', 'excel', 'csv'] Returns: (list(Molecule), np.ndarray or None) Returns a tuple. First element of tuple is the molecule_database. Second element is array of features of shape (len(molecule_database), n_features) or None if None found. """ if not self.is_verbose: RDLogger.DisableLog('rdApp.*') molecule_database = [] features = None if molecule_database_src_type.lower() in ["folder", "directory"]: if self.is_verbose: print(f"Searching for *.pdb files in {molecule_database_src}") for molfile in glob(os.path.join(molecule_database_src, "*.pdb")): if self.is_verbose: print(f"Loading {molfile}") try: molecule_database.append(Molecule(mol_src=molfile)) except LoadingError as e: if self.is_verbose: print(f"{molfile} could not be imported. Skipping") elif molecule_database_src_type.lower() == "text": if self.is_verbose: print(f"Reading SMILES strings from {molecule_database_src}") with open(molecule_database_src, "r") as fp: smiles_data = fp.readlines() for count, line in enumerate(smiles_data): # Assumes that the first column contains the smiles string line_fields = line.split() smile = line_fields[0] mol_property_val = None if len(line_fields) > 1: mol_property_val = float(line_fields[1]) if self.is_verbose: print(f"Processing {smile} " f"({count + 1}/" f"{len(smiles_data)})") mol_text = smile try: molecule_database.append( Molecule( mol_smiles=smile, mol_text=mol_text, mol_property_val=mol_property_val, )) except LoadingError as e: if self.is_verbose: print(f"{smile} could not be imported. Skipping") elif molecule_database_src_type.lower() in ["excel", "csv"]: if self.is_verbose: print(f"Reading molecules from {molecule_database_src}") database_df = (pd.read_excel(molecule_database_src, engine="openpyxl") if molecule_database_src_type.lower() == "excel" else pd.read_csv(molecule_database_src)) # expects feature columns to be prefixed with feature_ # e.g. feature_smiles feature_cols = [ column for column in database_df.columns if column.split("_")[0] == "feature" ] database_feature_df = database_df[feature_cols] mol_names, mol_smiles, responses = None, None, None if "feature_name" in feature_cols: mol_names = database_feature_df["feature_name"].values.flatten( ) database_feature_df = database_feature_df.drop( ["feature_name"], axis=1) if "feature_smiles" in feature_cols: mol_smiles = database_df["feature_smiles"].values.flatten() database_feature_df = database_feature_df.drop( ["feature_smiles"], axis=1) response_col = [ column for column in database_df.columns if column.split("_")[0] == "response" ] if len(response_col) > 0: # currently handles one response responses = database_df[response_col].values.flatten() for mol_id, smile in enumerate(mol_smiles): if self.is_verbose: print(f"Processing {smile} " f"({mol_id + 1}/" f"{database_df['feature_smiles'].values.size})") mol_text = mol_names[mol_id] if mol_names is not None else smile mol_property_val = responses[ mol_id] if responses is not None else None try: molecule_database.append( Molecule( mol_smiles=smile, mol_text=mol_text, mol_property_val=mol_property_val, )) except LoadingError as e: if self.is_verbose: print(f"{smile} could not be imported. Skipping") if len(database_feature_df.columns) > 0: features = database_feature_df.values else: raise FileNotFoundError( f"{molecule_database_src} could not be found. " f"Please enter valid folder name or path of a " f"text/excel/csv") if len(molecule_database) == 0: raise UserWarning("No molecular files found in the location!") return molecule_database, features
def __init__(self, moli, molj, options=argparse.Namespace(time=20, verbose='info')): """ Inizialization function Parameters ---------- moli : RDKit molecule object the first molecule used to perform the MCS calculation molj : RDKit molecule object the second molecule used to perform the MCS calculation options : argparse python object the list of user options """ def map_mcs_mol(): """ This function is used to define a map between the generated mcs, the molecules and vice versa """ # mcs indexes mapped back to the first molecule moli if self.__moli_noh.HasSubstructMatch(self.mcs_mol): moli_sub = self.__moli_noh.GetSubstructMatch(self.mcs_mol) else: raise ValueError( 'RDkit MCS Subgraph first molecule search failed') # GAC TEST 02/17/17 # mcsi_sub = self.mcs_mol.GetSubstructMatch(self.mcs_mol) if self.mcs_mol.HasSubstructMatch(self.mcs_mol): mcsi_sub = self.mcs_mol.GetSubstructMatch(self.mcs_mol) else: raise ValueError('RDkit MCS Subgraph search failed') # mcs to moli map_mcs_mol_to_moli_sub = zip(mcsi_sub, moli_sub) #print map_mcs_mol_to_moli_sub # An RDkit atomic property is defined to store the mapping to moli for idx in map_mcs_mol_to_moli_sub: self.mcs_mol.GetAtomWithIdx(idx[0]).SetProp( 'to_moli', str(idx[1])) # mcs indexes mapped back to the second molecule molj if self.__molj_noh.HasSubstructMatch(self.mcs_mol): molj_sub = self.__molj_noh.GetSubstructMatch(self.mcs_mol) else: raise ValueError( 'RDkit MCS Subgraph second molecule search failed') if self.mcs_mol.HasSubstructMatch(self.mcs_mol): mcsj_sub = self.mcs_mol.GetSubstructMatch(self.mcs_mol) else: raise ValueError('RDkit MCS Subgraph search failed') # mcs to molj map_mcs_mol_to_molj_sub = zip(mcsj_sub, molj_sub) #print map_mcs_mol_to_molj_sub # Map between the two molecules self.__map_moli_molj = zip(moli_sub, molj_sub) # An RDkit atomic property is defined to store the mapping to molj for idx in map_mcs_mol_to_molj_sub: self.mcs_mol.GetAtomWithIdx(idx[0]).SetProp( 'to_molj', str(idx[1])) # Chirality # moli chiral atoms chiral_at_moli_noh = [ seq[0] for seq in Chem.FindMolChiralCenters(self.__moli_noh) ] # molj chiral atoms chiral_at_molj_noh = [ seq[0] for seq in Chem.FindMolChiralCenters(self.__molj_noh) ] chiral_at_mcs_moli_noh = set([ seq[0] for seq in map_mcs_mol_to_moli_sub if seq[1] in chiral_at_moli_noh ]) chiral_at_mcs_molj_noh = set([ seq[0] for seq in map_mcs_mol_to_molj_sub if seq[1] in chiral_at_molj_noh ]) # mcs chiral atoms chiral_at_mcs = chiral_at_mcs_moli_noh | chiral_at_mcs_molj_noh for i in chiral_at_mcs: at = self.mcs_mol.GetAtomWithIdx(i) at.SetChiralTag(Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW) if chiral_at_mcs and options.verbose == 'pedantic': logging.info('Chiral atom detected') # For each mcs atom we save its original index in a specified # property. This could be very usefull in the code development # when deletition or atom insertions are performed for at in self.mcs_mol.GetAtoms(): at.SetProp('org_idx', str(at.GetIdx())) return def set_ring_counter(mol): """ This function is used to attach to each molecule atom a ring counter rc. This parameter is used to asses if a ring has been broken or not during the MCS mapping Parameters ---------- mol : RDKit Molecule obj the molecule used to define the atom ring counters """ # set to zero the atom ring counters for at in mol.GetAtoms(): at.SetProp('rc', '0') rginfo = mol.GetRingInfo() rgs = rginfo.AtomRings() #print rgs rgs_set = set([e for l in rgs for e in l]) for idx in rgs_set: for r in rgs: if (idx in r): val = int(mol.GetAtomWithIdx(idx).GetProp('rc')) val = val + 1 mol.GetAtomWithIdx(idx).SetProp('rc', str(val)) return # Set logging level and format logging.basicConfig(format='%(levelname)s:\t%(message)s', level=logging.INFO) # Local pointers to the passed molecules self.moli = moli self.molj = molj if not options.verbose == 'pedantic': lg = RDLogger.logger() lg.setLevel(RDLogger.CRITICAL) # Local pointers to the passed molecules without hydrogens # These variables are defined as private try: self.__moli_noh = AllChem.RemoveHs(moli) self.__molj_noh = AllChem.RemoveHs(molj) except Exception: self.__moli_noh = AllChem.RemoveHs(moli, sanitize=False) self.__molj_noh = AllChem.RemoveHs(molj, sanitize=False) Chem.SanitizeMol( self.__moli_noh, sanitizeOps=Chem.SanitizeFlags.SANITIZE_SETAROMATICITY) Chem.SanitizeMol( self.__molj_noh, sanitizeOps=Chem.SanitizeFlags.SANITIZE_SETAROMATICITY) # MCS calculaton. In RDKit the MCS is a smart string. Ring atoms are # always mapped in ring atoms. self.__mcs = rdFMCS.FindMCS([self.__moli_noh, self.__molj_noh], timeout=options.time, atomCompare=rdFMCS.AtomCompare.CompareAny, bondCompare=rdFMCS.BondCompare.CompareAny, matchValences=False, ringMatchesRingOnly=True, completeRingsOnly=False, matchChiralTag=False) # Checking if self.__mcs.canceled: logging.warning( 'Timeout reached to find the MCS between the molecules') if self.__mcs.numAtoms == 0: raise ValueError('No MCS was found between the molecules') # The found MCS pattern (smart strings) is converted to a RDKit molecule self.mcs_mol = Chem.MolFromSmarts(self.__mcs.smartsString) try: # Try to sanitize the MCS molecule Chem.SanitizeMol(self.mcs_mol) except Exception: # if not, try to recover the atom aromaticity wich is # important for the ring counter sanitFail = Chem.SanitizeMol( self.mcs_mol, sanitizeOps=Chem.SanitizeFlags.SANITIZE_SETAROMATICITY, catchErrors=True) if sanitFail: # if not, the MCS is skipped raise ValueError('Sanitization Failed...') # Mapping between the found MCS molecule and moli, molj try: map_mcs_mol() except Exception as e: raise ValueError(str(e)) #Set the ring counters for each molecule set_ring_counter(self.__moli_noh) set_ring_counter(self.__molj_noh) set_ring_counter(self.mcs_mol) # for at in self.mcs_mol.GetAtoms(): # print 'at = %d rc = %d' % (at.GetIdx(), int(at.GetProp('rc'))) if not options.verbose == 'pedantic': lg.setLevel(RDLogger.WARNING) return
""" This module detects salts. """ import logging from rdkit import Chem from rdkit.Chem.MolStandardize import rdMolStandardize from rdkit import RDLogger from .remove_salts import remove_salts RDLogger.DisableLog('rdApp.info') __all__ = ["detect_salts"] def _validation_smiles(mol): """Utility function that converts a mol to SMILES for later validation. """ validation_smiles = Chem.MolToSmiles(mol) return validation_smiles def detect_salts(mol, *args, **kwargs): """Detects salts. Generates a SMILES out of the entered mol for validation, performs fragment removal, turns the changed mol into another SMILES and validates it with the first SMILES created. Parameters ----------
# Setting logging low from rdkit import RDLogger lg = RDLogger.logger() lg.setLevel(RDLogger.CRITICAL) from database import db_client from django.conf import settings import makeit.utilities.io.pickle as pickle import os import makeit.global_config as gc # Chiral Retro Transformer import makeit.retrosynthetic.transformer as transformer RetroTransformer = transformer.RetroTransformer(lookup_only=True) RetroTransformer.load(chiral=True, refs=True, rxns=False) RETRO_CHIRAL_FOOTNOTE = 'Using {} chiral retrosynthesis templates (mincount {} if achiral, mincount {} if chiral) from {}/{}'.format( gc.Relevance_Prioritization['output_size'], gc.RETRO_TRANSFORMS_CHIRAL['mincount'], gc.RETRO_TRANSFORMS_CHIRAL['mincount_chiral'], gc.RETRO_TRANSFORMS_CHIRAL['database'], gc.RETRO_TRANSFORMS_CHIRAL['collection']) ### Databases db = db_client[settings.REACTIONS['database']] REACTION_DB = db[settings.REACTIONS['collection']] # RETRO_LIT_FOOTNOTE = 'Searched {} known reactions from literature'.format(REACTION_DB.count()) db = db_client[settings.INSTANCES['database']] INSTANCE_DB = db[settings.INSTANCES['collection']] db = db_client[settings.CHEMICALS['database']] CHEMICAL_DB = db[settings.CHEMICALS['collection']]
from plip.structure.preparation import PDBComplex import pyunitwizard as puw from rdkit import Chem, RDLogger from rdkit.Chem.Draw import rdMolDraw2D # Standard Library from collections import defaultdict import copy from io import StringIO, BytesIO import json import requests import re import tempfile from typing import List, Optional, Tuple import warnings RDLogger.DisableLog('rdApp.*') # Disable rdkit warnings class StructuredBasedPharmacophore(Pharmacophore): """ Class to store and compute structured-based pharmacophores Inherits from pharmacophore Parameters ---------- pharmacophoric_points : list of openpharmacophore.PharmacophoricPoint List of pharmacophoric pharmacophoric_points molecular_system : rdkit.Chem.Mol The protein-ligand complex from which this pharmacophore was extracted.
def test_SmartsRemover(self): salts = ['[Cl;H1&X1,-]', '[Na+]', '[O;H2,H1&-,X0&-2]', 'BadSmarts'] RDLogger.DisableLog('rdApp.error') self.assertRaises(ValueError, SmartsRemover.SmartsRemover, patterns=salts) RDLogger.EnableLog('rdApp.error')
def disable_rdkit_log(): from rdkit import RDLogger RDLogger.DisableLog('rdApp.*')
import rdkit.rdBase as rkrb import rdkit.RDLogger as rkl from mordred import Calculator, descriptors from rdkit.Chem import AddHs, CanonSmiles, MolFromSmiles from rdkit.Chem.Descriptors import ExactMolWt from rdkit.Chem.inchi import MolToInchiKey from rdkit.Chem.rdmolfiles import MolFromSmiles from sklearn.ensemble import RandomForestRegressor from minedatabase.filters.base_filter import Filter from minedatabase.metabolomics import MetabolomicsDataset, Peak from minedatabase.pickaxe import Pickaxe from minedatabase.utils import neutralise_charges logger = rkl.logger() logger.setLevel(rkl.ERROR) rkrb.DisableLog("rdApp.error") class MetabolomicsFilter(Filter): """Filters out compounds that don't align with a metabolomics dataset. This filter compares the masses (and optionally, predicted retention times) of MINE compounds against peak masses (and retention times) in a metabolomics dataset. Tolerances for mass (in Da) and retention times (in units consistent with dataset) are specified by the user. If a compound's mass (and predicted retention time, if desired) does not match that for any peak in the dataset, it is filtered out. Parameters
def generate_substructures(input_file): """ takes all text from input file containing the structures' smile string and identifier. Returns structure info list and a dictionary with all possibles substructure per structure. input_file: structure txt file """ official_subs_dict = {} with open(input_file) as file_object: input_file = file_object.read() # Create a structure list all_lines = input_file.split('\n') structure_smile_list = [] structure_mol_list = [] structure_combo_list = [] # for line in all_lines[0:5]: for line in all_lines[:-1]: line = line.split('\t') structure_id = line[1] structure_mol = Chem.MolFromSmiles(line[0]) structure_smile = Chem.MolToSmiles(structure_mol) structure_smile_list += [structure_smile] structure_mol_list += [structure_mol] structure_combo_list += [[ structure_smile, structure_mol, structure_id ]] # Generate the mols for each structure in the class draw_list = [] draw_legend_list = [] for i, structure_info in enumerate(structure_combo_list): valid_sub_list = [] valid_sub_mol_list = [] structure_smile = structure_info[0] structure_mol = structure_info[1] structure_id = structure_info[2] nr_of_atoms = structure_mol.GetNumAtoms() # Generate all possible mol environments per structure mol_env_list = [] for j in range(nr_of_atoms): for k in range(nr_of_atoms): env = Chem.FindAtomEnvironmentOfRadiusN(structure_mol, j, k) mol_env_list += [env] # Generate all possible substructures based on the mol envs for env in mol_env_list: submol = Chem.PathToSubmol(structure_mol, env) # Generate the mol of each substructure sub_smile = Chem.MolToSmiles(submol) submol = Chem.MolFromSmiles(sub_smile) if sub_smile != '' and sub_smile != structure_smile: lg = RDLogger.logger() lg.setLevel(RDLogger.CRITICAL) try: Chem.SanitizeMol(submol) if sub_smile not in valid_sub_list and structure_mol.HasSubstructMatch( submol) == True: valid_sub_list += [sub_smile] valid_sub_mol_list += [submol] except: pass # Write each substructure per structure in a dictionary and also generate the draw_list for i, valid_substructure in enumerate(valid_sub_list): if valid_substructure not in draw_list: draw_list += [valid_sub_mol_list[i]] draw_legend_list += [valid_substructure] if structure_id in official_subs_dict: official_subs_dict[structure_id].append(valid_substructure) if structure_id not in official_subs_dict: official_subs_dict[structure_id] = [valid_substructure] if structure_id not in official_subs_dict: official_subs_dict[structure_id] = ['<NA>'] official_subs_dict_sorted = sorted(official_subs_dict) with open("all_test_substructures.txt", 'w') as db_file: for name in official_subs_dict_sorted: for key in official_subs_dict.keys(): if key == name: value_string = '' for value in official_subs_dict[key]: value_string += value + "." value_string = value_string[:-1] db_file.write(value_string + '\t' + key + '\n') print('~~~~~~~~~~~~~~~~~~~~~~~~~~') print('All possible substructures') nr_of_subs = 0 for key, value in official_subs_dict.items(): for val in value: nr_of_subs += 1 print(nr_of_subs) return structure_combo_list, official_subs_dict
def generate_substructures(input_file): """ takes all text from input file containing the structures' smile string and identifier. Returns structure info list and a dictionary with all possibles substructure per structure. input_file: structure txt file """ with open(input_file) as file_object: input_file = file_object.read() official_subs_dict = {} draw_list = [] draw_legend_list = [] # Create a structure list all_lines = input_file.split('\n') structure_smile_list = [] structure_mol_list = [] structure_combo_list = [] # for line in all_lines[0:5]: for line in all_lines[:-1]: line = line.split('\t') structure_id = line[1] structure_mol = Chem.MolFromSmiles(line[0]) structure_smile = Chem.MolToSmiles(structure_mol) structure_smile_list += [structure_smile] structure_mol_list += [structure_mol] structure_combo_list += [[ structure_smile, structure_mol, structure_id ]] # generate all smiles combinations for each atom and length for i, structure_info in enumerate(structure_combo_list): structure_smile = structure_info[0] structure_mol = structure_info[1] structure_id = structure_info[2] char_list = [] for char in structure_smile: char_list += [char] nr_of_atoms = len(char_list) all_subs_comb_list = [] for j in range(nr_of_atoms): all_combos = list(it.combinations(char_list, j)) for combo in all_combos: combo = str(combo) combo = combo.replace('\'', '').replace(', ', '').replace( '(', '').replace(')', '') combo = combo.replace(',', '') if combo not in all_subs_comb_list: all_subs_comb_list += [combo] sub_smile_list = [] for substructure in all_subs_comb_list: lg = RDLogger.logger() lg.setLevel(RDLogger.CRITICAL) mol_sub = Chem.MolFromSmiles(substructure) if mol_sub != None and substructure not in sub_smile_list and substructure != '' and len( substructure) > 1: sub_smile_list += [substructure] sub_mol_list = [] sub_smile2_list = [] for sub_smile in sub_smile_list: mol2_sub = Chem.MolFromSmiles(sub_smile) # Check if the substructures are valid smiles try: smile2_sub = Chem.MolToSmiles(mol2_sub) sub_smile2_list += [smile2_sub] sub_mol_list += [mol2_sub] except: pass valid_sub_list = [] valid_sub_mol_list = [] for sub_mol2 in sub_mol_list: sub_smile3 = Chem.MolToSmiles(sub_mol2) # Check if the substructure matches the structure if structure_mol.HasSubstructMatch( sub_mol2) == True and sub_smile3 not in valid_sub_list: valid_sub_list += [sub_smile3] valid_sub_mol_list += [sub_mol2] # Write each substructure per structure in a dictionary and also generate the draw_list for k, valid_substructure in enumerate(valid_sub_list): if valid_substructure not in draw_list: draw_list += [valid_sub_mol_list[k]] draw_legend_list += [valid_substructure] if structure_id in official_subs_dict: official_subs_dict[structure_id].append(valid_substructure) if structure_id not in official_subs_dict: official_subs_dict[structure_id] = [valid_substructure] if structure_id not in official_subs_dict: official_subs_dict[structure_id] = ['<NA>'] official_subs_dict_sorted = sorted(official_subs_dict) with open("all_perm_test_substructures.txt", 'w') as db_file: for name in official_subs_dict_sorted: for key in official_subs_dict.keys(): if key == name: value_string = '' for value in official_subs_dict[key]: value_string += value + "." value_string = value_string[:-1] db_file.write(value_string + '\t' + key + '\n') print('~~~~~~~~~~~~~~~~~~~~~~~~~~') print('All possible substructures') nr_of_subs = 0 for key, value in official_subs_dict.items(): for val in value: nr_of_subs += 1 print(nr_of_subs) return structure_combo_list, official_subs_dict
def fraginc2smi(f, mol, frag_keys, frag_type=None, kekulize=False): RDLogger.DisableLog('rdApp.*') smi = Chem.MolToSmiles(mol) #print('{:02d}'.format(f[0]), end=' ') if kekulize: Chem.Kekulize(mol, clearAromaticFlags=True) mw = Chem.RWMol(mol) numatoms = mw.GetNumAtoms() total_deg = [atom.GetTotalValence() for atom in mw.GetAtoms()] for i in range(numatoms): idx = numatoms-1-i if idx not in f: mw.RemoveAtom(idx) numatoms = mw.GetNumAtoms() #if len(Chem.GetSymmSSSR(mw)) < 1: mw = Chem.RWMol(Chem.AddHs(mw)) #print(total_deg) #print('a : {}'.format([atom.GetAtomicNum() for atom in mol.GetAtoms()])) #print('f : {}'.format(f)) for idx, val in enumerate(total_deg): if idx in f: idx2 = sorted(list(set(f))).index(idx) atom = mw.GetAtomWithIdx(idx2) if atom.GetAtomicNum() != 1: if atom.GetTotalValence() != val: #print('{}({})'.format(idx,atom.GetAtomicNum())) #print('VALENCE DOES NOT MATCH {} -> {}'.format(atom.GetTotalValence(),val)) #print('numatoms : {}'.format(mw.GetNumAtoms())) for _ in range(val-atom.GetTotalValence()): idx_h = mw.AddAtom(Chem.Atom(1)) #print('added H {}'.format(idx_h)) mw.AddBond(idx2,idx_h,Chem.BondType.SINGLE) #print('numatoms : {}'.format(mw.GetNumAtoms())) #sys.exit() idx_rings = list() for r in Chem.GetSymmSSSR(mw): for x in r: if x not in idx_rings: idx_rings.append(x) #print(idx_rings) if True: for idx, atom in enumerate(mw.GetAtoms()): if idx not in idx_rings: atom.SetIsAromatic(False) if len(Chem.GetSymmSSSR(mw)) < 1: try: Chem.Kekulize(mw, clearAromaticFlags=True) smi = Chem.MolToSmiles(mw,kekuleSmiles=True,canonical=True) except: print('Cannot kekulize mw') smi = Chem.MolToSmiles(mw) #smi = Chem.MolToSmiles(mw,kekuleSmiles=True,canonical=True) else: smi = Chem.MolToSmiles(mw) mol = Chem.MolFromSmiles(smi) if mol == None: if 'n' in smi: smi = smi.replace('n','[nH]') elif ':O:' in smi: smi = smi.replace(':O:','[O]') mol = Chem.MolFromSmiles(smi) try: Chem.Kekulize(mol, clearAromaticFlags=True) smi = Chem.MolToSmiles(mol,kekuleSmiles=True) except: pass #smi = Chem.MolToSmiles(mol,kekuleSmiles=True) #print(smi, mol) #mw = Chem.AddHs(mw) mol = Chem.AddHs(mol) AllChem.EmbedMolecule(mol) #print(smi, mol.GetNumAtoms()) formula=moldict2hill(mol2formula(mol, incl_H=True)) smi2=Chem.MolToSmiles(mol,allHsExplicit=True,allBondsExplicit=False) mol.SetProp("_Name","{} {} {}".format(formula,smi,smi2)) if '.' in smi: smi_ls = smi.split('.') for s in smi_ls: mol_s = Chem.MolFromSmiles(s) mol_s = Chem.AddHs(mol_s) AllChem.EmbedMolecule(mol_s) s2 = Chem.MolToSmiles(mol_s,allHsExplicit=True,allBondsExplicit=False) formula=moldict2hill(mol2formula(mol_s, incl_H=True)) mol.SetProp("_Name","{} {} {}".format(formula,s,s2)) frag_fn, make_mol, frag_keys=get_frag_fn(formula,s,s2,frag_keys) if make_mol: with open('fragment_lookup/'+frag_fn+'.mol', "w") as fn: fn.write(Chem.MolToMolBlock(mol_s)) #print('Written to fragment_lookup/{}.mol'.format(frag_fn)) else: frag_fn, make_mol, frag_keys=get_frag_fn(formula,smi,smi2,frag_keys) #print(Chem.MolToMolBlock(mol)) if make_mol: with open('fragment_lookup/'+frag_fn+'.mol', "w") as fn: fn.write(Chem.MolToMolBlock(mol)) #print('Written to fragment_lookup/{}.mol'.format(frag_fn)) return smi, mol2formula(mol, incl_H=True), frag_fn, frag_keys
"""SyGMa: Systematically Generating potential Metabolites""" from builtins import str import argparse import sygma import sys from rdkit import Chem, RDLogger RDLogger.logger().setLevel(RDLogger.ERROR) import logging logging.basicConfig() logger = logging.getLogger('sygma') def run_sygma(args, file=sys.stdout): logger.setLevel(args.loglevel.upper()) scenario = sygma.Scenario([ [sygma.ruleset['phase1'], args.phase1], [sygma.ruleset['phase2'], args.phase2] ]) parent = Chem.MolFromSmiles(args.parentmol) metabolic_tree = scenario.run(parent) metabolic_tree.calc_scores() if args.outputtype == "sdf": metabolic_tree.write_sdf(file) elif args.outputtype == "smiles": file.write("\n".join([m+" "+str(s) for m,s in metabolic_tree.to_smiles()])+'\n') return None def get_sygma_parser(): ap = argparse.ArgumentParser(description=__doc__) ap.add_argument('--version', action='version', version='%(prog)s ' + sygma.__version__)
def main(): """ Example usage: python -m pipelines.xchem.prepare_tether_2 --smi ../../data/mpro/Mpro-x0387_0.smi --mol ../../data/mpro/Mpro-x0387_0.mol -o TETHERED --max-inputs 500 --chunk-size 100 :return: """ global chunk_size global embedding_failures_file # Suppress basic RDKit logging... RDLogger.logger().setLevel(RDLogger.ERROR) print('RDKit version:', rdBase.rdkitVersion) parser = argparse.ArgumentParser( description='Tether prep - prepare candidates for docking') parser.add_argument( '--smi', help='SMILES containing the expanded candidates for a hit)') parser.add_argument('--mol', help='Molfile containing the hit to tether to)') parser.add_argument( '-o', '--outfile', default='Tethered', help= 'Base name for results SDF file (will generate something like Tethered_Mpro-x0072_000.sdf)' ) parser.add_argument('--min-ph', type=float, help='The min pH to consider') parser.add_argument('--max-ph', type=float, help='The max pH to consider') parser.add_argument('-c', '--chunk-size', type=int, default=200, help='Chunk size for files') parser.add_argument('--max-inputs', type=int, default=0, help='Max number of molecules to process') parser.add_argument('--max-outputs', type=int, default=0, help='Max number of records to output') parser.add_argument('--modulus', type=int, default=0, help='Process only mols with this modulus') parser.add_argument('--timeout-embed', type=int, default=5, help='Timeout in seconds to apply to limit embedding') args = parser.parse_args() log("Tether prep args: ", args) chunk_size = args.chunk_size min_ph = args.min_ph max_ph = args.max_ph smi = args.smi mol = args.mol outfile = args.outfile max_inputs = args.max_inputs max_outputs = args.max_outputs modulus = args.modulus timout_embed_secs = args.timeout_embed embedding_failures_file = open(outfile + '_embedding_failures.smi', 'w') # Dimporphite needs to use argparse with its own arguments, not messed up with our arguments # so we store the original args orig_sys_argv = sys.argv[:] # Remove all the parameters, keeping only the filename (first one) so that # dimorphite is unaware of any previous commandline parameters. sys.argv = sys.argv[:1] execute(smi, mol, outfile, min_ph=min_ph, max_ph=max_ph, max_inputs=max_inputs, max_outputs=max_outputs, modulus=modulus, timout_embed_secs=timout_embed_secs) embedding_failures_file.close() print('Finished')
_splashMessage = """ -*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-* FeatFinderCLI version %s Copyright (C) 2005 Rational Discovery LLC This software is copyrighted. The software may not be copied, reproduced, translated or reduced to any electronic medium or machine-readable form without the prior written consent of Rational Discovery LLC. -*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-* """ % _version from rdkit import Chem from rdkit.Chem import ChemicalFeatures from rdkit import RDLogger logger = RDLogger.logger() import sys, os import re splitExpr = re.compile(r'[ \t,]') def GetAtomFeatInfo(factory, mol): res = [None] * mol.GetNumAtoms() feats = factory.GetFeaturesForMol(mol) for feat in feats: ids = feat.GetAtomIds() for id in ids: if res[id] is None: res[id] = [] res[id].append("%s-%s" % (feat.GetFamily(), feat.GetType())) return res
from __future__ import print_function from rdkit import RDLogger lg = RDLogger.logger() lg.setLevel(4) import rdkit.Chem as Chem import rdkit.Chem.AllChem as AllChem from rdkit import DataStructs import pandas as pd import numpy as np from tqdm import tqdm import json import sys from retrosim.utils.generate_retro_templates import process_an_example from retrosim.data.get_data import get_data_df, split_data_df from joblib import Parallel, delayed import multiprocessing num_cores = multiprocessing.cpu_count() from rdchiral.main import rdchiralRun, rdchiralReaction, rdchiralReactants import os SCRIPT_ROOT = os.path.dirname(__file__) PROJ_ROOT = os.path.dirname(SCRIPT_ROOT) ############### DEFINITIONS FOR VALIDATION SEARCH ######################## all_getfp_labels = ['Morgan2noFeat', 'Morgan3noFeat', 'Morgan2Feat', 'Morgan3Feat'] all_similarity_labels = ['Tanimoto', 'Dice', 'TverskyA', 'TverskyB',]
from chainer.training import extensions, StandardUpdater import chainermn import logging import argparse from distutils.util import strtobool from model import pair_matrix_model import uspto_pre from updater import MyUpdater from evaluator import MyEvaluator from rdkit import RDLogger rdl = RDLogger.logger() rdl.setLevel(RDLogger.CRITICAL) import glob from rdkit import Chem from tqdm import tqdm def read_inference(inference): l = {} with open(inference, 'r') as f: while True: line = f.readline() if not line: break l[int(line.split()[0])] = line.split()[1:]
def test1InchiReadPubChem(self): for f in self.dataset.values(): same, diff, reasonable = 0, 0, 0 for m in f: if m is None: # pragma: nocover continue x = MolToInchi(m) y = None RDLogger.DisableLog('rdApp.error') mol = MolFromInchi(x) RDLogger.EnableLog('rdApp.error') if mol is not None: y = MolToInchi( MolFromSmiles(MolToSmiles(mol, isomericSmiles=True))) if y is None: # metal involved? try: MolToInchi(m, treatWarningAsError=True) except InchiReadWriteError as inst: _, error = inst.args if 'Metal' in error or \ 'Charges were rearranged' in error: reasonable += 1 continue # THERE ARE NO EXAMPLES FOR THE FOLLOWING (no coverage) # RDKit does not like the SMILES? use MolBlock instead inchiMol = MolFromInchi(x) if inchiMol: rdDepictor.Compute2DCoords(inchiMol) z = MolToInchi(MolFromMolBlock( MolToMolBlock(inchiMol))) if x == z: reasonable += 1 continue # InChI messed up the radical? unsanitizedInchiMol = MolFromInchi(x, sanitize=False) if sum([ a.GetNumRadicalElectrons() * a.GetAtomicNum() for a in m.GetAtoms() if a.GetNumRadicalElectrons() != 0 ]) != sum([ a.GetNumRadicalElectrons() * a.GetAtomicNum() for a in unsanitizedInchiMol.GetAtoms() if a.GetNumRadicalElectrons() != 0 ]): reasonable += 1 continue diff += 1 cid = m.GetProp('PUBCHEM_COMPOUND_CID') print(COLOR_GREEN + 'Empty mol for PubChem Compound ' + cid + '\n' + COLOR_RESET) continue if x != y: # if there was warning in the first place, then this is # tolerable try: MolToInchi(m, treatWarningAsError=True) MolFromInchi(x, treatWarningAsError=True) except InchiReadWriteError as inst: reasonable += 1 continue # or if there are big rings SanitizeMol(m) if filter(lambda i: i >= 8, [len(r) for r in m.GetRingInfo().AtomRings()]): reasonable += 1 continue # THERE ARE NO EXAMPLES FOR THE FOLLOWING (no coverage) # or if RDKit loses bond stereo s = MolToSmiles(m, True) if MolToSmiles(MolFromSmiles(s), True) != s: reasonable += 1 continue # or if it is RDKit SMILES writer unhappy about the mol inchiMol = MolFromInchi(x) rdDepictor.Compute2DCoords(inchiMol) z = MolToInchi(MolFromMolBlock(MolToMolBlock(inchiMol))) if x == z: reasonable += 1 continue diff += 1 print(COLOR_GREEN + 'Molecule mismatch for PubChem Compound ' + cid + COLOR_RESET) print(inchiDiff(x, y)) print() else: same += 1 fmt = "\n{0}InChI read Summary: {1} identical, {2} variance, {3} reasonable variance{4}" print(fmt.format(COLOR_GREEN, same, diff, reasonable, COLOR_RESET)) self.assertEqual(same, 627) self.assertEqual(diff, 0) self.assertEqual(reasonable, 554)
""" RDKit interface """ from rdkit import RDLogger import rdkit.Chem as _rd_chem import rdkit.Chem.AllChem as _rd_all_chem import automol.create from automol.convert import _util _LOGGER = RDLogger.logger() _LOGGER.setLevel(RDLogger.ERROR) # geometry def to_geometry(rdm): """ Generate a molecular geometry from an RDKit molecule object. :param rdm: molecule object :type rdm: RDKit molecule object :rtype: automol geometry data structure """ rdm = _rd_chem.AddHs(rdm) atms = rdm.GetAtoms() natms = len(rdm.GetAtoms()) if natms == 1: syms = [str(atms[0].GetSymbol()).title()] xyzs = [(0., 0., 0.)] else: _rd_all_chem.EmbedMolecule(rdm)
def tearDown(self): RDLogger.EnableLog('rdApp.error')
def standardize_mols(jobs, mol_counter, num_mols, results, start_time, vendors, max_stereo_isomers, failures, tautomer, verbose): """ This function passes molecules to the standardization functions. Parameters ---------- jobs: multiprocessing.manager.list A list containing job information as dictionaries. mol_counter: multiprocessing.manager.value A counter keeping track of processed molecules. num_mols: int Total number of molecules to be processed. results: multiprocessing.manager.list A list containing lists describing the processed molecules. start_time: float Starting time of molecule processing. vendors: list List of vendors. max_stereo_isomers: int Maximal number of stereo isomers to generater per molecule. verbose : bool If RDKit warning should be displayed. """ if not verbose: RDLogger.DisableLog('rdApp.*') job = 'initiate' processed_mols = [] while job is not None: try: job = jobs.pop(0) vendor_position = vendors.index(job['vendor']) supplier = Chem.SDMolSupplier(job['sdf_path']) for mol_id in range(job['mol_start'], job['mol_end'] + 1): mol = supplier[mol_id] if job['identifier_field'] == 'None': identifier = 'unknown' else: try: identifier = mol.GetProp(job['identifier_field']) except AttributeError: identifier = 'unknown' try: # generate smiles for error catching smiles = 'unknown' smiles = Chem.MolToSmiles(mol) # default standardization from molvs mol = Standardizer().standardize(mol) # choose largest fragment mol = LargestFragmentChooser().choose(mol) # canonicalize tautomer if tautomer: mol = TautomerCanonicalizer().canonicalize(mol) # protonate mol mol = protonate_mol(mol) # molecular weight will not change anymore if ExactMolWt(mol) < 1200: # enumerate stereo isomers and append mols if max_stereo_isomers > 0: for mol in enumerate_stereo_isomers(mol, max_stereo_isomers): mol_as_list = [Chem.MolToSmiles(mol)] + [''] * len(vendors) mol_as_list[1 + vendor_position] = identifier processed_mols.append(mol_as_list) else: mol_as_list = [Chem.MolToSmiles(mol)] + [''] * len(vendors) mol_as_list[1 + vendor_position] = identifier processed_mols.append(mol_as_list) except: failures.append(' '.join(['standardize_error', smiles, job['vendor'], identifier])) with mol_counter.get_lock(): mol_counter.value += 1 update_progress(mol_counter.value / num_mols, 'Progress of standardization', ((time.time() - start_time) / mol_counter.value) * (num_mols - mol_counter.value)) except IndexError: job = None results += processed_mols return
import os.path as osp from tqdm import tqdm import torch import torch.nn.functional as F from torch_scatter import scatter from torch_geometric.data import (InMemoryDataset, download_url, extract_zip, Data) try: import rdkit from rdkit import Chem from rdkit.Chem.rdchem import HybridizationType from rdkit.Chem.rdchem import BondType as BT from rdkit import RDLogger RDLogger.DisableLog('rdApp.*') except ImportError: rdkit = None HAR2EV = 27.2113825435 KCALMOL2EV = 0.04336414 conversion = torch.tensor([ 1., 1., HAR2EV, HAR2EV, HAR2EV, 1., HAR2EV, HAR2EV, HAR2EV, HAR2EV, HAR2EV, 1., KCALMOL2EV, KCALMOL2EV, KCALMOL2EV, KCALMOL2EV, 1., 1., 1. ]) atomrefs = { 6: [0., 0., 0., 0., 0.], 7: [ -13.61312172, -1029.86312267, -1485.30251237, -2042.61123593,
GetTopologicalTorsionFingerprint from rdkit.Chem.Pharm2D import Gobbi_Pharm2D from rdkit.Chem.Pharm2D.Generate import Gen2DFingerprint from rdkit.Chem.rdReducedGraphs import GetErGFingerprint # All available similarities in RDKit from rdkit.DataStructs.cDataStructs import TanimotoSimilarity, \ DiceSimilarity, CosineSimilarity, SokalSimilarity, RusselSimilarity, \ RogotGoldbergSimilarity, AllBitSimilarity, KulczynskiSimilarity, \ McConnaugheySimilarity, AsymmetricSimilarity, BraunBlanquetSimilarity, \ TverskySimilarity from torch_geometric.data import Data # Suppress unnecessary RDkit warnings and errors RDLogger.logger().setLevel(RDLogger.CRITICAL) logger = logging.getLogger(__name__) # Tokenization dictionaries ################################################### # Special tokens for meta token SPECIAL_TOKEN_DICT = { 'SOS': 0, # Start of the sentence 'UNK': 128, # Unknown atoms 'MSK': 129, # Masked tokens/atoms for prediction 'EOS': 254, # End of the sentence 'PAD': 255, # Padding } # High frequency/occurrence atoms from PCBA ATOM_TOKEN_DICT = { 'C': 6,
# @@ All Rights Reserved @@ # This file is part of the RDKit. # The contents are covered by the terms of the BSD license # which is included in the file license.txt, found at the root # of the RDKit source tree. # from rdkit import Chem from rdkit.Chem import AllChem from rdkit.Chem import Lipinski,Descriptors,Crippen from rdkit.Dbase.DbConnection import DbConnect from rdkit.Dbase import DbModule import re #set up the logger: import rdkit.RDLogger as logging logger = logging.logger() logger.setLevel(logging.INFO) def ProcessMol(mol,typeConversions,globalProps,nDone,nameProp='_Name',nameCol='compound_id', redraw=False,keepHs=False, skipProps=False,addComputedProps=False, skipSmiles=False, uniqNames=None,namesSeen=None): if not mol: raise ValueError('no molecule') if keepHs: Chem.SanitizeMol(mol) try: nm = mol.GetProp(nameProp) except KeyError: nm = None
def quiet(self): # Silence everything but critical errors. self.rdk_lg = RDLogger.logger() self.rdk_lg.setLevel(RDLogger.CRITICAL)
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # import unittest import os, sys, copy import pickle from rdkit import rdBase from rdkit import Chem from rdkit.Chem.rdRGroupDecomposition import RGroupDecompose, RGroupDecomposition, RGroupDecompositionParameters from collections import OrderedDict # the RGD code can generate a lot of warnings. disable them from rdkit import RDLogger RDLogger.DisableLog("rdApp.warning") class TestCase(unittest.TestCase): def test_multicores(self): cores_smi_easy = OrderedDict() cores_smi_hard = OrderedDict() #cores_smi_easy['cephem'] = Chem.MolFromSmiles('O=C1C([1*])[C@@H]2N1C(C(O)=O)=C([3*])CS2') cores_smi_easy['cephem'] = Chem.MolFromSmarts('O=C1C([*:1])C2N1C(C(O)=O)=C([*:3])CS2') cores_smi_hard['cephem'] = Chem.MolFromSmarts('O=C1C([2*])([1*])[C@@H]2N1C(C(O)=O)=C([3*])CS2') #cores_smi_easy['carbacephem'] = Chem.MolFromSmiles('O=C1C([1*])[C@@H]2N1C(C(O)=O)=C([3*])CC2') cores_smi_easy['carbacephem'] = Chem.MolFromSmarts('O=C1C([1*])C2N1C(C(O)=O)=C([3*])CC2') cores_smi_hard['carbacephem'] = Chem.MolFromSmarts(
def create_scaffold_split(df, seed, frac, entity): # reference: https://github.com/chemprop/chemprop/blob/master/chemprop/data/scaffold.py try: from rdkit import Chem from rdkit.Chem.Scaffolds import MurckoScaffold from rdkit import RDLogger RDLogger.DisableLog('rdApp.*') except: raise ImportError( "Please install rdkit by 'conda install -c conda-forge rdkit'! ") from tqdm import tqdm from random import Random from collections import defaultdict random = Random(seed) s = df[entity].values scaffolds = defaultdict(set) idx2mol = dict(zip(list(range(len(s))), s)) error_smiles = 0 for i, smiles in tqdm(enumerate(s), total=len(s)): try: scaffold = MurckoScaffold.MurckoScaffoldSmiles( mol=Chem.MolFromSmiles(smiles), includeChirality=False) scaffolds[scaffold].add(i) except: print_sys(smiles + ' returns RDKit error and is thus omitted...') error_smiles += 1 train, val, test = [], [], [] train_size = int((len(df) - error_smiles) * frac[0]) val_size = int((len(df) - error_smiles) * frac[1]) test_size = (len(df) - error_smiles) - train_size - val_size train_scaffold_count, val_scaffold_count, test_scaffold_count = 0, 0, 0 #index_sets = sorted(list(scaffolds.values()), key=lambda i: len(i), reverse=True) index_sets = list(scaffolds.values()) big_index_sets = [] small_index_sets = [] for index_set in index_sets: if len(index_set) > val_size / 2 or len(index_set) > test_size / 2: big_index_sets.append(index_set) else: small_index_sets.append(index_set) random.seed(seed) random.shuffle(big_index_sets) random.shuffle(small_index_sets) index_sets = big_index_sets + small_index_sets if frac[2] == 0: for index_set in index_sets: if len(train) + len(index_set) <= train_size: train += index_set train_scaffold_count += 1 else: val += index_set val_scaffold_count += 1 else: for index_set in index_sets: if len(train) + len(index_set) <= train_size: train += index_set train_scaffold_count += 1 elif len(val) + len(index_set) <= val_size: val += index_set val_scaffold_count += 1 else: test += index_set test_scaffold_count += 1 return { 'train': df.iloc[train].reset_index(drop=True), 'valid': df.iloc[val].reset_index(drop=True), 'test': df.iloc[test].reset_index(drop=True) }
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ##### MolObjectHandling.py import __future__ import rdkit from rdkit import Chem # Disable the unnecessary RDKit warnings from rdkit import RDLogger RDLogger.DisableLog("rdApp.*") def check_sanitization(mol): """ Given a rdkit.Chem.rdchem.Mol this script will sanitize the molecule. It will be done using a series of try/except statements so that if it fails it will return a None rather than causing the outer script to fail. Nitrogen Fixing step occurs here to correct for a common RDKit valence error in which Nitrogens with with 4 bonds have the wrong formal charge by setting it to -1. This can be a place to add additional correcting features for any discovered common sanitation failures. Handled here so there are no problems later. Inputs:
def UFFConstrainedOptimize(mol, moving_atoms=None, fixed_atoms=None, cutoff=5., verbose=False): """Minimize a molecule using UFF forcefield with a set of moving/fixed atoms. If both moving and fixed atoms are provided, fixed_atoms parameter will be ignored. The minimization is done in-place (without copying molecule). Parameters ---------- mol: rdkit.Chem.rdchem.Mol Molecule to be minimized. moving_atoms: array-like (default=None) Indices of freely moving atoms. If None, fixed atoms are assigned based on `fixed_atoms`. These two arguments are mutually exclusive. fixed_atoms: array-like (default=None) Indices of fixed atoms. If None, fixed atoms are assigned based on `moving_atoms`. These two arguments are mutually exclusive. cutoff: float (default=10.) Distance cutoff for the UFF minimization Returns ------- mol: rdkit.Chem.rdchem.Mol Molecule with mimimized `moving_atoms` """ logger = RDLogger.logger() if not verbose: logger.setLevel(RDLogger.CRITICAL) if moving_atoms is None and fixed_atoms is None: raise ValueError('You must supply at least one set of moving/fixed ' 'atoms.') all_atoms = set(range(mol.GetNumAtoms())) if moving_atoms is None: moving_atoms = list(all_atoms.difference(fixed_atoms)) else: fixed_atoms = list(all_atoms.difference(moving_atoms)) # extract submolecules containing atoms within cutoff mol_conf = mol.GetConformer(-1) pos = np.array([mol_conf.GetAtomPosition(i) for i in range(mol_conf.GetNumAtoms())]) mask = (cdist(pos, pos[moving_atoms]) <= cutoff).any(axis=1) amap = np.where(mask)[0].tolist() # expand to whole residues pocket_residues = OrderedDict() protein_residues = GetResidues(mol) for res_id in protein_residues.keys(): if any(1 for res_aix in protein_residues[res_id] if res_aix in amap): pocket_residues[res_id] = protein_residues[res_id] amap = list(chain(*pocket_residues.values())) # TODO: above certain threshold its making a submolis redundant submol = AtomListToSubMol(mol, amap, includeConformer=True) # initialize ring info Chem.GetSSSR(submol) ff = UFFGetMoleculeForceField(submol, vdwThresh=cutoff, ignoreInterfragInteractions=False) for submol_id, atom_id in enumerate(amap): if atom_id not in moving_atoms: ff.AddFixedPoint(submol_id) ff.Initialize() ff.Minimize(energyTol=1e-4, forceTol=1e-3, maxIts=2000) # get the positions backbone conf = mol.GetConformer(-1) submol_conf = submol.GetConformer(-1) for submol_idx, mol_idx in enumerate(amap,): conf.SetAtomPosition(mol_idx, submol_conf.GetAtomPosition(submol_idx)) # FIXME: there's no getLevel method, so we set to default level if not verbose: logger.setLevel(RDLogger.INFO) return mol
# Output is either s fixed name in an output directory # or a prefixed filename (without an output directory) if args.output_is_prefix: output_filename = '{}.{}.gz'.format(args.output, output_filename) else: # Create the output directory if os.path.exists(args.output): logger.error('Output exists') sys.exit(1) os.mkdir(args.output) os.chmod(args.output, 0o777) output_filename = os.path.join(args.output, '{}.gz'.format(output_filename)) # Suppress basic RDKit logging... RDLogger.logger().setLevel(RDLogger.ERROR) # Report any limiting...? if args.limit: logger.warning('Limiting processing to first {:,} molecules'.format( args.limit)) # Before we open the output file # get a lit of all the input files (the prefix may be the same) # so we don't want our file in the list of files to be processed) data_files = glob.glob('{}/{}*.gz'.format(args.vendor_dir, args.vendor_prefix)) # Open the file we'll write the standardised data set to. # A text, tab-separated file. logger.info('Writing %s...', output_filename)
def split_sdf(file_name, outdir="data/"): if ".sdf" in file_name: print("Loading sdf.") rdk_lg = RDLogger.logger() rdk_lg.setLevel(RDLogger.CRITICAL) df = PandasTools.LoadSDF(sdf_file_name, smilesName='SMILES', molColName='Molecule', includeFingerprints=False) if ".csv" in file_name: print("Loading CSV.") # Parse the CSV file. rdk_lg = RDLogger.logger() rdk_lg.setLevel(RDLogger.CRITICAL) with open(file_name, "r") as csvf: pdb_list = [ list(line.split(",")) for line in csvf.read().split("\n") ] df = pd.DataFrame(columns=pdb_list[0].append('Molecule')) for pdb in pdb_list[1:-1]: print("pdb=", pdb) df = df.append({'PDB ID': pdb}, ignore_index=False) print("Raw cols = ", [str(x) for x in df.columns]) # Select only the needed columns and merge the two PDB cols. #df_list=['PDB ID(s) for Ligand-Target Complex','PDB ID(s) of Target Chain','SMILES','IC50 (nM)','Molecule'] df_list = ['PDB ID'] df_selected = df[df_list].copy() #df_selected["PDB IDs"] = df_selected['PDB ID(s) for Ligand-Target Complex'] + ',' + df_selected['PDB ID(s) of Target Chain'] print("Selected cols = ", [str(x) for x in df_selected.columns]) #df_selected = df_selected[ ["PDB IDs"] + df_list[2:] ] # Drop any rows with missing data. df_selected = df_selected.replace('', np.nan) df_selected = df_selected.replace(',', np.nan) df_selected = df_selected.dropna() r_rows = len(df.index) s_rows = len(df_selected.index) print("Raw rows = ", r_rows) print("Sel rows = ", s_rows) print("Keep pct = %.2f%s" % (((float(s_rows) / float(r_rows)) * 100.0), '%')) # Build ligand dictionary and a protein dictionary. print("Building protein-ligand dictionary.") uligs = {} prots_ligs = {} for lndx, row in enumerate(df_selected.values): #print("row[0]=",row[0]) pdbs = row[0][0].split(',') for pdb in pdbs: if pdb == '': continue if pdb not in prots_ligs: prots_ligs[pdb] = [] prots_ligs[pdb] += [lndx] uligs[lndx] = row print("Unique proteins = ", len(prots_ligs)) print("Writing per-ligand output files.") # Write out .lig files and return the data dictionaries. for key in uligs: ndx = str(key) lig = uligs[key] print("writing ligand indexed by ", lig[2], "ndx=", ndx) write_lig_file(lig[2], outdir + "/lig/lig%s.lig" % ndx) return uligs, prots_ligs
def process(self): try: import rdkit from rdkit import Chem, RDLogger from rdkit.Chem.rdchem import BondType as BT from rdkit.Chem.rdchem import HybridizationType RDLogger.DisableLog('rdApp.*') except ImportError: rdkit = None if rdkit is None: print(("Using a pre-processed version of the dataset. Please " "install 'rdkit' to alternatively process the raw data."), file=sys.stderr) data_list = torch.load(self.raw_paths[0]) data_list = [Data(**data_dict) for data_dict in data_list] if self.pre_filter is not None: data_list = [d for d in data_list if self.pre_filter(d)] if self.pre_transform is not None: data_list = [self.pre_transform(d) for d in data_list] torch.save(self.collate(data_list), self.processed_paths[0]) return types = {'H': 0, 'C': 1, 'N': 2, 'O': 3, 'F': 4} bonds = {BT.SINGLE: 0, BT.DOUBLE: 1, BT.TRIPLE: 2, BT.AROMATIC: 3} with open(self.raw_paths[1], 'r') as f: target = f.read().split('\n')[1:-1] target = [[float(x) for x in line.split(',')[1:20]] for line in target] target = torch.tensor(target, dtype=torch.float) target = torch.cat([target[:, 3:], target[:, :3]], dim=-1) target = target * conversion.view(1, -1) with open(self.raw_paths[2], 'r') as f: skip = [int(x.split()[0]) - 1 for x in f.read().split('\n')[9:-2]] suppl = Chem.SDMolSupplier(self.raw_paths[0], removeHs=False, sanitize=False) data_list = [] for i, mol in enumerate(tqdm(suppl)): if i in skip: continue N = mol.GetNumAtoms() pos = suppl.GetItemText(i).split('\n')[4:4 + N] pos = [[float(x) for x in line.split()[:3]] for line in pos] pos = torch.tensor(pos, dtype=torch.float) type_idx = [] atomic_number = [] aromatic = [] sp = [] sp2 = [] sp3 = [] num_hs = [] for atom in mol.GetAtoms(): type_idx.append(types[atom.GetSymbol()]) atomic_number.append(atom.GetAtomicNum()) aromatic.append(1 if atom.GetIsAromatic() else 0) hybridization = atom.GetHybridization() sp.append(1 if hybridization == HybridizationType.SP else 0) sp2.append(1 if hybridization == HybridizationType.SP2 else 0) sp3.append(1 if hybridization == HybridizationType.SP3 else 0) z = torch.tensor(atomic_number, dtype=torch.long) row, col, edge_type = [], [], [] for bond in mol.GetBonds(): start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx() row += [start, end] col += [end, start] edge_type += 2 * [bonds[bond.GetBondType()]] edge_index = torch.tensor([row, col], dtype=torch.long) edge_type = torch.tensor(edge_type, dtype=torch.long) edge_attr = F.one_hot(edge_type, num_classes=len(bonds)).to(torch.float) perm = (edge_index[0] * N + edge_index[1]).argsort() edge_index = edge_index[:, perm] edge_type = edge_type[perm] edge_attr = edge_attr[perm] row, col = edge_index hs = (z == 1).to(torch.float) num_hs = scatter(hs[row], col, dim_size=N).tolist() x1 = F.one_hot(torch.tensor(type_idx), num_classes=len(types)) x2 = torch.tensor([atomic_number, aromatic, sp, sp2, sp3, num_hs], dtype=torch.float).t().contiguous() x = torch.cat([x1.to(torch.float), x2], dim=-1) y = target[i].unsqueeze(0) name = mol.GetProp('_Name') data = Data(x=x, z=z, pos=pos, edge_index=edge_index, edge_attr=edge_attr, y=y, name=name, idx=i) if self.pre_filter is not None and not self.pre_filter(data): continue if self.pre_transform is not None: data = self.pre_transform(data) data_list.append(data) torch.save(self.collate(data_list), self.processed_paths[0])
def split_pdb_with_sdf(pdb_id, sdf_file_name, outdir="data/"): # This function takes in a PDB-list from csv (from rcsb.org) # alongside an sdf file containing compounds to test against every structure in the pdb-list print("Loading sdf from ", sdf_file_name) rdk_lg = RDLogger.logger() rdk_lg.setLevel(RDLogger.CRITICAL) df = PandasTools.LoadSDF(sdf_file_name, smilesName='SMILES', molColName='Molecule', includeFingerprints=False, embedProps=True) print("Available SDF cols = ", [str(x) for x in df.columns]) PandasTools.AddMoleculeColumnToFrame(df, 'SMILES', 'Molecule', includeFingerprints=False) df.insert(column="PDB ID", value=pdb_id, loc=0) # Select only the needed columns and merge the two PDB cols. #df_sdf_list = ['PDB ID','FDA drugnames','SMILES','Molecule'] #df_sdf_list = ['PDB ID','Molecule','Ligand','SMILES','BindingDB MonomerID'] df_sdf_list = [ 'PDB ID', 'BindingDB Ligand Name', 'ChEMBL ID of Ligand', 'Molecule' ] df_selected = df[df_sdf_list].copy() print("Selected SDF cols = ", [str(x) for x in df_selected.columns]) print("Loading compounds for test against PDB ID = ", pdb_id) #with open(pdb_list_file_name,"r") as csvf: # pdb_list = [ list(line.split(",")) for line in csvf.read().split("\n") ] # df = pd.DataFrame(columns=pdb_list[0].append('Molecule')) #i=0 for name, mol in zip(df['ChEMBL ID of Ligand'], df_selected['Molecule']): #for name,mol in zip(df_selected['BindingDB MonomerID'],df_selected['Molecule']): if ((mol.GetNumAtoms() < MAX_LIG_ATMS) and ('CHEMBL[0-9]*' in str(name) and not ligname.isspace())): print("pdb=", pdb_id, ",Molecule ID = ", name) df_selected = df_selected.append({'PDB ID': pdb_id}, ignore_index=True) #i=i+1 #print("Raw PDB file cols = ", [str(x) for x in df.columns]) # Select only the needed columns and merge the two PDB cols. df_selected = df_selected.drop_duplicates() df_selected = df_selected.dropna(inplace=False) #df_selected = df_selected['.*.CHEMBL.*.' in str(df_selected['ChEMBL ID of Ligand'].value)] #df_selected["PDB IDs"] = df_selected['PDB ID(s) for Ligand-Target Complex'] + ',' + df_selected['PDB ID(s) of Target Chain'] print("Selected PDB cols = ", [str(x) for x in df_selected.columns]) #df_selected = df_selected[ ["PDB IDs"] + df_list[2:] ] # Drop any rows with missing data. df_selected = df_selected.replace('', np.nan) df_selected = df_selected.replace(',', np.nan) df_selected = df_selected.dropna() r_rows = len(df.index) s_rows = len(df_selected.index) print("Raw rows = ", r_rows) print("Sel rows = ", s_rows) print("Keep pct = %.2f%s" % (((float(s_rows) / float(r_rows)) * 100.0), '%')) # Build ligand dictionary and a protein dictionary. print("Building protein-ligand dictionary.") uligs = {} prots_ligs = {} for lndx, row in enumerate(df_selected.values): print("prot_lig row=", row) ligname = str(row[2]).replace(';', '').replace(' ', '-').replace( '%', '').replace('/', '').replace('?', '').split('\n')[0] ligname = re.sub('[^A-Za-z0-9]+%\/.\\n.', '', ligname) #ligname = str(row[2]).replace('\\','') #.replace(' ','-').replace('%','').replace('/','') #ligname = re.sub('[^A-Za-z0-9]+%\/', '', ligname) pdbs = [pdb_id] #row[0].split(',') for pdb in pdbs: if pdb == '': continue if pdb not in prots_ligs: prots_ligs[pdb] = [] if (row[3].GetNumAtoms() < MAX_LIG_ATMS and not (ligname.isspace()) and (row[3].GetNumAtoms() > MIN_LIG_ATMS)): prots_ligs[pdb] += [lndx] #prots_ligs[pdb] += [ {str(lndx):ligname} ] MOL_TO_NDX.update({str(lndx): ligname}) uligs[lndx] = row print("Unique proteins = ", len(prots_ligs)) print("Writing per-ligand output files.") # Write out .lig files and return the data dictionaries. return_uligs = [] return_protligs = {} return_protligs[pdb_id] = [] for key in uligs: #return_uligs = {} #return_protligs = {} #for key in MOL_TO_NDX: ndx = str(key) lig = uligs[key] #lig = uligs[int(ndx)] #lig = uligs[int(key)] #print("lig=",lig) ligname = str(lig[2]).replace(';', '').replace(' ', '-').replace( '%', '').replace('/', '').replace('?', '').split('\n')[0] ligname = re.sub('[^A-Za-z0-9]+%\/.\\n.', '', ligname) #ligname = str(lig[2]) #ligname = str(row[2]).replace('\\n','') #.replace(' ','-').replace('%','').replace('/','') #ligname = re.sub('[^A-Za-z0-9]+%\/', '', ligname) #if(lig[1].GetNumAtoms()<MAX_LIG_ATMS and ligname==MOL_TO_NDX[ndx]): if (ligname == MOL_TO_NDX[ndx]): return_uligs.append(lig) return_protligs[pdb_id] += [int(ndx)] print("lig name=", ligname) print("lig # atoms: ", lig[3].GetNumAtoms()) print("lig=", lig) print("writing ligand for PDB=:", lig[0], ", #atoms:", lig[3].GetNumAtoms(), ", name: ", ligname, "ndx name", MOL_TO_NDX[str(ndx)], ", @ index=", ndx) #write_lig_file(lig[4],outdir+"/lig/lig%s.lig"%ndx) write_lig_file(lig[3], outdir + "/lig/lig%s.lig" % ligname) #return uligs, prots_ligs return uligs, return_protligs
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # INCHI_AVAILABLE = True import rdinchi import logging from rdkit import RDLogger logger = RDLogger.logger() logLevelToLogFunctionLookup = { logging.INFO : logger.info, logging.DEBUG : logger.debug, logging.WARNING : logger.warning, logging.CRITICAL : logger.critical, logging.ERROR : logger.error } class InchiReadWriteError(Exception): pass def MolFromInchi(inchi, sanitize=True, removeHs=True, logLevel=None, treatWarningAsError=False): """Construct a molecule from a InChI string
from argparse import ArgumentParser from molgym.agents.moldqn import DQNFinalState from molgym.agents.preprocessing import MorganFingerprints from molgym.envs.actions import MoleculeActions from molgym.envs.rewards import RewardFunction from molgym.envs.simple import Molecule from molgym.envs.rewards.mpnn import MPNNReward from molgym.utils.conversions import convert_nx_to_smiles, convert_smiles_to_nx from molgym.mpnn.layers import custom_objects from tensorflow.keras.models import load_model # Set up the logger logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger('RL-Logger') logger.setLevel(logging.DEBUG) rdkit_logger = RDLogger.logger() rdkit_logger.setLevel(RDLogger.CRITICAL) def get_platform_info(): """Get information about the computer running this process""" return { 'processor': platform.machine(), 'python_version': platform.python_version(), 'python_compiler': platform.python_compiler(), 'hostname': platform.node(), 'os': platform.platform(), 'cpu_name': platform.processor(), 'n_cores': os.cpu_count() }