Example #1
0
def findDuplicates (sdf, name, out):

    lg = RDLogger.logger()
    lg.setLevel(RDLogger.ERROR)
    
    suppl = Chem.SDMolSupplier(sdf,removeHs=False, sanitize=False)

    idlist = []
    nmlist = []
    smlist = []

    print 'reading SDFile...'
    counter = 0
    for mol in suppl:

        counter+=1
        
        if mol is None: continue
        try:
            inchi = Chem.MolToInchi(mol)
            inkey = Chem.InchiToInchiKey(inchi)
            smile = Chem.MolToSmiles(mol)
        except:
            continue

        try:
            ni = mol.GetProp(name)
        except:
            ni = 'mol%0.8d' %counter

        idlist.append(inkey[:-3])
        nmlist.append(ni)
        smlist.append(smile)
    
    n = len(idlist)

    print 'analizing duplicates...'

    fo = open (out,'w+')
    fo.write('i\tj\tnamei\tnamej\tsmilesi\tsmilesj\n')
    duplicates = 0
    for i in range (n):
        for j in range (i+1,n):
            if idlist[i]==idlist[j]:
                line=str(i)+'\t'+str(j)+'\t'+nmlist[i]+'\t'+nmlist[j]+'\t'+smlist[i]+'\t'+smlist[j]
                fo.write(line+'\n')
                duplicates+=1
    fo.close()

    print '\n%d duplicate molecules found' %duplicates
Example #2
0
    def _get_molecule_database(self, molecule_database_src,
                               molecule_database_src_type):
        """Load molecular database and return it.
        Optionally return features if found in excel / csv file.

        Args:
            molecule_database_src (str):
                Source of molecular information. Can be a folder or a filepath.
                In case a folder is specified, all .pdb files in the folder
                are sequentially read.
                If a file path, it is assumed that the file is a .txt file with
                layout: SMILES string (column1) '\b' property (column2, optional).
            molecule_database_src_type (str):
                Type of source. Can be ['folder', 'text', 'excel', 'csv']

        Returns:
            (list(Molecule), np.ndarray or None)
                Returns a tuple. First element of tuple is the molecule_database.
                Second element is array of features of shape
                (len(molecule_database), n_features) or None if None found.

        """
        if not self.is_verbose:
            RDLogger.DisableLog('rdApp.*')

        molecule_database = []
        features = None
        if molecule_database_src_type.lower() in ["folder", "directory"]:
            if self.is_verbose:
                print(f"Searching for *.pdb files in {molecule_database_src}")
            for molfile in glob(os.path.join(molecule_database_src, "*.pdb")):
                if self.is_verbose:
                    print(f"Loading {molfile}")
                try:
                    molecule_database.append(Molecule(mol_src=molfile))
                except LoadingError as e:
                    if self.is_verbose:
                        print(f"{molfile} could not be imported. Skipping")

        elif molecule_database_src_type.lower() == "text":
            if self.is_verbose:
                print(f"Reading SMILES strings from {molecule_database_src}")
            with open(molecule_database_src, "r") as fp:
                smiles_data = fp.readlines()
            for count, line in enumerate(smiles_data):
                # Assumes that the first column contains the smiles string
                line_fields = line.split()
                smile = line_fields[0]
                mol_property_val = None
                if len(line_fields) > 1:
                    mol_property_val = float(line_fields[1])
                if self.is_verbose:
                    print(f"Processing {smile} "
                          f"({count + 1}/"
                          f"{len(smiles_data)})")
                mol_text = smile
                try:
                    molecule_database.append(
                        Molecule(
                            mol_smiles=smile,
                            mol_text=mol_text,
                            mol_property_val=mol_property_val,
                        ))
                except LoadingError as e:
                    if self.is_verbose:
                        print(f"{smile} could not be imported. Skipping")

        elif molecule_database_src_type.lower() in ["excel", "csv"]:
            if self.is_verbose:
                print(f"Reading molecules from {molecule_database_src}")
            database_df = (pd.read_excel(molecule_database_src,
                                         engine="openpyxl")
                           if molecule_database_src_type.lower() == "excel"
                           else pd.read_csv(molecule_database_src))
            # expects feature columns to be prefixed with feature_
            # e.g. feature_smiles
            feature_cols = [
                column for column in database_df.columns
                if column.split("_")[0] == "feature"
            ]
            database_feature_df = database_df[feature_cols]
            mol_names, mol_smiles, responses = None, None, None
            if "feature_name" in feature_cols:
                mol_names = database_feature_df["feature_name"].values.flatten(
                )
                database_feature_df = database_feature_df.drop(
                    ["feature_name"], axis=1)
            if "feature_smiles" in feature_cols:
                mol_smiles = database_df["feature_smiles"].values.flatten()
                database_feature_df = database_feature_df.drop(
                    ["feature_smiles"], axis=1)

            response_col = [
                column for column in database_df.columns
                if column.split("_")[0] == "response"
            ]
            if len(response_col) > 0:
                # currently handles one response
                responses = database_df[response_col].values.flatten()
            for mol_id, smile in enumerate(mol_smiles):
                if self.is_verbose:
                    print(f"Processing {smile} "
                          f"({mol_id + 1}/"
                          f"{database_df['feature_smiles'].values.size})")
                mol_text = mol_names[mol_id] if mol_names is not None else smile

                mol_property_val = responses[
                    mol_id] if responses is not None else None

                try:
                    molecule_database.append(
                        Molecule(
                            mol_smiles=smile,
                            mol_text=mol_text,
                            mol_property_val=mol_property_val,
                        ))
                except LoadingError as e:
                    if self.is_verbose:
                        print(f"{smile} could not be imported. Skipping")

            if len(database_feature_df.columns) > 0:
                features = database_feature_df.values
        else:
            raise FileNotFoundError(
                f"{molecule_database_src} could not be found. "
                f"Please enter valid folder name or path of a "
                f"text/excel/csv")
        if len(molecule_database) == 0:
            raise UserWarning("No molecular files found in the location!")
        return molecule_database, features
Example #3
0
    def __init__(self,
                 moli,
                 molj,
                 options=argparse.Namespace(time=20, verbose='info')):
        """
        Inizialization function
    
        Parameters
        ----------

        moli : RDKit molecule object 
            the first molecule used to perform the MCS calculation
        molj : RDKit molecule object 
            the second molecule used to perform the MCS calculation
        options : argparse python object 
            the list of user options 
       
        """
        def map_mcs_mol():
            """

            This function is used to define a map between the generated mcs, the
            molecules and vice versa
           
            """

            # mcs indexes mapped back to the first molecule moli

            if self.__moli_noh.HasSubstructMatch(self.mcs_mol):
                moli_sub = self.__moli_noh.GetSubstructMatch(self.mcs_mol)
            else:
                raise ValueError(
                    'RDkit MCS Subgraph first molecule search failed')

            # GAC TEST 02/17/17
            # mcsi_sub = self.mcs_mol.GetSubstructMatch(self.mcs_mol)

            if self.mcs_mol.HasSubstructMatch(self.mcs_mol):
                mcsi_sub = self.mcs_mol.GetSubstructMatch(self.mcs_mol)
            else:
                raise ValueError('RDkit MCS Subgraph search failed')

            # mcs to moli
            map_mcs_mol_to_moli_sub = zip(mcsi_sub, moli_sub)

            #print  map_mcs_mol_to_moli_sub

            # An RDkit atomic property is defined to store the mapping to moli
            for idx in map_mcs_mol_to_moli_sub:
                self.mcs_mol.GetAtomWithIdx(idx[0]).SetProp(
                    'to_moli', str(idx[1]))

            # mcs indexes mapped back to the second molecule molj

            if self.__molj_noh.HasSubstructMatch(self.mcs_mol):
                molj_sub = self.__molj_noh.GetSubstructMatch(self.mcs_mol)
            else:
                raise ValueError(
                    'RDkit MCS Subgraph second molecule search failed')

            if self.mcs_mol.HasSubstructMatch(self.mcs_mol):
                mcsj_sub = self.mcs_mol.GetSubstructMatch(self.mcs_mol)
            else:
                raise ValueError('RDkit MCS Subgraph search failed')

            # mcs to molj
            map_mcs_mol_to_molj_sub = zip(mcsj_sub, molj_sub)

            #print map_mcs_mol_to_molj_sub

            # Map between the two molecules
            self.__map_moli_molj = zip(moli_sub, molj_sub)

            # An RDkit atomic property is defined to store the mapping to molj
            for idx in map_mcs_mol_to_molj_sub:
                self.mcs_mol.GetAtomWithIdx(idx[0]).SetProp(
                    'to_molj', str(idx[1]))

            # Chirality

            # moli chiral atoms
            chiral_at_moli_noh = [
                seq[0] for seq in Chem.FindMolChiralCenters(self.__moli_noh)
            ]
            # molj chiral atoms
            chiral_at_molj_noh = [
                seq[0] for seq in Chem.FindMolChiralCenters(self.__molj_noh)
            ]

            chiral_at_mcs_moli_noh = set([
                seq[0] for seq in map_mcs_mol_to_moli_sub
                if seq[1] in chiral_at_moli_noh
            ])
            chiral_at_mcs_molj_noh = set([
                seq[0] for seq in map_mcs_mol_to_molj_sub
                if seq[1] in chiral_at_molj_noh
            ])

            # mcs chiral atoms
            chiral_at_mcs = chiral_at_mcs_moli_noh | chiral_at_mcs_molj_noh

            for i in chiral_at_mcs:
                at = self.mcs_mol.GetAtomWithIdx(i)
                at.SetChiralTag(Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW)

            if chiral_at_mcs and options.verbose == 'pedantic':
                logging.info('Chiral atom detected')

            # For each mcs atom we save its original index in a specified
            # property. This could be very usefull in the code development
            # when deletition or atom insertions are performed
            for at in self.mcs_mol.GetAtoms():
                at.SetProp('org_idx', str(at.GetIdx()))

            return

        def set_ring_counter(mol):
            """

            This function is used to attach to each molecule atom a ring counter
            rc. This parameter is used to asses if a ring has been broken or not
            during the MCS mapping
         
            Parameters
            ----------
            mol : RDKit Molecule obj
                the molecule used to define the atom ring counters
       

            """

            # set to zero the atom ring counters
            for at in mol.GetAtoms():
                at.SetProp('rc', '0')

            rginfo = mol.GetRingInfo()

            rgs = rginfo.AtomRings()

            #print rgs

            rgs_set = set([e for l in rgs for e in l])

            for idx in rgs_set:
                for r in rgs:
                    if (idx in r):
                        val = int(mol.GetAtomWithIdx(idx).GetProp('rc'))
                        val = val + 1
                        mol.GetAtomWithIdx(idx).SetProp('rc', str(val))
            return

        # Set logging level and format
        logging.basicConfig(format='%(levelname)s:\t%(message)s',
                            level=logging.INFO)

        # Local pointers to the passed molecules
        self.moli = moli
        self.molj = molj

        if not options.verbose == 'pedantic':
            lg = RDLogger.logger()
            lg.setLevel(RDLogger.CRITICAL)

        # Local pointers to the passed molecules without hydrogens
        # These variables are defined as private
        try:
            self.__moli_noh = AllChem.RemoveHs(moli)
            self.__molj_noh = AllChem.RemoveHs(molj)
        except Exception:
            self.__moli_noh = AllChem.RemoveHs(moli, sanitize=False)
            self.__molj_noh = AllChem.RemoveHs(molj, sanitize=False)

            Chem.SanitizeMol(
                self.__moli_noh,
                sanitizeOps=Chem.SanitizeFlags.SANITIZE_SETAROMATICITY)
            Chem.SanitizeMol(
                self.__molj_noh,
                sanitizeOps=Chem.SanitizeFlags.SANITIZE_SETAROMATICITY)

        # MCS calculaton. In RDKit the MCS is a smart string. Ring atoms are
        # always mapped in ring atoms.
        self.__mcs = rdFMCS.FindMCS([self.__moli_noh, self.__molj_noh],
                                    timeout=options.time,
                                    atomCompare=rdFMCS.AtomCompare.CompareAny,
                                    bondCompare=rdFMCS.BondCompare.CompareAny,
                                    matchValences=False,
                                    ringMatchesRingOnly=True,
                                    completeRingsOnly=False,
                                    matchChiralTag=False)

        # Checking
        if self.__mcs.canceled:
            logging.warning(
                'Timeout reached to find the MCS between the molecules')

        if self.__mcs.numAtoms == 0:
            raise ValueError('No MCS was found between the molecules')

        # The found MCS pattern (smart strings) is converted to a RDKit molecule
        self.mcs_mol = Chem.MolFromSmarts(self.__mcs.smartsString)

        try:  # Try to sanitize the MCS molecule
            Chem.SanitizeMol(self.mcs_mol)
        except Exception:  # if not, try to recover the atom aromaticity wich is
            # important for the ring counter
            sanitFail = Chem.SanitizeMol(
                self.mcs_mol,
                sanitizeOps=Chem.SanitizeFlags.SANITIZE_SETAROMATICITY,
                catchErrors=True)
            if sanitFail:  # if not, the MCS is skipped
                raise ValueError('Sanitization Failed...')

        # Mapping between the found MCS molecule and moli,  molj
        try:
            map_mcs_mol()
        except Exception as e:
            raise ValueError(str(e))

        #Set the ring counters for each molecule
        set_ring_counter(self.__moli_noh)
        set_ring_counter(self.__molj_noh)
        set_ring_counter(self.mcs_mol)

        # for at in self.mcs_mol.GetAtoms():
        #     print 'at = %d rc = %d' % (at.GetIdx(), int(at.GetProp('rc')))

        if not options.verbose == 'pedantic':
            lg.setLevel(RDLogger.WARNING)

        return
Example #4
0
"""
This module detects salts.
"""
import logging
from rdkit import Chem
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit import RDLogger

from .remove_salts import remove_salts

RDLogger.DisableLog('rdApp.info')

__all__ = ["detect_salts"]


def _validation_smiles(mol):
    """Utility function that converts a mol to SMILES for later validation.
    """
    validation_smiles = Chem.MolToSmiles(mol)
    return validation_smiles


def detect_salts(mol, *args, **kwargs):
    """Detects salts.

    Generates a SMILES out of the entered mol for validation, performs fragment 
    removal, turns the changed mol into another SMILES and validates it with the
    first SMILES created.  

    Parameters
    ----------
Example #5
0
# Setting logging low
from rdkit import RDLogger
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)
from database import db_client
from django.conf import settings
import makeit.utilities.io.pickle as pickle
import os
import makeit.global_config as gc

# Chiral Retro Transformer
import makeit.retrosynthetic.transformer as transformer
RetroTransformer = transformer.RetroTransformer(lookup_only=True)
RetroTransformer.load(chiral=True, refs=True, rxns=False)
RETRO_CHIRAL_FOOTNOTE = 'Using {} chiral retrosynthesis templates (mincount {} if achiral, mincount {} if chiral) from {}/{}'.format(
    gc.Relevance_Prioritization['output_size'],
    gc.RETRO_TRANSFORMS_CHIRAL['mincount'],
    gc.RETRO_TRANSFORMS_CHIRAL['mincount_chiral'],
    gc.RETRO_TRANSFORMS_CHIRAL['database'],
    gc.RETRO_TRANSFORMS_CHIRAL['collection'])

### Databases
db = db_client[settings.REACTIONS['database']]
REACTION_DB = db[settings.REACTIONS['collection']]
# RETRO_LIT_FOOTNOTE = 'Searched {} known reactions from literature'.format(REACTION_DB.count())

db = db_client[settings.INSTANCES['database']]
INSTANCE_DB = db[settings.INSTANCES['collection']]

db = db_client[settings.CHEMICALS['database']]
CHEMICAL_DB = db[settings.CHEMICALS['collection']]
Example #6
0
from plip.structure.preparation import PDBComplex
import pyunitwizard as puw
from rdkit import Chem, RDLogger
from rdkit.Chem.Draw import rdMolDraw2D
# Standard Library
from collections import defaultdict
import copy
from io import StringIO, BytesIO
import json
import requests
import re
import tempfile
from typing import List, Optional, Tuple
import warnings

RDLogger.DisableLog('rdApp.*')  # Disable rdkit warnings


class StructuredBasedPharmacophore(Pharmacophore):
    """ Class to store and compute structured-based pharmacophores

    Inherits from pharmacophore

    Parameters
    ----------

    pharmacophoric_points : list of openpharmacophore.PharmacophoricPoint
        List of pharmacophoric pharmacophoric_points

    molecular_system : rdkit.Chem.Mol
        The protein-ligand complex from which this pharmacophore was extracted.
Example #7
0
 def test_SmartsRemover(self):
     salts = ['[Cl;H1&X1,-]', '[Na+]', '[O;H2,H1&-,X0&-2]', 'BadSmarts']
     RDLogger.DisableLog('rdApp.error')
     self.assertRaises(ValueError, SmartsRemover.SmartsRemover, patterns=salts)
     RDLogger.EnableLog('rdApp.error')
def disable_rdkit_log():
    from rdkit import RDLogger
    RDLogger.DisableLog('rdApp.*')
import rdkit.rdBase as rkrb
import rdkit.RDLogger as rkl
from mordred import Calculator, descriptors
from rdkit.Chem import AddHs, CanonSmiles, MolFromSmiles
from rdkit.Chem.Descriptors import ExactMolWt
from rdkit.Chem.inchi import MolToInchiKey
from rdkit.Chem.rdmolfiles import MolFromSmiles
from sklearn.ensemble import RandomForestRegressor

from minedatabase.filters.base_filter import Filter
from minedatabase.metabolomics import MetabolomicsDataset, Peak
from minedatabase.pickaxe import Pickaxe
from minedatabase.utils import neutralise_charges


logger = rkl.logger()
logger.setLevel(rkl.ERROR)
rkrb.DisableLog("rdApp.error")


class MetabolomicsFilter(Filter):
    """Filters out compounds that don't align with a metabolomics dataset.

    This filter compares the masses (and optionally, predicted retention times)
    of MINE compounds against peak masses (and retention times) in a
    metabolomics dataset. Tolerances for mass (in Da) and retention times
    (in units consistent with dataset) are specified by the user. If a
    compound's mass (and predicted retention time, if desired) does not match
    that for any peak in the dataset, it is filtered out.

    Parameters
Example #10
0
def generate_substructures(input_file):
    """ takes all text from input file containing the structures' smile string
    and identifier. Returns structure info list and a dictionary with all 
    possibles substructure per structure.
  
    input_file: structure txt file
    """

    official_subs_dict = {}

    with open(input_file) as file_object:
        input_file = file_object.read()

    # Create a structure list
    all_lines = input_file.split('\n')
    structure_smile_list = []
    structure_mol_list = []
    structure_combo_list = []
    #  for line in all_lines[0:5]:
    for line in all_lines[:-1]:
        line = line.split('\t')
        structure_id = line[1]
        structure_mol = Chem.MolFromSmiles(line[0])
        structure_smile = Chem.MolToSmiles(structure_mol)
        structure_smile_list += [structure_smile]
        structure_mol_list += [structure_mol]
        structure_combo_list += [[
            structure_smile, structure_mol, structure_id
        ]]

    # Generate the mols for each structure in the class
    draw_list = []
    draw_legend_list = []
    for i, structure_info in enumerate(structure_combo_list):
        valid_sub_list = []
        valid_sub_mol_list = []
        structure_smile = structure_info[0]
        structure_mol = structure_info[1]
        structure_id = structure_info[2]

        nr_of_atoms = structure_mol.GetNumAtoms()

        # Generate all possible mol environments per structure
        mol_env_list = []
        for j in range(nr_of_atoms):
            for k in range(nr_of_atoms):
                env = Chem.FindAtomEnvironmentOfRadiusN(structure_mol, j, k)
                mol_env_list += [env]

        # Generate all possible substructures based on the mol envs
        for env in mol_env_list:
            submol = Chem.PathToSubmol(structure_mol, env)
            # Generate the mol of each substructure
            sub_smile = Chem.MolToSmiles(submol)
            submol = Chem.MolFromSmiles(sub_smile)
            if sub_smile != '' and sub_smile != structure_smile:
                lg = RDLogger.logger()
                lg.setLevel(RDLogger.CRITICAL)
                try:
                    Chem.SanitizeMol(submol)
                    if sub_smile not in valid_sub_list and structure_mol.HasSubstructMatch(
                            submol) == True:
                        valid_sub_list += [sub_smile]
                        valid_sub_mol_list += [submol]
                except:
                    pass
        # Write each substructure per structure in a dictionary and also generate the draw_list

        for i, valid_substructure in enumerate(valid_sub_list):
            if valid_substructure not in draw_list:
                draw_list += [valid_sub_mol_list[i]]
                draw_legend_list += [valid_substructure]
            if structure_id in official_subs_dict:
                official_subs_dict[structure_id].append(valid_substructure)
            if structure_id not in official_subs_dict:
                official_subs_dict[structure_id] = [valid_substructure]
        if structure_id not in official_subs_dict:
            official_subs_dict[structure_id] = ['<NA>']

    official_subs_dict_sorted = sorted(official_subs_dict)
    with open("all_test_substructures.txt", 'w') as db_file:
        for name in official_subs_dict_sorted:
            for key in official_subs_dict.keys():
                if key == name:
                    value_string = ''
                    for value in official_subs_dict[key]:
                        value_string += value + "."
                    value_string = value_string[:-1]
                    db_file.write(value_string + '\t' + key + '\n')
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~')
    print('All possible substructures')
    nr_of_subs = 0
    for key, value in official_subs_dict.items():
        for val in value:
            nr_of_subs += 1
    print(nr_of_subs)

    return structure_combo_list, official_subs_dict
def generate_substructures(input_file):
    """ takes all text from input file containing the structures' smile string
    and identifier. Returns structure info list and a dictionary with all 
    possibles substructure per structure.
  
    input_file: structure txt file
    """

    with open(input_file) as file_object:
        input_file = file_object.read()

    official_subs_dict = {}
    draw_list = []
    draw_legend_list = []
    # Create a structure list
    all_lines = input_file.split('\n')
    structure_smile_list = []
    structure_mol_list = []
    structure_combo_list = []
    # for line in all_lines[0:5]:
    for line in all_lines[:-1]:
        line = line.split('\t')
        structure_id = line[1]
        structure_mol = Chem.MolFromSmiles(line[0])
        structure_smile = Chem.MolToSmiles(structure_mol)
        structure_smile_list += [structure_smile]
        structure_mol_list += [structure_mol]
        structure_combo_list += [[
            structure_smile, structure_mol, structure_id
        ]]

    # generate all smiles combinations for each atom and length
    for i, structure_info in enumerate(structure_combo_list):
        structure_smile = structure_info[0]
        structure_mol = structure_info[1]
        structure_id = structure_info[2]

        char_list = []
        for char in structure_smile:
            char_list += [char]
        nr_of_atoms = len(char_list)

        all_subs_comb_list = []
        for j in range(nr_of_atoms):
            all_combos = list(it.combinations(char_list, j))
            for combo in all_combos:
                combo = str(combo)
                combo = combo.replace('\'', '').replace(', ', '').replace(
                    '(', '').replace(')', '')
                combo = combo.replace(',', '')
                if combo not in all_subs_comb_list:
                    all_subs_comb_list += [combo]

        sub_smile_list = []
        for substructure in all_subs_comb_list:
            lg = RDLogger.logger()
            lg.setLevel(RDLogger.CRITICAL)
            mol_sub = Chem.MolFromSmiles(substructure)
            if mol_sub != None and substructure not in sub_smile_list and substructure != '' and len(
                    substructure) > 1:
                sub_smile_list += [substructure]

        sub_mol_list = []
        sub_smile2_list = []
        for sub_smile in sub_smile_list:
            mol2_sub = Chem.MolFromSmiles(sub_smile)
            # Check if the substructures are valid smiles
            try:
                smile2_sub = Chem.MolToSmiles(mol2_sub)
                sub_smile2_list += [smile2_sub]
                sub_mol_list += [mol2_sub]
            except:
                pass

        valid_sub_list = []
        valid_sub_mol_list = []
        for sub_mol2 in sub_mol_list:
            sub_smile3 = Chem.MolToSmiles(sub_mol2)
            # Check if the substructure matches the structure
            if structure_mol.HasSubstructMatch(
                    sub_mol2) == True and sub_smile3 not in valid_sub_list:
                valid_sub_list += [sub_smile3]
                valid_sub_mol_list += [sub_mol2]

        # Write each substructure per structure in a dictionary and also generate the draw_list
        for k, valid_substructure in enumerate(valid_sub_list):
            if valid_substructure not in draw_list:
                draw_list += [valid_sub_mol_list[k]]
                draw_legend_list += [valid_substructure]
            if structure_id in official_subs_dict:
                official_subs_dict[structure_id].append(valid_substructure)
            if structure_id not in official_subs_dict:
                official_subs_dict[structure_id] = [valid_substructure]
        if structure_id not in official_subs_dict:
            official_subs_dict[structure_id] = ['<NA>']

    official_subs_dict_sorted = sorted(official_subs_dict)
    with open("all_perm_test_substructures.txt", 'w') as db_file:
        for name in official_subs_dict_sorted:
            for key in official_subs_dict.keys():
                if key == name:
                    value_string = ''
                    for value in official_subs_dict[key]:
                        value_string += value + "."
                    value_string = value_string[:-1]
                    db_file.write(value_string + '\t' + key + '\n')

    print('~~~~~~~~~~~~~~~~~~~~~~~~~~')
    print('All possible substructures')
    nr_of_subs = 0
    for key, value in official_subs_dict.items():
        for val in value:
            nr_of_subs += 1
    print(nr_of_subs)

    return structure_combo_list, official_subs_dict
Example #12
0
def fraginc2smi(f, mol, frag_keys, frag_type=None, kekulize=False):
  RDLogger.DisableLog('rdApp.*') 
  smi = Chem.MolToSmiles(mol)
  #print('{:02d}'.format(f[0]), end=' ')
  if kekulize:
    Chem.Kekulize(mol, clearAromaticFlags=True)
  mw = Chem.RWMol(mol)
  numatoms = mw.GetNumAtoms()
  total_deg = [atom.GetTotalValence() for atom in mw.GetAtoms()]
  for i in range(numatoms):
    idx = numatoms-1-i
    if idx not in f:
      mw.RemoveAtom(idx)
  numatoms = mw.GetNumAtoms()
  #if len(Chem.GetSymmSSSR(mw)) < 1:
  mw = Chem.RWMol(Chem.AddHs(mw))
  #print(total_deg)
  #print('a : {}'.format([atom.GetAtomicNum() for atom in mol.GetAtoms()]))
  #print('f : {}'.format(f))
  for idx, val in enumerate(total_deg):
    if idx in f:
      idx2 = sorted(list(set(f))).index(idx)
      atom = mw.GetAtomWithIdx(idx2)
      if atom.GetAtomicNum() != 1:
        if atom.GetTotalValence() != val:
          #print('{}({})'.format(idx,atom.GetAtomicNum()))
          #print('VALENCE DOES NOT MATCH {} -> {}'.format(atom.GetTotalValence(),val))
          #print('numatoms : {}'.format(mw.GetNumAtoms()))
          for _ in range(val-atom.GetTotalValence()):
            idx_h = mw.AddAtom(Chem.Atom(1))
            #print('added H {}'.format(idx_h))
            mw.AddBond(idx2,idx_h,Chem.BondType.SINGLE)
          #print('numatoms : {}'.format(mw.GetNumAtoms()))
          #sys.exit() 
  idx_rings = list()
  for r in Chem.GetSymmSSSR(mw):
    for x in r:
      if x not in idx_rings:
        idx_rings.append(x)
  #print(idx_rings)
  if True:
    for idx, atom in enumerate(mw.GetAtoms()):
      if idx not in idx_rings:
        atom.SetIsAromatic(False)
  if len(Chem.GetSymmSSSR(mw)) < 1:
    try:
      Chem.Kekulize(mw, clearAromaticFlags=True)
      smi = Chem.MolToSmiles(mw,kekuleSmiles=True,canonical=True)
    except:
      print('Cannot kekulize mw')
      smi = Chem.MolToSmiles(mw)
  
  #smi = Chem.MolToSmiles(mw,kekuleSmiles=True,canonical=True)
  else:
    smi = Chem.MolToSmiles(mw)
  mol = Chem.MolFromSmiles(smi)
  if mol == None:
    if 'n' in smi:
      smi = smi.replace('n','[nH]')
    elif ':O:' in smi:
      smi = smi.replace(':O:','[O]')
    mol = Chem.MolFromSmiles(smi)
  try:
    Chem.Kekulize(mol, clearAromaticFlags=True)
    smi = Chem.MolToSmiles(mol,kekuleSmiles=True)
  except:
    pass
  #smi = Chem.MolToSmiles(mol,kekuleSmiles=True)
  #print(smi, mol)
  #mw = Chem.AddHs(mw)
  mol = Chem.AddHs(mol)
  AllChem.EmbedMolecule(mol)
  #print(smi, mol.GetNumAtoms())
  formula=moldict2hill(mol2formula(mol, incl_H=True))
  smi2=Chem.MolToSmiles(mol,allHsExplicit=True,allBondsExplicit=False)
  mol.SetProp("_Name","{}  {}  {}".format(formula,smi,smi2))
  if '.' in smi:
    smi_ls = smi.split('.')
    for s in smi_ls:
      mol_s = Chem.MolFromSmiles(s)
      mol_s = Chem.AddHs(mol_s)
      AllChem.EmbedMolecule(mol_s)
      s2 = Chem.MolToSmiles(mol_s,allHsExplicit=True,allBondsExplicit=False)
      formula=moldict2hill(mol2formula(mol_s, incl_H=True))
      mol.SetProp("_Name","{}  {}  {}".format(formula,s,s2))
      frag_fn, make_mol, frag_keys=get_frag_fn(formula,s,s2,frag_keys)
      if make_mol:
        with open('fragment_lookup/'+frag_fn+'.mol', "w") as fn:
          fn.write(Chem.MolToMolBlock(mol_s))
        #print('Written to fragment_lookup/{}.mol'.format(frag_fn))
  else:
    frag_fn, make_mol, frag_keys=get_frag_fn(formula,smi,smi2,frag_keys)
    #print(Chem.MolToMolBlock(mol))
    if make_mol:
      with open('fragment_lookup/'+frag_fn+'.mol', "w") as fn:
        fn.write(Chem.MolToMolBlock(mol))
      #print('Written to fragment_lookup/{}.mol'.format(frag_fn))
  return smi, mol2formula(mol, incl_H=True), frag_fn, frag_keys
Example #13
0
"""SyGMa: Systematically Generating potential Metabolites"""

from builtins import str
import argparse
import sygma
import sys
from rdkit import Chem, RDLogger
RDLogger.logger().setLevel(RDLogger.ERROR)
import logging
logging.basicConfig()
logger = logging.getLogger('sygma')

def run_sygma(args, file=sys.stdout):
    logger.setLevel(args.loglevel.upper())
    scenario = sygma.Scenario([
        [sygma.ruleset['phase1'], args.phase1],
        [sygma.ruleset['phase2'], args.phase2]
    ])

    parent = Chem.MolFromSmiles(args.parentmol)
    metabolic_tree = scenario.run(parent)
    metabolic_tree.calc_scores()
    if args.outputtype == "sdf":
        metabolic_tree.write_sdf(file)
    elif args.outputtype == "smiles":
        file.write("\n".join([m+" "+str(s) for m,s in metabolic_tree.to_smiles()])+'\n')
    return None

def get_sygma_parser():
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument('--version', action='version', version='%(prog)s ' + sygma.__version__)
Example #14
0
def main():
    """
    Example usage:
    python -m pipelines.xchem.prepare_tether_2 --smi ../../data/mpro/Mpro-x0387_0.smi --mol ../../data/mpro/Mpro-x0387_0.mol -o TETHERED --max-inputs 500 --chunk-size 100

    :return:
    """

    global chunk_size
    global embedding_failures_file

    # Suppress basic RDKit logging...
    RDLogger.logger().setLevel(RDLogger.ERROR)
    print('RDKit version:', rdBase.rdkitVersion)

    parser = argparse.ArgumentParser(
        description='Tether prep - prepare candidates for docking')

    parser.add_argument(
        '--smi', help='SMILES containing the expanded candidates for a hit)')
    parser.add_argument('--mol',
                        help='Molfile containing the hit to tether to)')
    parser.add_argument(
        '-o',
        '--outfile',
        default='Tethered',
        help=
        'Base name for results SDF file (will generate something like Tethered_Mpro-x0072_000.sdf)'
    )
    parser.add_argument('--min-ph', type=float, help='The min pH to consider')
    parser.add_argument('--max-ph', type=float, help='The max pH to consider')
    parser.add_argument('-c',
                        '--chunk-size',
                        type=int,
                        default=200,
                        help='Chunk size for files')
    parser.add_argument('--max-inputs',
                        type=int,
                        default=0,
                        help='Max number of molecules to process')
    parser.add_argument('--max-outputs',
                        type=int,
                        default=0,
                        help='Max number of records to output')
    parser.add_argument('--modulus',
                        type=int,
                        default=0,
                        help='Process only mols with this modulus')
    parser.add_argument('--timeout-embed',
                        type=int,
                        default=5,
                        help='Timeout in seconds to apply to limit embedding')

    args = parser.parse_args()
    log("Tether prep args: ", args)

    chunk_size = args.chunk_size

    min_ph = args.min_ph
    max_ph = args.max_ph
    smi = args.smi
    mol = args.mol
    outfile = args.outfile
    max_inputs = args.max_inputs
    max_outputs = args.max_outputs
    modulus = args.modulus
    timout_embed_secs = args.timeout_embed

    embedding_failures_file = open(outfile + '_embedding_failures.smi', 'w')

    # Dimporphite needs to use argparse with its own arguments, not messed up with our arguments
    # so we store the original args
    orig_sys_argv = sys.argv[:]

    # Remove all the parameters, keeping only the filename (first one) so that
    # dimorphite is unaware of any previous commandline parameters.
    sys.argv = sys.argv[:1]

    execute(smi,
            mol,
            outfile,
            min_ph=min_ph,
            max_ph=max_ph,
            max_inputs=max_inputs,
            max_outputs=max_outputs,
            modulus=modulus,
            timout_embed_secs=timout_embed_secs)

    embedding_failures_file.close()

    print('Finished')
Example #15
0
_splashMessage = """
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
  FeatFinderCLI version %s

  Copyright (C) 2005 Rational Discovery LLC

  This software is copyrighted.  The software may not be copied,
  reproduced, translated or reduced to any electronic medium or
  machine-readable form without the prior written consent of
  Rational Discovery LLC.
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
""" % _version
from rdkit import Chem
from rdkit.Chem import ChemicalFeatures
from rdkit import RDLogger
logger = RDLogger.logger()
import sys, os
import re
splitExpr = re.compile(r'[ \t,]')


def GetAtomFeatInfo(factory, mol):
    res = [None] * mol.GetNumAtoms()
    feats = factory.GetFeaturesForMol(mol)
    for feat in feats:
        ids = feat.GetAtomIds()
        for id in ids:
            if res[id] is None:
                res[id] = []
            res[id].append("%s-%s" % (feat.GetFamily(), feat.GetType()))
    return res
Example #16
0
from __future__ import print_function

from rdkit import RDLogger
lg = RDLogger.logger()
lg.setLevel(4)

import rdkit.Chem as Chem
import rdkit.Chem.AllChem as AllChem
from rdkit import DataStructs
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
import sys

from retrosim.utils.generate_retro_templates import process_an_example
from retrosim.data.get_data import get_data_df, split_data_df

from joblib import Parallel, delayed
import multiprocessing
num_cores = multiprocessing.cpu_count()

from rdchiral.main import rdchiralRun, rdchiralReaction, rdchiralReactants
import os

SCRIPT_ROOT = os.path.dirname(__file__)
PROJ_ROOT = os.path.dirname(SCRIPT_ROOT)

############### DEFINITIONS FOR VALIDATION SEARCH ########################
all_getfp_labels = ['Morgan2noFeat', 'Morgan3noFeat', 'Morgan2Feat', 'Morgan3Feat']
all_similarity_labels = ['Tanimoto', 'Dice', 'TverskyA', 'TverskyB',]
Example #17
0
from chainer.training import extensions, StandardUpdater

import chainermn

import logging
import argparse
from distutils.util import strtobool

from model import pair_matrix_model
import uspto_pre
from updater import MyUpdater
from evaluator import MyEvaluator

from rdkit import RDLogger

rdl = RDLogger.logger()
rdl.setLevel(RDLogger.CRITICAL)

import glob
from rdkit import Chem
from tqdm import tqdm


def read_inference(inference):
    l = {}
    with open(inference, 'r') as f:
        while True:
            line = f.readline()
            if not line:
                break
            l[int(line.split()[0])] = line.split()[1:]
Example #18
0
    def test1InchiReadPubChem(self):
        for f in self.dataset.values():
            same, diff, reasonable = 0, 0, 0
            for m in f:
                if m is None:  # pragma: nocover
                    continue
                x = MolToInchi(m)
                y = None
                RDLogger.DisableLog('rdApp.error')
                mol = MolFromInchi(x)
                RDLogger.EnableLog('rdApp.error')
                if mol is not None:
                    y = MolToInchi(
                        MolFromSmiles(MolToSmiles(mol, isomericSmiles=True)))
                if y is None:
                    # metal involved?
                    try:
                        MolToInchi(m, treatWarningAsError=True)
                    except InchiReadWriteError as inst:
                        _, error = inst.args
                        if 'Metal' in error or \
                                'Charges were rearranged' in error:
                            reasonable += 1
                            continue
                    # THERE ARE NO EXAMPLES FOR THE FOLLOWING (no coverage)
                    # RDKit does not like the SMILES? use MolBlock instead
                    inchiMol = MolFromInchi(x)
                    if inchiMol:
                        rdDepictor.Compute2DCoords(inchiMol)
                        z = MolToInchi(MolFromMolBlock(
                            MolToMolBlock(inchiMol)))
                        if x == z:
                            reasonable += 1
                            continue
                    # InChI messed up the radical?
                    unsanitizedInchiMol = MolFromInchi(x, sanitize=False)
                    if sum([
                            a.GetNumRadicalElectrons() * a.GetAtomicNum()
                            for a in m.GetAtoms()
                            if a.GetNumRadicalElectrons() != 0
                    ]) != sum([
                            a.GetNumRadicalElectrons() * a.GetAtomicNum()
                            for a in unsanitizedInchiMol.GetAtoms()
                            if a.GetNumRadicalElectrons() != 0
                    ]):
                        reasonable += 1
                        continue

                    diff += 1
                    cid = m.GetProp('PUBCHEM_COMPOUND_CID')
                    print(COLOR_GREEN + 'Empty mol for PubChem Compound ' +
                          cid + '\n' + COLOR_RESET)
                    continue
                if x != y:
                    # if there was warning in the first place, then this is
                    # tolerable
                    try:
                        MolToInchi(m, treatWarningAsError=True)
                        MolFromInchi(x, treatWarningAsError=True)
                    except InchiReadWriteError as inst:
                        reasonable += 1
                        continue
                    # or if there are big rings
                    SanitizeMol(m)
                    if filter(lambda i: i >= 8,
                              [len(r) for r in m.GetRingInfo().AtomRings()]):
                        reasonable += 1
                        continue
                    # THERE ARE NO EXAMPLES FOR THE FOLLOWING (no coverage)
                    # or if RDKit loses bond stereo
                    s = MolToSmiles(m, True)
                    if MolToSmiles(MolFromSmiles(s), True) != s:
                        reasonable += 1
                        continue
                    # or if it is RDKit SMILES writer unhappy about the mol
                    inchiMol = MolFromInchi(x)
                    rdDepictor.Compute2DCoords(inchiMol)
                    z = MolToInchi(MolFromMolBlock(MolToMolBlock(inchiMol)))
                    if x == z:
                        reasonable += 1
                        continue

                    diff += 1
                    print(COLOR_GREEN +
                          'Molecule mismatch for PubChem Compound ' + cid +
                          COLOR_RESET)
                    print(inchiDiff(x, y))
                    print()
                else:
                    same += 1
            fmt = "\n{0}InChI read Summary: {1} identical, {2} variance, {3} reasonable variance{4}"
            print(fmt.format(COLOR_GREEN, same, diff, reasonable, COLOR_RESET))
            self.assertEqual(same, 627)
            self.assertEqual(diff, 0)
            self.assertEqual(reasonable, 554)
Example #19
0
""" RDKit interface
"""

from rdkit import RDLogger
import rdkit.Chem as _rd_chem
import rdkit.Chem.AllChem as _rd_all_chem
import automol.create
from automol.convert import _util


_LOGGER = RDLogger.logger()
_LOGGER.setLevel(RDLogger.ERROR)


# geometry
def to_geometry(rdm):
    """ Generate a molecular geometry from an RDKit molecule object.

        :param rdm: molecule object
        :type rdm: RDKit molecule object
        :rtype: automol geometry data structure
    """

    rdm = _rd_chem.AddHs(rdm)
    atms = rdm.GetAtoms()
    natms = len(rdm.GetAtoms())
    if natms == 1:
        syms = [str(atms[0].GetSymbol()).title()]
        xyzs = [(0., 0., 0.)]
    else:
        _rd_all_chem.EmbedMolecule(rdm)
Example #20
0
 def tearDown(self):
     RDLogger.EnableLog('rdApp.error')
Example #21
0
def standardize_mols(jobs, mol_counter, num_mols, results, start_time, vendors, max_stereo_isomers, failures,
                     tautomer, verbose):
    """
    This function passes molecules to the standardization functions.

    Parameters
    ----------
    jobs: multiprocessing.manager.list
        A list containing job information as dictionaries.

    mol_counter: multiprocessing.manager.value
        A counter keeping track of processed molecules.

    num_mols: int
        Total number of molecules to be processed.

    results: multiprocessing.manager.list
        A list containing lists describing the processed molecules.

    start_time: float
        Starting time of molecule processing.

    vendors: list
        List of vendors.

    max_stereo_isomers: int
        Maximal number of stereo isomers to generater per molecule.

    verbose : bool
        If RDKit warning should be displayed.

    """
    if not verbose:
        RDLogger.DisableLog('rdApp.*')
    job = 'initiate'
    processed_mols = []
    while job is not None:
        try:
            job = jobs.pop(0)
            vendor_position = vendors.index(job['vendor'])
            supplier = Chem.SDMolSupplier(job['sdf_path'])
            for mol_id in range(job['mol_start'], job['mol_end'] + 1):
                mol = supplier[mol_id]
                if job['identifier_field'] == 'None':
                    identifier = 'unknown'
                else:
                    try:
                        identifier = mol.GetProp(job['identifier_field'])
                    except AttributeError:
                        identifier = 'unknown'
                try:
                    # generate smiles for error catching
                    smiles = 'unknown'
                    smiles = Chem.MolToSmiles(mol)
                    # default standardization from molvs
                    mol = Standardizer().standardize(mol)
                    # choose largest fragment
                    mol = LargestFragmentChooser().choose(mol)
                    # canonicalize tautomer
                    if tautomer:
                        mol = TautomerCanonicalizer().canonicalize(mol)
                    # protonate mol
                    mol = protonate_mol(mol)
                    # molecular weight will not change anymore
                    if ExactMolWt(mol) < 1200:
                        # enumerate stereo isomers and append mols
                        if max_stereo_isomers > 0:
                            for mol in enumerate_stereo_isomers(mol, max_stereo_isomers):
                                mol_as_list = [Chem.MolToSmiles(mol)] + [''] * len(vendors)
                                mol_as_list[1 + vendor_position] = identifier
                                processed_mols.append(mol_as_list)
                        else:
                            mol_as_list = [Chem.MolToSmiles(mol)] + [''] * len(vendors)
                            mol_as_list[1 + vendor_position] = identifier
                            processed_mols.append(mol_as_list)
                except:
                    failures.append(' '.join(['standardize_error', smiles, job['vendor'], identifier]))
                with mol_counter.get_lock():
                    mol_counter.value += 1
                update_progress(mol_counter.value / num_mols, 'Progress of standardization',
                                ((time.time() - start_time) / mol_counter.value) * (num_mols - mol_counter.value))
        except IndexError:
            job = None
    results += processed_mols
    return
Example #22
0
import os.path as osp

from tqdm import tqdm
import torch
import torch.nn.functional as F
from torch_scatter import scatter
from torch_geometric.data import (InMemoryDataset, download_url, extract_zip,
                                  Data)

try:
    import rdkit
    from rdkit import Chem
    from rdkit.Chem.rdchem import HybridizationType
    from rdkit.Chem.rdchem import BondType as BT
    from rdkit import RDLogger
    RDLogger.DisableLog('rdApp.*')
except ImportError:
    rdkit = None

HAR2EV = 27.2113825435
KCALMOL2EV = 0.04336414

conversion = torch.tensor([
    1., 1., HAR2EV, HAR2EV, HAR2EV, 1., HAR2EV, HAR2EV, HAR2EV, HAR2EV, HAR2EV,
    1., KCALMOL2EV, KCALMOL2EV, KCALMOL2EV, KCALMOL2EV, 1., 1., 1.
])

atomrefs = {
    6: [0., 0., 0., 0., 0.],
    7: [
        -13.61312172, -1029.86312267, -1485.30251237, -2042.61123593,
Example #23
0
    GetTopologicalTorsionFingerprint
from rdkit.Chem.Pharm2D import Gobbi_Pharm2D
from rdkit.Chem.Pharm2D.Generate import Gen2DFingerprint
from rdkit.Chem.rdReducedGraphs import GetErGFingerprint

# All available similarities in RDKit
from rdkit.DataStructs.cDataStructs import TanimotoSimilarity, \
    DiceSimilarity, CosineSimilarity, SokalSimilarity, RusselSimilarity, \
    RogotGoldbergSimilarity, AllBitSimilarity, KulczynskiSimilarity, \
    McConnaugheySimilarity, AsymmetricSimilarity, BraunBlanquetSimilarity, \
    TverskySimilarity

from torch_geometric.data import Data

# Suppress unnecessary RDkit warnings and errors
RDLogger.logger().setLevel(RDLogger.CRITICAL)
logger = logging.getLogger(__name__)

# Tokenization dictionaries ###################################################
# Special tokens for meta token
SPECIAL_TOKEN_DICT = {
    'SOS': 0,  # Start of the sentence
    'UNK': 128,  # Unknown atoms
    'MSK': 129,  # Masked tokens/atoms for prediction
    'EOS': 254,  # End of the sentence
    'PAD': 255,  # Padding
}

# High frequency/occurrence atoms from PCBA
ATOM_TOKEN_DICT = {
    'C': 6,
Example #24
0
#   @@ All Rights Reserved @@
#  This file is part of the RDKit.
#  The contents are covered by the terms of the BSD license
#  which is included in the file license.txt, found at the root
#  of the RDKit source tree.
#
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Lipinski,Descriptors,Crippen
from rdkit.Dbase.DbConnection import DbConnect
from rdkit.Dbase import DbModule
import re

#set up the logger:
import rdkit.RDLogger as logging
logger = logging.logger()
logger.setLevel(logging.INFO)

def ProcessMol(mol,typeConversions,globalProps,nDone,nameProp='_Name',nameCol='compound_id',
               redraw=False,keepHs=False,
               skipProps=False,addComputedProps=False,
               skipSmiles=False,
               uniqNames=None,namesSeen=None):
  if not mol:
    raise ValueError('no molecule')
  if keepHs:
    Chem.SanitizeMol(mol)
  try:
    nm = mol.GetProp(nameProp)
  except KeyError:
    nm = None
Example #25
0
 def quiet(self):
     # Silence everything but critical errors.
     self.rdk_lg = RDLogger.logger()
     self.rdk_lg.setLevel(RDLogger.CRITICAL)
Example #26
0
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#

import unittest
import os, sys, copy

import pickle

from rdkit import rdBase
from rdkit import Chem
from rdkit.Chem.rdRGroupDecomposition import RGroupDecompose, RGroupDecomposition, RGroupDecompositionParameters
from collections import OrderedDict

# the RGD code can generate a lot of warnings. disable them
from rdkit import RDLogger
RDLogger.DisableLog("rdApp.warning")


class TestCase(unittest.TestCase):

  def test_multicores(self):
    cores_smi_easy = OrderedDict()
    cores_smi_hard = OrderedDict()

    #cores_smi_easy['cephem'] = Chem.MolFromSmiles('O=C1C([1*])[C@@H]2N1C(C(O)=O)=C([3*])CS2')
    cores_smi_easy['cephem'] = Chem.MolFromSmarts('O=C1C([*:1])C2N1C(C(O)=O)=C([*:3])CS2')
    cores_smi_hard['cephem'] = Chem.MolFromSmarts('O=C1C([2*])([1*])[C@@H]2N1C(C(O)=O)=C([3*])CS2')

    #cores_smi_easy['carbacephem'] = Chem.MolFromSmiles('O=C1C([1*])[C@@H]2N1C(C(O)=O)=C([3*])CC2')
    cores_smi_easy['carbacephem'] = Chem.MolFromSmarts('O=C1C([1*])C2N1C(C(O)=O)=C([3*])CC2')
    cores_smi_hard['carbacephem'] = Chem.MolFromSmarts(
Example #27
0
def create_scaffold_split(df, seed, frac, entity):
    # reference: https://github.com/chemprop/chemprop/blob/master/chemprop/data/scaffold.py
    try:
        from rdkit import Chem
        from rdkit.Chem.Scaffolds import MurckoScaffold
        from rdkit import RDLogger
        RDLogger.DisableLog('rdApp.*')
    except:
        raise ImportError(
            "Please install rdkit by 'conda install -c conda-forge rdkit'! ")
    from tqdm import tqdm
    from random import Random

    from collections import defaultdict
    random = Random(seed)

    s = df[entity].values
    scaffolds = defaultdict(set)
    idx2mol = dict(zip(list(range(len(s))), s))

    error_smiles = 0
    for i, smiles in tqdm(enumerate(s), total=len(s)):
        try:
            scaffold = MurckoScaffold.MurckoScaffoldSmiles(
                mol=Chem.MolFromSmiles(smiles), includeChirality=False)
            scaffolds[scaffold].add(i)
        except:
            print_sys(smiles + ' returns RDKit error and is thus omitted...')
            error_smiles += 1

    train, val, test = [], [], []
    train_size = int((len(df) - error_smiles) * frac[0])
    val_size = int((len(df) - error_smiles) * frac[1])
    test_size = (len(df) - error_smiles) - train_size - val_size
    train_scaffold_count, val_scaffold_count, test_scaffold_count = 0, 0, 0

    #index_sets = sorted(list(scaffolds.values()), key=lambda i: len(i), reverse=True)
    index_sets = list(scaffolds.values())
    big_index_sets = []
    small_index_sets = []
    for index_set in index_sets:
        if len(index_set) > val_size / 2 or len(index_set) > test_size / 2:
            big_index_sets.append(index_set)
        else:
            small_index_sets.append(index_set)
    random.seed(seed)
    random.shuffle(big_index_sets)
    random.shuffle(small_index_sets)
    index_sets = big_index_sets + small_index_sets

    if frac[2] == 0:
        for index_set in index_sets:
            if len(train) + len(index_set) <= train_size:
                train += index_set
                train_scaffold_count += 1
            else:
                val += index_set
                val_scaffold_count += 1
    else:
        for index_set in index_sets:
            if len(train) + len(index_set) <= train_size:
                train += index_set
                train_scaffold_count += 1
            elif len(val) + len(index_set) <= val_size:
                val += index_set
                val_scaffold_count += 1
            else:
                test += index_set
                test_scaffold_count += 1

    return {
        'train': df.iloc[train].reset_index(drop=True),
        'valid': df.iloc[val].reset_index(drop=True),
        'test': df.iloc[test].reset_index(drop=True)
    }
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

##### MolObjectHandling.py
import __future__

import rdkit
from rdkit import Chem

# Disable the unnecessary RDKit warnings
from rdkit import RDLogger

RDLogger.DisableLog("rdApp.*")


def check_sanitization(mol):
    """
    Given a rdkit.Chem.rdchem.Mol this script will sanitize the molecule.
    It will be done using a series of try/except statements so that if it fails it will return a None
    rather than causing the outer script to fail.

    Nitrogen Fixing step occurs here to correct for a common RDKit valence error in which Nitrogens with
        with 4 bonds have the wrong formal charge by setting it to -1.
        This can be a place to add additional correcting features for any discovered common sanitation failures.

    Handled here so there are no problems later.

    Inputs:
Example #29
0
File: fixer.py Project: oddt/oddt
def UFFConstrainedOptimize(mol, moving_atoms=None, fixed_atoms=None,
                           cutoff=5., verbose=False):
    """Minimize a molecule using UFF forcefield with a set of moving/fixed
    atoms. If both moving and fixed atoms are provided, fixed_atoms parameter
    will be ignored.  The minimization is done in-place (without copying
    molecule).

    Parameters
    ----------
        mol: rdkit.Chem.rdchem.Mol
            Molecule to be minimized.
        moving_atoms: array-like (default=None)
            Indices of freely moving atoms. If None, fixed atoms are assigned
            based on `fixed_atoms`. These two arguments are mutually exclusive.
        fixed_atoms: array-like (default=None)
            Indices of fixed atoms. If None, fixed atoms are assigned based on
            `moving_atoms`. These two arguments are mutually exclusive.
        cutoff: float (default=10.)
            Distance cutoff for the UFF minimization

    Returns
    -------
        mol: rdkit.Chem.rdchem.Mol
            Molecule with mimimized `moving_atoms`
    """
    logger = RDLogger.logger()

    if not verbose:
        logger.setLevel(RDLogger.CRITICAL)

    if moving_atoms is None and fixed_atoms is None:
        raise ValueError('You must supply at least one set of moving/fixed '
                         'atoms.')

    all_atoms = set(range(mol.GetNumAtoms()))
    if moving_atoms is None:
        moving_atoms = list(all_atoms.difference(fixed_atoms))
    else:
        fixed_atoms = list(all_atoms.difference(moving_atoms))
    # extract submolecules containing atoms within cutoff
    mol_conf = mol.GetConformer(-1)
    pos = np.array([mol_conf.GetAtomPosition(i)
                   for i in range(mol_conf.GetNumAtoms())])
    mask = (cdist(pos, pos[moving_atoms]) <= cutoff).any(axis=1)
    amap = np.where(mask)[0].tolist()

    # expand to whole residues
    pocket_residues = OrderedDict()
    protein_residues = GetResidues(mol)
    for res_id in protein_residues.keys():
        if any(1 for res_aix in protein_residues[res_id]
               if res_aix in amap):
            pocket_residues[res_id] = protein_residues[res_id]
    amap = list(chain(*pocket_residues.values()))

    # TODO: above certain threshold its making a submolis redundant
    submol = AtomListToSubMol(mol, amap, includeConformer=True)
    # initialize ring info
    Chem.GetSSSR(submol)
    ff = UFFGetMoleculeForceField(submol, vdwThresh=cutoff,
                                  ignoreInterfragInteractions=False)
    for submol_id, atom_id in enumerate(amap):
        if atom_id not in moving_atoms:
            ff.AddFixedPoint(submol_id)
    ff.Initialize()
    ff.Minimize(energyTol=1e-4, forceTol=1e-3, maxIts=2000)

    # get the positions backbone
    conf = mol.GetConformer(-1)
    submol_conf = submol.GetConformer(-1)
    for submol_idx, mol_idx in enumerate(amap,):
        conf.SetAtomPosition(mol_idx, submol_conf.GetAtomPosition(submol_idx))

    # FIXME: there's no getLevel method, so we set to default level
    if not verbose:
        logger.setLevel(RDLogger.INFO)

    return mol
Example #30
0
    # Output is either s fixed name in an output directory
    # or a prefixed filename (without an output directory)
    if args.output_is_prefix:
        output_filename = '{}.{}.gz'.format(args.output, output_filename)
    else:
        # Create the output directory
        if os.path.exists(args.output):
            logger.error('Output exists')
            sys.exit(1)
        os.mkdir(args.output)
        os.chmod(args.output, 0o777)
        output_filename = os.path.join(args.output,
                                       '{}.gz'.format(output_filename))

    # Suppress basic RDKit logging...
    RDLogger.logger().setLevel(RDLogger.ERROR)

    # Report any limiting...?
    if args.limit:
        logger.warning('Limiting processing to first {:,} molecules'.format(
            args.limit))

    # Before we open the output file
    # get a lit of all the input files (the prefix may be the same)
    # so we don't want our file in the list of files to be processed)
    data_files = glob.glob('{}/{}*.gz'.format(args.vendor_dir,
                                              args.vendor_prefix))

    # Open the file we'll write the standardised data set to.
    # A text, tab-separated file.
    logger.info('Writing %s...', output_filename)
def split_sdf(file_name, outdir="data/"):

    if ".sdf" in file_name:
        print("Loading sdf.")

        rdk_lg = RDLogger.logger()
        rdk_lg.setLevel(RDLogger.CRITICAL)
        df = PandasTools.LoadSDF(sdf_file_name,
                                 smilesName='SMILES',
                                 molColName='Molecule',
                                 includeFingerprints=False)

    if ".csv" in file_name:
        print("Loading CSV.")
        # Parse the CSV file.
        rdk_lg = RDLogger.logger()
        rdk_lg.setLevel(RDLogger.CRITICAL)
        with open(file_name, "r") as csvf:
            pdb_list = [
                list(line.split(",")) for line in csvf.read().split("\n")
            ]
        df = pd.DataFrame(columns=pdb_list[0].append('Molecule'))
        for pdb in pdb_list[1:-1]:
            print("pdb=", pdb)
            df = df.append({'PDB ID': pdb}, ignore_index=False)
    print("Raw cols = ", [str(x) for x in df.columns])
    # Select only the needed columns and merge the two PDB cols.
    #df_list=['PDB ID(s) for Ligand-Target Complex','PDB ID(s) of Target Chain','SMILES','IC50 (nM)','Molecule']
    df_list = ['PDB ID']
    df_selected = df[df_list].copy()
    #df_selected["PDB IDs"] = df_selected['PDB ID(s) for Ligand-Target Complex'] + ',' + df_selected['PDB ID(s) of Target Chain']
    print("Selected cols = ", [str(x) for x in df_selected.columns])
    #df_selected = df_selected[ ["PDB IDs"] + df_list[2:] ]
    # Drop any rows with missing data.
    df_selected = df_selected.replace('', np.nan)
    df_selected = df_selected.replace(',', np.nan)
    df_selected = df_selected.dropna()
    r_rows = len(df.index)
    s_rows = len(df_selected.index)
    print("Raw rows = ", r_rows)
    print("Sel rows = ", s_rows)
    print("Keep pct = %.2f%s" %
          (((float(s_rows) / float(r_rows)) * 100.0), '%'))
    # Build ligand dictionary and a protein dictionary.
    print("Building protein-ligand dictionary.")
    uligs = {}
    prots_ligs = {}
    for lndx, row in enumerate(df_selected.values):
        #print("row[0]=",row[0])
        pdbs = row[0][0].split(',')
        for pdb in pdbs:
            if pdb == '':
                continue
            if pdb not in prots_ligs:
                prots_ligs[pdb] = []
            prots_ligs[pdb] += [lndx]
        uligs[lndx] = row
    print("Unique proteins = ", len(prots_ligs))
    print("Writing per-ligand output files.")
    # Write out .lig files and return the data dictionaries.
    for key in uligs:
        ndx = str(key)
        lig = uligs[key]
        print("writing ligand indexed by ", lig[2], "ndx=", ndx)
        write_lig_file(lig[2], outdir + "/lig/lig%s.lig" % ndx)
    return uligs, prots_ligs
Example #32
0
    def process(self):
        try:
            import rdkit
            from rdkit import Chem, RDLogger
            from rdkit.Chem.rdchem import BondType as BT
            from rdkit.Chem.rdchem import HybridizationType
            RDLogger.DisableLog('rdApp.*')

        except ImportError:
            rdkit = None

        if rdkit is None:
            print(("Using a pre-processed version of the dataset. Please "
                   "install 'rdkit' to alternatively process the raw data."),
                  file=sys.stderr)

            data_list = torch.load(self.raw_paths[0])
            data_list = [Data(**data_dict) for data_dict in data_list]

            if self.pre_filter is not None:
                data_list = [d for d in data_list if self.pre_filter(d)]

            if self.pre_transform is not None:
                data_list = [self.pre_transform(d) for d in data_list]

            torch.save(self.collate(data_list), self.processed_paths[0])
            return

        types = {'H': 0, 'C': 1, 'N': 2, 'O': 3, 'F': 4}
        bonds = {BT.SINGLE: 0, BT.DOUBLE: 1, BT.TRIPLE: 2, BT.AROMATIC: 3}

        with open(self.raw_paths[1], 'r') as f:
            target = f.read().split('\n')[1:-1]
            target = [[float(x) for x in line.split(',')[1:20]]
                      for line in target]
            target = torch.tensor(target, dtype=torch.float)
            target = torch.cat([target[:, 3:], target[:, :3]], dim=-1)
            target = target * conversion.view(1, -1)

        with open(self.raw_paths[2], 'r') as f:
            skip = [int(x.split()[0]) - 1 for x in f.read().split('\n')[9:-2]]

        suppl = Chem.SDMolSupplier(self.raw_paths[0],
                                   removeHs=False,
                                   sanitize=False)

        data_list = []
        for i, mol in enumerate(tqdm(suppl)):
            if i in skip:
                continue

            N = mol.GetNumAtoms()

            pos = suppl.GetItemText(i).split('\n')[4:4 + N]
            pos = [[float(x) for x in line.split()[:3]] for line in pos]
            pos = torch.tensor(pos, dtype=torch.float)

            type_idx = []
            atomic_number = []
            aromatic = []
            sp = []
            sp2 = []
            sp3 = []
            num_hs = []
            for atom in mol.GetAtoms():
                type_idx.append(types[atom.GetSymbol()])
                atomic_number.append(atom.GetAtomicNum())
                aromatic.append(1 if atom.GetIsAromatic() else 0)
                hybridization = atom.GetHybridization()
                sp.append(1 if hybridization == HybridizationType.SP else 0)
                sp2.append(1 if hybridization == HybridizationType.SP2 else 0)
                sp3.append(1 if hybridization == HybridizationType.SP3 else 0)

            z = torch.tensor(atomic_number, dtype=torch.long)

            row, col, edge_type = [], [], []
            for bond in mol.GetBonds():
                start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
                row += [start, end]
                col += [end, start]
                edge_type += 2 * [bonds[bond.GetBondType()]]

            edge_index = torch.tensor([row, col], dtype=torch.long)
            edge_type = torch.tensor(edge_type, dtype=torch.long)
            edge_attr = F.one_hot(edge_type,
                                  num_classes=len(bonds)).to(torch.float)

            perm = (edge_index[0] * N + edge_index[1]).argsort()
            edge_index = edge_index[:, perm]
            edge_type = edge_type[perm]
            edge_attr = edge_attr[perm]

            row, col = edge_index
            hs = (z == 1).to(torch.float)
            num_hs = scatter(hs[row], col, dim_size=N).tolist()

            x1 = F.one_hot(torch.tensor(type_idx), num_classes=len(types))
            x2 = torch.tensor([atomic_number, aromatic, sp, sp2, sp3, num_hs],
                              dtype=torch.float).t().contiguous()
            x = torch.cat([x1.to(torch.float), x2], dim=-1)

            y = target[i].unsqueeze(0)
            name = mol.GetProp('_Name')

            data = Data(x=x,
                        z=z,
                        pos=pos,
                        edge_index=edge_index,
                        edge_attr=edge_attr,
                        y=y,
                        name=name,
                        idx=i)

            if self.pre_filter is not None and not self.pre_filter(data):
                continue
            if self.pre_transform is not None:
                data = self.pre_transform(data)

            data_list.append(data)

        torch.save(self.collate(data_list), self.processed_paths[0])
def split_pdb_with_sdf(pdb_id, sdf_file_name, outdir="data/"):
    # This function takes in a PDB-list from csv (from rcsb.org)
    # alongside an sdf file containing compounds to test against every structure in the pdb-list

    print("Loading sdf from ", sdf_file_name)

    rdk_lg = RDLogger.logger()
    rdk_lg.setLevel(RDLogger.CRITICAL)
    df = PandasTools.LoadSDF(sdf_file_name,
                             smilesName='SMILES',
                             molColName='Molecule',
                             includeFingerprints=False,
                             embedProps=True)
    print("Available SDF cols = ", [str(x) for x in df.columns])
    PandasTools.AddMoleculeColumnToFrame(df,
                                         'SMILES',
                                         'Molecule',
                                         includeFingerprints=False)
    df.insert(column="PDB ID", value=pdb_id, loc=0)
    # Select only the needed columns and merge the two PDB cols.
    #df_sdf_list = ['PDB ID','FDA drugnames','SMILES','Molecule']
    #df_sdf_list = ['PDB ID','Molecule','Ligand','SMILES','BindingDB MonomerID']
    df_sdf_list = [
        'PDB ID', 'BindingDB Ligand Name', 'ChEMBL ID of Ligand', 'Molecule'
    ]
    df_selected = df[df_sdf_list].copy()
    print("Selected SDF cols = ", [str(x) for x in df_selected.columns])

    print("Loading compounds for test against PDB ID = ", pdb_id)
    #with open(pdb_list_file_name,"r") as csvf:
    #    pdb_list = [ list(line.split(",")) for line in csvf.read().split("\n") ]
    #    df = pd.DataFrame(columns=pdb_list[0].append('Molecule'))
    #i=0
    for name, mol in zip(df['ChEMBL ID of Ligand'], df_selected['Molecule']):
        #for name,mol in zip(df_selected['BindingDB MonomerID'],df_selected['Molecule']):
        if ((mol.GetNumAtoms() < MAX_LIG_ATMS)
                and ('CHEMBL[0-9]*' in str(name) and not ligname.isspace())):
            print("pdb=", pdb_id, ",Molecule ID = ", name)
            df_selected = df_selected.append({'PDB ID': pdb_id},
                                             ignore_index=True)
        #i=i+1

    #print("Raw PDB file cols = ", [str(x) for x in df.columns])
    # Select only the needed columns and merge the two PDB cols.
    df_selected = df_selected.drop_duplicates()
    df_selected = df_selected.dropna(inplace=False)
    #df_selected = df_selected['.*.CHEMBL.*.' in str(df_selected['ChEMBL ID of Ligand'].value)]
    #df_selected["PDB IDs"] = df_selected['PDB ID(s) for Ligand-Target Complex'] + ',' + df_selected['PDB ID(s) of Target Chain']
    print("Selected PDB cols = ", [str(x) for x in df_selected.columns])
    #df_selected = df_selected[ ["PDB IDs"] + df_list[2:] ]
    # Drop any rows with missing data.
    df_selected = df_selected.replace('', np.nan)
    df_selected = df_selected.replace(',', np.nan)
    df_selected = df_selected.dropna()
    r_rows = len(df.index)
    s_rows = len(df_selected.index)
    print("Raw rows = ", r_rows)
    print("Sel rows = ", s_rows)
    print("Keep pct = %.2f%s" %
          (((float(s_rows) / float(r_rows)) * 100.0), '%'))
    # Build ligand dictionary and a protein dictionary.
    print("Building protein-ligand dictionary.")
    uligs = {}
    prots_ligs = {}
    for lndx, row in enumerate(df_selected.values):
        print("prot_lig row=", row)
        ligname = str(row[2]).replace(';', '').replace(' ', '-').replace(
            '%', '').replace('/', '').replace('?', '').split('\n')[0]
        ligname = re.sub('[^A-Za-z0-9]+%\/.\\n.', '', ligname)
        #ligname = str(row[2]).replace('\\','') #.replace(' ','-').replace('%','').replace('/','')
        #ligname = re.sub('[^A-Za-z0-9]+%\/', '', ligname)
        pdbs = [pdb_id]  #row[0].split(',')
        for pdb in pdbs:
            if pdb == '':
                continue
            if pdb not in prots_ligs:
                prots_ligs[pdb] = []
            if (row[3].GetNumAtoms() < MAX_LIG_ATMS and not (ligname.isspace())
                    and (row[3].GetNumAtoms() > MIN_LIG_ATMS)):
                prots_ligs[pdb] += [lndx]
                #prots_ligs[pdb] += [ {str(lndx):ligname} ]
                MOL_TO_NDX.update({str(lndx): ligname})

                uligs[lndx] = row

    print("Unique proteins = ", len(prots_ligs))
    print("Writing per-ligand output files.")
    # Write out .lig files and return the data dictionaries.
    return_uligs = []
    return_protligs = {}
    return_protligs[pdb_id] = []
    for key in uligs:
        #return_uligs = {}
        #return_protligs = {}
        #for key in MOL_TO_NDX:
        ndx = str(key)
        lig = uligs[key]
        #lig = uligs[int(ndx)]
        #lig = uligs[int(key)]
        #print("lig=",lig)
        ligname = str(lig[2]).replace(';', '').replace(' ', '-').replace(
            '%', '').replace('/', '').replace('?', '').split('\n')[0]
        ligname = re.sub('[^A-Za-z0-9]+%\/.\\n.', '', ligname)
        #ligname = str(lig[2])
        #ligname = str(row[2]).replace('\\n','') #.replace(' ','-').replace('%','').replace('/','')
        #ligname = re.sub('[^A-Za-z0-9]+%\/', '', ligname)

        #if(lig[1].GetNumAtoms()<MAX_LIG_ATMS and ligname==MOL_TO_NDX[ndx]):
        if (ligname == MOL_TO_NDX[ndx]):
            return_uligs.append(lig)
            return_protligs[pdb_id] += [int(ndx)]
            print("lig name=", ligname)
            print("lig # atoms: ", lig[3].GetNumAtoms())
            print("lig=", lig)
            print("writing ligand for PDB=:", lig[0], ", #atoms:",
                  lig[3].GetNumAtoms(), ", name: ", ligname, "ndx name",
                  MOL_TO_NDX[str(ndx)], ", @ index=", ndx)
            #write_lig_file(lig[4],outdir+"/lig/lig%s.lig"%ndx)
            write_lig_file(lig[3], outdir + "/lig/lig%s.lig" % ligname)

    #return uligs, prots_ligs
    return uligs, return_protligs
Example #34
0
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#

INCHI_AVAILABLE = True

import rdinchi
import logging
from rdkit import RDLogger
logger = RDLogger.logger()

logLevelToLogFunctionLookup = {
        logging.INFO : logger.info,
        logging.DEBUG : logger.debug,
        logging.WARNING : logger.warning,
        logging.CRITICAL : logger.critical,
        logging.ERROR : logger.error
        }

class InchiReadWriteError(Exception):
    pass

def MolFromInchi(inchi, sanitize=True, removeHs=True, logLevel=None,
        treatWarningAsError=False):
    """Construct a molecule from a InChI string
Example #35
0
from argparse import ArgumentParser
from molgym.agents.moldqn import DQNFinalState
from molgym.agents.preprocessing import MorganFingerprints
from molgym.envs.actions import MoleculeActions
from molgym.envs.rewards import RewardFunction
from molgym.envs.simple import Molecule
from molgym.envs.rewards.mpnn import MPNNReward
from molgym.utils.conversions import convert_nx_to_smiles, convert_smiles_to_nx
from molgym.mpnn.layers import custom_objects
from tensorflow.keras.models import load_model

# Set up the logger
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger('RL-Logger')
logger.setLevel(logging.DEBUG)
rdkit_logger = RDLogger.logger()
rdkit_logger.setLevel(RDLogger.CRITICAL)


def get_platform_info():
    """Get information about the computer running this process"""

    return {
        'processor': platform.machine(),
        'python_version': platform.python_version(),
        'python_compiler': platform.python_compiler(),
        'hostname': platform.node(),
        'os': platform.platform(),
        'cpu_name': platform.processor(),
        'n_cores': os.cpu_count()
    }