def descriptor_generator(CID): """Generate the number of each functional group for specific compound according to input CID""" fg = ["[H]", "[CX4H3]", "[CX4H2]", "[CX4H1]", "[CX4H0]", "[CX3H2]", "[CX3H1]", "[CX3H0]", "[CX2H1]", "[CX2H0]", "[CX4H2R]", "[CX4H1R]", "[CX4H0R]","[CX3H1R]","[CX3H0R]","[cX3H1](:*):*", "[cX3H0](:*)(:*)*", "[OX2H1]", "[OX2H1][cX3]:[c]", "[OX2H0]", "[OX2H0R]", "[oX2H0](:*):*", "[CX3H0]=[O]", "[CX3H0R]=[O]", "[CX3H1]=[O]", "[CX3H0](=[O])[OX2H1]", "[CX3H0](=[O])[OX2H0]", "[cX3H0](:*)(:*):*"] ###define SMARTS of functional groups list counts = [] ###count functional groups result = SMILES(CID) ###generate SMILES of compound mol = readstring("smi", result) ###load SMILES for i in range(len(fg)): smarts = Smarts(fg[i]) ###load SMARTS n = smarts.findall(mol) ###find specific functional group, return will be tuples in a list counts.append(len(n)) ###record number of functional group X = pd.DataFrame(np.array(counts).reshape(1, -1)) ###reshape the counts return X
def Group_Filter_Aggregators(self, list_Groups2Filter): ''' Computes the filtering of the "Aggregators" compounds from McGovern, "A Common Mechanism Underlying Promiscuous Inhibitors from Virtual and High-Troughput Screening, J.Med.Chem,2002,45,1712-1722." Uses the dico_Aggregators from SmartsCodes module, and the SMARTS pattern methods from pybel. ''' for groups in list_Groups2Filter: rule, group = groups if SmartsCodes.dico_Aggregators.has_key(group): current_pattern = Smarts(SmartsCodes.dico_Aggregators[group]) list_match = current_pattern.findall(self.molecule) amount = len(list_match) self.groups_tested[group] = amount if list_match != []: self.criterion.append('Aggregator ' + group) self.toxic = 'Toxic' self.functionnal_group = True return self.toxic, self.functionnal_group, self.groups_tested, self.criterion
def Group_Filter_Warhead(self, list_Groups2Filter): ''' Computes the filtering of the "Warheads" compounds from Rishton, "Rishton GM, Reactive compounds and in vitro false positives in HTS, DDT Vol.2,No.9 September 1997." Uses the dico_Warheads from SmartsCodes module, and the SMARTS pattern methods from pybel. ''' for groups in list_Groups2Filter: rule, group = groups if SmartsCodes.dico_Warheads.has_key(group): current_pattern = Smarts(SmartsCodes.dico_Warheads[group]) list_match = current_pattern.findall(self.molecule) amount = len(list_match) self.groups_tested[group] = amount if list_match != []: self.criterion.append('Warhead ' + group) self.toxic = 'Toxic' self.functionnal_group = True return self.toxic, self.functionnal_group, self.groups_tested, self.criterion
def Group_Filter_Chemicals_Group(self, list_Groups2Filter): ''' Computes the filtering of the others chemicals groups known to be toxic. Uses the dico_Chemicals_Groups from SmartsCodes module, and the SMARTS pattern methods from pybel. ''' for groups in list_Groups2Filter: rule, group = groups rule = int(rule) if SmartsCodes.dico_Chemicals_Groups.has_key(group): current_pattern = Smarts( SmartsCodes.dico_Chemicals_Groups[group]) list_match = current_pattern.findall(self.molecule) amount = len(list_match) self.groups_tested[group] = amount if amount > rule: self.criterion.append('Chemical Group ' + group) self.toxic = 'Toxic' self.functionnal_group = True return self.toxic, self.functionnal_group, self.groups_tested, self.criterion
def Group_Filter_Frequent_Hitters(self, list_Groups2Filter): ''' Computes the filtering of the "Frequents Hitters" compounds from Roche, "Roche O., Development of a Virtual Screening Method for Identification of Frequent Hitters in Compound Libraries, J.Med.Chem., 2002,45,137-142." Uses the dico_Frequent_Hitters from SmartsCodes module, and the SMARTS pattern methods from pybel. ''' for groups in list_Groups2Filter: rule, group = groups if SmartsCodes.dico_Frequent_Hitters.has_key(group): current_pattern = Smarts( SmartsCodes.dico_Frequent_Hitters[group]) list_match = current_pattern.findall(self.molecule) amount = len(list_match) self.groups_tested[group] = amount if list_match != []: self.criterion.append('Frequent Hitter ' + group) self.toxic = 'Toxic' self.functionnal_group = True return self.toxic, self.functionnal_group, self.groups_tested, self.criterion
def __init__(self, smilesFrag): self.smiles = smilesFrag.replace(Break, Asterisk) self._molSmiles = self._removedAtom(self.smiles, Asterisk) self.mol = readstring('smi', self._molSmiles) self.atoms = len(self.mol.atoms) - self._molSmiles.count( WildCard) - self._molSmiles.count('H') self.smartsString = self._removedAtom(self._molSmiles, WildCard) self._smarts = Smarts(self.smartsString) if not self.match(self.mol) or len(Fragment._nh.findall( self.mol)) != self.smartsString.count(Fragment._nhString): self.smiles = smilesFrag.replace(Break, WildCard) self._molSmiles = self._removedAtom(self.smiles, Asterisk) self.mol = readstring('smi', self._molSmiles) self.cansmiles = Fragment._converter.getSmiles(self.mol) self._fingerprint = None self.target = None self._childs = set()
class WyzIndexer(Fingerprint): """ Structure key indexer """ name = 'wyz' # chain Chain4_Matcher = Smarts('CCCC') Chain6_Matcher = Smarts('CCCCCC') # methyl C3M1_Matcher = Smarts('[CH3][CH]([!H3])[!H3]') C3M2_Matcher = Smarts('[CH3][CH]([CH3])[!H3]') C4M1_Matcher = Smarts('[CH3]C([!H3])([!H3])[!H3]') C4M2_Matcher = Smarts('[!H3]C([CH3])([CH3])[!H3]') C4M3_Matcher = Smarts('[CH3]C([CH3])([CH3])[!H3]') # ring branches RB1_Matcher = Smarts('[RH1]([!H3])') RB2_Matcher = Smarts('[RH0]([!H3])([!H3])') RM1_Matcher = Smarts('[RH]([CH3])') RM2_Matcher = Smarts('[R]([CH3])([CH3])') # small rings R3_Matcher = Smarts('C1CC1') R4_Matcher = Smarts('C1CCC1') R8_Matcher = Smarts('C1CCCCCCC1') # special rings Fuse_Matcher = Smarts('[R2][R2]') Bridge_Matcher = Smarts('[R1][R1]([R1])[R1]([R1])[R1]') Bridge3_Matcher = Smarts('[R3]') def __init__(self, *args): super().__init__() pass def _index_smiles(self, smiles): molecule = pybel.readstring('smi', smiles) hv = np.array([a.heavyvalence for a in molecule.atoms]) def remove_repeat(l): ret = [] has_set = set() for l_ in l: if not has_set.intersection(l_): ret.append(l_) has_set.update(l_) return ret chain4match = remove_repeat( WyzIndexer.Chain4_Matcher.findall(molecule)) chain6match = remove_repeat( WyzIndexer.Chain6_Matcher.findall(molecule)) myindex = [ len(molecule.atoms), # 0 NC len(hv[hv == 4]), # 1 C4 len(hv[hv == 3]), # 2 C3 len(hv[hv == 2]), # 3 C2 len(hv[hv == 1]), # 4 C1 len(chain4match), # 5 chain4 len(chain6match), # 6 chain6 len(WyzIndexer.C3M1_Matcher.findall(molecule)), # 7 len(WyzIndexer.C3M2_Matcher.findall(molecule)), # 8 len(WyzIndexer.C4M1_Matcher.findall(molecule)), # 9 len(WyzIndexer.C4M2_Matcher.findall(molecule)), # 10 len(WyzIndexer.C4M3_Matcher.findall(molecule)), # 11 len(WyzIndexer.RB1_Matcher.findall(molecule)), # 12 len(WyzIndexer.RB2_Matcher.findall(molecule)), # 13 len(WyzIndexer.RM1_Matcher.findall(molecule)), # 14 len(WyzIndexer.RM2_Matcher.findall(molecule)), # 15 len(molecule.sssr), # 16 Ring len(WyzIndexer.R3_Matcher.findall(molecule)), # 17 R3 len(WyzIndexer.R4_Matcher.findall(molecule)), # 18 R4 len(WyzIndexer.R8_Matcher.findall(molecule)), # 19 R8 len(WyzIndexer.Fuse_Matcher.findall(molecule)), # 20 Fuse len(WyzIndexer.Bridge_Matcher.findall(molecule)), # 21 Bridge len(WyzIndexer.Bridge3_Matcher.findall(molecule)) # 22 Bridge3 ] return myindex def index(self, smiles): return np.array(self._index_smiles(smiles)) def index_list(self, smiles_list): return [self.index(s) for s in smiles_list]
#! /usr/bin/env python #import pybel from pybel import Smarts ################################################################################### Pattern_HBD = Smarts("[!#6;!H0]") Pattern_HBA = Smarts("[$([$([#8,#16]);" + "X1,X2]),$([#7X2;#7v3;" + "!$([nH]);!$(*(-a)-a)])]") ################################################################################### ################################################################################### list_allowed_atoms = [1,6,7,8,9,15,16,17,35,53] ###################= [H,C,N,O,F,P,S,Cl,Br,I] list_non_allowed_atoms = [2,3,4,5,10,11,12,13,14,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,54,55,56,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,57,58,59,60,61,62,63,64,65,66,67,68,69,70,89,90,91,92,93,94,95,96,97,98,99,100,101,102] ################################################################################### dico_Warheads = {'Sulfonyl_halide':'[SD4](=O)(=O)[F,Br,I,Cl]', 'Acyl_halide':'[F,Cl,Br,I]C(=O)', 'Alkyl_halide':'[Cl,Br,I][CH2]', 'Anhydride':'O=[CD3]([*])[OD2][CD3](=O)[*]', '1_2_dicarbonyl':'O=[CD3](*)[CD3](=O)[*]', 'Perhaloketone':'[F,Br,I,Cl][CX4]([F,Br,I,Cl])([F,Br,I,Cl])[CX3](=O)C', 'Aliphatic_ketone':'O=C(C)C([#1])([#1])[#1]', 'Epoxide':'[OX2r3]1[#6r3][#6r3]1', 'Aziridine':'[#7]1CC1', 'Aliphatic_ester':'*C(=O)[OX2]', 'Aliphatic_thioester':'CC(=O)S', 'Sufonate_ester':'C[SD4](=O)(=O)[OD2]', 'Phosphonate_ester':'C[PD4](O)(=O)[OD2]', 'Imine':'[NX2;$([N][#6]),$([NH]);!$([N][CX3]=[#7,#8,#15,#16])]=[CX3;$([CH2]),$([CH][#6]),$([C]([#6])[#6])]', 'Aldehyde':'[*][CD3]([#1])=[OD1]', 'Michael_acceptor':'[OD1]=C(*)C=[CH]([CH3])',
class Fragment: _converter = Converter(can=True) _nhString = '[nH]' _nh = Smarts(_nhString) _biphenylfp = set(readstring('smi', 'c1ccccc1c1ccccc1').calcfp().bits) def __init__(self, smilesFrag): self.smiles = smilesFrag.replace(Break, Asterisk) self._molSmiles = self._removedAtom(self.smiles, Asterisk) self.mol = readstring('smi', self._molSmiles) self.atoms = len(self.mol.atoms) - self._molSmiles.count( WildCard) - self._molSmiles.count('H') self.smartsString = self._removedAtom(self._molSmiles, WildCard) self._smarts = Smarts(self.smartsString) if not self.match(self.mol) or len(Fragment._nh.findall( self.mol)) != self.smartsString.count(Fragment._nhString): self.smiles = smilesFrag.replace(Break, WildCard) self._molSmiles = self._removedAtom(self.smiles, Asterisk) self.mol = readstring('smi', self._molSmiles) self.cansmiles = Fragment._converter.getSmiles(self.mol) self._fingerprint = None self.target = None self._childs = set() def _removedAtom(self, smiles, atom): removeChars = [] for bond in Bonds: removeChars.extend((atom + bond, bond + atom)) removeChars.extend((atom, '()')) for char in removeChars: smiles = smiles.replace(char, '') return smiles.replace('[]', WildCard) def connect(self, child): self._childs.add(child) def calcfp(self): if self._molSmiles.count(WildCard) == 0: fp = set(self.mol.calcfp().bits) if not fp.issuperset(Fragment._biphenylfp): self._fingerprint = fp del self.mol def matchDataset(self, dataset): self.hits = Dataset(dataset.labels) searchingset = dataset if self._childs: searchingset = min([child.hits for child in self._childs] + [dataset], key=operator.methodcaller('tot')) for label in dataset.classes: hits = self._find(searchingset.getStructures(label)) self.hits.populate(hits, label) for structure in self.hits.getStructures(): structure.addSubstructure(self) def setTarget(self, label): self.target = label def evaluate(self, dataset, target): self.LR = self.precision = self.recall = None targets = dataset.totClass(target) nontargets = dataset.tot() - targets self.trueMatches = T = self.hits.totClass(target) self.falseMatches = F = self.hits.tot() - self.trueMatches self.priority = targets > nontargets if not targets: return if T == 0: return if F == 0: self.LR = float('inf') self.precision = 1 else: self.LR = T / F * (nontargets / targets) self.precision = T / (T + F) self.recall = T / targets def match(self, mol): return self._smarts.obsmarts.Match(mol.OBMol, True) def _find(self, structures): hits = [] for structure in structures: if self._fingerprint and not self._fingerprint.issubset( structure._fingerprint): continue if self.match(structure.mol): hits.append(structure) return hits
class CHSKIndexer: """ Structure key indexer """ # chain Chain4_Matcher = Smarts('CCCC') Chain6_Matcher = Smarts('CCCCCC') # methyl C3M1_Matcher = Smarts('[CH3][CH]([!H3])[!H3]') C3M2_Matcher = Smarts('[CH3][CH]([CH3])[!H3]') C4M1_Matcher = Smarts('[CH3]C([!H3])([!H3])[!H3]') C4M2_Matcher = Smarts('[!H3]C([CH3])([CH3])[!H3]') C4M3_Matcher = Smarts('[CH3]C([CH3])([CH3])[!H3]') # ring branches RB1_Matcher = Smarts('[R;CH1]([!H3])') RB2_Matcher = Smarts('[R;CH0]([!H3])([!H3])') RM1_Matcher = Smarts('[R;CH]([CH3])') RM2_Matcher = Smarts('[R;C]([CH3])([CH3])') # small rings R5_Matcher = Smarts('C1CCCC1') # special rings Fuse_Matcher = Smarts('[C;R2][C;R2]') Bridge_Matcher = Smarts('[C;R1][C;R1]([C;R1])[C;R1]([C;R1])[C;R1]') # Alkene1_Matcher = Smarts('[CH2]=C') Alkene2_Matcher = Smarts('C[CH1]=C') Alkene3_Matcher = Smarts('CC(C)=C') Alkene4_Matcher = Smarts('c[CH1]=C') Alkene5_Matcher = Smarts('cC(C)=C') Alkene6_Matcher = Smarts('cC(c)=C') rAlkene1_Matcher = Smarts('[CH]1=CCCC1') rAlkene2_Matcher = Smarts('[CH0]1=CCCC1') cAlkene1_Matcher = Smarts('C=[CH][CH]=C') cAlkene2_Matcher = Smarts('C=[CH][CH0]=C') cAlkene3_Matcher = Smarts('C=[CH0][CH0]=C') Alkyne1_Matcher = Smarts('[CH]#C') Alkyne2_Matcher = Smarts('CC#C') Alkyne3_Matcher = Smarts('cC#C') Alkyne4_Matcher = Smarts('C=CC#C') Benzene1_Matcher = Smarts('c1ccccc1') Benzene2_Matcher = Smarts('[cH0]1[cH1][cH1][cH1][cH1][cH1]1') Benzene3_Matcher = Smarts('[cH0]1[cH0][cH1][cH1][cH1][cH1]1') Benzene4_Matcher = Smarts('[cH0]1[cH1][cH0][cH1][cH1][cH1]1') Benzene5_Matcher = Smarts('[cH0]1[cH1][cH1][cH0][cH1][cH1]1') Benzene6_Matcher = Smarts('[cH0]1[cH0][cH0][cH1][cH1][cH1]1') Benzene7_Matcher = Smarts('[cH0]1[cH0][cH1][cH0][cH1][cH1]1') Benzene8_Matcher = Smarts('[cH0]1[cH1][cH0][cH1][cH0][cH1]1') Benzene09_Matcher = Smarts('[cH0]1[cH0][cH0][cH0][cH1][cH1]1') Benzene10_Matcher = Smarts('[cH0]1[cH0][cH0][cH1][cH0][cH1]1') Benzene11_Matcher = Smarts('[cH0]1[cH0][cH1][cH0][cH0][cH1]1') Benzene12_Matcher = Smarts('[cH0]1[cH0][cH0][cH0][cH0][cH1]1') Benzene13_Matcher = Smarts('[cH0]1[cH0][cH0][cH0][cH0][cH0]1') BenzeneC5_Matcher = Smarts('c1c(CCC2)c2ccc1') BenzeneC6_Matcher = Smarts('c1c(CCCC2)c2ccc1') def __init__(self, *args): super().__init__(*args) def _index_molecule(self, molecule: Molecule): hv = np.array([a.heavyvalence for a in molecule.atoms]) def remove_repeat(l): ret = [] has_set = set() for l_ in l: if not has_set.intersection(l_): ret.append(l_) has_set.update(l_) return ret chain4match = remove_repeat( CHSKIndexer.Chain4_Matcher.findall(molecule)) chain6match = remove_repeat( CHSKIndexer.Chain6_Matcher.findall(molecule)) myindex = [ molecule.OBMol.NumHvyAtoms(), # 0 NC len(hv[hv == 4]), # 1 C4 len(hv[hv == 3]), # 2 C3 len(hv[hv == 2]), # 3 C2 len(hv[hv == 1]), # 4 C1 len(chain4match), # 5 chain4 len(chain6match), # 6 chain6 len(CHSKIndexer.C3M1_Matcher.findall(molecule)), # 7 len(CHSKIndexer.C3M2_Matcher.findall(molecule)), # 8 len(CHSKIndexer.C4M1_Matcher.findall(molecule)), # 9 len(CHSKIndexer.C4M2_Matcher.findall(molecule)), # 10 len(CHSKIndexer.C4M3_Matcher.findall(molecule)), # 11 len(CHSKIndexer.RB1_Matcher.findall(molecule)), # 12 len(CHSKIndexer.RB2_Matcher.findall(molecule)), # 13 len(CHSKIndexer.RM1_Matcher.findall(molecule)), # 14 len(CHSKIndexer.RM2_Matcher.findall(molecule)), # 15 len(molecule.sssr), # 16 Ring len(CHSKIndexer.Fuse_Matcher.findall(molecule)), # 17 Fuse len(CHSKIndexer.Bridge_Matcher.findall(molecule)), # 18 Bridge len(CHSKIndexer.Alkene1_Matcher.findall(molecule)), # 19 Alkene len(CHSKIndexer.Alkene2_Matcher.findall(molecule)), # 20 Alkene len(CHSKIndexer.Alkene3_Matcher.findall(molecule)), # 21 Alkene len(CHSKIndexer.Alkene4_Matcher.findall(molecule)), # 22 Alkene len(CHSKIndexer.Alkene5_Matcher.findall(molecule)), # 23 Alkene len(CHSKIndexer.Alkene6_Matcher.findall(molecule)), # 24 Alkene len(CHSKIndexer.Alkyne1_Matcher.findall(molecule)), # 25 Alkyne len(CHSKIndexer.Alkyne2_Matcher.findall(molecule)), # 26 Alkyne len(CHSKIndexer.Alkyne3_Matcher.findall(molecule)), # 27 Alkyne len(CHSKIndexer.Alkyne4_Matcher.findall(molecule)), # 27 Alkyne len(CHSKIndexer.Benzene1_Matcher.findall( molecule)), # 28 Benzene ring len(CHSKIndexer.Benzene2_Matcher.findall( molecule)), # 29 Benzene ring len(CHSKIndexer.Benzene3_Matcher.findall( molecule)), # 30 Benzene ring len(CHSKIndexer.Benzene4_Matcher.findall( molecule)), # 31 Benzene ring len(CHSKIndexer.Benzene5_Matcher.findall( molecule)), # 32 Benzene ring len(CHSKIndexer.Benzene6_Matcher.findall( molecule)), # 33 Benzene ring len(CHSKIndexer.Benzene7_Matcher.findall( molecule)), # 34 Benzene ring len(CHSKIndexer.Benzene8_Matcher.findall( molecule)), # 35 Benzene ring len(CHSKIndexer.Benzene09_Matcher.findall( molecule)), # 36 Benzene ring len(CHSKIndexer.Benzene10_Matcher.findall( molecule)), # 37 Benzene ring len(CHSKIndexer.Benzene11_Matcher.findall( molecule)), # 38 Benzene ring len(CHSKIndexer.Benzene12_Matcher.findall( molecule)), # 39 Benzene ring len(CHSKIndexer.Benzene13_Matcher.findall( molecule)), # 40 Benzene ring len(CHSKIndexer.BenzeneC5_Matcher.findall( molecule)), # 41 Benzene ring len(CHSKIndexer.BenzeneC6_Matcher.findall( molecule)), # 42 Benzene ring len(CHSKIndexer.rAlkene1_Matcher.findall(molecule)), # 43 Alkene len(CHSKIndexer.rAlkene2_Matcher.findall(molecule)), # 44 Alkene len(CHSKIndexer.cAlkene1_Matcher.findall(molecule)), # 45 Alkene len(CHSKIndexer.cAlkene2_Matcher.findall(molecule)), # 46 Alkene len(CHSKIndexer.cAlkene3_Matcher.findall(molecule)), # 47 Alkene ] return myindex def index(self, molecule: Molecule): return np.array(self._index_molecule(molecule))