def FingerprintMol(mol): """ generates the EState fingerprints for the molecule Concept from the paper: Hall and Kier JCICS _35_ 1039-1045 (1995) two numeric arrays are returned: The first (of ints) contains the number of times each possible atom type is hit The second (of floats) contains the sum of the EState indices for atoms of each type. """ if AtomTypes.esPatterns is None: AtomTypes.BuildPatts() esIndices = EStateIndices(mol) nPatts = len(AtomTypes.esPatterns) counts = numpy.zeros(nPatts,numpy.int) sums = numpy.zeros(nPatts,numpy.float) for i,(name,pattern) in enumerate(AtomTypes.esPatterns): matches = mol.GetSubstructMatches(pattern,uniquify=1) counts[i] = len(matches) for match in matches: sums[i] += esIndices[match[0]] return counts,sums
def _validate(self, vals, tol=1e-2, show=False): for smi, ans in vals: mol = Chem.MolFromSmiles(smi) types = AtomTypes.TypeAtoms(mol) if show: # pragma: nocover print(types) self.assertEqual(len(ans), len(types), 'bad type len for smiles: %s' % (smi)) lens = [len(x) for x in types] self.assertEqual(max(lens), 1, 'atom matched multiple types for smiles: %s' % (smi)) for a, b in zip(ans, [x[0] for x in types]): self.assertEqual(a, b, 'bad type for SMILES: %s' % (smi))
def _validate(self,vals,tol=1e-2,show=0): for smi,ans in vals: mol = Chem.MolFromSmiles(smi) types = AtomTypes.TypeAtoms(mol) if show: print types assert len(ans)==len(types),'bad type len for smiles: %s'%(smi) lens = [len(x) for x in types] assert max(lens)==1,'atom matched multiple types for smiles: %s'%(smi) types = [x[0] for x in types] for a,b in zip(ans,types): assert a==b,'bad type for SMILES: %s'%(smi)
def finger_print(chunk): """ Create a dictionary with the e-state fingerprint for the molecule in mol (rdkit mol) Input: mol; rdkit mol object name; structure name e_opt; energy gap (target) """ if AtomTypes.esPatterns is None: AtomTypes.BuildPatts() name_list = [name for name, _ in AtomTypes.esPatterns] df = pd.DataFrame(columns=['name', 'smiles'] + name_list) for row_index, row in chunk.iterrows(): name = (row["name"]) smiles = (row["smiles"]) mol = Chem.MolFromSmiles(smiles) try: types = AtomTypes.TypeAtoms(mol) es = EStateIndices(mol) counts, sums = Fingerprinter.FingerprintMol(mol) if AtomTypes.esPatterns is None: AtomTypes.BuildPatts() name_list = [name for name, _ in AtomTypes.esPatterns] data = {'name': name, 'smiles': smiles} data2 = {k: v for k, v in zip(name_list, sums)} data.update(data2) df = df.append(data, ignore_index=True) except AttributeError: print(i, formula) continue return df
def finger_print(mol, name, e_opt): """ Create a dictionary with the e-state fingerprint for the molecule in mol (rdkit mol) Input: mol; rdkit mol object name; structure name e_opt; energy gap (target) """ types = AtomTypes.TypeAtoms(mol) es = EStateIndices(mol) counts, sums = Fingerprinter.FingerprintMol(mol) if AtomTypes.esPatterns is None: AtomTypes.BuildPatts() name_list = [name for name, _ in AtomTypes.esPatterns] data = {'name': name, 'E_opt': e_opt} data2 = {k: v for k, v in zip(name_list, sums)} data.update(data2) return data
def _exampleCode(): """ Example code for calculating E-state fingerprints """ from rdkit import Chem smis = ['CC', 'CCC', 'c1[nH]cnc1CC(N)C(O)=O', 'NCCc1ccc(O)c(O)c1'] for smi in smis: m = Chem.MolFromSmiles(smi) print(smi, Chem.MolToSmiles(m)) types = AtomTypes.TypeAtoms(m) for i in range(m.GetNumAtoms()): print('%d %4s: %s' % (i + 1, m.GetAtomWithIdx(i).GetSymbol(), str(types[i]))) es = EStateIndices(m) counts, sums = FingerprintMol(m) for i in range(len(AtomTypes.esPatterns)): if counts[i]: name, _ = AtomTypes.esPatterns[i] print('%6s, % 2d, % 5.4f' % (name, counts[i], sums[i])) for i in range(len(es)): print('% 2d, % 5.4f' % (i + 1, es[i])) print('--------')
nPatts = len(AtomTypes.esPatterns) counts = numpy.zeros(nPatts,numpy.int) sums = numpy.zeros(nPatts,numpy.float) for i,(name,pattern) in enumerate(AtomTypes.esPatterns): matches = mol.GetSubstructMatches(pattern,uniquify=1) counts[i] = len(matches) for match in matches: sums[i] += esIndices[match[0]] return counts,sums if __name__ == '__main__': from rdkit import Chem smis = ['CC','CCC','c1[nH]cnc1CC(N)C(O)=O','NCCc1ccc(O)c(O)c1'] for smi in smis: m = Chem.MolFromSmiles(smi) print smi,Chem.MolToSmiles(m) types = AtomTypes.TypeAtoms(m) for i in range(m.GetNumAtoms()): print '%d %4s: %s'%(i+1,m.GetAtomWithIdx(i).GetSymbol(),str(types[i])) es = EStateIndices(m) counts,sums = FingerprintMol(m) for i in range(len(AtomTypes.esPatterns)): if counts[i]: name,patt = AtomTypes.esPatterns[i] print '%6s, % 2d, % 5.4f'%(name,counts[i],sums[i]) for i in range(len(es)): print '% 2d, % 5.4f'%(i+1,es[i]) print '--------'
runs the shell command cmd """ if shell: p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE) else: cmd = cmd.split() p = subprocess.Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE) output, err = p.communicate() return output.decode('utf-8') if AtomTypes.esPatterns is None: AtomTypes.BuildPatts() name_list = [name for name, _ in AtomTypes.esPatterns] df2 = pd.DataFrame(columns=['name', 'E_opt'] + name_list) #create the name, target, features df = pd.read_pickle("./egap_subpc.pkl") for row_index, row in df.iterrows(): #if row_index < 10: #if row_index < 1000: Atom = (row["Atom"]) name = (row["name"]) E_opt = (row["E_opt"])