def main(): sdf_root_path = "/media/data/pubchem/SDF" for path, dirs, filenames in os.walk(sdf_root_path): for filename in filenames: filepath = os.path.join(sdf_root_path, filename) # This SDF file fails to parse with RDKit on Ubuntu 16.04 if "Compound_102125001_102150000" in filename: continue with gzip.open(filepath, 'rb') as myfile: suppl = Chem.ForwardSDMolSupplier(myfile) for mol in suppl: if not mol: continue try: info = {} rdMolDescriptors.GetMorganFingerprint(mol, 1, bitInfo=info) keys = info.keys() keys_list = list(keys) for k in keys_list: print(k, end=' ') print() except Exception: pass
def GenerateMorganFeaturesFingerprints(Mols): """Generate MorganFeatures fingerprints.""" MiscUtil.PrintInfo("\nGenerating MorganFeatures fingerprints...") # Setup fingerprints parameters... Radius = OptionsInfo["FingerprintsParams"]["MorganFeatures"]["Radius"] UseChirality = OptionsInfo["FingerprintsParams"]["MorganFeatures"][ "UseChirality"] UseFeatures = True if OptionsInfo["GenerateBitVectFingerints"]: # Generate ExplicitBitVect fingerprints... FPSize = 2048 MolsFingerprints = [ rdMolDescriptors.GetMorganFingerprintAsBitVect( Mol, Radius, useFeatures=UseFeatures, useChirality=UseChirality, nBits=FPSize) for Mol in Mols ] else: # Generate UIntSparseIntVect fingerprints... MolsFingerprints = [ rdMolDescriptors.GetMorganFingerprint(Mol, Radius, useFeatures=UseFeatures, useChirality=UseChirality) for Mol in Mols ] return MolsFingerprints
def _compute_sas(mol: Mol, sa_model: Dict[int, float]) -> float: fp = rdMolDescriptors.GetMorganFingerprint(mol, 2) fps = fp.GetNonzeroElements() score1 = 0. nf = 0 # for bitId, v in fps.items(): for bitId, v in fps.items(): nf += v sfp = bitId score1 += sa_model.get(sfp, -4) * v score1 /= nf # features score nAtoms = mol.GetNumAtoms() nChiralCenters = len(FindMolChiralCenters(mol, includeUnassigned=True)) ri = mol.GetRingInfo() nSpiro = rdMolDescriptors.CalcNumSpiroAtoms(mol) nBridgeheads = rdMolDescriptors.CalcNumBridgeheadAtoms(mol) nMacrocycles = 0 for x in ri.AtomRings(): if len(x) > 8: nMacrocycles += 1 sizePenalty = nAtoms**1.005 - nAtoms stereoPenalty = math.log10(nChiralCenters + 1) spiroPenalty = math.log10(nSpiro + 1) bridgePenalty = math.log10(nBridgeheads + 1) macrocyclePenalty = 0. # --------------------------------------- # This differs from the paper, which defines: # macrocyclePenalty = math.log10(nMacrocycles+1) # This form generates better results when 2 or more macrocycles are present if nMacrocycles > 0: macrocyclePenalty = math.log10(2) score2 = 0. - sizePenalty - stereoPenalty - spiroPenalty - bridgePenalty - macrocyclePenalty # correction for the fingerprint density # not in the original publication, added in version 1.1 # to make highly symmetrical molecules easier to synthetise score3 = 0. if nAtoms > len(fps): score3 = math.log(float(nAtoms) / len(fps)) * .5 sascore = score1 + score2 + score3 # need to transform "raw" value into scale between 1 and 10 min = -4.0 max = 2.5 sascore = 11. - (sascore - min + 1) / (max - min) * 9. # smooth the 10-end if sascore > 8.: sascore = 8. + math.log(sascore + 1. - 9.) if sascore > 10.: sascore = 10.0 elif sascore < 1.: sascore = 1.0 return sascore
def scoreMolWConfidence(mol, fscore): """Next to the NP Likeness Score, this function outputs a confidence value between 0..1 that descibes how many fragments of the tested molecule were found in the model data set (1: all fragments were found). Returns namedtuple NPLikeness(nplikeness, confidence)""" if mol is None: raise ValueError('invalid molecule') fp = rdMolDescriptors.GetMorganFingerprint(mol, 2) bits = fp.GetNonzeroElements() # calculating the score score = 0.0 bits_found = 0 for bit in bits: if bit in fscore: bits_found += 1 score += fscore[bit] score /= float(mol.GetNumAtoms()) confidence = float(bits_found / len(bits)) # preventing score explosion for exotic molecules if score > 4: score = 4. + math.log10(score - 4. + 1.) elif score < -4: score = -4. - math.log10(-4. - score + 1.) NPLikeness = namedtuple("NPLikeness", "nplikeness,confidence") return NPLikeness(score, confidence)
def _featurize(self, mol): """ Calculate circular fingerprint. Parameters ---------- mol : RDKit Mol Molecule. """ if self.sparse: info = {} fp = rdMolDescriptors.GetMorganFingerprint( mol, self.radius, useChirality=self.chiral, useBondTypes=self.bonds, useFeatures=self.features, bitInfo=info) fp = fp.GetNonzeroElements() # convert to a dict # generate SMILES for fragments if self.smiles: fp_smiles = {} for fragment_id, count in fp.items(): root, radius = info[fragment_id][0] env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius, root) frag = Chem.PathToSubmol(mol, env) smiles = Chem.MolToSmiles(frag) fp_smiles[fragment_id] = {'smiles': smiles, 'count': count} fp = fp_smiles else: fp = rdMolDescriptors.GetMorganFingerprintAsBitVect( mol, self.radius, nBits=self.size, useChirality=self.chiral, useBondTypes=self.bonds, useFeatures=self.features) return fp
def GenerateMorganFeaturesFingerprints(Mols): """Generate MorganFeatures fingerprints.""" MiscUtil.PrintInfo("\nGenerating MorganFeatures %s fingerprints..." % OptionsInfo["SpecifiedFingerprintsType"]) # Setup fingerprints parameters... Radius = OptionsInfo["FingerprintsParams"]["MorganFeatures"]["Radius"] UseChirality = OptionsInfo["FingerprintsParams"]["MorganFeatures"][ "UseChirality"] FPSize = OptionsInfo["FingerprintsParams"]["MorganFeatures"]["FPSize"] UseFeatures = True if re.match("^BitVect$", OptionsInfo["SpecifiedFingerprintsType"], re.I): # Generate ExplicitBitVect fingerprints... MiscUtil.PrintInfo("FPSize: %s" % (FPSize)) MolsFingerprints = [ rdMolDescriptors.GetMorganFingerprintAsBitVect( Mol, Radius, useFeatures=UseFeatures, useChirality=UseChirality, nBits=FPSize) for Mol in Mols ] else: # Generate UIntSparseIntVect fingerprints... MolsFingerprints = [ rdMolDescriptors.GetMorganFingerprint(Mol, Radius, useFeatures=UseFeatures, useChirality=UseChirality) for Mol in Mols ] return MolsFingerprints
def testGithub1761(self): mol = Chem.MolFromSmiles('CC(F)(Cl)C(F)(Cl)C') self.assertRaises(OverflowError, lambda: rdMD.GetMorganFingerprint(mol, -1)) self.assertRaises(OverflowError, lambda: rdMD.GetHashedMorganFingerprint(mol, 0, -1)) self.assertRaises(ValueError, lambda: rdMD.GetHashedMorganFingerprint(mol, 0, 0))
def calculateScore(m): if _fscores is None: readFragmentScores() # fragment score fp = rdMolDescriptors.GetMorganFingerprint( m, 2) # <- 2 is the *radius* of the circular fingerprint fps = fp.GetNonzeroElements() score1 = 0. nf = 0 for bitId, v in iteritems(fps): nf += v sfp = bitId score1 += _fscores.get(sfp, -4) * v score1 /= nf # features score nAtoms = m.GetNumAtoms() nChiralCenters = len(Chem.FindMolChiralCenters(m, includeUnassigned=True)) ri = m.GetRingInfo() nBridgeheads, nSpiro = numBridgeheadsAndSpiro(m, ri) nMacrocycles = 0 for x in ri.AtomRings(): if len(x) > 8: nMacrocycles += 1 sizePenalty = nAtoms**1.005 - nAtoms stereoPenalty = math.log10(nChiralCenters + 1) spiroPenalty = math.log10(nSpiro + 1) bridgePenalty = math.log10(nBridgeheads + 1) macrocyclePenalty = 0. # --------------------------------------- # This differs from the paper, which defines: # macrocyclePenalty = math.log10(nMacrocycles+1) # This form generates better results when 2 or more macrocycles are present if nMacrocycles > 0: macrocyclePenalty = math.log10(2) score2 = 0. - sizePenalty - stereoPenalty - spiroPenalty - bridgePenalty - macrocyclePenalty # correction for the fingerprint density # not in the original publication, added in version 1.1 # to make highly symmetrical molecules easier to synthetise score3 = 0. if nAtoms > len(fps): score3 = math.log(float(nAtoms) / len(fps)) * .5 sascore = score1 + score2 + score3 # need to transform "raw" value into scale between 1 and 10 min = -4.0 max = 2.5 sascore = 11. - (sascore - min + 1) / (max - min) * 9. # smooth the 10-end if sascore > 8.: sascore = 8. + math.log(sascore + 1. - 9.) if sascore > 10.: sascore = 10.0 elif sascore < 1.: sascore = 1.0 return sascore
def _featurize(self, datapoint: RDKitMol, **kwargs) -> np.ndarray: """Calculate circular fingerprint. Parameters ---------- datapoint: rdkit.Chem.rdchem.Mol RDKit Mol object Returns ------- np.ndarray A numpy array of circular fingerprint. """ try: from rdkit import Chem from rdkit.Chem import rdMolDescriptors except ModuleNotFoundError: raise ImportError("This class requires RDKit to be installed.") if 'mol' in kwargs: datapoint = kwargs.get("mol") raise DeprecationWarning( 'Mol is being phased out as a parameter, please pass "datapoint" instead.' ) if self.sparse: info: Dict = {} fp = rdMolDescriptors.GetMorganFingerprint( datapoint, self.radius, useChirality=self.chiral, useBondTypes=self.bonds, useFeatures=self.features, bitInfo=info) fp = fp.GetNonzeroElements() # convert to a dict # generate SMILES for fragments if self.smiles: fp_smiles = {} for fragment_id, count in fp.items(): root, radius = info[fragment_id][0] env = Chem.FindAtomEnvironmentOfRadiusN(datapoint, radius, root) frag = Chem.PathToSubmol(datapoint, env) smiles = Chem.MolToSmiles(frag) fp_smiles[fragment_id] = {'smiles': smiles, 'count': count} fp = fp_smiles else: fp = rdMolDescriptors.GetMorganFingerprintAsBitVect( datapoint, self.radius, nBits=self.size, useChirality=self.chiral, useBondTypes=self.bonds, useFeatures=self.features) fp = np.asarray(fp, dtype=float) return fp
def compute(self, mol): fp = rdMolDescriptors.GetMorganFingerprint(mol, 2) bits = fp.GetNonzeroElements() # calculating the score score = sum(self.NA_model.get(bit, 0) for bit in bits) score /= float(mol.GetNumAtoms()) # preventing score explosion for exotic molecules if score > 4: score = 4. + math.log10(score - 4. + 1.) elif score < -4: score = -4. - math.log10(-4. - score + 1.) return score
def GetMorganFingerprint(mol, atomId=-1, radius=2, fpType='bv', nBits=2048, useFeatures=False): """ Calculates the Morgan fingerprint with the counts of atomId removed. Parameters: mol -- the molecule of interest radius -- the maximum radius fpType -- the type of Morgan fingerprint: 'count' or 'bv' atomId -- the atom to remove the counts for (if -1, no count is removed) nBits -- the size of the bit vector (only for fpType = 'bv') useFeatures -- if false: ConnectivityMorgan, if true: FeatureMorgan """ if fpType not in ['bv', 'count']: raise ValueError("Unknown Morgan fingerprint type") if not hasattr(mol, '_fpInfo'): info = {} # get the fingerprint if fpType == 'bv': molFp = rdMD.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits, useFeatures=useFeatures, bitInfo=info) else: molFp = rdMD.GetMorganFingerprint(mol, radius, useFeatures=useFeatures, bitInfo=info) # construct the bit map if fpType == 'bv': bitmap = [DataStructs.ExplicitBitVect(nBits) for x in range(mol.GetNumAtoms())] else: bitmap = [[] for x in range(mol.GetNumAtoms())] for bit, es in info.iteritems(): for at1, rad in es: if rad == 0: # for radius 0 if fpType == 'bv': bitmap[at1][bit] = 1 else: bitmap[at1].append(bit) else: # for radii > 0 env = Chem.FindAtomEnvironmentOfRadiusN(mol, rad, at1) amap = {} submol = Chem.PathToSubmol(mol, env, atomMap=amap) for at2 in amap.keys(): if fpType == 'bv': bitmap[at2][bit] = 1 else: bitmap[at2].append(bit) mol._fpInfo = (molFp, bitmap) if atomId < 0: return mol._fpInfo[0] else: # remove the bits of atomId if atomId >= mol.GetNumAtoms(): raise ValueError("atom index greater than number of atoms") if len(mol._fpInfo) != 2: raise ValueError("_fpInfo not set") if fpType == 'bv': molFp = mol._fpInfo[0] ^ mol._fpInfo[1][atomId] # xor else: # count molFp = copy.deepcopy(mol._fpInfo[0]) # delete the bits with atomId for bit in mol._fpInfo[1][atomId]: molFp[bit] -= 1 return molFp
def _featurize(self, mol: RDKitMol) -> np.ndarray: """Calculate circular fingerprint. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit Mol object Returns ------- np.ndarray A numpy array of circular fingerprint. """ from rdkit import Chem from rdkit.Chem import rdMolDescriptors if self.sparse: info: Dict = {} fp = rdMolDescriptors.GetMorganFingerprint( mol, self.radius, useChirality=self.chiral, useBondTypes=self.bonds, useFeatures=self.features, bitInfo=info) fp = fp.GetNonzeroElements() # convert to a dict # generate SMILES for fragments if self.smiles: fp_smiles = {} for fragment_id, count in fp.items(): root, radius = info[fragment_id][0] env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius, root) frag = Chem.PathToSubmol(mol, env) smiles = Chem.MolToSmiles(frag) fp_smiles[fragment_id] = {'smiles': smiles, 'count': count} fp = fp_smiles else: fp = rdMolDescriptors.GetMorganFingerprintAsBitVect( mol, self.radius, nBits=self.size, useChirality=self.chiral, useBondTypes=self.bonds, useFeatures=self.features) fp = np.asarray(fp, dtype=np.float) return fp
def _getAtomInvariantsWithRadius(mol, radius): """ Helper function to calculate the atom invariants for each atom with a given radius Arguments: - mol: the molecule of interest - radius: the radius for the Morgan fingerprint Return: list of atom invariants """ inv = [] for i in range(mol.GetNumAtoms()): info = {} fp = rdMolDescriptors.GetMorganFingerprint(mol, radius, fromAtoms=[i], bitInfo=info) for k in info.keys(): if info[k][0][1] == radius: inv.append(k) return inv
def scoreMol(mol, fscore): if mol is None: raise ValueError("invalid molecule") fp = rdMolDescriptors.GetMorganFingerprint(mol, 2) bits = fp.GetNonzeroElements() # calculating the score score = 0.0 for bit in bits: score += fscore.get(bit, 0) score /= float(mol.GetNumAtoms()) # preventing score explosion for exotic molecules if score > 4: score = 4.0 + math.log10(score - 4.0 + 1.0) if score < -4: score = -4.0 - math.log10(-4.0 - score + 1.0) return score
def NP_score(mol, fscore=None): if fscore is None: fscore = readNPModel() if mol is None: raise ValueError('invalid molecule') fp = rdMolDescriptors.GetMorganFingerprint(mol, 2) bits = fp.GetNonzeroElements() # calculating the score score = 0. for bit in bits: score += fscore.get(bit, 0) score /= float(mol.GetNumAtoms()) # preventing score explosion for exotic molecules if score > 4: score = 4. + math.log10(score - 4. + 1.) if score < -4: score = -4. - math.log10(-4. - score + 1.) return score
def main() : model = models.KeyedVectors.load_word2vec_format("vec.txt") embeddings = list() # Using canonical smiles for glycine, as in original research paper mol = Chem.MolFromSmiles("C(C(=O)O)N") try: info = {} rdMolDescriptors.GetMorganFingerprint(mol, 0, bitInfo=info) keys = info.keys() keys_list = list(keys) totalvec = np.zeros(200) for k in keys_list: wordvec = model.wv[str(k)] totalvec = np.add(totalvec, wordvec) embeddings.append(totalvec) except Exception as e: print(e) pass print(embeddings[0])
def testMorganFingerprints(self): mol = Chem.MolFromSmiles('CC(F)(Cl)C(F)(Cl)C') fp = rdMD.GetMorganFingerprint(mol, 0) self.assertTrue(len(fp.GetNonzeroElements()) == 4) mol = Chem.MolFromSmiles('CC') fp = rdMD.GetMorganFingerprint(mol, 0) self.assertTrue(len(fp.GetNonzeroElements()) == 1) self.assertTrue(list(fp.GetNonzeroElements().values())[0] == 2) fp = rdMD.GetMorganFingerprint(mol, 0, useCounts=False) self.assertTrue(len(fp.GetNonzeroElements()) == 1) self.assertTrue(list(fp.GetNonzeroElements().values())[0] == 1) mol = Chem.MolFromSmiles('CC(F)(Cl)C(F)(Cl)C') fp = rdMD.GetHashedMorganFingerprint(mol, 0) self.assertTrue(len(fp.GetNonzeroElements()) == 4) fp = rdMD.GetMorganFingerprint(mol, 1) self.assertTrue(len(fp.GetNonzeroElements()) == 8) fp = rdMD.GetHashedMorganFingerprint(mol, 1) self.assertTrue(len(fp.GetNonzeroElements()) == 8) fp = rdMD.GetMorganFingerprint(mol, 2) self.assertTrue(len(fp.GetNonzeroElements()) == 9) mol = Chem.MolFromSmiles('CC(F)(Cl)[C@](F)(Cl)C') fp = rdMD.GetMorganFingerprint(mol, 0) self.assertTrue(len(fp.GetNonzeroElements()) == 4) fp = rdMD.GetMorganFingerprint(mol, 1) self.assertTrue(len(fp.GetNonzeroElements()) == 8) fp = rdMD.GetMorganFingerprint(mol, 2) self.assertTrue(len(fp.GetNonzeroElements()) == 9) fp = rdMD.GetMorganFingerprint(mol, 0, useChirality=True) self.assertTrue(len(fp.GetNonzeroElements()) == 4) fp = rdMD.GetMorganFingerprint(mol, 1, useChirality=True) self.assertTrue(len(fp.GetNonzeroElements()) == 9) fp = rdMD.GetMorganFingerprint(mol, 2, useChirality=True) self.assertTrue(len(fp.GetNonzeroElements()) == 10) mol = Chem.MolFromSmiles('CCCCC') fp = rdMD.GetMorganFingerprint(mol, 0, fromAtoms=(0, )) self.assertTrue(len(fp.GetNonzeroElements()) == 1) mol = Chem.MolFromSmiles('CC1CC1') vs1 = rdMD.GetConnectivityInvariants(mol) self.assertEqual(len(vs1), mol.GetNumAtoms()) fp1 = rdMD.GetMorganFingerprint(mol, 2, invariants=vs1) fp2 = rdMD.GetMorganFingerprint(mol, 2) self.assertEqual(fp1, fp2) vs2 = rdMD.GetConnectivityInvariants(mol, False) self.assertEqual(len(vs2), mol.GetNumAtoms()) self.assertNotEqual(vs1, vs2) fp1 = rdMD.GetMorganFingerprint(mol, 2, invariants=vs2) self.assertNotEqual(fp1, fp2) mol = Chem.MolFromSmiles('Cc1ccccc1') vs1 = rdMD.GetFeatureInvariants(mol) self.assertEqual(len(vs1), mol.GetNumAtoms()) self.assertEqual(vs1[0], 0) self.assertNotEqual(vs1[1], 0) self.assertEqual(vs1[1], vs1[2]) self.assertEqual(vs1[1], vs1[3]) self.assertEqual(vs1[1], vs1[4]) mol = Chem.MolFromSmiles('FCCCl') vs1 = rdMD.GetFeatureInvariants(mol) self.assertEqual(len(vs1), mol.GetNumAtoms()) self.assertEqual(vs1[1], 0) self.assertEqual(vs1[2], 0) self.assertNotEqual(vs1[0], 0) self.assertEqual(vs1[0], vs1[3]) fp1 = rdMD.GetMorganFingerprint(mol, 0, invariants=vs1) fp2 = rdMD.GetMorganFingerprint(mol, 0, useFeatures=True) self.assertEqual(fp1, fp2)
def calculate_similarity_vector(smile_pair): """ Calculate fingerprints between two smile terms using different fingerprinters, and use different similarity metrics to calculate the difference between those fingerprints. """ # smile1, smile2 = smile_pair.split('_') smile1, smile2 = smile_pair mol1 = Chem.MolFromSmiles(smile1) mol2 = Chem.MolFromSmiles(smile2) molecule_similarity = list() # RDK topological fingerprint for a molecule fp1 = Chem.RDKFingerprint(mol1) fp2 = Chem.RDKFingerprint(mol2) molecule_similarity.extend(get_similarity_all(fp1, fp2)) #print 'RDK fingerprint: ', DataStructs.KulczynskiSimilarity(fp1,fp2) ## LayeredFingerprint, a fingerprint using SMARTS patterns #fp1 = Chem.LayeredFingerprint(mol1) #fp2 = Chem.LayeredFingerprint(mol2) #print 'RDK fingerprint: ', DataStructs.TanimotoSimilarity(fp1,fp2) # PatternFingerprint, a fingerprint using SMARTS patterns #fp1 = Chem.PatternFingerprint(mol1) #fp2 = Chem.PatternFingerprint(mol2) #print 'RDK fingerprint: ', DataStructs.TanimotoSimilarity(fp1,fp2) ############################################################################### # Topological Fingerprints # Uses Chem.RDKFingerprint internally, but with different parameters, I guess... # http://www.rdkit.org/docs/GettingStartedInPython.html#topological-fingerprints from rdkit.Chem.Fingerprints import FingerprintMols fp1 = FingerprintMols.FingerprintMol(mol1) fp2 = FingerprintMols.FingerprintMol(mol2) molecule_similarity.extend(get_similarity_all(fp1, fp2)) #print 'RDK fingerprint: ', DataStructs.TanimotoSimilarity(fp1,fp2) ############################################################################### # MACCS Keys # There is a SMARTS-based implementation of the 166 public MACCS keys. # http://www.rdkit.org/docs/GettingStartedInPython.html#maccs-keys from rdkit.Chem import MACCSkeys fp1 = MACCSkeys.GenMACCSKeys(mol1) fp2 = MACCSkeys.GenMACCSKeys(mol2) molecule_similarity.extend(get_similarity_all(fp1, fp2)) #print "RDK fingerprint: ", DataStructs.TanimotoSimilarity(fp1,fp2) ############################################################################### # Atom Pairs and Topological Torsions # Atom-pair descriptors [3] are available in several different forms. # The standard form is as fingerprint including counts for each bit instead of just zeros and ones: # http://www.rdkit.org/docs/GettingStartedInPython.html#atom-pairs-and-topological-torsions from rdkit.Chem.AtomPairs import Pairs fp1 = Pairs.GetAtomPairFingerprintAsBitVect(mol1) fp2 = Pairs.GetAtomPairFingerprintAsBitVect(mol2) molecule_similarity.extend(get_similarity_all(fp1, fp2)) #print "RDK fingerprint: ", DataStructs.DiceSimilarity(fp1,fp2) from rdkit.Chem.AtomPairs import Torsions fp1 = Torsions.GetTopologicalTorsionFingerprint(mol1) fp2 = Torsions.GetTopologicalTorsionFingerprint(mol2) molecule_similarity.extend(get_similarity_subset(fp1, fp2)) #print "RDK fingerprint: ", DataStructs.TanimotoSimilarity(fp1,fp2) ############################################################################### # Morgan Fingerprints (Circular Fingerprints) #This family of fingerprints, better known as circular fingerprints [5], #is built by applying the Morgan algorithm to a set of user-supplied atom invariants. #When generating Morgan fingerprints, the radius of the fingerprint must also be provided... # http://www.rdkit.org/docs/GettingStartedInPython.html#morgan-fingerprints-circular-fingerprints from rdkit.Chem import rdMolDescriptors fp1 = rdMolDescriptors.GetMorganFingerprint(mol1, 2) fp2 = rdMolDescriptors.GetMorganFingerprint(mol2, 2) molecule_similarity.extend(get_similarity_subset(fp1, fp2)) fp1 = rdMolDescriptors.GetMorganFingerprint(mol1, 2, useFeatures=True) fp2 = rdMolDescriptors.GetMorganFingerprint(mol2, 2, useFeatures=True) molecule_similarity.extend(get_similarity_subset(fp1, fp2)) #print "RDK fingerprint: ", DataStructs.TanimotoSimilarity(fp1,fp2) ############################################################################### return molecule_similarity
def calculateMol(self, m, smiles, internalParsing=False): return list(rd.GetMorganFingerprint( m, radius=self.radius, nBits=self.nbits, useChirality=True))
def synthetic_accessibility(mol, _fscores=None): ''' calculation of synthetic accessibility score as described in: 'Estimation of Synthetic Accessibility Score of Drug-like Molecules based on Molecular Complexity and Fragment Contributions' Peter Ertl and Ansgar Schuffenhauer Journal of Cheminformatics 1:8 (2009) http://www.jcheminf.com/content/1/1/8 several small modifications to the original paper are included particularly slightly different formula for marocyclic penalty and taking into account also molecule symmetry (fingerprint density) for a set of 10k diverse molecules the agreement between the original method as implemented in PipelinePilot and this implementation is r2 = 0.97 peter ertl & greg landrum, september 2013 Parameters ---------- mol : Mol Returns ------- float : synthetic accessibility score ''' if _fscores is None: with gzip.open(os.path.join(os.path.dirname(__file__), 'fpscores.pkl.gz'), 'rb') as f: _fscores = pickle.load(f) out_dict = {} for each_list in _fscores: for each_idx in range(1,len(each_list)): out_dict[each_list[each_idx]] = float(each_list[0]) _fscores = out_dict # fragment score # 2 is the *radius* of the circular fingerprint fingerprint = rdMolDescriptors.GetMorganFingerprint(mol, 2) fingerprints = fingerprint.GetNonzeroElements() score1 = 0. nf = 0 for bit_id, value in iteritems(fingerprints): nf += value sfp = bit_id score1 += _fscores.get(sfp, -4) * value score1 /= nf # features score num_atoms = mol.GetNumAtoms() num_chiral_centers = len(Chem.FindMolChiralCenters(mol, includeUnassigned=True)) ring_info = mol.GetRingInfo() num_spiro = rdMolDescriptors.CalcNumSpiroAtoms(mol) num_bridgeheads = rdMolDescriptors.CalcNumBridgeheadAtoms(mol) num_macrocycles = 0 for each_ring in ring_info.AtomRings(): if len(each_ring) > 8: num_macrocycles += 1 size_penalty = num_atoms ** 1.005 - num_atoms stereo_penalty = math.log10(num_chiral_centers + 1) spiro_penalty = math.log10(num_spiro + 1) bridge_penalty = math.log10(num_bridgeheads + 1) macrocycle_penalty = 0. # --------------------------------------- # This differs from the paper, which defines: # macrocycle_penalty = math.log10(num_macrocycles+1) # This form generates better results when 2 or more macrocycles are present if num_macrocycles > 0: macrocycle_penalty = math.log10(2) score2 = 0. -size_penalty -stereo_penalty -spiro_penalty -bridge_penalty -macrocycle_penalty # correction for the fingerprint density # not in the original publication, added in version 1.1 # to make highly symmetrical molecules easier to synthetise score3 = 0. if num_atoms > len(fingerprints): score3 = math.log(float(num_atoms) / len(fingerprints)) * .5 sascore = score1 + score2 + score3 # need to transform "raw" value into scale between 1 and 10 min_score = -4.0 max_score = 2.5 sascore = 11. - (sascore - min_score + 1) / (max_score - min_score) * 9. # smooth the 10-end if sascore > 8.: sascore = 8. + math.log(sascore+1.-9.) if sascore > 10.: sascore = 10.0 elif sascore < 1.: sascore = 1.0 return sascore
def sa_score(smiles): """ Return the SA Score for the given smiles representation. """ molecule = Chem.MolFromSmiles(smiles) # # fragment score # # use a radius of 2 for circular fingerprint try: fingerprint = rdMolDescriptors.GetMorganFingerprint(molecule, 2) fingerprint = fingerprint.GetNonzeroElements() except Exception as error: # Will throw a boost error for N+ so we just give a 0 for score debug(error) return 0 fragment_score = 0.0 fragment_count = 0 # Count frequencies of fragments for bit_id, count in fingerprint.items(): fragment_count += count fragment_score += MOLDB.get(bit_id, -4) * count fragment_score /= fragment_count # # features score # num_atoms = molecule.GetNumAtoms() num_chiral_centers = len( Chem.FindMolChiralCenters(molecule, includeUnassigned=True)) num_bridgeheads, num_spiro, num_macrocycles = ring_analysis(molecule) size_penalty = (num_atoms**1.005) - num_atoms stereo_penalty = math.log10(num_chiral_centers + 1) spiro_penalty = math.log10(num_spiro + 1) bridge_penalty = math.log10(num_bridgeheads + 1) macrocycle_penalty = 0.0 # --------------------------------------- # This differs from the paper, which defines: # macrocycle_penalty = math.log10(num_macrocycles + 1) # This form generates better results when 2 or more macrocycles are present if num_macrocycles > 0: macrocycle_penalty = math.log10(2) feature_penalty = (0.0 - size_penalty - stereo_penalty - spiro_penalty - bridge_penalty - macrocycle_penalty) # # Correction for the fingerprint density. # Not in the original publication, added in version 1.1 # to make highly symmetrical molecules easier to synthetise. # if num_atoms > len(fingerprint): fingerprint_density = math.log( float(num_atoms) / len(fingerprint)) * 0.5 else: fingerprint_density = 0.0 # # Total score # total_score = fragment_score + feature_penalty + fingerprint_density # Transform "raw" value into scale between 1 and 10. sa_min = -4.0 sa_max = 2.5 total_score = 11.0 - (total_score - sa_min + 1) / (sa_max - sa_min) * 9.0 # smooth the 10-end if total_score > 8.0: total_score = 8.0 + math.log(total_score + 1.0 - 9.0) if total_score > 10.0: total_score = 10.0 elif total_score < 1.0: total_score = 1.0 return total_score
def BuildMorganFP(mol): from rdkit.Chem import rdMolDescriptors fp = rdMolDescriptors.GetMorganFingerprint(mol, 2) fp._sumCache = fp.GetTotalVal() return fp
def __call__(self, smile): if _fscores is None: self.readFragmentScores() m = Chem.MolFromSmiles(smile) if m: try: # fragment score fp = rdMolDescriptors.GetMorganFingerprint( m, 2) # <- 2 is the *radius* of the circular fingerprint fps = fp.GetNonzeroElements() score1 = 0.0 nf = 0 for bitId, v in iteritems(fps): nf += v sfp = bitId score1 += _fscores.get(sfp, -4) * v score1 /= nf # features score nAtoms = m.GetNumAtoms() nChiralCenters = len( Chem.FindMolChiralCenters(m, includeUnassigned=True)) ri = m.GetRingInfo() nBridgeheads = rdMolDescriptors.CalcNumBridgeheadAtoms(m) nSpiro = nSpiro = rdMolDescriptors.CalcNumSpiroAtoms(m) nMacrocycles = 0 for x in ri.AtomRings(): if len(x) > 8: nMacrocycles += 1 sizePenalty = nAtoms**1.005 - nAtoms stereoPenalty = math.log10(nChiralCenters + 1) spiroPenalty = math.log10(nSpiro + 1) bridgePenalty = math.log10(nBridgeheads + 1) macrocyclePenalty = 0.0 # --------------------------------------- # This differs from the paper, which defines: # macrocyclePenalty = math.log10(nMacrocycles+1) # This form generates better results when 2 or more macrocycles are present if nMacrocycles > 0: macrocyclePenalty = math.log10(2) score2 = (0.0 - sizePenalty - stereoPenalty - spiroPenalty - bridgePenalty - macrocyclePenalty) # correction for the fingerprint density # not in the original publication, added in version 1.1 # to make highly symmetrical molecules easier to synthetise score3 = 0.0 if nAtoms > len(fps): score3 = math.log(float(nAtoms) / len(fps)) * 0.5 sascore = score1 + score2 + score3 # need to transform "raw" value into scale between 1 and 10 min_score = -4.0 max_score = 2.5 sascore = (11.0 - (sascore - min_score + 1) / (max_score - min_score) * 9.0) # smooth the 10-end if sascore > 8.0: sascore = 8.0 + math.log(sascore + 1.0 - 9.0) if sascore > 10.0: sascore = 10.0 elif sascore < 1.0: sascore = 1.0 sascore = math.exp(1 - sascore) # minimize the sascore return sascore except: return 0.0 else: return 0.0
def highlight_np_scores(mol, col_map="rwg", output="svg", png_fn="contribs.png", width=400, height=200): """Fragment highlighting for Peter Ertls Natural Product Likeness Score (J Chem Inf Model. 2008 Jan;48(1):68-74; DOI: 10.1021/ci700286x), as implemented in the RDKit. output can be: `svg` (SVG image; default); `raw` (raw SVG string); `png` (PNG image, written to `png_fn`; requires cairoSVG); `png_tag` (HTML img tag containing the encoded PNG image); `debug` (text output of parameters). Helpful RDKit links (used for creating the code): http://www.rdkit.org/docs/GettingStartedInPython.html#explaining-bits-from-morgan-fingerprints http://rdkit.blogspot.de/2015/02/new-drawing-code.html""" output = output.lower() cmap = mpl_col.LinearSegmentedColormap(col_map, CDICT[col_map], 50) bit_info = {} rdMolDescriptors.GetMorganFingerprint(mol, 2, bitInfo=bit_info) num_atoms = 1 # float(mol.GetNumAtoms()) atom_scores = Counter() bond_scores = Counter() num_bits = 0 for bit in bit_info: if bit not in fscore: continue num_bits += 1 score = fscore[bit] for frag in bit_info[bit]: env = Chem.FindAtomEnvironmentOfRadiusN(mol, frag[1], frag[0]) for b_idx in env: bond_scores[b_idx] += (score / num_atoms) atom_scores[mol.GetBondWithIdx(b_idx).GetBeginAtomIdx()] += ( score / num_atoms) atom_scores[mol.GetBondWithIdx(b_idx).GetEndAtomIdx()] += ( score / num_atoms) if output == "debug": values = atom_scores.values() norm = NormalizeAroundZero(vmin=min(values), vmax=max(values)) atom_cols = {atom: norm(score) for atom, score in atom_scores.items()} values = bond_scores.values() norm = NormalizeAroundZero(vmin=min(values), vmax=max(values)) bond_cols = {bond: norm(score) for bond, score in bond_scores.items()} print("*** Scores ***:") print(atom_scores) print(bond_scores) print("*** Norm Scores ***:") print(atom_cols) print(bond_cols) return values = atom_scores.values() norm = NormalizeAroundZero(vmin=min(values), vmax=max(values)) atom_cols = { atom: cmap(norm(score)) for atom, score in atom_scores.items() } values = bond_scores.values() norm = NormalizeAroundZero(vmin=min(values), vmax=max(values)) bond_cols = { bond: cmap(norm(score)) for bond, score in bond_scores.items() } check_2d_coords(mol) drawer = rdMolDraw2D.MolDraw2DSVG(width, height) drawer.DrawMolecule(mol, highlightAtoms=atom_cols.keys(), highlightAtomColors=atom_cols, highlightBonds=bond_cols.keys(), highlightBondColors=bond_cols) drawer.FinishDrawing() svg = drawer.GetDrawingText() svg = svg.replace('svg:', '') svg = svg.replace("</svg>\n", "</svg>") # svg = svg.replace("\n", "") svg = svg.replace("<?xml version='1.0' encoding='iso-8859-1'?>\n", "") if output == "raw": return svg elif "png" in output: if not PNG: print( "Converting to PNG requires cairoSVG, which could not be found.\n" "Try `pip install cairoSVG` to resolve this.") return if output == "png": svg2png(bytestring=svg, write_to=png_fn) return if output == "png_tag": # return a HTML <img> tag containing the PNG img svg_bc = svg2png(bytestring=svg) return mol_img_tag(svg_bc) else: print("Unknown output option.") return elif output == "svg": return SVG(svg) else: print("Unknown output option:", output)