# this little trick saves duplicated code def _NumMatches(mol, smarts): return len(mol.GetSubstructMatches(smarts, uniquify=1)) NumHDonors = lambda x: rdMolDescriptors.CalcNumHBD(x) NumHDonors.__doc__ = "Number of Hydrogen Bond Donors" NumHDonors.version = "1.0.0" _HDonors = lambda x, y=HDonorSmarts: x.GetSubstructMatches(y, uniquify=1) NumHAcceptors = lambda x: rdMolDescriptors.CalcNumHBA(x) NumHAcceptors.__doc__ = "Number of Hydrogen Bond Acceptors" NumHAcceptors.version = "2.0.0" _HAcceptors = lambda x, y=HAcceptorSmarts: x.GetSubstructMatches(y, uniquify=1) NumHeteroatoms = lambda x: rdMolDescriptors.CalcNumHeteroatoms(x) NumHeteroatoms.__doc__ = "Number of Heteroatoms" NumHeteroatoms.version = "1.0.0" _Heteroatoms = lambda x, y=HeteroatomSmarts: x.GetSubstructMatches(y, uniquify=1) NumRotatableBonds = lambda x: rdMolDescriptors.CalcNumRotatableBonds(x) NumRotatableBonds.__doc__ = "Number of Rotatable Bonds" NumRotatableBonds.version = "1.0.0" _RotatableBonds = lambda x, y=RotatableBondSmarts: x.GetSubstructMatches( y, uniquify=1) NOCount = lambda x: rdMolDescriptors.CalcNumLipinskiHBA(x) NOCount.__doc__ = "Number of Nitrogens and Oxygens" NOCount.version = "1.0.0" NHOHCount = lambda x: rdMolDescriptors.CalcNumLipinskiHBD(x) NHOHCount.__doc__ = "Number of NHs or OHs" NHOHCount.version = "2.0.0"
def get_global_features(self, mol): u = [] # Now get some specific features fdefName = os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef') factory = ChemicalFeatures.BuildFeatureFactory(fdefName) feats = factory.GetFeaturesForMol(mol) # First get some basic features natoms = mol.GetNumAtoms() nbonds = mol.GetNumBonds() mw = Descriptors.ExactMolWt(mol) HeavyAtomMolWt = Descriptors.HeavyAtomMolWt(mol) NumValenceElectrons = Descriptors.NumValenceElectrons(mol) ''' # These four descriptors are producing the value of infinity for refcode_csd = YOLJUF (CCOP(=O)(Cc1ccc(cc1)NC(=S)NP(OC(C)C)(OC(C)C)[S])OCC\t\n) MaxAbsPartialCharge = Descriptors.MaxAbsPartialCharge(mol) MaxPartialCharge = Descriptors.MaxPartialCharge(mol) MinAbsPartialCharge = Descriptors.MinAbsPartialCharge(mol) MinPartialCharge = Descriptors.MinPartialCharge(mol) ''' # FpDensityMorgan1 = Descriptors.FpDensityMorgan1(mol) # FpDensityMorgan2 = Descriptors.FpDensityMorgan2(mol) # FpDensityMorgan3 = Descriptors.FpDensityMorgan3(mol) # Get some features using chemical feature factory nbrAcceptor = 0 nbrDonor = 0 nbrHydrophobe = 0 nbrLumpedHydrophobe = 0 nbrPosIonizable = 0 nbrNegIonizable = 0 for j in range(len(feats)): #print(feats[j].GetFamily(), feats[j].GetType()) if ('Acceptor' == (feats[j].GetFamily())): nbrAcceptor = nbrAcceptor + 1 elif ('Donor' == (feats[j].GetFamily())): nbrDonor = nbrDonor + 1 elif ('Hydrophobe' == (feats[j].GetFamily())): nbrHydrophobe = nbrHydrophobe + 1 elif ('LumpedHydrophobe' == (feats[j].GetFamily())): nbrLumpedHydrophobe = nbrLumpedHydrophobe + 1 elif ('PosIonizable' == (feats[j].GetFamily())): nbrPosIonizable = nbrPosIonizable + 1 elif ('NegIonizable' == (feats[j].GetFamily())): nbrNegIonizable = nbrNegIonizable + 1 else: pass #print(feats[j].GetFamily()) # Now get some features using rdMolDescriptors moreGlobalFeatures = [rdm.CalcNumRotatableBonds(mol), rdm.CalcChi0n(mol), rdm.CalcChi0v(mol), \ rdm.CalcChi1n(mol), rdm.CalcChi1v(mol), rdm.CalcChi2n(mol), rdm.CalcChi2v(mol), \ rdm.CalcChi3n(mol), rdm.CalcChi4n(mol), rdm.CalcChi4v(mol), \ rdm.CalcFractionCSP3(mol), rdm.CalcHallKierAlpha(mol), rdm.CalcKappa1(mol), \ rdm.CalcKappa2(mol), rdm.CalcLabuteASA(mol), \ rdm.CalcNumAliphaticCarbocycles(mol), rdm.CalcNumAliphaticHeterocycles(mol), \ rdm.CalcNumAliphaticRings(mol), rdm.CalcNumAmideBonds(mol), \ rdm.CalcNumAromaticCarbocycles(mol), rdm.CalcNumAromaticHeterocycles(mol), \ rdm.CalcNumAromaticRings(mol), rdm.CalcNumBridgeheadAtoms(mol), rdm.CalcNumHBA(mol), \ rdm.CalcNumHBD(mol), rdm.CalcNumHeteroatoms(mol), rdm.CalcNumHeterocycles(mol), \ rdm.CalcNumLipinskiHBA(mol), rdm.CalcNumLipinskiHBD(mol), rdm.CalcNumRings(mol), \ rdm.CalcNumSaturatedCarbocycles(mol), rdm.CalcNumSaturatedHeterocycles(mol), \ rdm.CalcNumSaturatedRings(mol), rdm.CalcNumSpiroAtoms(mol), rdm.CalcTPSA(mol)] u = [natoms, nbonds, mw, HeavyAtomMolWt, NumValenceElectrons, \ nbrAcceptor, nbrDonor, nbrHydrophobe, nbrLumpedHydrophobe, \ nbrPosIonizable, nbrNegIonizable] u = u + moreGlobalFeatures u = np.array(u).T # Some of the descriptors produice NAN. We can convert them to 0 # If you are getting outliers in the training or validation set this could be # Because some important features were set to zero here because it produced NAN # Removing those features from the feature set might remove the outliers #u[np.isnan(u)] = 0 #u = torch.tensor(u, dtype=torch.float) return (u)
def get_molecular_features(dataframe, mol_list): df = dataframe for i in range(len(mol_list)): print("Getting molecular features for molecule: ", i) mol = mol_list[i] natoms = mol.GetNumAtoms() nbonds = mol.GetNumBonds() mw = Descriptors.ExactMolWt(mol) df.at[i,"NbrAtoms"] = natoms df.at[i,"NbrBonds"] = nbonds df.at[i,"mw"] = mw df.at[i,'HeavyAtomMolWt'] = Chem.Descriptors.HeavyAtomMolWt(mol) df.at[i,'NumValenceElectrons'] = Chem.Descriptors.NumValenceElectrons(mol) ''' # These four descriptors are producing the value of infinity for refcode_csd = YOLJUF (CCOP(=O)(Cc1ccc(cc1)NC(=S)NP(OC(C)C)(OC(C)C)[S])OCC\t\n) df.at[i,'MaxAbsPartialCharge'] = Chem.Descriptors.MaxAbsPartialCharge(mol) df.at[i,'MaxPartialCharge'] = Chem.Descriptors.MaxPartialCharge(mol) df.at[i,'MinAbsPartialCharge'] = Chem.Descriptors.MinAbsPartialCharge(mol) df.at[i,'MinPartialCharge'] = Chem.Descriptors.MinPartialCharge(mol) ''' df.at[i,'FpDensityMorgan1'] = Chem.Descriptors.FpDensityMorgan1(mol) df.at[i,'FpDensityMorgan2'] = Chem.Descriptors.FpDensityMorgan2(mol) df.at[i,'FpDensityMorgan3'] = Chem.Descriptors.FpDensityMorgan3(mol) #print(natoms, nbonds) # Now get some specific features fdefName = os.path.join(RDConfig.RDDataDir,'BaseFeatures.fdef') factory = ChemicalFeatures.BuildFeatureFactory(fdefName) feats = factory.GetFeaturesForMol(mol) #df["Acceptor"] = 0 #df["Aromatic"] = 0 #df["Hydrophobe"] = 0 nbrAcceptor = 0 nbrDonor = 0 nbrHydrophobe = 0 nbrLumpedHydrophobe = 0 nbrPosIonizable = 0 nbrNegIonizable = 0 for j in range(len(feats)): #print(feats[j].GetFamily(), feats[j].GetType()) if ('Acceptor' == (feats[j].GetFamily())): nbrAcceptor = nbrAcceptor + 1 elif ('Donor' == (feats[j].GetFamily())): nbrDonor = nbrDonor + 1 elif ('Hydrophobe' == (feats[j].GetFamily())): nbrHydrophobe = nbrHydrophobe + 1 elif ('LumpedHydrophobe' == (feats[j].GetFamily())): nbrLumpedHydrophobe = nbrLumpedHydrophobe + 1 elif ('PosIonizable' == (feats[j].GetFamily())): nbrPosIonizable = nbrPosIonizable + 1 elif ('NegIonizable' == (feats[j].GetFamily())): nbrNegIonizable = nbrNegIonizable + 1 else: pass#print(feats[j].GetFamily()) df.at[i,"Acceptor"] = nbrAcceptor df.at[i,"Donor"] = nbrDonor df.at[i,"Hydrophobe"] = nbrHydrophobe df.at[i,"LumpedHydrophobe"] = nbrLumpedHydrophobe df.at[i,"PosIonizable"] = nbrPosIonizable df.at[i,"NegIonizable"] = nbrNegIonizable # We can also get some more molecular features using rdMolDescriptors df.at[i,"NumRotatableBonds"] = rdMolDescriptors.CalcNumRotatableBonds(mol) df.at[i,"CalcChi0n"] = rdMolDescriptors.CalcChi0n(mol) df.at[i,"CalcChi0v"] = rdMolDescriptors.CalcChi0v(mol) df.at[i,"CalcChi1n"] = rdMolDescriptors.CalcChi1n(mol) df.at[i,"CalcChi1v"] = rdMolDescriptors.CalcChi1v(mol) df.at[i,"CalcChi2n"] = rdMolDescriptors.CalcChi2n(mol) df.at[i,"CalcChi2v"] = rdMolDescriptors.CalcChi2v(mol) df.at[i,"CalcChi3n"] = rdMolDescriptors.CalcChi3n(mol) df.at[i,"CalcChi3v"] = rdMolDescriptors.CalcChi3v(mol) df.at[i,"CalcChi4n"] = rdMolDescriptors.CalcChi4n(mol) df.at[i,"CalcChi4v"] = rdMolDescriptors.CalcChi4v(mol) df.at[i,"CalcFractionCSP3"] = rdMolDescriptors.CalcFractionCSP3(mol) df.at[i,"CalcHallKierAlpha"] = rdMolDescriptors.CalcHallKierAlpha(mol) df.at[i,"CalcKappa1"] = rdMolDescriptors.CalcKappa1(mol) df.at[i,"CalcKappa2"] = rdMolDescriptors.CalcKappa2(mol) #df.at[i,"CalcKappa3"] = rdMolDescriptors.CalcKappa3(mol) df.at[i,"CalcLabuteASA"] = rdMolDescriptors.CalcLabuteASA(mol) df.at[i,"CalcNumAliphaticCarbocycles"] = rdMolDescriptors.CalcNumAliphaticCarbocycles(mol) df.at[i,"CalcNumAliphaticHeterocycles"] = rdMolDescriptors.CalcNumAliphaticHeterocycles(mol) df.at[i,"CalcNumAliphaticRings"] = rdMolDescriptors.CalcNumAliphaticRings(mol) df.at[i,"CalcNumAmideBonds"] = rdMolDescriptors.CalcNumAmideBonds(mol) df.at[i,"CalcNumAromaticCarbocycles"] = rdMolDescriptors.CalcNumAromaticCarbocycles(mol) df.at[i,"CalcNumAromaticHeterocycles"] = rdMolDescriptors.CalcNumAromaticHeterocycles(mol) df.at[i,"CalcNumAromaticRings"] = rdMolDescriptors.CalcNumAromaticRings(mol) df.at[i,"CalcNumBridgeheadAtoms"] = rdMolDescriptors.CalcNumBridgeheadAtoms(mol) df.at[i,"CalcNumHBA"] = rdMolDescriptors.CalcNumHBA(mol) df.at[i,"CalcNumHBD"] = rdMolDescriptors.CalcNumHBD(mol) df.at[i,"CalcNumHeteroatoms"] = rdMolDescriptors.CalcNumHeteroatoms(mol) df.at[i,"CalcNumHeterocycles"] = rdMolDescriptors.CalcNumHeterocycles(mol) df.at[i,"CalcNumLipinskiHBA"] = rdMolDescriptors.CalcNumLipinskiHBA(mol) df.at[i,"CalcNumLipinskiHBD"] = rdMolDescriptors.CalcNumLipinskiHBD(mol) df.at[i,"CalcNumRings"] = rdMolDescriptors.CalcNumRings(mol) df.at[i,"CalcNumSaturatedCarbocycles"] = rdMolDescriptors.CalcNumSaturatedCarbocycles(mol) df.at[i,"CalcNumSaturatedHeterocycles"] = rdMolDescriptors.CalcNumSaturatedHeterocycles(mol) df.at[i,"CalcNumSaturatedRings"] = rdMolDescriptors.CalcNumSaturatedRings(mol) df.at[i,"CalcNumSpiroAtoms"] = rdMolDescriptors.CalcNumSpiroAtoms(mol) df.at[i,"CalcTPSA"] = rdMolDescriptors.CalcTPSA(mol) return(df)