Example #1
0

# this little trick saves duplicated code
def _NumMatches(mol, smarts):
    return len(mol.GetSubstructMatches(smarts, uniquify=1))


NumHDonors = lambda x: rdMolDescriptors.CalcNumHBD(x)
NumHDonors.__doc__ = "Number of Hydrogen Bond Donors"
NumHDonors.version = "1.0.0"
_HDonors = lambda x, y=HDonorSmarts: x.GetSubstructMatches(y, uniquify=1)
NumHAcceptors = lambda x: rdMolDescriptors.CalcNumHBA(x)
NumHAcceptors.__doc__ = "Number of Hydrogen Bond Acceptors"
NumHAcceptors.version = "2.0.0"
_HAcceptors = lambda x, y=HAcceptorSmarts: x.GetSubstructMatches(y, uniquify=1)
NumHeteroatoms = lambda x: rdMolDescriptors.CalcNumHeteroatoms(x)
NumHeteroatoms.__doc__ = "Number of Heteroatoms"
NumHeteroatoms.version = "1.0.0"
_Heteroatoms = lambda x, y=HeteroatomSmarts: x.GetSubstructMatches(y,
                                                                   uniquify=1)
NumRotatableBonds = lambda x: rdMolDescriptors.CalcNumRotatableBonds(x)
NumRotatableBonds.__doc__ = "Number of Rotatable Bonds"
NumRotatableBonds.version = "1.0.0"
_RotatableBonds = lambda x, y=RotatableBondSmarts: x.GetSubstructMatches(
    y, uniquify=1)
NOCount = lambda x: rdMolDescriptors.CalcNumLipinskiHBA(x)
NOCount.__doc__ = "Number of Nitrogens and Oxygens"
NOCount.version = "1.0.0"
NHOHCount = lambda x: rdMolDescriptors.CalcNumLipinskiHBD(x)
NHOHCount.__doc__ = "Number of NHs or OHs"
NHOHCount.version = "2.0.0"
    def get_global_features(self, mol):
        u = []
        # Now get some specific features
        fdefName = os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')
        factory = ChemicalFeatures.BuildFeatureFactory(fdefName)
        feats = factory.GetFeaturesForMol(mol)

        # First get some basic features
        natoms = mol.GetNumAtoms()
        nbonds = mol.GetNumBonds()
        mw = Descriptors.ExactMolWt(mol)
        HeavyAtomMolWt = Descriptors.HeavyAtomMolWt(mol)
        NumValenceElectrons = Descriptors.NumValenceElectrons(mol)
        ''' # These four descriptors are producing the value of infinity for refcode_csd = YOLJUF (CCOP(=O)(Cc1ccc(cc1)NC(=S)NP(OC(C)C)(OC(C)C)[S])OCC\t\n)
        MaxAbsPartialCharge = Descriptors.MaxAbsPartialCharge(mol)
        MaxPartialCharge = Descriptors.MaxPartialCharge(mol)
        MinAbsPartialCharge = Descriptors.MinAbsPartialCharge(mol)
        MinPartialCharge = Descriptors.MinPartialCharge(mol)
        '''
        #        FpDensityMorgan1 = Descriptors.FpDensityMorgan1(mol)
        #        FpDensityMorgan2 = Descriptors.FpDensityMorgan2(mol)
        #        FpDensityMorgan3 = Descriptors.FpDensityMorgan3(mol)

        # Get some features using chemical feature factory

        nbrAcceptor = 0
        nbrDonor = 0
        nbrHydrophobe = 0
        nbrLumpedHydrophobe = 0
        nbrPosIonizable = 0
        nbrNegIonizable = 0

        for j in range(len(feats)):
            #print(feats[j].GetFamily(), feats[j].GetType())
            if ('Acceptor' == (feats[j].GetFamily())):
                nbrAcceptor = nbrAcceptor + 1
            elif ('Donor' == (feats[j].GetFamily())):
                nbrDonor = nbrDonor + 1
            elif ('Hydrophobe' == (feats[j].GetFamily())):
                nbrHydrophobe = nbrHydrophobe + 1
            elif ('LumpedHydrophobe' == (feats[j].GetFamily())):
                nbrLumpedHydrophobe = nbrLumpedHydrophobe + 1
            elif ('PosIonizable' == (feats[j].GetFamily())):
                nbrPosIonizable = nbrPosIonizable + 1
            elif ('NegIonizable' == (feats[j].GetFamily())):
                nbrNegIonizable = nbrNegIonizable + 1
            else:
                pass
                #print(feats[j].GetFamily())

        # Now get some features using rdMolDescriptors

        moreGlobalFeatures = [rdm.CalcNumRotatableBonds(mol), rdm.CalcChi0n(mol), rdm.CalcChi0v(mol), \
                            rdm.CalcChi1n(mol), rdm.CalcChi1v(mol), rdm.CalcChi2n(mol), rdm.CalcChi2v(mol), \
                            rdm.CalcChi3n(mol), rdm.CalcChi4n(mol), rdm.CalcChi4v(mol), \
                            rdm.CalcFractionCSP3(mol), rdm.CalcHallKierAlpha(mol), rdm.CalcKappa1(mol), \
                            rdm.CalcKappa2(mol), rdm.CalcLabuteASA(mol), \
                            rdm.CalcNumAliphaticCarbocycles(mol), rdm.CalcNumAliphaticHeterocycles(mol), \
                            rdm.CalcNumAliphaticRings(mol), rdm.CalcNumAmideBonds(mol), \
                            rdm.CalcNumAromaticCarbocycles(mol), rdm.CalcNumAromaticHeterocycles(mol), \
                            rdm.CalcNumAromaticRings(mol), rdm.CalcNumBridgeheadAtoms(mol), rdm.CalcNumHBA(mol), \
                            rdm.CalcNumHBD(mol), rdm.CalcNumHeteroatoms(mol), rdm.CalcNumHeterocycles(mol), \
                            rdm.CalcNumLipinskiHBA(mol), rdm.CalcNumLipinskiHBD(mol), rdm.CalcNumRings(mol), \
                            rdm.CalcNumSaturatedCarbocycles(mol), rdm.CalcNumSaturatedHeterocycles(mol), \
                            rdm.CalcNumSaturatedRings(mol), rdm.CalcNumSpiroAtoms(mol), rdm.CalcTPSA(mol)]


        u = [natoms, nbonds, mw, HeavyAtomMolWt, NumValenceElectrons, \
            nbrAcceptor, nbrDonor, nbrHydrophobe, nbrLumpedHydrophobe, \
            nbrPosIonizable, nbrNegIonizable]

        u = u + moreGlobalFeatures
        u = np.array(u).T
        # Some of the descriptors produice NAN. We can convert them to 0
        # If you are getting outliers in the training or validation set this could be
        # Because some important features were set to zero here because it produced NAN
        # Removing those features from the feature set might remove the outliers

        #u[np.isnan(u)] = 0

        #u = torch.tensor(u, dtype=torch.float)
        return (u)
Example #3
0
def get_molecular_features(dataframe, mol_list):
    df = dataframe
    for i in range(len(mol_list)):
        print("Getting molecular features for molecule: ", i)
        mol = mol_list[i]
        natoms = mol.GetNumAtoms()
        nbonds = mol.GetNumBonds()
        mw = Descriptors.ExactMolWt(mol)
        df.at[i,"NbrAtoms"] = natoms
        df.at[i,"NbrBonds"] = nbonds
        df.at[i,"mw"] = mw
        df.at[i,'HeavyAtomMolWt'] = Chem.Descriptors.HeavyAtomMolWt(mol)
        df.at[i,'NumValenceElectrons'] = Chem.Descriptors.NumValenceElectrons(mol)
        ''' # These four descriptors are producing the value of infinity for refcode_csd = YOLJUF (CCOP(=O)(Cc1ccc(cc1)NC(=S)NP(OC(C)C)(OC(C)C)[S])OCC\t\n)
        df.at[i,'MaxAbsPartialCharge'] = Chem.Descriptors.MaxAbsPartialCharge(mol)
        df.at[i,'MaxPartialCharge'] = Chem.Descriptors.MaxPartialCharge(mol)
        df.at[i,'MinAbsPartialCharge'] = Chem.Descriptors.MinAbsPartialCharge(mol)
        df.at[i,'MinPartialCharge'] = Chem.Descriptors.MinPartialCharge(mol)
        '''
        df.at[i,'FpDensityMorgan1'] = Chem.Descriptors.FpDensityMorgan1(mol)
        df.at[i,'FpDensityMorgan2'] = Chem.Descriptors.FpDensityMorgan2(mol)
        df.at[i,'FpDensityMorgan3'] = Chem.Descriptors.FpDensityMorgan3(mol)
        
        #print(natoms, nbonds)
        
        # Now get some specific features
        fdefName = os.path.join(RDConfig.RDDataDir,'BaseFeatures.fdef')
        factory = ChemicalFeatures.BuildFeatureFactory(fdefName)
        feats = factory.GetFeaturesForMol(mol)
        #df["Acceptor"] = 0
        #df["Aromatic"] = 0
        #df["Hydrophobe"] = 0
        nbrAcceptor = 0
        nbrDonor = 0
        nbrHydrophobe = 0
        nbrLumpedHydrophobe = 0
        nbrPosIonizable = 0
        nbrNegIonizable = 0
        for j in range(len(feats)):
            #print(feats[j].GetFamily(), feats[j].GetType())
            if ('Acceptor' == (feats[j].GetFamily())):
                nbrAcceptor = nbrAcceptor + 1
            elif ('Donor' == (feats[j].GetFamily())):
                nbrDonor = nbrDonor + 1
            elif ('Hydrophobe' == (feats[j].GetFamily())):
                nbrHydrophobe = nbrHydrophobe + 1
            elif ('LumpedHydrophobe' == (feats[j].GetFamily())):
                nbrLumpedHydrophobe = nbrLumpedHydrophobe + 1
            elif ('PosIonizable' == (feats[j].GetFamily())):
                nbrPosIonizable = nbrPosIonizable + 1
            elif ('NegIonizable' == (feats[j].GetFamily())):
                nbrNegIonizable = nbrNegIonizable + 1                
            else:
                pass#print(feats[j].GetFamily())
                        
        df.at[i,"Acceptor"] = nbrAcceptor
        df.at[i,"Donor"] = nbrDonor
        df.at[i,"Hydrophobe"] = nbrHydrophobe
        df.at[i,"LumpedHydrophobe"] = nbrLumpedHydrophobe
        df.at[i,"PosIonizable"] = nbrPosIonizable
        df.at[i,"NegIonizable"] = nbrNegIonizable
        
        # We can also get some more molecular features using rdMolDescriptors
        
        df.at[i,"NumRotatableBonds"] = rdMolDescriptors.CalcNumRotatableBonds(mol)
        df.at[i,"CalcChi0n"] = rdMolDescriptors.CalcChi0n(mol)
        df.at[i,"CalcChi0v"] = rdMolDescriptors.CalcChi0v(mol)
        df.at[i,"CalcChi1n"] = rdMolDescriptors.CalcChi1n(mol)
        df.at[i,"CalcChi1v"] = rdMolDescriptors.CalcChi1v(mol)
        df.at[i,"CalcChi2n"] = rdMolDescriptors.CalcChi2n(mol)
        df.at[i,"CalcChi2v"] = rdMolDescriptors.CalcChi2v(mol)
        df.at[i,"CalcChi3n"] = rdMolDescriptors.CalcChi3n(mol)
        df.at[i,"CalcChi3v"] = rdMolDescriptors.CalcChi3v(mol)
        df.at[i,"CalcChi4n"] = rdMolDescriptors.CalcChi4n(mol)
        df.at[i,"CalcChi4v"] = rdMolDescriptors.CalcChi4v(mol)
        df.at[i,"CalcFractionCSP3"] = rdMolDescriptors.CalcFractionCSP3(mol)
        df.at[i,"CalcHallKierAlpha"] = rdMolDescriptors.CalcHallKierAlpha(mol)
        df.at[i,"CalcKappa1"] = rdMolDescriptors.CalcKappa1(mol)
        df.at[i,"CalcKappa2"] = rdMolDescriptors.CalcKappa2(mol)
        #df.at[i,"CalcKappa3"] = rdMolDescriptors.CalcKappa3(mol)
        df.at[i,"CalcLabuteASA"] = rdMolDescriptors.CalcLabuteASA(mol)
        df.at[i,"CalcNumAliphaticCarbocycles"] = rdMolDescriptors.CalcNumAliphaticCarbocycles(mol)
        df.at[i,"CalcNumAliphaticHeterocycles"] = rdMolDescriptors.CalcNumAliphaticHeterocycles(mol)
        df.at[i,"CalcNumAliphaticRings"] = rdMolDescriptors.CalcNumAliphaticRings(mol)
        df.at[i,"CalcNumAmideBonds"] = rdMolDescriptors.CalcNumAmideBonds(mol)
        df.at[i,"CalcNumAromaticCarbocycles"] = rdMolDescriptors.CalcNumAromaticCarbocycles(mol)
        df.at[i,"CalcNumAromaticHeterocycles"] = rdMolDescriptors.CalcNumAromaticHeterocycles(mol)
        df.at[i,"CalcNumAromaticRings"] = rdMolDescriptors.CalcNumAromaticRings(mol)
        df.at[i,"CalcNumBridgeheadAtoms"] = rdMolDescriptors.CalcNumBridgeheadAtoms(mol)
        df.at[i,"CalcNumHBA"] = rdMolDescriptors.CalcNumHBA(mol)
        df.at[i,"CalcNumHBD"] = rdMolDescriptors.CalcNumHBD(mol)
        df.at[i,"CalcNumHeteroatoms"] = rdMolDescriptors.CalcNumHeteroatoms(mol)
        df.at[i,"CalcNumHeterocycles"] = rdMolDescriptors.CalcNumHeterocycles(mol)
        df.at[i,"CalcNumLipinskiHBA"] = rdMolDescriptors.CalcNumLipinskiHBA(mol)
        df.at[i,"CalcNumLipinskiHBD"] = rdMolDescriptors.CalcNumLipinskiHBD(mol)
        df.at[i,"CalcNumRings"] = rdMolDescriptors.CalcNumRings(mol)
        df.at[i,"CalcNumSaturatedCarbocycles"] = rdMolDescriptors.CalcNumSaturatedCarbocycles(mol)
        df.at[i,"CalcNumSaturatedHeterocycles"] = rdMolDescriptors.CalcNumSaturatedHeterocycles(mol)
        df.at[i,"CalcNumSaturatedRings"] = rdMolDescriptors.CalcNumSaturatedRings(mol)
        df.at[i,"CalcNumSpiroAtoms"] = rdMolDescriptors.CalcNumSpiroAtoms(mol)
        df.at[i,"CalcTPSA"] = rdMolDescriptors.CalcTPSA(mol)
    return(df)