def _calculateDescriptors(mol): df = pd.DataFrame(index=[0]) df["SlogP"] = rdMolDescriptors.CalcCrippenDescriptors(mol)[0] df["SMR"] = rdMolDescriptors.CalcCrippenDescriptors(mol)[1] df["LabuteASA"] = rdMolDescriptors.CalcLabuteASA(mol) df["TPSA"] = Descriptors.TPSA(mol) df["AMW"] = Descriptors.MolWt(mol) df["ExactMW"] = rdMolDescriptors.CalcExactMolWt(mol) df["NumLipinskiHBA"] = rdMolDescriptors.CalcNumLipinskiHBA(mol) df["NumLipinskiHBD"] = rdMolDescriptors.CalcNumLipinskiHBD(mol) df["NumRotatableBonds"] = rdMolDescriptors.CalcNumRotatableBonds(mol) df["NumHBD"] = rdMolDescriptors.CalcNumHBD(mol) df["NumHBA"] = rdMolDescriptors.CalcNumHBA(mol) df["NumAmideBonds"] = rdMolDescriptors.CalcNumAmideBonds(mol) df["NumHeteroAtoms"] = rdMolDescriptors.CalcNumHeteroatoms(mol) df["NumHeavyAtoms"] = Chem.rdchem.Mol.GetNumHeavyAtoms(mol) df["NumAtoms"] = Chem.rdchem.Mol.GetNumAtoms(mol) df["NumRings"] = rdMolDescriptors.CalcNumRings(mol) df["NumAromaticRings"] = rdMolDescriptors.CalcNumAromaticRings(mol) df["NumSaturatedRings"] = rdMolDescriptors.CalcNumSaturatedRings(mol) df["NumAliphaticRings"] = rdMolDescriptors.CalcNumAliphaticRings(mol) df["NumAromaticHeterocycles"] = \ rdMolDescriptors.CalcNumAromaticHeterocycles(mol) df["NumSaturatedHeterocycles"] = \ rdMolDescriptors.CalcNumSaturatedHeterocycles(mol) df["NumAliphaticHeterocycles"] = \ rdMolDescriptors.CalcNumAliphaticHeterocycles(mol) df["NumAromaticCarbocycles"] = \ rdMolDescriptors.CalcNumAromaticCarbocycles(mol) df["NumSaturatedCarbocycles"] = \ rdMolDescriptors.CalcNumSaturatedCarbocycles(mol) df["NumAliphaticCarbocycles"] = \ rdMolDescriptors.CalcNumAliphaticCarbocycles(mol) df["FractionCSP3"] = rdMolDescriptors.CalcFractionCSP3(mol) df["Chi0v"] = rdMolDescriptors.CalcChi0v(mol) df["Chi1v"] = rdMolDescriptors.CalcChi1v(mol) df["Chi2v"] = rdMolDescriptors.CalcChi2v(mol) df["Chi3v"] = rdMolDescriptors.CalcChi3v(mol) df["Chi4v"] = rdMolDescriptors.CalcChi4v(mol) df["Chi1n"] = rdMolDescriptors.CalcChi1n(mol) df["Chi2n"] = rdMolDescriptors.CalcChi2n(mol) df["Chi3n"] = rdMolDescriptors.CalcChi3n(mol) df["Chi4n"] = rdMolDescriptors.CalcChi4n(mol) df["HallKierAlpha"] = rdMolDescriptors.CalcHallKierAlpha(mol) df["kappa1"] = rdMolDescriptors.CalcKappa1(mol) df["kappa2"] = rdMolDescriptors.CalcKappa2(mol) df["kappa3"] = rdMolDescriptors.CalcKappa3(mol) slogp_VSA = list(map(lambda i: "slogp_VSA" + str(i), list(range(1, 13)))) df = df.assign(**dict(zip(slogp_VSA, rdMolDescriptors.SlogP_VSA_(mol)))) smr_VSA = list(map(lambda i: "smr_VSA" + str(i), list(range(1, 11)))) df = df.assign(**dict(zip(smr_VSA, rdMolDescriptors.SMR_VSA_(mol)))) peoe_VSA = list(map(lambda i: "peoe_VSA" + str(i), list(range(1, 15)))) df = df.assign(**dict(zip(peoe_VSA, rdMolDescriptors.PEOE_VSA_(mol)))) MQNs = list(map(lambda i: "MQN" + str(i), list(range(1, 43)))) df = df.assign(**dict(zip(MQNs, rdMolDescriptors.MQNs_(mol)))) return df
def __init__(self, configuration: StatsExtractionConfig): self._filters = FilterTypesEnum self._columns = DataframeColumnsEnum self._stats = StatsExtractionEnum self._purging = PurgingEnum self._configuration = configuration standardisation_config_dict = self._configuration.standardisation_config standardisation_config = [ FilterConfiguration(name=name, parameters=params) for name, params in standardisation_config_dict.items() ] dec_separator = self._stats.DECORATION_SEPARATOR_TOKEN attachment_token = self._stats.ATTACHMENT_POINT_TOKEN self._mol_wts_udf = psf.udf( lambda x: ExactMolWt(Chem.MolFromSmiles(x)), pst.FloatType()) self._num_rings_udf = psf.udf( lambda x: rdMolDescriptors.CalcNumRings(Chem.MolFromSmiles(x)), pst.IntegerType()) self._num_atoms_udf = psf.udf( lambda x: Chem.MolFromSmiles(x).GetNumHeavyAtoms(), pst.IntegerType()) self._num_aromatic_rings_udf = psf.udf( lambda x: rdMolDescriptors.CalcNumAromaticRings( Chem.MolFromSmiles(x)), pst.IntegerType()) self._hbond_donors_udf = psf.udf( lambda x: rdMolDescriptors.CalcNumHBD(Chem.MolFromSmiles(x)), pst.IntegerType()) self._hbond_acceptors_udf = psf.udf( lambda x: rdMolDescriptors.CalcNumHBA(Chem.MolFromSmiles(x)), pst.IntegerType()) self._hetero_atom_ratio_udf = psf.udf( lambda x: len([ atom for atom in Chem.MolFromSmiles(x).GetAtoms() if atom.GetAtomicNum() == 6 ]) / Chem.MolFromSmiles(x).GetNumHeavyAtoms(), pst.FloatType()) self._make_canonical_udf = psf.udf( lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)), pst.StringType()) self._standardise_smiles_udf = psf.udf( lambda x: RDKitStandardizer(standardisation_config, None). apply_filter(x), pst.StringType()) pattern = self._stats.REGEX_TOKENS self.regex = re.compile(pattern) self._tokeniser_udf = psf.udf(self.regex.findall, pst.ArrayType(pst.StringType())) self._decoration_split_udf = psf.udf(lambda x: x.split(dec_separator), pst.ArrayType(pst.StringType())) self._count_decorations_udf = psf.udf( lambda s: list(s).count(attachment_token), pst.IntegerType())
def main(in_file, output): Cmpds = {} InMols = rdkit_open([in_file]) print('\n # Number of input molecule: {0}'.format(len(InMols))) for mol in InMols: m = {} name = mol.GetProp('_Name').split()[0] m['Name'] = name m['Formula'] = rd.CalcMolFormula(mol) m['SMILES'] = Chem.MolToSmiles(mol) m['MW'] = rd._CalcMolWt(mol) # Molecular Weight m['logP'] = rd.CalcCrippenDescriptors(mol)[0] # Partition coefficient m['HDon'] = rd.CalcNumLipinskiHBD(mol) # Lipinski Hbond donor m['HAcc'] = rd.CalcNumLipinskiHBA(mol) # Lipinski Hbond acceptor m['TPSA'] = rd.CalcTPSA(mol) # Topological polar surface area m['Rotat'] = rd.CalcNumRotatableBonds(mol, strict=True) # Rotatable bond m['MolRef'] = rd.CalcCrippenDescriptors(mol)[1] # Molar refractivity m['AliRing'] = rd.CalcNumAliphaticRings(mol) # Aliphatic ring number m['AroRing'] = rd.CalcNumAromaticRings(mol) # Aromatic ring number # m['Stereo'] = rd.CalcNumAtomStereoCenters(mol) # Stereo center number # m['UnspStereo'] = rd.CalcNumUnspecifiedAtomStereoCenters(mol) # unspecified stereo m['SMILES'] = Chem.MolToSmiles(mol, isomericSmiles=True, allHsExplicit=False) Cmpds[name] = m #################################### df = pd.DataFrame.from_dict(Cmpds, orient='index') df.index.name = 'Name' # Columns of data to print out Columns = [ 'Formula', 'MW', 'logP', 'HDon', 'HAcc', 'TPSA', 'Rotat', 'MolRef', 'AliRing', 'AroRing', #'Stereo', 'UnspStereo', 'SMILES', ] reorder = df[Columns] # Output to CSV reorder.to_csv( output+'.csv', sep=',', na_rep='NA', encoding='utf-8', float_format='%.5f', header=True ) # Output to Excel reorder.to_excel( output+'.xlsx', header=True, na_rep='NA' )
def assign_ring_order(self, skeleton, substituents): """ Assures that ring numbering in substituents is compatible with the number of rings present in the skeleton. Arguments --------- skeleton : :class:`str` SMILES string representing molecular skeleton onto which substituent groups will be placed. substituents : :class:`list` A list of allowed substituents, represented by SMILES strings. Returns ------- substituents : :class:`list` The list of allowed substituents, still represented by SMILES strings, with their ring open/close numbering adjusted to be compatible with the number of rings present in the skeleton. """ n = rdMolDescriptors.CalcNumAromaticRings( rdkit.MolFromSmiles(skeleton)) for i, item in enumerate(substituents): rings = rdMolDescriptors.CalcNumAromaticRings( rdkit.MolFromSmiles(item[1:-1])) if rings > 0: for j in reversed(range(rings + 1)): item = item.replace(str(j), str(j + n)) substituents[i] = item return substituents
def model_process_fun(model_out, visdom, n): # TODO: rephrase this to return a dict, instead of calling visdom directly from rdkit import Chem from rdkit.Chem.Draw import MolToFile # actions, logits, rewards, terminals, info = model_out smiles, valid = model_out['info'] total_rewards = model_out['rewards'] if len(total_rewards.shape) > 1: total_rewards = total_rewards.sum(1) best_ind = torch.argmax(total_rewards).data.item() this_smile = smiles[best_ind] mol = Chem.MolFromSmiles(this_smile) pic_save_path = os.path.realpath(root_location + '/images/' + 'tmp.svg') if mol is not None: try: MolToFile(mol, pic_save_path, imageType='svg') with open(pic_save_path, 'r') as myfile: data = myfile.read() data = data.replace('svg:', '') visdom.append('best molecule of batch', 'svg', svgstr=data) except Exception as e: print(e) scores, norm_scores = scorer.get_scores([this_smile]) visdom.append( 'score component', 'line', X=np.array([n]), Y=np.array([[x for x in norm_scores[0]] + [norm_scores[0].sum()] + [scores[0].sum()] + [desc.CalcNumAromaticRings(mol)]]), opts={ 'legend': [ 'logP', 'SA', 'cycle', 'norm_reward', 'reward', 'Aromatic rings' ] }) visdom.append('reward', 'line', X=np.array([n]), Y=np.array([total_rewards[best_ind].item()])) visdom.append('fraction valid', 'line', X=np.array([n]), Y=np.array([valid.mean().data.item()])) visdom.append('num atoms', 'line', X=np.array([n]), Y=np.array([len(mol.GetAtoms())]))
def calculate_properties(self, smiles=None, mol=None, props=[]): """this method calculates basic properties for the mol returns : error (bool)""" if len(props) == 0: return True if mol is None: mol = Chem.MolFromSmiles(smiles) if mol is None: return True if 'py_formula' in props: self.data['py_formula'] = desc.CalcMolFormula(mol) if 'py_em' in props: self.data['py_em'] = round(desc.CalcExactMolWt(mol), 5) if 'py_n_Cl_Br' in props: all_atoms = [] for atom in mol.GetAtoms(): all_atoms.append(atom.GetSymbol()) n_Cl = all_atoms.count('Cl') n_Br = all_atoms.count('Br') self.data['py_n_Cl_Br'] = n_Cl + n_Br if 'py_na' in props: self.data['py_na'] = mol.GetNumAtoms() if 'py_mw' in props: self.data['py_mw'] = desc._CalcMolWt(mol) if 'py_fsp3' in props: self.data['py_fsp3'] = desc.CalcFractionCSP3(mol) if 'py_rb' in props: self.data['py_rb'] = desc.CalcNumRotatableBonds(mol) if 'py_tpsa' in props: self.data['py_tpsa'] = desc.CalcTPSA(mol) if 'py_clogp' in props: self.data['py_clogp'] = desc.CalcCrippenDescriptors(mol)[0] if 'py_nar' in props: self.data['py_nar'] = desc.CalcNumAromaticRings(mol) if 'py_nhba' in props: self.data['py_nhba'] = desc.CalcNumHBA(mol) if 'py_nhbd' in props: self.data['py_nhbd'] = desc.CalcNumHBD(mol) return False
def feature_fp(smiles): mol = Chem.MolFromSmiles(smiles) fp = rdMolDescriptors.MQNs_(mol) fp.append(rdMolDescriptors.CalcNumRotatableBonds(mol)) fp.append(rdMolDescriptors.CalcExactMolWt(mol)) fp.append(rdMolDescriptors.CalcNumRotatableBonds(mol)) fp.append(rdMolDescriptors.CalcFractionCSP3(mol)) fp.append(rdMolDescriptors.CalcNumAliphaticCarbocycles(mol)) fp.append(rdMolDescriptors.CalcNumAliphaticHeterocycles(mol)) fp.append(rdMolDescriptors.CalcNumAliphaticRings((mol))) fp.append(rdMolDescriptors.CalcNumAromaticCarbocycles(mol)) fp.append(rdMolDescriptors.CalcNumAromaticHeterocycles(mol)) fp.append(rdMolDescriptors.CalcNumAromaticRings(mol)) fp.append(rdMolDescriptors.CalcNumBridgeheadAtoms(mol)) fp.append(rdMolDescriptors.CalcNumRings(mol)) fp.append(rdMolDescriptors.CalcNumAmideBonds(mol)) fp.append(rdMolDescriptors.CalcNumHeterocycles(mol)) fp.append(rdMolDescriptors.CalcNumSpiroAtoms(mol)) fp.append(rdMolDescriptors.CalcTPSA(mol)) return np.array(fp)
def model_process_fun(model_out, visdom, n): from rdkit import Chem from rdkit.Chem.Draw import MolToFile actions, logits, rewards, terminals, info = model_out smiles, valid = info total_rewards = rewards.sum(1) best_ind = torch.argmax(total_rewards).data.item() this_smile = smiles[best_ind] mol = Chem.MolFromSmiles(this_smile) pic_save_path = root_location + 'images/' + 'test.svg' if mol is not None: try: MolToFile(mol, pic_save_path, imageType='svg') with open(pic_save_path, 'r') as myfile: data = myfile.read() data = data.replace('svg:', '') visdom.append('best molecule of batch', 'svg', svgstr=data) except: pass scores, norm_scores = scorer.get_scores([this_smile]) visdom.append( 'score component', 'line', X=np.array([n]), Y=np.array( [[x for x in norm_scores[0]] + [norm_scores[0].sum()] + [scores[0].sum()] + [desc.CalcNumAromaticRings(mol)]]), opts={ 'legend': [ 'logP', 'SA', 'cycle', 'norm_reward', 'reward', 'Aromatic rings' ] }) visdom.append('fraction valid', 'line', X=np.array([n]), Y=np.array([valid.mean().data.item()]))
def numAromatic(x): # moleculeString = (df_all['smiles'][1]) # mS = moleculeString.tolist()[0] molecule = Chem.MolFromSmiles(str(x)) return rdMolDescriptors.CalcNumAromaticRings(molecule)
def get_global_features(self, mol): u = [] # Now get some specific features fdefName = os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef') factory = ChemicalFeatures.BuildFeatureFactory(fdefName) feats = factory.GetFeaturesForMol(mol) # First get some basic features natoms = mol.GetNumAtoms() nbonds = mol.GetNumBonds() mw = Descriptors.ExactMolWt(mol) HeavyAtomMolWt = Descriptors.HeavyAtomMolWt(mol) NumValenceElectrons = Descriptors.NumValenceElectrons(mol) ''' # These four descriptors are producing the value of infinity for refcode_csd = YOLJUF (CCOP(=O)(Cc1ccc(cc1)NC(=S)NP(OC(C)C)(OC(C)C)[S])OCC\t\n) MaxAbsPartialCharge = Descriptors.MaxAbsPartialCharge(mol) MaxPartialCharge = Descriptors.MaxPartialCharge(mol) MinAbsPartialCharge = Descriptors.MinAbsPartialCharge(mol) MinPartialCharge = Descriptors.MinPartialCharge(mol) ''' # FpDensityMorgan1 = Descriptors.FpDensityMorgan1(mol) # FpDensityMorgan2 = Descriptors.FpDensityMorgan2(mol) # FpDensityMorgan3 = Descriptors.FpDensityMorgan3(mol) # Get some features using chemical feature factory nbrAcceptor = 0 nbrDonor = 0 nbrHydrophobe = 0 nbrLumpedHydrophobe = 0 nbrPosIonizable = 0 nbrNegIonizable = 0 for j in range(len(feats)): #print(feats[j].GetFamily(), feats[j].GetType()) if ('Acceptor' == (feats[j].GetFamily())): nbrAcceptor = nbrAcceptor + 1 elif ('Donor' == (feats[j].GetFamily())): nbrDonor = nbrDonor + 1 elif ('Hydrophobe' == (feats[j].GetFamily())): nbrHydrophobe = nbrHydrophobe + 1 elif ('LumpedHydrophobe' == (feats[j].GetFamily())): nbrLumpedHydrophobe = nbrLumpedHydrophobe + 1 elif ('PosIonizable' == (feats[j].GetFamily())): nbrPosIonizable = nbrPosIonizable + 1 elif ('NegIonizable' == (feats[j].GetFamily())): nbrNegIonizable = nbrNegIonizable + 1 else: pass #print(feats[j].GetFamily()) # Now get some features using rdMolDescriptors moreGlobalFeatures = [rdm.CalcNumRotatableBonds(mol), rdm.CalcChi0n(mol), rdm.CalcChi0v(mol), \ rdm.CalcChi1n(mol), rdm.CalcChi1v(mol), rdm.CalcChi2n(mol), rdm.CalcChi2v(mol), \ rdm.CalcChi3n(mol), rdm.CalcChi4n(mol), rdm.CalcChi4v(mol), \ rdm.CalcFractionCSP3(mol), rdm.CalcHallKierAlpha(mol), rdm.CalcKappa1(mol), \ rdm.CalcKappa2(mol), rdm.CalcLabuteASA(mol), \ rdm.CalcNumAliphaticCarbocycles(mol), rdm.CalcNumAliphaticHeterocycles(mol), \ rdm.CalcNumAliphaticRings(mol), rdm.CalcNumAmideBonds(mol), \ rdm.CalcNumAromaticCarbocycles(mol), rdm.CalcNumAromaticHeterocycles(mol), \ rdm.CalcNumAromaticRings(mol), rdm.CalcNumBridgeheadAtoms(mol), rdm.CalcNumHBA(mol), \ rdm.CalcNumHBD(mol), rdm.CalcNumHeteroatoms(mol), rdm.CalcNumHeterocycles(mol), \ rdm.CalcNumLipinskiHBA(mol), rdm.CalcNumLipinskiHBD(mol), rdm.CalcNumRings(mol), \ rdm.CalcNumSaturatedCarbocycles(mol), rdm.CalcNumSaturatedHeterocycles(mol), \ rdm.CalcNumSaturatedRings(mol), rdm.CalcNumSpiroAtoms(mol), rdm.CalcTPSA(mol)] u = [natoms, nbonds, mw, HeavyAtomMolWt, NumValenceElectrons, \ nbrAcceptor, nbrDonor, nbrHydrophobe, nbrLumpedHydrophobe, \ nbrPosIonizable, nbrNegIonizable] u = u + moreGlobalFeatures u = np.array(u).T # Some of the descriptors produice NAN. We can convert them to 0 # If you are getting outliers in the training or validation set this could be # Because some important features were set to zero here because it produced NAN # Removing those features from the feature set might remove the outliers #u[np.isnan(u)] = 0 #u = torch.tensor(u, dtype=torch.float) return (u)
#FRAGMENTS = { # "acyl_halide": Chem.MolFromSmarts('[#9,#17,#35,#53]=O'), # C(=O)X # "anhydride": Chem.MolFromSmarts('[#6]-[#6](=O)-[#8]-[#6](-[#6])=O'), # CC(=O)OC(=O)C # "peroxide": Chem.MolFromSmarts('[#8]-[#8]'), # R-O-O-R' # "ab_unsaturated_ketone": Chem.MolFromSmarts('[#6]=[#6]-[#6]=O'), # R=CC=O #} DESCRIPTORS = { # classical molecular descriptors "num_heavy_atoms": lambda x: x.GetNumAtoms(), "molecular_weight": lambda x: round(Desc.ExactMolWt(x), 4), "num_rings": lambda x: rdMolDesc.CalcNumRings(x), "num_rings_arom": lambda x: rdMolDesc.CalcNumAromaticRings(x), "num_rings_ali": lambda x: rdMolDesc.CalcNumAliphaticRings(x), "num_hbd": lambda x: rdMolDesc.CalcNumLipinskiHBD(x), "num_hba": lambda x: rdMolDesc.CalcNumLipinskiHBA(x), "slogp": lambda x: round(Crippen.MolLogP(x), 4), "tpsa": lambda x: round(rdMolDesc.CalcTPSA(x), 4), "num_rotatable_bond": lambda x: rdMolDesc.CalcNumRotatableBonds(x), "num_atoms_oxygen": lambda x: len( [a for a in x.GetAtoms() if a.GetAtomicNum() == 8] ), "num_atoms_nitrogen": lambda x: len( [a for a in x.GetAtoms() if a.GetAtomicNum() == 7] ), "num_atoms_halogen": Fragments.fr_halogen, "num_atoms_bridgehead": rdMolDesc.CalcNumBridgeheadAtoms, # custom molecular descriptors
def num_aromatic_rings(mol: Mol) -> int: return rdMolDescriptors.CalcNumAromaticRings(mol)
def calculate_number_aromatic_rings(self): ''' Calculates the number of aromatic rings :return: ''' return rdMolDescriptors.CalcNumAromaticRings(self.mol)
from rdkit import Chem from rdkit.Chem import rdMolDescriptors as Des df = pd.read_csv('Datasets/COVID/train.csv', names=['smiles', 'aff'], skiprows=1) df['mol'] = df['smiles'].apply(lambda x: Chem.MolFromSmiles(x)) df['mol'] = df['mol'].apply(lambda x: Chem.AddHs(x)) df['num_of_atoms'] = df['mol'].apply(lambda x: x.GetNumAtoms()) df['num_of_bonds'] = df['mol'].apply(lambda x: x.GetNumBonds()) df['num_of_bonds_sq'] = df['mol'].apply(lambda x: x.GetNumBonds()**2) # df['ringsar'] = df['mol'].apply(lambda x: Des.CalcNumAromaticRings(x)) df['rings_sq'] = df['mol'].apply(lambda x: Des.CalcNumAromaticRings(x)**2) # df['num_of_heavy_atoms'] = df['mol'].apply(lambda x: x.GetNumHeavyAtoms()) df['n_h_a_sq'] = df['mol'].apply(lambda x: x.GetNumHeavyAtoms()**2) def number_of_atoms(atom_list, df): for i in atom_list: df['num_of_{}_atoms'.format(i)] = df['mol'].apply( lambda x: len(x.GetSubstructMatches(Chem.MolFromSmiles(i)))) number_of_atoms(['C', 'O', 'N'], df) ############################################ train_df = df.drop(columns=['smiles', 'mol', 'aff'])
def get_molecular_features(dataframe, mol_list): df = dataframe for i in range(len(mol_list)): print("Getting molecular features for molecule: ", i) mol = mol_list[i] natoms = mol.GetNumAtoms() nbonds = mol.GetNumBonds() mw = Descriptors.ExactMolWt(mol) df.at[i,"NbrAtoms"] = natoms df.at[i,"NbrBonds"] = nbonds df.at[i,"mw"] = mw df.at[i,'HeavyAtomMolWt'] = Chem.Descriptors.HeavyAtomMolWt(mol) df.at[i,'NumValenceElectrons'] = Chem.Descriptors.NumValenceElectrons(mol) ''' # These four descriptors are producing the value of infinity for refcode_csd = YOLJUF (CCOP(=O)(Cc1ccc(cc1)NC(=S)NP(OC(C)C)(OC(C)C)[S])OCC\t\n) df.at[i,'MaxAbsPartialCharge'] = Chem.Descriptors.MaxAbsPartialCharge(mol) df.at[i,'MaxPartialCharge'] = Chem.Descriptors.MaxPartialCharge(mol) df.at[i,'MinAbsPartialCharge'] = Chem.Descriptors.MinAbsPartialCharge(mol) df.at[i,'MinPartialCharge'] = Chem.Descriptors.MinPartialCharge(mol) ''' df.at[i,'FpDensityMorgan1'] = Chem.Descriptors.FpDensityMorgan1(mol) df.at[i,'FpDensityMorgan2'] = Chem.Descriptors.FpDensityMorgan2(mol) df.at[i,'FpDensityMorgan3'] = Chem.Descriptors.FpDensityMorgan3(mol) #print(natoms, nbonds) # Now get some specific features fdefName = os.path.join(RDConfig.RDDataDir,'BaseFeatures.fdef') factory = ChemicalFeatures.BuildFeatureFactory(fdefName) feats = factory.GetFeaturesForMol(mol) #df["Acceptor"] = 0 #df["Aromatic"] = 0 #df["Hydrophobe"] = 0 nbrAcceptor = 0 nbrDonor = 0 nbrHydrophobe = 0 nbrLumpedHydrophobe = 0 nbrPosIonizable = 0 nbrNegIonizable = 0 for j in range(len(feats)): #print(feats[j].GetFamily(), feats[j].GetType()) if ('Acceptor' == (feats[j].GetFamily())): nbrAcceptor = nbrAcceptor + 1 elif ('Donor' == (feats[j].GetFamily())): nbrDonor = nbrDonor + 1 elif ('Hydrophobe' == (feats[j].GetFamily())): nbrHydrophobe = nbrHydrophobe + 1 elif ('LumpedHydrophobe' == (feats[j].GetFamily())): nbrLumpedHydrophobe = nbrLumpedHydrophobe + 1 elif ('PosIonizable' == (feats[j].GetFamily())): nbrPosIonizable = nbrPosIonizable + 1 elif ('NegIonizable' == (feats[j].GetFamily())): nbrNegIonizable = nbrNegIonizable + 1 else: pass#print(feats[j].GetFamily()) df.at[i,"Acceptor"] = nbrAcceptor df.at[i,"Donor"] = nbrDonor df.at[i,"Hydrophobe"] = nbrHydrophobe df.at[i,"LumpedHydrophobe"] = nbrLumpedHydrophobe df.at[i,"PosIonizable"] = nbrPosIonizable df.at[i,"NegIonizable"] = nbrNegIonizable # We can also get some more molecular features using rdMolDescriptors df.at[i,"NumRotatableBonds"] = rdMolDescriptors.CalcNumRotatableBonds(mol) df.at[i,"CalcChi0n"] = rdMolDescriptors.CalcChi0n(mol) df.at[i,"CalcChi0v"] = rdMolDescriptors.CalcChi0v(mol) df.at[i,"CalcChi1n"] = rdMolDescriptors.CalcChi1n(mol) df.at[i,"CalcChi1v"] = rdMolDescriptors.CalcChi1v(mol) df.at[i,"CalcChi2n"] = rdMolDescriptors.CalcChi2n(mol) df.at[i,"CalcChi2v"] = rdMolDescriptors.CalcChi2v(mol) df.at[i,"CalcChi3n"] = rdMolDescriptors.CalcChi3n(mol) df.at[i,"CalcChi3v"] = rdMolDescriptors.CalcChi3v(mol) df.at[i,"CalcChi4n"] = rdMolDescriptors.CalcChi4n(mol) df.at[i,"CalcChi4v"] = rdMolDescriptors.CalcChi4v(mol) df.at[i,"CalcFractionCSP3"] = rdMolDescriptors.CalcFractionCSP3(mol) df.at[i,"CalcHallKierAlpha"] = rdMolDescriptors.CalcHallKierAlpha(mol) df.at[i,"CalcKappa1"] = rdMolDescriptors.CalcKappa1(mol) df.at[i,"CalcKappa2"] = rdMolDescriptors.CalcKappa2(mol) #df.at[i,"CalcKappa3"] = rdMolDescriptors.CalcKappa3(mol) df.at[i,"CalcLabuteASA"] = rdMolDescriptors.CalcLabuteASA(mol) df.at[i,"CalcNumAliphaticCarbocycles"] = rdMolDescriptors.CalcNumAliphaticCarbocycles(mol) df.at[i,"CalcNumAliphaticHeterocycles"] = rdMolDescriptors.CalcNumAliphaticHeterocycles(mol) df.at[i,"CalcNumAliphaticRings"] = rdMolDescriptors.CalcNumAliphaticRings(mol) df.at[i,"CalcNumAmideBonds"] = rdMolDescriptors.CalcNumAmideBonds(mol) df.at[i,"CalcNumAromaticCarbocycles"] = rdMolDescriptors.CalcNumAromaticCarbocycles(mol) df.at[i,"CalcNumAromaticHeterocycles"] = rdMolDescriptors.CalcNumAromaticHeterocycles(mol) df.at[i,"CalcNumAromaticRings"] = rdMolDescriptors.CalcNumAromaticRings(mol) df.at[i,"CalcNumBridgeheadAtoms"] = rdMolDescriptors.CalcNumBridgeheadAtoms(mol) df.at[i,"CalcNumHBA"] = rdMolDescriptors.CalcNumHBA(mol) df.at[i,"CalcNumHBD"] = rdMolDescriptors.CalcNumHBD(mol) df.at[i,"CalcNumHeteroatoms"] = rdMolDescriptors.CalcNumHeteroatoms(mol) df.at[i,"CalcNumHeterocycles"] = rdMolDescriptors.CalcNumHeterocycles(mol) df.at[i,"CalcNumLipinskiHBA"] = rdMolDescriptors.CalcNumLipinskiHBA(mol) df.at[i,"CalcNumLipinskiHBD"] = rdMolDescriptors.CalcNumLipinskiHBD(mol) df.at[i,"CalcNumRings"] = rdMolDescriptors.CalcNumRings(mol) df.at[i,"CalcNumSaturatedCarbocycles"] = rdMolDescriptors.CalcNumSaturatedCarbocycles(mol) df.at[i,"CalcNumSaturatedHeterocycles"] = rdMolDescriptors.CalcNumSaturatedHeterocycles(mol) df.at[i,"CalcNumSaturatedRings"] = rdMolDescriptors.CalcNumSaturatedRings(mol) df.at[i,"CalcNumSpiroAtoms"] = rdMolDescriptors.CalcNumSpiroAtoms(mol) df.at[i,"CalcTPSA"] = rdMolDescriptors.CalcTPSA(mol) return(df)
from rdkit import Chem from rdkit.Chem import Descriptors from rdkit.Chem import rdMolDescriptors smiles = open('rb/smiles.smi') line = smiles.readline() salida = open('rb/descriptors.csv', 'w') while (line): print(line) mol = Chem.MolFromSmiles(line) logp = Descriptors.MolLogP(mol) molwt = Descriptors.MolWt(mol) hac = Descriptors.HeavyAtomCount(mol) ar = rdMolDescriptors.CalcNumAromaticRings(mol) rb = rdMolDescriptors.CalcNumRotatableBonds(mol) lista = [line[:-1], logp, molwt, hac, ar, rb] salida.write(','.join([str(elem) for elem in lista]) + '\n') line = smiles.readline() salida.close()