def pains(filtered_df): filteredData = filtered_df params = FilterCatalogParams() # Build a catalog from all PAINS (A, B and C) params.AddCatalog(FilterCatalogParams.FilterCatalogs.PAINS) catalog = FilterCatalog(params) # Create empty dataframes for filtered data rdkit_highLightFramePAINS = pd.DataFrame(columns=('CompID', 'CompMol', 'unwantedID')) rdkit_noPAINS = pd.DataFrame(columns=('ChEMBL_ID', 'smiles','pIC50')) rdkit_withPAINS = pd.DataFrame(columns=('ChEMBL_ID', 'smiles', 'pIC50','unwantedID')) # For index and row in the filtered df for i,row in filteredData.iterrows(): curMol = Chem.MolFromSmiles(row.smiles) # Current molecule match = False # Set match to false rdkit_PAINSList = [] # Get the first match entry = catalog.GetFirstMatch(curMol) if entry!=None: # Add name of current unwanted subsftructure to list rdkit_PAINSList.append(entry.GetDescription().capitalize()) # Add relevant matching information to dataframe rdkit_highLightFramePAINS.loc[len(rdkit_highLightFramePAINS)] = [row.molecule_chembl_id, curMol, entry.GetDescription().capitalize()] match = True if not match: # Add to frame of PAINS free compounds rdkit_noPAINS.loc[len(rdkit_noPAINS)] = [row.molecule_chembl_id, row.smiles, row.pIC50] else: # Add to frame of compounds that contain PAINS # Put the relevant information in the dataframe with the unwanted substructures rdkit_withPAINS.loc[len(rdkit_withPAINS)] = [row.molecule_chembl_id, row.smiles, row.pIC50, entry.GetDescription().capitalize()] df = rdkit_noPAINS # Drop unnecessary columns ## df_new = df.drop(['units', 'IC50'], axis=1) df_new = df # Create molecules from smiles and their fingerprints create_mol(df_new, 2048) # Add column for activity df_new['active'] = np.zeros(len(df_new)) # Mark every molecule as active with an pIC50 of > 6.3 df_new.loc[df_new[df_new.pIC50 >= 6.3].index, 'active'] = 1.0 return df_new
def painspredict(thefile, theoutput): os.remove('output.txt') f1 = open(theoutput, 'w+') mySMILESinput = pd.DataFrame(columns=['ID', 'my_smiles']) params = FilterCatalogParams() params.AddCatalog(FilterCatalogParams.FilterCatalogs.NIH) catalog = FilterCatalog(params) suppl = Chem.SmilesMolSupplier(thefile) with open(thefile, 'r') as inf: first_line = inf.readline() inf.close() with open(thefile, 'a') as inf: inf.write(first_line) inf.close() inf = open(thefile, 'r') sub_strct = [line.rstrip().split(" ") for line in inf] ms = [x for x in suppl if x is not None] i = 0 for mol in ms: entry = catalog.GetFirstMatch(mol) sphybrid = Chem.rdMolDescriptors.CalcFractionCSP3(mol) if (entry is not None): print(i, sub_strct[i], "PAINS", entry.GetDescription(), "Fsp3", sphybrid, file=f1) else: print(i, sub_strct[i], "PAINS OK", "Fsp3", sphybrid, file=f1) i += 1