rel_path = "Data/hits.csv" abs_file_path = os.path.join(script_dir, rel_path) hit_df.to_csv(abs_file_path) # Before we merge, get rid of duplicate entries in hit and miss dfs hit_SMILES = [SMILES for SMILES in hit_df.CompoundSMILES] miss_df = miss_df[~miss_df['CompoundSMILES'].isin(hit_SMILES)] # Let's merge the hit and miss dfs for modelling df = pd.merge(hit_df, miss_df, how='outer') # Get rid of duplicate values again df = df.drop_duplicates(subset='CompoundSMILES', keep="first") # Produce rdkit features from SMILES df, properties = rdkit_utils.get_rdkit_properties(df) # Get X, y and training and test data y = df['Site_No'] X = df.drop(columns=['Site_No']) # Let's try add some feature engineering from feature tools # Make an entityset and add the entity es = ft.EntitySet(id='chem_features') es.entity_from_dataframe(entity_id='data', dataframe=X, make_index=False, index='CompoundSMILES') # Run deep feature synthesis with transformation primitives X, feature_defs = ft.dfs(entityset=es,
@author: warren """ import rdkit_utils import pandas as pd import featuretools as ft import joblib # Use the model to select frags df_test_frags = pd.read_excel("Data/Mpro_cocryst_2020_04_16.xlsx", usecols='D') df_test_frags.rename(columns={'SMILES': 'CompoundSMILES'}, inplace=True) df_SMILES_frags = pd.DataFrame(columns=['CompoundSMILES', 'Pred']) # Generate rdkit descriptors from SMILES X_test, properties = rdkit_utils.get_rdkit_properties(df_test_frags) # Remove SMILES column from X train df_SMILES_frags['CompoundSMILES'] = X_test['CompoundSMILES'] # Let's try add some feature engineering from feature tools # Make an entityset and add the entity es = ft.EntitySet(id='chem_features') es.entity_from_dataframe(entity_id='data', dataframe=X_test, make_index=False, index='CompoundSMILES') # Run deep feature synthesis with transformation primitives X_test, feature_defs = ft.dfs( entityset=es,
import pandas as pd from sklearn.preprocessing import MinMaxScaler from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV import joblib # Read csv of John's docking scores script_dir = os.path.dirname(os.path.abspath("__file__")) rel_path = "Data/Docking_03042020.csv" abs_file_path = os.path.join(script_dir, rel_path) docked_df = pd.read_csv(abs_file_path, usecols=["SMILES", "Hybrid2"]) docked_df.columns = ["CompoundSMILES", "Hybrid2"] # Produce rdkit features from SMILES df,properties = rdkit_utils.get_rdkit_properties(docked_df) # Get X, y and training and test data y = df['Hybrid2'] X = df.drop(columns=['CompoundSMILES', 'Hybrid2']) # Normalise data scaler_data = MinMaxScaler(feature_range = (-1, 1)) #Scale data used for model X = scaler_data.fit_transform(X) #Save scaler model for running submission scaler_filename = "Data/scaler_data.save" joblib.dump(scaler_data, scaler_filename)
import featuretools as ft import matplotlib.pyplot as plt from math import sqrt # Read csv of John's docking scores file_path = "Data/covid_submissions_all_info-2020-04-06-docked-justscore.csv" docked_df = pd.read_csv(file_path, usecols=["SMILES", "Hybrid2"]) docked_df.columns = ["CompoundSMILES", "Hybrid2"] # Get rid of duplicate values docked_df = docked_df.drop_duplicates(subset='CompoundSMILES', keep="first") docked_df = docked_df.reset_index(drop=True) # Produce rdkit features from SMILES df, properties = rdkit_utils.get_rdkit_properties(docked_df) # Get X, y and training and test data y = df['Hybrid2'] X = df.drop(columns=['Hybrid2']) # Let's try add some feature engineering from feature tools # Make an entityset and add the entity es = ft.EntitySet(id='chem_features') es.entity_from_dataframe(entity_id='data', dataframe=X, make_index=False, index='CompoundSMILES') # Run deep feature synthesis with transformation primitives X, feature_defs = ft.dfs(entityset=es,
random.seed(0xf00d) BRICS_func = BRICSBuild(allfrags) # Get BRICS builds and allocate them to list chunks in generator # to help memory all_BRICS_builds = rdkit_utils.get_BRICS_builds(BRICS_func) # Convert to SMILES and write to csv for having a look at later rdkit_utils.write_BRICS_csv(all_BRICS_builds, filename="Data/BRICS_hits.csv") # Use the model to filter the BRICS compounds df_test_BRICS = pd.read_csv("Data/BRICS_hits.csv", header=0) df_SMILES_BRICS = pd.DataFrame(columns=['CompoundSMILES', 'Pred']) # Generate rdkit descriptors from SMILES X_test_BRICS_pred, properties = rdkit_utils.get_rdkit_properties(df_test_BRICS) # Filter using Lipinksi filter/NB >300 and <500 MW applied in BRICS.py X_test_BRICS_pred = X_test_BRICS_pred[X_test_BRICS_pred.NumRotatableBonds <= 5] # Remove SMILES column from X train df_SMILES_BRICS['CompoundSMILES'] = X_test_BRICS_pred['CompoundSMILES'] X_test_BRICS_pred = X_test_BRICS_pred.drop(columns=['CompoundSMILES']) # Load saved rf model from classify.py script rf_model = joblib.load("Models/RF_model.pkl") # Test model y_test_BRICS_pred = rf_model.predict(X_test_BRICS_pred) # Get SMILES of BRICS_predictions