Esempio n. 1
0
rel_path = "Data/hits.csv"
abs_file_path = os.path.join(script_dir, rel_path)
hit_df.to_csv(abs_file_path)

# Before we merge, get rid of duplicate entries in hit and miss dfs
hit_SMILES = [SMILES for SMILES in hit_df.CompoundSMILES]
miss_df = miss_df[~miss_df['CompoundSMILES'].isin(hit_SMILES)]

# Let's merge the hit and miss dfs for modelling
df = pd.merge(hit_df, miss_df, how='outer')

# Get rid of duplicate values again
df = df.drop_duplicates(subset='CompoundSMILES', keep="first")

# Produce rdkit features from SMILES
df, properties = rdkit_utils.get_rdkit_properties(df)

# Get X, y and training and test data
y = df['Site_No']
X = df.drop(columns=['Site_No'])

# Let's try add some feature engineering from feature tools
# Make an entityset and add the entity
es = ft.EntitySet(id='chem_features')
es.entity_from_dataframe(entity_id='data',
                         dataframe=X,
                         make_index=False,
                         index='CompoundSMILES')

# Run deep feature synthesis with transformation primitives
X, feature_defs = ft.dfs(entityset=es,
Esempio n. 2
0
@author: warren
"""
import rdkit_utils
import pandas as pd
import featuretools as ft
import joblib

# Use the model to select frags
df_test_frags = pd.read_excel("Data/Mpro_cocryst_2020_04_16.xlsx", usecols='D')
df_test_frags.rename(columns={'SMILES': 'CompoundSMILES'}, inplace=True)

df_SMILES_frags = pd.DataFrame(columns=['CompoundSMILES', 'Pred'])

# Generate rdkit descriptors from SMILES
X_test, properties = rdkit_utils.get_rdkit_properties(df_test_frags)

# Remove SMILES column from X train
df_SMILES_frags['CompoundSMILES'] = X_test['CompoundSMILES']

# Let's try add some feature engineering from feature tools
# Make an entityset and add the entity
es = ft.EntitySet(id='chem_features')
es.entity_from_dataframe(entity_id='data',
                         dataframe=X_test,
                         make_index=False,
                         index='CompoundSMILES')

# Run deep feature synthesis with transformation primitives
X_test, feature_defs = ft.dfs(
    entityset=es,
Esempio n. 3
0
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
import joblib
    
# Read csv of John's docking scores 
script_dir = os.path.dirname(os.path.abspath("__file__"))
rel_path = "Data/Docking_03042020.csv"
abs_file_path = os.path.join(script_dir, rel_path)

docked_df = pd.read_csv(abs_file_path, usecols=["SMILES", "Hybrid2"])
docked_df.columns = ["CompoundSMILES", "Hybrid2"] 

# Produce rdkit features from SMILES
df,properties = rdkit_utils.get_rdkit_properties(docked_df)

# Get X, y and training and test data
y = df['Hybrid2']
X = df.drop(columns=['CompoundSMILES', 'Hybrid2'])

# Normalise data
scaler_data = MinMaxScaler(feature_range = (-1, 1))

#Scale data used for model
X = scaler_data.fit_transform(X)

#Save scaler model for running submission
scaler_filename = "Data/scaler_data.save"
joblib.dump(scaler_data, scaler_filename)
    
Esempio n. 4
0
import featuretools as ft
import matplotlib.pyplot as plt
from math import sqrt

# Read csv of John's docking scores
file_path = "Data/covid_submissions_all_info-2020-04-06-docked-justscore.csv"

docked_df = pd.read_csv(file_path, usecols=["SMILES", "Hybrid2"])
docked_df.columns = ["CompoundSMILES", "Hybrid2"]

# Get rid of duplicate values
docked_df = docked_df.drop_duplicates(subset='CompoundSMILES', keep="first")
docked_df = docked_df.reset_index(drop=True)

# Produce rdkit features from SMILES
df, properties = rdkit_utils.get_rdkit_properties(docked_df)

# Get X, y and training and test data
y = df['Hybrid2']
X = df.drop(columns=['Hybrid2'])

# Let's try add some feature engineering from feature tools
# Make an entityset and add the entity
es = ft.EntitySet(id='chem_features')
es.entity_from_dataframe(entity_id='data',
                         dataframe=X,
                         make_index=False,
                         index='CompoundSMILES')

# Run deep feature synthesis with transformation primitives
X, feature_defs = ft.dfs(entityset=es,
Esempio n. 5
0
random.seed(0xf00d)
BRICS_func = BRICSBuild(allfrags)

# Get BRICS builds and allocate them to list chunks in generator
# to help memory
all_BRICS_builds = rdkit_utils.get_BRICS_builds(BRICS_func)

# Convert to SMILES and write to csv for having a look at later
rdkit_utils.write_BRICS_csv(all_BRICS_builds, filename="Data/BRICS_hits.csv")

# Use the model to filter the BRICS compounds
df_test_BRICS = pd.read_csv("Data/BRICS_hits.csv", header=0)
df_SMILES_BRICS = pd.DataFrame(columns=['CompoundSMILES', 'Pred'])

# Generate rdkit descriptors from SMILES
X_test_BRICS_pred, properties = rdkit_utils.get_rdkit_properties(df_test_BRICS)

# Filter using Lipinksi filter/NB >300 and <500 MW applied in BRICS.py
X_test_BRICS_pred = X_test_BRICS_pred[X_test_BRICS_pred.NumRotatableBonds <= 5]

# Remove SMILES column from X train
df_SMILES_BRICS['CompoundSMILES'] = X_test_BRICS_pred['CompoundSMILES']
X_test_BRICS_pred = X_test_BRICS_pred.drop(columns=['CompoundSMILES'])

# Load saved rf model from classify.py script
rf_model = joblib.load("Models/RF_model.pkl")

# Test model
y_test_BRICS_pred = rf_model.predict(X_test_BRICS_pred)

# Get SMILES of BRICS_predictions