def experiment(dataset_file, method='GraphConv', split='scaffold'): featurizer = 'ECFP' if method == 'GraphConv': featurizer = 'GraphConv' tasks, datasets, transformers = load_dataset(dataset_file, featurizer=featurizer, split=split) train, val, test = datasets model = None if method == 'GraphConv': model = GraphConvModel(len(tasks), batch_size=BATCH_SIZE, mode="regression") elif method == 'RF': def model_builder_rf(model_dir): sklearn_model = RandomForestRegressor(n_estimators=100) return dc.models.SklearnModel(sklearn_model, model_dir) model = dc.models.SingletaskToMultitask(tasks, model_builder_rf) elif method == 'SVR': def model_builder_svr(model_dir): sklearn_model = svm.SVR(kernel='linear') return dc.models.SklearnModel(sklearn_model, model_dir) model = dc.models.SingletaskToMultitask(tasks, model_builder_svr) return model, train, val, test, transformers
def gc_model_builder(model_params, model_dir): gc_model = GraphConvModel(**model_params, model_dir="./models") return gc_model
np.random.seed(123) tf.set_random_seed(123) import deepchem as dc from deepchem.data.datasets import NumpyDataset from deepchem.models.tensorgraph.models.graph_models import GraphConvModel model_dir = os.path.join(os.path.dirname(__file__), "..", "model") # Create the featurizer and transformer with open(os.path.join(model_dir, '..', 'tasks.json'), 'r') as fp: tasks = json.load(fp) # Batch size of models model = GraphConvModel(12, mode='classification', model_dir=model_dir, batch_size=128) model.restore() # Make the inference functions def invoke_model(feats: np.array, smiles: List[str]) -> [dict]: """Invoke the model Args: feats (np.array): Features for the model smiles ([str]): SMILES Returns: ([dict]) Return the data """ # Turn the features into a Numpy dataset
""" import warnings warnings.filterwarnings('ignore') import deepchem as dc #from deepchem.models.tensorgraph.models.graph_models import MPNNTensorGraph from deepchem.models.tensorgraph.models.graph_models import GraphConvModel #from deepchem.feat import WeaveFeaturizer from deepchem.feat.graph_features import ConvMolFeaturizer from deepchem.feat.graph_features import WeaveFeaturizer from deepchem.data.data_loader import CSVLoader import pandas as pd import numpy as np featurizer = ConvMolFeaturizer() #featurizer = WeaveFeaturizer(graph_distance=True, explicit_H=False) train_loader = CSVLoader(tasks=['LogD7.4'], smiles_field='smiles', featurizer=featurizer) test_loader = CSVLoader(tasks=['LogD7.4'], smiles_field='smiles', featurizer=featurizer) X_train = train_loader.featurize('../demo_data/reg/training_set.csv') X_test = test_loader.featurize('../demo_data/reg/testing_set.csv') model = GraphConvModel(n_tasks=1, mode='regression') model.fit(X_train) print(model.predict(X_test))
# Load Tox21 dataset tox21_tasks, tox21_datasets, transformers = load_tox21(featurizer='GraphConv') train_dataset, valid_dataset, test_dataset = tox21_datasets print(train_dataset.data_dir) print(valid_dataset.data_dir) # Fit models metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification") # Batch size of models batch_size = 50 model = GraphConvModel(len(tox21_tasks), batch_size=batch_size, mode='classification') model.fit(train_dataset, nb_epoch=10) print("Evaluating model") train_scores = model.evaluate(train_dataset, [metric], transformers) valid_scores = model.evaluate(valid_dataset, [metric], transformers) print("Train scores") print(train_scores) print("Validation scores") print(valid_scores)
support_generator = dc.data.SupportGenerator(test_dataset, n_pos, n_neg, n_trials) # Compute accuracies task_scores = {task: [] for task in range(len(test_dataset.get_task_names()))} for trial_num, (task, support) in enumerate(support_generator): print("Starting trial %d" % trial_num) # Number of features on conv-mols n_feat = 75 # Batch size of models batch_size = 50 #graph_model = dc.nn.SequentialGraph(n_feat) model = GraphConvModel( 1, graph_conv_layers=[64, 128, 64], batch_size=batch_size) # Fit trained model model.fit(support, nb_epoch=10) # Test model task_dataset = dc.data.get_task_dataset_minus_support(test_dataset, support, task) y_pred = model.predict(task_dataset) score = metric.compute_metric(task_dataset.y, y_pred, task_dataset.w) print("Score on task %s is %s" % (str(task), str(score))) task_scores[task].append(score) # Join information for all tasks. mean_task_scores = {} std_task_scores = {} for task in range(len(test_dataset.get_task_names())):
n_trials) # Compute accuracies task_scores = {task: [] for task in range(len(test_dataset.get_task_names()))} for trial_num, (task, support) in enumerate(support_generator): print("Starting trial %d" % trial_num) # Number of features on conv-mols n_feat = 75 # Batch size of models batch_size = 50 #graph_model = dc.nn.SequentialGraph(n_feat) model = GraphConvModel(1, graph_conv_layers=[64, 128, 64], batch_size=batch_size) # Fit trained model model.fit(support, nb_epoch=10) # Test model task_dataset = dc.data.get_task_dataset_minus_support( test_dataset, support, task) y_pred = model.predict(task_dataset) score = metric.compute_metric(task_dataset.y, y_pred, task_dataset.w) print("Score on task %s is %s" % (str(task), str(score))) task_scores[task].append(score) # Join information for all tasks. mean_task_scores = {} std_task_scores = {}
def graph_conv_training(): graph_featurizer = dc.feat.graph_features.ConvMolFeaturizer() loader = dc.data.data_loader.CSVLoader(tasks=[t_task.get()], smiles_field=t_smiles.get(), id_field=t_id.get(), featurizer=graph_featurizer) dataset = loader.featurize(t_csv.get()) splitter = dc.splits.splitters.RandomSplitter() trainset, testset = splitter.train_test_split(dataset) hp = dc.molnet.preset_hyper_parameters param = hp.hps['graphconvreg'] print(param) batch_size = 48 from deepchem.models.tensorgraph.models.graph_models import GraphConvModel model = GraphConvModel(n_tasks=1, batch_size=64, uncertainty=False, mode='regression') model = dc.models.GraphConvTensorGraph(1, batch_size=batch_size, learning_rate=1e-3, use_queue=False, mode='regression', model_dir=t_savename.get()) np.random.seed(1) random.seed(1) model.fit(dataset, nb_epoch=max(1, int(t_epochs.get()))) #model.fit(trainset, nb_epoch=max(1, int(t_epochs.get()))) metric = dc.metrics.Metric(dc.metrics.r2_score) print('epoch: ', t_epochs.get()) print("Evaluating model") train_score = model.evaluate(trainset, [metric]) test_score = model.evaluate(testset, [metric]) model.save() pred_train = model.predict(trainset) pred_test = model.predict(testset) y_train = np.array(trainset.y, dtype=np.float32) y_test = np.array(testset.y, dtype=np.float32) import matplotlib.pyplot as plt plt.figure() plt.figure(figsize=(5, 5)) plt.scatter(y_train, pred_train, label='Train', c='blue') plt.title('Graph Convolution') plt.xlabel('Measured value') plt.ylabel('Predicted value') plt.scatter(y_test, pred_test, c='lightgreen', label='Test', alpha=0.8) plt.legend(loc=4) #plt.show() plt.savefig('score-tmp.png') from PIL import Image img = Image.open('score-tmp.png') img_resize = img.resize((400, 400), Image.LANCZOS) img_resize.save('score-tmp.png') global image_score image_score_open = Image.open('score-tmp.png') image_score = ImageTk.PhotoImage(image_score_open, master=frame1) canvas.create_image(200, 200, image=image_score) #Calculate R2 score print("Train score") print(train_score) t_train_r2.set(train_score) print("Test scores") print(test_score) t_test_r2.set(test_score) #Calculate RMSE train_rmse = 1 test_rmse = 1 ''' print("Train RMSE") print(train_rmse) t_train_rmse.set(train_rmse) print("Test RMSE") print(test_rmse) t_test_rmse.set(test_rmse) ''' df_save = pd.DataFrame({'pred_train': pred_train, 'meas_train': y_train}) df_save.to_csv('pred_and_meas_train.csv') print('finish!')
tf.set_random_seed(123) import deepchem as dc from deepchem.molnet import load_tox21 from deepchem.models.tensorgraph.models.graph_models import GraphConvModel model_dir = "model" # Load Tox21 dataset tox21_tasks, tox21_datasets, transformers = load_tox21(featurizer='GraphConv') with open('tasks.json', 'w') as fp: json.dump(tox21_tasks, fp) train_dataset, valid_dataset, test_dataset = tox21_datasets # Fit models metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification") # Batch size of models batch_size = 50 model = GraphConvModel(len(tox21_tasks), batch_size=batch_size, mode='classification', model_dir=model_dir) model.fit(train_dataset, nb_epoch=50) model.save()
metric = dc.metrics.Metric( dc.metrics.roc_auc_score, np.mean, mode="classification") print("Evaluating model") train_scores = model.evaluate(train_dataset, [metric], transformers) print("Training ROC-AUC Score: %f" % train_scores["mean-roc_auc_score"]) valid_scores = model.evaluate(valid_dataset, [metric], transformers) print("Validation ROC-AUC Score: %f" % valid_scores["mean-roc_auc_score"]) ''' ########################################################################################### # Load HIV dataset hiv_tasks, hiv_datasets, transformers = dc.molnet.load_hiv(featurizer='GraphConv') train_dataset, valid_dataset, test_dataset = hiv_datasets model = GraphConvModel( len(hiv_tasks), batch_size=70, mode='classification') # Set nb_epoch=10 for better results. model.fit(train_dataset, nb_epoch=1) metric = dc.metrics.Metric( dc.metrics.roc_auc_score, np.mean, mode="classification") print("Evaluating model") train_scores = model.evaluate(train_dataset, [metric], transformers) print("Training ROC-AUC Score: %f" % train_scores["mean-roc_auc_score"]) valid_scores = model.evaluate(valid_dataset, [metric], transformers) print("Validation ROC-AUC Score: %f" % valid_scores["mean-roc_auc_score"]) ''' ############################################################################################ # Load SAMPL(FreeSolv) dataset SAMPL_tasks, SAMPL_datasets, transformers = dc.molnet.load_sampl(
valid_dataset = loader.featurize( '../data/dw_acidic_unique_valid.csv' ) test_dataset = loader.featurize( '../data/dw_acidic_unique_test.csv' ) transformers = [ dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)] for dataset in [train_dataset, valid_dataset, test_dataset]: for transformer in transformers: dataset = transformer.transform(dataset) # print('shape of the dataset') # print(train_dataset.X.shape) # print(train_dataset.X[0].get_atom_features()) # print(train_dataset.X[0].get_atom_features().shape) model = GraphConvModel.load_from_dir('models') model.restore() train_scores = model.evaluate( train_dataset, [dc.metrics.Metric(dc.metrics.rms_score), dc.metrics.Metric(dc.metrics.r2_score), dc.metrics.Metric(dc.metrics.mae_score)] ) print('train scores') print(train_scores) valid_scores = model.evaluate( valid_dataset, [dc.metrics.Metric(dc.metrics.rms_score), dc.metrics.Metric(dc.metrics.r2_score),
test_dataset = loader.featurize('../data/dw_acidic_unique_test.csv') # splitter = dc.splits.RandomSplitter() # train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset, seed=42) transformers = [ dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset) ] for dataset in [train_dataset, valid_dataset, test_dataset]: for transformer in transformers: dataset = transformer.transform(dataset) model = GraphConvModel(n_tasks=1, mode='regression', tensorboard=True, model_dir='models/', dropout=0.5, graph_conv_layers=[64, 64]) # Need to use the following hackish code to track the validation loss # while fitting with DeepChem, this is how I track overfitting. valid_loss = 10000000 while valid_loss > 50: # Fit trained model model.fit(train_dataset, nb_epoch=1) # checkpoint_interval causes the model not to save a checkpoint. valid_loss = model.fit(valid_dataset, checkpoint_interval=0) print("valid loss: ", valid_loss) # This will restore the model to the fit from the train dataset model.restore() model.save()
] #Now we just store as Numpy Dataset? Or maybe I don't need to do that from deepchem.data.datasets import NumpyDataset # import NumpyDataset dataset = NumpyDataset(np.squeeze(X_oversampled), y_oversampled) splitter = dc.splits.splitters.RandomSplitter() trainset, testset = splitter.train_test_split(dataset) X_oversampled, y_oversampled = ros.fit_resample( np.atleast_2d(X_embeddings[0]).T, labels) test_classifier = GraphConvModel(1, graph_conv_layers=[64, 64], dense_layer_size=128, dropout=0.5, model_dir='models', mode='classification', number_atom_features=75, n_classes=2, uncertainty=False, use_queue=False, tensorboard=True) test_classifier.fit(trainset, nb_epoch=10) dnn_preds = test_classifier.predict(testset) break # hp = dc.molnet.preset_hyper_parameters # param = hp.hps[ 'graphconvreg' ] # print(param['batch_size']) # g = tf.Graph() # graph_model = dc.nn.SequentialGraph( 75 ) # graph_model.add( dc.nn.GraphConv( int(param['n_filters']), 75, activation='relu' )) # graph_model.add( dc.nn.BatchNormalization( epsilon=1e-5, mode=1 ))
graph_featurizer = dc.feat.graph_features.ConvMolFeaturizer() loader_train = dc.data.data_loader.CSVLoader(tasks=['ACTIVITY'], smiles_field="smiles", featurizer=graph_featurizer) dataset_train = loader_train.featurize('./train.csv') # In[3]: loader_test = dc.data.data_loader.CSVLoader(tasks=['ACTIVITY'], smiles_field="smiles", featurizer=graph_featurizer) dataset_test = loader_test.featurize('./test.csv') # In[9]: model = GraphConvModel(n_tasks=1, mode='regression', dropout=0.2) model.fit(dataset_train, nb_epoch=1000) # In[10]: metric = dc.metrics.Metric(dc.metrics.pearson_r2_score) print(model.evaluate(dataset_train, [metric])) print(model.evaluate(dataset_test, [metric])) # In[11]: test_preds = model.predict(dataset_test)