def test_graph_conv_model(): tasks, dataset, transformers, metric = get_dataset('classification', 'GraphConv') batch_size = 10 model = GraphConvModel(len(tasks), batch_size=batch_size, batch_normalize=False, mode='classification') model.fit(dataset, nb_epoch=20) scores = model.evaluate(dataset, [metric], transformers) assert scores['mean-roc_auc_score'] >= 0.9
def test_graph_conv_regression_model(): tasks, dataset, transformers, metric = get_dataset('regression', 'GraphConv') batch_size = 10 model = GraphConvModel(len(tasks), batch_size=batch_size, batch_normalize=False, mode='regression') model.fit(dataset, nb_epoch=100) scores = model.evaluate(dataset, [metric], transformers) assert scores['mean_absolute_error'] < 0.1
def test_graph_conv_error_bars(self): tasks, dataset, transformers, metric = self.get_dataset( 'regression', 'GraphConv', num_tasks=1) batch_size = 50 model = GraphConvModel(len(tasks), batch_size=batch_size, mode='regression') model.fit(dataset, nb_epoch=1) mu, sigma = model.bayesian_predict( dataset, transformers, untransform=True, n_passes=24) assert mu.shape == (len(dataset), len(tasks)) assert sigma.shape == (len(dataset), len(tasks))
def test_graph_conv_model(self): from deepchem.models import GraphConvModel, TensorGraph import numpy as np tasks, dataset, transformers, metric = self.get_dataset( 'classification', 'GraphConv') batch_size = 50 model = GraphConvModel(len(tasks), batch_size=batch_size, mode='classification') model.fit(dataset, nb_epoch=10) scores = model.evaluate(dataset, [metric], transformers) assert scores['mean-roc_auc_score'] >= 0.9
def define_gc_regression_model(n_tasks, graph_conv_sizes=(128, 128), dense_size=256, batch_size=128, learning_rate=0.001, config=default_config, model_dir='/tmp'): """ Initializes the multitask regression GCNN :param n_tasks: number of output tasks :param graph_conv_sizes: tuple with output dimension for every GC layer :param dense_size: size of the dense layer :param batch_size: number of examples per minibatch :param learning_rate: initial learning rate :param config: GPU and memory usage options :param model_dir: where the trained model will be stored :return: a GraphConvModel object """ return GraphConvModel(n_tasks=n_tasks, graph_conv_layers=graph_conv_sizes, dense_layer_size=dense_size, dropout=0.0, mode='regression', number_atom_features=75, uncertainty=False, batch_size=batch_size, learning_rate=learning_rate, learning_rate_decay_time=1000, optimizer_type='adam', configproto=config, model_dir=model_dir)
def test_neural_fingerprint_retrieval(): tasks, dataset, transformers, metric = get_dataset('classification', 'GraphConv') fp_size = 3 batch_size = 50 model = GraphConvModel(len(tasks), batch_size=batch_size, dense_layer_size=3, mode='classification') model.fit(dataset, nb_epoch=1) neural_fingerprints = model.predict_embedding(dataset) neural_fingerprints = np.array(neural_fingerprints)[:len(dataset)] assert (len(dataset), fp_size * 2) == neural_fingerprints.shape
def test_change_loss_function(self): tasks, dataset, transformers, metric = self.get_dataset( 'regression', 'GraphConv', num_tasks=1) batch_size = 50 model = GraphConvModel(len(tasks), batch_size=batch_size, mode='regression') model.fit(dataset, nb_epoch=1) model.save() model2 = TensorGraph.load_from_dir(model.model_dir, restore=False) dummy_label = model2.labels[-1] dummy_ouput = model2.outputs[-1] loss = ReduceSum(L2Loss(in_layers=[dummy_label, dummy_ouput])) module = model2.create_submodel(loss=loss) model2.restore() model2.fit(dataset, nb_epoch=1, submodel=module)
def test_graph_conv_atom_features(self): tasks, dataset, transformers, metric = self.get_dataset( 'regression', 'Raw', num_tasks=1) atom_feature_name = 'feature' y = [] for mol in dataset.X: atom_features = [] for atom in mol.GetAtoms(): val = np.random.normal() mol.SetProp("atom %08d %s" % (atom.GetIdx(), atom_feature_name), str(val)) atom_features.append(np.random.normal()) y.append([np.sum(atom_features)]) featurizer = ConvMolFeaturizer(atom_properties=[atom_feature_name]) X = featurizer.featurize(dataset.X) dataset = dc.data.NumpyDataset(X, np.array(y)) batch_size = 50 model = GraphConvModel( len(tasks), number_atom_features=featurizer.feature_length(), batch_size=batch_size, mode='regression') model.fit(dataset, nb_epoch=1) y_pred1 = model.predict(dataset) model.save() model2 = TensorGraph.load_from_dir(model.model_dir) y_pred2 = model2.predict(dataset) self.assertTrue(np.all(y_pred1 == y_pred2))
def test_neural_fingerprint_retrieval(self): tasks, dataset, transformers, metric = self.get_dataset( 'classification', 'GraphConv') fp_size = 3 batch_size = 50 model = GraphConvModel(len(tasks), batch_size=batch_size, dense_layer_size=3, mode='classification') model.fit(dataset, nb_epoch=1) neural_fingerprints = model.predict(dataset, outputs=model.neural_fingerprint) neural_fingerprints = np.array(neural_fingerprints)[:len(dataset)] self.assertEqual((len(dataset), fp_size * 2), neural_fingerprints.shape)
def test_neural_fingerprint_retrieval(self): tasks, dataset, transformers, metric = self.get_dataset( 'classification', 'GraphConv') fp_size = 3 batch_size = 50 model = GraphConvModel( len(tasks), batch_size=batch_size, dense_layer_size=3, mode='classification') model.fit(dataset, nb_epoch=1) neural_fingerprints = model.predict( dataset, outputs=model.neural_fingerprint) neural_fingerprints = np.array(neural_fingerprints)[:len(dataset)] self.assertEqual((len(dataset), fp_size * 2), neural_fingerprints.shape)
def test_graph_conv_regression_model(self): tasks, dataset, transformers, metric = self.get_dataset( 'regression', 'GraphConv') batch_size = 50 model = GraphConvModel(len(tasks), batch_size=batch_size, mode='regression') model.fit(dataset, nb_epoch=1) scores = model.evaluate(dataset, [metric], transformers) model.save() model = TensorGraph.load_from_dir(model.model_dir) scores = model.evaluate(dataset, [metric], transformers)
def test_graph_conv_regression_uncertainty(self): tasks, dataset, transformers, metric = self.get_dataset( 'regression', 'GraphConv') batch_size = 50 model = GraphConvModel(len(tasks), batch_size=batch_size, mode='regression', dropout=0.1, uncertainty=True) model.fit(dataset, nb_epoch=100) # Predict the output and uncertainty. pred, std = model.predict_uncertainty(dataset) mean_error = np.mean(np.abs(dataset.y - pred)) mean_value = np.mean(np.abs(dataset.y)) mean_std = np.mean(std) assert mean_error < 0.5 * mean_value assert mean_std > 0.5 * mean_error assert mean_std < mean_value
def test_graph_conv_atom_features(self): tasks, dataset, transformers, metric = self.get_dataset( 'regression', 'Raw', num_tasks=1) atom_feature_name = 'feature' y = [] for mol in dataset.X: atom_features = [] for atom in mol.GetAtoms(): val = np.random.normal() mol.SetProp("atom %08d %s" % (atom.GetIdx(), atom_feature_name), str(val)) atom_features.append(np.random.normal()) y.append([np.sum(atom_features)]) featurizer = ConvMolFeaturizer(atom_properties=[atom_feature_name]) X = featurizer.featurize(dataset.X) dataset = dc.data.NumpyDataset(X, np.array(y)) batch_size = 50 model = GraphConvModel( len(tasks), number_atom_features=featurizer.feature_length(), batch_size=batch_size, mode='regression') model.fit(dataset, nb_epoch=1) y_pred1 = model.predict(dataset) model.save() model2 = TensorGraph.load_from_dir(model.model_dir) y_pred2 = model2.predict(dataset) self.assertTrue(np.allclose(y_pred1, y_pred2))
def test_graph_conv_model(): batch_size = 2000 model = GraphConvModel(1, batch_size=batch_size, mode="classification", model_dir="/tmp/covid/model_dir") dataset_file = "covid_mpro_combined_data_sources.csv" tasks = ["isHit"] featurizer = dc.feat.ConvMolFeaturizer() loader = dc.data.CSVLoader(tasks=tasks, smiles_field="SMILES", featurizer=featurizer) dataset = loader.featurize(dataset_file, shard_size=8192) metrics = [ dc.metrics.Metric(dc.metrics.matthews_corrcoef, np.mean, mode="classification") ] splitter = dc.splits.RandomSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset) model.fit(train_dataset) pred = [x.flatten() for x in model.predict(valid_dataset)] pred_df = pd.DataFrame(pred, columns=["neg", "pos"]) pred_df["active"] = [int(x) for x in valid_dataset.y] pred_df["SMILES"] = valid_dataset.ids sns.boxplot(pred_df.active, pred_df.pos) print(model.evaluate(train_dataset, metrics)) print(model.evaluate(test_dataset, metrics)) metrics = [ dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification") ] print(model.evaluate(train_dataset, metrics)) print(model.evaluate(test_dataset, metrics))
def test_graph_conv_regression_uncertainty(self): tasks, dataset, transformers, metric = self.get_dataset( 'regression', 'GraphConv') batch_size = 50 model = GraphConvModel( len(tasks), batch_size=batch_size, mode='regression', dropout=0.1, uncertainty=True) model.fit(dataset, nb_epoch=100) # Predict the output and uncertainty. pred, std = model.predict_uncertainty(dataset) mean_error = np.mean(np.abs(dataset.y - pred)) mean_value = np.mean(np.abs(dataset.y)) mean_std = np.mean(std) assert mean_error < 0.5 * mean_value assert mean_std > 0.5 * mean_error assert mean_std < mean_value
def test_graph_conv_regression_model(self): tasks, dataset, transformers, metric = self.get_dataset( 'regression', 'GraphConv') batch_size = 50 model = GraphConvModel(len(tasks), batch_size=batch_size, mode='regression') model.fit(dataset, nb_epoch=100) scores = model.evaluate(dataset, [metric], transformers) assert all(s < 0.1 for s in scores['mean_absolute_error']) model.save() model = TensorGraph.load_from_dir(model.model_dir) scores2 = model.evaluate(dataset, [metric], transformers) assert np.allclose(scores['mean_absolute_error'], scores2['mean_absolute_error'])
def test_graph_conv_model(self): tasks, dataset, transformers, metric = self.get_dataset( 'classification', 'GraphConv') batch_size = 50 model = GraphConvModel( len(tasks), batch_size=batch_size, mode='classification') model.fit(dataset, nb_epoch=10) scores = model.evaluate(dataset, [metric], transformers) assert scores['mean-roc_auc_score'] >= 0.9 model.save() model = TensorGraph.load_from_dir(model.model_dir) scores2 = model.evaluate(dataset, [metric], transformers) assert np.allclose(scores['mean-roc_auc_score'], scores2['mean-roc_auc_score'])
def test_graph_conv_model(self): tasks, dataset, transformers, metric = self.get_dataset( 'classification', 'GraphConv') batch_size = 50 model = GraphConvModel( len(tasks), batch_size=batch_size, mode='classification') model.fit(dataset, nb_epoch=10) scores = model.evaluate(dataset, [metric], transformers) assert scores['mean-roc_auc_score'] >= 0.9 model.save() model = TensorGraph.load_from_dir(model.model_dir) scores2 = model.evaluate(dataset, [metric], transformers) assert np.allclose(scores['mean-roc_auc_score'], scores2['mean-roc_auc_score'])
def test_graph_conv_regression_model(self): tasks, dataset, transformers, metric = self.get_dataset( 'regression', 'GraphConv') batch_size = 50 model = GraphConvModel(len(tasks), batch_size=batch_size, mode='regression') model.fit(dataset, nb_epoch=100) scores = model.evaluate(dataset, [metric], transformers) assert all(s < 0.1 for s in scores['mean_absolute_error']) model.save() model = TensorGraph.load_from_dir(model.model_dir) scores2 = model.evaluate(dataset, [metric], transformers) assert np.allclose( scores['mean_absolute_error'], scores2['mean_absolute_error'], rtol=1e-4)
def test_graph_conv_model_no_task(): tasks, dataset, _, __ = get_dataset('classification', 'GraphConv') batch_size = 10 model = GraphConvModel(len(tasks), batch_size=batch_size, batch_normalize=False, mode='classification') model.fit(dataset, nb_epoch=20) # predict datset with no y (ensured by tasks = []) bace_url = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/bace.csv" dc.utils.data_utils.download_url(url=bace_url, name="bace_tmp.csv") loader = dc.data.CSVLoader(tasks=[], smiles_field='mol', featurizer=dc.feat.ConvMolFeaturizer()) td = loader.featurize( os.path.join(dc.utils.data_utils.get_data_dir(), "bace_tmp.csv")) model.predict(td)
def test_change_loss_function(self): tasks, dataset, transformers, metric = self.get_dataset( 'regression', 'GraphConv', num_tasks=1) batch_size = 50 model = GraphConvModel(len(tasks), batch_size=batch_size, mode='regression') model.fit(dataset, nb_epoch=1) model.save() model2 = TensorGraph.load_from_dir(model.model_dir, restore=False) dummy_label = model2.labels[-1] dummy_ouput = model2.outputs[-1] loss = ReduceSum(L2Loss(in_layers=[dummy_label, dummy_ouput])) module = model2.create_submodel(loss=loss) model2.restore() model2.fit(dataset, nb_epoch=1, submodel=module)
np.mean, mode="classification") ] training_score_list = [] validation_score_list = [] transformers = [] model.fit(dataset) print(model.evaluate(dataset, metrics)) return model #model = generate_graph_conv_model() model = GraphConvModel(1, batch_size=128, mode="classification", model_dir="/tmp/mk01/model_dir") model.restore() #make predictions featurizer = dc.feat.ConvMolFeaturizer() df = pd.read_csv("zinc_100k.txt", sep=" ", delimiter=' ', header=None) df.columns = ["SMILES", "Name"] rows, cols = df.shape df["Val"] = [ 0 ] * rows #just add add a dummy column to keep the featurizer happy infile_name = "zinc_filtered.csv" df.to_csv(infile_name, index=False) loader = dc.data.CSVLoader(tasks=['Val'], smiles_field="SMILES",
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset_train, frac_train=0.8, frac_valid=0.2, frac_test=0.0, seed=0) ####Transorm them train_dataset = transformers_train.transform(train_dataset) valid_dataset = transformers_train.transform(valid_dataset) test1_dataset = transformers_test1.transform(dataset_test1) test2_dataset = transformers_test2.transform(dataset_test2) ####### model_dir = "./tf_chp_hp" model = GraphConvModel(n_tasks=1, batch_size=32, mode='regression', dropout=0.0, dense_layer_size=256, learning_rate=0.005, model_dir=model_dir, random_seed=0) metric = dc.metrics.Metric(dc.metrics.r2_score, mode='regression') ckpt = tf.train.Checkpoint(step=tf.Variable(1)) manager = tf.train.CheckpointManager(ckpt, model_dir, max_to_keep=20) start_time = time.time() num_epochs = 100 losses_train = [] score_valid = [] score_train = []
""" MODEL BUILDING """ # Fit metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean) # Do setup required for tf/keras models #n_feat = 1000 # Number of features on conv-mols batch_size = 1000 # Batch size of models nb_epoch = 1000 # Number of epochs for convergence model = GraphConvModel( 1, batch_size=batch_size, mode='regression', dropout=0.2, tensorboard=True, model_dir= "/home/rod/Dropbox/Quimica/Analysis/ANalisis/Borradores/GraphConvModel/" ) #To prevent overfitting # Fit trained model model.fit(train_dataset, nb_epoch=nb_epoch) model.save() print("Evaluating model") train_scores = model.evaluate(train_dataset, [metric], transformers) valid_scores = model.evaluate(valid_dataset, [metric], transformers) print("Train scores") print(train_scores)
metrics = [ dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification") ] print(model.evaluate(train_dataset, metrics)) print(model.evaluate(test_dataset, metrics)) #test_graph_conv_model() #train the model batch_size = 2000 model = GraphConvModel(1, batch_size=batch_size, mode="classification", model_dir="/tmp/covid/model_dir") dataset_file = "covid_mpro_combined_data_sources.csv" tasks = ["isHit"] featurizer = dc.feat.ConvMolFeaturizer() loader = dc.data.CSVLoader(tasks=tasks, smiles_field="SMILES", featurizer=featurizer) dataset = loader.featurize(dataset_file, shard_size=8192) model.fit(dataset) #model = GraphConvModel(1, batch_size=128,mode="classification",model_dir="/tmp/mk01/model_dir") #model.restore() #make predictions featurizer = dc.feat.ConvMolFeaturizer()
from deepchem.molnet import load_delaney # Load Delaney dataset delaney_tasks, delaney_datasets, transformers = load_delaney( featurizer='GraphConv', split='index') train_dataset, valid_dataset, test_dataset = delaney_datasets # Fit models metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean) # Do setup required for tf/keras models # Number of features on conv-mols n_feat = 75 # Batch size of models batch_size = 128 model = GraphConvModel( len(delaney_tasks), batch_size=batch_size, mode='regression') # Fit trained model model.fit(train_dataset, nb_epoch=20) print("Evaluating model") train_scores = model.evaluate(train_dataset, [metric], transformers) valid_scores = model.evaluate(valid_dataset, [metric], transformers) print("Train scores") print(train_scores) print("Validation scores") print(valid_scores)
# Load HOPV dataset hopv_tasks, hopv_datasets, transformers = load_hopv(featurizer='GraphConv') train_dataset, valid_dataset, test_dataset = hopv_datasets # Fit models metric = [ dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean, mode="regression"), dc.metrics.Metric( dc.metrics.mean_absolute_error, np.mean, mode="regression") ] # Number of features on conv-mols n_feat = 75 # Batch size of models batch_size = 50 model = GraphConvModel( len(hopv_tasks), batch_size=batch_size, mode='regression') # Fit trained model model.fit(train_dataset, nb_epoch=25) print("Evaluating model") train_scores = model.evaluate(train_dataset, metric, transformers) valid_scores = model.evaluate(valid_dataset, metric, transformers) print("Train scores") print(train_scores) print("Validation scores") print(valid_scores)
smiles = ['C1CCCCC1', 'O1CCOCC1'] # cyclohexane and dioxane mols = [Chem.MolFromSmiles(smile) for smile in smiles] feat = dc.feat.CircularFingerprint(size=1024) arr = feat.featurize(mols) print(arr) feat = dc.feat.RDKitDescriptors() arr = feat.featurize(mols) print(arr) tasks, datasets, transformers = dc.molnet.load_delaney(featurizer='GraphConv') train_dataset, valid_dataset, test_dataset = datasets model = GraphConvModel(n_tasks=1, mode='regression', dropout=0.2) model.fit(train_dataset, nb_epoch=100) metric = dc.metrics.Metric(dc.metrics.pearson_r2_score) print(model.evaluate(train_dataset, [metric], transformers)) print(model.evaluate(test_dataset, [metric], transformers)) smiles = ['COC(C)(C)CCCC(C)CC=CC(C)=CC(=O)OC(C)C', 'CCOC(=O)CC', 'CSc1nc(NC(C)C)nc(NC(C)C)n1', 'CC(C#C)N(C)C(=O)Nc1ccc(Cl)cc1', 'Cc1cc2ccccc2cc1C'] from rdkit import Chem mols = [Chem.MolFromSmiles(s) for s in smiles] featurizer = dc.feat.ConvMolFeaturizer()
def generate_graph_conv_model(): batch_size = 128 model = GraphConvModel(1, batch_size=batch_size, mode='regression') return model
splitter = dc.splits.RandomSplitter() train, valid, test = splitter.train_valid_test_split(dataset) # In[4]: # Fit models metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean) # Number of features n_feat = 75 # Batch size of models batch_size = 128 model = GraphConvModel(len(delaney_tasks), batch_size=batch_size, mode='regression', dropout=0.2) # In[5]: # Fit trained model model.fit(train, nb_epoch=100) print("Evaluating model") train_scores = model.evaluate(train, [metric], transformers) valid_scores = model.evaluate(valid, [metric], transformers) print("Train scores") print(train_scores) print("Validation scores")
from dataset_functions import to_dataframe, from_dataframe # select the featurizer type to be ConvMolFeaturizer graph_featurizer = dc.feat.graph_features.ConvMolFeaturizer() loader = dc.data.data_loader.CSVLoader(tasks=['gap'], smiles_field="smiles", id_field="ID", featurizer=graph_featurizer) metric = [dc.metrics.Metric(dc.metrics.mae_score, np.mean)] model = GraphConvModel( n_tasks=1, graph_conv_layers=[128, 128], dense_layer_size=512, dropout=0.0001, # dropout must be included in every layer for uncertainty mode='regression', uncertainty=True, learning_rate=0.001, batch_size=8) csv_list = ["set1.csv", "set2.csv", "set3.csv", "set4.csv"] seeds = [5, 10, 12, 18] # for generic train test split loads for initial run def load_and_split(csv, seed=None): # load csv for training and test dataset = loader.featurize(csv) # transform data here
clintox_tasks, clintox_datasets, transformers = load_clintox( featurizer='GraphConv', split='random') train_dataset, valid_dataset, test_dataset = clintox_datasets # Fit models metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification") # Do setup required for tf/keras models # Number of features on conv-mols n_feat = 75 # Batch size of models batch_size = 50 model = GraphConvModel(len(clintox_tasks), batch_size=batch_size, mode='classification') # Fit trained model model.fit(train_dataset, nb_epoch=10) print("Evaluating model") train_scores = model.evaluate(train_dataset, [metric], transformers) valid_scores = model.evaluate(valid_dataset, [metric], transformers) print("Train scores") print(train_scores) print("Validation scores") print(valid_scores)
def finetune_dest_model(model_dir="models", source_model=None, csv="input_data.csv", include_top=False, num_epochs=100): dest_model = GraphConvModel(n_tasks=1, graph_conv_layers=[128, 128], dense_layer_size=512, dropout=0, mode='regression', learning_rate=0.001, batch_size=8, model_dir=model_dir) dest_model.load_from_pretrained(source_model=source_model, assignment_map=None, value_map=None, include_top=include_top) train_set, valid_set, transformers = load_data(csv) tune_layers_index = [] all_layers = dest_model.model.layers for layer in all_layers: ind = all_layers.index(layer) namelist = layer.name.split("_") if "batch" and "normalization" in namelist: tune_layers_index.append(ind) elif "graph" and "conv" in namelist: tune_layers_index.append(ind) elif "dense" in namelist: tune_layers_index.append(ind) all_model_losses = [] all_model_metrics = [] all_model_final_metrics = [] all_model_plots = [] all_models = [] for freeze_till in tune_layers_index: print("-----------------------------") # iterate through all possible index to freeze until and fit model for each iteration print( f"RUNNING ITERATION {tune_layers_index.index(freeze_till)} / {len(tune_layers_index)}" ) for layer in all_layers: if all_layers.index(layer) < freeze_till: layer.trainable = False else: layer.trainable = True print(f"Froze layers till {dest_model.model.layers[freeze_till - 1]}") print( f"Training layers starting from {dest_model.model.layers[freeze_till]}" ) print( f"Trainable layers: {len([layer for layer in dest_model.model.layers if layer.trainable])} - {[layer for layer in dest_model.model.layers if layer.trainable]}" ) print("-----------------------------") current_losses, final_metrics, current_metrics, plt = fit_with_metrics( dest_model, num_epochs=num_epochs, train_set=train_set, valid_set=valid_set, transformers=transformers) all_model_losses.append(current_losses) all_model_final_metrics.append(final_metrics) all_model_metrics.append(current_metrics) all_model_plots.append(plt) all_models.append(dest_model) print("Fitting completed!") print(f"Final metrics for this model: {current_metrics}") return all_models, all_model_losses, all_model_final_metrics, all_model_metrics, all_model_plots
def run_the_inference(input_file, model_dir, output_file=None, tempdir=None, gpu=None): """ :param input_file: file in .smi format (smiles, tab separation, molecule_id) :param checkpoint_file: path to the saved checkpoint of the model we want to use for inference :param output_file: where to store the predictions (csv format). If None, no output file is written :param tempdir: where the temporary directories created by DeepChem will be stored :param gpu: which GPU to use. If None, only CPU will be used :return: predictions (back transformed) """ if gpu is None: import os os.environ[ 'CUDA_VISIBLE_DEVICES'] = '' # we will use CPU only for inference else: import os os.environ['CUDA_VISIBLE_DEVICES'] = '%i' % gpu # 1. Load the model model = GraphConvModel.load_from_dir(model_dir, restore=True) n_tasks = model.n_tasks # 2. Prepare input data molids, smis, input_dset = input_smi_to_csv(input_file, tempdir, n_tasks) # 3. Run the prediction print('Running the prediction') second_tempdir = op.join(tempfile.mkdtemp(dir=tempdir), 'todel') ypred, molids_processed = easy_predict(input_dset, model, n_tasks, second_tempdir, smiles_field='smiles', id_field='molid') molids_processed = [str(mid) for mid in molids_processed] # 4. Post-process the predictions (remove the z-scaling) ypred = post_process_predictions(ypred, model_dir) # 5. Write down the csv file with the predictions if output_file is not None: ensure_dir_from_file(output_file) print('Writing the predictions on file') with open(output_file, 'w') as writer: # header writer.write(','.join(['CompoundID', 'Canonical Smiles'] + ['task_%i' % i for i in range(n_tasks)])) writer.write('\n') # content if len(molids_processed) == len(molids): for mid, smi, preds in zip(molids, smis, ypred): writer.write(','.join([str(mid), smi])) writer.write(',') writer.write(','.join([str(p) for p in preds])) writer.write('\n') else: # then we have to use the list of molids that were processed print( 'Not all compounds could be predicted. Problematic CompoundIDs:' ) print(set(molids).difference(set(molids_processed))) for mid, preds in zip(molids_processed, ypred): writer.write(','.join([str(mid), molid2smi[str(mid)]])) writer.write(',') writer.write(','.join([str(p) for p in preds])) writer.write('\n') # 6. Delete temp files print('Deleting temporary files...') shutil.rmtree(op.dirname(input_dset)) shutil.rmtree(op.dirname(second_tempdir)) return molids, smis, ypred
# splitters = { # 'index': dc.splits.IndexSplitter(), # 'random': dc.splits.RandomSplitter(), # 'scaffold': dc.splits.ScaffoldSplitter() # } splitter = dc.splits.ScaffoldSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset) train_dataset.load_metadata print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX") # Load Delaney dataset # delaney_tasks, delaney_datasets, transformers = load_delaney( # featurizer='GraphConv', split='index') # train_dataset, valid_dataset, test_dataset = dataset # Fit models metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean) # Do setup required for tf/keras models # Number of features on conv-mols n_feat = 250 # Batch size of models batch_size = 128 model = GraphConvModel(len(delaney_tasks), batch_size=batch_size, mode='regression') print("this is the end of the output")
train_dataset, valid_dataset, test_dataset = hopv_datasets # Fit models metric = [ dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean, mode="regression"), dc.metrics.Metric(dc.metrics.mean_absolute_error, np.mean, mode="regression") ] # Number of features on conv-mols n_feat = 75 # Batch size of models batch_size = 50 model = GraphConvModel(len(hopv_tasks), batch_size=batch_size, mode='regression') # Fit trained model model.fit(train_dataset, nb_epoch=25) print("Evaluating model") train_scores = model.evaluate(train_dataset, metric, transformers) valid_scores = model.evaluate(valid_dataset, metric, transformers) print("Train scores") print(train_scores) print("Validation scores") print(valid_scores)