Beispiel #1
0
def test_graph_conv_model():
    tasks, dataset, transformers, metric = get_dataset('classification',
                                                       'GraphConv')

    batch_size = 10
    model = GraphConvModel(len(tasks),
                           batch_size=batch_size,
                           batch_normalize=False,
                           mode='classification')

    model.fit(dataset, nb_epoch=20)
    scores = model.evaluate(dataset, [metric], transformers)
    assert scores['mean-roc_auc_score'] >= 0.9
Beispiel #2
0
def test_graph_conv_regression_model():
    tasks, dataset, transformers, metric = get_dataset('regression',
                                                       'GraphConv')

    batch_size = 10
    model = GraphConvModel(len(tasks),
                           batch_size=batch_size,
                           batch_normalize=False,
                           mode='regression')

    model.fit(dataset, nb_epoch=100)
    scores = model.evaluate(dataset, [metric], transformers)
    assert scores['mean_absolute_error'] < 0.1
  def test_graph_conv_error_bars(self):
    tasks, dataset, transformers, metric = self.get_dataset(
        'regression', 'GraphConv', num_tasks=1)

    batch_size = 50
    model = GraphConvModel(len(tasks), batch_size=batch_size, mode='regression')

    model.fit(dataset, nb_epoch=1)

    mu, sigma = model.bayesian_predict(
        dataset, transformers, untransform=True, n_passes=24)
    assert mu.shape == (len(dataset), len(tasks))
    assert sigma.shape == (len(dataset), len(tasks))
Beispiel #4
0
    def test_graph_conv_model(self):
        from deepchem.models import GraphConvModel, TensorGraph
        import numpy as np
        tasks, dataset, transformers, metric = self.get_dataset(
            'classification', 'GraphConv')

        batch_size = 50
        model = GraphConvModel(len(tasks),
                               batch_size=batch_size,
                               mode='classification')

        model.fit(dataset, nb_epoch=10)
        scores = model.evaluate(dataset, [metric], transformers)
        assert scores['mean-roc_auc_score'] >= 0.9
Beispiel #5
0
def define_gc_regression_model(n_tasks,
                               graph_conv_sizes=(128, 128),
                               dense_size=256,
                               batch_size=128,
                               learning_rate=0.001,
                               config=default_config,
                               model_dir='/tmp'):
    """
    Initializes the multitask regression GCNN
    :param n_tasks: number of output tasks
    :param graph_conv_sizes: tuple with output dimension for every GC layer
    :param dense_size: size of the dense layer
    :param batch_size: number of examples per minibatch
    :param learning_rate: initial learning rate
    :param config: GPU and memory usage options
    :param model_dir: where the trained model will be stored
    :return: a GraphConvModel object
    """

    return GraphConvModel(n_tasks=n_tasks,
                          graph_conv_layers=graph_conv_sizes,
                          dense_layer_size=dense_size,
                          dropout=0.0,
                          mode='regression',
                          number_atom_features=75,
                          uncertainty=False,
                          batch_size=batch_size,
                          learning_rate=learning_rate,
                          learning_rate_decay_time=1000,
                          optimizer_type='adam',
                          configproto=config,
                          model_dir=model_dir)
Beispiel #6
0
def test_neural_fingerprint_retrieval():
    tasks, dataset, transformers, metric = get_dataset('classification',
                                                       'GraphConv')

    fp_size = 3

    batch_size = 50
    model = GraphConvModel(len(tasks),
                           batch_size=batch_size,
                           dense_layer_size=3,
                           mode='classification')

    model.fit(dataset, nb_epoch=1)
    neural_fingerprints = model.predict_embedding(dataset)
    neural_fingerprints = np.array(neural_fingerprints)[:len(dataset)]
    assert (len(dataset), fp_size * 2) == neural_fingerprints.shape
  def test_change_loss_function(self):
    tasks, dataset, transformers, metric = self.get_dataset(
        'regression', 'GraphConv', num_tasks=1)

    batch_size = 50
    model = GraphConvModel(len(tasks), batch_size=batch_size, mode='regression')

    model.fit(dataset, nb_epoch=1)
    model.save()

    model2 = TensorGraph.load_from_dir(model.model_dir, restore=False)
    dummy_label = model2.labels[-1]
    dummy_ouput = model2.outputs[-1]
    loss = ReduceSum(L2Loss(in_layers=[dummy_label, dummy_ouput]))
    module = model2.create_submodel(loss=loss)
    model2.restore()
    model2.fit(dataset, nb_epoch=1, submodel=module)
  def test_graph_conv_atom_features(self):
    tasks, dataset, transformers, metric = self.get_dataset(
        'regression', 'Raw', num_tasks=1)

    atom_feature_name = 'feature'
    y = []
    for mol in dataset.X:
      atom_features = []
      for atom in mol.GetAtoms():
        val = np.random.normal()
        mol.SetProp("atom %08d %s" % (atom.GetIdx(), atom_feature_name),
                    str(val))
        atom_features.append(np.random.normal())
      y.append([np.sum(atom_features)])

    featurizer = ConvMolFeaturizer(atom_properties=[atom_feature_name])
    X = featurizer.featurize(dataset.X)
    dataset = dc.data.NumpyDataset(X, np.array(y))
    batch_size = 50
    model = GraphConvModel(
        len(tasks),
        number_atom_features=featurizer.feature_length(),
        batch_size=batch_size,
        mode='regression')

    model.fit(dataset, nb_epoch=1)
    y_pred1 = model.predict(dataset)
    model.save()

    model2 = TensorGraph.load_from_dir(model.model_dir)
    y_pred2 = model2.predict(dataset)
    self.assertTrue(np.all(y_pred1 == y_pred2))
Beispiel #9
0
    def test_neural_fingerprint_retrieval(self):
        tasks, dataset, transformers, metric = self.get_dataset(
            'classification', 'GraphConv')

        fp_size = 3

        batch_size = 50
        model = GraphConvModel(len(tasks),
                               batch_size=batch_size,
                               dense_layer_size=3,
                               mode='classification')

        model.fit(dataset, nb_epoch=1)
        neural_fingerprints = model.predict(dataset,
                                            outputs=model.neural_fingerprint)
        neural_fingerprints = np.array(neural_fingerprints)[:len(dataset)]
        self.assertEqual((len(dataset), fp_size * 2),
                         neural_fingerprints.shape)
Beispiel #10
0
  def test_neural_fingerprint_retrieval(self):
    tasks, dataset, transformers, metric = self.get_dataset(
        'classification', 'GraphConv')

    fp_size = 3

    batch_size = 50
    model = GraphConvModel(
        len(tasks),
        batch_size=batch_size,
        dense_layer_size=3,
        mode='classification')

    model.fit(dataset, nb_epoch=1)
    neural_fingerprints = model.predict(
        dataset, outputs=model.neural_fingerprint)
    neural_fingerprints = np.array(neural_fingerprints)[:len(dataset)]
    self.assertEqual((len(dataset), fp_size * 2), neural_fingerprints.shape)
Beispiel #11
0
  def test_graph_conv_regression_model(self):
    tasks, dataset, transformers, metric = self.get_dataset(
        'regression', 'GraphConv')

    batch_size = 50
    model = GraphConvModel(len(tasks), batch_size=batch_size, mode='regression')

    model.fit(dataset, nb_epoch=1)
    scores = model.evaluate(dataset, [metric], transformers)

    model.save()
    model = TensorGraph.load_from_dir(model.model_dir)
    scores = model.evaluate(dataset, [metric], transformers)
Beispiel #12
0
    def test_graph_conv_regression_uncertainty(self):
        tasks, dataset, transformers, metric = self.get_dataset(
            'regression', 'GraphConv')

        batch_size = 50
        model = GraphConvModel(len(tasks),
                               batch_size=batch_size,
                               mode='regression',
                               dropout=0.1,
                               uncertainty=True)

        model.fit(dataset, nb_epoch=100)

        # Predict the output and uncertainty.
        pred, std = model.predict_uncertainty(dataset)
        mean_error = np.mean(np.abs(dataset.y - pred))
        mean_value = np.mean(np.abs(dataset.y))
        mean_std = np.mean(std)
        assert mean_error < 0.5 * mean_value
        assert mean_std > 0.5 * mean_error
        assert mean_std < mean_value
Beispiel #13
0
  def test_graph_conv_atom_features(self):
    tasks, dataset, transformers, metric = self.get_dataset(
        'regression', 'Raw', num_tasks=1)

    atom_feature_name = 'feature'
    y = []
    for mol in dataset.X:
      atom_features = []
      for atom in mol.GetAtoms():
        val = np.random.normal()
        mol.SetProp("atom %08d %s" % (atom.GetIdx(), atom_feature_name),
                    str(val))
        atom_features.append(np.random.normal())
      y.append([np.sum(atom_features)])

    featurizer = ConvMolFeaturizer(atom_properties=[atom_feature_name])
    X = featurizer.featurize(dataset.X)
    dataset = dc.data.NumpyDataset(X, np.array(y))
    batch_size = 50
    model = GraphConvModel(
        len(tasks),
        number_atom_features=featurizer.feature_length(),
        batch_size=batch_size,
        mode='regression')

    model.fit(dataset, nb_epoch=1)
    y_pred1 = model.predict(dataset)
    model.save()

    model2 = TensorGraph.load_from_dir(model.model_dir)
    y_pred2 = model2.predict(dataset)
    self.assertTrue(np.allclose(y_pred1, y_pred2))
Beispiel #14
0
def test_graph_conv_model():
    batch_size = 2000
    model = GraphConvModel(1,
                           batch_size=batch_size,
                           mode="classification",
                           model_dir="/tmp/covid/model_dir")
    dataset_file = "covid_mpro_combined_data_sources.csv"
    tasks = ["isHit"]
    featurizer = dc.feat.ConvMolFeaturizer()
    loader = dc.data.CSVLoader(tasks=tasks,
                               smiles_field="SMILES",
                               featurizer=featurizer)
    dataset = loader.featurize(dataset_file, shard_size=8192)

    metrics = [
        dc.metrics.Metric(dc.metrics.matthews_corrcoef,
                          np.mean,
                          mode="classification")
    ]

    splitter = dc.splits.RandomSplitter()

    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset)

    model.fit(train_dataset)

    pred = [x.flatten() for x in model.predict(valid_dataset)]
    pred_df = pd.DataFrame(pred, columns=["neg", "pos"])
    pred_df["active"] = [int(x) for x in valid_dataset.y]
    pred_df["SMILES"] = valid_dataset.ids

    sns.boxplot(pred_df.active, pred_df.pos)

    print(model.evaluate(train_dataset, metrics))
    print(model.evaluate(test_dataset, metrics))

    metrics = [
        dc.metrics.Metric(dc.metrics.roc_auc_score,
                          np.mean,
                          mode="classification")
    ]
    print(model.evaluate(train_dataset, metrics))
    print(model.evaluate(test_dataset, metrics))
Beispiel #15
0
  def test_graph_conv_regression_uncertainty(self):
    tasks, dataset, transformers, metric = self.get_dataset(
        'regression', 'GraphConv')

    batch_size = 50
    model = GraphConvModel(
        len(tasks),
        batch_size=batch_size,
        mode='regression',
        dropout=0.1,
        uncertainty=True)

    model.fit(dataset, nb_epoch=100)

    # Predict the output and uncertainty.
    pred, std = model.predict_uncertainty(dataset)
    mean_error = np.mean(np.abs(dataset.y - pred))
    mean_value = np.mean(np.abs(dataset.y))
    mean_std = np.mean(std)
    assert mean_error < 0.5 * mean_value
    assert mean_std > 0.5 * mean_error
    assert mean_std < mean_value
Beispiel #16
0
  def test_graph_conv_regression_model(self):
    tasks, dataset, transformers, metric = self.get_dataset(
        'regression', 'GraphConv')

    batch_size = 50
    model = GraphConvModel(len(tasks), batch_size=batch_size, mode='regression')

    model.fit(dataset, nb_epoch=100)
    scores = model.evaluate(dataset, [metric], transformers)
    assert all(s < 0.1 for s in scores['mean_absolute_error'])

    model.save()
    model = TensorGraph.load_from_dir(model.model_dir)
    scores2 = model.evaluate(dataset, [metric], transformers)
    assert np.allclose(scores['mean_absolute_error'],
                       scores2['mean_absolute_error'])
Beispiel #17
0
  def test_graph_conv_model(self):
    tasks, dataset, transformers, metric = self.get_dataset(
        'classification', 'GraphConv')

    batch_size = 50
    model = GraphConvModel(
        len(tasks), batch_size=batch_size, mode='classification')

    model.fit(dataset, nb_epoch=10)
    scores = model.evaluate(dataset, [metric], transformers)
    assert scores['mean-roc_auc_score'] >= 0.9

    model.save()
    model = TensorGraph.load_from_dir(model.model_dir)
    scores2 = model.evaluate(dataset, [metric], transformers)
    assert np.allclose(scores['mean-roc_auc_score'],
                       scores2['mean-roc_auc_score'])
Beispiel #18
0
  def test_graph_conv_model(self):
    tasks, dataset, transformers, metric = self.get_dataset(
        'classification', 'GraphConv')

    batch_size = 50
    model = GraphConvModel(
        len(tasks), batch_size=batch_size, mode='classification')

    model.fit(dataset, nb_epoch=10)
    scores = model.evaluate(dataset, [metric], transformers)
    assert scores['mean-roc_auc_score'] >= 0.9

    model.save()
    model = TensorGraph.load_from_dir(model.model_dir)
    scores2 = model.evaluate(dataset, [metric], transformers)
    assert np.allclose(scores['mean-roc_auc_score'],
                       scores2['mean-roc_auc_score'])
Beispiel #19
0
  def test_graph_conv_regression_model(self):
    tasks, dataset, transformers, metric = self.get_dataset(
        'regression', 'GraphConv')

    batch_size = 50
    model = GraphConvModel(len(tasks), batch_size=batch_size, mode='regression')

    model.fit(dataset, nb_epoch=100)
    scores = model.evaluate(dataset, [metric], transformers)
    assert all(s < 0.1 for s in scores['mean_absolute_error'])

    model.save()
    model = TensorGraph.load_from_dir(model.model_dir)
    scores2 = model.evaluate(dataset, [metric], transformers)
    assert np.allclose(
        scores['mean_absolute_error'],
        scores2['mean_absolute_error'],
        rtol=1e-4)
Beispiel #20
0
def test_graph_conv_model_no_task():
    tasks, dataset, _, __ = get_dataset('classification', 'GraphConv')
    batch_size = 10
    model = GraphConvModel(len(tasks),
                           batch_size=batch_size,
                           batch_normalize=False,
                           mode='classification')
    model.fit(dataset, nb_epoch=20)
    # predict datset with no y (ensured by tasks = [])
    bace_url = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/bace.csv"
    dc.utils.data_utils.download_url(url=bace_url, name="bace_tmp.csv")
    loader = dc.data.CSVLoader(tasks=[],
                               smiles_field='mol',
                               featurizer=dc.feat.ConvMolFeaturizer())
    td = loader.featurize(
        os.path.join(dc.utils.data_utils.get_data_dir(), "bace_tmp.csv"))
    model.predict(td)
Beispiel #21
0
  def test_change_loss_function(self):
    tasks, dataset, transformers, metric = self.get_dataset(
        'regression', 'GraphConv', num_tasks=1)

    batch_size = 50
    model = GraphConvModel(len(tasks), batch_size=batch_size, mode='regression')

    model.fit(dataset, nb_epoch=1)
    model.save()

    model2 = TensorGraph.load_from_dir(model.model_dir, restore=False)
    dummy_label = model2.labels[-1]
    dummy_ouput = model2.outputs[-1]
    loss = ReduceSum(L2Loss(in_layers=[dummy_label, dummy_ouput]))
    module = model2.create_submodel(loss=loss)
    model2.restore()
    model2.fit(dataset, nb_epoch=1, submodel=module)
Beispiel #22
0
                          np.mean,
                          mode="classification")
    ]

    training_score_list = []
    validation_score_list = []
    transformers = []

    model.fit(dataset)
    print(model.evaluate(dataset, metrics))
    return model


#model = generate_graph_conv_model()
model = GraphConvModel(1,
                       batch_size=128,
                       mode="classification",
                       model_dir="/tmp/mk01/model_dir")
model.restore()
#make predictions
featurizer = dc.feat.ConvMolFeaturizer()
df = pd.read_csv("zinc_100k.txt", sep=" ", delimiter=' ', header=None)
df.columns = ["SMILES", "Name"]

rows, cols = df.shape
df["Val"] = [
    0
] * rows  #just add add a dummy column to keep the featurizer happy
infile_name = "zinc_filtered.csv"
df.to_csv(infile_name, index=False)
loader = dc.data.CSVLoader(tasks=['Val'],
                           smiles_field="SMILES",
Beispiel #23
0
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
    dataset_train, frac_train=0.8, frac_valid=0.2, frac_test=0.0, seed=0)

####Transorm them
train_dataset = transformers_train.transform(train_dataset)
valid_dataset = transformers_train.transform(valid_dataset)
test1_dataset = transformers_test1.transform(dataset_test1)
test2_dataset = transformers_test2.transform(dataset_test2)

#######

model_dir = "./tf_chp_hp"
model = GraphConvModel(n_tasks=1,
                       batch_size=32,
                       mode='regression',
                       dropout=0.0,
                       dense_layer_size=256,
                       learning_rate=0.005,
                       model_dir=model_dir,
                       random_seed=0)

metric = dc.metrics.Metric(dc.metrics.r2_score, mode='regression')

ckpt = tf.train.Checkpoint(step=tf.Variable(1))
manager = tf.train.CheckpointManager(ckpt, model_dir, max_to_keep=20)

start_time = time.time()

num_epochs = 100
losses_train = []
score_valid = []
score_train = []
Beispiel #24
0
"""
MODEL BUILDING
"""

# Fit
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)

# Do setup required for tf/keras models
#n_feat = 1000           # Number of features on conv-mols
batch_size = 1000  # Batch size of models
nb_epoch = 1000  # Number of epochs for convergence
model = GraphConvModel(
    1,
    batch_size=batch_size,
    mode='regression',
    dropout=0.2,
    tensorboard=True,
    model_dir=
    "/home/rod/Dropbox/Quimica/Analysis/ANalisis/Borradores/GraphConvModel/"
)  #To prevent overfitting

# Fit trained model
model.fit(train_dataset, nb_epoch=nb_epoch)
model.save()
print("Evaluating model")
train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)

print("Train scores")
print(train_scores)
Beispiel #25
0
    metrics = [
        dc.metrics.Metric(dc.metrics.roc_auc_score,
                          np.mean,
                          mode="classification")
    ]
    print(model.evaluate(train_dataset, metrics))
    print(model.evaluate(test_dataset, metrics))


#test_graph_conv_model()

#train the model
batch_size = 2000
model = GraphConvModel(1,
                       batch_size=batch_size,
                       mode="classification",
                       model_dir="/tmp/covid/model_dir")
dataset_file = "covid_mpro_combined_data_sources.csv"
tasks = ["isHit"]
featurizer = dc.feat.ConvMolFeaturizer()
loader = dc.data.CSVLoader(tasks=tasks,
                           smiles_field="SMILES",
                           featurizer=featurizer)
dataset = loader.featurize(dataset_file, shard_size=8192)

model.fit(dataset)

#model = GraphConvModel(1, batch_size=128,mode="classification",model_dir="/tmp/mk01/model_dir")
#model.restore()
#make predictions
featurizer = dc.feat.ConvMolFeaturizer()
from deepchem.molnet import load_delaney

# Load Delaney dataset
delaney_tasks, delaney_datasets, transformers = load_delaney(
    featurizer='GraphConv', split='index')
train_dataset, valid_dataset, test_dataset = delaney_datasets

# Fit models
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)

# Do setup required for tf/keras models
# Number of features on conv-mols
n_feat = 75
# Batch size of models
batch_size = 128
model = GraphConvModel(
    len(delaney_tasks), batch_size=batch_size, mode='regression')

# Fit trained model
model.fit(train_dataset, nb_epoch=20)

print("Evaluating model")
train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)
Beispiel #27
0
# Load HOPV dataset
hopv_tasks, hopv_datasets, transformers = load_hopv(featurizer='GraphConv')
train_dataset, valid_dataset, test_dataset = hopv_datasets

# Fit models
metric = [
    dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean, mode="regression"),
    dc.metrics.Metric(
        dc.metrics.mean_absolute_error, np.mean, mode="regression")
]

# Number of features on conv-mols
n_feat = 75
# Batch size of models
batch_size = 50
model = GraphConvModel(
    len(hopv_tasks), batch_size=batch_size, mode='regression')

# Fit trained model
model.fit(train_dataset, nb_epoch=25)

print("Evaluating model")
train_scores = model.evaluate(train_dataset, metric, transformers)
valid_scores = model.evaluate(valid_dataset, metric, transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)
Beispiel #28
0

smiles = ['C1CCCCC1', 'O1CCOCC1'] # cyclohexane and dioxane
mols = [Chem.MolFromSmiles(smile) for smile in smiles]
feat = dc.feat.CircularFingerprint(size=1024)
arr = feat.featurize(mols)
print(arr)

feat = dc.feat.RDKitDescriptors()
arr = feat.featurize(mols)
print(arr)

tasks, datasets, transformers = dc.molnet.load_delaney(featurizer='GraphConv')
train_dataset, valid_dataset, test_dataset = datasets

model = GraphConvModel(n_tasks=1, mode='regression', dropout=0.2)
model.fit(train_dataset, nb_epoch=100)

metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
print(model.evaluate(train_dataset, [metric], transformers))
print(model.evaluate(test_dataset, [metric], transformers))

smiles = ['COC(C)(C)CCCC(C)CC=CC(C)=CC(=O)OC(C)C',
          'CCOC(=O)CC',
          'CSc1nc(NC(C)C)nc(NC(C)C)n1',
          'CC(C#C)N(C)C(=O)Nc1ccc(Cl)cc1',
          'Cc1cc2ccccc2cc1C']

from rdkit import Chem
mols = [Chem.MolFromSmiles(s) for s in smiles]
featurizer = dc.feat.ConvMolFeaturizer()
def generate_graph_conv_model():
    batch_size = 128
    model = GraphConvModel(1, batch_size=batch_size, mode='regression')
    return model
    splitter = dc.splits.RandomSplitter()
    train, valid, test = splitter.train_valid_test_split(dataset)

# In[4]:

# Fit models
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)

# Number of features
n_feat = 75

# Batch size of models
batch_size = 128

model = GraphConvModel(len(delaney_tasks),
                       batch_size=batch_size,
                       mode='regression',
                       dropout=0.2)

# In[5]:

# Fit trained model
model.fit(train, nb_epoch=100)

print("Evaluating model")
train_scores = model.evaluate(train, [metric], transformers)
valid_scores = model.evaluate(valid, [metric], transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
Beispiel #31
0
from dataset_functions import to_dataframe, from_dataframe

# select the featurizer type to be ConvMolFeaturizer
graph_featurizer = dc.feat.graph_features.ConvMolFeaturizer()

loader = dc.data.data_loader.CSVLoader(tasks=['gap'],
                                       smiles_field="smiles",
                                       id_field="ID",
                                       featurizer=graph_featurizer)
metric = [dc.metrics.Metric(dc.metrics.mae_score, np.mean)]

model = GraphConvModel(
    n_tasks=1,
    graph_conv_layers=[128, 128],
    dense_layer_size=512,
    dropout=0.0001,
    # dropout must be included in every layer for uncertainty
    mode='regression',
    uncertainty=True,
    learning_rate=0.001,
    batch_size=8)

csv_list = ["set1.csv", "set2.csv", "set3.csv", "set4.csv"]
seeds = [5, 10, 12, 18]


# for generic train test split loads for initial run
def load_and_split(csv, seed=None):
    # load csv for training and test
    dataset = loader.featurize(csv)

    # transform data here
Beispiel #32
0
clintox_tasks, clintox_datasets, transformers = load_clintox(
    featurizer='GraphConv', split='random')
train_dataset, valid_dataset, test_dataset = clintox_datasets

# Fit models
metric = dc.metrics.Metric(dc.metrics.roc_auc_score,
                           np.mean,
                           mode="classification")

# Do setup required for tf/keras models
# Number of features on conv-mols
n_feat = 75
# Batch size of models
batch_size = 50
model = GraphConvModel(len(clintox_tasks),
                       batch_size=batch_size,
                       mode='classification')

# Fit trained model
model.fit(train_dataset, nb_epoch=10)

print("Evaluating model")
train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)
Beispiel #33
0
def finetune_dest_model(model_dir="models",
                        source_model=None,
                        csv="input_data.csv",
                        include_top=False,
                        num_epochs=100):

    dest_model = GraphConvModel(n_tasks=1,
                                graph_conv_layers=[128, 128],
                                dense_layer_size=512,
                                dropout=0,
                                mode='regression',
                                learning_rate=0.001,
                                batch_size=8,
                                model_dir=model_dir)

    dest_model.load_from_pretrained(source_model=source_model,
                                    assignment_map=None,
                                    value_map=None,
                                    include_top=include_top)

    train_set, valid_set, transformers = load_data(csv)

    tune_layers_index = []
    all_layers = dest_model.model.layers
    for layer in all_layers:
        ind = all_layers.index(layer)
        namelist = layer.name.split("_")
        if "batch" and "normalization" in namelist:
            tune_layers_index.append(ind)
        elif "graph" and "conv" in namelist:
            tune_layers_index.append(ind)
        elif "dense" in namelist:
            tune_layers_index.append(ind)

    all_model_losses = []
    all_model_metrics = []
    all_model_final_metrics = []
    all_model_plots = []
    all_models = []
    for freeze_till in tune_layers_index:
        print("-----------------------------")
        # iterate through all possible index to freeze until and fit model for each iteration
        print(
            f"RUNNING ITERATION {tune_layers_index.index(freeze_till)} / {len(tune_layers_index)}"
        )
        for layer in all_layers:
            if all_layers.index(layer) < freeze_till:
                layer.trainable = False
            else:
                layer.trainable = True

        print(f"Froze layers till {dest_model.model.layers[freeze_till - 1]}")
        print(
            f"Training layers starting from {dest_model.model.layers[freeze_till]}"
        )
        print(
            f"Trainable layers: {len([layer for layer in dest_model.model.layers if layer.trainable])} - {[layer for layer in dest_model.model.layers if layer.trainable]}"
        )
        print("-----------------------------")

        current_losses, final_metrics, current_metrics, plt = fit_with_metrics(
            dest_model,
            num_epochs=num_epochs,
            train_set=train_set,
            valid_set=valid_set,
            transformers=transformers)
        all_model_losses.append(current_losses)
        all_model_final_metrics.append(final_metrics)
        all_model_metrics.append(current_metrics)
        all_model_plots.append(plt)
        all_models.append(dest_model)
        print("Fitting completed!")
        print(f"Final metrics for this model: {current_metrics}")

    return all_models, all_model_losses, all_model_final_metrics, all_model_metrics, all_model_plots
Beispiel #34
0
def run_the_inference(input_file,
                      model_dir,
                      output_file=None,
                      tempdir=None,
                      gpu=None):
    """
    :param input_file: file in .smi format (smiles, tab separation, molecule_id)
    :param checkpoint_file: path to the saved checkpoint of the model we want to use for inference
    :param output_file: where to store the predictions (csv format). If None, no output file is written
    :param tempdir: where the temporary directories created by DeepChem will be stored
    :param gpu: which GPU to use. If None, only CPU will be used
    :return: predictions (back transformed)
    """
    if gpu is None:
        import os
        os.environ[
            'CUDA_VISIBLE_DEVICES'] = ''  # we will use CPU only for inference

    else:
        import os
        os.environ['CUDA_VISIBLE_DEVICES'] = '%i' % gpu

    # 1. Load the model
    model = GraphConvModel.load_from_dir(model_dir, restore=True)
    n_tasks = model.n_tasks

    # 2. Prepare input data
    molids, smis, input_dset = input_smi_to_csv(input_file, tempdir, n_tasks)

    # 3. Run the prediction
    print('Running the prediction')
    second_tempdir = op.join(tempfile.mkdtemp(dir=tempdir), 'todel')
    ypred, molids_processed = easy_predict(input_dset,
                                           model,
                                           n_tasks,
                                           second_tempdir,
                                           smiles_field='smiles',
                                           id_field='molid')
    molids_processed = [str(mid) for mid in molids_processed]

    # 4. Post-process the predictions (remove the z-scaling)
    ypred = post_process_predictions(ypred, model_dir)

    # 5. Write down the csv file with the predictions
    if output_file is not None:
        ensure_dir_from_file(output_file)
        print('Writing the predictions on file')
        with open(output_file, 'w') as writer:
            # header
            writer.write(','.join(['CompoundID', 'Canonical Smiles'] +
                                  ['task_%i' % i for i in range(n_tasks)]))
            writer.write('\n')
            # content
            if len(molids_processed) == len(molids):
                for mid, smi, preds in zip(molids, smis, ypred):
                    writer.write(','.join([str(mid), smi]))
                    writer.write(',')
                    writer.write(','.join([str(p) for p in preds]))
                    writer.write('\n')
            else:  # then we have to use the list of molids that were processed
                print(
                    'Not all compounds could be predicted. Problematic CompoundIDs:'
                )
                print(set(molids).difference(set(molids_processed)))
                for mid, preds in zip(molids_processed, ypred):
                    writer.write(','.join([str(mid), molid2smi[str(mid)]]))
                    writer.write(',')
                    writer.write(','.join([str(p) for p in preds]))
                    writer.write('\n')

    # 6. Delete temp files
    print('Deleting temporary files...')
    shutil.rmtree(op.dirname(input_dset))
    shutil.rmtree(op.dirname(second_tempdir))

    return molids, smis, ypred
Beispiel #35
0
# splitters = {
#   'index': dc.splits.IndexSplitter(),
#   'random': dc.splits.RandomSplitter(),
#   'scaffold': dc.splits.ScaffoldSplitter()
# }
splitter = dc.splits.ScaffoldSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
    dataset)

train_dataset.load_metadata
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")

# Load Delaney dataset
# delaney_tasks, delaney_datasets, transformers = load_delaney(
#     featurizer='GraphConv', split='index')
# train_dataset, valid_dataset, test_dataset = dataset

# Fit models
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)

# Do setup required for tf/keras models
# Number of features on conv-mols
n_feat = 250
# Batch size of models
batch_size = 128
model = GraphConvModel(len(delaney_tasks),
                       batch_size=batch_size,
                       mode='regression')

print("this is the end of the output")
train_dataset, valid_dataset, test_dataset = hopv_datasets

# Fit models
metric = [
    dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean, mode="regression"),
    dc.metrics.Metric(dc.metrics.mean_absolute_error,
                      np.mean,
                      mode="regression")
]

# Number of features on conv-mols
n_feat = 75
# Batch size of models
batch_size = 50
model = GraphConvModel(len(hopv_tasks),
                       batch_size=batch_size,
                       mode='regression')

# Fit trained model
model.fit(train_dataset, nb_epoch=25)

print("Evaluating model")
train_scores = model.evaluate(train_dataset, metric, transformers)
valid_scores = model.evaluate(valid_dataset, metric, transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)