Example #1
0
  def scaffold_test_train_test_split(self):
    """Test of singletask RF ECFP regression API."""
    splittype = "scaffold"
    input_transforms = []
    output_transforms = ["normalize"]
    model_params = {}
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = os.path.join(self.current_dir, "example.csv")
    featurizer = CircularFingerprint(size=1024)

    input_file = os.path.join(self.current_dir, input_file)
    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")

    dataset = loader.featurize(input_file, self.data_dir)

    # Splits featurized samples into train/test
    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)
    assert len(train_dataset) == 8
    assert len(test_dataset) == 2
Example #2
0
    def test_singletask_scaffold_k_fold_split(self):
        """
    Test singletask ScaffoldSplitter class.
    """
        solubility_dataset = self.load_solubility_data()
        scaffold_splitter = ScaffoldSplitter()
        ids_set = set(solubility_dataset.ids)

        K = 5
        fold_dirs = [tempfile.mkdtemp() for i in range(K)]
        fold_datasets = scaffold_splitter.k_fold_split(solubility_dataset,
                                                       fold_dirs)

        for fold in range(K):
            fold_dataset = fold_datasets[fold]
            # Verify lengths is 10/k == 2
            assert len(fold_dataset) == 2
            # Verify that compounds in this fold are subset of original compounds
            fold_ids_set = set(fold_dataset.ids)
            assert fold_ids_set.issubset(ids_set)
            # Verify that no two folds have overlapping compounds.
            for other_fold in range(K):
                if fold == other_fold:
                    continue
                other_fold_dataset = fold_datasets[other_fold]
                other_fold_ids_set = set(other_fold_dataset.ids)
                assert fold_ids_set.isdisjoint(other_fold_ids_set)

        merge_dir = tempfile.mkdtemp()
        merged_dataset = DiskDataset.merge(merge_dir, fold_datasets)
        assert len(merged_dataset) == len(solubility_dataset)
        assert sorted(merged_dataset.ids) == (sorted(solubility_dataset.ids))
  def test_multitask_tf_mlp_ECFP_classification_hyperparam_opt(self):
    """Straightforward test of Tensorflow multitask deepchem classification API."""
    splittype = "scaffold"
    task_type = "classification"

    input_file = os.path.join(self.current_dir, "multitask_example.csv")
    tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6",
             "task7", "task8", "task9", "task10", "task11", "task12",
             "task13", "task14", "task15", "task16"]
    task_types = {task: task_type for task in tasks}

    featurizer = CircularFingerprint(size=1024)

    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset, self.train_dir, self.valid_dir, self.test_dir)

    transformers = []

    metric = Metric(metrics.matthews_corrcoef, np.mean, mode="classification")
    params_dict = {"activation": ["relu"],
                    "momentum": [.9],
                    "batch_size": [50],
                    "init": ["glorot_uniform"],
                    "data_shape": [train_dataset.get_data_shape()],
                    "learning_rate": [1e-3],
                    "decay": [1e-6],
                    "nb_hidden": [1000], 
                    "nb_epoch": [1],
                    "nesterov": [False],
                    "dropouts": [(.5,)],
                    "nb_layers": [1],
                    "batchnorm": [False],
                    "layer_sizes": [(1000,)],
                    "weight_init_stddevs": [(.1,)],
                    "bias_init_consts": [(1.,)],
                    "num_classes": [2],
                    "penalty": [0.], 
                    "optimizer": ["sgd"],
                    "num_classification_tasks": [len(task_types)]
                  }

    def model_builder(tasks, task_types, params_dict, logdir, verbosity=None):
        return TensorflowModel(
            tasks, task_types, params_dict, logdir, 
            tf_class=TensorflowMultiTaskClassifier,
            verbosity=verbosity)
    optimizer = HyperparamOpt(model_builder, tasks, task_types,
                              verbosity="low")
    best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
      params_dict, train_dataset, valid_dataset, transformers,
      metric, logdir=None)
Example #4
0
    def test_singletask_sklearn_rf_ECFP_regression_API(self):
        """Test of singletask RF ECFP regression API."""
        splittype = "scaffold"
        featurizer = CircularFingerprint(size=1024)
        model_params = {}
        tasks = ["log-solubility"]
        task_type = "regression"
        task_types = {task: task_type for task in tasks}
        input_file = os.path.join(self.current_dir, "example.csv")
        loader = DataLoader(tasks=tasks,
                            smiles_field=self.smiles_field,
                            featurizer=featurizer,
                            verbosity="low")
        dataset = loader.featurize(input_file, self.data_dir)

        splitter = ScaffoldSplitter()
        train_dataset, test_dataset = splitter.train_test_split(
            dataset, self.train_dir, self.test_dir)

        input_transformers = []
        output_transformers = [
            NormalizationTransformer(transform_y=True, dataset=train_dataset)
        ]
        transformers = input_transformers + output_transformers
        model_params["data_shape"] = train_dataset.get_data_shape()
        regression_metrics = [
            Metric(metrics.r2_score),
            Metric(metrics.mean_squared_error),
            Metric(metrics.mean_absolute_error)
        ]

        model = SklearnModel(tasks,
                             task_types,
                             model_params,
                             self.model_dir,
                             mode="regression",
                             model_instance=RandomForestRegressor())

        # Fit trained model
        model.fit(train_dataset)
        model.save()

        # Eval model on train
        evaluator = Evaluator(model,
                              train_dataset,
                              transformers,
                              verbosity=True)
        _ = evaluator.compute_model_performance(regression_metrics)

        # Eval model on test
        evaluator = Evaluator(model,
                              test_dataset,
                              transformers,
                              verbosity=True)
        _ = evaluator.compute_model_performance(regression_metrics)
Example #5
0
  def test_multitask_keras_mlp_ECFP_classification_API(self):
    """Straightforward test of Keras multitask deepchem classification API."""
    g = tf.Graph()
    sess = tf.Session(graph=g)
    K.set_session(sess)
    with g.as_default():
      task_type = "classification"
      # TODO(rbharath): There should be some automatic check to ensure that all
      # required model_params are specified.
      # TODO(rbharath): Turning off dropout to make tests behave.
      model_params = {"nb_hidden": 10, "activation": "relu",
                      "dropout": .0, "learning_rate": .01,
                      "momentum": .9, "nesterov": False,
                      "decay": 1e-4, "batch_size": 5,
                      "nb_epoch": 2, "init": "glorot_uniform",
                      "nb_layers": 1, "batchnorm": False}

      input_file = os.path.join(self.current_dir, "multitask_example.csv")
      tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6",
               "task7", "task8", "task9", "task10", "task11", "task12",
               "task13", "task14", "task15", "task16"]
      task_types = {task: task_type for task in tasks}

      featurizer = CircularFingerprint(size=1024)

      loader = DataLoader(tasks=tasks,
                          smiles_field=self.smiles_field,
                          featurizer=featurizer,
                          verbosity="low")
      dataset = loader.featurize(input_file, self.data_dir)
      splitter = ScaffoldSplitter()
      train_dataset, test_dataset = splitter.train_test_split(
          dataset, self.train_dir, self.test_dir)

      transformers = []
      model_params["data_shape"] = train_dataset.get_data_shape()
      classification_metrics = [Metric(metrics.roc_auc_score),
                                Metric(metrics.matthews_corrcoef),
                                Metric(metrics.recall_score),
                                Metric(metrics.accuracy_score)]
      
      model = MultiTaskDNN(tasks, task_types, model_params, self.model_dir)

      # Fit trained model
      model.fit(train_dataset)
      model.save()

      # Eval model on train
      evaluator = Evaluator(model, train_dataset, transformers, verbosity=True)
      _ = evaluator.compute_model_performance(classification_metrics)

      # Eval model on test
      evaluator = Evaluator(model, test_dataset, transformers, verbosity=True)
      _ = evaluator.compute_model_performance(classification_metrics)
Example #6
0
  def test_singletask_sklearn_rf_RDKIT_descriptor_regression_API(self):
    """Test of singletask RF RDKIT-descriptor regression API."""
    splittype = "scaffold"
    featurizer = RDKitDescriptors()
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    model_params = {}
    input_file = os.path.join(self.current_dir, "example.csv")
    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)

    input_transformers = [
        NormalizationTransformer(transform_X=True, dataset=train_dataset),
        ClippingTransformer(transform_X=True, dataset=train_dataset)]
    output_transformers = [
        NormalizationTransformer(transform_y=True, dataset=train_dataset)]
    transformers = input_transformers + output_transformers
    for dataset in [train_dataset, test_dataset]:
      for transformer in transformers:
        transformer.transform(dataset)

    model_params["data_shape"] = train_dataset.get_data_shape()
    regression_metrics = [Metric(metrics.r2_score),
                          Metric(metrics.mean_squared_error),
                          Metric(metrics.mean_absolute_error)]

    model = SklearnModel(tasks, task_types, model_params, self.model_dir,
                         mode="regression",
                         model_instance=RandomForestRegressor())
  

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on train
    evaluator = Evaluator(model, train_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(regression_metrics)

    # Eval model on test
    evaluator = Evaluator(model, test_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(regression_metrics)
Example #7
0
 def test_singletask_scaffold_split(self):
     """
 Test singletask ScaffoldSplitter class.
 """
     solubility_dataset = self.load_solubility_data()
     scaffold_splitter = ScaffoldSplitter()
     train_data, valid_data, test_data = \
         scaffold_splitter.train_valid_test_split(
             solubility_dataset,
             self.train_dir, self.valid_dir, self.test_dir,
             frac_train=0.8, frac_valid=0.1, frac_test=0.1)
     assert len(train_data) == 8
     assert len(valid_data) == 1
     assert len(test_data) == 1
Example #8
0
 def test_multitask_scaffold_split(self):
   """
   Test multitask ScaffoldSplitter class.
   """
   multitask_dataset = self.load_multitask_data()
   scaffold_splitter = ScaffoldSplitter()
   train_data, valid_data, test_data = \
       scaffold_splitter.train_valid_test_split(
           multitask_dataset,
           self.train_dir, self.valid_dir, self.test_dir,
           frac_train=0.8, frac_valid=0.1, frac_test=0.1)
   assert len(train_data) == 8
   assert len(valid_data) == 1
   assert len(test_data) == 1
Example #9
0
    def _run_muv_experiment(self, dataset_file, reload=False, verbosity=None):
        """Loads or reloads a small version of MUV dataset."""
        # Load MUV dataset
        raw_dataset = load_from_disk(dataset_file)
        print("Number of examples in dataset: %s" % str(raw_dataset.shape[0]))

        print("About to featurize compounds")
        featurizer = CircularFingerprint(size=1024)
        MUV_tasks = [
            'MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644', 'MUV-548',
            'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712', 'MUV-737', 'MUV-858',
            'MUV-713', 'MUV-733', 'MUV-652', 'MUV-466', 'MUV-832'
        ]
        loader = DataLoader(tasks=MUV_tasks,
                            smiles_field="smiles",
                            featurizer=featurizer,
                            verbosity=verbosity)
        dataset = loader.featurize(dataset_file, self.data_dir)
        assert len(dataset) == len(raw_dataset)

        print("About to split compounds into train/valid/test")
        splitter = ScaffoldSplitter(verbosity=verbosity)
        frac_train, frac_valid, frac_test = .8, .1, .1
        train_dataset, valid_dataset, test_dataset = \
            splitter.train_valid_test_split(
                dataset, self.train_dir, self.valid_dir, self.test_dir,
                log_every_n=1000, frac_train=frac_train,
                frac_test=frac_test, frac_valid=frac_valid)
        # Do an approximate comparison since splits are sometimes slightly off from
        # the exact fraction.
        assert relative_difference(len(train_dataset),
                                   frac_train * len(dataset)) < 1e-3
        assert relative_difference(len(valid_dataset),
                                   frac_valid * len(dataset)) < 1e-3
        assert relative_difference(len(test_dataset),
                                   frac_test * len(dataset)) < 1e-3

        # TODO(rbharath): Transformers don't play nice with reload! Namely,
        # reloading will cause the transform to be reapplied. This is undesirable in
        # almost all cases. Need to understand a method to fix this.
        transformers = [
            BalancingTransformer(transform_w=True, dataset=train_dataset)
        ]
        print("Transforming datasets")
        for dataset in [train_dataset, valid_dataset, test_dataset]:
            for transformer in transformers:
                transformer.transform(dataset)

        return (len(train_dataset), len(valid_dataset), len(test_dataset))
Example #10
0
  def test_singletask_sklearn_rf_ECFP_regression_sharded_API(self):
    """Test of singletask RF ECFP regression API: sharded edition."""
    splittype = "scaffold"
    featurizer = CircularFingerprint(size=1024)
    model_params = {}
    tasks = ["label"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = os.path.join(
        self.current_dir, "../../../datasets/pdbbind_core_df.pkl.gz")

    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)
    input_transformers = []
    output_transformers = [
        NormalizationTransformer(transform_y=True, dataset=train_dataset)]
    transformers = input_transformers + output_transformers
    for dataset in [train_dataset, test_dataset]:
      for transformer in transformers:
        transformer.transform(dataset)
    # We set shard size above to force the creation of multiple shards of the data.
    # pdbbind_core has ~200 examples.
    model_params["data_shape"] = train_dataset.get_data_shape()
    regression_metrics = [Metric(metrics.r2_score),
                          Metric(metrics.mean_squared_error),
                          Metric(metrics.mean_absolute_error)]

    model = SklearnModel(tasks, task_types, model_params, self.model_dir,
                         mode="regression",
                         model_instance=RandomForestRegressor())

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on train
    evaluator = Evaluator(model, train_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(regression_metrics)

    # Eval model on test
    evaluator = Evaluator(model, test_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(regression_metrics)
Example #11
0
  def _run_muv_experiment(self, dataset_file, reload=False, verbosity=None):
    """Loads or reloads a small version of MUV dataset."""
    # Load MUV dataset
    raw_dataset = load_from_disk(dataset_file)
    print("Number of examples in dataset: %s" % str(raw_dataset.shape[0]))

    print("About to featurize compounds")
    featurizer = CircularFingerprint(size=1024)
    MUV_tasks = ['MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644',
                 'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712',
                 'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652',
                 'MUV-466', 'MUV-832']
    loader = DataLoader(tasks=MUV_tasks,
                        smiles_field="smiles",
                        featurizer=featurizer,
                        verbosity=verbosity)
    dataset = loader.featurize(dataset_file, self.data_dir)
    assert len(dataset) == len(raw_dataset)

    print("About to split compounds into train/valid/test")
    splitter = ScaffoldSplitter(verbosity=verbosity)
    frac_train, frac_valid, frac_test = .8, .1, .1
    train_dataset, valid_dataset, test_dataset = \
        splitter.train_valid_test_split(
            dataset, self.train_dir, self.valid_dir, self.test_dir,
            log_every_n=1000, frac_train=frac_train,
            frac_test=frac_test, frac_valid=frac_valid)
    # Do an approximate comparison since splits are sometimes slightly off from
    # the exact fraction.
    assert relative_difference(
        len(train_dataset), frac_train * len(dataset)) < 1e-3
    assert relative_difference(
        len(valid_dataset), frac_valid * len(dataset)) < 1e-3
    assert relative_difference(
        len(test_dataset), frac_test * len(dataset)) < 1e-3

    # TODO(rbharath): Transformers don't play nice with reload! Namely,
    # reloading will cause the transform to be reapplied. This is undesirable in
    # almost all cases. Need to understand a method to fix this.
    transformers = [
        BalancingTransformer(transform_w=True, dataset=train_dataset)]
    print("Transforming datasets")
    for dataset in [train_dataset, valid_dataset, test_dataset]:
      for transformer in transformers:
          transformer.transform(dataset)

    return (len(train_dataset), len(valid_dataset), len(test_dataset))
Example #12
0
  def test_singletask_sklearn_rf_RDKIT_descriptor_regression_API(self):
    """Test of singletask RF RDKIT-descriptor regression API."""
    splittype = "scaffold"
    featurizer = RDKitDescriptors()
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = os.path.join(self.current_dir, "example.csv")
    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)

    input_transformers = [
        NormalizationTransformer(transform_X=True, dataset=train_dataset),
        ClippingTransformer(transform_X=True, dataset=train_dataset)]
    output_transformers = [
        NormalizationTransformer(transform_y=True, dataset=train_dataset)]
    transformers = input_transformers + output_transformers
    for dataset in [train_dataset, test_dataset]:
      for transformer in transformers:
        transformer.transform(dataset)

    regression_metrics = [Metric(metrics.r2_score),
                          Metric(metrics.mean_squared_error),
                          Metric(metrics.mean_absolute_error)]

    sklearn_model = RandomForestRegressor()
    model = SklearnModel(sklearn_model, self.model_dir)

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on train
    evaluator = Evaluator(model, train_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(regression_metrics)

    # Eval model on test
    evaluator = Evaluator(model, test_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(regression_metrics)
Example #13
0
    def test_multitask_order(self):
        """Test that order of tasks in multitask datasets is preserved."""
        from deepchem.models.keras_models.fcnet import MultiTaskDNN
        splittype = "scaffold"
        output_transformers = []
        input_transformers = []
        task_type = "classification"
        # TODO(rbharath): There should be some automatic check to ensure that all
        # required model_params are specified.
        model_params = {
            "nb_hidden": 10,
            "activation": "relu",
            "dropout": .5,
            "learning_rate": .01,
            "momentum": .9,
            "nesterov": False,
            "decay": 1e-4,
            "batch_size": 5,
            "nb_epoch": 2,
            "init": "glorot_uniform",
            "nb_layers": 1,
            "batchnorm": False
        }

        input_file = os.path.join(self.current_dir, "multitask_example.csv")
        tasks = [
            "task0", "task1", "task2", "task3", "task4", "task5", "task6",
            "task7", "task8", "task9", "task10", "task11", "task12", "task13",
            "task14", "task15", "task16"
        ]
        task_types = {task: task_type for task in tasks}

        featurizer = CircularFingerprint(size=1024)

        loader = DataLoader(tasks=tasks,
                            smiles_field=self.smiles_field,
                            featurizer=featurizer,
                            verbosity="low")
        dataset = loader.featurize(input_file, self.data_dir)

        splitter = ScaffoldSplitter()
        train_dataset, test_dataset = splitter.train_test_split(
            dataset, self.train_dir, self.test_dir)

        assert train_dataset.get_task_names() == tasks
        assert test_dataset.get_task_names() == tasks
Example #14
0
  def test_singletask_tf_mlp_ECFP_classification_API(self):
    """Straightforward test of Tensorflow singletask deepchem classification API."""
    n_features = 1024
    featurizer = CircularFingerprint(size=n_features)

    tasks = ["outcome"]
    input_file = os.path.join(self.current_dir, "example_classification.csv")

    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)
    
    transformers = [
        NormalizationTransformer(transform_y=True, dataset=train_dataset)]

    for dataset in [train_dataset, test_dataset]:
      for transformer in transformers:
        transformer.transform(dataset)

    classification_metrics = [Metric(metrics.roc_auc_score),
                              Metric(metrics.matthews_corrcoef),
                              Metric(metrics.recall_score),
                              Metric(metrics.accuracy_score)]

    tensorflow_model = TensorflowMultiTaskClassifier(
        len(tasks), n_features, self.model_dir)
    model = TensorflowModel(tensorflow_model, self.model_dir)

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on train
    evaluator = Evaluator(model, train_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(classification_metrics)

    # Eval model on test
    evaluator = Evaluator(model, test_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(classification_metrics)
Example #15
0
  def test_multitask_keras_mlp_ECFP_classification_API(self):
    """Straightforward test of Keras multitask deepchem classification API."""
    g = tf.Graph()
    sess = tf.Session(graph=g)
    K.set_session(sess)
    with g.as_default():
      task_type = "classification"
      input_file = os.path.join(self.current_dir, "multitask_example.csv")
      tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6",
               "task7", "task8", "task9", "task10", "task11", "task12",
               "task13", "task14", "task15", "task16"]

      n_features = 1024
      featurizer = CircularFingerprint(size=n_features)
      loader = DataLoader(tasks=tasks,
                          smiles_field=self.smiles_field,
                          featurizer=featurizer,
                          verbosity="low")
      dataset = loader.featurize(input_file, self.data_dir)
      splitter = ScaffoldSplitter()
      train_dataset, test_dataset = splitter.train_test_split(
          dataset, self.train_dir, self.test_dir)

      transformers = []
      classification_metrics = [Metric(metrics.roc_auc_score),
                                Metric(metrics.matthews_corrcoef),
                                Metric(metrics.recall_score),
                                Metric(metrics.accuracy_score)]
      
      keras_model = MultiTaskDNN(len(tasks), n_features, "classification",
                                 dropout=0.)
      model = KerasModel(keras_model, self.model_dir)

      # Fit trained model
      model.fit(train_dataset)
      model.save()

      # Eval model on train
      evaluator = Evaluator(model, train_dataset, transformers, verbosity=True)
      _ = evaluator.compute_model_performance(classification_metrics)

      # Eval model on test
      evaluator = Evaluator(model, test_dataset, transformers, verbosity=True)
      _ = evaluator.compute_model_performance(classification_metrics)
Example #16
0
    def test_multitask_keras_mlp_ECFP_classification_hyperparam_opt(self):
        """Straightforward test of Keras multitask deepchem classification API."""
        task_type = "classification"
        input_file = os.path.join(self.current_dir, "multitask_example.csv")
        tasks = [
            "task0", "task1", "task2", "task3", "task4", "task5", "task6",
            "task7", "task8", "task9", "task10", "task11", "task12", "task13",
            "task14", "task15", "task16"
        ]

        n_features = 1024
        featurizer = CircularFingerprint(size=n_features)
        loader = DataLoader(tasks=tasks,
                            smiles_field=self.smiles_field,
                            featurizer=featurizer,
                            verbosity="low")
        dataset = loader.featurize(input_file, self.data_dir)

        splitter = ScaffoldSplitter()
        train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
            dataset, self.train_dir, self.valid_dir, self.test_dir)

        transformers = []
        metric = Metric(metrics.matthews_corrcoef,
                        np.mean,
                        mode="classification")
        params_dict = {"n_hidden": [5, 10]}

        def model_builder(model_params, model_dir):
            keras_model = MultiTaskDNN(len(tasks),
                                       n_features,
                                       task_type,
                                       dropout=0.,
                                       **model_params)
            return KerasModel(keras_model, model_dir)

        optimizer = HyperparamOpt(model_builder, verbosity="low")
        best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
            params_dict,
            train_dataset,
            valid_dataset,
            transformers,
            metric,
            logdir=None)
  def test_multitask_keras_mlp_ECFP_classification_hyperparam_opt(self):
    """Straightforward test of Keras multitask deepchem classification API."""
    task_type = "classification"
    input_file = os.path.join(self.current_dir, "multitask_example.csv")
    tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6",
             "task7", "task8", "task9", "task10", "task11", "task12",
             "task13", "task14", "task15", "task16"]
    task_types = {task: task_type for task in tasks}

    featurizer = CircularFingerprint(size=1024)
    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset, self.train_dir, self.valid_dir, self.test_dir)

    transformers = []
    metric = Metric(metrics.matthews_corrcoef, np.mean, mode="classification")
    params_dict= {"nb_hidden": [5, 10],
                  "activation": ["relu"],
                  "dropout": [.5],
                  "learning_rate": [.01],
                  "momentum": [.9],
                  "nesterov": [False],
                  "decay": [1e-4],
                  "batch_size": [5],
                  "nb_epoch": [2],
                  "init": ["glorot_uniform"],
                  "nb_layers": [1],
                  "batchnorm": [False],
                  "data_shape": [train_dataset.get_data_shape()]}
    
    optimizer = HyperparamOpt(MultiTaskDNN, tasks, task_types,
                              verbosity="low")
    best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
      params_dict, train_dataset, valid_dataset, transformers,
      metric, logdir=None)
Example #18
0
    def test_multitask_order(self):
        """Test that order of tasks in multitask datasets is preserved."""
        input_file = os.path.join(self.current_dir, "multitask_example.csv")
        tasks = [
            "task0", "task1", "task2", "task3", "task4", "task5", "task6",
            "task7", "task8", "task9", "task10", "task11", "task12", "task13",
            "task14", "task15", "task16"
        ]

        featurizer = CircularFingerprint(size=1024)

        loader = DataLoader(tasks=tasks,
                            smiles_field=self.smiles_field,
                            featurizer=featurizer,
                            verbosity="low")
        dataset = loader.featurize(input_file, self.data_dir)

        splitter = ScaffoldSplitter()
        train_dataset, test_dataset = splitter.train_test_split(
            dataset, self.train_dir, self.test_dir)

        assert train_dataset.get_task_names() == tasks
        assert test_dataset.get_task_names() == tasks
Example #19
0
    def test_singletask_sklearn_rf_ECFP_regression_hyperparam_opt(self):
        """Test of hyperparam_opt with singletask RF ECFP regression API."""
        featurizer = CircularFingerprint(size=1024)
        tasks = ["log-solubility"]
        input_file = os.path.join(self.current_dir, "example.csv")
        loader = DataLoader(tasks=tasks,
                            smiles_field=self.smiles_field,
                            featurizer=featurizer,
                            verbosity="low")
        dataset = loader.featurize(input_file, self.data_dir)

        splitter = ScaffoldSplitter()
        train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
            dataset, self.train_dir, self.valid_dir, self.test_dir)

        transformers = [
            NormalizationTransformer(transform_y=True, dataset=train_dataset)
        ]
        for dataset in [train_dataset, test_dataset]:
            for transformer in transformers:
                transformer.transform(dataset)

        params_dict = {"n_estimators": [10, 100]}
        metric = Metric(metrics.r2_score)

        def rf_model_builder(model_params, model_dir):
            sklearn_model = RandomForestRegressor(**model_params)
            return SklearnModel(sklearn_model, model_dir)

        optimizer = HyperparamOpt(rf_model_builder, verbosity="low")
        best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
            params_dict,
            train_dataset,
            valid_dataset,
            transformers,
            metric,
            logdir=None)
Example #20
0
def _load_mol_dataset(dataset_file,
                      tasks,
                      split="stratified",
                      test_size=0.1,
                      valid_size=0.1,
                      min_size=0,
                      max_size=None,
                      **kwargs):

    train_size = 1.0 - (test_size + valid_size)
    featurizer = RawFeaturizer()
    loader = CSVLoader(tasks=tasks,
                       smiles_field="smiles",
                       featurizer=featurizer,
                       verbose=False,
                       log_every_n=10000)
    dataset = loader.featurize(dataset_file)

    splitters = {
        'index': IndexSplitter(),
        'random': RandomSplitter(),
        'scaffold': ScaffoldSplitter(),
        'butina': ButinaSplitter(),
        'stratified': RandomStratifiedSplitter()
    }

    splitter = splitters[split]
    train, valid, test = splitter.train_valid_test_split(dataset,
                                                         frac_train=train_size,
                                                         frac_valid=valid_size,
                                                         frac_test=test_size)

    # compute data balance information on train
    balancer = BalancingTransformer(transform_w=True, dataset=train)
    train = balancer.transform(train)
    valid = balancer.transform(valid)
    test = balancer.transform(test)
    transformer = GraphTransformer(mol_size=[min_size, max_size], **kwargs)
    datasets = []
    for dt in (train, valid, test):
        X, ids = transformer(dt.ids, dtype=np.float32, ignore_errors=False)
        y = dt.y[ids, :]
        w = dt.w[ids, :]
        raw_mols = dt.X[ids]
        datasets.append(MolDataset(X, y, raw_mols, w=w, pad_to=max_size))

    in_size = X[0][-1].shape[-1]
    out_size = 1 if len(y.shape) == 1 else y.shape[-1]
    return datasets, in_size, out_size
Example #21
0
  def test_multitask_order(self):
    """Test that order of tasks in multitask datasets is preserved."""
    from deepchem.models.keras_models.fcnet import MultiTaskDNN
    splittype = "scaffold"
    output_transformers = []
    input_transformers = []
    task_type = "classification"
    # TODO(rbharath): There should be some automatic check to ensure that all
    # required model_params are specified.
    model_params = {"nb_hidden": 10, "activation": "relu",
                    "dropout": .5, "learning_rate": .01,
                    "momentum": .9, "nesterov": False,
                    "decay": 1e-4, "batch_size": 5,
                    "nb_epoch": 2, "init": "glorot_uniform",
                    "nb_layers": 1, "batchnorm": False}

    input_file = os.path.join(self.current_dir, "multitask_example.csv")
    tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6",
             "task7", "task8", "task9", "task10", "task11", "task12",
             "task13", "task14", "task15", "task16"]
    task_types = {task: task_type for task in tasks}

    featurizer = CircularFingerprint(size=1024)

    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)
  
    assert train_dataset.get_task_names() == tasks
    assert test_dataset.get_task_names() == tasks
  def test_singletask_sklearn_rf_ECFP_regression_hyperparam_opt(self):
    """Test of hyperparam_opt with singletask RF ECFP regression API."""
    splittype = "scaffold"
    featurizer = CircularFingerprint(size=1024)
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = os.path.join(self.current_dir, "example.csv")
    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset, self.train_dir, self.valid_dir, self.test_dir)

    input_transformers = []
    output_transformers = [
        NormalizationTransformer(transform_y=True, dataset=train_dataset)]
    transformers = input_transformers + output_transformers
    for dataset in [train_dataset, test_dataset]:
      for transformer in transformers:
        transformer.transform(dataset)
    params_dict = {
      "n_estimators": [10, 100],
      "max_features": ["auto"],
      "data_shape": train_dataset.get_data_shape()
    }
    metric = Metric(metrics.r2_score)

    optimizer = HyperparamOpt(rf_model_builder, tasks, task_types, verbosity="low")
    best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
      params_dict, train_dataset, valid_dataset, output_transformers,
      metric, logdir=None)
Example #23
0
def partition_train_val_test(smiles, dataset):
    """
    Split a molecule dataset (SMILES) with deepchem built-ins
    """

    ds = MockDataset(smiles)

    if dataset == "BBBP":
        splitter = ScaffoldSplitter()
    elif dataset == "BACE":
        splitter = ScaffoldSplitter()
    elif dataset == "TOX21":
        splitter = RandomSplitter()

    train_inds, val_inds, test_inds = splitter.split(ds)

    return {
        "train_inds": train_inds,
        "val_inds": val_inds,
        "test_inds": test_inds
    }
Example #24
0
def test_splits():
    clean()

    init_data()

    smiles_col = 'compound_id'
    id_col = 'compound_id'
    output_dir = 'plots'
    frac_train = 0.8
    frac_test = 0.1
    frac_valid = 0.1
    num_super_scaffolds = 40
    num_generations = 20
    dfw = 1  # chemical distance importance weight
    rfw = 1  # split fraction importance weight

    total_df = pd.read_csv('KCNA5_KCNH2_SCN5A_data.csv', dtype={id_col: str})
    response_cols = [
        'target_KCNA5_standard_value', 'target_KCNH2_standard_value',
        'target_SCN5A_activity'
    ]

    # -------------------------------------------------------------------------
    # one generation multitask scaffold split
    mss = MultitaskScaffoldSplitter()
    mss_split_df = split_with(total_df,
                              mss,
                              smiles_col=smiles_col,
                              id_col=id_col,
                              response_cols=response_cols,
                              diff_fitness_weight=dfw,
                              ratio_fitness_weight=rfw,
                              num_generations=1,
                              num_super_scaffolds=num_super_scaffolds,
                              frac_train=frac_train,
                              frac_test=frac_test,
                              frac_valid=frac_valid)
    mss_split_df.to_csv('one_gen_split.csv', index=False)
    assert len(total_df) == len(mss_split_df)

    split_a = pd.read_csv('one_gen_split.csv', dtype={'cmpd_id': str})
    split_a_ss = SplitStats(total_df,
                            split_a,
                            smiles_col=smiles_col,
                            id_col=id_col,
                            response_cols=response_cols)
    split_a_ss.make_all_plots(
        dist_path=os.path.join(output_dir, 'multitask_1gen'))

    # -------------------------------------------------------------------------
    # multiple generation mulittask scaffold split
    mss = MultitaskScaffoldSplitter()
    mss_split_df = split_with(total_df,
                              mss,
                              smiles_col=smiles_col,
                              id_col=id_col,
                              response_cols=response_cols,
                              diff_fitness_weight=dfw,
                              ratio_fitness_weight=rfw,
                              num_generations=num_generations,
                              num_super_scaffolds=num_super_scaffolds,
                              frac_train=frac_train,
                              frac_test=frac_test,
                              frac_valid=frac_valid)
    mss_split_df.to_csv('twenty_gen_split.csv', index=False)
    assert len(total_df) == len(mss_split_df)

    split_b = pd.read_csv('twenty_gen_split.csv', dtype={'cmpd_id': str})
    split_b_ss = SplitStats(total_df,
                            split_b,
                            smiles_col=smiles_col,
                            id_col=id_col,
                            response_cols=response_cols)
    split_b_ss.make_all_plots(
        dist_path=os.path.join(output_dir, f'multitask_{num_generations}gen'))

    # -------------------------------------------------------------------------
    # regular scaffold split
    ss = ScaffoldSplitter()
    ss_split_df = split_with(total_df,
                             ss,
                             smiles_col=smiles_col,
                             id_col=id_col,
                             response_cols=response_cols,
                             frac_train=frac_train,
                             frac_test=frac_test,
                             frac_valid=frac_valid)
    ss_split_df.to_csv('ss_split.csv', index=False)
    assert len(total_df) == len(ss_split_df)

    split_c = pd.read_csv('ss_split.csv', dtype={'cmpd_id': str})
    split_c_ss = SplitStats(total_df,
                            split_c,
                            smiles_col=smiles_col,
                            id_col=id_col,
                            response_cols=response_cols)
    split_c_ss.make_all_plots(dist_path=os.path.join(output_dir, 'scaffold_'))

    # median train/test compound distance should have gone up
    assert np.median(split_a_ss.dists) <= np.median(split_b_ss.dists)
    assert np.median(split_c_ss.dists) <= np.median(split_b_ss.dists)

    # no subset should contain 0 samples
    assert np.min(
        np.concatenate([
            split_a_ss.train_fracs, split_a_ss.valid_fracs,
            split_a_ss.test_fracs
        ])) > 0
    assert np.min(
        np.concatenate([
            split_b_ss.train_fracs, split_b_ss.valid_fracs,
            split_b_ss.test_fracs
        ])) > 0

    clean()
Example #25
0
  def test_singletask_tf_mlp_ECFP_classification_API(self):
    """Straightforward test of Tensorflow singletask deepchem classification API."""
    splittype = "scaffold"
    output_transformers = []
    input_transformers = []
    task_type = "classification"

    featurizer = CircularFingerprint(size=1024)

    tasks = ["outcome"]
    task_type = "classification"
    task_types = {task: task_type for task in tasks}
    input_file = os.path.join(self.current_dir, "example_classification.csv")

    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)
    
    input_transformers = []
    output_transformers = [
        NormalizationTransformer(transform_y=True, dataset=train_dataset)]
    transformers = input_transformers + output_transformers

    for dataset in [train_dataset, test_dataset]:
      for transformer in transformers:
        transformer.transform(dataset)

    model_params = {
      "batch_size": 2,
      "num_classification_tasks": 1,
      "num_features": 1024,
      "layer_sizes": [1024],
      "weight_init_stddevs": [1.],
      "bias_init_consts": [0.],
      "dropouts": [.5],
      "num_classes": 2,
      "nb_epoch": 1,
      "penalty": 0.0,
      "optimizer": "adam",
      "learning_rate": .001,
      "data_shape": train_dataset.get_data_shape()
    }
    classification_metrics = [Metric(metrics.roc_auc_score),
                              Metric(metrics.matthews_corrcoef),
                              Metric(metrics.recall_score),
                              Metric(metrics.accuracy_score)]

    model = TensorflowModel(
        tasks, task_types, model_params, self.model_dir,
        tf_class=TensorflowMultiTaskClassifier)

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on train
    evaluator = Evaluator(model, train_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(classification_metrics)

    # Eval model on test
    evaluator = Evaluator(model, test_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(classification_metrics)
Example #26
0
    def test_multitask_keras_mlp_ECFP_classification_API(self):
        """Straightforward test of Keras multitask deepchem classification API."""
        from deepchem.models.keras_models.fcnet import MultiTaskDNN
        task_type = "classification"
        # TODO(rbharath): There should be some automatic check to ensure that all
        # required model_params are specified.
        model_params = {
            "nb_hidden": 10,
            "activation": "relu",
            "dropout": .5,
            "learning_rate": .01,
            "momentum": .9,
            "nesterov": False,
            "decay": 1e-4,
            "batch_size": 5,
            "nb_epoch": 2,
            "init": "glorot_uniform",
            "nb_layers": 1,
            "batchnorm": False
        }

        input_file = os.path.join(self.current_dir, "multitask_example.csv")
        tasks = [
            "task0", "task1", "task2", "task3", "task4", "task5", "task6",
            "task7", "task8", "task9", "task10", "task11", "task12", "task13",
            "task14", "task15", "task16"
        ]
        task_types = {task: task_type for task in tasks}

        featurizer = CircularFingerprint(size=1024)

        loader = DataLoader(tasks=tasks,
                            smiles_field=self.smiles_field,
                            featurizer=featurizer,
                            verbosity="low")
        dataset = loader.featurize(input_file, self.data_dir)
        splitter = ScaffoldSplitter()
        train_dataset, test_dataset = splitter.train_test_split(
            dataset, self.train_dir, self.test_dir)

        transformers = []
        model_params["data_shape"] = train_dataset.get_data_shape()
        classification_metrics = [
            Metric(metrics.roc_auc_score),
            Metric(metrics.matthews_corrcoef),
            Metric(metrics.recall_score),
            Metric(metrics.accuracy_score)
        ]

        model = MultiTaskDNN(tasks, task_types, model_params, self.model_dir)

        # Fit trained model
        model.fit(train_dataset)
        model.save()

        # Eval model on train
        evaluator = Evaluator(model,
                              train_dataset,
                              transformers,
                              verbosity=True)
        _ = evaluator.compute_model_performance(classification_metrics)

        # Eval model on test
        evaluator = Evaluator(model,
                              test_dataset,
                              transformers,
                              verbosity=True)
        _ = evaluator.compute_model_performance(classification_metrics)
Example #27
0
    def test_singletask_tf_mlp_ECFP_classification_API(self):
        """Straightforward test of Tensorflow singletask deepchem classification API."""
        splittype = "scaffold"
        output_transformers = []
        input_transformers = []
        task_type = "classification"

        featurizer = CircularFingerprint(size=1024)

        tasks = ["outcome"]
        task_type = "classification"
        task_types = {task: task_type for task in tasks}
        input_file = os.path.join(self.current_dir,
                                  "example_classification.csv")

        loader = DataLoader(tasks=tasks,
                            smiles_field=self.smiles_field,
                            featurizer=featurizer,
                            verbosity="low")
        dataset = loader.featurize(input_file, self.data_dir)

        splitter = ScaffoldSplitter()
        train_dataset, test_dataset = splitter.train_test_split(
            dataset, self.train_dir, self.test_dir)

        input_transformers = []
        output_transformers = [
            NormalizationTransformer(transform_y=True, dataset=train_dataset)
        ]
        transformers = input_transformers + output_transformers

        for dataset in [train_dataset, test_dataset]:
            for transformer in transformers:
                transformer.transform(dataset)

        model_params = {
            "batch_size": 2,
            "num_classification_tasks": 1,
            "num_features": 1024,
            "layer_sizes": [1024],
            "weight_init_stddevs": [1.],
            "bias_init_consts": [0.],
            "dropouts": [.5],
            "num_classes": 2,
            "nb_epoch": 1,
            "penalty": 0.0,
            "optimizer": "adam",
            "learning_rate": .001,
            "data_shape": train_dataset.get_data_shape()
        }
        classification_metrics = [
            Metric(metrics.roc_auc_score),
            Metric(metrics.matthews_corrcoef),
            Metric(metrics.recall_score),
            Metric(metrics.accuracy_score)
        ]

        model = TensorflowModel(tasks,
                                task_types,
                                model_params,
                                self.model_dir,
                                tf_class=TensorflowMultiTaskClassifier)

        # Fit trained model
        model.fit(train_dataset)
        model.save()

        # Eval model on train
        evaluator = Evaluator(model,
                              train_dataset,
                              transformers,
                              verbosity=True)
        _ = evaluator.compute_model_performance(classification_metrics)

        # Eval model on test
        evaluator = Evaluator(model,
                              test_dataset,
                              transformers,
                              verbosity=True)
        _ = evaluator.compute_model_performance(classification_metrics)