Example #1
0
  def _hyperparam_opt(self, model_builder, params_dict, train_dataset,
                      valid_dataset, output_transformers, task_types, metric,
                      logdir=None):

    optimizer = HyperparamOpt(model_builder, task_types, verbosity="low")
    best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
      params_dict, train_dataset, valid_dataset, output_transformers,
      metric, logdir=logdir)
  def test_multitask_tf_mlp_ECFP_classification_hyperparam_opt(self):
    """Straightforward test of Tensorflow multitask deepchem classification API."""
    splittype = "scaffold"
    task_type = "classification"

    input_file = os.path.join(self.current_dir, "multitask_example.csv")
    tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6",
             "task7", "task8", "task9", "task10", "task11", "task12",
             "task13", "task14", "task15", "task16"]
    task_types = {task: task_type for task in tasks}

    featurizer = CircularFingerprint(size=1024)

    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset, self.train_dir, self.valid_dir, self.test_dir)

    transformers = []

    metric = Metric(metrics.matthews_corrcoef, np.mean, mode="classification")
    params_dict = {"activation": ["relu"],
                    "momentum": [.9],
                    "batch_size": [50],
                    "init": ["glorot_uniform"],
                    "data_shape": [train_dataset.get_data_shape()],
                    "learning_rate": [1e-3],
                    "decay": [1e-6],
                    "nb_hidden": [1000], 
                    "nb_epoch": [1],
                    "nesterov": [False],
                    "dropouts": [(.5,)],
                    "nb_layers": [1],
                    "batchnorm": [False],
                    "layer_sizes": [(1000,)],
                    "weight_init_stddevs": [(.1,)],
                    "bias_init_consts": [(1.,)],
                    "num_classes": [2],
                    "penalty": [0.], 
                    "optimizer": ["sgd"],
                    "num_classification_tasks": [len(task_types)]
                  }

    def model_builder(tasks, task_types, params_dict, logdir, verbosity=None):
        return TensorflowModel(
            tasks, task_types, params_dict, logdir, 
            tf_class=TensorflowMultiTaskClassifier,
            verbosity=verbosity)
    optimizer = HyperparamOpt(model_builder, tasks, task_types,
                              verbosity="low")
    best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
      params_dict, train_dataset, valid_dataset, transformers,
      metric, logdir=None)
    def test_singletask_to_multitask_sklearn_hyperparam_opt(self):
        """Test of hyperparam_opt with singletask_to_multitask."""
        tasks = [
            "task0", "task1", "task2", "task3", "task4", "task5", "task6",
            "task7", "task8", "task9", "task10", "task11", "task12", "task13",
            "task14", "task15", "task16"
        ]
        input_file = "multitask_example.csv"

        n_features = 10
        n_tasks = len(tasks)
        # Define train dataset
        n_train = 100
        X_train = np.random.rand(n_train, n_features)
        y_train = np.random.randint(2, size=(n_train, n_tasks))
        w_train = np.ones_like(y_train)
        ids_train = ["C"] * n_train

        train_dataset = DiskDataset.from_numpy(self.train_dir, X_train,
                                               y_train, w_train, ids_train,
                                               tasks)

        # Define validation dataset
        n_valid = 10
        X_valid = np.random.rand(n_valid, n_features)
        y_valid = np.random.randint(2, size=(n_valid, n_tasks))
        w_valid = np.ones_like(y_valid)
        ids_valid = ["C"] * n_valid
        valid_dataset = DiskDataset.from_numpy(self.valid_dir, X_valid,
                                               y_valid, w_valid, ids_valid,
                                               tasks)

        transformers = []
        classification_metric = Metric(metrics.matthews_corrcoef,
                                       np.mean,
                                       mode="classification")
        params_dict = {"n_estimators": [1, 10]}

        def multitask_model_builder(model_params, model_dir):
            def model_builder(model_dir):
                sklearn_model = RandomForestClassifier(**model_params)
                return SklearnModel(sklearn_model, model_dir)

            return SingletaskToMultitask(tasks, model_builder, model_dir)

        optimizer = HyperparamOpt(multitask_model_builder, verbosity="low")
        best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
            params_dict,
            train_dataset,
            valid_dataset,
            transformers,
            classification_metric,
            logdir=None)
  def test_singletask_to_multitask_sklearn_hyperparam_opt(self):
    """Test of hyperparam_opt with singletask_to_multitask."""
    splittype = "scaffold"
    output_transformers = []
    tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6",
             "task7", "task8", "task9", "task10", "task11", "task12",
             "task13", "task14", "task15", "task16"]
    task_types = {task: "classification" for task in tasks}
    input_file = "multitask_example.csv"
      
    n_features = 10
    n_tasks = len(tasks)
    # Define train dataset
    n_train = 100
    X_train = np.random.rand(n_train, n_features)
    y_train = np.random.randint(2, size=(n_train, n_tasks))
    w_train = np.ones_like(y_train)
    ids_train = ["C"] * n_train
    train_dataset = Dataset.from_numpy(self.train_dir,
                                       X_train, y_train, w_train, ids_train,
                                       tasks)

    # Define validation dataset
    n_valid = 10
    X_valid = np.random.rand(n_valid, n_features)
    y_valid = np.random.randint(2, size=(n_valid, n_tasks))
    w_valid = np.ones_like(y_valid)
    ids_valid = ["C"] * n_valid
    valid_dataset = Dataset.from_numpy(self.valid_dir,
                                       X_valid, y_valid, w_valid, ids_valid,
                                       tasks)
    params_dict = {
        "batch_size": [32],
        "data_shape": [train_dataset.get_data_shape()],
    }
    classification_metric = Metric(metrics.matthews_corrcoef, np.mean,
                                   mode="classification")
    def model_builder(tasks, task_types, model_params, task_model_dir,
                      verbosity=None):
      return SklearnModel(tasks, task_types, model_params, task_model_dir,
                          model_instance=LogisticRegression())
    def multitask_model_builder(tasks, task_types, params_dict, logdir=None,
                                verbosity=None):
      return SingletaskToMultitask(tasks, task_types, params_dict,
                                   self.model_dir, model_builder)

    optimizer = HyperparamOpt(multitask_model_builder, tasks, task_types,
                              verbosity="low")
    best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
      params_dict, train_dataset, valid_dataset, output_transformers,
      classification_metric, logdir=None)
    def test_multitask_keras_mlp_ECFP_classification_hyperparam_opt(self):
        """Straightforward test of Keras multitask deepchem classification API."""
        task_type = "classification"
        input_file = os.path.join(self.current_dir, "multitask_example.csv")
        tasks = [
            "task0", "task1", "task2", "task3", "task4", "task5", "task6",
            "task7", "task8", "task9", "task10", "task11", "task12", "task13",
            "task14", "task15", "task16"
        ]

        n_features = 1024
        featurizer = CircularFingerprint(size=n_features)
        loader = DataLoader(tasks=tasks,
                            smiles_field=self.smiles_field,
                            featurizer=featurizer,
                            verbosity="low")
        dataset = loader.featurize(input_file, self.data_dir)

        splitter = ScaffoldSplitter()
        train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
            dataset, self.train_dir, self.valid_dir, self.test_dir)

        transformers = []
        metric = Metric(metrics.matthews_corrcoef,
                        np.mean,
                        mode="classification")
        params_dict = {"n_hidden": [5, 10]}

        def model_builder(model_params, model_dir):
            keras_model = MultiTaskDNN(len(tasks),
                                       n_features,
                                       task_type,
                                       dropout=0.,
                                       **model_params)
            return KerasModel(keras_model, model_dir)

        optimizer = HyperparamOpt(model_builder, verbosity="low")
        best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
            params_dict,
            train_dataset,
            valid_dataset,
            transformers,
            metric,
            logdir=None)
  def test_multitask_keras_mlp_ECFP_classification_hyperparam_opt(self):
    """Straightforward test of Keras multitask deepchem classification API."""
    task_type = "classification"
    input_file = os.path.join(self.current_dir, "multitask_example.csv")
    tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6",
             "task7", "task8", "task9", "task10", "task11", "task12",
             "task13", "task14", "task15", "task16"]
    task_types = {task: task_type for task in tasks}

    featurizer = CircularFingerprint(size=1024)
    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset, self.train_dir, self.valid_dir, self.test_dir)

    transformers = []
    metric = Metric(metrics.matthews_corrcoef, np.mean, mode="classification")
    params_dict= {"nb_hidden": [5, 10],
                  "activation": ["relu"],
                  "dropout": [.5],
                  "learning_rate": [.01],
                  "momentum": [.9],
                  "nesterov": [False],
                  "decay": [1e-4],
                  "batch_size": [5],
                  "nb_epoch": [2],
                  "init": ["glorot_uniform"],
                  "nb_layers": [1],
                  "batchnorm": [False],
                  "data_shape": [train_dataset.get_data_shape()]}
    
    optimizer = HyperparamOpt(MultiTaskDNN, tasks, task_types,
                              verbosity="low")
    best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
      params_dict, train_dataset, valid_dataset, transformers,
      metric, logdir=None)
    def test_singletask_sklearn_rf_ECFP_regression_hyperparam_opt(self):
        """Test of hyperparam_opt with singletask RF ECFP regression API."""
        featurizer = CircularFingerprint(size=1024)
        tasks = ["log-solubility"]
        input_file = os.path.join(self.current_dir, "example.csv")
        loader = DataLoader(tasks=tasks,
                            smiles_field=self.smiles_field,
                            featurizer=featurizer,
                            verbosity="low")
        dataset = loader.featurize(input_file, self.data_dir)

        splitter = ScaffoldSplitter()
        train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
            dataset, self.train_dir, self.valid_dir, self.test_dir)

        transformers = [
            NormalizationTransformer(transform_y=True, dataset=train_dataset)
        ]
        for dataset in [train_dataset, test_dataset]:
            for transformer in transformers:
                transformer.transform(dataset)

        params_dict = {"n_estimators": [10, 100]}
        metric = Metric(metrics.r2_score)

        def rf_model_builder(model_params, model_dir):
            sklearn_model = RandomForestRegressor(**model_params)
            return SklearnModel(sklearn_model, model_dir)

        optimizer = HyperparamOpt(rf_model_builder, verbosity="low")
        best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
            params_dict,
            train_dataset,
            valid_dataset,
            transformers,
            metric,
            logdir=None)
  def test_singletask_sklearn_rf_ECFP_regression_hyperparam_opt(self):
    """Test of hyperparam_opt with singletask RF ECFP regression API."""
    splittype = "scaffold"
    featurizer = CircularFingerprint(size=1024)
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = os.path.join(self.current_dir, "example.csv")
    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset, self.train_dir, self.valid_dir, self.test_dir)

    input_transformers = []
    output_transformers = [
        NormalizationTransformer(transform_y=True, dataset=train_dataset)]
    transformers = input_transformers + output_transformers
    for dataset in [train_dataset, test_dataset]:
      for transformer in transformers:
        transformer.transform(dataset)
    params_dict = {
      "n_estimators": [10, 100],
      "max_features": ["auto"],
      "data_shape": train_dataset.get_data_shape()
    }
    metric = Metric(metrics.r2_score)

    optimizer = HyperparamOpt(rf_model_builder, tasks, task_types, verbosity="low")
    best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
      params_dict, train_dataset, valid_dataset, output_transformers,
      metric, logdir=None)
Example #9
0
def bace_rf_model(mode="classification", verbosity="high", split="20-80"):
    """Train random forests on BACE dataset."""
    (bace_tasks, train_dataset, valid_dataset, test_dataset, crystal_dataset,
     transformers) = load_bace(mode=mode, transform=False, split=split)

    if mode == "regression":
        r2_metric = Metric(metrics.r2_score, verbosity=verbosity)
        rms_metric = Metric(metrics.rms_score, verbosity=verbosity)
        mae_metric = Metric(metrics.mae_score, verbosity=verbosity)
        all_metrics = [r2_metric, rms_metric, mae_metric]
        metric = r2_metric
        model_class = RandomForestRegressor
    elif mode == "classification":
        roc_auc_metric = Metric(metrics.roc_auc_score, verbosity=verbosity)
        accuracy_metric = Metric(metrics.accuracy_score, verbosity=verbosity)
        mcc_metric = Metric(metrics.matthews_corrcoef, verbosity=verbosity)
        # Note sensitivity = recall
        recall_metric = Metric(metrics.recall_score, verbosity=verbosity)
        model_class = RandomForestClassifier
        all_metrics = [
            accuracy_metric, mcc_metric, recall_metric, roc_auc_metric
        ]
        metric = roc_auc_metric
    else:
        raise ValueError("Invalid mode %s" % mode)

    def model_builder(tasks,
                      task_types,
                      params_dict,
                      model_dir,
                      verbosity=verbosity):
        n_estimators = params_dict["n_estimators"]
        max_features = params_dict["max_features"]
        return SklearnModel(tasks,
                            task_types,
                            params_dict,
                            model_dir,
                            model_instance=model_class(
                                n_estimators=n_estimators,
                                max_features=max_features))

    params_dict = {
        "n_estimators": [10, 100],
        "batch_size": [None],
        "data_shape": [train_dataset.get_data_shape()],
        "max_features": ["auto", "sqrt", "log2", None],
    }
    optimizer = HyperparamOpt(model_builder, bace_tasks,
                              {task: mode
                               for task in bace_tasks})
    best_rf, best_rf_hyperparams, all_rf_results = optimizer.hyperparam_search(
        params_dict, train_dataset, valid_dataset, transformers, metric=metric)

    if len(train_dataset) > 0:
        rf_train_evaluator = Evaluator(best_rf,
                                       train_dataset,
                                       transformers,
                                       verbosity=verbosity)
        csv_out = "rf_%s_%s_train.csv" % (mode, split)
        stats_out = "rf_%s_%s_train_stats.txt" % (mode, split)
        rf_train_score = rf_train_evaluator.compute_model_performance(
            all_metrics, csv_out=csv_out, stats_out=stats_out)
        print("RF Train set scores: %s" % (str(rf_train_score)))

    if len(valid_dataset) > 0:
        rf_valid_evaluator = Evaluator(best_rf,
                                       valid_dataset,
                                       transformers,
                                       verbosity=verbosity)
        csv_out = "rf_%s_%s_valid.csv" % (mode, split)
        stats_out = "rf_%s_%s_valid_stats.txt" % (mode, split)
        rf_valid_score = rf_valid_evaluator.compute_model_performance(
            all_metrics, csv_out=csv_out, stats_out=stats_out)
        print("RF Valid set scores: %s" % (str(rf_valid_score)))

    if len(test_dataset) > 0:
        rf_test_evaluator = Evaluator(best_rf,
                                      test_dataset,
                                      transformers,
                                      verbosity=verbosity)
        csv_out = "rf_%s_%s_test.csv" % (mode, split)
        stats_out = "rf_%s_%s_test_stats.txt" % (mode, split)
        rf_test_score = rf_test_evaluator.compute_model_performance(
            all_metrics, csv_out=csv_out, stats_out=stats_out)
        print("RF Test set: %s" % (str(rf_test_score)))

    if len(crystal_dataset) > 0:
        rf_crystal_evaluator = Evaluator(best_rf, crystal_dataset,
                                         transformers, verbosity)
        csv_out = "rf_%s_%s_crystal.csv" % (mode, split)
        stats_out = "rf_%s_%s_crystal_stats.txt" % (mode, split)
        rf_crystal_score = rf_crystal_evaluator.compute_model_performance(
            all_metrics, csv_out=csv_out, stats_out=stats_out)
        print("RF Crystal set: %s" % (str(rf_crystal_score)))
Example #10
0
def bace_dnn_model(mode="classification", verbosity="high", split="20-80"):
  """Train fully-connected DNNs on BACE dataset."""
  (bace_tasks, train_dataset, valid_dataset, test_dataset, crystal_dataset,
   transformers) = load_bace(mode=mode, transform=True, split=split)

  if mode == "regression":
    r2_metric = Metric(metrics.r2_score, verbosity=verbosity)
    rms_metric = Metric(metrics.rms_score, verbosity=verbosity)
    mae_metric = Metric(metrics.mae_score, verbosity=verbosity)
    all_metrics = [r2_metric, rms_metric, mae_metric]
    metric = r2_metric
  elif mode == "classification":
    roc_auc_metric = Metric(metrics.roc_auc_score, verbosity=verbosity)
    accuracy_metric = Metric(metrics.accuracy_score, verbosity=verbosity)
    mcc_metric = Metric(metrics.matthews_corrcoef, verbosity=verbosity)
    # Note sensitivity = recall
    recall_metric = Metric(metrics.recall_score, verbosity=verbosity)
    all_metrics = [accuracy_metric, mcc_metric, recall_metric, roc_auc_metric]
    metric = roc_auc_metric 
  else:
    raise ValueError("Invalid mode %s" % mode)

  params_dict = {"activation": ["relu"],
                  "momentum": [.9],
                  "batch_size": [50],
                  "init": ["glorot_uniform"],
                  "data_shape": [train_dataset.get_data_shape()],
                  "learning_rate": np.power(10., np.random.uniform(-5, -3, size=5)),
                  "decay": np.power(10, np.random.uniform(-6, -4, size=5)),
                  "nb_hidden": [1000],
                  "nb_epoch": [40],
                  "nesterov": [False],
                  "dropout": [.5],
                  "nb_layers": [1],
                  "batchnorm": [False],
                }

  optimizer = HyperparamOpt(SingleTaskDNN, bace_tasks,
                            {task: mode for task in bace_tasks},
                            verbosity=verbosity)
  best_dnn, best_hyperparams, all_results = optimizer.hyperparam_search(
      params_dict, train_dataset, valid_dataset, transformers,
      metric=metric)

  if len(train_dataset) > 0:
    dnn_train_evaluator = Evaluator(best_dnn, train_dataset, transformers)            
    csv_out = "dnn_%s_%s_train.csv" % (mode, split)
    stats_out = "dnn_%s_%s_train_stats.txt" % (mode, split)
    dnn_train_score = dnn_train_evaluator.compute_model_performance(
        all_metrics, csv_out=csv_out, stats_out=stats_out)
    print("DNN Train set %s: %s" % (metric.name, str(dnn_train_score)))

  if len(valid_dataset) > 0:
    dnn_valid_evaluator = Evaluator(best_dnn, valid_dataset, transformers)            
    csv_out = "dnn_%s_%s_valid.csv" % (mode, split)
    stats_out = "dnn_%s_%s_valid_stats.txt" % (mode, split)
    dnn_valid_score = dnn_valid_evaluator.compute_model_performance(
        all_metrics, csv_out=csv_out, stats_out=stats_out)
    print("DNN Valid set %s: %s" % (metric.name, str(dnn_valid_score)))
                                                                                               
  if len(test_dataset) > 0:
    dnn_test_evaluator = Evaluator(best_dnn, test_dataset, transformers)
    csv_out = "dnn_%s_%s_test.csv" % (mode, split)
    stats_out = "dnn_%s_%s_test_stats.txt" % (mode, split)
    dnn_test_score = dnn_test_evaluator.compute_model_performance(
        all_metrics, csv_out=csv_out, stats_out=stats_out)
    print("DNN Test set %s: %s" % (metric.name, str(dnn_test_score)))

  if len(crystal_dataset) > 0:
    dnn_crystal_evaluator = Evaluator(best_dnn, crystal_dataset, transformers)
    csv_out = "dnn_%s_%s_crystal.csv" % (mode, split)
    stats_out = "dnn_%s_%s_crystal_stats.txt" % (mode, split)
    dnn_crystal_score = dnn_crystal_evaluator.compute_model_performance(
        all_metrics, csv_out=csv_out, stats_out=stats_out)
    print("DNN Crystal set %s: %s" % (metric.name, str(dnn_crystal_score)))
Example #11
0
def bace_dnn_model(mode="classification", verbosity="high", split="20-80"):
    """Train fully-connected DNNs on BACE dataset."""
    (bace_tasks, train_dataset, valid_dataset, test_dataset, crystal_dataset,
     transformers) = load_bace(mode=mode, transform=True, split=split)

    if mode == "regression":
        r2_metric = Metric(metrics.r2_score, verbosity=verbosity)
        rms_metric = Metric(metrics.rms_score, verbosity=verbosity)
        mae_metric = Metric(metrics.mae_score, verbosity=verbosity)
        all_metrics = [r2_metric, rms_metric, mae_metric]
        metric = r2_metric
    elif mode == "classification":
        roc_auc_metric = Metric(metrics.roc_auc_score, verbosity=verbosity)
        accuracy_metric = Metric(metrics.accuracy_score, verbosity=verbosity)
        mcc_metric = Metric(metrics.matthews_corrcoef, verbosity=verbosity)
        # Note sensitivity = recall
        recall_metric = Metric(metrics.recall_score, verbosity=verbosity)
        all_metrics = [
            accuracy_metric, mcc_metric, recall_metric, roc_auc_metric
        ]
        metric = roc_auc_metric
    else:
        raise ValueError("Invalid mode %s" % mode)

    params_dict = {
        "activation": ["relu"],
        "momentum": [.9],
        "batch_size": [50],
        "init": ["glorot_uniform"],
        "data_shape": [train_dataset.get_data_shape()],
        "learning_rate": np.power(10., np.random.uniform(-5, -3, size=5)),
        "decay": np.power(10, np.random.uniform(-6, -4, size=5)),
        "nb_hidden": [1000],
        "nb_epoch": [40],
        "nesterov": [False],
        "dropout": [.5],
        "nb_layers": [1],
        "batchnorm": [False],
    }

    optimizer = HyperparamOpt(SingleTaskDNN,
                              bace_tasks, {task: mode
                                           for task in bace_tasks},
                              verbosity=verbosity)
    best_dnn, best_hyperparams, all_results = optimizer.hyperparam_search(
        params_dict, train_dataset, valid_dataset, transformers, metric=metric)

    if len(train_dataset) > 0:
        dnn_train_evaluator = Evaluator(best_dnn, train_dataset, transformers)
        csv_out = "dnn_%s_%s_train.csv" % (mode, split)
        stats_out = "dnn_%s_%s_train_stats.txt" % (mode, split)
        dnn_train_score = dnn_train_evaluator.compute_model_performance(
            all_metrics, csv_out=csv_out, stats_out=stats_out)
        print("DNN Train set %s: %s" % (metric.name, str(dnn_train_score)))

    if len(valid_dataset) > 0:
        dnn_valid_evaluator = Evaluator(best_dnn, valid_dataset, transformers)
        csv_out = "dnn_%s_%s_valid.csv" % (mode, split)
        stats_out = "dnn_%s_%s_valid_stats.txt" % (mode, split)
        dnn_valid_score = dnn_valid_evaluator.compute_model_performance(
            all_metrics, csv_out=csv_out, stats_out=stats_out)
        print("DNN Valid set %s: %s" % (metric.name, str(dnn_valid_score)))

    if len(test_dataset) > 0:
        dnn_test_evaluator = Evaluator(best_dnn, test_dataset, transformers)
        csv_out = "dnn_%s_%s_test.csv" % (mode, split)
        stats_out = "dnn_%s_%s_test_stats.txt" % (mode, split)
        dnn_test_score = dnn_test_evaluator.compute_model_performance(
            all_metrics, csv_out=csv_out, stats_out=stats_out)
        print("DNN Test set %s: %s" % (metric.name, str(dnn_test_score)))

    if len(crystal_dataset) > 0:
        dnn_crystal_evaluator = Evaluator(best_dnn, crystal_dataset,
                                          transformers)
        csv_out = "dnn_%s_%s_crystal.csv" % (mode, split)
        stats_out = "dnn_%s_%s_crystal_stats.txt" % (mode, split)
        dnn_crystal_score = dnn_crystal_evaluator.compute_model_performance(
            all_metrics, csv_out=csv_out, stats_out=stats_out)
        print("DNN Crystal set %s: %s" % (metric.name, str(dnn_crystal_score)))
Example #12
0
def bace_rf_model(mode="classification", verbosity="high", split="20-80"):
  """Train random forests on BACE dataset."""
  (bace_tasks, train_dataset, valid_dataset, test_dataset, crystal_dataset,
   transformers) = load_bace(mode=mode, transform=False, split=split)

  if mode == "regression":
    r2_metric = Metric(metrics.r2_score, verbosity=verbosity)
    rms_metric = Metric(metrics.rms_score, verbosity=verbosity)
    mae_metric = Metric(metrics.mae_score, verbosity=verbosity)
    all_metrics = [r2_metric, rms_metric, mae_metric]
    metric = r2_metric
    model_class = RandomForestRegressor
  elif mode == "classification":
    roc_auc_metric = Metric(metrics.roc_auc_score, verbosity=verbosity)
    accuracy_metric = Metric(metrics.accuracy_score, verbosity=verbosity)
    mcc_metric = Metric(metrics.matthews_corrcoef, verbosity=verbosity)
    # Note sensitivity = recall
    recall_metric = Metric(metrics.recall_score, verbosity=verbosity)
    model_class = RandomForestClassifier
    all_metrics = [accuracy_metric, mcc_metric, recall_metric, roc_auc_metric]
    metric = roc_auc_metric 
  else:
    raise ValueError("Invalid mode %s" % mode)

  def model_builder(tasks, task_types, params_dict, model_dir, verbosity=verbosity):
      n_estimators = params_dict["n_estimators"]
      max_features = params_dict["max_features"]
      return SklearnModel(
          tasks, task_types, params_dict, model_dir,
          model_instance=model_class(n_estimators=n_estimators,
                                     max_features=max_features))
  params_dict = {
      "n_estimators": [10, 100],
      "batch_size": [None],
      "data_shape": [train_dataset.get_data_shape()],
      "max_features": ["auto", "sqrt", "log2", None],
      }
  optimizer = HyperparamOpt(model_builder, bace_tasks,
                            {task: mode for task in bace_tasks})
  best_rf, best_rf_hyperparams, all_rf_results = optimizer.hyperparam_search(
      params_dict, train_dataset, valid_dataset, transformers,
      metric=metric)

  if len(train_dataset) > 0:
    rf_train_evaluator = Evaluator(best_rf, train_dataset, transformers,
                                   verbosity=verbosity)
    csv_out = "rf_%s_%s_train.csv" % (mode, split)
    stats_out = "rf_%s_%s_train_stats.txt" % (mode, split)
    rf_train_score = rf_train_evaluator.compute_model_performance(
        all_metrics, csv_out=csv_out, stats_out=stats_out)
    print("RF Train set scores: %s" % (str(rf_train_score)))

  if len(valid_dataset) > 0:
    rf_valid_evaluator = Evaluator(best_rf, valid_dataset, transformers,
                                   verbosity=verbosity)
    csv_out = "rf_%s_%s_valid.csv" % (mode, split)
    stats_out = "rf_%s_%s_valid_stats.txt" % (mode, split)
    rf_valid_score = rf_valid_evaluator.compute_model_performance(
        all_metrics, csv_out=csv_out, stats_out=stats_out)
    print("RF Valid set scores: %s" % (str(rf_valid_score)))

  if len(test_dataset) > 0:
    rf_test_evaluator = Evaluator(best_rf, test_dataset, transformers,
                                  verbosity=verbosity)
    csv_out = "rf_%s_%s_test.csv" % (mode, split)
    stats_out = "rf_%s_%s_test_stats.txt" % (mode, split)
    rf_test_score = rf_test_evaluator.compute_model_performance(
        all_metrics, csv_out=csv_out, stats_out=stats_out)
    print("RF Test set: %s" % (str(rf_test_score)))

  if len(crystal_dataset) > 0:
    rf_crystal_evaluator = Evaluator(best_rf, crystal_dataset, transformers,
                                     verbosity)
    csv_out = "rf_%s_%s_crystal.csv" % (mode, split)
    stats_out = "rf_%s_%s_crystal_stats.txt" % (mode, split)
    rf_crystal_score = rf_crystal_evaluator.compute_model_performance(
        all_metrics, csv_out=csv_out, stats_out=stats_out)
    print("RF Crystal set: %s" % (str(rf_crystal_score)))
Example #13
0
def bace_dnn_model(mode="classification", verbosity="high", split="20-80"):
    """Train fully-connected DNNs on BACE dataset."""
    (bace_tasks, train_dataset, valid_dataset, test_dataset, crystal_dataset,
     transformers) = load_bace(mode=mode, transform=True, split=split)

    if mode == "regression":
        r2_metric = Metric(metrics.r2_score, verbosity=verbosity)
        rms_metric = Metric(metrics.rms_score, verbosity=verbosity)
        mae_metric = Metric(metrics.mae_score, verbosity=verbosity)
        all_metrics = [r2_metric, rms_metric, mae_metric]
        metric = r2_metric
    elif mode == "classification":
        roc_auc_metric = Metric(metrics.roc_auc_score, verbosity=verbosity)
        accuracy_metric = Metric(metrics.accuracy_score, verbosity=verbosity)
        mcc_metric = Metric(metrics.matthews_corrcoef, verbosity=verbosity)
        # Note sensitivity = recall
        recall_metric = Metric(metrics.recall_score, verbosity=verbosity)
        all_metrics = [
            accuracy_metric, mcc_metric, recall_metric, roc_auc_metric
        ]
        metric = roc_auc_metric
    else:
        raise ValueError("Invalid mode %s" % mode)

    params_dict = {
        "learning_rate": np.power(10., np.random.uniform(-5, -3, size=5)),
        "decay": np.power(10, np.random.uniform(-6, -4, size=5)),
        "nb_epoch": [40]
    }

    n_features = train_dataset.get_data_shape()[0]

    def model_builder(model_params, model_dir):
        keras_model = MultiTaskDNN(len(bace_tasks),
                                   n_features,
                                   "classification",
                                   dropout=.5,
                                   **model_params)
        return KerasModel(keras_model, model_dir)

    optimizer = HyperparamOpt(model_builder, verbosity="low")
    best_dnn, best_hyperparams, all_results = optimizer.hyperparam_search(
        params_dict, train_dataset, valid_dataset, transformers, metric=metric)

    if len(train_dataset) > 0:
        dnn_train_evaluator = Evaluator(best_dnn, train_dataset, transformers)
        csv_out = "dnn_%s_%s_train.csv" % (mode, split)
        stats_out = "dnn_%s_%s_train_stats.txt" % (mode, split)
        dnn_train_score = dnn_train_evaluator.compute_model_performance(
            all_metrics, csv_out=csv_out, stats_out=stats_out)
        print("DNN Train set %s: %s" % (metric.name, str(dnn_train_score)))

    if len(valid_dataset) > 0:
        dnn_valid_evaluator = Evaluator(best_dnn, valid_dataset, transformers)
        csv_out = "dnn_%s_%s_valid.csv" % (mode, split)
        stats_out = "dnn_%s_%s_valid_stats.txt" % (mode, split)
        dnn_valid_score = dnn_valid_evaluator.compute_model_performance(
            all_metrics, csv_out=csv_out, stats_out=stats_out)
        print("DNN Valid set %s: %s" % (metric.name, str(dnn_valid_score)))

    if len(test_dataset) > 0:
        dnn_test_evaluator = Evaluator(best_dnn, test_dataset, transformers)
        csv_out = "dnn_%s_%s_test.csv" % (mode, split)
        stats_out = "dnn_%s_%s_test_stats.txt" % (mode, split)
        dnn_test_score = dnn_test_evaluator.compute_model_performance(
            all_metrics, csv_out=csv_out, stats_out=stats_out)
        print("DNN Test set %s: %s" % (metric.name, str(dnn_test_score)))

    if len(crystal_dataset) > 0:
        dnn_crystal_evaluator = Evaluator(best_dnn, crystal_dataset,
                                          transformers)
        csv_out = "dnn_%s_%s_crystal.csv" % (mode, split)
        stats_out = "dnn_%s_%s_crystal_stats.txt" % (mode, split)
        dnn_crystal_score = dnn_crystal_evaluator.compute_model_performance(
            all_metrics, csv_out=csv_out, stats_out=stats_out)
        print("DNN Crystal set %s: %s" % (metric.name, str(dnn_crystal_score)))