Beispiel #1
0
  def test_sklearn_multitask_regression_overfit(self):
    """Test SKLearn singletask-to-multitask overfits tiny regression data."""
    n_tasks = 2
    tasks = ["task%d" % task for task in range(n_tasks)]
    n_samples = 10
    n_features = 3
    
    # Generate dummy dataset
    np.random.seed(123)
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.rand(n_samples, n_tasks)
    w = np.ones((n_samples, n_tasks))

    dataset = DiskDataset.from_numpy(self.train_dir, X, y, w, ids)

    verbosity = "high"
    regression_metric = Metric(metrics.r2_score, verbosity=verbosity, task_averager=np.mean)
    def model_builder(model_dir):
      sklearn_model = RandomForestRegressor()
      return SklearnModel(sklearn_model, model_dir)
    model = SingletaskToMultitask(tasks, model_builder, self.model_dir)

    # Fit trained model
    model.fit(dataset)
    model.save()

    # Eval model on train
    transformers = []
    evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity)
    scores = evaluator.compute_model_performance([regression_metric])

    assert scores[regression_metric.name] > .7
Beispiel #2
0
    def test_sklearn_multitask_classification_overfit(self):
        """Test SKLearn singletask-to-multitask overfits tiny data."""
        n_tasks = 10
        tasks = ["task%d" % task for task in range(n_tasks)]
        task_types = {task: "classification" for task in tasks}
        n_samples = 10
        n_features = 3

        # Generate dummy dataset
        np.random.seed(123)
        ids = np.arange(n_samples)
        X = np.random.rand(n_samples, n_features)
        y = np.random.randint(2, size=(n_samples, n_tasks))
        w = np.ones((n_samples, n_tasks))

        dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

        model_params = {
            "batch_size": None,
            "data_shape": dataset.get_data_shape()
        }

        verbosity = "high"
        classification_metric = Metric(metrics.roc_auc_score,
                                       verbosity=verbosity)

        def model_builder(tasks,
                          task_types,
                          model_params,
                          model_dir,
                          verbosity=None):
            return SklearnModel(tasks,
                                task_types,
                                model_params,
                                model_dir,
                                mode="classification",
                                model_instance=RandomForestClassifier(),
                                verbosity=verbosity)

        model = SingletaskToMultitask(tasks,
                                      task_types,
                                      model_params,
                                      self.model_dir,
                                      model_builder,
                                      verbosity=verbosity)

        # Fit trained model
        model.fit(dataset)
        model.save()

        # Eval model on train
        transformers = []
        evaluator = Evaluator(model,
                              dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([classification_metric])

        assert scores[classification_metric.name] > .9
  def test_singletask_to_multitask_classification(self):
    splittype = "scaffold"
    compound_featurizers = [CircularFingerprint(size=1024)]
    complex_featurizers = []
    output_transformers = []
    tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6",
             "task7", "task8", "task9", "task10", "task11", "task12",
             "task13", "task14", "task15", "task16"]
    task_types = {task: "classification" for task in tasks}
    input_file = "multitask_example.csv"

    n_features = 10
    n_tasks = len(tasks)
    # Define train dataset
    n_train = 100
    X_train = np.random.rand(n_train, n_features)
    y_train = np.random.randint(2, size=(n_train, n_tasks))
    w_train = np.ones_like(y_train)
    ids_train = ["C"] * n_train
    train_dataset = Dataset.from_numpy(self.train_dir,
                                       X_train, y_train, w_train, ids_train,
                                       tasks)

    # Define test dataset
    n_test = 10
    X_test = np.random.rand(n_test, n_features)
    y_test = np.random.randint(2, size=(n_test, n_tasks))
    w_test = np.ones_like(y_test)
    ids_test = ["C"] * n_test
    test_dataset = Dataset.from_numpy(self.test_dir,
                                      X_test, y_test, w_test, ids_test,
                                      tasks)

    params_dict = {
        "batch_size": 32,
        "data_shape": train_dataset.get_data_shape()
    }
    classification_metrics = [Metric(metrics.roc_auc_score)]
    def model_builder(tasks, task_types, model_params, model_builder, verbosity=None):
      return SklearnModel(tasks, task_types, model_params, model_builder,
                          model_instance=LogisticRegression())
    multitask_model = SingletaskToMultitask(tasks, task_types, params_dict,
                                            self.model_dir, model_builder)

    # Fit trained model
    multitask_model.fit(train_dataset)
    multitask_model.save()

    # Eval multitask_model on train
    evaluator = Evaluator(multitask_model, train_dataset, output_transformers,
                          verbosity=True)
    _ = evaluator.compute_model_performance(classification_metrics)

    # Eval multitask_model on test
    evaluator = Evaluator(multitask_model, test_dataset, output_transformers,
                          verbosity=True)
    _ = evaluator.compute_model_performance(classification_metrics)
Beispiel #4
0
  def test_sklearn_multitask_classification(self):
    """Test that sklearn models can learn on simple multitask classification."""
    np.random.seed(123)
    n_tasks = 4
    dataset = sklearn.datasets.load_digits(n_class=2)
    X, y = dataset.data, dataset.target
    y = np.reshape(y, (len(y), 1))
    y = np.hstack([y] * n_tasks)
    
    frac_train = .7
    n_samples = len(X)
    
    X_train, y_train = X[:frac_train*n_samples], y[:frac_train*n_samples]
    X_test, y_test = X[frac_train*n_samples:], y[frac_train*n_samples:]

    train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train)
    test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test)

    tasks = train_dataset.get_task_names()
    task_types = {task: "classification" for task in tasks}

    model_params = {
      "batch_size": None,
      "data_shape": train_dataset.get_data_shape()
    }

    verbosity = "high"
    classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity)
    def model_builder(tasks, task_types, model_params, model_dir, verbosity=None):
      return SklearnModel(tasks, task_types, model_params, model_dir,
                          mode="classification",
                          model_instance=LogisticRegression(),
                          verbosity=verbosity)
    model = SingletaskToMultitask(tasks, task_types, model_params, self.model_dir,
                                  model_builder, verbosity=verbosity)

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on train
    transformers = []
    train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity)
    train_scores = train_evaluator.compute_model_performance([classification_metric])
    print("train_scores")
    print(train_scores)

    # Eval model on test
    transformers = []
    evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity)
    scores = evaluator.compute_model_performance([classification_metric])
    print("scores")
    print(scores)

    for score in scores[classification_metric.name]:
      assert score > .5
Beispiel #5
0
    def test_sklearn_multitask_classification(self):
        """Test that sklearn models can learn on simple multitask classification."""
        np.random.seed(123)
        n_tasks = 4
        tasks = range(n_tasks)
        dataset = sklearn.datasets.load_digits(n_class=2)
        X, y = dataset.data, dataset.target
        y = np.reshape(y, (len(y), 1))
        y = np.hstack([y] * n_tasks)

        frac_train = .7
        n_samples = len(X)
        n_train = int(frac_train * n_samples)
        X_train, y_train = X[:n_train], y[:n_train]
        X_test, y_test = X[n_train:], y[n_train:]
        train_dataset = DiskDataset.from_numpy(self.train_dir, X_train,
                                               y_train)
        test_dataset = DiskDataset.from_numpy(self.test_dir, X_test, y_test)

        verbosity = "high"
        classification_metric = Metric(metrics.roc_auc_score,
                                       verbosity=verbosity)

        def model_builder(model_dir):
            sklearn_model = LogisticRegression()
            return SklearnModel(sklearn_model, model_dir)

        model = SingletaskToMultitask(tasks, model_builder, self.model_dir)

        # Fit trained model
        model.fit(train_dataset)
        model.save()

        # Eval model on train
        transformers = []
        train_evaluator = Evaluator(model,
                                    train_dataset,
                                    transformers,
                                    verbosity=verbosity)
        train_scores = train_evaluator.compute_model_performance(
            [classification_metric])

        # Eval model on test
        transformers = []
        evaluator = Evaluator(model,
                              test_dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([classification_metric])

        for score in scores[classification_metric.name]:
            assert score > .5
    def test_singletask_to_multitask_classification(self):
        n_features = 10
        n_tasks = 17
        tasks = range(n_tasks)
        # Define train dataset
        n_train = 100
        X_train = np.random.rand(n_train, n_features)
        y_train = np.random.randint(2, size=(n_train, n_tasks))
        w_train = np.ones_like(y_train)
        ids_train = ["C"] * n_train
        train_dataset = DiskDataset.from_numpy(self.train_dir, X_train,
                                               y_train, w_train, ids_train)

        # Define test dataset
        n_test = 10
        X_test = np.random.rand(n_test, n_features)
        y_test = np.random.randint(2, size=(n_test, n_tasks))
        w_test = np.ones_like(y_test)
        ids_test = ["C"] * n_test
        test_dataset = DiskDataset.from_numpy(self.test_dir, X_test, y_test,
                                              w_test, ids_test)

        transformers = []
        classification_metrics = [Metric(metrics.roc_auc_score)]

        def model_builder(model_dir):
            sklearn_model = LogisticRegression()
            return SklearnModel(sklearn_model, model_dir)

        multitask_model = SingletaskToMultitask(tasks, model_builder,
                                                self.model_dir)

        # Fit trained model
        multitask_model.fit(train_dataset)
        multitask_model.save()

        # Eval multitask_model on train
        evaluator = Evaluator(multitask_model,
                              train_dataset,
                              transformers,
                              verbosity=True)
        _ = evaluator.compute_model_performance(classification_metrics)

        # Eval multitask_model on test
        evaluator = Evaluator(multitask_model,
                              test_dataset,
                              transformers,
                              verbosity=True)
        _ = evaluator.compute_model_performance(classification_metrics)
Beispiel #7
0
  def test_sklearn_multitask_regression_overfit(self):
    """Test SKLearn singletask-to-multitask overfits tiny regression data."""
    n_tasks = 2
    tasks = ["task%d" % task for task in range(n_tasks)]
    task_types = {task: "regression" for task in tasks}
    n_samples = 10
    n_features = 3
    
    # Generate dummy dataset
    np.random.seed(123)
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.rand(n_samples, n_tasks)
    w = np.ones((n_samples, n_tasks))

    dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

    model_params = {
      "batch_size": None,
      "data_shape": dataset.get_data_shape()
    }

    verbosity = "high"
    regression_metric = Metric(metrics.r2_score, verbosity=verbosity)
    def model_builder(tasks, task_types, model_params, model_dir, verbosity=None):
      return SklearnModel(tasks, task_types, model_params, model_dir,
                          mode="regression",
                          model_instance=RandomForestRegressor(),
                          verbosity=verbosity)
    model = SingletaskToMultitask(tasks, task_types, model_params, self.model_dir,
                                  model_builder, verbosity=verbosity)

    # Fit trained model
    model.fit(dataset)
    model.save()

    # Eval model on train
    transformers = []
    evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity)
    scores = evaluator.compute_model_performance([regression_metric])

    assert scores[regression_metric.name] > .7
Beispiel #8
0
print("About to perform train/valid/test split.")
splitter = RandomSplitter(verbosity=verbosity)
print("Performing new split.")
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
    nci_dataset, train_dir, valid_dir, test_dir)

classification_metric = Metric(metrics.roc_auc_score, np.mean,
                               verbosity=verbosity,
                               mode="classification")
def model_builder(model_dir):
  sklearn_model = RandomForestRegressor(n_estimators=500)
  return SklearnModel(sklearn_model, model_dir)
model = SingletaskToMultitask(nci_tasks, model_builder, model_dir)

# Fit trained model
model.fit(train_dataset)
model.save()

train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity)
train_scores = train_evaluator.compute_model_performance([classification_metric])

print("Train scores")
print(train_scores)

valid_evaluator = Evaluator(model, valid_dataset, transformers, verbosity=verbosity)
valid_scores = valid_evaluator.compute_model_performance([classification_metric])

print("Validation scores")
print(valid_scores)
                        model_params,
                        model_dir,
                        model_instance=RandomForestClassifier(
                            class_weight="balanced", n_estimators=500),
                        verbosity=verbosity)


model = SingletaskToMultitask(tox21_tasks,
                              tox21_task_types,
                              params_dict,
                              model_dir,
                              model_builder,
                              verbosity=verbosity)

# Fit trained model
model.fit(train_dataset)
model.save()

train_evaluator = Evaluator(model,
                            train_dataset,
                            transformers,
                            verbosity=verbosity)
train_scores = train_evaluator.compute_model_performance(
    [classification_metric])

print("Train scores")
print(train_scores)

valid_evaluator = Evaluator(model,
                            valid_dataset,
                            transformers,
    def test_sklearn_multitask_classification(self):
        """Test that sklearn models can learn on simple multitask classification."""
        np.random.seed(123)
        n_tasks = 4
        dataset = sklearn.datasets.load_digits(n_class=2)
        X, y = dataset.data, dataset.target
        y = np.reshape(y, (len(y), 1))
        y = np.hstack([y] * n_tasks)

        frac_train = .7
        n_samples = len(X)

        X_train, y_train = X[:frac_train * n_samples], y[:frac_train *
                                                         n_samples]
        X_test, y_test = X[frac_train * n_samples:], y[frac_train * n_samples:]

        train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train)
        test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test)

        tasks = train_dataset.get_task_names()
        task_types = {task: "classification" for task in tasks}

        model_params = {
            "batch_size": None,
            "data_shape": train_dataset.get_data_shape()
        }

        verbosity = "high"
        classification_metric = Metric(metrics.roc_auc_score,
                                       verbosity=verbosity)

        def model_builder(tasks,
                          task_types,
                          model_params,
                          model_dir,
                          verbosity=None):
            return SklearnModel(tasks,
                                task_types,
                                model_params,
                                model_dir,
                                mode="classification",
                                model_instance=LogisticRegression(),
                                verbosity=verbosity)

        model = SingletaskToMultitask(tasks,
                                      task_types,
                                      model_params,
                                      self.model_dir,
                                      model_builder,
                                      verbosity=verbosity)

        # Fit trained model
        model.fit(train_dataset)
        model.save()

        # Eval model on train
        transformers = []
        train_evaluator = Evaluator(model,
                                    train_dataset,
                                    transformers,
                                    verbosity=verbosity)
        train_scores = train_evaluator.compute_model_performance(
            [classification_metric])
        print("train_scores")
        print(train_scores)

        # Eval model on test
        transformers = []
        evaluator = Evaluator(model,
                              test_dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([classification_metric])
        print("scores")
        print(scores)

        for score in scores[classification_metric.name]:
            assert score > .5
    def test_singletask_to_multitask_classification(self):
        splittype = "scaffold"
        compound_featurizers = [CircularFingerprint(size=1024)]
        complex_featurizers = []
        output_transformers = []
        tasks = [
            "task0", "task1", "task2", "task3", "task4", "task5", "task6",
            "task7", "task8", "task9", "task10", "task11", "task12", "task13",
            "task14", "task15", "task16"
        ]
        task_types = {task: "classification" for task in tasks}
        input_file = "multitask_example.csv"

        n_features = 10
        n_tasks = len(tasks)
        # Define train dataset
        n_train = 100
        X_train = np.random.rand(n_train, n_features)
        y_train = np.random.randint(2, size=(n_train, n_tasks))
        w_train = np.ones_like(y_train)
        ids_train = ["C"] * n_train
        train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train,
                                           w_train, ids_train, tasks)

        # Define test dataset
        n_test = 10
        X_test = np.random.rand(n_test, n_features)
        y_test = np.random.randint(2, size=(n_test, n_tasks))
        w_test = np.ones_like(y_test)
        ids_test = ["C"] * n_test
        test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test,
                                          w_test, ids_test, tasks)

        params_dict = {
            "batch_size": 32,
            "data_shape": train_dataset.get_data_shape()
        }
        classification_metrics = [Metric(metrics.roc_auc_score)]

        def model_builder(tasks,
                          task_types,
                          model_params,
                          model_builder,
                          verbosity=None):
            return SklearnModel(tasks,
                                task_types,
                                model_params,
                                model_builder,
                                model_instance=LogisticRegression())

        multitask_model = SingletaskToMultitask(tasks, task_types, params_dict,
                                                self.model_dir, model_builder)

        # Fit trained model
        multitask_model.fit(train_dataset)
        multitask_model.save()

        # Eval multitask_model on train
        evaluator = Evaluator(multitask_model,
                              train_dataset,
                              output_transformers,
                              verbosity=True)
        _ = evaluator.compute_model_performance(classification_metrics)

        # Eval multitask_model on test
        evaluator = Evaluator(multitask_model,
                              test_dataset,
                              output_transformers,
                              verbosity=True)
        _ = evaluator.compute_model_performance(classification_metrics)
Beispiel #12
0
            tox_test), tox_transformers = dc.molnet.load_tox21()

classification_metric = Metric(
    metrics.roc_auc_score, np.mean, mode="classification")


def model_builder(model_dir):
  sklearn_model = RandomForestClassifier(
      class_weight="balanced", n_estimators=500, n_jobs=-1)
  return dc.models.SklearnModel(sklearn_model, model_dir)


print(tox_train.get_task_names())
print(tox_tasks)
tox_model = SingletaskToMultitask(tox_tasks, model_builder)
tox_model.fit(tox_train)

# Load sider models now

sider_tasks, (
    sider_train, sider_valid,
    sider_test), sider_transformers = dc.molnet.load_sider(split="random")

sider_model = SingletaskToMultitask(sider_tasks, model_builder)
sider_model.fit(sider_train)

# Load sweetlead dataset now. Pass in dataset object and appropriate
# transformers to predict functions

sweet_tasks, (sweet_dataset, _, _), sweet_transformers = dc.molnet.load_sweet()