Ejemplo n.º 1
0
    def train_valid_test_split(self,
                               dataset,
                               train_dir,
                               valid_dir,
                               test_dir,
                               frac_train=.8,
                               frac_valid=.1,
                               frac_test=.1,
                               seed=None,
                               log_every_n=1000):

        # Obtain original x, y, and w arrays and shuffle
        X, y, w, ids = self.__randomize_arrays(dataset.to_numpy())
        X_train, y_train, w_train, ids_train, X_test, y_test, w_test, ids_test = self.__split(
            X, y, w, ids, frac_train)

        # calculate percent split for valid (out of test and valid)
        valid_percentage = frac_valid / (frac_valid + frac_test)
        # split test data into valid and test, treating sub test set also as sparse
        X_valid, y_valid, w_valid, ids_valid, X_test, y_test, w_test, ids_test = self.__split(
            X_test, y_test, w_test, ids_test, valid_percentage)

        # turn back into dataset objects
        train_data = Dataset.from_numpy(train_dir, X_train, y_train, w_train,
                                        ids_train)
        valid_data = Dataset.from_numpy(valid_dir, X_valid, y_valid, w_valid,
                                        ids_valid)
        test_data = Dataset.from_numpy(test_dir, X_test, y_test, w_test,
                                       ids_test)
        return train_data, valid_data, test_data
Ejemplo n.º 2
0
    def test_move_load(self):
        """Test that datasets can be moved and loaded."""
        verbosity = "high"
        current_dir = os.path.dirname(os.path.realpath(__file__))
        data_dir = os.path.join(self.base_dir, "data")
        moved_data_dir = os.path.join(self.base_dir, "moved_data")
        dataset_file = os.path.join(current_dir,
                                    "../../models/tests/example.csv")

        featurizer = CircularFingerprint(size=1024)
        tasks = ["log-solubility"]
        loader = DataLoader(tasks=tasks,
                            smiles_field="smiles",
                            featurizer=featurizer,
                            verbosity=verbosity)
        dataset = loader.featurize(dataset_file, data_dir)

        X, y, w, ids = dataset.to_numpy()
        shutil.move(data_dir, moved_data_dir)

        moved_dataset = Dataset(moved_data_dir, reload=reload)

        X_moved, y_moved, w_moved, ids_moved = moved_dataset.to_numpy()

        np.testing.assert_allclose(X, X_moved)
        np.testing.assert_allclose(y, y_moved)
        np.testing.assert_allclose(w, w_moved)
        np.testing.assert_array_equal(ids, ids_moved)
Ejemplo n.º 3
0
  def test_move_load(self):
    """Test that datasets can be moved and loaded."""
    verbosity = "high"
    current_dir = os.path.dirname(os.path.realpath(__file__))
    data_dir = os.path.join(self.base_dir, "data")
    moved_data_dir = os.path.join(self.base_dir, "moved_data")
    dataset_file = os.path.join(
        current_dir, "../../models/tests/example.csv")

    featurizer = CircularFingerprint(size=1024)
    tasks = ["log-solubility"]
    loader = DataLoader(tasks=tasks,
                        smiles_field="smiles",
                        featurizer=featurizer,
                        verbosity=verbosity)
    dataset = loader.featurize(dataset_file, data_dir)

    X, y, w, ids = dataset.to_numpy()
    shutil.move(data_dir, moved_data_dir)

    moved_dataset = Dataset(
        moved_data_dir, reload=reload)

    X_moved, y_moved, w_moved, ids_moved = moved_dataset.to_numpy()

    np.testing.assert_allclose(X, X_moved)
    np.testing.assert_allclose(y, y_moved)
    np.testing.assert_allclose(w, w_moved)
    np.testing.assert_array_equal(ids, ids_moved)
Ejemplo n.º 4
0
    def test_sklearn_regression(self):
        """Test that sklearn models can learn on simple regression datasets."""
        np.random.seed(123)
        dataset = sklearn.datasets.load_diabetes()
        X, y = dataset.data, dataset.target

        frac_train = .7
        n_samples = len(X)

        X_train, y_train = X[:frac_train * n_samples], y[:frac_train *
                                                         n_samples]
        X_test, y_test = X[frac_train * n_samples:], y[frac_train * n_samples:]

        train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train)
        test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test)

        tasks = train_dataset.get_task_names()
        task_types = {task: "regression" for task in tasks}

        model_params = {
            "batch_size": None,
            "data_shape": train_dataset.get_data_shape()
        }

        verbosity = "high"
        regression_metric = Metric(metrics.r2_score, verbosity=verbosity)
        model = SklearnModel(tasks,
                             task_types,
                             model_params,
                             self.model_dir,
                             mode="regression",
                             model_instance=LinearRegression())

        # Fit trained model
        model.fit(train_dataset)
        model.save()

        # Eval model on train
        transformers = []
        train_evaluator = Evaluator(model,
                                    train_dataset,
                                    transformers,
                                    verbosity=verbosity)
        train_scores = train_evaluator.compute_model_performance(
            [regression_metric])
        print("train_scores")
        print(train_scores)

        # Eval model on test
        transformers = []
        evaluator = Evaluator(model,
                              test_dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([regression_metric])
        print("scores")
        print(scores)

        assert scores[regression_metric.name] > .5
  def test_singletask_to_multitask_classification(self):
    splittype = "scaffold"
    compound_featurizers = [CircularFingerprint(size=1024)]
    complex_featurizers = []
    output_transformers = []
    tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6",
             "task7", "task8", "task9", "task10", "task11", "task12",
             "task13", "task14", "task15", "task16"]
    task_types = {task: "classification" for task in tasks}
    input_file = "multitask_example.csv"

    n_features = 10
    n_tasks = len(tasks)
    # Define train dataset
    n_train = 100
    X_train = np.random.rand(n_train, n_features)
    y_train = np.random.randint(2, size=(n_train, n_tasks))
    w_train = np.ones_like(y_train)
    ids_train = ["C"] * n_train
    train_dataset = Dataset.from_numpy(self.train_dir,
                                       X_train, y_train, w_train, ids_train,
                                       tasks)

    # Define test dataset
    n_test = 10
    X_test = np.random.rand(n_test, n_features)
    y_test = np.random.randint(2, size=(n_test, n_tasks))
    w_test = np.ones_like(y_test)
    ids_test = ["C"] * n_test
    test_dataset = Dataset.from_numpy(self.test_dir,
                                      X_test, y_test, w_test, ids_test,
                                      tasks)

    params_dict = {
        "batch_size": 32,
        "data_shape": train_dataset.get_data_shape()
    }
    classification_metrics = [Metric(metrics.roc_auc_score)]
    def model_builder(tasks, task_types, model_params, model_builder, verbosity=None):
      return SklearnModel(tasks, task_types, model_params, model_builder,
                          model_instance=LogisticRegression())
    multitask_model = SingletaskToMultitask(tasks, task_types, params_dict,
                                            self.model_dir, model_builder)

    # Fit trained model
    multitask_model.fit(train_dataset)
    multitask_model.save()

    # Eval multitask_model on train
    evaluator = Evaluator(multitask_model, train_dataset, output_transformers,
                          verbosity=True)
    _ = evaluator.compute_model_performance(classification_metrics)

    # Eval multitask_model on test
    evaluator = Evaluator(multitask_model, test_dataset, output_transformers,
                          verbosity=True)
    _ = evaluator.compute_model_performance(classification_metrics)
Ejemplo n.º 6
0
  def test_sklearn_multitask_classification(self):
    """Test that sklearn models can learn on simple multitask classification."""
    np.random.seed(123)
    n_tasks = 4
    dataset = sklearn.datasets.load_digits(n_class=2)
    X, y = dataset.data, dataset.target
    y = np.reshape(y, (len(y), 1))
    y = np.hstack([y] * n_tasks)
    
    frac_train = .7
    n_samples = len(X)
    
    X_train, y_train = X[:frac_train*n_samples], y[:frac_train*n_samples]
    X_test, y_test = X[frac_train*n_samples:], y[frac_train*n_samples:]

    train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train)
    test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test)

    tasks = train_dataset.get_task_names()
    task_types = {task: "classification" for task in tasks}

    model_params = {
      "batch_size": None,
      "data_shape": train_dataset.get_data_shape()
    }

    verbosity = "high"
    classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity)
    def model_builder(tasks, task_types, model_params, model_dir, verbosity=None):
      return SklearnModel(tasks, task_types, model_params, model_dir,
                          mode="classification",
                          model_instance=LogisticRegression(),
                          verbosity=verbosity)
    model = SingletaskToMultitask(tasks, task_types, model_params, self.model_dir,
                                  model_builder, verbosity=verbosity)

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on train
    transformers = []
    train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity)
    train_scores = train_evaluator.compute_model_performance([classification_metric])
    print("train_scores")
    print(train_scores)

    # Eval model on test
    transformers = []
    evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity)
    scores = evaluator.compute_model_performance([classification_metric])
    print("scores")
    print(scores)

    for score in scores[classification_metric.name]:
      assert score > .5
Ejemplo n.º 7
0
  def test_singletask_to_multitask_sklearn_hyperparam_opt(self):
    """Test of hyperparam_opt with singletask_to_multitask."""
    splittype = "scaffold"
    output_transformers = []
    tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6",
             "task7", "task8", "task9", "task10", "task11", "task12",
             "task13", "task14", "task15", "task16"]
    task_types = {task: "classification" for task in tasks}
    input_file = "multitask_example.csv"
      
    n_features = 10
    n_tasks = len(tasks)
    # Define train dataset
    n_train = 100
    X_train = np.random.rand(n_train, n_features)
    y_train = np.random.randint(2, size=(n_train, n_tasks))
    w_train = np.ones_like(y_train)
    ids_train = ["C"] * n_train
    train_dataset = Dataset.from_numpy(self.train_dir,
                                       X_train, y_train, w_train, ids_train,
                                       tasks)

    # Define validation dataset
    n_valid = 10
    X_valid = np.random.rand(n_valid, n_features)
    y_valid = np.random.randint(2, size=(n_valid, n_tasks))
    w_valid = np.ones_like(y_valid)
    ids_valid = ["C"] * n_valid
    valid_dataset = Dataset.from_numpy(self.valid_dir,
                                       X_valid, y_valid, w_valid, ids_valid,
                                       tasks)
    params_dict = {
        "batch_size": [32],
        "data_shape": [train_dataset.get_data_shape()],
    }
    classification_metric = Metric(metrics.matthews_corrcoef, np.mean,
                                   mode="classification")
    def model_builder(tasks, task_types, model_params, task_model_dir,
                      verbosity=None):
      return SklearnModel(tasks, task_types, model_params, task_model_dir,
                          model_instance=LogisticRegression())
    def multitask_model_builder(tasks, task_types, params_dict, logdir=None,
                                verbosity=None):
      return SingletaskToMultitask(tasks, task_types, params_dict,
                                   self.model_dir, model_builder)

    optimizer = HyperparamOpt(multitask_model_builder, tasks, task_types,
                              verbosity="low")
    best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
      params_dict, train_dataset, valid_dataset, output_transformers,
      classification_metric, logdir=None)
Ejemplo n.º 8
0
  def test_sklearn_classification(self):
    """Test that sklearn models can learn on simple classification datasets."""
    np.random.seed(123)
    dataset = sklearn.datasets.load_digits(n_class=2)
    X, y = dataset.data, dataset.target

    frac_train = .7
    n_samples = len(X)
    
    X_train, y_train = X[:frac_train*n_samples], y[:frac_train*n_samples]
    X_test, y_test = X[frac_train*n_samples:], y[frac_train*n_samples:]

    print("X_train.shape, y_train.shape, X_test.shape, y_test.shape")
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

    train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train)
    test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test)

    tasks = train_dataset.get_task_names()
    task_types = {task: "classification" for task in tasks}

    model_params = {
      "batch_size": None,
      "data_shape": train_dataset.get_data_shape()
    }

    verbosity = "high"
    classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity)
    model = SklearnModel(tasks, task_types, model_params, self.model_dir,
                         mode="classification",
                         model_instance=LogisticRegression())

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on train
    transformers = []
    train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity)
    train_scores = train_evaluator.compute_model_performance([classification_metric])
    print("train_scores")
    print(train_scores)

    # Eval model on test
    transformers = []
    evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity)
    scores = evaluator.compute_model_performance([classification_metric])
    print("scores")
    print(scores)

    assert scores[classification_metric.name] > .5
Ejemplo n.º 9
0
  def test_samples_move(self):
    """Test that featurized samples can be moved and reloaded."""
    verbosity = "high"
    data_dir = os.path.join(self.base_dir, "data")
    moved_data_dir = os.path.join(self.base_dir, "moved_data")
    dataset_file = os.path.join(
        self.current_dir, "example.csv")

    featurizer = CircularFingerprint(size=1024)
    tasks = ["log-solubility"]
    loader = DataLoader(tasks=tasks,
                        smiles_field="smiles",
                        featurizer=featurizer,
                        verbosity=verbosity)
    featurized_dataset = loader.featurize(
        dataset_file, data_dir)
    n_dataset = len(featurized_dataset)
  
    # Now perform move
    shutil.move(data_dir, moved_data_dir)

    moved_featurized_dataset = Dataset(
        data_dir=moved_data_dir, reload=True)

    assert len(moved_featurized_dataset) == n_dataset
Ejemplo n.º 10
0
  def test_to_singletask(self):
    """Test that to_singletask works."""
    num_datapoints = 100
    num_features = 10
    num_tasks = 10
    # Generate data
    X = np.random.rand(num_datapoints, num_features)
    y = np.random.randint(2, size=(num_datapoints, num_tasks))
    w = np.random.randint(2, size=(num_datapoints, num_tasks))
    ids = np.array(["id"] * num_datapoints)
    
    dataset = Dataset.from_numpy(self.data_dir, X, y, w, ids, verbosity="high")

    task_dirs = []
    try:
      for task in range(num_tasks):
        task_dirs.append(tempfile.mkdtemp())
      singletask_datasets = dataset.to_singletask(task_dirs)
      for task in range(num_tasks):
        singletask_dataset = singletask_datasets[task]
        X_task, y_task, w_task, ids_task = singletask_dataset.to_numpy()
        w_nonzero = w[:, task] != 0
        np.testing.assert_array_equal(X_task, X[w_nonzero != 0])
        np.testing.assert_array_equal(y_task.flatten(), y[:, task][w_nonzero != 0])
        np.testing.assert_array_equal(w_task.flatten(), w[:, task][w_nonzero != 0])
        np.testing.assert_array_equal(ids_task, ids[w_nonzero != 0])
    finally:
      # Cleanup
      for task_dir in task_dirs:
        shutil.rmtree(task_dir)
Ejemplo n.º 11
0
    def test_shuffle_each_shard(self):
        """Test that shuffle_each_shard works."""
        n_samples = 100
        n_tasks = 10
        n_features = 10

        X = np.random.rand(n_samples, n_features)
        y = np.random.randint(2, size=(n_samples, n_tasks))
        w = np.random.randint(2, size=(n_samples, n_tasks))
        ids = np.arange(n_samples)
        dataset = Dataset.from_numpy(self.data_dir, X, y, w, ids)
        dataset.reshard(shard_size=10)

        dataset.shuffle_each_shard()
        X_s, y_s, w_s, ids_s = dataset.to_numpy()
        assert X_s.shape == X.shape
        assert y_s.shape == y.shape
        assert ids_s.shape == ids.shape
        assert w_s.shape == w.shape

        # The ids should now store the performed permutation. Check that the
        # original dataset is recoverable.
        for i in range(n_samples):
            np.testing.assert_array_equal(X_s[i], X[ids_s[i]])
            np.testing.assert_array_equal(y_s[i], y[ids_s[i]])
            np.testing.assert_array_equal(w_s[i], w[ids_s[i]])
            np.testing.assert_array_equal(ids_s[i], ids[ids_s[i]])
Ejemplo n.º 12
0
  def test_get_shape(self):
    """Test that get_shape works."""
    num_datapoints = 100
    num_features = 10
    num_tasks = 10
    # Generate data
    X = np.random.rand(num_datapoints, num_features)
    y = np.random.randint(2, size=(num_datapoints, num_tasks))
    w = np.random.randint(2, size=(num_datapoints, num_tasks))
    ids = np.array(["id"] * num_datapoints)
    
    dataset = Dataset.from_numpy(self.data_dir, X, y, w, ids, verbosity="high")

    X_shape, y_shape, w_shape, ids_shape = dataset.get_shape()
    print("type(X_shape), type(y_shape), type(w_shape), type(ids_shape)")
    print(type(X_shape), type(y_shape), type(w_shape), type(ids_shape))
    print("type(X.shape), type(y.shape), type(w.shape), type(ids.shape)")
    print(type(X.shape), type(y.shape), type(w.shape), type(ids.shape))
    print("X_shape, y_shape, w_shape, ids_shape")
    print(X_shape, y_shape, w_shape, ids_shape)
    print("X.shape, y.shape, w.shape, ids.shape")
    print(X.shape, y.shape, w.shape, ids.shape)
    assert X_shape == X.shape
    assert y_shape == y.shape
    assert w_shape == w.shape
    assert ids_shape == ids.shape
Ejemplo n.º 13
0
  def test_shuffle_shards(self):
    """Test that shuffle_shards works."""
    n_samples = 100
    n_tasks = 10
    n_features = 10

    X = np.random.rand(n_samples, n_features)
    y = np.random.randint(2, size=(n_samples, n_tasks))
    w = np.random.randint(2, size=(n_samples, n_tasks))
    ids = np.arange(n_samples)
    dataset = Dataset.from_numpy(self.data_dir, X, y, w, ids)
    dataset.reshard(shard_size=10)
    dataset.shuffle_shards()

    X_s, y_s, w_s, ids_s = dataset.to_numpy()

    assert X_s.shape == X.shape
    assert y_s.shape == y.shape
    assert ids_s.shape == ids.shape
    assert w_s.shape == w.shape

    # The ids should now store the performed permutation. Check that the
    # original dataset is recoverable.
    for i in range(n_samples):
      np.testing.assert_array_equal(X_s[i], X[ids_s[i]])
      np.testing.assert_array_equal(y_s[i], y[ids_s[i]])
      np.testing.assert_array_equal(w_s[i], w[ids_s[i]])
      np.testing.assert_array_equal(ids_s[i], ids[ids_s[i]])
Ejemplo n.º 14
0
  def test_to_singletask(self):
    """Test that to_singletask works."""
    num_datapoints = 100
    num_features = 10
    num_tasks = 10
    # Generate data
    X = np.random.rand(num_datapoints, num_features)
    y = np.random.randint(2, size=(num_datapoints, num_tasks))
    w = np.random.randint(2, size=(num_datapoints, num_tasks))
    ids = np.array(["id"] * num_datapoints)
    
    dataset = Dataset.from_numpy(self.data_dir, X, y, w, ids, verbosity="high")

    task_dirs = []
    try:
      for task in range(num_tasks):
        task_dirs.append(tempfile.mkdtemp())
      singletask_datasets = dataset.to_singletask(task_dirs)
      for task in range(num_tasks):
        singletask_dataset = singletask_datasets[task]
        X_task, y_task, w_task, ids_task = singletask_dataset.to_numpy()
        w_nonzero = w[:, task] != 0
        np.testing.assert_array_equal(X_task, X[w_nonzero != 0])
        np.testing.assert_array_equal(y_task.flatten(), y[:, task][w_nonzero != 0])
        np.testing.assert_array_equal(w_task.flatten(), w[:, task][w_nonzero != 0])
        np.testing.assert_array_equal(ids_task, ids[w_nonzero != 0])
    finally:
      # Cleanup
      for task_dir in task_dirs:
        shutil.rmtree(task_dir)
Ejemplo n.º 15
0
    def test_tf_skewed_classification_overfit(self):
        """Test tensorflow models can overfit 0/1 datasets with few actives."""
        tasks = ["task0"]
        task_types = {task: "classification" for task in tasks}
        #n_samples = 100
        n_samples = 100
        n_features = 3
        n_tasks = len(tasks)
        n_classes = 2

        # Generate dummy dataset
        np.random.seed(123)
        p = .05
        ids = np.arange(n_samples)
        X = np.random.rand(n_samples, n_features)
        y = np.random.binomial(1, p, size=(n_samples, n_tasks))
        w = np.ones((n_samples, n_tasks))

        dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

        model_params = {
            "layer_sizes": [1500],
            "dropouts": [.0],
            "learning_rate": 0.003,
            "momentum": .9,
            "batch_size": n_samples,
            "num_classification_tasks": 1,
            "num_classes": n_classes,
            "num_features": n_features,
            "weight_init_stddevs": [1.],
            "bias_init_consts": [1.],
            "nb_epoch": 200,
            "penalty": 0.0,
            "optimizer": "adam",
            "data_shape": dataset.get_data_shape()
        }

        verbosity = "high"
        classification_metric = Metric(metrics.roc_auc_score,
                                       verbosity=verbosity)
        model = TensorflowModel(tasks,
                                task_types,
                                model_params,
                                self.model_dir,
                                tf_class=TensorflowMultiTaskClassifier,
                                verbosity=verbosity)

        # Fit trained model
        model.fit(dataset)
        model.save()

        # Eval model on train
        transformers = []
        evaluator = Evaluator(model,
                              dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([classification_metric])

        assert scores[classification_metric.name] > .8
Ejemplo n.º 16
0
    def test_sklearn_multitask_classification_overfit(self):
        """Test SKLearn singletask-to-multitask overfits tiny data."""
        n_tasks = 10
        tasks = ["task%d" % task for task in range(n_tasks)]
        task_types = {task: "classification" for task in tasks}
        n_samples = 10
        n_features = 3

        # Generate dummy dataset
        np.random.seed(123)
        ids = np.arange(n_samples)
        X = np.random.rand(n_samples, n_features)
        y = np.random.randint(2, size=(n_samples, n_tasks))
        w = np.ones((n_samples, n_tasks))

        dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

        model_params = {
            "batch_size": None,
            "data_shape": dataset.get_data_shape()
        }

        verbosity = "high"
        classification_metric = Metric(metrics.roc_auc_score,
                                       verbosity=verbosity)

        def model_builder(tasks,
                          task_types,
                          model_params,
                          model_dir,
                          verbosity=None):
            return SklearnModel(tasks,
                                task_types,
                                model_params,
                                model_dir,
                                mode="classification",
                                model_instance=RandomForestClassifier(),
                                verbosity=verbosity)

        model = SingletaskToMultitask(tasks,
                                      task_types,
                                      model_params,
                                      self.model_dir,
                                      model_builder,
                                      verbosity=verbosity)

        # Fit trained model
        model.fit(dataset)
        model.save()

        # Eval model on train
        transformers = []
        evaluator = Evaluator(model,
                              dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([classification_metric])

        assert scores[classification_metric.name] > .9
Ejemplo n.º 17
0
def load_pdbbind(pdbbind_dir, base_dir, reload=True):
    """Load PDBBind datasets. Does not do train/test split"""
    # Set some global variables up top
    reload = True
    verbosity = "high"
    model = "logistic"
    regen = False

    # Create some directories for analysis
    # The base_dir holds the results of all analysis
    if not reload:
        if os.path.exists(base_dir):
            shutil.rmtree(base_dir)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
    current_dir = os.path.dirname(os.path.realpath(__file__))
    #Make directories to store the raw and featurized datasets.
    data_dir = os.path.join(base_dir, "dataset")

    # Load PDBBind dataset
    labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013")
    pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set")
    tasks = ["-logKd/Ki"]
    print("About to load contents.")
    contents_df = load_pdbbind_labels(labels_file)
    ids = contents_df["PDB code"].values
    y = np.array([float(val) for val in contents_df["-logKd/Ki"].values])

    # Define featurizers
    grid_featurizer = GridFeaturizer(voxel_width=16.0,
                                     feature_types="voxel_combined",
                                     voxel_feature_types=[
                                         "ecfp", "splif", "hbond", "pi_stack",
                                         "cation_pi", "salt_bridge"
                                     ],
                                     ecfp_power=9,
                                     splif_power=9,
                                     parallel=True,
                                     flatten=True)
    compound_featurizers = [CircularFingerprint(size=1024)]
    complex_featurizers = [grid_featurizer]

    # Featurize Dataset
    features = []
    for pdb_code in ids:
        pdb_subdir = os.path.join(pdb_subdirs, pdb_code)
        computed_feature = compute_pdbbind_feature(compound_featurizers,
                                                   complex_featurizers,
                                                   pdb_subdir, pdb_code)
        if len(computed_feature) == 0:
            computed_feature = np.zeros(1024)
        features.append(computed_feature)
    X = np.vstack(features)
    w = np.ones_like(y)

    dataset = Dataset.from_numpy(data_dir, X, y, w, ids)
    transformers = []

    return tasks, dataset, transformers
Ejemplo n.º 18
0
    def test_tf_multitask_classification_overfit(self):
        """Test tf multitask overfits tiny data."""
        n_tasks = 10
        tasks = ["task%d" % task for task in range(n_tasks)]
        task_types = {task: "classification" for task in tasks}
        n_samples = 10
        n_features = 3
        n_classes = 2

        # Generate dummy dataset
        np.random.seed(123)
        ids = np.arange(n_samples)
        X = np.random.rand(n_samples, n_features)
        #y = np.random.randint(n_classes, size=(n_samples, n_tasks))
        y = np.zeros((n_samples, n_tasks))
        w = np.ones((n_samples, n_tasks))

        dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

        model_params = {
            "layer_sizes": [1000],
            "dropouts": [.0],
            "learning_rate": 0.0003,
            "momentum": .9,
            "batch_size": n_samples,
            "num_classification_tasks": n_tasks,
            "num_classes": n_classes,
            "num_features": n_features,
            "weight_init_stddevs": [.1],
            "bias_init_consts": [1.],
            "nb_epoch": 100,
            "penalty": 0.0,
            "optimizer": "adam",
            "data_shape": dataset.get_data_shape()
        }

        verbosity = "high"
        classification_metric = Metric(metrics.accuracy_score,
                                       verbosity=verbosity)
        model = TensorflowModel(tasks,
                                task_types,
                                model_params,
                                self.model_dir,
                                tf_class=TensorflowMultiTaskClassifier,
                                verbosity=verbosity)

        # Fit trained model
        model.fit(dataset)
        model.save()

        # Eval model on train
        transformers = []
        evaluator = Evaluator(model,
                              dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([classification_metric])

        assert scores[classification_metric.name] > .9
Ejemplo n.º 19
0
    def test_tf_reload(self):
        """Test that tensorflow models can overfit simple classification datasets."""
        tasks = ["task0"]
        task_types = {task: "classification" for task in tasks}
        n_samples = 10
        n_features = 3
        n_tasks = len(tasks)
        n_classes = 2

        # Generate dummy dataset
        np.random.seed(123)
        ids = np.arange(n_samples)
        X = np.random.rand(n_samples, n_features)
        y = np.random.randint(n_classes, size=(n_samples, n_tasks))
        w = np.ones((n_samples, n_tasks))

        dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

        model_params = {
            "layer_sizes": [1000],
            "dropouts": [0.0],
            "learning_rate": 0.003,
            "momentum": 0.9,
            "batch_size": n_samples,
            "num_classification_tasks": 1,
            "num_classes": n_classes,
            "num_features": n_features,
            "weight_init_stddevs": [1.0],
            "bias_init_consts": [1.0],
            "nb_epoch": 100,
            "penalty": 0.0,
            "optimizer": "adam",
            "data_shape": dataset.get_data_shape(),
        }

        verbosity = "high"
        classification_metric = Metric(metrics.accuracy_score, verbosity=verbosity)
        model = TensorflowModel(
            tasks, task_types, model_params, self.model_dir, tf_class=TensorflowMultiTaskClassifier, verbosity=verbosity
        )

        # Fit trained model
        model.fit(dataset)
        model.save()

        # Load trained model
        reloaded_model = TensorflowModel(
            tasks, task_types, model_params, self.model_dir, tf_class=TensorflowMultiTaskClassifier, verbosity=verbosity
        )
        reloaded_model.reload()
        assert reloaded_model.eval_model._restored_model

        # Eval model on train
        transformers = []
        evaluator = Evaluator(reloaded_model, dataset, transformers, verbosity=verbosity)
        scores = evaluator.compute_model_performance([classification_metric])

        assert scores[classification_metric.name] > 0.9
Ejemplo n.º 20
0
    def test_tf_regression_overfit(self):
        """Test that TensorFlow models can overfit simple regression datasets."""
        tasks = ["task0"]
        task_types = {task: "regression" for task in tasks}
        n_samples = 10
        n_features = 3
        n_tasks = len(tasks)

        # Generate dummy dataset
        np.random.seed(123)
        ids = np.arange(n_samples)
        X = np.random.rand(n_samples, n_features)
        y = np.zeros((n_samples, n_tasks))
        w = np.ones((n_samples, n_tasks))

        dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

        model_params = {
            "layer_sizes": [1000],
            "dropouts": [.0],
            "learning_rate": 0.003,
            "momentum": .9,
            "batch_size": n_samples,
            "num_regression_tasks": 1,
            "num_features": n_features,
            "weight_init_stddevs": [np.sqrt(6) / np.sqrt(1000)],
            "bias_init_consts": [1.],
            "nb_epoch": 100,
            "penalty": 0.0,
            "optimizer": "momentum",
            "data_shape": dataset.get_data_shape()
        }

        verbosity = "high"
        regression_metric = Metric(metrics.mean_squared_error,
                                   verbosity=verbosity)
        model = TensorflowModel(tasks,
                                task_types,
                                model_params,
                                self.model_dir,
                                tf_class=TensorflowMultiTaskRegressor,
                                verbosity=verbosity)

        # Fit trained model
        model.fit(dataset)
        model.save()

        # Eval model on train
        transformers = []
        evaluator = Evaluator(model,
                              dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([regression_metric])

        assert scores[regression_metric.name] < .1
Ejemplo n.º 21
0
    def test_keras_skewed_classification_overfit(self):
        """Test keras models can overfit 0/1 datasets with few actives."""
        tasks = ["task0"]
        task_types = {task: "classification" for task in tasks}
        n_samples = 100
        n_features = 3
        n_tasks = len(tasks)

        # Generate dummy dataset
        np.random.seed(123)
        p = .05
        ids = np.arange(n_samples)
        X = np.random.rand(n_samples, n_features)
        y = np.random.binomial(1, p, size=(n_samples, n_tasks))
        w = np.ones((n_samples, n_tasks))

        dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

        model_params = {
            "nb_hidden": 1000,
            "activation": "relu",
            "dropout": .0,
            "learning_rate": .15,
            "momentum": .9,
            "nesterov": False,
            "decay": 1e-4,
            "batch_size": n_samples,
            "nb_epoch": 200,
            "init": "glorot_uniform",
            "nb_layers": 1,
            "batchnorm": False,
            "data_shape": dataset.get_data_shape()
        }

        verbosity = "high"
        classification_metric = Metric(metrics.roc_auc_score,
                                       verbosity=verbosity)
        model = MultiTaskDNN(tasks,
                             task_types,
                             model_params,
                             self.model_dir,
                             verbosity=verbosity)

        # Fit trained model
        model.fit(dataset)
        model.save()

        # Eval model on train
        transformers = []
        evaluator = Evaluator(model,
                              dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([classification_metric])

        assert scores[classification_metric.name] > .9
Ejemplo n.º 22
0
    def test_keras_reload(self):
        """Test that trained keras models can be reloaded correctly."""
        g = tf.Graph()
        sess = tf.Session(graph=g)
        K.set_session(sess)
        with g.as_default():
            tasks = ["task0"]
            task_types = {task: "classification" for task in tasks}
            n_samples = 10
            n_features = 3
            n_tasks = len(tasks)

            # Generate dummy dataset
            np.random.seed(123)
            ids = np.arange(n_samples)
            X = np.random.rand(n_samples, n_features)
            y = np.random.randint(2, size=(n_samples, n_tasks))
            w = np.ones((n_samples, n_tasks))

            dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

            model_params = {
                "nb_hidden": 1000,
                "activation": "relu",
                "dropout": 0.0,
                "learning_rate": 0.15,
                "momentum": 0.9,
                "nesterov": False,
                "decay": 1e-4,
                "batch_size": n_samples,
                "nb_epoch": 200,
                "init": "glorot_uniform",
                "nb_layers": 1,
                "batchnorm": False,
                "data_shape": dataset.get_data_shape(),
            }

            verbosity = "high"
            classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity)
            model = MultiTaskDNN(tasks, task_types, model_params, self.model_dir, verbosity=verbosity)

            # Fit trained model
            model.fit(dataset)
            model.save()

            # Load trained model
            reloaded_model = MultiTaskDNN(tasks, task_types, model_params, self.model_dir, verbosity=verbosity)
            reloaded_model.reload()

            # Eval model on train
            transformers = []
            evaluator = Evaluator(reloaded_model, dataset, transformers, verbosity=verbosity)
            scores = evaluator.compute_model_performance([classification_metric])

            assert scores[classification_metric.name] > 0.9
Ejemplo n.º 23
0
  def test_tf_skewed_classification_overfit(self):
    """Test tensorflow models can overfit 0/1 datasets with few actives."""
    tasks = ["task0"]
    task_types = {task: "classification" for task in tasks}
    #n_samples = 100
    n_samples = 100
    n_features = 3
    n_tasks = len(tasks)
    n_classes = 2
    
    # Generate dummy dataset
    np.random.seed(123)
    p = .05
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.binomial(1, p, size=(n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
  
    dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

    model_params = {
      "layer_sizes": [1500],
      "dropouts": [.0],
      "learning_rate": 0.003,
      "momentum": .9,
      "batch_size": n_samples,
      "num_classification_tasks": 1,
      "num_classes": n_classes,
      "num_features": n_features,
      "weight_init_stddevs": [1.],
      "bias_init_consts": [1.],
      "nb_epoch": 200,
      "penalty": 0.0,
      "optimizer": "adam",
      "data_shape": dataset.get_data_shape()
    }

    verbosity = "high"
    classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity)
    model = TensorflowModel(
        tasks, task_types, model_params, self.model_dir,
        tf_class=TensorflowMultiTaskClassifier,
        verbosity=verbosity)

    # Fit trained model
    model.fit(dataset)
    model.save()

    # Eval model on train
    transformers = []
    evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity)
    scores = evaluator.compute_model_performance([classification_metric])

    assert scores[classification_metric.name] > .8
Ejemplo n.º 24
0
  def train_valid_test_split(self, dataset, train_dir,
                             valid_dir, test_dir, frac_train=.8,
                             frac_valid=.1, frac_test=.1, seed=None,
                             log_every_n=1000):

    # Obtain original x, y, and w arrays and shuffle
    X, y, w, ids = self.__randomize_arrays(dataset.to_numpy())
    X_train, y_train, w_train, ids_train, X_test, y_test, w_test, ids_test = self.__split(X, y, w, ids, frac_train)

    # calculate percent split for valid (out of test and valid)
    valid_percentage = frac_valid / (frac_valid + frac_test)
    # split test data into valid and test, treating sub test set also as sparse
    X_valid, y_valid, w_valid, ids_valid, X_test, y_test, w_test, ids_test = self.__split(X_test, y_test, w_test,
                                                                                          ids_test, valid_percentage)

    # turn back into dataset objects
    train_data = Dataset.from_numpy(train_dir, X_train, y_train, w_train, ids_train)
    valid_data = Dataset.from_numpy(valid_dir, X_valid, y_valid, w_valid, ids_valid)
    test_data = Dataset.from_numpy(test_dir, X_test, y_test, w_test, ids_test)
    return train_data, valid_data, test_data
Ejemplo n.º 25
0
def load_muv(base_dir, reload=True):
  """Load MUV datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
  verbosity = "high"
  model = "logistic"
  regen = False

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
  if not reload:
    if os.path.exists(base_dir):
      shutil.rmtree(base_dir)
  if not os.path.exists(base_dir):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  data_dir = os.path.join(base_dir, "dataset")

  # Load MUV dataset
  print("About to load MUV dataset.")
  dataset_file = os.path.join(
      current_dir, "../../datasets/muv.csv.gz")
  dataset = load_from_disk(dataset_file)
  print("Columns of dataset: %s" % str(dataset.columns.values))
  print("Number of examples in dataset: %s" % str(dataset.shape[0]))

  # Featurize MUV dataset
  print("About to featurize MUV dataset.")
  featurizer = CircularFingerprint(size=1024)
  all_MUV_tasks = sorted(['MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644',
                          'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712',
                          'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652',
                          'MUV-466', 'MUV-832'])

  loader = DataLoader(tasks=all_MUV_tasks,
                      smiles_field="smiles",
                      featurizer=featurizer,
                      verbosity=verbosity)
  if not reload or not os.path.exists(data_dir):
    dataset = loader.featurize(dataset_file, data_dir)
    regen = True
  else:
    dataset = Dataset(data_dir, reload=True)

  # Initialize transformers 
  transformers = [
      BalancingTransformer(transform_w=True, dataset=dataset)]
  if regen:
    print("About to transform data")
    for transformer in transformers:
        transformer.transform(dataset)
  
  return all_MUV_tasks, dataset, transformers
Ejemplo n.º 26
0
    def test_keras_multitask_regression_overfit(self):
        """Test keras multitask overfits tiny data."""
        n_tasks = 10
        tasks = ["task%d" % task for task in range(n_tasks)]
        task_types = {task: "regression" for task in tasks}
        n_samples = 10
        n_features = 3

        # Generate dummy dataset
        np.random.seed(123)
        ids = np.arange(n_samples)
        X = np.random.rand(n_samples, n_features)
        y = np.random.randint(2, size=(n_samples, n_tasks))
        w = np.ones((n_samples, n_tasks))

        dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

        model_params = {
            "nb_hidden": 1000,
            "activation": "relu",
            "dropout": .0,
            "learning_rate": .15,
            "momentum": .9,
            "nesterov": False,
            "decay": 1e-4,
            "batch_size": n_samples,
            "nb_epoch": 200,
            "init": "glorot_uniform",
            "nb_layers": 1,
            "batchnorm": False,
            "data_shape": dataset.get_data_shape()
        }

        verbosity = "high"
        regression_metric = Metric(metrics.r2_score, verbosity=verbosity)
        model = MultiTaskDNN(tasks,
                             task_types,
                             model_params,
                             self.model_dir,
                             verbosity=verbosity)

        # Fit trained model
        model.fit(dataset)
        model.save()

        # Eval model on train
        transformers = []
        evaluator = Evaluator(model,
                              dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([regression_metric])

        assert scores[regression_metric.name] > .9
Ejemplo n.º 27
0
    def test_sklearn_reload(self):
        """Test that trained model can be reloaded correctly."""
        tasks = ["task0"]
        task_types = {task: "classification" for task in tasks}
        n_samples = 10
        n_features = 3
        n_tasks = len(tasks)

        # Generate dummy dataset
        np.random.seed(123)
        ids = np.arange(n_samples)
        X = np.random.rand(n_samples, n_features)
        y = np.random.randint(2, size=(n_samples, n_tasks))
        w = np.ones((n_samples, n_tasks))

        dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

        model_params = {
            "batch_size": None,
            "data_shape": dataset.get_data_shape()
        }

        verbosity = "high"
        classification_metric = Metric(metrics.roc_auc_score,
                                       verbosity=verbosity)
        model = SklearnModel(tasks,
                             task_types,
                             model_params,
                             self.model_dir,
                             mode="classification",
                             model_instance=RandomForestClassifier())

        # Fit trained model
        model.fit(dataset)
        model.save()

        # Load trained model
        reloaded_model = SklearnModel(tasks,
                                      task_types,
                                      model_params,
                                      self.model_dir,
                                      mode="classification")
        reloaded_model.reload()

        # Eval model on train
        transformers = []
        evaluator = Evaluator(reloaded_model,
                              dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([classification_metric])

        assert scores[classification_metric.name] > .9
Ejemplo n.º 28
0
  def test_tf_multitask_regression_overfit(self):
    """Test tf multitask overfits tiny data."""
    n_tasks = 10
    tasks = ["task%d" % task for task in range(n_tasks)]
    task_types = {task: "regression" for task in tasks}
    n_samples = 10
    n_features = 3
    n_classes = 2
    
    # Generate dummy dataset
    np.random.seed(123)
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    #y = np.random.randint(n_classes, size=(n_samples, n_tasks))
    y = np.zeros((n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
  
    dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

    model_params = {
      "layer_sizes": [1000],
      "dropouts": [.0],
      "learning_rate": 0.0003,
      "momentum": .9,
      "batch_size": n_samples,
      "num_regression_tasks": n_tasks,
      "num_classes": n_classes,
      "num_features": n_features,
      "weight_init_stddevs": [.1],
      "bias_init_consts": [1.],
      "nb_epoch": 100,
      "penalty": 0.0,
      "optimizer": "adam",
      "data_shape": dataset.get_data_shape()
    }

    verbosity = "high"
    regression_metric = Metric(metrics.r2_score, verbosity=verbosity)
    model = TensorflowModel(
        tasks, task_types, model_params, self.model_dir,
        tf_class=TensorflowMultiTaskRegressor,
        verbosity=verbosity)

    # Fit trained model
    model.fit(dataset)
    model.save()

    # Eval model on train
    transformers = []
    evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity)
    scores = evaluator.compute_model_performance([regression_metric])

    assert scores[regression_metric.name] > .9
Ejemplo n.º 29
0
def load_tox21(base_dir, reload=True):
  """Load Tox21 datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
  verbosity = "high"
  model = "logistic"

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
  if not reload:
    if os.path.exists(base_dir):
      shutil.rmtree(base_dir)
  if not os.path.exists(base_dir):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  samples_dir = os.path.join(base_dir, "samples")
  data_dir = os.path.join(base_dir, "dataset")

  # Load Tox21 dataset
  print("About to load Tox21 dataset.")
  dataset_file = os.path.join(
      current_dir, "../../datasets/tox21.csv.gz")
  dataset = load_from_disk(dataset_file)
  print("Columns of dataset: %s" % str(dataset.columns.values))
  print("Number of examples in dataset: %s" % str(dataset.shape[0]))

  # Featurize Tox21 dataset
  print("About to featurize Tox21 dataset.")
  featurizer = CircularFingerprint(size=1024)
  all_tox21_tasks = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER',
                     'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5',
                     'SR-HSE', 'SR-MMP', 'SR-p53']

  if not reload or not os.path.exists(data_dir):
    loader = DataLoader(tasks=all_tox21_tasks,
                        smiles_field="smiles",
                        featurizer=featurizer,
                        verbosity=verbosity)
    dataset = loader.featurize(
        dataset_file, data_dir, shard_size=8192)
  else:
    dataset = Dataset(data_dir, all_tox21_tasks, reload=True)

  # Initialize transformers 
  transformers = [
      BalancingTransformer(transform_w=True, dataset=dataset)]
  if not reload:
    print("About to transform data")
    for transformer in transformers:
        transformer.transform(dataset)
  
  return all_tox21_tasks, dataset, transformers
Ejemplo n.º 30
0
  def test_keras_multitask_regression_overfit(self):
    """Test keras multitask overfits tiny data."""
    g = tf.Graph()
    sess = tf.Session(graph=g)
    K.set_session(sess)
    with g.as_default():
      n_tasks = 10
      tasks = ["task%d" % task for task in range(n_tasks)]
      task_types = {task: "regression" for task in tasks}
      n_samples = 10
      n_features = 3
      
      # Generate dummy dataset
      np.random.seed(123)
      ids = np.arange(n_samples)
      X = np.random.rand(n_samples, n_features)
      y = np.random.randint(2, size=(n_samples, n_tasks))
      w = np.ones((n_samples, n_tasks))
    
      dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

      model_params = {
          "nb_hidden": 1000,
          "activation": "relu",
          "dropout": .0,
          "learning_rate": .15,
          "momentum": .9,
          "nesterov": False,
          "decay": 1e-4,
          "batch_size": n_samples,
          "nb_epoch": 200,
          "init": "glorot_uniform",
          "nb_layers": 1,
          "batchnorm": False,
          "data_shape": dataset.get_data_shape()
      }

      verbosity = "high"
      regression_metric = Metric(metrics.r2_score, verbosity=verbosity)
      model = MultiTaskDNN(tasks, task_types, model_params, self.model_dir,
                           verbosity=verbosity)

      # Fit trained model
      model.fit(dataset)
      model.save()

      # Eval model on train
      transformers = []
      evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity)
      scores = evaluator.compute_model_performance([regression_metric])

      assert scores[regression_metric.name] > .9
Ejemplo n.º 31
0
  def test_get_shape(self):
    """Test that get_shape works."""
    num_datapoints = 100
    num_features = 10
    num_tasks = 10
    # Generate data
    X = np.random.rand(num_datapoints, num_features)
    y = np.random.randint(2, size=(num_datapoints, num_tasks))
    w = np.random.randint(2, size=(num_datapoints, num_tasks))
    ids = np.array(["id"] * num_datapoints)
    
    dataset = Dataset.from_numpy(self.data_dir, X, y, w, ids, verbosity="high")

    X_shape, y_shape, w_shape, ids_shape = dataset.get_shape()
    assert X_shape == X.shape
    assert y_shape == y.shape
    assert w_shape == w.shape
    assert ids_shape == ids.shape
Ejemplo n.º 32
0
    def test_multitask_data(self):
        """Test that data associated with a tasks stays associated with it."""
        tasks = ["task0", "task1"]
        n_samples = 100
        n_features = 3
        n_tasks = len(tasks)

        # Generate dummy dataset
        ids = np.array(["C"] * n_samples, dtype=object)
        X = np.random.rand(n_samples, n_features)
        y = np.random.randint(2, size=(n_samples, n_tasks))
        w = np.ones((n_samples, n_tasks))

        dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)
        X_out, y_out, w_out, _ = dataset.to_numpy()
        np.testing.assert_allclose(X, X_out)
        np.testing.assert_allclose(y, y_out)
        np.testing.assert_allclose(w, w_out)
Ejemplo n.º 33
0
 def test_multitask_data(self):
   """Test that data associated with a tasks stays associated with it."""
   tasks = ["task0", "task1"]
   n_samples = 100
   n_features = 3
   n_tasks = len(tasks)
   
   # Generate dummy dataset
   ids = np.array(["C"] * n_samples, dtype=object)
   X = np.random.rand(n_samples, n_features)
   y = np.random.randint(2, size=(n_samples, n_tasks))
   w = np.ones((n_samples, n_tasks))
 
   dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)
   X_out, y_out, w_out, _ = dataset.to_numpy()
   np.testing.assert_allclose(X, X_out)
   np.testing.assert_allclose(y, y_out)
   np.testing.assert_allclose(w, w_out)
Ejemplo n.º 34
0
    def test_sklearn_reload(self):
        """Test that trained model can be reloaded correctly."""
        tasks = ["task0"]
        task_types = {task: "classification" for task in tasks}
        n_samples = 10
        n_features = 3
        n_tasks = len(tasks)

        # Generate dummy dataset
        np.random.seed(123)
        ids = np.arange(n_samples)
        X = np.random.rand(n_samples, n_features)
        y = np.random.randint(2, size=(n_samples, n_tasks))
        w = np.ones((n_samples, n_tasks))

        dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

        model_params = {"batch_size": None, "data_shape": dataset.get_data_shape()}

        verbosity = "high"
        classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity)
        model = SklearnModel(
            tasks,
            task_types,
            model_params,
            self.model_dir,
            mode="classification",
            model_instance=RandomForestClassifier(),
        )

        # Fit trained model
        model.fit(dataset)
        model.save()

        # Load trained model
        reloaded_model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="classification")
        reloaded_model.reload()

        # Eval model on train
        transformers = []
        evaluator = Evaluator(reloaded_model, dataset, transformers, verbosity=verbosity)
        scores = evaluator.compute_model_performance([classification_metric])

        assert scores[classification_metric.name] > 0.9
Ejemplo n.º 35
0
    def test_sklearn_regression_overfit(self):
        """Test that sklearn models can overfit simple regression datasets."""
        tasks = ["task0"]
        task_types = {task: "regression" for task in tasks}
        n_samples = 10
        n_features = 3
        n_tasks = len(tasks)

        # Generate dummy dataset
        np.random.seed(123)
        ids = np.arange(n_samples)
        X = np.random.rand(n_samples, n_features)
        y = np.random.rand(n_samples, n_tasks)
        w = np.ones((n_samples, n_tasks))

        dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

        model_params = {
            "batch_size": None,
            "data_shape": dataset.get_data_shape()
        }

        verbosity = "high"
        regression_metric = Metric(metrics.r2_score, verbosity=verbosity)
        model = SklearnModel(tasks,
                             task_types,
                             model_params,
                             self.model_dir,
                             mode="regression",
                             model_instance=RandomForestRegressor())

        # Fit trained model
        model.fit(dataset)
        model.save()

        # Eval model on train
        transformers = []
        evaluator = Evaluator(model,
                              dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([regression_metric])

        assert scores[regression_metric.name] > .7
Ejemplo n.º 36
0
    def test_merge(self):
        """Test that datasets can be merged."""
        verbosity = "high"
        current_dir = os.path.dirname(os.path.realpath(__file__))
        first_data_dir = os.path.join(self.base_dir, "first_dataset")
        second_data_dir = os.path.join(self.base_dir, "second_dataset")
        merged_data_dir = os.path.join(self.base_dir, "merged_data")

        dataset_file = os.path.join(current_dir, "../../models/tests/example.csv")

        featurizer = CircularFingerprint(size=1024)
        tasks = ["log-solubility"]
        loader = DataLoader(tasks=tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity)
        first_dataset = loader.featurize(dataset_file, first_data_dir)
        second_dataset = loader.featurize(dataset_file, second_data_dir)

        merged_dataset = Dataset.merge(merged_data_dir, [first_dataset, second_dataset])

        assert len(merged_dataset) == len(first_dataset) + len(second_dataset)
Ejemplo n.º 37
0
  def test_sklearn_multitask_regression_overfit(self):
    """Test SKLearn singletask-to-multitask overfits tiny regression data."""
    n_tasks = 2
    tasks = ["task%d" % task for task in range(n_tasks)]
    task_types = {task: "regression" for task in tasks}
    n_samples = 10
    n_features = 3
    
    # Generate dummy dataset
    np.random.seed(123)
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.rand(n_samples, n_tasks)
    w = np.ones((n_samples, n_tasks))

    dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

    model_params = {
      "batch_size": None,
      "data_shape": dataset.get_data_shape()
    }

    verbosity = "high"
    regression_metric = Metric(metrics.r2_score, verbosity=verbosity)
    def model_builder(tasks, task_types, model_params, model_dir, verbosity=None):
      return SklearnModel(tasks, task_types, model_params, model_dir,
                          mode="regression",
                          model_instance=RandomForestRegressor(),
                          verbosity=verbosity)
    model = SingletaskToMultitask(tasks, task_types, model_params, self.model_dir,
                                  model_builder, verbosity=verbosity)

    # Fit trained model
    model.fit(dataset)
    model.save()

    # Eval model on train
    transformers = []
    evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity)
    scores = evaluator.compute_model_performance([regression_metric])

    assert scores[regression_metric.name] > .7
Ejemplo n.º 38
0
  def test_select(self):
    """Test that dataset select works."""
    num_datapoints = 10
    num_features = 10
    num_tasks = 1
    X = np.random.rand(num_datapoints, num_features)
    y = np.random.randint(2, size=(num_datapoints, num_tasks))
    w = np.ones((num_datapoints, num_tasks))
    ids = np.array(["id"] * num_datapoints)
    dataset = Dataset.from_numpy(self.data_dir, X, y, w, ids)

    select_dir = tempfile.mkdtemp()
    indices = [0, 4, 5, 8]
    select_dataset = dataset.select(select_dir, indices)
    X_sel, y_sel, w_sel, ids_sel = select_dataset.to_numpy()
    np.testing.assert_array_equal(X[indices], X_sel)
    np.testing.assert_array_equal(y[indices], y_sel)
    np.testing.assert_array_equal(w[indices], w_sel)
    np.testing.assert_array_equal(ids[indices], ids_sel)
    shutil.rmtree(select_dir)
Ejemplo n.º 39
0
  def test_select(self):
    """Test that dataset select works."""
    num_datapoints = 10
    num_features = 10
    num_tasks = 1
    X = np.random.rand(num_datapoints, num_features)
    y = np.random.randint(2, size=(num_datapoints, num_tasks))
    w = np.ones((num_datapoints, num_tasks))
    ids = np.array(["id"] * num_datapoints)
    dataset = Dataset.from_numpy(self.data_dir, X, y, w, ids)

    select_dir = tempfile.mkdtemp()
    indices = [0, 4, 5, 8]
    select_dataset = dataset.select(select_dir, indices)
    X_sel, y_sel, w_sel, ids_sel = select_dataset.to_numpy()
    np.testing.assert_array_equal(X[indices], X_sel)
    np.testing.assert_array_equal(y[indices], y_sel)
    np.testing.assert_array_equal(w[indices], w_sel)
    np.testing.assert_array_equal(ids[indices], ids_sel)
    shutil.rmtree(select_dir)
Ejemplo n.º 40
0
def classify(document):
    doc = document.strip().split('\r\n')
    mol = [Chem.MolFromSmiles(x) for x in doc if x is not None]
    fp = [
        AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024) for x in mol
        if x is not None
    ]
    fp = np.array(fp)
    if len(fp.shape) == 1:
        fp = np.reshape(fp, (1, -1))
    o = np.ones((fp.shape[0], len(kinase_tasks)))
    d = Dataset.from_numpy(data_dir, fp, o, tasks=kinase_tasks)
    y = np.squeeze(np.delete(model.predict_proba(d), 0, 2))
    if len(y.shape) == 1:
        y = np.reshape(y, (1, -1))
    yy = pd.DataFrame(y)
    yy.columns = kinase_tasks
    yy = yy.T
    yy.columns = doc
    yy.index.name = 'kinase'
    yy.to_csv(os.path.join(data_dir, 'pred.csv'))
    return doc, yy
Ejemplo n.º 41
0
  def test_sklearn_skewed_classification_overfit(self):
    """Test sklearn models can overfit 0/1 datasets with few actives."""
    tasks = ["task0"]
    task_types = {task: "classification" for task in tasks}
    n_samples = 100
    n_features = 3
    n_tasks = len(tasks)
    
    # Generate dummy dataset
    np.random.seed(123)
    p = .05
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.binomial(1, p, size=(n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
  
    dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

    model_params = {
      "batch_size": None,
      "data_shape": dataset.get_data_shape()
    }

    verbosity = "high"
    classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity)
    model = SklearnModel(tasks, task_types, model_params, self.model_dir,
                         mode="classification",
                         model_instance=RandomForestClassifier())

    # Fit trained model
    model.fit(dataset)
    model.save()

    # Eval model on train
    transformers = []
    evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity)
    scores = evaluator.compute_model_performance([classification_metric])

    assert scores[classification_metric.name] > .9
Ejemplo n.º 42
0
  def test_sklearn_regression_overfit(self):
    """Test that sklearn models can overfit simple regression datasets."""
    tasks = ["task0"]
    task_types = {task: "regression" for task in tasks}
    n_samples = 10
    n_features = 3
    n_tasks = len(tasks)
    
    # Generate dummy dataset
    np.random.seed(123)
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.rand(n_samples, n_tasks)
    w = np.ones((n_samples, n_tasks))

    dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

    model_params = {
      "batch_size": None,
      "data_shape": dataset.get_data_shape()
    }

    verbosity = "high"
    regression_metric = Metric(metrics.r2_score, verbosity=verbosity)
    model = SklearnModel(tasks, task_types, model_params, self.model_dir,
                         mode="regression",
                         model_instance=RandomForestRegressor())

    # Fit trained model
    model.fit(dataset)
    model.save()

    # Eval model on train
    transformers = []
    evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity)
    scores = evaluator.compute_model_performance([regression_metric])

    assert scores[regression_metric.name] > .7
Ejemplo n.º 43
0
    def test_merge(self):
        """Test that datasets can be merged."""
        verbosity = "high"
        current_dir = os.path.dirname(os.path.realpath(__file__))
        first_data_dir = os.path.join(self.base_dir, "first_dataset")
        second_data_dir = os.path.join(self.base_dir, "second_dataset")
        merged_data_dir = os.path.join(self.base_dir, "merged_data")

        dataset_file = os.path.join(current_dir,
                                    "../../models/tests/example.csv")

        featurizer = CircularFingerprint(size=1024)
        tasks = ["log-solubility"]
        loader = DataLoader(tasks=tasks,
                            smiles_field="smiles",
                            featurizer=featurizer,
                            verbosity=verbosity)
        first_dataset = loader.featurize(dataset_file, first_data_dir)
        second_dataset = loader.featurize(dataset_file, second_data_dir)

        merged_dataset = Dataset.merge(merged_data_dir,
                                       [first_dataset, second_dataset])

        assert len(merged_dataset) == len(first_dataset) + len(second_dataset)
    def test_singletask_to_multitask_classification(self):
        splittype = "scaffold"
        compound_featurizers = [CircularFingerprint(size=1024)]
        complex_featurizers = []
        output_transformers = []
        tasks = [
            "task0", "task1", "task2", "task3", "task4", "task5", "task6",
            "task7", "task8", "task9", "task10", "task11", "task12", "task13",
            "task14", "task15", "task16"
        ]
        task_types = {task: "classification" for task in tasks}
        input_file = "multitask_example.csv"

        n_features = 10
        n_tasks = len(tasks)
        # Define train dataset
        n_train = 100
        X_train = np.random.rand(n_train, n_features)
        y_train = np.random.randint(2, size=(n_train, n_tasks))
        w_train = np.ones_like(y_train)
        ids_train = ["C"] * n_train
        train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train,
                                           w_train, ids_train, tasks)

        # Define test dataset
        n_test = 10
        X_test = np.random.rand(n_test, n_features)
        y_test = np.random.randint(2, size=(n_test, n_tasks))
        w_test = np.ones_like(y_test)
        ids_test = ["C"] * n_test
        test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test,
                                          w_test, ids_test, tasks)

        params_dict = {
            "batch_size": 32,
            "data_shape": train_dataset.get_data_shape()
        }
        classification_metrics = [Metric(metrics.roc_auc_score)]

        def model_builder(tasks,
                          task_types,
                          model_params,
                          model_builder,
                          verbosity=None):
            return SklearnModel(tasks,
                                task_types,
                                model_params,
                                model_builder,
                                model_instance=LogisticRegression())

        multitask_model = SingletaskToMultitask(tasks, task_types, params_dict,
                                                self.model_dir, model_builder)

        # Fit trained model
        multitask_model.fit(train_dataset)
        multitask_model.save()

        # Eval multitask_model on train
        evaluator = Evaluator(multitask_model,
                              train_dataset,
                              output_transformers,
                              verbosity=True)
        _ = evaluator.compute_model_performance(classification_metrics)

        # Eval multitask_model on test
        evaluator = Evaluator(multitask_model,
                              test_dataset,
                              output_transformers,
                              verbosity=True)
        _ = evaluator.compute_model_performance(classification_metrics)
Ejemplo n.º 45
0
tox21_tasks, tox21_dataset, transformers = load_tox21(data_dir, reload=reload)
num_train = 7200
X, y, w, ids = tox21_dataset.to_numpy()
X_train, X_valid = X[:num_train], X[num_train:]
y_train, y_valid = y[:num_train], y[num_train:]
w_train, w_valid = w[:num_train], w[num_train:]
ids_train, ids_valid = ids[:num_train], ids[num_train:]

# Not sure if we need to constantly delete these directories...
if os.path.exists(train_dir):
    shutil.rmtree(train_dir)
train_dataset = Dataset.from_numpy(train_dir,
                                   X_train,
                                   y_train,
                                   w_train,
                                   ids_train,
                                   tox21_tasks,
                                   verbosity=verbosity)

if os.path.exists(valid_dir):
    shutil.rmtree(valid_dir)
valid_dataset = Dataset.from_numpy(valid_dir, X_valid, y_valid, w_valid,
                                   ids_valid, tox21_tasks, verbosity)

# Fit models
tox21_task_types = {task: "classification" for task in tox21_tasks}

classification_metric = Metric(metrics.roc_auc_score,
                               np.mean,
                               verbosity=verbosity,
Ejemplo n.º 46
0
print("About to perform train/valid/test split.")
num_train = .8 * len(dataset)
X, y, w, ids = dataset.to_numpy()
num_tasks = 120
pcba_tasks = pcba_tasks[:num_tasks]
print("Using following tasks")
print(pcba_tasks)
X_train, X_valid = X[:num_train], X[num_train:]
y_train, y_valid = y[:num_train, :num_tasks], y[num_train:, :num_tasks]
w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks]
ids_train, ids_valid = ids[:num_train], ids[num_train:]

if os.path.exists(train_dir):
  shutil.rmtree(train_dir)
train_dataset = Dataset.from_numpy(train_dir, X_train, y_train,
                                   w_train, ids_train, pcba_tasks)

if os.path.exists(valid_dir):
  shutil.rmtree(valid_dir)
valid_dataset = Dataset.from_numpy(valid_dir, X_valid, y_valid,
                                   w_valid, ids_valid, pcba_tasks)

# Fit Logistic Regression models
pcba_task_types = {task: "classification" for task in pcba_tasks}


classification_metric = Metric(metrics.roc_auc_score, np.mean,
                               verbosity=verbosity,
                               mode="classification")
params_dict = { 
    "batch_size": 64,
Ejemplo n.º 47
0
def load_core_pdbbind_grid(pdbbind_dir, base_dir, reload=True):
  """Load PDBBind datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
  verbosity = "high"
  model = "logistic"
  regen = False

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
  if not reload:
    if os.path.exists(base_dir):
      shutil.rmtree(base_dir)
  if not os.path.exists(base_dir):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  data_dir = os.path.join(base_dir, "dataset")

  # Load PDBBind dataset
  labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013")
  pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set")
  tasks = ["-logKd/Ki"]
  print("About to load contents.")
  contents_df = load_pdbbind_labels(labels_file)
  ids = contents_df["PDB code"].values
  y = np.array([float(val) for val in contents_df["-logKd/Ki"].values])

  # Define featurizers
  grid_featurizer = GridFeaturizer(
      voxel_width=16.0, feature_types="voxel_combined",
      # TODO(rbharath, enf): Figure out why pi_stack is slow and cation_pi
      # causes segfaults.
      #voxel_feature_types=["ecfp", "splif", "hbond", "pi_stack", "cation_pi",
      #"salt_bridge"], ecfp_power=9, splif_power=9,
      voxel_feature_types=["ecfp", "splif", "hbond", 
      "salt_bridge"], ecfp_power=9, splif_power=9,
      parallel=True, flatten=True,
      verbosity=verbosity)
  compound_featurizers = [CircularFingerprint(size=1024)]
  complex_featurizers = [grid_featurizer]
  
  # Featurize Dataset
  features = []
  feature_len = None
  y_inds = []
  for ind, pdb_code in enumerate(ids):
    print("Processing %s" % str(pdb_code))
    pdb_subdir = os.path.join(pdb_subdirs, pdb_code)
    computed_feature = compute_pdbbind_grid_feature(
        compound_featurizers, complex_featurizers, pdb_subdir, pdb_code)
    if feature_len is None:
      feature_len = len(computed_feature)
    if len(computed_feature) != feature_len:
      print("Featurization failed for %s!" % pdb_code)
      continue
    y_inds.append(ind)
    features.append(computed_feature)
  ############################################################# DEBUG
  y = y[y_inds]
  ############################################################# DEBUG
  X = np.vstack(features)
  w = np.ones_like(y)
   
  dataset = Dataset.from_numpy(data_dir, X, y, w, ids)
  transformers = []
  
  return tasks, dataset, transformers
Ejemplo n.º 48
0
    def test_sklearn_multitask_classification(self):
        """Test that sklearn models can learn on simple multitask classification."""
        np.random.seed(123)
        n_tasks = 4
        dataset = sklearn.datasets.load_digits(n_class=2)
        X, y = dataset.data, dataset.target
        y = np.reshape(y, (len(y), 1))
        y = np.hstack([y] * n_tasks)

        frac_train = .7
        n_samples = len(X)

        X_train, y_train = X[:frac_train * n_samples], y[:frac_train *
                                                         n_samples]
        X_test, y_test = X[frac_train * n_samples:], y[frac_train * n_samples:]

        train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train)
        test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test)

        tasks = train_dataset.get_task_names()
        task_types = {task: "classification" for task in tasks}

        model_params = {
            "batch_size": None,
            "data_shape": train_dataset.get_data_shape()
        }

        verbosity = "high"
        classification_metric = Metric(metrics.roc_auc_score,
                                       verbosity=verbosity)

        def model_builder(tasks,
                          task_types,
                          model_params,
                          model_dir,
                          verbosity=None):
            return SklearnModel(tasks,
                                task_types,
                                model_params,
                                model_dir,
                                mode="classification",
                                model_instance=LogisticRegression(),
                                verbosity=verbosity)

        model = SingletaskToMultitask(tasks,
                                      task_types,
                                      model_params,
                                      self.model_dir,
                                      model_builder,
                                      verbosity=verbosity)

        # Fit trained model
        model.fit(train_dataset)
        model.save()

        # Eval model on train
        transformers = []
        train_evaluator = Evaluator(model,
                                    train_dataset,
                                    transformers,
                                    verbosity=verbosity)
        train_scores = train_evaluator.compute_model_performance(
            [classification_metric])
        print("train_scores")
        print(train_scores)

        # Eval model on test
        transformers = []
        evaluator = Evaluator(model,
                              test_dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([classification_metric])
        print("scores")
        print(scores)

        for score in scores[classification_metric.name]:
            assert score > .5
Ejemplo n.º 49
0
  def test_tf_skewed_missing_classification_overfit(self):
    """TF, skewed data, few actives

    Test tensorflow models overfit 0/1 datasets with missing data and few
    actives. This is intended to be as close to singletask MUV datasets as
    possible.
    """
    
    tasks = ["task0"]
    task_types = {task: "classification" for task in tasks}
    n_samples = 5120
    n_features = 6
    n_tasks = len(tasks)
    n_classes = 2
    
    # Generate dummy dataset
    np.random.seed(123)
    p = .002
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.binomial(1, p, size=(n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
    y_flat, w_flat = np.squeeze(y), np.squeeze(w)
    y_nonzero = y_flat[w_flat != 0]
    num_nonzero = np.count_nonzero(y_nonzero)
    weight_nonzero = len(y_nonzero)/num_nonzero
    w_flat[y_flat != 0] = weight_nonzero
    w = np.reshape(w_flat, (n_samples, n_tasks))
  
    dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

    model_params = {
      "layer_sizes": [1200],
      "dropouts": [.0],
      "learning_rate": 0.003,
      "momentum": .9,
      "batch_size": 75,
      "num_classification_tasks": 1,
      "num_classes": n_classes,
      "num_features": n_features,
      "weight_init_stddevs": [1.],
      "bias_init_consts": [1.],
      "nb_epoch": 250,
      "penalty": 0.0,
      "optimizer": "adam",
      "data_shape": dataset.get_data_shape()
    }

    verbosity = "high"
    classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity)
    model = TensorflowModel(
        tasks, task_types, model_params, self.model_dir,
        tf_class=TensorflowMultiTaskClassifier,
        verbosity=verbosity)

    # Fit trained model
    model.fit(dataset)
    model.save()

    # Eval model on train
    transformers = []
    evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity)
    scores = evaluator.compute_model_performance([classification_metric])

    assert scores[classification_metric.name] > .8
Ejemplo n.º 50
0
print("About to perform train/valid/test split.")
num_train = .8 * len(dataset)
X, y, w, ids = dataset.to_numpy()
num_tasks = 120
pcba_tasks = pcba_tasks[:num_tasks]
print("Using following tasks")
print(pcba_tasks)
X_train, X_valid = X[:num_train], X[num_train:]
y_train, y_valid = y[:num_train, :num_tasks], y[num_train:, :num_tasks]
w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks]
ids_train, ids_valid = ids[:num_train], ids[num_train:]

if os.path.exists(train_dir):
  shutil.rmtree(train_dir)
train_dataset = Dataset.from_numpy(train_dir, X_train, y_train,
                                   w_train, ids_train, pcba_tasks)

if os.path.exists(valid_dir):
  shutil.rmtree(valid_dir)
valid_dataset = Dataset.from_numpy(valid_dir, X_valid, y_valid,
                                   w_valid, ids_valid, pcba_tasks)

# Fit Logistic Regression models
pcba_task_types = {task: "classification" for task in pcba_tasks}


classification_metric = Metric(metrics.roc_auc_score, np.mean,
                               verbosity=verbosity,
                               mode="classification")
params_dict = { 
    "batch_size": None,
Ejemplo n.º 51
0
    def featurize(self,
                  input_files,
                  data_dir,
                  shard_size=8192,
                  num_shards_per_batch=24,
                  worker_pool=None,
                  logging=True,
                  debug=False):
        """Featurize provided files and write to specified location."""
        ############################################################## TIMING
        time1 = time.time()
        ############################################################## TIMING
        log("Loading raw samples now.", self.verbosity)
        log("shard_size: %d" % shard_size, self.verbosity)
        log("num_shards_per_batch: %d" % num_shards_per_batch, self.verbosity)

        # Allow users to specify a single file for featurization
        if not isinstance(input_files, list):
            input_files = [input_files]

        if not os.path.exists(data_dir):
            os.makedirs(data_dir)

        # Construct partial function to write datasets.
        if not len(input_files):
            return None
        input_type = get_input_type(input_files[0])

        if logging:
            mp.log_to_stderr()
        if worker_pool is None:
            if logging:
                worker_pool = LoggingPool(processes=1)
            else:
                worker_pool = mp.Pool(processes=1)
        log("Spawning workers now.", self.verbosity)
        metadata_rows = []
        data_iterator = it.izip(
            it.repeat((self, shard_size, input_type, data_dir)),
            enumerate(load_data(input_files, shard_size, self.verbosity)))
        # Turns out python map is terrible and exhausts the generator as given.
        # Solution seems to be to to manually pull out N elements from iterator,
        # then to map on only those N elements. BLECH. Python should do a better
        # job here.
        num_batches = 0
        ############################################################## TIMING
        time2 = time.time()
        log("TIMING: pre-map featurization took %0.3f s" % (time2 - time1))
        ############################################################## TIMING
        while True:
            log("About to start processing next batch of shards",
                self.verbosity)
            ############################################################## TIMING
            time1 = time.time()
            ############################################################## TIMING
            iterator = itertools.islice(data_iterator, num_shards_per_batch)
            if not debug:
                batch_metadata = worker_pool.map(featurize_map_function,
                                                 iterator)
            else:
                batch_metadata = []
                for elt in iterator:
                    batch_metadata.append(featurize_map_function(elt))
            ############################################################## TIMING
            time2 = time.time()
            log("TIMING: map call on batch took %0.3f s" % (time2 - time1),
                self.verbosity)
            ############################################################## TIMING
            if batch_metadata:
                metadata_rows.extend(
                    [elt for elt in batch_metadata if elt is not None])
                num_batches += 1
                log(
                    "Featurized %d datapoints\n" %
                    (shard_size * num_shards_per_batch * num_batches),
                    self.verbosity)
            else:
                break
        ############################################################## TIMING
        time1 = time.time()
        ############################################################## TIMING

        # TODO(rbharath): This whole bit with metadata_rows is an awkward way of
        # creating a Dataset. Is there a more elegant solutions?
        dataset = Dataset(data_dir=data_dir,
                          metadata_rows=metadata_rows,
                          reload=reload,
                          verbosity=self.verbosity)
        ############################################################## TIMING
        time2 = time.time()
        print("TIMING: dataset construction took %0.3f s" % (time2 - time1),
              self.verbosity)
        ############################################################## TIMING
        return dataset
Ejemplo n.º 52
0
  def test_multiload(self):
    """Check can re-use featurization for multiple task selections.

    TODO(rbharath): This test seems silly after the recent round of
                    refactoring. Can it be removed?
    """
    # Only for debug!
    np.random.seed(123)

    # Set some global variables up top
    reload = True
    verbosity = "high"


    current_dir = os.path.dirname(os.path.realpath(__file__))
    #Make directories to store the raw and featurized datasets.
    data_dir = os.path.join(self.base_dir, "dataset")
    train_dir = os.path.join(self.base_dir, "train_dataset")
    valid_dir = os.path.join(self.base_dir, "valid_dataset")
    test_dir = os.path.join(self.base_dir, "test_dataset")
    model_dir = os.path.join(self.base_dir, "model")

    # Load dataset
    print("About to load dataset.")
    dataset_file = os.path.join(
        current_dir, "../../models/tests/multitask_example.csv")
    dataset = load_from_disk(dataset_file)
    print("Columns of dataset: %s" % str(dataset.columns.values))
    print("Number of examples in dataset: %s" % str(dataset.shape[0]))

    # Featurize tox21 dataset
    print("About to featurize dataset.")
    featurizer = CircularFingerprint(size=1024)
    all_tasks = ["task%d"%i for i in range(17)] 

    ####### Do featurization
    loader = DataLoader(tasks=all_tasks,
                        smiles_field="smiles",
                        featurizer=featurizer,
                        verbosity=verbosity)
    dataset = loader.featurize(
        dataset_file, data_dir)

    # Do train/valid split.
    X_multi, y_multi, w_multi, ids_multi = dataset.to_numpy()


    ####### Do singletask load
    y_tasks, w_tasks, = [], []
    for ind, task in enumerate(all_tasks):
      print("Processing task %s" % task)
      dataset = Dataset(data_dir, verbosity=verbosity, reload=reload)

      X_task, y_task, w_task, ids_task = dataset.to_numpy()
      y_tasks.append(y_task[:, ind])
      w_tasks.append(w_task[:, ind])

    ################## Do comparison
    for ind, task in enumerate(all_tasks):
      y_multi_task = y_multi[:, ind]
      w_multi_task = w_multi[:, ind]

      y_task = y_tasks[ind]
      w_task = w_tasks[ind]

      np.testing.assert_allclose(y_multi_task.flatten(), y_task.flatten())
      np.testing.assert_allclose(w_multi_task.flatten(), w_task.flatten())
Ejemplo n.º 53
0
from deepchem.datasets import Dataset
from deepchem.models.tensorflow_models import TensorflowModel
from deepchem.models.tensorflow_models.fcnet import TensorflowMultiTaskClassifier
from bokeh.plotting import figure
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.embed import components
from bokeh.palettes import Plasma256
import pandas as pd

app = Flask(__name__)
Plasma256.extend(Plasma256[::-1])
cur_dir = os.path.dirname(__file__)
data_dir = os.path.join(cur_dir, 'data')
base_dir = '/data/ballen/ML/kinaseDeepLearningAllKinase_081516'
test_dir = os.path.join(base_dir, 'test_dataset_random')
kinase_tasks = Dataset(test_dir, reload=True).get_task_names()
kinase_task_types = {task: 'classification' for task in kinase_tasks}
params_dict = {
    "activation": "relu",
    "momentum": .9,
    "batch_size": 128,
    "init": "glorot_uniform",
    "data_shape": (1024, ),
    "learning_rate": 1e-3,
    "decay": 1e-6,
    "nb_hidden": (2000, 500),
    "nb_epoch": 100,
    "nesterov": False,
    "dropouts": (.5, .5),
    "nb_layers": 2,
    "batchnorm": False,
Ejemplo n.º 54
0
model_dir = os.path.join(base_dir, "model")

print("About to perform train/valid/test split.")
num_train = .8 * len(dataset)
X, y, w, ids = dataset.to_numpy()
num_tasks = 17
muv_tasks = muv_tasks[:num_tasks]
X_train, X_valid = X[:num_train], X[num_train:]
y_train, y_valid = y[:num_train, :num_tasks], y[num_train:, :num_tasks]
w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks]
ids_train, ids_valid = ids[:num_train], ids[num_train:]

if os.path.exists(train_dir):
  shutil.rmtree(train_dir)
train_dataset = Dataset.from_numpy(train_dir, X_train, y_train,
                                   w_train, ids_train, muv_tasks,
                                   verbosity=verbosity)

if os.path.exists(valid_dir):
  shutil.rmtree(valid_dir)
valid_dataset = Dataset.from_numpy(valid_dir, X_valid, y_valid,
                                   w_valid, ids_valid, muv_tasks,
                                   verbosity=verbosity)

# Fit Logistic Regression models
muv_task_types = {task: "classification" for task in muv_tasks}


classification_metric = Metric(metrics.roc_auc_score, np.mean,
                               verbosity=verbosity,
                               mode="classification")
Ejemplo n.º 55
0
def load_pcba(base_dir, reload=True):
  """Load PCBA datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
  verbosity = "high"
  regen = False

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
  if not reload:
    if os.path.exists(base_dir):
      shutil.rmtree(base_dir)
  if not os.path.exists(base_dir):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  data_dir = os.path.join(base_dir, "dataset")

  # Load PCBA dataset
  print("About to load PCBA dataset.")
  dataset_file = os.path.join(
      current_dir, "../../datasets/pcba.csv.gz")
  dataset = load_from_disk(dataset_file)
  print("Columns of dataset: %s" % str(dataset.columns.values))
  print("Number of examples in dataset: %s" % str(dataset.shape[0]))

  # Featurize PCBA dataset
  print("About to featurize PCBA dataset.")
  featurizer = CircularFingerprint(size=1024)
  all_PCBA_tasks = [
      'PCBA-1030','PCBA-1379','PCBA-1452','PCBA-1454','PCBA-1457',
      'PCBA-1458','PCBA-1460','PCBA-1461','PCBA-1468','PCBA-1469',
      'PCBA-1471','PCBA-1479','PCBA-1631','PCBA-1634','PCBA-1688',
      'PCBA-1721','PCBA-2100','PCBA-2101','PCBA-2147','PCBA-2242',
      'PCBA-2326','PCBA-2451','PCBA-2517','PCBA-2528','PCBA-2546',
      'PCBA-2549','PCBA-2551','PCBA-2662','PCBA-2675','PCBA-2676',
      'PCBA-411','PCBA-463254','PCBA-485281','PCBA-485290','PCBA-485294',
      'PCBA-485297','PCBA-485313','PCBA-485314','PCBA-485341','PCBA-485349',
      'PCBA-485353','PCBA-485360','PCBA-485364','PCBA-485367','PCBA-492947',
      'PCBA-493208','PCBA-504327','PCBA-504332','PCBA-504333','PCBA-504339',
      'PCBA-504444','PCBA-504466','PCBA-504467','PCBA-504706','PCBA-504842',
      'PCBA-504845','PCBA-504847','PCBA-504891','PCBA-540276','PCBA-540317',
      'PCBA-588342','PCBA-588453','PCBA-588456','PCBA-588579','PCBA-588590',
      'PCBA-588591','PCBA-588795','PCBA-588855','PCBA-602179','PCBA-602233',
      'PCBA-602310','PCBA-602313','PCBA-602332','PCBA-624170','PCBA-624171',
      'PCBA-624173','PCBA-624202','PCBA-624246','PCBA-624287','PCBA-624288',
      'PCBA-624291','PCBA-624296','PCBA-624297','PCBA-624417','PCBA-651635',
      'PCBA-651644','PCBA-651768','PCBA-651965','PCBA-652025','PCBA-652104',
      'PCBA-652105','PCBA-652106','PCBA-686970','PCBA-686978','PCBA-686979',
      'PCBA-720504','PCBA-720532','PCBA-720542','PCBA-720551','PCBA-720553',
      'PCBA-720579','PCBA-720580','PCBA-720707','PCBA-720708','PCBA-720709',
      'PCBA-720711','PCBA-743255','PCBA-743266','PCBA-875','PCBA-881',
      'PCBA-883','PCBA-884','PCBA-885','PCBA-887','PCBA-891','PCBA-899',
      'PCBA-902','PCBA-903','PCBA-904','PCBA-912','PCBA-914','PCBA-915',
      'PCBA-924','PCBA-925','PCBA-926','PCBA-927','PCBA-938','PCBA-995']

  loader = DataLoader(tasks=all_PCBA_tasks,
                      smiles_field="smiles",
                      featurizer=featurizer,
                      verbosity=verbosity)
  if not reload or not os.path.exists(data_dir):
    dataset = loader.featurize(dataset_file, data_dir)
    regen = True
  else:
    dataset = Dataset(data_dir, reload=True)

  # Initialize transformers 
  transformers = [
      BalancingTransformer(transform_w=True, dataset=dataset)]

  if regen:
    print("About to transform data")
    for transformer in transformers:
        transformer.transform(dataset)
  
  return all_PCBA_tasks, dataset, transformers
Ejemplo n.º 56
0
  def test_sklearn_transformed_regression(self):
    """Test that sklearn models can learn on simple transformed regression datasets."""
    np.random.seed(123)
    dataset = sklearn.datasets.load_diabetes()
    X, y = dataset.data, dataset.target

    frac_train = .7
    n_samples = len(X)
    
    X_train, y_train = X[:frac_train*n_samples], y[:frac_train*n_samples]
    X_test, y_test = X[frac_train*n_samples:], y[frac_train*n_samples:]

    train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train)
    test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test)


    # Eval model on train
    input_transformers = [
        NormalizationTransformer(transform_X=True, dataset=train_dataset),
        ClippingTransformer(transform_X=True, dataset=train_dataset)]
    output_transformers = [
      NormalizationTransformer(transform_y=True, dataset=train_dataset)]
    transformers = input_transformers + output_transformers
    for transformer in transformers:
        transformer.transform(train_dataset)
    for transformer in transformers:
        transformer.transform(test_dataset)

    tasks = train_dataset.get_task_names()
    task_types = {task: "regression" for task in tasks}

    model_params = {
      "batch_size": None,
      "data_shape": train_dataset.get_data_shape()
    }

    verbosity = "high"
    regression_metric = Metric(metrics.r2_score, verbosity=verbosity)
    model = SklearnModel(tasks, task_types, model_params, self.model_dir,
                         mode="regression",
                         model_instance=LinearRegression())

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity)
    train_scores = train_evaluator.compute_model_performance([regression_metric])
    print("train_scores")
    print(train_scores)

    assert train_scores[regression_metric.name] > .5

    # Eval model on test
    transformers = []
    evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity)
    scores = evaluator.compute_model_performance([regression_metric])
    print("scores")
    print(scores)

    assert scores[regression_metric.name] > .5