Ejemplo n.º 1
0
    def test_sklearn_reload(self):
        """Test that trained model can be reloaded correctly."""
        tasks = ["task0"]
        task_types = {task: "classification" for task in tasks}
        n_samples = 10
        n_features = 3
        n_tasks = len(tasks)

        # Generate dummy dataset
        np.random.seed(123)
        ids = np.arange(n_samples)
        X = np.random.rand(n_samples, n_features)
        y = np.random.randint(2, size=(n_samples, n_tasks))
        w = np.ones((n_samples, n_tasks))

        dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

        model_params = {
            "batch_size": None,
            "data_shape": dataset.get_data_shape()
        }

        verbosity = "high"
        classification_metric = Metric(metrics.roc_auc_score,
                                       verbosity=verbosity)
        model = SklearnModel(tasks,
                             task_types,
                             model_params,
                             self.model_dir,
                             mode="classification",
                             model_instance=RandomForestClassifier())

        # Fit trained model
        model.fit(dataset)
        model.save()

        # Load trained model
        reloaded_model = SklearnModel(tasks,
                                      task_types,
                                      model_params,
                                      self.model_dir,
                                      mode="classification")
        reloaded_model.reload()

        # Eval model on train
        transformers = []
        evaluator = Evaluator(reloaded_model,
                              dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([classification_metric])

        assert scores[classification_metric.name] > .9
Ejemplo n.º 2
0
  def test_sklearn_classification_overfit(self):
    """Test that sklearn models can overfit simple classification datasets."""
    n_samples = 10
    n_features = 3
    n_tasks = 1
    
    # Generate dummy dataset
    np.random.seed(123)
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.randint(2, size=(n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
    dataset = NumpyDataset(X, y, w, ids)

    verbosity = "high"
    classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity)
    sklearn_model = RandomForestClassifier()
    model = SklearnModel(sklearn_model, self.model_dir)

    # Fit trained model
    model.fit(dataset)
    model.save()

    # Eval model on train
    transformers = []
    evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity)
    scores = evaluator.compute_model_performance([classification_metric])

    assert scores[classification_metric.name] > .9
Ejemplo n.º 3
0
def model_builder(tasks, task_types, model_params, model_dir, verbosity=None):
    return SklearnModel(tasks,
                        task_types,
                        model_params,
                        model_dir,
                        model_instance=RandomForestRegressor(n_estimators=500),
                        verbosity=verbosity)
Ejemplo n.º 4
0
    def test_sklearn_regression(self):
        """Test that sklearn models can learn on simple regression datasets."""
        np.random.seed(123)
        dataset = sklearn.datasets.load_diabetes()
        X, y = dataset.data, dataset.target

        frac_train = .7
        n_samples = len(X)

        X_train, y_train = X[:frac_train * n_samples], y[:frac_train *
                                                         n_samples]
        X_test, y_test = X[frac_train * n_samples:], y[frac_train * n_samples:]

        train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train)
        test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test)

        tasks = train_dataset.get_task_names()
        task_types = {task: "regression" for task in tasks}

        model_params = {
            "batch_size": None,
            "data_shape": train_dataset.get_data_shape()
        }

        verbosity = "high"
        regression_metric = Metric(metrics.r2_score, verbosity=verbosity)
        model = SklearnModel(tasks,
                             task_types,
                             model_params,
                             self.model_dir,
                             mode="regression",
                             model_instance=LinearRegression())

        # Fit trained model
        model.fit(train_dataset)
        model.save()

        # Eval model on train
        transformers = []
        train_evaluator = Evaluator(model,
                                    train_dataset,
                                    transformers,
                                    verbosity=verbosity)
        train_scores = train_evaluator.compute_model_performance(
            [regression_metric])
        print("train_scores")
        print(train_scores)

        # Eval model on test
        transformers = []
        evaluator = Evaluator(model,
                              test_dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([regression_metric])
        print("scores")
        print(scores)

        assert scores[regression_metric.name] > .5
def model_builder(tasks, task_types, model_params, model_dir, verbosity=None):
    return SklearnModel(tasks,
                        task_types,
                        model_params,
                        model_dir,
                        model_instance=RandomForestClassifier(
                            class_weight="balanced", n_estimators=500),
                        verbosity=verbosity)
 def model_builder(tasks,
                   task_types,
                   model_params,
                   model_builder,
                   verbosity=None):
     return SklearnModel(tasks,
                         task_types,
                         model_params,
                         model_builder,
                         model_instance=LogisticRegression())
Ejemplo n.º 7
0
    def test_singletask_sklearn_rf_ECFP_regression_API(self):
        """Test of singletask RF ECFP regression API."""
        splittype = "scaffold"
        featurizer = CircularFingerprint(size=1024)
        model_params = {}
        tasks = ["log-solubility"]
        task_type = "regression"
        task_types = {task: task_type for task in tasks}
        input_file = os.path.join(self.current_dir, "example.csv")
        loader = DataLoader(tasks=tasks,
                            smiles_field=self.smiles_field,
                            featurizer=featurizer,
                            verbosity="low")
        dataset = loader.featurize(input_file, self.data_dir)

        splitter = ScaffoldSplitter()
        train_dataset, test_dataset = splitter.train_test_split(
            dataset, self.train_dir, self.test_dir)

        input_transformers = []
        output_transformers = [
            NormalizationTransformer(transform_y=True, dataset=train_dataset)
        ]
        transformers = input_transformers + output_transformers
        model_params["data_shape"] = train_dataset.get_data_shape()
        regression_metrics = [
            Metric(metrics.r2_score),
            Metric(metrics.mean_squared_error),
            Metric(metrics.mean_absolute_error)
        ]

        model = SklearnModel(tasks,
                             task_types,
                             model_params,
                             self.model_dir,
                             mode="regression",
                             model_instance=RandomForestRegressor())

        # Fit trained model
        model.fit(train_dataset)
        model.save()

        # Eval model on train
        evaluator = Evaluator(model,
                              train_dataset,
                              transformers,
                              verbosity=True)
        _ = evaluator.compute_model_performance(regression_metrics)

        # Eval model on test
        evaluator = Evaluator(model,
                              test_dataset,
                              transformers,
                              verbosity=True)
        _ = evaluator.compute_model_performance(regression_metrics)
Ejemplo n.º 8
0
 def model_builder(tasks,
                   task_types,
                   model_params,
                   model_dir,
                   verbosity=None):
     return SklearnModel(tasks,
                         task_types,
                         model_params,
                         model_dir,
                         mode="regression",
                         model_instance=LinearRegression(),
                         verbosity=verbosity)
Ejemplo n.º 9
0
 def model_builder(tasks,
                   task_types,
                   model_params,
                   model_dir,
                   verbosity=None):
     return SklearnModel(tasks,
                         task_types,
                         model_params,
                         model_dir,
                         mode="classification",
                         model_instance=RandomForestClassifier(),
                         verbosity=verbosity)
Ejemplo n.º 10
0
def rf_model_builder(tasks, task_types, params_dict, model_dir, verbosity=None):
    """Builds random forests given hyperparameters.

    Last two arguments only for tensorflow models and ignored.
    """
    n_estimators = params_dict["n_estimators"]
    max_features = params_dict["max_features"]
    return SklearnModel(
        tasks, task_types, params_dict, model_dir,
        mode="regression",
        model_instance=RandomForestRegressor(n_estimators=n_estimators,
                                             max_features=max_features))
Ejemplo n.º 11
0
 def model_builder(tasks,
                   task_types,
                   model_params,
                   model_dir,
                   verbosity=None):
     return SklearnModel(tasks,
                         task_types,
                         model_params,
                         model_dir,
                         mode="classification",
                         model_instance=LogisticRegression(),
                         verbosity=verbosity)
Ejemplo n.º 12
0
 def model_builder(tasks,
                   task_types,
                   params_dict,
                   model_dir,
                   verbosity=verbosity):
     n_estimators = params_dict["n_estimators"]
     max_features = params_dict["max_features"]
     return SklearnModel(tasks,
                         task_types,
                         params_dict,
                         model_dir,
                         model_instance=model_class(
                             n_estimators=n_estimators,
                             max_features=max_features))
Ejemplo n.º 13
0
    def test_sklearn_transformed_regression(self):
        """Test that sklearn models can learn on simple transformed regression datasets."""
        np.random.seed(123)
        dataset = sklearn.datasets.load_diabetes()
        X, y = dataset.data, dataset.target

        frac_train = .7
        n_samples = len(X)
        n_train = int(frac_train * n_samples)
        X_train, y_train = X[:n_train], y[:n_train]
        X_test, y_test = X[n_train:], y[n_train:]
        train_dataset = DiskDataset.from_numpy(self.train_dir, X_train,
                                               y_train)
        test_dataset = DiskDataset.from_numpy(self.test_dir, X_test, y_test)

        # Eval model on train
        transformers = [
            NormalizationTransformer(transform_X=True, dataset=train_dataset),
            ClippingTransformer(transform_X=True, dataset=train_dataset),
            NormalizationTransformer(transform_y=True, dataset=train_dataset)
        ]
        for data in [train_dataset, test_dataset]:
            for transformer in transformers:
                transformer.transform(data)

        verbosity = "high"
        regression_metric = Metric(metrics.r2_score, verbosity=verbosity)
        sklearn_model = LinearRegression()
        model = SklearnModel(sklearn_model, self.model_dir)

        # Fit trained model
        model.fit(train_dataset)
        model.save()

        train_evaluator = Evaluator(model,
                                    train_dataset,
                                    transformers,
                                    verbosity=verbosity)
        train_scores = train_evaluator.compute_model_performance(
            [regression_metric])
        assert train_scores[regression_metric.name] > .5

        # Eval model on test
        evaluator = Evaluator(model,
                              test_dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([regression_metric])
        assert scores[regression_metric.name] > .5
Ejemplo n.º 14
0
  def test_singletask_sklearn_rf_RDKIT_descriptor_regression_API(self):
    """Test of singletask RF RDKIT-descriptor regression API."""
    splittype = "scaffold"
    featurizer = RDKitDescriptors()
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = os.path.join(self.current_dir, "example.csv")
    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)

    input_transformers = [
        NormalizationTransformer(transform_X=True, dataset=train_dataset),
        ClippingTransformer(transform_X=True, dataset=train_dataset)]
    output_transformers = [
        NormalizationTransformer(transform_y=True, dataset=train_dataset)]
    transformers = input_transformers + output_transformers
    for dataset in [train_dataset, test_dataset]:
      for transformer in transformers:
        transformer.transform(dataset)

    regression_metrics = [Metric(metrics.r2_score),
                          Metric(metrics.mean_squared_error),
                          Metric(metrics.mean_absolute_error)]

    sklearn_model = RandomForestRegressor()
    model = SklearnModel(sklearn_model, self.model_dir)

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on train
    evaluator = Evaluator(model, train_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(regression_metrics)

    # Eval model on test
    evaluator = Evaluator(model, test_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(regression_metrics)
Ejemplo n.º 15
0
    def test_sklearn_regression_overfit(self):
        """Test that sklearn models can overfit simple regression datasets."""
        tasks = ["task0"]
        task_types = {task: "regression" for task in tasks}
        n_samples = 10
        n_features = 3
        n_tasks = len(tasks)

        # Generate dummy dataset
        np.random.seed(123)
        ids = np.arange(n_samples)
        X = np.random.rand(n_samples, n_features)
        y = np.random.rand(n_samples, n_tasks)
        w = np.ones((n_samples, n_tasks))

        dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

        model_params = {
            "batch_size": None,
            "data_shape": dataset.get_data_shape()
        }

        verbosity = "high"
        regression_metric = Metric(metrics.r2_score, verbosity=verbosity)
        model = SklearnModel(tasks,
                             task_types,
                             model_params,
                             self.model_dir,
                             mode="regression",
                             model_instance=RandomForestRegressor())

        # Fit trained model
        model.fit(dataset)
        model.save()

        # Eval model on train
        transformers = []
        evaluator = Evaluator(model,
                              dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([regression_metric])

        assert scores[regression_metric.name] > .7
Ejemplo n.º 16
0
    def test_sklearn_classification(self):
        """Test that sklearn models can learn on simple classification datasets."""
        np.random.seed(123)
        dataset = sklearn.datasets.load_digits(n_class=2)
        X, y = dataset.data, dataset.target

        frac_train = .7
        n_samples = len(X)
        n_train = int(frac_train * n_samples)
        X_train, y_train = X[:n_train], y[:n_train]
        X_test, y_test = X[n_train:], y[n_train:]
        train_dataset = DiskDataset.from_numpy(self.train_dir, X_train,
                                               y_train)
        test_dataset = DiskDataset.from_numpy(self.test_dir, X_test, y_test)

        verbosity = "high"
        classification_metric = Metric(metrics.roc_auc_score,
                                       verbosity=verbosity)
        sklearn_model = LogisticRegression()
        model = SklearnModel(sklearn_model, self.model_dir)

        # Fit trained model
        model.fit(train_dataset)
        model.save()

        # Eval model on train
        transformers = []
        train_evaluator = Evaluator(model,
                                    train_dataset,
                                    transformers,
                                    verbosity=verbosity)
        train_scores = train_evaluator.compute_model_performance(
            [classification_metric])

        # Eval model on test
        transformers = []
        evaluator = Evaluator(model,
                              test_dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([classification_metric])
        assert scores[classification_metric.name] > .5
Ejemplo n.º 17
0
  def __init__(self, pad=5):
    self.pad = pad
    self.convex_finder = ConvexHullPocketFinder(pad)

    # Load binding pocket model
    self.base_dir = tempfile.mkdtemp()
    print("About to download trained model.")
    # TODO(rbharath): Shift refined to full once trained.
    call((
        "wget -c http://deepchem.io.s3-website-us-west-1.amazonaws.com/trained_models/pocket_random_refined_RF.tar.gz"
    ).split())
    call(("tar -zxvf pocket_random_refined_RF.tar.gz").split())
    call(("mv pocket_random_refined_RF %s" % (self.base_dir)).split())
    self.model_dir = os.path.join(self.base_dir, "pocket_random_refined_RF")

    # Fit model on dataset
    self.model = SklearnModel(model_dir=self.model_dir)
    self.model.reload()

    # Create featurizers
    self.pocket_featurizer = BindingPocketFeaturizer()
    self.ligand_featurizer = CircularFingerprint(size=1024)
Ejemplo n.º 18
0
pd.DataFrame(train_dataset.y,
             columns=['prediction']).to_csv(modeldir + "train_original.csv")
pd.DataFrame(valid_dataset.y,
             columns=['prediction']).to_csv(modeldir + "valid_original.csv")

for estimator in n_estimators:

    print('n_estimators = {0}'.format(estimator))
    #Create model
    sklmodel = RandomForestRegressor(n_estimators=estimator,
                                     criterion="mse",
                                     max_features=max_features,
                                     bootstrap=True,
                                     oob_score=False,
                                     n_jobs=int(cpus / 2))
    model = SklearnModel(sklmodel, modeldir)
    model.fit(train_dataset)

    #Append trains cores and results
    train_scores = model.evaluate(
        train_dataset,
        [metric, dc.metrics.Metric(dc.metrics.mae_score)])
    train_results = np.concatenate(
        (train_results, list(train_scores.values())))
    valid_scores = model.evaluate(
        valid_dataset,
        [metric, dc.metrics.Metric(dc.metrics.mae_score)])
    test_results = np.concatenate((test_results, list(valid_scores.values())))

    #Append trains cores and results
    predict_train = pd.DataFrame(
Ejemplo n.º 19
0
 def model_builder(model_dir):
     sklearn_model = LogisticRegression()
     return SklearnModel(sklearn_model, model_dir)
Ejemplo n.º 20
0
pdbbind_task_types = {task: "regression" for task in pdbbind_tasks}

classification_metric = Metric(metrics.r2_score,
                               verbosity=verbosity,
                               mode="regression")
params_dict = {
    "batch_size": None,
    "data_shape": train_dataset.get_data_shape(),
}

if os.path.exists(model_dir):
    shutil.rmtree(model_dir)
os.makedirs(model_dir)
model = SklearnModel(pdbbind_tasks,
                     pdbbind_task_types,
                     params_dict,
                     model_dir,
                     model_instance=RandomForestRegressor(n_estimators=500),
                     verbosity=verbosity)

# Fit trained model
model.fit(train_dataset)
model.save()

train_evaluator = Evaluator(model,
                            train_dataset,
                            transformers,
                            verbosity=verbosity)
train_scores = train_evaluator.compute_model_performance(
    [classification_metric])

print("Train scores")
Ejemplo n.º 21
0
    # Get supports on test-set
    support_generator = SupportGenerator(
        test_dataset, range(len(test_dataset.get_task_names())), n_pos, n_neg,
        n_trials, replace)

    # Compute accuracies
    task_scores = {
        task: []
        for task in range(len(test_dataset.get_task_names()))
    }
    for (task, support) in support_generator:
        # Train model on support
        sklearn_model = RandomForestClassifier(class_weight="balanced",
                                               n_estimators=50)
        model = SklearnModel(sklearn_model, model_dir)
        model.fit(support)

        # Test model
        task_dataset = get_task_dataset_minus_support(test_dataset, support,
                                                      task)
        y_pred = model.predict_proba(task_dataset)
        score = metric.compute_metric(task_dataset.y, y_pred, task_dataset.w)
        #print("Score on task %s is %s" % (str(task), str(score)))
        task_scores[task].append(score)

    # Join information for all tasks.
    mean_task_scores = {}
    for task in range(len(test_dataset.get_task_names())):
        mean_task_scores[task] = np.mean(np.array(task_scores[task]))
    print("Fold %s" % str(fold))
Ejemplo n.º 22
0
def model_builder(tasks, task_types, model_params, model_dir, verbosity=None):
  return SklearnModel(tasks, task_types, model_params, model_dir,
                      model_instance=LogisticRegression(class_weight="balanced"),
                      verbosity=verbosity)
def generate_rf_model():
    model_dir = "."
    sklearn_model = RandomForestRegressor(n_estimators=500)
    return SklearnModel(sklearn_model, model_dir)
Ejemplo n.º 24
0
 def model_builder(model_dir):
   sklearn_model = RandomForestRegressor()
   return SklearnModel(sklearn_model, model_dir)
Ejemplo n.º 25
0
 def model_builder(model_dir):
   sklearn_model = RandomForestClassifier()
   return SklearnModel(sklearn_model, model_dir)
Ejemplo n.º 26
0
def model_builder(model_dir):
    sklearn_model = RandomForestRegressor(n_estimators=500)
    return SklearnModel(sklearn_model, model_dir)
Ejemplo n.º 27
0
def model_builder(model_dir):
  sklearn_model = RandomForestClassifier(
      class_weight="balanced", n_estimators=500)
  return SklearnModel(sklearn_model, model_dir)
Ejemplo n.º 28
0
 def rf_model_builder(model_params, model_dir):
     sklearn_model = RandomForestClassifier(**model_params)
     return SklearnModel(sklearn_model, model_dir)
Ejemplo n.º 29
0
        featurizer = deepchem.feat.WeaveFeaturizer(),
        transformers = 2,
        modelname = MPNNModel,
        model_file = model_dir + "mpnn_model",
        dataset_file = data_dir + 'To_predict.csv',
        fname = 'PredictedMPNN.csv',
        parentdir = data_dir,
        newdir = newdir)
    flag_predicted = False;

if len(models) == 0 or "RandomForest" in models:
    print("-Evaluating Random Forest Model", flush = True)
    predictchem.predict_csv_from_model(
        featurizer = deepchem.feat.CircularFingerprint(size=1024),
        transformers = 2,
        modelname = SklearnModel(model_dir = model_dir + "random_forest"),
        model_file = "", #No need for model_file
        dataset_file = data_dir + 'To_predict.csv',
        fname = 'PredictedForest.csv',
        parentdir = data_dir,
        newdir = newdir,
        modeltype = "sklearn")
    flag_predicted = False;

if len(models) == 0 or "KRR" in models:
    print("-Evaluating Kernel Ridge Regression", flush = True)
    predictchem.predict_csv_from_model(
        featurizer = deepchem.feat.CircularFingerprint(size=1024),
        transformers = 2,
        modelname = SklearnModel(model_dir = model_dir + "krr_model"),
        model_file = "", #No need for model_file