Esempio n. 1
0
  def test_sklearn_classification_overfit(self):
    """Test that sklearn models can overfit simple classification datasets."""
    n_samples = 10
    n_features = 3
    n_tasks = 1
    
    # Generate dummy dataset
    np.random.seed(123)
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.randint(2, size=(n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
    dataset = NumpyDataset(X, y, w, ids)

    verbosity = "high"
    classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity)
    sklearn_model = RandomForestClassifier()
    model = SklearnModel(sklearn_model, self.model_dir)

    # Fit trained model
    model.fit(dataset)
    model.save()

    # Eval model on train
    transformers = []
    evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity)
    scores = evaluator.compute_model_performance([classification_metric])

    assert scores[classification_metric.name] > .9
Esempio n. 2
0
    def test_sklearn_regression(self):
        """Test that sklearn models can learn on simple regression datasets."""
        np.random.seed(123)
        dataset = sklearn.datasets.load_diabetes()
        X, y = dataset.data, dataset.target

        frac_train = .7
        n_samples = len(X)

        X_train, y_train = X[:frac_train * n_samples], y[:frac_train *
                                                         n_samples]
        X_test, y_test = X[frac_train * n_samples:], y[frac_train * n_samples:]

        train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train)
        test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test)

        tasks = train_dataset.get_task_names()
        task_types = {task: "regression" for task in tasks}

        model_params = {
            "batch_size": None,
            "data_shape": train_dataset.get_data_shape()
        }

        verbosity = "high"
        regression_metric = Metric(metrics.r2_score, verbosity=verbosity)
        model = SklearnModel(tasks,
                             task_types,
                             model_params,
                             self.model_dir,
                             mode="regression",
                             model_instance=LinearRegression())

        # Fit trained model
        model.fit(train_dataset)
        model.save()

        # Eval model on train
        transformers = []
        train_evaluator = Evaluator(model,
                                    train_dataset,
                                    transformers,
                                    verbosity=verbosity)
        train_scores = train_evaluator.compute_model_performance(
            [regression_metric])
        print("train_scores")
        print(train_scores)

        # Eval model on test
        transformers = []
        evaluator = Evaluator(model,
                              test_dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([regression_metric])
        print("scores")
        print(scores)

        assert scores[regression_metric.name] > .5
Esempio n. 3
0
    def test_singletask_sklearn_rf_ECFP_regression_API(self):
        """Test of singletask RF ECFP regression API."""
        splittype = "scaffold"
        featurizer = CircularFingerprint(size=1024)
        model_params = {}
        tasks = ["log-solubility"]
        task_type = "regression"
        task_types = {task: task_type for task in tasks}
        input_file = os.path.join(self.current_dir, "example.csv")
        loader = DataLoader(tasks=tasks,
                            smiles_field=self.smiles_field,
                            featurizer=featurizer,
                            verbosity="low")
        dataset = loader.featurize(input_file, self.data_dir)

        splitter = ScaffoldSplitter()
        train_dataset, test_dataset = splitter.train_test_split(
            dataset, self.train_dir, self.test_dir)

        input_transformers = []
        output_transformers = [
            NormalizationTransformer(transform_y=True, dataset=train_dataset)
        ]
        transformers = input_transformers + output_transformers
        model_params["data_shape"] = train_dataset.get_data_shape()
        regression_metrics = [
            Metric(metrics.r2_score),
            Metric(metrics.mean_squared_error),
            Metric(metrics.mean_absolute_error)
        ]

        model = SklearnModel(tasks,
                             task_types,
                             model_params,
                             self.model_dir,
                             mode="regression",
                             model_instance=RandomForestRegressor())

        # Fit trained model
        model.fit(train_dataset)
        model.save()

        # Eval model on train
        evaluator = Evaluator(model,
                              train_dataset,
                              transformers,
                              verbosity=True)
        _ = evaluator.compute_model_performance(regression_metrics)

        # Eval model on test
        evaluator = Evaluator(model,
                              test_dataset,
                              transformers,
                              verbosity=True)
        _ = evaluator.compute_model_performance(regression_metrics)
Esempio n. 4
0
    def test_sklearn_reload(self):
        """Test that trained model can be reloaded correctly."""
        tasks = ["task0"]
        task_types = {task: "classification" for task in tasks}
        n_samples = 10
        n_features = 3
        n_tasks = len(tasks)

        # Generate dummy dataset
        np.random.seed(123)
        ids = np.arange(n_samples)
        X = np.random.rand(n_samples, n_features)
        y = np.random.randint(2, size=(n_samples, n_tasks))
        w = np.ones((n_samples, n_tasks))

        dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

        model_params = {
            "batch_size": None,
            "data_shape": dataset.get_data_shape()
        }

        verbosity = "high"
        classification_metric = Metric(metrics.roc_auc_score,
                                       verbosity=verbosity)
        model = SklearnModel(tasks,
                             task_types,
                             model_params,
                             self.model_dir,
                             mode="classification",
                             model_instance=RandomForestClassifier())

        # Fit trained model
        model.fit(dataset)
        model.save()

        # Load trained model
        reloaded_model = SklearnModel(tasks,
                                      task_types,
                                      model_params,
                                      self.model_dir,
                                      mode="classification")
        reloaded_model.reload()

        # Eval model on train
        transformers = []
        evaluator = Evaluator(reloaded_model,
                              dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([classification_metric])

        assert scores[classification_metric.name] > .9
Esempio n. 5
0
  def test_singletask_sklearn_rf_RDKIT_descriptor_regression_API(self):
    """Test of singletask RF RDKIT-descriptor regression API."""
    splittype = "scaffold"
    featurizer = RDKitDescriptors()
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    model_params = {}
    input_file = os.path.join(self.current_dir, "example.csv")
    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)

    input_transformers = [
        NormalizationTransformer(transform_X=True, dataset=train_dataset),
        ClippingTransformer(transform_X=True, dataset=train_dataset)]
    output_transformers = [
        NormalizationTransformer(transform_y=True, dataset=train_dataset)]
    transformers = input_transformers + output_transformers
    for dataset in [train_dataset, test_dataset]:
      for transformer in transformers:
        transformer.transform(dataset)

    model_params["data_shape"] = train_dataset.get_data_shape()
    regression_metrics = [Metric(metrics.r2_score),
                          Metric(metrics.mean_squared_error),
                          Metric(metrics.mean_absolute_error)]

    model = SklearnModel(tasks, task_types, model_params, self.model_dir,
                         mode="regression",
                         model_instance=RandomForestRegressor())
  

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on train
    evaluator = Evaluator(model, train_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(regression_metrics)

    # Eval model on test
    evaluator = Evaluator(model, test_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(regression_metrics)
Esempio n. 6
0
  def test_sklearn_classification(self):
    """Test that sklearn models can learn on simple classification datasets."""
    np.random.seed(123)
    dataset = sklearn.datasets.load_digits(n_class=2)
    X, y = dataset.data, dataset.target

    frac_train = .7
    n_samples = len(X)
    
    X_train, y_train = X[:frac_train*n_samples], y[:frac_train*n_samples]
    X_test, y_test = X[frac_train*n_samples:], y[frac_train*n_samples:]

    print("X_train.shape, y_train.shape, X_test.shape, y_test.shape")
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

    train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train)
    test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test)

    tasks = train_dataset.get_task_names()
    task_types = {task: "classification" for task in tasks}

    model_params = {
      "batch_size": None,
      "data_shape": train_dataset.get_data_shape()
    }

    verbosity = "high"
    classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity)
    model = SklearnModel(tasks, task_types, model_params, self.model_dir,
                         mode="classification",
                         model_instance=LogisticRegression())

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on train
    transformers = []
    train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity)
    train_scores = train_evaluator.compute_model_performance([classification_metric])
    print("train_scores")
    print(train_scores)

    # Eval model on test
    transformers = []
    evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity)
    scores = evaluator.compute_model_performance([classification_metric])
    print("scores")
    print(scores)

    assert scores[classification_metric.name] > .5
Esempio n. 7
0
  def test_singletask_sklearn_rf_ECFP_regression_sharded_API(self):
    """Test of singletask RF ECFP regression API: sharded edition."""
    splittype = "scaffold"
    featurizer = CircularFingerprint(size=1024)
    model_params = {}
    tasks = ["label"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = os.path.join(
        self.current_dir, "../../../datasets/pdbbind_core_df.pkl.gz")

    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)
    input_transformers = []
    output_transformers = [
        NormalizationTransformer(transform_y=True, dataset=train_dataset)]
    transformers = input_transformers + output_transformers
    for dataset in [train_dataset, test_dataset]:
      for transformer in transformers:
        transformer.transform(dataset)
    # We set shard size above to force the creation of multiple shards of the data.
    # pdbbind_core has ~200 examples.
    model_params["data_shape"] = train_dataset.get_data_shape()
    regression_metrics = [Metric(metrics.r2_score),
                          Metric(metrics.mean_squared_error),
                          Metric(metrics.mean_absolute_error)]

    model = SklearnModel(tasks, task_types, model_params, self.model_dir,
                         mode="regression",
                         model_instance=RandomForestRegressor())

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on train
    evaluator = Evaluator(model, train_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(regression_metrics)

    # Eval model on test
    evaluator = Evaluator(model, test_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(regression_metrics)
Esempio n. 8
0
    def test_sklearn_transformed_regression(self):
        """Test that sklearn models can learn on simple transformed regression datasets."""
        np.random.seed(123)
        dataset = sklearn.datasets.load_diabetes()
        X, y = dataset.data, dataset.target

        frac_train = .7
        n_samples = len(X)
        n_train = int(frac_train * n_samples)
        X_train, y_train = X[:n_train], y[:n_train]
        X_test, y_test = X[n_train:], y[n_train:]
        train_dataset = DiskDataset.from_numpy(self.train_dir, X_train,
                                               y_train)
        test_dataset = DiskDataset.from_numpy(self.test_dir, X_test, y_test)

        # Eval model on train
        transformers = [
            NormalizationTransformer(transform_X=True, dataset=train_dataset),
            ClippingTransformer(transform_X=True, dataset=train_dataset),
            NormalizationTransformer(transform_y=True, dataset=train_dataset)
        ]
        for data in [train_dataset, test_dataset]:
            for transformer in transformers:
                transformer.transform(data)

        verbosity = "high"
        regression_metric = Metric(metrics.r2_score, verbosity=verbosity)
        sklearn_model = LinearRegression()
        model = SklearnModel(sklearn_model, self.model_dir)

        # Fit trained model
        model.fit(train_dataset)
        model.save()

        train_evaluator = Evaluator(model,
                                    train_dataset,
                                    transformers,
                                    verbosity=verbosity)
        train_scores = train_evaluator.compute_model_performance(
            [regression_metric])
        assert train_scores[regression_metric.name] > .5

        # Eval model on test
        evaluator = Evaluator(model,
                              test_dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([regression_metric])
        assert scores[regression_metric.name] > .5
Esempio n. 9
0
  def test_singletask_sklearn_rf_RDKIT_descriptor_regression_API(self):
    """Test of singletask RF RDKIT-descriptor regression API."""
    splittype = "scaffold"
    featurizer = RDKitDescriptors()
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = os.path.join(self.current_dir, "example.csv")
    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)

    input_transformers = [
        NormalizationTransformer(transform_X=True, dataset=train_dataset),
        ClippingTransformer(transform_X=True, dataset=train_dataset)]
    output_transformers = [
        NormalizationTransformer(transform_y=True, dataset=train_dataset)]
    transformers = input_transformers + output_transformers
    for dataset in [train_dataset, test_dataset]:
      for transformer in transformers:
        transformer.transform(dataset)

    regression_metrics = [Metric(metrics.r2_score),
                          Metric(metrics.mean_squared_error),
                          Metric(metrics.mean_absolute_error)]

    sklearn_model = RandomForestRegressor()
    model = SklearnModel(sklearn_model, self.model_dir)

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on train
    evaluator = Evaluator(model, train_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(regression_metrics)

    # Eval model on test
    evaluator = Evaluator(model, test_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(regression_metrics)
Esempio n. 10
0
    def test_sklearn_regression_overfit(self):
        """Test that sklearn models can overfit simple regression datasets."""
        tasks = ["task0"]
        task_types = {task: "regression" for task in tasks}
        n_samples = 10
        n_features = 3
        n_tasks = len(tasks)

        # Generate dummy dataset
        np.random.seed(123)
        ids = np.arange(n_samples)
        X = np.random.rand(n_samples, n_features)
        y = np.random.rand(n_samples, n_tasks)
        w = np.ones((n_samples, n_tasks))

        dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

        model_params = {
            "batch_size": None,
            "data_shape": dataset.get_data_shape()
        }

        verbosity = "high"
        regression_metric = Metric(metrics.r2_score, verbosity=verbosity)
        model = SklearnModel(tasks,
                             task_types,
                             model_params,
                             self.model_dir,
                             mode="regression",
                             model_instance=RandomForestRegressor())

        # Fit trained model
        model.fit(dataset)
        model.save()

        # Eval model on train
        transformers = []
        evaluator = Evaluator(model,
                              dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([regression_metric])

        assert scores[regression_metric.name] > .7
Esempio n. 11
0
    def test_sklearn_reload(self):
        """Test that trained model can be reloaded correctly."""
        tasks = ["task0"]
        task_types = {task: "classification" for task in tasks}
        n_samples = 10
        n_features = 3
        n_tasks = len(tasks)

        # Generate dummy dataset
        np.random.seed(123)
        ids = np.arange(n_samples)
        X = np.random.rand(n_samples, n_features)
        y = np.random.randint(2, size=(n_samples, n_tasks))
        w = np.ones((n_samples, n_tasks))

        dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

        model_params = {"batch_size": None, "data_shape": dataset.get_data_shape()}

        verbosity = "high"
        classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity)
        model = SklearnModel(
            tasks,
            task_types,
            model_params,
            self.model_dir,
            mode="classification",
            model_instance=RandomForestClassifier(),
        )

        # Fit trained model
        model.fit(dataset)
        model.save()

        # Load trained model
        reloaded_model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="classification")
        reloaded_model.reload()

        # Eval model on train
        transformers = []
        evaluator = Evaluator(reloaded_model, dataset, transformers, verbosity=verbosity)
        scores = evaluator.compute_model_performance([classification_metric])

        assert scores[classification_metric.name] > 0.9
Esempio n. 12
0
    def test_sklearn_classification(self):
        """Test that sklearn models can learn on simple classification datasets."""
        np.random.seed(123)
        dataset = sklearn.datasets.load_digits(n_class=2)
        X, y = dataset.data, dataset.target

        frac_train = .7
        n_samples = len(X)
        n_train = int(frac_train * n_samples)
        X_train, y_train = X[:n_train], y[:n_train]
        X_test, y_test = X[n_train:], y[n_train:]
        train_dataset = DiskDataset.from_numpy(self.train_dir, X_train,
                                               y_train)
        test_dataset = DiskDataset.from_numpy(self.test_dir, X_test, y_test)

        verbosity = "high"
        classification_metric = Metric(metrics.roc_auc_score,
                                       verbosity=verbosity)
        sklearn_model = LogisticRegression()
        model = SklearnModel(sklearn_model, self.model_dir)

        # Fit trained model
        model.fit(train_dataset)
        model.save()

        # Eval model on train
        transformers = []
        train_evaluator = Evaluator(model,
                                    train_dataset,
                                    transformers,
                                    verbosity=verbosity)
        train_scores = train_evaluator.compute_model_performance(
            [classification_metric])

        # Eval model on test
        transformers = []
        evaluator = Evaluator(model,
                              test_dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([classification_metric])
        assert scores[classification_metric.name] > .5
Esempio n. 13
0
  def test_sklearn_skewed_classification_overfit(self):
    """Test sklearn models can overfit 0/1 datasets with few actives."""
    tasks = ["task0"]
    task_types = {task: "classification" for task in tasks}
    n_samples = 100
    n_features = 3
    n_tasks = len(tasks)
    
    # Generate dummy dataset
    np.random.seed(123)
    p = .05
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.binomial(1, p, size=(n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
  
    dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

    model_params = {
      "batch_size": None,
      "data_shape": dataset.get_data_shape()
    }

    verbosity = "high"
    classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity)
    model = SklearnModel(tasks, task_types, model_params, self.model_dir,
                         mode="classification",
                         model_instance=RandomForestClassifier())

    # Fit trained model
    model.fit(dataset)
    model.save()

    # Eval model on train
    transformers = []
    evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity)
    scores = evaluator.compute_model_performance([classification_metric])

    assert scores[classification_metric.name] > .9
Esempio n. 14
0
  def test_sklearn_regression_overfit(self):
    """Test that sklearn models can overfit simple regression datasets."""
    tasks = ["task0"]
    task_types = {task: "regression" for task in tasks}
    n_samples = 10
    n_features = 3
    n_tasks = len(tasks)
    
    # Generate dummy dataset
    np.random.seed(123)
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.rand(n_samples, n_tasks)
    w = np.ones((n_samples, n_tasks))

    dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

    model_params = {
      "batch_size": None,
      "data_shape": dataset.get_data_shape()
    }

    verbosity = "high"
    regression_metric = Metric(metrics.r2_score, verbosity=verbosity)
    model = SklearnModel(tasks, task_types, model_params, self.model_dir,
                         mode="regression",
                         model_instance=RandomForestRegressor())

    # Fit trained model
    model.fit(dataset)
    model.save()

    # Eval model on train
    transformers = []
    evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity)
    scores = evaluator.compute_model_performance([regression_metric])

    assert scores[regression_metric.name] > .7
Esempio n. 15
0
  fold_tasks = range(fold * len(test_dataset.get_task_names()),
                     (fold+1) * len(test_dataset.get_task_names()))

  # Get supports on test-set
  support_generator = SupportGenerator(
      test_dataset, range(len(test_dataset.get_task_names())), n_pos, n_neg,
      n_trials, replace)

  # Compute accuracies
  task_scores = {task: [] for task in range(len(test_dataset.get_task_names()))}
  for (task, support) in support_generator:
    # Train model on support
    sklearn_model = RandomForestClassifier(
        class_weight="balanced", n_estimators=50)
    model = SklearnModel(sklearn_model, model_dir)
    model.fit(support)

    # Test model
    task_dataset = get_task_dataset_minus_support(test_dataset, support, task)
    y_pred = model.predict_proba(task_dataset)
    score = metric.compute_metric(
        task_dataset.y, y_pred, task_dataset.w)
    #print("Score on task %s is %s" % (str(task), str(score)))
    task_scores[task].append(score)

  # Join information for all tasks.
  mean_task_scores = {}
  for task in range(len(test_dataset.get_task_names())):
    mean_task_scores[task] = np.mean(np.array(task_scores[task]))
  print("Fold %s" % str(fold))
  print(mean_task_scores)
Esempio n. 16
0
  def test_sklearn_transformed_regression(self):
    """Test that sklearn models can learn on simple transformed regression datasets."""
    np.random.seed(123)
    dataset = sklearn.datasets.load_diabetes()
    X, y = dataset.data, dataset.target

    frac_train = .7
    n_samples = len(X)
    
    X_train, y_train = X[:frac_train*n_samples], y[:frac_train*n_samples]
    X_test, y_test = X[frac_train*n_samples:], y[frac_train*n_samples:]

    train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train)
    test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test)


    # Eval model on train
    input_transformers = [
        NormalizationTransformer(transform_X=True, dataset=train_dataset),
        ClippingTransformer(transform_X=True, dataset=train_dataset)]
    output_transformers = [
      NormalizationTransformer(transform_y=True, dataset=train_dataset)]
    transformers = input_transformers + output_transformers
    for transformer in transformers:
        transformer.transform(train_dataset)
    for transformer in transformers:
        transformer.transform(test_dataset)

    tasks = train_dataset.get_task_names()
    task_types = {task: "regression" for task in tasks}

    model_params = {
      "batch_size": None,
      "data_shape": train_dataset.get_data_shape()
    }

    verbosity = "high"
    regression_metric = Metric(metrics.r2_score, verbosity=verbosity)
    model = SklearnModel(tasks, task_types, model_params, self.model_dir,
                         mode="regression",
                         model_instance=LinearRegression())

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity)
    train_scores = train_evaluator.compute_model_performance([regression_metric])
    print("train_scores")
    print(train_scores)

    assert train_scores[regression_metric.name] > .5

    # Eval model on test
    transformers = []
    evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity)
    scores = evaluator.compute_model_performance([regression_metric])
    print("scores")
    print(scores)

    assert scores[regression_metric.name] > .5
Esempio n. 17
0
             columns=['prediction']).to_csv(modeldir + "train_original.csv")
pd.DataFrame(valid_dataset.y,
             columns=['prediction']).to_csv(modeldir + "valid_original.csv")

for estimator in n_estimators:

    print('n_estimators = {0}'.format(estimator))
    #Create model
    sklmodel = RandomForestRegressor(n_estimators=estimator,
                                     criterion="mse",
                                     max_features=max_features,
                                     bootstrap=True,
                                     oob_score=False,
                                     n_jobs=int(cpus / 2))
    model = SklearnModel(sklmodel, modeldir)
    model.fit(train_dataset)

    #Append trains cores and results
    train_scores = model.evaluate(
        train_dataset,
        [metric, dc.metrics.Metric(dc.metrics.mae_score)])
    train_results = np.concatenate(
        (train_results, list(train_scores.values())))
    valid_scores = model.evaluate(
        valid_dataset,
        [metric, dc.metrics.Metric(dc.metrics.mae_score)])
    test_results = np.concatenate((test_results, list(valid_scores.values())))

    #Append trains cores and results
    predict_train = pd.DataFrame(
        model.predict(train_dataset),
Esempio n. 18
0
    # Get supports on test-set
    support_generator = SupportGenerator(
        test_dataset, range(len(test_dataset.get_task_names())), n_pos, n_neg,
        n_trials, replace)

    # Compute accuracies
    task_scores = {
        task: []
        for task in range(len(test_dataset.get_task_names()))
    }
    for (task, support) in support_generator:
        # Train model on support
        sklearn_model = RandomForestClassifier(class_weight="balanced",
                                               n_estimators=50)
        model = SklearnModel(sklearn_model, model_dir)
        model.fit(support)

        # Test model
        task_dataset = get_task_dataset_minus_support(test_dataset, support,
                                                      task)
        y_pred = model.predict_proba(task_dataset)
        score = metric.compute_metric(task_dataset.y, y_pred, task_dataset.w)
        #print("Score on task %s is %s" % (str(task), str(score)))
        task_scores[task].append(score)

    # Join information for all tasks.
    mean_task_scores = {}
    for task in range(len(test_dataset.get_task_names())):
        mean_task_scores[task] = np.mean(np.array(task_scores[task]))
    print("Fold %s" % str(fold))
    print(mean_task_scores)