def test_sklearn_multitask_regression_overfit(self): """Test SKLearn singletask-to-multitask overfits tiny regression data.""" n_tasks = 2 tasks = ["task%d" % task for task in range(n_tasks)] n_samples = 10 n_features = 3 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.rand(n_samples, n_tasks) w = np.ones((n_samples, n_tasks)) dataset = DiskDataset.from_numpy(self.train_dir, X, y, w, ids) verbosity = "high" regression_metric = Metric(metrics.r2_score, verbosity=verbosity, task_averager=np.mean) def model_builder(model_dir): sklearn_model = RandomForestRegressor() return SklearnModel(sklearn_model, model_dir) model = SingletaskToMultitask(tasks, model_builder, self.model_dir) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) assert scores[regression_metric.name] > .7
def test_sklearn_multitask_classification_overfit(self): """Test SKLearn singletask-to-multitask overfits tiny data.""" n_tasks = 10 tasks = ["task%d" % task for task in range(n_tasks)] task_types = {task: "classification" for task in tasks} n_samples = 10 n_features = 3 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.randint(2, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "batch_size": None, "data_shape": dataset.get_data_shape() } verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) def model_builder(tasks, task_types, model_params, model_dir, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_dir, mode="classification", model_instance=RandomForestClassifier(), verbosity=verbosity) model = SingletaskToMultitask(tasks, task_types, model_params, self.model_dir, model_builder, verbosity=verbosity) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .9
def test_sklearn_multitask_classification(self): """Test that sklearn models can learn on simple multitask classification.""" np.random.seed(123) n_tasks = 4 tasks = range(n_tasks) dataset = sklearn.datasets.load_digits(n_class=2) X, y = dataset.data, dataset.target y = np.reshape(y, (len(y), 1)) y = np.hstack([y] * n_tasks) frac_train = .7 n_samples = len(X) n_train = int(frac_train * n_samples) X_train, y_train = X[:n_train], y[:n_train] X_test, y_test = X[n_train:], y[n_train:] train_dataset = DiskDataset.from_numpy(self.train_dir, X_train, y_train) test_dataset = DiskDataset.from_numpy(self.test_dir, X_test, y_test) verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) def model_builder(model_dir): sklearn_model = LogisticRegression() return SklearnModel(sklearn_model, model_dir) model = SingletaskToMultitask(tasks, model_builder, self.model_dir) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train transformers = [] train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance( [classification_metric]) # Eval model on test transformers = [] evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) for score in scores[classification_metric.name]: assert score > .5
def test_singletask_to_multitask_classification(self): n_features = 10 n_tasks = 17 tasks = range(n_tasks) # Define train dataset n_train = 100 X_train = np.random.rand(n_train, n_features) y_train = np.random.randint(2, size=(n_train, n_tasks)) w_train = np.ones_like(y_train) ids_train = ["C"] * n_train train_dataset = DiskDataset.from_numpy(self.train_dir, X_train, y_train, w_train, ids_train) # Define test dataset n_test = 10 X_test = np.random.rand(n_test, n_features) y_test = np.random.randint(2, size=(n_test, n_tasks)) w_test = np.ones_like(y_test) ids_test = ["C"] * n_test test_dataset = DiskDataset.from_numpy(self.test_dir, X_test, y_test, w_test, ids_test) transformers = [] classification_metrics = [Metric(metrics.roc_auc_score)] def model_builder(model_dir): sklearn_model = LogisticRegression() return SklearnModel(sklearn_model, model_dir) multitask_model = SingletaskToMultitask(tasks, model_builder, self.model_dir) # Fit trained model multitask_model.fit(train_dataset) multitask_model.save() # Eval multitask_model on train evaluator = Evaluator(multitask_model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics) # Eval multitask_model on test evaluator = Evaluator(multitask_model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics)
shutil.rmtree(model_dir) def model_builder(tasks, task_types, model_params, model_dir, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_dir, model_instance=RandomForestClassifier( class_weight="balanced", n_estimators=500), verbosity=verbosity) model = SingletaskToMultitask(tox21_tasks, tox21_task_types, params_dict, model_dir, model_builder, verbosity=verbosity) # Fit trained model model.fit(train_dataset) model.save() train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance( [classification_metric]) print("Train scores")
def test_sklearn_multitask_classification(self): """Test that sklearn models can learn on simple multitask classification.""" np.random.seed(123) n_tasks = 4 dataset = sklearn.datasets.load_digits(n_class=2) X, y = dataset.data, dataset.target y = np.reshape(y, (len(y), 1)) y = np.hstack([y] * n_tasks) frac_train = .7 n_samples = len(X) X_train, y_train = X[:frac_train * n_samples], y[:frac_train * n_samples] X_test, y_test = X[frac_train * n_samples:], y[frac_train * n_samples:] train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train) test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test) tasks = train_dataset.get_task_names() task_types = {task: "classification" for task in tasks} model_params = { "batch_size": None, "data_shape": train_dataset.get_data_shape() } verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) def model_builder(tasks, task_types, model_params, model_dir, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_dir, mode="classification", model_instance=LogisticRegression(), verbosity=verbosity) model = SingletaskToMultitask(tasks, task_types, model_params, self.model_dir, model_builder, verbosity=verbosity) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train transformers = [] train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance( [classification_metric]) print("train_scores") print(train_scores) # Eval model on test transformers = [] evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) print("scores") print(scores) for score in scores[classification_metric.name]: assert score > .5
def model_builder(tasks, task_types, model_params, model_dir, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_dir, model_instance=RandomForestClassifier( class_weight="balanced", n_estimators=500, n_jobs=-1), verbosity=verbosity) tox_model = SingletaskToMultitask(tox_tasks, tox_task_types, params_dict, tox_model_dir, model_builder, verbosity=verbosity) tox_model.reload() """ Load sider models now """ base_sider_data_dir = "/home/apappu/deepchem-models/toxcast_models/sider/sider_data" sider_tasks, sider_dataset, sider_transformers = load_sider( base_sider_data_dir, reload=reload) base_sider_dir = "/home/apappu/deepchem-models/toxcast_models/sider/sider_analysis" sider_train_dir = os.path.join(base_sider_dir, "train_dataset")
nci_tasks, nci_dataset, transformers = load_nci(base_dir) (train_dataset, valid_dataset, test_dataset) = nci_dataset classification_metric = Metric(metrics.roc_auc_score, np.mean, verbosity=verbosity, mode="classification") def model_builder(model_dir): sklearn_model = RandomForestRegressor(n_estimators=500) return SklearnModel(sklearn_model, model_dir) model = SingletaskToMultitask(nci_tasks, model_builder, model_dir) # Fit trained model model.fit(train_dataset) model.save() train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance( [classification_metric]) print("Train scores") print(train_scores)
def multitask_model_builder(model_params, model_dir): def model_builder(model_dir): sklearn_model = RandomForestClassifier(**model_params) return SklearnModel(sklearn_model, model_dir) return SingletaskToMultitask(tasks, model_builder, model_dir)
def test_singletask_to_multitask_classification(self): splittype = "scaffold" compound_featurizers = [CircularFingerprint(size=1024)] complex_featurizers = [] output_transformers = [] tasks = [ "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16" ] task_types = {task: "classification" for task in tasks} input_file = "multitask_example.csv" n_features = 10 n_tasks = len(tasks) # Define train dataset n_train = 100 X_train = np.random.rand(n_train, n_features) y_train = np.random.randint(2, size=(n_train, n_tasks)) w_train = np.ones_like(y_train) ids_train = ["C"] * n_train train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train, w_train, ids_train, tasks) # Define test dataset n_test = 10 X_test = np.random.rand(n_test, n_features) y_test = np.random.randint(2, size=(n_test, n_tasks)) w_test = np.ones_like(y_test) ids_test = ["C"] * n_test test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test, w_test, ids_test, tasks) params_dict = { "batch_size": 32, "data_shape": train_dataset.get_data_shape() } classification_metrics = [Metric(metrics.roc_auc_score)] def model_builder(tasks, task_types, model_params, model_builder, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_builder, model_instance=LogisticRegression()) multitask_model = SingletaskToMultitask(tasks, task_types, params_dict, self.model_dir, model_builder) # Fit trained model multitask_model.fit(train_dataset) multitask_model.save() # Eval multitask_model on train evaluator = Evaluator(multitask_model, train_dataset, output_transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics) # Eval multitask_model on test evaluator = Evaluator(multitask_model, test_dataset, output_transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics)
tox_tasks, (tox_train, tox_valid, tox_test), tox_transformers = dc.molnet.load_tox21() classification_metric = Metric( metrics.roc_auc_score, np.mean, mode="classification") def model_builder(model_dir): sklearn_model = RandomForestClassifier( class_weight="balanced", n_estimators=500, n_jobs=-1) return dc.models.SklearnModel(sklearn_model, model_dir) print(tox_train.get_task_names()) print(tox_tasks) tox_model = SingletaskToMultitask(tox_tasks, model_builder) tox_model.fit(tox_train) # Load sider models now sider_tasks, ( sider_train, sider_valid, sider_test), sider_transformers = dc.molnet.load_sider(split="random") sider_model = SingletaskToMultitask(sider_tasks, model_builder) sider_model.fit(sider_train) # Load sweetlead dataset now. Pass in dataset object and appropriate # transformers to predict functions sweet_tasks, (sweet_dataset, _, _), sweet_transformers = dc.molnet.load_sweet()
(train_dataset, valid_dataset) = tox21_datasets # Fit models classification_metric = Metric(metrics.roc_auc_score, np.mean, verbosity=verbosity, mode="classification") def model_builder(model_dir): sklearn_model = RandomForestClassifier(class_weight="balanced", n_estimators=500) return SklearnModel(sklearn_model, model_dir) model = SingletaskToMultitask(tox21_tasks, model_builder, model_dir) # Fit trained model model.fit(train_dataset) model.save() train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance( [classification_metric]) print("Train scores") print(train_scores)
pcba_tasks, pcba_datasets, transformers = load_pcba() (train_dataset, valid_dataset, test_dataset) = pcba_datasets classification_metric = Metric(metrics.roc_auc_score, np.mean, verbose=is_verbose, mode="classification") def model_builder(model_dir): sklearn_model = RandomForestClassifier(class_weight="balanced", n_estimators=500) return SklearnModel(sklearn_model, model_dir) model = SingletaskToMultitask(pcba_tasks, model_builder, model_dir) # Fit trained model model.fit(train_dataset) model.save() train_evaluator = Evaluator(model, train_dataset, transformers, verbose=is_verbose) train_scores = train_evaluator.compute_model_performance( [classification_metric]) print("Train scores") print(train_scores)
def multitask_model_builder(tasks, task_types, params_dict, logdir=None, verbosity=None): return SingletaskToMultitask(tasks, task_types, params_dict, self.model_dir, model_builder)