def train_valid_test_split(self, dataset, train_dir, valid_dir, test_dir, frac_train=.8, frac_valid=.1, frac_test=.1, seed=None, log_every_n=1000): # Obtain original x, y, and w arrays and shuffle X, y, w, ids = self.__randomize_arrays(dataset.to_numpy()) X_train, y_train, w_train, ids_train, X_test, y_test, w_test, ids_test = self.__split( X, y, w, ids, frac_train) # calculate percent split for valid (out of test and valid) valid_percentage = frac_valid / (frac_valid + frac_test) # split test data into valid and test, treating sub test set also as sparse X_valid, y_valid, w_valid, ids_valid, X_test, y_test, w_test, ids_test = self.__split( X_test, y_test, w_test, ids_test, valid_percentage) # turn back into dataset objects train_data = Dataset.from_numpy(train_dir, X_train, y_train, w_train, ids_train) valid_data = Dataset.from_numpy(valid_dir, X_valid, y_valid, w_valid, ids_valid) test_data = Dataset.from_numpy(test_dir, X_test, y_test, w_test, ids_test) return train_data, valid_data, test_data
def test_sklearn_regression(self): """Test that sklearn models can learn on simple regression datasets.""" np.random.seed(123) dataset = sklearn.datasets.load_diabetes() X, y = dataset.data, dataset.target frac_train = .7 n_samples = len(X) X_train, y_train = X[:frac_train * n_samples], y[:frac_train * n_samples] X_test, y_test = X[frac_train * n_samples:], y[frac_train * n_samples:] train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train) test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test) tasks = train_dataset.get_task_names() task_types = {task: "regression" for task in tasks} model_params = { "batch_size": None, "data_shape": train_dataset.get_data_shape() } verbosity = "high" regression_metric = Metric(metrics.r2_score, verbosity=verbosity) model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="regression", model_instance=LinearRegression()) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train transformers = [] train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance( [regression_metric]) print("train_scores") print(train_scores) # Eval model on test transformers = [] evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) print("scores") print(scores) assert scores[regression_metric.name] > .5
def test_singletask_to_multitask_classification(self): splittype = "scaffold" compound_featurizers = [CircularFingerprint(size=1024)] complex_featurizers = [] output_transformers = [] tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16"] task_types = {task: "classification" for task in tasks} input_file = "multitask_example.csv" n_features = 10 n_tasks = len(tasks) # Define train dataset n_train = 100 X_train = np.random.rand(n_train, n_features) y_train = np.random.randint(2, size=(n_train, n_tasks)) w_train = np.ones_like(y_train) ids_train = ["C"] * n_train train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train, w_train, ids_train, tasks) # Define test dataset n_test = 10 X_test = np.random.rand(n_test, n_features) y_test = np.random.randint(2, size=(n_test, n_tasks)) w_test = np.ones_like(y_test) ids_test = ["C"] * n_test test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test, w_test, ids_test, tasks) params_dict = { "batch_size": 32, "data_shape": train_dataset.get_data_shape() } classification_metrics = [Metric(metrics.roc_auc_score)] def model_builder(tasks, task_types, model_params, model_builder, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_builder, model_instance=LogisticRegression()) multitask_model = SingletaskToMultitask(tasks, task_types, params_dict, self.model_dir, model_builder) # Fit trained model multitask_model.fit(train_dataset) multitask_model.save() # Eval multitask_model on train evaluator = Evaluator(multitask_model, train_dataset, output_transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics) # Eval multitask_model on test evaluator = Evaluator(multitask_model, test_dataset, output_transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics)
def test_sklearn_multitask_classification(self): """Test that sklearn models can learn on simple multitask classification.""" np.random.seed(123) n_tasks = 4 dataset = sklearn.datasets.load_digits(n_class=2) X, y = dataset.data, dataset.target y = np.reshape(y, (len(y), 1)) y = np.hstack([y] * n_tasks) frac_train = .7 n_samples = len(X) X_train, y_train = X[:frac_train*n_samples], y[:frac_train*n_samples] X_test, y_test = X[frac_train*n_samples:], y[frac_train*n_samples:] train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train) test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test) tasks = train_dataset.get_task_names() task_types = {task: "classification" for task in tasks} model_params = { "batch_size": None, "data_shape": train_dataset.get_data_shape() } verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) def model_builder(tasks, task_types, model_params, model_dir, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_dir, mode="classification", model_instance=LogisticRegression(), verbosity=verbosity) model = SingletaskToMultitask(tasks, task_types, model_params, self.model_dir, model_builder, verbosity=verbosity) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train transformers = [] train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance([classification_metric]) print("train_scores") print(train_scores) # Eval model on test transformers = [] evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) print("scores") print(scores) for score in scores[classification_metric.name]: assert score > .5
def test_sklearn_classification(self): """Test that sklearn models can learn on simple classification datasets.""" np.random.seed(123) dataset = sklearn.datasets.load_digits(n_class=2) X, y = dataset.data, dataset.target frac_train = .7 n_samples = len(X) X_train, y_train = X[:frac_train*n_samples], y[:frac_train*n_samples] X_test, y_test = X[frac_train*n_samples:], y[frac_train*n_samples:] print("X_train.shape, y_train.shape, X_test.shape, y_test.shape") print(X_train.shape, y_train.shape, X_test.shape, y_test.shape) train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train) test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test) tasks = train_dataset.get_task_names() task_types = {task: "classification" for task in tasks} model_params = { "batch_size": None, "data_shape": train_dataset.get_data_shape() } verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="classification", model_instance=LogisticRegression()) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train transformers = [] train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance([classification_metric]) print("train_scores") print(train_scores) # Eval model on test transformers = [] evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) print("scores") print(scores) assert scores[classification_metric.name] > .5
def test_singletask_to_multitask_sklearn_hyperparam_opt(self): """Test of hyperparam_opt with singletask_to_multitask.""" splittype = "scaffold" output_transformers = [] tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16"] task_types = {task: "classification" for task in tasks} input_file = "multitask_example.csv" n_features = 10 n_tasks = len(tasks) # Define train dataset n_train = 100 X_train = np.random.rand(n_train, n_features) y_train = np.random.randint(2, size=(n_train, n_tasks)) w_train = np.ones_like(y_train) ids_train = ["C"] * n_train train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train, w_train, ids_train, tasks) # Define validation dataset n_valid = 10 X_valid = np.random.rand(n_valid, n_features) y_valid = np.random.randint(2, size=(n_valid, n_tasks)) w_valid = np.ones_like(y_valid) ids_valid = ["C"] * n_valid valid_dataset = Dataset.from_numpy(self.valid_dir, X_valid, y_valid, w_valid, ids_valid, tasks) params_dict = { "batch_size": [32], "data_shape": [train_dataset.get_data_shape()], } classification_metric = Metric(metrics.matthews_corrcoef, np.mean, mode="classification") def model_builder(tasks, task_types, model_params, task_model_dir, verbosity=None): return SklearnModel(tasks, task_types, model_params, task_model_dir, model_instance=LogisticRegression()) def multitask_model_builder(tasks, task_types, params_dict, logdir=None, verbosity=None): return SingletaskToMultitask(tasks, task_types, params_dict, self.model_dir, model_builder) optimizer = HyperparamOpt(multitask_model_builder, tasks, task_types, verbosity="low") best_model, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, output_transformers, classification_metric, logdir=None)
def test_tf_skewed_classification_overfit(self): """Test tensorflow models can overfit 0/1 datasets with few actives.""" tasks = ["task0"] task_types = {task: "classification" for task in tasks} #n_samples = 100 n_samples = 100 n_features = 3 n_tasks = len(tasks) n_classes = 2 # Generate dummy dataset np.random.seed(123) p = .05 ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.binomial(1, p, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "layer_sizes": [1500], "dropouts": [.0], "learning_rate": 0.003, "momentum": .9, "batch_size": n_samples, "num_classification_tasks": 1, "num_classes": n_classes, "num_features": n_features, "weight_init_stddevs": [1.], "bias_init_consts": [1.], "nb_epoch": 200, "penalty": 0.0, "optimizer": "adam", "data_shape": dataset.get_data_shape() } verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) model = TensorflowModel(tasks, task_types, model_params, self.model_dir, tf_class=TensorflowMultiTaskClassifier, verbosity=verbosity) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .8
def test_shuffle_shards(self): """Test that shuffle_shards works.""" n_samples = 100 n_tasks = 10 n_features = 10 X = np.random.rand(n_samples, n_features) y = np.random.randint(2, size=(n_samples, n_tasks)) w = np.random.randint(2, size=(n_samples, n_tasks)) ids = np.arange(n_samples) dataset = Dataset.from_numpy(self.data_dir, X, y, w, ids) dataset.reshard(shard_size=10) dataset.shuffle_shards() X_s, y_s, w_s, ids_s = dataset.to_numpy() assert X_s.shape == X.shape assert y_s.shape == y.shape assert ids_s.shape == ids.shape assert w_s.shape == w.shape # The ids should now store the performed permutation. Check that the # original dataset is recoverable. for i in range(n_samples): np.testing.assert_array_equal(X_s[i], X[ids_s[i]]) np.testing.assert_array_equal(y_s[i], y[ids_s[i]]) np.testing.assert_array_equal(w_s[i], w[ids_s[i]]) np.testing.assert_array_equal(ids_s[i], ids[ids_s[i]])
def test_to_singletask(self): """Test that to_singletask works.""" num_datapoints = 100 num_features = 10 num_tasks = 10 # Generate data X = np.random.rand(num_datapoints, num_features) y = np.random.randint(2, size=(num_datapoints, num_tasks)) w = np.random.randint(2, size=(num_datapoints, num_tasks)) ids = np.array(["id"] * num_datapoints) dataset = Dataset.from_numpy(self.data_dir, X, y, w, ids, verbosity="high") task_dirs = [] try: for task in range(num_tasks): task_dirs.append(tempfile.mkdtemp()) singletask_datasets = dataset.to_singletask(task_dirs) for task in range(num_tasks): singletask_dataset = singletask_datasets[task] X_task, y_task, w_task, ids_task = singletask_dataset.to_numpy() w_nonzero = w[:, task] != 0 np.testing.assert_array_equal(X_task, X[w_nonzero != 0]) np.testing.assert_array_equal(y_task.flatten(), y[:, task][w_nonzero != 0]) np.testing.assert_array_equal(w_task.flatten(), w[:, task][w_nonzero != 0]) np.testing.assert_array_equal(ids_task, ids[w_nonzero != 0]) finally: # Cleanup for task_dir in task_dirs: shutil.rmtree(task_dir)
def test_shuffle_each_shard(self): """Test that shuffle_each_shard works.""" n_samples = 100 n_tasks = 10 n_features = 10 X = np.random.rand(n_samples, n_features) y = np.random.randint(2, size=(n_samples, n_tasks)) w = np.random.randint(2, size=(n_samples, n_tasks)) ids = np.arange(n_samples) dataset = Dataset.from_numpy(self.data_dir, X, y, w, ids) dataset.reshard(shard_size=10) dataset.shuffle_each_shard() X_s, y_s, w_s, ids_s = dataset.to_numpy() assert X_s.shape == X.shape assert y_s.shape == y.shape assert ids_s.shape == ids.shape assert w_s.shape == w.shape # The ids should now store the performed permutation. Check that the # original dataset is recoverable. for i in range(n_samples): np.testing.assert_array_equal(X_s[i], X[ids_s[i]]) np.testing.assert_array_equal(y_s[i], y[ids_s[i]]) np.testing.assert_array_equal(w_s[i], w[ids_s[i]]) np.testing.assert_array_equal(ids_s[i], ids[ids_s[i]])
def test_get_shape(self): """Test that get_shape works.""" num_datapoints = 100 num_features = 10 num_tasks = 10 # Generate data X = np.random.rand(num_datapoints, num_features) y = np.random.randint(2, size=(num_datapoints, num_tasks)) w = np.random.randint(2, size=(num_datapoints, num_tasks)) ids = np.array(["id"] * num_datapoints) dataset = Dataset.from_numpy(self.data_dir, X, y, w, ids, verbosity="high") X_shape, y_shape, w_shape, ids_shape = dataset.get_shape() print("type(X_shape), type(y_shape), type(w_shape), type(ids_shape)") print(type(X_shape), type(y_shape), type(w_shape), type(ids_shape)) print("type(X.shape), type(y.shape), type(w.shape), type(ids.shape)") print(type(X.shape), type(y.shape), type(w.shape), type(ids.shape)) print("X_shape, y_shape, w_shape, ids_shape") print(X_shape, y_shape, w_shape, ids_shape) print("X.shape, y.shape, w.shape, ids.shape") print(X.shape, y.shape, w.shape, ids.shape) assert X_shape == X.shape assert y_shape == y.shape assert w_shape == w.shape assert ids_shape == ids.shape
def test_to_singletask(self): """Test that to_singletask works.""" num_datapoints = 100 num_features = 10 num_tasks = 10 # Generate data X = np.random.rand(num_datapoints, num_features) y = np.random.randint(2, size=(num_datapoints, num_tasks)) w = np.random.randint(2, size=(num_datapoints, num_tasks)) ids = np.array(["id"] * num_datapoints) dataset = Dataset.from_numpy(self.data_dir, X, y, w, ids, verbosity="high") task_dirs = [] try: for task in range(num_tasks): task_dirs.append(tempfile.mkdtemp()) singletask_datasets = dataset.to_singletask(task_dirs) for task in range(num_tasks): singletask_dataset = singletask_datasets[task] X_task, y_task, w_task, ids_task = singletask_dataset.to_numpy() w_nonzero = w[:, task] != 0 np.testing.assert_array_equal(X_task, X[w_nonzero != 0]) np.testing.assert_array_equal(y_task.flatten(), y[:, task][w_nonzero != 0]) np.testing.assert_array_equal(w_task.flatten(), w[:, task][w_nonzero != 0]) np.testing.assert_array_equal(ids_task, ids[w_nonzero != 0]) finally: # Cleanup for task_dir in task_dirs: shutil.rmtree(task_dir)
def test_sklearn_multitask_classification_overfit(self): """Test SKLearn singletask-to-multitask overfits tiny data.""" n_tasks = 10 tasks = ["task%d" % task for task in range(n_tasks)] task_types = {task: "classification" for task in tasks} n_samples = 10 n_features = 3 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.randint(2, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "batch_size": None, "data_shape": dataset.get_data_shape() } verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) def model_builder(tasks, task_types, model_params, model_dir, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_dir, mode="classification", model_instance=RandomForestClassifier(), verbosity=verbosity) model = SingletaskToMultitask(tasks, task_types, model_params, self.model_dir, model_builder, verbosity=verbosity) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .9
def test_tf_multitask_classification_overfit(self): """Test tf multitask overfits tiny data.""" n_tasks = 10 tasks = ["task%d" % task for task in range(n_tasks)] task_types = {task: "classification" for task in tasks} n_samples = 10 n_features = 3 n_classes = 2 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) #y = np.random.randint(n_classes, size=(n_samples, n_tasks)) y = np.zeros((n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "layer_sizes": [1000], "dropouts": [.0], "learning_rate": 0.0003, "momentum": .9, "batch_size": n_samples, "num_classification_tasks": n_tasks, "num_classes": n_classes, "num_features": n_features, "weight_init_stddevs": [.1], "bias_init_consts": [1.], "nb_epoch": 100, "penalty": 0.0, "optimizer": "adam", "data_shape": dataset.get_data_shape() } verbosity = "high" classification_metric = Metric(metrics.accuracy_score, verbosity=verbosity) model = TensorflowModel(tasks, task_types, model_params, self.model_dir, tf_class=TensorflowMultiTaskClassifier, verbosity=verbosity) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .9
def load_pdbbind(pdbbind_dir, base_dir, reload=True): """Load PDBBind datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" model = "logistic" regen = False # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") # Load PDBBind dataset labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013") pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set") tasks = ["-logKd/Ki"] print("About to load contents.") contents_df = load_pdbbind_labels(labels_file) ids = contents_df["PDB code"].values y = np.array([float(val) for val in contents_df["-logKd/Ki"].values]) # Define featurizers grid_featurizer = GridFeaturizer(voxel_width=16.0, feature_types="voxel_combined", voxel_feature_types=[ "ecfp", "splif", "hbond", "pi_stack", "cation_pi", "salt_bridge" ], ecfp_power=9, splif_power=9, parallel=True, flatten=True) compound_featurizers = [CircularFingerprint(size=1024)] complex_featurizers = [grid_featurizer] # Featurize Dataset features = [] for pdb_code in ids: pdb_subdir = os.path.join(pdb_subdirs, pdb_code) computed_feature = compute_pdbbind_feature(compound_featurizers, complex_featurizers, pdb_subdir, pdb_code) if len(computed_feature) == 0: computed_feature = np.zeros(1024) features.append(computed_feature) X = np.vstack(features) w = np.ones_like(y) dataset = Dataset.from_numpy(data_dir, X, y, w, ids) transformers = [] return tasks, dataset, transformers
def test_tf_reload(self): """Test that tensorflow models can overfit simple classification datasets.""" tasks = ["task0"] task_types = {task: "classification" for task in tasks} n_samples = 10 n_features = 3 n_tasks = len(tasks) n_classes = 2 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.randint(n_classes, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "layer_sizes": [1000], "dropouts": [0.0], "learning_rate": 0.003, "momentum": 0.9, "batch_size": n_samples, "num_classification_tasks": 1, "num_classes": n_classes, "num_features": n_features, "weight_init_stddevs": [1.0], "bias_init_consts": [1.0], "nb_epoch": 100, "penalty": 0.0, "optimizer": "adam", "data_shape": dataset.get_data_shape(), } verbosity = "high" classification_metric = Metric(metrics.accuracy_score, verbosity=verbosity) model = TensorflowModel( tasks, task_types, model_params, self.model_dir, tf_class=TensorflowMultiTaskClassifier, verbosity=verbosity ) # Fit trained model model.fit(dataset) model.save() # Load trained model reloaded_model = TensorflowModel( tasks, task_types, model_params, self.model_dir, tf_class=TensorflowMultiTaskClassifier, verbosity=verbosity ) reloaded_model.reload() assert reloaded_model.eval_model._restored_model # Eval model on train transformers = [] evaluator = Evaluator(reloaded_model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > 0.9
def test_keras_skewed_classification_overfit(self): """Test keras models can overfit 0/1 datasets with few actives.""" tasks = ["task0"] task_types = {task: "classification" for task in tasks} n_samples = 100 n_features = 3 n_tasks = len(tasks) # Generate dummy dataset np.random.seed(123) p = .05 ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.binomial(1, p, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "nb_hidden": 1000, "activation": "relu", "dropout": .0, "learning_rate": .15, "momentum": .9, "nesterov": False, "decay": 1e-4, "batch_size": n_samples, "nb_epoch": 200, "init": "glorot_uniform", "nb_layers": 1, "batchnorm": False, "data_shape": dataset.get_data_shape() } verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) model = MultiTaskDNN(tasks, task_types, model_params, self.model_dir, verbosity=verbosity) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .9
def test_tf_regression_overfit(self): """Test that TensorFlow models can overfit simple regression datasets.""" tasks = ["task0"] task_types = {task: "regression" for task in tasks} n_samples = 10 n_features = 3 n_tasks = len(tasks) # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.zeros((n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "layer_sizes": [1000], "dropouts": [.0], "learning_rate": 0.003, "momentum": .9, "batch_size": n_samples, "num_regression_tasks": 1, "num_features": n_features, "weight_init_stddevs": [np.sqrt(6) / np.sqrt(1000)], "bias_init_consts": [1.], "nb_epoch": 100, "penalty": 0.0, "optimizer": "momentum", "data_shape": dataset.get_data_shape() } verbosity = "high" regression_metric = Metric(metrics.mean_squared_error, verbosity=verbosity) model = TensorflowModel(tasks, task_types, model_params, self.model_dir, tf_class=TensorflowMultiTaskRegressor, verbosity=verbosity) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) assert scores[regression_metric.name] < .1
def test_keras_reload(self): """Test that trained keras models can be reloaded correctly.""" g = tf.Graph() sess = tf.Session(graph=g) K.set_session(sess) with g.as_default(): tasks = ["task0"] task_types = {task: "classification" for task in tasks} n_samples = 10 n_features = 3 n_tasks = len(tasks) # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.randint(2, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "nb_hidden": 1000, "activation": "relu", "dropout": 0.0, "learning_rate": 0.15, "momentum": 0.9, "nesterov": False, "decay": 1e-4, "batch_size": n_samples, "nb_epoch": 200, "init": "glorot_uniform", "nb_layers": 1, "batchnorm": False, "data_shape": dataset.get_data_shape(), } verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) model = MultiTaskDNN(tasks, task_types, model_params, self.model_dir, verbosity=verbosity) # Fit trained model model.fit(dataset) model.save() # Load trained model reloaded_model = MultiTaskDNN(tasks, task_types, model_params, self.model_dir, verbosity=verbosity) reloaded_model.reload() # Eval model on train transformers = [] evaluator = Evaluator(reloaded_model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > 0.9
def test_keras_multitask_regression_overfit(self): """Test keras multitask overfits tiny data.""" n_tasks = 10 tasks = ["task%d" % task for task in range(n_tasks)] task_types = {task: "regression" for task in tasks} n_samples = 10 n_features = 3 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.randint(2, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "nb_hidden": 1000, "activation": "relu", "dropout": .0, "learning_rate": .15, "momentum": .9, "nesterov": False, "decay": 1e-4, "batch_size": n_samples, "nb_epoch": 200, "init": "glorot_uniform", "nb_layers": 1, "batchnorm": False, "data_shape": dataset.get_data_shape() } verbosity = "high" regression_metric = Metric(metrics.r2_score, verbosity=verbosity) model = MultiTaskDNN(tasks, task_types, model_params, self.model_dir, verbosity=verbosity) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) assert scores[regression_metric.name] > .9
def test_tf_skewed_classification_overfit(self): """Test tensorflow models can overfit 0/1 datasets with few actives.""" tasks = ["task0"] task_types = {task: "classification" for task in tasks} #n_samples = 100 n_samples = 100 n_features = 3 n_tasks = len(tasks) n_classes = 2 # Generate dummy dataset np.random.seed(123) p = .05 ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.binomial(1, p, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "layer_sizes": [1500], "dropouts": [.0], "learning_rate": 0.003, "momentum": .9, "batch_size": n_samples, "num_classification_tasks": 1, "num_classes": n_classes, "num_features": n_features, "weight_init_stddevs": [1.], "bias_init_consts": [1.], "nb_epoch": 200, "penalty": 0.0, "optimizer": "adam", "data_shape": dataset.get_data_shape() } verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) model = TensorflowModel( tasks, task_types, model_params, self.model_dir, tf_class=TensorflowMultiTaskClassifier, verbosity=verbosity) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .8
def train_valid_test_split(self, dataset, train_dir, valid_dir, test_dir, frac_train=.8, frac_valid=.1, frac_test=.1, seed=None, log_every_n=1000): # Obtain original x, y, and w arrays and shuffle X, y, w, ids = self.__randomize_arrays(dataset.to_numpy()) X_train, y_train, w_train, ids_train, X_test, y_test, w_test, ids_test = self.__split(X, y, w, ids, frac_train) # calculate percent split for valid (out of test and valid) valid_percentage = frac_valid / (frac_valid + frac_test) # split test data into valid and test, treating sub test set also as sparse X_valid, y_valid, w_valid, ids_valid, X_test, y_test, w_test, ids_test = self.__split(X_test, y_test, w_test, ids_test, valid_percentage) # turn back into dataset objects train_data = Dataset.from_numpy(train_dir, X_train, y_train, w_train, ids_train) valid_data = Dataset.from_numpy(valid_dir, X_valid, y_valid, w_valid, ids_valid) test_data = Dataset.from_numpy(test_dir, X_test, y_test, w_test, ids_test) return train_data, valid_data, test_data
def test_tf_multitask_regression_overfit(self): """Test tf multitask overfits tiny data.""" n_tasks = 10 tasks = ["task%d" % task for task in range(n_tasks)] task_types = {task: "regression" for task in tasks} n_samples = 10 n_features = 3 n_classes = 2 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) #y = np.random.randint(n_classes, size=(n_samples, n_tasks)) y = np.zeros((n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "layer_sizes": [1000], "dropouts": [.0], "learning_rate": 0.0003, "momentum": .9, "batch_size": n_samples, "num_regression_tasks": n_tasks, "num_classes": n_classes, "num_features": n_features, "weight_init_stddevs": [.1], "bias_init_consts": [1.], "nb_epoch": 100, "penalty": 0.0, "optimizer": "adam", "data_shape": dataset.get_data_shape() } verbosity = "high" regression_metric = Metric(metrics.r2_score, verbosity=verbosity) model = TensorflowModel( tasks, task_types, model_params, self.model_dir, tf_class=TensorflowMultiTaskRegressor, verbosity=verbosity) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) assert scores[regression_metric.name] > .9
def test_sklearn_reload(self): """Test that trained model can be reloaded correctly.""" tasks = ["task0"] task_types = {task: "classification" for task in tasks} n_samples = 10 n_features = 3 n_tasks = len(tasks) # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.randint(2, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "batch_size": None, "data_shape": dataset.get_data_shape() } verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="classification", model_instance=RandomForestClassifier()) # Fit trained model model.fit(dataset) model.save() # Load trained model reloaded_model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="classification") reloaded_model.reload() # Eval model on train transformers = [] evaluator = Evaluator(reloaded_model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .9
def test_keras_multitask_regression_overfit(self): """Test keras multitask overfits tiny data.""" g = tf.Graph() sess = tf.Session(graph=g) K.set_session(sess) with g.as_default(): n_tasks = 10 tasks = ["task%d" % task for task in range(n_tasks)] task_types = {task: "regression" for task in tasks} n_samples = 10 n_features = 3 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.randint(2, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "nb_hidden": 1000, "activation": "relu", "dropout": .0, "learning_rate": .15, "momentum": .9, "nesterov": False, "decay": 1e-4, "batch_size": n_samples, "nb_epoch": 200, "init": "glorot_uniform", "nb_layers": 1, "batchnorm": False, "data_shape": dataset.get_data_shape() } verbosity = "high" regression_metric = Metric(metrics.r2_score, verbosity=verbosity) model = MultiTaskDNN(tasks, task_types, model_params, self.model_dir, verbosity=verbosity) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) assert scores[regression_metric.name] > .9
def test_multitask_data(self): """Test that data associated with a tasks stays associated with it.""" tasks = ["task0", "task1"] n_samples = 100 n_features = 3 n_tasks = len(tasks) # Generate dummy dataset ids = np.array(["C"] * n_samples, dtype=object) X = np.random.rand(n_samples, n_features) y = np.random.randint(2, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) X_out, y_out, w_out, _ = dataset.to_numpy() np.testing.assert_allclose(X, X_out) np.testing.assert_allclose(y, y_out) np.testing.assert_allclose(w, w_out)
def test_multitask_data(self): """Test that data associated with a tasks stays associated with it.""" tasks = ["task0", "task1"] n_samples = 100 n_features = 3 n_tasks = len(tasks) # Generate dummy dataset ids = np.array(["C"] * n_samples, dtype=object) X = np.random.rand(n_samples, n_features) y = np.random.randint(2, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) X_out, y_out, w_out, _ = dataset.to_numpy() np.testing.assert_allclose(X, X_out) np.testing.assert_allclose(y, y_out) np.testing.assert_allclose(w, w_out)
def test_get_shape(self): """Test that get_shape works.""" num_datapoints = 100 num_features = 10 num_tasks = 10 # Generate data X = np.random.rand(num_datapoints, num_features) y = np.random.randint(2, size=(num_datapoints, num_tasks)) w = np.random.randint(2, size=(num_datapoints, num_tasks)) ids = np.array(["id"] * num_datapoints) dataset = Dataset.from_numpy(self.data_dir, X, y, w, ids, verbosity="high") X_shape, y_shape, w_shape, ids_shape = dataset.get_shape() assert X_shape == X.shape assert y_shape == y.shape assert w_shape == w.shape assert ids_shape == ids.shape
def test_sklearn_reload(self): """Test that trained model can be reloaded correctly.""" tasks = ["task0"] task_types = {task: "classification" for task in tasks} n_samples = 10 n_features = 3 n_tasks = len(tasks) # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.randint(2, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = {"batch_size": None, "data_shape": dataset.get_data_shape()} verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) model = SklearnModel( tasks, task_types, model_params, self.model_dir, mode="classification", model_instance=RandomForestClassifier(), ) # Fit trained model model.fit(dataset) model.save() # Load trained model reloaded_model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="classification") reloaded_model.reload() # Eval model on train transformers = [] evaluator = Evaluator(reloaded_model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > 0.9
def test_sklearn_regression_overfit(self): """Test that sklearn models can overfit simple regression datasets.""" tasks = ["task0"] task_types = {task: "regression" for task in tasks} n_samples = 10 n_features = 3 n_tasks = len(tasks) # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.rand(n_samples, n_tasks) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "batch_size": None, "data_shape": dataset.get_data_shape() } verbosity = "high" regression_metric = Metric(metrics.r2_score, verbosity=verbosity) model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="regression", model_instance=RandomForestRegressor()) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) assert scores[regression_metric.name] > .7
def test_sklearn_multitask_regression_overfit(self): """Test SKLearn singletask-to-multitask overfits tiny regression data.""" n_tasks = 2 tasks = ["task%d" % task for task in range(n_tasks)] task_types = {task: "regression" for task in tasks} n_samples = 10 n_features = 3 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.rand(n_samples, n_tasks) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "batch_size": None, "data_shape": dataset.get_data_shape() } verbosity = "high" regression_metric = Metric(metrics.r2_score, verbosity=verbosity) def model_builder(tasks, task_types, model_params, model_dir, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_dir, mode="regression", model_instance=RandomForestRegressor(), verbosity=verbosity) model = SingletaskToMultitask(tasks, task_types, model_params, self.model_dir, model_builder, verbosity=verbosity) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) assert scores[regression_metric.name] > .7
def test_select(self): """Test that dataset select works.""" num_datapoints = 10 num_features = 10 num_tasks = 1 X = np.random.rand(num_datapoints, num_features) y = np.random.randint(2, size=(num_datapoints, num_tasks)) w = np.ones((num_datapoints, num_tasks)) ids = np.array(["id"] * num_datapoints) dataset = Dataset.from_numpy(self.data_dir, X, y, w, ids) select_dir = tempfile.mkdtemp() indices = [0, 4, 5, 8] select_dataset = dataset.select(select_dir, indices) X_sel, y_sel, w_sel, ids_sel = select_dataset.to_numpy() np.testing.assert_array_equal(X[indices], X_sel) np.testing.assert_array_equal(y[indices], y_sel) np.testing.assert_array_equal(w[indices], w_sel) np.testing.assert_array_equal(ids[indices], ids_sel) shutil.rmtree(select_dir)
def test_select(self): """Test that dataset select works.""" num_datapoints = 10 num_features = 10 num_tasks = 1 X = np.random.rand(num_datapoints, num_features) y = np.random.randint(2, size=(num_datapoints, num_tasks)) w = np.ones((num_datapoints, num_tasks)) ids = np.array(["id"] * num_datapoints) dataset = Dataset.from_numpy(self.data_dir, X, y, w, ids) select_dir = tempfile.mkdtemp() indices = [0, 4, 5, 8] select_dataset = dataset.select(select_dir, indices) X_sel, y_sel, w_sel, ids_sel = select_dataset.to_numpy() np.testing.assert_array_equal(X[indices], X_sel) np.testing.assert_array_equal(y[indices], y_sel) np.testing.assert_array_equal(w[indices], w_sel) np.testing.assert_array_equal(ids[indices], ids_sel) shutil.rmtree(select_dir)
def classify(document): doc = document.strip().split('\r\n') mol = [Chem.MolFromSmiles(x) for x in doc if x is not None] fp = [ AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024) for x in mol if x is not None ] fp = np.array(fp) if len(fp.shape) == 1: fp = np.reshape(fp, (1, -1)) o = np.ones((fp.shape[0], len(kinase_tasks))) d = Dataset.from_numpy(data_dir, fp, o, tasks=kinase_tasks) y = np.squeeze(np.delete(model.predict_proba(d), 0, 2)) if len(y.shape) == 1: y = np.reshape(y, (1, -1)) yy = pd.DataFrame(y) yy.columns = kinase_tasks yy = yy.T yy.columns = doc yy.index.name = 'kinase' yy.to_csv(os.path.join(data_dir, 'pred.csv')) return doc, yy
def test_sklearn_skewed_classification_overfit(self): """Test sklearn models can overfit 0/1 datasets with few actives.""" tasks = ["task0"] task_types = {task: "classification" for task in tasks} n_samples = 100 n_features = 3 n_tasks = len(tasks) # Generate dummy dataset np.random.seed(123) p = .05 ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.binomial(1, p, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "batch_size": None, "data_shape": dataset.get_data_shape() } verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="classification", model_instance=RandomForestClassifier()) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .9
def test_sklearn_regression_overfit(self): """Test that sklearn models can overfit simple regression datasets.""" tasks = ["task0"] task_types = {task: "regression" for task in tasks} n_samples = 10 n_features = 3 n_tasks = len(tasks) # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.rand(n_samples, n_tasks) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "batch_size": None, "data_shape": dataset.get_data_shape() } verbosity = "high" regression_metric = Metric(metrics.r2_score, verbosity=verbosity) model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="regression", model_instance=RandomForestRegressor()) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) assert scores[regression_metric.name] > .7
def test_sklearn_multitask_classification(self): """Test that sklearn models can learn on simple multitask classification.""" np.random.seed(123) n_tasks = 4 dataset = sklearn.datasets.load_digits(n_class=2) X, y = dataset.data, dataset.target y = np.reshape(y, (len(y), 1)) y = np.hstack([y] * n_tasks) frac_train = .7 n_samples = len(X) X_train, y_train = X[:frac_train * n_samples], y[:frac_train * n_samples] X_test, y_test = X[frac_train * n_samples:], y[frac_train * n_samples:] train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train) test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test) tasks = train_dataset.get_task_names() task_types = {task: "classification" for task in tasks} model_params = { "batch_size": None, "data_shape": train_dataset.get_data_shape() } verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) def model_builder(tasks, task_types, model_params, model_dir, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_dir, mode="classification", model_instance=LogisticRegression(), verbosity=verbosity) model = SingletaskToMultitask(tasks, task_types, model_params, self.model_dir, model_builder, verbosity=verbosity) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train transformers = [] train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance( [classification_metric]) print("train_scores") print(train_scores) # Eval model on test transformers = [] evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) print("scores") print(scores) for score in scores[classification_metric.name]: assert score > .5
print("About to perform train/valid/test split.") num_train = .8 * len(dataset) X, y, w, ids = dataset.to_numpy() num_tasks = 120 pcba_tasks = pcba_tasks[:num_tasks] print("Using following tasks") print(pcba_tasks) X_train, X_valid = X[:num_train], X[num_train:] y_train, y_valid = y[:num_train, :num_tasks], y[num_train:, :num_tasks] w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks] ids_train, ids_valid = ids[:num_train], ids[num_train:] if os.path.exists(train_dir): shutil.rmtree(train_dir) train_dataset = Dataset.from_numpy(train_dir, X_train, y_train, w_train, ids_train, pcba_tasks) if os.path.exists(valid_dir): shutil.rmtree(valid_dir) valid_dataset = Dataset.from_numpy(valid_dir, X_valid, y_valid, w_valid, ids_valid, pcba_tasks) # Fit Logistic Regression models pcba_task_types = {task: "classification" for task in pcba_tasks} classification_metric = Metric(metrics.roc_auc_score, np.mean, verbosity=verbosity, mode="classification") params_dict = { "batch_size": None,
def test_tf_skewed_missing_classification_overfit(self): """TF, skewed data, few actives Test tensorflow models overfit 0/1 datasets with missing data and few actives. This is intended to be as close to singletask MUV datasets as possible. """ tasks = ["task0"] task_types = {task: "classification" for task in tasks} n_samples = 5120 n_features = 6 n_tasks = len(tasks) n_classes = 2 # Generate dummy dataset np.random.seed(123) p = .002 ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.binomial(1, p, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) y_flat, w_flat = np.squeeze(y), np.squeeze(w) y_nonzero = y_flat[w_flat != 0] num_nonzero = np.count_nonzero(y_nonzero) weight_nonzero = len(y_nonzero)/num_nonzero w_flat[y_flat != 0] = weight_nonzero w = np.reshape(w_flat, (n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "layer_sizes": [1200], "dropouts": [.0], "learning_rate": 0.003, "momentum": .9, "batch_size": 75, "num_classification_tasks": 1, "num_classes": n_classes, "num_features": n_features, "weight_init_stddevs": [1.], "bias_init_consts": [1.], "nb_epoch": 250, "penalty": 0.0, "optimizer": "adam", "data_shape": dataset.get_data_shape() } verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) model = TensorflowModel( tasks, task_types, model_params, self.model_dir, tf_class=TensorflowMultiTaskClassifier, verbosity=verbosity) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .8
tox21_tasks, tox21_dataset, transformers = load_tox21(data_dir, reload=reload) num_train = 7200 X, y, w, ids = tox21_dataset.to_numpy() X_train, X_valid = X[:num_train], X[num_train:] y_train, y_valid = y[:num_train], y[num_train:] w_train, w_valid = w[:num_train], w[num_train:] ids_train, ids_valid = ids[:num_train], ids[num_train:] # Not sure if we need to constantly delete these directories... if os.path.exists(train_dir): shutil.rmtree(train_dir) train_dataset = Dataset.from_numpy(train_dir, X_train, y_train, w_train, ids_train, tox21_tasks, verbosity=verbosity) if os.path.exists(valid_dir): shutil.rmtree(valid_dir) valid_dataset = Dataset.from_numpy(valid_dir, X_valid, y_valid, w_valid, ids_valid, tox21_tasks, verbosity) # Fit models tox21_task_types = {task: "classification" for task in tox21_tasks} classification_metric = Metric(metrics.roc_auc_score, np.mean, verbosity=verbosity,
def test_tf_skewed_missing_classification_overfit(self): """TF, skewed data, few actives Test tensorflow models overfit 0/1 datasets with missing data and few actives. This is intended to be as close to singletask MUV datasets as possible. """ tasks = ["task0"] task_types = {task: "classification" for task in tasks} n_samples = 5120 n_features = 6 n_tasks = len(tasks) n_classes = 2 # Generate dummy dataset np.random.seed(123) p = .002 ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.binomial(1, p, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) print("np.count_nonzero(y)") print(np.count_nonzero(y)) ##### DEBUG y_flat, w_flat = np.squeeze(y), np.squeeze(w) y_nonzero = y_flat[w_flat != 0] num_nonzero = np.count_nonzero(y_nonzero) weight_nonzero = len(y_nonzero) / num_nonzero print("weight_nonzero") print(weight_nonzero) w_flat[y_flat != 0] = weight_nonzero w = np.reshape(w_flat, (n_samples, n_tasks)) print("np.amin(w), np.amax(w)") print(np.amin(w), np.amax(w)) ##### DEBUG dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "layer_sizes": [1200], "dropouts": [.0], "learning_rate": 0.003, "momentum": .9, "batch_size": 75, "num_classification_tasks": 1, "num_classes": n_classes, "num_features": n_features, "weight_init_stddevs": [1.], "bias_init_consts": [1.], "nb_epoch": 250, "penalty": 0.0, "optimizer": "adam", "data_shape": dataset.get_data_shape() } verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) model = TensorflowModel(tasks, task_types, model_params, self.model_dir, tf_class=TensorflowMultiTaskClassifier, verbosity=verbosity) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .8
def load_core_pdbbind_grid(pdbbind_dir, base_dir, reload=True): """Load PDBBind datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" model = "logistic" regen = False # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") # Load PDBBind dataset labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013") pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set") tasks = ["-logKd/Ki"] print("About to load contents.") contents_df = load_pdbbind_labels(labels_file) ids = contents_df["PDB code"].values y = np.array([float(val) for val in contents_df["-logKd/Ki"].values]) # Define featurizers grid_featurizer = GridFeaturizer( voxel_width=16.0, feature_types="voxel_combined", # TODO(rbharath, enf): Figure out why pi_stack is slow and cation_pi # causes segfaults. #voxel_feature_types=["ecfp", "splif", "hbond", "pi_stack", "cation_pi", #"salt_bridge"], ecfp_power=9, splif_power=9, voxel_feature_types=["ecfp", "splif", "hbond", "salt_bridge"], ecfp_power=9, splif_power=9, parallel=True, flatten=True, verbosity=verbosity) compound_featurizers = [CircularFingerprint(size=1024)] complex_featurizers = [grid_featurizer] # Featurize Dataset features = [] feature_len = None y_inds = [] for ind, pdb_code in enumerate(ids): print("Processing %s" % str(pdb_code)) pdb_subdir = os.path.join(pdb_subdirs, pdb_code) computed_feature = compute_pdbbind_grid_feature( compound_featurizers, complex_featurizers, pdb_subdir, pdb_code) if feature_len is None: feature_len = len(computed_feature) if len(computed_feature) != feature_len: print("Featurization failed for %s!" % pdb_code) continue y_inds.append(ind) features.append(computed_feature) ############################################################# DEBUG y = y[y_inds] ############################################################# DEBUG X = np.vstack(features) w = np.ones_like(y) dataset = Dataset.from_numpy(data_dir, X, y, w, ids) transformers = [] return tasks, dataset, transformers
print("About to perform train/valid/test split.") num_train = .8 * len(dataset) X, y, w, ids = dataset.to_numpy() num_tasks = 120 pcba_tasks = pcba_tasks[:num_tasks] print("Using following tasks") print(pcba_tasks) X_train, X_valid = X[:num_train], X[num_train:] y_train, y_valid = y[:num_train, :num_tasks], y[num_train:, :num_tasks] w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks] ids_train, ids_valid = ids[:num_train], ids[num_train:] if os.path.exists(train_dir): shutil.rmtree(train_dir) train_dataset = Dataset.from_numpy(train_dir, X_train, y_train, w_train, ids_train, pcba_tasks) if os.path.exists(valid_dir): shutil.rmtree(valid_dir) valid_dataset = Dataset.from_numpy(valid_dir, X_valid, y_valid, w_valid, ids_valid, pcba_tasks) # Fit Logistic Regression models pcba_task_types = {task: "classification" for task in pcba_tasks} classification_metric = Metric(metrics.roc_auc_score, np.mean, verbosity=verbosity, mode="classification") params_dict = { "batch_size": 64,
def test_sklearn_transformed_regression(self): """Test that sklearn models can learn on simple transformed regression datasets.""" np.random.seed(123) dataset = sklearn.datasets.load_diabetes() X, y = dataset.data, dataset.target frac_train = .7 n_samples = len(X) X_train, y_train = X[:frac_train*n_samples], y[:frac_train*n_samples] X_test, y_test = X[frac_train*n_samples:], y[frac_train*n_samples:] train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train) test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test) # Eval model on train input_transformers = [ NormalizationTransformer(transform_X=True, dataset=train_dataset), ClippingTransformer(transform_X=True, dataset=train_dataset)] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset)] transformers = input_transformers + output_transformers for transformer in transformers: transformer.transform(train_dataset) for transformer in transformers: transformer.transform(test_dataset) tasks = train_dataset.get_task_names() task_types = {task: "regression" for task in tasks} model_params = { "batch_size": None, "data_shape": train_dataset.get_data_shape() } verbosity = "high" regression_metric = Metric(metrics.r2_score, verbosity=verbosity) model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="regression", model_instance=LinearRegression()) # Fit trained model model.fit(train_dataset) model.save() train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance([regression_metric]) print("train_scores") print(train_scores) assert train_scores[regression_metric.name] > .5 # Eval model on test transformers = [] evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) print("scores") print(scores) assert scores[regression_metric.name] > .5
model_dir = os.path.join(base_dir, "model") print("About to perform train/valid/test split.") num_train = .8 * len(dataset) X, y, w, ids = dataset.to_numpy() num_tasks = 17 muv_tasks = muv_tasks[:num_tasks] X_train, X_valid = X[:num_train], X[num_train:] y_train, y_valid = y[:num_train, :num_tasks], y[num_train:, :num_tasks] w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks] ids_train, ids_valid = ids[:num_train], ids[num_train:] if os.path.exists(train_dir): shutil.rmtree(train_dir) train_dataset = Dataset.from_numpy(train_dir, X_train, y_train, w_train, ids_train, muv_tasks, verbosity=verbosity) if os.path.exists(valid_dir): shutil.rmtree(valid_dir) valid_dataset = Dataset.from_numpy(valid_dir, X_valid, y_valid, w_valid, ids_valid, muv_tasks, verbosity=verbosity) # Fit Logistic Regression models muv_task_types = {task: "classification" for task in muv_tasks} classification_metric = Metric(metrics.roc_auc_score, np.mean, verbosity=verbosity, mode="classification")
def test_singletask_to_multitask_classification(self): splittype = "scaffold" compound_featurizers = [CircularFingerprint(size=1024)] complex_featurizers = [] output_transformers = [] tasks = [ "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16" ] task_types = {task: "classification" for task in tasks} input_file = "multitask_example.csv" n_features = 10 n_tasks = len(tasks) # Define train dataset n_train = 100 X_train = np.random.rand(n_train, n_features) y_train = np.random.randint(2, size=(n_train, n_tasks)) w_train = np.ones_like(y_train) ids_train = ["C"] * n_train train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train, w_train, ids_train, tasks) # Define test dataset n_test = 10 X_test = np.random.rand(n_test, n_features) y_test = np.random.randint(2, size=(n_test, n_tasks)) w_test = np.ones_like(y_test) ids_test = ["C"] * n_test test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test, w_test, ids_test, tasks) params_dict = { "batch_size": 32, "data_shape": train_dataset.get_data_shape() } classification_metrics = [Metric(metrics.roc_auc_score)] def model_builder(tasks, task_types, model_params, model_builder, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_builder, model_instance=LogisticRegression()) multitask_model = SingletaskToMultitask(tasks, task_types, params_dict, self.model_dir, model_builder) # Fit trained model multitask_model.fit(train_dataset) multitask_model.save() # Eval multitask_model on train evaluator = Evaluator(multitask_model, train_dataset, output_transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics) # Eval multitask_model on test evaluator = Evaluator(multitask_model, test_dataset, output_transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics)