def test_tf_skewed_classification_overfit(self): """Test tensorflow models can overfit 0/1 datasets with few actives.""" tasks = ["task0"] task_types = {task: "classification" for task in tasks} #n_samples = 100 n_samples = 100 n_features = 3 n_tasks = len(tasks) n_classes = 2 # Generate dummy dataset np.random.seed(123) p = .05 ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.binomial(1, p, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "layer_sizes": [1500], "dropouts": [.0], "learning_rate": 0.003, "momentum": .9, "batch_size": n_samples, "num_classification_tasks": 1, "num_classes": n_classes, "num_features": n_features, "weight_init_stddevs": [1.], "bias_init_consts": [1.], "nb_epoch": 200, "penalty": 0.0, "optimizer": "adam", "data_shape": dataset.get_data_shape() } verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) model = TensorflowModel( tasks, task_types, model_params, self.model_dir, tf_class=TensorflowMultiTaskClassifier, verbosity=verbosity) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .8
def test_tf_multitask_regression_overfit(self): """Test tf multitask overfits tiny data.""" n_tasks = 10 tasks = ["task%d" % task for task in range(n_tasks)] task_types = {task: "regression" for task in tasks} n_samples = 10 n_features = 3 n_classes = 2 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) #y = np.random.randint(n_classes, size=(n_samples, n_tasks)) y = np.zeros((n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "layer_sizes": [1000], "dropouts": [.0], "learning_rate": 0.0003, "momentum": .9, "batch_size": n_samples, "num_regression_tasks": n_tasks, "num_classes": n_classes, "num_features": n_features, "weight_init_stddevs": [.1], "bias_init_consts": [1.], "nb_epoch": 100, "penalty": 0.0, "optimizer": "adam", "data_shape": dataset.get_data_shape() } verbosity = "high" regression_metric = Metric(metrics.r2_score, verbosity=verbosity) model = TensorflowModel( tasks, task_types, model_params, self.model_dir, tf_class=TensorflowMultiTaskRegressor, verbosity=verbosity) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) assert scores[regression_metric.name] > .9
def test_tf_reload(self): """Test that tensorflow models can overfit simple classification datasets.""" n_samples = 10 n_features = 3 n_tasks = 1 n_classes = 2 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.randint(n_classes, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = NumpyDataset(X, y, w, ids) verbosity = "high" classification_metric = Metric(metrics.accuracy_score, verbosity=verbosity) tensorflow_model = TensorflowMultiTaskClassifier(n_tasks, n_features, self.model_dir, dropouts=[0.], verbosity=verbosity) model = TensorflowModel(tensorflow_model, self.model_dir) # Fit trained model model.fit(dataset) model.save() # Load trained model reloaded_tensorflow_model = TensorflowMultiTaskClassifier( n_tasks, n_features, self.model_dir, dropouts=[0.], verbosity=verbosity) reloaded_model = TensorflowModel(reloaded_tensorflow_model, self.model_dir) reloaded_model.reload() # Eval model on train transformers = [] evaluator = Evaluator(reloaded_model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .6
def test_tf_reload(self): """Test that tensorflow models can overfit simple classification datasets.""" tasks = ["task0"] task_types = {task: "classification" for task in tasks} n_samples = 10 n_features = 3 n_tasks = len(tasks) n_classes = 2 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.randint(n_classes, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "layer_sizes": [1000], "dropouts": [0.0], "learning_rate": 0.003, "momentum": 0.9, "batch_size": n_samples, "num_classification_tasks": 1, "num_classes": n_classes, "num_features": n_features, "weight_init_stddevs": [1.0], "bias_init_consts": [1.0], "nb_epoch": 100, "penalty": 0.0, "optimizer": "adam", "data_shape": dataset.get_data_shape(), } verbosity = "high" classification_metric = Metric(metrics.accuracy_score, verbosity=verbosity) model = TensorflowModel( tasks, task_types, model_params, self.model_dir, tf_class=TensorflowMultiTaskClassifier, verbosity=verbosity ) # Fit trained model model.fit(dataset) model.save() # Load trained model reloaded_model = TensorflowModel( tasks, task_types, model_params, self.model_dir, tf_class=TensorflowMultiTaskClassifier, verbosity=verbosity ) reloaded_model.reload() assert reloaded_model.eval_model._restored_model # Eval model on train transformers = [] evaluator = Evaluator(reloaded_model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > 0.9
def test_tf_regression_overfit(self): """Test that TensorFlow models can overfit simple regression datasets.""" n_samples = 10 n_features = 3 n_tasks = 1 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.zeros((n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = NumpyDataset(X, y, w, ids) verbosity = "high" regression_metric = Metric(metrics.mean_squared_error, verbosity=verbosity) # TODO(rbharath): This breaks with optimizer="momentum". Why? tensorflow_model = TensorflowMultiTaskRegressor( n_tasks, n_features, self.model_dir, dropouts=[0.], learning_rate=0.003, weight_init_stddevs=[np.sqrt(6)/np.sqrt(1000)], batch_size=n_samples, verbosity=verbosity) model = TensorflowModel(tensorflow_model, self.model_dir) # Fit trained model model.fit(dataset, nb_epoch=100) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) assert scores[regression_metric.name] < .1
def test_tf_skewed_classification_overfit(self): """Test tensorflow models can overfit 0/1 datasets with few actives.""" tasks = ["task0"] task_types = {task: "classification" for task in tasks} #n_samples = 100 n_samples = 100 n_features = 3 n_tasks = len(tasks) n_classes = 2 # Generate dummy dataset np.random.seed(123) p = .05 ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.binomial(1, p, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "layer_sizes": [1500], "dropouts": [.0], "learning_rate": 0.003, "momentum": .9, "batch_size": n_samples, "num_classification_tasks": 1, "num_classes": n_classes, "num_features": n_features, "weight_init_stddevs": [1.], "bias_init_consts": [1.], "nb_epoch": 200, "penalty": 0.0, "optimizer": "adam", "data_shape": dataset.get_data_shape() } verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) model = TensorflowModel(tasks, task_types, model_params, self.model_dir, tf_class=TensorflowMultiTaskClassifier, verbosity=verbosity) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .8
def test_tf_classification_overfit(self): """Test that tensorflow models can overfit simple classification datasets.""" n_samples = 10 n_features = 3 n_tasks = 1 n_classes = 2 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.zeros((n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = NumpyDataset(X, y, w, ids) verbosity = "high" classification_metric = Metric(metrics.accuracy_score, verbosity=verbosity) tensorflow_model = TensorflowMultiTaskClassifier( n_tasks, n_features, self.model_dir, dropouts=[0.], learning_rate=0.0003, weight_init_stddevs=[.1], batch_size=n_samples, verbosity=verbosity) model = TensorflowModel(tensorflow_model, self.model_dir) # Fit trained model model.fit(dataset, nb_epoch=100) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .9
def test_tf_multitask_regression_overfit(self): """Test tf multitask overfits tiny data.""" n_tasks = 10 n_samples = 10 n_features = 3 n_classes = 2 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.zeros((n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = NumpyDataset(X, y, w, ids) verbosity = "high" regression_metric = Metric(metrics.mean_squared_error, verbosity=verbosity, task_averager=np.mean, mode="regression") tensorflow_model = TensorflowMultiTaskRegressor( n_tasks, n_features, self.model_dir, dropouts=[0.], learning_rate=0.0003, weight_init_stddevs=[.1], batch_size=n_samples, verbosity=verbosity) model = TensorflowModel(tensorflow_model, self.model_dir) # Fit trained model model.fit(dataset, nb_epoch=50) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) assert scores[regression_metric.name] < .1
def test_tf_multitask_classification_overfit(self): """Test tf multitask overfits tiny data.""" n_tasks = 10 tasks = ["task%d" % task for task in range(n_tasks)] task_types = {task: "classification" for task in tasks} n_samples = 10 n_features = 3 n_classes = 2 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) #y = np.random.randint(n_classes, size=(n_samples, n_tasks)) y = np.zeros((n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "layer_sizes": [1000], "dropouts": [.0], "learning_rate": 0.0003, "momentum": .9, "batch_size": n_samples, "num_classification_tasks": n_tasks, "num_classes": n_classes, "num_features": n_features, "weight_init_stddevs": [.1], "bias_init_consts": [1.], "nb_epoch": 100, "penalty": 0.0, "optimizer": "adam", "data_shape": dataset.get_data_shape() } verbosity = "high" classification_metric = Metric(metrics.accuracy_score, verbosity=verbosity) model = TensorflowModel(tasks, task_types, model_params, self.model_dir, tf_class=TensorflowMultiTaskClassifier, verbosity=verbosity) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .9
def test_tf_regression_overfit(self): """Test that TensorFlow models can overfit simple regression datasets.""" tasks = ["task0"] task_types = {task: "regression" for task in tasks} n_samples = 10 n_features = 3 n_tasks = len(tasks) # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.zeros((n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "layer_sizes": [1000], "dropouts": [.0], "learning_rate": 0.003, "momentum": .9, "batch_size": n_samples, "num_regression_tasks": 1, "num_features": n_features, "weight_init_stddevs": [np.sqrt(6) / np.sqrt(1000)], "bias_init_consts": [1.], "nb_epoch": 100, "penalty": 0.0, "optimizer": "momentum", "data_shape": dataset.get_data_shape() } verbosity = "high" regression_metric = Metric(metrics.mean_squared_error, verbosity=verbosity) model = TensorflowModel(tasks, task_types, model_params, self.model_dir, tf_class=TensorflowMultiTaskRegressor, verbosity=verbosity) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) assert scores[regression_metric.name] < .1
def test_tf_skewed_missing_classification_overfit(self): """TF, skewed data, few actives Test tensorflow models overfit 0/1 datasets with missing data and few actives. This is intended to be as close to singletask MUV datasets as possible. """ n_samples = 5120 n_features = 6 n_tasks = 1 n_classes = 2 # Generate dummy dataset np.random.seed(123) p = .002 ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.binomial(1, p, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) y_flat, w_flat = np.squeeze(y), np.squeeze(w) y_nonzero = y_flat[w_flat != 0] num_nonzero = np.count_nonzero(y_nonzero) weight_nonzero = len(y_nonzero)/num_nonzero w_flat[y_flat != 0] = weight_nonzero w = np.reshape(w_flat, (n_samples, n_tasks)) dataset = NumpyDataset(X, y, w, ids) verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) tensorflow_model = TensorflowMultiTaskClassifier( n_tasks, n_features, self.model_dir, dropouts=[0.], learning_rate=0.003, weight_init_stddevs=[1.], batch_size=n_samples, verbosity=verbosity) model = TensorflowModel(tensorflow_model, self.model_dir) # Fit trained model model.fit(dataset, nb_epoch=50) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .8
def test_singletask_tf_mlp_ECFP_classification_API(self): """Straightforward test of Tensorflow singletask deepchem classification API.""" n_features = 1024 featurizer = CircularFingerprint(size=n_features) tasks = ["outcome"] input_file = os.path.join(self.current_dir, "example_classification.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset)] for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) classification_metrics = [Metric(metrics.roc_auc_score), Metric(metrics.matthews_corrcoef), Metric(metrics.recall_score), Metric(metrics.accuracy_score)] tensorflow_model = TensorflowMultiTaskClassifier( len(tasks), n_features, self.model_dir) model = TensorflowModel(tensorflow_model, self.model_dir) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics)
def test_singletask_tf_mlp_ECFP_classification_API(self): """Straightforward test of Tensorflow singletask deepchem classification API.""" splittype = "scaffold" output_transformers = [] input_transformers = [] task_type = "classification" featurizer = CircularFingerprint(size=1024) tasks = ["outcome"] task_type = "classification" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example_classification.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) input_transformers = [] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset)] transformers = input_transformers + output_transformers for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) model_params = { "batch_size": 2, "num_classification_tasks": 1, "num_features": 1024, "layer_sizes": [1024], "weight_init_stddevs": [1.], "bias_init_consts": [0.], "dropouts": [.5], "num_classes": 2, "nb_epoch": 1, "penalty": 0.0, "optimizer": "adam", "learning_rate": .001, "data_shape": train_dataset.get_data_shape() } classification_metrics = [Metric(metrics.roc_auc_score), Metric(metrics.matthews_corrcoef), Metric(metrics.recall_score), Metric(metrics.accuracy_score)] model = TensorflowModel( tasks, task_types, model_params, self.model_dir, tf_class=TensorflowMultiTaskClassifier) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics)
def test_tf_skewed_missing_classification_overfit(self): """TF, skewed data, few actives Test tensorflow models overfit 0/1 datasets with missing data and few actives. This is intended to be as close to singletask MUV datasets as possible. """ tasks = ["task0"] task_types = {task: "classification" for task in tasks} n_samples = 5120 n_features = 6 n_tasks = len(tasks) n_classes = 2 # Generate dummy dataset np.random.seed(123) p = .002 ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.binomial(1, p, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) print("np.count_nonzero(y)") print(np.count_nonzero(y)) ##### DEBUG y_flat, w_flat = np.squeeze(y), np.squeeze(w) y_nonzero = y_flat[w_flat != 0] num_nonzero = np.count_nonzero(y_nonzero) weight_nonzero = len(y_nonzero) / num_nonzero print("weight_nonzero") print(weight_nonzero) w_flat[y_flat != 0] = weight_nonzero w = np.reshape(w_flat, (n_samples, n_tasks)) print("np.amin(w), np.amax(w)") print(np.amin(w), np.amax(w)) ##### DEBUG dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "layer_sizes": [1200], "dropouts": [.0], "learning_rate": 0.003, "momentum": .9, "batch_size": 75, "num_classification_tasks": 1, "num_classes": n_classes, "num_features": n_features, "weight_init_stddevs": [1.], "bias_init_consts": [1.], "nb_epoch": 250, "penalty": 0.0, "optimizer": "adam", "data_shape": dataset.get_data_shape() } verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) model = TensorflowModel(tasks, task_types, model_params, self.model_dir, tf_class=TensorflowMultiTaskClassifier, verbosity=verbosity) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .8
ids_train, ids_valid = ids[:num_train], ids[num_train:] train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train, w_train, ids_train, pdbbind_tasks) valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid, w_valid, ids_valid, pdbbind_tasks) classification_metric = Metric(metrics.pearson_r2_score, verbosity=verbosity, mode="regression") n_features = dataset.get_data_shape()[0] tensorflow_model = TensorflowMultiTaskRegressor( len(pdbbind_tasks), n_features, model_dir, dropouts=[.25], learning_rate=0.0003, weight_init_stddevs=[.1], batch_size=64, verbosity=verbosity) model = TensorflowModel(tensorflow_model, model_dir) # Fit trained model model.fit(train_dataset, nb_epoch=20) model.save() train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance([classification_metric]) print("Train scores") print(train_scores) valid_evaluator = Evaluator(model, valid_dataset, transformers, verbosity=verbosity) valid_scores = valid_evaluator.compute_model_performance([classification_metric]) print("Validation scores")
def model_builder(model_params, model_dir): tensorflow_model = TensorflowMultiTaskClassifier( len(tasks), n_features, model_dir, **model_params) return TensorflowModel(tensorflow_model, model_dir)
"nesterov": False, "dropouts": (.5, .5), "nb_layers": 2, "batchnorm": False, "layer_sizes": (2000, 500), "weight_init_stddevs": (.1, .1), "bias_init_consts": (1., 1.), "num_classes": 2, "penalty": 0., "optimizer": "sgd", "num_classification_tasks": len(kinase_task_types) } model_dir = os.path.join(base_dir, 'model_2000x500_128_allKinase_081516') model = TensorflowModel(kinase_tasks, kinase_task_types, params_dict, model_dir, tf_class=TensorflowMultiTaskClassifier, verbosity='high') def classify(document): doc = document.strip().split('\r\n') mol = [Chem.MolFromSmiles(x) for x in doc if x is not None] fp = [ AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024) for x in mol if x is not None ] fp = np.array(fp) if len(fp.shape) == 1: fp = np.reshape(fp, (1, -1)) o = np.ones((fp.shape[0], len(kinase_tasks)))
"dropouts": [.25], "num_classification_tasks": len(muv_tasks), "num_classes": 2, "penalty": .0, "optimizer": "momentum", "learning_rate": .001, "momentum": .9, } if os.path.exists(model_dir): shutil.rmtree(model_dir) os.makedirs(model_dir) model = TensorflowModel(muv_tasks, muv_task_types, params_dict, model_dir, tf_class=TensorflowMultiTaskClassifier, verbosity=verbosity) # Fit trained model model.fit(train_dataset) model.save() train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance( [classification_metric]) print("Train scores")
def test_singletask_tf_mlp_ECFP_classification_API(self): """Straightforward test of Tensorflow singletask deepchem classification API.""" splittype = "scaffold" output_transformers = [] input_transformers = [] task_type = "classification" featurizer = CircularFingerprint(size=1024) tasks = ["outcome"] task_type = "classification" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example_classification.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) input_transformers = [] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset) ] transformers = input_transformers + output_transformers for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) model_params = { "batch_size": 2, "num_classification_tasks": 1, "num_features": 1024, "layer_sizes": [1024], "weight_init_stddevs": [1.], "bias_init_consts": [0.], "dropouts": [.5], "num_classes": 2, "nb_epoch": 1, "penalty": 0.0, "optimizer": "adam", "learning_rate": .001, "data_shape": train_dataset.get_data_shape() } classification_metrics = [ Metric(metrics.roc_auc_score), Metric(metrics.matthews_corrcoef), Metric(metrics.recall_score), Metric(metrics.accuracy_score) ] model = TensorflowModel(tasks, task_types, model_params, self.model_dir, tf_class=TensorflowMultiTaskClassifier) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics)
# Load Tox21 dataset n_features = 1024 tox21_tasks, tox21_datasets, transformers = load_tox21(data_dir, reload=False) # Do train/valid split. train_dataset, valid_dataset = tox21_datasets # Fit models classification_metric = Metric(metrics.roc_auc_score, np.mean, verbosity=verbosity, mode="classification") tensorflow_model = TensorflowMultiTaskClassifier( len(tox21_tasks), n_features, model_dir, dropouts=[.25], learning_rate=0.0003, weight_init_stddevs=[1.], batch_size=32, verbosity=verbosity) model = TensorflowModel(tensorflow_model, model_dir) # Fit trained model model.fit(train_dataset) model.save() train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance( [classification_metric]) print("Train scores") print(train_scores) valid_evaluator = Evaluator(model, valid_dataset, transformers, verbosity=verbosity)
def test_tf_skewed_missing_classification_overfit(self): """TF, skewed data, few actives Test tensorflow models overfit 0/1 datasets with missing data and few actives. This is intended to be as close to singletask MUV datasets as possible. """ tasks = ["task0"] task_types = {task: "classification" for task in tasks} n_samples = 5120 n_features = 6 n_tasks = len(tasks) n_classes = 2 # Generate dummy dataset np.random.seed(123) p = .002 ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.binomial(1, p, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) y_flat, w_flat = np.squeeze(y), np.squeeze(w) y_nonzero = y_flat[w_flat != 0] num_nonzero = np.count_nonzero(y_nonzero) weight_nonzero = len(y_nonzero)/num_nonzero w_flat[y_flat != 0] = weight_nonzero w = np.reshape(w_flat, (n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "layer_sizes": [1200], "dropouts": [.0], "learning_rate": 0.003, "momentum": .9, "batch_size": 75, "num_classification_tasks": 1, "num_classes": n_classes, "num_features": n_features, "weight_init_stddevs": [1.], "bias_init_consts": [1.], "nb_epoch": 250, "penalty": 0.0, "optimizer": "adam", "data_shape": dataset.get_data_shape() } verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) model = TensorflowModel( tasks, task_types, model_params, self.model_dir, tf_class=TensorflowMultiTaskClassifier, verbosity=verbosity) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .8
"bias_init_consts": [1.], "dropouts": [.25], "num_classification_tasks": len(pcba_tasks), "num_classes": 2, "penalty": .0, "optimizer": "momentum", "learning_rate": .001, "momentum": .9, } if os.path.exists(model_dir): shutil.rmtree(model_dir) os.makedirs(model_dir) model = TensorflowModel(pcba_tasks, pcba_task_types, params_dict, model_dir, tf_class=TensorflowMultiTaskClassifier, verbosity=verbosity) # Fit trained model model.fit(train_dataset) model.save() train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance([classification_metric]) print("Train scores") print(train_scores) valid_evaluator = Evaluator(model, valid_dataset, transformers, verbosity=verbosity) valid_scores = valid_evaluator.compute_model_performance([classification_metric])
def model_builder(tasks, task_types, params_dict, logdir, verbosity=None): return TensorflowModel( tasks, task_types, params_dict, logdir, tf_class=TensorflowMultiTaskClassifier, verbosity=verbosity)