def scaffold_test_train_test_split(self): """Test of singletask RF ECFP regression API.""" splittype = "scaffold" input_transforms = [] output_transforms = ["normalize"] model_params = {} tasks = ["log-solubility"] task_type = "regression" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example.csv") featurizer = CircularFingerprint(size=1024) input_file = os.path.join(self.current_dir, input_file) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) # Splits featurized samples into train/test splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) assert len(train_dataset) == 8 assert len(test_dataset) == 2
def test_singletask_scaffold_k_fold_split(self): """ Test singletask ScaffoldSplitter class. """ solubility_dataset = self.load_solubility_data() scaffold_splitter = ScaffoldSplitter() ids_set = set(solubility_dataset.ids) K = 5 fold_dirs = [tempfile.mkdtemp() for i in range(K)] fold_datasets = scaffold_splitter.k_fold_split(solubility_dataset, fold_dirs) for fold in range(K): fold_dataset = fold_datasets[fold] # Verify lengths is 10/k == 2 assert len(fold_dataset) == 2 # Verify that compounds in this fold are subset of original compounds fold_ids_set = set(fold_dataset.ids) assert fold_ids_set.issubset(ids_set) # Verify that no two folds have overlapping compounds. for other_fold in range(K): if fold == other_fold: continue other_fold_dataset = fold_datasets[other_fold] other_fold_ids_set = set(other_fold_dataset.ids) assert fold_ids_set.isdisjoint(other_fold_ids_set) merge_dir = tempfile.mkdtemp() merged_dataset = DiskDataset.merge(merge_dir, fold_datasets) assert len(merged_dataset) == len(solubility_dataset) assert sorted(merged_dataset.ids) == (sorted(solubility_dataset.ids))
def test_multitask_tf_mlp_ECFP_classification_hyperparam_opt(self): """Straightforward test of Tensorflow multitask deepchem classification API.""" splittype = "scaffold" task_type = "classification" input_file = os.path.join(self.current_dir, "multitask_example.csv") tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16"] task_types = {task: task_type for task in tasks} featurizer = CircularFingerprint(size=1024) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir) transformers = [] metric = Metric(metrics.matthews_corrcoef, np.mean, mode="classification") params_dict = {"activation": ["relu"], "momentum": [.9], "batch_size": [50], "init": ["glorot_uniform"], "data_shape": [train_dataset.get_data_shape()], "learning_rate": [1e-3], "decay": [1e-6], "nb_hidden": [1000], "nb_epoch": [1], "nesterov": [False], "dropouts": [(.5,)], "nb_layers": [1], "batchnorm": [False], "layer_sizes": [(1000,)], "weight_init_stddevs": [(.1,)], "bias_init_consts": [(1.,)], "num_classes": [2], "penalty": [0.], "optimizer": ["sgd"], "num_classification_tasks": [len(task_types)] } def model_builder(tasks, task_types, params_dict, logdir, verbosity=None): return TensorflowModel( tasks, task_types, params_dict, logdir, tf_class=TensorflowMultiTaskClassifier, verbosity=verbosity) optimizer = HyperparamOpt(model_builder, tasks, task_types, verbosity="low") best_model, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, transformers, metric, logdir=None)
def test_singletask_sklearn_rf_ECFP_regression_API(self): """Test of singletask RF ECFP regression API.""" splittype = "scaffold" featurizer = CircularFingerprint(size=1024) model_params = {} tasks = ["log-solubility"] task_type = "regression" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) input_transformers = [] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset) ] transformers = input_transformers + output_transformers model_params["data_shape"] = train_dataset.get_data_shape() regression_metrics = [ Metric(metrics.r2_score), Metric(metrics.mean_squared_error), Metric(metrics.mean_absolute_error) ] model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="regression", model_instance=RandomForestRegressor()) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics)
def test_multitask_keras_mlp_ECFP_classification_API(self): """Straightforward test of Keras multitask deepchem classification API.""" g = tf.Graph() sess = tf.Session(graph=g) K.set_session(sess) with g.as_default(): task_type = "classification" # TODO(rbharath): There should be some automatic check to ensure that all # required model_params are specified. # TODO(rbharath): Turning off dropout to make tests behave. model_params = {"nb_hidden": 10, "activation": "relu", "dropout": .0, "learning_rate": .01, "momentum": .9, "nesterov": False, "decay": 1e-4, "batch_size": 5, "nb_epoch": 2, "init": "glorot_uniform", "nb_layers": 1, "batchnorm": False} input_file = os.path.join(self.current_dir, "multitask_example.csv") tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16"] task_types = {task: task_type for task in tasks} featurizer = CircularFingerprint(size=1024) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) transformers = [] model_params["data_shape"] = train_dataset.get_data_shape() classification_metrics = [Metric(metrics.roc_auc_score), Metric(metrics.matthews_corrcoef), Metric(metrics.recall_score), Metric(metrics.accuracy_score)] model = MultiTaskDNN(tasks, task_types, model_params, self.model_dir) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics)
def test_singletask_sklearn_rf_RDKIT_descriptor_regression_API(self): """Test of singletask RF RDKIT-descriptor regression API.""" splittype = "scaffold" featurizer = RDKitDescriptors() tasks = ["log-solubility"] task_type = "regression" task_types = {task: task_type for task in tasks} model_params = {} input_file = os.path.join(self.current_dir, "example.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) input_transformers = [ NormalizationTransformer(transform_X=True, dataset=train_dataset), ClippingTransformer(transform_X=True, dataset=train_dataset)] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset)] transformers = input_transformers + output_transformers for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) model_params["data_shape"] = train_dataset.get_data_shape() regression_metrics = [Metric(metrics.r2_score), Metric(metrics.mean_squared_error), Metric(metrics.mean_absolute_error)] model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="regression", model_instance=RandomForestRegressor()) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics)
def test_singletask_scaffold_split(self): """ Test singletask ScaffoldSplitter class. """ solubility_dataset = self.load_solubility_data() scaffold_splitter = ScaffoldSplitter() train_data, valid_data, test_data = \ scaffold_splitter.train_valid_test_split( solubility_dataset, self.train_dir, self.valid_dir, self.test_dir, frac_train=0.8, frac_valid=0.1, frac_test=0.1) assert len(train_data) == 8 assert len(valid_data) == 1 assert len(test_data) == 1
def test_multitask_scaffold_split(self): """ Test multitask ScaffoldSplitter class. """ multitask_dataset = self.load_multitask_data() scaffold_splitter = ScaffoldSplitter() train_data, valid_data, test_data = \ scaffold_splitter.train_valid_test_split( multitask_dataset, self.train_dir, self.valid_dir, self.test_dir, frac_train=0.8, frac_valid=0.1, frac_test=0.1) assert len(train_data) == 8 assert len(valid_data) == 1 assert len(test_data) == 1
def _run_muv_experiment(self, dataset_file, reload=False, verbosity=None): """Loads or reloads a small version of MUV dataset.""" # Load MUV dataset raw_dataset = load_from_disk(dataset_file) print("Number of examples in dataset: %s" % str(raw_dataset.shape[0])) print("About to featurize compounds") featurizer = CircularFingerprint(size=1024) MUV_tasks = [ 'MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644', 'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712', 'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652', 'MUV-466', 'MUV-832' ] loader = DataLoader(tasks=MUV_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize(dataset_file, self.data_dir) assert len(dataset) == len(raw_dataset) print("About to split compounds into train/valid/test") splitter = ScaffoldSplitter(verbosity=verbosity) frac_train, frac_valid, frac_test = .8, .1, .1 train_dataset, valid_dataset, test_dataset = \ splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir, log_every_n=1000, frac_train=frac_train, frac_test=frac_test, frac_valid=frac_valid) # Do an approximate comparison since splits are sometimes slightly off from # the exact fraction. assert relative_difference(len(train_dataset), frac_train * len(dataset)) < 1e-3 assert relative_difference(len(valid_dataset), frac_valid * len(dataset)) < 1e-3 assert relative_difference(len(test_dataset), frac_test * len(dataset)) < 1e-3 # TODO(rbharath): Transformers don't play nice with reload! Namely, # reloading will cause the transform to be reapplied. This is undesirable in # almost all cases. Need to understand a method to fix this. transformers = [ BalancingTransformer(transform_w=True, dataset=train_dataset) ] print("Transforming datasets") for dataset in [train_dataset, valid_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) return (len(train_dataset), len(valid_dataset), len(test_dataset))
def test_singletask_sklearn_rf_ECFP_regression_sharded_API(self): """Test of singletask RF ECFP regression API: sharded edition.""" splittype = "scaffold" featurizer = CircularFingerprint(size=1024) model_params = {} tasks = ["label"] task_type = "regression" task_types = {task: task_type for task in tasks} input_file = os.path.join( self.current_dir, "../../../datasets/pdbbind_core_df.pkl.gz") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) input_transformers = [] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset)] transformers = input_transformers + output_transformers for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) # We set shard size above to force the creation of multiple shards of the data. # pdbbind_core has ~200 examples. model_params["data_shape"] = train_dataset.get_data_shape() regression_metrics = [Metric(metrics.r2_score), Metric(metrics.mean_squared_error), Metric(metrics.mean_absolute_error)] model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="regression", model_instance=RandomForestRegressor()) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics)
def _run_muv_experiment(self, dataset_file, reload=False, verbosity=None): """Loads or reloads a small version of MUV dataset.""" # Load MUV dataset raw_dataset = load_from_disk(dataset_file) print("Number of examples in dataset: %s" % str(raw_dataset.shape[0])) print("About to featurize compounds") featurizer = CircularFingerprint(size=1024) MUV_tasks = ['MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644', 'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712', 'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652', 'MUV-466', 'MUV-832'] loader = DataLoader(tasks=MUV_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize(dataset_file, self.data_dir) assert len(dataset) == len(raw_dataset) print("About to split compounds into train/valid/test") splitter = ScaffoldSplitter(verbosity=verbosity) frac_train, frac_valid, frac_test = .8, .1, .1 train_dataset, valid_dataset, test_dataset = \ splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir, log_every_n=1000, frac_train=frac_train, frac_test=frac_test, frac_valid=frac_valid) # Do an approximate comparison since splits are sometimes slightly off from # the exact fraction. assert relative_difference( len(train_dataset), frac_train * len(dataset)) < 1e-3 assert relative_difference( len(valid_dataset), frac_valid * len(dataset)) < 1e-3 assert relative_difference( len(test_dataset), frac_test * len(dataset)) < 1e-3 # TODO(rbharath): Transformers don't play nice with reload! Namely, # reloading will cause the transform to be reapplied. This is undesirable in # almost all cases. Need to understand a method to fix this. transformers = [ BalancingTransformer(transform_w=True, dataset=train_dataset)] print("Transforming datasets") for dataset in [train_dataset, valid_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) return (len(train_dataset), len(valid_dataset), len(test_dataset))
def test_singletask_sklearn_rf_RDKIT_descriptor_regression_API(self): """Test of singletask RF RDKIT-descriptor regression API.""" splittype = "scaffold" featurizer = RDKitDescriptors() tasks = ["log-solubility"] task_type = "regression" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) input_transformers = [ NormalizationTransformer(transform_X=True, dataset=train_dataset), ClippingTransformer(transform_X=True, dataset=train_dataset)] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset)] transformers = input_transformers + output_transformers for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) regression_metrics = [Metric(metrics.r2_score), Metric(metrics.mean_squared_error), Metric(metrics.mean_absolute_error)] sklearn_model = RandomForestRegressor() model = SklearnModel(sklearn_model, self.model_dir) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics)
def test_multitask_order(self): """Test that order of tasks in multitask datasets is preserved.""" from deepchem.models.keras_models.fcnet import MultiTaskDNN splittype = "scaffold" output_transformers = [] input_transformers = [] task_type = "classification" # TODO(rbharath): There should be some automatic check to ensure that all # required model_params are specified. model_params = { "nb_hidden": 10, "activation": "relu", "dropout": .5, "learning_rate": .01, "momentum": .9, "nesterov": False, "decay": 1e-4, "batch_size": 5, "nb_epoch": 2, "init": "glorot_uniform", "nb_layers": 1, "batchnorm": False } input_file = os.path.join(self.current_dir, "multitask_example.csv") tasks = [ "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16" ] task_types = {task: task_type for task in tasks} featurizer = CircularFingerprint(size=1024) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) assert train_dataset.get_task_names() == tasks assert test_dataset.get_task_names() == tasks
def test_singletask_tf_mlp_ECFP_classification_API(self): """Straightforward test of Tensorflow singletask deepchem classification API.""" n_features = 1024 featurizer = CircularFingerprint(size=n_features) tasks = ["outcome"] input_file = os.path.join(self.current_dir, "example_classification.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset)] for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) classification_metrics = [Metric(metrics.roc_auc_score), Metric(metrics.matthews_corrcoef), Metric(metrics.recall_score), Metric(metrics.accuracy_score)] tensorflow_model = TensorflowMultiTaskClassifier( len(tasks), n_features, self.model_dir) model = TensorflowModel(tensorflow_model, self.model_dir) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics)
def test_multitask_keras_mlp_ECFP_classification_API(self): """Straightforward test of Keras multitask deepchem classification API.""" g = tf.Graph() sess = tf.Session(graph=g) K.set_session(sess) with g.as_default(): task_type = "classification" input_file = os.path.join(self.current_dir, "multitask_example.csv") tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16"] n_features = 1024 featurizer = CircularFingerprint(size=n_features) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) transformers = [] classification_metrics = [Metric(metrics.roc_auc_score), Metric(metrics.matthews_corrcoef), Metric(metrics.recall_score), Metric(metrics.accuracy_score)] keras_model = MultiTaskDNN(len(tasks), n_features, "classification", dropout=0.) model = KerasModel(keras_model, self.model_dir) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics)
def test_multitask_keras_mlp_ECFP_classification_hyperparam_opt(self): """Straightforward test of Keras multitask deepchem classification API.""" task_type = "classification" input_file = os.path.join(self.current_dir, "multitask_example.csv") tasks = [ "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16" ] n_features = 1024 featurizer = CircularFingerprint(size=n_features) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir) transformers = [] metric = Metric(metrics.matthews_corrcoef, np.mean, mode="classification") params_dict = {"n_hidden": [5, 10]} def model_builder(model_params, model_dir): keras_model = MultiTaskDNN(len(tasks), n_features, task_type, dropout=0., **model_params) return KerasModel(keras_model, model_dir) optimizer = HyperparamOpt(model_builder, verbosity="low") best_model, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, transformers, metric, logdir=None)
def test_multitask_keras_mlp_ECFP_classification_hyperparam_opt(self): """Straightforward test of Keras multitask deepchem classification API.""" task_type = "classification" input_file = os.path.join(self.current_dir, "multitask_example.csv") tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16"] task_types = {task: task_type for task in tasks} featurizer = CircularFingerprint(size=1024) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir) transformers = [] metric = Metric(metrics.matthews_corrcoef, np.mean, mode="classification") params_dict= {"nb_hidden": [5, 10], "activation": ["relu"], "dropout": [.5], "learning_rate": [.01], "momentum": [.9], "nesterov": [False], "decay": [1e-4], "batch_size": [5], "nb_epoch": [2], "init": ["glorot_uniform"], "nb_layers": [1], "batchnorm": [False], "data_shape": [train_dataset.get_data_shape()]} optimizer = HyperparamOpt(MultiTaskDNN, tasks, task_types, verbosity="low") best_model, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, transformers, metric, logdir=None)
def test_multitask_order(self): """Test that order of tasks in multitask datasets is preserved.""" input_file = os.path.join(self.current_dir, "multitask_example.csv") tasks = [ "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16" ] featurizer = CircularFingerprint(size=1024) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) assert train_dataset.get_task_names() == tasks assert test_dataset.get_task_names() == tasks
def test_singletask_sklearn_rf_ECFP_regression_hyperparam_opt(self): """Test of hyperparam_opt with singletask RF ECFP regression API.""" featurizer = CircularFingerprint(size=1024) tasks = ["log-solubility"] input_file = os.path.join(self.current_dir, "example.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir) transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset) ] for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) params_dict = {"n_estimators": [10, 100]} metric = Metric(metrics.r2_score) def rf_model_builder(model_params, model_dir): sklearn_model = RandomForestRegressor(**model_params) return SklearnModel(sklearn_model, model_dir) optimizer = HyperparamOpt(rf_model_builder, verbosity="low") best_model, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, transformers, metric, logdir=None)
def _load_mol_dataset(dataset_file, tasks, split="stratified", test_size=0.1, valid_size=0.1, min_size=0, max_size=None, **kwargs): train_size = 1.0 - (test_size + valid_size) featurizer = RawFeaturizer() loader = CSVLoader(tasks=tasks, smiles_field="smiles", featurizer=featurizer, verbose=False, log_every_n=10000) dataset = loader.featurize(dataset_file) splitters = { 'index': IndexSplitter(), 'random': RandomSplitter(), 'scaffold': ScaffoldSplitter(), 'butina': ButinaSplitter(), 'stratified': RandomStratifiedSplitter() } splitter = splitters[split] train, valid, test = splitter.train_valid_test_split(dataset, frac_train=train_size, frac_valid=valid_size, frac_test=test_size) # compute data balance information on train balancer = BalancingTransformer(transform_w=True, dataset=train) train = balancer.transform(train) valid = balancer.transform(valid) test = balancer.transform(test) transformer = GraphTransformer(mol_size=[min_size, max_size], **kwargs) datasets = [] for dt in (train, valid, test): X, ids = transformer(dt.ids, dtype=np.float32, ignore_errors=False) y = dt.y[ids, :] w = dt.w[ids, :] raw_mols = dt.X[ids] datasets.append(MolDataset(X, y, raw_mols, w=w, pad_to=max_size)) in_size = X[0][-1].shape[-1] out_size = 1 if len(y.shape) == 1 else y.shape[-1] return datasets, in_size, out_size
def test_multitask_order(self): """Test that order of tasks in multitask datasets is preserved.""" from deepchem.models.keras_models.fcnet import MultiTaskDNN splittype = "scaffold" output_transformers = [] input_transformers = [] task_type = "classification" # TODO(rbharath): There should be some automatic check to ensure that all # required model_params are specified. model_params = {"nb_hidden": 10, "activation": "relu", "dropout": .5, "learning_rate": .01, "momentum": .9, "nesterov": False, "decay": 1e-4, "batch_size": 5, "nb_epoch": 2, "init": "glorot_uniform", "nb_layers": 1, "batchnorm": False} input_file = os.path.join(self.current_dir, "multitask_example.csv") tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16"] task_types = {task: task_type for task in tasks} featurizer = CircularFingerprint(size=1024) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) assert train_dataset.get_task_names() == tasks assert test_dataset.get_task_names() == tasks
def test_singletask_sklearn_rf_ECFP_regression_hyperparam_opt(self): """Test of hyperparam_opt with singletask RF ECFP regression API.""" splittype = "scaffold" featurizer = CircularFingerprint(size=1024) tasks = ["log-solubility"] task_type = "regression" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir) input_transformers = [] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset)] transformers = input_transformers + output_transformers for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) params_dict = { "n_estimators": [10, 100], "max_features": ["auto"], "data_shape": train_dataset.get_data_shape() } metric = Metric(metrics.r2_score) optimizer = HyperparamOpt(rf_model_builder, tasks, task_types, verbosity="low") best_model, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, output_transformers, metric, logdir=None)
def partition_train_val_test(smiles, dataset): """ Split a molecule dataset (SMILES) with deepchem built-ins """ ds = MockDataset(smiles) if dataset == "BBBP": splitter = ScaffoldSplitter() elif dataset == "BACE": splitter = ScaffoldSplitter() elif dataset == "TOX21": splitter = RandomSplitter() train_inds, val_inds, test_inds = splitter.split(ds) return { "train_inds": train_inds, "val_inds": val_inds, "test_inds": test_inds }
def test_splits(): clean() init_data() smiles_col = 'compound_id' id_col = 'compound_id' output_dir = 'plots' frac_train = 0.8 frac_test = 0.1 frac_valid = 0.1 num_super_scaffolds = 40 num_generations = 20 dfw = 1 # chemical distance importance weight rfw = 1 # split fraction importance weight total_df = pd.read_csv('KCNA5_KCNH2_SCN5A_data.csv', dtype={id_col: str}) response_cols = [ 'target_KCNA5_standard_value', 'target_KCNH2_standard_value', 'target_SCN5A_activity' ] # ------------------------------------------------------------------------- # one generation multitask scaffold split mss = MultitaskScaffoldSplitter() mss_split_df = split_with(total_df, mss, smiles_col=smiles_col, id_col=id_col, response_cols=response_cols, diff_fitness_weight=dfw, ratio_fitness_weight=rfw, num_generations=1, num_super_scaffolds=num_super_scaffolds, frac_train=frac_train, frac_test=frac_test, frac_valid=frac_valid) mss_split_df.to_csv('one_gen_split.csv', index=False) assert len(total_df) == len(mss_split_df) split_a = pd.read_csv('one_gen_split.csv', dtype={'cmpd_id': str}) split_a_ss = SplitStats(total_df, split_a, smiles_col=smiles_col, id_col=id_col, response_cols=response_cols) split_a_ss.make_all_plots( dist_path=os.path.join(output_dir, 'multitask_1gen')) # ------------------------------------------------------------------------- # multiple generation mulittask scaffold split mss = MultitaskScaffoldSplitter() mss_split_df = split_with(total_df, mss, smiles_col=smiles_col, id_col=id_col, response_cols=response_cols, diff_fitness_weight=dfw, ratio_fitness_weight=rfw, num_generations=num_generations, num_super_scaffolds=num_super_scaffolds, frac_train=frac_train, frac_test=frac_test, frac_valid=frac_valid) mss_split_df.to_csv('twenty_gen_split.csv', index=False) assert len(total_df) == len(mss_split_df) split_b = pd.read_csv('twenty_gen_split.csv', dtype={'cmpd_id': str}) split_b_ss = SplitStats(total_df, split_b, smiles_col=smiles_col, id_col=id_col, response_cols=response_cols) split_b_ss.make_all_plots( dist_path=os.path.join(output_dir, f'multitask_{num_generations}gen')) # ------------------------------------------------------------------------- # regular scaffold split ss = ScaffoldSplitter() ss_split_df = split_with(total_df, ss, smiles_col=smiles_col, id_col=id_col, response_cols=response_cols, frac_train=frac_train, frac_test=frac_test, frac_valid=frac_valid) ss_split_df.to_csv('ss_split.csv', index=False) assert len(total_df) == len(ss_split_df) split_c = pd.read_csv('ss_split.csv', dtype={'cmpd_id': str}) split_c_ss = SplitStats(total_df, split_c, smiles_col=smiles_col, id_col=id_col, response_cols=response_cols) split_c_ss.make_all_plots(dist_path=os.path.join(output_dir, 'scaffold_')) # median train/test compound distance should have gone up assert np.median(split_a_ss.dists) <= np.median(split_b_ss.dists) assert np.median(split_c_ss.dists) <= np.median(split_b_ss.dists) # no subset should contain 0 samples assert np.min( np.concatenate([ split_a_ss.train_fracs, split_a_ss.valid_fracs, split_a_ss.test_fracs ])) > 0 assert np.min( np.concatenate([ split_b_ss.train_fracs, split_b_ss.valid_fracs, split_b_ss.test_fracs ])) > 0 clean()
def test_singletask_tf_mlp_ECFP_classification_API(self): """Straightforward test of Tensorflow singletask deepchem classification API.""" splittype = "scaffold" output_transformers = [] input_transformers = [] task_type = "classification" featurizer = CircularFingerprint(size=1024) tasks = ["outcome"] task_type = "classification" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example_classification.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) input_transformers = [] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset)] transformers = input_transformers + output_transformers for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) model_params = { "batch_size": 2, "num_classification_tasks": 1, "num_features": 1024, "layer_sizes": [1024], "weight_init_stddevs": [1.], "bias_init_consts": [0.], "dropouts": [.5], "num_classes": 2, "nb_epoch": 1, "penalty": 0.0, "optimizer": "adam", "learning_rate": .001, "data_shape": train_dataset.get_data_shape() } classification_metrics = [Metric(metrics.roc_auc_score), Metric(metrics.matthews_corrcoef), Metric(metrics.recall_score), Metric(metrics.accuracy_score)] model = TensorflowModel( tasks, task_types, model_params, self.model_dir, tf_class=TensorflowMultiTaskClassifier) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics)
def test_multitask_keras_mlp_ECFP_classification_API(self): """Straightforward test of Keras multitask deepchem classification API.""" from deepchem.models.keras_models.fcnet import MultiTaskDNN task_type = "classification" # TODO(rbharath): There should be some automatic check to ensure that all # required model_params are specified. model_params = { "nb_hidden": 10, "activation": "relu", "dropout": .5, "learning_rate": .01, "momentum": .9, "nesterov": False, "decay": 1e-4, "batch_size": 5, "nb_epoch": 2, "init": "glorot_uniform", "nb_layers": 1, "batchnorm": False } input_file = os.path.join(self.current_dir, "multitask_example.csv") tasks = [ "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16" ] task_types = {task: task_type for task in tasks} featurizer = CircularFingerprint(size=1024) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) transformers = [] model_params["data_shape"] = train_dataset.get_data_shape() classification_metrics = [ Metric(metrics.roc_auc_score), Metric(metrics.matthews_corrcoef), Metric(metrics.recall_score), Metric(metrics.accuracy_score) ] model = MultiTaskDNN(tasks, task_types, model_params, self.model_dir) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics)
def test_singletask_tf_mlp_ECFP_classification_API(self): """Straightforward test of Tensorflow singletask deepchem classification API.""" splittype = "scaffold" output_transformers = [] input_transformers = [] task_type = "classification" featurizer = CircularFingerprint(size=1024) tasks = ["outcome"] task_type = "classification" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example_classification.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) input_transformers = [] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset) ] transformers = input_transformers + output_transformers for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) model_params = { "batch_size": 2, "num_classification_tasks": 1, "num_features": 1024, "layer_sizes": [1024], "weight_init_stddevs": [1.], "bias_init_consts": [0.], "dropouts": [.5], "num_classes": 2, "nb_epoch": 1, "penalty": 0.0, "optimizer": "adam", "learning_rate": .001, "data_shape": train_dataset.get_data_shape() } classification_metrics = [ Metric(metrics.roc_auc_score), Metric(metrics.matthews_corrcoef), Metric(metrics.recall_score), Metric(metrics.accuracy_score) ] model = TensorflowModel(tasks, task_types, model_params, self.model_dir, tf_class=TensorflowMultiTaskClassifier) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics)