def test_drop(self): """Test on dataset where RDKit fails on some strings.""" # Set some global variables up top reload = True verbosity = "high" len_full = 25 current_dir = os.path.dirname(os.path.realpath(__file__)) data_dir = os.path.join(self.base_dir, "dataset") model_dir = os.path.join(self.base_dir, "model") print("About to load emols dataset.") dataset_file = os.path.join( current_dir, "mini_emols.csv") # Featurize emols dataset print("About to featurize datasets.") featurizer = CircularFingerprint(size=1024) emols_tasks = ['activity'] loader = DataLoader(tasks=emols_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize(dataset_file, data_dir, debug=True, logging=False) X, y, w, ids = dataset.to_numpy() print("ids.shape, X.shape, y.shape, w.shape") print(ids.shape, X.shape, y.shape, w.shape) assert len(X) == len(y) == len(w) == len(ids)
def test_subset(self): """Tests that subsetting of datasets works.""" verbosity = "high" current_dir = os.path.dirname(os.path.realpath(__file__)) data_dir = os.path.join(self.base_dir, "dataset") subset_dir = os.path.join(self.base_dir, "subset") dataset_file = os.path.join(current_dir, "../../models/tests/example.csv") featurizer = CircularFingerprint(size=1024) tasks = ["log-solubility"] loader = DataLoader(tasks=tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize(dataset_file, data_dir, shard_size=2) shard_nums = [1, 2] orig_ids = dataset.get_ids() _, _, _, ids_1 = dataset.get_shard(1) _, _, _, ids_2 = dataset.get_shard(2) subset = dataset.subset(subset_dir, shard_nums) after_ids = dataset.get_ids() assert len(subset) == 4 assert sorted(subset.get_ids()) == sorted(np.concatenate([ids_1, ids_2])) assert list(orig_ids) == list(after_ids)
def test_subset(self): """Tests that subsetting of datasets works.""" verbosity = "high" current_dir = os.path.dirname(os.path.realpath(__file__)) data_dir = os.path.join(self.base_dir, "dataset") subset_dir = os.path.join(self.base_dir, "subset") dataset_file = os.path.join(current_dir, "../../models/tests/example.csv") featurizer = CircularFingerprint(size=1024) tasks = ["log-solubility"] loader = DataLoader(tasks=tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize(dataset_file, data_dir, shard_size=2) shard_nums = [1, 2] orig_ids = dataset.ids _, _, _, ids_1 = dataset.get_shard(1) _, _, _, ids_2 = dataset.get_shard(2) subset = dataset.subset(subset_dir, shard_nums) after_ids = dataset.ids assert len(subset) == 4 assert sorted(subset.ids) == sorted(np.concatenate([ids_1, ids_2])) assert list(orig_ids) == list(after_ids)
def test_samples_move(self): """Test that featurized samples can be moved and reloaded.""" verbosity = "high" data_dir = os.path.join(self.base_dir, "data") moved_data_dir = os.path.join(self.base_dir, "moved_data") dataset_file = os.path.join( self.current_dir, "example.csv") featurizer = CircularFingerprint(size=1024) tasks = ["log-solubility"] loader = DataLoader(tasks=tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) featurized_dataset = loader.featurize( dataset_file, data_dir) n_dataset = len(featurized_dataset) # Now perform move shutil.move(data_dir, moved_data_dir) moved_featurized_dataset = Dataset( data_dir=moved_data_dir, reload=True) assert len(moved_featurized_dataset) == n_dataset
def random_test_train_valid_test_split(self): """Test of singletask RF ECFP regression API.""" input_transforms = [] output_transforms = ["normalize"] model_params = {} tasks = ["log-solubility"] task_type = "regression" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example.csv") featurizer = CircularFingerprint(size=1024) input_file = os.path.join(self.current_dir, input_file) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) # Splits featurized samples into train/test splitter = RandomSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir) assert len(train_dataset) == 8 assert len(valid_dataset) == 1 assert len(test_dataset) == 1
def test_move_load(self): """Test that datasets can be moved and loaded.""" verbosity = "high" current_dir = os.path.dirname(os.path.realpath(__file__)) data_dir = os.path.join(self.base_dir, "data") moved_data_dir = os.path.join(self.base_dir, "moved_data") dataset_file = os.path.join( current_dir, "../../models/tests/example.csv") featurizer = CircularFingerprint(size=1024) tasks = ["log-solubility"] loader = DataLoader(tasks=tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize(dataset_file, data_dir) X, y, w, ids = dataset.to_numpy() shutil.move(data_dir, moved_data_dir) moved_dataset = Dataset( moved_data_dir, reload=reload) X_moved, y_moved, w_moved, ids_moved = moved_dataset.to_numpy() np.testing.assert_allclose(X, X_moved) np.testing.assert_allclose(y, y_moved) np.testing.assert_allclose(w, w_moved) np.testing.assert_array_equal(ids, ids_moved)
def random_test_train_valid_test_split_from_sdf(self): """Test of singletask CoulombMatrixEig regression on .sdf file.""" splittype = "random" input_transforms = [] output_transforms = ["normalize"] model_params = {} tasks = ["atomization_energy"] task_type = "regression" task_types = {task: task_type for task in tasks} current_dir = os.path.dirname(os.path.abspath(__file__)) input_file = os.path.join(current_dir, "data/water.sdf") featurizer = CoulombMatrixEig(6, remove_hydrogens=False) input_file = os.path.join(self.current_dir, input_file) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, mol_field="mol", featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) # Splits featurized samples into train/test splitter = RandomSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir) assert len(train_dataset) == 8 assert len(valid_dataset) == 1 assert len(test_dataset) == 1
def test_move_load(self): """Test that datasets can be moved and loaded.""" verbosity = "high" current_dir = os.path.dirname(os.path.realpath(__file__)) data_dir = os.path.join(self.base_dir, "data") moved_data_dir = os.path.join(self.base_dir, "moved_data") dataset_file = os.path.join(current_dir, "../../models/tests/example.csv") featurizer = CircularFingerprint(size=1024) tasks = ["log-solubility"] loader = DataLoader(tasks=tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize(dataset_file, data_dir) X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids) shutil.move(data_dir, moved_data_dir) moved_dataset = DiskDataset(moved_data_dir, reload=True) X_moved, y_moved, w_moved, ids_moved = (moved_dataset.X, moved_dataset.y, moved_dataset.w, moved_dataset.ids) np.testing.assert_allclose(X, X_moved) np.testing.assert_allclose(y, y_moved) np.testing.assert_allclose(w, w_moved) np.testing.assert_array_equal(ids, ids_moved)
def test_reshard_shuffle(self): """Test that datasets can be merged.""" verbosity = "high" current_dir = os.path.dirname(os.path.realpath(__file__)) data_dir = os.path.join(self.base_dir, "dataset") dataset_file = os.path.join(current_dir, "../../models/tests/example.csv") featurizer = CircularFingerprint(size=1024) tasks = ["log-solubility"] loader = DataLoader(tasks=tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize(dataset_file, data_dir, shard_size=2) X_orig, y_orig, w_orig, orig_ids = dataset.to_numpy() orig_len = len(dataset) dataset.reshard_shuffle(reshard_size=1) X_new, y_new, w_new, new_ids = dataset.to_numpy() assert len(dataset) == orig_len # The shuffling should have switched up the ordering assert not np.array_equal(orig_ids, new_ids) # But all the same entries should still be present assert sorted(orig_ids) == sorted(new_ids) # All the data should have same shape assert X_orig.shape == X_new.shape assert y_orig.shape == y_new.shape assert w_orig.shape == w_new.shape
def test_reshard_shuffle(self): """Test that datasets can be merged.""" verbosity = "high" current_dir = os.path.dirname(os.path.realpath(__file__)) data_dir = os.path.join(self.base_dir, "dataset") dataset_file = os.path.join( current_dir, "../../models/tests/example.csv") featurizer = CircularFingerprint(size=1024) tasks = ["log-solubility"] loader = DataLoader(tasks=tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize( dataset_file, data_dir, shard_size=2) X_orig, y_orig, w_orig, orig_ids = dataset.to_numpy() orig_len = len(dataset) dataset.reshard_shuffle(reshard_size=1) X_new, y_new, w_new, new_ids = dataset.to_numpy() assert len(dataset) == orig_len # The shuffling should have switched up the ordering assert not np.array_equal(orig_ids, new_ids) # But all the same entries should still be present assert sorted(orig_ids) == sorted(new_ids) # All the data should have same shape assert X_orig.shape == X_new.shape assert y_orig.shape == y_new.shape assert w_orig.shape == w_new.shape
def featurize_and_split(input_file, feature_dir, samples_dir, train_dir, test_dir, splittype, feature_types, input_transforms, output_transforms, tasks, feature_files=None): """Featurize inputs with NNScore and do train-test split.""" loader = DataLoader(tasks=tasks, smiles_field="smiles", protein_pdb_field="protein_pdb", ligand_pdb_field="ligand_pdb", verbose=True) if feature_files is None: print("About to featurize.") samples = loader.featurize(input_file, feature_dir, samples_dir, shard_size=8) print("Completed Featurization") else: # Transform data into arrays for ML samples = FeaturizedSamples(samples_dir, feature_files, reload_data=False) # Split into train/test train_samples, test_samples = samples.train_test_split( splittype, train_dir, test_dir) print("Finished train test split.") train_dataset = Dataset(train_dir, train_samples, feature_types) test_dataset = Dataset(test_dir, test_samples, feature_types) print("Finished creating train test datasets") # Transforming train/test data train_dataset.transform(input_transforms, output_transforms) test_dataset.transform(input_transforms, output_transforms) print("Finished Transforming train test data.") return train_dataset, test_dataset
def test_drop(self): """Test on dataset where RDKit fails on some strings.""" # Set some global variables up top reload = True verbosity = "high" len_full = 25 current_dir = os.path.dirname(os.path.realpath(__file__)) data_dir = os.path.join(self.base_dir, "dataset") model_dir = os.path.join(self.base_dir, "model") print("About to load emols dataset.") dataset_file = os.path.join(current_dir, "mini_emols.csv") # Featurize emols dataset print("About to featurize datasets.") featurizer = CircularFingerprint(size=1024) emols_tasks = ['activity'] loader = DataLoader(tasks=emols_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize(dataset_file, data_dir, debug=True, logging=False) X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids) print("ids.shape, X.shape, y.shape, w.shape") print(ids.shape, X.shape, y.shape, w.shape) assert len(X) == len(y) == len(w) == len(ids)
def test_graph_conv_singletask_classification_overfit(self): """Test graph-conv multitask overfits tiny data.""" g = tf.Graph() sess = tf.Session(graph=g) K.set_session(sess) with g.as_default(): n_tasks = 1 n_samples = 10 n_features = 3 n_classes = 2 # Load mini log-solubility dataset. splittype = "scaffold" featurizer = ConvMolFeaturizer() tasks = ["outcome"] task_type = "classification" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example_classification.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) verbosity = "high" classification_metric = Metric(metrics.accuracy_score, verbosity=verbosity) #n_atoms = 50 n_feat = 71 batch_size = 10 graph_model = SequentialGraphModel(n_feat) graph_model.add(GraphConv(64, activation='relu')) graph_model.add(BatchNormalization(epsilon=1e-5, mode=1)) graph_model.add(GraphPool()) # Gather Projection graph_model.add(Dense(128, activation='relu')) graph_model.add(BatchNormalization(epsilon=1e-5, mode=1)) graph_model.add(GraphGather(batch_size, activation="tanh")) with self.test_session() as sess: model = MultitaskGraphClassifier( sess, graph_model, n_tasks, self.model_dir, batch_size=batch_size, learning_rate=1e-3, learning_rate_decay_time=1000, optimizer_type="adam", beta1=.9, beta2=.999, verbosity="high") # Fit trained model model.fit(dataset, nb_epoch=20) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) ######################################################### DEBUG print("scores") print(scores) ######################################################### DEBUG assert scores[classification_metric.name] > .85
def test_multitask_tf_mlp_ECFP_classification_hyperparam_opt(self): """Straightforward test of Tensorflow multitask deepchem classification API.""" splittype = "scaffold" task_type = "classification" input_file = os.path.join(self.current_dir, "multitask_example.csv") tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16"] task_types = {task: task_type for task in tasks} featurizer = CircularFingerprint(size=1024) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir) transformers = [] metric = Metric(metrics.matthews_corrcoef, np.mean, mode="classification") params_dict = {"activation": ["relu"], "momentum": [.9], "batch_size": [50], "init": ["glorot_uniform"], "data_shape": [train_dataset.get_data_shape()], "learning_rate": [1e-3], "decay": [1e-6], "nb_hidden": [1000], "nb_epoch": [1], "nesterov": [False], "dropouts": [(.5,)], "nb_layers": [1], "batchnorm": [False], "layer_sizes": [(1000,)], "weight_init_stddevs": [(.1,)], "bias_init_consts": [(1.,)], "num_classes": [2], "penalty": [0.], "optimizer": ["sgd"], "num_classification_tasks": [len(task_types)] } def model_builder(tasks, task_types, params_dict, logdir, verbosity=None): return TensorflowModel( tasks, task_types, params_dict, logdir, tf_class=TensorflowMultiTaskClassifier, verbosity=verbosity) optimizer = HyperparamOpt(model_builder, tasks, task_types, verbosity="low") best_model, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, transformers, metric, logdir=None)
def test_singletask_sklearn_rf_ECFP_regression_API(self): """Test of singletask RF ECFP regression API.""" splittype = "scaffold" featurizer = CircularFingerprint(size=1024) model_params = {} tasks = ["log-solubility"] task_type = "regression" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) input_transformers = [] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset) ] transformers = input_transformers + output_transformers model_params["data_shape"] = train_dataset.get_data_shape() regression_metrics = [ Metric(metrics.r2_score), Metric(metrics.mean_squared_error), Metric(metrics.mean_absolute_error) ] model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="regression", model_instance=RandomForestRegressor()) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics)
def load_muv(base_dir, reload=True): """Load MUV datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" model = "logistic" regen = False # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") # Load MUV dataset print("About to load MUV dataset.") dataset_file = os.path.join( current_dir, "../../datasets/muv.csv.gz") dataset = load_from_disk(dataset_file) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) # Featurize MUV dataset print("About to featurize MUV dataset.") featurizer = CircularFingerprint(size=1024) all_MUV_tasks = sorted(['MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644', 'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712', 'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652', 'MUV-466', 'MUV-832']) loader = DataLoader(tasks=all_MUV_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) if not reload or not os.path.exists(data_dir): dataset = loader.featurize(dataset_file, data_dir) regen = True else: dataset = Dataset(data_dir, reload=True) # Initialize transformers transformers = [ BalancingTransformer(transform_w=True, dataset=dataset)] if regen: print("About to transform data") for transformer in transformers: transformer.transform(dataset) return all_MUV_tasks, dataset, transformers
def test_multitask_keras_mlp_ECFP_classification_API(self): """Straightforward test of Keras multitask deepchem classification API.""" g = tf.Graph() sess = tf.Session(graph=g) K.set_session(sess) with g.as_default(): task_type = "classification" # TODO(rbharath): There should be some automatic check to ensure that all # required model_params are specified. # TODO(rbharath): Turning off dropout to make tests behave. model_params = {"nb_hidden": 10, "activation": "relu", "dropout": .0, "learning_rate": .01, "momentum": .9, "nesterov": False, "decay": 1e-4, "batch_size": 5, "nb_epoch": 2, "init": "glorot_uniform", "nb_layers": 1, "batchnorm": False} input_file = os.path.join(self.current_dir, "multitask_example.csv") tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16"] task_types = {task: task_type for task in tasks} featurizer = CircularFingerprint(size=1024) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) transformers = [] model_params["data_shape"] = train_dataset.get_data_shape() classification_metrics = [Metric(metrics.roc_auc_score), Metric(metrics.matthews_corrcoef), Metric(metrics.recall_score), Metric(metrics.accuracy_score)] model = MultiTaskDNN(tasks, task_types, model_params, self.model_dir) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics)
def load_tox21(base_dir, reload=True): """Load Tox21 datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" model = "logistic" # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. samples_dir = os.path.join(base_dir, "samples") data_dir = os.path.join(base_dir, "dataset") # Load Tox21 dataset print("About to load Tox21 dataset.") dataset_file = os.path.join( current_dir, "../../datasets/tox21.csv.gz") dataset = load_from_disk(dataset_file) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) # Featurize Tox21 dataset print("About to featurize Tox21 dataset.") featurizer = CircularFingerprint(size=1024) all_tox21_tasks = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53'] if not reload or not os.path.exists(data_dir): loader = DataLoader(tasks=all_tox21_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize( dataset_file, data_dir, shard_size=8192) else: dataset = Dataset(data_dir, all_tox21_tasks, reload=True) # Initialize transformers transformers = [ BalancingTransformer(transform_w=True, dataset=dataset)] if not reload: print("About to transform data") for transformer in transformers: transformer.transform(dataset) return all_tox21_tasks, dataset, transformers
def test_singletask_sklearn_rf_RDKIT_descriptor_regression_API(self): """Test of singletask RF RDKIT-descriptor regression API.""" splittype = "scaffold" featurizer = RDKitDescriptors() tasks = ["log-solubility"] task_type = "regression" task_types = {task: task_type for task in tasks} model_params = {} input_file = os.path.join(self.current_dir, "example.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) input_transformers = [ NormalizationTransformer(transform_X=True, dataset=train_dataset), ClippingTransformer(transform_X=True, dataset=train_dataset)] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset)] transformers = input_transformers + output_transformers for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) model_params["data_shape"] = train_dataset.get_data_shape() regression_metrics = [Metric(metrics.r2_score), Metric(metrics.mean_squared_error), Metric(metrics.mean_absolute_error)] model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="regression", model_instance=RandomForestRegressor()) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics)
def load_feat_multitask_data(self): """Load example with numerical features, tasks.""" if os.path.exists(self.data_dir): shutil.rmtree(self.data_dir) features = ["feat0", "feat1", "feat2", "feat3", "feat4", "feat5"] featurizer = UserDefinedFeaturizer(features) tasks = ["task0", "task1", "task2", "task3", "task4", "task5"] input_file = os.path.join( self.current_dir, "../../models/tests/feat_multitask_example.csv") loader = DataLoader(tasks=tasks, featurizer=featurizer, id_field="id", verbosity="low") return loader.featurize(input_file, self.data_dir)
def load_classification_data(self): """Loads classification data from example.csv""" if os.path.exists(self.data_dir): shutil.rmtree(self.data_dir) featurizer = CircularFingerprint(size=1024) tasks = ["outcome"] task_type = "classification" input_file = os.path.join( self.current_dir, "../../models/tests/example_classification.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") return loader.featurize(input_file, self.data_dir)
def _run_muv_experiment(self, dataset_file, reload=False, verbosity=None): """Loads or reloads a small version of MUV dataset.""" # Load MUV dataset raw_dataset = load_from_disk(dataset_file) print("Number of examples in dataset: %s" % str(raw_dataset.shape[0])) print("About to featurize compounds") featurizer = CircularFingerprint(size=1024) MUV_tasks = [ 'MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644', 'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712', 'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652', 'MUV-466', 'MUV-832' ] loader = DataLoader(tasks=MUV_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize(dataset_file, self.data_dir) assert len(dataset) == len(raw_dataset) print("About to split compounds into train/valid/test") splitter = ScaffoldSplitter(verbosity=verbosity) frac_train, frac_valid, frac_test = .8, .1, .1 train_dataset, valid_dataset, test_dataset = \ splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir, log_every_n=1000, frac_train=frac_train, frac_test=frac_test, frac_valid=frac_valid) # Do an approximate comparison since splits are sometimes slightly off from # the exact fraction. assert relative_difference(len(train_dataset), frac_train * len(dataset)) < 1e-3 assert relative_difference(len(valid_dataset), frac_valid * len(dataset)) < 1e-3 assert relative_difference(len(test_dataset), frac_test * len(dataset)) < 1e-3 # TODO(rbharath): Transformers don't play nice with reload! Namely, # reloading will cause the transform to be reapplied. This is undesirable in # almost all cases. Need to understand a method to fix this. transformers = [ BalancingTransformer(transform_w=True, dataset=train_dataset) ] print("Transforming datasets") for dataset in [train_dataset, valid_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) return (len(train_dataset), len(valid_dataset), len(test_dataset))
def test_singletask_sklearn_rf_ECFP_regression_sharded_API(self): """Test of singletask RF ECFP regression API: sharded edition.""" splittype = "scaffold" featurizer = CircularFingerprint(size=1024) model_params = {} tasks = ["label"] task_type = "regression" task_types = {task: task_type for task in tasks} input_file = os.path.join( self.current_dir, "../../../datasets/pdbbind_core_df.pkl.gz") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) input_transformers = [] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset)] transformers = input_transformers + output_transformers for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) # We set shard size above to force the creation of multiple shards of the data. # pdbbind_core has ~200 examples. model_params["data_shape"] = train_dataset.get_data_shape() regression_metrics = [Metric(metrics.r2_score), Metric(metrics.mean_squared_error), Metric(metrics.mean_absolute_error)] model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="regression", model_instance=RandomForestRegressor()) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics)
def test_log_solubility_dataset(self): """Test of loading for simple log-solubility dataset.""" current_dir = os.path.dirname(os.path.realpath(__file__)) input_file = "../../models/tests/example.csv" input_file = os.path.join(current_dir, input_file) tasks = ["log-solubility"] smiles_field = "smiles" loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=CircularFingerprint(size=1024), verbosity="low") dataset = loader.featurize(input_file, self.data_dir) assert len(dataset) == 10
def load_feat_multitask_data(self): """Load example with numerical features, tasks.""" if os.path.exists(self.data_dir): shutil.rmtree(self.data_dir) features = ["feat0", "feat1", "feat2", "feat3", "feat4", "feat5"] featurizer = UserDefinedFeaturizer(features) tasks = ["task0", "task1", "task2", "task3", "task4", "task5"] input_file = os.path.join( self.current_dir, "../../models/tests/feat_multitask_example.csv") loader = DataLoader( tasks=tasks, featurizer=featurizer, id_field="id", verbosity="low") return loader.featurize(input_file, self.data_dir)
def load_sparse_multitask_dataset(self): """Load sparse tox multitask data, sample dataset.""" if os.path.exists(self.data_dir): shutil.rmtree(self.data_dir) featurizer = CircularFingerprint(size=1024) tasks = ["task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9"] input_file = os.path.join( self.current_dir, "../../models/tests/sparse_multitask_example.csv") loader = DataLoader( tasks=tasks, smiles_field="smiles", featurizer=featurizer, verbosity="low") return loader.featurize(input_file, self.data_dir)
def load_classification_data(self): """Loads classification data from example.csv""" if os.path.exists(self.data_dir): shutil.rmtree(self.data_dir) featurizer = CircularFingerprint(size=1024) tasks = ["outcome"] task_type = "classification" input_file = os.path.join( self.current_dir, "../../models/tests/example_classification.csv") loader = DataLoader( tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") return loader.featurize(input_file, self.data_dir)
def load_gaussian_cdf_data(self): """Load example with numbers sampled from Gaussian normal distribution. Each feature and task is a column of values that is sampled from a normal distribution of mean 0, stdev 1.""" if os.path.exists(self.data_dir): shutil.rmtree(self.data_dir) features = ["feat0", "feat1"] featurizer = UserDefinedFeaturizer(features) tasks = ["task0", "task1"] input_file = os.path.join( self.current_dir, "../../models/tests/gaussian_cdf_example.csv") loader = DataLoader(tasks=tasks, featurizer=featurizer, id_field="id", verbosity=None) return loader.featurize(input_file, self.data_dir)
def load_multitask_data(self): """Load example multitask data.""" if os.path.exists(self.data_dir): shutil.rmtree(self.data_dir) featurizer = CircularFingerprint(size=1024) tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16"] input_file = os.path.join( self.current_dir, "../../models/tests/multitask_example.csv") loader = DataLoader( tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") return loader.featurize(input_file, self.data_dir)
def _run_muv_experiment(self, dataset_file, reload=False, verbosity=None): """Loads or reloads a small version of MUV dataset.""" # Load MUV dataset raw_dataset = load_from_disk(dataset_file) print("Number of examples in dataset: %s" % str(raw_dataset.shape[0])) print("About to featurize compounds") featurizer = CircularFingerprint(size=1024) MUV_tasks = ['MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644', 'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712', 'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652', 'MUV-466', 'MUV-832'] loader = DataLoader(tasks=MUV_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize(dataset_file, self.data_dir) assert len(dataset) == len(raw_dataset) print("About to split compounds into train/valid/test") splitter = ScaffoldSplitter(verbosity=verbosity) frac_train, frac_valid, frac_test = .8, .1, .1 train_dataset, valid_dataset, test_dataset = \ splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir, log_every_n=1000, frac_train=frac_train, frac_test=frac_test, frac_valid=frac_valid) # Do an approximate comparison since splits are sometimes slightly off from # the exact fraction. assert relative_difference( len(train_dataset), frac_train * len(dataset)) < 1e-3 assert relative_difference( len(valid_dataset), frac_valid * len(dataset)) < 1e-3 assert relative_difference( len(test_dataset), frac_test * len(dataset)) < 1e-3 # TODO(rbharath): Transformers don't play nice with reload! Namely, # reloading will cause the transform to be reapplied. This is undesirable in # almost all cases. Need to understand a method to fix this. transformers = [ BalancingTransformer(transform_w=True, dataset=train_dataset)] print("Transforming datasets") for dataset in [train_dataset, valid_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) return (len(train_dataset), len(valid_dataset), len(test_dataset))
def test_singletask_sklearn_rf_RDKIT_descriptor_regression_API(self): """Test of singletask RF RDKIT-descriptor regression API.""" splittype = "scaffold" featurizer = RDKitDescriptors() tasks = ["log-solubility"] task_type = "regression" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) input_transformers = [ NormalizationTransformer(transform_X=True, dataset=train_dataset), ClippingTransformer(transform_X=True, dataset=train_dataset)] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset)] transformers = input_transformers + output_transformers for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) regression_metrics = [Metric(metrics.r2_score), Metric(metrics.mean_squared_error), Metric(metrics.mean_absolute_error)] sklearn_model = RandomForestRegressor() model = SklearnModel(sklearn_model, self.model_dir) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics)
def test_multitask_order(self): """Test that order of tasks in multitask datasets is preserved.""" from deepchem.models.keras_models.fcnet import MultiTaskDNN splittype = "scaffold" output_transformers = [] input_transformers = [] task_type = "classification" # TODO(rbharath): There should be some automatic check to ensure that all # required model_params are specified. model_params = { "nb_hidden": 10, "activation": "relu", "dropout": .5, "learning_rate": .01, "momentum": .9, "nesterov": False, "decay": 1e-4, "batch_size": 5, "nb_epoch": 2, "init": "glorot_uniform", "nb_layers": 1, "batchnorm": False } input_file = os.path.join(self.current_dir, "multitask_example.csv") tasks = [ "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16" ] task_types = {task: task_type for task in tasks} featurizer = CircularFingerprint(size=1024) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) assert train_dataset.get_task_names() == tasks assert test_dataset.get_task_names() == tasks
def test_singletask_tf_mlp_ECFP_classification_API(self): """Straightforward test of Tensorflow singletask deepchem classification API.""" n_features = 1024 featurizer = CircularFingerprint(size=n_features) tasks = ["outcome"] input_file = os.path.join(self.current_dir, "example_classification.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset)] for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) classification_metrics = [Metric(metrics.roc_auc_score), Metric(metrics.matthews_corrcoef), Metric(metrics.recall_score), Metric(metrics.accuracy_score)] tensorflow_model = TensorflowMultiTaskClassifier( len(tasks), n_features, self.model_dir) model = TensorflowModel(tensorflow_model, self.model_dir) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics)
def test_multitask_keras_mlp_ECFP_classification_API(self): """Straightforward test of Keras multitask deepchem classification API.""" g = tf.Graph() sess = tf.Session(graph=g) K.set_session(sess) with g.as_default(): task_type = "classification" input_file = os.path.join(self.current_dir, "multitask_example.csv") tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16"] n_features = 1024 featurizer = CircularFingerprint(size=n_features) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) transformers = [] classification_metrics = [Metric(metrics.roc_auc_score), Metric(metrics.matthews_corrcoef), Metric(metrics.recall_score), Metric(metrics.accuracy_score)] keras_model = MultiTaskDNN(len(tasks), n_features, "classification", dropout=0.) model = KerasModel(keras_model, self.model_dir) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics)
def test_multitask_keras_mlp_ECFP_classification_hyperparam_opt(self): """Straightforward test of Keras multitask deepchem classification API.""" task_type = "classification" input_file = os.path.join(self.current_dir, "multitask_example.csv") tasks = [ "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16" ] n_features = 1024 featurizer = CircularFingerprint(size=n_features) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir) transformers = [] metric = Metric(metrics.matthews_corrcoef, np.mean, mode="classification") params_dict = {"n_hidden": [5, 10]} def model_builder(model_params, model_dir): keras_model = MultiTaskDNN(len(tasks), n_features, task_type, dropout=0., **model_params) return KerasModel(keras_model, model_dir) optimizer = HyperparamOpt(model_builder, verbosity="low") best_model, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, transformers, metric, logdir=None)
def test_merge(self): """Test that datasets can be merged.""" verbosity = "high" current_dir = os.path.dirname(os.path.realpath(__file__)) first_data_dir = os.path.join(self.base_dir, "first_dataset") second_data_dir = os.path.join(self.base_dir, "second_dataset") merged_data_dir = os.path.join(self.base_dir, "merged_data") dataset_file = os.path.join(current_dir, "../../models/tests/example.csv") featurizer = CircularFingerprint(size=1024) tasks = ["log-solubility"] loader = DataLoader(tasks=tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) first_dataset = loader.featurize(dataset_file, first_data_dir) second_dataset = loader.featurize(dataset_file, second_data_dir) merged_dataset = Dataset.merge(merged_data_dir, [first_dataset, second_dataset]) assert len(merged_dataset) == len(first_dataset) + len(second_dataset)
def test_multitask_keras_mlp_ECFP_classification_hyperparam_opt(self): """Straightforward test of Keras multitask deepchem classification API.""" task_type = "classification" input_file = os.path.join(self.current_dir, "multitask_example.csv") tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16"] task_types = {task: task_type for task in tasks} featurizer = CircularFingerprint(size=1024) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir) transformers = [] metric = Metric(metrics.matthews_corrcoef, np.mean, mode="classification") params_dict= {"nb_hidden": [5, 10], "activation": ["relu"], "dropout": [.5], "learning_rate": [.01], "momentum": [.9], "nesterov": [False], "decay": [1e-4], "batch_size": [5], "nb_epoch": [2], "init": ["glorot_uniform"], "nb_layers": [1], "batchnorm": [False], "data_shape": [train_dataset.get_data_shape()]} optimizer = HyperparamOpt(MultiTaskDNN, tasks, task_types, verbosity="low") best_model, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, transformers, metric, logdir=None)
def test_multitask_order(self): """Test that order of tasks in multitask datasets is preserved.""" input_file = os.path.join(self.current_dir, "multitask_example.csv") tasks = [ "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16" ] featurizer = CircularFingerprint(size=1024) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) assert train_dataset.get_task_names() == tasks assert test_dataset.get_task_names() == tasks
def test_singletask_sklearn_rf_ECFP_regression_hyperparam_opt(self): """Test of hyperparam_opt with singletask RF ECFP regression API.""" featurizer = CircularFingerprint(size=1024) tasks = ["log-solubility"] input_file = os.path.join(self.current_dir, "example.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir) transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset) ] for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) params_dict = {"n_estimators": [10, 100]} metric = Metric(metrics.r2_score) def rf_model_builder(model_params, model_dir): sklearn_model = RandomForestRegressor(**model_params) return SklearnModel(sklearn_model, model_dir) optimizer = HyperparamOpt(rf_model_builder, verbosity="low") best_model, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, transformers, metric, logdir=None)
def test_merge(self): """Test that datasets can be merged.""" verbosity = "high" current_dir = os.path.dirname(os.path.realpath(__file__)) first_data_dir = os.path.join(self.base_dir, "first_dataset") second_data_dir = os.path.join(self.base_dir, "second_dataset") merged_data_dir = os.path.join(self.base_dir, "merged_data") dataset_file = os.path.join(current_dir, "../../models/tests/example.csv") featurizer = CircularFingerprint(size=1024) tasks = ["log-solubility"] loader = DataLoader(tasks=tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) first_dataset = loader.featurize(dataset_file, first_data_dir) second_dataset = loader.featurize(dataset_file, second_data_dir) merged_dataset = DiskDataset.merge(merged_data_dir, [first_dataset, second_dataset]) assert len(merged_dataset) == len(first_dataset) + len(second_dataset)
def load_solubility_data(self): """Loads solubility data from example.csv""" if os.path.exists(self.data_dir): shutil.rmtree(self.data_dir) featurizer = CircularFingerprint(size=1024) tasks = ["log-solubility"] task_type = "regression" input_file = os.path.join(self.current_dir, "../../models/tests/example.csv") featurizer = DataLoader( tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") return featurizer.featurize(input_file, self.data_dir)
def test_multitask_order(self): """Test that order of tasks in multitask datasets is preserved.""" from deepchem.models.keras_models.fcnet import MultiTaskDNN splittype = "scaffold" output_transformers = [] input_transformers = [] task_type = "classification" # TODO(rbharath): There should be some automatic check to ensure that all # required model_params are specified. model_params = {"nb_hidden": 10, "activation": "relu", "dropout": .5, "learning_rate": .01, "momentum": .9, "nesterov": False, "decay": 1e-4, "batch_size": 5, "nb_epoch": 2, "init": "glorot_uniform", "nb_layers": 1, "batchnorm": False} input_file = os.path.join(self.current_dir, "multitask_example.csv") tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16"] task_types = {task: task_type for task in tasks} featurizer = CircularFingerprint(size=1024) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) assert train_dataset.get_task_names() == tasks assert test_dataset.get_task_names() == tasks
def load_sweet(base_dir, reload=True, frac_train=.8): """Load sweet datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" model = "logistic" regen = False # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") train_dir = os.path.join(base_dir, "train_dataset") valid_dir = os.path.join(base_dir, "valid_dataset") # Load SWEET dataset print("About to load SWEET dataset.") dataset_file = os.path.join( current_dir, "./sweet.csv.gz") dataset = load_from_disk(dataset_file) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) # Featurize SWEET dataset print("About to featurize SWEET dataset.") featurizer = CircularFingerprint(size=1024) SWEET_tasks = dataset.columns.values[1:].tolist() loader = DataLoader(tasks=SWEET_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) if not reload or not os.path.exists(data_dir): dataset = loader.featurize(dataset_file, data_dir) regen = True else: dataset = DiskDataset(data_dir, reload=True) # Initialize transformers transformers = [ BalancingTransformer(transform_w=True, dataset=dataset)] if regen: print("About to transform data") for transformer in transformers: dataset = transformer.transform(dataset) X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids) num_tasks = 17 num_train = frac_train * len(dataset) SWEET_tasks = SWEET_tasks[:num_tasks] print("Using following tasks") print(SWEET_tasks) X_train, X_valid = X[:num_train], X[num_train:] y_train, y_valid = y[:num_train, :num_tasks], y[num_train:, :num_tasks] w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks] ids_train, ids_valid = ids[:num_train], ids[num_train:] train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train, w_train, ids_train, SWEET_tasks) valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid, w_valid, ids_valid, SWEET_tasks) return SWEET_tasks, (train_dataset, valid_dataset), transformers
def test_multiload(self): """Check can re-use featurization for multiple task selections. TODO(rbharath): This test seems silly after the recent round of refactoring. Can it be removed? """ # Only for debug! np.random.seed(123) # Set some global variables up top reload = True verbosity = "high" current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(self.base_dir, "dataset") train_dir = os.path.join(self.base_dir, "train_dataset") valid_dir = os.path.join(self.base_dir, "valid_dataset") test_dir = os.path.join(self.base_dir, "test_dataset") model_dir = os.path.join(self.base_dir, "model") # Load dataset print("About to load dataset.") dataset_file = os.path.join( current_dir, "../../models/tests/multitask_example.csv") dataset = load_from_disk(dataset_file) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) # Featurize tox21 dataset print("About to featurize dataset.") featurizer = CircularFingerprint(size=1024) all_tasks = ["task%d"%i for i in range(17)] ####### Do featurization loader = DataLoader(tasks=all_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize( dataset_file, data_dir) # Do train/valid split. X_multi, y_multi, w_multi, ids_multi = dataset.to_numpy() ####### Do singletask load y_tasks, w_tasks, = [], [] for ind, task in enumerate(all_tasks): print("Processing task %s" % task) dataset = Dataset(data_dir, verbosity=verbosity, reload=reload) X_task, y_task, w_task, ids_task = dataset.to_numpy() y_tasks.append(y_task[:, ind]) w_tasks.append(w_task[:, ind]) ################## Do comparison for ind, task in enumerate(all_tasks): y_multi_task = y_multi[:, ind] w_multi_task = w_multi[:, ind] y_task = y_tasks[ind] w_task = w_tasks[ind] np.testing.assert_allclose(y_multi_task.flatten(), y_task.flatten()) np.testing.assert_allclose(w_multi_task.flatten(), w_task.flatten())
def test_singletask_tf_mlp_ECFP_classification_API(self): """Straightforward test of Tensorflow singletask deepchem classification API.""" splittype = "scaffold" output_transformers = [] input_transformers = [] task_type = "classification" featurizer = CircularFingerprint(size=1024) tasks = ["outcome"] task_type = "classification" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example_classification.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) input_transformers = [] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset)] transformers = input_transformers + output_transformers for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) model_params = { "batch_size": 2, "num_classification_tasks": 1, "num_features": 1024, "layer_sizes": [1024], "weight_init_stddevs": [1.], "bias_init_consts": [0.], "dropouts": [.5], "num_classes": 2, "nb_epoch": 1, "penalty": 0.0, "optimizer": "adam", "learning_rate": .001, "data_shape": train_dataset.get_data_shape() } classification_metrics = [Metric(metrics.roc_auc_score), Metric(metrics.matthews_corrcoef), Metric(metrics.recall_score), Metric(metrics.accuracy_score)] model = TensorflowModel( tasks, task_types, model_params, self.model_dir, tf_class=TensorflowMultiTaskClassifier) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics)
def load_nci(base_dir, reload=True, force_transform=False): """Load NCI datasets. Does not do train/test split""" # Set some global variables up top verbosity = "high" model = "logistic" regen = False # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): print("Deleting dir in nci_datasets.py") print(base_dir) shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") # Load nci dataset print("About to load NCI dataset.") dataset_file1_path = os.path.join( current_dir, "../../datasets/nci_1.csv.gz") dataset_file2_path = os.path.join( current_dir, "../../datasets/nci_2.csv.gz") dataset_paths = [dataset_file1_path, dataset_file2_path] dataset = load_sharded_csv(dataset_paths) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) # Featurize nci dataset print("About to featurize nci dataset.") featurizer = CircularFingerprint(size=1024) #was sorted list originally in muv_datasets.py, but csv is ordered so removed all_nci_tasks = (['CCRF-CEM', 'HL-60(TB)', 'K-562', 'MOLT-4', 'RPMI-8226', 'SR', 'A549/ATCC', 'EKVX', 'HOP-62', 'HOP-92', 'NCI-H226', 'NCI-H23', 'NCI-H322M', 'NCI-H460', 'NCI-H522', 'COLO 205', 'HCC-2998', 'HCT-116', 'HCT-15', 'HT29', 'KM12', 'SW-620', 'SF-268', 'SF-295', 'SF-539', 'SNB-19', 'SNB-75', 'U251', 'LOX IMVI', 'MALME-3M', 'M14', 'MDA-MB-435', 'SK-MEL-2', 'SK-MEL-28', 'SK-MEL-5', 'UACC-257', 'UACC-62', 'IGR-OV1', 'OVCAR-3', 'OVCAR-4', 'OVCAR-5', 'OVCAR-8', 'NCI/ADR-RES', 'SK-OV-3', '786-0', 'A498', 'ACHN', 'CAKI-1', 'RXF 393', 'SN12C', 'TK-10', 'UO-31', 'PC-3', 'DU-145', 'MCF7', 'MDA-MB-231/ATCC', 'MDA-MB-468', 'HS 578T', 'BT-549', 'T-47D']) loader = DataLoader(tasks=all_nci_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) if not reload or not os.path.exists(data_dir): dataset = loader.featurize(dataset_paths, data_dir) regen = True else: dataset = Dataset(data_dir, reload=True) # Initialize transformers transformers = [] if regen or force_transform: print("About to transform data") transformers = [ NormalizationTransformer(transform_y=True, dataset=dataset)] for transformer in transformers: transformer.transform(dataset) return all_nci_tasks, dataset, transformers
def load_bace(mode="regression", transform=True, split="20-80"): """Load BACE-1 dataset as regression/classification problem.""" reload = True verbosity = "high" regen = False assert split in ["20-80", "80-20"] current_dir = os.path.dirname(os.path.realpath(__file__)) if split == "20-80": dataset_file = os.path.join( current_dir, "../../datasets/desc_canvas_aug30.csv") elif split == "80-20": dataset_file = os.path.join( current_dir, "../../datasets/rev8020split_desc.csv") dataset = load_from_disk(dataset_file) num_display = 10 pretty_columns = ( "[" + ",".join(["'%s'" % column for column in dataset.columns.values[:num_display]]) + ",...]") crystal_dataset_file = os.path.join( current_dir, "../../datasets/crystal_desc_canvas_aug30.csv") crystal_dataset = load_from_disk(crystal_dataset_file) print("Columns of dataset: %s" % pretty_columns) print("Number of examples in dataset: %s" % str(dataset.shape[0])) print("Number of examples in crystal dataset: %s" % str(crystal_dataset.shape[0])) #Make directories to store the raw and featurized datasets. base_dir = tempfile.mkdtemp() data_dir = os.path.join(base_dir, "dataset") train_dir = os.path.join(base_dir, "train_dataset") valid_dir = os.path.join(base_dir, "valid_dataset") test_dir = os.path.join(base_dir, "test_dataset") model_dir = os.path.join(base_dir, "model") crystal_dir = os.path.join(base_dir, "crystal") if mode == "regression": bace_tasks = ["pIC50"] elif mode == "classification": bace_tasks = ["Class"] else: raise ValueError("Unknown mode %s" % mode) featurizer = UserDefinedFeaturizer(user_specified_features) loader = DataLoader(tasks=bace_tasks, smiles_field="mol", id_field="CID", featurizer=featurizer) if not reload or not os.path.exists(data_dir): dataset = loader.featurize(dataset_file, data_dir) regen = True else: dataset = Dataset(data_dir, reload=True) if not reload or not os.path.exists(crystal_dir): crystal_dataset = loader.featurize(crystal_dataset_file, crystal_dir) else: crystal_dataset = Dataset(crystal_dir, reload=True) if (not reload or not os.path.exists(train_dir) or not os.path.exists(valid_dir) or not os.path.exists(test_dir)): regen = True splitter = SpecifiedSplitter(dataset_file, "Model", verbosity=verbosity) train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, train_dir, valid_dir, test_dir) else: train_dataset = Dataset(train_dir, reload=True) valid_dataset = Dataset(valid_dir, reload=True) test_dataset = Dataset(test_dir, reload=True) #NOTE THE RENAMING: if split == "20-80": valid_dataset, test_dataset = test_dataset, valid_dataset print("Number of compounds in train set") print(len(train_dataset)) print("Number of compounds in validation set") print(len(valid_dataset)) print("Number of compounds in test set") print(len(test_dataset)) print("Number of compounds in crystal set") print(len(crystal_dataset)) if transform and regen: input_transformers = [ NormalizationTransformer(transform_X=True, dataset=train_dataset), ClippingTransformer(transform_X=True, dataset=train_dataset)] output_transformers = [] if mode == "regression": output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset)] else: output_transformers = [] else: input_transformers, output_transformers = [], [] transformers = input_transformers + output_transformers for dataset in [train_dataset, valid_dataset, test_dataset, crystal_dataset]: for transformer in transformers: dataset = transformer.transform(dataset) return (bace_tasks, train_dataset, valid_dataset, test_dataset, crystal_dataset, output_transformers)
def test_singletask_matches_multitask_load(self): """Check that singletask load and multitask load of dataset are same.""" # Only for debug! np.random.seed(123) # Set some global variables up top reload = True verbosity = "high" base_dir = tempfile.mkdtemp() current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") train_dir = os.path.join(base_dir, "train_dataset") valid_dir = os.path.join(base_dir, "valid_dataset") test_dir = os.path.join(base_dir, "test_dataset") model_dir = os.path.join(base_dir, "model") # Load dataset print("About to load dataset.") dataset_file = os.path.join( current_dir, "../../models/tests/multitask_example.csv") dataset = load_from_disk(dataset_file) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) # Featurize tox21 dataset print("About to featurize dataset.") featurizer = CircularFingerprint(size=1024) all_tasks = ["task%d"%i for i in range(17)] # For debugging purposes n_tasks = 17 tasks = all_tasks[0:n_tasks] ####### Do multitask load loader = DataLoader(tasks=tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize(dataset_file, data_dir) # Do train/valid split. X_multi, y_multi, w_multi, ids_multi = dataset.to_numpy() ####### Do singletask load y_tasks, w_tasks, ids_tasks = [], [], [] for task in tasks: print("Processing task %s" % task) if os.path.exists(data_dir): shutil.rmtree(data_dir) loader = DataLoader(tasks=[task], smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize(dataset_file, data_dir) X_task, y_task, w_task, ids_task = dataset.to_numpy() y_tasks.append(y_task) w_tasks.append(w_task) ids_tasks.append(ids_task) ################## Do comparison for ind, task in enumerate(tasks): y_multi_task = y_multi[:, ind] w_multi_task = w_multi[:, ind] y_task = y_tasks[ind] w_task = w_tasks[ind] ids_task = ids_tasks[ind] np.testing.assert_allclose(y_multi_task.flatten(), y_task.flatten()) np.testing.assert_allclose(w_multi_task.flatten(), w_task.flatten()) shutil.rmtree(base_dir)
featurized_samples_file = os.path.join(data_dir, "featurized_samples.joblib") feature_dir = os.path.join(base_dir, "features") if not os.path.exists(feature_dir): os.makedirs(feature_dir) samples_dir = os.path.join(base_dir, "samples") if not os.path.exists(samples_dir): os.makedirs(samples_dir) featurizers = compound_featurizers + complex_featurizers featurizer = DataLoader(tasks=["label"], smiles_field="smiles", protein_pdb_field="protein_pdb", ligand_pdb_field="ligand_pdb", compound_featurizers=compound_featurizers, complex_featurizers=complex_featurizers, id_field="complex_id", verbose=False) from ipyparallel import Client c = Client() print("c.ids") print(c.ids) dview = c[:] featurized_samples = featurizer.featurize(dataset_file, feature_dir, samples_dir, worker_pool=dview, shard_size=1024) save_to_disk(featurized_samples, featurized_samples_file)
def load_pcba(base_dir, reload=True): """Load PCBA datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" regen = False # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") # Load PCBA dataset print("About to load PCBA dataset.") dataset_file = os.path.join( current_dir, "../../datasets/pcba.csv.gz") dataset = load_from_disk(dataset_file) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) # Featurize PCBA dataset print("About to featurize PCBA dataset.") featurizer = CircularFingerprint(size=1024) all_PCBA_tasks = [ 'PCBA-1030','PCBA-1379','PCBA-1452','PCBA-1454','PCBA-1457', 'PCBA-1458','PCBA-1460','PCBA-1461','PCBA-1468','PCBA-1469', 'PCBA-1471','PCBA-1479','PCBA-1631','PCBA-1634','PCBA-1688', 'PCBA-1721','PCBA-2100','PCBA-2101','PCBA-2147','PCBA-2242', 'PCBA-2326','PCBA-2451','PCBA-2517','PCBA-2528','PCBA-2546', 'PCBA-2549','PCBA-2551','PCBA-2662','PCBA-2675','PCBA-2676', 'PCBA-411','PCBA-463254','PCBA-485281','PCBA-485290','PCBA-485294', 'PCBA-485297','PCBA-485313','PCBA-485314','PCBA-485341','PCBA-485349', 'PCBA-485353','PCBA-485360','PCBA-485364','PCBA-485367','PCBA-492947', 'PCBA-493208','PCBA-504327','PCBA-504332','PCBA-504333','PCBA-504339', 'PCBA-504444','PCBA-504466','PCBA-504467','PCBA-504706','PCBA-504842', 'PCBA-504845','PCBA-504847','PCBA-504891','PCBA-540276','PCBA-540317', 'PCBA-588342','PCBA-588453','PCBA-588456','PCBA-588579','PCBA-588590', 'PCBA-588591','PCBA-588795','PCBA-588855','PCBA-602179','PCBA-602233', 'PCBA-602310','PCBA-602313','PCBA-602332','PCBA-624170','PCBA-624171', 'PCBA-624173','PCBA-624202','PCBA-624246','PCBA-624287','PCBA-624288', 'PCBA-624291','PCBA-624296','PCBA-624297','PCBA-624417','PCBA-651635', 'PCBA-651644','PCBA-651768','PCBA-651965','PCBA-652025','PCBA-652104', 'PCBA-652105','PCBA-652106','PCBA-686970','PCBA-686978','PCBA-686979', 'PCBA-720504','PCBA-720532','PCBA-720542','PCBA-720551','PCBA-720553', 'PCBA-720579','PCBA-720580','PCBA-720707','PCBA-720708','PCBA-720709', 'PCBA-720711','PCBA-743255','PCBA-743266','PCBA-875','PCBA-881', 'PCBA-883','PCBA-884','PCBA-885','PCBA-887','PCBA-891','PCBA-899', 'PCBA-902','PCBA-903','PCBA-904','PCBA-912','PCBA-914','PCBA-915', 'PCBA-924','PCBA-925','PCBA-926','PCBA-927','PCBA-938','PCBA-995'] loader = DataLoader(tasks=all_PCBA_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) if not reload or not os.path.exists(data_dir): dataset = loader.featurize(dataset_file, data_dir) regen = True else: dataset = Dataset(data_dir, reload=True) # Initialize transformers transformers = [ BalancingTransformer(transform_w=True, dataset=dataset)] if regen: print("About to transform data") for transformer in transformers: transformer.transform(dataset) return all_PCBA_tasks, dataset, transformers