def test_get_featurized_data_scaffold(): """Tries to load a previously prefeaturized dataset, then creates featurization, instantiates n_features, and dumps a pickle file of the transformers if they exist. Implemented in super. The dataset object from file is a delaney dataset using an ecfp featurizer with a default scaffold split. Testing of featurization of the dataset is extensively done in test_featurization.py """ (params_from_file, dataset_obj_from_file, df_delaney) = utils.delaney_objects() (params_from_file_scaffold, dataset_obj_from_file_scaffold, df_delaney) = utils.delaney_objects(split_strategy="train_valid_test", splitter="scaffold") dataset_obj_from_file_scaffold.params.transformers = True dataset_obj_from_file_scaffold.get_featurized_data() test_list = [] test_list.append( isinstance(dataset_obj_from_file_scaffold.dataset, dc.data.datasets.DiskDataset)) test_list.append( len(dataset_obj_from_file_scaffold.dataset) == len(df_delaney)) test_list.append(dataset_obj_from_file.n_features == dataset_obj_from_file.params.ecfp_size) test_list.append( len(dataset_obj_from_file.dataset.y) == len( dataset_obj_from_file.dataset.ids)) assert all(test_list)
def test_combine_training_data_scaffold(): """ Concatenates train and valid from self.train_valid_dsets[0] into a combined DiskDataset. Implemented in super. """ (params_from_file, dataset_obj_from_file, df_delaney) = utils.delaney_objects() (params_from_file_scaffold, dataset_obj_from_file_scaffold, df_delaney) = utils.delaney_objects(split_strategy="train_valid_test", splitter="scaffold") dataset_obj_from_file_scaffold.combined_training_data() (orig_train, orig_valid) = dataset_obj_from_file_scaffold.train_valid_dsets[0] test_list = [] test_list.append( isinstance(dataset_obj_from_file_scaffold.combined_train_valid_data, DD)) test_list.append( len(dataset_obj_from_file_scaffold.combined_train_valid_data.y) == len( dataset_obj_from_file_scaffold.combined_train_valid_data.ids)) concat_train_valid = np.concatenate((orig_train.ids, orig_valid.ids)) test_list.append( (concat_train_valid == dataset_obj_from_file_scaffold.combined_train_valid_data.ids).all()) test_list.append(len(orig_train.y) == len(orig_train.ids)) assert all(test_list)
def test_get_split_metadata(): """ pulls a dictionary that contains the splitting strategy and splitter used to generate the model. """ (params_from_file, dataset_obj_from_file, df_delaney) = utils.delaney_objects() (params_from_file_scaffold, dataset_obj_from_file_scaffold, df_delaney) = utils.delaney_objects(split_strategy="train_valid_test", splitter="scaffold") out_dict = dataset_obj_from_file.get_split_metadata() test_list = [] test_list.append(out_dict["split_strategy"] == dataset_obj_from_file.params.split_strategy) test_list.append( out_dict["splitter"] == dataset_obj_from_file.params.splitter) # TODO: num_folds does not match. Need to identify the difference in num_folds. # test_list.append(out_dict["Splitting"]["num_folds"] == dataset_obj_from_file.splitting.num_folds) test_list.append(out_dict["split_valid_frac"] == dataset_obj_from_file.params.split_valid_frac) test_list.append(out_dict["split_test_frac"] == dataset_obj_from_file.params.split_test_frac) test_list.append( out_dict["split_uuid"] == dataset_obj_from_file.split_uuid) assert all(test_list)
def test_get_dataset_tasks(): """Testing task extraction with self.params.response_cols as a single value or a list. From Datastore, if y is not defined, should extract from the dataset itself. Returns True if tasks are found, False if they are not. """ (params_from_file, dataset_obj_from_file, df_delaney) = utils.delaney_objects() (params_from_file_scaffold, dataset_obj_from_file_scaffold, df_delaney) = utils.delaney_objects(split_strategy="train_valid_test", splitter="scaffold") test_list = [] flag_tasks_from_file = dataset_obj_from_file.get_dataset_tasks( delaney_from_disk) test_list.append(flag_tasks_from_file) test_list.append( dataset_obj_from_file.tasks == params_from_file.response_cols) subset_delaney = delaney_from_disk[[ 'Compound ID', 'smiles', 'measured log solubility in mols per litre', 'Polar Surface Area' ]] # ksm: These tests are commented out because we no longer support datasets where response_cols is None. #flag_tasks_from_file_noy = dataset_obj_from_file_noy.get_dataset_tasks(subset_delaney) #test_list.append(flag_tasks_from_file_noy) #test_list.append(sorted(dataset_obj_from_file_noy.tasks) == sorted(['measured log solubility in mols per litre','Polar Surface Area'])) #flag_tasks_from_file_failure = dataset_obj_from_file_noy.get_dataset_tasks(subset_delaney[['Compound ID','smiles']]) #test_list.append(not flag_tasks_from_file_failure) if not datastore_is_down: flag_tasks_from_ds = dataset_obj_from_datastore.get_dataset_tasks( df_datastore) test_list.append(flag_tasks_from_ds) test_list.append( dataset_obj_from_datastore.tasks == params_from_ds.response_cols) subset_datastore = df_datastore[[ 'compound_id', 'rdkit_smiles', 'PIC50' ]] # TODO (ksm): The following test fails because the task name stored in the dataset metadata differs from the response_cols # parameter specified when the ModelDataset was created for this dataset. Need to find a better dataset. # flag_tasks_from_ds_noy = dataset_obj_from_datastore_noy.get_dataset_tasks(subset_datastore) # test_list.append(flag_tasks_from_ds_noy) # test_list.append(sorted(dataset_obj_from_datastore_noy.tasks) == sorted(['PIC50'])) # ksm: For datastore datasets, the following test never fails because the task name is retrieved from the dataset metadata #dataset_obj_from_datastore_noy.tasks = None #flag_tasks_from_ds_failure = dataset_obj_from_datastore_noy.get_dataset_tasks(subset_datastore[['compound_id','rdkit_smiles']]) #test_list.append(not flag_tasks_from_ds_failure) # TODO (ksm): Add tests for a multitask dataset print(test_list) assert all(test_list)
def test_load_full_dataset(): """Full dataset is properly loaded. Comparing against datastore_functions and dataframe loading for the FileDataset and DatastoreDataset subclasses""" (params_from_file, dataset_obj_from_file, df_delaney) = utils.delaney_objects() (params_from_file_scaffold, dataset_obj_from_file_scaffold, df_delaney) = utils.delaney_objects(split_strategy="train_valid_test", splitter="scaffold") from_method = dataset_obj_from_file.load_full_dataset() assert from_method.equals(delaney_from_disk) if not datastore_is_down: from_method_datastore = dataset_obj_from_datastore.load_full_dataset() assert from_method_datastore.equals(df_datastore)
def test_save_split_dataset(): """Saves the compound IDs and smiles strings for a split subset. Implemented in super """ (params_from_file, dataset_obj_from_file, df_delaney) = utils.delaney_objects() (params_from_file_scaffold, dataset_obj_from_file_scaffold, df_delaney) = utils.delaney_objects(split_strategy="train_valid_test", splitter="scaffold") dataset_obj_from_file.save_split_dataset() dir = os.path.dirname(dataset_obj_from_file.params.dataset_key) split_path = '{0}/{1}'.format(dir, dataset_obj_from_file._get_split_key()) assert os.path.isfile(split_path)
def test_check_task_columns(): """Checks that self.tasks exist, then checks that the requested self.tasks all exist within the dataframe. Throws exception if self.get_dataset_tasks is False or if prediction tasks are missing. Testing for exception raising on bad task columns and success. """ (params_from_file, dataset_obj_from_file, df_delaney) = utils.delaney_objects() (params_from_file_scaffold, dataset_obj_from_file_scaffold, df_delaney) = utils.delaney_objects(split_strategy="train_valid_test", splitter="scaffold") #with pytest.raises(Exception): # dataset_obj_from_file_wrongy.check_task_columns(delaney_from_disk) dataset_obj_from_file.check_task_columns(delaney_from_disk) assert dataset_obj_from_file.tasks == params_from_file.response_cols if not datastore_is_down: #with pytest.raises(Exception): # dataset_obj_from_datastore_wrongy.check_task_columns(df_datastore) dataset_obj_from_datastore.get_dataset_tasks(df_datastore) assert dataset_obj_from_datastore.tasks == params_from_ds.response_cols
def test_create_model_dataset(): """testing if classes are properly generated from the factory method. Asserting that the correct methods exist, and are callable. """ (params_from_file, dataset_obj_from_file, df_delaney) = utils.delaney_objects() (params_from_file_scaffold, dataset_obj_from_file_scaffold, df_delaney) = utils.delaney_objects(split_strategy="train_valid_test", splitter="scaffold") test_list = [] test_list.append( isinstance(dataset_obj_from_file, model_dataset.FileDataset)) methods = [ "load_full_dataset", "check_task_columns", "load_featurized_data", "get_featurized_data", "get_dataset_tasks", "split_dataset", "get_split_metadata", "save_split_dataset", "load_presplit_dataset", "save_featurized_data", "combined_training_data" ] #testing datastore if not datastore_is_down: test_list.append( isinstance(dataset_obj_from_datastore, model_dataset.DatastoreDataset)) for method in methods: test_list.append( callable(getattr(dataset_obj_from_datastore, method))) test_list.append(dataset_obj_from_datastore.ds_client is not None) test_list.append(dataset_obj_from_datastore.dataset_name) #testing from file for method in methods: test_list.append(callable(getattr(dataset_obj_from_file, method))) test_list.append( isinstance(dataset_obj_from_file.featurization, feat.DynamicFeaturization)) test_list.append( isinstance(dataset_obj_from_file.splitting, split.TrainValidTestSplitting)) test_list.append(dataset_obj_from_file.dataset_name) assert all(test_list)
def test_load_presplit_dataset(): # Loads in the split files from disk. Uses splitting.get_split_prefix to specify the path of the split file. Uses splitting.select_dset_by_attr_ids. Returns True or False. Initializes self.train_valid_attr, self.train_valid_dsets, self.test_attr, self.test_dset. Implemented in super. (params_from_file, dataset_obj_from_file, df_delaney) = utils.delaney_objects() (params_from_file_scaffold, dataset_obj_from_file_scaffold, df_delaney) = utils.delaney_objects(split_strategy="train_valid_test", splitter="scaffold") dataset_obj_from_file.get_featurized_data() dataset_obj_from_file.split_dataset() (orig_train, orig_valid) = dataset_obj_from_file.train_valid_dsets[0] (orig_train_attr, orig_valid_attr) = dataset_obj_from_file.train_valid_attr[0] dataset_obj_from_file.save_split_dataset() # Need to pass the split_uuid to recover the split we just saved (params_from_file2, dataset_obj_from_file2, df_delaney2) = utils.delaney_objects( split_uuid=dataset_obj_from_file.split_uuid) dataset_obj_from_file2.get_featurized_data() dataset_obj_from_file2.load_presplit_dataset() (train, valid) = dataset_obj_from_file2.train_valid_dsets[0] (train_attr, valid_attr) = dataset_obj_from_file2.train_valid_attr[0] test_list = [] test_list.append((sorted(train.y) == sorted(orig_train.y))) test_list.append((sorted(valid.y) == sorted(orig_valid.y))) test_list.append( set(train_attr.index.values) == set(orig_train_attr.index.values)) test_list.append( set(valid_attr.index.values) == set(orig_valid_attr.index.values)) test_list.append((sorted(dataset_obj_from_file.test_dset.y) == sorted( dataset_obj_from_file2.test_dset.y))) test_list.append( set(dataset_obj_from_file.test_attr.index.values) == set( dataset_obj_from_file2.test_attr.index.values)) assert all(test_list)
def test_split_dataset(): """ Uses the split_datset method of splitting to split data. Implemented in super. Because the various splitting strategies are heavily tested in test_splitting.py, this test is simply ensuring that the attributes are appropriately created. """ (params_from_file, dataset_obj_from_file, df_delaney) = utils.delaney_objects() (params_from_file_scaffold, dataset_obj_from_file_scaffold, df_delaney) = utils.delaney_objects(split_strategy="train_valid_test", splitter="scaffold") dataset_obj_from_file.split_dataset() (train, valid) = dataset_obj_from_file.train_valid_dsets[0] (train_attr, valid_attr) = dataset_obj_from_file.train_valid_attr[0] test_list = [] test_list.append( len(dataset_obj_from_file.dataset) == len(train) + len(valid) + len(dataset_obj_from_file.test_dset)) test_list.append(set(train.ids.tolist()) == set(train_attr.index.tolist())) test_list.append(set(valid.ids.tolist()) == set(valid_attr.index.tolist())) test_list.append( set(dataset_obj_from_file.test_dset.ids.tolist()) == set( dataset_obj_from_file.test_attr.index.tolist())) #testing that k_fold splits are properly generated dataset_obj_from_file.params.split_strategy = 'k_fold_cv' dataset_obj_from_file.split_dataset() (train, valid) = dataset_obj_from_file.train_valid_dsets[0] (train_attr, valid_attr) = dataset_obj_from_file.train_valid_attr[0] test_list.append( len(dataset_obj_from_file.dataset) == len(train) + len(valid) + len(dataset_obj_from_file.test_dset)) test_list.append(set(train.ids.tolist()) == set(train_attr.index.tolist())) test_list.append(set(valid.ids.tolist()) == set(valid_attr.index.tolist())) test_list.append( set(dataset_obj_from_file.test_dset.ids.tolist()) == set( dataset_obj_from_file.test_attr.index.tolist())) assert all(test_list)
import deepchem as dc try: from mol_vae_features import MoleculeVAEFeaturizer mol_vae_supported = True except ModuleNotFoundError: mol_vae_supported = False import utils_testing as utils #WARNING: assuming model_dataset.py can create a functional data object. #WARNING: assuming that config_delaney.json and config_datastore_cav12.json are in the current directory. ownership = 'gskusers-ad' datastore_is_down = utils.datastore_status() (delaney_params_ecfp, data_obj_ecfp, df_delaney) = utils.delaney_objects() featurizer_ecfp = data_obj_ecfp.featurization data_obj_ecfp.check_task_columns(df_delaney) (delaney_params_graphconv, data_obj_graphconv, df_delaney) = utils.delaney_objects(featurizer="graphconv") featurizer_graphconv = data_obj_graphconv.featurization data_obj_graphconv.check_task_columns(df_delaney) if mol_vae_supported: (delaney_params_molvae, data_obj_molvae, df_delaney) = utils.delaney_objects(featurizer="molvae") featurizer_molvae = data_obj_molvae.featurization if not datastore_is_down: (datastore_params, mdl_datastore, df_datastore) = utils.datastore_objects()
os.path.abspath(inspect.getfile(inspect.currentframe()))) parentdir = os.path.dirname(currentdir) import atomsci.ddm.pipeline.featurization as feat import atomsci.ddm.pipeline.model_datasets as model_datasets import atomsci.ddm.pipeline.splitting as split import deepchem as dc import utils_testing as utils from deepchem.data import DiskDataset stratified_fixed = False # ksm: In latest code, ModelDataset defers creating its splitting object until the call to ModelDataset.split_dataset(). # So we have to do that first in order to access the splitting object. (params_random, data_obj_random, df_delaney) = utils.delaney_objects(split_strategy="train_valid_test", splitter="random") os.makedirs(params_random.output_dir, exist_ok=True) data_obj_random.get_featurized_data() data_obj_random.split_dataset() splitter_random = data_obj_random.splitting (params_scaffold, data_obj_scaffold, df_delaney) = utils.delaney_objects(split_strategy="train_valid_test", splitter="scaffold") data_obj_scaffold.get_featurized_data() data_obj_scaffold.split_dataset() splitter_scaffold = data_obj_scaffold.splitting (params_stratified, data_obj_stratified, df_delaney) = utils.delaney_objects(split_strategy="train_valid_test", splitter="stratified")
import atomsci.ddm.pipeline.model_datasets as model_dataset import atomsci.ddm.pipeline.model_wrapper as model_wrapper import atomsci.ddm.pipeline.model_pipeline as MP import atomsci.ddm.pipeline.perf_data as perf_data import utils_testing as utils import copy import pdb """This testing script assumes that /ds/data/public/delaney/delaney-processed.csv is still on the same path on twintron. Assumes that the dataset_key: /ds/projdata/gsk_data/GSK_derived/PK_parameters/gsk_blood_plasma_partition_rat_crit_res_data.csv under the bucket gskdata and with the object_oid: 5af0e6368003ff018de33db5 still exists. """ #The dataset object from file is a delaney dataset using an ecfp featurizer with a default scaffold split. datastore_is_down = utils.datastore_status() MP_delaney_ecfp_train_valid_test_random = utils.delaney_pipeline() (delaney_params, mdl_dataset_delaney, delaney_df) = utils.delaney_objects() general_params = { 'dataset_key': './delaney-processed.csv', 'featurizer': 'ecfp', 'response_cols': 'measured log solubility in mols per litre', 'id_col': 'Compound ID', 'smiles_col': 'smiles', 'output_dir': 'pytest', 'model_type': 'NN', 'splitter': 'scaffold', 'prediction_type': 'regression', 'baseline_epoch': '7', 'max_epochs': '10', 'datastore': 'False', 'save_results': 'False'