verbosity = "high" base_tox_data_dir = "/home/apappu/deepchem-models/toxcast_models/toxcast/toxcast_data" tox_tasks, tox_dataset, tox_transformers = load_tox(base_tox_data_dir, reload=reload) #removes directory if present -- warning base_tox_dir = "/home/apappu/deepchem-models/toxcast_models/toxcast/toxcast_analysis" tox_train_dir = os.path.join(base_tox_dir, "train_dataset") tox_valid_dir = os.path.join(base_tox_dir, "valid_dataset") tox_test_dir = os.path.join(base_tox_dir, "test_dataset") tox_model_dir = os.path.join(base_tox_dir, "model") tox_splitter = StratifiedSplitter() #default split is 80-10-10 train-valid-test split tox_train_dataset, tox_valid_dataset, tox_test_dataset = tox_splitter.train_valid_test_split( tox_dataset, tox_train_dir, tox_valid_dir, tox_test_dir) # Fit Logistic Regression models tox_task_types = {task: "classification" for task in tox_tasks} classification_metric = Metric(metrics.roc_auc_score, np.mean, verbosity=verbosity, mode="classification") params_dict = { "batch_size": None, "data_shape": tox_train_dataset.get_data_shape(),
verbosity = "high" base_tox_data_dir = "/home/apappu/deepchem-models/toxcast_models/toxcast/toxcast_data" tox_tasks, tox_dataset, tox_transformers = load_tox( base_tox_data_dir, reload=reload) #removes directory if present -- warning base_tox_dir = "/home/apappu/deepchem-models/toxcast_models/toxcast/toxcast_analysis" tox_train_dir = os.path.join(base_tox_dir, "train_dataset") tox_valid_dir = os.path.join(base_tox_dir, "valid_dataset") tox_test_dir = os.path.join(base_tox_dir, "test_dataset") tox_model_dir = os.path.join(base_tox_dir, "model") tox_splitter = StratifiedSplitter() #default split is 80-10-10 train-valid-test split tox_train_dataset, tox_valid_dataset, tox_test_dataset = tox_splitter.train_valid_test_split( tox_dataset, tox_train_dir, tox_valid_dir, tox_test_dir) # Fit Logistic Regression models tox_task_types = {task: "classification" for task in tox_tasks} classification_metric = Metric(metrics.roc_auc_score, np.mean, verbosity=verbosity, mode="classification") params_dict = { "batch_size": None, "data_shape": tox_train_dataset.get_data_shape(),
def test_stratified_multitask_split(self): """ Test multitask StratifiedSplitter class """ # ensure sparse dataset is actually sparse sparse_dataset = self.load_sparse_multitask_dataset() X, y, w, ids = sparse_dataset.to_numpy() """ sparsity is determined by number of w weights that are 0 for a given task structure of w np array is such that each row corresponds to a sample -- e.g., analyze third column for third sparse task """ frac_train = 0.5 cutoff = int(frac_train * w.shape[0]) w = w[:cutoff, :] sparse_flag = False col_index = 0 for col in w.T: if not np.any(col): #check to see if any columns are all zero sparse_flag = True break col_index+=1 if not sparse_flag: print("Test dataset isn't sparse -- test failed") else: print("Column %d is sparse -- expected" % col_index) assert sparse_flag stratified_splitter = StratifiedSplitter() train_data, valid_data, test_data = \ stratified_splitter.train_valid_test_split( sparse_dataset, self.train_dir, self.valid_dir, self.test_dir, frac_train=0.8, frac_valid=0.1, frac_test=0.1 ) datasets = [train_data, valid_data, test_data] dataset_index = 0 for dataset in datasets: X, y, w, ids = dataset.to_numpy() # verify that each task in the train dataset has some hits for col in w.T: if not np.any(col): print("Fail -- one column doesn't have results") if dataset_index == 0: print("train_data failed") elif dataset_index == 1: print("valid_data failed") elif dataset_index == 2: print("test_data failed") assert np.any(col) if dataset_index == 0: print("train_data passed") elif dataset_index == 1: print("valid_data passed") elif dataset_index == 2: print("test_data passed") dataset_index+=1 print("end of stratified test") assert 1 == 1
def test_stratified_multitask_split(self): """ Test multitask StratifiedSplitter class """ # ensure sparse dataset is actually sparse sparse_dataset = self.load_sparse_multitask_dataset() X, y, w, ids = sparse_dataset.to_numpy() """ sparsity is determined by number of w weights that are 0 for a given task structure of w np array is such that each row corresponds to a sample -- e.g., analyze third column for third sparse task """ frac_train = 0.5 cutoff = int(frac_train * w.shape[0]) w = w[:cutoff, :] sparse_flag = False col_index = 0 for col in w.T: if not np.any(col): #check to see if any columns are all zero sparse_flag = True break col_index += 1 if not sparse_flag: print("Test dataset isn't sparse -- test failed") else: print("Column %d is sparse -- expected" % col_index) assert sparse_flag stratified_splitter = StratifiedSplitter() train_data, valid_data, test_data = \ stratified_splitter.train_valid_test_split( sparse_dataset, self.train_dir, self.valid_dir, self.test_dir, frac_train=0.8, frac_valid=0.1, frac_test=0.1 ) datasets = [train_data, valid_data, test_data] dataset_index = 0 for dataset in datasets: X, y, w, ids = dataset.to_numpy() # verify that each task in the train dataset has some hits for col in w.T: if not np.any(col): print("Fail -- one column doesn't have results") if dataset_index == 0: print("train_data failed") elif dataset_index == 1: print("valid_data failed") elif dataset_index == 2: print("test_data failed") assert np.any(col) if dataset_index == 0: print("train_data passed") elif dataset_index == 1: print("valid_data passed") elif dataset_index == 2: print("test_data passed") dataset_index += 1 print("end of stratified test") assert 1 == 1