def _load_dense_dataset(dataset, valid_size=0.1, test_size=0.1, min_size=0, max_size=None, **kwargs): train_size = 1.0 - (test_size + valid_size) graphs = _read_graphfile(dataname=dataset, min_nodes=min_size, max_nodes=max_size) labels = [] for G in graphs: for u in G.nodes(): if G.nodes[u].get("feat") is None: # fall back to node label if node attributes are not found G.nodes[u]['feat'] = np.array(G.nodes[u]['label']) labels.append(G.graph['label']) n_tasks = len(set(labels)) labels = np.asarray(labels) dataset = NumpyDataset(graphs, y=labels, n_tasks=n_tasks) splitter = RandomSplitter() # splits.RandomStratifiedSplitter() train, valid, test = splitter.train_valid_test_split(dataset, frac_train=train_size, frac_valid=valid_size, frac_test=test_size) datasets = [] for dt in (train, valid, test): datasets.append( NetworkXGraphDataset(dt.X, dt.y, w=None, pad_to=max_size)) in_size = datasets[0].X[0].shape[-1] return datasets, in_size, n_tasks
def random_test_train_valid_test_split_from_sdf(self): """Test of singletask CoulombMatrixEig regression on .sdf file.""" splittype = "random" input_transforms = [] output_transforms = ["normalize"] model_params = {} tasks = ["atomization_energy"] task_type = "regression" task_types = {task: task_type for task in tasks} current_dir = os.path.dirname(os.path.abspath(__file__)) input_file = os.path.join(current_dir, "data/water.sdf") featurizer = CoulombMatrixEig(6, remove_hydrogens=False) input_file = os.path.join(self.current_dir, input_file) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, mol_field="mol", featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) # Splits featurized samples into train/test splitter = RandomSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir) assert len(train_dataset) == 8 assert len(valid_dataset) == 1 assert len(test_dataset) == 1
def random_test_train_valid_test_split(self): """Test of singletask RF ECFP regression API.""" input_transforms = [] output_transforms = ["normalize"] model_params = {} tasks = ["log-solubility"] task_type = "regression" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example.csv") featurizer = CircularFingerprint(size=1024) input_file = os.path.join(self.current_dir, input_file) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) # Splits featurized samples into train/test splitter = RandomSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir) assert len(train_dataset) == 8 assert len(valid_dataset) == 1 assert len(test_dataset) == 1
def test_multitask_random_split(self): """ Test multitask RandomSplitter class. """ multitask_dataset = self.load_multitask_data() random_splitter = RandomSplitter() train_data, valid_data, test_data = \ random_splitter.train_valid_test_split( multitask_dataset, self.train_dir, self.valid_dir, self.test_dir, frac_train=0.8, frac_valid=0.1, frac_test=0.1) assert len(train_data) == 8 assert len(valid_data) == 1 assert len(test_data) == 1
def test_singletask_random_split(self): """ Test singletask RandomSplitter class. """ solubility_dataset = self.load_solubility_data() random_splitter = RandomSplitter() train_data, valid_data, test_data = \ random_splitter.train_valid_test_split( solubility_dataset, self.train_dir, self.valid_dir, self.test_dir, frac_train=0.8, frac_valid=0.1, frac_test=0.1) assert len(train_data) == 8 assert len(valid_data) == 1 assert len(test_data) == 1 merge_dir = tempfile.mkdtemp() merged_dataset = DiskDataset.merge(merge_dir, [train_data, valid_data, test_data]) assert sorted(merged_dataset.ids) == (sorted(solubility_dataset.ids))
base_dir = "/tmp/nci_rf" train_dir = os.path.join(base_dir, "train_dataset") valid_dir = os.path.join(base_dir, "valid_dataset") test_dir = os.path.join(base_dir, "test_dataset") model_dir = os.path.join(base_dir, "model") if os.path.exists(base_dir): shutil.rmtree(base_dir) os.makedirs(base_dir) nci_tasks, nci_dataset, transformers = load_nci( base_dir, reload=reload, force_transform=force_transform) print("About to perform train/valid/test split.") splitter = RandomSplitter(verbosity=verbosity) print("Performing new split.") train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( nci_dataset, train_dir, valid_dir, test_dir) classification_metric = Metric(metrics.roc_auc_score, np.mean, verbosity=verbosity, mode="classification") def model_builder(model_dir): sklearn_model = RandomForestRegressor(n_estimators=500) return SklearnModel(sklearn_model, model_dir) model = SingletaskToMultitask(nci_tasks, model_builder, model_dir) # Fit trained model model.fit(train_dataset) model.save() train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance([classification_metric])
""" base_sider_data_dir = "/home/apappu/deepchem-models/toxcast_models/sider/sider_data" sider_tasks, sider_dataset, sider_transformers = load_sider( base_sider_data_dir, reload=reload) base_sider_dir = "/home/apappu/deepchem-models/toxcast_models/sider/sider_analysis" sider_train_dir = os.path.join(base_sider_dir, "train_dataset") sider_valid_dir = os.path.join(base_sider_dir, "valid_dataset") sider_test_dir = os.path.join(base_sider_dir, "test_dataset") sider_model_dir = os.path.join(base_sider_dir, "model") sider_splitter = RandomSplitter() sider_train_dataset, sider_valid_dataset, sider_test_dataset = sider_splitter.train_valid_test_split( sider_dataset, sider_train_dir, sider_valid_dir, sider_test_dir) # Fit Logistic Regression models sider_task_types = {task: "classification" for task in sider_tasks} params_dict = { "batch_size": None, "data_shape": sider_train_dataset.get_data_shape(), } sider_model = SingletaskToMultitask(sider_tasks, sider_task_types, params_dict, sider_model_dir, model_builder, verbosity=verbosity)
base_dir = "/tmp/nci_rf" train_dir = os.path.join(base_dir, "train_dataset") valid_dir = os.path.join(base_dir, "valid_dataset") test_dir = os.path.join(base_dir, "test_dataset") model_dir = os.path.join(base_dir, "model") if os.path.exists(base_dir): shutil.rmtree(base_dir) os.makedirs(base_dir) nci_tasks, nci_dataset, transformers = load_nci( base_dir, reload=reload, force_transform=force_transform) print("About to perform train/valid/test split.") splitter = RandomSplitter(verbosity=verbosity) print("Performing new split.") train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( nci_dataset, train_dir, valid_dir, test_dir) classification_metric = Metric(metrics.roc_auc_score, np.mean, verbosity=verbosity, mode="classification") def model_builder(model_dir): sklearn_model = RandomForestRegressor(n_estimators=500) return SklearnModel(sklearn_model, model_dir) model = SingletaskToMultitask(nci_tasks, model_builder, model_dir) # Fit trained model
def test_singletask_random_split(self): """ Test singletask RandomSplitter class. """ solubility_dataset = self.load_solubility_data() random_splitter = RandomSplitter() train_data, valid_data, test_data = \ random_splitter.train_valid_test_split( solubility_dataset, self.train_dir, self.valid_dir, self.test_dir, frac_train=0.8, frac_valid=0.1, frac_test=0.1) assert len(train_data) == 8 assert len(valid_data) == 1 assert len(test_data) == 1 def test_singletask_scaffold_split(self): """ Test singletask ScaffoldSplitter class. """ solubility_dataset = self.load_solubility_data() scaffold_splitter = ScaffoldSplitter() train_data, valid_data, test_data = \ scaffold_splitter.train_valid_test_split( solubility_dataset, self.train_dir, self.valid_dir, self.test_dir, frac_train=0.8, frac_valid=0.1, frac_test=0.1) assert len(train_data) == 8 assert len(valid_data) == 1 assert len(test_data) == 1 def test_multitask_random_split(self): """ Test multitask RandomSplitter class. """ multitask_dataset = self.load_multitask_data() random_splitter = RandomSplitter() train_data, valid_data, test_data = \ random_splitter.train_valid_test_split( multitask_dataset, self.train_dir, self.valid_dir, self.test_dir, frac_train=0.8, frac_valid=0.1, frac_test=0.1) assert len(train_data) == 8 assert len(valid_data) == 1 assert len(test_data) == 1 def test_multitask_scaffold_split(self): """ Test multitask ScaffoldSplitter class. """ multitask_dataset = self.load_multitask_data() scaffold_splitter = ScaffoldSplitter() train_data, valid_data, test_data = \ scaffold_splitter.train_valid_test_split( multitask_dataset, self.train_dir, self.valid_dir, self.test_dir, frac_train=0.8, frac_valid=0.1, frac_test=0.1) assert len(train_data) == 8 assert len(valid_data) == 1 assert len(test_data) == 1 def test_stratified_multitask_split(self): """ Test multitask StratifiedSplitter class """ # ensure sparse dataset is actually sparse sparse_dataset = self.load_sparse_multitask_dataset() X, y, w, ids = sparse_dataset.to_numpy() """ sparsity is determined by number of w weights that are 0 for a given task structure of w np array is such that each row corresponds to a sample -- e.g., analyze third column for third sparse task """ frac_train = 0.5 cutoff = int(frac_train * w.shape[0]) w = w[:cutoff, :] sparse_flag = False col_index = 0 for col in w.T: if not np.any(col): #check to see if any columns are all zero sparse_flag = True break col_index+=1 if not sparse_flag: print("Test dataset isn't sparse -- test failed") else: print("Column %d is sparse -- expected" % col_index) assert sparse_flag stratified_splitter = StratifiedSplitter() train_data, valid_data, test_data = \ stratified_splitter.train_valid_test_split( sparse_dataset, self.train_dir, self.valid_dir, self.test_dir, frac_train=0.8, frac_valid=0.1, frac_test=0.1 ) datasets = [train_data, valid_data, test_data] dataset_index = 0 for dataset in datasets: X, y, w, ids = dataset.to_numpy() # verify that each task in the train dataset has some hits for col in w.T: if not np.any(col): print("Fail -- one column doesn't have results") if dataset_index == 0: print("train_data failed") elif dataset_index == 1: print("valid_data failed") elif dataset_index == 2: print("test_data failed") assert np.any(col) if dataset_index == 0: print("train_data passed") elif dataset_index == 1: print("valid_data passed") elif dataset_index == 2: print("test_data passed") dataset_index+=1 print("end of stratified test") assert 1 == 1
""" base_sider_data_dir = "/home/apappu/deepchem-models/toxcast_models/sider/sider_data" sider_tasks, sider_dataset, sider_transformers = load_sider( base_sider_data_dir, reload=reload) base_sider_dir = "/home/apappu/deepchem-models/toxcast_models/sider/sider_analysis" sider_train_dir = os.path.join(base_sider_dir, "train_dataset") sider_valid_dir = os.path.join(base_sider_dir, "valid_dataset") sider_test_dir = os.path.join(base_sider_dir, "test_dataset") sider_model_dir = os.path.join(base_sider_dir, "model") sider_splitter = RandomSplitter() sider_train_dataset, sider_valid_dataset, sider_test_dataset = sider_splitter.train_valid_test_split( sider_dataset, sider_train_dir, sider_valid_dir, sider_test_dir) # Fit Logistic Regression models sider_task_types = {task: "classification" for task in sider_tasks} params_dict = { "batch_size": None, "data_shape": sider_train_dataset.get_data_shape(), } sider_model = SingletaskToMultitask(sider_tasks, sider_task_types, params_dict, sider_model_dir, model_builder, verbosity=verbosity) sider_model.reload() """ Load sweetlead dataset now. Pass in dataset object and appropriate transformers to predict functions
def test_singletask_random_split(self): """ Test singletask RandomSplitter class. """ solubility_dataset = self.load_solubility_data() random_splitter = RandomSplitter() train_data, valid_data, test_data = \ random_splitter.train_valid_test_split( solubility_dataset, self.train_dir, self.valid_dir, self.test_dir, frac_train=0.8, frac_valid=0.1, frac_test=0.1) assert len(train_data) == 8 assert len(valid_data) == 1 assert len(test_data) == 1 def test_singletask_scaffold_split(self): """ Test singletask ScaffoldSplitter class. """ solubility_dataset = self.load_solubility_data() scaffold_splitter = ScaffoldSplitter() train_data, valid_data, test_data = \ scaffold_splitter.train_valid_test_split( solubility_dataset, self.train_dir, self.valid_dir, self.test_dir, frac_train=0.8, frac_valid=0.1, frac_test=0.1) assert len(train_data) == 8 assert len(valid_data) == 1 assert len(test_data) == 1 def test_multitask_random_split(self): """ Test multitask RandomSplitter class. """ multitask_dataset = self.load_multitask_data() random_splitter = RandomSplitter() train_data, valid_data, test_data = \ random_splitter.train_valid_test_split( multitask_dataset, self.train_dir, self.valid_dir, self.test_dir, frac_train=0.8, frac_valid=0.1, frac_test=0.1) assert len(train_data) == 8 assert len(valid_data) == 1 assert len(test_data) == 1 def test_multitask_scaffold_split(self): """ Test multitask ScaffoldSplitter class. """ multitask_dataset = self.load_multitask_data() scaffold_splitter = ScaffoldSplitter() train_data, valid_data, test_data = \ scaffold_splitter.train_valid_test_split( multitask_dataset, self.train_dir, self.valid_dir, self.test_dir, frac_train=0.8, frac_valid=0.1, frac_test=0.1) assert len(train_data) == 8 assert len(valid_data) == 1 assert len(test_data) == 1 def test_stratified_multitask_split(self): """ Test multitask StratifiedSplitter class """ # ensure sparse dataset is actually sparse sparse_dataset = self.load_sparse_multitask_dataset() X, y, w, ids = sparse_dataset.to_numpy() """ sparsity is determined by number of w weights that are 0 for a given task structure of w np array is such that each row corresponds to a sample -- e.g., analyze third column for third sparse task """ frac_train = 0.5 cutoff = int(frac_train * w.shape[0]) w = w[:cutoff, :] sparse_flag = False col_index = 0 for col in w.T: if not np.any(col): #check to see if any columns are all zero sparse_flag = True break col_index += 1 if not sparse_flag: print("Test dataset isn't sparse -- test failed") else: print("Column %d is sparse -- expected" % col_index) assert sparse_flag stratified_splitter = StratifiedSplitter() train_data, valid_data, test_data = \ stratified_splitter.train_valid_test_split( sparse_dataset, self.train_dir, self.valid_dir, self.test_dir, frac_train=0.8, frac_valid=0.1, frac_test=0.1 ) datasets = [train_data, valid_data, test_data] dataset_index = 0 for dataset in datasets: X, y, w, ids = dataset.to_numpy() # verify that each task in the train dataset has some hits for col in w.T: if not np.any(col): print("Fail -- one column doesn't have results") if dataset_index == 0: print("train_data failed") elif dataset_index == 1: print("valid_data failed") elif dataset_index == 2: print("test_data failed") assert np.any(col) if dataset_index == 0: print("train_data passed") elif dataset_index == 1: print("valid_data passed") elif dataset_index == 2: print("test_data passed") dataset_index += 1 print("end of stratified test") assert 1 == 1