def random_test_train_valid_test_split(self): """Test of singletask RF ECFP regression API.""" input_transforms = [] output_transforms = ["normalize"] model_params = {} tasks = ["log-solubility"] task_type = "regression" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example.csv") featurizer = CircularFingerprint(size=1024) input_file = os.path.join(self.current_dir, input_file) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) # Splits featurized samples into train/test splitter = RandomSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir) assert len(train_dataset) == 8 assert len(valid_dataset) == 1 assert len(test_dataset) == 1
def _load_dense_dataset(dataset, valid_size=0.1, test_size=0.1, min_size=0, max_size=None, **kwargs): train_size = 1.0 - (test_size + valid_size) graphs = _read_graphfile(dataname=dataset, min_nodes=min_size, max_nodes=max_size) labels = [] for G in graphs: for u in G.nodes(): if G.nodes[u].get("feat") is None: # fall back to node label if node attributes are not found G.nodes[u]['feat'] = np.array(G.nodes[u]['label']) labels.append(G.graph['label']) n_tasks = len(set(labels)) labels = np.asarray(labels) dataset = NumpyDataset(graphs, y=labels, n_tasks=n_tasks) splitter = RandomSplitter() # splits.RandomStratifiedSplitter() train, valid, test = splitter.train_valid_test_split(dataset, frac_train=train_size, frac_valid=valid_size, frac_test=test_size) datasets = [] for dt in (train, valid, test): datasets.append( NetworkXGraphDataset(dt.X, dt.y, w=None, pad_to=max_size)) in_size = datasets[0].X[0].shape[-1] return datasets, in_size, n_tasks
def test_singletask_random_k_fold_split(self): """ Test singletask RandomSplitter class. """ solubility_dataset = self.load_solubility_data() random_splitter = RandomSplitter() ids_set = set(solubility_dataset.ids) K = 5 fold_dirs = [tempfile.mkdtemp() for i in range(K)] fold_datasets = random_splitter.k_fold_split(solubility_dataset, fold_dirs) for fold in range(K): fold_dataset = fold_datasets[fold] # Verify lengths is 10/k == 2 assert len(fold_dataset) == 2 # Verify that compounds in this fold are subset of original compounds fold_ids_set = set(fold_dataset.ids) assert fold_ids_set.issubset(ids_set) # Verify that no two folds have overlapping compounds. for other_fold in range(K): if fold == other_fold: continue other_fold_dataset = fold_datasets[other_fold] other_fold_ids_set = set(other_fold_dataset.ids) assert fold_ids_set.isdisjoint(other_fold_ids_set) merge_dir = tempfile.mkdtemp() merged_dataset = DiskDataset.merge(merge_dir, fold_datasets) assert len(merged_dataset) == len(solubility_dataset) assert sorted(merged_dataset.ids) == (sorted(solubility_dataset.ids))
def random_test_train_valid_test_split_from_sdf(self): """Test of singletask CoulombMatrixEig regression on .sdf file.""" splittype = "random" input_transforms = [] output_transforms = ["normalize"] model_params = {} tasks = ["atomization_energy"] task_type = "regression" task_types = {task: task_type for task in tasks} current_dir = os.path.dirname(os.path.abspath(__file__)) input_file = os.path.join(current_dir, "data/water.sdf") featurizer = CoulombMatrixEig(6, remove_hydrogens=False) input_file = os.path.join(self.current_dir, input_file) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, mol_field="mol", featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) # Splits featurized samples into train/test splitter = RandomSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir) assert len(train_dataset) == 8 assert len(valid_dataset) == 1 assert len(test_dataset) == 1
def test_multitask_random_split(self): """ Test multitask RandomSplitter class. """ multitask_dataset = self.load_multitask_data() random_splitter = RandomSplitter() train_data, valid_data, test_data = \ random_splitter.train_valid_test_split( multitask_dataset, self.train_dir, self.valid_dir, self.test_dir, frac_train=0.8, frac_valid=0.1, frac_test=0.1) assert len(train_data) == 8 assert len(valid_data) == 1 assert len(test_data) == 1
def _load_mol_dataset(dataset_file, tasks, split="stratified", test_size=0.1, valid_size=0.1, min_size=0, max_size=None, **kwargs): train_size = 1.0 - (test_size + valid_size) featurizer = RawFeaturizer() loader = CSVLoader(tasks=tasks, smiles_field="smiles", featurizer=featurizer, verbose=False, log_every_n=10000) dataset = loader.featurize(dataset_file) splitters = { 'index': IndexSplitter(), 'random': RandomSplitter(), 'scaffold': ScaffoldSplitter(), 'butina': ButinaSplitter(), 'stratified': RandomStratifiedSplitter() } splitter = splitters[split] train, valid, test = splitter.train_valid_test_split(dataset, frac_train=train_size, frac_valid=valid_size, frac_test=test_size) # compute data balance information on train balancer = BalancingTransformer(transform_w=True, dataset=train) train = balancer.transform(train) valid = balancer.transform(valid) test = balancer.transform(test) transformer = GraphTransformer(mol_size=[min_size, max_size], **kwargs) datasets = [] for dt in (train, valid, test): X, ids = transformer(dt.ids, dtype=np.float32, ignore_errors=False) y = dt.y[ids, :] w = dt.w[ids, :] raw_mols = dt.X[ids] datasets.append(MolDataset(X, y, raw_mols, w=w, pad_to=max_size)) in_size = X[0][-1].shape[-1] out_size = 1 if len(y.shape) == 1 else y.shape[-1] return datasets, in_size, out_size
def test_singletask_random_split(self): """ Test singletask RandomSplitter class. """ solubility_dataset = self.load_solubility_data() random_splitter = RandomSplitter() train_data, valid_data, test_data = \ random_splitter.train_valid_test_split( solubility_dataset, self.train_dir, self.valid_dir, self.test_dir, frac_train=0.8, frac_valid=0.1, frac_test=0.1) assert len(train_data) == 8 assert len(valid_data) == 1 assert len(test_data) == 1 merge_dir = tempfile.mkdtemp() merged_dataset = DiskDataset.merge(merge_dir, [train_data, valid_data, test_data]) assert sorted(merged_dataset.ids) == (sorted(solubility_dataset.ids))
def partition_train_val_test(smiles, dataset): """ Split a molecule dataset (SMILES) with deepchem built-ins """ ds = MockDataset(smiles) if dataset == "BBBP": splitter = ScaffoldSplitter() elif dataset == "BACE": splitter = ScaffoldSplitter() elif dataset == "TOX21": splitter = RandomSplitter() train_inds, val_inds, test_inds = splitter.split(ds) return { "train_inds": train_inds, "val_inds": val_inds, "test_inds": test_inds }
Load sider models now """ base_sider_data_dir = "/home/apappu/deepchem-models/toxcast_models/sider/sider_data" sider_tasks, sider_dataset, sider_transformers = load_sider( base_sider_data_dir, reload=reload) base_sider_dir = "/home/apappu/deepchem-models/toxcast_models/sider/sider_analysis" sider_train_dir = os.path.join(base_sider_dir, "train_dataset") sider_valid_dir = os.path.join(base_sider_dir, "valid_dataset") sider_test_dir = os.path.join(base_sider_dir, "test_dataset") sider_model_dir = os.path.join(base_sider_dir, "model") sider_splitter = RandomSplitter() sider_train_dataset, sider_valid_dataset, sider_test_dataset = sider_splitter.train_valid_test_split( sider_dataset, sider_train_dir, sider_valid_dir, sider_test_dir) # Fit Logistic Regression models sider_task_types = {task: "classification" for task in sider_tasks} params_dict = { "batch_size": None, "data_shape": sider_train_dataset.get_data_shape(), } sider_model = SingletaskToMultitask(sider_tasks, sider_task_types, params_dict, sider_model_dir,
force_transform = False base_dir = "/tmp/nci_rf" train_dir = os.path.join(base_dir, "train_dataset") valid_dir = os.path.join(base_dir, "valid_dataset") test_dir = os.path.join(base_dir, "test_dataset") model_dir = os.path.join(base_dir, "model") if os.path.exists(base_dir): shutil.rmtree(base_dir) os.makedirs(base_dir) nci_tasks, nci_dataset, transformers = load_nci( base_dir, reload=reload, force_transform=force_transform) print("About to perform train/valid/test split.") splitter = RandomSplitter(verbosity=verbosity) print("Performing new split.") train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( nci_dataset, train_dir, valid_dir, test_dir) classification_metric = Metric(metrics.roc_auc_score, np.mean, verbosity=verbosity, mode="classification") def model_builder(model_dir): sklearn_model = RandomForestRegressor(n_estimators=500) return SklearnModel(sklearn_model, model_dir)
def get_model(model_name: str): if model_name == "ECFP": model = model_obj(len(wang_tasks), wang_train.get_data_shape()[0], batch_size=50, tensorboard_log_frequency=25) else: model = model_obj(len(wang_tasks), batch_size=50, mode='regression', tensorboard_log_frequency=25) return model splitter_dict = { "Random": RandomSplitter(), #"Scaffold": ScaffoldSplitterNew(), #"MolecularWeight": MolecularWeightSplitterNew(), #"Butina": ButinaSplitterNew(), } if __name__ == "__main__": results = {} for splitter_name, splitter in splitter_dict.items(): logging.info(f"Generating scaffolds with {splitter_name}") results[splitter_name] = {} for model_name, model_obj in model_dict.items(): logging.info(f"Using {model_name} as a model") results[splitter_name][model_name] = {} featurizer = model_name
def test_singletask_random_split(self): """ Test singletask RandomSplitter class. """ solubility_dataset = self.load_solubility_data() random_splitter = RandomSplitter() train_data, valid_data, test_data = \ random_splitter.train_valid_test_split( solubility_dataset, self.train_dir, self.valid_dir, self.test_dir, frac_train=0.8, frac_valid=0.1, frac_test=0.1) assert len(train_data) == 8 assert len(valid_data) == 1 assert len(test_data) == 1 def test_singletask_scaffold_split(self): """ Test singletask ScaffoldSplitter class. """ solubility_dataset = self.load_solubility_data() scaffold_splitter = ScaffoldSplitter() train_data, valid_data, test_data = \ scaffold_splitter.train_valid_test_split( solubility_dataset, self.train_dir, self.valid_dir, self.test_dir, frac_train=0.8, frac_valid=0.1, frac_test=0.1) assert len(train_data) == 8 assert len(valid_data) == 1 assert len(test_data) == 1 def test_multitask_random_split(self): """ Test multitask RandomSplitter class. """ multitask_dataset = self.load_multitask_data() random_splitter = RandomSplitter() train_data, valid_data, test_data = \ random_splitter.train_valid_test_split( multitask_dataset, self.train_dir, self.valid_dir, self.test_dir, frac_train=0.8, frac_valid=0.1, frac_test=0.1) assert len(train_data) == 8 assert len(valid_data) == 1 assert len(test_data) == 1 def test_multitask_scaffold_split(self): """ Test multitask ScaffoldSplitter class. """ multitask_dataset = self.load_multitask_data() scaffold_splitter = ScaffoldSplitter() train_data, valid_data, test_data = \ scaffold_splitter.train_valid_test_split( multitask_dataset, self.train_dir, self.valid_dir, self.test_dir, frac_train=0.8, frac_valid=0.1, frac_test=0.1) assert len(train_data) == 8 assert len(valid_data) == 1 assert len(test_data) == 1 def test_stratified_multitask_split(self): """ Test multitask StratifiedSplitter class """ # ensure sparse dataset is actually sparse sparse_dataset = self.load_sparse_multitask_dataset() X, y, w, ids = sparse_dataset.to_numpy() """ sparsity is determined by number of w weights that are 0 for a given task structure of w np array is such that each row corresponds to a sample -- e.g., analyze third column for third sparse task """ frac_train = 0.5 cutoff = int(frac_train * w.shape[0]) w = w[:cutoff, :] sparse_flag = False col_index = 0 for col in w.T: if not np.any(col): #check to see if any columns are all zero sparse_flag = True break col_index += 1 if not sparse_flag: print("Test dataset isn't sparse -- test failed") else: print("Column %d is sparse -- expected" % col_index) assert sparse_flag stratified_splitter = StratifiedSplitter() train_data, valid_data, test_data = \ stratified_splitter.train_valid_test_split( sparse_dataset, self.train_dir, self.valid_dir, self.test_dir, frac_train=0.8, frac_valid=0.1, frac_test=0.1 ) datasets = [train_data, valid_data, test_data] dataset_index = 0 for dataset in datasets: X, y, w, ids = dataset.to_numpy() # verify that each task in the train dataset has some hits for col in w.T: if not np.any(col): print("Fail -- one column doesn't have results") if dataset_index == 0: print("train_data failed") elif dataset_index == 1: print("valid_data failed") elif dataset_index == 2: print("test_data failed") assert np.any(col) if dataset_index == 0: print("train_data passed") elif dataset_index == 1: print("valid_data passed") elif dataset_index == 2: print("test_data passed") dataset_index += 1 print("end of stratified test") assert 1 == 1