def test_get_molnet_bbbp_dataset_with_smiles(): # test default behavior pp = AtomicNumberPreprocessor() datasets = molnet.get_molnet_dataset('bbbp', preprocessor=pp, return_smiles=True) assert 'smiles' in datasets.keys() assert 'dataset' in datasets.keys() smileses = datasets['smiles'] datasets = datasets['dataset'] assert len(smileses) == 3 assert len(datasets) == 3 # Test each train, valid and test dataset for i, dataset in enumerate(datasets): # --- Test dataset is correctly obtained --- index = np.random.choice(len(dataset), None) atoms, label = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == np.int32 # (atom from, atom to) or (edge_type, atom from, atom to) assert label.ndim == 1 assert label.shape[0] == 1 assert label.dtype == np.int32 assert len(dataset) == expect_bbbp_lengths[i] assert len(smileses[i]) == expect_bbbp_lengths[i]
def test_get_molnet_bbbp_dataset_change_split_ratio(): # test default behavior pp = AtomicNumberPreprocessor() datasets = molnet.get_molnet_dataset('bbbp', preprocessor=pp, frac_train=0.5, frac_valid=0.3, frac_test=0.2) assert 'smiles' in datasets.keys() assert 'dataset' in datasets.keys() datasets = datasets['dataset'] assert len(datasets) == 3 assert type(datasets[0]) == NumpyTupleDataset assert type(datasets[1]) == NumpyTupleDataset assert type(datasets[2]) == NumpyTupleDataset # Test each train, valid and test dataset for i, dataset in enumerate(datasets): # --- Test dataset is correctly obtained --- index = numpy.random.choice(len(dataset), None) atoms, label = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (atom from, atom to) or (edge_type, atom from, atom to) assert label.ndim == 1 assert label.shape[0] == 1 assert label.dtype == numpy.int32 assert len(dataset) == expect_bbbp_lengths2[i]
def test_get_molnet_qm7_dataset(): # test default behavior pp = AtomicNumberPreprocessor() datasets = molnet.get_molnet_dataset('qm7', preprocessor=pp) assert 'smiles' in datasets.keys() assert 'dataset' in datasets.keys() datasets = datasets['dataset'] assert len(datasets) == 3 assert type(datasets[0]) == NumpyTupleDataset assert type(datasets[1]) == NumpyTupleDataset assert type(datasets[2]) == NumpyTupleDataset # Test each train, valid and test dataset for i, dataset in enumerate(datasets): # --- Test dataset is correctly obtained --- index = numpy.random.choice(len(dataset), None) atoms, label = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (atom from, atom to) or (edge_type, atom from, atom to) assert label.ndim == 1 assert label.shape[0] == 1 assert label.dtype == numpy.float32 # --- Test number of dataset --- assert len(dataset) == expect_qm7_lengths[i]
def test_get_molnet_grid_featurized_pdbbind_dataset(): # test default behavioer datasets = molnet.get_molnet_dataset('pdbbind_grid', pdbbind_subset='core', split='random') assert 'dataset' in datasets.keys() datasets = datasets['dataset'] assert len(datasets) == 3 assert type(datasets[0]) == NumpyTupleDataset assert type(datasets[1]) == NumpyTupleDataset assert type(datasets[2]) == NumpyTupleDataset # Test each train, valid and test dataset for i, dataset in enumerate(datasets): # --- Test dataset is correctly obtained --- index = numpy.random.choice(len(dataset), None) atoms, label = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (atom from, atom to) or (edge_type, atom from, atom to) assert label.ndim == 1 assert label.shape[0] == 1 assert label.dtype == numpy.float32 # --- Test number of dataset --- assert len(dataset) == expect_featurized_pdbbind_lengths[i]
def test_get_molnet_pdbbind_dataset_with_pdb_id(): # test default behavior pp = AtomicNumberPreprocessor() time_list = numpy.random.randint(1000, size=168).tolist() datasets = molnet.get_molnet_dataset('pdbbind_smiles', preprocessor=pp, pdbbind_subset='core', return_pdb_id=True, time_list=time_list, split='random') assert 'smiles' in datasets.keys() assert 'dataset' in datasets.keys() assert 'pdb_id' in datasets.keys() pdb_ids = datasets['pdb_id'] datasets = datasets['dataset'] assert len(pdb_ids) == 3 assert len(datasets) == 3 # Test each train, valid and test dataset for i, dataset in enumerate(datasets): # --- Test dataset is correctly obtained --- index = numpy.random.choice(len(dataset), None) atoms, label = dataset[index] assert label.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (atom from, atom to) or (edge_type, atom from, atom to) assert label.ndim == 1 assert label.shape[0] == 1 assert label.dtype == numpy.float32 # --Test number of dataset --- assert len(dataset) == expect_pdbbind_lengths[i] assert len(pdb_ids[i]) == expect_pdbbind_lengths[i]
def main(): # Supported preprocessing/network list parser = argparse.ArgumentParser( description='Regression with own dataset.') parser.add_argument('--gpu', '-g', type=int, default=-1) parser.add_argument('--method', type=str, default='nfpdrop', choices=['nfpdrop', 'ggnndrop', 'nfp', 'ggnn']) parser.add_argument('--epoch', '-e', type=int, default=20) parser.add_argument('--seed', '-s', type=int, default=777) parser.add_argument('--layer', '-n', type=int, default=3) parser.add_argument('--batchsize', '-b', type=int, default=32) parser.add_argument('--m', '-m', type=int, default=30) args = parser.parse_args() dataset_name = 'delaney' # labels = "measured log solubility in mols per litre" labels = None # Dataset preparation print('Preprocessing dataset...') method = args.method if 'nfp' in method: preprocess_method = 'nfp' elif 'ggnn' in method: preprocess_method = 'ggnn' else: raise ValueError('Unexpected method', method) preprocessor = preprocess_method_dict[preprocess_method]() data = get_molnet_dataset(dataset_name, preprocessor, labels=labels, return_smiles=True, frac_train=1.0, frac_valid=0.0, frac_test=0.0) dataset = data['dataset'][0] smiles = data['smiles'][0] epoch = args.epoch gpu = args.gpu n_unit_list = [32] random_state = np.random.RandomState(args.seed) n = len(dataset) M = args.m order = np.arange(n) random_state.shuffle(order) batchsize = args.batchsize for n_unit in n_unit_list: n_layer = args.layer n_split = 5 for idx in range(n_split): print('Start training: ', idx + 1, "/", n_split) dir_path = get_dir_path(batchsize, n_unit, n_layer, M, method) os.makedirs(dir_path, exist_ok=True) np.save(os.path.join(dir_path, "smiles.npy"), np.array(smiles)) train(gpu, method, epoch, batchsize, n_unit, n_layer, dataset, smiles, M, n_split, idx, order)