Exemple #1
0
def test_get_molnet_bbbp_dataset_with_smiles():
    # test default behavior
    pp = AtomicNumberPreprocessor()
    datasets = molnet.get_molnet_dataset('bbbp', preprocessor=pp,
                                         return_smiles=True)

    assert 'smiles' in datasets.keys()
    assert 'dataset' in datasets.keys()
    smileses = datasets['smiles']
    datasets = datasets['dataset']
    assert len(smileses) == 3
    assert len(datasets) == 3

    # Test each train, valid and test dataset
    for i, dataset in enumerate(datasets):
        # --- Test dataset is correctly obtained ---
        index = np.random.choice(len(dataset), None)
        atoms, label = dataset[index]

        assert atoms.ndim == 1  # (atom, )
        assert atoms.dtype == np.int32
        # (atom from, atom to) or (edge_type, atom from, atom to) assert label.ndim == 1
        assert label.shape[0] == 1
        assert label.dtype == np.int32
        assert len(dataset) == expect_bbbp_lengths[i]
        assert len(smileses[i]) == expect_bbbp_lengths[i]
def test_get_molnet_bbbp_dataset_change_split_ratio():
    # test default behavior
    pp = AtomicNumberPreprocessor()
    datasets = molnet.get_molnet_dataset('bbbp',
                                         preprocessor=pp,
                                         frac_train=0.5,
                                         frac_valid=0.3,
                                         frac_test=0.2)
    assert 'smiles' in datasets.keys()
    assert 'dataset' in datasets.keys()
    datasets = datasets['dataset']
    assert len(datasets) == 3
    assert type(datasets[0]) == NumpyTupleDataset
    assert type(datasets[1]) == NumpyTupleDataset
    assert type(datasets[2]) == NumpyTupleDataset

    # Test each train, valid and test dataset
    for i, dataset in enumerate(datasets):
        # --- Test dataset is correctly obtained ---
        index = numpy.random.choice(len(dataset), None)
        atoms, label = dataset[index]

        assert atoms.ndim == 1  # (atom, )
        assert atoms.dtype == numpy.int32
        # (atom from, atom to) or (edge_type, atom from, atom to)
        assert label.ndim == 1
        assert label.shape[0] == 1
        assert label.dtype == numpy.int32
        assert len(dataset) == expect_bbbp_lengths2[i]
def test_get_molnet_qm7_dataset():
    # test default behavior
    pp = AtomicNumberPreprocessor()
    datasets = molnet.get_molnet_dataset('qm7', preprocessor=pp)
    assert 'smiles' in datasets.keys()
    assert 'dataset' in datasets.keys()
    datasets = datasets['dataset']
    assert len(datasets) == 3
    assert type(datasets[0]) == NumpyTupleDataset
    assert type(datasets[1]) == NumpyTupleDataset
    assert type(datasets[2]) == NumpyTupleDataset

    # Test each train, valid and test dataset
    for i, dataset in enumerate(datasets):
        # --- Test dataset is correctly obtained ---
        index = numpy.random.choice(len(dataset), None)
        atoms, label = dataset[index]

        assert atoms.ndim == 1  # (atom, )
        assert atoms.dtype == numpy.int32
        # (atom from, atom to) or (edge_type, atom from, atom to)
        assert label.ndim == 1
        assert label.shape[0] == 1
        assert label.dtype == numpy.float32

        # --- Test number of dataset ---
        assert len(dataset) == expect_qm7_lengths[i]
Exemple #4
0
def test_get_molnet_grid_featurized_pdbbind_dataset():
    # test default behavioer
    datasets = molnet.get_molnet_dataset('pdbbind_grid',
                                         pdbbind_subset='core',
                                         split='random')
    assert 'dataset' in datasets.keys()
    datasets = datasets['dataset']
    assert len(datasets) == 3
    assert type(datasets[0]) == NumpyTupleDataset
    assert type(datasets[1]) == NumpyTupleDataset
    assert type(datasets[2]) == NumpyTupleDataset

    # Test each train, valid and test dataset
    for i, dataset in enumerate(datasets):
        # --- Test dataset is correctly obtained ---
        index = numpy.random.choice(len(dataset), None)
        atoms, label = dataset[index]

        assert atoms.ndim == 1  # (atom, )
        assert atoms.dtype == numpy.int32
        # (atom from, atom to) or (edge_type, atom from, atom to)
        assert label.ndim == 1
        assert label.shape[0] == 1
        assert label.dtype == numpy.float32

        # --- Test number of dataset ---
        assert len(dataset) == expect_featurized_pdbbind_lengths[i]
Exemple #5
0
def test_get_molnet_pdbbind_dataset_with_pdb_id():
    # test default behavior
    pp = AtomicNumberPreprocessor()
    time_list = numpy.random.randint(1000, size=168).tolist()
    datasets = molnet.get_molnet_dataset('pdbbind_smiles',
                                         preprocessor=pp,
                                         pdbbind_subset='core',
                                         return_pdb_id=True,
                                         time_list=time_list,
                                         split='random')
    assert 'smiles' in datasets.keys()
    assert 'dataset' in datasets.keys()
    assert 'pdb_id' in datasets.keys()
    pdb_ids = datasets['pdb_id']
    datasets = datasets['dataset']
    assert len(pdb_ids) == 3
    assert len(datasets) == 3

    # Test each train, valid and test dataset
    for i, dataset in enumerate(datasets):
        # --- Test dataset is correctly obtained ---
        index = numpy.random.choice(len(dataset), None)
        atoms, label = dataset[index]

        assert label.ndim == 1  # (atom, )
        assert atoms.dtype == numpy.int32
        # (atom from, atom to) or (edge_type, atom from, atom to)
        assert label.ndim == 1
        assert label.shape[0] == 1
        assert label.dtype == numpy.float32

        # --Test number of dataset ---
        assert len(dataset) == expect_pdbbind_lengths[i]
        assert len(pdb_ids[i]) == expect_pdbbind_lengths[i]
Exemple #6
0
def main():
    # Supported preprocessing/network list
    parser = argparse.ArgumentParser(
        description='Regression with own dataset.')
    parser.add_argument('--gpu', '-g', type=int, default=-1)
    parser.add_argument('--method',
                        type=str,
                        default='nfpdrop',
                        choices=['nfpdrop', 'ggnndrop', 'nfp', 'ggnn'])
    parser.add_argument('--epoch', '-e', type=int, default=20)
    parser.add_argument('--seed', '-s', type=int, default=777)
    parser.add_argument('--layer', '-n', type=int, default=3)
    parser.add_argument('--batchsize', '-b', type=int, default=32)
    parser.add_argument('--m', '-m', type=int, default=30)
    args = parser.parse_args()

    dataset_name = 'delaney'
    # labels = "measured log solubility in mols per litre"
    labels = None

    # Dataset preparation
    print('Preprocessing dataset...')
    method = args.method
    if 'nfp' in method:
        preprocess_method = 'nfp'
    elif 'ggnn' in method:
        preprocess_method = 'ggnn'
    else:
        raise ValueError('Unexpected method', method)
    preprocessor = preprocess_method_dict[preprocess_method]()
    data = get_molnet_dataset(dataset_name,
                              preprocessor,
                              labels=labels,
                              return_smiles=True,
                              frac_train=1.0,
                              frac_valid=0.0,
                              frac_test=0.0)
    dataset = data['dataset'][0]
    smiles = data['smiles'][0]

    epoch = args.epoch
    gpu = args.gpu

    n_unit_list = [32]
    random_state = np.random.RandomState(args.seed)
    n = len(dataset)

    M = args.m
    order = np.arange(n)
    random_state.shuffle(order)
    batchsize = args.batchsize
    for n_unit in n_unit_list:
        n_layer = args.layer
        n_split = 5
        for idx in range(n_split):
            print('Start training: ', idx + 1, "/", n_split)
            dir_path = get_dir_path(batchsize, n_unit, n_layer, M, method)
            os.makedirs(dir_path, exist_ok=True)
            np.save(os.path.join(dir_path, "smiles.npy"), np.array(smiles))
            train(gpu, method, epoch, batchsize, n_unit, n_layer, dataset,
                  smiles, M, n_split, idx, order)