def test_atomic_number_non_default_max_atoms_preprocessor(mol):
    preprocessor = AtomicNumberPreprocessor(max_atoms=5)
    ret_atom_array = preprocessor.get_input_features(mol)
    expect_atom_array = numpy.array([6, 7, 6, 8],
                                    dtype=numpy.int32)
    numpy.testing.assert_array_equal(ret_atom_array, expect_atom_array)

    preprocessor = AtomicNumberPreprocessor(max_atoms=3)
    with pytest.raises(MolFeatureExtractionError):
        preprocessor.get_input_features(mol)
Example #2
0
def test_get_molnet_bbbp_dataset_with_smiles():
    # test default behavior
    pp = AtomicNumberPreprocessor()
    datasets = molnet.get_molnet_dataset('bbbp', preprocessor=pp,
                                         return_smiles=True)

    assert 'smiles' in datasets.keys()
    assert 'dataset' in datasets.keys()
    smileses = datasets['smiles']
    datasets = datasets['dataset']
    assert len(smileses) == 3
    assert len(datasets) == 3

    # Test each train, valid and test dataset
    for i, dataset in enumerate(datasets):
        # --- Test dataset is correctly obtained ---
        index = np.random.choice(len(dataset), None)
        atoms, label = dataset[index]

        assert atoms.ndim == 1  # (atom, )
        assert atoms.dtype == np.int32
        # (atom from, atom to) or (edge_type, atom from, atom to) assert label.ndim == 1
        assert label.shape[0] == 1
        assert label.dtype == np.int32
        assert len(dataset) == expect_bbbp_lengths[i]
        assert len(smileses[i]) == expect_bbbp_lengths[i]
def test_get_molnet_bbbp_dataset_change_split_ratio():
    # test default behavior
    pp = AtomicNumberPreprocessor()
    datasets = molnet.get_molnet_dataset('bbbp',
                                         preprocessor=pp,
                                         frac_train=0.5,
                                         frac_valid=0.3,
                                         frac_test=0.2)
    assert 'smiles' in datasets.keys()
    assert 'dataset' in datasets.keys()
    datasets = datasets['dataset']
    assert len(datasets) == 3
    assert type(datasets[0]) == NumpyTupleDataset
    assert type(datasets[1]) == NumpyTupleDataset
    assert type(datasets[2]) == NumpyTupleDataset

    # Test each train, valid and test dataset
    for i, dataset in enumerate(datasets):
        # --- Test dataset is correctly obtained ---
        index = numpy.random.choice(len(dataset), None)
        atoms, label = dataset[index]

        assert atoms.ndim == 1  # (atom, )
        assert atoms.dtype == numpy.int32
        # (atom from, atom to) or (edge_type, atom from, atom to)
        assert label.ndim == 1
        assert label.shape[0] == 1
        assert label.dtype == numpy.int32
        assert len(dataset) == expect_bbbp_lengths2[i]
Example #4
0
def test_get_molnet_pdbbind_dataset_with_pdb_id():
    # test default behavior
    pp = AtomicNumberPreprocessor()
    time_list = numpy.random.randint(1000, size=168).tolist()
    datasets = molnet.get_molnet_dataset('pdbbind_smiles',
                                         preprocessor=pp,
                                         pdbbind_subset='core',
                                         return_pdb_id=True,
                                         time_list=time_list,
                                         split='random')
    assert 'smiles' in datasets.keys()
    assert 'dataset' in datasets.keys()
    assert 'pdb_id' in datasets.keys()
    pdb_ids = datasets['pdb_id']
    datasets = datasets['dataset']
    assert len(pdb_ids) == 3
    assert len(datasets) == 3

    # Test each train, valid and test dataset
    for i, dataset in enumerate(datasets):
        # --- Test dataset is correctly obtained ---
        index = numpy.random.choice(len(dataset), None)
        atoms, label = dataset[index]

        assert label.ndim == 1  # (atom, )
        assert atoms.dtype == numpy.int32
        # (atom from, atom to) or (edge_type, atom from, atom to)
        assert label.ndim == 1
        assert label.shape[0] == 1
        assert label.dtype == numpy.float32

        # --Test number of dataset ---
        assert len(dataset) == expect_pdbbind_lengths[i]
        assert len(pdb_ids[i]) == expect_pdbbind_lengths[i]
def test_get_molnet_qm7_dataset():
    # test default behavior
    pp = AtomicNumberPreprocessor()
    datasets = molnet.get_molnet_dataset('qm7', preprocessor=pp)
    assert 'smiles' in datasets.keys()
    assert 'dataset' in datasets.keys()
    datasets = datasets['dataset']
    assert len(datasets) == 3
    assert type(datasets[0]) == NumpyTupleDataset
    assert type(datasets[1]) == NumpyTupleDataset
    assert type(datasets[2]) == NumpyTupleDataset

    # Test each train, valid and test dataset
    for i, dataset in enumerate(datasets):
        # --- Test dataset is correctly obtained ---
        index = numpy.random.choice(len(dataset), None)
        atoms, label = dataset[index]

        assert atoms.ndim == 1  # (atom, )
        assert atoms.dtype == numpy.int32
        # (atom from, atom to) or (edge_type, atom from, atom to)
        assert label.ndim == 1
        assert label.shape[0] == 1
        assert label.dtype == numpy.float32

        # --- Test number of dataset ---
        assert len(dataset) == expect_qm7_lengths[i]
Example #6
0
def test_get_qm9_smiles():
    # test default behavior
    pp = AtomicNumberPreprocessor()
    dataset, smiles = qm9.get_qm9(preprocessor=pp, return_smiles=True)

    # --- Test dataset is correctly obtained ---
    index = numpy.random.choice(len(dataset), None)
    atoms, label = dataset[index]

    assert atoms.ndim == 1  # (atom, )
    assert atoms.dtype == numpy.int32
    # (atom from, atom to) or (edge_type, atom from, atom to)
    assert label.ndim == 1
    assert label.shape[0] == QM9_NUM_LABEL
    assert label.dtype == numpy.float32

    # --- Test number of dataset ---
    assert len(dataset) == QM9_NUM_DATASET
    assert len(smiles) == QM9_NUM_DATASET

    # --- Test order of dataset ---
    atoms0, labels0 = dataset[0]
    assert smiles[0] == 'C'
    assert numpy.alltrue(atoms0 == numpy.array([6], dtype=numpy.int32))

    atoms7777, labels7777 = dataset[7777]
    assert smiles[7777] == 'CC1=NCCC(C)O1'
    assert numpy.alltrue(
        atoms7777 == numpy.array([6, 6, 7, 6, 6, 6, 6, 8], dtype=numpy.int32))

    atoms133884, labels133884 = dataset[133884]
    assert smiles[133884] == 'C1N2C3C4C5OC13C2C54'
    assert numpy.alltrue(atoms133884 == numpy.array(
        [6, 7, 6, 6, 6, 8, 6, 6, 6], dtype=numpy.int32))
Example #7
0
def get_qm9(preprocessor=None, labels=None, retain_smiles=False):
    """Downloads, caches and preprocesses QM9 dataset.

    Args:
        preprocesssor (BasePreprocessor): Preprocessor.
            This should be chosen based on the network to be trained.
            If it is None, default `AtomicNumberPreprocessor` is used.
        labels (str or list): List of target labels.
        retain_smiles (bool): If set to ``True``,
            smiles list is also returned.

    Returns:
        dataset, which is composed of `features`, which depends on
        `preprocess_method`.

    """
    labels = labels or get_qm9_label_names()
    if isinstance(labels, str):
        labels = [labels, ]

    def postprocess_label(label_list):
        # This is regression task, cast to float value.
        return numpy.asarray(label_list, dtype=numpy.float32)

    if preprocessor is None:
        preprocessor = AtomicNumberPreprocessor()
    parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label,
                           labels=labels, smiles_col='SMILES1')
    dataset = parser.parse(get_qm9_filepath(), retain_smiles=retain_smiles)
    if retain_smiles:
        return dataset, parser.smiles
    else:
        return dataset
Example #8
0
def test_atomic_number_preprocessor_default():
    preprocessor = AtomicNumberPreprocessor()
    dataset = SmilesParser(preprocessor).parse(
        ['C#N', 'Cc1cnc(C=O)n1C', 'c1ccccc1'])['dataset']
    index = numpy.random.choice(len(dataset), None)
    atoms, = dataset[index]

    assert atoms.ndim == 1
    assert atoms.dtype == numpy.int32
Example #9
0
def test_atomic_number_preprocessor_with_tox21():
    preprocessor = AtomicNumberPreprocessor()
    dataset = SDFFileParser(preprocessor) \
        .parse(get_tox21_filepath('train'))['dataset']
    index = numpy.random.choice(len(dataset), None)
    atoms, = dataset[index]

    assert atoms.ndim == 1
    assert atoms.dtype == numpy.int32
def test_atomic_number_preprocessor_with_tox21():
    preprocessor = AtomicNumberPreprocessor()

    # labels=None as default, and label information is not returned.
    dataset = SDFFileParser(preprocessor).parse(get_tox21_filepath('train'))
    index = numpy.random.choice(len(dataset), None)
    atoms, = dataset[index]

    assert atoms.ndim == 1  # (atom, )
    assert atoms.dtype == numpy.int32
Example #11
0
def test_atomic_number_non_default_max_atoms_preprocessor(mol):
    preprocessor = AtomicNumberPreprocessor(max_atoms=5)
    ret_atom_array = preprocessor.get_input_features(mol)
    expect_atom_array = numpy.array([6, 7, 6, 8], dtype=numpy.int32)
    numpy.testing.assert_array_equal(ret_atom_array, expect_atom_array)

    preprocessor = AtomicNumberPreprocessor(max_atoms=3)
    with pytest.raises(MolFeatureExtractionError):
        preprocessor.get_input_features(mol)
Example #12
0
def test_get_zinc_smiles():
    # test smiles extraction and dataset order
    pp = AtomicNumberPreprocessor()
    target_index = [0, 7777, 249454]  # set target_index for fast testing...
    dataset, smiles = zinc.get_zinc250k(preprocessor=pp,
                                        return_smiles=True,
                                        target_index=target_index)

    # --- Test dataset is correctly obtained ---
    index = numpy.random.choice(len(dataset), None)
    atoms, label = dataset[index]

    assert atoms.ndim == 1  # (atom, )
    assert atoms.dtype == numpy.int32
    # (atom from, atom to) or (edge_type, atom from, atom to)
    assert label.ndim == 1
    assert label.shape[0] == ZINC250K_NUM_LABEL
    assert label.dtype == numpy.float32

    # --- Test number of dataset ---
    assert len(dataset) == len(target_index)
    assert len(smiles) == len(target_index)

    # --- Test order of dataset ---
    assert smiles[0] == 'CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1'
    atoms0, labels0 = dataset[0]
    assert numpy.alltrue(atoms0 == numpy.array([
        6, 6, 6, 6, 6, 6, 6, 6, 8, 6, 6, 6, 6, 8, 7, 6, 6, 6, 6, 6, 6, 9, 6, 6
    ],
                                               dtype=numpy.int32))
    assert numpy.alltrue(labels0 == numpy.array(
        [5.0506, 0.70201224, 2.0840945], dtype=numpy.float32))

    assert smiles[1] == 'CCCc1cc(NC(=O)Nc2ccc3c(c2)OCCO3)n(C)n1'
    atoms7777, labels7777 = dataset[1]
    assert numpy.alltrue(atoms7777 == numpy.array(
        [6, 6, 6, 6, 6, 6, 7, 6, 8, 7, 6, 6, 6, 6, 6, 6, 8, 6, 6, 8, 7, 6, 7],
        dtype=numpy.int32))
    assert numpy.alltrue(labels7777 == numpy.array(
        [2.7878, 0.9035222, 2.3195992], dtype=numpy.float32))

    assert smiles[
        2] == 'O=C(CC(c1ccccc1)c1ccccc1)N1CCN(S(=O)(=O)c2ccccc2[N+](=O)[O-])CC1'  # NOQA
    atoms249454, labels249454 = dataset[2]
    assert numpy.alltrue(atoms249454 == numpy.array([
        8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 6, 6, 7, 16, 8, 8,
        6, 6, 6, 6, 6, 6, 7, 8, 8, 6, 6
    ],
                                                    dtype=numpy.int32))
    assert numpy.alltrue(labels249454 == numpy.array(
        [3.6499, 0.37028658, 2.2142494], dtype=numpy.float32))
Example #13
0
def get_tox21(preprocessor=None, labels=None, return_smiles=False):
    """Downloads, caches and preprocesses Tox21 dataset.

    Args:
        preprocesssor (BasePreprocessor): Preprocessor.
            This should be chosen based on the network to be trained.
            If it is None, default `AtomicNumberPreprocessor` is used.
        labels (str or list): List of target labels.
        return_smiles (bool): If set to True, smiles array is also returned.

    Returns:
        The 3-tuple consisting of train, validation and test
        datasets, respectively. Each dataset is composed of `features`,
        which depends on `preprocess_method`.
    """
    labels = labels or get_tox21_label_names()
    if isinstance(labels, str):
        labels = [labels, ]

    def postprocess_label(label_list):
        # Set -1 to the place where the label is not found,
        # this corresponds to not calculate loss with `sigmoid_cross_entropy`
        t = numpy.array([-1 if label is None else label for label in
                         label_list], dtype=numpy.int32)
        return t

    if preprocessor is None:
        preprocessor = AtomicNumberPreprocessor()
    parser = SDFFileParser(preprocessor,
                           postprocess_label=postprocess_label,
                           labels=labels)

    train_result = parser.parse(get_tox21_filepath('train'),
                                return_smiles=return_smiles)
    val_result = parser.parse(get_tox21_filepath('val'),
                              return_smiles=return_smiles)
    test_result = parser.parse(get_tox21_filepath('test'),
                               return_smiles=return_smiles)

    if return_smiles:
        train, train_smiles = train_result['dataset'], train_result['smiles']
        val, val_smiles = val_result['dataset'], val_result['smiles']
        test, test_smiles = test_result['dataset'], test_result['smiles']
        return train, val, test, train_smiles, val_smiles, test_smiles
    else:
        train = train_result['dataset']
        val = val_result['dataset']
        test = test_result['dataset']
        return train, val, test
Example #14
0
def test_get_tox21():
    # test default behavior
    pp = AtomicNumberPreprocessor()
    train, val, test = tox21.get_tox21(preprocessor=pp)

    # --- Test dataset is correctly obtained ---
    for dataset in [train, val, test]:
        index = numpy.random.choice(len(dataset), None)
        atoms, label = dataset[index]

        assert atoms.ndim == 1  # (atom, )
        assert atoms.dtype == numpy.int32
        assert label.ndim == 1
        assert label.shape[0] == TOX21_NUM_LABEL
        assert label.dtype == numpy.int32
Example #15
0
def get_qm9(preprocessor=None,
            labels=None,
            return_smiles=False,
            target_index=None):
    from chainer_chemistry.dataset.parsers.csv_file_parser import CSVFileParser
    """Downloads, caches and preprocesses QM9 dataset.

    Args:
        preprocessor (BasePreprocessor): Preprocessor.
            This should be chosen based on the network to be trained.
            If it is None, default `AtomicNumberPreprocessor` is used.
        labels (str or list): List of target labels.
        return_smiles (bool): If set to ``True``,
            smiles array is also returned.
        target_index (list or None): target index list to partially extract
            dataset. If None (default), all examples are parsed.

    Returns:
        dataset, which is composed of `features`, which depends on
        `preprocess_method`.

    """
    labels = labels or get_qm9_label_names()
    if isinstance(labels, str):
        labels = [
            labels,
        ]

    def postprocess_label(label_list):
        # This is regression task, cast to float value.
        return numpy.asarray(label_list, dtype=numpy.float32)

    if preprocessor is None:
        preprocessor = AtomicNumberPreprocessor()
    parser = CSVFileParser(preprocessor,
                           postprocess_label=postprocess_label,
                           labels=labels,
                           smiles_col='SMILES1')
    result = parser.parse(get_qm9_filepath(),
                          return_smiles=return_smiles,
                          target_index=target_index)

    if return_smiles:
        return result['dataset'], result['smiles']
    else:
        return result['dataset']
Example #16
0
def test_get_zinc():
    # test default behavior
    pp = AtomicNumberPreprocessor()
    dataset = zinc.get_zinc250k(preprocessor=pp)

    # --- Test dataset is correctly obtained ---
    index = numpy.random.choice(len(dataset), None)
    atoms, label = dataset[index]

    assert atoms.ndim == 1  # (atom, )
    assert atoms.dtype == numpy.int32
    assert label.ndim == 1
    assert label.shape[0] == ZINC250K_NUM_LABEL
    assert label.dtype == numpy.float32

    # --- Test number of dataset ---
    assert len(dataset) == ZINC250K_NUM_DATASET
Example #17
0
def test_get_qm9():
    # test default behavior
    pp = AtomicNumberPreprocessor()
    dataset = qm9.get_qm9(preprocessor=pp)

    # --- Test dataset is correctly obtained ---
    index = numpy.random.choice(len(dataset), None)
    atoms, label = dataset[index]

    assert atoms.ndim == 1  # (atom, )
    assert atoms.dtype == numpy.int32
    # (atom from, atom to) or (edge_type, atom from, atom to)
    assert label.ndim == 1
    assert label.shape[0] == QM9_NUM_LABEL
    assert label.dtype == numpy.float32

    # --- Test number of dataset ---
    assert len(dataset) == QM9_NUM_DATASET
Example #18
0
def get_molnet_dataset(dataset_name,
                       preprocessor=None,
                       labels=None,
                       split=None,
                       frac_train=.8,
                       frac_valid=.1,
                       frac_test=.1,
                       seed=777,
                       return_smiles=False,
                       target_index=None,
                       task_index=0,
                       **kwargs):
    """Downloads, caches and preprocess MoleculeNet dataset.

    Args:
        dataset_name (str): MoleculeNet dataset name. If you want to know the
            detail of MoleculeNet, please refer to
            `official site <http://moleculenet.ai/datasets-1>`_
            If you would like to know what dataset_name is available for
            chainer_chemistry, please refer to `molnet_config.py`.
        preprocessor (BasePreprocessor): Preprocessor.
            It should be chosen based on the network to be trained.
            If it is None, default `AtomicNumberPreprocessor` is used.
        labels (str or list): List of target labels.
        split (str or BaseSplitter or None): How to split dataset into train,
            validation and test. If `None`, this functions use the splitter
            that is recommended by MoleculeNet. Additionally You can use an
            instance of BaseSplitter or choose it from 'random', 'stratified'
            and 'scaffold'.
        return_smiles (bool): If set to ``True``,
            smiles array is also returned.
        target_index (list or None): target index list to partially extract
            dataset. If `None` (default), all examples are parsed.
        task_index (int): Target task index in dataset for stratification.
            (Stratified Splitter only)
    Returns (dict):
        Dictionary that contains dataset that is already split into train,
        valid and test dataset and 1-d numpy array with dtype=object(string)
        which is a vector of smiles for each example or `None`.

    """
    if dataset_name not in molnet_default_config:
        raise ValueError(
            "We don't support {} dataset. Please choose from {}".format(
                dataset_name, list(molnet_default_config.keys())))
    dataset_config = molnet_default_config[dataset_name]
    labels = labels or dataset_config['tasks']
    if isinstance(labels, str):
        labels = [
            labels,
        ]

    if preprocessor is None:
        preprocessor = AtomicNumberPreprocessor()

    if dataset_config['task_type'] == 'regression':

        def postprocess_label(label_list):
            return numpy.asarray(label_list, dtype=numpy.float32)
    elif dataset_config['task_type'] == 'classification':

        def postprocess_label(label_list):
            label_list = numpy.asarray(label_list)
            label_list[numpy.isnan(label_list)] = -1
            return label_list.astype(numpy.int32)

    parser = CSVFileParser(preprocessor,
                           labels=labels,
                           smiles_col=dataset_config['smiles_columns'],
                           postprocess_label=postprocess_label)
    if dataset_config['dataset_type'] == 'one_file_csv':
        split = dataset_config['split'] if split is None else split

        if isinstance(split, str):
            splitter = split_method_dict[split]()
        elif isinstance(split, BaseSplitter):
            splitter = split
        else:
            raise TypeError("split must be None, str or instance of"
                            " BaseSplitter, but got {}".format(type(split)))

        if isinstance(splitter, ScaffoldSplitter):
            get_smiles = True
        else:
            get_smiles = return_smiles

        result = parser.parse(get_molnet_filepath(dataset_name),
                              return_smiles=get_smiles,
                              target_index=target_index,
                              **kwargs)
        dataset = result['dataset']
        smiles = result['smiles']
        train_ind, valid_ind, test_ind = \
            splitter.train_valid_test_split(dataset, smiles_list=smiles,
                                            task_index=task_index,
                                            frac_train=frac_train,
                                            frac_valid=frac_valid,
                                            frac_test=frac_test, **kwargs)
        train = NumpyTupleDataset(*dataset.features[train_ind])
        valid = NumpyTupleDataset(*dataset.features[valid_ind])
        test = NumpyTupleDataset(*dataset.features[test_ind])

        result['dataset'] = (train, valid, test)
        if return_smiles:
            train_smiles = smiles[train_ind]
            valid_smiles = smiles[valid_ind]
            test_smiles = smiles[test_ind]
            result['smiles'] = (train_smiles, valid_smiles, test_smiles)
        else:
            result['smiles'] = None
    elif dataset_config['dataset_type'] == 'separate_csv':
        result = {}
        train_result = parser.parse(get_molnet_filepath(dataset_name, 'train'),
                                    return_smiles=return_smiles,
                                    target_index=target_index)
        valid_result = parser.parse(get_molnet_filepath(dataset_name, 'valid'),
                                    return_smiles=return_smiles,
                                    target_index=target_index)
        test_result = parser.parse(get_molnet_filepath(dataset_name, 'test'),
                                   return_smiles=return_smiles,
                                   target_index=target_index)
        result['dataset'] = (train_result['dataset'], valid_result['dataset'],
                             test_result['dataset'])
        result['smiles'] = (train_result['smiles'], valid_result['smiles'],
                            test_result['smiles'])
    else:
        raise ValueError('dataset_type={} is not supported'.format(
            dataset_config['dataset_type']))
    return result
Example #19
0
def get_molnet_dataset(dataset_name,
                       preprocessor=None,
                       labels=None,
                       split='random',
                       frac_train=.8,
                       frac_valid=.1,
                       frac_test=.1,
                       seed=777,
                       return_smiles=False,
                       target_index=None):
    from chainer_chemistry.dataset.parsers.csv_file_parser import CSVFileParser
    """Downloads, caches and preprocess MoleculeNet dataset.

    Args:
        dataset_name (str): MoleculeNet dataset name. If you want to know the
            detail of MoleculeNet, please refer to
            `official site <http://moleculenet.ai/datasets-1>`_
            If you would like to know what dataset_name is available for
            chainer_chemistry, please refer to `molnet_config.py`.
        preprocessor (BasePreprocessor): Preprocessor.
            It should be chosen based on the network to be trained.
            If it is None, default `AtomicNumberPreprocessor` is used.
        labels (str or list): List of target labels.
        return_smiles (bool): If set to ``True``,
            smiles array is also returned.
        target_index (list or None): target index list to partially extract
            dataset. If `None` (default), all examples are parsed.
    Returns (dict):
        Dictionary that contains dataset that is already splitted into train,
        valid and test dataset and 1-d numpy array with dtype=object(string)
        which is a vector of smiles for each example or `None`.

    """
    if dataset_name not in molnet_default_config:
        raise ValueError(
            "We don't support {} dataset. Please choose from {}".format(
                dataset_name, list(molnet_default_config.keys())))
    dataset_config = molnet_default_config[dataset_name]
    labels = labels or dataset_config['tasks']
    if isinstance(labels, str):
        labels = [
            labels,
        ]

    if preprocessor is None:
        preprocessor = AtomicNumberPreprocessor()

    if dataset_config['task_type'] == 'regression':

        def postprocess_label(label_list):
            return numpy.asarray(label_list, dtype=numpy.float32)
    elif dataset_config['task_type'] == 'classification':

        def postprocess_label(label_list):
            label_list = numpy.asarray(label_list)
            label_list[numpy.isnan(label_list)] = -1
            return label_list.astype(numpy.int32)

    parser = CSVFileParser(preprocessor,
                           labels=labels,
                           smiles_col=dataset_config['smiles_columns'],
                           postprocess_label=postprocess_label)
    if dataset_config['dataset_type'] == 'one_file_csv':
        result = parser.parse(get_molnet_filepath(dataset_name),
                              return_smiles=return_smiles,
                              target_index=target_index)
        # TODO(motoki): splitting function or class
        dataset = result['dataset']
        if split == 'random':
            perm = numpy.random.permutation(len(dataset))
            dataset = NumpyTupleDataset(*dataset.features[perm])
            train_data_size = int(len(dataset) * frac_train)
            valid_data_size = int(len(dataset) * frac_valid)
            train = NumpyTupleDataset(*dataset.features[:train_data_size])
            valid = NumpyTupleDataset(
                *dataset.features[train_data_size:train_data_size +
                                  valid_data_size])
            test = NumpyTupleDataset(*dataset.features[train_data_size +
                                                       valid_data_size:])

            result['dataset'] = (train, valid, test)
            if return_smiles:
                smiles = result['smiles'][perm]
                train_smiles = smiles[:train_data_size]
                valid_smiles = smiles[train_data_size:train_data_size +
                                      valid_data_size]
                test_smiles = smiles[train_data_size + valid_data_size:]
                result['smiles'] = (train_smiles, valid_smiles, test_smiles)
            else:
                result['smiles'] = None
        else:
            raise NotImplementedError
    elif dataset_config['dataset_type'] == 'separate_csv':
        result = {}
        train_result = parser.parse(get_molnet_filepath(dataset_name, 'train'),
                                    return_smiles=return_smiles,
                                    target_index=target_index)
        valid_result = parser.parse(get_molnet_filepath(dataset_name, 'valid'),
                                    return_smiles=return_smiles,
                                    target_index=target_index)
        test_result = parser.parse(get_molnet_filepath(dataset_name, 'test'),
                                   return_smiles=return_smiles,
                                   target_index=target_index)
        result['dataset'] = (train_result['dataset'], valid_result['dataset'],
                             test_result['dataset'])
        result['smiles'] = (train_result['smiles'], valid_result['smiles'],
                            test_result['smiles'])
    else:
        raise NotImplementedError
    return result
Example #20
0
def test_atomic_number_preprocessor_assert_raises():
    with pytest.raises(ValueError):
        AtomicNumberPreprocessor(max_atoms=3, out_size=2)  # NOQA
Example #21
0
def get_pdbbind_smiles(pdbbind_subset, preprocessor=None, labels=None,
                       split=None, frac_train=.8, frac_valid=.1,
                       frac_test=.1, return_smiles=False, return_pdb_id=True,
                       target_index=None, task_index=0, time_list=None,
                       **kwargs):
    """Downloads, caches and preprocess PDBbind dataset.

    Args:
        pdbbind_subset (str): PDBbind dataset subset name. If you want to know
            the detail of subset, please refer to `official site
            <http://www.pdbbind.org.cn/download/pdbbind_2017_intro.pdf>`
        preprocessor (BasePreprocessor): Preprocessor.
            It should be chosen based on the network to be trained.
            If it is None, default `AtomicNumberPreprocessor` is used.
        labels (str or list): List of target labels.
        split (str or BaseSplitter or None): How to split dataset into train,
            validation and test. If `None`, this functions use the splitter
            that is recommended by MoleculeNet. Additionally You can use an
            instance of BaseSplitter or choose it from 'random', 'stratified'
            and 'scaffold'.
        return_smiles (bool): If set to ``True``,
            smiles array is also returned.
        return_pdb_id (bool): If set to ``True``,
            PDB ID array is also returned.
            This argument is only used when you select 'pdbbind_smiles'.
        target_index (list or None): target index list to partially extract
            dataset. If `None` (default), all examples are parsed.
        task_index (int): Target task index in dataset for stratification.
            (Stratified Splitter only)
    Returns (dict):
        Dictionary that contains dataset that is already split into train,
        valid and test dataset and 1-d numpy arrays with dtype=object(string)
        which are vectors of smiles and pdb_id for each example or `None`.

    """
    config = molnet_default_config['pdbbind_smiles']
    labels = labels or config['tasks']
    if isinstance(labels, str):
        labels = [labels, ]

    if preprocessor is None:
        preprocessor = AtomicNumberPreprocessor()

    def postprocess_label(label_list):
        return numpy.asarray(label_list, dtype=numpy.float32)

    parser = CSVFileParser(preprocessor, labels=labels,
                           smiles_col=config['smiles_columns'],
                           postprocess_label=postprocess_label)
    split = config['split'] if split is None else split
    if isinstance(split, str):
        splitter = split_method_dict[split]()
    elif isinstance(split, BaseSplitter):
        splitter = split
    else:
        raise TypeError("split must be None, str or instance of"
                        " BaseSplitter, but got {}".format(type(split)))

    result = parser.parse(get_molnet_filepath('pdbbind_smiles',
                                              pdbbind_subset=pdbbind_subset),
                          return_smiles=return_smiles,
                          return_is_successful=True,
                          target_index=target_index)
    dataset = result['dataset']
    smiles = result['smiles']
    is_successful = result['is_successful']

    if return_pdb_id:
        df = pandas.read_csv(
            get_molnet_filepath('pdbbind_smiles',
                                pdbbind_subset=pdbbind_subset))
        pdb_id = df['id'][is_successful]
    else:
        pdb_id = None

    train_ind, valid_ind, test_ind = \
        splitter.train_valid_test_split(dataset, time_list=time_list,
                                        smiles_list=smiles,
                                        task_index=task_index,
                                        frac_train=frac_train,
                                        frac_valid=frac_valid,
                                        frac_test=frac_test, **kwargs)
    train = NumpyTupleDataset(*dataset.features[train_ind])
    valid = NumpyTupleDataset(*dataset.features[valid_ind])
    test = NumpyTupleDataset(*dataset.features[test_ind])

    result['dataset'] = (train, valid, test)

    if return_smiles:
        train_smiles = smiles[train_ind]
        valid_smiles = smiles[valid_ind]
        test_smiles = smiles[test_ind]
        result['smiles'] = (train_smiles, valid_smiles, test_smiles)
    else:
        result['smiles'] = None

    if return_pdb_id:
        train_pdb_id = pdb_id[train_ind]
        valid_pdb_id = pdb_id[valid_ind]
        test_pdb_id = pdb_id[test_ind]
        result['pdb_id'] = (train_pdb_id, valid_pdb_id, test_pdb_id)
    else:
        result['pdb_id'] = None
    return result
Example #22
0
def test_atomic_number_preprocessor(mol):
    preprocessor = AtomicNumberPreprocessor(max_atoms=5, out_size=10)
    ret_atom_array = preprocessor.get_input_features(mol)
    expect_atom_array = numpy.array([6, 7, 6, 8, 0, 0, 0, 0, 0, 0],
                                    dtype=numpy.int32)
    numpy.testing.assert_array_equal(ret_atom_array, expect_atom_array)
Example #23
0
def test_atomic_number_default_preprocessor(mol):
    preprocessor = AtomicNumberPreprocessor()
    ret_atom_array = preprocessor.get_input_features(mol)
    expect_atom_array = numpy.array([6, 7, 6, 8], dtype=numpy.int32)
    numpy.testing.assert_array_equal(ret_atom_array, expect_atom_array)
def pp():
    return AtomicNumberPreprocessor()