def test_atomic_number_non_default_max_atoms_preprocessor(mol): preprocessor = AtomicNumberPreprocessor(max_atoms=5) ret_atom_array = preprocessor.get_input_features(mol) expect_atom_array = numpy.array([6, 7, 6, 8], dtype=numpy.int32) numpy.testing.assert_array_equal(ret_atom_array, expect_atom_array) preprocessor = AtomicNumberPreprocessor(max_atoms=3) with pytest.raises(MolFeatureExtractionError): preprocessor.get_input_features(mol)
def test_get_molnet_bbbp_dataset_with_smiles(): # test default behavior pp = AtomicNumberPreprocessor() datasets = molnet.get_molnet_dataset('bbbp', preprocessor=pp, return_smiles=True) assert 'smiles' in datasets.keys() assert 'dataset' in datasets.keys() smileses = datasets['smiles'] datasets = datasets['dataset'] assert len(smileses) == 3 assert len(datasets) == 3 # Test each train, valid and test dataset for i, dataset in enumerate(datasets): # --- Test dataset is correctly obtained --- index = np.random.choice(len(dataset), None) atoms, label = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == np.int32 # (atom from, atom to) or (edge_type, atom from, atom to) assert label.ndim == 1 assert label.shape[0] == 1 assert label.dtype == np.int32 assert len(dataset) == expect_bbbp_lengths[i] assert len(smileses[i]) == expect_bbbp_lengths[i]
def test_get_molnet_bbbp_dataset_change_split_ratio(): # test default behavior pp = AtomicNumberPreprocessor() datasets = molnet.get_molnet_dataset('bbbp', preprocessor=pp, frac_train=0.5, frac_valid=0.3, frac_test=0.2) assert 'smiles' in datasets.keys() assert 'dataset' in datasets.keys() datasets = datasets['dataset'] assert len(datasets) == 3 assert type(datasets[0]) == NumpyTupleDataset assert type(datasets[1]) == NumpyTupleDataset assert type(datasets[2]) == NumpyTupleDataset # Test each train, valid and test dataset for i, dataset in enumerate(datasets): # --- Test dataset is correctly obtained --- index = numpy.random.choice(len(dataset), None) atoms, label = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (atom from, atom to) or (edge_type, atom from, atom to) assert label.ndim == 1 assert label.shape[0] == 1 assert label.dtype == numpy.int32 assert len(dataset) == expect_bbbp_lengths2[i]
def test_get_molnet_pdbbind_dataset_with_pdb_id(): # test default behavior pp = AtomicNumberPreprocessor() time_list = numpy.random.randint(1000, size=168).tolist() datasets = molnet.get_molnet_dataset('pdbbind_smiles', preprocessor=pp, pdbbind_subset='core', return_pdb_id=True, time_list=time_list, split='random') assert 'smiles' in datasets.keys() assert 'dataset' in datasets.keys() assert 'pdb_id' in datasets.keys() pdb_ids = datasets['pdb_id'] datasets = datasets['dataset'] assert len(pdb_ids) == 3 assert len(datasets) == 3 # Test each train, valid and test dataset for i, dataset in enumerate(datasets): # --- Test dataset is correctly obtained --- index = numpy.random.choice(len(dataset), None) atoms, label = dataset[index] assert label.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (atom from, atom to) or (edge_type, atom from, atom to) assert label.ndim == 1 assert label.shape[0] == 1 assert label.dtype == numpy.float32 # --Test number of dataset --- assert len(dataset) == expect_pdbbind_lengths[i] assert len(pdb_ids[i]) == expect_pdbbind_lengths[i]
def test_get_molnet_qm7_dataset(): # test default behavior pp = AtomicNumberPreprocessor() datasets = molnet.get_molnet_dataset('qm7', preprocessor=pp) assert 'smiles' in datasets.keys() assert 'dataset' in datasets.keys() datasets = datasets['dataset'] assert len(datasets) == 3 assert type(datasets[0]) == NumpyTupleDataset assert type(datasets[1]) == NumpyTupleDataset assert type(datasets[2]) == NumpyTupleDataset # Test each train, valid and test dataset for i, dataset in enumerate(datasets): # --- Test dataset is correctly obtained --- index = numpy.random.choice(len(dataset), None) atoms, label = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (atom from, atom to) or (edge_type, atom from, atom to) assert label.ndim == 1 assert label.shape[0] == 1 assert label.dtype == numpy.float32 # --- Test number of dataset --- assert len(dataset) == expect_qm7_lengths[i]
def test_get_qm9_smiles(): # test default behavior pp = AtomicNumberPreprocessor() dataset, smiles = qm9.get_qm9(preprocessor=pp, return_smiles=True) # --- Test dataset is correctly obtained --- index = numpy.random.choice(len(dataset), None) atoms, label = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (atom from, atom to) or (edge_type, atom from, atom to) assert label.ndim == 1 assert label.shape[0] == QM9_NUM_LABEL assert label.dtype == numpy.float32 # --- Test number of dataset --- assert len(dataset) == QM9_NUM_DATASET assert len(smiles) == QM9_NUM_DATASET # --- Test order of dataset --- atoms0, labels0 = dataset[0] assert smiles[0] == 'C' assert numpy.alltrue(atoms0 == numpy.array([6], dtype=numpy.int32)) atoms7777, labels7777 = dataset[7777] assert smiles[7777] == 'CC1=NCCC(C)O1' assert numpy.alltrue( atoms7777 == numpy.array([6, 6, 7, 6, 6, 6, 6, 8], dtype=numpy.int32)) atoms133884, labels133884 = dataset[133884] assert smiles[133884] == 'C1N2C3C4C5OC13C2C54' assert numpy.alltrue(atoms133884 == numpy.array( [6, 7, 6, 6, 6, 8, 6, 6, 6], dtype=numpy.int32))
def get_qm9(preprocessor=None, labels=None, retain_smiles=False): """Downloads, caches and preprocesses QM9 dataset. Args: preprocesssor (BasePreprocessor): Preprocessor. This should be chosen based on the network to be trained. If it is None, default `AtomicNumberPreprocessor` is used. labels (str or list): List of target labels. retain_smiles (bool): If set to ``True``, smiles list is also returned. Returns: dataset, which is composed of `features`, which depends on `preprocess_method`. """ labels = labels or get_qm9_label_names() if isinstance(labels, str): labels = [labels, ] def postprocess_label(label_list): # This is regression task, cast to float value. return numpy.asarray(label_list, dtype=numpy.float32) if preprocessor is None: preprocessor = AtomicNumberPreprocessor() parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col='SMILES1') dataset = parser.parse(get_qm9_filepath(), retain_smiles=retain_smiles) if retain_smiles: return dataset, parser.smiles else: return dataset
def test_atomic_number_preprocessor_default(): preprocessor = AtomicNumberPreprocessor() dataset = SmilesParser(preprocessor).parse( ['C#N', 'Cc1cnc(C=O)n1C', 'c1ccccc1'])['dataset'] index = numpy.random.choice(len(dataset), None) atoms, = dataset[index] assert atoms.ndim == 1 assert atoms.dtype == numpy.int32
def test_atomic_number_preprocessor_with_tox21(): preprocessor = AtomicNumberPreprocessor() dataset = SDFFileParser(preprocessor) \ .parse(get_tox21_filepath('train'))['dataset'] index = numpy.random.choice(len(dataset), None) atoms, = dataset[index] assert atoms.ndim == 1 assert atoms.dtype == numpy.int32
def test_atomic_number_preprocessor_with_tox21(): preprocessor = AtomicNumberPreprocessor() # labels=None as default, and label information is not returned. dataset = SDFFileParser(preprocessor).parse(get_tox21_filepath('train')) index = numpy.random.choice(len(dataset), None) atoms, = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32
def test_atomic_number_non_default_max_atoms_preprocessor(mol): preprocessor = AtomicNumberPreprocessor(max_atoms=5) ret_atom_array = preprocessor.get_input_features(mol) expect_atom_array = numpy.array([6, 7, 6, 8], dtype=numpy.int32) numpy.testing.assert_array_equal(ret_atom_array, expect_atom_array) preprocessor = AtomicNumberPreprocessor(max_atoms=3) with pytest.raises(MolFeatureExtractionError): preprocessor.get_input_features(mol)
def test_get_zinc_smiles(): # test smiles extraction and dataset order pp = AtomicNumberPreprocessor() target_index = [0, 7777, 249454] # set target_index for fast testing... dataset, smiles = zinc.get_zinc250k(preprocessor=pp, return_smiles=True, target_index=target_index) # --- Test dataset is correctly obtained --- index = numpy.random.choice(len(dataset), None) atoms, label = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (atom from, atom to) or (edge_type, atom from, atom to) assert label.ndim == 1 assert label.shape[0] == ZINC250K_NUM_LABEL assert label.dtype == numpy.float32 # --- Test number of dataset --- assert len(dataset) == len(target_index) assert len(smiles) == len(target_index) # --- Test order of dataset --- assert smiles[0] == 'CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1' atoms0, labels0 = dataset[0] assert numpy.alltrue(atoms0 == numpy.array([ 6, 6, 6, 6, 6, 6, 6, 6, 8, 6, 6, 6, 6, 8, 7, 6, 6, 6, 6, 6, 6, 9, 6, 6 ], dtype=numpy.int32)) assert numpy.alltrue(labels0 == numpy.array( [5.0506, 0.70201224, 2.0840945], dtype=numpy.float32)) assert smiles[1] == 'CCCc1cc(NC(=O)Nc2ccc3c(c2)OCCO3)n(C)n1' atoms7777, labels7777 = dataset[1] assert numpy.alltrue(atoms7777 == numpy.array( [6, 6, 6, 6, 6, 6, 7, 6, 8, 7, 6, 6, 6, 6, 6, 6, 8, 6, 6, 8, 7, 6, 7], dtype=numpy.int32)) assert numpy.alltrue(labels7777 == numpy.array( [2.7878, 0.9035222, 2.3195992], dtype=numpy.float32)) assert smiles[ 2] == 'O=C(CC(c1ccccc1)c1ccccc1)N1CCN(S(=O)(=O)c2ccccc2[N+](=O)[O-])CC1' # NOQA atoms249454, labels249454 = dataset[2] assert numpy.alltrue(atoms249454 == numpy.array([ 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 6, 6, 7, 16, 8, 8, 6, 6, 6, 6, 6, 6, 7, 8, 8, 6, 6 ], dtype=numpy.int32)) assert numpy.alltrue(labels249454 == numpy.array( [3.6499, 0.37028658, 2.2142494], dtype=numpy.float32))
def get_tox21(preprocessor=None, labels=None, return_smiles=False): """Downloads, caches and preprocesses Tox21 dataset. Args: preprocesssor (BasePreprocessor): Preprocessor. This should be chosen based on the network to be trained. If it is None, default `AtomicNumberPreprocessor` is used. labels (str or list): List of target labels. return_smiles (bool): If set to True, smiles array is also returned. Returns: The 3-tuple consisting of train, validation and test datasets, respectively. Each dataset is composed of `features`, which depends on `preprocess_method`. """ labels = labels or get_tox21_label_names() if isinstance(labels, str): labels = [labels, ] def postprocess_label(label_list): # Set -1 to the place where the label is not found, # this corresponds to not calculate loss with `sigmoid_cross_entropy` t = numpy.array([-1 if label is None else label for label in label_list], dtype=numpy.int32) return t if preprocessor is None: preprocessor = AtomicNumberPreprocessor() parser = SDFFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels) train_result = parser.parse(get_tox21_filepath('train'), return_smiles=return_smiles) val_result = parser.parse(get_tox21_filepath('val'), return_smiles=return_smiles) test_result = parser.parse(get_tox21_filepath('test'), return_smiles=return_smiles) if return_smiles: train, train_smiles = train_result['dataset'], train_result['smiles'] val, val_smiles = val_result['dataset'], val_result['smiles'] test, test_smiles = test_result['dataset'], test_result['smiles'] return train, val, test, train_smiles, val_smiles, test_smiles else: train = train_result['dataset'] val = val_result['dataset'] test = test_result['dataset'] return train, val, test
def test_get_tox21(): # test default behavior pp = AtomicNumberPreprocessor() train, val, test = tox21.get_tox21(preprocessor=pp) # --- Test dataset is correctly obtained --- for dataset in [train, val, test]: index = numpy.random.choice(len(dataset), None) atoms, label = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 assert label.ndim == 1 assert label.shape[0] == TOX21_NUM_LABEL assert label.dtype == numpy.int32
def get_qm9(preprocessor=None, labels=None, return_smiles=False, target_index=None): from chainer_chemistry.dataset.parsers.csv_file_parser import CSVFileParser """Downloads, caches and preprocesses QM9 dataset. Args: preprocessor (BasePreprocessor): Preprocessor. This should be chosen based on the network to be trained. If it is None, default `AtomicNumberPreprocessor` is used. labels (str or list): List of target labels. return_smiles (bool): If set to ``True``, smiles array is also returned. target_index (list or None): target index list to partially extract dataset. If None (default), all examples are parsed. Returns: dataset, which is composed of `features`, which depends on `preprocess_method`. """ labels = labels or get_qm9_label_names() if isinstance(labels, str): labels = [ labels, ] def postprocess_label(label_list): # This is regression task, cast to float value. return numpy.asarray(label_list, dtype=numpy.float32) if preprocessor is None: preprocessor = AtomicNumberPreprocessor() parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col='SMILES1') result = parser.parse(get_qm9_filepath(), return_smiles=return_smiles, target_index=target_index) if return_smiles: return result['dataset'], result['smiles'] else: return result['dataset']
def test_get_zinc(): # test default behavior pp = AtomicNumberPreprocessor() dataset = zinc.get_zinc250k(preprocessor=pp) # --- Test dataset is correctly obtained --- index = numpy.random.choice(len(dataset), None) atoms, label = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 assert label.ndim == 1 assert label.shape[0] == ZINC250K_NUM_LABEL assert label.dtype == numpy.float32 # --- Test number of dataset --- assert len(dataset) == ZINC250K_NUM_DATASET
def test_get_qm9(): # test default behavior pp = AtomicNumberPreprocessor() dataset = qm9.get_qm9(preprocessor=pp) # --- Test dataset is correctly obtained --- index = numpy.random.choice(len(dataset), None) atoms, label = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (atom from, atom to) or (edge_type, atom from, atom to) assert label.ndim == 1 assert label.shape[0] == QM9_NUM_LABEL assert label.dtype == numpy.float32 # --- Test number of dataset --- assert len(dataset) == QM9_NUM_DATASET
def get_molnet_dataset(dataset_name, preprocessor=None, labels=None, split=None, frac_train=.8, frac_valid=.1, frac_test=.1, seed=777, return_smiles=False, target_index=None, task_index=0, **kwargs): """Downloads, caches and preprocess MoleculeNet dataset. Args: dataset_name (str): MoleculeNet dataset name. If you want to know the detail of MoleculeNet, please refer to `official site <http://moleculenet.ai/datasets-1>`_ If you would like to know what dataset_name is available for chainer_chemistry, please refer to `molnet_config.py`. preprocessor (BasePreprocessor): Preprocessor. It should be chosen based on the network to be trained. If it is None, default `AtomicNumberPreprocessor` is used. labels (str or list): List of target labels. split (str or BaseSplitter or None): How to split dataset into train, validation and test. If `None`, this functions use the splitter that is recommended by MoleculeNet. Additionally You can use an instance of BaseSplitter or choose it from 'random', 'stratified' and 'scaffold'. return_smiles (bool): If set to ``True``, smiles array is also returned. target_index (list or None): target index list to partially extract dataset. If `None` (default), all examples are parsed. task_index (int): Target task index in dataset for stratification. (Stratified Splitter only) Returns (dict): Dictionary that contains dataset that is already split into train, valid and test dataset and 1-d numpy array with dtype=object(string) which is a vector of smiles for each example or `None`. """ if dataset_name not in molnet_default_config: raise ValueError( "We don't support {} dataset. Please choose from {}".format( dataset_name, list(molnet_default_config.keys()))) dataset_config = molnet_default_config[dataset_name] labels = labels or dataset_config['tasks'] if isinstance(labels, str): labels = [ labels, ] if preprocessor is None: preprocessor = AtomicNumberPreprocessor() if dataset_config['task_type'] == 'regression': def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) elif dataset_config['task_type'] == 'classification': def postprocess_label(label_list): label_list = numpy.asarray(label_list) label_list[numpy.isnan(label_list)] = -1 return label_list.astype(numpy.int32) parser = CSVFileParser(preprocessor, labels=labels, smiles_col=dataset_config['smiles_columns'], postprocess_label=postprocess_label) if dataset_config['dataset_type'] == 'one_file_csv': split = dataset_config['split'] if split is None else split if isinstance(split, str): splitter = split_method_dict[split]() elif isinstance(split, BaseSplitter): splitter = split else: raise TypeError("split must be None, str or instance of" " BaseSplitter, but got {}".format(type(split))) if isinstance(splitter, ScaffoldSplitter): get_smiles = True else: get_smiles = return_smiles result = parser.parse(get_molnet_filepath(dataset_name), return_smiles=get_smiles, target_index=target_index, **kwargs) dataset = result['dataset'] smiles = result['smiles'] train_ind, valid_ind, test_ind = \ splitter.train_valid_test_split(dataset, smiles_list=smiles, task_index=task_index, frac_train=frac_train, frac_valid=frac_valid, frac_test=frac_test, **kwargs) train = NumpyTupleDataset(*dataset.features[train_ind]) valid = NumpyTupleDataset(*dataset.features[valid_ind]) test = NumpyTupleDataset(*dataset.features[test_ind]) result['dataset'] = (train, valid, test) if return_smiles: train_smiles = smiles[train_ind] valid_smiles = smiles[valid_ind] test_smiles = smiles[test_ind] result['smiles'] = (train_smiles, valid_smiles, test_smiles) else: result['smiles'] = None elif dataset_config['dataset_type'] == 'separate_csv': result = {} train_result = parser.parse(get_molnet_filepath(dataset_name, 'train'), return_smiles=return_smiles, target_index=target_index) valid_result = parser.parse(get_molnet_filepath(dataset_name, 'valid'), return_smiles=return_smiles, target_index=target_index) test_result = parser.parse(get_molnet_filepath(dataset_name, 'test'), return_smiles=return_smiles, target_index=target_index) result['dataset'] = (train_result['dataset'], valid_result['dataset'], test_result['dataset']) result['smiles'] = (train_result['smiles'], valid_result['smiles'], test_result['smiles']) else: raise ValueError('dataset_type={} is not supported'.format( dataset_config['dataset_type'])) return result
def get_molnet_dataset(dataset_name, preprocessor=None, labels=None, split='random', frac_train=.8, frac_valid=.1, frac_test=.1, seed=777, return_smiles=False, target_index=None): from chainer_chemistry.dataset.parsers.csv_file_parser import CSVFileParser """Downloads, caches and preprocess MoleculeNet dataset. Args: dataset_name (str): MoleculeNet dataset name. If you want to know the detail of MoleculeNet, please refer to `official site <http://moleculenet.ai/datasets-1>`_ If you would like to know what dataset_name is available for chainer_chemistry, please refer to `molnet_config.py`. preprocessor (BasePreprocessor): Preprocessor. It should be chosen based on the network to be trained. If it is None, default `AtomicNumberPreprocessor` is used. labels (str or list): List of target labels. return_smiles (bool): If set to ``True``, smiles array is also returned. target_index (list or None): target index list to partially extract dataset. If `None` (default), all examples are parsed. Returns (dict): Dictionary that contains dataset that is already splitted into train, valid and test dataset and 1-d numpy array with dtype=object(string) which is a vector of smiles for each example or `None`. """ if dataset_name not in molnet_default_config: raise ValueError( "We don't support {} dataset. Please choose from {}".format( dataset_name, list(molnet_default_config.keys()))) dataset_config = molnet_default_config[dataset_name] labels = labels or dataset_config['tasks'] if isinstance(labels, str): labels = [ labels, ] if preprocessor is None: preprocessor = AtomicNumberPreprocessor() if dataset_config['task_type'] == 'regression': def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) elif dataset_config['task_type'] == 'classification': def postprocess_label(label_list): label_list = numpy.asarray(label_list) label_list[numpy.isnan(label_list)] = -1 return label_list.astype(numpy.int32) parser = CSVFileParser(preprocessor, labels=labels, smiles_col=dataset_config['smiles_columns'], postprocess_label=postprocess_label) if dataset_config['dataset_type'] == 'one_file_csv': result = parser.parse(get_molnet_filepath(dataset_name), return_smiles=return_smiles, target_index=target_index) # TODO(motoki): splitting function or class dataset = result['dataset'] if split == 'random': perm = numpy.random.permutation(len(dataset)) dataset = NumpyTupleDataset(*dataset.features[perm]) train_data_size = int(len(dataset) * frac_train) valid_data_size = int(len(dataset) * frac_valid) train = NumpyTupleDataset(*dataset.features[:train_data_size]) valid = NumpyTupleDataset( *dataset.features[train_data_size:train_data_size + valid_data_size]) test = NumpyTupleDataset(*dataset.features[train_data_size + valid_data_size:]) result['dataset'] = (train, valid, test) if return_smiles: smiles = result['smiles'][perm] train_smiles = smiles[:train_data_size] valid_smiles = smiles[train_data_size:train_data_size + valid_data_size] test_smiles = smiles[train_data_size + valid_data_size:] result['smiles'] = (train_smiles, valid_smiles, test_smiles) else: result['smiles'] = None else: raise NotImplementedError elif dataset_config['dataset_type'] == 'separate_csv': result = {} train_result = parser.parse(get_molnet_filepath(dataset_name, 'train'), return_smiles=return_smiles, target_index=target_index) valid_result = parser.parse(get_molnet_filepath(dataset_name, 'valid'), return_smiles=return_smiles, target_index=target_index) test_result = parser.parse(get_molnet_filepath(dataset_name, 'test'), return_smiles=return_smiles, target_index=target_index) result['dataset'] = (train_result['dataset'], valid_result['dataset'], test_result['dataset']) result['smiles'] = (train_result['smiles'], valid_result['smiles'], test_result['smiles']) else: raise NotImplementedError return result
def test_atomic_number_preprocessor_assert_raises(): with pytest.raises(ValueError): AtomicNumberPreprocessor(max_atoms=3, out_size=2) # NOQA
def get_pdbbind_smiles(pdbbind_subset, preprocessor=None, labels=None, split=None, frac_train=.8, frac_valid=.1, frac_test=.1, return_smiles=False, return_pdb_id=True, target_index=None, task_index=0, time_list=None, **kwargs): """Downloads, caches and preprocess PDBbind dataset. Args: pdbbind_subset (str): PDBbind dataset subset name. If you want to know the detail of subset, please refer to `official site <http://www.pdbbind.org.cn/download/pdbbind_2017_intro.pdf>` preprocessor (BasePreprocessor): Preprocessor. It should be chosen based on the network to be trained. If it is None, default `AtomicNumberPreprocessor` is used. labels (str or list): List of target labels. split (str or BaseSplitter or None): How to split dataset into train, validation and test. If `None`, this functions use the splitter that is recommended by MoleculeNet. Additionally You can use an instance of BaseSplitter or choose it from 'random', 'stratified' and 'scaffold'. return_smiles (bool): If set to ``True``, smiles array is also returned. return_pdb_id (bool): If set to ``True``, PDB ID array is also returned. This argument is only used when you select 'pdbbind_smiles'. target_index (list or None): target index list to partially extract dataset. If `None` (default), all examples are parsed. task_index (int): Target task index in dataset for stratification. (Stratified Splitter only) Returns (dict): Dictionary that contains dataset that is already split into train, valid and test dataset and 1-d numpy arrays with dtype=object(string) which are vectors of smiles and pdb_id for each example or `None`. """ config = molnet_default_config['pdbbind_smiles'] labels = labels or config['tasks'] if isinstance(labels, str): labels = [labels, ] if preprocessor is None: preprocessor = AtomicNumberPreprocessor() def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) parser = CSVFileParser(preprocessor, labels=labels, smiles_col=config['smiles_columns'], postprocess_label=postprocess_label) split = config['split'] if split is None else split if isinstance(split, str): splitter = split_method_dict[split]() elif isinstance(split, BaseSplitter): splitter = split else: raise TypeError("split must be None, str or instance of" " BaseSplitter, but got {}".format(type(split))) result = parser.parse(get_molnet_filepath('pdbbind_smiles', pdbbind_subset=pdbbind_subset), return_smiles=return_smiles, return_is_successful=True, target_index=target_index) dataset = result['dataset'] smiles = result['smiles'] is_successful = result['is_successful'] if return_pdb_id: df = pandas.read_csv( get_molnet_filepath('pdbbind_smiles', pdbbind_subset=pdbbind_subset)) pdb_id = df['id'][is_successful] else: pdb_id = None train_ind, valid_ind, test_ind = \ splitter.train_valid_test_split(dataset, time_list=time_list, smiles_list=smiles, task_index=task_index, frac_train=frac_train, frac_valid=frac_valid, frac_test=frac_test, **kwargs) train = NumpyTupleDataset(*dataset.features[train_ind]) valid = NumpyTupleDataset(*dataset.features[valid_ind]) test = NumpyTupleDataset(*dataset.features[test_ind]) result['dataset'] = (train, valid, test) if return_smiles: train_smiles = smiles[train_ind] valid_smiles = smiles[valid_ind] test_smiles = smiles[test_ind] result['smiles'] = (train_smiles, valid_smiles, test_smiles) else: result['smiles'] = None if return_pdb_id: train_pdb_id = pdb_id[train_ind] valid_pdb_id = pdb_id[valid_ind] test_pdb_id = pdb_id[test_ind] result['pdb_id'] = (train_pdb_id, valid_pdb_id, test_pdb_id) else: result['pdb_id'] = None return result
def test_atomic_number_preprocessor(mol): preprocessor = AtomicNumberPreprocessor(max_atoms=5, out_size=10) ret_atom_array = preprocessor.get_input_features(mol) expect_atom_array = numpy.array([6, 7, 6, 8, 0, 0, 0, 0, 0, 0], dtype=numpy.int32) numpy.testing.assert_array_equal(ret_atom_array, expect_atom_array)
def test_atomic_number_default_preprocessor(mol): preprocessor = AtomicNumberPreprocessor() ret_atom_array = preprocessor.get_input_features(mol) expect_atom_array = numpy.array([6, 7, 6, 8], dtype=numpy.int32) numpy.testing.assert_array_equal(ret_atom_array, expect_atom_array)
def pp(): return AtomicNumberPreprocessor()