예제 #1
0
def load_dataset(method, labels, prefix='input', num_data=-1):
    policy = _CacheNamePolicy(method, labels, prefix, num_data=num_data)
    train_path = policy.get_train_file_path()
    val_path = policy.get_val_file_path()
    test_path = policy.get_test_file_path()

    train, val, test = None, None, None
    print()
    if os.path.exists(policy.cache_dir):
        print('load from cache {}'.format(policy.cache_dir))
        train = NumpyTupleDataset.load(train_path)
        val = NumpyTupleDataset.load(val_path)
        test = NumpyTupleDataset.load(test_path)
    if train is None or val is None or test is None:
        print('preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        if num_data >= 0:
            # Use `num_data` examples for train
            target_index = numpy.arange(num_data)
            train, val, test = D.get_tox21(
                preprocessor, labels=labels,
                train_target_index=target_index, val_target_index=None,
                test_target_index=None
            )
        else:
            train, val, test = D.get_tox21(preprocessor, labels=labels)
        # Cache dataset
        policy.create_cache_directory()
        NumpyTupleDataset.save(train_path, train)
        NumpyTupleDataset.save(val_path, val)
        NumpyTupleDataset.save(test_path, test)
    return train, val, test
예제 #2
0
def load_dataset(method, labels, prefix='input', num_data=-1):
    policy = _CacheNamePolicy(method, labels, prefix, num_data=num_data)
    train_path = policy.get_train_file_path()
    val_path = policy.get_val_file_path()
    test_path = policy.get_test_file_path()

    train, val, test = None, None, None
    if os.path.exists(policy.cache_dir):
        print('load from cache {}'.format(policy.cache_dir))
        train = NumpyTupleDataset.load(train_path)
        val = NumpyTupleDataset.load(val_path)
        test = NumpyTupleDataset.load(test_path)
    if train is None or val is None or test is None:
        print('preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        if num_data >= 0:
            # Use `num_data` examples for train
            target_index = numpy.arange(num_data)
            train, val, test = D.get_tox21(preprocessor,
                                           labels=labels,
                                           train_target_index=target_index,
                                           val_target_index=None,
                                           test_target_index=None)
        else:
            train, val, test = D.get_tox21(preprocessor, labels=labels)
        # Cache dataset
        policy.create_cache_directory()
        NumpyTupleDataset.save(train_path, train)
        NumpyTupleDataset.save(val_path, val)
        NumpyTupleDataset.save(test_path, test)

    return train, val, test
def test_train_valid_regression_split(reg_dataset):
    splitter = StratifiedSplitter()
    train_ind, valid_ind = splitter.train_valid_split(reg_dataset)
    assert type(train_ind) == numpy.ndarray
    assert train_ind.shape[0] == 90
    assert valid_ind.shape[0] == 10

    train = NumpyTupleDataset(*reg_dataset.features[train_ind])
    valid = NumpyTupleDataset(*reg_dataset.features[valid_ind])
    assert 45.0 < train.features[:, -1].mean() < 55.0
    assert 45.0 < valid.features[:, -1].mean() < 55.0
def test_train_valid_classification_split(cls_dataset):
    splitter = StratifiedSplitter()
    train_ind, valid_ind = splitter.train_valid_split(cls_dataset)
    assert type(train_ind) == numpy.ndarray
    assert train_ind.shape[0] == 27
    assert valid_ind.shape[0] == 3

    train = NumpyTupleDataset(*cls_dataset.features[train_ind])
    valid = NumpyTupleDataset(*cls_dataset.features[valid_ind])
    assert (train.features[:, -1] == 1).sum() == 9
    assert (valid.features[:, -1] == 1).sum() == 1
예제 #5
0
def get_pdbbind_grid(pdbbind_subset,
                     split=None,
                     frac_train=.8,
                     frac_valid=.1,
                     frac_test=.1,
                     task_index=0,
                     **kwargs):
    """Downloads, caches and grid-featurize PDBbind dataset.

    Args:
        pdbbind_subset (str): PDBbind dataset subset name. If you want to know
            the detail of subset, please refer to `official site
            <http://www.pdbbind.org.cn/download/pdbbind_2017_intro.pdf>`
        split (str or BaseSplitter or None): How to split dataset into train,
            validation and test. If `None`, this functions use the splitter
            that is recommended by MoleculeNet. Additionally You can use an
            instance of BaseSplitter or choose it from 'random', 'stratified'
            and 'scaffold'.
        task_index (int): Target task index in dataset for stratification.
            (Stratified Splitter only)
    Returns (dict):
        Dictionary that contains dataset that is already split into train,
        valid and test dataset and 1-d numpy arrays with dtype=object(string)
        which are vectors of smiles and pdb_id for each example or `None`.

    """
    result = {}
    dataset = get_grid_featurized_pdbbind_dataset(pdbbind_subset)
    if split is None:
        split = molnet_default_config['pdbbind_grid']['split']
    if isinstance(split, str):
        splitter = split_method_dict[split]()
    elif isinstance(split, BaseSplitter):
        splitter = split
    else:
        raise TypeError("split must be None, str, or instance of"
                        " BaseSplitter, but got {}".format(type(split)))
    time_list = get_pdbbind_time()
    train_ind, valid_ind, test_ind = \
        splitter.train_valid_test_split(dataset, time_list=time_list,
                                        smiles_list=None,
                                        task_index=task_index,
                                        frac_train=frac_train,
                                        frac_valid=frac_valid,
                                        frac_test=frac_test, **kwargs)
    train = NumpyTupleDataset(*dataset.features[train_ind])
    valid = NumpyTupleDataset(*dataset.features[valid_ind])
    test = NumpyTupleDataset(*dataset.features[test_ind])

    result['dataset'] = (train, valid, test)
    result['smiles'] = None
    return result
예제 #6
0
def _test_roc_auc_evaluator_with_labels(data1):
    """test `pos_labels` and `ignore_labels` behavior"""

    predictor = DummyPredictor()
    dataset = NumpyTupleDataset(*data1)

    iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False)
    evaluator = ROCAUCEvaluator(
        iterator, predictor, name='val',
        pos_labels=[1, 2], ignore_labels=-1,
    )

    # --- test evaluate ---
    repo = chainer.Reporter()
    repo.add_observer('target', predictor)
    with repo:
        observation = evaluator.evaluate()

    expected_roc_auc = 0.75
    # print('observation ', observation)
    assert observation['target/roc_auc'] == expected_roc_auc

    # --- test __call__ ---
    result = evaluator()
    # print('result ', result)
    assert result['val/main/roc_auc'] == expected_roc_auc
예제 #7
0
def _test_balanced_serial_iterator_no_batch_balancing():
    x = numpy.arange(8)
    t = numpy.asarray([0, 0, -1, 1, 1, 2, -1, 1])
    iterator = BalancedSerialIterator(NumpyTupleDataset(x, t),
                                      batch_size=9,
                                      labels=t,
                                      ignore_labels=-1,
                                      batch_balancing=False)
    # In this case, we have 3 examples of label=1.
    # When BalancedSerialIterator runs, all label examples are sampled 3 times
    # in one epoch.
    # Therefore, number of data is "augmented" as 9
    # 3 (number of label types) * 3 (number of maximum examples in one label)
    expect_N_augmented = 9
    assert iterator.N_augmented == expect_N_augmented
    # iterator.show_label_stats()  # we can show label stats

    batch = iterator.next()

    assert len(batch) == 9
    labels_batch = numpy.array([example[-1] for example in batch])

    assert numpy.sum(labels_batch == 0) == 3
    assert numpy.sum(labels_batch == 1) == 3
    assert numpy.sum(labels_batch == 2) == 3
예제 #8
0
def create_datasets(atom_arrays, adj_arrays, teach_signals, wle_arrays=None):
    """
    Expand the atomic_num_arrays with the expanded labels,
    then return valid datasets (tuple of NumpyTupleDataset)

    Args:
        atom_arrays: 3-tuple of list of lists.
                        atom_arrays[i][j][k] is the id of an atom
                        i: train/val/test
                        j: index of a sample (i.e. molcule)
                        k: index of an atom
        adj_arrays: list of list of numpy.array, all mol's adjacnecy tensors
        teach_signals: list of list of numpy.array,
                          all teacher (supervision) signals
        wle_arrays: None (for WLE) or 3-tuple of list of lists (for CWLE and GWLE).

    Returns: 3 tuple of valid datasets (train/vel/test) in NumpyTuppleDataset

    """

    output_datasets = []

    # ToDo: try another indexing: e.g. orignal node label + extneions
    assert len(atom_arrays) == len(adj_arrays) == len(teach_signals)
    if wle_arrays is not None:
        assert len(atom_arrays) == len(wle_arrays)
    for i in range(len(atom_arrays)):
        # We have swaped the axes 0 and 1 for adj-arrays. re-swap
        set_adj_arrays = np.array(adj_arrays[i])
        for m in range(len(set_adj_arrays)):
            set_adj_arrays[m] = np.swapaxes(set_adj_arrays[m], 0, 1)

        if wle_arrays is None:
            dataset = NumpyTupleDataset(np.array(atom_arrays[i]),
                                        set_adj_arrays,
                                        np.array(teach_signals[i]))
        else:
            dataset = NumpyTupleDataset(np.array(atom_arrays[i]),
                                        set_adj_arrays,
                                        np.array(wle_arrays[i]),
                                        np.array(teach_signals[i]))
        output_datasets.append(dataset)
    # end expanded-for

    return output_datasets
예제 #9
0
def small_datasets():
    N_1 = 3
    N_2 = 5

    # one-hot atom labels: 1 tp N
    atom_array_1 = np.arange(N_1)
    atom_array_2 = np.arange(N_2)

    # adj-array, manually
    # all connectes. expanded labels is a permutaion of 0,1,2
    adj_array_1 = np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]]).astype(np.int32)
    # node 0 --> 0-1.2
    # node 1 --> 1-0.2
    # node 2 --> 2-0.1

    adj_array_2 = np.array([[1, 1, 0, 0, 1], [1, 1, 0, 0, 1], [0, 0, 1, 1, 0],
                            [0, 0, 1, 1, 0], [1, 1, 0, 0,
                                              1]]).astype(np.float32)
    # node 0 --> 0-1.4
    # node 1 --> 1-0.4
    # node 2 --> 2-3
    # node 3 --> 3-2
    # node 4 --> 4-0.1

    # supervised labels, dummy
    teach_signal_1 = np.array(1).astype(np.int)
    teach_signal_2 = np.array(0).astype(np.int)

    # concat in a one numpy array!
    atom_arrays = np.array([atom_array_1, atom_array_2])
    adj_arrays = np.array([adj_array_1, adj_array_2])
    teach_signals = np.array([teach_signal_1, teach_signal_2])

    # train/val/test dataset, respectively
    datasets = [
        NumpyTupleDataset(atom_arrays, adj_arrays, teach_signals),
        NumpyTupleDataset(atom_arrays, adj_arrays, teach_signals),
        NumpyTupleDataset(atom_arrays, adj_arrays, teach_signals)
    ]
    return datasets
def test_classification_split_by_labels_ndarray(cls_dataset, cls_label):
    splitter = StratifiedSplitter()
    train_ind, valid_ind, test_ind = splitter._split(cls_dataset,
                                                     labels=cls_label)
    assert type(train_ind) == numpy.ndarray
    assert train_ind.shape[0] == 24
    assert valid_ind.shape[0] == 3
    assert test_ind.shape[0] == 3

    train = NumpyTupleDataset(*cls_dataset.features[train_ind])
    valid = NumpyTupleDataset(*cls_dataset.features[valid_ind])
    test = NumpyTupleDataset(*cls_dataset.features[test_ind])
    assert (train.features[:, -1] == 1).sum() == 8
    assert (valid.features[:, -1] == 1).sum() == 1
    assert (test.features[:, -1] == 1).sum() == 1

    train_ind, valid_ind, test_ind = splitter._split(cls_dataset,
                                                     labels=cls_label,
                                                     frac_train=0.5,
                                                     frac_valid=0.3,
                                                     frac_test=0.2)
    assert type(train_ind) == numpy.ndarray
    assert train_ind.shape[0] == 15
    assert valid_ind.shape[0] == 9
    assert test_ind.shape[0] == 6

    train = NumpyTupleDataset(*cls_dataset.features[train_ind])
    valid = NumpyTupleDataset(*cls_dataset.features[valid_ind])
    test = NumpyTupleDataset(*cls_dataset.features[test_ind])
    assert (train.features[:, -1] == 1).sum() == 5
    assert (valid.features[:, -1] == 1).sum() == 3
    assert (test.features[:, -1] == 1).sum() == 2
def test_regression_split(reg_dataset):
    splitter = StratifiedSplitter()
    train_ind, valid_ind, test_ind = splitter._split(reg_dataset)
    assert type(train_ind) == numpy.ndarray
    assert train_ind.shape[0] == 80
    assert valid_ind.shape[0] == 10
    assert test_ind.shape[0] == 10

    train = NumpyTupleDataset(*reg_dataset.features[train_ind])
    valid = NumpyTupleDataset(*reg_dataset.features[valid_ind])
    test = NumpyTupleDataset(*reg_dataset.features[test_ind])
    assert 45.0 < train.features[:, -1].mean() < 55.0
    assert 45.0 < valid.features[:, -1].mean() < 55.0
    assert 45.0 < test.features[:, -1].mean() < 55.0

    train_ind, valid_ind, test_ind = splitter._split(reg_dataset,
                                                     frac_train=0.5,
                                                     frac_valid=0.3,
                                                     frac_test=0.2)
    assert type(train_ind) == numpy.ndarray
    assert train_ind.shape[0] == 50
    assert valid_ind.shape[0] == 30
    assert test_ind.shape[0] == 20

    train = NumpyTupleDataset(*reg_dataset.features[train_ind])
    valid = NumpyTupleDataset(*reg_dataset.features[valid_ind])
    test = NumpyTupleDataset(*reg_dataset.features[test_ind])
    assert 45.0 < train.features[:, -1].mean() < 55.0
    assert 45.0 < valid.features[:, -1].mean() < 55.0
    assert 45.0 < test.features[:, -1].mean() < 55.0
예제 #12
0
def _test_balanced_serial_iterator_serialization_with_batch_balancing():
    x = numpy.arange(8)
    t = numpy.asarray([0, 0, -1, 1, 1, 2, -1, 1])
    iterator = BalancedSerialIterator(NumpyTupleDataset(x, t),
                                      batch_size=3,
                                      labels=t,
                                      ignore_labels=-1,
                                      batch_balancing=True)
    batch1 = iterator.next()  # NOQA
    batch2 = iterator.next()  # NOQA
    batch3 = iterator.next()  # NOQA

    assert iterator.current_position == 0
    assert iterator.epoch == 1
    assert iterator.is_new_epoch

    target = dict()
    iterator.serialize(DummySerializer(target))
    current_index_list_orig = dict()
    current_pos_orig = dict()
    for label, index_iterator in iterator.labels_iterator_dict.items():
        ii_label = 'index_iterator_{}'.format(label)
        current_index_list_orig[ii_label] = index_iterator.current_index_list
        current_pos_orig[ii_label] = index_iterator.current_pos

    iterator = BalancedSerialIterator(NumpyTupleDataset(x, t),
                                      batch_size=3,
                                      labels=t,
                                      ignore_labels=-1,
                                      batch_balancing=True)
    iterator.serialize(DummyDeserializer(target))
    assert iterator.current_position == 0
    assert iterator.epoch == 1
    assert iterator.is_new_epoch
    for label, index_iterator in iterator.labels_iterator_dict.items():
        ii_label = 'index_iterator_{}'.format(label)
        assert numpy.array_equal(index_iterator.current_index_list,
                                 current_index_list_orig[ii_label])
        assert index_iterator.current_pos == current_pos_orig[ii_label]
예제 #13
0
def get_grid_featurized_pdbbind_dataset(subset):
    """Downloads and caches grid featurized PDBBind dataset.

    Args:
        subset (str): subset name of PDBBind dataset.

    Returns (NumpyTupleDataset):
        grid featurized PDBBind dataset.

    """
    x_path, y_path = get_grid_featurized_pdbbind_filepath(subset)
    x = joblib.load(x_path).astype('i')
    y = joblib.load(y_path).astype('f')
    dataset = NumpyTupleDataset(x, y)
    return dataset
def _test_prc_auc_evaluator_raise_error(data, raise_value_error=True):

    predictor = DummyPredictor()
    dataset = NumpyTupleDataset(*data)

    iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False)
    evaluator = PRCAUCEvaluator(
        iterator, predictor, name='train',
        pos_labels=1, ignore_labels=None,
        raise_value_error=raise_value_error
    )
    repo = chainer.Reporter()
    repo.add_observer('target', predictor)
    with repo:
        observation = evaluator.evaluate()

    return observation['target/prc_auc']
예제 #15
0
def load_dataset(method, labels, prefix='input'):
    policy = _CacheNamePolicy(method, labels, prefix)
    train_path = policy.get_train_file_path()
    val_path = policy.get_val_file_path()
    test_path = policy.get_test_file_path()

    train, val, test = None, None, None
    print()
    if os.path.exists(policy.cache_dir):
        print('load from cache {}'.format(policy.cache_dir))
        train = NumpyTupleDataset.load(train_path)
        val = NumpyTupleDataset.load(val_path)
        test = NumpyTupleDataset.load(test_path)
    if train is None or val is None or test is None:
        print('preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        train, val, test = D.get_tox21(preprocessor, labels=labels)
        # Cache dataset
        policy.create_cache_directory()
        NumpyTupleDataset.save(train_path, train)
        NumpyTupleDataset.save(val_path, val)
        NumpyTupleDataset.save(test_path, test)
    return train, val, test
예제 #16
0
def _test_r2_score_evaluator(inputs):
    predictor = DummyPredictor()
    x0, x1, _ = inputs
    dataset = NumpyTupleDataset(x0, x1)

    iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False)
    evaluator = R2ScoreEvaluator(iterator, predictor, name='train')
    repo = chainer.Reporter()
    repo.add_observer('target', predictor)
    with repo:
        observation = evaluator.evaluate()

    expected = r2_score(x0, x1)
    pytest.approx(observation['target/r2_score'][0], expected)

    # --- test __call__ ---
    result = evaluator()
    pytest.approx(result['train/main/r2_score'][0], expected)
예제 #17
0
def _test_balanced_serial_iterator_with_batch_balancing():
    x = numpy.arange(8)
    t = numpy.asarray([0, 0, -1, 1, 1, 2, -1, 1])
    iterator = BalancedSerialIterator(NumpyTupleDataset(x, t),
                                      batch_size=3,
                                      labels=t,
                                      ignore_labels=-1,
                                      batch_balancing=True)
    expect_N_augmented = 9
    assert iterator.N_augmented == expect_N_augmented
    batch1 = iterator.next()
    batch2 = iterator.next()
    batch3 = iterator.next()
    for batch in [batch1, batch2, batch3]:
        assert len(batch) == 3
        labels_batch = numpy.array([example[-1] for example in batch])
        assert numpy.sum(labels_batch == 0) == 1
        assert numpy.sum(labels_batch == 1) == 1
        assert numpy.sum(labels_batch == 2) == 1
예제 #18
0
def cwle_datasets():
    B = 10
    D_atom = 5
    D_wle = 50
    K_large = 10000

    atom_arrays = [np.full((B, D_atom), K_large) for _ in range(3)]
    adj_arrays = [np.eye(B, dtype=np.int32) for _ in range(3)]
    wle_arrays = [
        np.arange(B * D_wle, dtype=np.int32).reshape(B, -1) for _ in range(3)
    ]
    signal_arrays = [np.full(B, K_large) for _ in range(3)]

    print(wle_arrays[0].shape)

    datasets = [
        NumpyTupleDataset(atom_arrays[i], adj_arrays[i], wle_arrays[i],
                          signal_arrays[i]) for i in range(3)
    ]
    return datasets
def _test_prc_auc_evaluator_default_args(data0):

    predictor = DummyPredictor()
    dataset = NumpyTupleDataset(*data0)

    iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False)
    evaluator = PRCAUCEvaluator(
        iterator, predictor, name='train',
        pos_labels=1, ignore_labels=None
    )
    repo = chainer.Reporter()
    repo.add_observer('target', predictor)
    with repo:
        observation = evaluator.evaluate()

    expected_prc_auc = 0.7916
    pytest.approx(observation['target/prc_auc'], expected_prc_auc)

    # --- test __call__ ---
    result = evaluator()
    pytest.approx(result['train/main/prc_auc'], expected_prc_auc)
예제 #20
0
def _test_roc_auc_evaluator_default_args(data0):

    predictor = DummyPredictor()
    dataset = NumpyTupleDataset(*data0)

    iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False)
    evaluator = ROCAUCEvaluator(
        iterator, predictor, name='train',
        pos_labels=1, ignore_labels=None
    )
    repo = chainer.Reporter()
    repo.add_observer('target', predictor)
    with repo:
        observation = evaluator.evaluate()

    expected_roc_auc = 0.75
    # print('observation ', observation)
    assert observation['target/roc_auc'] == expected_roc_auc

    # --- test __call__ ---
    result = evaluator()
    # print('result ', result)
    assert result['train/main/roc_auc'] == expected_roc_auc
def _test_prc_auc_evaluator_with_labels(data1):
    """test `pos_labels` and `ignore_labels` behavior"""

    predictor = DummyPredictor()
    dataset = NumpyTupleDataset(*data1)

    iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False)
    evaluator = PRCAUCEvaluator(
        iterator, predictor, name='val',
        pos_labels=[1, 2], ignore_labels=-1,
    )

    # --- test evaluate ---
    repo = chainer.Reporter()
    repo.add_observer('target', predictor)
    with repo:
        observation = evaluator.evaluate()

    expected_prc_auc = 0.7916
    pytest.approx(observation['target/prc_auc'], expected_prc_auc)

    # --- test __call__ ---
    result = evaluator()
    pytest.approx(result['val/main/prc_auc'], expected_prc_auc)
예제 #22
0
    def parse(self, filepath, retain_smiles=False):
        """parse csv file using `preprocessor`

        Label is extracted from `labels` columns and input features are
        extracted from smiles information in `smiles` column.

        Args:
            filepath (str): file path to be parsed.
            retain_smiles (bool): If set to True, smiles list is saved to
                `smiles` property.

        Returns: Dataset

        """
        logger = self.logger
        pp = self.preprocessor
        if retain_smiles:
            self.smiles = []  # Initialize

        # counter = 0
        if isinstance(pp, MolPreprocessor):
            try:
                # It is recommended to use `read_csv` method in pandas version
                # after 0.18.x
                df = pandas.read_csv(filepath)
            except AttributeError as e:
                # It is deprecated in newer versions of pandas, but we use
                # this method for older version of pandas.
                df = pandas.DataFrame.from_csv(filepath)

            features = None
            smiles_index = df.columns.get_loc(self.smiles_col)
            if self.labels is None:
                labels_index = []  # dummy list
            else:
                labels_index = [df.columns.get_loc(c) for c in self.labels]

            total_count = df.shape[0]
            fail_count = 0
            success_count = 0
            for row in tqdm(df.itertuples(index=False), total=df.shape[0]):
                smiles = row[smiles_index]
                # TODO(Nakago): Check.
                # currently it assumes list
                labels = [row[i] for i in labels_index]
                try:
                    mol = Chem.MolFromSmiles(smiles)
                    if mol is None:
                        fail_count += 1
                        continue
                    # Note that smiles expression is not unique.
                    # we should re-obtain smiles from `mol`, so that the
                    # smiles order does not contradict with input features'
                    # order.
                    # Here, `smiles` and `standardized_smiles` expresses
                    # same molecule, but the expression may be different!
                    standardized_smiles, mol = pp.prepare_smiles_and_mol(mol)
                    input_features = pp.get_input_features(mol)

                    # Extract label
                    if self.postprocess_label is not None:
                        labels = self.postprocess_label(labels)

                    if retain_smiles:
                        assert standardized_smiles == Chem.MolToSmiles(mol)
                        self.smiles.append(standardized_smiles)
                        # logger.debug('[DEBUG] smiles {}, standard_smiles {}'
                        #              .format(smiles, standardized_smiles))
                except MolFeatureExtractionError as e:
                    # This is expected error that extracting feature failed,
                    # skip this molecule.
                    fail_count += 1
                    continue
                except Exception as e:
                    logger.warning('parse(), type: {}, {}'.format(
                        type(e).__name__, e.args))
                    logger.info(traceback.format_exc())
                    fail_count += 1
                    continue
                # Initialize features: list of list
                if features is None:
                    if isinstance(input_features, tuple):
                        num_features = len(input_features)
                    else:
                        num_features = 1
                    if self.labels is not None:
                        num_features += 1
                    features = [[] for _ in range(num_features)]

                if isinstance(input_features, tuple):
                    for i in range(len(input_features)):
                        features[i].append(input_features[i])
                else:
                    features[0].append(input_features)
                if self.labels is not None:
                    features[len(features) - 1].append(labels)
                success_count += 1
            ret = []

            for feature in features:
                try:
                    feat_array = numpy.asarray(feature)
                except ValueError:
                    # Temporal work around.
                    # See,
                    # https://stackoverflow.com/questions/26885508/why-do-i-get-error-trying-to-cast-np-arraysome-list-valueerror-could-not-broa
                    feat_array = numpy.empty(len(feature), dtype=numpy.ndarray)
                    feat_array[:] = feature[:]
                ret.append(feat_array)
            result = tuple(ret)
            logger.info(
                'Preprocess finished. FAIL {}, SUCCESS {}, TOTAL {}'.format(
                    fail_count, success_count, total_count))
        else:
            # Spec not finalized yet for general case
            result = pp.process(filepath)

        if isinstance(result, tuple):
            if self.postprocess_fn is not None:
                result = self.postprocess_fn(*result)
            return NumpyTupleDataset(*result)
        else:
            if self.postprocess_fn is not None:
                result = self.postprocess_fn(result)
            return NumpyTupleDataset(result)
예제 #23
0
파일: parsers.py 프로젝트: Minys233/GCN-BMP
    def parse(self,
              filepath,
              return_smiles_pair=False,
              return_smiles_pair_original=False,
              target_index=None,
              return_is_successful=False):

        smiles2ssp_filename = "smiles2ssp.pkl"
        smiles2ssp_path = "/home/chenx/drug_mining/representation_learning/chainer-chemistry/examples/ddi/dataset/drug_list"
        smiles2ssp_filepath = os.path.join(smiles2ssp_path,
                                           smiles2ssp_filename)
        with open(smiles2ssp_filepath, 'rb') as pkl_reader:
            smiles2vec = pickle.load(pkl_reader)

        df = pandas.read_csv(filepath)

        logger = self.logger
        pp = self.preprocessor
        smiles_pair_list = []
        smiles_pair_list_original = []
        is_successful_list = []

        # counter = 0
        if isinstance(pp, MolPreprocessor):
            # No influence.
            if target_index is not None:
                df = df.iloc[target_index]

            features = None
            smiles_1_index = df.columns.get_loc(self.smiles_cols[0])
            smiles_2_index = df.columns.get_loc(self.smiles_cols[1])
            if self.labels is None:
                labels_index = []  # dummy list
            else:
                labels_index = [df.columns.get_loc(c) for c in self.labels]

            total_count = df.shape[0]
            fail_count = 0
            success_count = 0
            # iteration on every row within the csv file
            for row in tqdm(df.itertuples(index=False), total=df.shape[0]):
                smiles_1 = row[smiles_1_index]
                smiles_2 = row[smiles_2_index]
                # currently it assumes list
                labels = [int(row[i]) for i in labels_index]
                try:
                    mol_1 = Chem.MolFromSmiles(smiles_1)
                    mol_2 = Chem.MolFromSmiles(smiles_2)
                    if mol_1 is None or mol_2 is None:
                        fail_count += 1
                        if return_is_successful:
                            is_successful_list.append(False)
                        continue

                    # input_features_1 = pp.get_input_features(mol_1)
                    # input_features_2 = pp.get_input_features(mol_2)

                    input_features_1 = smiles2vec[smiles_1]
                    input_features_2 = smiles2vec[smiles_2]

                    # Extract label
                    if self.postprocess_label is not None:
                        labels = self.postprocess_label(labels)

                    # if return_smiles_pair:
                    #     smiles_pair_list.append([canonical_smiles_1, canonical_smiles_2])
                    if return_smiles_pair:
                        smiles_pair_list.append([smiles_1, smiles_2])
                    if return_smiles_pair_original:
                        smiles_pair_list_original.append([smiles_1, smiles_2])

                except MolFeatureExtractionError as e:
                    # This is expected error that extracting feature failed,
                    # skip this molecule.
                    fail_count += 1
                    if return_is_successful:
                        is_successful_list.append(False)
                    continue
                except Exception as e:
                    logger.warning('parse(), type: {}, {}'.format(
                        type(e).__name__, e.args))
                    logger.info(traceback.format_exc())
                    fail_count += 1
                    if return_is_successful:
                        is_successful_list.append(False)
                    continue
                # Initialize features: list of list
                if features is None:
                    if isinstance(input_features_1, tuple):
                        num_features_1 = len(input_features_1)
                    else:
                        num_features_1 = 1
                    if isinstance(input_features_2, tuple):
                        num_features_2 = len(input_features_2)
                    else:
                        num_features_2 = 1
                    num_features = num_features_1 + num_features_2
                    if self.labels is not None:
                        num_features += 1
                    # list of list, a sublist corresponding to a certain feature
                    features = [[] for _ in range(num_features)]
                # for every row in csv file
                if isinstance(input_features_1, tuple):
                    for i in range(len(input_features_1)):
                        # features[i] a list containing the i-th feature
                        features[i].append(input_features_1[i])
                else:
                    features[0].append(input_features_1)
                # offset = len(input_features_1)
                offset = num_features_1
                if isinstance(input_features_2, tuple):
                    for i in range(len(input_features_2)):
                        features[offset + i].append(input_features_2[i])
                else:
                    features[offset].append(input_features_2)

                # last column corresponding to targeted label
                if self.labels is not None:
                    features[len(features) - 1].append(labels)

                success_count += 1
                if return_is_successful:
                    is_successful_list.append(True)

            ret = []
            for feature in features:
                try:
                    feat_array = numpy.asarray(feature)
                except ValueError:
                    # Temporal work around.
                    # See,
                    # https://stackoverflow.com/questions/26885508/why-do-i-get-error-trying-to-cast-np-arraysome-list-valueerror-could-not-broa
                    feat_array = numpy.empty(len(feature), dtype=numpy.ndarray)
                    feat_array[:] = feature[:]
                ret.append(feat_array)
            result = tuple(ret)
            logger.info(
                'Preprocess finished. FAIL {}, SUCCESS {}, TOTAL {}'.format(
                    fail_count, success_count, total_count))
        else:
            raise NotImplementedError

        smiles_pairs = numpy.array(
            smiles_pair_list) if return_smiles_pair else None
        smiles_pairs_original = numpy.array(
            smiles_pair_list_original) if return_smiles_pair_original else None
        if return_is_successful:
            is_successful = numpy.array(is_successful_list)
        else:
            is_successful = None

        if isinstance(result, tuple):
            if self.postprocess_fn is not None:
                result = self.postprocess_fn(*result)
            dataset = NumpyTupleDataset(*result)
        else:
            if self.postprocess_fn is not None:
                result = self.postprocess_fn(result)
            dataset = NumpyTupleDataset(result)
        return {
            "dataset": dataset,
            "smiles_pair": smiles_pairs,
            "smiles_pair_original": smiles_pairs_original,
            "is_successful": is_successful
        }
예제 #24
0
파일: parsers.py 프로젝트: Minys233/GCN-BMP
    def parse(self,
              filepath,
              return_smiles_pair=False,
              return_smiles_pair_original=False,
              target_index=None,
              return_is_successful=False):
        """parse DataFrame using `preprocessor`

        Label is extracted from `labels` columns and input features are
        extracted from smiles information in `smiles` column.

        Args:
            filepath (str): file path to be parsed.
            return_smiles_pair (bool): If set to `True`, smiles list is returned in
                the key 'smiles', it is a list of SMILES from which input
                features are successfully made.
                If set to `False`, `None` is returned in the key 'smiles'.
            target_index (list or None): target index list to partially extract
                dataset. If None (default), all examples are parsed.
            return_is_successful (bool): If set to `True`, boolean list is
                returned in the key 'is_successful'. It represents
                preprocessing has succeeded or not for each SMILES.
                If set to False, `None` is returned in the key 'is_success'.

        Returns (dict): dictionary that contains Dataset, 1-d numpy array with
            dtype=object(string) which is a vector of smiles for each example
            or None.

        """
        df = pandas.read_csv(filepath)

        logger = self.logger
        pp = self.preprocessor
        smiles_pair_list = []
        smiles_pair_list_original = []
        is_successful_list = []

        # counter = 0
        if isinstance(pp, MolPreprocessor):
            # No influence.
            if target_index is not None:
                df = df.iloc[target_index]

            features = None
            smiles_1_index = df.columns.get_loc(self.smiles_cols[0])
            smiles_2_index = df.columns.get_loc(self.smiles_cols[1])
            if self.labels is None:
                labels_index = []  # dummy list
            else:
                labels_index = [df.columns.get_loc(c) for c in self.labels]

            total_count = df.shape[0]
            fail_count = 0
            success_count = 0
            # iteration on every row within the csv file
            for row in tqdm(df.itertuples(index=False), total=df.shape[0]):
                smiles_1 = row[smiles_1_index]
                smiles_2 = row[smiles_2_index]
                # currently it assumes list
                labels = [int(row[i]) for i in labels_index]
                try:
                    mol_1 = Chem.MolFromSmiles(smiles_1)
                    mol_2 = Chem.MolFromSmiles(smiles_2)
                    if mol_1 is None or mol_2 is None:
                        fail_count += 1
                        if return_is_successful:
                            is_successful_list.append(False)
                        continue
                    # Note that smiles expression is not unique.
                    # we obtain canonical smiles
                    # canonical_smiles_1, mol_1 = pp.prepare_smiles_and_mol(mol_1)
                    # input_features_1 = pp.get_input_features(mol_1)
                    # canonical_smiles_2, mol_2 = pp.prepare_smiles_and_mol(mol_2)
                    # input_features_2 = pp.get_input_features(mol_2)

                    input_features_1 = pp.get_input_features(mol_1)
                    input_features_2 = pp.get_input_features(mol_2)

                    # Extract label
                    if self.postprocess_label is not None:
                        labels = self.postprocess_label(labels)

                    # if return_smiles_pair:
                    #     smiles_pair_list.append([canonical_smiles_1, canonical_smiles_2])
                    if return_smiles_pair:
                        smiles_pair_list.append([smiles_1, smiles_2])
                    if return_smiles_pair_original:
                        smiles_pair_list_original.append([smiles_1, smiles_2])

                except MolFeatureExtractionError as e:
                    # This is expected error that extracting feature failed,
                    # skip this molecule.
                    fail_count += 1
                    if return_is_successful:
                        is_successful_list.append(False)
                    continue
                except Exception as e:
                    logger.warning('parse(), type: {}, {}'.format(
                        type(e).__name__, e.args))
                    logger.info(traceback.format_exc())
                    fail_count += 1
                    if return_is_successful:
                        is_successful_list.append(False)
                    continue
                # Initialize features: list of list
                if features is None:
                    if isinstance(input_features_1, tuple):
                        num_features_1 = len(input_features_1)
                    else:
                        num_features_1 = 1
                    if isinstance(input_features_2, tuple):
                        num_features_2 = len(input_features_2)
                    else:
                        num_features_2 = 1
                    num_features = num_features_1 + num_features_2
                    if self.labels is not None:
                        num_features += 1
                    # list of list, a sublist corresponding to a certain feature
                    features = [[] for _ in range(num_features)]
                # for every row in csv file
                if isinstance(input_features_1, tuple):
                    for i in range(len(input_features_1)):
                        # features[i] a list containing the i-th feature
                        features[i].append(input_features_1[i])
                else:
                    features[0].append(input_features_1)
                offset = len(input_features_1)
                if isinstance(input_features_2, tuple):
                    for i in range(len(input_features_2)):
                        features[offset + i].append(input_features_2[i])
                else:
                    features[offset].append(input_features_2)

                # last column corresponding to targeted label
                if self.labels is not None:
                    features[len(features) - 1].append(labels)

                success_count += 1
                if return_is_successful:
                    is_successful_list.append(True)

            ret = []
            for feature in features:
                try:
                    feat_array = numpy.asarray(feature)
                except ValueError:
                    # Temporal work around.
                    # See,
                    # https://stackoverflow.com/questions/26885508/why-do-i-get-error-trying-to-cast-np-arraysome-list-valueerror-could-not-broa
                    feat_array = numpy.empty(len(feature), dtype=numpy.ndarray)
                    feat_array[:] = feature[:]
                ret.append(feat_array)
            result = tuple(ret)
            logger.info(
                'Preprocess finished. FAIL {}, SUCCESS {}, TOTAL {}'.format(
                    fail_count, success_count, total_count))
        else:
            raise NotImplementedError

        smiles_pairs = numpy.array(
            smiles_pair_list) if return_smiles_pair else None
        smiles_pairs_original = numpy.array(
            smiles_pair_list_original) if return_smiles_pair_original else None
        if return_is_successful:
            is_successful = numpy.array(is_successful_list)
        else:
            is_successful = None

        if isinstance(result, tuple):
            if self.postprocess_fn is not None:
                result = self.postprocess_fn(*result)
            dataset = NumpyTupleDataset(*result)
        else:
            if self.postprocess_fn is not None:
                result = self.postprocess_fn(result)
            dataset = NumpyTupleDataset(result)
        return {
            "dataset": dataset,
            "smiles_pair": smiles_pairs,
            "smiles_pair_original": smiles_pairs_original,
            "is_successful": is_successful
        }
예제 #25
0
def get_molnet_dataset(dataset_name,
                       preprocessor=None,
                       labels=None,
                       split=None,
                       frac_train=.8,
                       frac_valid=.1,
                       frac_test=.1,
                       seed=777,
                       return_smiles=False,
                       target_index=None,
                       task_index=0,
                       **kwargs):
    """Downloads, caches and preprocess MoleculeNet dataset.

    Args:
        dataset_name (str): MoleculeNet dataset name. If you want to know the
            detail of MoleculeNet, please refer to
            `official site <http://moleculenet.ai/datasets-1>`_
            If you would like to know what dataset_name is available for
            chainer_chemistry, please refer to `molnet_config.py`.
        preprocessor (BasePreprocessor): Preprocessor.
            It should be chosen based on the network to be trained.
            If it is None, default `AtomicNumberPreprocessor` is used.
        labels (str or list): List of target labels.
        split (str or BaseSplitter or None): How to split dataset into train,
            validation and test. If `None`, this functions use the splitter
            that is recommended by MoleculeNet. Additionally You can use an
            instance of BaseSplitter or choose it from 'random', 'stratified'
            and 'scaffold'.
        return_smiles (bool): If set to ``True``,
            smiles array is also returned.
        target_index (list or None): target index list to partially extract
            dataset. If `None` (default), all examples are parsed.
        task_index (int): Target task index in dataset for stratification.
            (Stratified Splitter only)
    Returns (dict):
        Dictionary that contains dataset that is already split into train,
        valid and test dataset and 1-d numpy array with dtype=object(string)
        which is a vector of smiles for each example or `None`.

    """
    if dataset_name not in molnet_default_config:
        raise ValueError(
            "We don't support {} dataset. Please choose from {}".format(
                dataset_name, list(molnet_default_config.keys())))
    dataset_config = molnet_default_config[dataset_name]
    labels = labels or dataset_config['tasks']
    if isinstance(labels, str):
        labels = [
            labels,
        ]

    if preprocessor is None:
        preprocessor = AtomicNumberPreprocessor()

    if dataset_config['task_type'] == 'regression':

        def postprocess_label(label_list):
            return numpy.asarray(label_list, dtype=numpy.float32)
    elif dataset_config['task_type'] == 'classification':

        def postprocess_label(label_list):
            label_list = numpy.asarray(label_list)
            label_list[numpy.isnan(label_list)] = -1
            return label_list.astype(numpy.int32)

    parser = CSVFileParser(preprocessor,
                           labels=labels,
                           smiles_col=dataset_config['smiles_columns'],
                           postprocess_label=postprocess_label)
    if dataset_config['dataset_type'] == 'one_file_csv':
        split = dataset_config['split'] if split is None else split

        if isinstance(split, str):
            splitter = split_method_dict[split]()
        elif isinstance(split, BaseSplitter):
            splitter = split
        else:
            raise TypeError("split must be None, str or instance of"
                            " BaseSplitter, but got {}".format(type(split)))

        if isinstance(splitter, ScaffoldSplitter):
            get_smiles = True
        else:
            get_smiles = return_smiles

        result = parser.parse(get_molnet_filepath(dataset_name),
                              return_smiles=get_smiles,
                              target_index=target_index,
                              **kwargs)
        dataset = result['dataset']
        smiles = result['smiles']
        train_ind, valid_ind, test_ind = \
            splitter.train_valid_test_split(dataset, smiles_list=smiles,
                                            task_index=task_index,
                                            frac_train=frac_train,
                                            frac_valid=frac_valid,
                                            frac_test=frac_test, **kwargs)
        train = NumpyTupleDataset(*dataset.features[train_ind])
        valid = NumpyTupleDataset(*dataset.features[valid_ind])
        test = NumpyTupleDataset(*dataset.features[test_ind])

        result['dataset'] = (train, valid, test)
        if return_smiles:
            train_smiles = smiles[train_ind]
            valid_smiles = smiles[valid_ind]
            test_smiles = smiles[test_ind]
            result['smiles'] = (train_smiles, valid_smiles, test_smiles)
        else:
            result['smiles'] = None
    elif dataset_config['dataset_type'] == 'separate_csv':
        result = {}
        train_result = parser.parse(get_molnet_filepath(dataset_name, 'train'),
                                    return_smiles=return_smiles,
                                    target_index=target_index)
        valid_result = parser.parse(get_molnet_filepath(dataset_name, 'valid'),
                                    return_smiles=return_smiles,
                                    target_index=target_index)
        test_result = parser.parse(get_molnet_filepath(dataset_name, 'test'),
                                   return_smiles=return_smiles,
                                   target_index=target_index)
        result['dataset'] = (train_result['dataset'], valid_result['dataset'],
                             test_result['dataset'])
        result['smiles'] = (train_result['smiles'], valid_result['smiles'],
                            test_result['smiles'])
    else:
        raise ValueError('dataset_type={} is not supported'.format(
            dataset_config['dataset_type']))
    return result
예제 #26
0
def get_molnet_dataset(dataset_name,
                       preprocessor=None,
                       labels=None,
                       split='random',
                       frac_train=.8,
                       frac_valid=.1,
                       frac_test=.1,
                       seed=777,
                       return_smiles=False,
                       target_index=None):
    from chainer_chemistry.dataset.parsers.csv_file_parser import CSVFileParser
    """Downloads, caches and preprocess MoleculeNet dataset.

    Args:
        dataset_name (str): MoleculeNet dataset name. If you want to know the
            detail of MoleculeNet, please refer to
            `official site <http://moleculenet.ai/datasets-1>`_
            If you would like to know what dataset_name is available for
            chainer_chemistry, please refer to `molnet_config.py`.
        preprocessor (BasePreprocessor): Preprocessor.
            It should be chosen based on the network to be trained.
            If it is None, default `AtomicNumberPreprocessor` is used.
        labels (str or list): List of target labels.
        return_smiles (bool): If set to ``True``,
            smiles array is also returned.
        target_index (list or None): target index list to partially extract
            dataset. If `None` (default), all examples are parsed.
    Returns (dict):
        Dictionary that contains dataset that is already splitted into train,
        valid and test dataset and 1-d numpy array with dtype=object(string)
        which is a vector of smiles for each example or `None`.

    """
    if dataset_name not in molnet_default_config:
        raise ValueError(
            "We don't support {} dataset. Please choose from {}".format(
                dataset_name, list(molnet_default_config.keys())))
    dataset_config = molnet_default_config[dataset_name]
    labels = labels or dataset_config['tasks']
    if isinstance(labels, str):
        labels = [
            labels,
        ]

    if preprocessor is None:
        preprocessor = AtomicNumberPreprocessor()

    if dataset_config['task_type'] == 'regression':

        def postprocess_label(label_list):
            return numpy.asarray(label_list, dtype=numpy.float32)
    elif dataset_config['task_type'] == 'classification':

        def postprocess_label(label_list):
            label_list = numpy.asarray(label_list)
            label_list[numpy.isnan(label_list)] = -1
            return label_list.astype(numpy.int32)

    parser = CSVFileParser(preprocessor,
                           labels=labels,
                           smiles_col=dataset_config['smiles_columns'],
                           postprocess_label=postprocess_label)
    if dataset_config['dataset_type'] == 'one_file_csv':
        result = parser.parse(get_molnet_filepath(dataset_name),
                              return_smiles=return_smiles,
                              target_index=target_index)
        # TODO(motoki): splitting function or class
        dataset = result['dataset']
        if split == 'random':
            perm = numpy.random.permutation(len(dataset))
            dataset = NumpyTupleDataset(*dataset.features[perm])
            train_data_size = int(len(dataset) * frac_train)
            valid_data_size = int(len(dataset) * frac_valid)
            train = NumpyTupleDataset(*dataset.features[:train_data_size])
            valid = NumpyTupleDataset(
                *dataset.features[train_data_size:train_data_size +
                                  valid_data_size])
            test = NumpyTupleDataset(*dataset.features[train_data_size +
                                                       valid_data_size:])

            result['dataset'] = (train, valid, test)
            if return_smiles:
                smiles = result['smiles'][perm]
                train_smiles = smiles[:train_data_size]
                valid_smiles = smiles[train_data_size:train_data_size +
                                      valid_data_size]
                test_smiles = smiles[train_data_size + valid_data_size:]
                result['smiles'] = (train_smiles, valid_smiles, test_smiles)
            else:
                result['smiles'] = None
        else:
            raise NotImplementedError
    elif dataset_config['dataset_type'] == 'separate_csv':
        result = {}
        train_result = parser.parse(get_molnet_filepath(dataset_name, 'train'),
                                    return_smiles=return_smiles,
                                    target_index=target_index)
        valid_result = parser.parse(get_molnet_filepath(dataset_name, 'valid'),
                                    return_smiles=return_smiles,
                                    target_index=target_index)
        test_result = parser.parse(get_molnet_filepath(dataset_name, 'test'),
                                   return_smiles=return_smiles,
                                   target_index=target_index)
        result['dataset'] = (train_result['dataset'], valid_result['dataset'],
                             test_result['dataset'])
        result['smiles'] = (train_result['smiles'], valid_result['smiles'],
                            test_result['smiles'])
    else:
        raise NotImplementedError
    return result
def indexer(data):
    dataset = NumpyTupleDataset(*data)
    indexer = NumpyTupleDatasetFeatureIndexer(dataset)
    return indexer
예제 #28
0
    def parse(self,
              df,
              return_smiles=False,
              target_index=None,
              return_is_successful=False):
        """parse DataFrame using `preprocessor`

        Label is extracted from `labels` columns and input features are
        extracted from smiles information in `smiles` column.

        Args:
            df (pandas.DataFrame): dataframe to be parsed.
            return_smiles (bool): If set to `True`, smiles list is returned in
                the key 'smiles', it is a list of SMILES from which input
                features are successfully made.
                If set to `False`, `None` is returned in the key 'smiles'.
            target_index (list or None): target index list to partially extract
                dataset. If None (default), all examples are parsed.
            return_is_successful (bool): If set to `True`, boolean list is
                returned in the key 'is_successful'. It represents
                preprocessing has succeeded or not for each SMILES.
                If set to False, `None` is returned in the key 'is_success'.

        Returns (dict): dictionary that contains Dataset, 1-d numpy array with
            dtype=object(string) which is a vector of smiles for each example
            or None.

        """
        logger = self.logger
        pp = self.preprocessor
        smiles_list = []
        is_successful_list = []

        # counter = 0
        if isinstance(pp, MolPreprocessor):
            if target_index is not None:
                df = df.iloc[target_index]

            features = None
            smiles_index = df.columns.get_loc(self.smiles_col)
            if self.labels is None:
                labels_index = []  # dummy list
            else:
                labels_index = [df.columns.get_loc(c) for c in self.labels]

            total_count = df.shape[0]
            fail_count = 0
            success_count = 0
            for row in tqdm(df.itertuples(index=False), total=df.shape[0]):
                smiles = row[smiles_index]
                # TODO(Nakago): Check.
                # currently it assumes list
                labels = [row[i] for i in labels_index]
                try:
                    mol = Chem.MolFromSmiles(smiles)
                    if mol is None:
                        fail_count += 1
                        if return_is_successful:
                            is_successful_list.append(False)
                        continue
                    # Note that smiles expression is not unique.
                    # we obtain canonical smiles
                    canonical_smiles, mol = pp.prepare_smiles_and_mol(mol)
                    input_features = pp.get_input_features(mol)

                    # Extract label
                    if self.postprocess_label is not None:
                        labels = self.postprocess_label(labels)

                    if return_smiles:
                        assert canonical_smiles == Chem.MolToSmiles(mol)
                        smiles_list.append(canonical_smiles)
                        # logger.debug('[DEBUG] smiles {}, standard_smiles {}'
                        #              .format(smiles, standardized_smiles))
                except MolFeatureExtractionError as e:
                    # This is expected error that extracting feature failed,
                    # skip this molecule.
                    fail_count += 1
                    if return_is_successful:
                        is_successful_list.append(False)
                    continue
                except Exception as e:
                    logger.warning('parse(), type: {}, {}'.format(
                        type(e).__name__, e.args))
                    logger.info(traceback.format_exc())
                    fail_count += 1
                    if return_is_successful:
                        is_successful_list.append(False)
                    continue
                # Initialize features: list of list
                if features is None:
                    if isinstance(input_features, tuple):
                        num_features = len(input_features)
                    else:
                        num_features = 1
                    if self.labels is not None:
                        num_features += 1
                    features = [[] for _ in range(num_features)]

                if isinstance(input_features, tuple):
                    for i in range(len(input_features)):
                        features[i].append(input_features[i])
                else:
                    features[0].append(input_features)
                if self.labels is not None:
                    features[len(features) - 1].append(labels)
                success_count += 1
                if return_is_successful:
                    is_successful_list.append(True)
            ret = []

            for feature in features:
                try:
                    feat_array = numpy.asarray(feature)
                except ValueError:
                    # Temporal work around.
                    # See,
                    # https://stackoverflow.com/questions/26885508/why-do-i-get-error-trying-to-cast-np-arraysome-list-valueerror-could-not-broa
                    feat_array = numpy.empty(len(feature), dtype=numpy.ndarray)
                    feat_array[:] = feature[:]
                ret.append(feat_array)
            result = tuple(ret)
            logger.info(
                'Preprocess finished. FAIL {}, SUCCESS {}, TOTAL {}'.format(
                    fail_count, success_count, total_count))
        else:
            raise NotImplementedError

        smileses = numpy.array(smiles_list) if return_smiles else None
        if return_is_successful:
            is_successful = numpy.array(is_successful_list)
        else:
            is_successful = None

        if isinstance(result, tuple):
            if self.postprocess_fn is not None:
                result = self.postprocess_fn(*result)
            dataset = NumpyTupleDataset(*result)
        else:
            if self.postprocess_fn is not None:
                result = self.postprocess_fn(result)
            dataset = NumpyTupleDataset(result)
        return {
            "dataset": dataset,
            "smiles": smileses,
            "is_successful": is_successful
        }
 def create_dataset(self, *args, **kwargs):
     return NumpyTupleDataset(*args)
    def parse(self,
              filepath,
              return_smiles=False,
              target_index=None,
              return_is_successful=False):
        """parse sdf file using `preprocessor`

        Note that label is extracted from preprocessor's method.

        Args:
            filepath (str): file path to be parsed.
            return_smiles (bool): If set to True, this function returns
                preprocessed dataset and smiles list.
                If set to False, this function returns preprocessed dataset and
                `None`.
            target_index (list or None): target index list to partially extract
                dataset. If None (default), all examples are parsed.
            return_is_successful (bool): If set to `True`, boolean list is
                returned in the key 'is_successful'. It represents
                preprocessing has succeeded or not for each SMILES.
                If set to False, `None` is returned in the key 'is_success'.

        Returns (dict): dictionary that contains Dataset, 1-d numpy array with
            dtype=object(string) which is a vector of smiles for each example
            or None.

        """
        logger = self.logger
        pp = self.preprocessor
        smiles_list = []
        is_successful_list = []

        if isinstance(pp, MolPreprocessor):
            mol_supplier = Chem.SDMolSupplier(filepath)

            if target_index is None:
                target_index = list(range(len(mol_supplier)))

            features = None

            total_count = len(mol_supplier)
            fail_count = 0
            success_count = 0
            for index in tqdm(target_index):
                # `mol_supplier` does not accept numpy.integer, we must use int
                mol = mol_supplier[int(index)]

                if mol is None:
                    fail_count += 1
                    if return_is_successful:
                        is_successful_list.append(False)
                    continue
                try:
                    # Labels need to be extracted from `mol` before standardize
                    # smiles.
                    if self.labels is not None:
                        label = pp.get_label(mol, self.labels)
                        if self.postprocess_label is not None:
                            label = self.postprocess_label(label)

                    # Note that smiles expression is not unique.
                    # we obtain canonical smiles
                    smiles = Chem.MolToSmiles(mol)
                    mol = Chem.MolFromSmiles(smiles)
                    canonical_smiles, mol = pp.prepare_smiles_and_mol(mol)
                    input_features = pp.get_input_features(mol)

                    # Initialize features: list of list
                    if features is None:
                        if isinstance(input_features, tuple):
                            num_features = len(input_features)
                        else:
                            num_features = 1
                        if self.labels is not None:
                            num_features += 1
                        features = [[] for _ in range(num_features)]

                    if return_smiles:
                        assert canonical_smiles == Chem.MolToSmiles(mol)
                        smiles_list.append(canonical_smiles)
                except MolFeatureExtractionError as e:
                    # This is expected error that extracting feature failed,
                    # skip this molecule.
                    fail_count += 1
                    if return_is_successful:
                        is_successful_list.append(False)
                    continue
                except Exception as e:
                    logger.warning('parse() error, type: {}, {}'.format(
                        type(e).__name__, e.args))
                    fail_count += 1
                    if return_is_successful:
                        is_successful_list.append(False)
                    continue

                if isinstance(input_features, tuple):
                    for i in range(len(input_features)):
                        features[i].append(input_features[i])
                else:
                    features[0].append(input_features)
                if self.labels is not None:
                    features[len(features) - 1].append(label)
                success_count += 1
                if return_is_successful:
                    is_successful_list.append(True)

            ret = []

            for feature in features:
                try:
                    feat_array = numpy.asarray(feature)
                except ValueError:
                    # Temporal work around to convert object-type list into
                    # numpy array.
                    # See, https://goo.gl/kgJXwb
                    feat_array = numpy.empty(len(feature), dtype=numpy.ndarray)
                    feat_array[:] = feature[:]
                ret.append(feat_array)
            result = tuple(ret)
            logger.info(
                'Preprocess finished. FAIL {}, SUCCESS {}, TOTAL {}'.format(
                    fail_count, success_count, total_count))
        else:
            # Spec not finalized yet for general case
            result = pp.process(filepath)

        smileses = numpy.array(smiles_list) if return_smiles else None
        if return_is_successful:
            is_successful = numpy.array(is_successful_list)
        else:
            is_successful = None

        if isinstance(result, tuple):
            if self.postprocess_fn is not None:
                result = self.postprocess_fn(*result)
            dataset = NumpyTupleDataset(*result)
        else:
            if self.postprocess_fn is not None:
                result = self.postprocess_fn(result)
            dataset = NumpyTupleDataset(result)
        return {
            "dataset": dataset,
            "smiles": smileses,
            "is_successful": is_successful
        }
    def parse(self, filepath, return_smiles=False):
        """parse sdf file using `preprocessor`

        Note that label is extracted from preprocessor's method.

        Args:
            filepath (str): file path to be parsed.
            return_smiles (bool): If set to True, this function returns
                preprocessed dataset and smiles list.
                If set to False, this function returns preprocessed dataset and
                `None`.

        Returns (dict): dictionary that contains Dataset, 1-d numpy array with
            dtype=object(string) which is a vector of smiles for each example
            or None.

        """
        logger = self.logger
        pp = self.preprocessor
        smiles_list = []

        if isinstance(pp, MolPreprocessor):
            mol_supplier = Chem.SDMolSupplier(filepath)

            features = None

            total_count = len(mol_supplier)
            fail_count = 0
            success_count = 0
            for mol in tqdm(mol_supplier):
                if mol is None:
                    total_count -= 1
                    continue
                try:
                    # Labels need to be extracted from `mol` before standardize
                    # smiles.
                    if self.labels is not None:
                        label = pp.get_label(mol, self.labels)
                        if self.postprocess_label is not None:
                            label = self.postprocess_label(label)

                    # Note that smiles expression is not unique.
                    # we should re-obtain smiles from `mol`, so that the
                    # smiles order does not contradict with input features'
                    # order.
                    # Here, `smiles` and `standardized_smiles` expresses
                    # same molecule, but the expression may be different!
                    smiles = Chem.MolToSmiles(mol)
                    mol = Chem.MolFromSmiles(smiles)
                    standardized_smiles, mol = pp.prepare_smiles_and_mol(mol)
                    input_features = pp.get_input_features(mol)

                    # Initialize features: list of list
                    if features is None:
                        if isinstance(input_features, tuple):
                            num_features = len(input_features)
                        else:
                            num_features = 1
                        if self.labels is not None:
                            num_features += 1
                        features = [[] for _ in range(num_features)]

                    if return_smiles:
                        assert standardized_smiles == Chem.MolToSmiles(mol)
                        smiles_list.append(standardized_smiles)
                except MolFeatureExtractionError as e:
                    # This is expected error that extracting feature failed,
                    # skip this molecule.
                    fail_count += 1
                    continue
                except Exception as e:
                    logger.warning('parse() error, type: {}, {}'
                                   .format(type(e).__name__, e.args))
                    continue

                if isinstance(input_features, tuple):
                    for i in range(len(input_features)):
                        features[i].append(input_features[i])
                else:
                    features[0].append(input_features)
                if self.labels is not None:
                    features[len(features) - 1].append(label)
                success_count += 1

            ret = []

            for feature in features:
                try:
                    feat_array = numpy.asarray(feature)
                except ValueError:
                    # Temporal work around to convert object-type list into
                    # numpy array.
                    # See, https://goo.gl/kgJXwb
                    feat_array = numpy.empty(len(feature), dtype=numpy.ndarray)
                    feat_array[:] = feature[:]
                ret.append(feat_array)
            result = tuple(ret)
            logger.info('Preprocess finished. FAIL {}, SUCCESS {}, TOTAL {}'
                        .format(fail_count, success_count, total_count))
        else:
            # Spec not finalized yet for general case
            result = pp.process(filepath)

        smileses = numpy.array(smiles_list) if return_smiles else None

        if isinstance(result, tuple):
            if self.postprocess_fn is not None:
                result = self.postprocess_fn(*result)
            return {"dataset": NumpyTupleDataset(*result), "smiles": smileses}
        else:
            if self.postprocess_fn is not None:
                result = self.postprocess_fn(result)
                result = NumpyTupleDataset(result)
            return {"dataset": NumpyTupleDataset(result), "smiles": smileses}