def load_dataset(method, labels, prefix='input', num_data=-1): policy = _CacheNamePolicy(method, labels, prefix, num_data=num_data) train_path = policy.get_train_file_path() val_path = policy.get_val_file_path() test_path = policy.get_test_file_path() train, val, test = None, None, None print() if os.path.exists(policy.cache_dir): print('load from cache {}'.format(policy.cache_dir)) train = NumpyTupleDataset.load(train_path) val = NumpyTupleDataset.load(val_path) test = NumpyTupleDataset.load(test_path) if train is None or val is None or test is None: print('preprocessing dataset...') preprocessor = preprocess_method_dict[method]() if num_data >= 0: # Use `num_data` examples for train target_index = numpy.arange(num_data) train, val, test = D.get_tox21( preprocessor, labels=labels, train_target_index=target_index, val_target_index=None, test_target_index=None ) else: train, val, test = D.get_tox21(preprocessor, labels=labels) # Cache dataset policy.create_cache_directory() NumpyTupleDataset.save(train_path, train) NumpyTupleDataset.save(val_path, val) NumpyTupleDataset.save(test_path, test) return train, val, test
def load_dataset(method, labels, prefix='input', num_data=-1): policy = _CacheNamePolicy(method, labels, prefix, num_data=num_data) train_path = policy.get_train_file_path() val_path = policy.get_val_file_path() test_path = policy.get_test_file_path() train, val, test = None, None, None if os.path.exists(policy.cache_dir): print('load from cache {}'.format(policy.cache_dir)) train = NumpyTupleDataset.load(train_path) val = NumpyTupleDataset.load(val_path) test = NumpyTupleDataset.load(test_path) if train is None or val is None or test is None: print('preprocessing dataset...') preprocessor = preprocess_method_dict[method]() if num_data >= 0: # Use `num_data` examples for train target_index = numpy.arange(num_data) train, val, test = D.get_tox21(preprocessor, labels=labels, train_target_index=target_index, val_target_index=None, test_target_index=None) else: train, val, test = D.get_tox21(preprocessor, labels=labels) # Cache dataset policy.create_cache_directory() NumpyTupleDataset.save(train_path, train) NumpyTupleDataset.save(val_path, val) NumpyTupleDataset.save(test_path, test) return train, val, test
def test_train_valid_regression_split(reg_dataset): splitter = StratifiedSplitter() train_ind, valid_ind = splitter.train_valid_split(reg_dataset) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 90 assert valid_ind.shape[0] == 10 train = NumpyTupleDataset(*reg_dataset.features[train_ind]) valid = NumpyTupleDataset(*reg_dataset.features[valid_ind]) assert 45.0 < train.features[:, -1].mean() < 55.0 assert 45.0 < valid.features[:, -1].mean() < 55.0
def test_train_valid_classification_split(cls_dataset): splitter = StratifiedSplitter() train_ind, valid_ind = splitter.train_valid_split(cls_dataset) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 27 assert valid_ind.shape[0] == 3 train = NumpyTupleDataset(*cls_dataset.features[train_ind]) valid = NumpyTupleDataset(*cls_dataset.features[valid_ind]) assert (train.features[:, -1] == 1).sum() == 9 assert (valid.features[:, -1] == 1).sum() == 1
def get_pdbbind_grid(pdbbind_subset, split=None, frac_train=.8, frac_valid=.1, frac_test=.1, task_index=0, **kwargs): """Downloads, caches and grid-featurize PDBbind dataset. Args: pdbbind_subset (str): PDBbind dataset subset name. If you want to know the detail of subset, please refer to `official site <http://www.pdbbind.org.cn/download/pdbbind_2017_intro.pdf>` split (str or BaseSplitter or None): How to split dataset into train, validation and test. If `None`, this functions use the splitter that is recommended by MoleculeNet. Additionally You can use an instance of BaseSplitter or choose it from 'random', 'stratified' and 'scaffold'. task_index (int): Target task index in dataset for stratification. (Stratified Splitter only) Returns (dict): Dictionary that contains dataset that is already split into train, valid and test dataset and 1-d numpy arrays with dtype=object(string) which are vectors of smiles and pdb_id for each example or `None`. """ result = {} dataset = get_grid_featurized_pdbbind_dataset(pdbbind_subset) if split is None: split = molnet_default_config['pdbbind_grid']['split'] if isinstance(split, str): splitter = split_method_dict[split]() elif isinstance(split, BaseSplitter): splitter = split else: raise TypeError("split must be None, str, or instance of" " BaseSplitter, but got {}".format(type(split))) time_list = get_pdbbind_time() train_ind, valid_ind, test_ind = \ splitter.train_valid_test_split(dataset, time_list=time_list, smiles_list=None, task_index=task_index, frac_train=frac_train, frac_valid=frac_valid, frac_test=frac_test, **kwargs) train = NumpyTupleDataset(*dataset.features[train_ind]) valid = NumpyTupleDataset(*dataset.features[valid_ind]) test = NumpyTupleDataset(*dataset.features[test_ind]) result['dataset'] = (train, valid, test) result['smiles'] = None return result
def _test_roc_auc_evaluator_with_labels(data1): """test `pos_labels` and `ignore_labels` behavior""" predictor = DummyPredictor() dataset = NumpyTupleDataset(*data1) iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False) evaluator = ROCAUCEvaluator( iterator, predictor, name='val', pos_labels=[1, 2], ignore_labels=-1, ) # --- test evaluate --- repo = chainer.Reporter() repo.add_observer('target', predictor) with repo: observation = evaluator.evaluate() expected_roc_auc = 0.75 # print('observation ', observation) assert observation['target/roc_auc'] == expected_roc_auc # --- test __call__ --- result = evaluator() # print('result ', result) assert result['val/main/roc_auc'] == expected_roc_auc
def _test_balanced_serial_iterator_no_batch_balancing(): x = numpy.arange(8) t = numpy.asarray([0, 0, -1, 1, 1, 2, -1, 1]) iterator = BalancedSerialIterator(NumpyTupleDataset(x, t), batch_size=9, labels=t, ignore_labels=-1, batch_balancing=False) # In this case, we have 3 examples of label=1. # When BalancedSerialIterator runs, all label examples are sampled 3 times # in one epoch. # Therefore, number of data is "augmented" as 9 # 3 (number of label types) * 3 (number of maximum examples in one label) expect_N_augmented = 9 assert iterator.N_augmented == expect_N_augmented # iterator.show_label_stats() # we can show label stats batch = iterator.next() assert len(batch) == 9 labels_batch = numpy.array([example[-1] for example in batch]) assert numpy.sum(labels_batch == 0) == 3 assert numpy.sum(labels_batch == 1) == 3 assert numpy.sum(labels_batch == 2) == 3
def create_datasets(atom_arrays, adj_arrays, teach_signals, wle_arrays=None): """ Expand the atomic_num_arrays with the expanded labels, then return valid datasets (tuple of NumpyTupleDataset) Args: atom_arrays: 3-tuple of list of lists. atom_arrays[i][j][k] is the id of an atom i: train/val/test j: index of a sample (i.e. molcule) k: index of an atom adj_arrays: list of list of numpy.array, all mol's adjacnecy tensors teach_signals: list of list of numpy.array, all teacher (supervision) signals wle_arrays: None (for WLE) or 3-tuple of list of lists (for CWLE and GWLE). Returns: 3 tuple of valid datasets (train/vel/test) in NumpyTuppleDataset """ output_datasets = [] # ToDo: try another indexing: e.g. orignal node label + extneions assert len(atom_arrays) == len(adj_arrays) == len(teach_signals) if wle_arrays is not None: assert len(atom_arrays) == len(wle_arrays) for i in range(len(atom_arrays)): # We have swaped the axes 0 and 1 for adj-arrays. re-swap set_adj_arrays = np.array(adj_arrays[i]) for m in range(len(set_adj_arrays)): set_adj_arrays[m] = np.swapaxes(set_adj_arrays[m], 0, 1) if wle_arrays is None: dataset = NumpyTupleDataset(np.array(atom_arrays[i]), set_adj_arrays, np.array(teach_signals[i])) else: dataset = NumpyTupleDataset(np.array(atom_arrays[i]), set_adj_arrays, np.array(wle_arrays[i]), np.array(teach_signals[i])) output_datasets.append(dataset) # end expanded-for return output_datasets
def small_datasets(): N_1 = 3 N_2 = 5 # one-hot atom labels: 1 tp N atom_array_1 = np.arange(N_1) atom_array_2 = np.arange(N_2) # adj-array, manually # all connectes. expanded labels is a permutaion of 0,1,2 adj_array_1 = np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]]).astype(np.int32) # node 0 --> 0-1.2 # node 1 --> 1-0.2 # node 2 --> 2-0.1 adj_array_2 = np.array([[1, 1, 0, 0, 1], [1, 1, 0, 0, 1], [0, 0, 1, 1, 0], [0, 0, 1, 1, 0], [1, 1, 0, 0, 1]]).astype(np.float32) # node 0 --> 0-1.4 # node 1 --> 1-0.4 # node 2 --> 2-3 # node 3 --> 3-2 # node 4 --> 4-0.1 # supervised labels, dummy teach_signal_1 = np.array(1).astype(np.int) teach_signal_2 = np.array(0).astype(np.int) # concat in a one numpy array! atom_arrays = np.array([atom_array_1, atom_array_2]) adj_arrays = np.array([adj_array_1, adj_array_2]) teach_signals = np.array([teach_signal_1, teach_signal_2]) # train/val/test dataset, respectively datasets = [ NumpyTupleDataset(atom_arrays, adj_arrays, teach_signals), NumpyTupleDataset(atom_arrays, adj_arrays, teach_signals), NumpyTupleDataset(atom_arrays, adj_arrays, teach_signals) ] return datasets
def test_classification_split_by_labels_ndarray(cls_dataset, cls_label): splitter = StratifiedSplitter() train_ind, valid_ind, test_ind = splitter._split(cls_dataset, labels=cls_label) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 24 assert valid_ind.shape[0] == 3 assert test_ind.shape[0] == 3 train = NumpyTupleDataset(*cls_dataset.features[train_ind]) valid = NumpyTupleDataset(*cls_dataset.features[valid_ind]) test = NumpyTupleDataset(*cls_dataset.features[test_ind]) assert (train.features[:, -1] == 1).sum() == 8 assert (valid.features[:, -1] == 1).sum() == 1 assert (test.features[:, -1] == 1).sum() == 1 train_ind, valid_ind, test_ind = splitter._split(cls_dataset, labels=cls_label, frac_train=0.5, frac_valid=0.3, frac_test=0.2) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 15 assert valid_ind.shape[0] == 9 assert test_ind.shape[0] == 6 train = NumpyTupleDataset(*cls_dataset.features[train_ind]) valid = NumpyTupleDataset(*cls_dataset.features[valid_ind]) test = NumpyTupleDataset(*cls_dataset.features[test_ind]) assert (train.features[:, -1] == 1).sum() == 5 assert (valid.features[:, -1] == 1).sum() == 3 assert (test.features[:, -1] == 1).sum() == 2
def test_regression_split(reg_dataset): splitter = StratifiedSplitter() train_ind, valid_ind, test_ind = splitter._split(reg_dataset) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 80 assert valid_ind.shape[0] == 10 assert test_ind.shape[0] == 10 train = NumpyTupleDataset(*reg_dataset.features[train_ind]) valid = NumpyTupleDataset(*reg_dataset.features[valid_ind]) test = NumpyTupleDataset(*reg_dataset.features[test_ind]) assert 45.0 < train.features[:, -1].mean() < 55.0 assert 45.0 < valid.features[:, -1].mean() < 55.0 assert 45.0 < test.features[:, -1].mean() < 55.0 train_ind, valid_ind, test_ind = splitter._split(reg_dataset, frac_train=0.5, frac_valid=0.3, frac_test=0.2) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 50 assert valid_ind.shape[0] == 30 assert test_ind.shape[0] == 20 train = NumpyTupleDataset(*reg_dataset.features[train_ind]) valid = NumpyTupleDataset(*reg_dataset.features[valid_ind]) test = NumpyTupleDataset(*reg_dataset.features[test_ind]) assert 45.0 < train.features[:, -1].mean() < 55.0 assert 45.0 < valid.features[:, -1].mean() < 55.0 assert 45.0 < test.features[:, -1].mean() < 55.0
def _test_balanced_serial_iterator_serialization_with_batch_balancing(): x = numpy.arange(8) t = numpy.asarray([0, 0, -1, 1, 1, 2, -1, 1]) iterator = BalancedSerialIterator(NumpyTupleDataset(x, t), batch_size=3, labels=t, ignore_labels=-1, batch_balancing=True) batch1 = iterator.next() # NOQA batch2 = iterator.next() # NOQA batch3 = iterator.next() # NOQA assert iterator.current_position == 0 assert iterator.epoch == 1 assert iterator.is_new_epoch target = dict() iterator.serialize(DummySerializer(target)) current_index_list_orig = dict() current_pos_orig = dict() for label, index_iterator in iterator.labels_iterator_dict.items(): ii_label = 'index_iterator_{}'.format(label) current_index_list_orig[ii_label] = index_iterator.current_index_list current_pos_orig[ii_label] = index_iterator.current_pos iterator = BalancedSerialIterator(NumpyTupleDataset(x, t), batch_size=3, labels=t, ignore_labels=-1, batch_balancing=True) iterator.serialize(DummyDeserializer(target)) assert iterator.current_position == 0 assert iterator.epoch == 1 assert iterator.is_new_epoch for label, index_iterator in iterator.labels_iterator_dict.items(): ii_label = 'index_iterator_{}'.format(label) assert numpy.array_equal(index_iterator.current_index_list, current_index_list_orig[ii_label]) assert index_iterator.current_pos == current_pos_orig[ii_label]
def get_grid_featurized_pdbbind_dataset(subset): """Downloads and caches grid featurized PDBBind dataset. Args: subset (str): subset name of PDBBind dataset. Returns (NumpyTupleDataset): grid featurized PDBBind dataset. """ x_path, y_path = get_grid_featurized_pdbbind_filepath(subset) x = joblib.load(x_path).astype('i') y = joblib.load(y_path).astype('f') dataset = NumpyTupleDataset(x, y) return dataset
def _test_prc_auc_evaluator_raise_error(data, raise_value_error=True): predictor = DummyPredictor() dataset = NumpyTupleDataset(*data) iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False) evaluator = PRCAUCEvaluator( iterator, predictor, name='train', pos_labels=1, ignore_labels=None, raise_value_error=raise_value_error ) repo = chainer.Reporter() repo.add_observer('target', predictor) with repo: observation = evaluator.evaluate() return observation['target/prc_auc']
def load_dataset(method, labels, prefix='input'): policy = _CacheNamePolicy(method, labels, prefix) train_path = policy.get_train_file_path() val_path = policy.get_val_file_path() test_path = policy.get_test_file_path() train, val, test = None, None, None print() if os.path.exists(policy.cache_dir): print('load from cache {}'.format(policy.cache_dir)) train = NumpyTupleDataset.load(train_path) val = NumpyTupleDataset.load(val_path) test = NumpyTupleDataset.load(test_path) if train is None or val is None or test is None: print('preprocessing dataset...') preprocessor = preprocess_method_dict[method]() train, val, test = D.get_tox21(preprocessor, labels=labels) # Cache dataset policy.create_cache_directory() NumpyTupleDataset.save(train_path, train) NumpyTupleDataset.save(val_path, val) NumpyTupleDataset.save(test_path, test) return train, val, test
def _test_r2_score_evaluator(inputs): predictor = DummyPredictor() x0, x1, _ = inputs dataset = NumpyTupleDataset(x0, x1) iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False) evaluator = R2ScoreEvaluator(iterator, predictor, name='train') repo = chainer.Reporter() repo.add_observer('target', predictor) with repo: observation = evaluator.evaluate() expected = r2_score(x0, x1) pytest.approx(observation['target/r2_score'][0], expected) # --- test __call__ --- result = evaluator() pytest.approx(result['train/main/r2_score'][0], expected)
def _test_balanced_serial_iterator_with_batch_balancing(): x = numpy.arange(8) t = numpy.asarray([0, 0, -1, 1, 1, 2, -1, 1]) iterator = BalancedSerialIterator(NumpyTupleDataset(x, t), batch_size=3, labels=t, ignore_labels=-1, batch_balancing=True) expect_N_augmented = 9 assert iterator.N_augmented == expect_N_augmented batch1 = iterator.next() batch2 = iterator.next() batch3 = iterator.next() for batch in [batch1, batch2, batch3]: assert len(batch) == 3 labels_batch = numpy.array([example[-1] for example in batch]) assert numpy.sum(labels_batch == 0) == 1 assert numpy.sum(labels_batch == 1) == 1 assert numpy.sum(labels_batch == 2) == 1
def cwle_datasets(): B = 10 D_atom = 5 D_wle = 50 K_large = 10000 atom_arrays = [np.full((B, D_atom), K_large) for _ in range(3)] adj_arrays = [np.eye(B, dtype=np.int32) for _ in range(3)] wle_arrays = [ np.arange(B * D_wle, dtype=np.int32).reshape(B, -1) for _ in range(3) ] signal_arrays = [np.full(B, K_large) for _ in range(3)] print(wle_arrays[0].shape) datasets = [ NumpyTupleDataset(atom_arrays[i], adj_arrays[i], wle_arrays[i], signal_arrays[i]) for i in range(3) ] return datasets
def _test_prc_auc_evaluator_default_args(data0): predictor = DummyPredictor() dataset = NumpyTupleDataset(*data0) iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False) evaluator = PRCAUCEvaluator( iterator, predictor, name='train', pos_labels=1, ignore_labels=None ) repo = chainer.Reporter() repo.add_observer('target', predictor) with repo: observation = evaluator.evaluate() expected_prc_auc = 0.7916 pytest.approx(observation['target/prc_auc'], expected_prc_auc) # --- test __call__ --- result = evaluator() pytest.approx(result['train/main/prc_auc'], expected_prc_auc)
def _test_roc_auc_evaluator_default_args(data0): predictor = DummyPredictor() dataset = NumpyTupleDataset(*data0) iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False) evaluator = ROCAUCEvaluator( iterator, predictor, name='train', pos_labels=1, ignore_labels=None ) repo = chainer.Reporter() repo.add_observer('target', predictor) with repo: observation = evaluator.evaluate() expected_roc_auc = 0.75 # print('observation ', observation) assert observation['target/roc_auc'] == expected_roc_auc # --- test __call__ --- result = evaluator() # print('result ', result) assert result['train/main/roc_auc'] == expected_roc_auc
def _test_prc_auc_evaluator_with_labels(data1): """test `pos_labels` and `ignore_labels` behavior""" predictor = DummyPredictor() dataset = NumpyTupleDataset(*data1) iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False) evaluator = PRCAUCEvaluator( iterator, predictor, name='val', pos_labels=[1, 2], ignore_labels=-1, ) # --- test evaluate --- repo = chainer.Reporter() repo.add_observer('target', predictor) with repo: observation = evaluator.evaluate() expected_prc_auc = 0.7916 pytest.approx(observation['target/prc_auc'], expected_prc_auc) # --- test __call__ --- result = evaluator() pytest.approx(result['val/main/prc_auc'], expected_prc_auc)
def parse(self, filepath, retain_smiles=False): """parse csv file using `preprocessor` Label is extracted from `labels` columns and input features are extracted from smiles information in `smiles` column. Args: filepath (str): file path to be parsed. retain_smiles (bool): If set to True, smiles list is saved to `smiles` property. Returns: Dataset """ logger = self.logger pp = self.preprocessor if retain_smiles: self.smiles = [] # Initialize # counter = 0 if isinstance(pp, MolPreprocessor): try: # It is recommended to use `read_csv` method in pandas version # after 0.18.x df = pandas.read_csv(filepath) except AttributeError as e: # It is deprecated in newer versions of pandas, but we use # this method for older version of pandas. df = pandas.DataFrame.from_csv(filepath) features = None smiles_index = df.columns.get_loc(self.smiles_col) if self.labels is None: labels_index = [] # dummy list else: labels_index = [df.columns.get_loc(c) for c in self.labels] total_count = df.shape[0] fail_count = 0 success_count = 0 for row in tqdm(df.itertuples(index=False), total=df.shape[0]): smiles = row[smiles_index] # TODO(Nakago): Check. # currently it assumes list labels = [row[i] for i in labels_index] try: mol = Chem.MolFromSmiles(smiles) if mol is None: fail_count += 1 continue # Note that smiles expression is not unique. # we should re-obtain smiles from `mol`, so that the # smiles order does not contradict with input features' # order. # Here, `smiles` and `standardized_smiles` expresses # same molecule, but the expression may be different! standardized_smiles, mol = pp.prepare_smiles_and_mol(mol) input_features = pp.get_input_features(mol) # Extract label if self.postprocess_label is not None: labels = self.postprocess_label(labels) if retain_smiles: assert standardized_smiles == Chem.MolToSmiles(mol) self.smiles.append(standardized_smiles) # logger.debug('[DEBUG] smiles {}, standard_smiles {}' # .format(smiles, standardized_smiles)) except MolFeatureExtractionError as e: # This is expected error that extracting feature failed, # skip this molecule. fail_count += 1 continue except Exception as e: logger.warning('parse(), type: {}, {}'.format( type(e).__name__, e.args)) logger.info(traceback.format_exc()) fail_count += 1 continue # Initialize features: list of list if features is None: if isinstance(input_features, tuple): num_features = len(input_features) else: num_features = 1 if self.labels is not None: num_features += 1 features = [[] for _ in range(num_features)] if isinstance(input_features, tuple): for i in range(len(input_features)): features[i].append(input_features[i]) else: features[0].append(input_features) if self.labels is not None: features[len(features) - 1].append(labels) success_count += 1 ret = [] for feature in features: try: feat_array = numpy.asarray(feature) except ValueError: # Temporal work around. # See, # https://stackoverflow.com/questions/26885508/why-do-i-get-error-trying-to-cast-np-arraysome-list-valueerror-could-not-broa feat_array = numpy.empty(len(feature), dtype=numpy.ndarray) feat_array[:] = feature[:] ret.append(feat_array) result = tuple(ret) logger.info( 'Preprocess finished. FAIL {}, SUCCESS {}, TOTAL {}'.format( fail_count, success_count, total_count)) else: # Spec not finalized yet for general case result = pp.process(filepath) if isinstance(result, tuple): if self.postprocess_fn is not None: result = self.postprocess_fn(*result) return NumpyTupleDataset(*result) else: if self.postprocess_fn is not None: result = self.postprocess_fn(result) return NumpyTupleDataset(result)
def parse(self, filepath, return_smiles_pair=False, return_smiles_pair_original=False, target_index=None, return_is_successful=False): smiles2ssp_filename = "smiles2ssp.pkl" smiles2ssp_path = "/home/chenx/drug_mining/representation_learning/chainer-chemistry/examples/ddi/dataset/drug_list" smiles2ssp_filepath = os.path.join(smiles2ssp_path, smiles2ssp_filename) with open(smiles2ssp_filepath, 'rb') as pkl_reader: smiles2vec = pickle.load(pkl_reader) df = pandas.read_csv(filepath) logger = self.logger pp = self.preprocessor smiles_pair_list = [] smiles_pair_list_original = [] is_successful_list = [] # counter = 0 if isinstance(pp, MolPreprocessor): # No influence. if target_index is not None: df = df.iloc[target_index] features = None smiles_1_index = df.columns.get_loc(self.smiles_cols[0]) smiles_2_index = df.columns.get_loc(self.smiles_cols[1]) if self.labels is None: labels_index = [] # dummy list else: labels_index = [df.columns.get_loc(c) for c in self.labels] total_count = df.shape[0] fail_count = 0 success_count = 0 # iteration on every row within the csv file for row in tqdm(df.itertuples(index=False), total=df.shape[0]): smiles_1 = row[smiles_1_index] smiles_2 = row[smiles_2_index] # currently it assumes list labels = [int(row[i]) for i in labels_index] try: mol_1 = Chem.MolFromSmiles(smiles_1) mol_2 = Chem.MolFromSmiles(smiles_2) if mol_1 is None or mol_2 is None: fail_count += 1 if return_is_successful: is_successful_list.append(False) continue # input_features_1 = pp.get_input_features(mol_1) # input_features_2 = pp.get_input_features(mol_2) input_features_1 = smiles2vec[smiles_1] input_features_2 = smiles2vec[smiles_2] # Extract label if self.postprocess_label is not None: labels = self.postprocess_label(labels) # if return_smiles_pair: # smiles_pair_list.append([canonical_smiles_1, canonical_smiles_2]) if return_smiles_pair: smiles_pair_list.append([smiles_1, smiles_2]) if return_smiles_pair_original: smiles_pair_list_original.append([smiles_1, smiles_2]) except MolFeatureExtractionError as e: # This is expected error that extracting feature failed, # skip this molecule. fail_count += 1 if return_is_successful: is_successful_list.append(False) continue except Exception as e: logger.warning('parse(), type: {}, {}'.format( type(e).__name__, e.args)) logger.info(traceback.format_exc()) fail_count += 1 if return_is_successful: is_successful_list.append(False) continue # Initialize features: list of list if features is None: if isinstance(input_features_1, tuple): num_features_1 = len(input_features_1) else: num_features_1 = 1 if isinstance(input_features_2, tuple): num_features_2 = len(input_features_2) else: num_features_2 = 1 num_features = num_features_1 + num_features_2 if self.labels is not None: num_features += 1 # list of list, a sublist corresponding to a certain feature features = [[] for _ in range(num_features)] # for every row in csv file if isinstance(input_features_1, tuple): for i in range(len(input_features_1)): # features[i] a list containing the i-th feature features[i].append(input_features_1[i]) else: features[0].append(input_features_1) # offset = len(input_features_1) offset = num_features_1 if isinstance(input_features_2, tuple): for i in range(len(input_features_2)): features[offset + i].append(input_features_2[i]) else: features[offset].append(input_features_2) # last column corresponding to targeted label if self.labels is not None: features[len(features) - 1].append(labels) success_count += 1 if return_is_successful: is_successful_list.append(True) ret = [] for feature in features: try: feat_array = numpy.asarray(feature) except ValueError: # Temporal work around. # See, # https://stackoverflow.com/questions/26885508/why-do-i-get-error-trying-to-cast-np-arraysome-list-valueerror-could-not-broa feat_array = numpy.empty(len(feature), dtype=numpy.ndarray) feat_array[:] = feature[:] ret.append(feat_array) result = tuple(ret) logger.info( 'Preprocess finished. FAIL {}, SUCCESS {}, TOTAL {}'.format( fail_count, success_count, total_count)) else: raise NotImplementedError smiles_pairs = numpy.array( smiles_pair_list) if return_smiles_pair else None smiles_pairs_original = numpy.array( smiles_pair_list_original) if return_smiles_pair_original else None if return_is_successful: is_successful = numpy.array(is_successful_list) else: is_successful = None if isinstance(result, tuple): if self.postprocess_fn is not None: result = self.postprocess_fn(*result) dataset = NumpyTupleDataset(*result) else: if self.postprocess_fn is not None: result = self.postprocess_fn(result) dataset = NumpyTupleDataset(result) return { "dataset": dataset, "smiles_pair": smiles_pairs, "smiles_pair_original": smiles_pairs_original, "is_successful": is_successful }
def parse(self, filepath, return_smiles_pair=False, return_smiles_pair_original=False, target_index=None, return_is_successful=False): """parse DataFrame using `preprocessor` Label is extracted from `labels` columns and input features are extracted from smiles information in `smiles` column. Args: filepath (str): file path to be parsed. return_smiles_pair (bool): If set to `True`, smiles list is returned in the key 'smiles', it is a list of SMILES from which input features are successfully made. If set to `False`, `None` is returned in the key 'smiles'. target_index (list or None): target index list to partially extract dataset. If None (default), all examples are parsed. return_is_successful (bool): If set to `True`, boolean list is returned in the key 'is_successful'. It represents preprocessing has succeeded or not for each SMILES. If set to False, `None` is returned in the key 'is_success'. Returns (dict): dictionary that contains Dataset, 1-d numpy array with dtype=object(string) which is a vector of smiles for each example or None. """ df = pandas.read_csv(filepath) logger = self.logger pp = self.preprocessor smiles_pair_list = [] smiles_pair_list_original = [] is_successful_list = [] # counter = 0 if isinstance(pp, MolPreprocessor): # No influence. if target_index is not None: df = df.iloc[target_index] features = None smiles_1_index = df.columns.get_loc(self.smiles_cols[0]) smiles_2_index = df.columns.get_loc(self.smiles_cols[1]) if self.labels is None: labels_index = [] # dummy list else: labels_index = [df.columns.get_loc(c) for c in self.labels] total_count = df.shape[0] fail_count = 0 success_count = 0 # iteration on every row within the csv file for row in tqdm(df.itertuples(index=False), total=df.shape[0]): smiles_1 = row[smiles_1_index] smiles_2 = row[smiles_2_index] # currently it assumes list labels = [int(row[i]) for i in labels_index] try: mol_1 = Chem.MolFromSmiles(smiles_1) mol_2 = Chem.MolFromSmiles(smiles_2) if mol_1 is None or mol_2 is None: fail_count += 1 if return_is_successful: is_successful_list.append(False) continue # Note that smiles expression is not unique. # we obtain canonical smiles # canonical_smiles_1, mol_1 = pp.prepare_smiles_and_mol(mol_1) # input_features_1 = pp.get_input_features(mol_1) # canonical_smiles_2, mol_2 = pp.prepare_smiles_and_mol(mol_2) # input_features_2 = pp.get_input_features(mol_2) input_features_1 = pp.get_input_features(mol_1) input_features_2 = pp.get_input_features(mol_2) # Extract label if self.postprocess_label is not None: labels = self.postprocess_label(labels) # if return_smiles_pair: # smiles_pair_list.append([canonical_smiles_1, canonical_smiles_2]) if return_smiles_pair: smiles_pair_list.append([smiles_1, smiles_2]) if return_smiles_pair_original: smiles_pair_list_original.append([smiles_1, smiles_2]) except MolFeatureExtractionError as e: # This is expected error that extracting feature failed, # skip this molecule. fail_count += 1 if return_is_successful: is_successful_list.append(False) continue except Exception as e: logger.warning('parse(), type: {}, {}'.format( type(e).__name__, e.args)) logger.info(traceback.format_exc()) fail_count += 1 if return_is_successful: is_successful_list.append(False) continue # Initialize features: list of list if features is None: if isinstance(input_features_1, tuple): num_features_1 = len(input_features_1) else: num_features_1 = 1 if isinstance(input_features_2, tuple): num_features_2 = len(input_features_2) else: num_features_2 = 1 num_features = num_features_1 + num_features_2 if self.labels is not None: num_features += 1 # list of list, a sublist corresponding to a certain feature features = [[] for _ in range(num_features)] # for every row in csv file if isinstance(input_features_1, tuple): for i in range(len(input_features_1)): # features[i] a list containing the i-th feature features[i].append(input_features_1[i]) else: features[0].append(input_features_1) offset = len(input_features_1) if isinstance(input_features_2, tuple): for i in range(len(input_features_2)): features[offset + i].append(input_features_2[i]) else: features[offset].append(input_features_2) # last column corresponding to targeted label if self.labels is not None: features[len(features) - 1].append(labels) success_count += 1 if return_is_successful: is_successful_list.append(True) ret = [] for feature in features: try: feat_array = numpy.asarray(feature) except ValueError: # Temporal work around. # See, # https://stackoverflow.com/questions/26885508/why-do-i-get-error-trying-to-cast-np-arraysome-list-valueerror-could-not-broa feat_array = numpy.empty(len(feature), dtype=numpy.ndarray) feat_array[:] = feature[:] ret.append(feat_array) result = tuple(ret) logger.info( 'Preprocess finished. FAIL {}, SUCCESS {}, TOTAL {}'.format( fail_count, success_count, total_count)) else: raise NotImplementedError smiles_pairs = numpy.array( smiles_pair_list) if return_smiles_pair else None smiles_pairs_original = numpy.array( smiles_pair_list_original) if return_smiles_pair_original else None if return_is_successful: is_successful = numpy.array(is_successful_list) else: is_successful = None if isinstance(result, tuple): if self.postprocess_fn is not None: result = self.postprocess_fn(*result) dataset = NumpyTupleDataset(*result) else: if self.postprocess_fn is not None: result = self.postprocess_fn(result) dataset = NumpyTupleDataset(result) return { "dataset": dataset, "smiles_pair": smiles_pairs, "smiles_pair_original": smiles_pairs_original, "is_successful": is_successful }
def get_molnet_dataset(dataset_name, preprocessor=None, labels=None, split=None, frac_train=.8, frac_valid=.1, frac_test=.1, seed=777, return_smiles=False, target_index=None, task_index=0, **kwargs): """Downloads, caches and preprocess MoleculeNet dataset. Args: dataset_name (str): MoleculeNet dataset name. If you want to know the detail of MoleculeNet, please refer to `official site <http://moleculenet.ai/datasets-1>`_ If you would like to know what dataset_name is available for chainer_chemistry, please refer to `molnet_config.py`. preprocessor (BasePreprocessor): Preprocessor. It should be chosen based on the network to be trained. If it is None, default `AtomicNumberPreprocessor` is used. labels (str or list): List of target labels. split (str or BaseSplitter or None): How to split dataset into train, validation and test. If `None`, this functions use the splitter that is recommended by MoleculeNet. Additionally You can use an instance of BaseSplitter or choose it from 'random', 'stratified' and 'scaffold'. return_smiles (bool): If set to ``True``, smiles array is also returned. target_index (list or None): target index list to partially extract dataset. If `None` (default), all examples are parsed. task_index (int): Target task index in dataset for stratification. (Stratified Splitter only) Returns (dict): Dictionary that contains dataset that is already split into train, valid and test dataset and 1-d numpy array with dtype=object(string) which is a vector of smiles for each example or `None`. """ if dataset_name not in molnet_default_config: raise ValueError( "We don't support {} dataset. Please choose from {}".format( dataset_name, list(molnet_default_config.keys()))) dataset_config = molnet_default_config[dataset_name] labels = labels or dataset_config['tasks'] if isinstance(labels, str): labels = [ labels, ] if preprocessor is None: preprocessor = AtomicNumberPreprocessor() if dataset_config['task_type'] == 'regression': def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) elif dataset_config['task_type'] == 'classification': def postprocess_label(label_list): label_list = numpy.asarray(label_list) label_list[numpy.isnan(label_list)] = -1 return label_list.astype(numpy.int32) parser = CSVFileParser(preprocessor, labels=labels, smiles_col=dataset_config['smiles_columns'], postprocess_label=postprocess_label) if dataset_config['dataset_type'] == 'one_file_csv': split = dataset_config['split'] if split is None else split if isinstance(split, str): splitter = split_method_dict[split]() elif isinstance(split, BaseSplitter): splitter = split else: raise TypeError("split must be None, str or instance of" " BaseSplitter, but got {}".format(type(split))) if isinstance(splitter, ScaffoldSplitter): get_smiles = True else: get_smiles = return_smiles result = parser.parse(get_molnet_filepath(dataset_name), return_smiles=get_smiles, target_index=target_index, **kwargs) dataset = result['dataset'] smiles = result['smiles'] train_ind, valid_ind, test_ind = \ splitter.train_valid_test_split(dataset, smiles_list=smiles, task_index=task_index, frac_train=frac_train, frac_valid=frac_valid, frac_test=frac_test, **kwargs) train = NumpyTupleDataset(*dataset.features[train_ind]) valid = NumpyTupleDataset(*dataset.features[valid_ind]) test = NumpyTupleDataset(*dataset.features[test_ind]) result['dataset'] = (train, valid, test) if return_smiles: train_smiles = smiles[train_ind] valid_smiles = smiles[valid_ind] test_smiles = smiles[test_ind] result['smiles'] = (train_smiles, valid_smiles, test_smiles) else: result['smiles'] = None elif dataset_config['dataset_type'] == 'separate_csv': result = {} train_result = parser.parse(get_molnet_filepath(dataset_name, 'train'), return_smiles=return_smiles, target_index=target_index) valid_result = parser.parse(get_molnet_filepath(dataset_name, 'valid'), return_smiles=return_smiles, target_index=target_index) test_result = parser.parse(get_molnet_filepath(dataset_name, 'test'), return_smiles=return_smiles, target_index=target_index) result['dataset'] = (train_result['dataset'], valid_result['dataset'], test_result['dataset']) result['smiles'] = (train_result['smiles'], valid_result['smiles'], test_result['smiles']) else: raise ValueError('dataset_type={} is not supported'.format( dataset_config['dataset_type'])) return result
def get_molnet_dataset(dataset_name, preprocessor=None, labels=None, split='random', frac_train=.8, frac_valid=.1, frac_test=.1, seed=777, return_smiles=False, target_index=None): from chainer_chemistry.dataset.parsers.csv_file_parser import CSVFileParser """Downloads, caches and preprocess MoleculeNet dataset. Args: dataset_name (str): MoleculeNet dataset name. If you want to know the detail of MoleculeNet, please refer to `official site <http://moleculenet.ai/datasets-1>`_ If you would like to know what dataset_name is available for chainer_chemistry, please refer to `molnet_config.py`. preprocessor (BasePreprocessor): Preprocessor. It should be chosen based on the network to be trained. If it is None, default `AtomicNumberPreprocessor` is used. labels (str or list): List of target labels. return_smiles (bool): If set to ``True``, smiles array is also returned. target_index (list or None): target index list to partially extract dataset. If `None` (default), all examples are parsed. Returns (dict): Dictionary that contains dataset that is already splitted into train, valid and test dataset and 1-d numpy array with dtype=object(string) which is a vector of smiles for each example or `None`. """ if dataset_name not in molnet_default_config: raise ValueError( "We don't support {} dataset. Please choose from {}".format( dataset_name, list(molnet_default_config.keys()))) dataset_config = molnet_default_config[dataset_name] labels = labels or dataset_config['tasks'] if isinstance(labels, str): labels = [ labels, ] if preprocessor is None: preprocessor = AtomicNumberPreprocessor() if dataset_config['task_type'] == 'regression': def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) elif dataset_config['task_type'] == 'classification': def postprocess_label(label_list): label_list = numpy.asarray(label_list) label_list[numpy.isnan(label_list)] = -1 return label_list.astype(numpy.int32) parser = CSVFileParser(preprocessor, labels=labels, smiles_col=dataset_config['smiles_columns'], postprocess_label=postprocess_label) if dataset_config['dataset_type'] == 'one_file_csv': result = parser.parse(get_molnet_filepath(dataset_name), return_smiles=return_smiles, target_index=target_index) # TODO(motoki): splitting function or class dataset = result['dataset'] if split == 'random': perm = numpy.random.permutation(len(dataset)) dataset = NumpyTupleDataset(*dataset.features[perm]) train_data_size = int(len(dataset) * frac_train) valid_data_size = int(len(dataset) * frac_valid) train = NumpyTupleDataset(*dataset.features[:train_data_size]) valid = NumpyTupleDataset( *dataset.features[train_data_size:train_data_size + valid_data_size]) test = NumpyTupleDataset(*dataset.features[train_data_size + valid_data_size:]) result['dataset'] = (train, valid, test) if return_smiles: smiles = result['smiles'][perm] train_smiles = smiles[:train_data_size] valid_smiles = smiles[train_data_size:train_data_size + valid_data_size] test_smiles = smiles[train_data_size + valid_data_size:] result['smiles'] = (train_smiles, valid_smiles, test_smiles) else: result['smiles'] = None else: raise NotImplementedError elif dataset_config['dataset_type'] == 'separate_csv': result = {} train_result = parser.parse(get_molnet_filepath(dataset_name, 'train'), return_smiles=return_smiles, target_index=target_index) valid_result = parser.parse(get_molnet_filepath(dataset_name, 'valid'), return_smiles=return_smiles, target_index=target_index) test_result = parser.parse(get_molnet_filepath(dataset_name, 'test'), return_smiles=return_smiles, target_index=target_index) result['dataset'] = (train_result['dataset'], valid_result['dataset'], test_result['dataset']) result['smiles'] = (train_result['smiles'], valid_result['smiles'], test_result['smiles']) else: raise NotImplementedError return result
def indexer(data): dataset = NumpyTupleDataset(*data) indexer = NumpyTupleDatasetFeatureIndexer(dataset) return indexer
def parse(self, df, return_smiles=False, target_index=None, return_is_successful=False): """parse DataFrame using `preprocessor` Label is extracted from `labels` columns and input features are extracted from smiles information in `smiles` column. Args: df (pandas.DataFrame): dataframe to be parsed. return_smiles (bool): If set to `True`, smiles list is returned in the key 'smiles', it is a list of SMILES from which input features are successfully made. If set to `False`, `None` is returned in the key 'smiles'. target_index (list or None): target index list to partially extract dataset. If None (default), all examples are parsed. return_is_successful (bool): If set to `True`, boolean list is returned in the key 'is_successful'. It represents preprocessing has succeeded or not for each SMILES. If set to False, `None` is returned in the key 'is_success'. Returns (dict): dictionary that contains Dataset, 1-d numpy array with dtype=object(string) which is a vector of smiles for each example or None. """ logger = self.logger pp = self.preprocessor smiles_list = [] is_successful_list = [] # counter = 0 if isinstance(pp, MolPreprocessor): if target_index is not None: df = df.iloc[target_index] features = None smiles_index = df.columns.get_loc(self.smiles_col) if self.labels is None: labels_index = [] # dummy list else: labels_index = [df.columns.get_loc(c) for c in self.labels] total_count = df.shape[0] fail_count = 0 success_count = 0 for row in tqdm(df.itertuples(index=False), total=df.shape[0]): smiles = row[smiles_index] # TODO(Nakago): Check. # currently it assumes list labels = [row[i] for i in labels_index] try: mol = Chem.MolFromSmiles(smiles) if mol is None: fail_count += 1 if return_is_successful: is_successful_list.append(False) continue # Note that smiles expression is not unique. # we obtain canonical smiles canonical_smiles, mol = pp.prepare_smiles_and_mol(mol) input_features = pp.get_input_features(mol) # Extract label if self.postprocess_label is not None: labels = self.postprocess_label(labels) if return_smiles: assert canonical_smiles == Chem.MolToSmiles(mol) smiles_list.append(canonical_smiles) # logger.debug('[DEBUG] smiles {}, standard_smiles {}' # .format(smiles, standardized_smiles)) except MolFeatureExtractionError as e: # This is expected error that extracting feature failed, # skip this molecule. fail_count += 1 if return_is_successful: is_successful_list.append(False) continue except Exception as e: logger.warning('parse(), type: {}, {}'.format( type(e).__name__, e.args)) logger.info(traceback.format_exc()) fail_count += 1 if return_is_successful: is_successful_list.append(False) continue # Initialize features: list of list if features is None: if isinstance(input_features, tuple): num_features = len(input_features) else: num_features = 1 if self.labels is not None: num_features += 1 features = [[] for _ in range(num_features)] if isinstance(input_features, tuple): for i in range(len(input_features)): features[i].append(input_features[i]) else: features[0].append(input_features) if self.labels is not None: features[len(features) - 1].append(labels) success_count += 1 if return_is_successful: is_successful_list.append(True) ret = [] for feature in features: try: feat_array = numpy.asarray(feature) except ValueError: # Temporal work around. # See, # https://stackoverflow.com/questions/26885508/why-do-i-get-error-trying-to-cast-np-arraysome-list-valueerror-could-not-broa feat_array = numpy.empty(len(feature), dtype=numpy.ndarray) feat_array[:] = feature[:] ret.append(feat_array) result = tuple(ret) logger.info( 'Preprocess finished. FAIL {}, SUCCESS {}, TOTAL {}'.format( fail_count, success_count, total_count)) else: raise NotImplementedError smileses = numpy.array(smiles_list) if return_smiles else None if return_is_successful: is_successful = numpy.array(is_successful_list) else: is_successful = None if isinstance(result, tuple): if self.postprocess_fn is not None: result = self.postprocess_fn(*result) dataset = NumpyTupleDataset(*result) else: if self.postprocess_fn is not None: result = self.postprocess_fn(result) dataset = NumpyTupleDataset(result) return { "dataset": dataset, "smiles": smileses, "is_successful": is_successful }
def create_dataset(self, *args, **kwargs): return NumpyTupleDataset(*args)
def parse(self, filepath, return_smiles=False, target_index=None, return_is_successful=False): """parse sdf file using `preprocessor` Note that label is extracted from preprocessor's method. Args: filepath (str): file path to be parsed. return_smiles (bool): If set to True, this function returns preprocessed dataset and smiles list. If set to False, this function returns preprocessed dataset and `None`. target_index (list or None): target index list to partially extract dataset. If None (default), all examples are parsed. return_is_successful (bool): If set to `True`, boolean list is returned in the key 'is_successful'. It represents preprocessing has succeeded or not for each SMILES. If set to False, `None` is returned in the key 'is_success'. Returns (dict): dictionary that contains Dataset, 1-d numpy array with dtype=object(string) which is a vector of smiles for each example or None. """ logger = self.logger pp = self.preprocessor smiles_list = [] is_successful_list = [] if isinstance(pp, MolPreprocessor): mol_supplier = Chem.SDMolSupplier(filepath) if target_index is None: target_index = list(range(len(mol_supplier))) features = None total_count = len(mol_supplier) fail_count = 0 success_count = 0 for index in tqdm(target_index): # `mol_supplier` does not accept numpy.integer, we must use int mol = mol_supplier[int(index)] if mol is None: fail_count += 1 if return_is_successful: is_successful_list.append(False) continue try: # Labels need to be extracted from `mol` before standardize # smiles. if self.labels is not None: label = pp.get_label(mol, self.labels) if self.postprocess_label is not None: label = self.postprocess_label(label) # Note that smiles expression is not unique. # we obtain canonical smiles smiles = Chem.MolToSmiles(mol) mol = Chem.MolFromSmiles(smiles) canonical_smiles, mol = pp.prepare_smiles_and_mol(mol) input_features = pp.get_input_features(mol) # Initialize features: list of list if features is None: if isinstance(input_features, tuple): num_features = len(input_features) else: num_features = 1 if self.labels is not None: num_features += 1 features = [[] for _ in range(num_features)] if return_smiles: assert canonical_smiles == Chem.MolToSmiles(mol) smiles_list.append(canonical_smiles) except MolFeatureExtractionError as e: # This is expected error that extracting feature failed, # skip this molecule. fail_count += 1 if return_is_successful: is_successful_list.append(False) continue except Exception as e: logger.warning('parse() error, type: {}, {}'.format( type(e).__name__, e.args)) fail_count += 1 if return_is_successful: is_successful_list.append(False) continue if isinstance(input_features, tuple): for i in range(len(input_features)): features[i].append(input_features[i]) else: features[0].append(input_features) if self.labels is not None: features[len(features) - 1].append(label) success_count += 1 if return_is_successful: is_successful_list.append(True) ret = [] for feature in features: try: feat_array = numpy.asarray(feature) except ValueError: # Temporal work around to convert object-type list into # numpy array. # See, https://goo.gl/kgJXwb feat_array = numpy.empty(len(feature), dtype=numpy.ndarray) feat_array[:] = feature[:] ret.append(feat_array) result = tuple(ret) logger.info( 'Preprocess finished. FAIL {}, SUCCESS {}, TOTAL {}'.format( fail_count, success_count, total_count)) else: # Spec not finalized yet for general case result = pp.process(filepath) smileses = numpy.array(smiles_list) if return_smiles else None if return_is_successful: is_successful = numpy.array(is_successful_list) else: is_successful = None if isinstance(result, tuple): if self.postprocess_fn is not None: result = self.postprocess_fn(*result) dataset = NumpyTupleDataset(*result) else: if self.postprocess_fn is not None: result = self.postprocess_fn(result) dataset = NumpyTupleDataset(result) return { "dataset": dataset, "smiles": smileses, "is_successful": is_successful }
def parse(self, filepath, return_smiles=False): """parse sdf file using `preprocessor` Note that label is extracted from preprocessor's method. Args: filepath (str): file path to be parsed. return_smiles (bool): If set to True, this function returns preprocessed dataset and smiles list. If set to False, this function returns preprocessed dataset and `None`. Returns (dict): dictionary that contains Dataset, 1-d numpy array with dtype=object(string) which is a vector of smiles for each example or None. """ logger = self.logger pp = self.preprocessor smiles_list = [] if isinstance(pp, MolPreprocessor): mol_supplier = Chem.SDMolSupplier(filepath) features = None total_count = len(mol_supplier) fail_count = 0 success_count = 0 for mol in tqdm(mol_supplier): if mol is None: total_count -= 1 continue try: # Labels need to be extracted from `mol` before standardize # smiles. if self.labels is not None: label = pp.get_label(mol, self.labels) if self.postprocess_label is not None: label = self.postprocess_label(label) # Note that smiles expression is not unique. # we should re-obtain smiles from `mol`, so that the # smiles order does not contradict with input features' # order. # Here, `smiles` and `standardized_smiles` expresses # same molecule, but the expression may be different! smiles = Chem.MolToSmiles(mol) mol = Chem.MolFromSmiles(smiles) standardized_smiles, mol = pp.prepare_smiles_and_mol(mol) input_features = pp.get_input_features(mol) # Initialize features: list of list if features is None: if isinstance(input_features, tuple): num_features = len(input_features) else: num_features = 1 if self.labels is not None: num_features += 1 features = [[] for _ in range(num_features)] if return_smiles: assert standardized_smiles == Chem.MolToSmiles(mol) smiles_list.append(standardized_smiles) except MolFeatureExtractionError as e: # This is expected error that extracting feature failed, # skip this molecule. fail_count += 1 continue except Exception as e: logger.warning('parse() error, type: {}, {}' .format(type(e).__name__, e.args)) continue if isinstance(input_features, tuple): for i in range(len(input_features)): features[i].append(input_features[i]) else: features[0].append(input_features) if self.labels is not None: features[len(features) - 1].append(label) success_count += 1 ret = [] for feature in features: try: feat_array = numpy.asarray(feature) except ValueError: # Temporal work around to convert object-type list into # numpy array. # See, https://goo.gl/kgJXwb feat_array = numpy.empty(len(feature), dtype=numpy.ndarray) feat_array[:] = feature[:] ret.append(feat_array) result = tuple(ret) logger.info('Preprocess finished. FAIL {}, SUCCESS {}, TOTAL {}' .format(fail_count, success_count, total_count)) else: # Spec not finalized yet for general case result = pp.process(filepath) smileses = numpy.array(smiles_list) if return_smiles else None if isinstance(result, tuple): if self.postprocess_fn is not None: result = self.postprocess_fn(*result) return {"dataset": NumpyTupleDataset(*result), "smiles": smileses} else: if self.postprocess_fn is not None: result = self.postprocess_fn(result) result = NumpyTupleDataset(result) return {"dataset": NumpyTupleDataset(result), "smiles": smileses}