class TestSmilesToSeq(unittest.TestCase): """Tests for SmilesToSeq featurizers.""" def setUp(self): """Setup.""" pad_len = 5 max_len = 35 filename = os.path.join(os.path.dirname(__file__), "data", "chembl_25_small.csv") char_to_idx = create_char_to_idx(filename, max_len=max_len) self.feat = SmilesToSeq(char_to_idx=char_to_idx, max_len=max_len, pad_len=pad_len) def test_smiles_to_seq_featurize(self): """Test SmilesToSeq featurization.""" smiles = ["Cn1c(=O)c2c(ncn2C)n(C)c1=O", "CC(=O)N1CN(C(C)=O)C(O)C1O"] expected_seq_len = self.feat.max_len + 2 * self.feat.pad_len features = self.feat.featurize(smiles) assert features.shape[0] == len(smiles) assert features.shape[-1] == expected_seq_len def test_reconstruct_from_seq(self): """Test SMILES reconstruction from features.""" smiles = ["Cn1c(=O)c2c(ncn2C)n(C)c1=O"] features = self.feat.featurize(smiles) # not support array style inputs reconstructed_smile = self.feat.smiles_from_seq(features[0]) assert smiles[0] == reconstructed_smile
class TestSmilesFeaturizers(TestCase): """Tests for SmilesToSeq and SmilesToImage featurizers.""" def setUp(self): """Setup.""" pad_len = 5 max_len = 35 filename = os.path.join(os.path.dirname(__file__), "data", "chembl_25_small.csv") char_to_idx = create_char_to_idx(filename, max_len=max_len) self.feat = SmilesToSeq(char_to_idx=char_to_idx, max_len=max_len, pad_len=pad_len) def test_smiles_to_seq_featurize(self): """Test SmilesToSeq featurization.""" from rdkit import Chem smiles = ["Cn1c(=O)c2c(ncn2C)n(C)c1=O", "CC(=O)N1CN(C(C)=O)C(O)C1O"] mols = [Chem.MolFromSmiles(smile) for smile in smiles] expected_seq_len = self.feat.max_len + 2 * self.feat.pad_len features = self.feat.featurize(mols) assert_equals(features.shape[0], len(smiles)) assert_equals(features.shape[-1], expected_seq_len) def test_reconstruct_from_seq(self): """Test SMILES reconstruction from features.""" smiles = ["Cn1c(=O)c2c(ncn2C)n(C)c1=O"] from rdkit import Chem mols = [Chem.MolFromSmiles(smile) for smile in smiles] features = self.feat.featurize(mols) reconstructed_smile = self.feat.smiles_from_seq(features[0]) assert_equals(smiles[0], reconstructed_smile)
def setUp(self): """Setup.""" pad_len = 5 max_len = 35 filename = os.path.join(os.path.dirname(__file__), "data", "chembl_25_small.csv") char_to_idx = create_char_to_idx(filename, max_len=max_len) self.feat = SmilesToSeq(char_to_idx=char_to_idx, max_len=max_len, pad_len=pad_len)
def get_dataset(mode="classification", featurizer="smiles2seq", max_seq_len=20, data_points=10, n_tasks=5): dataset_file = os.path.join(os.path.dirname(__file__), "chembl_25_small.csv") if featurizer == "smiles2seq": max_len = 250 pad_len = 10 char_to_idx = create_char_to_idx(dataset_file, max_len=max_len, smiles_field="smiles") feat = SmilesToSeq(char_to_idx=char_to_idx, max_len=max_len, pad_len=pad_len) elif featurizer == "smiles2img": img_size = 80 img_spec = "engd" res = 0.5 feat = SmilesToImage(img_size=img_size, img_spec=img_spec, res=res) loader = dc.data.CSVLoader(tasks=chembl25_tasks, smiles_field='smiles', featurizer=feat) dataset = loader.create_dataset(inputs=[dataset_file], shard_size=10000, data_dir=tempfile.mkdtemp()) w = np.ones(shape=(data_points, n_tasks)) if mode == 'classification': y = np.random.randint(0, 2, size=(data_points, n_tasks)) metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification") else: y = np.random.normal(size=(data_points, n_tasks)) metric = dc.metrics.Metric(dc.metrics.mean_absolute_error, mode="regression") if featurizer == "smiles2seq": dataset = dc.data.NumpyDataset(dataset.X[:data_points, :max_seq_len], y, w, dataset.ids[:data_points]) else: dataset = dc.data.NumpyDataset(dataset.X[:data_points], y, w, dataset.ids[:data_points]) if featurizer == "smiles2seq": return dataset, metric, char_to_idx else: return dataset, metric
def load_chembl25(featurizer="smiles2seq", split="random", data_dir=None, save_dir=None, split_seed=None, reload=True, transformer_type='minmax', **kwargs): """Loads the ChEMBL25 dataset, featurizes it, and does a split. Parameters ---------- featurizer: str, default smiles2seq Featurizer to use split: str, default None Splitter to use data_dir: str, default None Directory to download data to, or load dataset from. (TODO: If None, make tmp) save_dir: str, default None Directory to save the featurized dataset to. (TODO: If None, make tmp) split_seed: int, default None Seed to be used for splitting the dataset reload: bool, default True Whether to reload saved dataset transformer_type: str, default minmax: Transformer to use """ if data_dir is None: data_dir = DEFAULT_DIR if save_dir is None: save_dir = DEFAULT_DIR save_folder = os.path.join(save_dir, "chembl_25-featurized", str(featurizer)) if featurizer == "smiles2img": img_spec = kwargs.get("img_spec", "engd") save_folder = os.path.join(save_folder, img_spec) if reload: if not os.path.exists(save_folder): logger.warning( "{} does not exist. Reconstructing dataset.".format(save_folder)) else: logger.info("{} exists. Restoring dataset.".format(save_folder)) loaded, dataset, transformers = dc.utils.save.load_dataset_from_disk( save_folder) if loaded: return chembl25_tasks, dataset, transformers dataset_file = os.path.join(data_dir, "chembl_25.csv.gz") if not os.path.exists(dataset_file): logger.warning("File {} not found. Downloading dataset. (~555 MB)".format( dataset_file)) dc.utils.download_url(url=CHEMBL_URL, dest_dir=data_dir) if featurizer == "smiles2seq": max_len = kwargs.get('max_len', 250) pad_len = kwargs.get('pad_len', 10) char_to_idx = create_char_to_idx( dataset_file, max_len=max_len, smiles_field="smiles") featurizer = SmilesToSeq( char_to_idx=char_to_idx, max_len=max_len, pad_len=pad_len) elif featurizer == "smiles2img": img_size = kwargs.get("img_size", 80) img_spec = kwargs.get("img_spec", "engd") res = kwargs.get("res", 0.5) featurizer = SmilesToImage(img_size=img_size, img_spec=img_spec, res=res) else: raise ValueError( "Featurizer of type {} is not supported".format(featurizer)) loader = dc.data.CSVLoader( tasks=chembl25_tasks, smiles_field='smiles', featurizer=featurizer) dataset = loader.featurize( input_files=[dataset_file], shard_size=10000, data_dir=save_folder) if split is None: if transformer_type == "minmax": transformers = [ dc.trans.MinMaxTransformer( transform_X=False, transform_y=True, dataset=dataset) ] else: transformers = [ dc.trans.NormalizationTransformer( transform_X=False, transform_y=True, dataset=dataset) ] logger.info("Split is None, about to transform dataset.") for transformer in transformers: dataset = transformer.transform(dataset) return chembl25_tasks, (dataset, None, None), transformers splitters = { 'index': dc.splits.IndexSplitter(), 'random': dc.splits.RandomSplitter(), 'scaffold': dc.splits.ScaffoldSplitter(), } logger.info("About to split data with {} splitter.".format(split)) splitter = splitters[split] frac_train = kwargs.get('frac_train', 4 / 6) frac_valid = kwargs.get('frac_valid', 1 / 6) frac_test = kwargs.get('frac_test', 1 / 6) train, valid, test = splitter.train_valid_test_split( dataset, seed=split_seed, frac_train=frac_train, frac_test=frac_test, frac_valid=frac_valid) if transformer_type == "minmax": transformers = [ dc.trans.MinMaxTransformer( transform_X=False, transform_y=True, dataset=train) ] else: transformers = [ dc.trans.NormalizationTransformer( transform_X=False, transform_y=True, dataset=train) ] for transformer in transformers: train = transformer.transform(train) valid = transformer.transform(valid) test = transformer.transform(test) if reload: dc.utils.save.save_dataset_to_disk(save_folder, train, valid, test, transformers) return chembl25_tasks, (train, valid, test), transformers