Example #1
0
 def setUp(self):
     """Setup."""
     pad_len = 5
     max_len = 35
     filename = os.path.join(os.path.dirname(__file__), "data",
                             "chembl_25_small.csv")
     char_to_idx = create_char_to_idx(filename, max_len=max_len)
     self.feat = SmilesToSeq(char_to_idx=char_to_idx,
                             max_len=max_len,
                             pad_len=pad_len)
Example #2
0
def test_smiles2vec_reload():
    """Test that smiles2vec models can be saved and reloaded."""
    dataset_file = os.path.join(os.path.dirname(__file__),
                                "chembl_25_small.csv")
    max_len = 250
    pad_len = 10
    max_seq_len = 20
    char_to_idx = create_char_to_idx(dataset_file,
                                     max_len=max_len,
                                     smiles_field="smiles")
    feat = dc.feat.SmilesToSeq(char_to_idx=char_to_idx,
                               max_len=max_len,
                               pad_len=pad_len)

    n_tasks = 5
    data_points = 10

    loader = dc.data.CSVLoader(tasks=CHEMBL25_TASKS,
                               smiles_field='smiles',
                               featurizer=feat)
    dataset = loader.create_dataset(inputs=[dataset_file],
                                    shard_size=10000,
                                    data_dir=tempfile.mkdtemp())
    y = np.random.randint(0, 2, size=(data_points, n_tasks))
    w = np.ones(shape=(data_points, n_tasks))
    dataset = dc.data.NumpyDataset(dataset.X[:data_points, :max_seq_len], y, w,
                                   dataset.ids[:data_points])

    classsification_metric = dc.metrics.Metric(dc.metrics.roc_auc_score,
                                               np.mean,
                                               mode="classification")

    model_dir = tempfile.mkdtemp()
    model = dc.models.Smiles2Vec(char_to_idx=char_to_idx,
                                 max_seq_len=max_seq_len,
                                 use_conv=True,
                                 n_tasks=n_tasks,
                                 model_dir=model_dir,
                                 mode="classification")
    model.fit(dataset, nb_epoch=3)

    # Reload Trained Model
    reloaded_model = dc.models.Smiles2Vec(char_to_idx=char_to_idx,
                                          max_seq_len=max_seq_len,
                                          use_conv=True,
                                          n_tasks=n_tasks,
                                          model_dir=model_dir,
                                          mode="classification")
    reloaded_model.restore()

    # Check predictions match on original dataset
    origpred = model.predict(dataset)
    reloadpred = reloaded_model.predict(dataset)
    assert np.all(origpred == reloadpred)
def get_dataset(mode="classification",
                featurizer="smiles2seq",
                max_seq_len=20,
                data_points=10,
                n_tasks=5):
    dataset_file = os.path.join(os.path.dirname(__file__),
                                "chembl_25_small.csv")

    if featurizer == "smiles2seq":
        max_len = 250
        pad_len = 10
        char_to_idx = create_char_to_idx(dataset_file,
                                         max_len=max_len,
                                         smiles_field="smiles")
        feat = SmilesToSeq(char_to_idx=char_to_idx,
                           max_len=max_len,
                           pad_len=pad_len)

    elif featurizer == "smiles2img":
        img_size = 80
        img_spec = "engd"
        res = 0.5
        feat = SmilesToImage(img_size=img_size, img_spec=img_spec, res=res)

    loader = dc.data.CSVLoader(tasks=chembl25_tasks,
                               smiles_field='smiles',
                               featurizer=feat)
    dataset = loader.create_dataset(inputs=[dataset_file],
                                    shard_size=10000,
                                    data_dir=tempfile.mkdtemp())

    w = np.ones(shape=(data_points, n_tasks))

    if mode == 'classification':
        y = np.random.randint(0, 2, size=(data_points, n_tasks))
        metric = dc.metrics.Metric(dc.metrics.roc_auc_score,
                                   np.mean,
                                   mode="classification")
    else:
        y = np.random.normal(size=(data_points, n_tasks))
        metric = dc.metrics.Metric(dc.metrics.mean_absolute_error,
                                   mode="regression")

    if featurizer == "smiles2seq":
        dataset = dc.data.NumpyDataset(dataset.X[:data_points, :max_seq_len],
                                       y, w, dataset.ids[:data_points])
    else:
        dataset = dc.data.NumpyDataset(dataset.X[:data_points], y, w,
                                       dataset.ids[:data_points])

    if featurizer == "smiles2seq":
        return dataset, metric, char_to_idx
    else:
        return dataset, metric
def load_chembl25(featurizer="smiles2seq",
                  split="random",
                  data_dir=None,
                  save_dir=None,
                  split_seed=None,
                  reload=True,
                  transformer_type='minmax',
                  **kwargs):
    """Loads the ChEMBL25 dataset, featurizes it, and does a split.
  Parameters
  ----------
  featurizer: str, default smiles2seq
    Featurizer to use
  split: str, default None
    Splitter to use
  data_dir: str, default None
    Directory to download data to, or load dataset from. (TODO: If None, make tmp)
  save_dir: str, default None
    Directory to save the featurized dataset to. (TODO: If None, make tmp)
  split_seed: int, default None
    Seed to be used for splitting the dataset
  reload: bool, default True
    Whether to reload saved dataset
  transformer_type: str, default minmax:
    Transformer to use
  """
    if data_dir is None:
        data_dir = DEFAULT_DIR
    if save_dir is None:
        save_dir = DEFAULT_DIR

    save_folder = os.path.join(save_dir, "chembl_25-featurized",
                               str(featurizer))
    if featurizer == "smiles2img":
        img_spec = kwargs.get("img_spec", "std")
        save_folder = os.path.join(save_folder, img_spec)

    if reload:
        if not os.path.exists(save_folder):
            logger.warning("{} does not exist. Reconstructing dataset.".format(
                save_folder))
        else:
            logger.info("{} exists. Restoring dataset.".format(save_folder))
            loaded, dataset, transformers = dc.utils.data_utils.load_dataset_from_disk(
                save_folder)
            if loaded:
                return chembl25_tasks, dataset, transformers

    dataset_file = os.path.join(data_dir, "chembl_25.csv.gz")

    if not os.path.exists(dataset_file):
        logger.warning(
            "File {} not found. Downloading dataset. (~555 MB)".format(
                dataset_file))
        dc.utils.data_utils.download_url(url=CHEMBL_URL, dest_dir=data_dir)

    if featurizer == 'ECFP':
        featurizer = deepchem.feat.CircularFingerprint(size=1024)
    elif featurizer == 'GraphConv':
        featurizer = deepchem.feat.ConvMolFeaturizer()
    elif featurizer == 'Weave':
        featurizer = deepchem.feat.WeaveFeaturizer()
    elif featurizer == 'Raw':
        featurizer = deepchem.feat.RawFeaturizer()
    elif featurizer == "smiles2seq":
        max_len = kwargs.get('max_len', 250)
        pad_len = kwargs.get('pad_len', 10)
        char_to_idx = create_char_to_idx(dataset_file,
                                         max_len=max_len,
                                         smiles_field="smiles")
        featurizer = SmilesToSeq(char_to_idx=char_to_idx,
                                 max_len=max_len,
                                 pad_len=pad_len)
    elif featurizer == "smiles2img":
        img_size = kwargs.get("img_size", 80)
        img_spec = kwargs.get("img_spec", "engd")
        res = kwargs.get("res", 0.5)
        featurizer = SmilesToImage(img_size=img_size,
                                   img_spec=img_spec,
                                   res=res)

    else:
        raise ValueError(
            "Featurizer of type {} is not supported".format(featurizer))

    loader = dc.data.CSVLoader(tasks=chembl25_tasks,
                               smiles_field='smiles',
                               featurizer=featurizer)
    dataset = loader.featurize(input_files=[dataset_file],
                               shard_size=10000,
                               data_dir=save_folder)

    if split is None:
        if transformer_type == "minmax":
            transformers = [
                dc.trans.MinMaxTransformer(transform_X=False,
                                           transform_y=True,
                                           dataset=dataset)
            ]
        else:
            transformers = [
                dc.trans.NormalizationTransformer(transform_X=False,
                                                  transform_y=True,
                                                  dataset=dataset)
            ]

        logger.info("Split is None, about to transform dataset.")
        for transformer in transformers:
            dataset = transformer.transform(dataset)
        return chembl25_tasks, (dataset, None, None), transformers

    splitters = {
        'index': dc.splits.IndexSplitter(),
        'random': dc.splits.RandomSplitter(),
        'scaffold': dc.splits.ScaffoldSplitter(),
    }

    logger.info("About to split data with {} splitter.".format(split))
    splitter = splitters[split]

    frac_train = kwargs.get('frac_train', 4 / 6)
    frac_valid = kwargs.get('frac_valid', 1 / 6)
    frac_test = kwargs.get('frac_test', 1 / 6)

    train, valid, test = splitter.train_valid_test_split(dataset,
                                                         seed=split_seed,
                                                         frac_train=frac_train,
                                                         frac_test=frac_test,
                                                         frac_valid=frac_valid)
    if transformer_type == "minmax":
        transformers = [
            dc.trans.MinMaxTransformer(transform_X=False,
                                       transform_y=True,
                                       dataset=train)
        ]
    else:
        transformers = [
            dc.trans.NormalizationTransformer(transform_X=False,
                                              transform_y=True,
                                              dataset=train)
        ]

    for transformer in transformers:
        train = transformer.transform(train)
        valid = transformer.transform(valid)
        test = transformer.transform(test)

    if reload:
        dc.utils.data_utils.save_dataset_to_disk(save_folder, train, valid,
                                                 test, transformers)

    return chembl25_tasks, (train, valid, test), transformers