Esempio n. 1
0
  def transform(self, dataset):
    """Performs power transform on data."""
    X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)     
    w_t = w
    ids_t = ids
    n_powers = len(self.powers)
    if self.transform_X:
      X_t = np.power(X, self.powers[0])
      for i in range(1, n_powers):
      	X_t = np.hstack((X_t,np.power(X, self.powers[i])))
      y_t = y
    if self.transform_y:
      print("y will not be transformed by PowerTransformer, for now.")
      """
      y_t = np.power(y, self.powers[0])
      for i in range(1, n_powers):
      	y_t = np.hstack((y_t,np.power(y, self.powers[i])))
      X_t = X
      """

    # TODO (rbharath): Find a more elegant solution to saving the data?
    shutil.rmtree(dataset.data_dir)
    os.makedirs(dataset.data_dir)
    DiskDataset.from_numpy(X_t, y_t, w_t, ids_t, data_dir=dataset.data_dir)
    return dataset
Esempio n. 2
0
    def transform(self, dataset):
        """Performs power transform on data."""
        X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
        w_t = w
        ids_t = ids
        n_powers = len(self.powers)
        if self.transform_X:
            X_t = np.power(X, self.powers[0])
            for i in range(1, n_powers):
                X_t = np.hstack((X_t, np.power(X, self.powers[i])))
            y_t = y
        if self.transform_y:
            print("y will not be transformed by PowerTransformer, for now.")
            """
      y_t = np.power(y, self.powers[0])
      for i in range(1, n_powers):
      	y_t = np.hstack((y_t,np.power(y, self.powers[i])))
      X_t = X
      """

        # TODO (rbharath): Find a more elegant solution to saving the data?
        shutil.rmtree(dataset.data_dir)
        os.makedirs(dataset.data_dir)
        DiskDataset.from_numpy(X_t, y_t, w_t, ids_t, data_dir=dataset.data_dir)
        return dataset
Esempio n. 3
0
  def _to_singletask(dataset, task_dirs):
    """Transforms a multitask dataset to a collection of singletask datasets."""
    tasks = dataset.get_task_names()
    assert len(tasks) == len(task_dirs)
    log("Splitting multitask dataset into singletask datasets", dataset.verbosity)
    task_metadata_rows = {task: [] for task in tasks}
    for shard_num, (X, y, w, ids) in enumerate(dataset.itershards()):
      log("Processing shard %d" % shard_num, dataset.verbosity)
      basename = "dataset-%d" % shard_num
      for task_num, task in enumerate(tasks):
        log("\tTask %s" % task, dataset.verbosity)
        w_task = w[:, task_num]
        y_task = y[:, task_num]

        # Extract those datapoints which are present for this task
        X_nonzero = X[w_task != 0]
        num_datapoints = X_nonzero.shape[0]
        y_nonzero = np.reshape(y_task[w_task != 0], (num_datapoints, 1))
        w_nonzero = np.reshape(w_task[w_task != 0], (num_datapoints, 1))
        ids_nonzero = ids[w_task != 0]

        if X_nonzero.size > 0: 
          task_metadata_rows[task].append(
            DiskDataset.write_data_to_disk(
                task_dirs[task_num], basename, [task],
                X_nonzero, y_nonzero, w_nonzero, ids_nonzero))
    
    task_datasets = [
        DiskDataset(data_dir=task_dirs[task_num],
                metadata_rows=task_metadata_rows[task],
                verbosity=dataset.verbosity)
        for (task_num, task) in enumerate(tasks)]
    return task_datasets
Esempio n. 4
0
def load_tox21(featurizer='ECFP', split='index'):
    """Load Tox21 datasets. Does not do train/test split"""
    # Featurize Tox21 dataset
    print("About to featurize Tox21 dataset.")
    current_dir = os.path.dirname(os.path.realpath(__file__))

    dataset_file = os.path.join(current_dir, "../../datasets/tox21.csv.gz")
    data_dir = deepchem.utils.get_data_dir()

    tox21_tasks = [
        'NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD',
        'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53'
    ]

    dataset_dir = os.path.join(data_dir, "tox21", featurizer, split)
    train, valid, test = os.path.join(dataset_dir, 'train'), os.path.join(
        dataset_dir, 'valid'), os.path.join(dataset_dir, 'test')
    if os.path.isdir(dataset_dir):
        train, valid, test = DiskDataset(data_dir=train), DiskDataset(
            data_dir=valid), DiskDataset(data_dir=test)
        transformers = [
            dc.trans.BalancingTransformer(transform_w=True, dataset=train)
        ]
        return tox21_tasks, (train, valid, test), transformers
    if featurizer == 'ECFP':
        featurizer_func = dc.feat.CircularFingerprint(size=1024)
    elif featurizer == 'GraphConv':
        featurizer_func = dc.feat.ConvMolFeaturizer()
    elif featurizer == 'AdjMatrix':
        featurizer_func = dc.feat.AdjacencyFingerprint(num_atoms_feature=True)
    loader = dc.data.CSVLoader(tasks=tox21_tasks,
                               smiles_field="smiles",
                               featurizer=featurizer_func)
    dataset = loader.featurize(dataset_file, shard_size=8192)

    # Initialize transformers
    transformers = [
        dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)
    ]

    print("About to transform data")
    for transformer in transformers:
        dataset = transformer.transform(dataset)

    splitters = {
        'index': dc.splits.IndexSplitter(),
        'random': dc.splits.RandomSplitter(),
        'scaffold': dc.splits.ScaffoldSplitter(),
        'butina': dc.splits.ButinaSplitter()
    }
    splitter = splitters[split]
    train, valid, test = splitter.train_valid_test_split(dataset,
                                                         train_dir=train,
                                                         valid_dir=valid,
                                                         test_dir=test)

    return tox21_tasks, (train, valid, test), transformers
Esempio n. 5
0
    def k_fold_split(self, dataset, k, directories=None, **kwargs):
        """
    Parameters
    ----------
    dataset: `dc.data.Dataset`
      Dataset to do a k-fold split
    k: int
      Number of folds to split `dataset` into.
    directories: list[str]
      list of length 2*k filepaths to save the result disk-datasets

    Returns
    -------
    list of length k tuples of (train, cv) where `train` and `cv` are both
    lists of `Dataset`s.
    """
        logger.info("Computing K-fold split")
        if directories is None:
            directories = [tempfile.mkdtemp() for _ in range(2 * k)]
        else:
            assert len(directories) == 2 * k
        cv_datasets = []
        train_ds_base = None
        train_datasets = []
        # rem_dataset is remaining portion of dataset
        if isinstance(dataset, DiskDataset):
            rem_dataset = dataset
        else:
            rem_dataset = DiskDataset.from_numpy(dataset.X, dataset.y,
                                                 dataset.w, dataset.ids)
        for fold in range(k):
            # Note starts as 1/k since fold starts at 0. Ends at 1 since fold goes up
            # to k-1.
            frac_fold = 1. / (k - fold)
            train_dir, cv_dir = directories[2 * fold], directories[2 * fold +
                                                                   1]
            fold_inds, rem_inds, _ = self.split(rem_dataset,
                                                frac_train=frac_fold,
                                                frac_valid=1 - frac_fold,
                                                frac_test=0,
                                                **kwargs)
            cv_dataset = rem_dataset.select(fold_inds, select_dir=cv_dir)
            cv_datasets.append(cv_dataset)
            rem_dataset = rem_dataset.select(rem_inds)

            train_ds_to_merge = filter(lambda x: x is not None,
                                       [train_ds_base, rem_dataset])
            train_ds_to_merge = filter(lambda x: len(x) > 0, train_ds_to_merge)
            train_dataset = DiskDataset.merge(train_ds_to_merge,
                                              merge_dir=train_dir)
            train_datasets.append(train_dataset)

            update_train_base_merge = filter(lambda x: x is not None,
                                             [train_ds_base, cv_dataset])
            train_ds_base = DiskDataset.merge(update_train_base_merge)
        return list(zip(train_datasets, cv_datasets))
Esempio n. 6
0
  def _to_singletask(dataset, task_dirs):
    """Transforms a multitask dataset to a collection of singletask datasets."""
    tasks = dataset.get_task_names()
    assert len(tasks) == len(task_dirs)
    log("Splitting multitask dataset into singletask datasets", dataset.verbosity)
    task_metadata_rows = {task: [] for task in tasks}
    for shard_num, (X, y, w, ids) in enumerate(dataset.itershards()):
      log("Processing shard %d" % shard_num, dataset.verbosity)
      basename = "dataset-%d" % shard_num
      for task_num, task in enumerate(tasks):
        log("\tTask %s" % task, dataset.verbosity)
        w_task = w[:, task_num]
        y_task = y[:, task_num]

        # Extract those datapoints which are present for this task
        X_nonzero = X[w_task != 0]
        num_datapoints = X_nonzero.shape[0]
        y_nonzero = np.reshape(y_task[w_task != 0], (num_datapoints, 1))
        w_nonzero = np.reshape(w_task[w_task != 0], (num_datapoints, 1))
        ids_nonzero = ids[w_task != 0]

        if X_nonzero.size > 0: 
          task_metadata_rows[task].append(
            DiskDataset.write_data_to_disk(
                task_dirs[task_num], basename, [task],
                X_nonzero, y_nonzero, w_nonzero, ids_nonzero))
    
    task_datasets = [
        DiskDataset(data_dir=task_dirs[task_num],
                metadata_rows=task_metadata_rows[task],
                verbosity=dataset.verbosity)
        for (task_num, task) in enumerate(tasks)]
    return task_datasets
Esempio n. 7
0
  def _to_singletask(dataset, task_dirs):
    """Transforms a multitask dataset to a collection of singletask datasets."""
    tasks = dataset.get_task_names()
    assert len(tasks) == len(task_dirs)
    logger.info("Splitting multitask dataset into singletask datasets")
    task_datasets = [
        DiskDataset.create_dataset([], task_dirs[task_num], [task.item()])
        for (task_num, task) in enumerate(tasks)
    ]
    #task_metadata_rows = {task: [] for task in tasks}
    for shard_num, (X, y, w, ids) in enumerate(dataset.itershards()):
      logger.info("Processing shard %d" % shard_num)
      basename = "dataset-%d" % shard_num
      for task_num, task in enumerate(tasks):
        logger.info("\tTask %s" % task)
        if len(w.shape) == 1:
          w_task = w
        elif w.shape[1] == 1:
          w_task = w[:, 0]
        else:
          w_task = w[:, task_num]
        y_task = y[:, task_num]

        # Extract those datapoints which are present for this task
        X_nonzero = X[w_task != 0]
        num_datapoints = X_nonzero.shape[0]
        y_nonzero = np.reshape(y_task[w_task != 0], (num_datapoints, 1))
        w_nonzero = np.reshape(w_task[w_task != 0], (num_datapoints, 1))
        ids_nonzero = ids[w_task != 0]

        task_datasets[task_num].add_shard(X_nonzero, y_nonzero, w_nonzero,
                                          ids_nonzero)

    return task_datasets
Esempio n. 8
0
    def featurize(self, input_files, data_dir=None, shard_size=8192):
        """Featurize provided files and write to specified location."""
        log("Loading raw samples now.", self.verbose)
        log("shard_size: %d" % shard_size, self.verbose)

        if not isinstance(input_files, list):
            input_files = [input_files]

        def shard_generator():
            for shard_num, shard in enumerate(
                    self.get_shards(input_files, shard_size)):
                time1 = time.time()
                X, valid_inds = self.featurize_shard(shard)
                ids, y, w = convert_df_to_numpy(shard, self.tasks,
                                                self.id_field)
                # Filter out examples where featurization failed.
                ids, y, w = (ids[valid_inds], y[valid_inds], w[valid_inds])
                assert len(X) == len(ids) == len(y) == len(w)
                time2 = time.time()
                log(
                    "TIMING: featurizing shard %d took %0.3f s" %
                    (shard_num, time2 - time1), self.verbose)
                yield X, y, w, ids

        return DiskDataset.create_dataset(shard_generator(), data_dir,
                                          self.tasks)
Esempio n. 9
0
    def test_fit(self):
        tf_enc = TensorflowMoleculeEncoder.zinc_encoder()

        smiles = [
            "Cn1cnc2c1c(=O)n(C)c(=O)n2C",
            "O=C(O)[C@@H]1/C(=C/CO)O[C@@H]2CC(=O)N21",
            "Cn1c2nncnc2c(=O)n(C)c1=O", "Cn1cnc2c1c(=O)[nH]c(=O)n2C",
            "NC(=O)c1ncc[nH]c1=O", "O=C1OCc2c1[nH]c(=O)[nH]c2=O",
            "Cn1c(N)c(N)c(=O)n(C)c1=O", "CNc1nc2c([nH]1)c(=O)[nH]c(=O)n2C",
            "CC(=O)N1CN(C(C)=O)[C@@H](O)[C@@H]1O",
            "CC(=O)N1CN(C(C)=O)[C@H](O)[C@H]1O", "Cc1[nH]c(=O)[nH]c(=O)c1CO",
            "O=C1NCCCc2c1no[n+]2[O-]", "Cc1nc(C(N)=O)c(N)n1CCO",
            "O=c1[nH]cc(N2CCOCC2)c(=O)[nH]1"
        ]

        featurizer = dc.feat.one_hot.OneHotFeaturizer(zinc_charset, 120)
        mols = [Chem.MolFromSmiles(x) for x in smiles]
        features = featurizer.featurize(mols)

        dataset = DiskDataset.from_numpy(features, features)
        prediction = tf_enc.predict_on_batch(dataset.X)
        tf_de = TensorflowMoleculeDecoder.zinc_decoder()
        one_hot_decoded = tf_de.predict_on_batch(prediction)
        decoded_smiles = featurizer.untransform(one_hot_decoded)
        assert_equals(len(decoded_smiles), len(smiles))
Esempio n. 10
0
    def create_dataset(self,
                       input_files: OneOrMany[str],
                       data_dir: Optional[str] = None,
                       shard_size: Optional[int] = None) -> DiskDataset:
        """Creates a `Dataset` from input FASTA files.

    At present, FASTA support is limited and only allows for one-hot
    featurization, and doesn't allow for sharding.

    Parameters
    ----------
    input_files: list
      List of fasta files.
    data_dir: str, optional
      Name of directory where featurized data is stored.
    shard_size: int, optional
      For now, this argument is ignored and each FASTA file gets its
      own shard. 

    Returns
    -------
    A `Dataset` object containing a featurized representation of data
    from `input_files`.
    """
        if isinstance(input_files, str):
            input_files = [input_files]

        def shard_generator():
            for input_file in input_files:
                X = encode_fasta_sequence(input_file)
                ids = np.ones(len(X))
                # (X, y, w, ids)
                yield X, None, None, ids

        return DiskDataset.create_dataset(shard_generator(), data_dir)
Esempio n. 11
0
  def featurize(self, input_files, data_dir=None, shard_size=8192):
    """Featurize provided files and write to specified location."""
    log("Loading raw samples now.", self.verbose)
    log("shard_size: %d" % shard_size, self.verbose)

    if not isinstance(input_files, list):
      input_files = [input_files]

    def shard_generator():
      for shard_num, shard in enumerate(
          self.get_shards(input_files, shard_size)):
        time1 = time.time()
        X, valid_inds = self.featurize_shard(shard)
        ids = shard[self.id_field].values
        ids = ids[valid_inds]
        if len(self.tasks) > 0:
          # Featurize task results iff they exist.
          y, w = convert_df_to_numpy(shard, self.tasks, self.id_field)
          # Filter out examples where featurization failed.
          y, w = (y[valid_inds], w[valid_inds])
          assert len(X) == len(ids) == len(y) == len(w)
        else:
          # For prospective data where results are unknown, it makes
          # no sense to have y values or weights.
          y, w = (None, None)
          assert len(X) == len(ids)

        time2 = time.time()
        log("TIMING: featurizing shard %d took %0.3f s" %
            (shard_num, time2 - time1), self.verbose)
        yield X, y, w, ids

    return DiskDataset.create_dataset(
        shard_generator(), data_dir, self.tasks, verbose=self.verbose)
Esempio n. 12
0
    def create_dataset(self, input_files, data_dir=None, shard_size=8192):
        """Creates and returns a `Dataset` object by featurizing provided files.

    Reads in `input_files` and uses `self.featurizer` to featurize the
    data in these input files.  For large files, automatically shards
    into smaller chunks of `shard_size` datapoints for convenience.
    Returns a `Dataset` object that contains the featurized dataset.

    This implementation assumes that the helper methods `_get_shards`
    and `_featurize_shard` are implemented and that each shard
    returned by `_get_shards` is a pandas dataframe.  You may choose
    to reuse or override this method in your subclass implementations.

    Parameters
    ----------
    input_files: list
      List of input filenames.
    data_dir: str, optional
      Directory to store featurized dataset.
    shard_size: int, optional
      Number of examples stored in each shard.

    Returns
    -------
    A `Dataset` object containing a featurized representation of data
    from `input_files`.
    """
        logger.info("Loading raw samples now.")
        logger.info("shard_size: %d" % shard_size)

        if not isinstance(input_files, list):
            input_files = [input_files]

        def shard_generator():
            for shard_num, shard in enumerate(
                    self._get_shards(input_files, shard_size)):
                time1 = time.time()
                X, valid_inds = self._featurize_shard(shard)
                ids = shard[self.id_field].values
                ids = ids[valid_inds]
                if len(self.tasks) > 0:
                    # Featurize task results iff they exist.
                    y, w = _convert_df_to_numpy(shard, self.tasks)
                    # Filter out examples where featurization failed.
                    y, w = (y[valid_inds], w[valid_inds])
                    assert len(X) == len(ids) == len(y) == len(w)
                else:
                    # For prospective data where results are unknown, it
                    # makes no sense to have y values or weights.
                    y, w = (None, None)
                    assert len(X) == len(ids)

                time2 = time.time()
                logger.info("TIMING: featurizing shard %d took %0.3f s" %
                            (shard_num, time2 - time1))
                yield X, y, w, ids

        return DiskDataset.create_dataset(shard_generator(), data_dir,
                                          self.tasks)
Esempio n. 13
0
  def featurize_complexes(self, mol_files, protein_files):
    pool = multiprocessing.Pool()
    results = []
    for i, (mol_file, protein_pdb) in enumerate(zip(mol_files, protein_files)):
      log_message = "Featurizing %d / %d" % (i, len(mol_files))
      results.append(
          pool.apply_async(_featurize_complex,
                           (self, mol_file, protein_pdb, log_message)))
    pool.close()
    features = []
    failures = []
    for ind, result in enumerate(results):
      new_features = result.get()
      # Handle loading failures which return None
      if new_features is not None:
        features.append(new_features)
      else:
        failures.append(ind)

    features = np.asarray(features)
    labels = np.delete(self.labels, failures)
    dataset = DiskDataset.from_numpy(features, labels)

    # Fit atomic conv model
    self.atomic_conv_model.fit(dataset, nb_epoch=self.epochs)

    # Add the Atomic Convolution layers to fetches
    layers_to_fetch = list()
    for layer in self.atomic_conv_model.layers.values():
      if isinstance(layer,
                    dc.models.tensorgraph.models.atomic_conv.AtomicConvolution):
        layers_to_fetch.append(layer)

    # Extract the atomic convolution features
    atomic_conv_features = list()
    feed_dict_generator = self.atomic_conv_model.default_generator(
        dataset=dataset, epochs=1)

    for feed_dict in self.atomic_conv_model._create_feed_dicts(
        feed_dict_generator, training=False):
      frag1_conv, frag2_conv, complex_conv = self.atomic_conv_model._run_graph(
          outputs=layers_to_fetch, feed_dict=feed_dict, training=False)
      concatenated = np.concatenate(
          [frag1_conv, frag2_conv, complex_conv], axis=1)
      atomic_conv_features.append(concatenated)

    batch_size = self.atomic_conv_model.batch_size

    if len(features) % batch_size != 0:
      num_batches = (len(features) // batch_size) + 1
      num_to_skip = num_batches * batch_size - len(features)
    else:
      num_to_skip = 0

    atomic_conv_features = np.asarray(atomic_conv_features)
    atomic_conv_features = atomic_conv_features[-num_to_skip:]
    atomic_conv_features = np.squeeze(atomic_conv_features)

    return atomic_conv_features, failures
Esempio n. 14
0
    def featurize_complexes(self, mol_files, protein_files):
        pool = multiprocessing.Pool()
        results = []
        for i, (mol_file,
                protein_pdb) in enumerate(zip(mol_files, protein_files)):
            log_message = "Featurizing %d / %d" % (i, len(mol_files))
            results.append(
                pool.apply_async(_featurize_complex,
                                 (self, mol_file, protein_pdb, log_message)))
        pool.close()
        features = []
        failures = []
        for ind, result in enumerate(results):
            new_features = result.get()
            # Handle loading failures which return None
            if new_features is not None:
                features.append(new_features)
            else:
                failures.append(ind)

        features = np.asarray(features)
        labels = np.delete(self.labels, failures)
        dataset = DiskDataset.from_numpy(features, labels)

        # Fit atomic conv model
        self.atomic_conv_model.fit(dataset, nb_epoch=self.epochs)

        # Add the Atomic Convolution layers to fetches
        layers_to_fetch = list()
        for layer in self.atomic_conv_model.layers.values():
            if isinstance(layer, dc.models.atomic_conv.AtomicConvolution):
                layers_to_fetch.append(layer)

        # Extract the atomic convolution features
        atomic_conv_features = list()
        feed_dict_generator = self.atomic_conv_model.default_generator(
            dataset=dataset, epochs=1)

        for feed_dict in self.atomic_conv_model._create_feed_dicts(
                feed_dict_generator, training=False):
            frag1_conv, frag2_conv, complex_conv = self.atomic_conv_model._run_graph(
                outputs=layers_to_fetch, feed_dict=feed_dict, training=False)
            concatenated = np.concatenate(
                [frag1_conv, frag2_conv, complex_conv], axis=1)
            atomic_conv_features.append(concatenated)

        batch_size = self.atomic_conv_model.batch_size

        if len(features) % batch_size != 0:
            num_batches = (len(features) // batch_size) + 1
            num_to_skip = num_batches * batch_size - len(features)
        else:
            num_to_skip = 0

        atomic_conv_features = np.asarray(atomic_conv_features)
        atomic_conv_features = atomic_conv_features[-num_to_skip:]
        atomic_conv_features = np.squeeze(atomic_conv_features)

        return atomic_conv_features, failures
Esempio n. 15
0
    def create_dataset(self,
                       inputs: Sequence[Any],
                       data_dir: Optional[str] = None,
                       shard_size: Optional[int] = 8192) -> DiskDataset:
        """Creates and returns a `Dataset` object by featurizing provided files.

    Reads in `inputs` and uses `self.featurizer` to featurize the
    data in these input files.  For large files, automatically shards
    into smaller chunks of `shard_size` datapoints for convenience.
    Returns a `Dataset` object that contains the featurized dataset.

    This implementation assumes that the helper methods `_get_shards`
    and `_featurize_shard` are implemented and that each shard
    returned by `_get_shards` is a pandas dataframe.  You may choose
    to reuse or override this method in your subclass implementations.

    Parameters
    ----------
    inputs: Sequence[Any]
      List of inputs to process. Entries can be arbitrary objects so long as
      they are understood by `self.featurizer`
    data_dir: str, optional (default None)
      Directory to store featurized dataset.
    shard_size: int, optional (default 8192)
      Number of examples stored in each shard.

    Returns
    -------
    DiskDataset
      A `DiskDataset` object containing a featurized representation of data
      from `inputs`.
    """
        logger.info("Loading raw samples now.")
        logger.info("shard_size: %s" % str(shard_size))

        if not isinstance(inputs, list):
            try:
                inputs = list(inputs)
            except TypeError:
                inputs = [inputs]

        def shard_generator():
            global_index = 0
            for shard_num, shard in enumerate(
                    self._get_shards(inputs, shard_size)):
                time1 = time.time()
                X, y, w, ids = self._featurize_shard(shard, global_index)
                global_index += len(shard)

                time2 = time.time()
                logger.info("TIMING: featurizing shard %d took %0.3f s" %
                            (shard_num, time2 - time1))
                yield X, y, w, ids

        return DiskDataset.create_dataset(shard_generator(), data_dir,
                                          self.tasks)
def test_select_attrs_by_dset_smiles():
    #testing that the method can split a attr according to a disk dataset. In this case, the attr_ids need to be converted back to smiles to match the input dataset.
    dataset = DiskDataset.from_numpy(
        test_scaffold.X,
        test_scaffold.y,
        ids=test_scaffold_attr[data_obj_scaffold.params.smiles_col].values)

    newDD = split.select_attrs_by_dset_smiles(
        dataset, data_obj_scaffold.attr, data_obj_scaffold.params.smiles_col)
    assert newDD.equals(test_scaffold_attr)
def test_select_dset_by_attr_ids_using_smiles():
    #testing that the method can split a dataset according to its attr ids into the correct deepchem diskdataframe. In this case, the attr_ids are converted back to smiles to match the input dataset.
    dataset = DiskDataset.from_numpy(
        data_obj_scaffold.dataset.X,
        data_obj_scaffold.dataset.y,
        ids=data_obj_scaffold.attr[data_obj_scaffold.params.smiles_col].values)
    newdf = pd.DataFrame({'compound_ids': test_scaffold_attr.index.tolist()},
                         index=test_scaffold_attr.smiles)
    newDD = split.select_dset_by_attr_ids(dataset, newdf)
    assert (newDD.y == test_scaffold.y).all()
Esempio n. 18
0
    def split(self, dataset, frac_split, split_dirs=None):
        """
    Method that does bulk of splitting dataset.
    """
        if split_dirs is not None:
            assert len(split_dirs) == 2
        else:
            split_dirs = [tempfile.mkdtemp(), tempfile.mkdtemp()]

        # Handle edge case where frac_split is 1
        if frac_split == 1:
            dataset_1 = DiskDataset.from_numpy(dataset.X, dataset.y, dataset.w,
                                               dataset.ids)
            dataset_2 = None
            return dataset_1, dataset_2
        X, y, w, ids = randomize_arrays(
            (dataset.X, dataset.y, dataset.w, dataset.ids))
        if len(y.shape) == 1:
            y = np.expand_dims(y, 1)
        if len(w.shape) == 1:
            w = np.expand_dims(w, 1)
        split_indices = self.get_task_split_indices(y, w, frac_split)

        # Create weight matrices fpor two haves.
        w_1, w_2 = np.zeros_like(w), np.zeros_like(w)
        for task, split_index in enumerate(split_indices):
            # copy over up to required index for weight first_split
            w_1[:split_index, task] = w[:split_index, task]
            w_2[split_index:, task] = w[split_index:, task]

        # check out if any rows in either w_1 or w_2 are just zeros
        rows_1 = w_1.any(axis=1)
        X_1, y_1, w_1, ids_1 = X[rows_1], y[rows_1], w_1[rows_1], ids[rows_1]
        dataset_1 = DiskDataset.from_numpy(X_1, y_1, w_1, ids_1)

        rows_2 = w_2.any(axis=1)
        X_2, y_2, w_2, ids_2 = X[rows_2], y[rows_2], w_2[rows_2], ids[rows_2]
        dataset_2 = DiskDataset.from_numpy(X_2, y_2, w_2, ids_2)

        return dataset_1, dataset_2
Esempio n. 19
0
 def target_4_dataset_save(self,dataset,file):
     compound=dataset.ids.tolist()
     target=dataset.get_task_names()
     print(target)
     w=dataset.w
     print('w.shape')
     print(w.shape)
     compuond_4_target=[]
     target_4=['P21728','P14416','P08908','P28223']
     
     target_4=sorted(target_4,key=lambda x:target.index(x))
     target_4_index=[target.index(i) for i in target_4]
     print('target_4')
     print(target_4_index)
     for i in range(len(compound)):
         z=0
         for j in target_4_index:
         
             if w[i,j]>0:
                 z=z+1
         if z>0:
             compuond_4_target.append(i)
             
         
     compound_shard=[]
     
         
     dataset1=dataset.select(compuond_4_target)
     print(compuond_4_target)
     cpd=compuond_4_target
     metadata_rows=[]
     shard_generator=self.shard_generator(cpd,target_4_index,dataset1)
     for shard_num, (X, y, w, ids) in enumerate(shard_generator):
       basename = "shard-%d" % shard_num
       metadata_rows.append(
           DiskDataset.write_data_to_disk(file, basename,target_4 , X, y, w,
                                          ids))
       metadata_df = DiskDataset._construct_metadata(metadata_rows)
       self.save_metadata(target_4, metadata_df, file)
       time2 = time.time()
Esempio n. 20
0
  def split(self, dataset, frac_split, split_dirs=None):
    """
    Method that does bulk of splitting dataset.
    """
    if split_dirs is not None:
      assert len(split_dirs) == 2
    else:
      split_dirs = [tempfile.mkdtemp(), tempfile.mkdtemp()]

    # Handle edge case where frac_split is 1
    if frac_split == 1:
      dataset_1 = DiskDataset.from_numpy(dataset.X, dataset.y, dataset.w,
                                         dataset.ids)
      dataset_2 = None
      return dataset_1, dataset_2
    X, y, w, ids = randomize_arrays((dataset.X, dataset.y, dataset.w,
                                     dataset.ids))
    if len(y.shape) == 1:
      y = np.expand_dims(y, 1)
    if len(w.shape) == 1:
      w = np.expand_dims(w, 1)
    split_indices = self.get_task_split_indices(y, w, frac_split)

    # Create weight matrices fpor two haves.
    w_1, w_2 = np.zeros_like(w), np.zeros_like(w)
    for task, split_index in enumerate(split_indices):
      # copy over up to required index for weight first_split
      w_1[:split_index, task] = w[:split_index, task]
      w_2[split_index:, task] = w[split_index:, task]

    # check out if any rows in either w_1 or w_2 are just zeros
    rows_1 = w_1.any(axis=1)
    X_1, y_1, w_1, ids_1 = X[rows_1], y[rows_1], w_1[rows_1], ids[rows_1]
    dataset_1 = DiskDataset.from_numpy(X_1, y_1, w_1, ids_1)

    rows_2 = w_2.any(axis=1)
    X_2, y_2, w_2, ids_2 = X[rows_2], y[rows_2], w_2[rows_2], ids[rows_2]
    dataset_2 = DiskDataset.from_numpy(X_2, y_2, w_2, ids_2)

    return dataset_1, dataset_2
Esempio n. 21
0
    def featurize_complexes(self, mol_files, protein_files):
        features = []
        failures = []
        for i, (mol_file,
                protein_pdb) in enumerate(zip(mol_files, protein_files)):
            logging.info("Featurizing %d / %d" % (i, len(mol_files)))
            new_features = self._featurize_complex(mol_file, protein_pdb)
            # Handle loading failures which return None
            if new_features is not None:
                features.append(new_features)
            else:
                failures.append(ind)

        features = np.asarray(features)
        labels = np.delete(self.labels, failures)
        dataset = DiskDataset.from_numpy(features, labels)

        # Fit atomic conv model
        self.atomic_conv_model.fit(dataset, nb_epoch=self.epochs)

        # Add the Atomic Convolution layers to fetches
        layers_to_fetch = [
            self.atomic_conv_model._frag1_conv,
            self.atomic_conv_model._frag2_conv,
            self.atomic_conv_model._complex_conv
        ]

        # Extract the atomic convolution features
        atomic_conv_features = list()
        batch_generator = self.atomic_conv_model.default_generator(
            dataset=dataset, epochs=1)

        for X, y, w in batch_generator:
            frag1_conv, frag2_conv, complex_conv = self.atomic_conv_model.predict_on_generator(
                [(X, y, w)], outputs=layers_to_fetch)
            concatenated = np.concatenate(
                [frag1_conv, frag2_conv, complex_conv], axis=1)
            atomic_conv_features.append(concatenated)

        batch_size = self.atomic_conv_model.batch_size

        if len(features) % batch_size != 0:
            num_batches = (len(features) // batch_size) + 1
            num_to_skip = num_batches * batch_size - len(features)
        else:
            num_to_skip = 0

        atomic_conv_features = np.asarray(atomic_conv_features)
        atomic_conv_features = atomic_conv_features[-num_to_skip:]
        atomic_conv_features = np.squeeze(atomic_conv_features)

        return atomic_conv_features, failures
Esempio n. 22
0
    def featurize(self, input_files, data_dir=None, shard_size=8192):
        """Featurize provided files and write to specified location.
    
    For large datasets, automatically shards into smaller chunks
    for convenience.

    Parameters
    ----------
    input_files: list
      List of input filenames.
    data_dir: str
      (Optional) Directory to store featurized dataset.
    shard_size: int
      (Optional) Number of examples stored in each shard.
    """
        log("Loading raw samples now.", self.verbose)
        log("shard_size: %d" % shard_size, self.verbose)

        if not isinstance(input_files, list):
            input_files = [input_files]

        def shard_generator():
            for shard_num, shard in enumerate(
                    self.get_shards(input_files, shard_size)):
                time1 = time.time()
                X, valid_inds = self.featurize_shard(shard)
                ids = shard[self.id_field].values
                ids = ids[valid_inds]
                if len(self.tasks) > 0:
                    # Featurize task results iff they exist.
                    y, w = convert_df_to_numpy(shard, self.tasks,
                                               self.id_field)
                    # Filter out examples where featurization failed.
                    y, w = (y[valid_inds], w[valid_inds])
                    assert len(X) == len(ids) == len(y) == len(w)
                else:
                    # For prospective data where results are unknown, it makes
                    # no sense to have y values or weights.
                    y, w = (None, None)
                    assert len(X) == len(ids)

                time2 = time.time()
                log(
                    "TIMING: featurizing shard %d took %0.3f s" %
                    (shard_num, time2 - time1), self.verbose)
                yield X, y, w, ids

        return DiskDataset.create_dataset(shard_generator(),
                                          data_dir,
                                          self.tasks,
                                          verbose=self.verbose)
Esempio n. 23
0
def load_core_pdbbind_coordinates(pdbbind_dir, base_dir, reload=True):
    """Load PDBBind datasets. Does not do train/test split"""
    # Set some global variables up top
    reload = True
    verbosity = "high"
    model = "logistic"
    regen = False
    neighbor_cutoff = 4
    max_num_neighbors = 10

    # Create some directories for analysis
    # The base_dir holds the results of all analysis
    if not reload:
        if os.path.exists(base_dir):
            shutil.rmtree(base_dir)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
    current_dir = os.path.dirname(os.path.realpath(__file__))
    #Make directories to store the raw and featurized datasets.
    data_dir = os.path.join(base_dir, "dataset")

    # Load PDBBind dataset
    labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013")
    pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set")
    tasks = ["-logKd/Ki"]
    print("About to load contents.")
    contents_df = load_pdbbind_labels(labels_file)
    ids = contents_df["PDB code"].values
    y = np.array([float(val) for val in contents_df["-logKd/Ki"].values])

    # Define featurizers
    featurizer = NeighborListComplexAtomicCoordinates(max_num_neighbors,
                                                      neighbor_cutoff)

    # Featurize Dataset
    features = []
    for ind, pdb_code in enumerate(ids):
        print("Processing %s" % str(pdb_code))
        pdb_subdir = os.path.join(pdb_subdirs, pdb_code)
        computed_feature = compute_pdbbind_coordinate_features(
            featurizer, pdb_subdir, pdb_code)
        features.append(computed_feature)
    X = np.array(features, dtype - object)
    w = np.ones_like(y)

    dataset = DiskDataset.from_numpy(data_dir, X, y, w, ids)
    transformers = []

    return tasks, dataset, transformers
Esempio n. 24
0
def load_core_pdbbind_coordinates(pdbbind_dir, base_dir, reload=True):
  """Load PDBBind datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
  verbosity = "high"
  model = "logistic"
  regen = False
  neighbor_cutoff = 4
  max_num_neighbors = 10

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
  if not reload:
    if os.path.exists(base_dir):
      shutil.rmtree(base_dir)
  if not os.path.exists(base_dir):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  data_dir = os.path.join(base_dir, "dataset")

  # Load PDBBind dataset
  labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013")
  pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set")
  tasks = ["-logKd/Ki"]
  print("About to load contents.")
  contents_df = load_pdbbind_labels(labels_file)
  ids = contents_df["PDB code"].values
  y = np.array([float(val) for val in contents_df["-logKd/Ki"].values])

  # Define featurizers
  featurizer = NeighborListComplexAtomicCoordinates(
      max_num_neighbors, neighbor_cutoff)
  
  # Featurize Dataset
  features = []
  for ind, pdb_code in enumerate(ids):
    print("Processing %s" % str(pdb_code))
    pdb_subdir = os.path.join(pdb_subdirs, pdb_code)
    computed_feature = compute_pdbbind_coordinate_features(
        featurizer, pdb_subdir, pdb_code)
    features.append(computed_feature)
  X = np.array(features, dtype-object)
  w = np.ones_like(y)
   
  dataset = DiskDataset.from_numpy(data_dir, X, y, w, ids)
  transformers = []
  
  return tasks, dataset, transformers
Esempio n. 25
0
  def featurize(self, input_files, data_dir=None, shard_size=8192):
    """Featurize provided files and write to specified location."""
    log("Loading raw samples now.", self.verbose)
    log("shard_size: %d" % shard_size, self.verbose)

    if not isinstance(input_files, list):
      input_files = [input_files]
    def shard_generator():
      for shard_num, shard in enumerate(self.get_shards(input_files, shard_size)):
        time1 = time.time()
        X, valid_inds = self.featurize_shard(shard)
        ids, y, w = convert_df_to_numpy(shard, self.tasks, self.id_field)  
        # Filter out examples where featurization failed.
        ids, y, w = (ids[valid_inds], y[valid_inds], w[valid_inds])
        assert len(X) == len(ids) == len(y) == len(w)
        time2 = time.time()
        log("TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2-time1),
            self.verbose)
        yield X, y, w, ids
    return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks)
Esempio n. 26
0
  def featurize(self, input_files, data_dir=None):
    """Featurizes fasta files.

    Parameters
    ----------
    input_files: list
      List of fasta files.
    data_dir: str
      (Optional) Name of directory where featurized data is stored.
    """
    if not isinstance(input_files, list):
      input_files = [input_files]

    def shard_generator():
      for input_file in input_files:
        X = encode_fasta_sequence(input_file)
        ids = np.ones(len(X))
        # (X, y, w, ids)
        yield X, None, None, ids

    return DiskDataset.create_dataset(shard_generator(), data_dir)
Esempio n. 27
0
  def featurize(self, input_files, data_dir=None):
    """Featurizes fasta files.

    Parameters
    ----------
    input_files: list
      List of fasta files.
    data_dir: str
      (Optional) Name of directory where featurized data is stored.
    """
    if not isinstance(input_files, list):
      input_files = [input_files]

    def shard_generator():
      for input_file in input_files:
        X = encode_fasta_sequence(input_file)
        ids = np.ones(len(X))
        # (X, y, w, ids)
        yield X, None, None, ids

    return DiskDataset.create_dataset(shard_generator(), data_dir)
Esempio n. 28
0
  def test_fit(self):
    tf_enc = TensorflowMoleculeEncoder.zinc_encoder()

    smiles = [
        "Cn1cnc2c1c(=O)n(C)c(=O)n2C", "O=C(O)[C@@H]1/C(=C/CO)O[C@@H]2CC(=O)N21",
        "Cn1c2nncnc2c(=O)n(C)c1=O", "Cn1cnc2c1c(=O)[nH]c(=O)n2C",
        "NC(=O)c1ncc[nH]c1=O", "O=C1OCc2c1[nH]c(=O)[nH]c2=O",
        "Cn1c(N)c(N)c(=O)n(C)c1=O", "CNc1nc2c([nH]1)c(=O)[nH]c(=O)n2C",
        "CC(=O)N1CN(C(C)=O)[C@@H](O)[C@@H]1O",
        "CC(=O)N1CN(C(C)=O)[C@H](O)[C@H]1O", "Cc1[nH]c(=O)[nH]c(=O)c1CO",
        "O=C1NCCCc2c1no[n+]2[O-]", "Cc1nc(C(N)=O)c(N)n1CCO",
        "O=c1[nH]cc(N2CCOCC2)c(=O)[nH]1"
    ]

    featurizer = dc.feat.one_hot.OneHotFeaturizer(zinc_charset, 120)
    mols = [Chem.MolFromSmiles(x) for x in smiles]
    features = featurizer.featurize(mols)

    dataset = DiskDataset.from_numpy(features, features)
    prediction = tf_enc.predict_on_batch(dataset.X)
    tf_de = TensorflowMoleculeDecoder.zinc_decoder()
    one_hot_decoded = tf_de.predict_on_batch(prediction)
    decoded_smiles = featurizer.untransform(one_hot_decoded)
    assert_equals(len(decoded_smiles), len(smiles))
Esempio n. 29
0
    def create_dataset(self,
                       input_files: OneOrMany[str],
                       data_dir: Optional[str] = None,
                       shard_size: Optional[int] = 8192) -> DiskDataset:
        """Creates a `Dataset` from input JSON files.

    Parameters
    ----------
    input_files: OneOrMany[str]
      List of JSON filenames.
    data_dir: Optional[str], default None
      Name of directory where featurized data is stored.
    shard_size: Optional[int], default 8192
      Shard size when loading data.

    Returns
    -------
    dataset: dc.data.Dataset
      A `Dataset` object containing a featurized representation of data
      from `input_files`.

    """
        if not isinstance(input_files, list):
            try:
                if isinstance(input_files, str):
                    input_files = [input_files]
                else:
                    input_files = list(input_files)
            except TypeError:
                raise ValueError(
                    "input_files is of an unrecognized form. Must be one filename or a list of filenames."
                )

        def shard_generator():
            """Yield X, y, w, and ids for shards."""
            for shard_num, shard in enumerate(
                    self._get_shards(input_files, shard_size)):

                time1 = time.time()
                X, valid_inds = self._featurize_shard(shard)
                if self.id_field:
                    ids = shard[self.id_field].values
                else:
                    ids = np.ones(len(X))
                ids = ids[valid_inds]

                if len(self.tasks) > 0:
                    # Featurize task results if they exist.
                    y, w = _convert_df_to_numpy(shard, self.tasks)

                    if self.label_field:
                        y = shard[self.label_field]
                    if self.weight_field:
                        w = shard[self.weight_field]

                    # Filter out examples where featurization failed.
                    y, w = (y[valid_inds], w[valid_inds])
                    assert len(X) == len(ids) == len(y) == len(w)
                else:
                    # For prospective data where results are unknown, it
                    # makes no sense to have y values or weights.
                    y, w = (None, None)
                    assert len(X) == len(ids)

                time2 = time.time()
                logger.info("TIMING: featurizing shard %d took %0.3f s" %
                            (shard_num, time2 - time1))
                yield X, y, w, ids

        return DiskDataset.create_dataset(shard_generator(), data_dir)
Esempio n. 30
0
    def split_dataset(self, dataset, attr_df, smiles_col):
        #smiles_col is a hack for now until deepchem fixes their scaffold and butina splitters
        """Splits dataset into training, testing and validation sets.
        
        Args:
            dataset (deepchem Dataset): full featurized dataset

            attr_df (Pandas DataFrame): dataframe containing SMILES strings indexed by compound IDs,

            smiles_col (string): name of SMILES column (hack for now until deepchem fixes scaffold and butina splitters)
        
        Returns:
            [(train, valid)], test, [(train_attr, valid_attr)], test_attr:

            train (deepchem Dataset): training dataset.

            valid (deepchem Dataset): validation dataset.

            test (deepchem Dataset): testing dataset.

            train_attr (Pandas DataFrame): dataframe of SMILES strings indexed by compound IDs for training set.

            valid_attr (Pandas DataFrame): dataframe of SMILES strings indexed by compound IDs for validation set.

            test_attr (Pandas DataFrame): dataframe of SMILES strings indexed by compound IDs for test set.
            
        Raises:
            Exception if there are duplicate ids or smiles strings in the dataset or the attr_df

        """
        dataset_dup = False
        if check_if_dupe_smiles_dataset(dataset, attr_df, smiles_col):
            log.info("Duplicate ids or smiles in the dataset, will deduplicate first and assign all records per compound ID to same partition")
            dataset_dup = True
            dataset_ori = copy.deepcopy(dataset)
            id_df = pd.DataFrame({'indices' : np.arange(len(dataset.ids), dtype=np.int32), "compound_id": [str(e) for e in dataset.ids]})
            sel_df = id_df.drop_duplicates(subset="compound_id")
            dataset = dataset.select(sel_df.indices.values)

        if self.needs_smiles():
            # Some DeepChem splitters require compound IDs in dataset to be SMILES strings. Swap in the
            # SMILES strings now; we'll reverse this later.
            dataset = DiskDataset.from_numpy(dataset.X, dataset.y, w=dataset.w, ids=attr_df.drop_duplicates(subset=smiles_col)[smiles_col].values)
            if dataset_dup:
                dataset_ori = DiskDataset.from_numpy(dataset_ori.X, dataset_ori.y, w=dataset_ori.w, ids=attr_df[smiles_col].values)

        # Under k-fold CV, the training/validation splits are determined by num_folds; only the test set fraction
        # is directly specified through command line parameters. If we use Butina splitting, we can't control
        # the test set size either.
        train_frac = 1.0 - self.params.split_test_frac

        # Use DeepChem train_test_split() to select held-out test set; then use k_fold_split on the
        # training set to split it into training/validation folds.
        if self.split == 'butina':
            train_cv, test, _ = self.splitter.train_valid_test_split(dataset)
            self.splitter = dc.splits.ScaffoldSplitter()
            train_cv_pairs = self.splitter.k_fold_split(train_cv, self.num_folds)
        else:
            # TODO: Add special handling for AVE splitter
            train_cv, test = self.splitter.train_test_split(dataset, seed=np.random.seed(123), frac_train=train_frac)
            train_cv_pairs = self.splitter.k_fold_split(train_cv, self.num_folds)

        train_valid_dsets = []
        train_valid_attr = []

        if self.needs_smiles():
            # Now that DeepChem splitters have done their work, replace the SMILES strings in the split 
            # dataset objects with actual compound IDs.
            for train, valid in train_cv_pairs:
                # assign the subsets to the original dataset if duplicated compounds exist
                if dataset_dup:
                    train = select_dset_by_id_list(dataset_ori, train.ids)
                    valid = select_dset_by_id_list(dataset_ori, valid.ids)
                train_attr = select_attrs_by_dset_smiles(train, attr_df, smiles_col)
                train = DiskDataset.from_numpy(train.X, train.y, w=train.w, ids=train_attr.index.values)

                valid_attr = select_attrs_by_dset_smiles(valid, attr_df, smiles_col)
                valid = DiskDataset.from_numpy(valid.X, valid.y, w=valid.w, ids=valid_attr.index.values)

                train_valid_dsets.append((train, valid))
                train_valid_attr.append((train_attr, valid_attr))

            if dataset_dup:
                test = select_dset_by_id_list(dataset_ori, test.ids)
            test_attr = select_attrs_by_dset_smiles(test, attr_df, smiles_col)
            test = DiskDataset.from_numpy(test.X, test.y, w=test.w, ids=test_attr.index.values)
        else:
            # Otherwise just subset the ID-to-SMILES maps.
            for train, valid in train_cv_pairs:
                if dataset_dup:
                    train = select_dset_by_id_list(dataset_ori, train.ids)
                    valid = select_dset_by_id_list(dataset_ori, valid.ids)
                train_attr = select_attrs_by_dset_ids(train, attr_df)
                valid_attr = select_attrs_by_dset_ids(valid, attr_df)
                train_valid_attr.append((train_attr, valid_attr))
            train_valid_dsets = train_cv_pairs
            if dataset_dup:
                test = select_dset_by_id_list(dataset_ori, test.ids)
            test_attr = select_attrs_by_dset_ids(test, attr_df)

        return train_valid_dsets, test, train_valid_attr, test_attr
Esempio n. 31
0
    def split_dataset(self, dataset, attr_df, smiles_col):
        #smiles_col is a hack for now until deepchem fixes their scaffold and butina splitters
        """Splits dataset into training, testing and validation sets.
        
        For ave_min, random, scaffold, index splits
            self.params.split_valid_frac & self.params.split_test_frac should be defined and 
            train_frac = 1.0 - self.params.split_valid_frac - self.params.split_test_frac
        
        For butina split, test size is not user defined, and depends on available clusters that qualify for placement in the test set
            train_frac = 1.0 - self.params.split_valid_frac
        
        For temporal split, test size is also not user defined, and depends on number of compounds with dates after cutoff date.
            train_frac = 1.0 - self.params.split_valid_frac
        Args:
            dataset (deepchem Dataset): full featurized dataset

            attr_df (Pandas DataFrame): dataframe containing SMILES strings indexed by compound IDs,

            smiles_col (string): name of SMILES column (hack for now until deepchem fixes scaffold and butina splitters)
        
        Returns:
            [(train, valid)], test, [(train_attr, valid_attr)], test_attr:
            train (deepchem Dataset): training dataset.

            valid (deepchem Dataset): validation dataset.

            test (deepchem Dataset): testing dataset.

            train_attr (Pandas DataFrame): dataframe of SMILES strings indexed by compound IDs for training set.

            valid_attr (Pandas DataFrame): dataframe of SMILES strings indexed by compound IDs for validation set.

            test_attr (Pandas DataFrame): dataframe of SMILES strings indexed by compound IDs for test set.
            
        Raises:
            Exception if there are duplicate ids or smiles strings in the dataset or the attr_df

        """
        if check_if_dupe_smiles_dataset(dataset, attr_df, smiles_col):
            raise Exception("Duplicate ids or smiles in the dataset")

        log.warning("Splitting data by %s" % self.params.splitter)

        if self.needs_smiles():
            # Some DeepChem splitters require compound IDs in dataset to be SMILES strings. Swap in the
            # SMILES strings now; we'll reverse this later.
            dataset = DiskDataset.from_numpy(dataset.X,
                                             dataset.y,
                                             ids=attr_df[smiles_col].values,
                                             verbose=False)

        if self.split == 'butina':
            #train_valid, test = self.splitter.train_test_split(dataset, cutoff=self.params.butina_cutoff)
            # Can't use train_test_split with Butina because Butina splits into train and valid sets only.
            train_valid, test, _ = self.splitter.train_valid_test_split(
                dataset)
            self.splitter = dc.splits.ScaffoldSplitter()
            # With Butina splitting, we don't have control over the size of the test set
            train_frac = 1.0 - self.params.split_valid_frac
            train, valid = self.splitter.train_test_split(
                train_valid, seed=np.random.seed(123), frac_train=train_frac)
        elif self.split == 'ave_min':
            # AVEMinSplitter also only does train-valid splits, but at least nested splits seem to work.
            # TODO: Change this if we modify AVE splitter to do 3-way splits internally.
            train_valid_frac = 1.0 - self.params.split_test_frac
            train_frac = train_valid_frac - self.params.split_valid_frac
            log.info("Performing split for test set")
            train_valid, test, _ = self.splitter.train_valid_test_split(
                dataset,
                frac_train=train_valid_frac,
                frac_valid=self.params.split_test_frac,
                frac_test=0.0)
            log.info("Performing split of training and validation sets")
            train, valid, _ = self.splitter.train_valid_test_split(
                train_valid,
                frac_train=train_frac / train_valid_frac,
                frac_valid=self.params.split_valid_frac / train_valid_frac,
                frac_test=0.0)
            log.info(
                "Results of 3-way split: %d training, %d validation, %d test compounds"
                % (train.X.shape[0], valid.X.shape[0], test.X.shape[0]))
        elif self.split == 'temporal':
            # TemporalSplitter requires that we pass attr_df so it can get the dates for each compound
            train_frac = 1.0 - self.params.split_valid_frac
            train, valid, test = self.splitter.train_valid_test_split(
                dataset,
                attr_df,
                frac_train=train_frac,
                frac_valid=self.params.split_valid_frac)
        else:
            train_frac = 1.0 - self.params.split_valid_frac - self.params.split_test_frac
            train, valid, test = self.splitter.train_valid_test_split(
                dataset,
                frac_train=train_frac,
                frac_valid=self.params.split_valid_frac,
                frac_test=self.params.split_test_frac,
                seed=np.random.seed(123))

        # Extract the ID-to_SMILES maps from attr_df for each subset.
        if self.needs_smiles():
            # Now that DeepChem splitters have done their work, replace the SMILES strings in the split
            # dataset objects with actual compound IDs.
            train_attr = select_attrs_by_dset_smiles(train, attr_df,
                                                     smiles_col)
            train = DiskDataset.from_numpy(train.X,
                                           train.y,
                                           ids=train_attr.index.values,
                                           verbose=False)

            valid_attr = select_attrs_by_dset_smiles(valid, attr_df,
                                                     smiles_col)
            valid = DiskDataset.from_numpy(valid.X,
                                           valid.y,
                                           ids=valid_attr.index.values,
                                           verbose=False)

            test_attr = select_attrs_by_dset_smiles(test, attr_df, smiles_col)
            test = DiskDataset.from_numpy(test.X,
                                          test.y,
                                          ids=test_attr.index.values,
                                          verbose=False)
        else:
            # Otherwise just subset the ID-to-SMILES maps.
            train_attr = select_attrs_by_dset_ids(train, attr_df)
            valid_attr = select_attrs_by_dset_ids(valid, attr_df)
            test_attr = select_attrs_by_dset_ids(test, attr_df)

        # Note grouping of train/valid return values as tuple lists, to match format of
        # KFoldSplitting.split_dataset().
        return [(train, valid)], test, [(train_attr, valid_attr)], test_attr
Esempio n. 32
0
    def split_dataset(self, dataset, attr_df, smiles_col):
        #smiles_col is a hack for now until deepchem fixes their scaffold and butina splitters
        """Splits dataset into training, testing and validation sets.
        
        Args:
            dataset (deepchem Dataset): full featurized dataset

            attr_df (Pandas DataFrame): dataframe containing SMILES strings indexed by compound IDs,

            smiles_col (string): name of SMILES column (hack for now until deepchem fixes scaffold and butina splitters)
        
        Returns:
            [(train, valid)], test, [(train_attr, valid_attr)], test_attr:

            train (deepchem Dataset): training dataset.

            valid (deepchem Dataset): validation dataset.

            test (deepchem Dataset): testing dataset.

            train_attr (Pandas DataFrame): dataframe of SMILES strings indexed by compound IDs for training set.

            valid_attr (Pandas DataFrame): dataframe of SMILES strings indexed by compound IDs for validation set.

            test_attr (Pandas DataFrame): dataframe of SMILES strings indexed by compound IDs for test set.
            
        Raises:
            Exception if there are duplicate ids or smiles strings in the dataset or the attr_df

        """
        if check_if_dupe_smiles_dataset(dataset, attr_df, smiles_col):
            raise Exception("Duplicate ids or smiles in the dataset")
        if self.needs_smiles():
            # Some DeepChem splitters require compound IDs in dataset to be SMILES strings. Swap in the
            # SMILES strings now; we'll reverse this later.
            dataset = DiskDataset.from_numpy(dataset.X,
                                             dataset.y,
                                             ids=attr_df[smiles_col].values,
                                             verbose=False)

        # Under k-fold CV, the training/validation splits are determined by num_folds; only the test set fraction
        # is directly specified through command line parameters. If we use Butina splitting, we can't control
        # the test set size either.
        train_frac = 1.0 - self.params.split_test_frac

        # Use DeepChem train_test_split() to select held-out test set; then use k_fold_split on the
        # training set to split it into training/validation folds.
        if self.split == 'butina':
            # TODO: Splitter.train_test_split() doesn't provide a way to pass the cutoff parameter
            # through to the ButinaSplitter.split() function. Simple fix would be to reimplement
            # train_test_split() here (it's not a complicated function). For now, allow cutoff to default.
            #train_cv, test = self.splitter.train_test_split(dataset, cutoff=self.params.butina_cutoff)
            train_cv, test, _ = self.splitter.train_valid_test_split(dataset)
            self.splitter = dc.splits.ScaffoldSplitter()
            train_cv_pairs = self.splitter.k_fold_split(
                train_cv, self.num_folds)
        else:
            # TODO: Add special handling for AVE splitter
            train_cv, test = self.splitter.train_test_split(
                dataset, seed=np.random.seed(123), frac_train=train_frac)
            train_cv_pairs = self.splitter.k_fold_split(
                train_cv, self.num_folds)

        train_valid_dsets = []
        train_valid_attr = []

        if self.needs_smiles():
            # Now that DeepChem splitters have done their work, replace the SMILES strings in the split
            # dataset objects with actual compound IDs.
            for train, valid in train_cv_pairs:
                train_attr = select_attrs_by_dset_smiles(
                    train, attr_df, smiles_col)
                train = DiskDataset.from_numpy(train.X,
                                               train.y,
                                               ids=train_attr.index.values,
                                               verbose=False)

                valid_attr = select_attrs_by_dset_smiles(
                    valid, attr_df, smiles_col)
                valid = DiskDataset.from_numpy(valid.X,
                                               valid.y,
                                               ids=valid_attr.index.values,
                                               verbose=False)

                train_valid_dsets.append((train, valid))
                train_valid_attr.append((train_attr, valid_attr))

            test_attr = select_attrs_by_dset_smiles(test, attr_df, smiles_col)
            test = DiskDataset.from_numpy(test.X,
                                          test.y,
                                          ids=test_attr.index.values,
                                          verbose=False)
        else:
            # Otherwise just subset the ID-to-SMILES maps.
            for train, valid in train_cv_pairs:
                train_attr = select_attrs_by_dset_ids(train, attr_df)
                valid_attr = select_attrs_by_dset_ids(valid, attr_df)
                train_valid_attr.append((train_attr, valid_attr))
            train_valid_dsets = train_cv_pairs
            test_attr = select_attrs_by_dset_ids(test, attr_df)

        return train_valid_dsets, test, train_valid_attr, test_attr
Esempio n. 33
0
  def k_fold_split(self, dataset, k, directories=None, **kwargs):
    """
    Parameters
    ----------
    dataset: Dataset
    Dataset to do a k-fold split

    k: int
    number of folds

    directories: list of str
    list of length 2*k filepaths to save the result disk-datasets

    kwargs

    Returns
    -------
    list of length k tuples of (train, cv)

    """
    """
    :param dataset:
    :param k:
    :param directories:
    :param kwargs:
    :return: list of length k tuples of (train, cv)
    """
    log("Computing K-fold split", self.verbose)
    if directories is None:
      directories = [tempfile.mkdtemp() for _ in range(2 * k)]
    else:
      assert len(directories) == 2 * k
    cv_datasets = []
    train_ds_base = None
    train_datasets = []
    # rem_dataset is remaining portion of dataset
    if isinstance(dataset, DiskDataset):
      rem_dataset = dataset
    else:
      rem_dataset = DiskDataset.from_numpy(dataset.X, dataset.y, dataset.w,
                                           dataset.ids)
    for fold in range(k):
      # Note starts as 1/k since fold starts at 0. Ends at 1 since fold goes up
      # to k-1.
      frac_fold = 1. / (k - fold)
      train_dir, cv_dir = directories[2 * fold], directories[2 * fold + 1]
      fold_inds, rem_inds, _ = self.split(
          rem_dataset,
          frac_train=frac_fold,
          frac_valid=1 - frac_fold,
          frac_test=0,
          **kwargs)
      cv_dataset = rem_dataset.select(fold_inds, select_dir=cv_dir)
      cv_datasets.append(cv_dataset)
      rem_dataset = rem_dataset.select(rem_inds)

      train_ds_to_merge = filter(lambda x: x is not None,
                                 [train_ds_base, rem_dataset])
      train_ds_to_merge = filter(lambda x: len(x) > 0, train_ds_to_merge)
      train_dataset = DiskDataset.merge(train_ds_to_merge, merge_dir=train_dir)
      train_datasets.append(train_dataset)

      update_train_base_merge = filter(lambda x: x is not None,
                                       [train_ds_base, cv_dataset])
      train_ds_base = DiskDataset.merge(update_train_base_merge)
    return list(zip(train_datasets, cv_datasets))
Esempio n. 34
0
  def load_dataset(
      self, name: str, reload: bool
  ) -> Tuple[List[str], Tuple[Dataset, ...], List[dc.trans.Transformer]]:
    """Load the dataset.

    Parameters
    ----------
    name: str
      the name of the dataset, used to identify the directory on disk
    reload: bool
      if True, the first call for a particular featurizer and splitter will cache
      the datasets to disk, and subsequent calls will reload the cached datasets.
    """
    # Build the path to the dataset on disk.

    featurizer_name = str(self.featurizer)
    splitter_name = 'None' if self.splitter is None else str(self.splitter)
    save_folder = os.path.join(self.save_dir, name + "-featurized",
                               featurizer_name, splitter_name)
    if len(self.transformers) > 0:
      transformer_name = '_'.join(
          t.get_directory_name() for t in self.transformers)
      save_folder = os.path.join(save_folder, transformer_name)

    # Try to reload cached datasets.

    if reload:
      if self.splitter is None:
        if os.path.exists(save_folder):
          transformers = dc.utils.data_utils.load_transformers(save_folder)
          return self.tasks, (DiskDataset(save_folder),), transformers
      else:
        loaded, all_dataset, transformers = dc.utils.data_utils.load_dataset_from_disk(
            save_folder)
        if all_dataset is not None:
          return self.tasks, all_dataset, transformers

    # Create the dataset

    logger.info("About to featurize %s dataset." % name)
    dataset = self.create_dataset()

    # Split and transform the dataset.

    if self.splitter is None:
      transformer_dataset: Dataset = dataset
    else:
      logger.info("About to split dataset with {} splitter.".format(
          self.splitter.__class__.__name__))
      train, valid, test = self.splitter.train_valid_test_split(dataset)
      transformer_dataset = train
    transformers = [
        t.create_transformer(transformer_dataset) for t in self.transformers
    ]
    logger.info("About to transform data.")
    if self.splitter is None:
      for transformer in transformers:
        dataset = transformer.transform(dataset)
      if reload and isinstance(dataset, DiskDataset):
        dataset.move(save_folder)
        dc.utils.data_utils.save_transformers(save_folder, transformers)
      return self.tasks, (dataset,), transformers

    for transformer in transformers:
      train = transformer.transform(train)
      valid = transformer.transform(valid)
      test = transformer.transform(test)
    if reload and isinstance(train, DiskDataset) and isinstance(
        valid, DiskDataset) and isinstance(test, DiskDataset):
      dc.utils.data_utils.save_dataset_to_disk(save_folder, train, valid, test,
                                               transformers)
    return self.tasks, (train, valid, test), transformers
Esempio n. 35
0
def load_core_pdbbind_grid(pdbbind_dir, base_dir, reload=True):
    """Load PDBBind datasets. Does not do train/test split"""
    # Set some global variables up top
    reload = True
    verbosity = "high"
    model = "logistic"
    regen = False

    # Create some directories for analysis
    # The base_dir holds the results of all analysis
    if not reload:
        if os.path.exists(base_dir):
            shutil.rmtree(base_dir)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
    current_dir = os.path.dirname(os.path.realpath(__file__))
    #Make directories to store the raw and featurized datasets.
    data_dir = os.path.join(base_dir, "dataset")

    # Load PDBBind dataset
    labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013")
    pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set")
    tasks = ["-logKd/Ki"]
    print("About to load contents.")
    contents_df = load_pdbbind_labels(labels_file)
    ids = contents_df["PDB code"].values
    y = np.array([float(val) for val in contents_df["-logKd/Ki"].values])

    # Define featurizers
    grid_featurizer = GridFeaturizer(
        voxel_width=16.0,
        feature_types="voxel_combined",
        # TODO(rbharath, enf): Figure out why pi_stack is slow and cation_pi
        # causes segfaults.
        #voxel_feature_types=["ecfp", "splif", "hbond", "pi_stack", "cation_pi",
        #"salt_bridge"], ecfp_power=9, splif_power=9,
        voxel_feature_types=["ecfp", "splif", "hbond", "salt_bridge"],
        ecfp_power=9,
        splif_power=9,
        parallel=True,
        flatten=True,
        verbosity=verbosity)
    compound_featurizers = [CircularFingerprint(size=1024)]
    complex_featurizers = [grid_featurizer]

    # Featurize Dataset
    features = []
    feature_len = None
    y_inds = []
    for ind, pdb_code in enumerate(ids):
        print("Processing %s" % str(pdb_code))
        pdb_subdir = os.path.join(pdb_subdirs, pdb_code)
        computed_feature = compute_pdbbind_grid_feature(
            compound_featurizers, complex_featurizers, pdb_subdir, pdb_code)
        if feature_len is None:
            feature_len = len(computed_feature)
        if len(computed_feature) != feature_len:
            print("Featurization failed for %s!" % pdb_code)
            continue
        y_inds.append(ind)
        features.append(computed_feature)
    y = y[y_inds]
    X = np.vstack(features)
    w = np.ones_like(y)

    dataset = DiskDataset.from_numpy(data_dir, X, y, w, ids)
    transformers = []

    return tasks, dataset, transformers
Esempio n. 36
0
def load_uspto(featurizer="plain",
               split=None,
               num_to_load=10000,
               reload=True,
               verbose=False):
    """Load USPTO dataset.

  For now, only loads the subset of data for 2008-2011 reactions. See https://figshare.com/articles/Chemical_reactions_from_US_patents_1976-Sep2016_/5104873 for more details.

  The full dataset contains some 400K reactions. This causes an out-of-memory error on development laptop if full dataset is featurized. For now, return a truncated subset of dataset.

  Reloading is not entirely supported for this dataset.
  """
    # Most reaction dataset ML tasks train the prediction of products from
    # ractants. Both of these are contained in the rxn object that is output,
    # so there is no "tasks" field.
    uspto_tasks = []
    if split is not None:
        raise ValueError("Train/valid/test not yet supported.")
    # Download USPTO dataset
    data_dir = deepchem.utils.get_data_dir()
    if reload:
        save_dir = os.path.join(data_dir, "uspto/" + featurizer + "/")
        loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
            save_dir)
        if loaded:
            return uspto_tasks, all_dataset, transformers

    dataset_file = os.path.join(data_dir,
                                "2008-2011_USPTO_reactionSmiles_filtered.zip")
    if not os.path.exists(dataset_file):
        deepchem.utils.download_url(
            "https://bitbucket.org/dan2097/patent-reaction-extraction/downloads/2008-2011_USPTO_reactionSmiles_filtered.zip"
        )
    # Unzip
    unzip_dir = os.path.join(data_dir,
                             "2008-2011_USPTO_reactionSmiles_filtered")
    if not os.path.exists(unzip_dir):
        deepchem.utils.unzip_file(dataset_file, dest_dir=unzip_dir)
    # Unzipped file is a tap seperated values file (despite the .txt)
    filename = os.path.join(unzip_dir,
                            "2008-2011_USPTO_reactionSmiles_filtered.txt")
    rxns = []
    from rdkit.Chem import rdChemReactions
    with open(filename) as tsvfile:
        reader = csv.reader(tsvfile, delimiter="\t")
        for ind, row in enumerate(reader):
            if ind > num_to_load:
                break
            if verbose:
                print("Loading reaction %d" % ind)
            # The first element in the row is the reaction smarts
            smarts = row[0]
            # Sometimes smarts have extraneous information at end of form "
            # |f:0" that causes parsing to fail. Not sure what this information
            # is, but just ignoring for now.
            smarts = smarts.split(" ")[0]
            rxn = rdChemReactions.ReactionFromSmarts(smarts)
            rxns.append(rxn)
    rxn_array = np.array(rxns)
    # Make up dummy labels since DiskDataset.from_numpy doesn't allow
    # creation from just features for now.
    y = np.ones(len(rxn_array))
    # TODO: This dataset isn't saved to disk so reload doesn't happen.
    rxn_dataset = DiskDataset.from_numpy(rxn_array, y)
    transformers = []
    return uspto_tasks, (rxn_dataset, None, None), transformers
Esempio n. 37
0
    def featurize(self, input_files, in_memory=True):
        """Featurizes image files.

    Parameters
    ----------
    input_files: list
      Each file in this list should either be of a supported image format
      (.png, .tif only for now) or of a compressed folder of image files
      (only .zip for now).
    in_memory: bool
      If true, return in-memory NumpyDataset. Else return DiskDataset.
    """
        if not isinstance(input_files, list):
            input_files = [input_files]

        image_files = []
        # Sometimes zip files contain directories within. Traverse directories
        while len(input_files) > 0:
            remainder = []
            for input_file in input_files:
                filename, extension = os.path.splitext(input_file)
                # TODO(rbharath): Add support for more extensions
                if os.path.isdir(input_file):
                    dirfiles = [
                        os.path.join(input_file, subfile)
                        for subfile in os.listdir(input_file)
                    ]
                    remainder += dirfiles
                elif extension == ".zip":
                    zip_dir = tempfile.mkdtemp()
                    zip_ref = zipfile.ZipFile(input_file, 'r')
                    zip_ref.extractall(path=zip_dir)
                    zip_ref.close()
                    zip_files = [
                        os.path.join(zip_dir, name)
                        for name in zip_ref.namelist()
                    ]
                    for zip_file in zip_files:
                        _, extension = os.path.splitext(zip_file)
                        if extension in [".png", ".tif"]:
                            image_files.append(zip_file)
                elif extension in [".png", ".tif"]:
                    image_files.append(input_file)
                else:
                    raise ValueError("Unsupported file format")
            input_files = remainder

        images = []
        for image_file in image_files:
            _, extension = os.path.splitext(image_file)
            if extension == ".png":
                image = misc.imread(image_file)
                images.append(image)
            elif extension == ".tif":
                im = Image.open(image_file)
                imarray = np.array(im)
                images.append(imarray)
            else:
                raise ValueError("Unsupported image filetype for %s" %
                                 image_file)
        images = np.array(images)
        if in_memory:
            return NumpyDataset(images)
        else:
            # from_numpy currently requires labels. Make dummy labels
            labels = np.zeros(len(images))
            return DiskDataset.from_numpy(images, labels)
Esempio n. 38
0
def load_core_pdbbind_grid(pdbbind_dir, base_dir, reload=True):
  """Load PDBBind datasets. Does not do train/test split"""
  # Set some global variables up top
  regen = False

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
  if not reload:
    if os.path.exists(base_dir):
      shutil.rmtree(base_dir)
  if not os.path.exists(base_dir):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  data_dir = os.path.join(base_dir, "dataset")

  # Load PDBBind dataset
  labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013")
  pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set")
  tasks = ["-logKd/Ki"]
  print("About to load contents.")
  contents_df = load_pdbbind_labels(labels_file)
  ids = contents_df["PDB code"].values
  y = np.array([float(val) for val in contents_df["-logKd/Ki"].values])

  # Define featurizers
  grid_featurizer = GridFeaturizer(
      voxel_width=16.0, feature_types="voxel_combined",
      # TODO(rbharath, enf): Figure out why pi_stack is slow and cation_pi
      # causes segfaults.
      #voxel_feature_types=["ecfp", "splif", "hbond", "pi_stack", "cation_pi",
      #"salt_bridge"], ecfp_power=9, splif_power=9,
      voxel_feature_types=["ecfp", "splif", "hbond", 
      "salt_bridge"], ecfp_power=9, splif_power=9,
      parallel=True, flatten=True)
  compound_featurizers = [CircularFingerprint(size=1024)]
  complex_featurizers = [grid_featurizer]
  
  # Featurize Dataset
  features = []
  feature_len = None
  y_inds = []
  for ind, pdb_code in enumerate(ids):
    print("Processing %s" % str(pdb_code))
    pdb_subdir = os.path.join(pdb_subdirs, pdb_code)
    computed_feature = compute_pdbbind_grid_feature(
        compound_featurizers, complex_featurizers, pdb_subdir, pdb_code)
    if feature_len is None:
      feature_len = len(computed_feature)
    if len(computed_feature) != feature_len:
      print("Featurization failed for %s!" % pdb_code)
      continue
    y_inds.append(ind)
    features.append(computed_feature)
  y = y[y_inds]
  X = np.vstack(features)
  w = np.ones_like(y)
   
  dataset = DiskDataset.from_numpy(data_dir, X, y, w, ids)
  transformers = []
  
  return tasks, dataset, transformers
        train_attr.smiles.tolist(),
        valid_attr.smiles.tolist(),
        test_attr.smiles.tolist()
    ], [])
    test.append(len(full_dataset_smiles) == len(set(full_dataset_smiles)))
    assert all(test)


#***********************************************************************************

([(train, valid)], test_scaffold, [(train_attr, valid_attr)],
 test_scaffold_attr) = splitter_scaffold.split_dataset(
     data_obj_scaffold.dataset, data_obj_scaffold.attr,
     data_obj_scaffold.params.smiles_col)
dataset_scaffold = DiskDataset.from_numpy(data_obj_scaffold.dataset.X,
                                          data_obj_scaffold.dataset.y,
                                          ids=data_obj_scaffold.attr.index)


def test_select_dset_by_attr_ids_using_smiles():
    #testing that the method can split a dataset according to its attr ids into the correct deepchem diskdataframe. In this case, the attr_ids are converted back to smiles to match the input dataset.
    dataset = DiskDataset.from_numpy(
        data_obj_scaffold.dataset.X,
        data_obj_scaffold.dataset.y,
        ids=data_obj_scaffold.attr[data_obj_scaffold.params.smiles_col].values)
    newdf = pd.DataFrame({'compound_ids': test_scaffold_attr.index.tolist()},
                         index=test_scaffold_attr.smiles)
    newDD = split.select_dset_by_attr_ids(dataset, newdf)
    assert (newDD.y == test_scaffold.y).all()

Esempio n. 40
0
# REPLACE WITH DOWNLOADED PDBBIND EXAMPLE
pdbbind_dir = "/tmp/deep-docking/datasets/pdbbind"
pdbbind_tasks, dataset, transformers = load_core_pdbbind_grid(
    pdbbind_dir, base_dir)

print("About to perform train/valid/test split.")
num_train = .8 * len(dataset)
X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)

X_train, X_valid = X[:num_train], X[num_train:]
y_train, y_valid = y[:num_train], y[num_train:]
w_train, w_valid = w[:num_train], w[num_train:]
ids_train, ids_valid = ids[:num_train], ids[num_train:]

train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train,
                                   w_train, ids_train, pdbbind_tasks)
valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid,
                                   w_valid, ids_valid, pdbbind_tasks)

classification_metric = Metric(metrics.pearson_r2_score, verbosity=verbosity,
                               mode="regression")

n_features = dataset.get_data_shape()[0]
tensorflow_model = TensorflowMultiTaskRegressor(
    len(pdbbind_tasks), n_features, model_dir, dropouts=[.25],
    learning_rate=0.0003, weight_init_stddevs=[.1],
    batch_size=64, verbosity=verbosity)
model = TensorflowModel(tensorflow_model, model_dir)

# Fit trained model
model.fit(train_dataset, nb_epoch=20)
Esempio n. 41
0
    def create_dataset(self,
                       inputs: Union[OneOrMany[str], Tuple[Any]],
                       data_dir: Optional[str] = None,
                       shard_size: Optional[int] = 8192,
                       in_memory: bool = False) -> Dataset:
        """Creates and returns a `Dataset` object by featurizing provided image files and labels/weights.

    Parameters
    ----------
    inputs: `Union[OneOrMany[str], Tuple[Any]]`
      The inputs provided should be one of the following

      - filename
      - list of filenames
      - Tuple (list of filenames, labels)
      - Tuple (list of filenames, labels, weights)

      Each file in a given list of filenames should either be of a supported
      image format (.png, .tif only for now) or of a compressed folder of
      image files (only .zip for now). If `labels` or `weights` are provided,
      they must correspond to the sorted order of all filenames provided, with
      one label/weight per file.

    data_dir: str, optional
      Directory to store featurized dataset.
    in_memory: bool
      If true, return in-memory NumpyDataset. Else return ImageDataset.

    Returns
    -------
    A `Dataset` object containing a featurized representation of data
    from `input_files`, `labels`, and `weights`.

    """
        labels, weights = None, None
        if isinstance(inputs, tuple):
            if len(inputs) == 1:
                input_files = inputs[0]
                if isinstance(inputs, str):
                    input_files = [inputs]
            elif len(inputs) == 2:
                input_files, labels = inputs
            elif len(inputs) == 3:
                input_files, labels, weights = inputs
            else:
                raise ValueError("Input must be a tuple of length 1, 2, or 3")
        else:
            input_files = inputs
        if isinstance(input_files, str):
            input_files = [input_files]

        image_files = []
        # Sometimes zip files contain directories within. Traverse directories
        while len(input_files) > 0:
            remainder = []
            for input_file in input_files:
                filename, extension = os.path.splitext(input_file)
                extension = extension.lower()
                # TODO(rbharath): Add support for more extensions
                if os.path.isdir(input_file):
                    dirfiles = [
                        os.path.join(input_file, subfile)
                        for subfile in os.listdir(input_file)
                    ]
                    remainder += dirfiles
                elif extension == ".zip":
                    zip_dir = tempfile.mkdtemp()
                    zip_ref = zipfile.ZipFile(input_file, 'r')
                    zip_ref.extractall(path=zip_dir)
                    zip_ref.close()
                    zip_files = [
                        os.path.join(zip_dir, name)
                        for name in zip_ref.namelist()
                    ]
                    for zip_file in zip_files:
                        _, extension = os.path.splitext(zip_file)
                        extension = extension.lower()
                        if extension in [".png", ".tif"]:
                            image_files.append(zip_file)
                elif extension in [".png", ".tif"]:
                    image_files.append(input_file)
                else:
                    raise ValueError("Unsupported file format")
            input_files = remainder

        # Sort image files
        image_files = sorted(image_files)

        if in_memory:
            if data_dir is None:
                return NumpyDataset(self.load_img(image_files),
                                    y=labels,
                                    w=weights,
                                    ids=image_files)
            else:
                dataset = DiskDataset.from_numpy(self.load_img(image_files),
                                                 y=labels,
                                                 w=weights,
                                                 ids=image_files,
                                                 tasks=self.tasks,
                                                 data_dir=data_dir)
                if shard_size is not None:
                    dataset.reshard(shard_size)
                return dataset
        else:
            return ImageDataset(image_files,
                                y=labels,
                                w=weights,
                                ids=image_files)
def load_gpcr(dataset_file,
              featurizer='ECFP',
              transformers=True,
              reload=True,
              sep='OnePositiveSplit',
              K=5):
    #    data_dir=os.path.dirname(dataset_file)

    save_dir = os.path.join(
        os.path.dirname(dataset_file),
        '.'.join(os.path.basename(dataset_file).split('.')[:-1]), "ecfp",
        "split")
    train, valid, test = os.path.join(save_dir, 'train'), os.path.join(
        save_dir, 'valid'), os.path.join(save_dir, 'test')
    fopen = open(dataset_file, "r")
    ss = fopen.readlines()
    m = ss[0].strip('\n').split(',')
    m.remove('SMILES')
    if os.path.isdir(save_dir):
        if reload:
            dataset, train_dataset, valid_dataset, test_dataset = DiskDataset(
                data_dir=save_dir), DiskDataset(data_dir=train), DiskDataset(
                    data_dir=valid), DiskDataset(data_dir=test)
            transformers = [
                deepchem.trans.NormalizationTransformer(transform_w=True,
                                                        dataset=train_dataset)
            ]
            all_dataset = (dataset, train_dataset, valid_dataset, test_dataset)
            return m, all_dataset, transformers
    if featurizer == 'ECFP':
        featurizer = deepchem.feat.CircularFingerprint(size=1024)
    elif featurizer == 'GraphConv':
        featurizer = deepchem.feat.ConvMolFeaturizer()
    elif featurizer == 'Weave':
        featurizer = deepchem.feat.WeaveFeaturizer()
    elif featurizer == 'Raw':
        featurizer = deepchem.feat.RawFeaturizer()
    elif featurizer == 'AdjacencyConv':
        featurizer = deepchem.feat.AdjacencyFingerprint(max_n_atoms=150,
                                                        max_valence=6)
    elif featurizer == 'SelfDefine':
        featurizer = deepchem.feat.UserDefinedFeaturizer(feature_field)
    loader = deepchem.data.CSVLoader(tasks=m,
                                     smiles_field="SMILES",
                                     featurizer=featurizer)
    dataset = loader.featurize(dataset_file,
                               data_dir=save_dir,
                               shard_size=8192)
    #    dataset = loader.featurize(dataset_file, shard_size=8192)
    # Initialize transformers
    if transformers:
        transformers = [
            deepchem.trans.NormalizationTransformer(transform_w=True,
                                                    dataset=dataset)
        ]
        for transformer in transformers:
            dataset = transformer.transform(dataset)
    splitters = {
        'index': deepchem.splits.IndexSplitter(),
        'random': deepchem.splits.RandomSplitter(),
        'random_stratified': deepchem.splits.RandomStratifiedSplitter(),
        'scaffold': deepchem.splits.ScaffoldSplitter(),
        'butina': deepchem.splits.ButinaSplitter(),
        'task': deepchem.splits.TaskSplitter(),
        'Harmonious_positive': Harmonious_positive(),
        'OnePositiveSplit': OnePositiveSplit()
    }
    splitter = splitters[sep]
    if sep == 'task':
        fold_datasets = splitter.k_fold_split(dataset, K)
        all_dataset = fold_datasets
    elif sep == 'Harmonious_positive':
        train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
            dataset)
        train_dataset = DiskDataset.from_numpy(train_dataset.X,
                                               train_dataset.y,
                                               train_dataset.w,
                                               train_dataset.ids,
                                               dataset.tasks,
                                               data_dir=train)
        valid_dataset = DiskDataset.from_numpy(valid_dataset.X,
                                               valid_dataset.y,
                                               valid_dataset.w,
                                               valid_dataset.ids,
                                               dataset.tasks,
                                               data_dir=valid)
        test_dataset = DiskDataset.from_numpy(test_dataset.X,
                                              test_dataset.y,
                                              test_dataset.w,
                                              test_dataset.ids,
                                              dataset.tasks,
                                              data_dir=test)
        all_dataset = (dataset, train_dataset, valid_dataset, test_dataset)
    elif sep == 'Harmonious_positive' and K:
        #        train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        #                                dataset,
        #                                frac_train=frac_train,
        #                                frac_valid=0,
        #                                frac_test=1- frac_train,
        #                                )
        #        train_dataset = DiskDataset.from_numpy(train_dataset.X,train_dataset.y,train_dataset.w,train_dataset.ids,
        #                                               dataset.tasks,data_dir=train)
        #        train_dataset.reshard(8192)
        #        test_dataset  = DiskDataset.from_numpy(test_dataset.X,test_dataset.y,test_dataset.w,test_dataset.ids,
        #                                               dataset.tasks,data_dir=test)
        #        test_dataset.reshard(8192)
        #        fold_dataset = splitter.k_fold_split(
        #                train_dataset, K, directories=[os.path.join(valid,str(i)) for i in range(K)],verbose=True)
        fold_dataset = splitter.k_fold_split(
            dataset,
            K,
            directories=[os.path.join(valid, str(i)) for i in range(K)],
            verbose=True)
        folds = []
        for i in range(K):
            print('merge fold dataset {}...'.format(i))
            train_fold = DiskDataset.merge(
                [fold_dataset[j] for j in range(K) if j != i],
                merge_dir=os.path.join(valid, str(i), 'train_fold'))
            test_fold = DiskDataset.merge([fold_dataset[i]],
                                          merge_dir=os.path.join(
                                              valid, str(i), 'valid_fold'))
            folds.append([train_fold, test_fold])
        all_dataset = (dataset, [], folds, [])
    else:
        train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
            dataset,
            train_dir=train,
            valid_dir=valid,
            test_dir=test,
            frac_train=frac_train,
            frac_valid=frac_valid,
            frac_test=frac_test)
        all_dataset = (dataset, train_dataset, valid_dataset, test_dataset)

#    else:
#        train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset,train_dir=train, valid_dir=valid, test_dir=test)
#        all_dataset = (dataset,train_dataset, valid_dataset, test_dataset)
#    if reload:
#        deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,transformers)
    return m, all_dataset, transformers