Beispiel #1
0
    def train_valid_test_split(self,
                               dataset,
                               train_dir=None,
                               valid_dir=None,
                               test_dir=None,
                               frac_train=.8,
                               frac_valid=.1,
                               frac_test=.1,
                               seed=None,
                               log_every_n=1000,
                               verbose=True):
        """
        Splits self into train/validation/test sets.
        Returns Dataset objects.
        """
        log("Computing train/valid/test indices", self.verbose)
        train_inds, valid_inds, test_inds = self.split(dataset,
                                                       frac_train=frac_train,
                                                       frac_test=frac_test,
                                                       frac_valid=frac_valid,
                                                       log_every_n=log_every_n)
        if train_dir is None:
            train_dir = tempfile.mkdtemp()
        if valid_dir is None:
            valid_dir = tempfile.mkdtemp()
        if test_dir is None:
            test_dir = tempfile.mkdtemp()
        train_dataset = dataset.select(train_inds, train_dir)
        if frac_valid != 0:
            valid_dataset = dataset.select(valid_inds, valid_dir)
        else:
            valid_dataset = None
        test_dataset = dataset.select(test_inds, test_dir)

        return train_dataset, valid_dataset, test_dataset
Beispiel #2
0
 def featurize(self,
               protein_file,
               pockets,
               pocket_atoms_map,
               pocket_coords,
               verbose=False):
   """
   Calculate atomic coodinates.
   """
   import mdtraj
   protein = mdtraj.load(protein_file)
   n_pockets = len(pockets)
   n_residues = len(BindingPocketFeaturizer.residues)
   res_map = dict(zip(BindingPocketFeaturizer.residues, range(n_residues)))
   all_features = torch.zeros((n_pockets, n_residues))
   for pocket_num, (pocket, coords) in enumerate(zip(pockets, pocket_coords)):
     pocket_atoms = pocket_atoms_map[pocket]
     for ind, atom in enumerate(pocket_atoms):
       atom_name = str(protein.top.atom(atom))
       # atom_name is of format RESX-ATOMTYPE
       # where X is a 1 to 4 digit number
       residue = atom_name[:3]
       if residue not in res_map:
         log("Warning: Non-standard residue in PDB file", verbose)
         continue
       atomtype = atom_name.split("-")[1]
       all_features[pocket_num, res_map[residue]] += 1
   return all_features
Beispiel #3
0
    def compute_model_performance(self,
                                  metrics,
                                  csv_out=None,
                                  stats_out=None,
                                  per_task_metrics=False):
        """
    Computes statistics of model on test data and saves results to csv.
    Parameters
    ----------
    metrics: list
      List of Pytorch.Chemistry.metrics.Metric objects
    csv_out: str, optional
      Filename to write CSV of model predictions.
    stats_out: str, optional
      Filename to write computed statistics.
    per_task_metrics: bool, optional
      If true, return computed metric for each task on multitask dataset.
    """
        y = self.dataset.y
        y = undo_transforms(y, self.output_transformers)
        w = self.dataset.w

        if not len(metrics):
            return {}
        else:
            mode = metrics[0].mode
        y_pred = self.model.predict(self.dataset, self.output_transformers)
        if mode == "classification":
            y_pred_print = np.argmax(y_pred, -1)
        else:
            y_pred_print = y_pred
        multitask_scores = {}
        all_task_scores = {}

        if csv_out is not None:
            log("Saving predictions to %s" % csv_out, self.verbose)
            self.output_predictions(y_pred_print, csv_out)

        # Compute multitask metrics
        for metric in metrics:
            if per_task_metrics:
                multitask_scores[
                    metric.name], computed_metrics = metric.compute_metric(
                        y, y_pred, w, per_task_metrics=True)
                all_task_scores[metric.name] = computed_metrics
            else:
                multitask_scores[metric.name] = metric.compute_metric(
                    y, y_pred, w, per_task_metrics=False)

        if stats_out is not None:
            log("Saving stats to %s" % stats_out, self.verbose)
            self.output_statistics(multitask_scores, stats_out)

        if not per_task_metrics:
            return multitask_scores
        else:
            return multitask_scores, all_task_scores
Beispiel #4
0
 def fit(self, dataset, nb_epoch=10, batch_size=50, **kwargs):
   """
   Fits a model on data in a Dataset object.
   """
   # TODO(rbharath/enf): We need a structured way to deal with potential GPU
   #                     memory overflows.
   for epoch in range(nb_epoch):
     log("Starting epoch %s" % str(epoch + 1), self.verbose)
     losses = []
     for (X_batch, y_batch, w_batch,
          ids_batch) in dataset.iterbatches(batch_size):
       losses.append(self.fit_on_batch(X_batch, y_batch, w_batch))
     log("Avg loss for epoch %d: %f" % (epoch + 1, np.array(losses).mean()),
         self.verbose)
Beispiel #5
0
 def __init__(self, tasks, model_builder, model_dir=None, verbose=True):
     super(SingletaskToMultitask, self).__init__(self,
                                                 model_dir=model_dir,
                                                 verbose=verbose)
     self.tasks = tasks
     self.task_model_dirs = {}
     self.model_builder = model_builder
     log("About to initialize singletask to multitask model", self.verbose)
     for task in self.tasks:
         task_model_dir = os.path.join(self.model_dir, str(task))
         if not os.path.exists(task_model_dir):
             os.makedirs(task_model_dir)
         log("Initializing directory for task %s" % task, self.verbose)
         self.task_model_dirs[task] = task_model_dir
Beispiel #6
0
 def fit(self, dataset, **kwargs):
     """
 Updates all singletask models with new information.
 Warning: This current implementation is only functional for sklearn models.
 """
     if not isinstance(dataset, DiskDataset):
         raise ValueError(
             'SingletaskToMultitask only works with DiskDatasets')
     log("About to create task-specific datasets", self.verbose)
     task_datasets = self._create_task_datasets(dataset)
     for ind, task in enumerate(self.tasks):
         log("Fitting model for task %s" % task, self.verbose)
         task_model = self.model_builder(self.task_model_dirs[task])
         task_model.fit(task_datasets[ind], **kwargs)
         task_model.save()
Beispiel #7
0
 def _create_task_datasets(self, dataset):
     """Make directories to hold data for tasks"""
     task_data_dirs = []
     for task in self.tasks:
         task_data_dir = os.path.join(self.model_dir, str(task) + "_data")
         if os.path.exists(task_data_dir):
             shutil.rmtree(task_data_dir)
         os.makedirs(task_data_dir)
         task_data_dirs.append(task_data_dir)
     task_datasets = self._to_singletask(dataset, task_data_dirs)
     for task, task_dataset in zip(self.tasks, task_datasets):
         log(
             "Dataset for task %s has shape %s" %
             (task, str(task_dataset.get_shape())), self.verbose)
     return task_datasets
Beispiel #8
0
    def k_fold_split(self,
                     dataset,
                     k,
                     directories=None,
                     seed=None,
                     log_every_n=None,
                     **kwargs):
        """
        Splits compounds into k-folds using stratified sampling.
        Overriding base class k_fold_split.
        Parameters
        ----------
        dataset: dc.data.Dataset object
          Dataset.
        k: int
          Number of folds.
        seed: int (Optional, Default None)
          Random seed.
        log_every_n: int (Optional, Default None)
          Log every n examples (not currently used).
        Returns
        -------
        fold_datasets: List
          List containing dc.data.Dataset objects
        """
        log("Computing K-fold split", self.verbose)
        if directories is None:
            directories = [tempfile.mkdtemp() for _ in range(k)]
        else:
            assert len(directories) == k

        y_s = dataset.y[:, self.task_number]
        sortidx = np.argsort(y_s)
        sortidx_list = np.array_split(sortidx, k)

        fold_datasets = []
        for fold in range(k):
            fold_dir = directories[fold]
            fold_ind = sortidx_list[fold]
            fold_dataset = dataset.select(fold_ind, fold_dir)
            fold_datasets.append(fold_dataset)
        return fold_datasets
Beispiel #9
0
 def k_fold_split(self, dataset, k, directories=None, **kwargs):
     """Needs custom implementation due to ragged splits for stratification."""
     log("Computing K-fold split", self.verbose)
     if directories is None:
         directories = [tempfile.mkdtemp() for _ in range(k)]
     else:
         assert len(directories) == k
     fold_datasets = []
     # rem_dataset is remaining portion of dataset
     rem_dataset = dataset
     for fold in range(k):
         # Note starts as 1/k since fold starts at 0. Ends at 1 since fold goes up
         # to k-1.
         frac_fold = 1. / (k - fold)
         fold_dir = directories[fold]
         rem_dir = tempfile.mkdtemp()
         fold_dataset, rem_dataset = self.split(rem_dataset, frac_fold,
                                                [fold_dir, rem_dir])
         fold_datasets.append(fold_dataset)
     return fold_datasets
Beispiel #10
0
    def _to_singletask(dataset, task_dirs):
        """Transforms a multitask dataset to a collection of singletask datasets."""
        tasks = dataset.get_task_names()
        assert len(tasks) == len(task_dirs)
        log("Splitting multitask dataset into singletask datasets",
            dataset.verbose)
        task_datasets = [
            DiskDataset.create_dataset([], task_dirs[task_num], [task])
            for (task_num, task) in enumerate(tasks)
        ]
        #task_metadata_rows = {task: [] for task in tasks}
        for shard_num, (X, y, w, ids) in enumerate(dataset.itershards()):
            log("Processing shard %d" % shard_num, dataset.verbose)
            basename = "dataset-%d" % shard_num
            for task_num, task in enumerate(tasks):
                log("\tTask %s" % task, dataset.verbose)
                w_task = w[:, task_num]
                y_task = y[:, task_num]

                # Extract those datapoints which are present for this task
                X_nonzero = X[w_task != 0]
                num_datapoints = X_nonzero.shape[0]
                y_nonzero = np.reshape(y_task[w_task != 0],
                                       (num_datapoints, 1))
                w_nonzero = np.reshape(w_task[w_task != 0],
                                       (num_datapoints, 1))
                ids_nonzero = ids[w_task != 0]

                task_datasets[task_num].add_shard(X_nonzero, y_nonzero,
                                                  w_nonzero, ids_nonzero)

        return task_datasets
Beispiel #11
0
 def featurize_complexes(self,
                         mol_pdbs,
                         protein_pdbs,
                         verbose=True,
                         log_every_n=1000):
     """
 Calculate features for mol/protein complexes.
 Parameters
 ----------
 mol_pdbs: list
   List of PDBs for molecules. Each PDB should be a list of lines of the
   PDB file.
 protein_pdbs: list
   List of PDBs for proteins. Each PDB should be a list of lines of the
   PDB file.
 """
     features = []
     for i, (mol_pdb, protein_pdb) in enumerate(zip(mol_pdbs,
                                                    protein_pdbs)):
         if verbose and i % log_every_n == 0:
             log("Featurizing %d / %d" % (i, len(mol_pdbs)))
         features.append(self._featurize_complex(mol_pdb, protein_pdb))
     features = np.asarray(features)
     return features
Beispiel #12
0
 def split(self,
           dataset,
           frac_train=.8,
           frac_valid=.1,
           frac_test=.1,
           log_every_n=1000):
     """
     Splits internal compounds into train/validation/test by scaffold.
     """
     np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.)
     scaffolds = {}
     log("About to generate scaffolds", self.verbose)
     data_len = len(dataset)
     for ind, smiles in enumerate(dataset.ids):
         if ind % log_every_n == 0:
             log("Generating scaffold %d/%d" % (ind, data_len),
                 self.verbose)
         scaffold = generate_scaffold(smiles)
         if scaffold not in scaffolds:
             scaffolds[scaffold] = [ind]
         else:
             scaffolds[scaffold].append(ind)
     # Sort from largest to smallest scaffold sets
     scaffolds = {key: sorted(value) for key, value in scaffolds.items()}
     scaffold_sets = [
         scaffold_set
         for (scaffold,
              scaffold_set) in sorted(scaffolds.items(),
                                      key=lambda x: (len(x[1]), x[1][0]),
                                      reverse=True)
     ]
     train_cutoff = frac_train * len(dataset)
     valid_cutoff = (frac_train + frac_valid) * len(dataset)
     train_inds, valid_inds, test_inds = [], [], []
     log("About to sort in scaffold sets", self.verbose)
     for scaffold_set in scaffold_sets:
         if len(train_inds) + len(scaffold_set) > train_cutoff:
             if len(train_inds) + len(valid_inds) + len(
                     scaffold_set) > valid_cutoff:
                 test_inds += scaffold_set
             else:
                 valid_inds += scaffold_set
         else:
             train_inds += scaffold_set
     return train_inds, valid_inds, test_inds
Beispiel #13
0
    def k_fold_split(self, dataset, k, directories=None, **kwargs):
        """
    Parameters
    ----------
    dataset: Dataset
    Dataset to do a k-fold split
    k: int
    number of folds
    directories: list of str
    list of length 2*k filepaths to save the result disk-datasets
    kwargs
    Returns
    -------
    list of length k tuples of (train, cv)
    """
        """
    :param dataset:
    :param k:
    :param directories:
    :param kwargs:
    :return: list of length k tuples of (train, cv)
    """
        log("Computing K-fold split", self.verbose)
        if directories is None:
            directories = [tempfile.mkdtemp() for _ in range(2 * k)]
        else:
            assert len(directories) == 2 * k
        cv_datasets = []
        train_ds_base = None
        train_datasets = []
        # rem_dataset is remaining portion of dataset
        if isinstance(dataset, DiskDataset):
            rem_dataset = dataset
        else:
            rem_dataset = DiskDataset.from_numpy(dataset.X, dataset.y,
                                                 dataset.w, dataset.ids)
        for fold in range(k):
            # Note starts as 1/k since fold starts at 0. Ends at 1 since fold goes up
            # to k-1.
            frac_fold = 1. / (k - fold)
            train_dir, cv_dir = directories[2 * fold], directories[2 * fold +
                                                                   1]
            fold_inds, rem_inds, _ = self.split(rem_dataset,
                                                frac_train=frac_fold,
                                                frac_valid=1 - frac_fold,
                                                frac_test=0)
            cv_dataset = rem_dataset.select(fold_inds, select_dir=cv_dir)
            cv_datasets.append(cv_dataset)
            rem_dataset = rem_dataset.select(rem_inds)

            train_ds_to_merge = filter(lambda x: x is not None,
                                       [train_ds_base, rem_dataset])
            train_ds_to_merge = filter(lambda x: len(x) > 0, train_ds_to_merge)
            train_dataset = DiskDataset.merge(train_ds_to_merge,
                                              merge_dir=train_dir)
            train_datasets.append(train_dataset)

            update_train_base_merge = filter(lambda x: x is not None,
                                             [train_ds_base, cv_dataset])
            train_ds_base = DiskDataset.merge(update_train_base_merge)
        return list(zip(train_datasets, cv_datasets))
Beispiel #14
0
    def hyperparam_search(self,
                          params_dict,
                          train_dataset,
                          valid_dataset,
                          output_transformers,
                          metric,
                          use_max=True,
                          logdir=None):
        """Perform hyperparams search according to params_dict.
    Each key to hyperparams_dict is a model_param. The values should be a list
    of potential values for that hyperparam.
    TODO(rbharath): This shouldn't be stored in a temporary directory.
    """
        hyperparams = params_dict.keys()
        hyperparam_vals = params_dict.values()
        for hyperparam_list in params_dict.values():
            assert isinstance(hyperparam_list, collections.Iterable)

        number_combinations = reduce(mul,
                                     [len(vals) for vals in hyperparam_vals])

        if use_max:
            best_validation_score = -np.inf
        else:
            best_validation_score = np.inf
        best_hyperparams = None
        best_model, best_model_dir = None, None
        all_scores = {}
        for ind, hyperparameter_tuple in enumerate(
                itertools.product(*hyperparam_vals)):
            model_params = {}
            log("Fitting model %d/%d" % (ind + 1, number_combinations),
                self.verbose)
            for hyperparam, hyperparam_val in zip(hyperparams,
                                                  hyperparameter_tuple):
                model_params[hyperparam] = hyperparam_val
            log("hyperparameters: %s" % str(model_params), self.verbose)

            if logdir is not None:
                model_dir = os.path.join(logdir, str(ind))
                log("model_dir is %s" % model_dir, self.verbose)
                try:
                    os.makedirs(model_dir)
                except OSError:
                    if not os.path.isdir(model_dir):
                        log(
                            "Error creating model_dir, using tempfile directory",
                            self.verbose)
                        model_dir = tempfile.mkdtemp()
            else:
                model_dir = tempfile.mkdtemp()

            model = self.model_class(model_params, model_dir)
            model.fit(train_dataset, **model_params)
            model.save()

            evaluator = Evaluator(model, valid_dataset, output_transformers)
            multitask_scores = evaluator.compute_model_performance([metric])
            valid_score = multitask_scores[metric.name]
            all_scores[str(hyperparameter_tuple)] = valid_score

            if (use_max and valid_score >= best_validation_score) or (
                    not use_max and valid_score <= best_validation_score):
                best_validation_score = valid_score
                best_hyperparams = hyperparameter_tuple
                if best_model_dir is not None:
                    shutil.rmtree(best_model_dir)
                best_model_dir = model_dir
                best_model = model
            else:
                shutil.rmtree(model_dir)

            log(
                "Model %d/%d, Metric %s, Validation set %s: %f" %
                (ind + 1, number_combinations, metric.name, ind, valid_score),
                self.verbose)
            log("\tbest_validation_score so far: %f" % best_validation_score,
                self.verbose)
        if best_model is None:
            log("No models trained correctly.", self.verbose)
            # arbitrarily return last model
            best_model, best_hyperparams = model, hyperparameter_tuple
            return best_model, best_hyperparams, all_scores
        train_evaluator = Evaluator(best_model, train_dataset,
                                    output_transformers)
        multitask_scores = train_evaluator.compute_model_performance([metric])
        train_score = multitask_scores[metric.name]
        log("Best hyperparameters: %s" % str(best_hyperparams), self.verbose)
        log("train_score: %f" % train_score, self.verbose)
        log("validation_score: %f" % best_validation_score, self.verbose)
        return best_model, best_hyperparams, all_scores