Ejemplo n.º 1
0
    def create_dataset(shard_generator, data_dir=None, tasks=[], verbose=True):
        """Creates a new DiskDataset

        Parameters
        ----------
        shard_generator: Iterable
          An iterable (either a list or generator) that provides tuples of data
          (X, y, w, ids). Each tuple will be written to a separate shard on disk.
        data_dir: str
          Filename for data directory. Creates a temp directory if none specified.
        tasks: list
          List of tasks for this dataset.
        """
        if data_dir is None:
            data_dir = tempfile.mkdtemp()
        elif not os.path.exists(data_dir):
            os.makedirs(data_dir)

        metadata_rows = []
        time1 = time.time()
        for shard_num, (X, y, w, ids) in enumerate(shard_generator):
            basename = "shard-%d" % shard_num
            metadata_rows.append(
                DiskDataset.write_data_to_disk(data_dir, basename, tasks, X, y,
                                               w, ids))
        metadata_df = DiskDataset._construct_metadata(metadata_rows)
        save_metadata(tasks, metadata_df, data_dir)
        time2 = time.time()
        log("TIMING: dataset construction took %0.3f s" % (time2 - time1),
            verbose)
        return DiskDataset(data_dir, verbose=verbose)
Ejemplo n.º 2
0
    def featurize_complexes(self,
                            mol_pdbs,
                            protein_pdbs,
                            verbose=True,
                            log_every_n=1000):
        """
        Calculate features for mol/protein complexes.

        Parameters
        ----------
        mol_pdbs: list
          List of PDBs for molecules. Each PDB should be a list of lines of the
          PDB file.
        protein_pdbs: list
          List of PDBs for proteins. Each PDB should be a list of lines of the
          PDB file.
        """
        features = []
        for i, (mol_pdb, protein_pdb) in enumerate(zip(mol_pdbs,
                                                       protein_pdbs)):
            if verbose and i % log_every_n == 0:
                log("Featurizing %d / %d" % (i, len(mol_pdbs)))
            features.append(self._featurize_complex(mol_pdb, protein_pdb))
        features = np.asarray(features)
        return features
Ejemplo n.º 3
0
    def __init__(self, data_dir, verbose=True):
        """
        Turns featurized dataframes into numpy files, writes them & metadata to disk.
        """
        self.data_dir = data_dir
        self.verbose = verbose

        log("Loading dataset from disk.", self.verbose)
        self.tasks, self.metadata_df = self.load_metadata()
Ejemplo n.º 4
0
    def sparse_shuffle(self):
        """Shuffling that exploits data sparsity to shuffle large datasets.

        Only for 1-dimensional feature vectors (does not work for tensorial
        featurizations).
        """
        time1 = time.time()
        shard_size = self.get_shard_size()
        num_shards = self.get_number_shards()
        X_sparses, ys, ws, ids = [], [], [], []
        num_features = None
        for i in range(num_shards):
            (X_s, y_s, w_s, ids_s) = self.get_shard(i)
            if num_features is None:
                num_features = X_s.shape[1]
            X_sparse = sparsify_features(X_s)
            X_sparses, ys, ws, ids = (X_sparses + [X_sparse], ys + [y_s],
                                      ws + [w_s],
                                      ids + [np.atleast_1d(np.squeeze(ids_s))])
        # Get full dataset in memory
        (X_sparse, y, w, ids) = (np.vstack(X_sparses), np.vstack(ys),
                                 np.vstack(ws), np.concatenate(ids))
        # Shuffle in memory
        num_samples = len(X_sparse)
        permutation = np.random.permutation(num_samples)
        X_sparse, y, w, ids = (X_sparse[permutation], y[permutation],
                               w[permutation], ids[permutation])
        # Write shuffled shards out to disk
        for i in range(num_shards):
            start, stop = i * shard_size, (i + 1) * shard_size
            (X_sparse_s, y_s, w_s,
             ids_s) = (X_sparse[start:stop], y[start:stop], w[start:stop],
                       ids[start:stop])
            X_s = densify_features(X_sparse_s, num_features)
            self.set_shard(i, X_s, y_s, w_s, ids_s)
        time2 = time.time()
        log("TIMING: sparse_shuffle took %0.3f s" % (time2 - time1),
            self.verbose)
Ejemplo n.º 5
0
    def compute_metric(self,
                       y_true,
                       y_pred,
                       w=None,
                       n_classes=2,
                       filter_nans=False,
                       per_task_metrics=False,
                       plot=False,
                       all_metrics=None,
                       is_training_set=False,
                       no_concordance_index=True,
                       tasks=None,
                       model_name=None,
                       verbose=False):
        """Compute a performance metric for each task.

        Parameters
        ----------
        y_true: np.ndarray
          An np.ndarray containing true values for each task.
        y_pred: np.ndarray
          An np.ndarray containing predicted values for each task.
        w: np.ndarray, optional
          An np.ndarray containing weights for each datapoint.
        n_classes: int, optional
          Number of classes in data for classification tasks.
        filter_nans: bool, optional
          Remove NaN values in computed metrics
        per_task_metrics: bool, optional
          If true, return computed metric for each task on multitask dataset.

        Returns
        -------
        A numpy nd.array containing metric values for each task.
        """

        if len(y_true.shape) > 1:
            n_samples, n_tasks = y_true.shape[0], y_true.shape[1]
        else:
            n_samples, n_tasks = y_true.shape[0], 1
        if tasks is not None:
            assert len(tasks) == n_tasks
        if self.mode == "classification":
            y_pred = np.reshape(y_pred, (n_samples, n_tasks, n_classes))
        else:
            y_pred = np.reshape(y_pred, (n_samples, n_tasks))
        y_true = np.reshape(y_true, (n_samples, n_tasks))
        if w is None or len(w) == 0:
            w = np.ones_like(y_true)
        assert y_true.shape[0] == y_pred.shape[0] == w.shape[0]
        computed_metrics = []
        excluded_single_tasks_dict = {}
        task_name_to_metric_value = {}
        time_start = time.time()

        # WARNING: the following block works fine, but if you change a similar block in driver.py, it might cause
        # problems. Do not do that.
        excluded_metatasks_dict = {}
        metatask_to_task = {}
        taskind_to_metatask = {}
        meta_task_list = []
        aggregated_task_names = copy.deepcopy(tasks)
        if len(self.aggregate_list) > 0:
            assert tasks is not None
            for meta_task_name in self.aggregate_list:
                for i, task_name in enumerate(tasks):
                    if not re.search(meta_task_name, task_name, re.I):
                        continue
                    # Only construct the corresponding entry when there are truly such hits.
                    if meta_task_name not in metatask_to_task:
                        metatask_to_task[meta_task_name] = []
                        aggregated_task_names.append(meta_task_name)
                    if i not in taskind_to_metatask:
                        taskind_to_metatask[i] = meta_task_name
                    if meta_task_name not in meta_task_list:
                        meta_task_list.append(meta_task_name)
                    aggregated_task_names.remove(task_name)
                    pair = (i, task_name)
                    metatask_to_task[meta_task_name].append(pair)

        n_aggregated_tasks = len(aggregated_task_names)
        do_aggregation = len(taskind_to_metatask) > 0

        if self.arithmetic_mean:
            # assert self.weighted_metric_of_each_endpoint
            total_datapoints = 0
            num_observations = []
            aggregated_num_obs = {
                meta_task_name: 0
                for meta_task_name in metatask_to_task
            }
            for task in range(n_tasks):
                w_task = w[:, task]
                this_datapoints = sum(w_task != 0)
                num_observations.append(this_datapoints)
                total_datapoints += this_datapoints
                if not do_aggregation:
                    continue
                if task in taskind_to_metatask:
                    meta_task_name = taskind_to_metatask[task]
                    aggregated_num_obs[meta_task_name] += this_datapoints

        self.get_metric_values_for_tasks(
            excluded_single_tasks_dict,
            computed_metrics,
            tasks,
            y_true,
            y_pred,
            w,
            plot,
            taskind_to_metatask=taskind_to_metatask,
            num_observations=num_observations,
            all_metrics=all_metrics,
            is_training_set=is_training_set,
            no_concordance_index=no_concordance_index,
            model_name=model_name,
            do_aggregation=do_aggregation,
            task_name_to_metric_value=task_name_to_metric_value)

        # Calculates the metric values of the tasks. Hopefully it is correct.
        if do_aggregation:
            self.plot_and_get_metric_values_for_metatasks(
                meta_task_list,
                metatask_to_task,
                computed_metrics,
                y_true,
                y_pred,
                w,
                plot=plot,
                all_metrics=all_metrics,
                is_training_set=is_training_set,
                no_concordance_index=no_concordance_index,
                model_name=model_name,
                excluded_metatasks_dict=excluded_metatasks_dict,
                aggregated_num_obs=aggregated_num_obs,
                task_name_to_metric_value=task_name_to_metric_value,
                num_observations=num_observations)

            # Check to make sure the computed_metrics contains the elements in the order of aggregated_task_names.
        for i, task_name in enumerate(aggregated_task_names):
            metric_val = task_name_to_metric_value[task_name]
            np.testing.assert_almost_equal(metric_val,
                                           computed_metrics[i],
                                           decimal=5)

        weighted_metrics = []
        # This block populates the weighted_metrics.
        if self.arithmetic_mean:
            # TODO: we should extract this block as a function.
            # def fill_weighted_metrics_list(self, weighted_metrics, total_datapoints, num_observations):
            excluded_datapoints = 0
            for task_ind in excluded_single_tasks_dict:
                excluded_datapoints += excluded_single_tasks_dict[task_ind]
            for meta_task_name in excluded_metatasks_dict:
                excluded_datapoints += excluded_metatasks_dict[meta_task_name]

            total_datapoints -= excluded_datapoints
            sum_coefficient = 0
            included_n_tasks = sum(np.invert(np.isnan(computed_metrics)))
            for task in range(n_tasks):
                # This block only processes the tasks that are not aggregated.
                if task in taskind_to_metatask:
                    continue
                if task in excluded_single_tasks_dict:
                    weighted_metrics.append(np.nan)
                    continue
                task_coefficient = included_n_tasks * num_observations[
                    task] / total_datapoints
                sum_coefficient += task_coefficient
                weighted_metrics.append(task_coefficient *
                                        computed_metrics[task])

            # Now deal with aggregated tasks.
            for meta_task_name in meta_task_list:
                if meta_task_name in excluded_metatasks_dict:
                    weighted_metrics.append(np.nan)
                    continue
                num_obs = aggregated_num_obs[meta_task_name]
                task_coefficient = included_n_tasks * num_obs / total_datapoints
                sum_coefficient += task_coefficient
                weighted_metrics.append(
                    task_coefficient *
                    task_name_to_metric_value[meta_task_name])

            np.testing.assert_almost_equal(sum_coefficient,
                                           included_n_tasks,
                                           decimal=6)

        time_end = time.time()

        if not self.arithmetic_mean:
            weighted_metrics = copy.deepcopy(computed_metrics)
            assert not do_aggregation
            # if do_aggregation:
            #   warnings.warn("Aggregating in metric %s yet not using arithmetic mean. " % (self.metric.__name__))
            #   # TODO: calculating the simple average of the aggregated tasks

        if verbose:
            if not do_aggregation:
                log(
                    "computed_metrics %s: %s" %
                    (self.metric.__name__, str(computed_metrics)),
                    self.verbose)
                log("corresponding to tasks: %s" % (str(tasks)), self.verbose)
            else:
                log(
                    "computed_metrics %s: %s" %
                    (self.metric.__name__, str(computed_metrics)),
                    self.verbose)
                log(
                    "corresponding to tasks: %s" %
                    (str(aggregated_task_names)), self.verbose)
            print("time spent on evaluation: ", time_end - time_start)

        if n_tasks == 1:
            computed_metrics = computed_metrics[0]
        if not self.is_multitask:
            return computed_metrics

        if filter_nans:
            computed_metrics = np.array(computed_metrics)
            computed_metrics = computed_metrics[~np.isnan(computed_metrics)]
        if self.compute_energy_metric:
            # TODO(rbharath, joegomes): What is this magic number?
            force_error = self.task_averager(
                computed_metrics[1:]) * 4961.47596096
            print("Force error (metric: np.mean(%s)): %f kJ/mol/A" %
                  (self.name, force_error))
            return computed_metrics[0]
        if not per_task_metrics:
            return self.task_averager(weighted_metrics)
        if len(self.aggregate_list) > 0:
            assert do_aggregation
            return self.task_averager(weighted_metrics), computed_metrics

        return self.task_averager(weighted_metrics), computed_metrics