def create_dataset(shard_generator, data_dir=None, tasks=[], verbose=True): """Creates a new DiskDataset Parameters ---------- shard_generator: Iterable An iterable (either a list or generator) that provides tuples of data (X, y, w, ids). Each tuple will be written to a separate shard on disk. data_dir: str Filename for data directory. Creates a temp directory if none specified. tasks: list List of tasks for this dataset. """ if data_dir is None: data_dir = tempfile.mkdtemp() elif not os.path.exists(data_dir): os.makedirs(data_dir) metadata_rows = [] time1 = time.time() for shard_num, (X, y, w, ids) in enumerate(shard_generator): basename = "shard-%d" % shard_num metadata_rows.append( DiskDataset.write_data_to_disk(data_dir, basename, tasks, X, y, w, ids)) metadata_df = DiskDataset._construct_metadata(metadata_rows) save_metadata(tasks, metadata_df, data_dir) time2 = time.time() log("TIMING: dataset construction took %0.3f s" % (time2 - time1), verbose) return DiskDataset(data_dir, verbose=verbose)
def featurize_complexes(self, mol_pdbs, protein_pdbs, verbose=True, log_every_n=1000): """ Calculate features for mol/protein complexes. Parameters ---------- mol_pdbs: list List of PDBs for molecules. Each PDB should be a list of lines of the PDB file. protein_pdbs: list List of PDBs for proteins. Each PDB should be a list of lines of the PDB file. """ features = [] for i, (mol_pdb, protein_pdb) in enumerate(zip(mol_pdbs, protein_pdbs)): if verbose and i % log_every_n == 0: log("Featurizing %d / %d" % (i, len(mol_pdbs))) features.append(self._featurize_complex(mol_pdb, protein_pdb)) features = np.asarray(features) return features
def __init__(self, data_dir, verbose=True): """ Turns featurized dataframes into numpy files, writes them & metadata to disk. """ self.data_dir = data_dir self.verbose = verbose log("Loading dataset from disk.", self.verbose) self.tasks, self.metadata_df = self.load_metadata()
def sparse_shuffle(self): """Shuffling that exploits data sparsity to shuffle large datasets. Only for 1-dimensional feature vectors (does not work for tensorial featurizations). """ time1 = time.time() shard_size = self.get_shard_size() num_shards = self.get_number_shards() X_sparses, ys, ws, ids = [], [], [], [] num_features = None for i in range(num_shards): (X_s, y_s, w_s, ids_s) = self.get_shard(i) if num_features is None: num_features = X_s.shape[1] X_sparse = sparsify_features(X_s) X_sparses, ys, ws, ids = (X_sparses + [X_sparse], ys + [y_s], ws + [w_s], ids + [np.atleast_1d(np.squeeze(ids_s))]) # Get full dataset in memory (X_sparse, y, w, ids) = (np.vstack(X_sparses), np.vstack(ys), np.vstack(ws), np.concatenate(ids)) # Shuffle in memory num_samples = len(X_sparse) permutation = np.random.permutation(num_samples) X_sparse, y, w, ids = (X_sparse[permutation], y[permutation], w[permutation], ids[permutation]) # Write shuffled shards out to disk for i in range(num_shards): start, stop = i * shard_size, (i + 1) * shard_size (X_sparse_s, y_s, w_s, ids_s) = (X_sparse[start:stop], y[start:stop], w[start:stop], ids[start:stop]) X_s = densify_features(X_sparse_s, num_features) self.set_shard(i, X_s, y_s, w_s, ids_s) time2 = time.time() log("TIMING: sparse_shuffle took %0.3f s" % (time2 - time1), self.verbose)
def compute_metric(self, y_true, y_pred, w=None, n_classes=2, filter_nans=False, per_task_metrics=False, plot=False, all_metrics=None, is_training_set=False, no_concordance_index=True, tasks=None, model_name=None, verbose=False): """Compute a performance metric for each task. Parameters ---------- y_true: np.ndarray An np.ndarray containing true values for each task. y_pred: np.ndarray An np.ndarray containing predicted values for each task. w: np.ndarray, optional An np.ndarray containing weights for each datapoint. n_classes: int, optional Number of classes in data for classification tasks. filter_nans: bool, optional Remove NaN values in computed metrics per_task_metrics: bool, optional If true, return computed metric for each task on multitask dataset. Returns ------- A numpy nd.array containing metric values for each task. """ if len(y_true.shape) > 1: n_samples, n_tasks = y_true.shape[0], y_true.shape[1] else: n_samples, n_tasks = y_true.shape[0], 1 if tasks is not None: assert len(tasks) == n_tasks if self.mode == "classification": y_pred = np.reshape(y_pred, (n_samples, n_tasks, n_classes)) else: y_pred = np.reshape(y_pred, (n_samples, n_tasks)) y_true = np.reshape(y_true, (n_samples, n_tasks)) if w is None or len(w) == 0: w = np.ones_like(y_true) assert y_true.shape[0] == y_pred.shape[0] == w.shape[0] computed_metrics = [] excluded_single_tasks_dict = {} task_name_to_metric_value = {} time_start = time.time() # WARNING: the following block works fine, but if you change a similar block in driver.py, it might cause # problems. Do not do that. excluded_metatasks_dict = {} metatask_to_task = {} taskind_to_metatask = {} meta_task_list = [] aggregated_task_names = copy.deepcopy(tasks) if len(self.aggregate_list) > 0: assert tasks is not None for meta_task_name in self.aggregate_list: for i, task_name in enumerate(tasks): if not re.search(meta_task_name, task_name, re.I): continue # Only construct the corresponding entry when there are truly such hits. if meta_task_name not in metatask_to_task: metatask_to_task[meta_task_name] = [] aggregated_task_names.append(meta_task_name) if i not in taskind_to_metatask: taskind_to_metatask[i] = meta_task_name if meta_task_name not in meta_task_list: meta_task_list.append(meta_task_name) aggregated_task_names.remove(task_name) pair = (i, task_name) metatask_to_task[meta_task_name].append(pair) n_aggregated_tasks = len(aggregated_task_names) do_aggregation = len(taskind_to_metatask) > 0 if self.arithmetic_mean: # assert self.weighted_metric_of_each_endpoint total_datapoints = 0 num_observations = [] aggregated_num_obs = { meta_task_name: 0 for meta_task_name in metatask_to_task } for task in range(n_tasks): w_task = w[:, task] this_datapoints = sum(w_task != 0) num_observations.append(this_datapoints) total_datapoints += this_datapoints if not do_aggregation: continue if task in taskind_to_metatask: meta_task_name = taskind_to_metatask[task] aggregated_num_obs[meta_task_name] += this_datapoints self.get_metric_values_for_tasks( excluded_single_tasks_dict, computed_metrics, tasks, y_true, y_pred, w, plot, taskind_to_metatask=taskind_to_metatask, num_observations=num_observations, all_metrics=all_metrics, is_training_set=is_training_set, no_concordance_index=no_concordance_index, model_name=model_name, do_aggregation=do_aggregation, task_name_to_metric_value=task_name_to_metric_value) # Calculates the metric values of the tasks. Hopefully it is correct. if do_aggregation: self.plot_and_get_metric_values_for_metatasks( meta_task_list, metatask_to_task, computed_metrics, y_true, y_pred, w, plot=plot, all_metrics=all_metrics, is_training_set=is_training_set, no_concordance_index=no_concordance_index, model_name=model_name, excluded_metatasks_dict=excluded_metatasks_dict, aggregated_num_obs=aggregated_num_obs, task_name_to_metric_value=task_name_to_metric_value, num_observations=num_observations) # Check to make sure the computed_metrics contains the elements in the order of aggregated_task_names. for i, task_name in enumerate(aggregated_task_names): metric_val = task_name_to_metric_value[task_name] np.testing.assert_almost_equal(metric_val, computed_metrics[i], decimal=5) weighted_metrics = [] # This block populates the weighted_metrics. if self.arithmetic_mean: # TODO: we should extract this block as a function. # def fill_weighted_metrics_list(self, weighted_metrics, total_datapoints, num_observations): excluded_datapoints = 0 for task_ind in excluded_single_tasks_dict: excluded_datapoints += excluded_single_tasks_dict[task_ind] for meta_task_name in excluded_metatasks_dict: excluded_datapoints += excluded_metatasks_dict[meta_task_name] total_datapoints -= excluded_datapoints sum_coefficient = 0 included_n_tasks = sum(np.invert(np.isnan(computed_metrics))) for task in range(n_tasks): # This block only processes the tasks that are not aggregated. if task in taskind_to_metatask: continue if task in excluded_single_tasks_dict: weighted_metrics.append(np.nan) continue task_coefficient = included_n_tasks * num_observations[ task] / total_datapoints sum_coefficient += task_coefficient weighted_metrics.append(task_coefficient * computed_metrics[task]) # Now deal with aggregated tasks. for meta_task_name in meta_task_list: if meta_task_name in excluded_metatasks_dict: weighted_metrics.append(np.nan) continue num_obs = aggregated_num_obs[meta_task_name] task_coefficient = included_n_tasks * num_obs / total_datapoints sum_coefficient += task_coefficient weighted_metrics.append( task_coefficient * task_name_to_metric_value[meta_task_name]) np.testing.assert_almost_equal(sum_coefficient, included_n_tasks, decimal=6) time_end = time.time() if not self.arithmetic_mean: weighted_metrics = copy.deepcopy(computed_metrics) assert not do_aggregation # if do_aggregation: # warnings.warn("Aggregating in metric %s yet not using arithmetic mean. " % (self.metric.__name__)) # # TODO: calculating the simple average of the aggregated tasks if verbose: if not do_aggregation: log( "computed_metrics %s: %s" % (self.metric.__name__, str(computed_metrics)), self.verbose) log("corresponding to tasks: %s" % (str(tasks)), self.verbose) else: log( "computed_metrics %s: %s" % (self.metric.__name__, str(computed_metrics)), self.verbose) log( "corresponding to tasks: %s" % (str(aggregated_task_names)), self.verbose) print("time spent on evaluation: ", time_end - time_start) if n_tasks == 1: computed_metrics = computed_metrics[0] if not self.is_multitask: return computed_metrics if filter_nans: computed_metrics = np.array(computed_metrics) computed_metrics = computed_metrics[~np.isnan(computed_metrics)] if self.compute_energy_metric: # TODO(rbharath, joegomes): What is this magic number? force_error = self.task_averager( computed_metrics[1:]) * 4961.47596096 print("Force error (metric: np.mean(%s)): %f kJ/mol/A" % (self.name, force_error)) return computed_metrics[0] if not per_task_metrics: return self.task_averager(weighted_metrics) if len(self.aggregate_list) > 0: assert do_aggregation return self.task_averager(weighted_metrics), computed_metrics return self.task_averager(weighted_metrics), computed_metrics