def evaluate(self, dataset, metric, n_pos, n_neg, n_trials=1000, exclude_support=True): """Evaluate performance on dataset according to metrics Evaluates the performance of the trained model by sampling supports randomly for each task in dataset. For each sampled support, the accuracy of the model with support provided is computed on all data for that task. If exclude_support is True (by default), the support set is excluded from this accuracy calculation. exclude_support should be set to false if model's memorization capacity wants to be evaluated. Since the accuracy on a task is dependent on the choice of random support, the evaluation experiment is repeated n_trials times for each task. (Each task gets n_trials experiments). The computed accuracies are averaged across trials. TODO(rbharath): Currently does not support any transformers. Parameters ---------- dataset: dc.data.Dataset Dataset to test on. metrics: dc.metrics.Metric Evaluation metric. n_pos: int, optional Number of positive samples per support. n_neg: int, optional Number of negative samples per support. exclude_support: bool, optional Whether support set should be excluded when computing model accuracy. """ # Get batches test_tasks = range(len(dataset.get_task_names())) task_scores = {task: [] for task in test_tasks} support_generator = SupportGenerator(dataset, n_pos, n_neg, n_trials) for ind, (task, support) in enumerate(support_generator): print("Eval sample %d from task %s" % (ind, str(task))) # TODO(rbharath): Add test for get_task_dataset_minus_support for # multitask case with missing data... if exclude_support: print("Removing support datapoints for eval.") task_dataset = get_task_dataset_minus_support(dataset, support, task) else: print("Keeping support datapoints for eval.") task_dataset = get_task_dataset(dataset, task) y_pred = self.predict_proba(support, task_dataset) task_scores[task].append(metric.compute_metric( task_dataset.y, y_pred, task_dataset.w)) # Join information for all tasks. mean_task_scores = {} std_task_scores = {} for task in test_tasks: mean_task_scores[task] = np.mean(np.array(task_scores[task])) std_task_scores[task] = np.std(np.array(task_scores[task])) return mean_task_scores, std_task_scores
def evaluate(self, dataset, metric, n_pos, n_neg, n_trials=1000, exclude_support=True): """Evaluate performance on dataset according to metrics Evaluates the performance of the trained model by sampling supports randomly for each task in dataset. For each sampled support, the accuracy of the model with support provided is computed on all data for that task. If exclude_support is True (by default), the support set is excluded from this accuracy calculation. exclude_support should be set to false if model's memorization capacity wants to be evaluated. Since the accuracy on a task is dependent on the choice of random support, the evaluation experiment is repeated n_trials times for each task. (Each task gets n_trials experiments). The computed accuracies are averaged across trials. TODO(rbharath): Currently does not support any transformers. Parameters ---------- dataset: dc.data.Dataset Dataset to test on. metrics: dc.metrics.Metric Evaluation metric. n_pos: int, optional Number of positive samples per support. n_neg: int, optional Number of negative samples per support. exclude_support: bool, optional Whether support set should be excluded when computing model accuracy. """ # Get batches test_tasks = range(len(dataset.get_task_names())) task_scores = {task: [] for task in test_tasks} support_generator = SupportGenerator(dataset, n_pos, n_neg, n_trials) for ind, (task, support) in enumerate(support_generator): print("Eval sample %d from task %s" % (ind, str(task))) # TODO(rbharath): Add test for get_task_dataset_minus_support for # multitask case with missing data... if exclude_support: print("Removing support datapoints for eval.") task_dataset = get_task_dataset_minus_support( dataset, support, task) else: print("Keeping support datapoints for eval.") task_dataset = get_task_dataset(dataset, task) y_pred = self.predict_proba(support, task_dataset) task_scores[task].append( metric.compute_metric(task_dataset.y, y_pred, task_dataset.w)) # Join information for all tasks. mean_task_scores = {} std_task_scores = {} for task in test_tasks: mean_task_scores[task] = np.mean(np.array(task_scores[task])) std_task_scores[task] = np.std(np.array(task_scores[task])) return mean_task_scores, std_task_scores