def evaluate(self, dataset, metric, n_pos, n_neg, n_trials=1000, exclude_support=True): """Evaluate performance on dataset according to metrics Evaluates the performance of the trained model by sampling supports randomly for each task in dataset. For each sampled support, the accuracy of the model with support provided is computed on all data for that task. If exclude_support is True (by default), the support set is excluded from this accuracy calculation. exclude_support should be set to false if model's memorization capacity wants to be evaluated. Since the accuracy on a task is dependent on the choice of random support, the evaluation experiment is repeated n_trials times for each task. (Each task gets n_trials experiments). The computed accuracies are averaged across trials. TODO(rbharath): Currently does not support any transformers. Parameters ---------- dataset: dc.data.Dataset Dataset to test on. metrics: dc.metrics.Metric Evaluation metric. n_pos: int, optional Number of positive samples per support. n_neg: int, optional Number of negative samples per support. exclude_support: bool, optional Whether support set should be excluded when computing model accuracy. """ # Get batches test_tasks = range(len(dataset.get_task_names())) task_scores = {task: [] for task in test_tasks} support_generator = SupportGenerator(dataset, n_pos, n_neg, n_trials) for ind, (task, support) in enumerate(support_generator): print("Eval sample %d from task %s" % (ind, str(task))) # TODO(rbharath): Add test for get_task_dataset_minus_support for # multitask case with missing data... if exclude_support: print("Removing support datapoints for eval.") task_dataset = get_task_dataset_minus_support( dataset, support, task) else: print("Keeping support datapoints for eval.") task_dataset = get_task_dataset(dataset, task) y_pred = self.predict_proba(support, task_dataset) task_scores[task].append( metric.compute_metric(task_dataset.y, y_pred, task_dataset.w)) # Join information for all tasks. mean_task_scores = {} std_task_scores = {} for task in test_tasks: mean_task_scores[task] = np.mean(np.array(task_scores[task])) std_task_scores[task] = np.std(np.array(task_scores[task])) return mean_task_scores, std_task_scores
def old_fit(self, dataset, n_trials=1000, n_steps_per_trial=1, n_pos=1, n_neg=9, log_every_n_samples=10, replace=True, **kwargs): """Fits model on dataset. Note that fitting for support models is quite different from fitting for other deep models. Fitting is a two-level process. We perform n_trials, where for each trial, we randomply sample a support set for each given task, and independently a test set from that same task. The SupportGenerator class iterates over the tasks in random order. Parameters ---------- dataset: dc.data.Dataset Dataset to fit model on. n_trials: int, optional Number of (support, test) pairs to sample and train on. n_steps_per_trial: int, optional Number of gradient descent steps to take per support. n_pos: int, optional Number of positive examples per support. n_neg: int, optional Number of negative examples per support. log_every_n_samples: int, optional Displays info every this number of samples replace: bool, optional Whether or not to use replacement when sampling supports/tests. """ time_start = time.time() # Perform the optimization n_tasks = len(dataset.get_task_names()) feed_total, run_total, test_total = 0, 0, 0 # Create different support sets support_generator = SupportGenerator(dataset, range(n_tasks), n_pos, n_neg, n_trials) recent_losses = [] for ind, (task, support) in enumerate(support_generator): if ind % log_every_n_samples == 0: print("Sample %d from task %s" % (ind, str(task))) # Get batch to try it out on test_start = time.time() test = get_single_task_test(dataset, self.test_batch_size, task, replace) test_end = time.time() test_total += (test_end - test_start) feed_start = time.time() feed_dict = self.construct_feed_dict(test, support) feed_end = time.time() feed_total += (feed_end - feed_start) for step in range(n_steps_per_trial): # Train on support set, batch pair run_start = time.time() _, loss = self.sess.run([self.train_op, self.loss_op], feed_dict=feed_dict) run_end = time.time() run_total += (run_end - run_start) if ind % log_every_n_samples == 0: mean_loss = np.mean(np.array(recent_losses)) print("\tmean loss is %s" % str(mean_loss)) recent_losses = [] else: recent_losses.append(loss) time_end = time.time() print("old_fit took %s seconds" % str(time_end - time_start)) print("test_total: %s" % str(test_total)) print("feed_total: %s" % str(feed_total)) print("run_total: %s" % str(run_total))