def _compute_metrics(cls, model, ts, examples, eval_size=1000, batch_size=256): examples_ = sample_if_large(examples, max_size=eval_size) losses, weights = [], [] for batch in chunks(examples_, batch_size): # compute loss batch_loss = model.loss(batch, ts) losses.append(batch_loss.data[0]) weights.append(len(batch)) losses, weights = np.array(losses), np.array(weights) loss = np.sum(losses * weights) / np.sum(weights) # compute perplexity entropy = 0.0 num_words = 0 for batch in chunks(examples_, batch_size): # change base losses = model.per_instance_losses(batch) # -log_e_x losses = losses.data.cpu().numpy() losses_log_2 = losses / np.log(2.0) # normalize log_p by sentence length lengths = np.array([len(ex) + 1 for ex in batch]) entropy += np.sum(losses_log_2) num_words += sum(lengths) pp = 2.0**(1.0 / num_words * entropy) return round(loss, 5), round(pp, 55)
def _compute_metrics(cls, editor, examples, num_evaluate_examples, noiser, batch_size=256, edit_dropout=False, draw_samples=False): with random_seed(0): sample = sample_if_large(examples, num_evaluate_examples, replace=False) if edit_dropout: noised_sample = noiser(sample) else: noised_sample = sample # compute loss and log to TensorBoard # need to break the sample into batches, in case the sample is too large to fit in GPU memory losses, weights = [], [] for batch in chunks(noised_sample, batch_size): weights.append(len(batch)) loss_var, _, _ = editor.loss(batch, draw_samples) losses.append(loss_var.data[0]) losses, weights = np.array(losses), np.array(weights) loss = np.sum(losses * weights) / np.sum(weights) # weighted average # compute BLEU score and log to TensorBoard outputs, edit_traces = editor.edit(noised_sample) bleus = [] for ex, output in izip(noised_sample, outputs): # outputs is a list(over batches)[ list(over beams) [ list(over tokens) [ unicode ] ] ] object. bleus.append(bleu(ex.target_words, output[0])) avg_bleu = np.mean(bleus) return loss, avg_bleu, edit_traces
def _evaluate(self, data_splits, big_eval): """Evaluate. Args: data_splits (RetrieverDataSplits) big_eval (bool) """ config = self.config.eval num_samples = config.big_num_examples if big_eval else config.num_examples format_name = lambda name: '{}_{}'.format('big' if big_eval else 'small', name) with random_seed(0): train_sample = sample_if_large(data_splits.train, num_samples) self._evaluate_split(train_sample, format_name('train')) valid_sample = sample_if_large(data_splits.valid, num_samples) self._evaluate_split(valid_sample, format_name('valid'))
def evaluate(self, step): print 'Evaluate at step {}'.format(step) num_examples = self.config.num_evaluate_examples with random_seed(0): train_sample = sample_if_large(self.train_examples, num_examples, replace=False) with random_seed(0): valid_sample = sample_if_large(self.valid_examples, num_examples, replace=False) train_eval = self.evaluate_on_examples(step, train_sample, self.train_visualizer) valid_eval = self.evaluate_on_examples(step, valid_sample, self.valid_visualizer) # Log to TensorBoard train_eval.json_summarize(self.workspace.codalab, step) train_eval.tboard_summarize(self.tb_logger, step) valid_eval.json_summarize(self.workspace.codalab, step) valid_eval.tboard_summarize(self.tb_logger, step)
def evaluate_helper(examples, prefix): with random_seed(0): sample = sample_if_large(examples, num_samples, replace=False) eval = self.evaluate_on_examples(step=step, examples=sample, visualizer=silent_visualizer) # wrap with BernoulliSequenceStat, for conf intervals for name, stat in eval.stats.items(): if name.startswith('denoAcc'): eval.stats[name] = BernoulliSequenceStat(stat) with open(full_eval_path, 'a') as f: eval.summarize(f, prefix=prefix) eval.tboard_summarize(self.tb_logger, step, prefix=prefix) eval.json_summarize(self.workspace.codalab, step, prefix=prefix)
def _compute_metrics(cls, editor, examples, num_evaluate_examples, batch_size): """ Args: editor (Editor) examples (list[EditExample]) num_evaluate_examples (int) batch_size (int) Returns: stats (dict[str, float]) edit_traces (list[EditTrace]): of length num_evaluate_examples loss_traces (list[LossTrace]): of length num_evaluate_examples """ sample = sample_if_large(examples, num_evaluate_examples, replace=False) # compute loss # need to break the sample into batches, in case the sample is too large to fit in GPU memory losses, loss_traces, weights, enc_losses = [], [], [], [] for batch in verboserate(chunks(sample, batch_size), desc='Computing loss on examples'): weights.append(len(batch)) loss_var, loss_trace_batch, enc_loss = editor.loss(batch) # convert loss Variable into float loss_val = loss_var.data[0] assert isinstance(loss_val, float) losses.append(loss_val) enc_losses.append(enc_loss) loss_traces.extend(loss_trace_batch) losses, weights = np.array(losses), np.array(weights) loss = np.sum(losses * weights) / np.sum(weights) # weighted average enc_loss = np.sum(np.array(enc_losses) * weights) / np.sum(weights) punct_table = dict.fromkeys( i for i in xrange(sys.maxunicode) if unicodedata.category(unichr(i)).startswith('P')) def remove_punct(s): new_s = [] for t in s: t = unicode(t).translate(punct_table) if t != '': new_s.append(t) return new_s metrics = { 'bleu': (bleu, max), 'edit_dist': (lambda s, t: edit_dist(s, t)[0] / len(s) if len(s) > 0 else len(t), min), 'exact_match': (lambda s, t: 1.0 if remove_punct(s) == remove_punct(t) else 0.0, max) } top_results = defaultdict(list) top5_results = defaultdict(list) # compute predictions beams, edit_traces = editor.edit(sample, batch_size=batch_size, max_seq_length=150, verbose=True) for ex, beam in izip(sample, beams): top = beam[0] top5 = beam[:5] target = ex.target_words for name, (fxn, best) in metrics.items(): top_results[name].append(fxn(top, target)) top5_results[name].append( best(fxn(predict, target) for predict in top5)) # compute averages stats_top = {name: np.mean(vals) for name, vals in top_results.items()} stats_top5 = { '{}_top5'.format(name): np.mean(vals) for name, vals in top5_results.items() } # combine into a single stats object stats = {'loss': loss, 'enc_loss': enc_loss} stats.update(stats_top) stats.update(stats_top5) return stats, edit_traces, loss_traces
def case_sample(examples): """Get a random sample of supervised ParseCases.""" with random_seed(0): example_sample = sample_if_large(examples, 30) return list(examples_to_supervised_cases(example_sample))