Ejemplo n.º 1
0
    def _compute_metrics(cls,
                         model,
                         ts,
                         examples,
                         eval_size=1000,
                         batch_size=256):

        examples_ = sample_if_large(examples, max_size=eval_size)

        losses, weights = [], []
        for batch in chunks(examples_, batch_size):
            # compute loss
            batch_loss = model.loss(batch, ts)
            losses.append(batch_loss.data[0])
            weights.append(len(batch))
        losses, weights = np.array(losses), np.array(weights)
        loss = np.sum(losses * weights) / np.sum(weights)

        # compute perplexity
        entropy = 0.0
        num_words = 0
        for batch in chunks(examples_, batch_size):
            # change base
            losses = model.per_instance_losses(batch)  # -log_e_x
            losses = losses.data.cpu().numpy()
            losses_log_2 = losses / np.log(2.0)

            # normalize log_p by sentence length
            lengths = np.array([len(ex) + 1 for ex in batch])
            entropy += np.sum(losses_log_2)
            num_words += sum(lengths)

        pp = 2.0**(1.0 / num_words * entropy)

        return round(loss, 5), round(pp, 55)
Ejemplo n.º 2
0
    def _compute_metrics(cls, editor, examples, num_evaluate_examples, noiser, batch_size=256, edit_dropout=False, draw_samples=False):
        with random_seed(0):
            sample = sample_if_large(examples, num_evaluate_examples, replace=False)
        if edit_dropout:
            noised_sample = noiser(sample)
        else:
            noised_sample = sample

        # compute loss and log to TensorBoard
        # need to break the sample into batches, in case the sample is too large to fit in GPU memory
        losses, weights = [], []
        for batch in chunks(noised_sample, batch_size):
            weights.append(len(batch))
            loss_var, _, _ = editor.loss(batch, draw_samples)
            losses.append(loss_var.data[0])
        losses, weights = np.array(losses), np.array(weights)
        loss = np.sum(losses * weights) / np.sum(weights)  # weighted average

        # compute BLEU score and log to TensorBoard
        outputs, edit_traces = editor.edit(noised_sample)
        bleus = []
        for ex, output in izip(noised_sample, outputs):
            # outputs is a list(over batches)[ list(over beams) [ list(over tokens) [ unicode ] ] ] object.
            bleus.append(bleu(ex.target_words, output[0]))
        avg_bleu = np.mean(bleus)
        return loss, avg_bleu, edit_traces
Ejemplo n.º 3
0
    def _evaluate(self, data_splits, big_eval):
        """Evaluate.
        
        Args:
            data_splits (RetrieverDataSplits)
            big_eval (bool)
        """
        config = self.config.eval
        num_samples = config.big_num_examples if big_eval else config.num_examples

        format_name = lambda name: '{}_{}'.format('big' if big_eval else 'small', name)

        with random_seed(0):
            train_sample = sample_if_large(data_splits.train, num_samples)
            self._evaluate_split(train_sample, format_name('train'))

            valid_sample = sample_if_large(data_splits.valid, num_samples)
            self._evaluate_split(valid_sample, format_name('valid'))
Ejemplo n.º 4
0
    def evaluate(self, step):
        print 'Evaluate at step {}'.format(step)
        num_examples = self.config.num_evaluate_examples
        with random_seed(0):
            train_sample = sample_if_large(self.train_examples,
                                           num_examples,
                                           replace=False)
        with random_seed(0):
            valid_sample = sample_if_large(self.valid_examples,
                                           num_examples,
                                           replace=False)
        train_eval = self.evaluate_on_examples(step, train_sample,
                                               self.train_visualizer)
        valid_eval = self.evaluate_on_examples(step, valid_sample,
                                               self.valid_visualizer)

        # Log to TensorBoard
        train_eval.json_summarize(self.workspace.codalab, step)
        train_eval.tboard_summarize(self.tb_logger, step)
        valid_eval.json_summarize(self.workspace.codalab, step)
        valid_eval.tboard_summarize(self.tb_logger, step)
Ejemplo n.º 5
0
        def evaluate_helper(examples, prefix):
            with random_seed(0):
                sample = sample_if_large(examples, num_samples, replace=False)
            eval = self.evaluate_on_examples(step=step,
                                             examples=sample,
                                             visualizer=silent_visualizer)

            # wrap with BernoulliSequenceStat, for conf intervals
            for name, stat in eval.stats.items():
                if name.startswith('denoAcc'):
                    eval.stats[name] = BernoulliSequenceStat(stat)

            with open(full_eval_path, 'a') as f:
                eval.summarize(f, prefix=prefix)

            eval.tboard_summarize(self.tb_logger, step, prefix=prefix)
            eval.json_summarize(self.workspace.codalab, step, prefix=prefix)
Ejemplo n.º 6
0
    def _compute_metrics(cls, editor, examples, num_evaluate_examples,
                         batch_size):
        """

        Args:
            editor (Editor)
            examples (list[EditExample])
            num_evaluate_examples (int)
            batch_size (int)

        Returns:
            stats (dict[str, float])
            edit_traces (list[EditTrace]): of length num_evaluate_examples
            loss_traces (list[LossTrace]): of length num_evaluate_examples

        """
        sample = sample_if_large(examples,
                                 num_evaluate_examples,
                                 replace=False)

        # compute loss
        # need to break the sample into batches, in case the sample is too large to fit in GPU memory
        losses, loss_traces, weights, enc_losses = [], [], [], []

        for batch in verboserate(chunks(sample, batch_size),
                                 desc='Computing loss on examples'):
            weights.append(len(batch))
            loss_var, loss_trace_batch, enc_loss = editor.loss(batch)

            # convert loss Variable into float
            loss_val = loss_var.data[0]
            assert isinstance(loss_val, float)
            losses.append(loss_val)
            enc_losses.append(enc_loss)

            loss_traces.extend(loss_trace_batch)

        losses, weights = np.array(losses), np.array(weights)
        loss = np.sum(losses * weights) / np.sum(weights)  # weighted average
        enc_loss = np.sum(np.array(enc_losses) * weights) / np.sum(weights)

        punct_table = dict.fromkeys(
            i for i in xrange(sys.maxunicode)
            if unicodedata.category(unichr(i)).startswith('P'))

        def remove_punct(s):
            new_s = []
            for t in s:
                t = unicode(t).translate(punct_table)
                if t != '':
                    new_s.append(t)
            return new_s

        metrics = {
            'bleu': (bleu, max),
            'edit_dist': (lambda s, t: edit_dist(s, t)[0] / len(s)
                          if len(s) > 0 else len(t), min),
            'exact_match':
            (lambda s, t: 1.0
             if remove_punct(s) == remove_punct(t) else 0.0, max)
        }

        top_results = defaultdict(list)
        top5_results = defaultdict(list)

        # compute predictions
        beams, edit_traces = editor.edit(sample,
                                         batch_size=batch_size,
                                         max_seq_length=150,
                                         verbose=True)
        for ex, beam in izip(sample, beams):
            top = beam[0]
            top5 = beam[:5]
            target = ex.target_words
            for name, (fxn, best) in metrics.items():
                top_results[name].append(fxn(top, target))
                top5_results[name].append(
                    best(fxn(predict, target) for predict in top5))

        # compute averages
        stats_top = {name: np.mean(vals) for name, vals in top_results.items()}
        stats_top5 = {
            '{}_top5'.format(name): np.mean(vals)
            for name, vals in top5_results.items()
        }

        # combine into a single stats object
        stats = {'loss': loss, 'enc_loss': enc_loss}
        stats.update(stats_top)
        stats.update(stats_top5)

        return stats, edit_traces, loss_traces
Ejemplo n.º 7
0
 def case_sample(examples):
     """Get a random sample of supervised ParseCases."""
     with random_seed(0):
         example_sample = sample_if_large(examples, 30)
     return list(examples_to_supervised_cases(example_sample))