Beispiel #1
0
    def predict_and_score(self, eval_instances, random=False, verbosity=0):
        predictions = []
        scores = []

        batches = iterators.iter_batches(eval_instances,
                                         self.options.listener_eval_batch_size)
        num_batches = (len(eval_instances) -
                       1) // self.options.listener_eval_batch_size + 1

        if self.options.verbosity + verbosity >= 2:
            print('Testing')
        progress.start_task('Eval batch', num_batches)
        for batch_num, batch in enumerate(batches):
            progress.progress(batch_num)
            batch = list(batch)

            xs, (y, ) = self._data_to_arrays(batch, test=True)

            probs = self.model.predict(xs)
            if random:
                indices = sample(probs)
                predictions.extend(indices)
            else:
                predictions.extend(probs.argmax(axis=1))
            scores_arr = np.log(probs[np.arange(len(batch)), y])
            scores.extend(scores_arr.tolist())
        progress.end_task()
        if self.options.verbosity >= 9:
            print('%s %ss:') % (self.id, 'sample' if random else 'prediction')
            for inst, prediction in zip(eval_instances, predictions):
                print('%s -> %s' % (repr(inst.input), repr(prediction)))

        return predictions, scores
    def predict_and_score(self, eval_instances, random=False, verbosity=0):
        predictions = []
        scores = []

        batches = iterators.iter_batches(eval_instances, self.options.listener_eval_batch_size)
        num_batches = (len(eval_instances) - 1) // self.options.listener_eval_batch_size + 1

        if self.options.verbosity + verbosity >= 2:
            print('Testing')
        progress.start_task('Eval batch', num_batches)
        for batch_num, batch in enumerate(batches):
            progress.progress(batch_num)
            batch = list(batch)

            xs, (y,) = self._data_to_arrays(batch, test=True)

            probs = self.model.predict(xs)
            if random:
                indices = sample(probs)
                predictions.extend(indices)
            else:
                predictions.extend(probs.argmax(axis=1))
            scores_arr = np.log(probs[np.arange(len(batch)), y])
            scores.extend(scores_arr.tolist())
        progress.end_task()
        if self.options.verbosity >= 9:
            print('%s %ss:') % (self.id, 'sample' if random else 'prediction')
            for inst, prediction in zip(eval_instances, predictions):
                print('%s -> %s' % (repr(inst.input), repr(prediction)))

        return predictions, scores
Beispiel #3
0
    def score(self, eval_instances, verbosity=0):
        result = []
        batches = iterators.iter_batches(eval_instances,
                                         self.options.speaker_eval_batch_size)
        num_batches = (len(eval_instances) -
                       1) // self.options.speaker_eval_batch_size + 1

        if self.options.verbosity + verbosity >= 2:
            print('Scoring')
        if self.options.verbosity + verbosity >= 1:
            progress.start_task('Score batch', num_batches)
        for batch_num, batch in enumerate(batches):
            if self.options.verbosity + verbosity >= 1:
                progress.progress(batch_num)
            batch = list(batch)

            xs, (n, ) = self._data_to_arrays(batch, test=False)
            if self.use_color_mask:
                mask = xs[3]
            else:
                mask = xs[2]

            probs = self.model.predict(xs)
            token_probs = probs[np.arange(probs.shape[0])[:, np.newaxis],
                                np.arange(probs.shape[1]), n]
            scores_arr = np.sum(np.log(token_probs) * mask, axis=1)
            scores = scores_arr.tolist()
            result.extend(scores)
        if self.options.verbosity + verbosity >= 1:
            progress.end_task()

        return result
Beispiel #4
0
    def predict(self, eval_instances, random=False, verbosity=0):
        result = []
        batches = iterators.iter_batches(eval_instances,
                                         self.options.speaker_eval_batch_size)
        num_batches = (len(eval_instances) -
                       1) // self.options.speaker_eval_batch_size + 1

        eos_index = self.seq_vec.vectorize(['</s>'])[0]

        if self.options.verbosity + verbosity >= 2:
            print('Predicting')
        if self.options.verbosity + verbosity >= 1:
            progress.start_task('Predict batch', num_batches)
        for batch_num, batch in enumerate(batches):
            if self.options.verbosity + verbosity >= 1:
                progress.progress(batch_num)
            batch = list(batch)

            (c, _p, mask), (_y, ) = self._data_to_arrays(batch, test=True)
            assert mask.all()  # We shouldn't be masking anything in prediction

            beam_size = 1 if random else self.options.speaker_beam_size
            done = np.zeros((len(batch), beam_size), dtype=np.bool)
            beam = np.zeros((len(batch), beam_size, self.seq_vec.max_len),
                            dtype=np.int32)
            beam[:, :, 0] = self.seq_vec.vectorize(['<s>'])[0]
            beam_scores = np.log(np.zeros((len(batch), beam_size)))
            beam_scores[:, 0] = 0.0

            c = np.repeat(c, beam_size, axis=0)
            mask = np.repeat(mask, beam_size, axis=0)

            for length in range(1, self.seq_vec.max_len):
                if done.all():
                    break
                p = beam.reshape(
                    (beam.shape[0] * beam.shape[1], beam.shape[2]))[:, :-1]
                probs = self.model.predict([c, p, mask])
                if random:
                    indices = sample(probs[:, length - 1, :])
                    beam[:, 0, length] = indices
                    done = np.logical_or(done, indices == eos_index)
                else:
                    assert probs.shape[1] == p.shape[1], (probs.shape[1],
                                                          p.shape[1])
                    assert probs.shape[2] == len(
                        self.seq_vec.tokens), (probs.shape[2],
                                               len(self.seq_vec.tokens))
                    scores = np.log(probs)[:, length - 1, :].reshape(
                        (beam.shape[0], beam.shape[1], probs.shape[2]))
                    beam_search_step(scores, length, beam, beam_scores, done,
                                     eos_index)
            outputs = self.seq_vec.unvectorize_all(beam[:, 0, :])
            result.extend([' '.join(strip_invalid_tokens(o)) for o in outputs])
        if self.options.verbosity + verbosity >= 1:
            progress.end_task()

        return result
    def predict_and_score(self, eval_instances, random=False, verbosity=0):
        from fields import build_instance

        options = self.get_options()
        predictions = []
        scores = []
        base_is_listener = self.override_listener()
        assert options.listener, 'Eval data should be listener data for DirectRefGameLearner'

        true_batch_size = options.listener_eval_batch_size / options.num_distractors
        batches = iterators.iter_batches(eval_instances, true_batch_size)
        num_batches = (len(eval_instances) - 1) // true_batch_size + 1

        if options.verbosity + verbosity >= 2:
            print('Testing')
        progress.start_task('Eval batch', num_batches)
        for batch_num, batch in enumerate(batches):
            progress.progress(batch_num)
            batch = list(batch)
            assert batch[
                0].alt_outputs, 'No context given for direct listener testing'
            context = len(batch[0].alt_outputs)
            if self.options.direct_base_uses_context:
                output_grid = [
                    build_instance(inst.input, target, inst.alt_outputs,
                                   base_is_listener) for inst in batch
                    for target in range(len(inst.alt_outputs))
                ]
            else:
                output_grid = [
                    build_instance(inst.input, color, None, base_is_listener)
                    for inst in batch for color in inst.alt_outputs
                ]
            assert len(output_grid) == context * len(batch), \
                'Context must be the same number of colors for all examples'
            true_indices = np.array([inst.output for inst in batch])
            grid_scores = self.base.score(output_grid, verbosity=verbosity)
            log_probs = np.array(grid_scores).reshape((len(batch), context))
            # Renormalize over only the context colors
            log_probs -= logsumexp(log_probs, axis=1)[:, np.newaxis]
            # Cap confidences to reasonable values
            if options.direct_min_score is not None and options.direct_min_score <= 0.0:
                log_probs = np.maximum(options.direct_min_score, log_probs)
                # Normalize again (so we always return log probabilities)
                log_probs -= logsumexp(log_probs, axis=1)[:, np.newaxis]
            assert log_probs.shape == (len(batch), context)
            pred_indices = np.argmax(log_probs, axis=1)
            predictions.extend(pred_indices.tolist())
            # Extract the score of the true color
            scores.extend(log_probs[np.arange(len(batch)),
                                    true_indices].tolist())
        progress.end_task()

        return predictions, scores
    def predict_and_score(self, eval_instances, random=False, verbosity=0):
        from fields import build_instance

        options = self.get_options()
        predictions = []
        scores = []
        base_is_listener = self.override_listener()
        assert options.listener, 'Eval data should be listener data for DirectRefGameLearner'

        true_batch_size = options.listener_eval_batch_size / options.num_distractors
        batches = iterators.iter_batches(eval_instances, true_batch_size)
        num_batches = (len(eval_instances) - 1) // true_batch_size + 1

        if options.verbosity + verbosity >= 2:
            print('Testing')
        progress.start_task('Eval batch', num_batches)
        for batch_num, batch in enumerate(batches):
            progress.progress(batch_num)
            batch = list(batch)
            assert batch[0].alt_outputs, 'No context given for direct listener testing'
            context = len(batch[0].alt_outputs)
            if self.options.direct_base_uses_context:
                output_grid = [build_instance(inst.input, target, inst.alt_outputs,
                                              base_is_listener)
                               for inst in batch for target in range(len(inst.alt_outputs))]
            else:
                output_grid = [build_instance(inst.input, color, None, base_is_listener)
                               for inst in batch for color in inst.alt_outputs]
            assert len(output_grid) == context * len(batch), \
                'Context must be the same number of colors for all examples'
            true_indices = np.array([inst.output for inst in batch])
            grid_scores = self.base.score(output_grid, verbosity=verbosity)
            log_probs = np.array(grid_scores).reshape((len(batch), context))
            # Renormalize over only the context colors
            log_probs -= logsumexp(log_probs, axis=1)[:, np.newaxis]
            # Cap confidences to reasonable values
            if options.direct_min_score is not None and options.direct_min_score <= 0.0:
                log_probs = np.maximum(options.direct_min_score, log_probs)
                # Normalize again (so we always return log probabilities)
                log_probs -= logsumexp(log_probs, axis=1)[:, np.newaxis]
            assert log_probs.shape == (len(batch), context)
            pred_indices = np.argmax(log_probs, axis=1)
            predictions.extend(pred_indices.tolist())
            # Extract the score of the true color
            scores.extend(log_probs[np.arange(len(batch)), true_indices].tolist())
        progress.end_task()

        return predictions, scores
Beispiel #7
0
def output_csv():
    options = config.options(read=True)

    output = html_report.get_output(options.run_dir, options.split)
    insts = get_trial_data(output, options.test_size, options.run_dir)

    print(','.join('ex%d%s' % (ex, part)
                   for ex in range(BATCH_SIZE)
                   for part in ['cid', 'system', 'desc', 'target', 'c1', 'c2', 'c3']))

    for i, batch in enumerate(iterators.iter_batches(insts, BATCH_SIZE)):
        batch = list(batch)
        if len(batch) != BATCH_SIZE:
            continue
        print(','.join('"%d:%d","%s","%s","%s","%s","%s","%s"' %
                       ((i, j, inst.source, inst.input, inst.output) +
                        tuple(html_report.web_color(c) for c in inst.alt_outputs[:3]))
                       for j, inst in enumerate(batch)))
Beispiel #8
0
def output_csv():
    options = config.options(read=True)

    output = html_report.get_output(options.run_dir, options.split)
    insts = get_trial_data(output, options.test_size, options.run_dir)

    print(','.join(
        'ex%d%s' % (ex, part) for ex in range(BATCH_SIZE)
        for part in ['cid', 'system', 'desc', 'target', 'c1', 'c2', 'c3']))

    for i, batch in enumerate(iterators.iter_batches(insts, BATCH_SIZE)):
        batch = list(batch)
        if len(batch) != BATCH_SIZE:
            continue
        print(','.join(
            '"%d:%d","%s","%s","%s","%s","%s","%s"' %
            ((i, j, inst.source, inst.input, inst.output) +
             tuple(html_report.web_color(c) for c in inst.alt_outputs[:3]))
            for j, inst in enumerate(batch)))
Beispiel #9
0
    def train(self, training_instances, validation_instances='ignored', metrics='ignored'):
        self.build_graph()
        env = gym.make(cards_env.register())

        self.init_params()

        if self.options.verbosity >= 1:
            progress.start_task('Epoch', self.options.pg_train_epochs)

        for epoch in range(self.options.pg_train_epochs):
            if self.options.verbosity >= 1:
                progress.progress(epoch)

            batches = iterators.iter_batches(training_instances,
                                             self.options.pg_batch_size)
            num_batches = (len(training_instances) - 1) // self.options.pg_batch_size + 1

            if self.options.verbosity >= 1:
                progress.start_task('Batch', num_batches)

            try:
                for batch_num, batch in enumerate(batches):
                    if self.options.verbosity >= 1:
                        progress.progress(batch_num)
                    step = epoch * num_batches + batch_num
                    self.train_one_batch(list(batch), env, t=step)
                    if step % 10 == 0:
                        check_prefix = config.get_file_path('checkpoint')
                        self.saver.save(self.session, check_prefix, global_step=step)
            except KeyboardInterrupt:
                self.summary_writer.flush()
                raise

            if self.options.verbosity >= 1:
                progress.end_task()

        if self.options.verbosity >= 1:
            progress.end_task()
Beispiel #10
0
    def predict_and_score(self, eval_instances, random=False, verbosity=0):
        options = config.options()
        predictions = []
        scores = []

        all_utts = self.base.seq_vec.tokens
        sym_vec = vectorizers.SymbolVectorizer()
        sym_vec.add_all(all_utts)
        prior_scores = self.prior_scores(all_utts)

        base_is_listener = (type(self.base) in listener.LISTENERS.values())

        true_batch_size = options.listener_eval_batch_size / len(all_utts)
        batches = iterators.iter_batches(eval_instances, true_batch_size)
        num_batches = (len(eval_instances) - 1) // true_batch_size + 1

        if options.verbosity + verbosity >= 2:
            print('Testing')
        progress.start_task('Eval batch', num_batches)
        for batch_num, batch in enumerate(batches):
            progress.progress(batch_num)
            batch = list(batch)
            context = len(
                batch[0].alt_inputs) if batch[0].alt_inputs is not None else 0
            if context:
                output_grid = [
                    (instance.Instance(utt, color)
                     if base_is_listener else instance.Instance(color, utt))
                    for inst in batch for color in inst.alt_inputs
                    for utt in sym_vec.tokens
                ]
                assert len(output_grid) == context * len(batch) * len(all_utts), \
                    'Context must be the same number of colors for all examples'
                true_indices = np.array([inst.input for inst in batch])
            else:
                output_grid = [
                    (instance.Instance(utt, inst.input) if base_is_listener
                     else instance.Instance(inst.input, utt)) for inst in batch
                    for utt in sym_vec.tokens
                ]
                true_indices = sym_vec.vectorize_all(
                    [inst.input for inst in batch])
                if len(true_indices.shape) == 2:
                    # Sequence vectorizer; we're only using single tokens for now.
                    true_indices = true_indices[:, 0]
            scores = self.base.score(output_grid, verbosity=verbosity)
            if context:
                log_probs = np.array(scores).reshape(
                    (len(batch), context, len(all_utts)))
                orig_log_probs = log_probs[np.arange(len(batch)),
                                           true_indices, :]
                # Renormalize over only the context colors, and extract the score of
                # the true color.
                log_probs -= logsumexp(log_probs, axis=1)[:, np.newaxis, :]
                log_probs = log_probs[np.arange(len(batch)), true_indices, :]
            else:
                log_probs = np.array(scores).reshape(
                    (len(batch), len(all_utts)))
                orig_log_probs = log_probs
            assert log_probs.shape == (len(batch), len(all_utts))
            # Add in the prior scores, if used (S1 \propto L0 * P)
            if prior_scores is not None:
                log_probs = log_probs + 0.5 * prior_scores
            if options.exhaustive_base_weight:
                w = options.exhaustive_base_weight
                log_probs = w * orig_log_probs + (1.0 - w) * log_probs
            # Normalize across utterances. Note that the listener returns probability
            # densities over colors.
            log_probs -= logsumexp(log_probs, axis=1)[:, np.newaxis]
            if random:
                pred_indices = sample(np.exp(log_probs))
            else:
                pred_indices = np.argmax(log_probs, axis=1)
            predictions.extend(sym_vec.unvectorize_all(pred_indices))
            scores.extend(log_probs[np.arange(len(batch)),
                                    true_indices].tolist())
        progress.end_task()

        return predictions, scores
Beispiel #11
0
    def predict_and_score(self, eval_instances, random=False, verbosity=0):
        options = self.get_options()
        predictions = []
        scores = []

        if options.verbosity + verbosity >= 2:
            print('Building alternative utterance list')
        sym_vec = vectorizers.SymbolVectorizer()
        sym_vec.add_all([inst.input for inst in self.get_dataset(self.base)])

        assert eval_instances[0].alt_outputs, \
            'Context required for L(S(L)): %s' % eval_instances[0].__dict__
        context_len = len(eval_instances[0].alt_outputs)
        if options.exhaustive_num_samples > 0:
            num_alt_utts = options.exhaustive_num_samples * context_len + 1
            num_sample_sets = options.exhaustive_num_sample_sets
        else:
            num_alt_utts = len(sym_vec.tokens) + 1
            num_sample_sets = 1
        true_batch_size = max(
            options.listener_eval_batch_size /
            (num_alt_utts * num_sample_sets * context_len), 1)
        batches = iterators.iter_batches(eval_instances, true_batch_size)
        num_batches = (len(eval_instances) - 1) // true_batch_size + 1

        if options.exhaustive_output_speaker_samples:
            self.truncate_utterances_files('s1_samples.%s.jsons',
                                           num_sample_sets)
        if options.exhaustive_output_speaker_predictions:
            self.truncate_utterances_files('s1_predictions.%s.jsons',
                                           num_sample_sets)
        if options.exhaustive_output_all_grids:
            self.truncate_utterances_files('grids.%s.jsons.gz', 1)

        if options.verbosity + verbosity >= 2:
            print('Testing')
        progress.start_task('Eval batch', num_batches)
        for batch_num, batch in enumerate(batches):
            progress.progress(batch_num)
            batch = list(batch)
            output_grid = self.build_grid(batch, sym_vec.tokens)
            assert len(output_grid) == len(batch) * num_sample_sets * context_len * num_alt_utts, \
                'Context must be the same number of colors for all examples %s' % \
                ((len(output_grid), len(batch), num_sample_sets, context_len, num_alt_utts),)
            true_indices = np.array([inst.output for inst in batch])
            grid_scores = self.base.score(output_grid, verbosity=verbosity)
            l0_log_probs = np.array(grid_scores).reshape(
                (len(batch), num_sample_sets, context_len, num_alt_utts))
            # Renormalize over only the context colors, and extract the score of
            # the true color according to the base model.
            l0_log_probs -= logsumexp(l0_log_probs, axis=2)[:, :,
                                                            np.newaxis, :]
            assert l0_log_probs.shape == (len(batch), num_sample_sets,
                                          context_len,
                                          num_alt_utts), l0_log_probs.shape
            orig_log_probs = l0_log_probs[np.arange(len(batch)), 0, :, 0]
            assert orig_log_probs.shape == (len(batch),
                                            context_len), orig_log_probs.shape
            # Apply temperature parameter before speaker.
            utilities = options.exhaustive_inv_temperature * l0_log_probs
            # Normalize across utterances. Note that the listener returns probability
            # densities over colors.
            s1_log_probs = utilities - logsumexp(utilities, axis=3)[:, :, :,
                                                                    np.newaxis]
            assert s1_log_probs.shape == (len(batch), num_sample_sets,
                                          context_len,
                                          num_alt_utts), s1_log_probs.shape
            if options.exhaustive_output_speaker_samples or \
                    options.exhaustive_output_speaker_predictions:
                speaker_dist = s1_log_probs[np.arange(len(batch)), :,
                                            true_indices, 1:]
                if options.exhaustive_output_speaker_samples:
                    speaker_sample_indices = sample(np.exp(speaker_dist))
                    self.write_speaker_utterances('s1_samples.%s.jsons',
                                                  output_grid,
                                                  speaker_sample_indices,
                                                  l0_log_probs.shape)
                if options.exhaustive_output_speaker_predictions:
                    speaker_pred_indices = np.argmax(speaker_dist, axis=2)
                    self.write_speaker_utterances('s1_predictions.%s.jsons',
                                                  output_grid,
                                                  speaker_pred_indices,
                                                  l0_log_probs.shape)
            # Normalize again across context colors.
            l2_log_probs = s1_log_probs - logsumexp(
                s1_log_probs, axis=2)[:, :, np.newaxis, :]
            assert l2_log_probs.shape == (len(batch), num_sample_sets,
                                          context_len,
                                          num_alt_utts), l2_log_probs.shape
            # Extract the score of each color for the input utterance according to the L2 model.
            log_probs = l2_log_probs[np.arange(len(batch)), :, :, 0]
            assert log_probs.shape == (len(batch), num_sample_sets,
                                       context_len), log_probs.shape
            # Blend L0 and L2 (if enabled) to produce final score.
            if options.exhaustive_base_weight:
                w = options.exhaustive_base_weight
                # Bump zero probabilities up to epsilon ~= 3e-23, because previously we would
                # only have -inf log probs, but now if w < 0 we could get NaNs.
                log_probs = (
                    w * np.maximum(orig_log_probs[:, np.newaxis, :], -52.0) +
                    (1.0 - w) * np.maximum(log_probs, -52.0))
            # Normalize across context one more time to prevent cheating when
            # blending.
            log_probs -= logsumexp(log_probs, axis=2)[:, :, np.newaxis]
            # Average (in probability space) over sample sets
            log_probs = logsumexp(log_probs, axis=1) - np.log(
                log_probs.shape[1])
            if options.exhaustive_output_all_grids:
                self.write_grids(output_grid, l0_log_probs, s1_log_probs,
                                 l2_log_probs, log_probs)
            if random:
                pred_indices = sample(np.exp(log_probs))
            else:
                pred_indices = np.argmax(log_probs, axis=1)
            predictions.extend(pred_indices)
            # Extract the score of the true color according to the combined model.
            scores.extend(log_probs[np.arange(len(batch)),
                                    true_indices].tolist())
        progress.end_task()

        return predictions, scores
Beispiel #12
0
    def predict_and_score(self, eval_instances, random=False, verbosity=0):
        options = config.options()
        predictions = []
        scores = []

        all_utts = self.base.seq_vec.tokens
        sym_vec = vectorizers.SymbolVectorizer()
        sym_vec.add_all(all_utts)
        prior_scores = self.prior_scores(all_utts)

        base_is_listener = (type(self.base) in listener.LISTENERS.values())

        true_batch_size = options.listener_eval_batch_size / len(all_utts)
        batches = iterators.iter_batches(eval_instances, true_batch_size)
        num_batches = (len(eval_instances) - 1) // true_batch_size + 1

        if options.verbosity + verbosity >= 2:
            print('Testing')
        progress.start_task('Eval batch', num_batches)
        for batch_num, batch in enumerate(batches):
            progress.progress(batch_num)
            batch = list(batch)
            context = len(batch[0].alt_inputs) if batch[0].alt_inputs is not None else 0
            if context:
                output_grid = [(instance.Instance(utt, color)
                                if base_is_listener else
                                instance.Instance(color, utt))
                               for inst in batch for color in inst.alt_inputs
                               for utt in sym_vec.tokens]
                assert len(output_grid) == context * len(batch) * len(all_utts), \
                    'Context must be the same number of colors for all examples'
                true_indices = np.array([inst.input for inst in batch])
            else:
                output_grid = [(instance.Instance(utt, inst.input)
                                if base_is_listener else
                                instance.Instance(inst.input, utt))
                               for inst in batch for utt in sym_vec.tokens]
                true_indices = sym_vec.vectorize_all([inst.input for inst in batch])
                if len(true_indices.shape) == 2:
                    # Sequence vectorizer; we're only using single tokens for now.
                    true_indices = true_indices[:, 0]
            scores = self.base.score(output_grid, verbosity=verbosity)
            if context:
                log_probs = np.array(scores).reshape((len(batch), context, len(all_utts)))
                orig_log_probs = log_probs[np.arange(len(batch)), true_indices, :]
                # Renormalize over only the context colors, and extract the score of
                # the true color.
                log_probs -= logsumexp(log_probs, axis=1)[:, np.newaxis, :]
                log_probs = log_probs[np.arange(len(batch)), true_indices, :]
            else:
                log_probs = np.array(scores).reshape((len(batch), len(all_utts)))
                orig_log_probs = log_probs
            assert log_probs.shape == (len(batch), len(all_utts))
            # Add in the prior scores, if used (S1 \propto L0 * P)
            if prior_scores is not None:
                log_probs = log_probs + 0.5 * prior_scores
            if options.exhaustive_base_weight:
                w = options.exhaustive_base_weight
                log_probs = w * orig_log_probs + (1.0 - w) * log_probs
            # Normalize across utterances. Note that the listener returns probability
            # densities over colors.
            log_probs -= logsumexp(log_probs, axis=1)[:, np.newaxis]
            if random:
                pred_indices = sample(np.exp(log_probs))
            else:
                pred_indices = np.argmax(log_probs, axis=1)
            predictions.extend(sym_vec.unvectorize_all(pred_indices))
            scores.extend(log_probs[np.arange(len(batch)), true_indices].tolist())
        progress.end_task()

        return predictions, scores
Beispiel #13
0
    def predict_and_score(self, eval_instances, random=False, verbosity=0):
        options = self.get_options()
        predictions = []
        scores = []

        if options.verbosity + verbosity >= 2:
            print('Building alternative utterance list')
        sym_vec = vectorizers.SymbolVectorizer()
        sym_vec.add_all([inst.input for inst in self.get_dataset(self.base)])

        assert eval_instances[0].alt_outputs, \
            'Context required for L(S(L)): %s' % eval_instances[0].__dict__
        context_len = len(eval_instances[0].alt_outputs)
        if options.exhaustive_num_samples > 0:
            num_alt_utts = options.exhaustive_num_samples * context_len + 1
            num_sample_sets = options.exhaustive_num_sample_sets
        else:
            num_alt_utts = len(sym_vec.tokens) + 1
            num_sample_sets = 1
        true_batch_size = max(options.listener_eval_batch_size /
                              (num_alt_utts * num_sample_sets * context_len), 1)
        batches = iterators.iter_batches(eval_instances, true_batch_size)
        num_batches = (len(eval_instances) - 1) // true_batch_size + 1

        if options.exhaustive_output_speaker_samples:
            self.truncate_utterances_files('s1_samples.%s.jsons', num_sample_sets)
        if options.exhaustive_output_speaker_predictions:
            self.truncate_utterances_files('s1_predictions.%s.jsons', num_sample_sets)
        if options.exhaustive_output_all_grids:
            self.truncate_utterances_files('grids.%s.jsons.gz', 1)

        if options.verbosity + verbosity >= 2:
            print('Testing')
        progress.start_task('Eval batch', num_batches)
        for batch_num, batch in enumerate(batches):
            progress.progress(batch_num)
            batch = list(batch)
            output_grid = self.build_grid(batch, sym_vec.tokens)
            assert len(output_grid) == len(batch) * num_sample_sets * context_len * num_alt_utts, \
                'Context must be the same number of colors for all examples %s' % \
                ((len(output_grid), len(batch), num_sample_sets, context_len, num_alt_utts),)
            true_indices = np.array([inst.output for inst in batch])
            grid_scores = self.base.score(output_grid, verbosity=verbosity)
            l0_log_probs = np.array(grid_scores).reshape((len(batch), num_sample_sets,
                                                          context_len, num_alt_utts))
            # Renormalize over only the context colors, and extract the score of
            # the true color according to the base model.
            l0_log_probs -= logsumexp(l0_log_probs, axis=2)[:, :, np.newaxis, :]
            assert l0_log_probs.shape == (len(batch), num_sample_sets,
                                          context_len, num_alt_utts), l0_log_probs.shape
            orig_log_probs = l0_log_probs[np.arange(len(batch)), 0, :, 0]
            assert orig_log_probs.shape == (len(batch), context_len), orig_log_probs.shape
            # Apply temperature parameter before speaker.
            utilities = options.exhaustive_inv_temperature * l0_log_probs
            # Normalize across utterances. Note that the listener returns probability
            # densities over colors.
            s1_log_probs = utilities - logsumexp(utilities, axis=3)[:, :, :, np.newaxis]
            assert s1_log_probs.shape == (len(batch), num_sample_sets,
                                          context_len, num_alt_utts), s1_log_probs.shape
            if options.exhaustive_output_speaker_samples or \
                    options.exhaustive_output_speaker_predictions:
                speaker_dist = s1_log_probs[np.arange(len(batch)), :, true_indices, 1:]
                if options.exhaustive_output_speaker_samples:
                    speaker_sample_indices = sample(np.exp(speaker_dist))
                    self.write_speaker_utterances('s1_samples.%s.jsons', output_grid,
                                                  speaker_sample_indices, l0_log_probs.shape)
                if options.exhaustive_output_speaker_predictions:
                    speaker_pred_indices = np.argmax(speaker_dist, axis=2)
                    self.write_speaker_utterances('s1_predictions.%s.jsons', output_grid,
                                                  speaker_pred_indices, l0_log_probs.shape)
            # Normalize again across context colors.
            l2_log_probs = s1_log_probs - logsumexp(s1_log_probs, axis=2)[:, :, np.newaxis, :]
            assert l2_log_probs.shape == (len(batch), num_sample_sets,
                                          context_len, num_alt_utts), l2_log_probs.shape
            # Extract the score of each color for the input utterance according to the L2 model.
            log_probs = l2_log_probs[np.arange(len(batch)), :, :, 0]
            assert log_probs.shape == (len(batch), num_sample_sets, context_len), log_probs.shape
            # Blend L0 and L2 (if enabled) to produce final score.
            if options.exhaustive_base_weight:
                w = options.exhaustive_base_weight
                # Bump zero probabilities up to epsilon ~= 3e-23, because previously we would
                # only have -inf log probs, but now if w < 0 we could get NaNs.
                log_probs = (w * np.maximum(orig_log_probs[:, np.newaxis, :], -52.0) +
                             (1.0 - w) * np.maximum(log_probs, -52.0))
            # Normalize across context one more time to prevent cheating when
            # blending.
            log_probs -= logsumexp(log_probs, axis=2)[:, :, np.newaxis]
            # Average (in probability space) over sample sets
            log_probs = logsumexp(log_probs, axis=1) - np.log(log_probs.shape[1])
            if options.exhaustive_output_all_grids:
                self.write_grids(output_grid,
                                 l0_log_probs, s1_log_probs, l2_log_probs, log_probs)
            if random:
                pred_indices = sample(np.exp(log_probs))
            else:
                pred_indices = np.argmax(log_probs, axis=1)
            predictions.extend(pred_indices)
            # Extract the score of the true color according to the combined model.
            scores.extend(log_probs[np.arange(len(batch)), true_indices].tolist())
        progress.end_task()

        return predictions, scores