def evaluate_auxiliaries(
         self, examples: Dict[str, instruction_example.InstructionExample],
         prefix: str, experiment: crayon.CrayonExperiment):
     train_results_dict: Dict[str, Any] = \
         plan_metrics.plan_metric_results(self, examples)
     for name, float_value in train_results_dict.items():
         experiment.add_scalar_value(prefix + ' ' + str(name), float_value)
    def _evaluate(
            self, train_examples: Dict[str,
                                       instruction_example.InstructionExample],
            validation_examples: Dict[str,
                                      instruction_example.InstructionExample],
            first_epoch_aggregated_training: Dict[
                str,
                aggregated_instruction_example.AggregatedInstructionExample],
            validation_games: Dict[str, cereal_bar_game.CerealBarGame],
            game_arguments: game_args.GameArgs,
            evaluation_arguments: evaluation_args.EvaluationArgs,
            train_accuracy_proportion: int,
            experiment: crayon.CrayonExperiment, epoch_num: int,
            logger: evaluation_logger.EvaluationLogger):
        validation_followed_proportion = 0.
        validation_score_proportion = 0.
        with torch.no_grad():
            logger.disable_logging()
            _evaluate_and_log_metrics(
                self,
                list(train_examples.values())
                [:int(len(train_examples) * train_accuracy_proportion)],
                game_arguments, evaluation_arguments, experiment, 'train',
                epoch_num, logger)
            logger.enable_logging()

            logger.log('Epoch %d validation evaluation' % epoch_num)

            validation_card_state_accuracy = _evaluate_and_log_metrics(
                self, list(validation_examples.values()), game_arguments,
                evaluation_arguments, experiment, 'validation', epoch_num,
                logger)

            if self._end_to_end:
                full_results = action_generator_metrics.execution_accuracies(
                    self,
                    game_arguments,
                    evaluation_arguments,
                    game_examples=list(validation_games.values()),
                    logger=logger)
                print(full_results)
                exit()
                experiment.add_scalar_value('val prop followed',
                                            validation_followed_proportion)
                experiment.add_scalar_value('val prop score',
                                            validation_score_proportion)

                if first_epoch_aggregated_training:
                    eval_and_log_metrics(
                        self,
                        list(first_epoch_aggregated_training.values())
                        [:int(len(first_epoch_aggregated_training) * 0.1)],
                        game_arguments, evaluation_arguments, experiment,
                        "agg train epoch 0", epoch_num)

        return validation_card_state_accuracy, validation_followed_proportion, validation_score_proportion
def crayon_ship_metrics(crayon_exp: CrayonExperiment,
                        mets_dict: Dict[str, float],
                        metrics: List[str],
                        iteration: int):
    """
    Ship new metrics to alband/crayon's tensorboard
    :param crayon_exp: target experiment
    :param mets_dict: dictionary holding metrics
    :param metrics: list of metrics to ship
    :param iteration: current iteration
    :return:
    """

    out_dict = dict([(k, v) for k, v in mets_dict.items()
                     if k in metrics])
    crayon_exp.add_scalar_dict(out_dict, wall_time=-1, step=iteration + 1)
Ejemplo n.º 4
0
    def _eval(self, train_examples: Dict[str, Any], val_examples: Dict[str,
                                                                       Any],
              experiment: crayon.CrayonExperiment):
        self.eval()
        with torch.no_grad():
            auxiliaries_train = dict()
            train_sample = list(
                train_examples.values())[:int(len(train_examples) * 0.1)]
            for example in train_sample:
                card_scores, auxiliaries = self.get_predictions(example)
                if Auxiliary.FINAL_CARDS in self._auxiliaries:
                    auxiliaries.update(
                        {Auxiliary.FINAL_CARDS: card_scores.squeeze()})
                auxiliaries_train[example.get_id()] = auxiliaries

            train_results_dict: Dict[str, Any] = \
                auxiliary_property_accuracies(self,
                                              train_sample,
                                              auxiliaries_train,
                                              self._args.get_decoder_args().traj_weight_by_time())

            for name, float_value in train_results_dict.items():
                experiment.add_scalar_value('train ' + str(name), float_value)

            auxiliaries_val = dict()
            for example in val_examples.values():
                card_scores, auxiliaries = self.get_predictions(example)
                if Auxiliary.FINAL_CARDS in self._auxiliaries:
                    auxiliaries.update(
                        {Auxiliary.FINAL_CARDS: card_scores.squeeze()})
                auxiliaries_val[example.get_id()] = auxiliaries
            val_results_dict: Dict[str, Any] = \
                auxiliary_property_accuracies(self,
                                              list(val_examples.values()),
                                              auxiliaries_val,
                                              self._args.get_decoder_args().traj_weight_by_time())
            for name, float_value in val_results_dict.items():
                experiment.add_scalar_value('val ' + str(name), float_value)

        if str(Auxiliary.TRAJECTORY) + ' xent' not in val_results_dict:
            val_results_dict[str(Auxiliary.TRAJECTORY) + ' xent'] = 0.
        return (val_results_dict[str(Auxiliary.FINAL_CARDS) + ' accuracy']
                if Auxiliary.FINAL_CARDS in self._auxiliaries else 0.,
                val_results_dict[str(Auxiliary.TRAJECTORY) + ' xent'])
def _evaluate_and_log_metrics(
        model: ActionGeneratorModelWrapper,
        examples: List[instruction_example.InstructionExample],
        game_arguments: game_args.GameArgs,
        evaluation_arguments: evaluation_args.EvaluationArgs,
        experiment: crayon.CrayonExperiment, prefix: str, step: int,
        logger: evaluation_logger.EvaluationLogger):
    # TODO: Should this include accuracy of auxiliary predictions? It's a bit harder to measure because the agent gets
    # off the gold trajectory so the labels are less well-defined.
    metric_results = action_generator_metrics.execution_accuracies(
        model,
        game_arguments,
        evaluation_arguments,
        instruction_examples=examples,
        logger=logger)
    logger.log('Evaluation results:')
    logger.log(str(metric_results))

    experiment.add_scalar_value(
        prefix + ' exact acc',
        metric_results[metric.Metric.SEQUENCE_ACCURACY],
        step=step)
    experiment.add_scalar_value(prefix + ' agent distance',
                                metric_results[metric.Metric.AGENT_DISTANCE],
                                step=step)
    experiment.add_scalar_value(
        prefix + ' exact config acc',
        metric_results[metric.Metric.EXACT_ENVIRONMENT_ACCURACY],
        step=step)
    experiment.add_scalar_value(
        prefix + ' environment acc',
        metric_results[metric.Metric.RELAXED_ENVIRONMENT_ACCURACY],
        step=step)

    card_accuracy = metric_results[metric.Metric.CARD_ACCURACY]
    experiment.add_scalar_value(prefix + ' card acc', card_accuracy, step=step)

    return card_accuracy
Ejemplo n.º 6
0
    def _train_epoch(
            self, train_ids: List[Tuple[str, int]], epoch_idx: int,
            train_examples: Dict[str, instruction_example.InstructionExample],
            batch_size: int, optimizer: torch.optim.Optimizer,
            experiment: crayon.CrayonExperiment):
        self.train()
        num_batches: int = 0
        train_loss_sum: float = 0

        random.shuffle(train_ids)
        losses_dict = dict()
        with util.get_progressbar('epoch ' + str(epoch_idx),
                                  int(len(train_examples) /
                                      batch_size)) as pbar:
            for start_idx in range(0, len(train_examples), batch_size):
                pbar.update(num_batches)
                examples_in_batch: List[Any] = list()
                for ex_id, idx in train_ids[start_idx:start_idx + batch_size]:
                    examples_in_batch.append((train_examples[ex_id], idx))

                batch_loss, _, auxiliary_losses = train_aux_loss_batch(
                    self, examples_in_batch, optimizer)

                for auxiliary_type, losses in auxiliary_losses.items():
                    if auxiliary_type == Auxiliary.HEX_PROPERTIES:
                        for loss_name, loss in auxiliary_losses[
                                auxiliary_type].items():
                            if loss_name not in losses_dict:
                                losses_dict[loss_name] = 0.
                            losses_dict[loss_name] += loss.item()
                    else:
                        loss_name = str(auxiliary_type)
                        if loss_name not in losses_dict:
                            losses_dict[loss_name] = 0.
                        losses_dict[loss_name] += losses.item()

                experiment.add_scalar_value('batch loss', batch_loss)
                if math.isnan(batch_loss):
                    raise ValueError('NaN Loss')

                train_loss_sum += batch_loss
                num_batches += 1

        avg_loss: float = float(train_loss_sum / num_batches)
        logging.info('Average loss per batch: %f', avg_loss)
        experiment.add_scalar_value('train loss', avg_loss)
        for loss, loss_sum in losses_dict.items():
            experiment.add_scalar_value('train ' + str(loss) + ' loss',
                                        float(loss_sum / num_batches))
Ejemplo n.º 7
0
    def train_loop(self, dataset: game_dataset.GameDataset,
                   game_arguments: game_args.GameArgs,
                   evaluation_arguments: evaluation_args.EvaluationArgs,
                   training_arguments: training_args.TrainingArgs,
                   experiment: crayon.CrayonExperiment) -> str:
        train_examples: Dict[
            str,
            instruction_example.InstructionExample] = dataset.get_examples(
                dataset_split.DatasetSplit.UPDATE)

        if self._args.use_all_trajectory():
            raise ValueError('Using the whole trajectory is not supported.')
            # Train IDs are a cross between training IDs and indices in the action sequence.
            # train_ids: List[Tuple[str, int]] = list()
            # for example_id, example in train_examples.items():
            #    for i in range(len(example.get_state_deltas())):
            #        train_ids.append((example_id, i))
        else:
            train_ids: List[Tuple[str,
                                  int]] = [(key, 0)
                                           for key in train_examples.keys()]

        val_examples: Dict[
            str,
            instruction_example.InstructionExample] = dataset.get_examples(
                dataset_split.DatasetSplit.VAL)

        # Clip gradients
        optimizer: torch.optim.Optimizer = training_arguments.get_optimizer(
            'hex')(self.parameters())
        if training_arguments.get_max_gradient() > 0:
            torch.nn.utils.clip_grad_norm_(
                self.parameters(), training_arguments.get_max_gradient())

        num_epochs: int = 0
        max_acc = 0

        patience: float = training_arguments.get_initial_patience()
        countdown: int = int(patience)

        best_epoch_filename: str = ''

        while countdown > 0:
            logging.info('Starting epoch (hex predictor) ' + str(num_epochs))
            self._train_epoch(train_ids, num_epochs, train_examples,
                              training_arguments.get_batch_size(), optimizer,
                              experiment)

            val_acc, val_xent = self._eval(train_examples, val_examples,
                                           experiment)

            save_desc: str = ''

            if val_acc > max_acc:
                logging.info('Best accuracy: ' + str(val_acc))
                max_acc = val_acc
                save_desc += '_bestacc'

            if save_desc:
                # Hacky: for now, always return the one who is best on card prediction only.
                # This seems to be better than xent.
                filename: str = \
                    os.path.join(training_arguments.get_save_dir(), 'model_' + str(num_epochs) + save_desc + '.pt')
                best_epoch_filename = filename

                patience *= training_arguments.get_patience_update_ratio()
                countdown = int(patience)
                logging.info('Resetting countdown to %d, patience is %d',
                             countdown, patience)

                self.save(filename)

            num_epochs += 1
            countdown -= 1
            experiment.add_scalar_value('countdown', countdown)

        return best_epoch_filename
    def train_loop(self, dataset: game_dataset.GameDataset,
                   game_arguments: game_args.GameArgs,
                   evaluation_arguments: evaluation_args.EvaluationArgs,
                   training_arguments: training_args.TrainingArgs,
                   experiment: crayon.CrayonExperiment) -> str:

        train_examples: Dict[
            str,
            instruction_example.InstructionExample] = dataset.get_examples(
                dataset_split.DatasetSplit.UPDATE)
        validation_examples: Dict[
            str,
            instruction_example.InstructionExample] = dataset.get_examples(
                dataset_split.DatasetSplit.VALIDATION)
        validation_games: Dict[
            str, cereal_bar_game.CerealBarGame] = dataset.get_games(
                dataset_split.DatasetSplit.VALIDATION)

        # Clip gradients
        optimizer: torch.optim.Optimizer = \
            training_arguments.get_optimizer(self.get_arguments().get_task(),
                                             self.get_arguments().get_decoder_args().end_to_end())(self.parameters())
        if training_arguments.get_max_gradient() > 0:
            torch.nn.utils.clip_grad_norm_(
                self.parameters(), training_arguments.get_max_gradient())
        batch_size: int = training_arguments.get_batch_size()

        num_epochs: int = 0

        maximum_card_state_accuracy = 0
        maximum_proportion_instructions_followed = 0
        maximum_proportion_points_scored = 0

        patience: float = training_arguments.get_initial_patience()
        countdown: int = int(patience)

        # Aggregated datasets
        aggregated_train_examples: List[
            Dict[str, aggregated_instruction_example.
                 AggregatedInstructionExample]] = list()
        aggregated_validation_examples: List[Dict[str, aggregated_instruction_example.AggregatedInstructionExample]] = \
            list()

        compiled_example_set_train: Set[str] = set()
        for example_id, example in train_examples.items():
            # Need to use a string instead so it can be hashed.
            compiled_example_set_train.add(example.hash_representation())
        compiled_example_set_validation: Set[str] = set()
        for example_id, example in validation_examples.items():
            compiled_example_set_validation.add(example.hash_representation())

        best_filename: str = ''

        aggregated_buffer: Set[Tuple[str, int]] = set()

        while countdown > 0 and (not self._end_to_end or
                                 num_epochs < MAXIMUM_NUM_END_TO_END_EPOCHS):
            logging.info('Starting epoch (action predictor) ' +
                         str(num_epochs))
            total_num_examples: int = (len(train_examples) + sum([
                len(epoch_examples)
                for epoch_examples in aggregated_train_examples
            ]))
            total_num_validation_examples: int = \
                (len(validation_examples) + sum([len(epoch_examples) for epoch_examples in
                                                 aggregated_validation_examples]))

            if training_arguments.aggregate_examples():
                experiment.add_scalar_value('number of training examples',
                                            total_num_examples,
                                            step=num_epochs)
                experiment.add_scalar_value('number of validation examples',
                                            total_num_validation_examples,
                                            step=num_epochs)

            new_examples, aggregated_buffer = \
                self._train_epoch(num_epochs,
                                  train_examples,
                                  batch_size,
                                  optimizer,
                                  experiment,
                                  training_arguments.aggregate_examples(),
                                  aggregated_train_examples,
                                  compiled_example_set_train,
                                  game_arguments,
                                  evaluation_arguments,
                                  aggregated_buffer)

            if training_arguments.aggregate_examples():
                logging.info('Collected ' + str(len(new_examples)) +
                             ' new training examples!')
                logging.info('Num marked as implicit: ' + str(
                    len([ex
                         for ex in new_examples.values() if ex.implicit()])))
                aggregated_train_examples.append(new_examples)

                num_card = 0
                num_pos = 0
                for example in new_examples.values():
                    if example.get_type(
                    ) == aggregated_instruction_example.InstructionExampleType.INVALID_CARD_STATE:
                        num_card += 1
                    else:
                        num_pos += 1
                logging.info('Created ' + str(num_card) +
                             ' incorrect card state examples; ' +
                             str(num_pos) + ' position examples.')
                experiment.add_scalar_value('new invalid card examples',
                                            num_card,
                                            step=num_epochs)
                experiment.add_scalar_value('new incorrect position examples',
                                            num_pos,
                                            step=num_epochs)

                # Save the examples under a file specific to this epoch.
                if new_examples:
                    with open(
                            os.path.join(
                                training_arguments.get_save_directory(),
                                'aggregated_train_examples_epoch' +
                                str(num_epochs) + '.pkl'), 'wb') as ofile:
                        pickle.dump(new_examples, ofile)

            evaluation_filename = evaluation_arguments.get_evaluation_results_filename(
            )
            if evaluation_filename:
                evaluation_filename = os.path.join(
                    training_arguments.get_save_directory(),
                    evaluation_filename + '-' + str(num_epochs))

            logger: evaluation_logger.EvaluationLogger = evaluation_logger.EvaluationLogger(
                evaluation_filename)

            (validation_card_state_accuracy, validation_proportion_instructions_followed,
             validation_proportion_points_scored) = \
                self._evaluate(train_examples,
                               validation_examples,
                               aggregated_train_examples[0] if aggregated_train_examples else None,
                               validation_games,
                               game_arguments,
                               evaluation_arguments,
                               training_arguments.get_proportion_of_train_for_accuracy(),
                               experiment,
                               num_epochs,
                               logger)

            suffix = ''
            better = False
            if validation_card_state_accuracy > maximum_card_state_accuracy:
                logging.info('Best card acc at ' +
                             '{0:.2f}'.format(validation_card_state_accuracy) +
                             '%')
                maximum_card_state_accuracy = validation_card_state_accuracy
                suffix += '_card'
                better = True
            if validation_proportion_instructions_followed > maximum_proportion_instructions_followed:
                logging.info('Highest prop followed at %f',
                             validation_proportion_instructions_followed)
                maximum_proportion_instructions_followed = validation_proportion_instructions_followed
                suffix += '_follow'
                better = True
            if validation_proportion_points_scored > maximum_proportion_points_scored:
                logging.info('Highest prop score at %f',
                             validation_proportion_points_scored)
                maximum_proportion_points_scored = validation_proportion_points_scored
                suffix += '_score'
                better = True

            if better:
                filename = os.path.join(
                    training_arguments.get_save_directory(),
                    'model_' + str(num_epochs) + suffix + '.pt')
                best_filename = filename
                patience *= training_arguments.get_patience_update_factor()
                countdown = int(patience)
                logging.info('Resetting countdown to ' + str(countdown))
                self.save(filename)

            num_epochs += 1
            countdown -= 1
            experiment.add_scalar_value('countdown', countdown)
        return best_filename
    def _train_epoch(
            self, epoch_idx: int,
            train_examples: Dict[str, instruction_example.InstructionExample],
            batch_size: int, optimizer: torch.optim.Optimizer,
            experiment: crayon.CrayonExperiment, aggregate_examples: bool,
            aggregated_train_examples: List[Dict[
                str,
                aggregated_instruction_example.AggregatedInstructionExample]],
            compiled_example_set: Set[str], game_arguments: game_args.GameArgs,
            evaluation_argumentss: evaluation_args.EvaluationArgs,
            aggregated_buffer: Set[Tuple[str, int]]):
        self.train()
        num_batches: int = 0
        train_loss_sum: float = 0

        # The train IDs set pairs example IDs with the epoch in which they were created (-1 for static data).
        # It is augmented during training to add new examples to this epoch index, so items are removed until it's empty
        train_ids_set = {(key, -1) for key in train_examples.keys()}

        losses_dict = dict()
        main_loss_sum: float = 0

        total_num_examples_seen: int = 0

        # Aggregate new training examples. Similar to Dagger (Ross et al. 2011), this first aggregates examples, then
        # trains on them (plus the new data).
        new_aggregated_examples = None
        if aggregate_examples:
            raise ValueError('Example aggregation not supported yet.')
            logging.info('Aggregated buffer has ' +
                         str(len(aggregated_buffer)) + ' examples.')
            new_aggregated_examples: Dict[str, aggregated_instruction_example.AggregatedInstructionExample] = \
                perform_example_aggregation(train_examples,
                                            compiled_example_set,
                                            epoch_idx,
                                            game_args,
                                            eval_args,
                                            self,
                                            allow_duplicates=True)  # Allow duplicate examples during training
            new_ids: List[Tuple[str, int]] = [
                (key, epoch_idx) for key in new_aggregated_examples.keys()
            ]
            logging.info('Generated ' + str(len(new_aggregated_examples)) +
                         ' new examples.')

            # Add it to the compiled examples
            for example_id, new_example in new_aggregated_examples.items():
                # Remove the epoch at the end!
                compiled_example_set.add(new_example.hash_rep())

            # Add to the aggregated buffer
            aggregated_buffer = aggregated_buffer | set(new_ids)

            # Now randomly choose a subset, and combine with the train IDs
            # This doesn't guarantee that the most recently collected examples appear in this epoch
            train_ids_set = train_ids_set | set(
                random.sample(aggregated_buffer,
                              min(len(train_examples),
                                  len(aggregated_buffer))))

            example_ids = list(train_examples.keys())
            random.shuffle(example_ids)
            inorder_ids: List[str] = list()
            for ex_id in example_ids:
                inorder_ids.extend([
                    ex for ex in train_ids_set
                    if '-'.join(ex[0].split('-')[:2]) == ex_id
                ])
            logging.info('IDs used for batch: ')
            logging.info(','.join([ex_id[0] for ex_id in inorder_ids]))
            if len(set(inorder_ids)) != len(inorder_ids):
                raise ValueError(
                    'Inorder IDs was not a set -- has duplicate members, ' +
                    str(len(set(inorder_ids))) + ' vs. ' +
                    str(len(inorder_ids)))
            if set(inorder_ids) != set(train_ids_set):
                # These should just be examples that are too long because the original example is too long.
                logging.info('Train IDs not in inorder IDs:')
                logging.info(','.join([
                    ex_id[0] for ex_id in set(train_ids_set) - set(inorder_ids)
                ]))
                logging.info('Inorder IDs not in train IDs:')
                logging.info(','.join([
                    ex_id[0] for ex_id in set(inorder_ids) - set(train_ids_set)
                ]))
                logging.warn(
                    'Set of inorder IDs is not the same as the set of train ids'
                )
        else:
            inorder_ids = list(train_ids_set)
            random.shuffle(inorder_ids)

        logging.info('Starting epoch #%r with %r examples.' %
                     (epoch_idx, len(inorder_ids)))

        with util.get_progressbar('epoch ' + str(epoch_idx),
                                  len(inorder_ids)) as pbar:
            for start_idx in range(0, len(inorder_ids), batch_size):
                pbar.update(num_batches)
                sampled_ids = inorder_ids[
                    start_idx:min(start_idx + batch_size, len(inorder_ids))]

                # Construct the list of examples that should be used to update the model parameters.
                update_examples: List[Union[AggregatedExample,
                                            Example]] = list()

                for identifier, epoch in sampled_ids:
                    # This example was sampled from the first epoch.
                    if epoch < 0:
                        update_examples.append(train_examples[identifier])
                    elif epoch == epoch_idx:
                        # This example was recently added (i.e., during this epoch).
                        update_examples.append(
                            new_aggregated_examples[identifier])
                    else:
                        # Take from the aggregated dataset for the specified epoch.
                        update_examples.append(
                            aggregated_train_examples[epoch][identifier])

                # Do an update, and log the performance of the metrics.
                loss, main_loss, auxiliary_losses = \
                    batch_loss.apply_batch_loss(self, update_examples, optimizer)

                for auxiliary_type, losses in auxiliary_losses.items():
                    loss_name = str(auxiliary_type)
                    if loss_name not in losses_dict:
                        losses_dict[loss_name] = 0.
                    losses_dict[loss_name] += losses.item()

                experiment.add_scalar_value('batch loss', loss)
                if math.isnan(loss):
                    raise ValueError('NaN Loss')

                train_loss_sum += loss
                main_loss_sum += main_loss.item()
                num_batches += 1

                total_num_examples_seen += len(sampled_ids)
                pbar.update(total_num_examples_seen)

        avg_loss: float = float(train_loss_sum / num_batches)
        logging.info('Average loss per batch: %f', avg_loss)
        experiment.add_scalar_value('train loss', avg_loss)
        experiment.add_scalar_value('train action prediction loss',
                                    float(main_loss_sum / num_batches))
        for loss, loss_sum in losses_dict.items():
            experiment.add_scalar_value('train ' + str(loss) + ' loss',
                                        float(loss_sum / num_batches))

        # Add the new aggregated examples to the training set.
        return new_aggregated_examples, aggregated_buffer