Esempio n. 1
0
    def _run_one_iteration(self, iteration, eval_mode):
        """Runs one iteration in separate thread, logs and checkpoints results.

    Same as parent Runner implementation except that summary statistics are
    directly logged instead of being returned.

    Args:
      iteration: int, current iteration number, used as a global_step for saving
        Tensorboard summaries.
      eval_mode: bool, whether this is an evaluation iteration.
    """
        statistics = iteration_statistics.IterationStatistics()
        iteration_name = '{}iteration {}'.format('eval ' if eval_mode else '',
                                                 iteration)
        tf.logging.info('Starting %s.', iteration_name)
        run_phase = self._run_eval_phase if eval_mode else self._run_train_phase
        num_episodes, average_reward = run_phase(statistics)
        with self._output_lock:
            logging_iteration = iteration if eval_mode else self._completed_iteration
            self._log_experiment(logging_iteration,
                                 statistics,
                                 suffix='_eval' if eval_mode else '')
            self._save_tensorboard_summaries(
                logging_iteration,
                num_episodes,
                average_reward,
                tag='Eval' if eval_mode else 'Train')
            if not eval_mode:
                self._checkpoint_experiment(self._completed_iteration)
                self._completed_iteration += 1
        tf.logging.info('Completed %s.', iteration_name)
Esempio n. 2
0
    def testAddManyValues(self):
        my_pi = 3.14159

        statistics = iteration_statistics.IterationStatistics()

        # Add a number of items. Each item is added to the list corresponding to its
        # given key.
        statistics.append({
            'rewards': 0,
            'nouns': 'reinforcement',
            'angles': my_pi
        })
        # Add a second item to the 'nouns' list.
        statistics.append({'nouns': 'learning'})

        # There are three lists.
        self.assertEqual(3, len(statistics.data_lists))
        self.assertEqual(1, len(statistics.data_lists['rewards']))
        self.assertEqual(2, len(statistics.data_lists['nouns']))
        self.assertEqual(1, len(statistics.data_lists['angles']))

        self.assertEqual(0, statistics.data_lists['rewards'][0])
        self.assertEqual('reinforcement', statistics.data_lists['nouns'][0])
        self.assertEqual('learning', statistics.data_lists['nouns'][1])
        self.assertEqual(my_pi, statistics.data_lists['angles'][0])
Esempio n. 3
0
    def _run_one_iteration(self, iteration):
        """Runs one iteration of agent/environment interaction.

    An iteration involves running several episodes until a certain number of
    steps are obtained. The interleaving of train/eval phases implemented here
    are to match the implementation of (Mnih et al., 2015).

    Args:
      iteration: int, current iteration number, used as a global_step for saving
        Tensorboard summaries.

    Returns:
      A dict containing summary statistics for this iteration.
    """
        statistics = iteration_statistics.IterationStatistics()
        tf.logging.info('Starting iteration %d', iteration)
        num_episodes_train, average_reward_train = self._run_train_phase(
            statistics)
        num_episodes_eval, average_reward_eval = self._run_eval_phase(
            statistics)

        self._save_tensorboard_summaries(iteration, num_episodes_train,
                                         average_reward_train,
                                         num_episodes_eval,
                                         average_reward_eval)
        if not self.testing:
            self.fout_test.write(
                '%d %f %d\n' %
                (iteration, average_reward_eval, num_episodes_eval))
            self.fout_test.flush()
        return statistics.data_lists
Esempio n. 4
0
  def _run_one_iteration(self, iteration):
    """Runs one iteration of agent/environment interaction.

    An iteration involves running several episodes until a certain number of
    steps are obtained. The interleaving of train/eval phases implemented here
    are to match the implementation of (Mnih et al., 2015).

    Args:
      iteration: int, current iteration number, used as a global_step for saving
        Tensorboard summaries.

    Returns:
      A dict containing summary statistics for this iteration.
    """
    statistics = iteration_statistics.IterationStatistics()
    logging.info('Starting iteration %d', iteration)
    num_episodes_train, average_reward_train, average_steps_per_second = (
        self._run_train_phase(statistics))
    active_num_episodes_eval, active_average_reward_eval = self._run_eval_phase(
        statistics, 'active')
    passive_num_episodes_eval, passive_average_reward_eval = (
        self._run_eval_phase(statistics, 'passive'))

    self._save_tensorboard_summaries(iteration, num_episodes_train,
                                     average_reward_train,
                                     active_num_episodes_eval,
                                     active_average_reward_eval,
                                     passive_num_episodes_eval,
                                     passive_average_reward_eval,
                                     average_steps_per_second)
    return statistics.data_lists
Esempio n. 5
0
    def _run_one_iteration(self, iteration):
        """Runs one iteration of agent/environment interaction.

    An iteration involves running several episodes until a certain number of
    steps are obtained. The interleaving of train/eval phases implemented here
    are to match the implementation of (Mnih et al., 2015).

    Args:
      iteration: int, current iteration number, used as a global_step for saving
        Tensorboard summaries.

    Returns:
      A dict containing summary statistics for this iteration.
    """
        statistics = iteration_statistics.IterationStatistics()
        tf.logging.info('Starting iteration %d', iteration)

        # Perform the training phase, during which the agent learns.
        self._agent.eval_mode = False
        start_time = time.time()
        number_steps, sum_returns, num_episodes = self._run_one_phase(
            self._training_steps, statistics, 'train')
        average_return = sum_returns / num_episodes if num_episodes > 0 else 0.0
        print("Average return", round(average_return, 2))
        print("Number_steps", number_steps)
        print("Num episodes", num_episodes)
        statistics.append({'train_average_return': average_return})
        time_delta = time.time() - start_time
        print("Time", int(time_delta))
        tf.logging.info(
            'Average undiscounted return per training episode: %.2f',
            average_return)
        tf.logging.info('Average training steps per second: %.2f',
                        number_steps / time_delta)
        num_episodes_train, average_reward_train = num_episodes, average_return

        # Perform the evaluation phase -- no learning.
        self._agent.eval_mode = True
        _, eval_sum_returns, eval_num_episodes = self._run_one_phase(
            self._evaluation_steps, statistics, 'eval')
        eval_average_return = eval_sum_returns / eval_num_episodes if eval_num_episodes > 0 else 0.0
        print("Test average return", round(eval_average_return, 2))
        print("Test num episodes", eval_num_episodes)
        print('---------------------------------------')
        tf.logging.info(
            'Average undiscounted return per evaluation episode: %.2f',
            eval_average_return)
        statistics.append({'eval_average_return': eval_average_return})
        num_episodes_eval, average_reward_eval = eval_num_episodes, eval_average_return

        self._save_tensorboard_summaries(iteration, num_episodes_train,
                                         average_reward_train,
                                         num_episodes_eval,
                                         average_reward_eval)
        return statistics.data_lists
Esempio n. 6
0
    def testAddOneValue(self):
        statistics = iteration_statistics.IterationStatistics()

        # The statistics data structure should be empty a-priori.
        self.assertEqual(0, len(statistics.data_lists))

        statistics.append({'key1': 0})
        # We should have exactly one list, containing one value.
        self.assertEqual(1, len(statistics.data_lists))
        self.assertEqual(1, len(statistics.data_lists['key1']))
        self.assertEqual(0, statistics.data_lists['key1'][0])
Esempio n. 7
0
    def _run_one_iteration(self, iteration):
        """Runs one iteration of agent/environment interaction."""
        statistics = iteration_statistics.IterationStatistics()
        tf.logging.info('Starting iteration %d', iteration)
        self._run_train_phase()

        num_episodes_eval, average_reward_eval = self._run_eval_phase(
            statistics)

        self._save_tensorboard_summaries(iteration, num_episodes_eval,
                                         average_reward_eval)
        return statistics.data_lists
Esempio n. 8
0
    def _run_one_iteration(self, iteration):
        """Runs one iteration of agent/environment interaction."""
        statistics = iteration_statistics.IterationStatistics()
        tf.logging.info("Starting iteration %d", iteration)
        # pylint: disable=protected-access
        if not self._agent._replay_suffix:
            # Reload the replay buffer
            self._agent._replay.memory.reload_buffer(num_buffers=5)
        # pylint: enable=protected-access
        self._run_train_phase()
        self.offline_evaluation(iteration)

        return statistics.data_lists
    def _run_one_iteration(self, iteration):
        """Runs one iteration of agent/environment interaction."""
        statistics = iteration_statistics.IterationStatistics()
        logging.info('Starting iteration %d', iteration)
        if not self._agent.replay_suffix:
            # Reload the replay buffer at every iteration
            self._agent._replay.reload_data()  # pylint: disable=protected-access
        self._run_train_phase()

        num_episodes_eval, average_reward_eval = self._run_eval_phase(
            statistics)

        self._save_tensorboard_summaries(iteration, num_episodes_eval,
                                         average_reward_eval)
        return statistics.data_lists
    def _run_one_iteration(self, iteration):
        """Runs one iteration of agent/environment interaction.

    An iteration involves running several episodes until a certain number of
    steps are obtained. The interleaving of train/eval phases implemented here
    are to match the implementation of (Mnih et al., 2015).

    Args:
      iteration: int, current iteration number, used as a global_step for saving
        Tensorboard summaries.

    Returns:
      A dict containing summary statistics for this iteration.
    """
        statistics = iteration_statistics.IterationStatistics()
        tf.logging.info('Starting iteration %d', iteration)

        num_episodes_train, average_reward_train = self._run_train_phase(
            statistics)
        num_episodes_eval, average_reward_eval = self._run_eval_phase(
            statistics)
        self._run_evalrandom_phase(statistics)
        print("EPSILON:")
        print(
            self._agent.epsilon_fn(self._agent.epsilon_decay_period,
                                   self._agent.training_steps,
                                   self._agent.min_replay_history,
                                   self._agent.epsilon_train))
        print("QUNATILE VALUES:")
        self._agent._record_observation([0, 2, 1, 1, 2, 0, 0, 0, 0])
        print(
            self._agent._sess.run(
                self._agent._q_values, {
                    self._agent.state_ph: self._agent.state,
                    self._agent.validmoves_ph: [0, 5, 6, 7, 8]
                }))

        #self._save_tensorboard_summaries(iteration, num_episodes_train,
        #                                 average_reward_train, num_episodes_eval,
        #                                 average_reward_eval)
        if not self.testing:
            self.fout_test.write(
                '%d %f %d\n' %
                (iteration, average_reward_eval, num_episodes_eval))
            self.fout_test.flush()
        return statistics.data_lists
Esempio n. 11
0
    def _train_one_step(self, epoch, Phi, Psi, left_vec, key, optim,
                        opt_state):  # pylint: disable=invalid-name
        """Training function."""
        statistics = iteration_statistics.IterationStatistics()
        logging.info('Starting epoch %d', epoch)
        start_time = time.time()
        Phi, opt_state, grads = estimates.nabla_phi_analytical(
            Phi,
            Psi,
            key,
            optim,  # pylint: disable=invalid-name
            opt_state,
            self._estimator,
            self._alpha,
            self._use_l2_reg,
            self._reg_coeff,
            self._use_penalty,
            self._j,
            self._num_rows)
        time_delta = time.time() - start_time
        statistics.append({'Time/epoch': time_delta})
        statistics.append({'representation': Phi})
        gm_distances = utils.grassman_distance(Phi, left_vec[:, :FLAGS.d])
        statistics.append({'GM_distances': gm_distances})
        frob_norms = utils.outer_objective_mc(Phi, Psi)
        statistics.append({'Frob_norms': frob_norms})
        phi_norms = jnp.sum(jnp.square(Phi))
        statistics.append({'phi_norms': phi_norms})
        grad_norms = jnp.sum(jnp.square(grads))
        phi_ranks = jnp.linalg.matrix_rank(Phi)
        statistics.append({'phi_ranks': phi_ranks})
        statistics.append({'grad_norms': grad_norms})
        if FLAGS.d == 1:
            dot_products = (Phi.T @ left_vec[:, :FLAGS.d] /
                            (jnp.linalg.norm(Phi) *
                             jnp.linalg.norm(left_vec[:, :FLAGS.d])))[0][0]
            statistics.append({'Dot_products': dot_products})
        else:
            dot_products = 0.

        # if epoch % self._summary_writer_frequency == 0:
        self._save_tensorboard_summaries(epoch, frob_norms, gm_distances,
                                         dot_products, phi_norms, grad_norms,
                                         phi_ranks)
        return statistics.data_lists, Phi, opt_state
    def _run_one_iteration(self, iteration):
        """Runs one iteration of agent/environment interaction.
    An iteration involves running several episodes until a certain number of
    steps are obtained. This method differs from the `_run_one_iteration` method
    in the base `Runner` class in that it only runs the train phase.
    Args:
      iteration: int, current iteration number, used as a global_step for saving
        Tensorboard summaries.
    Returns:
      A dict containing summary statistics for this iteration.
    """
        statistics = iteration_statistics.IterationStatistics()
        num_episodes_train, average_reward_train = self._run_train_phase(
            statistics)

        self._save_tensorboard_summaries(iteration, num_episodes_train,
                                         average_reward_train)
        return statistics.data_lists
Esempio n. 13
0
    def _run_one_iteration(self, iteration):
        """Runs one iteration of agent/environment interaction."""
        statistics = iteration_statistics.IterationStatistics()
        tf.logging.info('Starting iteration %d', iteration)
        for _ in range(self._training_maxi_steps):
            # pylint: disable=protected-access
            if not self._agent._replay_suffix:
                # Reload the replay buffer
                self._agent._replay.memory.reload_buffer(num_buffers=4)
            # pylint: enable=protected-access
            self._run_train_phase()

        num_episodes_eval, average_reward_eval = self._run_eval_phase(
            statistics)

        self._save_tensorboard_summaries(iteration, num_episodes_eval,
                                         average_reward_eval)

        try:
            self._agent.iteration_end_hook()
        except AttributeError:
            pass

        return statistics.data_lists
Esempio n. 14
0
 def _run_one_iteration(self, iteration):
     statistics = iteration_statistics.IterationStatistics()
     logging.info('Starting iteration %d', iteration)
     _, _ = self._run_eval_phase(statistics)
     return statistics.data_lists
Esempio n. 15
0
 def _run_one_iteration(self, iteration):
     from dopamine.discrete_domains import iteration_statistics
     statistics = iteration_statistics.IterationStatistics()
     tf.logging.info('Starting iteration %d', iteration)
     _, _ = self._run_eval_phase(statistics)
     return statistics.data_lists
Esempio n. 16
0
 def testMissingValue(self):
     statistics = iteration_statistics.IterationStatistics()
     with self.assertRaises(KeyError):
         _ = statistics.data_lists['missing_key']
    def _run_one_iteration(self, iteration, firstiteration):
        """Runs one iteration of agent/environment interaction.
    An iteration involves running several episodes until a certain number of
    steps are obtained. The interleaving of train/eval phases implemented here
    are to match the implementation of (Mnih et al., 2015).
    Args:
      iteration: int, current iteration number, used as a global_step for saving
        Tensorboard summaries.
    Returns:
      A dict containing summary statistics for this iteration.
    """

        #if firstiteration:
        #  experiment_data = self._agent2.bundle_and_checkpoint(self.PATH + "test",
        #                                                    iteration)
        #  if experiment_data:
        #    experiment_data['current_iteration'] = iteration
        #    experiment_data['logs'] = self._logger.data
        #    self._checkpointertest.save_checkpoint(iteration, experiment_data)
        #if iteration==25:
        #    experiment_data = self._checkpointertest.load_checkpoint(0)
        #    x=False
        #   x=self._agent2.unbundle(self.PATH + "test", 0, experiment_data)
        #

        #self.to_graph = tf.Graph() # graph where everything above will be copied to

        #self._q_argmax = tf.contrib.copy_graph.copy_op_to_graph(self._agent._q_argmax, self.to_graph,[])
        #self._q_argmax = tf.contrib.copy_graph.copy_op_to_graph(self._agent.state_ph, self.to_graph,[])
        #self._q_argmax = tf.contrib.copy_graph.copy_op_to_graph(self._agent.validmoves_ph, self.to_graph,[])

        if firstiteration:
            self._my_checkpoint_experiment(iteration, 'latest', 1)
            self._my_checkpoint_experiment(iteration, 'latest', 2)
        self._my_checkpoint_experiment(iteration, 'player', 1)

        q_argmax2 = self._agent._sess.run(
            self._agent._q_values, {
                self._agent.state_ph: [[[[0], [0], [0], [0], [0], [0], [0],
                                         [0], [0], [50]]]],
                self._agent.validmoves_ph: [1, 3, 5, 7, 8]
            })

        #print("Q Before loading", q_argmax2)

        self._my_initialize_resume('crpt', 'latest', self.latest1, 11)
        self._my_initialize_resume('crpt', 'latest', self.latest2, 22)

        statistics = iteration_statistics.IterationStatistics()
        tf.logging.info('Starting iteration %d', iteration)
        print("SELF.COUNTER:", self.counter)
        if iteration > 50 and self.counter >= 1:
            self.counter = 0
            self.player1_turn_training = not self.player1_turn_training

        if iteration < 50000:
            num_episodes_train, average_reward_train = self._run_generic_phase(
                statistics, 'train1')

            num_episodes_train, average_reward_train = self._run_generic_phase(
                statistics, 'train2')
            print("TRAIN TRAIN1")
        else:
            if self.player1_turn_training:
                num_episodes_train, average_reward_train = self._run_generic_phase(
                    statistics, 'train1')
                print("TRAIN TRAIN2")
            else:
                num_episodes_train, average_reward_train = self._run_generic_phase(
                    statistics, 'train2')
                print("TRAIN TRAIN3")

        num_episodes_eval, average_reward_eval = self._run_eval_phase(
            statistics)

        print("PLAYER1 TURN TRAINING:", self.player1_turn_training)
        print("EVAL EVAL")

        #numep, evalaveragereward =  self._run_generic_phase(
        #    statistics,'preveval')

        print("LATEST1:", self.latest1)
        print("LATEST2:", self.latest2)

        numep, oldvsnew = self._run_generic_phase(statistics, 'oldvsnew')
        numep, newvsold = self._run_generic_phase(statistics, 'newvsold')
        numep, oldvsold = self._run_generic_phase(statistics, 'oldvsold')
        print("OLD VS NEW:", oldvsnew)
        print("NEW VS OLD:", newvsold)
        print("OLD VS OLD:", oldvsold)
        numep, oldvsboss = self._run_generic_phase(statistics, 'oldvsboss')
        print("OLD VS BOSS", oldvsboss)
        numep, evalrandom = self._run_generic_phase(statistics, 'evalrandom')
        print("EVALRANDOM:", evalrandom)
        print("PLAYER1 TURN TRAINING2:", self.player1_turn_training)
        if self.player1_turn_training:
            self.compare_result_against = newvsold
            self.who_to_change = 1
            self.who_to_change_latest = self.latest1
            if newvsold > 0.05:
                self.counter += 1
            else:
                self.counter = 0
        else:
            self.compare_result_against = oldvsnew
            self.who_to_change = 2
            self.who_to_change_latest = self.latest2
            if oldvsnew < 0.05:
                self.counter += 1
            else:
                self.counter = 0

        if iteration < 50000:
            self._my_checkpoint_experiment(iteration, 'latest', 1)
            self.latest1 = iteration
            self._my_checkpoint_experiment(iteration, 'latest', 2)
            self.latest2 = iteration
            self.counter = 0
        else:
            if self.player1_turn_training:
                if oldvsold + 0.05 < self.compare_result_against:
                    self._my_checkpoint_experiment(iteration, 'latest',
                                                   self.who_to_change)
                    if self.who_to_change == 1:
                        self.latest1 = iteration
                    else:
                        self.latest2 = iteration
                    print("CROOK: Changing player " + str(self.who_to_change) +
                          " version")
                else:
                    self._my_initialize_resume('crpt', 'latest',
                                               self.who_to_change_latest,
                                               self.who_to_change)
                    print("CROOK: Staying with old player " +
                          str(self.who_to_change))
            else:
                if oldvsold - 0.05 > self.compare_result_against:
                    self._my_checkpoint_experiment(iteration, 'latest',
                                                   self.who_to_change)
                    if self.who_to_change == 1:
                        self.latest1 = iteration
                    else:
                        self.latest2 = iteration
                    print("CROOK: Changing player " + str(self.who_to_change) +
                          " version")
                else:
                    self._my_initialize_resume('crpt', 'latest',
                                               self.who_to_change_latest,
                                               self.who_to_change)
                    print("CROOK: Staying with old player " +
                          str(self.who_to_change))

        #variables = tf.trainable_variables()
        #print("Weight matrix: {0}".format(self._agent._sess.run(variables[0])))

        print("EPSILON:")
        print(
            self._agent.epsilon_fn(self._agent.epsilon_decay_period,
                                   self._agent.training_steps,
                                   self._agent.min_replay_history,
                                   self._agent.epsilon_train))

        #self._save_tensorboard_summaries(iteration, num_episodes_train,
        #                                 average_reward_train, num_episodes_eval,
        #                                 average_reward_eval)
        if not self.testing:
            self.fout_test.write(
                '%d %f %d\n' %
                (iteration, average_reward_eval, num_episodes_eval))
            self.fout_test.flush()
        return statistics.data_lists
Esempio n. 18
0
 def run_inference_test(self):
     statistics = iteration_statistics.IterationStatistics()
     _ = self._run_one_phase(self.inference_steps, statistics, 'eval')