def play_episode(self, env: Environment) -> Episode:
        env.reset()
        episode_steps = []
        total_reward: Reward = 0.0

        while not env.is_done():
            episode_step, reward = self.step(env)
            episode_steps.append(episode_step),
            total_reward += reward

        episode = Episode(steps=episode_steps, reward=total_reward)
        return episode
Example #2
0
class Trainer(object):
    """ Class for Training a Local Network / ONE agent """
    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate,
                 grad_applier,
                 show_env=False,
                 local_t_max=20,
                 max_global_time_step=10 * 10**7,
                 gamma=0.99,
                 save_interval_step=100 * 1000,
                 env='Breakout-v0',
                 device='/cpu:0'):

        self.thread_index = thread_index
        self.learning_rate = learning_rate
        self.env = env

        # Whether to render the environment
        # or not during training (default is
        # True for one of the agents) - change
        # this in main.py
        self.show_env = show_env

        # Discount factor for the reward
        self.gamma = gamma

        # Number of "epochs"
        self.max_global_time_step = max_global_time_step

        # Number of steps for the LSTM
        self.local_t_max = local_t_max

        # Number of actions the agent can take
        self.action_size = Environment.get_action_size(env)

        self.local_network = A3C(self.action_size, self.thread_index, device)

        self.global_network = global_network

        # Build computational graph
        self.local_network._create_network()

        # Build computational graph for the losses
        # and gradients
        self.local_network.prepare_a3c_loss()
        self.apply_gradients = grad_applier.minimize_local(
            self.local_network.a3c_loss, global_network.get_vars(),
            self.local_network.get_vars())

        # Sync the weights of the local network with those
        # of the main network
        self.sync = self.local_network.sync_from(global_network)

        # Initialize time step, learning rate, etc
        self.local_t = 0
        self.initial_learning_rate = initial_learning_rate
        self.episode_reward = 0

    def build_environment(self):
        """ Create the environment """
        self.environment = Environment(self.env, show_env=self.show_env)

    def stop(self):
        """ Terminate the environment """
        self.environment.stop()

    def _record_score(self, sess, summary_writer, summary_op, score_input,
                      score, global_t):
        """ Save Score to Tensorboard """
        summary_str = sess.run(summary_op, feed_dict={score_input: score})
        summary_writer.add_summary(summary_str, global_t)

        # Write to disk
        summary_writer.flush()

    def choose_action(self, pi_values):
        """
        Sample from the learned policy
        distribution

        :param pi_values:
            Probability distribution for
            every actions
        """
        return np.random.choice(range(len(pi_values)), p=pi_values)

    def concat_action_reward(self, action, action_size, reward):
        """
        Return one hot vectored action and reward.
        """
        action_reward = np.zeros([action_size + 1], dtype='float32')
        action_reward[action] = 1.0
        action_reward[-1] = float(reward)
        return action_reward

    def _decay_learning_rate(self, global_time_step):
        """ Decay the learning rate linearly """
        time_left = self.max_global_time_step - global_time_step
        learning_rate = self.initial_learning_rate * time_left \
                        / self.max_global_time_step

        # Clip learning rate at 0.0
        if learning_rate < 0.0:
            learning_rate = 0.0
        return learning_rate

    def _process_a3c(self, sess, global_t, summary_writer, summary_op,
                     score_input):
        """
        Process max_local_t steps/frames in the
        A3C network

        :param sess:
            TensorFlow session object

        :param global_t:
            Global time step (number of steps
            processed by the global/shared network)
        """
        # States of the LSTM
        states = []
        last_action_rewards = []
        actions = []
        rewards = []
        values = []

        # Synchronize with global network
        sess.run(self.sync)

        # Initial local time step
        self.local_t = 0

        # Whether we hit a terminal state or not
        terminal_end = False
        start_lstm_state = self.local_network.lstm_state_out

        # Loops local_t_max time steps
        for _ in range(self.local_t_max):
            last_action = self.environment.last_action
            last_reward = self.environment.last_reward
            last_action_reward = self.concat_action_reward(
                last_action, self.action_size, last_reward)

            # Compute policy and value function
            pi_, value_ = self.local_network.run_pi_value(
                sess, self.environment.last_state, last_action_reward)

            # Pick an action given the new computed policy
            action = self.choose_action(pi_)

            # Append results to placeholders...
            states.append(self.environment.last_state)
            last_action_rewards.append(last_action_reward)
            actions.append(action)
            values.append(value_)

            # Process next action
            new_state, reward, terminal = self.environment.process(action)

            rewards.append(reward)
            self.episode_reward += reward

            self.local_t += 1

            if terminal:
                # Environment hit a terminal state
                terminal_end = True

                # ----------------
                # PRINT STATISTICS
                # ----------------

                print('Time step: %5d k - Score: %3d' %
                      (global_t / 1000, self.episode_reward))

                self._record_score(sess, summary_writer, summary_op,
                                   score_input, self.episode_reward, global_t)

                # If we hit a terminal state, then the
                # reward is set to 0, else, it is set
                # to the value function
                self.episode_reward = 0
                self.environment.reset()
                self.local_network.reset_state()
                break

        # ---------
        # BACK-PROP
        # ---------

        # We discount the rewards from t - 1 to t_start. At
        # time step t the reward is either 0 (if terminal state)
        # or V (non terminal state)
        R = 0.0
        if not terminal_end:
            R = self.local_network.run_last_value(sess, new_state,
                                                  last_action_reward)

        # Reverse placeholders
        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        # To compute the gradients we compute a minibatch of
        # length local_t_max
        batch_s = []
        batch_a = []
        batch_adv = []
        batch_R = []

        # For printing
        R_non_discounted = R

        # Discounting...
        for (ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + self.gamma * R
            adv = R - Vi
            a = np.array([0] * self.action_size)
            a[ai] = 1.0

            batch_s.append(si)
            batch_a.append(a)

            # Convert np.array -> float because
            # the advantage and reward placeholders
            #  expects shape [None, ] not [None, 1]
            batch_adv.append(float(adv))
            batch_R.append(float(R))

        batch_s.reverse()
        batch_a.reverse()
        batch_adv.reverse()
        batch_R.reverse()

        # Decay learning rate
        cur_learning_rate = self._decay_learning_rate(global_t)

        # Create feed_dict for gradient_applier
        feed_dict = {
            self.local_network.input: batch_s,
            self.local_network.last_action_reward: last_action_rewards,
            self.local_network.a: batch_a,
            self.local_network.adv: batch_adv,
            self.local_network.R: batch_R,
            self.local_network.lstm_state: start_lstm_state,
            self.learning_rate: cur_learning_rate
        }

        # compute gradients and update weights
        sess.run(self.apply_gradients, feed_dict=feed_dict)
        """
        # ----------------
        # PRINT STATISTICS
        # ----------------

        # Compute losses
        total_loss, policy_loss, value_loss = self.local_network.run_losses(sess,
                                                                            feed_dict)

        total_loss = np.mean(total_loss)
        policy_loss = np.mean(policy_loss)
        value_loss = np.mean(value_loss)

        if global_t % 1000 == 0:
            print('Time Step: %6d k Reward: %3d - Total Loss: %.4f - '
                  'Policy Loss: %.4f - Value Loss: %.4f' %
                  (global_t / 1000, float(R_non_discounted), total_loss,
                   policy_loss, value_loss))

            # Save to log file
            with open(LOG_FILE, 'a') as f:
                f.write('Reward: %3d - Total Loss: %.4f - Policy Loss: %.4f '
                  '- Value Loss: %.4f \n' %
                  (float(R), total_loss, policy_loss, value_loss))
        """

        # Return the number of steps taken
        # to update global_time_steps
        return self.local_t