def play_episode(self, env: Environment) -> Episode: env.reset() episode_steps = [] total_reward: Reward = 0.0 while not env.is_done(): episode_step, reward = self.step(env) episode_steps.append(episode_step), total_reward += reward episode = Episode(steps=episode_steps, reward=total_reward) return episode
class Trainer(object): """ Class for Training a Local Network / ONE agent """ def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate, grad_applier, show_env=False, local_t_max=20, max_global_time_step=10 * 10**7, gamma=0.99, save_interval_step=100 * 1000, env='Breakout-v0', device='/cpu:0'): self.thread_index = thread_index self.learning_rate = learning_rate self.env = env # Whether to render the environment # or not during training (default is # True for one of the agents) - change # this in main.py self.show_env = show_env # Discount factor for the reward self.gamma = gamma # Number of "epochs" self.max_global_time_step = max_global_time_step # Number of steps for the LSTM self.local_t_max = local_t_max # Number of actions the agent can take self.action_size = Environment.get_action_size(env) self.local_network = A3C(self.action_size, self.thread_index, device) self.global_network = global_network # Build computational graph self.local_network._create_network() # Build computational graph for the losses # and gradients self.local_network.prepare_a3c_loss() self.apply_gradients = grad_applier.minimize_local( self.local_network.a3c_loss, global_network.get_vars(), self.local_network.get_vars()) # Sync the weights of the local network with those # of the main network self.sync = self.local_network.sync_from(global_network) # Initialize time step, learning rate, etc self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 def build_environment(self): """ Create the environment """ self.environment = Environment(self.env, show_env=self.show_env) def stop(self): """ Terminate the environment """ self.environment.stop() def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): """ Save Score to Tensorboard """ summary_str = sess.run(summary_op, feed_dict={score_input: score}) summary_writer.add_summary(summary_str, global_t) # Write to disk summary_writer.flush() def choose_action(self, pi_values): """ Sample from the learned policy distribution :param pi_values: Probability distribution for every actions """ return np.random.choice(range(len(pi_values)), p=pi_values) def concat_action_reward(self, action, action_size, reward): """ Return one hot vectored action and reward. """ action_reward = np.zeros([action_size + 1], dtype='float32') action_reward[action] = 1.0 action_reward[-1] = float(reward) return action_reward def _decay_learning_rate(self, global_time_step): """ Decay the learning rate linearly """ time_left = self.max_global_time_step - global_time_step learning_rate = self.initial_learning_rate * time_left \ / self.max_global_time_step # Clip learning rate at 0.0 if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def _process_a3c(self, sess, global_t, summary_writer, summary_op, score_input): """ Process max_local_t steps/frames in the A3C network :param sess: TensorFlow session object :param global_t: Global time step (number of steps processed by the global/shared network) """ # States of the LSTM states = [] last_action_rewards = [] actions = [] rewards = [] values = [] # Synchronize with global network sess.run(self.sync) # Initial local time step self.local_t = 0 # Whether we hit a terminal state or not terminal_end = False start_lstm_state = self.local_network.lstm_state_out # Loops local_t_max time steps for _ in range(self.local_t_max): last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = self.concat_action_reward( last_action, self.action_size, last_reward) # Compute policy and value function pi_, value_ = self.local_network.run_pi_value( sess, self.environment.last_state, last_action_reward) # Pick an action given the new computed policy action = self.choose_action(pi_) # Append results to placeholders... states.append(self.environment.last_state) last_action_rewards.append(last_action_reward) actions.append(action) values.append(value_) # Process next action new_state, reward, terminal = self.environment.process(action) rewards.append(reward) self.episode_reward += reward self.local_t += 1 if terminal: # Environment hit a terminal state terminal_end = True # ---------------- # PRINT STATISTICS # ---------------- print('Time step: %5d k - Score: %3d' % (global_t / 1000, self.episode_reward)) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) # If we hit a terminal state, then the # reward is set to 0, else, it is set # to the value function self.episode_reward = 0 self.environment.reset() self.local_network.reset_state() break # --------- # BACK-PROP # --------- # We discount the rewards from t - 1 to t_start. At # time step t the reward is either 0 (if terminal state) # or V (non terminal state) R = 0.0 if not terminal_end: R = self.local_network.run_last_value(sess, new_state, last_action_reward) # Reverse placeholders actions.reverse() states.reverse() rewards.reverse() values.reverse() # To compute the gradients we compute a minibatch of # length local_t_max batch_s = [] batch_a = [] batch_adv = [] batch_R = [] # For printing R_non_discounted = R # Discounting... for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + self.gamma * R adv = R - Vi a = np.array([0] * self.action_size) a[ai] = 1.0 batch_s.append(si) batch_a.append(a) # Convert np.array -> float because # the advantage and reward placeholders # expects shape [None, ] not [None, 1] batch_adv.append(float(adv)) batch_R.append(float(R)) batch_s.reverse() batch_a.reverse() batch_adv.reverse() batch_R.reverse() # Decay learning rate cur_learning_rate = self._decay_learning_rate(global_t) # Create feed_dict for gradient_applier feed_dict = { self.local_network.input: batch_s, self.local_network.last_action_reward: last_action_rewards, self.local_network.a: batch_a, self.local_network.adv: batch_adv, self.local_network.R: batch_R, self.local_network.lstm_state: start_lstm_state, self.learning_rate: cur_learning_rate } # compute gradients and update weights sess.run(self.apply_gradients, feed_dict=feed_dict) """ # ---------------- # PRINT STATISTICS # ---------------- # Compute losses total_loss, policy_loss, value_loss = self.local_network.run_losses(sess, feed_dict) total_loss = np.mean(total_loss) policy_loss = np.mean(policy_loss) value_loss = np.mean(value_loss) if global_t % 1000 == 0: print('Time Step: %6d k Reward: %3d - Total Loss: %.4f - ' 'Policy Loss: %.4f - Value Loss: %.4f' % (global_t / 1000, float(R_non_discounted), total_loss, policy_loss, value_loss)) # Save to log file with open(LOG_FILE, 'a') as f: f.write('Reward: %3d - Total Loss: %.4f - Policy Loss: %.4f ' '- Value Loss: %.4f \n' % (float(R), total_loss, policy_loss, value_loss)) """ # Return the number of steps taken # to update global_time_steps return self.local_t