def compute_returns(rewards, terminal, gamma, clip=False, c=1.89): """Compute expected return.""" length = np.shape(rewards)[0] returns = np.empty_like(rewards, dtype=np.float32) if clip: rewards = np.clip(rewards, -1., 1.) else: # when reward is 1, t(r=1) = 0.412 which is less than half of # reward which slows down the training with Atari games with # raw rewards at range (-1, 1). To address this down scaled reward, # we add the constant c=sign(r) * 1.89 to ensure that # t(r=1 + sign(r) * 1.89) ~ 1 rewards = np.sign(rewards) * c + rewards for i in reversed(range(length)): if terminal[i]: returns[i] = rewards[i] if clip else transform_h(rewards[i]) else: if clip: returns[i] = rewards[i] + gamma * returns[i + 1] else: # apply transformed expected return exp_r_t = gamma * transform_h_inv(returns[i + 1]) returns[i] = transform_h(rewards[i] + exp_r_t) return returns
def update_a3c(self, sess, actions, states, rewards, values, global_t): cumsum_reward = 0.0 actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_state = [] batch_action = [] batch_adv = [] batch_cumsum_reward = [] # compute and accumulate gradients for(ai, ri, si, vi) in zip(actions, rewards, states, values): if self.transformed_bellman: ri = np.sign(ri) * self.reward_constant + ri cumsum_reward = transform_h( ri + self.gamma * transform_h_inv(cumsum_reward)) else: cumsum_reward = ri + self.gamma * cumsum_reward advantage = cumsum_reward - vi # convert action to one-hot vector a = np.zeros([self.action_size]) a[ai] = 1 batch_state.append(si) batch_action.append(a) batch_adv.append(advantage) batch_cumsum_reward.append(cumsum_reward) cur_learning_rate = self._anneal_learning_rate(global_t, self.initial_learning_rate ) feed_dict = { self.local_a3c.s: batch_state, self.local_a3c.a: batch_action, self.local_a3c.advantage: batch_adv, self.local_a3c.cumulative_reward: batch_cumsum_reward, self.learning_rate_input: cur_learning_rate, } sess.run(self.rollout_apply_gradients, feed_dict=feed_dict) return batch_adv
def compute_return_for_state(self, rewards, terminal): """Compute expected return.""" length = np.shape(rewards)[0] returns = np.empty_like(rewards, dtype=np.float32) if self.reward_clipped: rewards = np.clip(rewards, -1., 1.) else: rewards = np.sign(rewards) * self.reward_constant + rewards for i in reversed(range(length)): if terminal[i]: returns[i] = rewards[i] if self.reward_clipped else transform_h(rewards[i]) else: if self.reward_clipped: returns[i] = rewards[i] + self.gamma * returns[i+1] else: # apply transformed expected return exp_r_t = self.gamma * transform_h_inv(returns[i+1]) returns[i] = transform_h(rewards[i] + exp_r_t) return returns[0]
def train(self, sess, global_t, train_rewards): """Train A3C.""" states = [] fullstates = [] actions = [] rewards = [] values = [] rho = [] terminal_pseudo = False # loss of life terminal_end = False # real terminal # copy weights from shared to local sess.run(self.sync) start_local_t = self.local_t # t_max times loop for i in range(self.local_t_max): state = cv2.resize(self.game_state.s_t, self.local_net.in_shape[:-1], interpolation=cv2.INTER_AREA) fullstate = self.game_state.clone_full_state() pi_, value_, logits_ = self.local_net.run_policy_and_value( sess, state) action = self.pick_action(logits_) states.append(state) fullstates.append(fullstate) actions.append(action) values.append(value_) if self.thread_idx == self.log_idx \ and self.local_t % self.log_interval == 0: log_msg1 = "lg={}".format( np.array_str(logits_, precision=4, suppress_small=True)) log_msg2 = "pi={}".format( np.array_str(pi_, precision=4, suppress_small=True)) log_msg3 = "V={:.4f}".format(value_) logger.debug(log_msg1) logger.debug(log_msg2) logger.debug(log_msg3) # process game self.game_state.step(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward if self.use_sil: # save states in episode memory self.episode.add_item(self.game_state.s_t, fullstate, action, reward, terminal) if self.reward_type == 'CLIP': reward = np.sign(reward) rewards.append(reward) self.local_t += 1 self.episode_steps += 1 global_t += 1 # s_t1 -> s_t self.game_state.update() if terminal: terminal_pseudo = True env = self.game_state.env name = 'EpisodicLifeEnv' if get_wrapper_by_name(env, name).was_real_done: # reduce log freq if self.thread_idx == self.log_idx: log_msg = "train: worker={} global_t={} local_t={}".format( self.thread_idx, global_t, self.local_t) score_str = colored( "score={}".format(self.episode_reward), "magenta") steps_str = colored( "steps={}".format(self.episode_steps), "blue") log_msg += " {} {}".format(score_str, steps_str) logger.debug(log_msg) train_rewards['train'][global_t] = (self.episode_reward, self.episode_steps) self.record_summary(score=self.episode_reward, steps=self.episode_steps, episodes=None, global_t=global_t, mode='Train') self.episode_reward = 0 self.episode_steps = 0 terminal_end = True self.game_state.reset(hard_reset=False) break cumsum_reward = 0.0 if not terminal: state = cv2.resize(self.game_state.s_t, self.local_net.in_shape[:-1], interpolation=cv2.INTER_AREA) cumsum_reward = self.local_net.run_value(sess, state) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_state = [] batch_action = [] batch_adv = [] batch_cumsum_reward = [] # compute and accumulate gradients for (ai, ri, si, vi) in zip(actions, rewards, states, values): if self.transformed_bellman: ri = np.sign(ri) * self.reward_constant + ri cumsum_reward = transform_h(ri + self.gamma * transform_h_inv(cumsum_reward)) else: cumsum_reward = ri + self.gamma * cumsum_reward advantage = cumsum_reward - vi # convert action to one-hot vector a = np.zeros([self.action_size]) a[ai] = 1 batch_state.append(si) batch_action.append(a) batch_adv.append(advantage) batch_cumsum_reward.append(cumsum_reward) cur_learning_rate = self._anneal_learning_rate( global_t, self.initial_learning_rate) feed_dict = { self.local_net.s: batch_state, self.local_net.a: batch_action, self.local_net.advantage: batch_adv, self.local_net.cumulative_reward: batch_cumsum_reward, self.learning_rate_input: cur_learning_rate, } sess.run(self.apply_gradients, feed_dict=feed_dict) t = self.local_t - self.prev_local_t if (self.thread_idx == self.log_idx and t >= self.perf_log_interval): self.prev_local_t += self.perf_log_interval elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time logger.info("worker-{}, log_worker-{}".format( self.thread_idx, self.log_idx)) logger.info("Performance : {} STEPS in {:.0f} sec. {:.0f}" " STEPS/sec. {:.2f}M STEPS/hour.".format( global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t, terminal_end, terminal_pseudo