def reset(self, hard_reset=False): if self.episode_life and hard_reset: get_wrapper_by_name(self.env, 'EpisodicLifeEnv').was_real_done = True x_t = self.env.reset() self.prev_x_t = x_t self.x_t = x_t self.s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) self.full_state = self.env.unwrapped.clone_full_state() self.lives = self.env.unwrapped.ale.lives() self.reward = 0 self.terminal = False self.loss_life = False self.gain_life = False
def test_game(self, sess): self.game_state.reset(hard_reset=True) max_steps = 25000 total_reward = 0 total_steps = 0 episode_reward = 0 episode_steps = 0 n_episodes = 0 while max_steps > 0: model_pi = self.net.run_policy(sess, self.game_state.s_t) action, confidence = self.choose_action_with_high_confidence(model_pi, exclude_noop=False) # take action self.game_state.step(action) terminal = self.game_state.terminal episode_reward += self.game_state.reward episode_steps += 1 max_steps -= 1 # s_t = s_t1 self.game_state.update() if terminal: if get_wrapper_by_name(self.game_state.env, 'EpisodicLifeEnv').was_real_done: n_episodes += 1 score_str = colored("score={}".format(episode_reward), "magenta") steps_str = colored("steps={}".format(episode_steps), "blue") log_data = (n_episodes, score_str, steps_str, total_steps) #logger.debug("test: trial={} {} {} total_steps={}".format(*log_data)) total_reward += episode_reward total_steps += episode_steps episode_reward = 0 episode_steps = 0 self.game_state.reset(hard_reset=False) if n_episodes == 0: total_reward = episode_reward total_steps = episode_steps else: # (timestep, total sum of rewards, total # of steps before terminating) total_reward = total_reward / n_episodes total_steps = total_steps // n_episodes log_data = (total_reward, total_steps, n_episodes) logger.info("test: final score={} final steps={} # trials={}".format(*log_data)) return log_data
def test_game(self, sess): """Evaluate game with current network model. Keyword argument: sess -- tf session """ self.game_state.reset(hard_reset=True) max_steps = 25000 total_reward = 0 total_steps = 0 episode_reward = 0 episode_steps = 0 n_episodes = 0 while max_steps > 0: state = cv2.resize(self.game_state.s_t, self.net.in_shape[:-1], interpolation=cv2.INTER_AREA) model_pi = self.net.run_policy(sess, state) action, confidence = self.choose_action_with_high_confidence( model_pi, exclude_noop=False) # take action self.game_state.step(action) terminal = self.game_state.terminal episode_reward += self.game_state.reward episode_steps += 1 max_steps -= 1 # s_t = s_t1 self.game_state.update() if terminal: was_real_done = get_wrapper_by_name( self.game_state.env, 'EpisodicLifeEnv').was_real_done if was_real_done: n_episodes += 1 score_str = colored("score={}".format( episode_reward), "magenta") steps_str = colored("steps={}".format( episode_steps), "blue") log_data = (n_episodes, score_str, steps_str, total_steps) logger.debug("test: trial={} {} {} total_steps={}" .format(*log_data)) total_reward += episode_reward total_steps += episode_steps episode_reward = 0 episode_steps = 0 self.game_state.reset(hard_reset=False) if n_episodes == 0: total_reward = episode_reward total_steps = episode_steps else: total_reward = total_reward / n_episodes total_steps = total_steps // n_episodes log_data = (total_reward, total_steps, n_episodes) logger.info("test: final score={} final steps={} # trials={}" .format(*log_data)) return log_data
def test_keys(env_id): import cv2 from skimage.measure import compare_ssim from skimage import io, filters from collections import deque test_game = GameState(env_id=env_id, display=True, human_demo=True) terminal = False skip = 0 state = test_game.x_t sys_state = None sys_states = deque(maxlen=100) last_num_steps = 0 last_num_ctr = 0 max_repeat = 5 while True: sys_state = test_game.clone_full_state() sys_states.append((sys_state, test_game.get_episode_frame_number())) a = test_game.env.human_agent_action test_game.step(a) # new_state = test_game.x_t # (score, diff) = compare_ssim(state, new_state, full=True) # logger.info("SSIM: {}".format(score)) # state = new_state # edges = filters.sobel(state) # cv2.imshow("edges", test_game.x_t) # cv2.waitKey(1) if test_game.gain_life: logger.info("Gain Life") if test_game.loss_life: logger.warn("Lost life!") logger.info("frame number={}".format( test_game.get_episode_frame_number())) restore = True last_num_ctr += 1 if last_num_steps == 0: last_num_steps = len(sys_states) logger.info('last_num_steps={}'.format(last_num_steps)) elif last_num_steps > len(sys_states): logger.info('last_num_ctr={}'.format(last_num_ctr)) if last_num_ctr == max_repeat: restore = False if restore: full_state, frame_num = sys_states.popleft() logger.info("\trestore frame number={}".format(frame_num)) test_game.restore_full_state(full_state) steps = 0 sys_states.clear() if test_game.reward > 0: last_num_steps = 0 last_num_ctr = 0 sys_states.clear() elif test_game.reward < 0: logger.info("reward={}".format(test_game.reward)) restore = True last_num_ctr += 1 if last_num_steps == 0: last_num_steps = len(sys_states) logger.info('last_num_steps={}'.format(last_num_steps)) elif last_num_steps > len(sys_states): logger.info('last_num_ctr={}'.format(last_num_ctr)) if last_num_ctr == max_repeat: restore = False if restore: full_state, frame_num = sys_states.popleft() logger.info("\trestore frame number={}".format(frame_num)) test_game.restore_full_state(full_state) steps = 0 sys_states.clear() if get_wrapper_by_name(test_game.env, 'EpisodicLifeEnv').was_real_done: break elif test_game.terminal: test_game.reset(hard_reset=False) sleep(0.0167) # cv2.destroyAllWindows() test_game.close() del test_game.env del test_game
def testing(self, sess, max_steps, global_t, folder, worker=None): """Evaluate A3C.""" assert worker is not None assert not worker.is_refresh_thread assert not worker.is_sil_thread logger.info("Evaluate policy at global_t={}...".format(global_t)) # copy weights from shared to local sess.run(worker.sync) episode_buffer = [] worker.game_state.reset(hard_reset=True) episode_buffer.append(worker.game_state.get_screen_rgb()) total_reward = 0 total_steps = 0 episode_reward = 0 episode_steps = 0 n_episodes = 0 while max_steps > 0: state = cv2.resize(worker.game_state.s_t, worker.local_net.in_shape[:-1], interpolation=cv2.INTER_AREA) pi_, value_, logits_ = \ worker.local_net.run_policy_and_value(sess, state) if False: action = np.random.choice(range(worker.action_size), p=pi_) else: action = worker.pick_action(logits_) # take action worker.game_state.step(action) terminal = worker.game_state.terminal if n_episodes == 0 and global_t % 5000000 == 0: episode_buffer.append(worker.game_state.get_screen_rgb()) episode_reward += worker.game_state.reward episode_steps += 1 max_steps -= 1 # s_t = s_t1 worker.game_state.update() if terminal: env = worker.game_state.env name = 'EpisodicLifeEnv' if get_wrapper_by_name(env, name).was_real_done: # make a video every 5M training steps, using the first episode tested if n_episodes == 0 and global_t % 5000000 == 0: time_per_step = 0.0167 images = np.array(episode_buffer) file = 'frames/image{ep:010d}'.format(ep=global_t) duration = len(images) * time_per_step make_movie(images, str(folder / file), duration=duration, true_image=True, salience=False) episode_buffer = [] n_episodes += 1 score_str = colored("score={}".format(episode_reward), "yellow") steps_str = colored("steps={}".format(episode_steps), "cyan") log_data = (global_t, worker.thread_idx, self.thread_idx, n_episodes, score_str, steps_str, total_steps) logger.debug( "test: global_t={} test_worker={} cur_worker={}" " trial={} {} {}" " total_steps={}".format(*log_data)) total_reward += episode_reward total_steps += episode_steps episode_reward = 0 episode_steps = 0 worker.game_state.reset(hard_reset=False) if n_episodes == 0: total_reward = episode_reward total_steps = episode_steps else: total_reward = total_reward / n_episodes total_steps = total_steps // n_episodes log_data = (global_t, worker.thread_idx, self.thread_idx, total_reward, total_steps, n_episodes) logger.info("test: global_t={} test_worker={} cur_worker={}" " final score={} final steps={}" " # trials={}".format(*log_data)) worker.record_summary(score=total_reward, steps=total_steps, episodes=n_episodes, global_t=global_t, mode='A3C_Test') # reset variables used in training worker.episode_reward = 0 worker.episode_steps = 0 worker.game_state.reset(hard_reset=True) worker.last_rho = 0. if worker.use_sil: # ensure no states left from a non-terminating episode worker.episode.reset() return (total_reward, total_steps, n_episodes)
def test_loaded_classifier(self, global_t, max_eps, sess, worker=None, model=None): """Evaluate game with current classifier model.""" assert model is not None assert sess is not None assert worker is not None logger.info( "Testing loaded classifier at global_t={}...".format(global_t)) worker.game_state.reset(hard_reset=True) total_reward = 0 total_steps = 0 episode_reward = 0 episode_steps = 0 n_episodes = 0 reward_list = [] # testing loaded classifier for 50 epsodes while n_episodes < max_eps: state = cv2.resize(worker.game_state.s_t, model.in_shape[:-1], interpolation=cv2.INTER_AREA) model_pi = model.run_policy(sess, state) action, _ = self.choose_action_with_high_confidence( model_pi, exclude_noop=False) # take action worker.game_state.step(action) terminal = worker.game_state.terminal episode_reward += worker.game_state.reward episode_steps += 1 # s_t = s_t1 worker.game_state.update() if terminal: was_real_done = get_wrapper_by_name( worker.game_state.env, 'EpisodicLifeEnv').was_real_done if was_real_done: n_episodes += 1 score_str = colored("score={}".format(episode_reward), "magenta") steps_str = colored("steps={}".format(episode_steps), "blue") log_data = (n_episodes, score_str, steps_str, worker.thread_idx, self.thread_idx, total_steps) logger.debug( "(fixed) classifier test: trial={} {} {} " "test_worker={} cur_worker={} total_steps={}".format( *log_data)) total_reward += episode_reward reward_list.append(episode_reward) total_steps += episode_steps episode_reward = 0 episode_steps = 0 worker.game_state.reset(hard_reset=False) if n_episodes == 0: total_reward = episode_reward total_steps = episode_steps else: total_reward = total_reward / n_episodes total_steps = total_steps // n_episodes log_data = (global_t, worker.thread_idx, self.thread_idx, total_reward, total_steps, n_episodes) logger.info( "classifier test: global_t={} test_worker={} cur_worker={} " "final score={} final steps={} # trials={}".format(*log_data)) self.record_summary(score=total_reward, steps=total_steps, episodes=n_episodes, global_t=global_t, mode='Classifier_Test') return (total_reward, total_steps, n_episodes, reward_list)
def testing(self, sess, max_steps, global_t, folder, demo_memory_cam=None): logger.info("Evaluate policy at global_t={}...".format(global_t)) # copy weights from shared to local sess.run(self.sync) if demo_memory_cam is not None and global_t % 5000000 == 0: self.generate_cam_video(sess, 0.03, global_t, folder, demo_memory_cam) episode_buffer = [] self.game_state.reset(hard_reset=True) episode_buffer.append(self.game_state.get_screen_rgb()) total_reward = 0 total_steps = 0 episode_reward = 0 episode_steps = 0 n_episodes = 0 while max_steps > 0: #pi_ = self.local_network.run_policy(sess, self.game_state.s_t) pi_, value_, logits_ = self.local_network.run_policy_and_value( sess, self.game_state.s_t) if False: action = np.random.choice(range(self.action_size), p=pi_) else: action = self.choose_action(logits_) if self.use_pretrained_model_as_advice: psi = self.psi if self.psi > 0.001 else 0.0 if psi > np.random.rand(): model_pi = self.pretrained_model.run_policy( self.pretrained_model_sess, self.game_state.s_t) model_action, confidence = self.choose_action_with_high_confidence( model_pi, exclude_noop=False) if model_action > self.shaping_actions and confidence >= self.advice_confidence: action = model_action # take action self.game_state.step(action) terminal = self.game_state.terminal if n_episodes == 0 and global_t % 5000000 == 0: episode_buffer.append(self.game_state.get_screen_rgb()) episode_reward += self.game_state.reward episode_steps += 1 max_steps -= 1 # s_t = s_t1 self.game_state.update() if terminal: if get_wrapper_by_name(self.game_state.env, 'EpisodicLifeEnv').was_real_done: if n_episodes == 0 and global_t % 5000000 == 0: time_per_step = 0.0167 images = np.array(episode_buffer) make_movie( images, folder + '/frames/image{ep:010d}'.format(ep=global_t), duration=len(images) * time_per_step, true_image=True, salience=False) episode_buffer = [] n_episodes += 1 score_str = colored("score={}".format(episode_reward), "magenta") steps_str = colored("steps={}".format(episode_steps), "blue") log_data = (global_t, self.thread_index, n_episodes, score_str, steps_str, total_steps) logger.debug( "test: global_t={} worker={} trial={} {} {} total_steps={}" .format(*log_data)) total_reward += episode_reward total_steps += episode_steps episode_reward = 0 episode_steps = 0 self.game_state.reset(hard_reset=False) if self.use_lstm: self.local_network.reset_state() if n_episodes == 0: total_reward = episode_reward total_steps = episode_steps else: # (timestep, total sum of rewards, total # of steps before terminating) total_reward = total_reward / n_episodes total_steps = total_steps // n_episodes log_data = (global_t, self.thread_index, total_reward, total_steps, n_episodes) logger.info( "test: global_t={} worker={} final score={} final steps={} # trials={}" .format(*log_data)) self.record_summary(score=total_reward, steps=total_steps, episodes=n_episodes, global_t=global_t, mode='Test') # reset variables used in training self.episode_reward = 0 self.episode_steps = 0 self.game_state.reset(hard_reset=True) self.last_rho = 0. if self.is_demo_thread: self.replay_mem_reset() if self.use_lstm: self.local_network.reset_state() return total_reward, total_steps, n_episodes
def process(self, sess, global_t, train_rewards): states = [] actions = [] rewards = [] values = [] rho = [] terminal_end = False # copy weights from shared to local sess.run(self.sync) start_local_t = self.local_t if self.use_lstm: start_lstm_state = self.local_network.lstm_state_out # t_max times loop for i in range(self.local_t_max): pi_, value_, logits_ = self.local_network.run_policy_and_value( sess, self.game_state.s_t) action = self.choose_action(logits_) model_pi = None confidence = 0. if self.use_pretrained_model_as_advice: self.psi = 0.9999 * ( 0.9999** global_t) if self.psi > 0.001 else 0.0 # 0.99995 works if self.psi > np.random.rand(): model_pi = self.pretrained_model.run_policy( self.pretrained_model_sess, self.game_state.s_t) model_action, confidence = self.choose_action_with_high_confidence( model_pi, exclude_noop=False) if (model_action > self.shaping_actions and confidence >= self.advice_confidence): action = model_action self.advice_ctr += 1 if self.use_pretrained_model_as_reward_shaping: #if action > 0: if model_pi is None: model_pi = self.pretrained_model.run_policy( self.pretrained_model_sess, self.game_state.s_t) confidence = model_pi[action][0][0] if (action > self.shaping_actions and confidence >= self.advice_confidence): #rho.append(round(confidence, 5)) rho.append(self.shaping_reward) self.shaping_ctr += 1 else: rho.append(0.) #self.shaping_ctr += 1 states.append(self.game_state.s_t) actions.append(action) values.append(value_) if self.thread_index == 0 and self.local_t % self.log_interval == 0: log_msg1 = "lg={}".format( np.array_str(logits_, precision=4, suppress_small=True)) log_msg2 = "pi={}".format( np.array_str(pi_, precision=4, suppress_small=True)) log_msg3 = "V={:.4f}".format(value_) if self.use_pretrained_model_as_advice: log_msg3 += " psi={:.4f}".format(self.psi) logger.debug(log_msg1) logger.debug(log_msg2) logger.debug(log_msg3) # process game self.game_state.step(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal if self.use_pretrained_model_as_reward_shaping: if reward < 0 and reward > 0: rho[i] = 0. j = i - 1 while j > i - 5: if rewards[j] != 0: break rho[j] = 0. j -= 1 # if self.game_state.loss_life: # if self.game_state.gain_life or reward > 0: # rho[i] = 0. # j = i-1 # k = 1 # while j >= 0: # if rewards[j] != 0: # rho[j] = self.shaping_reward * (self.gamma ** -1) # break # rho[j] = self.shaping_reward / k # j -= 1 # k += 1 self.episode_reward += reward if self.reward_type == 'LOG': reward = np.sign(reward) * np.log(1 + np.abs(reward)) elif self.reward_type == 'CLIP': # clip reward reward = np.sign(reward) rewards.append(reward) self.local_t += 1 self.episode_steps += 1 global_t += 1 # s_t1 -> s_t self.game_state.update() if terminal: if get_wrapper_by_name(self.game_state.env, 'EpisodicLifeEnv').was_real_done: log_msg = "train: worker={} global_t={}".format( self.thread_index, global_t) if self.use_pretrained_model_as_advice: log_msg += " advice_ctr={}".format(self.advice_ctr) if self.use_pretrained_model_as_reward_shaping: log_msg += " shaping_ctr={}".format(self.shaping_ctr) score_str = colored("score={}".format(self.episode_reward), "magenta") steps_str = colored("steps={}".format(self.episode_steps), "blue") log_msg += " {} {}".format(score_str, steps_str) logger.debug(log_msg) train_rewards['train'][global_t] = (self.episode_reward, self.episode_steps) self.record_summary(score=self.episode_reward, steps=self.episode_steps, episodes=None, global_t=global_t, mode='Train') self.episode_reward = 0 self.episode_steps = 0 terminal_end = True self.last_rho = 0. if self.use_lstm: self.local_network.reset_state() self.game_state.reset(hard_reset=False) break cumulative_reward = 0.0 if not terminal: cumulative_reward = self.local_network.run_value( sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_state = [] batch_action = [] batch_adv = [] batch_cumulative_reward = [] if self.use_pretrained_model_as_reward_shaping: rho.reverse() rho.append(self.last_rho) self.last_rho = rho[0] i = 0 # compute and accumulate gradients for (ai, ri, si, vi) in zip(actions, rewards, states, values): # Wiewiora et al.(2003) Principled Methods for Advising RL agents # Look-Back Advice #F = rho[i] - (self.shaping_gamma**-1) * rho[i+1] #F = rho[i] - self.shaping_gamma * rho[i+1] f = (self.shaping_gamma**-1) * rho[i] - rho[i + 1] if (i == 0 and terminal) or (f != 0 and (ri > 0 or ri < 0)): #logger.warn("averted additional F in absorbing state") F = 0. # if (F < 0. and ri > 0) or (F > 0. and ri < 0): # logger.warn("Negative reward shaping F={} ri={} rho[s]={} rhos[s-1]={}".format(F, ri, rho[i], rho[i+1])) # F = 0. cumulative_reward = (ri + f * self.shaping_factor ) + self.gamma * cumulative_reward advantage = cumulative_reward - vi a = np.zeros([self.action_size]) a[ai] = 1 batch_state.append(si) batch_action.append(a) batch_adv.append(advantage) batch_cumulative_reward.append(cumulative_reward) i += 1 else: def h(z, eps=10**-2): return (np.sign(z) * (np.sqrt(np.abs(z) + 1.) - 1.)) + (eps * z) def h_inv(z, eps=10**-2): return np.sign(z) * (np.square( (np.sqrt(1 + 4 * eps * (np.abs(z) + 1 + eps)) - 1) / (2 * eps)) - 1) def h_log(z, eps=.6): return (np.sign(z) * np.log(1. + np.abs(z)) * eps) def h_inv_log(z, eps=.6): return np.sign(z) * (np.exp(np.abs(z) / eps) - 1) # compute and accumulate gradients for (ai, ri, si, vi) in zip(actions, rewards, states, values): if self.transformed_bellman: cumulative_reward = h(ri + self.gamma * h_inv(cumulative_reward)) else: cumulative_reward = ri + self.gamma * cumulative_reward advantage = cumulative_reward - vi # convert action to one-hot vector a = np.zeros([self.action_size]) a[ai] = 1 batch_state.append(si) batch_action.append(a) batch_adv.append(advantage) batch_cumulative_reward.append(cumulative_reward) cur_learning_rate = self._anneal_learning_rate(global_t) if self.use_lstm: batch_state.reverse() batch_action.reverse() batch_adv.reverse() batch_cumulative_reward.reverse() sess.run(self.apply_gradients, feed_dict={ self.local_network.s: batch_state, self.local_network.a: batch_action, self.local_network.advantage: batch_adv, self.local_network.cumulative_reward: batch_cumulative_reward, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size: [len(batch_action)], self.learning_rate_input: cur_learning_rate }) else: sess.run(self.apply_gradients, feed_dict={ self.local_network.s: batch_state, self.local_network.a: batch_action, self.local_network.advantage: batch_adv, self.local_network.cumulative_reward: batch_cumulative_reward, self.learning_rate_input: cur_learning_rate }) if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= self.performance_log_interval): self.prev_local_t += self.performance_log_interval elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time logger.info( "Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour" .format(global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t, terminal_end
def testing_model(self, sess, max_steps, global_t, folder, demo_memory_cam=None, demo_cam_human=False): logger.info("Testing model at global_t={}...".format(global_t)) # copy weights from shared to local sess.run(self.sync) if demo_memory_cam is not None: self.generate_cam_video(sess, 0.03, global_t, folder, demo_memory_cam, demo_cam_human) return else: self.game_state.reset(hard_reset=True) max_steps += 4 test_memory = ReplayMemory( 84, 84, np.random.RandomState(), max_steps=max_steps, phi_length=4, num_actions=self.game_state.env.action_space.n, wrap_memory=False, full_state_size=self.game_state.clone_full_state().shape[0]) for _ in range(4): test_memory.add(self.game_state.x_t, 0, self.game_state.reward, self.game_state.terminal, self.game_state.lives, fullstate=self.game_state.full_state) episode_buffer = [] test_memory_cam = [] total_reward = 0 total_steps = 0 episode_reward = 0 episode_steps = 0 n_episodes = 0 terminal = False while True: #pi_ = self.local_network.run_policy(sess, self.game_state.s_t) test_memory_cam.append(self.game_state.s_t) episode_buffer.append(self.game_state.get_screen_rgb()) pi_, value_, logits_ = self.local_network.run_policy_and_value( sess, self.game_state.s_t) #action = self.choose_action(logits_) action = np.argmax(pi_) # take action self.game_state.step(action) terminal = self.game_state.terminal memory_full = episode_steps == max_steps - 5 terminal_ = terminal or memory_full # store the transition to replay memory test_memory.add(self.game_state.x_t1, action, self.game_state.reward, terminal_, self.game_state.lives, fullstate=self.game_state.full_state1) # update the old values episode_reward += self.game_state.reward episode_steps += 1 # s_t = s_t1 self.game_state.update() if terminal_: if get_wrapper_by_name( self.game_state.env, 'EpisodicLifeEnv').was_real_done or memory_full: time_per_step = 0.03 images = np.array(episode_buffer) make_movie(images, folder + '/frames/image{ep:010d}'.format(ep=global_t), duration=len(images) * time_per_step, true_image=True, salience=False) break self.game_state.reset(hard_reset=False) if self.use_lstm: self.local_network.reset_state() total_reward = episode_reward total_steps = episode_steps log_data = (global_t, self.thread_index, total_reward, total_steps) logger.info( "test: global_t={} worker={} final score={} final steps={}".format( *log_data)) self.generate_cam_video(sess, 0.03, global_t, folder, np.array(test_memory_cam)) test_memory.save(name='test_cam', folder=folder, resize=True) if self.use_lstm: self.local_network.reset_state() return
def train(self, sess, global_t, train_rewards): """Train A3C.""" states = [] fullstates = [] actions = [] rewards = [] values = [] rho = [] terminal_pseudo = False # loss of life terminal_end = False # real terminal # copy weights from shared to local sess.run(self.sync) start_local_t = self.local_t # t_max times loop for i in range(self.local_t_max): state = cv2.resize(self.game_state.s_t, self.local_net.in_shape[:-1], interpolation=cv2.INTER_AREA) fullstate = self.game_state.clone_full_state() pi_, value_, logits_ = self.local_net.run_policy_and_value( sess, state) action = self.pick_action(logits_) states.append(state) fullstates.append(fullstate) actions.append(action) values.append(value_) if self.thread_idx == self.log_idx \ and self.local_t % self.log_interval == 0: log_msg1 = "lg={}".format( np.array_str(logits_, precision=4, suppress_small=True)) log_msg2 = "pi={}".format( np.array_str(pi_, precision=4, suppress_small=True)) log_msg3 = "V={:.4f}".format(value_) logger.debug(log_msg1) logger.debug(log_msg2) logger.debug(log_msg3) # process game self.game_state.step(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward if self.use_sil: # save states in episode memory self.episode.add_item(self.game_state.s_t, fullstate, action, reward, terminal) if self.reward_type == 'CLIP': reward = np.sign(reward) rewards.append(reward) self.local_t += 1 self.episode_steps += 1 global_t += 1 # s_t1 -> s_t self.game_state.update() if terminal: terminal_pseudo = True env = self.game_state.env name = 'EpisodicLifeEnv' if get_wrapper_by_name(env, name).was_real_done: # reduce log freq if self.thread_idx == self.log_idx: log_msg = "train: worker={} global_t={} local_t={}".format( self.thread_idx, global_t, self.local_t) score_str = colored( "score={}".format(self.episode_reward), "magenta") steps_str = colored( "steps={}".format(self.episode_steps), "blue") log_msg += " {} {}".format(score_str, steps_str) logger.debug(log_msg) train_rewards['train'][global_t] = (self.episode_reward, self.episode_steps) self.record_summary(score=self.episode_reward, steps=self.episode_steps, episodes=None, global_t=global_t, mode='Train') self.episode_reward = 0 self.episode_steps = 0 terminal_end = True self.game_state.reset(hard_reset=False) break cumsum_reward = 0.0 if not terminal: state = cv2.resize(self.game_state.s_t, self.local_net.in_shape[:-1], interpolation=cv2.INTER_AREA) cumsum_reward = self.local_net.run_value(sess, state) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_state = [] batch_action = [] batch_adv = [] batch_cumsum_reward = [] # compute and accumulate gradients for (ai, ri, si, vi) in zip(actions, rewards, states, values): if self.transformed_bellman: ri = np.sign(ri) * self.reward_constant + ri cumsum_reward = transform_h(ri + self.gamma * transform_h_inv(cumsum_reward)) else: cumsum_reward = ri + self.gamma * cumsum_reward advantage = cumsum_reward - vi # convert action to one-hot vector a = np.zeros([self.action_size]) a[ai] = 1 batch_state.append(si) batch_action.append(a) batch_adv.append(advantage) batch_cumsum_reward.append(cumsum_reward) cur_learning_rate = self._anneal_learning_rate( global_t, self.initial_learning_rate) feed_dict = { self.local_net.s: batch_state, self.local_net.a: batch_action, self.local_net.advantage: batch_adv, self.local_net.cumulative_reward: batch_cumsum_reward, self.learning_rate_input: cur_learning_rate, } sess.run(self.apply_gradients, feed_dict=feed_dict) t = self.local_t - self.prev_local_t if (self.thread_idx == self.log_idx and t >= self.perf_log_interval): self.prev_local_t += self.perf_log_interval elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time logger.info("worker-{}, log_worker-{}".format( self.thread_idx, self.log_idx)) logger.info("Performance : {} STEPS in {:.0f} sec. {:.0f}" " STEPS/sec. {:.2f}M STEPS/hour.".format( global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t, terminal_end, terminal_pseudo
def run(self): # load if starting from a checkpoint wall_t = self._load() # get the first state by doing nothing and preprocess the image to 80x80x4 # only reset when it doesn't evaluate first when it enters loop below if self.global_t % self.eval_freq != 0: self._reset(hard_reset=True) # only executed at the very beginning of training and never again if self.global_t == 0 and self.train_with_demo_steps > 0: self.train_with_demo_memory_only() # load one demo for cam if self.load_demo_cam: # note, tuple length has to be >=2. pad 0 if len==1 demo_cam_id = tuple(map(int, self.demo_cam_id.split(","))) if len(demo_cam_id) == 1: demo_cam_id = (*demo_cam_id, '0') demo_cam, _, total_rewards_cam, _ = load_memory( name=None, demo_memory_folder=self.demo_memory_folder, demo_ids=demo_cam_id, imgs_normalized=False) max_idx, _ = max(total_rewards_cam.items(), key=lambda a: a[1]) size_max_idx_mem = len(demo_cam[max_idx]) self.test_cam_si = np.zeros( (size_max_idx_mem, demo_cam[max_idx].height, demo_cam[max_idx].width, demo_cam[max_idx].phi_length), dtype=np.float32) for i in range(size_max_idx_mem): s0, _, _, _, _, _, _, _ = demo_cam[max_idx][i] self.test_cam_si[i] = np.copy(s0) logger.info("loaded demo {} for testing CAM".format(demo_cam_id)) # set start time start_time = time.time() - wall_t logger.info("replay memory size={}".format(self.replay_memory.size)) sub_total_reward = 0.0 sub_steps = 0 while self.global_t < self.train_max_steps: # Evaluation of policy if self.global_t % self.eval_freq == 0: terminal = 0 total_reward, total_steps, n_episodes = self.test() # re-initialize game for training self._reset(hard_reset=True) sub_total_reward = 0.0 sub_steps = 0 time.sleep(0.5) if self.global_t % self.copy_freq == 0: self.net.update_target_network(slow=False) # choose an action epsilon greedily ## self._update_state_input(observation) readout_t = self.net.evaluate(self.game_state.s_t)[0] action = get_action_index( readout_t, is_random=(random.random() <= self.epsilon or self.global_t <= self.observe), n_actions=self.game_state.env.action_space.n) # scale down epsilon if self.epsilon > self.final_epsilon and self.global_t > self.observe: self.epsilon -= (self.init_epsilon - self.final_epsilon) / self.explore ##### HUMAN ADVICE OVERRIDE ACTION ##### if self.use_human_advice and self.psi > self.final_epsilon: use_advice = False # After n exploration steps, decay psi if (self.global_t - self.observe) >= self.explore: self.psi *= self.init_psi # TODO: Determine if I want advice during observation or only during exploration if random.random() > self.final_epsilon: psi_cond = True if self.psi == self.init_psi else ( self.psi > random.random()) if psi_cond: action_advice = self.human_net.evaluate( self.game_state.s_t)[0] action_human = np.argmax(action_advice) if action_advice[action_human] >= self.confidence: action = action_human use_advice = True ##### HUMAN ADVICE OVERRIDE ACTION ##### # Training # run the selected action and observe next state and reward self.game_state.step(action) terminal = self.game_state.terminal terminal_ = terminal or ((self.global_t + 1) % self.eval_freq == 0) # store the transition in D ## self.replay_memory.add_sample(observation, action, reward, (1 if terminal_ else 0)) self.replay_memory.add(self.game_state.x_t1, action, self.game_state.reward, terminal_, self.game_state.lives, fullstate=self.game_state.full_state1) # update the old values sub_total_reward += self.game_state.reward sub_steps += 1 self.global_t += 1 self.game_state.update() # only train if done observing if self.global_t > self.observe and self.global_t % self.update_freq == 0: s_j_batch, a_batch, r_batch, terminals, s_j1_batch = self.replay_memory.sample( self.batch, reward_type=self.reward_type) # perform gradient step self.net.train(s_j_batch, a_batch, r_batch, s_j1_batch, terminals, self.global_t) # self.net.add_summary(summary, self.global_t) if terminal: if get_wrapper_by_name(self.game_state.env, 'EpisodicLifeEnv').was_real_done: self.rewards['train'][self.global_t] = (sub_total_reward, sub_steps) score_str = colored("score={}".format(sub_total_reward), "magenta") steps_str = colored("steps={}".format(sub_steps), "blue") log_data = (self.global_t, score_str, steps_str) logger.debug("train: global_t={} {} {}".format(*log_data)) self.net.record_summary(score=sub_total_reward, steps=sub_steps, episodes=None, global_t=self.global_t, mode='Train') sub_total_reward = 0.0 sub_steps = 0 self._reset(hard_reset=False) # save progress every SAVE_FREQ iterations if self.global_t % self.save_freq == 0: wall_t = time.time() - start_time logger.info('Total time: {} seconds'.format(wall_t)) wall_t_fname = self.folder + '/' + 'wall_t.' + str( self.global_t) epsilon_fname = self.folder + '/epsilon' logger.info('Now saving data. Please wait') with open(wall_t_fname, 'w') as f: f.write(str(wall_t)) with open(epsilon_fname, 'w') as f: f.write(str(self.epsilon)) self.net.save(self.global_t) self.replay_memory.save(name=self.name, folder=self.folder, resize=False) pickle.dump( self.rewards, open( self.folder + '/' + self.name.replace('-', '_') + '-dqn-rewards.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) logger.info('Data saved!') # log information state = "" if self.global_t - 1 < self.observe: state = "observe" elif self.global_t - 1 < self.observe + self.explore: state = "explore" else: state = "train" if (self.global_t - 1) % 10000 == 0: if self.use_human_advice: log_data = (state, self.global_t - 1, self.epsilon, self.psi, use_advice, action, np.max(readout_t)) logger.debug( "{0:}: global_t={1:} epsilon={2:.4f} psi={3:.4f} \ advice={4:} action={5:} q_max={6:.4f}".format( *log_data)) else: log_data = (state, self.global_t - 1, self.epsilon, action, np.max(readout_t)) logger.debug( "{0:}: global_t={1:} epsilon={2:.4f} action={3:} " "q_max={4:.4f}".format(*log_data))
def test(self, render=False): logger.info("Evaluate policy at global_t={}...".format(self.global_t)) episode_buffer = [] self.game_state.reset(hard_reset=True) episode_buffer.append(self.game_state.get_screen_rgb()) max_steps = self.eval_max_steps total_reward = 0 total_steps = 0 episode_reward = 0 episode_steps = 0 n_episodes = 0 # use one demonstration data to record cam # only need to make movie for demo data once # if self.global_t == 0: cam, state, action = self.calculate_cam(self.test_cam_si) cam_plus_img = [] cam_side_img = [] for i in range(len(cam)): # overlay cam-state overlay = np.uint8(cam[i]).copy() output = np.uint8(state[i]).copy() alpha = 0.3 cv2.addWeighted(overlay, alpha, output, 1 - alpha, 0, output) # create a title space for action title_space = np.zeros((20, 84, 3), np.uint8) title_space[:] = (255, 255, 255) cv2.putText(title_space, "{}".format(ACTION_MEANING[action[i]]), (20, 14), cv2.FONT_HERSHEY_DUPLEX, .4, (0, 0, 0), 1) # concate title and state vcat_output = cv2.vconcat((title_space, output)) cam_plus_img.append(vcat_output) # side-by-side cam-state hcat_cam_state = cv2.hconcat( (np.uint8(cam[i]).copy(), np.uint8(state[i]).copy())) title_space = np.zeros((20, 84 * 2, 3), np.uint8) title_space[:] = (255, 255, 255) vcat_title_camstate = cv2.vconcat((title_space, hcat_cam_state)) cv2.putText(vcat_title_camstate, "{}".format(ACTION_MEANING[action[i]]), (20, 14), cv2.FONT_HERSHEY_DUPLEX, .4, (0, 0, 0), 1) cam_side_img.append(vcat_title_camstate) time_per_step = 0.0167 make_movie( cam_plus_img, self.folder + '/frames/demo-cam_plus_img{ep:010d}'.format(ep=(self.global_t)), duration=len(cam) * time_per_step, true_image=True, salience=False) make_movie( cam_side_img, self.folder + '/frames/demo-cam_side_img{ep:010d}'.format(ep=(self.global_t)), duration=len(state) * time_per_step, true_image=True, salience=False) del cam, state, action, cam_plus_img, cam_side_img while max_steps > 0: readout_t = self.net.evaluate(self.game_state.s_t)[0] action = get_action_index( readout_t, is_random=(random.random() <= 0.05), n_actions=self.game_state.env.action_space.n) # take action self.game_state.step(action) terminal = self.game_state.terminal if n_episodes == 0 and self.global_t % 2000000 == 0: episode_buffer.append(self.game_state.get_screen_rgb()) episode_reward += self.game_state.reward episode_steps += 1 max_steps -= 1 # s_t = s_t1 self.game_state.update() if terminal: if get_wrapper_by_name(self.game_state.env, 'EpisodicLifeEnv').was_real_done: if n_episodes == 0 and self.global_t % 2000000 == 0: time_per_step = 0.0167 images = np.array(episode_buffer) make_movie(images, self.folder + '/frames/image{ep:010d}'.format( ep=(self.global_t)), duration=len(images) * time_per_step, true_image=True, salience=False) episode_buffer = [] n_episodes += 1 score_str = colored("score={}".format(episode_reward), "magenta") steps_str = colored("steps={}".format(episode_steps), "blue") log_data = (self.global_t, n_episodes, score_str, steps_str, total_steps) logger.debug( "test: global_t={} trial={} {} {} total_steps={}". format(*log_data)) total_reward += episode_reward total_steps += episode_steps episode_reward = 0 episode_steps = 0 self.game_state.reset(hard_reset=False) if n_episodes == 0: total_reward = episode_reward total_steps = episode_steps else: # (timestep, total sum of rewards, total # of steps before terminating) total_reward = total_reward / n_episodes total_steps = total_steps // n_episodes log_data = (self.global_t, total_reward, total_steps, n_episodes) logger.debug( "test: global_t={} final score={} final steps={} # episodes={}". format(*log_data)) self.net.record_summary(score=total_reward, steps=total_steps, episodes=n_episodes, global_t=self.global_t, mode='Test') self.rewards['eval'][self.global_t] = (total_reward, total_steps, n_episodes) return total_reward, total_steps, n_episodes
def rollout(self, a3c_sess, folder, pretrain_sess, global_t, past_state, add_all_rollout, ep_max_steps, nstep_bc, update_in_rollout): """Perform one rollout until terminal.""" a3c_sess.run(self.sync_a3c) if nstep_bc > 0: pretrain_sess.run(self.sync_pretrained) _, fs, old_a, old_return, _, _ = past_state states = [] actions = [] rewards = [] values = [] terminals = [] confidences = [] rollout_ctr, rollout_added_ctr = 0, 0 rollout_new_return, rollout_old_return = 0, 0 terminal_pseudo = False # loss of life terminal_end = False # real terminal add = False self.rolloutgame.reset(hard_reset=True) self.rolloutgame.restore_full_state(fs) # check if restore successful fs_check = self.rolloutgame.clone_full_state() assert fs_check.all() == fs.all() del fs_check start_local_t = self.local_t self.rolloutgame.step(0) # prevent rollout too long, set max_ep_steps to be lower than ALE default # see https://github.com/openai/gym/blob/54f22cf4db2e43063093a1b15d968a57a32b6e90/gym/envs/__init__.py#L635 # but in all games tested, no rollout exceeds ep_max_steps while ep_max_steps > 0: state = cv2.resize(self.rolloutgame.s_t, self.local_a3c.in_shape[:-1], interpolation=cv2.INTER_AREA) fullstate = self.rolloutgame.clone_full_state() if nstep_bc > 0: # LiDER-TA or BC model_pi = self.local_pretrained.run_policy(pretrain_sess, state) action, confidence = self.choose_action_with_high_confidence( model_pi, exclude_noop=False) confidences.append(confidence) # not using "confidences" for anything nstep_bc -= 1 else: # LiDER, refresh with current policy pi_, _, logits_ = self.local_a3c.run_policy_and_value(a3c_sess, state) action = self.pick_action(logits_) confidences.append(pi_[action]) value_ = self.local_a3c.run_value(a3c_sess, state) values.append(value_) states.append(state) actions.append(action) self.rolloutgame.step(action) ep_max_steps -= 1 reward = self.rolloutgame.reward terminal = self.rolloutgame.terminal terminals.append(terminal) self.episode_reward += reward self.episode.add_item(self.rolloutgame.s_t, fullstate, action, reward, terminal, from_rollout=True) if self.reward_type == 'CLIP': reward = np.sign(reward) rewards.append(reward) self.local_t += 1 self.episode_steps += 1 global_t += 1 self.rolloutgame.update() if terminal: terminal_pseudo = True env = self.rolloutgame.env name = 'EpisodicLifeEnv' rollout_ctr += 1 terminal_end = get_wrapper_by_name(env, name).was_real_done new_return = self.compute_return_for_state(rewards, terminals) if not add_all_rollout: if new_return > old_return: add = True else: add = True if add: rollout_added_ctr += 1 rollout_new_return += new_return rollout_old_return += old_return # update policy immediate using a good rollout if update_in_rollout: batch_adv = self.update_a3c(a3c_sess, actions, states, rewards, values, global_t) self.episode_reward = 0 self.episode_steps = 0 self.rolloutgame.reset(hard_reset=True) break diff_local_t = self.local_t - start_local_t return diff_local_t, terminal_end, terminal_pseudo, rollout_ctr, \ rollout_added_ctr, add, rollout_new_return, rollout_old_return