def calculate_cam(self, test_cam_si): state = [] action_onehot = [] action_array = [] for i in range(len(test_cam_si)): readout_t = self.net.evaluate(test_cam_si[i])[0] action = get_action_index( readout_t, is_random=(random.random() <= 0.05), n_actions=self.game_state.env.action_space.n) action_array.append(action) a_onehot = np.zeros(self.game_state.env.action_space.n) a_onehot[action] = 1 action_onehot.append(a_onehot) state.append(np.mean(test_cam_si[i], axis=-1)) conv_value, conv_grad, gbgrad = self.net.grad_cam( test_cam_si, action_onehot) cam = [] img = [] for i in range(len(conv_value)): cam_tmp = self.visualize(conv_value[i], conv_grad[i]) cam.append(cam_tmp) # fake RGB channels for demo images state_tmp = cv2.merge((state[i], state[i], state[i])) img.append(state_tmp) return np.array(cam), np.array(img), action_array
def run(self, minutes_limit=5, episode=0, num_episodes=0, demo_type=0, model_net=None, replay_memory=None, total_memory=0): if self.create_movie: movie_images = [] rewards = {'train':[], 'eval':[]} full_episode = False if minutes_limit == 0: full_episode = True timeout = 60 * minutes_limit t = 0 total_reward = 0.0 # re-initialize game for evaluation self._reset(replay_memory, hard_reset=True) rew = self.game_state.reward terminal = False lives = self.game_state.lives # loss_life = self.game_state.loss_life # gain_life = self.game_state.gain_life and not loss_life if self.pause_onstart: root = Tk() root.withdraw() messagebox.showinfo( self.name, "Start episode {} of {}. total memory={}. " "Press OK to start playing".format(episode, num_episodes, total_memory)) # regular game start_time = datetime.datetime.now() timeout_start = time.time() actions = deque() dtm = time.time() pulse = 1.0 / self.hertz while True: dtm += pulse delay = dtm - time.time() if delay > 0: time.sleep(delay) #60 hz else: dtm = time.time() if not terminal: if demo_type == 1: # RANDOM AGENT action = np.random.randint(self.game_state.n_actions) elif demo_type == 2: # MODEL AGENT if sub_t % self._skip == 0: self._update_state_input(self.game_state.s_t) readout_t = model_net.evaluate(self.state_input)[0] action = get_action_index(readout_t, is_random=False, n_actions=self.game_state.n_actions) else: # HUMAN action = self.game_state.env.human_agent_action actions.append(action) self.game_state.step(action) rew += self.game_state.reward lives = self.game_state.lives # loss_life = loss_life or self.game_state.loss_life # gain_life = (gain_life or self.game_state.gain_life) and not loss_life total_reward += self.game_state.reward t += 1 if self.create_movie: movie_images.append(self.game_state.get_screen_rgb()) # Ensure that D does not reach max memory that mitigate # problems when combining different human demo files if (replay_memory.size + 3) == replay_memory.max_steps: logger.warn("Memory max limit reached!") terminal = True elif not full_episode: terminal = True if (time.time() > timeout_start + timeout) else False # add memory every 4th frame even if demo uses skip=1 if self.game_state.get_episode_frame_number() % self._skip == 0 or terminal or self.game_state.terminal: self.obs_buffer[0] = self.game_state.x_t self.obs_buffer[1] = self.game_state.x_t1 max_obs = self.obs_buffer.max(axis=0) # cv2.imshow('max obs', max_obs) # cv2.imshow('current', self.game_state.x_t1) # cv2.waitKey(1) # store the transition in D replay_memory.add( max_obs, actions.popleft(), rew, terminal or self.game_state.terminal, lives, fullstate=self.game_state.full_state1) actions.clear() rew = 0 if terminal or (self.game_state.episode_life and get_wrapper_by_name(self.game_state.env, 'EpisodicLifeEnv').was_real_done): root = Tk() root.withdraw() messagebox.showinfo(self.name, "Times up!" if terminal else "Game ended!") break if self.game_state.terminal: self._reset(replay_memory, hard_reset=False) continue self.game_state.update() end_time = datetime.datetime.now() duration = end_time - start_time logger.info("Duration: {}".format(duration)) logger.info("Total steps: {}".format(t)) logger.info("Total reward: {}".format(total_reward)) logger.info("Total Replay memory saved: {}".format(replay_memory.size)) replay_memory.save(name=self.name, folder=self.folder, resize=True) if self.create_movie: time_per_step = 0.0167 make_movie( movie_images, str(self.folder / "demo"), duration=len(movie_images)*time_per_step, true_image=True, salience=False) return total_reward, t, start_time, end_time, duration, replay_memory.size
def run(self): # load if starting from a checkpoint wall_t = self._load() # get the first state by doing nothing and preprocess the image to 80x80x4 # only reset when it doesn't evaluate first when it enters loop below if self.global_t % self.eval_freq != 0: self._reset(hard_reset=True) # only executed at the very beginning of training and never again if self.global_t == 0 and self.train_with_demo_steps > 0: self.train_with_demo_memory_only() # load one demo for cam if self.load_demo_cam: # note, tuple length has to be >=2. pad 0 if len==1 demo_cam_id = tuple(map(int, self.demo_cam_id.split(","))) if len(demo_cam_id) == 1: demo_cam_id = (*demo_cam_id, '0') demo_cam, _, total_rewards_cam, _ = load_memory( name=None, demo_memory_folder=self.demo_memory_folder, demo_ids=demo_cam_id, imgs_normalized=False) max_idx, _ = max(total_rewards_cam.items(), key=lambda a: a[1]) size_max_idx_mem = len(demo_cam[max_idx]) self.test_cam_si = np.zeros( (size_max_idx_mem, demo_cam[max_idx].height, demo_cam[max_idx].width, demo_cam[max_idx].phi_length), dtype=np.float32) for i in range(size_max_idx_mem): s0, _, _, _, _, _, _, _ = demo_cam[max_idx][i] self.test_cam_si[i] = np.copy(s0) logger.info("loaded demo {} for testing CAM".format(demo_cam_id)) # set start time start_time = time.time() - wall_t logger.info("replay memory size={}".format(self.replay_memory.size)) sub_total_reward = 0.0 sub_steps = 0 while self.global_t < self.train_max_steps: # Evaluation of policy if self.global_t % self.eval_freq == 0: terminal = 0 total_reward, total_steps, n_episodes = self.test() # re-initialize game for training self._reset(hard_reset=True) sub_total_reward = 0.0 sub_steps = 0 time.sleep(0.5) if self.global_t % self.copy_freq == 0: self.net.update_target_network(slow=False) # choose an action epsilon greedily ## self._update_state_input(observation) readout_t = self.net.evaluate(self.game_state.s_t)[0] action = get_action_index( readout_t, is_random=(random.random() <= self.epsilon or self.global_t <= self.observe), n_actions=self.game_state.env.action_space.n) # scale down epsilon if self.epsilon > self.final_epsilon and self.global_t > self.observe: self.epsilon -= (self.init_epsilon - self.final_epsilon) / self.explore ##### HUMAN ADVICE OVERRIDE ACTION ##### if self.use_human_advice and self.psi > self.final_epsilon: use_advice = False # After n exploration steps, decay psi if (self.global_t - self.observe) >= self.explore: self.psi *= self.init_psi # TODO: Determine if I want advice during observation or only during exploration if random.random() > self.final_epsilon: psi_cond = True if self.psi == self.init_psi else ( self.psi > random.random()) if psi_cond: action_advice = self.human_net.evaluate( self.game_state.s_t)[0] action_human = np.argmax(action_advice) if action_advice[action_human] >= self.confidence: action = action_human use_advice = True ##### HUMAN ADVICE OVERRIDE ACTION ##### # Training # run the selected action and observe next state and reward self.game_state.step(action) terminal = self.game_state.terminal terminal_ = terminal or ((self.global_t + 1) % self.eval_freq == 0) # store the transition in D ## self.replay_memory.add_sample(observation, action, reward, (1 if terminal_ else 0)) self.replay_memory.add(self.game_state.x_t1, action, self.game_state.reward, terminal_, self.game_state.lives, fullstate=self.game_state.full_state1) # update the old values sub_total_reward += self.game_state.reward sub_steps += 1 self.global_t += 1 self.game_state.update() # only train if done observing if self.global_t > self.observe and self.global_t % self.update_freq == 0: s_j_batch, a_batch, r_batch, terminals, s_j1_batch = self.replay_memory.sample( self.batch, reward_type=self.reward_type) # perform gradient step self.net.train(s_j_batch, a_batch, r_batch, s_j1_batch, terminals, self.global_t) # self.net.add_summary(summary, self.global_t) if terminal: if get_wrapper_by_name(self.game_state.env, 'EpisodicLifeEnv').was_real_done: self.rewards['train'][self.global_t] = (sub_total_reward, sub_steps) score_str = colored("score={}".format(sub_total_reward), "magenta") steps_str = colored("steps={}".format(sub_steps), "blue") log_data = (self.global_t, score_str, steps_str) logger.debug("train: global_t={} {} {}".format(*log_data)) self.net.record_summary(score=sub_total_reward, steps=sub_steps, episodes=None, global_t=self.global_t, mode='Train') sub_total_reward = 0.0 sub_steps = 0 self._reset(hard_reset=False) # save progress every SAVE_FREQ iterations if self.global_t % self.save_freq == 0: wall_t = time.time() - start_time logger.info('Total time: {} seconds'.format(wall_t)) wall_t_fname = self.folder + '/' + 'wall_t.' + str( self.global_t) epsilon_fname = self.folder + '/epsilon' logger.info('Now saving data. Please wait') with open(wall_t_fname, 'w') as f: f.write(str(wall_t)) with open(epsilon_fname, 'w') as f: f.write(str(self.epsilon)) self.net.save(self.global_t) self.replay_memory.save(name=self.name, folder=self.folder, resize=False) pickle.dump( self.rewards, open( self.folder + '/' + self.name.replace('-', '_') + '-dqn-rewards.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) logger.info('Data saved!') # log information state = "" if self.global_t - 1 < self.observe: state = "observe" elif self.global_t - 1 < self.observe + self.explore: state = "explore" else: state = "train" if (self.global_t - 1) % 10000 == 0: if self.use_human_advice: log_data = (state, self.global_t - 1, self.epsilon, self.psi, use_advice, action, np.max(readout_t)) logger.debug( "{0:}: global_t={1:} epsilon={2:.4f} psi={3:.4f} \ advice={4:} action={5:} q_max={6:.4f}".format( *log_data)) else: log_data = (state, self.global_t - 1, self.epsilon, action, np.max(readout_t)) logger.debug( "{0:}: global_t={1:} epsilon={2:.4f} action={3:} " "q_max={4:.4f}".format(*log_data))
def test(self, render=False): logger.info("Evaluate policy at global_t={}...".format(self.global_t)) episode_buffer = [] self.game_state.reset(hard_reset=True) episode_buffer.append(self.game_state.get_screen_rgb()) max_steps = self.eval_max_steps total_reward = 0 total_steps = 0 episode_reward = 0 episode_steps = 0 n_episodes = 0 # use one demonstration data to record cam # only need to make movie for demo data once # if self.global_t == 0: cam, state, action = self.calculate_cam(self.test_cam_si) cam_plus_img = [] cam_side_img = [] for i in range(len(cam)): # overlay cam-state overlay = np.uint8(cam[i]).copy() output = np.uint8(state[i]).copy() alpha = 0.3 cv2.addWeighted(overlay, alpha, output, 1 - alpha, 0, output) # create a title space for action title_space = np.zeros((20, 84, 3), np.uint8) title_space[:] = (255, 255, 255) cv2.putText(title_space, "{}".format(ACTION_MEANING[action[i]]), (20, 14), cv2.FONT_HERSHEY_DUPLEX, .4, (0, 0, 0), 1) # concate title and state vcat_output = cv2.vconcat((title_space, output)) cam_plus_img.append(vcat_output) # side-by-side cam-state hcat_cam_state = cv2.hconcat( (np.uint8(cam[i]).copy(), np.uint8(state[i]).copy())) title_space = np.zeros((20, 84 * 2, 3), np.uint8) title_space[:] = (255, 255, 255) vcat_title_camstate = cv2.vconcat((title_space, hcat_cam_state)) cv2.putText(vcat_title_camstate, "{}".format(ACTION_MEANING[action[i]]), (20, 14), cv2.FONT_HERSHEY_DUPLEX, .4, (0, 0, 0), 1) cam_side_img.append(vcat_title_camstate) time_per_step = 0.0167 make_movie( cam_plus_img, self.folder + '/frames/demo-cam_plus_img{ep:010d}'.format(ep=(self.global_t)), duration=len(cam) * time_per_step, true_image=True, salience=False) make_movie( cam_side_img, self.folder + '/frames/demo-cam_side_img{ep:010d}'.format(ep=(self.global_t)), duration=len(state) * time_per_step, true_image=True, salience=False) del cam, state, action, cam_plus_img, cam_side_img while max_steps > 0: readout_t = self.net.evaluate(self.game_state.s_t)[0] action = get_action_index( readout_t, is_random=(random.random() <= 0.05), n_actions=self.game_state.env.action_space.n) # take action self.game_state.step(action) terminal = self.game_state.terminal if n_episodes == 0 and self.global_t % 2000000 == 0: episode_buffer.append(self.game_state.get_screen_rgb()) episode_reward += self.game_state.reward episode_steps += 1 max_steps -= 1 # s_t = s_t1 self.game_state.update() if terminal: if get_wrapper_by_name(self.game_state.env, 'EpisodicLifeEnv').was_real_done: if n_episodes == 0 and self.global_t % 2000000 == 0: time_per_step = 0.0167 images = np.array(episode_buffer) make_movie(images, self.folder + '/frames/image{ep:010d}'.format( ep=(self.global_t)), duration=len(images) * time_per_step, true_image=True, salience=False) episode_buffer = [] n_episodes += 1 score_str = colored("score={}".format(episode_reward), "magenta") steps_str = colored("steps={}".format(episode_steps), "blue") log_data = (self.global_t, n_episodes, score_str, steps_str, total_steps) logger.debug( "test: global_t={} trial={} {} {} total_steps={}". format(*log_data)) total_reward += episode_reward total_steps += episode_steps episode_reward = 0 episode_steps = 0 self.game_state.reset(hard_reset=False) if n_episodes == 0: total_reward = episode_reward total_steps = episode_steps else: # (timestep, total sum of rewards, total # of steps before terminating) total_reward = total_reward / n_episodes total_steps = total_steps // n_episodes log_data = (self.global_t, total_reward, total_steps, n_episodes) logger.debug( "test: global_t={} final score={} final steps={} # episodes={}". format(*log_data)) self.net.record_summary(score=total_reward, steps=total_steps, episodes=n_episodes, global_t=self.global_t, mode='Test') self.rewards['eval'][self.global_t] = (total_reward, total_steps, n_episodes) return total_reward, total_steps, n_episodes