예제 #1
0
    def calculate_cam(self, test_cam_si):
        state = []
        action_onehot = []
        action_array = []

        for i in range(len(test_cam_si)):
            readout_t = self.net.evaluate(test_cam_si[i])[0]
            action = get_action_index(
                readout_t,
                is_random=(random.random() <= 0.05),
                n_actions=self.game_state.env.action_space.n)
            action_array.append(action)
            a_onehot = np.zeros(self.game_state.env.action_space.n)
            a_onehot[action] = 1
            action_onehot.append(a_onehot)

            state.append(np.mean(test_cam_si[i], axis=-1))

        conv_value, conv_grad, gbgrad = self.net.grad_cam(
            test_cam_si, action_onehot)
        cam = []
        img = []

        for i in range(len(conv_value)):
            cam_tmp = self.visualize(conv_value[i], conv_grad[i])
            cam.append(cam_tmp)

            # fake RGB channels for demo images
            state_tmp = cv2.merge((state[i], state[i], state[i]))
            img.append(state_tmp)

        return np.array(cam), np.array(img), action_array
예제 #2
0
    def run(self, minutes_limit=5, episode=0, num_episodes=0, demo_type=0,
            model_net=None, replay_memory=None, total_memory=0):
        if self.create_movie:
            movie_images = []

        rewards = {'train':[], 'eval':[]}

        full_episode = False
        if minutes_limit == 0:
            full_episode = True
        timeout = 60 * minutes_limit
        t = 0
        total_reward = 0.0

        # re-initialize game for evaluation
        self._reset(replay_memory, hard_reset=True)

        rew = self.game_state.reward
        terminal = False
        lives = self.game_state.lives
        # loss_life = self.game_state.loss_life
        # gain_life = self.game_state.gain_life and not loss_life

        if self.pause_onstart:
            root = Tk()
            root.withdraw()

            messagebox.showinfo(
                self.name,
                "Start episode {} of {}. total memory={}. "
                "Press OK to start playing".format(episode, num_episodes, total_memory))

        # regular game
        start_time = datetime.datetime.now()
        timeout_start = time.time()

        actions = deque()

        dtm = time.time()
        pulse = 1.0 / self.hertz

        while True:
            dtm += pulse
            delay = dtm - time.time()
            if delay > 0:
                time.sleep(delay) #60 hz
            else:
                dtm = time.time()

            if not terminal:
                if demo_type == 1:  # RANDOM AGENT
                    action = np.random.randint(self.game_state.n_actions)
                elif demo_type == 2:  # MODEL AGENT
                    if sub_t % self._skip == 0:
                        self._update_state_input(self.game_state.s_t)
                        readout_t = model_net.evaluate(self.state_input)[0]
                        action = get_action_index(readout_t, is_random=False, n_actions=self.game_state.n_actions)
                else: # HUMAN
                    action = self.game_state.env.human_agent_action

            actions.append(action)
            self.game_state.step(action)
            rew += self.game_state.reward
            lives = self.game_state.lives
            # loss_life = loss_life or self.game_state.loss_life
            # gain_life = (gain_life or self.game_state.gain_life) and not loss_life
            total_reward += self.game_state.reward
            t += 1

            if self.create_movie:
                movie_images.append(self.game_state.get_screen_rgb())

            # Ensure that D does not reach max memory that mitigate
            # problems when combining different human demo files
            if (replay_memory.size + 3) == replay_memory.max_steps:
                logger.warn("Memory max limit reached!")
                terminal = True
            elif not full_episode:
                terminal = True if (time.time() > timeout_start + timeout) else False

            # add memory every 4th frame even if demo uses skip=1
            if self.game_state.get_episode_frame_number() % self._skip == 0 or terminal or self.game_state.terminal:
                self.obs_buffer[0] = self.game_state.x_t
                self.obs_buffer[1] = self.game_state.x_t1
                max_obs = self.obs_buffer.max(axis=0)
                # cv2.imshow('max obs', max_obs)
                # cv2.imshow('current', self.game_state.x_t1)
                # cv2.waitKey(1)

                # store the transition in D
                replay_memory.add(
                    max_obs,
                    actions.popleft(),
                    rew,
                    terminal or self.game_state.terminal,
                    lives,
                    fullstate=self.game_state.full_state1)
                actions.clear()
                rew = 0

                if terminal or (self.game_state.episode_life and get_wrapper_by_name(self.game_state.env, 'EpisodicLifeEnv').was_real_done):
                    root = Tk()
                    root.withdraw()
                    messagebox.showinfo(self.name, "Times up!" if terminal else "Game ended!")
                    break

                if self.game_state.terminal:
                    self._reset(replay_memory, hard_reset=False)
                    continue

            self.game_state.update()

        end_time = datetime.datetime.now()
        duration = end_time - start_time
        logger.info("Duration: {}".format(duration))
        logger.info("Total steps: {}".format(t))
        logger.info("Total reward: {}".format(total_reward))
        logger.info("Total Replay memory saved: {}".format(replay_memory.size))

        replay_memory.save(name=self.name, folder=self.folder, resize=True)
        if self.create_movie:
            time_per_step = 0.0167
            make_movie(
                movie_images, str(self.folder / "demo"),
                duration=len(movie_images)*time_per_step,
                true_image=True, salience=False)

        return total_reward, t, start_time, end_time, duration, replay_memory.size
예제 #3
0
    def run(self):
        # load if starting from a checkpoint
        wall_t = self._load()

        # get the first state by doing nothing and preprocess the image to 80x80x4
        # only reset when it doesn't evaluate first when it enters loop below
        if self.global_t % self.eval_freq != 0:
            self._reset(hard_reset=True)

        # only executed at the very beginning of training and never again
        if self.global_t == 0 and self.train_with_demo_steps > 0:
            self.train_with_demo_memory_only()

        # load one demo for cam
        if self.load_demo_cam:
            # note, tuple length has to be >=2. pad 0 if len==1
            demo_cam_id = tuple(map(int, self.demo_cam_id.split(",")))
            if len(demo_cam_id) == 1:
                demo_cam_id = (*demo_cam_id, '0')
            demo_cam, _, total_rewards_cam, _ = load_memory(
                name=None,
                demo_memory_folder=self.demo_memory_folder,
                demo_ids=demo_cam_id,
                imgs_normalized=False)

            max_idx, _ = max(total_rewards_cam.items(), key=lambda a: a[1])
            size_max_idx_mem = len(demo_cam[max_idx])
            self.test_cam_si = np.zeros(
                (size_max_idx_mem, demo_cam[max_idx].height,
                 demo_cam[max_idx].width, demo_cam[max_idx].phi_length),
                dtype=np.float32)
            for i in range(size_max_idx_mem):
                s0, _, _, _, _, _, _, _ = demo_cam[max_idx][i]
                self.test_cam_si[i] = np.copy(s0)
            logger.info("loaded demo {} for testing CAM".format(demo_cam_id))

        # set start time
        start_time = time.time() - wall_t

        logger.info("replay memory size={}".format(self.replay_memory.size))
        sub_total_reward = 0.0
        sub_steps = 0

        while self.global_t < self.train_max_steps:
            # Evaluation of policy
            if self.global_t % self.eval_freq == 0:
                terminal = 0
                total_reward, total_steps, n_episodes = self.test()
                # re-initialize game for training
                self._reset(hard_reset=True)
                sub_total_reward = 0.0
                sub_steps = 0
                time.sleep(0.5)

            if self.global_t % self.copy_freq == 0:
                self.net.update_target_network(slow=False)

            # choose an action epsilon greedily
            ## self._update_state_input(observation)
            readout_t = self.net.evaluate(self.game_state.s_t)[0]
            action = get_action_index(
                readout_t,
                is_random=(random.random() <= self.epsilon
                           or self.global_t <= self.observe),
                n_actions=self.game_state.env.action_space.n)

            # scale down epsilon
            if self.epsilon > self.final_epsilon and self.global_t > self.observe:
                self.epsilon -= (self.init_epsilon -
                                 self.final_epsilon) / self.explore

            ##### HUMAN ADVICE OVERRIDE ACTION #####
            if self.use_human_advice and self.psi > self.final_epsilon:
                use_advice = False
                # After n exploration steps, decay psi
                if (self.global_t - self.observe) >= self.explore:
                    self.psi *= self.init_psi

                # TODO: Determine if I want advice during observation or only during exploration
                if random.random() > self.final_epsilon:
                    psi_cond = True if self.psi == self.init_psi else (
                        self.psi > random.random())
                    if psi_cond:
                        action_advice = self.human_net.evaluate(
                            self.game_state.s_t)[0]
                        action_human = np.argmax(action_advice)
                        if action_advice[action_human] >= self.confidence:
                            action = action_human
                            use_advice = True
            ##### HUMAN ADVICE OVERRIDE ACTION #####

            # Training
            # run the selected action and observe next state and reward
            self.game_state.step(action)
            terminal = self.game_state.terminal
            terminal_ = terminal or ((self.global_t + 1) % self.eval_freq == 0)

            # store the transition in D
            ## self.replay_memory.add_sample(observation, action, reward, (1 if terminal_ else 0))
            self.replay_memory.add(self.game_state.x_t1,
                                   action,
                                   self.game_state.reward,
                                   terminal_,
                                   self.game_state.lives,
                                   fullstate=self.game_state.full_state1)

            # update the old values
            sub_total_reward += self.game_state.reward
            sub_steps += 1
            self.global_t += 1
            self.game_state.update()

            # only train if done observing
            if self.global_t > self.observe and self.global_t % self.update_freq == 0:
                s_j_batch, a_batch, r_batch, terminals, s_j1_batch = self.replay_memory.sample(
                    self.batch, reward_type=self.reward_type)
                # perform gradient step
                self.net.train(s_j_batch, a_batch, r_batch, s_j1_batch,
                               terminals, self.global_t)
                # self.net.add_summary(summary, self.global_t)

            if terminal:
                if get_wrapper_by_name(self.game_state.env,
                                       'EpisodicLifeEnv').was_real_done:
                    self.rewards['train'][self.global_t] = (sub_total_reward,
                                                            sub_steps)
                    score_str = colored("score={}".format(sub_total_reward),
                                        "magenta")
                    steps_str = colored("steps={}".format(sub_steps), "blue")
                    log_data = (self.global_t, score_str, steps_str)
                    logger.debug("train: global_t={} {} {}".format(*log_data))
                    self.net.record_summary(score=sub_total_reward,
                                            steps=sub_steps,
                                            episodes=None,
                                            global_t=self.global_t,
                                            mode='Train')
                    sub_total_reward = 0.0
                    sub_steps = 0
                self._reset(hard_reset=False)

            # save progress every SAVE_FREQ iterations
            if self.global_t % self.save_freq == 0:
                wall_t = time.time() - start_time
                logger.info('Total time: {} seconds'.format(wall_t))
                wall_t_fname = self.folder + '/' + 'wall_t.' + str(
                    self.global_t)
                epsilon_fname = self.folder + '/epsilon'

                logger.info('Now saving data. Please wait')
                with open(wall_t_fname, 'w') as f:
                    f.write(str(wall_t))
                with open(epsilon_fname, 'w') as f:
                    f.write(str(self.epsilon))

                self.net.save(self.global_t)

                self.replay_memory.save(name=self.name,
                                        folder=self.folder,
                                        resize=False)
                pickle.dump(
                    self.rewards,
                    open(
                        self.folder + '/' + self.name.replace('-', '_') +
                        '-dqn-rewards.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
                logger.info('Data saved!')

            # log information
            state = ""
            if self.global_t - 1 < self.observe:
                state = "observe"
            elif self.global_t - 1 < self.observe + self.explore:
                state = "explore"
            else:
                state = "train"

            if (self.global_t - 1) % 10000 == 0:
                if self.use_human_advice:
                    log_data = (state, self.global_t - 1,
                                self.epsilon, self.psi, use_advice, action,
                                np.max(readout_t))
                    logger.debug(
                        "{0:}: global_t={1:} epsilon={2:.4f} psi={3:.4f} \
                        advice={4:} action={5:} q_max={6:.4f}".format(
                            *log_data))
                else:
                    log_data = (state, self.global_t - 1, self.epsilon, action,
                                np.max(readout_t))
                    logger.debug(
                        "{0:}: global_t={1:} epsilon={2:.4f} action={3:} "
                        "q_max={4:.4f}".format(*log_data))
예제 #4
0
    def test(self, render=False):
        logger.info("Evaluate policy at global_t={}...".format(self.global_t))

        episode_buffer = []
        self.game_state.reset(hard_reset=True)
        episode_buffer.append(self.game_state.get_screen_rgb())

        max_steps = self.eval_max_steps
        total_reward = 0
        total_steps = 0
        episode_reward = 0
        episode_steps = 0
        n_episodes = 0

        # use one demonstration data to record cam
        # only need to make movie for demo data once
        # if self.global_t == 0:
        cam, state, action = self.calculate_cam(self.test_cam_si)
        cam_plus_img = []
        cam_side_img = []

        for i in range(len(cam)):
            # overlay cam-state
            overlay = np.uint8(cam[i]).copy()
            output = np.uint8(state[i]).copy()
            alpha = 0.3
            cv2.addWeighted(overlay, alpha, output, 1 - alpha, 0, output)
            # create a title space for action
            title_space = np.zeros((20, 84, 3), np.uint8)
            title_space[:] = (255, 255, 255)
            cv2.putText(title_space, "{}".format(ACTION_MEANING[action[i]]),
                        (20, 14), cv2.FONT_HERSHEY_DUPLEX, .4, (0, 0, 0), 1)
            # concate title and state
            vcat_output = cv2.vconcat((title_space, output))
            cam_plus_img.append(vcat_output)

            # side-by-side cam-state
            hcat_cam_state = cv2.hconcat(
                (np.uint8(cam[i]).copy(), np.uint8(state[i]).copy()))
            title_space = np.zeros((20, 84 * 2, 3), np.uint8)
            title_space[:] = (255, 255, 255)
            vcat_title_camstate = cv2.vconcat((title_space, hcat_cam_state))
            cv2.putText(vcat_title_camstate,
                        "{}".format(ACTION_MEANING[action[i]]), (20, 14),
                        cv2.FONT_HERSHEY_DUPLEX, .4, (0, 0, 0), 1)
            cam_side_img.append(vcat_title_camstate)

        time_per_step = 0.0167
        make_movie(
            cam_plus_img,
            self.folder +
            '/frames/demo-cam_plus_img{ep:010d}'.format(ep=(self.global_t)),
            duration=len(cam) * time_per_step,
            true_image=True,
            salience=False)
        make_movie(
            cam_side_img,
            self.folder +
            '/frames/demo-cam_side_img{ep:010d}'.format(ep=(self.global_t)),
            duration=len(state) * time_per_step,
            true_image=True,
            salience=False)
        del cam, state, action, cam_plus_img, cam_side_img

        while max_steps > 0:
            readout_t = self.net.evaluate(self.game_state.s_t)[0]
            action = get_action_index(
                readout_t,
                is_random=(random.random() <= 0.05),
                n_actions=self.game_state.env.action_space.n)

            # take action
            self.game_state.step(action)
            terminal = self.game_state.terminal

            if n_episodes == 0 and self.global_t % 2000000 == 0:
                episode_buffer.append(self.game_state.get_screen_rgb())

            episode_reward += self.game_state.reward
            episode_steps += 1
            max_steps -= 1

            # s_t = s_t1
            self.game_state.update()

            if terminal:
                if get_wrapper_by_name(self.game_state.env,
                                       'EpisodicLifeEnv').was_real_done:
                    if n_episodes == 0 and self.global_t % 2000000 == 0:
                        time_per_step = 0.0167
                        images = np.array(episode_buffer)
                        make_movie(images,
                                   self.folder +
                                   '/frames/image{ep:010d}'.format(
                                       ep=(self.global_t)),
                                   duration=len(images) * time_per_step,
                                   true_image=True,
                                   salience=False)
                        episode_buffer = []
                    n_episodes += 1
                    score_str = colored("score={}".format(episode_reward),
                                        "magenta")
                    steps_str = colored("steps={}".format(episode_steps),
                                        "blue")
                    log_data = (self.global_t, n_episodes, score_str,
                                steps_str, total_steps)
                    logger.debug(
                        "test: global_t={} trial={} {} {} total_steps={}".
                        format(*log_data))
                    total_reward += episode_reward
                    total_steps += episode_steps
                    episode_reward = 0
                    episode_steps = 0
                self.game_state.reset(hard_reset=False)

        if n_episodes == 0:
            total_reward = episode_reward
            total_steps = episode_steps
        else:
            # (timestep, total sum of rewards, total # of steps before terminating)
            total_reward = total_reward / n_episodes
            total_steps = total_steps // n_episodes

        log_data = (self.global_t, total_reward, total_steps, n_episodes)
        logger.debug(
            "test: global_t={} final score={} final steps={} # episodes={}".
            format(*log_data))
        self.net.record_summary(score=total_reward,
                                steps=total_steps,
                                episodes=n_episodes,
                                global_t=self.global_t,
                                mode='Test')

        self.rewards['eval'][self.global_t] = (total_reward, total_steps,
                                               n_episodes)
        return total_reward, total_steps, n_episodes