Example #1
0
    def generate_cam_video(self,
                           sess,
                           time_per_step,
                           global_t,
                           folder,
                           demo_memory_cam,
                           demo_cam_human=False):
        # use one demonstration data to record cam
        # only need to make movie for demo data once
        cam_side_img = self.generate_cam(sess, demo_memory_cam, global_t)

        path = '/frames/demo-cam_side_img'
        if demo_cam_human:
            path += '_human'

        make_movie(cam_side_img,
                   folder + '{}{ep:010d}'.format(path, ep=(global_t)),
                   duration=len(cam_side_img) * time_per_step,
                   true_image=True,
                   salience=False)
        del cam_side_img
Example #2
0
    def run(self, minutes_limit=5, episode=0, num_episodes=0, demo_type=0,
            model_net=None, replay_memory=None, total_memory=0):
        if self.create_movie:
            movie_images = []

        rewards = {'train':[], 'eval':[]}

        full_episode = False
        if minutes_limit == 0:
            full_episode = True
        timeout = 60 * minutes_limit
        t = 0
        total_reward = 0.0

        # re-initialize game for evaluation
        self._reset(replay_memory, hard_reset=True)

        rew = self.game_state.reward
        terminal = False
        lives = self.game_state.lives
        # loss_life = self.game_state.loss_life
        # gain_life = self.game_state.gain_life and not loss_life

        if self.pause_onstart:
            root = Tk()
            root.withdraw()

            messagebox.showinfo(
                self.name,
                "Start episode {} of {}. total memory={}. "
                "Press OK to start playing".format(episode, num_episodes, total_memory))

        # regular game
        start_time = datetime.datetime.now()
        timeout_start = time.time()

        actions = deque()

        dtm = time.time()
        pulse = 1.0 / self.hertz

        while True:
            dtm += pulse
            delay = dtm - time.time()
            if delay > 0:
                time.sleep(delay) #60 hz
            else:
                dtm = time.time()

            if not terminal:
                if demo_type == 1:  # RANDOM AGENT
                    action = np.random.randint(self.game_state.n_actions)
                elif demo_type == 2:  # MODEL AGENT
                    if sub_t % self._skip == 0:
                        self._update_state_input(self.game_state.s_t)
                        readout_t = model_net.evaluate(self.state_input)[0]
                        action = get_action_index(readout_t, is_random=False, n_actions=self.game_state.n_actions)
                else: # HUMAN
                    action = self.game_state.env.human_agent_action

            actions.append(action)
            self.game_state.step(action)
            rew += self.game_state.reward
            lives = self.game_state.lives
            # loss_life = loss_life or self.game_state.loss_life
            # gain_life = (gain_life or self.game_state.gain_life) and not loss_life
            total_reward += self.game_state.reward
            t += 1

            if self.create_movie:
                movie_images.append(self.game_state.get_screen_rgb())

            # Ensure that D does not reach max memory that mitigate
            # problems when combining different human demo files
            if (replay_memory.size + 3) == replay_memory.max_steps:
                logger.warn("Memory max limit reached!")
                terminal = True
            elif not full_episode:
                terminal = True if (time.time() > timeout_start + timeout) else False

            # add memory every 4th frame even if demo uses skip=1
            if self.game_state.get_episode_frame_number() % self._skip == 0 or terminal or self.game_state.terminal:
                self.obs_buffer[0] = self.game_state.x_t
                self.obs_buffer[1] = self.game_state.x_t1
                max_obs = self.obs_buffer.max(axis=0)
                # cv2.imshow('max obs', max_obs)
                # cv2.imshow('current', self.game_state.x_t1)
                # cv2.waitKey(1)

                # store the transition in D
                replay_memory.add(
                    max_obs,
                    actions.popleft(),
                    rew,
                    terminal or self.game_state.terminal,
                    lives,
                    fullstate=self.game_state.full_state1)
                actions.clear()
                rew = 0

                if terminal or (self.game_state.episode_life and get_wrapper_by_name(self.game_state.env, 'EpisodicLifeEnv').was_real_done):
                    root = Tk()
                    root.withdraw()
                    messagebox.showinfo(self.name, "Times up!" if terminal else "Game ended!")
                    break

                if self.game_state.terminal:
                    self._reset(replay_memory, hard_reset=False)
                    continue

            self.game_state.update()

        end_time = datetime.datetime.now()
        duration = end_time - start_time
        logger.info("Duration: {}".format(duration))
        logger.info("Total steps: {}".format(t))
        logger.info("Total reward: {}".format(total_reward))
        logger.info("Total Replay memory saved: {}".format(replay_memory.size))

        replay_memory.save(name=self.name, folder=self.folder, resize=True)
        if self.create_movie:
            time_per_step = 0.0167
            make_movie(
                movie_images, str(self.folder / "demo"),
                duration=len(movie_images)*time_per_step,
                true_image=True, salience=False)

        return total_reward, t, start_time, end_time, duration, replay_memory.size
    def testing(self, sess, max_steps, global_t, folder, worker=None):
        """Evaluate A3C."""
        assert worker is not None
        assert not worker.is_refresh_thread
        assert not worker.is_sil_thread

        logger.info("Evaluate policy at global_t={}...".format(global_t))

        # copy weights from shared to local
        sess.run(worker.sync)

        episode_buffer = []
        worker.game_state.reset(hard_reset=True)
        episode_buffer.append(worker.game_state.get_screen_rgb())

        total_reward = 0
        total_steps = 0
        episode_reward = 0
        episode_steps = 0
        n_episodes = 0

        while max_steps > 0:
            state = cv2.resize(worker.game_state.s_t,
                               worker.local_net.in_shape[:-1],
                               interpolation=cv2.INTER_AREA)
            pi_, value_, logits_ = \
                worker.local_net.run_policy_and_value(sess, state)

            if False:
                action = np.random.choice(range(worker.action_size), p=pi_)
            else:
                action = worker.pick_action(logits_)

            # take action
            worker.game_state.step(action)
            terminal = worker.game_state.terminal

            if n_episodes == 0 and global_t % 5000000 == 0:
                episode_buffer.append(worker.game_state.get_screen_rgb())

            episode_reward += worker.game_state.reward
            episode_steps += 1
            max_steps -= 1

            # s_t = s_t1
            worker.game_state.update()

            if terminal:
                env = worker.game_state.env
                name = 'EpisodicLifeEnv'
                if get_wrapper_by_name(env, name).was_real_done:
                    # make a video every 5M training steps, using the first episode tested
                    if n_episodes == 0 and global_t % 5000000 == 0:
                        time_per_step = 0.0167
                        images = np.array(episode_buffer)
                        file = 'frames/image{ep:010d}'.format(ep=global_t)
                        duration = len(images) * time_per_step
                        make_movie(images,
                                   str(folder / file),
                                   duration=duration,
                                   true_image=True,
                                   salience=False)
                        episode_buffer = []

                    n_episodes += 1
                    score_str = colored("score={}".format(episode_reward),
                                        "yellow")
                    steps_str = colored("steps={}".format(episode_steps),
                                        "cyan")
                    log_data = (global_t, worker.thread_idx, self.thread_idx,
                                n_episodes, score_str, steps_str, total_steps)
                    logger.debug(
                        "test: global_t={} test_worker={} cur_worker={}"
                        " trial={} {} {}"
                        " total_steps={}".format(*log_data))
                    total_reward += episode_reward
                    total_steps += episode_steps
                    episode_reward = 0
                    episode_steps = 0

                worker.game_state.reset(hard_reset=False)

        if n_episodes == 0:
            total_reward = episode_reward
            total_steps = episode_steps
        else:
            total_reward = total_reward / n_episodes
            total_steps = total_steps // n_episodes

        log_data = (global_t, worker.thread_idx, self.thread_idx, total_reward,
                    total_steps, n_episodes)
        logger.info("test: global_t={} test_worker={} cur_worker={}"
                    " final score={} final steps={}"
                    " # trials={}".format(*log_data))

        worker.record_summary(score=total_reward,
                              steps=total_steps,
                              episodes=n_episodes,
                              global_t=global_t,
                              mode='A3C_Test')

        # reset variables used in training
        worker.episode_reward = 0
        worker.episode_steps = 0
        worker.game_state.reset(hard_reset=True)
        worker.last_rho = 0.

        if worker.use_sil:
            # ensure no states left from a non-terminating episode
            worker.episode.reset()
        return (total_reward, total_steps, n_episodes)
Example #4
0
    def testing(self, sess, max_steps, global_t, folder, demo_memory_cam=None):
        logger.info("Evaluate policy at global_t={}...".format(global_t))
        # copy weights from shared to local
        sess.run(self.sync)

        if demo_memory_cam is not None and global_t % 5000000 == 0:
            self.generate_cam_video(sess, 0.03, global_t, folder,
                                    demo_memory_cam)

        episode_buffer = []
        self.game_state.reset(hard_reset=True)
        episode_buffer.append(self.game_state.get_screen_rgb())

        total_reward = 0
        total_steps = 0
        episode_reward = 0
        episode_steps = 0
        n_episodes = 0
        while max_steps > 0:
            #pi_ = self.local_network.run_policy(sess, self.game_state.s_t)
            pi_, value_, logits_ = self.local_network.run_policy_and_value(
                sess, self.game_state.s_t)
            if False:
                action = np.random.choice(range(self.action_size), p=pi_)
            else:
                action = self.choose_action(logits_)

            if self.use_pretrained_model_as_advice:
                psi = self.psi if self.psi > 0.001 else 0.0
                if psi > np.random.rand():
                    model_pi = self.pretrained_model.run_policy(
                        self.pretrained_model_sess, self.game_state.s_t)
                    model_action, confidence = self.choose_action_with_high_confidence(
                        model_pi, exclude_noop=False)
                    if model_action > self.shaping_actions and confidence >= self.advice_confidence:
                        action = model_action

            # take action
            self.game_state.step(action)
            terminal = self.game_state.terminal

            if n_episodes == 0 and global_t % 5000000 == 0:
                episode_buffer.append(self.game_state.get_screen_rgb())

            episode_reward += self.game_state.reward
            episode_steps += 1
            max_steps -= 1

            # s_t = s_t1
            self.game_state.update()

            if terminal:
                if get_wrapper_by_name(self.game_state.env,
                                       'EpisodicLifeEnv').was_real_done:
                    if n_episodes == 0 and global_t % 5000000 == 0:
                        time_per_step = 0.0167
                        images = np.array(episode_buffer)
                        make_movie(
                            images,
                            folder +
                            '/frames/image{ep:010d}'.format(ep=global_t),
                            duration=len(images) * time_per_step,
                            true_image=True,
                            salience=False)
                        episode_buffer = []
                    n_episodes += 1
                    score_str = colored("score={}".format(episode_reward),
                                        "magenta")
                    steps_str = colored("steps={}".format(episode_steps),
                                        "blue")
                    log_data = (global_t, self.thread_index, n_episodes,
                                score_str, steps_str, total_steps)
                    logger.debug(
                        "test: global_t={} worker={} trial={} {} {} total_steps={}"
                        .format(*log_data))
                    total_reward += episode_reward
                    total_steps += episode_steps
                    episode_reward = 0
                    episode_steps = 0

                self.game_state.reset(hard_reset=False)
                if self.use_lstm:
                    self.local_network.reset_state()

        if n_episodes == 0:
            total_reward = episode_reward
            total_steps = episode_steps
        else:
            # (timestep, total sum of rewards, total # of steps before terminating)
            total_reward = total_reward / n_episodes
            total_steps = total_steps // n_episodes

        log_data = (global_t, self.thread_index, total_reward, total_steps,
                    n_episodes)
        logger.info(
            "test: global_t={} worker={} final score={} final steps={} # trials={}"
            .format(*log_data))

        self.record_summary(score=total_reward,
                            steps=total_steps,
                            episodes=n_episodes,
                            global_t=global_t,
                            mode='Test')

        # reset variables used in training
        self.episode_reward = 0
        self.episode_steps = 0
        self.game_state.reset(hard_reset=True)
        self.last_rho = 0.
        if self.is_demo_thread:
            self.replay_mem_reset()

        if self.use_lstm:
            self.local_network.reset_state()
        return total_reward, total_steps, n_episodes
Example #5
0
    def testing_model(self,
                      sess,
                      max_steps,
                      global_t,
                      folder,
                      demo_memory_cam=None,
                      demo_cam_human=False):
        logger.info("Testing model at global_t={}...".format(global_t))
        # copy weights from shared to local
        sess.run(self.sync)

        if demo_memory_cam is not None:
            self.generate_cam_video(sess, 0.03, global_t, folder,
                                    demo_memory_cam, demo_cam_human)
            return
        else:
            self.game_state.reset(hard_reset=True)
            max_steps += 4
            test_memory = ReplayMemory(
                84,
                84,
                np.random.RandomState(),
                max_steps=max_steps,
                phi_length=4,
                num_actions=self.game_state.env.action_space.n,
                wrap_memory=False,
                full_state_size=self.game_state.clone_full_state().shape[0])
            for _ in range(4):
                test_memory.add(self.game_state.x_t,
                                0,
                                self.game_state.reward,
                                self.game_state.terminal,
                                self.game_state.lives,
                                fullstate=self.game_state.full_state)

        episode_buffer = []
        test_memory_cam = []

        total_reward = 0
        total_steps = 0
        episode_reward = 0
        episode_steps = 0
        n_episodes = 0
        terminal = False
        while True:
            #pi_ = self.local_network.run_policy(sess, self.game_state.s_t)
            test_memory_cam.append(self.game_state.s_t)
            episode_buffer.append(self.game_state.get_screen_rgb())
            pi_, value_, logits_ = self.local_network.run_policy_and_value(
                sess, self.game_state.s_t)
            #action = self.choose_action(logits_)
            action = np.argmax(pi_)

            # take action
            self.game_state.step(action)
            terminal = self.game_state.terminal
            memory_full = episode_steps == max_steps - 5
            terminal_ = terminal or memory_full

            # store the transition to replay memory
            test_memory.add(self.game_state.x_t1,
                            action,
                            self.game_state.reward,
                            terminal_,
                            self.game_state.lives,
                            fullstate=self.game_state.full_state1)

            # update the old values
            episode_reward += self.game_state.reward
            episode_steps += 1

            # s_t = s_t1
            self.game_state.update()

            if terminal_:
                if get_wrapper_by_name(
                        self.game_state.env,
                        'EpisodicLifeEnv').was_real_done or memory_full:
                    time_per_step = 0.03
                    images = np.array(episode_buffer)
                    make_movie(images,
                               folder +
                               '/frames/image{ep:010d}'.format(ep=global_t),
                               duration=len(images) * time_per_step,
                               true_image=True,
                               salience=False)
                    break

                self.game_state.reset(hard_reset=False)
                if self.use_lstm:
                    self.local_network.reset_state()

        total_reward = episode_reward
        total_steps = episode_steps
        log_data = (global_t, self.thread_index, total_reward, total_steps)
        logger.info(
            "test: global_t={} worker={} final score={} final steps={}".format(
                *log_data))

        self.generate_cam_video(sess, 0.03, global_t, folder,
                                np.array(test_memory_cam))
        test_memory.save(name='test_cam', folder=folder, resize=True)

        if self.use_lstm:
            self.local_network.reset_state()

        return
Example #6
0
    def test(self, render=False):
        logger.info("Evaluate policy at global_t={}...".format(self.global_t))

        episode_buffer = []
        self.game_state.reset(hard_reset=True)
        episode_buffer.append(self.game_state.get_screen_rgb())

        max_steps = self.eval_max_steps
        total_reward = 0
        total_steps = 0
        episode_reward = 0
        episode_steps = 0
        n_episodes = 0

        # use one demonstration data to record cam
        # only need to make movie for demo data once
        # if self.global_t == 0:
        cam, state, action = self.calculate_cam(self.test_cam_si)
        cam_plus_img = []
        cam_side_img = []

        for i in range(len(cam)):
            # overlay cam-state
            overlay = np.uint8(cam[i]).copy()
            output = np.uint8(state[i]).copy()
            alpha = 0.3
            cv2.addWeighted(overlay, alpha, output, 1 - alpha, 0, output)
            # create a title space for action
            title_space = np.zeros((20, 84, 3), np.uint8)
            title_space[:] = (255, 255, 255)
            cv2.putText(title_space, "{}".format(ACTION_MEANING[action[i]]),
                        (20, 14), cv2.FONT_HERSHEY_DUPLEX, .4, (0, 0, 0), 1)
            # concate title and state
            vcat_output = cv2.vconcat((title_space, output))
            cam_plus_img.append(vcat_output)

            # side-by-side cam-state
            hcat_cam_state = cv2.hconcat(
                (np.uint8(cam[i]).copy(), np.uint8(state[i]).copy()))
            title_space = np.zeros((20, 84 * 2, 3), np.uint8)
            title_space[:] = (255, 255, 255)
            vcat_title_camstate = cv2.vconcat((title_space, hcat_cam_state))
            cv2.putText(vcat_title_camstate,
                        "{}".format(ACTION_MEANING[action[i]]), (20, 14),
                        cv2.FONT_HERSHEY_DUPLEX, .4, (0, 0, 0), 1)
            cam_side_img.append(vcat_title_camstate)

        time_per_step = 0.0167
        make_movie(
            cam_plus_img,
            self.folder +
            '/frames/demo-cam_plus_img{ep:010d}'.format(ep=(self.global_t)),
            duration=len(cam) * time_per_step,
            true_image=True,
            salience=False)
        make_movie(
            cam_side_img,
            self.folder +
            '/frames/demo-cam_side_img{ep:010d}'.format(ep=(self.global_t)),
            duration=len(state) * time_per_step,
            true_image=True,
            salience=False)
        del cam, state, action, cam_plus_img, cam_side_img

        while max_steps > 0:
            readout_t = self.net.evaluate(self.game_state.s_t)[0]
            action = get_action_index(
                readout_t,
                is_random=(random.random() <= 0.05),
                n_actions=self.game_state.env.action_space.n)

            # take action
            self.game_state.step(action)
            terminal = self.game_state.terminal

            if n_episodes == 0 and self.global_t % 2000000 == 0:
                episode_buffer.append(self.game_state.get_screen_rgb())

            episode_reward += self.game_state.reward
            episode_steps += 1
            max_steps -= 1

            # s_t = s_t1
            self.game_state.update()

            if terminal:
                if get_wrapper_by_name(self.game_state.env,
                                       'EpisodicLifeEnv').was_real_done:
                    if n_episodes == 0 and self.global_t % 2000000 == 0:
                        time_per_step = 0.0167
                        images = np.array(episode_buffer)
                        make_movie(images,
                                   self.folder +
                                   '/frames/image{ep:010d}'.format(
                                       ep=(self.global_t)),
                                   duration=len(images) * time_per_step,
                                   true_image=True,
                                   salience=False)
                        episode_buffer = []
                    n_episodes += 1
                    score_str = colored("score={}".format(episode_reward),
                                        "magenta")
                    steps_str = colored("steps={}".format(episode_steps),
                                        "blue")
                    log_data = (self.global_t, n_episodes, score_str,
                                steps_str, total_steps)
                    logger.debug(
                        "test: global_t={} trial={} {} {} total_steps={}".
                        format(*log_data))
                    total_reward += episode_reward
                    total_steps += episode_steps
                    episode_reward = 0
                    episode_steps = 0
                self.game_state.reset(hard_reset=False)

        if n_episodes == 0:
            total_reward = episode_reward
            total_steps = episode_steps
        else:
            # (timestep, total sum of rewards, total # of steps before terminating)
            total_reward = total_reward / n_episodes
            total_steps = total_steps // n_episodes

        log_data = (self.global_t, total_reward, total_steps, n_episodes)
        logger.debug(
            "test: global_t={} final score={} final steps={} # episodes={}".
            format(*log_data))
        self.net.record_summary(score=total_reward,
                                steps=total_steps,
                                episodes=n_episodes,
                                global_t=self.global_t,
                                mode='Test')

        self.rewards['eval'][self.global_t] = (total_reward, total_steps,
                                               n_episodes)
        return total_reward, total_steps, n_episodes