Beispiel #1
0
 def reset(self, hard_reset=False):
     if self.episode_life and hard_reset:
         get_wrapper_by_name(self.env,
                             'EpisodicLifeEnv').was_real_done = True
     x_t = self.env.reset()
     self.prev_x_t = x_t
     self.x_t = x_t
     self.s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)
     self.full_state = self.env.unwrapped.clone_full_state()
     self.lives = self.env.unwrapped.ale.lives()
     self.reward = 0
     self.terminal = False
     self.loss_life = False
     self.gain_life = False
Beispiel #2
0
    def test_game(self, sess):
        self.game_state.reset(hard_reset=True)

        max_steps = 25000
        total_reward = 0
        total_steps = 0
        episode_reward = 0
        episode_steps = 0
        n_episodes = 0
        while max_steps > 0:
            model_pi = self.net.run_policy(sess, self.game_state.s_t)
            action, confidence = self.choose_action_with_high_confidence(model_pi, exclude_noop=False)

            # take action
            self.game_state.step(action)
            terminal = self.game_state.terminal
            episode_reward += self.game_state.reward
            episode_steps += 1
            max_steps -= 1

            # s_t = s_t1
            self.game_state.update()

            if terminal:
                if get_wrapper_by_name(self.game_state.env, 'EpisodicLifeEnv').was_real_done:
                    n_episodes += 1
                    score_str = colored("score={}".format(episode_reward), "magenta")
                    steps_str = colored("steps={}".format(episode_steps), "blue")
                    log_data = (n_episodes, score_str, steps_str, total_steps)
                    #logger.debug("test: trial={} {} {} total_steps={}".format(*log_data))
                    total_reward += episode_reward
                    total_steps += episode_steps
                    episode_reward = 0
                    episode_steps = 0

                self.game_state.reset(hard_reset=False)

        if n_episodes == 0:
            total_reward = episode_reward
            total_steps = episode_steps
        else:
            # (timestep, total sum of rewards, total # of steps before terminating)
            total_reward = total_reward / n_episodes
            total_steps = total_steps // n_episodes

        log_data = (total_reward, total_steps, n_episodes)
        logger.info("test: final score={} final steps={} # trials={}".format(*log_data))
        return log_data
Beispiel #3
0
    def test_game(self, sess):
        """Evaluate game with current network model.

        Keyword argument:
        sess -- tf session
        """
        self.game_state.reset(hard_reset=True)

        max_steps = 25000
        total_reward = 0
        total_steps = 0
        episode_reward = 0
        episode_steps = 0
        n_episodes = 0
        while max_steps > 0:
            state = cv2.resize(self.game_state.s_t,
                               self.net.in_shape[:-1],
                               interpolation=cv2.INTER_AREA)
            model_pi = self.net.run_policy(sess, state)
            action, confidence = self.choose_action_with_high_confidence(
                model_pi, exclude_noop=False)

            # take action
            self.game_state.step(action)
            terminal = self.game_state.terminal
            episode_reward += self.game_state.reward
            episode_steps += 1
            max_steps -= 1

            # s_t = s_t1
            self.game_state.update()

            if terminal:
                was_real_done = get_wrapper_by_name(
                    self.game_state.env, 'EpisodicLifeEnv').was_real_done

                if was_real_done:
                    n_episodes += 1
                    score_str = colored("score={}".format(
                        episode_reward), "magenta")
                    steps_str = colored("steps={}".format(
                        episode_steps), "blue")
                    log_data = (n_episodes, score_str, steps_str, total_steps)
                    logger.debug("test: trial={} {} {} total_steps={}"
                                 .format(*log_data))
                    total_reward += episode_reward
                    total_steps += episode_steps
                    episode_reward = 0
                    episode_steps = 0

                self.game_state.reset(hard_reset=False)

        if n_episodes == 0:
            total_reward = episode_reward
            total_steps = episode_steps
        else:
            total_reward = total_reward / n_episodes
            total_steps = total_steps // n_episodes

        log_data = (total_reward, total_steps, n_episodes)
        logger.info("test: final score={} final steps={} # trials={}"
                    .format(*log_data))
        return log_data
Beispiel #4
0
def test_keys(env_id):
    import cv2
    from skimage.measure import compare_ssim
    from skimage import io, filters
    from collections import deque
    test_game = GameState(env_id=env_id, display=True, human_demo=True)
    terminal = False
    skip = 0
    state = test_game.x_t
    sys_state = None
    sys_states = deque(maxlen=100)
    last_num_steps = 0
    last_num_ctr = 0
    max_repeat = 5
    while True:
        sys_state = test_game.clone_full_state()
        sys_states.append((sys_state, test_game.get_episode_frame_number()))
        a = test_game.env.human_agent_action
        test_game.step(a)
        # new_state = test_game.x_t
        # (score, diff) = compare_ssim(state, new_state, full=True)
        # logger.info("SSIM: {}".format(score))
        # state = new_state
        # edges = filters.sobel(state)
        # cv2.imshow("edges", test_game.x_t)
        # cv2.waitKey(1)
        if test_game.gain_life:
            logger.info("Gain Life")
        if test_game.loss_life:
            logger.warn("Lost life!")
            logger.info("frame number={}".format(
                test_game.get_episode_frame_number()))
            restore = True
            last_num_ctr += 1
            if last_num_steps == 0:
                last_num_steps = len(sys_states)
                logger.info('last_num_steps={}'.format(last_num_steps))
            elif last_num_steps > len(sys_states):
                logger.info('last_num_ctr={}'.format(last_num_ctr))
                if last_num_ctr == max_repeat:
                    restore = False
            if restore:
                full_state, frame_num = sys_states.popleft()
                logger.info("\trestore frame number={}".format(frame_num))
                test_game.restore_full_state(full_state)
            steps = 0
            sys_states.clear()
        if test_game.reward > 0:
            last_num_steps = 0
            last_num_ctr = 0
            sys_states.clear()
        elif test_game.reward < 0:
            logger.info("reward={}".format(test_game.reward))
            restore = True
            last_num_ctr += 1
            if last_num_steps == 0:
                last_num_steps = len(sys_states)
                logger.info('last_num_steps={}'.format(last_num_steps))
            elif last_num_steps > len(sys_states):
                logger.info('last_num_ctr={}'.format(last_num_ctr))
                if last_num_ctr == max_repeat:
                    restore = False
            if restore:
                full_state, frame_num = sys_states.popleft()
                logger.info("\trestore frame number={}".format(frame_num))
                test_game.restore_full_state(full_state)
            steps = 0
            sys_states.clear()

        if get_wrapper_by_name(test_game.env, 'EpisodicLifeEnv').was_real_done:
            break
        elif test_game.terminal:
            test_game.reset(hard_reset=False)
        sleep(0.0167)

    # cv2.destroyAllWindows()
    test_game.close()
    del test_game.env
    del test_game
    def testing(self, sess, max_steps, global_t, folder, worker=None):
        """Evaluate A3C."""
        assert worker is not None
        assert not worker.is_refresh_thread
        assert not worker.is_sil_thread

        logger.info("Evaluate policy at global_t={}...".format(global_t))

        # copy weights from shared to local
        sess.run(worker.sync)

        episode_buffer = []
        worker.game_state.reset(hard_reset=True)
        episode_buffer.append(worker.game_state.get_screen_rgb())

        total_reward = 0
        total_steps = 0
        episode_reward = 0
        episode_steps = 0
        n_episodes = 0

        while max_steps > 0:
            state = cv2.resize(worker.game_state.s_t,
                               worker.local_net.in_shape[:-1],
                               interpolation=cv2.INTER_AREA)
            pi_, value_, logits_ = \
                worker.local_net.run_policy_and_value(sess, state)

            if False:
                action = np.random.choice(range(worker.action_size), p=pi_)
            else:
                action = worker.pick_action(logits_)

            # take action
            worker.game_state.step(action)
            terminal = worker.game_state.terminal

            if n_episodes == 0 and global_t % 5000000 == 0:
                episode_buffer.append(worker.game_state.get_screen_rgb())

            episode_reward += worker.game_state.reward
            episode_steps += 1
            max_steps -= 1

            # s_t = s_t1
            worker.game_state.update()

            if terminal:
                env = worker.game_state.env
                name = 'EpisodicLifeEnv'
                if get_wrapper_by_name(env, name).was_real_done:
                    # make a video every 5M training steps, using the first episode tested
                    if n_episodes == 0 and global_t % 5000000 == 0:
                        time_per_step = 0.0167
                        images = np.array(episode_buffer)
                        file = 'frames/image{ep:010d}'.format(ep=global_t)
                        duration = len(images) * time_per_step
                        make_movie(images,
                                   str(folder / file),
                                   duration=duration,
                                   true_image=True,
                                   salience=False)
                        episode_buffer = []

                    n_episodes += 1
                    score_str = colored("score={}".format(episode_reward),
                                        "yellow")
                    steps_str = colored("steps={}".format(episode_steps),
                                        "cyan")
                    log_data = (global_t, worker.thread_idx, self.thread_idx,
                                n_episodes, score_str, steps_str, total_steps)
                    logger.debug(
                        "test: global_t={} test_worker={} cur_worker={}"
                        " trial={} {} {}"
                        " total_steps={}".format(*log_data))
                    total_reward += episode_reward
                    total_steps += episode_steps
                    episode_reward = 0
                    episode_steps = 0

                worker.game_state.reset(hard_reset=False)

        if n_episodes == 0:
            total_reward = episode_reward
            total_steps = episode_steps
        else:
            total_reward = total_reward / n_episodes
            total_steps = total_steps // n_episodes

        log_data = (global_t, worker.thread_idx, self.thread_idx, total_reward,
                    total_steps, n_episodes)
        logger.info("test: global_t={} test_worker={} cur_worker={}"
                    " final score={} final steps={}"
                    " # trials={}".format(*log_data))

        worker.record_summary(score=total_reward,
                              steps=total_steps,
                              episodes=n_episodes,
                              global_t=global_t,
                              mode='A3C_Test')

        # reset variables used in training
        worker.episode_reward = 0
        worker.episode_steps = 0
        worker.game_state.reset(hard_reset=True)
        worker.last_rho = 0.

        if worker.use_sil:
            # ensure no states left from a non-terminating episode
            worker.episode.reset()
        return (total_reward, total_steps, n_episodes)
    def test_loaded_classifier(self,
                               global_t,
                               max_eps,
                               sess,
                               worker=None,
                               model=None):
        """Evaluate game with current classifier model."""
        assert model is not None
        assert sess is not None
        assert worker is not None

        logger.info(
            "Testing loaded classifier at global_t={}...".format(global_t))

        worker.game_state.reset(hard_reset=True)

        total_reward = 0
        total_steps = 0
        episode_reward = 0
        episode_steps = 0
        n_episodes = 0
        reward_list = []

        # testing loaded classifier for 50 epsodes
        while n_episodes < max_eps:
            state = cv2.resize(worker.game_state.s_t,
                               model.in_shape[:-1],
                               interpolation=cv2.INTER_AREA)
            model_pi = model.run_policy(sess, state)

            action, _ = self.choose_action_with_high_confidence(
                model_pi, exclude_noop=False)

            # take action
            worker.game_state.step(action)
            terminal = worker.game_state.terminal
            episode_reward += worker.game_state.reward
            episode_steps += 1

            # s_t = s_t1
            worker.game_state.update()

            if terminal:
                was_real_done = get_wrapper_by_name(
                    worker.game_state.env, 'EpisodicLifeEnv').was_real_done

                if was_real_done:
                    n_episodes += 1
                    score_str = colored("score={}".format(episode_reward),
                                        "magenta")
                    steps_str = colored("steps={}".format(episode_steps),
                                        "blue")
                    log_data = (n_episodes, score_str, steps_str,
                                worker.thread_idx, self.thread_idx,
                                total_steps)
                    logger.debug(
                        "(fixed) classifier test: trial={} {} {} "
                        "test_worker={} cur_worker={} total_steps={}".format(
                            *log_data))
                    total_reward += episode_reward
                    reward_list.append(episode_reward)
                    total_steps += episode_steps
                    episode_reward = 0
                    episode_steps = 0

                worker.game_state.reset(hard_reset=False)

        if n_episodes == 0:
            total_reward = episode_reward
            total_steps = episode_steps
        else:
            total_reward = total_reward / n_episodes
            total_steps = total_steps // n_episodes

        log_data = (global_t, worker.thread_idx, self.thread_idx, total_reward,
                    total_steps, n_episodes)
        logger.info(
            "classifier test: global_t={} test_worker={} cur_worker={} "
            "final score={} final steps={} # trials={}".format(*log_data))
        self.record_summary(score=total_reward,
                            steps=total_steps,
                            episodes=n_episodes,
                            global_t=global_t,
                            mode='Classifier_Test')

        return (total_reward, total_steps, n_episodes, reward_list)
Beispiel #7
0
    def testing(self, sess, max_steps, global_t, folder, demo_memory_cam=None):
        logger.info("Evaluate policy at global_t={}...".format(global_t))
        # copy weights from shared to local
        sess.run(self.sync)

        if demo_memory_cam is not None and global_t % 5000000 == 0:
            self.generate_cam_video(sess, 0.03, global_t, folder,
                                    demo_memory_cam)

        episode_buffer = []
        self.game_state.reset(hard_reset=True)
        episode_buffer.append(self.game_state.get_screen_rgb())

        total_reward = 0
        total_steps = 0
        episode_reward = 0
        episode_steps = 0
        n_episodes = 0
        while max_steps > 0:
            #pi_ = self.local_network.run_policy(sess, self.game_state.s_t)
            pi_, value_, logits_ = self.local_network.run_policy_and_value(
                sess, self.game_state.s_t)
            if False:
                action = np.random.choice(range(self.action_size), p=pi_)
            else:
                action = self.choose_action(logits_)

            if self.use_pretrained_model_as_advice:
                psi = self.psi if self.psi > 0.001 else 0.0
                if psi > np.random.rand():
                    model_pi = self.pretrained_model.run_policy(
                        self.pretrained_model_sess, self.game_state.s_t)
                    model_action, confidence = self.choose_action_with_high_confidence(
                        model_pi, exclude_noop=False)
                    if model_action > self.shaping_actions and confidence >= self.advice_confidence:
                        action = model_action

            # take action
            self.game_state.step(action)
            terminal = self.game_state.terminal

            if n_episodes == 0 and global_t % 5000000 == 0:
                episode_buffer.append(self.game_state.get_screen_rgb())

            episode_reward += self.game_state.reward
            episode_steps += 1
            max_steps -= 1

            # s_t = s_t1
            self.game_state.update()

            if terminal:
                if get_wrapper_by_name(self.game_state.env,
                                       'EpisodicLifeEnv').was_real_done:
                    if n_episodes == 0 and global_t % 5000000 == 0:
                        time_per_step = 0.0167
                        images = np.array(episode_buffer)
                        make_movie(
                            images,
                            folder +
                            '/frames/image{ep:010d}'.format(ep=global_t),
                            duration=len(images) * time_per_step,
                            true_image=True,
                            salience=False)
                        episode_buffer = []
                    n_episodes += 1
                    score_str = colored("score={}".format(episode_reward),
                                        "magenta")
                    steps_str = colored("steps={}".format(episode_steps),
                                        "blue")
                    log_data = (global_t, self.thread_index, n_episodes,
                                score_str, steps_str, total_steps)
                    logger.debug(
                        "test: global_t={} worker={} trial={} {} {} total_steps={}"
                        .format(*log_data))
                    total_reward += episode_reward
                    total_steps += episode_steps
                    episode_reward = 0
                    episode_steps = 0

                self.game_state.reset(hard_reset=False)
                if self.use_lstm:
                    self.local_network.reset_state()

        if n_episodes == 0:
            total_reward = episode_reward
            total_steps = episode_steps
        else:
            # (timestep, total sum of rewards, total # of steps before terminating)
            total_reward = total_reward / n_episodes
            total_steps = total_steps // n_episodes

        log_data = (global_t, self.thread_index, total_reward, total_steps,
                    n_episodes)
        logger.info(
            "test: global_t={} worker={} final score={} final steps={} # trials={}"
            .format(*log_data))

        self.record_summary(score=total_reward,
                            steps=total_steps,
                            episodes=n_episodes,
                            global_t=global_t,
                            mode='Test')

        # reset variables used in training
        self.episode_reward = 0
        self.episode_steps = 0
        self.game_state.reset(hard_reset=True)
        self.last_rho = 0.
        if self.is_demo_thread:
            self.replay_mem_reset()

        if self.use_lstm:
            self.local_network.reset_state()
        return total_reward, total_steps, n_episodes
Beispiel #8
0
    def process(self, sess, global_t, train_rewards):
        states = []
        actions = []
        rewards = []
        values = []
        rho = []

        terminal_end = False

        # copy weights from shared to local
        sess.run(self.sync)

        start_local_t = self.local_t

        if self.use_lstm:
            start_lstm_state = self.local_network.lstm_state_out

        # t_max times loop
        for i in range(self.local_t_max):
            pi_, value_, logits_ = self.local_network.run_policy_and_value(
                sess, self.game_state.s_t)
            action = self.choose_action(logits_)

            model_pi = None
            confidence = 0.
            if self.use_pretrained_model_as_advice:
                self.psi = 0.9999 * (
                    0.9999**
                    global_t) if self.psi > 0.001 else 0.0  # 0.99995 works
                if self.psi > np.random.rand():
                    model_pi = self.pretrained_model.run_policy(
                        self.pretrained_model_sess, self.game_state.s_t)
                    model_action, confidence = self.choose_action_with_high_confidence(
                        model_pi, exclude_noop=False)
                    if (model_action > self.shaping_actions
                            and confidence >= self.advice_confidence):
                        action = model_action
                        self.advice_ctr += 1
            if self.use_pretrained_model_as_reward_shaping:
                #if action > 0:
                if model_pi is None:
                    model_pi = self.pretrained_model.run_policy(
                        self.pretrained_model_sess, self.game_state.s_t)
                    confidence = model_pi[action][0][0]
                if (action > self.shaping_actions
                        and confidence >= self.advice_confidence):
                    #rho.append(round(confidence, 5))
                    rho.append(self.shaping_reward)
                    self.shaping_ctr += 1
                else:
                    rho.append(0.)
                #self.shaping_ctr += 1

            states.append(self.game_state.s_t)
            actions.append(action)
            values.append(value_)

            if self.thread_index == 0 and self.local_t % self.log_interval == 0:
                log_msg1 = "lg={}".format(
                    np.array_str(logits_, precision=4, suppress_small=True))
                log_msg2 = "pi={}".format(
                    np.array_str(pi_, precision=4, suppress_small=True))
                log_msg3 = "V={:.4f}".format(value_)
                if self.use_pretrained_model_as_advice:
                    log_msg3 += " psi={:.4f}".format(self.psi)
                logger.debug(log_msg1)
                logger.debug(log_msg2)
                logger.debug(log_msg3)

            # process game
            self.game_state.step(action)

            # receive game result
            reward = self.game_state.reward
            terminal = self.game_state.terminal
            if self.use_pretrained_model_as_reward_shaping:
                if reward < 0 and reward > 0:
                    rho[i] = 0.
                    j = i - 1
                    while j > i - 5:
                        if rewards[j] != 0:
                            break
                        rho[j] = 0.
                        j -= 1
            #     if self.game_state.loss_life:
            #     if self.game_state.gain_life or reward > 0:
            #         rho[i] = 0.
            #         j = i-1
            #         k = 1
            #         while j >= 0:
            #             if rewards[j] != 0:
            #                 rho[j] = self.shaping_reward * (self.gamma ** -1)
            #                 break
            #             rho[j] = self.shaping_reward / k
            #             j -= 1
            #             k += 1

            self.episode_reward += reward

            if self.reward_type == 'LOG':
                reward = np.sign(reward) * np.log(1 + np.abs(reward))
            elif self.reward_type == 'CLIP':
                # clip reward
                reward = np.sign(reward)

            rewards.append(reward)

            self.local_t += 1
            self.episode_steps += 1
            global_t += 1

            # s_t1 -> s_t
            self.game_state.update()

            if terminal:
                if get_wrapper_by_name(self.game_state.env,
                                       'EpisodicLifeEnv').was_real_done:
                    log_msg = "train: worker={} global_t={}".format(
                        self.thread_index, global_t)
                    if self.use_pretrained_model_as_advice:
                        log_msg += " advice_ctr={}".format(self.advice_ctr)
                    if self.use_pretrained_model_as_reward_shaping:
                        log_msg += " shaping_ctr={}".format(self.shaping_ctr)
                    score_str = colored("score={}".format(self.episode_reward),
                                        "magenta")
                    steps_str = colored("steps={}".format(self.episode_steps),
                                        "blue")
                    log_msg += " {} {}".format(score_str, steps_str)
                    logger.debug(log_msg)
                    train_rewards['train'][global_t] = (self.episode_reward,
                                                        self.episode_steps)
                    self.record_summary(score=self.episode_reward,
                                        steps=self.episode_steps,
                                        episodes=None,
                                        global_t=global_t,
                                        mode='Train')
                    self.episode_reward = 0
                    self.episode_steps = 0
                    terminal_end = True

                self.last_rho = 0.
                if self.use_lstm:
                    self.local_network.reset_state()
                self.game_state.reset(hard_reset=False)
                break

        cumulative_reward = 0.0
        if not terminal:
            cumulative_reward = self.local_network.run_value(
                sess, self.game_state.s_t)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_state = []
        batch_action = []
        batch_adv = []
        batch_cumulative_reward = []

        if self.use_pretrained_model_as_reward_shaping:
            rho.reverse()
            rho.append(self.last_rho)
            self.last_rho = rho[0]
            i = 0
            # compute and accumulate gradients
            for (ai, ri, si, vi) in zip(actions, rewards, states, values):
                # Wiewiora et al.(2003) Principled Methods for Advising RL agents
                # Look-Back Advice
                #F = rho[i] - (self.shaping_gamma**-1) * rho[i+1]
                #F = rho[i] - self.shaping_gamma * rho[i+1]
                f = (self.shaping_gamma**-1) * rho[i] - rho[i + 1]
                if (i == 0 and terminal) or (f != 0 and (ri > 0 or ri < 0)):
                    #logger.warn("averted additional F in absorbing state")
                    F = 0.
                # if (F < 0. and ri > 0) or (F > 0. and ri < 0):
                #     logger.warn("Negative reward shaping F={} ri={} rho[s]={} rhos[s-1]={}".format(F, ri, rho[i], rho[i+1]))
                #     F = 0.
                cumulative_reward = (ri + f * self.shaping_factor
                                     ) + self.gamma * cumulative_reward
                advantage = cumulative_reward - vi

                a = np.zeros([self.action_size])
                a[ai] = 1

                batch_state.append(si)
                batch_action.append(a)
                batch_adv.append(advantage)
                batch_cumulative_reward.append(cumulative_reward)
                i += 1
        else:

            def h(z, eps=10**-2):
                return (np.sign(z) *
                        (np.sqrt(np.abs(z) + 1.) - 1.)) + (eps * z)

            def h_inv(z, eps=10**-2):
                return np.sign(z) * (np.square(
                    (np.sqrt(1 + 4 * eps *
                             (np.abs(z) + 1 + eps)) - 1) / (2 * eps)) - 1)

            def h_log(z, eps=.6):
                return (np.sign(z) * np.log(1. + np.abs(z)) * eps)

            def h_inv_log(z, eps=.6):
                return np.sign(z) * (np.exp(np.abs(z) / eps) - 1)

            # compute and accumulate gradients
            for (ai, ri, si, vi) in zip(actions, rewards, states, values):
                if self.transformed_bellman:
                    cumulative_reward = h(ri + self.gamma *
                                          h_inv(cumulative_reward))
                else:
                    cumulative_reward = ri + self.gamma * cumulative_reward
                advantage = cumulative_reward - vi

                # convert action to one-hot vector
                a = np.zeros([self.action_size])
                a[ai] = 1

                batch_state.append(si)
                batch_action.append(a)
                batch_adv.append(advantage)
                batch_cumulative_reward.append(cumulative_reward)

        cur_learning_rate = self._anneal_learning_rate(global_t)

        if self.use_lstm:
            batch_state.reverse()
            batch_action.reverse()
            batch_adv.reverse()
            batch_cumulative_reward.reverse()

            sess.run(self.apply_gradients,
                     feed_dict={
                         self.local_network.s: batch_state,
                         self.local_network.a: batch_action,
                         self.local_network.advantage: batch_adv,
                         self.local_network.cumulative_reward:
                         batch_cumulative_reward,
                         self.local_network.initial_lstm_state:
                         start_lstm_state,
                         self.local_network.step_size: [len(batch_action)],
                         self.learning_rate_input: cur_learning_rate
                     })
        else:
            sess.run(self.apply_gradients,
                     feed_dict={
                         self.local_network.s: batch_state,
                         self.local_network.a: batch_action,
                         self.local_network.advantage: batch_adv,
                         self.local_network.cumulative_reward:
                         batch_cumulative_reward,
                         self.learning_rate_input: cur_learning_rate
                     })

        if (self.thread_index == 0) and (self.local_t - self.prev_local_t >=
                                         self.performance_log_interval):
            self.prev_local_t += self.performance_log_interval
            elapsed_time = time.time() - self.start_time
            steps_per_sec = global_t / elapsed_time
            logger.info(
                "Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour"
                .format(global_t, elapsed_time, steps_per_sec,
                        steps_per_sec * 3600 / 1000000.))

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t, terminal_end
Beispiel #9
0
    def testing_model(self,
                      sess,
                      max_steps,
                      global_t,
                      folder,
                      demo_memory_cam=None,
                      demo_cam_human=False):
        logger.info("Testing model at global_t={}...".format(global_t))
        # copy weights from shared to local
        sess.run(self.sync)

        if demo_memory_cam is not None:
            self.generate_cam_video(sess, 0.03, global_t, folder,
                                    demo_memory_cam, demo_cam_human)
            return
        else:
            self.game_state.reset(hard_reset=True)
            max_steps += 4
            test_memory = ReplayMemory(
                84,
                84,
                np.random.RandomState(),
                max_steps=max_steps,
                phi_length=4,
                num_actions=self.game_state.env.action_space.n,
                wrap_memory=False,
                full_state_size=self.game_state.clone_full_state().shape[0])
            for _ in range(4):
                test_memory.add(self.game_state.x_t,
                                0,
                                self.game_state.reward,
                                self.game_state.terminal,
                                self.game_state.lives,
                                fullstate=self.game_state.full_state)

        episode_buffer = []
        test_memory_cam = []

        total_reward = 0
        total_steps = 0
        episode_reward = 0
        episode_steps = 0
        n_episodes = 0
        terminal = False
        while True:
            #pi_ = self.local_network.run_policy(sess, self.game_state.s_t)
            test_memory_cam.append(self.game_state.s_t)
            episode_buffer.append(self.game_state.get_screen_rgb())
            pi_, value_, logits_ = self.local_network.run_policy_and_value(
                sess, self.game_state.s_t)
            #action = self.choose_action(logits_)
            action = np.argmax(pi_)

            # take action
            self.game_state.step(action)
            terminal = self.game_state.terminal
            memory_full = episode_steps == max_steps - 5
            terminal_ = terminal or memory_full

            # store the transition to replay memory
            test_memory.add(self.game_state.x_t1,
                            action,
                            self.game_state.reward,
                            terminal_,
                            self.game_state.lives,
                            fullstate=self.game_state.full_state1)

            # update the old values
            episode_reward += self.game_state.reward
            episode_steps += 1

            # s_t = s_t1
            self.game_state.update()

            if terminal_:
                if get_wrapper_by_name(
                        self.game_state.env,
                        'EpisodicLifeEnv').was_real_done or memory_full:
                    time_per_step = 0.03
                    images = np.array(episode_buffer)
                    make_movie(images,
                               folder +
                               '/frames/image{ep:010d}'.format(ep=global_t),
                               duration=len(images) * time_per_step,
                               true_image=True,
                               salience=False)
                    break

                self.game_state.reset(hard_reset=False)
                if self.use_lstm:
                    self.local_network.reset_state()

        total_reward = episode_reward
        total_steps = episode_steps
        log_data = (global_t, self.thread_index, total_reward, total_steps)
        logger.info(
            "test: global_t={} worker={} final score={} final steps={}".format(
                *log_data))

        self.generate_cam_video(sess, 0.03, global_t, folder,
                                np.array(test_memory_cam))
        test_memory.save(name='test_cam', folder=folder, resize=True)

        if self.use_lstm:
            self.local_network.reset_state()

        return
Beispiel #10
0
    def train(self, sess, global_t, train_rewards):
        """Train A3C."""
        states = []
        fullstates = []
        actions = []
        rewards = []
        values = []
        rho = []

        terminal_pseudo = False  # loss of life
        terminal_end = False  # real terminal

        # copy weights from shared to local
        sess.run(self.sync)

        start_local_t = self.local_t

        # t_max times loop
        for i in range(self.local_t_max):
            state = cv2.resize(self.game_state.s_t,
                               self.local_net.in_shape[:-1],
                               interpolation=cv2.INTER_AREA)
            fullstate = self.game_state.clone_full_state()

            pi_, value_, logits_ = self.local_net.run_policy_and_value(
                sess, state)
            action = self.pick_action(logits_)

            states.append(state)
            fullstates.append(fullstate)
            actions.append(action)
            values.append(value_)

            if self.thread_idx == self.log_idx \
               and self.local_t % self.log_interval == 0:
                log_msg1 = "lg={}".format(
                    np.array_str(logits_, precision=4, suppress_small=True))
                log_msg2 = "pi={}".format(
                    np.array_str(pi_, precision=4, suppress_small=True))
                log_msg3 = "V={:.4f}".format(value_)
                logger.debug(log_msg1)
                logger.debug(log_msg2)
                logger.debug(log_msg3)

            # process game
            self.game_state.step(action)

            # receive game result
            reward = self.game_state.reward
            terminal = self.game_state.terminal

            self.episode_reward += reward

            if self.use_sil:
                # save states in episode memory
                self.episode.add_item(self.game_state.s_t, fullstate, action,
                                      reward, terminal)

            if self.reward_type == 'CLIP':
                reward = np.sign(reward)

            rewards.append(reward)

            self.local_t += 1
            self.episode_steps += 1
            global_t += 1

            # s_t1 -> s_t
            self.game_state.update()

            if terminal:
                terminal_pseudo = True

                env = self.game_state.env
                name = 'EpisodicLifeEnv'
                if get_wrapper_by_name(env, name).was_real_done:
                    # reduce log freq
                    if self.thread_idx == self.log_idx:
                        log_msg = "train: worker={} global_t={} local_t={}".format(
                            self.thread_idx, global_t, self.local_t)
                        score_str = colored(
                            "score={}".format(self.episode_reward), "magenta")
                        steps_str = colored(
                            "steps={}".format(self.episode_steps), "blue")
                        log_msg += " {} {}".format(score_str, steps_str)
                        logger.debug(log_msg)

                    train_rewards['train'][global_t] = (self.episode_reward,
                                                        self.episode_steps)
                    self.record_summary(score=self.episode_reward,
                                        steps=self.episode_steps,
                                        episodes=None,
                                        global_t=global_t,
                                        mode='Train')
                    self.episode_reward = 0
                    self.episode_steps = 0
                    terminal_end = True

                self.game_state.reset(hard_reset=False)
                break

        cumsum_reward = 0.0
        if not terminal:
            state = cv2.resize(self.game_state.s_t,
                               self.local_net.in_shape[:-1],
                               interpolation=cv2.INTER_AREA)
            cumsum_reward = self.local_net.run_value(sess, state)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_state = []
        batch_action = []
        batch_adv = []
        batch_cumsum_reward = []

        # compute and accumulate gradients
        for (ai, ri, si, vi) in zip(actions, rewards, states, values):
            if self.transformed_bellman:
                ri = np.sign(ri) * self.reward_constant + ri
                cumsum_reward = transform_h(ri + self.gamma *
                                            transform_h_inv(cumsum_reward))
            else:
                cumsum_reward = ri + self.gamma * cumsum_reward
            advantage = cumsum_reward - vi

            # convert action to one-hot vector
            a = np.zeros([self.action_size])
            a[ai] = 1

            batch_state.append(si)
            batch_action.append(a)
            batch_adv.append(advantage)
            batch_cumsum_reward.append(cumsum_reward)

        cur_learning_rate = self._anneal_learning_rate(
            global_t, self.initial_learning_rate)

        feed_dict = {
            self.local_net.s: batch_state,
            self.local_net.a: batch_action,
            self.local_net.advantage: batch_adv,
            self.local_net.cumulative_reward: batch_cumsum_reward,
            self.learning_rate_input: cur_learning_rate,
        }

        sess.run(self.apply_gradients, feed_dict=feed_dict)

        t = self.local_t - self.prev_local_t
        if (self.thread_idx == self.log_idx and t >= self.perf_log_interval):
            self.prev_local_t += self.perf_log_interval
            elapsed_time = time.time() - self.start_time
            steps_per_sec = global_t / elapsed_time
            logger.info("worker-{}, log_worker-{}".format(
                self.thread_idx, self.log_idx))
            logger.info("Performance : {} STEPS in {:.0f} sec. {:.0f}"
                        " STEPS/sec. {:.2f}M STEPS/hour.".format(
                            global_t, elapsed_time, steps_per_sec,
                            steps_per_sec * 3600 / 1000000.))

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t, terminal_end, terminal_pseudo
Beispiel #11
0
    def run(self):
        # load if starting from a checkpoint
        wall_t = self._load()

        # get the first state by doing nothing and preprocess the image to 80x80x4
        # only reset when it doesn't evaluate first when it enters loop below
        if self.global_t % self.eval_freq != 0:
            self._reset(hard_reset=True)

        # only executed at the very beginning of training and never again
        if self.global_t == 0 and self.train_with_demo_steps > 0:
            self.train_with_demo_memory_only()

        # load one demo for cam
        if self.load_demo_cam:
            # note, tuple length has to be >=2. pad 0 if len==1
            demo_cam_id = tuple(map(int, self.demo_cam_id.split(",")))
            if len(demo_cam_id) == 1:
                demo_cam_id = (*demo_cam_id, '0')
            demo_cam, _, total_rewards_cam, _ = load_memory(
                name=None,
                demo_memory_folder=self.demo_memory_folder,
                demo_ids=demo_cam_id,
                imgs_normalized=False)

            max_idx, _ = max(total_rewards_cam.items(), key=lambda a: a[1])
            size_max_idx_mem = len(demo_cam[max_idx])
            self.test_cam_si = np.zeros(
                (size_max_idx_mem, demo_cam[max_idx].height,
                 demo_cam[max_idx].width, demo_cam[max_idx].phi_length),
                dtype=np.float32)
            for i in range(size_max_idx_mem):
                s0, _, _, _, _, _, _, _ = demo_cam[max_idx][i]
                self.test_cam_si[i] = np.copy(s0)
            logger.info("loaded demo {} for testing CAM".format(demo_cam_id))

        # set start time
        start_time = time.time() - wall_t

        logger.info("replay memory size={}".format(self.replay_memory.size))
        sub_total_reward = 0.0
        sub_steps = 0

        while self.global_t < self.train_max_steps:
            # Evaluation of policy
            if self.global_t % self.eval_freq == 0:
                terminal = 0
                total_reward, total_steps, n_episodes = self.test()
                # re-initialize game for training
                self._reset(hard_reset=True)
                sub_total_reward = 0.0
                sub_steps = 0
                time.sleep(0.5)

            if self.global_t % self.copy_freq == 0:
                self.net.update_target_network(slow=False)

            # choose an action epsilon greedily
            ## self._update_state_input(observation)
            readout_t = self.net.evaluate(self.game_state.s_t)[0]
            action = get_action_index(
                readout_t,
                is_random=(random.random() <= self.epsilon
                           or self.global_t <= self.observe),
                n_actions=self.game_state.env.action_space.n)

            # scale down epsilon
            if self.epsilon > self.final_epsilon and self.global_t > self.observe:
                self.epsilon -= (self.init_epsilon -
                                 self.final_epsilon) / self.explore

            ##### HUMAN ADVICE OVERRIDE ACTION #####
            if self.use_human_advice and self.psi > self.final_epsilon:
                use_advice = False
                # After n exploration steps, decay psi
                if (self.global_t - self.observe) >= self.explore:
                    self.psi *= self.init_psi

                # TODO: Determine if I want advice during observation or only during exploration
                if random.random() > self.final_epsilon:
                    psi_cond = True if self.psi == self.init_psi else (
                        self.psi > random.random())
                    if psi_cond:
                        action_advice = self.human_net.evaluate(
                            self.game_state.s_t)[0]
                        action_human = np.argmax(action_advice)
                        if action_advice[action_human] >= self.confidence:
                            action = action_human
                            use_advice = True
            ##### HUMAN ADVICE OVERRIDE ACTION #####

            # Training
            # run the selected action and observe next state and reward
            self.game_state.step(action)
            terminal = self.game_state.terminal
            terminal_ = terminal or ((self.global_t + 1) % self.eval_freq == 0)

            # store the transition in D
            ## self.replay_memory.add_sample(observation, action, reward, (1 if terminal_ else 0))
            self.replay_memory.add(self.game_state.x_t1,
                                   action,
                                   self.game_state.reward,
                                   terminal_,
                                   self.game_state.lives,
                                   fullstate=self.game_state.full_state1)

            # update the old values
            sub_total_reward += self.game_state.reward
            sub_steps += 1
            self.global_t += 1
            self.game_state.update()

            # only train if done observing
            if self.global_t > self.observe and self.global_t % self.update_freq == 0:
                s_j_batch, a_batch, r_batch, terminals, s_j1_batch = self.replay_memory.sample(
                    self.batch, reward_type=self.reward_type)
                # perform gradient step
                self.net.train(s_j_batch, a_batch, r_batch, s_j1_batch,
                               terminals, self.global_t)
                # self.net.add_summary(summary, self.global_t)

            if terminal:
                if get_wrapper_by_name(self.game_state.env,
                                       'EpisodicLifeEnv').was_real_done:
                    self.rewards['train'][self.global_t] = (sub_total_reward,
                                                            sub_steps)
                    score_str = colored("score={}".format(sub_total_reward),
                                        "magenta")
                    steps_str = colored("steps={}".format(sub_steps), "blue")
                    log_data = (self.global_t, score_str, steps_str)
                    logger.debug("train: global_t={} {} {}".format(*log_data))
                    self.net.record_summary(score=sub_total_reward,
                                            steps=sub_steps,
                                            episodes=None,
                                            global_t=self.global_t,
                                            mode='Train')
                    sub_total_reward = 0.0
                    sub_steps = 0
                self._reset(hard_reset=False)

            # save progress every SAVE_FREQ iterations
            if self.global_t % self.save_freq == 0:
                wall_t = time.time() - start_time
                logger.info('Total time: {} seconds'.format(wall_t))
                wall_t_fname = self.folder + '/' + 'wall_t.' + str(
                    self.global_t)
                epsilon_fname = self.folder + '/epsilon'

                logger.info('Now saving data. Please wait')
                with open(wall_t_fname, 'w') as f:
                    f.write(str(wall_t))
                with open(epsilon_fname, 'w') as f:
                    f.write(str(self.epsilon))

                self.net.save(self.global_t)

                self.replay_memory.save(name=self.name,
                                        folder=self.folder,
                                        resize=False)
                pickle.dump(
                    self.rewards,
                    open(
                        self.folder + '/' + self.name.replace('-', '_') +
                        '-dqn-rewards.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
                logger.info('Data saved!')

            # log information
            state = ""
            if self.global_t - 1 < self.observe:
                state = "observe"
            elif self.global_t - 1 < self.observe + self.explore:
                state = "explore"
            else:
                state = "train"

            if (self.global_t - 1) % 10000 == 0:
                if self.use_human_advice:
                    log_data = (state, self.global_t - 1,
                                self.epsilon, self.psi, use_advice, action,
                                np.max(readout_t))
                    logger.debug(
                        "{0:}: global_t={1:} epsilon={2:.4f} psi={3:.4f} \
                        advice={4:} action={5:} q_max={6:.4f}".format(
                            *log_data))
                else:
                    log_data = (state, self.global_t - 1, self.epsilon, action,
                                np.max(readout_t))
                    logger.debug(
                        "{0:}: global_t={1:} epsilon={2:.4f} action={3:} "
                        "q_max={4:.4f}".format(*log_data))
Beispiel #12
0
    def test(self, render=False):
        logger.info("Evaluate policy at global_t={}...".format(self.global_t))

        episode_buffer = []
        self.game_state.reset(hard_reset=True)
        episode_buffer.append(self.game_state.get_screen_rgb())

        max_steps = self.eval_max_steps
        total_reward = 0
        total_steps = 0
        episode_reward = 0
        episode_steps = 0
        n_episodes = 0

        # use one demonstration data to record cam
        # only need to make movie for demo data once
        # if self.global_t == 0:
        cam, state, action = self.calculate_cam(self.test_cam_si)
        cam_plus_img = []
        cam_side_img = []

        for i in range(len(cam)):
            # overlay cam-state
            overlay = np.uint8(cam[i]).copy()
            output = np.uint8(state[i]).copy()
            alpha = 0.3
            cv2.addWeighted(overlay, alpha, output, 1 - alpha, 0, output)
            # create a title space for action
            title_space = np.zeros((20, 84, 3), np.uint8)
            title_space[:] = (255, 255, 255)
            cv2.putText(title_space, "{}".format(ACTION_MEANING[action[i]]),
                        (20, 14), cv2.FONT_HERSHEY_DUPLEX, .4, (0, 0, 0), 1)
            # concate title and state
            vcat_output = cv2.vconcat((title_space, output))
            cam_plus_img.append(vcat_output)

            # side-by-side cam-state
            hcat_cam_state = cv2.hconcat(
                (np.uint8(cam[i]).copy(), np.uint8(state[i]).copy()))
            title_space = np.zeros((20, 84 * 2, 3), np.uint8)
            title_space[:] = (255, 255, 255)
            vcat_title_camstate = cv2.vconcat((title_space, hcat_cam_state))
            cv2.putText(vcat_title_camstate,
                        "{}".format(ACTION_MEANING[action[i]]), (20, 14),
                        cv2.FONT_HERSHEY_DUPLEX, .4, (0, 0, 0), 1)
            cam_side_img.append(vcat_title_camstate)

        time_per_step = 0.0167
        make_movie(
            cam_plus_img,
            self.folder +
            '/frames/demo-cam_plus_img{ep:010d}'.format(ep=(self.global_t)),
            duration=len(cam) * time_per_step,
            true_image=True,
            salience=False)
        make_movie(
            cam_side_img,
            self.folder +
            '/frames/demo-cam_side_img{ep:010d}'.format(ep=(self.global_t)),
            duration=len(state) * time_per_step,
            true_image=True,
            salience=False)
        del cam, state, action, cam_plus_img, cam_side_img

        while max_steps > 0:
            readout_t = self.net.evaluate(self.game_state.s_t)[0]
            action = get_action_index(
                readout_t,
                is_random=(random.random() <= 0.05),
                n_actions=self.game_state.env.action_space.n)

            # take action
            self.game_state.step(action)
            terminal = self.game_state.terminal

            if n_episodes == 0 and self.global_t % 2000000 == 0:
                episode_buffer.append(self.game_state.get_screen_rgb())

            episode_reward += self.game_state.reward
            episode_steps += 1
            max_steps -= 1

            # s_t = s_t1
            self.game_state.update()

            if terminal:
                if get_wrapper_by_name(self.game_state.env,
                                       'EpisodicLifeEnv').was_real_done:
                    if n_episodes == 0 and self.global_t % 2000000 == 0:
                        time_per_step = 0.0167
                        images = np.array(episode_buffer)
                        make_movie(images,
                                   self.folder +
                                   '/frames/image{ep:010d}'.format(
                                       ep=(self.global_t)),
                                   duration=len(images) * time_per_step,
                                   true_image=True,
                                   salience=False)
                        episode_buffer = []
                    n_episodes += 1
                    score_str = colored("score={}".format(episode_reward),
                                        "magenta")
                    steps_str = colored("steps={}".format(episode_steps),
                                        "blue")
                    log_data = (self.global_t, n_episodes, score_str,
                                steps_str, total_steps)
                    logger.debug(
                        "test: global_t={} trial={} {} {} total_steps={}".
                        format(*log_data))
                    total_reward += episode_reward
                    total_steps += episode_steps
                    episode_reward = 0
                    episode_steps = 0
                self.game_state.reset(hard_reset=False)

        if n_episodes == 0:
            total_reward = episode_reward
            total_steps = episode_steps
        else:
            # (timestep, total sum of rewards, total # of steps before terminating)
            total_reward = total_reward / n_episodes
            total_steps = total_steps // n_episodes

        log_data = (self.global_t, total_reward, total_steps, n_episodes)
        logger.debug(
            "test: global_t={} final score={} final steps={} # episodes={}".
            format(*log_data))
        self.net.record_summary(score=total_reward,
                                steps=total_steps,
                                episodes=n_episodes,
                                global_t=self.global_t,
                                mode='Test')

        self.rewards['eval'][self.global_t] = (total_reward, total_steps,
                                               n_episodes)
        return total_reward, total_steps, n_episodes
    def rollout(self, a3c_sess, folder, pretrain_sess, global_t, past_state,
                add_all_rollout, ep_max_steps, nstep_bc, update_in_rollout):
        """Perform one rollout until terminal."""
        a3c_sess.run(self.sync_a3c)
        if nstep_bc > 0:
            pretrain_sess.run(self.sync_pretrained)

        _, fs, old_a, old_return, _, _ = past_state

        states = []
        actions = []
        rewards = []
        values = []
        terminals = []
        confidences = []

        rollout_ctr, rollout_added_ctr = 0, 0
        rollout_new_return, rollout_old_return = 0, 0

        terminal_pseudo = False  # loss of life
        terminal_end = False  # real terminal
        add = False

        self.rolloutgame.reset(hard_reset=True)
        self.rolloutgame.restore_full_state(fs)
        # check if restore successful
        fs_check = self.rolloutgame.clone_full_state()
        assert fs_check.all() == fs.all()
        del fs_check

        start_local_t = self.local_t
        self.rolloutgame.step(0)

        # prevent rollout too long, set max_ep_steps to be lower than ALE default
        # see https://github.com/openai/gym/blob/54f22cf4db2e43063093a1b15d968a57a32b6e90/gym/envs/__init__.py#L635
        # but in all games tested, no rollout exceeds ep_max_steps
        while ep_max_steps > 0:
            state = cv2.resize(self.rolloutgame.s_t,
                       self.local_a3c.in_shape[:-1],
                       interpolation=cv2.INTER_AREA)
            fullstate = self.rolloutgame.clone_full_state()

            if nstep_bc > 0: # LiDER-TA or BC
                model_pi = self.local_pretrained.run_policy(pretrain_sess, state)
                action, confidence = self.choose_action_with_high_confidence(
                                          model_pi, exclude_noop=False)
                confidences.append(confidence) # not using "confidences" for anything
                nstep_bc -= 1
            else: # LiDER, refresh with current policy
                pi_, _, logits_ = self.local_a3c.run_policy_and_value(a3c_sess,
                                                                      state)
                action = self.pick_action(logits_)
                confidences.append(pi_[action])

            value_ = self.local_a3c.run_value(a3c_sess, state)
            values.append(value_)
            states.append(state)
            actions.append(action)

            self.rolloutgame.step(action)

            ep_max_steps -= 1

            reward = self.rolloutgame.reward
            terminal = self.rolloutgame.terminal
            terminals.append(terminal)

            self.episode_reward += reward

            self.episode.add_item(self.rolloutgame.s_t, fullstate, action,
                                  reward, terminal, from_rollout=True)

            if self.reward_type == 'CLIP':
                reward = np.sign(reward)
            rewards.append(reward)

            self.local_t += 1
            self.episode_steps += 1
            global_t += 1

            self.rolloutgame.update()

            if terminal:
                terminal_pseudo = True
                env = self.rolloutgame.env
                name = 'EpisodicLifeEnv'
                rollout_ctr += 1
                terminal_end = get_wrapper_by_name(env, name).was_real_done

                new_return = self.compute_return_for_state(rewards, terminals)

                if not add_all_rollout:
                    if new_return > old_return:
                        add = True
                else:
                    add = True

                if add:
                    rollout_added_ctr += 1
                    rollout_new_return += new_return
                    rollout_old_return += old_return
                    # update policy immediate using a good rollout
                    if update_in_rollout:
                        batch_adv = self.update_a3c(a3c_sess, actions, states, rewards, values, global_t)

                self.episode_reward = 0
                self.episode_steps = 0
                self.rolloutgame.reset(hard_reset=True)
                break

        diff_local_t = self.local_t - start_local_t

        return diff_local_t, terminal_end, terminal_pseudo, rollout_ctr, \
               rollout_added_ctr, add, rollout_new_return, rollout_old_return