def _learn_loop(self, multi_env):
        """Main training loop."""
        step, env_steps = self.session.run([self.actor_step, self.total_env_steps])

        env_obs = multi_env.reset()
        observations, goals = main_observation(env_obs), goal_observation(env_obs)
        buffer = PPOBuffer()

        def end_of_training(s, es):
            return s >= self.params.train_for_steps or es > self.params.train_for_env_steps

        while not end_of_training(step, env_steps):
            timing = Timing()
            num_steps = 0
            batch_start = time.time()

            buffer.reset()

            with timing.timeit('experience'):
                # collecting experience
                for rollout_step in range(self.params.rollout):
                    actions, action_probs, values = self.actor_critic.invoke(self.session, observations, goals=goals)

                    # wait for all the workers to complete an environment step
                    env_obs, rewards, dones, infos = multi_env.step(actions)
                    self.process_infos(infos)
                    new_observations, new_goals = main_observation(env_obs), goal_observation(env_obs)

                    # add experience from all environments to the current buffer
                    buffer.add(observations, actions, action_probs, rewards, dones, values, goals)
                    observations = new_observations
                    goals = new_goals

                    num_steps += num_env_steps(infos)

                # last step values are required for TD-return calculation
                _, _, values = self.actor_critic.invoke(self.session, observations, goals=goals)
                buffer.values.append(values)

            env_steps += num_steps

            # calculate discounted returns and GAE
            buffer.finalize_batch(self.params.gamma, self.params.gae_lambda)

            # update actor and critic
            with timing.timeit('train'):
                step = self._train(buffer, env_steps)

            avg_reward = multi_env.calc_avg_rewards(n=self.params.stats_episodes)
            avg_length = multi_env.calc_avg_episode_lengths(n=self.params.stats_episodes)
            fps = num_steps / (time.time() - batch_start)

            self._maybe_print(step, env_steps, avg_reward, avg_length, fps, timing)
            self._maybe_aux_summaries(env_steps, avg_reward, avg_length, fps)
            self._maybe_update_avg_reward(avg_reward, multi_env.stats_num_episodes())
            self._maybe_coverage_summaries(env_steps)
Beispiel #2
0
def run_policy_loop(agent, env, max_num_episodes, fps=7, max_num_frames=None, deterministic=False):
    """Execute the policy and render onto the screen, using the standard agent interface."""
    agent.initialize()

    episode_rewards = []
    num_frames = 0

    def max_frames_reached(frames):
        return max_num_frames is not None and frames > max_num_frames

    for _ in range(max_num_episodes):
        env_obs, info = reset_with_info(env)
        done = False
        obs, goal_obs = main_observation(env_obs), goal_observation(env_obs)
        if goal_obs is not None:
            goal_obs_rgb = cv2.cvtColor(goal_obs, cv2.COLOR_BGR2RGB)
            cv2.imshow('goal', cv2.resize(goal_obs_rgb, (500, 500)))
            cv2.waitKey(500)
        episode_reward = 0

        while not done:
            start = time.time()
            env.render()
            if fps < 1000:
                time.sleep(1.0 / fps)
            action = agent.best_action([obs], goals=[goal_obs], deterministic=deterministic)
            env_obs, rew, done, _ = env.step(action)
            obs, goal_obs = main_observation(env_obs), goal_observation(env_obs)
            episode_reward += rew
            log.info('Actual fps: %.1f', 1.0 / (time.time() - start))

            num_frames += 1
            if max_frames_reached(num_frames):
                break

        env.render()
        time.sleep(0.2)

        episode_rewards.append(episode_reward)
        last_episodes = episode_rewards[-100:]
        avg_reward = sum(last_episodes) / len(last_episodes)
        log.info(
            'Episode reward: %f, avg reward for %d episodes: %f', episode_reward, len(last_episodes), avg_reward,
        )

        if max_frames_reached(num_frames):
            break

    agent.finalize()
    env.close()
    cv2.destroyAllWindows()
    return 0
def enjoy(params,
          env_id,
          max_num_episodes=1000,
          max_num_frames=None,
          show_automap=False):
    def make_env_func():
        e = create_env(env_id, mode='test', show_automap=show_automap)
        e.seed(0)
        return e

    params = params.load()
    params.num_envs = 1  # during execution we're only using one env
    agent = AgentTMAX(make_env_func, params)
    env = make_env_func()

    agent.initialize()

    global persistent_map
    if agent.params.persistent_map_checkpoint is not None:
        persistent_map = TopologicalMap.create_empty()
        persistent_map.maybe_load_checkpoint(
            agent.params.persistent_map_checkpoint)

    global current_landmark

    episode_rewards = []
    num_frames = 0

    def max_frames_reached(frames):
        return max_num_frames is not None and frames > max_num_frames

    for _ in range(max_num_episodes):
        env_obs, info = reset_with_info(env)
        done = False

        obs, goal_obs = main_observation(env_obs), goal_observation(env_obs)
        prev_obs = obs
        if current_landmark is None:
            current_landmark = obs

        if goal_obs is not None:
            goal_obs_rgb = cv2.cvtColor(goal_obs, cv2.COLOR_BGR2RGB)
            cv2.imshow('goal', cv2.resize(goal_obs_rgb, (500, 500)))
            cv2.waitKey(500)

        episode_reward, episode_frames = 0, 0

        if not agent.tmax_mgr.initialized:
            agent.tmax_mgr.initialize([obs], [info], env_steps=0)
            persistent_map = agent.tmax_mgr.dense_persistent_maps[-1]
            sparse_persistent_map = agent.tmax_mgr.sparse_persistent_maps[-1]
            log.debug('Num landmarks in sparse map: %d',
                      sparse_persistent_map.num_landmarks())

        agent.curiosity.initialized = True
        agent.tmax_mgr.mode[0] = TmaxMode.EXPLORATION
        agent.tmax_mgr.locomotion_final_targets[0] = None
        agent.tmax_mgr.locomotion_targets[0] = None

        start_episode = time.time()
        t = Timing()

        while not done and not terminate and not max_frames_reached(
                num_frames):
            with t.timeit('one_frame'):
                env.render()
                cv2.waitKey(1)  # to prevent window from fading

                if pause:
                    time.sleep(0.01)
                    continue

                if len(current_actions) > 0:
                    # key combinations are not handled, but this is purely for testing
                    action = current_actions[-1]
                else:
                    action = 0

                if policy_type == PolicyType.PLAYER:
                    pass
                elif policy_type == PolicyType.RANDOM:
                    action = env.action_space.sample()
                elif policy_type == PolicyType.AGENT:
                    agent.tmax_mgr.mode[0] = TmaxMode.EXPLORATION
                    action, *_ = agent.policy_step([prev_obs], [obs],
                                                   [goal_obs], None, None)
                    action = action[0]
                elif policy_type == PolicyType.LOCOMOTION:
                    agent.tmax_mgr.mode[0] = TmaxMode.LOCOMOTION
                    action, _, _ = agent.loco_actor_critic.invoke(
                        agent.session,
                        [obs],
                        [current_landmark],
                        None,
                        None,
                        [1.0],
                    )
                    action = action[0]

                env_obs, rew, done, info = env.step(action)
                next_obs, goal_obs = main_observation(
                    env_obs), goal_observation(env_obs)

                _, _ = agent.tmax_mgr.update(
                    [obs],
                    [next_obs],
                    [rew],
                    [done],
                    [info],
                    num_frames,
                    t,
                    verbose=True,
                )

                prev_obs = obs
                obs = next_obs

                calc_distance_to_memory(agent, sparse_persistent_map, obs)
                calc_value_estimate(agent, obs)

                episode_reward += rew

                num_frames += 1
                episode_frames += 1

            took_seconds = t.one_frame
            desired_fps = 15  # (4-repeated here, which means actually 60fps)
            wait_seconds = (1.0 / desired_fps) - took_seconds
            wait_seconds = max(0.0, wait_seconds)
            if wait_seconds > EPS:
                time.sleep(wait_seconds)

        env.render()
        log.info('Actual fps: %.1f',
                 episode_frames / (time.time() - start_episode))
        time.sleep(0.2)

        episode_rewards.append(episode_reward)
        last_episodes = episode_rewards[-100:]
        avg_reward = sum(last_episodes) / len(last_episodes)
        log.info(
            'Episode reward: %f, avg reward for %d episodes: %f',
            episode_reward,
            len(last_episodes),
            avg_reward,
        )

        if max_frames_reached(num_frames) or terminate:
            break

    agent.finalize()
    env.close()
    cv2.destroyAllWindows()

    return 0
def enjoy(params,
          env_id,
          max_num_episodes=1,
          max_num_frames=1e10,
          render=False):
    def make_env_func():
        e = create_env(env_id, mode='train', skip_frames=True)
        e.seed(0)
        return e

    agent = AgentRandom(make_env_func, params.load())
    env = make_env_func()

    # this helps with screen recording
    pause_at_the_beginning = False
    if pause_at_the_beginning:
        env.render()
        log.info('Press any key to start...')
        cv2.waitKey()

    agent.initialize()

    episode_rewards = []
    num_frames = 0

    histogram = setup_histogram(agent)

    def max_frames_reached(frames):
        return max_num_frames is not None and frames > max_num_frames

    for _ in range(max_num_episodes):
        env_obs, info = reset_with_info(env)
        done = False
        obs, goal_obs = main_observation(env_obs), goal_observation(env_obs)

        episode_reward = []

        while not done and not max_frames_reached(num_frames):
            start = time.time()
            if render:
                env.render()

            action = agent.best_action([obs],
                                       goals=[goal_obs],
                                       deterministic=False)
            env_obs, rew, done, info = env.step(action)
            if done:
                log.warning('Done flag is true %d, rew: %.3f, num_frames %d',
                            done, rew, num_frames)

            update_coverage(agent, [info], histogram)

            episode_reward.append(rew)

            if num_frames % 100 == 0:
                log.info('fps: %.1f, rew: %d, done: %s, frames %d',
                         1.0 / (time.time() - start), rew, done, num_frames)

            write_summaries(agent, histogram, num_frames)

            num_frames += num_env_steps([info])

        if render:
            env.render()
        time.sleep(0.2)

        episode_rewards.append(sum(episode_reward))
        last_episodes = episode_rewards[-100:]
        avg_reward = sum(last_episodes) / len(last_episodes)
        log.info(
            'Episode reward: %f, avg reward for %d episodes: %f',
            sum(episode_reward),
            len(last_episodes),
            avg_reward,
        )

        if max_frames_reached(num_frames):
            break

    write_summaries(agent, histogram, num_frames, force=True)

    agent.finalize()
    env.close()
    cv2.destroyAllWindows()
Beispiel #5
0
    def _learn_loop(self, multi_env):
        """Main training loop."""
        # env_steps used in tensorboard (and thus, our results)
        # actor_step used as global step for training
        step, env_steps = self.session.run(
            [self.actor_step, self.total_env_steps])

        env_obs = multi_env.reset()
        obs, goals = main_observation(env_obs), goal_observation(env_obs)

        buffer = CuriousPPOBuffer()
        trajectory_buffer = TrajectoryBuffer(self.params.num_envs)
        self.curiosity.set_trajectory_buffer(trajectory_buffer)

        def end_of_training(s, es):
            return s >= self.params.train_for_steps or es > self.params.train_for_env_steps

        while not end_of_training(step, env_steps):
            timing = Timing()
            num_steps = 0
            batch_start = time.time()

            buffer.reset()

            with timing.timeit('experience'):
                # collecting experience
                for rollout_step in range(self.params.rollout):
                    actions, action_probs, values = self._policy_step(
                        obs, goals)

                    # wait for all the workers to complete an environment step
                    env_obs, rewards, dones, infos = multi_env.step(actions)

                    if self.params.graceful_episode_termination:
                        rewards = list(rewards)
                        for i in range(self.params.num_envs):
                            if dones[i] and infos[i].get('prev') is not None:
                                if infos[i]['prev'].get(
                                        'terminated_by_timer', False):
                                    log.info('Env %d terminated by timer', i)
                                    rewards[i] += values[i]

                    if not self.params.random_exploration:
                        trajectory_buffer.add(obs, actions, infos, dones)

                    next_obs, new_goals = main_observation(
                        env_obs), goal_observation(env_obs)

                    # calculate curiosity bonus
                    with timing.add_time('curiosity'):
                        if not self.params.random_exploration:
                            bonuses = self.curiosity.generate_bonus_rewards(
                                self.session,
                                obs,
                                next_obs,
                                actions,
                                dones,
                                infos,
                            )
                            rewards = self.params.extrinsic_reward_coeff * np.array(
                                rewards) + bonuses

                    # add experience from environment to the current buffer
                    buffer.add(obs, next_obs, actions, action_probs, rewards,
                               dones, values, goals)

                    obs, goals = next_obs, new_goals
                    self.process_infos(infos)
                    num_steps += num_env_steps(infos)

                # last step values are required for TD-return calculation
                _, _, values = self._policy_step(obs, goals)
                buffer.values.append(values)

            env_steps += num_steps

            # calculate discounted returns and GAE
            buffer.finalize_batch(self.params.gamma, self.params.gae_lambda)

            # update actor and critic and CM
            with timing.timeit('train'):
                step = self._train_with_curiosity(step, buffer, env_steps,
                                                  timing)

            avg_reward = multi_env.calc_avg_rewards(
                n=self.params.stats_episodes)
            avg_length = multi_env.calc_avg_episode_lengths(
                n=self.params.stats_episodes)

            self._maybe_update_avg_reward(avg_reward,
                                          multi_env.stats_num_episodes())
            self._maybe_trajectory_summaries(trajectory_buffer, env_steps)
            self._maybe_coverage_summaries(env_steps)
            self.curiosity.additional_summaries(
                env_steps,
                self.summary_writer,
                self.params.stats_episodes,
                map_img=self.map_img,
                coord_limits=self.coord_limits,
            )

            trajectory_buffer.reset_trajectories()

            fps = num_steps / (time.time() - batch_start)
            self._maybe_print(step, env_steps, avg_reward, avg_length, fps,
                              timing)
            self._maybe_aux_summaries(env_steps, avg_reward, avg_length, fps)