def record_trajectory(params, env_id):
    def make_env_func():
        e = create_env(env_id, skip_frames=True)
        e.seed(0)
        return e

    env = make_env_func()
    map_img, coord_limits = generate_env_map(make_env_func)

    env_obs, info = reset_with_info(env)
    obs = main_observation(env_obs)
    done = False

    m = TopologicalMap(obs,
                       directed_graph=False,
                       initial_info=info,
                       verbose=True)

    trajectory = Trajectory(env_idx=-1)
    frame = 0

    t = Timing()

    while not done and not terminate:
        with t.timeit('one_frame'):
            env.render()

            if len(current_actions) > 0:
                action = current_actions[-1]
            else:
                action = 0

            trajectory.add(obs, action, info)
            m.add_landmark(obs, info, update_curr_landmark=True)

            env_obs, rew, done, info = env.step(action)
            obs = main_observation(env_obs)

        took_seconds = t.one_frame
        desired_fps = 15
        wait_seconds = (1.0 / desired_fps) - took_seconds
        wait_seconds = max(0.0, wait_seconds)
        time.sleep(wait_seconds)

        frame += 1

    env.render()
    time.sleep(0.2)

    trajectory_dir = trajectory.save(params.experiment_dir())
    m.save_checkpoint(trajectory_dir,
                      map_img=map_img,
                      coord_limits=coord_limits,
                      verbose=True)

    env.close()
    return 0
Beispiel #2
0
def build_graph(params, env_id, max_num_episodes=1000):
    def make_env_func():
        e = create_env(env_id, mode='test', skip_frames=False)
        e.seed(0)
        return e

    checkpoint_dir = model_dir(params.experiment_dir())
    map_img, coord_limits = generate_env_map(make_env_func)
    env = make_env_func()

    m = None

    for _ in range(max_num_episodes):
        env_obs, info = reset_with_info(env)
        obs = main_observation(env_obs)
        done = False

        if m is None:
            m = TopologicalMap(obs, directed_graph=False, initial_info=info, verbose=True)
            m.maybe_load_checkpoint(checkpoint_dir)

        while not done and not terminate:
            env.render()

            if len(current_actions) > 0:
                action = current_actions[-1]
            else:
                action = 0

            env_obs, rew, done, info = env.step(action)
            obs = main_observation(env_obs)

            global add_landmark
            if add_landmark:
                # noinspection PyProtectedMember
                new_idx = m._add_new_node(obs=obs, pos=get_position(info), angle=get_angle(info))
                log.info('Added landmark idx %d', new_idx)
                add_landmark = False

                res = m.save_checkpoint(checkpoint_dir, map_img=map_img, coord_limits=coord_limits, verbose=True)
                cv2.imshow('map', cv2.imread(res.graph_filename))
                cv2.waitKey(50)

        if terminate:
            break
        else:
            env.render()
            time.sleep(0.2)

    m.save_checkpoint(checkpoint_dir, map_img=map_img, coord_limits=coord_limits, verbose=True)
    log.debug('Set breakpoint here to edit graph edges before saving...')

    log.info('Saving to %s...', checkpoint_dir)
    m.save_checkpoint(checkpoint_dir, map_img=map_img, coord_limits=coord_limits, verbose=True)

    env.close()
    return 0
    def _learn_loop(self, multi_env):
        """Main training loop."""
        step, env_steps = self.session.run([self.actor_step, self.total_env_steps])

        env_obs = multi_env.reset()
        observations, goals = main_observation(env_obs), goal_observation(env_obs)
        buffer = PPOBuffer()

        def end_of_training(s, es):
            return s >= self.params.train_for_steps or es > self.params.train_for_env_steps

        while not end_of_training(step, env_steps):
            timing = Timing()
            num_steps = 0
            batch_start = time.time()

            buffer.reset()

            with timing.timeit('experience'):
                # collecting experience
                for rollout_step in range(self.params.rollout):
                    actions, action_probs, values = self.actor_critic.invoke(self.session, observations, goals=goals)

                    # wait for all the workers to complete an environment step
                    env_obs, rewards, dones, infos = multi_env.step(actions)
                    self.process_infos(infos)
                    new_observations, new_goals = main_observation(env_obs), goal_observation(env_obs)

                    # add experience from all environments to the current buffer
                    buffer.add(observations, actions, action_probs, rewards, dones, values, goals)
                    observations = new_observations
                    goals = new_goals

                    num_steps += num_env_steps(infos)

                # last step values are required for TD-return calculation
                _, _, values = self.actor_critic.invoke(self.session, observations, goals=goals)
                buffer.values.append(values)

            env_steps += num_steps

            # calculate discounted returns and GAE
            buffer.finalize_batch(self.params.gamma, self.params.gae_lambda)

            # update actor and critic
            with timing.timeit('train'):
                step = self._train(buffer, env_steps)

            avg_reward = multi_env.calc_avg_rewards(n=self.params.stats_episodes)
            avg_length = multi_env.calc_avg_episode_lengths(n=self.params.stats_episodes)
            fps = num_steps / (time.time() - batch_start)

            self._maybe_print(step, env_steps, avg_reward, avg_length, fps, timing)
            self._maybe_aux_summaries(env_steps, avg_reward, avg_length, fps)
            self._maybe_update_avg_reward(avg_reward, multi_env.stats_num_episodes())
            self._maybe_coverage_summaries(env_steps)
Beispiel #4
0
def run_policy_loop(agent, env, max_num_episodes, fps=7, max_num_frames=None, deterministic=False):
    """Execute the policy and render onto the screen, using the standard agent interface."""
    agent.initialize()

    episode_rewards = []
    num_frames = 0

    def max_frames_reached(frames):
        return max_num_frames is not None and frames > max_num_frames

    for _ in range(max_num_episodes):
        env_obs, info = reset_with_info(env)
        done = False
        obs, goal_obs = main_observation(env_obs), goal_observation(env_obs)
        if goal_obs is not None:
            goal_obs_rgb = cv2.cvtColor(goal_obs, cv2.COLOR_BGR2RGB)
            cv2.imshow('goal', cv2.resize(goal_obs_rgb, (500, 500)))
            cv2.waitKey(500)
        episode_reward = 0

        while not done:
            start = time.time()
            env.render()
            if fps < 1000:
                time.sleep(1.0 / fps)
            action = agent.best_action([obs], goals=[goal_obs], deterministic=deterministic)
            env_obs, rew, done, _ = env.step(action)
            obs, goal_obs = main_observation(env_obs), goal_observation(env_obs)
            episode_reward += rew
            log.info('Actual fps: %.1f', 1.0 / (time.time() - start))

            num_frames += 1
            if max_frames_reached(num_frames):
                break

        env.render()
        time.sleep(0.2)

        episode_rewards.append(episode_reward)
        last_episodes = episode_rewards[-100:]
        avg_reward = sum(last_episodes) / len(last_episodes)
        log.info(
            'Episode reward: %f, avg reward for %d episodes: %f', episode_reward, len(last_episodes), avg_reward,
        )

        if max_frames_reached(num_frames):
            break

    agent.finalize()
    env.close()
    cv2.destroyAllWindows()
    return 0
Beispiel #5
0
def evaluate_locomotion_agent(agent, multi_env):
    num_envs = multi_env.num_envs

    observations = main_observation(multi_env.reset())
    obs_prev = observations
    infos = multi_env.info()

    agent.tmax_mgr.initialize(observations, infos, 1)
    m = agent.tmax_mgr.dense_persistent_maps[-1]

    navigator = Navigator(agent)
    for env_i in range(num_envs):
        navigator.reset(env_i, m)

    # sample final goals
    all_targets = list(m.graph.nodes)
    if len(all_targets) > 0:
        all_targets.remove(0)

    final_goal_idx = random.sample(all_targets, num_envs)
    log.info('Goals: %r', final_goal_idx)

    # noinspection PyProtectedMember
    navigator._ensure_paths_to_goal_calculated([m] * num_envs, final_goal_idx)
    path_lengths = [0] * num_envs
    for env_i in range(num_envs):
        location, path_length = 0, 0
        while location != final_goal_idx[env_i]:
            location = navigator.paths[env_i][location]
            path_length += 1
        path_lengths[env_i] = path_length

    frames = 0
    next_target, next_target_d = navigator.get_next_target(
        [m] * num_envs, observations, final_goal_idx, [frames] * num_envs,
    )
    next_target_obs = [m.get_observation(t) for t in next_target]

    avg_speed = [-1] * num_envs
    success = [False] * num_envs

    t = Timing()
    while True:
        with t.timeit('frame'):
            with t.timeit('policy'):
                actions = policy_step(agent, obs_prev, observations, next_target_obs, final_goal_idx)

            with t.timeit('step'):
                env_obs, rew, done, info = multi_env.step(actions)

            obs_prev = observations
            observations = main_observation(env_obs)

            with t.timeit('navigator'):
                next_target, next_target_d = navigator.get_next_target(
                    [m] * num_envs, observations, final_goal_idx, [frames] * num_envs,
                )

            for env_i in range(num_envs):
                if final_goal_idx[env_i] is None:
                    continue

                if next_target[env_i] is None:
                    log.warning(
                        'Agent %d got lost in %d steps trying to reach %d', env_i, frames, final_goal_idx[env_i],
                    )
                    final_goal_idx[env_i] = None
                else:
                    if next_target[env_i] == final_goal_idx[env_i] and next_target_d[env_i] < 0.1:
                        success[env_i] = True
                        avg_speed[env_i] = path_lengths[env_i] / (frames + 1)
                        log.debug(
                            'Agent %d reached goal %d in %d steps, avg. speed %.3f',
                            env_i, final_goal_idx[env_i], frames, avg_speed[env_i],
                        )
                        final_goal_idx[env_i] = None

                    next_target_obs[env_i] = m.get_observation(next_target[env_i])

            frames += 1
            if frames > 5000:
                log.error('Timeout! 5000 frames was not enough to finish locomotion!')
                break

        finished = [g is None for g in final_goal_idx]
        if all(finished):
            log.info('Done!')
            break
        else:
            if frames % 10 == 0:
                frame_repeat = 4
                fps = (1.0 / t.frame) * frame_repeat * num_envs
                log.info('%d agents remaining, fps %.3f, time %s', num_envs - sum(finished), fps, t)

    return success, avg_speed
def train_loop(agent, multi_env):
    params = agent.params

    observations = main_observation(multi_env.reset())
    infos = multi_env.info()

    trajectory_buffer = TrajectoryBuffer(multi_env.num_envs)

    step, env_steps = agent.session.run([agent.curiosity.distance.step, agent.total_env_steps])

    loop_time = deque([], maxlen=2500)
    advanced_steps = deque([], maxlen=2500)

    t = Timing()

    complete_trajectories = []
    num_to_process = 20

    test_buffer = Buffer()
    num_test_data = 5000

    while True:
        with t.timeit('loop'):
            with t.timeit('step'):
                actions = np.random.randint(0, agent.actor_critic.num_actions, params.num_envs)
                new_obs, rewards, dones, new_infos = multi_env.step(actions)

            with t.timeit('misc'):
                trajectory_buffer.add(observations, actions, infos, dones)

                observations = main_observation(new_obs)
                infos = new_infos

                num_steps_delta = num_env_steps(infos)
                env_steps += num_steps_delta

                complete_trajectories.extend(trajectory_buffer.complete_trajectories)
                trajectory_buffer.reset_trajectories()

            with t.timeit('train'):
                while len(complete_trajectories) > num_to_process:
                    buffer = generate_training_data(complete_trajectories[:num_to_process], params)
                    complete_trajectories = complete_trajectories[num_to_process:]

                    if len(test_buffer) <= 0:
                        buffer.shuffle_data()

                        test_buffer = Buffer()
                        test_buffer.add_buff(buffer, max_to_add=num_test_data)
                    else:
                        step = agent.curiosity.distance.train(buffer, env_steps, agent)

                    agent.curiosity.distance.calc_test_error(test_buffer, env_steps, agent)

            if t.train > 1.0:
                log.debug('Training time: %s', t)

        loop_time.append(t.loop)
        advanced_steps.append(num_steps_delta)

        if env_steps % 100 == 0:
            avg_fps = sum(advanced_steps) / sum(loop_time)
            log.info('Step %d, avg. fps %.1f, training steps %d, timing: %s', env_steps, avg_fps, step, t)
def train_loop(agent, multi_env):
    params = agent.params

    observations = main_observation(multi_env.reset())
    infos = multi_env.info()

    trajectory_buffer = TmaxTrajectoryBuffer(multi_env.num_envs)
    locomotion_buffer = LocomotionBuffer(params)

    num_test_data = 5000
    locomotion_buffer_test = LocomotionBuffer(params)

    step, env_steps = agent.session.run(
        [agent.locomotion.step, agent.total_env_steps])

    loop_time = deque([], maxlen=2500)
    advanced_steps = deque([], maxlen=2500)

    t = Timing()

    while True:
        with t.timeit('loop'):
            with t.timeit('step'):
                actions = np.random.randint(0, agent.actor_critic.num_actions,
                                            params.num_envs)
                new_obs, rewards, dones, new_infos = multi_env.step(actions)

            with t.timeit('misc'):
                trajectory_buffer.add(
                    observations,
                    actions,
                    infos,
                    dones,
                    tmax_mgr=agent.tmax_mgr,
                    is_random=[True] * params.num_envs,
                )

                observations = main_observation(new_obs)
                infos = new_infos

                num_steps_delta = num_env_steps(infos)
                env_steps += num_steps_delta

            with t.timeit('train'):
                locomotion_buffer.extract_data(
                    trajectory_buffer.complete_trajectories)
                trajectory_buffer.reset_trajectories()

                if len(locomotion_buffer.buffer
                       ) >= params.locomotion_experience_replay_buffer:
                    if len(locomotion_buffer_test.buffer) <= 0:
                        log.info(
                            'Prepare test data that we will never see during training...'
                        )
                        locomotion_buffer.shuffle_data()
                        locomotion_buffer_test.buffer.add_buff(
                            locomotion_buffer.buffer, max_to_add=num_test_data)

                        # noinspection PyProtectedMember
                        log.info(
                            'Test buffer size %d, capacity %d',
                            locomotion_buffer_test.buffer._size,
                            locomotion_buffer_test.buffer._capacity,
                        )
                    else:
                        step = train_locomotion_net(agent, locomotion_buffer,
                                                    params, env_steps)

                    locomotion_buffer.reset()
                    calc_test_error(agent, locomotion_buffer_test, params,
                                    env_steps)
                    calc_test_error(agent,
                                    locomotion_buffer_test,
                                    params,
                                    env_steps,
                                    bn_training=True)

            if t.train > 1.0:
                log.debug('Train time: %s', t)

        loop_time.append(t.loop)
        advanced_steps.append(num_steps_delta)

        if env_steps % 100 == 0:
            avg_fps = sum(advanced_steps) / sum(loop_time)
            log.info('Step %d, avg. fps %.1f, training steps %d, timing: %s',
                     env_steps, avg_fps, step, t)
def enjoy(params,
          env_id,
          max_num_episodes=1,
          max_num_frames=1e10,
          render=False):
    def make_env_func():
        e = create_env(env_id, mode='train', skip_frames=True)
        e.seed(0)
        return e

    agent = AgentRandom(make_env_func, params.load())
    env = make_env_func()

    # this helps with screen recording
    pause_at_the_beginning = False
    if pause_at_the_beginning:
        env.render()
        log.info('Press any key to start...')
        cv2.waitKey()

    agent.initialize()

    episode_rewards = []
    num_frames = 0

    histogram = setup_histogram(agent)

    def max_frames_reached(frames):
        return max_num_frames is not None and frames > max_num_frames

    for _ in range(max_num_episodes):
        env_obs, info = reset_with_info(env)
        done = False
        obs, goal_obs = main_observation(env_obs), goal_observation(env_obs)

        episode_reward = []

        while not done and not max_frames_reached(num_frames):
            start = time.time()
            if render:
                env.render()

            action = agent.best_action([obs],
                                       goals=[goal_obs],
                                       deterministic=False)
            env_obs, rew, done, info = env.step(action)
            if done:
                log.warning('Done flag is true %d, rew: %.3f, num_frames %d',
                            done, rew, num_frames)

            update_coverage(agent, [info], histogram)

            episode_reward.append(rew)

            if num_frames % 100 == 0:
                log.info('fps: %.1f, rew: %d, done: %s, frames %d',
                         1.0 / (time.time() - start), rew, done, num_frames)

            write_summaries(agent, histogram, num_frames)

            num_frames += num_env_steps([info])

        if render:
            env.render()
        time.sleep(0.2)

        episode_rewards.append(sum(episode_reward))
        last_episodes = episode_rewards[-100:]
        avg_reward = sum(last_episodes) / len(last_episodes)
        log.info(
            'Episode reward: %f, avg reward for %d episodes: %f',
            sum(episode_reward),
            len(last_episodes),
            avg_reward,
        )

        if max_frames_reached(num_frames):
            break

    write_summaries(agent, histogram, num_frames, force=True)

    agent.finalize()
    env.close()
    cv2.destroyAllWindows()
def enjoy(params,
          env_id,
          max_num_episodes=1000,
          max_num_frames=None,
          show_automap=False):
    def make_env_func():
        e = create_env(env_id, mode='test', show_automap=show_automap)
        e.seed(0)
        return e

    params = params.load()
    params.num_envs = 1  # during execution we're only using one env
    agent = AgentTMAX(make_env_func, params)
    env = make_env_func()

    agent.initialize()

    global persistent_map
    if agent.params.persistent_map_checkpoint is not None:
        persistent_map = TopologicalMap.create_empty()
        persistent_map.maybe_load_checkpoint(
            agent.params.persistent_map_checkpoint)

    global current_landmark

    episode_rewards = []
    num_frames = 0

    def max_frames_reached(frames):
        return max_num_frames is not None and frames > max_num_frames

    for _ in range(max_num_episodes):
        env_obs, info = reset_with_info(env)
        done = False

        obs, goal_obs = main_observation(env_obs), goal_observation(env_obs)
        prev_obs = obs
        if current_landmark is None:
            current_landmark = obs

        if goal_obs is not None:
            goal_obs_rgb = cv2.cvtColor(goal_obs, cv2.COLOR_BGR2RGB)
            cv2.imshow('goal', cv2.resize(goal_obs_rgb, (500, 500)))
            cv2.waitKey(500)

        episode_reward, episode_frames = 0, 0

        if not agent.tmax_mgr.initialized:
            agent.tmax_mgr.initialize([obs], [info], env_steps=0)
            persistent_map = agent.tmax_mgr.dense_persistent_maps[-1]
            sparse_persistent_map = agent.tmax_mgr.sparse_persistent_maps[-1]
            log.debug('Num landmarks in sparse map: %d',
                      sparse_persistent_map.num_landmarks())

        agent.curiosity.initialized = True
        agent.tmax_mgr.mode[0] = TmaxMode.EXPLORATION
        agent.tmax_mgr.locomotion_final_targets[0] = None
        agent.tmax_mgr.locomotion_targets[0] = None

        start_episode = time.time()
        t = Timing()

        while not done and not terminate and not max_frames_reached(
                num_frames):
            with t.timeit('one_frame'):
                env.render()
                cv2.waitKey(1)  # to prevent window from fading

                if pause:
                    time.sleep(0.01)
                    continue

                if len(current_actions) > 0:
                    # key combinations are not handled, but this is purely for testing
                    action = current_actions[-1]
                else:
                    action = 0

                if policy_type == PolicyType.PLAYER:
                    pass
                elif policy_type == PolicyType.RANDOM:
                    action = env.action_space.sample()
                elif policy_type == PolicyType.AGENT:
                    agent.tmax_mgr.mode[0] = TmaxMode.EXPLORATION
                    action, *_ = agent.policy_step([prev_obs], [obs],
                                                   [goal_obs], None, None)
                    action = action[0]
                elif policy_type == PolicyType.LOCOMOTION:
                    agent.tmax_mgr.mode[0] = TmaxMode.LOCOMOTION
                    action, _, _ = agent.loco_actor_critic.invoke(
                        agent.session,
                        [obs],
                        [current_landmark],
                        None,
                        None,
                        [1.0],
                    )
                    action = action[0]

                env_obs, rew, done, info = env.step(action)
                next_obs, goal_obs = main_observation(
                    env_obs), goal_observation(env_obs)

                _, _ = agent.tmax_mgr.update(
                    [obs],
                    [next_obs],
                    [rew],
                    [done],
                    [info],
                    num_frames,
                    t,
                    verbose=True,
                )

                prev_obs = obs
                obs = next_obs

                calc_distance_to_memory(agent, sparse_persistent_map, obs)
                calc_value_estimate(agent, obs)

                episode_reward += rew

                num_frames += 1
                episode_frames += 1

            took_seconds = t.one_frame
            desired_fps = 15  # (4-repeated here, which means actually 60fps)
            wait_seconds = (1.0 / desired_fps) - took_seconds
            wait_seconds = max(0.0, wait_seconds)
            if wait_seconds > EPS:
                time.sleep(wait_seconds)

        env.render()
        log.info('Actual fps: %.1f',
                 episode_frames / (time.time() - start_episode))
        time.sleep(0.2)

        episode_rewards.append(episode_reward)
        last_episodes = episode_rewards[-100:]
        avg_reward = sum(last_episodes) / len(last_episodes)
        log.info(
            'Episode reward: %f, avg reward for %d episodes: %f',
            episode_reward,
            len(last_episodes),
            avg_reward,
        )

        if max_frames_reached(num_frames) or terminate:
            break

    agent.finalize()
    env.close()
    cv2.destroyAllWindows()

    return 0
Beispiel #10
0
def test_locomotion(params, env_id):
    def make_env_func():
        e = create_env(env_id, skip_frames=True)
        e.seed(0)
        return e

    # params = params.load()
    # params.ensure_serialized()

    params.num_envs = 1
    # params.naive_locomotion = True

    agent = AgentTMAX(make_env_func, params)

    agent.initialize()

    env = make_env_func()

    env_obs, info = reset_with_info(env)
    obs_prev = obs = main_observation(env_obs)
    done = False

    if params.persistent_map_checkpoint is not None:
        loaded_persistent_map = TopologicalMap.create_empty()
        loaded_persistent_map.maybe_load_checkpoint(
            params.persistent_map_checkpoint)
    else:
        agent.tmax_mgr.initialize([obs], [info], 1)
        loaded_persistent_map = agent.tmax_mgr.dense_persistent_maps[-1]

    m = loaded_persistent_map

    t = Timing()

    log.info('Num landmarks: %d', m.num_landmarks())
    final_goal_idx = 49

    log.info('Locomotion goal is %d', final_goal_idx)

    # localizer = Localizer(m, agent)

    final_goal_obs = m.get_observation(final_goal_idx)
    cv2.namedWindow('next_target')
    cv2.moveWindow('next_target', 800, 100)
    cv2.namedWindow('final_goal')
    cv2.moveWindow('final_goal', 1400, 100)
    display_obs('next_target', obs)
    display_obs('final_goal', final_goal_obs)
    cv2.waitKey(1)

    # localizer.current_landmark = 0
    # next_target = localizer.get_next_target(obs, final_goal_idx)
    # next_target_obs = m.get_observation(next_target)

    frame = 0

    if params.naive_locomotion:
        navigator = NavigatorNaive(agent)
    else:
        navigator = Navigator(agent)

    navigator.reset(0, m)

    next_target, next_target_d = navigator.get_next_target(
        [m],
        [obs],
        [final_goal_idx],
        [frame],
    )
    next_target, next_target_d = next_target[0], next_target_d[0]
    next_target_obs = m.get_observation(next_target)

    while not done and not terminate:
        with t.timeit('one_frame'):
            env.render()
            if not pause:
                if random.random() < 0.5:
                    deterministic = False
                else:
                    deterministic = True

                if params.naive_locomotion:
                    action = navigator.replay_action([0])[0]
                else:
                    action = agent.locomotion.navigate(
                        agent.session,
                        [obs_prev],
                        [obs],
                        [next_target_obs],
                        deterministic=deterministic,
                    )[0]

                env_obs, rew, done, info = env.step(action)

                log.info('Action is %d', action)
                obs_prev = obs
                obs = main_observation(env_obs)

                next_target, next_target_d = navigator.get_next_target(
                    [m],
                    [obs],
                    [final_goal_idx],
                    [frame],
                )
                next_target, next_target_d = next_target[0], next_target_d[0]
                if next_target is None:
                    log.error('We are lost!')
                else:
                    log.info('Next target is %d with distance %.3f!',
                             next_target, next_target_d)
                    display_obs('next_target', next_target_obs)
                    cv2.waitKey(1)

                if next_target is not None:
                    next_target_obs = m.get_observation(next_target)

                log.info('Frame %d...', frame)

        took_seconds = t.one_frame
        desired_fps = 10
        wait_seconds = (1.0 / desired_fps) - took_seconds
        wait_seconds = max(0.0, wait_seconds)
        if wait_seconds > EPS:
            time.sleep(wait_seconds)

        if not pause:
            frame += 1

    log.info('After loop')

    env.render()
    time.sleep(0.05)

    env.close()
    agent.finalize()
    return 0
Beispiel #11
0
    def _learn_loop(self, multi_env):
        """Main training loop."""
        # env_steps used in tensorboard (and thus, our results)
        # actor_step used as global step for training
        step, env_steps = self.session.run(
            [self.actor_step, self.total_env_steps])

        env_obs = multi_env.reset()
        obs, goals = main_observation(env_obs), goal_observation(env_obs)

        buffer = CuriousPPOBuffer()
        trajectory_buffer = TrajectoryBuffer(self.params.num_envs)
        self.curiosity.set_trajectory_buffer(trajectory_buffer)

        def end_of_training(s, es):
            return s >= self.params.train_for_steps or es > self.params.train_for_env_steps

        while not end_of_training(step, env_steps):
            timing = Timing()
            num_steps = 0
            batch_start = time.time()

            buffer.reset()

            with timing.timeit('experience'):
                # collecting experience
                for rollout_step in range(self.params.rollout):
                    actions, action_probs, values = self._policy_step(
                        obs, goals)

                    # wait for all the workers to complete an environment step
                    env_obs, rewards, dones, infos = multi_env.step(actions)

                    if self.params.graceful_episode_termination:
                        rewards = list(rewards)
                        for i in range(self.params.num_envs):
                            if dones[i] and infos[i].get('prev') is not None:
                                if infos[i]['prev'].get(
                                        'terminated_by_timer', False):
                                    log.info('Env %d terminated by timer', i)
                                    rewards[i] += values[i]

                    if not self.params.random_exploration:
                        trajectory_buffer.add(obs, actions, infos, dones)

                    next_obs, new_goals = main_observation(
                        env_obs), goal_observation(env_obs)

                    # calculate curiosity bonus
                    with timing.add_time('curiosity'):
                        if not self.params.random_exploration:
                            bonuses = self.curiosity.generate_bonus_rewards(
                                self.session,
                                obs,
                                next_obs,
                                actions,
                                dones,
                                infos,
                            )
                            rewards = self.params.extrinsic_reward_coeff * np.array(
                                rewards) + bonuses

                    # add experience from environment to the current buffer
                    buffer.add(obs, next_obs, actions, action_probs, rewards,
                               dones, values, goals)

                    obs, goals = next_obs, new_goals
                    self.process_infos(infos)
                    num_steps += num_env_steps(infos)

                # last step values are required for TD-return calculation
                _, _, values = self._policy_step(obs, goals)
                buffer.values.append(values)

            env_steps += num_steps

            # calculate discounted returns and GAE
            buffer.finalize_batch(self.params.gamma, self.params.gae_lambda)

            # update actor and critic and CM
            with timing.timeit('train'):
                step = self._train_with_curiosity(step, buffer, env_steps,
                                                  timing)

            avg_reward = multi_env.calc_avg_rewards(
                n=self.params.stats_episodes)
            avg_length = multi_env.calc_avg_episode_lengths(
                n=self.params.stats_episodes)

            self._maybe_update_avg_reward(avg_reward,
                                          multi_env.stats_num_episodes())
            self._maybe_trajectory_summaries(trajectory_buffer, env_steps)
            self._maybe_coverage_summaries(env_steps)
            self.curiosity.additional_summaries(
                env_steps,
                self.summary_writer,
                self.params.stats_episodes,
                map_img=self.map_img,
                coord_limits=self.coord_limits,
            )

            trajectory_buffer.reset_trajectories()

            fps = num_steps / (time.time() - batch_start)
            self._maybe_print(step, env_steps, avg_reward, avg_length, fps,
                              timing)
            self._maybe_aux_summaries(env_steps, avg_reward, avg_length, fps)