def test_multi_env_performance(test, env_type, num_envs, num_workers):
    t = Timing()
    with t.timeit('init'):
        multi_env = MultiEnv(num_envs,
                             num_workers,
                             test.make_env,
                             stats_episodes=100)
        total_num_frames, frames = 20000, 0

    with t.timeit('first_reset'):
        multi_env.reset()

    next_print = print_step = 10000
    with t.timeit('experience'):
        while frames < total_num_frames:
            _, _, done, info = multi_env.step([0] * num_envs)
            frames += num_env_steps(info)
            if frames > next_print:
                log.info('Collected %d frames of experience...', frames)
                next_print += print_step

    fps = total_num_frames / t.experience
    log.debug('%s performance:', env_type)
    log.debug('Took %.3f sec to collect %d frames in parallel, %.1f FPS',
              t.experience, total_num_frames, fps)
    log.debug('Timing: %s', t)

    multi_env.close()
    def _learn_loop(self, multi_env):
        """Main training loop."""
        step, env_steps = self.session.run([self.actor_step, self.total_env_steps])

        env_obs = multi_env.reset()
        observations, goals = main_observation(env_obs), goal_observation(env_obs)
        buffer = PPOBuffer()

        def end_of_training(s, es):
            return s >= self.params.train_for_steps or es > self.params.train_for_env_steps

        while not end_of_training(step, env_steps):
            timing = Timing()
            num_steps = 0
            batch_start = time.time()

            buffer.reset()

            with timing.timeit('experience'):
                # collecting experience
                for rollout_step in range(self.params.rollout):
                    actions, action_probs, values = self.actor_critic.invoke(self.session, observations, goals=goals)

                    # wait for all the workers to complete an environment step
                    env_obs, rewards, dones, infos = multi_env.step(actions)
                    self.process_infos(infos)
                    new_observations, new_goals = main_observation(env_obs), goal_observation(env_obs)

                    # add experience from all environments to the current buffer
                    buffer.add(observations, actions, action_probs, rewards, dones, values, goals)
                    observations = new_observations
                    goals = new_goals

                    num_steps += num_env_steps(infos)

                # last step values are required for TD-return calculation
                _, _, values = self.actor_critic.invoke(self.session, observations, goals=goals)
                buffer.values.append(values)

            env_steps += num_steps

            # calculate discounted returns and GAE
            buffer.finalize_batch(self.params.gamma, self.params.gae_lambda)

            # update actor and critic
            with timing.timeit('train'):
                step = self._train(buffer, env_steps)

            avg_reward = multi_env.calc_avg_rewards(n=self.params.stats_episodes)
            avg_length = multi_env.calc_avg_episode_lengths(n=self.params.stats_episodes)
            fps = num_steps / (time.time() - batch_start)

            self._maybe_print(step, env_steps, avg_reward, avg_length, fps, timing)
            self._maybe_aux_summaries(env_steps, avg_reward, avg_length, fps)
            self._maybe_update_avg_reward(avg_reward, multi_env.stats_num_episodes())
            self._maybe_coverage_summaries(env_steps)
Beispiel #3
0
    def step(self, action):
        observation, reward, done, info = self.env.step(action)
        self._num_steps += num_env_steps([info])
        if done:
            pass
        else:
            if self._num_steps >= self._terminate_in:
                done = True
                info[self.terminated_by_timer] = True

        return observation, reward, done, info
def test_env_performance(make_env, env_type, verbose=False):
    t = Timing()
    with t.timeit('init'):
        env = make_env(AttrDict({'worker_index': 0, 'vector_index': 0}))
        total_num_frames, frames = 10000, 0

    with t.timeit('first_reset'):
        env.reset()

    t.reset = t.step = 1e-9
    num_resets = 0
    with t.timeit('experience'):
        while frames < total_num_frames:
            done = False

            start_reset = time.time()
            env.reset()

            t.reset += time.time() - start_reset
            num_resets += 1

            while not done and frames < total_num_frames:
                start_step = time.time()
                if verbose:
                    env.render()
                    time.sleep(1.0 / 40)

                obs, rew, done, info = env.step(env.action_space.sample())
                if verbose:
                    log.info('Received reward %.3f', rew)

                t.step += time.time() - start_step
                frames += num_env_steps([info])

    fps = total_num_frames / t.experience
    log.debug('%s performance:', env_type)
    log.debug('Took %.3f sec to collect %d frames on one CPU, %.1f FPS',
              t.experience,
              total_num_frames,
              fps)
    log.debug('Avg. reset time %.3f s', t.reset / num_resets)
    log.debug('Timing: %s', t)
    env.close()
def test_env_performance(test, env_type):
    t = Timing()
    with t.timeit('init'):
        env = test.make_env()
        total_num_frames, frames = 4000, 0
        agent = AgentRandom(test.make_env, {})

    with t.timeit('first_reset'):
        env.reset()

    t.reset = t.step = 1e-9
    num_resets = 0
    with t.timeit('experience'):
        while frames < total_num_frames:
            done = False

            start_reset = time.time()
            env.reset()

            t.reset += time.time() - start_reset
            num_resets += 1

            while not done and frames < total_num_frames:
                start_step = time.time()
                obs, rew, done, info = env.step(agent.best_action())
                t.step += time.time() - start_step
                frames += num_env_steps([info])

    fps = total_num_frames / t.experience
    log.debug('%s performance:', env_type)
    log.debug('Took %.3f sec to collect %d frames on one CPU, %.1f FPS',
              t.experience, total_num_frames, fps)
    log.debug('Avg. reset time %.3f s', t.reset / num_resets)
    log.debug('Timing: %s', t)

    env.close()
def train_loop(agent, multi_env):
    params = agent.params

    observations = main_observation(multi_env.reset())
    infos = multi_env.info()

    trajectory_buffer = TrajectoryBuffer(multi_env.num_envs)

    step, env_steps = agent.session.run([agent.curiosity.distance.step, agent.total_env_steps])

    loop_time = deque([], maxlen=2500)
    advanced_steps = deque([], maxlen=2500)

    t = Timing()

    complete_trajectories = []
    num_to_process = 20

    test_buffer = Buffer()
    num_test_data = 5000

    while True:
        with t.timeit('loop'):
            with t.timeit('step'):
                actions = np.random.randint(0, agent.actor_critic.num_actions, params.num_envs)
                new_obs, rewards, dones, new_infos = multi_env.step(actions)

            with t.timeit('misc'):
                trajectory_buffer.add(observations, actions, infos, dones)

                observations = main_observation(new_obs)
                infos = new_infos

                num_steps_delta = num_env_steps(infos)
                env_steps += num_steps_delta

                complete_trajectories.extend(trajectory_buffer.complete_trajectories)
                trajectory_buffer.reset_trajectories()

            with t.timeit('train'):
                while len(complete_trajectories) > num_to_process:
                    buffer = generate_training_data(complete_trajectories[:num_to_process], params)
                    complete_trajectories = complete_trajectories[num_to_process:]

                    if len(test_buffer) <= 0:
                        buffer.shuffle_data()

                        test_buffer = Buffer()
                        test_buffer.add_buff(buffer, max_to_add=num_test_data)
                    else:
                        step = agent.curiosity.distance.train(buffer, env_steps, agent)

                    agent.curiosity.distance.calc_test_error(test_buffer, env_steps, agent)

            if t.train > 1.0:
                log.debug('Training time: %s', t)

        loop_time.append(t.loop)
        advanced_steps.append(num_steps_delta)

        if env_steps % 100 == 0:
            avg_fps = sum(advanced_steps) / sum(loop_time)
            log.info('Step %d, avg. fps %.1f, training steps %d, timing: %s', env_steps, avg_fps, step, t)
def train_loop(agent, multi_env):
    params = agent.params

    observations = main_observation(multi_env.reset())
    infos = multi_env.info()

    trajectory_buffer = TmaxTrajectoryBuffer(multi_env.num_envs)
    locomotion_buffer = LocomotionBuffer(params)

    num_test_data = 5000
    locomotion_buffer_test = LocomotionBuffer(params)

    step, env_steps = agent.session.run(
        [agent.locomotion.step, agent.total_env_steps])

    loop_time = deque([], maxlen=2500)
    advanced_steps = deque([], maxlen=2500)

    t = Timing()

    while True:
        with t.timeit('loop'):
            with t.timeit('step'):
                actions = np.random.randint(0, agent.actor_critic.num_actions,
                                            params.num_envs)
                new_obs, rewards, dones, new_infos = multi_env.step(actions)

            with t.timeit('misc'):
                trajectory_buffer.add(
                    observations,
                    actions,
                    infos,
                    dones,
                    tmax_mgr=agent.tmax_mgr,
                    is_random=[True] * params.num_envs,
                )

                observations = main_observation(new_obs)
                infos = new_infos

                num_steps_delta = num_env_steps(infos)
                env_steps += num_steps_delta

            with t.timeit('train'):
                locomotion_buffer.extract_data(
                    trajectory_buffer.complete_trajectories)
                trajectory_buffer.reset_trajectories()

                if len(locomotion_buffer.buffer
                       ) >= params.locomotion_experience_replay_buffer:
                    if len(locomotion_buffer_test.buffer) <= 0:
                        log.info(
                            'Prepare test data that we will never see during training...'
                        )
                        locomotion_buffer.shuffle_data()
                        locomotion_buffer_test.buffer.add_buff(
                            locomotion_buffer.buffer, max_to_add=num_test_data)

                        # noinspection PyProtectedMember
                        log.info(
                            'Test buffer size %d, capacity %d',
                            locomotion_buffer_test.buffer._size,
                            locomotion_buffer_test.buffer._capacity,
                        )
                    else:
                        step = train_locomotion_net(agent, locomotion_buffer,
                                                    params, env_steps)

                    locomotion_buffer.reset()
                    calc_test_error(agent, locomotion_buffer_test, params,
                                    env_steps)
                    calc_test_error(agent,
                                    locomotion_buffer_test,
                                    params,
                                    env_steps,
                                    bn_training=True)

            if t.train > 1.0:
                log.debug('Train time: %s', t)

        loop_time.append(t.loop)
        advanced_steps.append(num_steps_delta)

        if env_steps % 100 == 0:
            avg_fps = sum(advanced_steps) / sum(loop_time)
            log.info('Step %d, avg. fps %.1f, training steps %d, timing: %s',
                     env_steps, avg_fps, step, t)
def enjoy(params,
          env_id,
          max_num_episodes=1,
          max_num_frames=1e10,
          render=False):
    def make_env_func():
        e = create_env(env_id, mode='train', skip_frames=True)
        e.seed(0)
        return e

    agent = AgentRandom(make_env_func, params.load())
    env = make_env_func()

    # this helps with screen recording
    pause_at_the_beginning = False
    if pause_at_the_beginning:
        env.render()
        log.info('Press any key to start...')
        cv2.waitKey()

    agent.initialize()

    episode_rewards = []
    num_frames = 0

    histogram = setup_histogram(agent)

    def max_frames_reached(frames):
        return max_num_frames is not None and frames > max_num_frames

    for _ in range(max_num_episodes):
        env_obs, info = reset_with_info(env)
        done = False
        obs, goal_obs = main_observation(env_obs), goal_observation(env_obs)

        episode_reward = []

        while not done and not max_frames_reached(num_frames):
            start = time.time()
            if render:
                env.render()

            action = agent.best_action([obs],
                                       goals=[goal_obs],
                                       deterministic=False)
            env_obs, rew, done, info = env.step(action)
            if done:
                log.warning('Done flag is true %d, rew: %.3f, num_frames %d',
                            done, rew, num_frames)

            update_coverage(agent, [info], histogram)

            episode_reward.append(rew)

            if num_frames % 100 == 0:
                log.info('fps: %.1f, rew: %d, done: %s, frames %d',
                         1.0 / (time.time() - start), rew, done, num_frames)

            write_summaries(agent, histogram, num_frames)

            num_frames += num_env_steps([info])

        if render:
            env.render()
        time.sleep(0.2)

        episode_rewards.append(sum(episode_reward))
        last_episodes = episode_rewards[-100:]
        avg_reward = sum(last_episodes) / len(last_episodes)
        log.info(
            'Episode reward: %f, avg reward for %d episodes: %f',
            sum(episode_reward),
            len(last_episodes),
            avg_reward,
        )

        if max_frames_reached(num_frames):
            break

    write_summaries(agent, histogram, num_frames, force=True)

    agent.finalize()
    env.close()
    cv2.destroyAllWindows()
Beispiel #9
0
    def _learn_loop(self, multi_env):
        """Main training loop."""
        # env_steps used in tensorboard (and thus, our results)
        # actor_step used as global step for training
        step, env_steps = self.session.run(
            [self.actor_step, self.total_env_steps])

        env_obs = multi_env.reset()
        obs, goals = main_observation(env_obs), goal_observation(env_obs)

        buffer = CuriousPPOBuffer()
        trajectory_buffer = TrajectoryBuffer(self.params.num_envs)
        self.curiosity.set_trajectory_buffer(trajectory_buffer)

        def end_of_training(s, es):
            return s >= self.params.train_for_steps or es > self.params.train_for_env_steps

        while not end_of_training(step, env_steps):
            timing = Timing()
            num_steps = 0
            batch_start = time.time()

            buffer.reset()

            with timing.timeit('experience'):
                # collecting experience
                for rollout_step in range(self.params.rollout):
                    actions, action_probs, values = self._policy_step(
                        obs, goals)

                    # wait for all the workers to complete an environment step
                    env_obs, rewards, dones, infos = multi_env.step(actions)

                    if self.params.graceful_episode_termination:
                        rewards = list(rewards)
                        for i in range(self.params.num_envs):
                            if dones[i] and infos[i].get('prev') is not None:
                                if infos[i]['prev'].get(
                                        'terminated_by_timer', False):
                                    log.info('Env %d terminated by timer', i)
                                    rewards[i] += values[i]

                    if not self.params.random_exploration:
                        trajectory_buffer.add(obs, actions, infos, dones)

                    next_obs, new_goals = main_observation(
                        env_obs), goal_observation(env_obs)

                    # calculate curiosity bonus
                    with timing.add_time('curiosity'):
                        if not self.params.random_exploration:
                            bonuses = self.curiosity.generate_bonus_rewards(
                                self.session,
                                obs,
                                next_obs,
                                actions,
                                dones,
                                infos,
                            )
                            rewards = self.params.extrinsic_reward_coeff * np.array(
                                rewards) + bonuses

                    # add experience from environment to the current buffer
                    buffer.add(obs, next_obs, actions, action_probs, rewards,
                               dones, values, goals)

                    obs, goals = next_obs, new_goals
                    self.process_infos(infos)
                    num_steps += num_env_steps(infos)

                # last step values are required for TD-return calculation
                _, _, values = self._policy_step(obs, goals)
                buffer.values.append(values)

            env_steps += num_steps

            # calculate discounted returns and GAE
            buffer.finalize_batch(self.params.gamma, self.params.gae_lambda)

            # update actor and critic and CM
            with timing.timeit('train'):
                step = self._train_with_curiosity(step, buffer, env_steps,
                                                  timing)

            avg_reward = multi_env.calc_avg_rewards(
                n=self.params.stats_episodes)
            avg_length = multi_env.calc_avg_episode_lengths(
                n=self.params.stats_episodes)

            self._maybe_update_avg_reward(avg_reward,
                                          multi_env.stats_num_episodes())
            self._maybe_trajectory_summaries(trajectory_buffer, env_steps)
            self._maybe_coverage_summaries(env_steps)
            self.curiosity.additional_summaries(
                env_steps,
                self.summary_writer,
                self.params.stats_episodes,
                map_img=self.map_img,
                coord_limits=self.coord_limits,
            )

            trajectory_buffer.reset_trajectories()

            fps = num_steps / (time.time() - batch_start)
            self._maybe_print(step, env_steps, avg_reward, avg_length, fps,
                              timing)
            self._maybe_aux_summaries(env_steps, avg_reward, avg_length, fps)