def test_multi_env_performance(test, env_type, num_envs, num_workers): t = Timing() with t.timeit('init'): multi_env = MultiEnv(num_envs, num_workers, test.make_env, stats_episodes=100) total_num_frames, frames = 20000, 0 with t.timeit('first_reset'): multi_env.reset() next_print = print_step = 10000 with t.timeit('experience'): while frames < total_num_frames: _, _, done, info = multi_env.step([0] * num_envs) frames += num_env_steps(info) if frames > next_print: log.info('Collected %d frames of experience...', frames) next_print += print_step fps = total_num_frames / t.experience log.debug('%s performance:', env_type) log.debug('Took %.3f sec to collect %d frames in parallel, %.1f FPS', t.experience, total_num_frames, fps) log.debug('Timing: %s', t) multi_env.close()
def _learn_loop(self, multi_env): """Main training loop.""" step, env_steps = self.session.run([self.actor_step, self.total_env_steps]) env_obs = multi_env.reset() observations, goals = main_observation(env_obs), goal_observation(env_obs) buffer = PPOBuffer() def end_of_training(s, es): return s >= self.params.train_for_steps or es > self.params.train_for_env_steps while not end_of_training(step, env_steps): timing = Timing() num_steps = 0 batch_start = time.time() buffer.reset() with timing.timeit('experience'): # collecting experience for rollout_step in range(self.params.rollout): actions, action_probs, values = self.actor_critic.invoke(self.session, observations, goals=goals) # wait for all the workers to complete an environment step env_obs, rewards, dones, infos = multi_env.step(actions) self.process_infos(infos) new_observations, new_goals = main_observation(env_obs), goal_observation(env_obs) # add experience from all environments to the current buffer buffer.add(observations, actions, action_probs, rewards, dones, values, goals) observations = new_observations goals = new_goals num_steps += num_env_steps(infos) # last step values are required for TD-return calculation _, _, values = self.actor_critic.invoke(self.session, observations, goals=goals) buffer.values.append(values) env_steps += num_steps # calculate discounted returns and GAE buffer.finalize_batch(self.params.gamma, self.params.gae_lambda) # update actor and critic with timing.timeit('train'): step = self._train(buffer, env_steps) avg_reward = multi_env.calc_avg_rewards(n=self.params.stats_episodes) avg_length = multi_env.calc_avg_episode_lengths(n=self.params.stats_episodes) fps = num_steps / (time.time() - batch_start) self._maybe_print(step, env_steps, avg_reward, avg_length, fps, timing) self._maybe_aux_summaries(env_steps, avg_reward, avg_length, fps) self._maybe_update_avg_reward(avg_reward, multi_env.stats_num_episodes()) self._maybe_coverage_summaries(env_steps)
def step(self, action): observation, reward, done, info = self.env.step(action) self._num_steps += num_env_steps([info]) if done: pass else: if self._num_steps >= self._terminate_in: done = True info[self.terminated_by_timer] = True return observation, reward, done, info
def test_env_performance(make_env, env_type, verbose=False): t = Timing() with t.timeit('init'): env = make_env(AttrDict({'worker_index': 0, 'vector_index': 0})) total_num_frames, frames = 10000, 0 with t.timeit('first_reset'): env.reset() t.reset = t.step = 1e-9 num_resets = 0 with t.timeit('experience'): while frames < total_num_frames: done = False start_reset = time.time() env.reset() t.reset += time.time() - start_reset num_resets += 1 while not done and frames < total_num_frames: start_step = time.time() if verbose: env.render() time.sleep(1.0 / 40) obs, rew, done, info = env.step(env.action_space.sample()) if verbose: log.info('Received reward %.3f', rew) t.step += time.time() - start_step frames += num_env_steps([info]) fps = total_num_frames / t.experience log.debug('%s performance:', env_type) log.debug('Took %.3f sec to collect %d frames on one CPU, %.1f FPS', t.experience, total_num_frames, fps) log.debug('Avg. reset time %.3f s', t.reset / num_resets) log.debug('Timing: %s', t) env.close()
def test_env_performance(test, env_type): t = Timing() with t.timeit('init'): env = test.make_env() total_num_frames, frames = 4000, 0 agent = AgentRandom(test.make_env, {}) with t.timeit('first_reset'): env.reset() t.reset = t.step = 1e-9 num_resets = 0 with t.timeit('experience'): while frames < total_num_frames: done = False start_reset = time.time() env.reset() t.reset += time.time() - start_reset num_resets += 1 while not done and frames < total_num_frames: start_step = time.time() obs, rew, done, info = env.step(agent.best_action()) t.step += time.time() - start_step frames += num_env_steps([info]) fps = total_num_frames / t.experience log.debug('%s performance:', env_type) log.debug('Took %.3f sec to collect %d frames on one CPU, %.1f FPS', t.experience, total_num_frames, fps) log.debug('Avg. reset time %.3f s', t.reset / num_resets) log.debug('Timing: %s', t) env.close()
def train_loop(agent, multi_env): params = agent.params observations = main_observation(multi_env.reset()) infos = multi_env.info() trajectory_buffer = TrajectoryBuffer(multi_env.num_envs) step, env_steps = agent.session.run([agent.curiosity.distance.step, agent.total_env_steps]) loop_time = deque([], maxlen=2500) advanced_steps = deque([], maxlen=2500) t = Timing() complete_trajectories = [] num_to_process = 20 test_buffer = Buffer() num_test_data = 5000 while True: with t.timeit('loop'): with t.timeit('step'): actions = np.random.randint(0, agent.actor_critic.num_actions, params.num_envs) new_obs, rewards, dones, new_infos = multi_env.step(actions) with t.timeit('misc'): trajectory_buffer.add(observations, actions, infos, dones) observations = main_observation(new_obs) infos = new_infos num_steps_delta = num_env_steps(infos) env_steps += num_steps_delta complete_trajectories.extend(trajectory_buffer.complete_trajectories) trajectory_buffer.reset_trajectories() with t.timeit('train'): while len(complete_trajectories) > num_to_process: buffer = generate_training_data(complete_trajectories[:num_to_process], params) complete_trajectories = complete_trajectories[num_to_process:] if len(test_buffer) <= 0: buffer.shuffle_data() test_buffer = Buffer() test_buffer.add_buff(buffer, max_to_add=num_test_data) else: step = agent.curiosity.distance.train(buffer, env_steps, agent) agent.curiosity.distance.calc_test_error(test_buffer, env_steps, agent) if t.train > 1.0: log.debug('Training time: %s', t) loop_time.append(t.loop) advanced_steps.append(num_steps_delta) if env_steps % 100 == 0: avg_fps = sum(advanced_steps) / sum(loop_time) log.info('Step %d, avg. fps %.1f, training steps %d, timing: %s', env_steps, avg_fps, step, t)
def train_loop(agent, multi_env): params = agent.params observations = main_observation(multi_env.reset()) infos = multi_env.info() trajectory_buffer = TmaxTrajectoryBuffer(multi_env.num_envs) locomotion_buffer = LocomotionBuffer(params) num_test_data = 5000 locomotion_buffer_test = LocomotionBuffer(params) step, env_steps = agent.session.run( [agent.locomotion.step, agent.total_env_steps]) loop_time = deque([], maxlen=2500) advanced_steps = deque([], maxlen=2500) t = Timing() while True: with t.timeit('loop'): with t.timeit('step'): actions = np.random.randint(0, agent.actor_critic.num_actions, params.num_envs) new_obs, rewards, dones, new_infos = multi_env.step(actions) with t.timeit('misc'): trajectory_buffer.add( observations, actions, infos, dones, tmax_mgr=agent.tmax_mgr, is_random=[True] * params.num_envs, ) observations = main_observation(new_obs) infos = new_infos num_steps_delta = num_env_steps(infos) env_steps += num_steps_delta with t.timeit('train'): locomotion_buffer.extract_data( trajectory_buffer.complete_trajectories) trajectory_buffer.reset_trajectories() if len(locomotion_buffer.buffer ) >= params.locomotion_experience_replay_buffer: if len(locomotion_buffer_test.buffer) <= 0: log.info( 'Prepare test data that we will never see during training...' ) locomotion_buffer.shuffle_data() locomotion_buffer_test.buffer.add_buff( locomotion_buffer.buffer, max_to_add=num_test_data) # noinspection PyProtectedMember log.info( 'Test buffer size %d, capacity %d', locomotion_buffer_test.buffer._size, locomotion_buffer_test.buffer._capacity, ) else: step = train_locomotion_net(agent, locomotion_buffer, params, env_steps) locomotion_buffer.reset() calc_test_error(agent, locomotion_buffer_test, params, env_steps) calc_test_error(agent, locomotion_buffer_test, params, env_steps, bn_training=True) if t.train > 1.0: log.debug('Train time: %s', t) loop_time.append(t.loop) advanced_steps.append(num_steps_delta) if env_steps % 100 == 0: avg_fps = sum(advanced_steps) / sum(loop_time) log.info('Step %d, avg. fps %.1f, training steps %d, timing: %s', env_steps, avg_fps, step, t)
def enjoy(params, env_id, max_num_episodes=1, max_num_frames=1e10, render=False): def make_env_func(): e = create_env(env_id, mode='train', skip_frames=True) e.seed(0) return e agent = AgentRandom(make_env_func, params.load()) env = make_env_func() # this helps with screen recording pause_at_the_beginning = False if pause_at_the_beginning: env.render() log.info('Press any key to start...') cv2.waitKey() agent.initialize() episode_rewards = [] num_frames = 0 histogram = setup_histogram(agent) def max_frames_reached(frames): return max_num_frames is not None and frames > max_num_frames for _ in range(max_num_episodes): env_obs, info = reset_with_info(env) done = False obs, goal_obs = main_observation(env_obs), goal_observation(env_obs) episode_reward = [] while not done and not max_frames_reached(num_frames): start = time.time() if render: env.render() action = agent.best_action([obs], goals=[goal_obs], deterministic=False) env_obs, rew, done, info = env.step(action) if done: log.warning('Done flag is true %d, rew: %.3f, num_frames %d', done, rew, num_frames) update_coverage(agent, [info], histogram) episode_reward.append(rew) if num_frames % 100 == 0: log.info('fps: %.1f, rew: %d, done: %s, frames %d', 1.0 / (time.time() - start), rew, done, num_frames) write_summaries(agent, histogram, num_frames) num_frames += num_env_steps([info]) if render: env.render() time.sleep(0.2) episode_rewards.append(sum(episode_reward)) last_episodes = episode_rewards[-100:] avg_reward = sum(last_episodes) / len(last_episodes) log.info( 'Episode reward: %f, avg reward for %d episodes: %f', sum(episode_reward), len(last_episodes), avg_reward, ) if max_frames_reached(num_frames): break write_summaries(agent, histogram, num_frames, force=True) agent.finalize() env.close() cv2.destroyAllWindows()
def _learn_loop(self, multi_env): """Main training loop.""" # env_steps used in tensorboard (and thus, our results) # actor_step used as global step for training step, env_steps = self.session.run( [self.actor_step, self.total_env_steps]) env_obs = multi_env.reset() obs, goals = main_observation(env_obs), goal_observation(env_obs) buffer = CuriousPPOBuffer() trajectory_buffer = TrajectoryBuffer(self.params.num_envs) self.curiosity.set_trajectory_buffer(trajectory_buffer) def end_of_training(s, es): return s >= self.params.train_for_steps or es > self.params.train_for_env_steps while not end_of_training(step, env_steps): timing = Timing() num_steps = 0 batch_start = time.time() buffer.reset() with timing.timeit('experience'): # collecting experience for rollout_step in range(self.params.rollout): actions, action_probs, values = self._policy_step( obs, goals) # wait for all the workers to complete an environment step env_obs, rewards, dones, infos = multi_env.step(actions) if self.params.graceful_episode_termination: rewards = list(rewards) for i in range(self.params.num_envs): if dones[i] and infos[i].get('prev') is not None: if infos[i]['prev'].get( 'terminated_by_timer', False): log.info('Env %d terminated by timer', i) rewards[i] += values[i] if not self.params.random_exploration: trajectory_buffer.add(obs, actions, infos, dones) next_obs, new_goals = main_observation( env_obs), goal_observation(env_obs) # calculate curiosity bonus with timing.add_time('curiosity'): if not self.params.random_exploration: bonuses = self.curiosity.generate_bonus_rewards( self.session, obs, next_obs, actions, dones, infos, ) rewards = self.params.extrinsic_reward_coeff * np.array( rewards) + bonuses # add experience from environment to the current buffer buffer.add(obs, next_obs, actions, action_probs, rewards, dones, values, goals) obs, goals = next_obs, new_goals self.process_infos(infos) num_steps += num_env_steps(infos) # last step values are required for TD-return calculation _, _, values = self._policy_step(obs, goals) buffer.values.append(values) env_steps += num_steps # calculate discounted returns and GAE buffer.finalize_batch(self.params.gamma, self.params.gae_lambda) # update actor and critic and CM with timing.timeit('train'): step = self._train_with_curiosity(step, buffer, env_steps, timing) avg_reward = multi_env.calc_avg_rewards( n=self.params.stats_episodes) avg_length = multi_env.calc_avg_episode_lengths( n=self.params.stats_episodes) self._maybe_update_avg_reward(avg_reward, multi_env.stats_num_episodes()) self._maybe_trajectory_summaries(trajectory_buffer, env_steps) self._maybe_coverage_summaries(env_steps) self.curiosity.additional_summaries( env_steps, self.summary_writer, self.params.stats_episodes, map_img=self.map_img, coord_limits=self.coord_limits, ) trajectory_buffer.reset_trajectories() fps = num_steps / (time.time() - batch_start) self._maybe_print(step, env_steps, avg_reward, avg_length, fps, timing) self._maybe_aux_summaries(env_steps, avg_reward, avg_length, fps)