def record_trajectory(params, env_id): def make_env_func(): e = create_env(env_id, skip_frames=True) e.seed(0) return e env = make_env_func() map_img, coord_limits = generate_env_map(make_env_func) env_obs, info = reset_with_info(env) obs = main_observation(env_obs) done = False m = TopologicalMap(obs, directed_graph=False, initial_info=info, verbose=True) trajectory = Trajectory(env_idx=-1) frame = 0 t = Timing() while not done and not terminate: with t.timeit('one_frame'): env.render() if len(current_actions) > 0: action = current_actions[-1] else: action = 0 trajectory.add(obs, action, info) m.add_landmark(obs, info, update_curr_landmark=True) env_obs, rew, done, info = env.step(action) obs = main_observation(env_obs) took_seconds = t.one_frame desired_fps = 15 wait_seconds = (1.0 / desired_fps) - took_seconds wait_seconds = max(0.0, wait_seconds) time.sleep(wait_seconds) frame += 1 env.render() time.sleep(0.2) trajectory_dir = trajectory.save(params.experiment_dir()) m.save_checkpoint(trajectory_dir, map_img=map_img, coord_limits=coord_limits, verbose=True) env.close() return 0
def build_graph(params, env_id, max_num_episodes=1000): def make_env_func(): e = create_env(env_id, mode='test', skip_frames=False) e.seed(0) return e checkpoint_dir = model_dir(params.experiment_dir()) map_img, coord_limits = generate_env_map(make_env_func) env = make_env_func() m = None for _ in range(max_num_episodes): env_obs, info = reset_with_info(env) obs = main_observation(env_obs) done = False if m is None: m = TopologicalMap(obs, directed_graph=False, initial_info=info, verbose=True) m.maybe_load_checkpoint(checkpoint_dir) while not done and not terminate: env.render() if len(current_actions) > 0: action = current_actions[-1] else: action = 0 env_obs, rew, done, info = env.step(action) obs = main_observation(env_obs) global add_landmark if add_landmark: # noinspection PyProtectedMember new_idx = m._add_new_node(obs=obs, pos=get_position(info), angle=get_angle(info)) log.info('Added landmark idx %d', new_idx) add_landmark = False res = m.save_checkpoint(checkpoint_dir, map_img=map_img, coord_limits=coord_limits, verbose=True) cv2.imshow('map', cv2.imread(res.graph_filename)) cv2.waitKey(50) if terminate: break else: env.render() time.sleep(0.2) m.save_checkpoint(checkpoint_dir, map_img=map_img, coord_limits=coord_limits, verbose=True) log.debug('Set breakpoint here to edit graph edges before saving...') log.info('Saving to %s...', checkpoint_dir) m.save_checkpoint(checkpoint_dir, map_img=map_img, coord_limits=coord_limits, verbose=True) env.close() return 0
def _learn_loop(self, multi_env): """Main training loop.""" step, env_steps = self.session.run([self.actor_step, self.total_env_steps]) env_obs = multi_env.reset() observations, goals = main_observation(env_obs), goal_observation(env_obs) buffer = PPOBuffer() def end_of_training(s, es): return s >= self.params.train_for_steps or es > self.params.train_for_env_steps while not end_of_training(step, env_steps): timing = Timing() num_steps = 0 batch_start = time.time() buffer.reset() with timing.timeit('experience'): # collecting experience for rollout_step in range(self.params.rollout): actions, action_probs, values = self.actor_critic.invoke(self.session, observations, goals=goals) # wait for all the workers to complete an environment step env_obs, rewards, dones, infos = multi_env.step(actions) self.process_infos(infos) new_observations, new_goals = main_observation(env_obs), goal_observation(env_obs) # add experience from all environments to the current buffer buffer.add(observations, actions, action_probs, rewards, dones, values, goals) observations = new_observations goals = new_goals num_steps += num_env_steps(infos) # last step values are required for TD-return calculation _, _, values = self.actor_critic.invoke(self.session, observations, goals=goals) buffer.values.append(values) env_steps += num_steps # calculate discounted returns and GAE buffer.finalize_batch(self.params.gamma, self.params.gae_lambda) # update actor and critic with timing.timeit('train'): step = self._train(buffer, env_steps) avg_reward = multi_env.calc_avg_rewards(n=self.params.stats_episodes) avg_length = multi_env.calc_avg_episode_lengths(n=self.params.stats_episodes) fps = num_steps / (time.time() - batch_start) self._maybe_print(step, env_steps, avg_reward, avg_length, fps, timing) self._maybe_aux_summaries(env_steps, avg_reward, avg_length, fps) self._maybe_update_avg_reward(avg_reward, multi_env.stats_num_episodes()) self._maybe_coverage_summaries(env_steps)
def run_policy_loop(agent, env, max_num_episodes, fps=7, max_num_frames=None, deterministic=False): """Execute the policy and render onto the screen, using the standard agent interface.""" agent.initialize() episode_rewards = [] num_frames = 0 def max_frames_reached(frames): return max_num_frames is not None and frames > max_num_frames for _ in range(max_num_episodes): env_obs, info = reset_with_info(env) done = False obs, goal_obs = main_observation(env_obs), goal_observation(env_obs) if goal_obs is not None: goal_obs_rgb = cv2.cvtColor(goal_obs, cv2.COLOR_BGR2RGB) cv2.imshow('goal', cv2.resize(goal_obs_rgb, (500, 500))) cv2.waitKey(500) episode_reward = 0 while not done: start = time.time() env.render() if fps < 1000: time.sleep(1.0 / fps) action = agent.best_action([obs], goals=[goal_obs], deterministic=deterministic) env_obs, rew, done, _ = env.step(action) obs, goal_obs = main_observation(env_obs), goal_observation(env_obs) episode_reward += rew log.info('Actual fps: %.1f', 1.0 / (time.time() - start)) num_frames += 1 if max_frames_reached(num_frames): break env.render() time.sleep(0.2) episode_rewards.append(episode_reward) last_episodes = episode_rewards[-100:] avg_reward = sum(last_episodes) / len(last_episodes) log.info( 'Episode reward: %f, avg reward for %d episodes: %f', episode_reward, len(last_episodes), avg_reward, ) if max_frames_reached(num_frames): break agent.finalize() env.close() cv2.destroyAllWindows() return 0
def evaluate_locomotion_agent(agent, multi_env): num_envs = multi_env.num_envs observations = main_observation(multi_env.reset()) obs_prev = observations infos = multi_env.info() agent.tmax_mgr.initialize(observations, infos, 1) m = agent.tmax_mgr.dense_persistent_maps[-1] navigator = Navigator(agent) for env_i in range(num_envs): navigator.reset(env_i, m) # sample final goals all_targets = list(m.graph.nodes) if len(all_targets) > 0: all_targets.remove(0) final_goal_idx = random.sample(all_targets, num_envs) log.info('Goals: %r', final_goal_idx) # noinspection PyProtectedMember navigator._ensure_paths_to_goal_calculated([m] * num_envs, final_goal_idx) path_lengths = [0] * num_envs for env_i in range(num_envs): location, path_length = 0, 0 while location != final_goal_idx[env_i]: location = navigator.paths[env_i][location] path_length += 1 path_lengths[env_i] = path_length frames = 0 next_target, next_target_d = navigator.get_next_target( [m] * num_envs, observations, final_goal_idx, [frames] * num_envs, ) next_target_obs = [m.get_observation(t) for t in next_target] avg_speed = [-1] * num_envs success = [False] * num_envs t = Timing() while True: with t.timeit('frame'): with t.timeit('policy'): actions = policy_step(agent, obs_prev, observations, next_target_obs, final_goal_idx) with t.timeit('step'): env_obs, rew, done, info = multi_env.step(actions) obs_prev = observations observations = main_observation(env_obs) with t.timeit('navigator'): next_target, next_target_d = navigator.get_next_target( [m] * num_envs, observations, final_goal_idx, [frames] * num_envs, ) for env_i in range(num_envs): if final_goal_idx[env_i] is None: continue if next_target[env_i] is None: log.warning( 'Agent %d got lost in %d steps trying to reach %d', env_i, frames, final_goal_idx[env_i], ) final_goal_idx[env_i] = None else: if next_target[env_i] == final_goal_idx[env_i] and next_target_d[env_i] < 0.1: success[env_i] = True avg_speed[env_i] = path_lengths[env_i] / (frames + 1) log.debug( 'Agent %d reached goal %d in %d steps, avg. speed %.3f', env_i, final_goal_idx[env_i], frames, avg_speed[env_i], ) final_goal_idx[env_i] = None next_target_obs[env_i] = m.get_observation(next_target[env_i]) frames += 1 if frames > 5000: log.error('Timeout! 5000 frames was not enough to finish locomotion!') break finished = [g is None for g in final_goal_idx] if all(finished): log.info('Done!') break else: if frames % 10 == 0: frame_repeat = 4 fps = (1.0 / t.frame) * frame_repeat * num_envs log.info('%d agents remaining, fps %.3f, time %s', num_envs - sum(finished), fps, t) return success, avg_speed
def train_loop(agent, multi_env): params = agent.params observations = main_observation(multi_env.reset()) infos = multi_env.info() trajectory_buffer = TrajectoryBuffer(multi_env.num_envs) step, env_steps = agent.session.run([agent.curiosity.distance.step, agent.total_env_steps]) loop_time = deque([], maxlen=2500) advanced_steps = deque([], maxlen=2500) t = Timing() complete_trajectories = [] num_to_process = 20 test_buffer = Buffer() num_test_data = 5000 while True: with t.timeit('loop'): with t.timeit('step'): actions = np.random.randint(0, agent.actor_critic.num_actions, params.num_envs) new_obs, rewards, dones, new_infos = multi_env.step(actions) with t.timeit('misc'): trajectory_buffer.add(observations, actions, infos, dones) observations = main_observation(new_obs) infos = new_infos num_steps_delta = num_env_steps(infos) env_steps += num_steps_delta complete_trajectories.extend(trajectory_buffer.complete_trajectories) trajectory_buffer.reset_trajectories() with t.timeit('train'): while len(complete_trajectories) > num_to_process: buffer = generate_training_data(complete_trajectories[:num_to_process], params) complete_trajectories = complete_trajectories[num_to_process:] if len(test_buffer) <= 0: buffer.shuffle_data() test_buffer = Buffer() test_buffer.add_buff(buffer, max_to_add=num_test_data) else: step = agent.curiosity.distance.train(buffer, env_steps, agent) agent.curiosity.distance.calc_test_error(test_buffer, env_steps, agent) if t.train > 1.0: log.debug('Training time: %s', t) loop_time.append(t.loop) advanced_steps.append(num_steps_delta) if env_steps % 100 == 0: avg_fps = sum(advanced_steps) / sum(loop_time) log.info('Step %d, avg. fps %.1f, training steps %d, timing: %s', env_steps, avg_fps, step, t)
def train_loop(agent, multi_env): params = agent.params observations = main_observation(multi_env.reset()) infos = multi_env.info() trajectory_buffer = TmaxTrajectoryBuffer(multi_env.num_envs) locomotion_buffer = LocomotionBuffer(params) num_test_data = 5000 locomotion_buffer_test = LocomotionBuffer(params) step, env_steps = agent.session.run( [agent.locomotion.step, agent.total_env_steps]) loop_time = deque([], maxlen=2500) advanced_steps = deque([], maxlen=2500) t = Timing() while True: with t.timeit('loop'): with t.timeit('step'): actions = np.random.randint(0, agent.actor_critic.num_actions, params.num_envs) new_obs, rewards, dones, new_infos = multi_env.step(actions) with t.timeit('misc'): trajectory_buffer.add( observations, actions, infos, dones, tmax_mgr=agent.tmax_mgr, is_random=[True] * params.num_envs, ) observations = main_observation(new_obs) infos = new_infos num_steps_delta = num_env_steps(infos) env_steps += num_steps_delta with t.timeit('train'): locomotion_buffer.extract_data( trajectory_buffer.complete_trajectories) trajectory_buffer.reset_trajectories() if len(locomotion_buffer.buffer ) >= params.locomotion_experience_replay_buffer: if len(locomotion_buffer_test.buffer) <= 0: log.info( 'Prepare test data that we will never see during training...' ) locomotion_buffer.shuffle_data() locomotion_buffer_test.buffer.add_buff( locomotion_buffer.buffer, max_to_add=num_test_data) # noinspection PyProtectedMember log.info( 'Test buffer size %d, capacity %d', locomotion_buffer_test.buffer._size, locomotion_buffer_test.buffer._capacity, ) else: step = train_locomotion_net(agent, locomotion_buffer, params, env_steps) locomotion_buffer.reset() calc_test_error(agent, locomotion_buffer_test, params, env_steps) calc_test_error(agent, locomotion_buffer_test, params, env_steps, bn_training=True) if t.train > 1.0: log.debug('Train time: %s', t) loop_time.append(t.loop) advanced_steps.append(num_steps_delta) if env_steps % 100 == 0: avg_fps = sum(advanced_steps) / sum(loop_time) log.info('Step %d, avg. fps %.1f, training steps %d, timing: %s', env_steps, avg_fps, step, t)
def enjoy(params, env_id, max_num_episodes=1, max_num_frames=1e10, render=False): def make_env_func(): e = create_env(env_id, mode='train', skip_frames=True) e.seed(0) return e agent = AgentRandom(make_env_func, params.load()) env = make_env_func() # this helps with screen recording pause_at_the_beginning = False if pause_at_the_beginning: env.render() log.info('Press any key to start...') cv2.waitKey() agent.initialize() episode_rewards = [] num_frames = 0 histogram = setup_histogram(agent) def max_frames_reached(frames): return max_num_frames is not None and frames > max_num_frames for _ in range(max_num_episodes): env_obs, info = reset_with_info(env) done = False obs, goal_obs = main_observation(env_obs), goal_observation(env_obs) episode_reward = [] while not done and not max_frames_reached(num_frames): start = time.time() if render: env.render() action = agent.best_action([obs], goals=[goal_obs], deterministic=False) env_obs, rew, done, info = env.step(action) if done: log.warning('Done flag is true %d, rew: %.3f, num_frames %d', done, rew, num_frames) update_coverage(agent, [info], histogram) episode_reward.append(rew) if num_frames % 100 == 0: log.info('fps: %.1f, rew: %d, done: %s, frames %d', 1.0 / (time.time() - start), rew, done, num_frames) write_summaries(agent, histogram, num_frames) num_frames += num_env_steps([info]) if render: env.render() time.sleep(0.2) episode_rewards.append(sum(episode_reward)) last_episodes = episode_rewards[-100:] avg_reward = sum(last_episodes) / len(last_episodes) log.info( 'Episode reward: %f, avg reward for %d episodes: %f', sum(episode_reward), len(last_episodes), avg_reward, ) if max_frames_reached(num_frames): break write_summaries(agent, histogram, num_frames, force=True) agent.finalize() env.close() cv2.destroyAllWindows()
def enjoy(params, env_id, max_num_episodes=1000, max_num_frames=None, show_automap=False): def make_env_func(): e = create_env(env_id, mode='test', show_automap=show_automap) e.seed(0) return e params = params.load() params.num_envs = 1 # during execution we're only using one env agent = AgentTMAX(make_env_func, params) env = make_env_func() agent.initialize() global persistent_map if agent.params.persistent_map_checkpoint is not None: persistent_map = TopologicalMap.create_empty() persistent_map.maybe_load_checkpoint( agent.params.persistent_map_checkpoint) global current_landmark episode_rewards = [] num_frames = 0 def max_frames_reached(frames): return max_num_frames is not None and frames > max_num_frames for _ in range(max_num_episodes): env_obs, info = reset_with_info(env) done = False obs, goal_obs = main_observation(env_obs), goal_observation(env_obs) prev_obs = obs if current_landmark is None: current_landmark = obs if goal_obs is not None: goal_obs_rgb = cv2.cvtColor(goal_obs, cv2.COLOR_BGR2RGB) cv2.imshow('goal', cv2.resize(goal_obs_rgb, (500, 500))) cv2.waitKey(500) episode_reward, episode_frames = 0, 0 if not agent.tmax_mgr.initialized: agent.tmax_mgr.initialize([obs], [info], env_steps=0) persistent_map = agent.tmax_mgr.dense_persistent_maps[-1] sparse_persistent_map = agent.tmax_mgr.sparse_persistent_maps[-1] log.debug('Num landmarks in sparse map: %d', sparse_persistent_map.num_landmarks()) agent.curiosity.initialized = True agent.tmax_mgr.mode[0] = TmaxMode.EXPLORATION agent.tmax_mgr.locomotion_final_targets[0] = None agent.tmax_mgr.locomotion_targets[0] = None start_episode = time.time() t = Timing() while not done and not terminate and not max_frames_reached( num_frames): with t.timeit('one_frame'): env.render() cv2.waitKey(1) # to prevent window from fading if pause: time.sleep(0.01) continue if len(current_actions) > 0: # key combinations are not handled, but this is purely for testing action = current_actions[-1] else: action = 0 if policy_type == PolicyType.PLAYER: pass elif policy_type == PolicyType.RANDOM: action = env.action_space.sample() elif policy_type == PolicyType.AGENT: agent.tmax_mgr.mode[0] = TmaxMode.EXPLORATION action, *_ = agent.policy_step([prev_obs], [obs], [goal_obs], None, None) action = action[0] elif policy_type == PolicyType.LOCOMOTION: agent.tmax_mgr.mode[0] = TmaxMode.LOCOMOTION action, _, _ = agent.loco_actor_critic.invoke( agent.session, [obs], [current_landmark], None, None, [1.0], ) action = action[0] env_obs, rew, done, info = env.step(action) next_obs, goal_obs = main_observation( env_obs), goal_observation(env_obs) _, _ = agent.tmax_mgr.update( [obs], [next_obs], [rew], [done], [info], num_frames, t, verbose=True, ) prev_obs = obs obs = next_obs calc_distance_to_memory(agent, sparse_persistent_map, obs) calc_value_estimate(agent, obs) episode_reward += rew num_frames += 1 episode_frames += 1 took_seconds = t.one_frame desired_fps = 15 # (4-repeated here, which means actually 60fps) wait_seconds = (1.0 / desired_fps) - took_seconds wait_seconds = max(0.0, wait_seconds) if wait_seconds > EPS: time.sleep(wait_seconds) env.render() log.info('Actual fps: %.1f', episode_frames / (time.time() - start_episode)) time.sleep(0.2) episode_rewards.append(episode_reward) last_episodes = episode_rewards[-100:] avg_reward = sum(last_episodes) / len(last_episodes) log.info( 'Episode reward: %f, avg reward for %d episodes: %f', episode_reward, len(last_episodes), avg_reward, ) if max_frames_reached(num_frames) or terminate: break agent.finalize() env.close() cv2.destroyAllWindows() return 0
def test_locomotion(params, env_id): def make_env_func(): e = create_env(env_id, skip_frames=True) e.seed(0) return e # params = params.load() # params.ensure_serialized() params.num_envs = 1 # params.naive_locomotion = True agent = AgentTMAX(make_env_func, params) agent.initialize() env = make_env_func() env_obs, info = reset_with_info(env) obs_prev = obs = main_observation(env_obs) done = False if params.persistent_map_checkpoint is not None: loaded_persistent_map = TopologicalMap.create_empty() loaded_persistent_map.maybe_load_checkpoint( params.persistent_map_checkpoint) else: agent.tmax_mgr.initialize([obs], [info], 1) loaded_persistent_map = agent.tmax_mgr.dense_persistent_maps[-1] m = loaded_persistent_map t = Timing() log.info('Num landmarks: %d', m.num_landmarks()) final_goal_idx = 49 log.info('Locomotion goal is %d', final_goal_idx) # localizer = Localizer(m, agent) final_goal_obs = m.get_observation(final_goal_idx) cv2.namedWindow('next_target') cv2.moveWindow('next_target', 800, 100) cv2.namedWindow('final_goal') cv2.moveWindow('final_goal', 1400, 100) display_obs('next_target', obs) display_obs('final_goal', final_goal_obs) cv2.waitKey(1) # localizer.current_landmark = 0 # next_target = localizer.get_next_target(obs, final_goal_idx) # next_target_obs = m.get_observation(next_target) frame = 0 if params.naive_locomotion: navigator = NavigatorNaive(agent) else: navigator = Navigator(agent) navigator.reset(0, m) next_target, next_target_d = navigator.get_next_target( [m], [obs], [final_goal_idx], [frame], ) next_target, next_target_d = next_target[0], next_target_d[0] next_target_obs = m.get_observation(next_target) while not done and not terminate: with t.timeit('one_frame'): env.render() if not pause: if random.random() < 0.5: deterministic = False else: deterministic = True if params.naive_locomotion: action = navigator.replay_action([0])[0] else: action = agent.locomotion.navigate( agent.session, [obs_prev], [obs], [next_target_obs], deterministic=deterministic, )[0] env_obs, rew, done, info = env.step(action) log.info('Action is %d', action) obs_prev = obs obs = main_observation(env_obs) next_target, next_target_d = navigator.get_next_target( [m], [obs], [final_goal_idx], [frame], ) next_target, next_target_d = next_target[0], next_target_d[0] if next_target is None: log.error('We are lost!') else: log.info('Next target is %d with distance %.3f!', next_target, next_target_d) display_obs('next_target', next_target_obs) cv2.waitKey(1) if next_target is not None: next_target_obs = m.get_observation(next_target) log.info('Frame %d...', frame) took_seconds = t.one_frame desired_fps = 10 wait_seconds = (1.0 / desired_fps) - took_seconds wait_seconds = max(0.0, wait_seconds) if wait_seconds > EPS: time.sleep(wait_seconds) if not pause: frame += 1 log.info('After loop') env.render() time.sleep(0.05) env.close() agent.finalize() return 0
def _learn_loop(self, multi_env): """Main training loop.""" # env_steps used in tensorboard (and thus, our results) # actor_step used as global step for training step, env_steps = self.session.run( [self.actor_step, self.total_env_steps]) env_obs = multi_env.reset() obs, goals = main_observation(env_obs), goal_observation(env_obs) buffer = CuriousPPOBuffer() trajectory_buffer = TrajectoryBuffer(self.params.num_envs) self.curiosity.set_trajectory_buffer(trajectory_buffer) def end_of_training(s, es): return s >= self.params.train_for_steps or es > self.params.train_for_env_steps while not end_of_training(step, env_steps): timing = Timing() num_steps = 0 batch_start = time.time() buffer.reset() with timing.timeit('experience'): # collecting experience for rollout_step in range(self.params.rollout): actions, action_probs, values = self._policy_step( obs, goals) # wait for all the workers to complete an environment step env_obs, rewards, dones, infos = multi_env.step(actions) if self.params.graceful_episode_termination: rewards = list(rewards) for i in range(self.params.num_envs): if dones[i] and infos[i].get('prev') is not None: if infos[i]['prev'].get( 'terminated_by_timer', False): log.info('Env %d terminated by timer', i) rewards[i] += values[i] if not self.params.random_exploration: trajectory_buffer.add(obs, actions, infos, dones) next_obs, new_goals = main_observation( env_obs), goal_observation(env_obs) # calculate curiosity bonus with timing.add_time('curiosity'): if not self.params.random_exploration: bonuses = self.curiosity.generate_bonus_rewards( self.session, obs, next_obs, actions, dones, infos, ) rewards = self.params.extrinsic_reward_coeff * np.array( rewards) + bonuses # add experience from environment to the current buffer buffer.add(obs, next_obs, actions, action_probs, rewards, dones, values, goals) obs, goals = next_obs, new_goals self.process_infos(infos) num_steps += num_env_steps(infos) # last step values are required for TD-return calculation _, _, values = self._policy_step(obs, goals) buffer.values.append(values) env_steps += num_steps # calculate discounted returns and GAE buffer.finalize_batch(self.params.gamma, self.params.gae_lambda) # update actor and critic and CM with timing.timeit('train'): step = self._train_with_curiosity(step, buffer, env_steps, timing) avg_reward = multi_env.calc_avg_rewards( n=self.params.stats_episodes) avg_length = multi_env.calc_avg_episode_lengths( n=self.params.stats_episodes) self._maybe_update_avg_reward(avg_reward, multi_env.stats_num_episodes()) self._maybe_trajectory_summaries(trajectory_buffer, env_steps) self._maybe_coverage_summaries(env_steps) self.curiosity.additional_summaries( env_steps, self.summary_writer, self.params.stats_episodes, map_img=self.map_img, coord_limits=self.coord_limits, ) trajectory_buffer.reset_trajectories() fps = num_steps / (time.time() - batch_start) self._maybe_print(step, env_steps, avg_reward, avg_length, fps, timing) self._maybe_aux_summaries(env_steps, avg_reward, avg_length, fps)