def train_agent(real_env, learner, world_model_dir, hparams, epoch): """Train the PPO agent in the simulated environment.""" initial_frame_chooser = rl_utils.make_initial_frame_chooser( real_env, hparams.frame_stack_size, hparams.simulation_random_starts, hparams.simulation_flip_first_random_for_beginning ) env_fn = make_simulated_env_fn_from_hparams( real_env, hparams, batch_size=hparams.simulated_batch_size, initial_frame_chooser=initial_frame_chooser, model_dir=world_model_dir, sim_video_dir=os.path.join( learner.agent_model_dir, "sim_videos_{}".format(epoch) ) ) base_algo_str = hparams.base_algo train_hparams = trainer_lib.create_hparams(hparams.base_algo_params) if hparams.wm_policy_param_sharing: train_hparams.optimizer_zero_grads = True rl_utils.update_hparams_from_hparams( train_hparams, hparams, base_algo_str + "_" ) final_epoch = hparams.epochs - 1 is_special_epoch = (epoch + 3) == final_epoch or (epoch + 7) == final_epoch is_final_epoch = epoch == final_epoch env_step_multiplier = 3 if is_final_epoch else 2 if is_special_epoch else 1 learner.train( env_fn, train_hparams, simulated=True, save_continuously=True, epoch=epoch, env_step_multiplier=env_step_multiplier )
def __init__(self, real_env, world_model_dir, hparams, random_starts, setable_initial_frames=False): """Init. Args: real_env: gym environment. world_model_dir: path to world model checkpoint directory. hparams: hparams for rlmb pipeline. random_starts: if restart world model from random frames, or only from initial ones (from beginning of episodes). Valid only when `setable_initial_fames` set to False. setable_initial_frames: if True, initial_frames for world model should be set by `add_to_initial_stack`. """ self._setable_initial_frames = setable_initial_frames if self._setable_initial_frames: real_obs_shape = real_env.observation_space.shape shape = (1, hparams.frame_stack_size) + real_obs_shape self._initial_frames = np.zeros(shape=shape, dtype=np.uint8) def initial_frame_chooser(batch_size): assert batch_size == 1 return self._initial_frames else: initial_frame_chooser = rl_utils.make_initial_frame_chooser( real_env, hparams.frame_stack_size, simulation_random_starts=random_starts, simulation_flip_first_random_for_beginning=False) env_fn = make_simulated_env_fn_from_hparams( real_env, hparams, batch_size=1, initial_frame_chooser=initial_frame_chooser, model_dir=world_model_dir, ) env = env_fn(in_graph=False) self.env = FlatBatchEnv(env) self.observation_space = self.env.observation_space self.action_space = self.env.action_space
def eval_fn(env, loop_hparams, policy_hparams, policy_dir, sampling_temp): """Eval function.""" base_env = env env = rl_utils.BatchStackWrapper(env, loop_hparams.frame_stack_size) agent = make_agent_from_hparams(agent_type, base_env, env, loop_hparams, policy_hparams, planner_hparams, model_dir, policy_dir, sampling_temp, video_writers) if eval_mode == "agent_simulated": real_env = base_env.new_like(batch_size=1) stacked_env = rl_utils.BatchStackWrapper( real_env, loop_hparams.frame_stack_size) collect_frames_for_random_starts(real_env, stacked_env, agent, loop_hparams.frame_stack_size, random_starts_step_limit, log_every_steps) initial_frame_chooser = rl_utils.make_initial_frame_chooser( real_env, loop_hparams.frame_stack_size, simulation_random_starts=True, simulation_flip_first_random_for_beginning=False, split=None, ) env_fn = rl.make_simulated_env_fn_from_hparams( real_env, loop_hparams, batch_size=loop_hparams.eval_batch_size, initial_frame_chooser=initial_frame_chooser, model_dir=model_dir) sim_env = env_fn(in_graph=False) env = rl_utils.BatchStackWrapper(sim_env, loop_hparams.frame_stack_size) kwargs = {} if not agent.records_own_videos: kwargs["video_writers"] = video_writers step_limit = base_env.rl_env_max_episode_steps if step_limit == -1: step_limit = None rl_utils.run_rollouts(env, agent, env.reset(), log_every_steps=log_every_steps, step_limit=step_limit, **kwargs) if eval_mode == "agent_real": assert len(base_env.current_epoch_rollouts()) == env.batch_size
def make_simulated_gym_env(real_env, world_model_dir, hparams, random_starts): """Gym environment with world model.""" initial_frame_chooser = rl_utils.make_initial_frame_chooser( real_env, hparams.frame_stack_size, simulation_random_starts=random_starts, simulation_flip_first_random_for_beginning=False) env_fn = make_simulated_env_fn_from_hparams( real_env, hparams, batch_size=1, initial_frame_chooser=initial_frame_chooser, model_dir=world_model_dir) env = env_fn(in_graph=False) flat_env = FlatBatchEnv(env) return flat_env
def evaluate_world_model(real_env, hparams, world_model_dir, debug_video_path): """Evaluate the world model (reward accuracy).""" frame_stack_size = hparams.frame_stack_size rollout_subsequences = [] def initial_frame_chooser(batch_size): assert batch_size == len(rollout_subsequences) return np.stack([ [frame.observation.decode() for frame in subsequence[:frame_stack_size]] for subsequence in rollout_subsequences ]) env_fn = make_simulated_env_fn_from_hparams( real_env, hparams, batch_size=hparams.wm_eval_batch_size, initial_frame_chooser=initial_frame_chooser, model_dir=world_model_dir ) sim_env = env_fn(in_graph=False) subsequence_length = int( max(hparams.wm_eval_rollout_ratios) * hparams.simulated_rollout_length ) rollouts = real_env.current_epoch_rollouts( split=tf.estimator.ModeKeys.EVAL, minimal_rollout_frames=(subsequence_length + frame_stack_size) ) video_writer = common_video.WholeVideoWriter( fps=10, output_path=debug_video_path, file_format="avi" ) reward_accuracies_by_length = { int(ratio * hparams.simulated_rollout_length): [] for ratio in hparams.wm_eval_rollout_ratios } for _ in range(hparams.wm_eval_num_batches): rollout_subsequences[:] = random_rollout_subsequences( rollouts, hparams.wm_eval_batch_size, subsequence_length + frame_stack_size ) eval_subsequences = [ subsequence[(frame_stack_size - 1):] for subsequence in rollout_subsequences ] # Check that the initial observation is the same in the real and simulated # rollout. sim_init_obs = sim_env.reset() def decode_real_obs(index): return np.stack([ subsequence[index].observation.decode() for subsequence in eval_subsequences # pylint: disable=cell-var-from-loop ]) real_init_obs = decode_real_obs(0) assert np.all(sim_init_obs == real_init_obs) debug_frame_batches = [] def append_debug_frame_batch(sim_obs, real_obs, sim_cum_rews, real_cum_rews, sim_rews, real_rews): """Add a debug frame.""" rews = [[sim_cum_rews, sim_rews], [real_cum_rews, real_rews]] headers = [] for j in range(len(sim_obs)): local_nps = [] for i in range(2): img = PIL_Image().new("RGB", (sim_obs.shape[-2], 11),) draw = PIL_ImageDraw().Draw(img) draw.text((0, 0), "c:{:3}, r:{:3}".format(int(rews[i][0][j]), int(rews[i][1][j])), fill=(255, 0, 0)) local_nps.append(np.asarray(img)) local_nps.append(np.zeros_like(local_nps[0])) headers.append(np.concatenate(local_nps, axis=1)) errs = absolute_hinge_difference(sim_obs, real_obs) headers = np.stack(headers) debug_frame_batches.append( # pylint: disable=cell-var-from-loop np.concatenate([headers, np.concatenate([sim_obs, real_obs, errs], axis=2)], axis=1) ) append_debug_frame_batch(sim_init_obs, real_init_obs, np.zeros(hparams.wm_eval_batch_size), np.zeros(hparams.wm_eval_batch_size), np.zeros(hparams.wm_eval_batch_size), np.zeros(hparams.wm_eval_batch_size)) (sim_cum_rewards, real_cum_rewards) = ( np.zeros(hparams.wm_eval_batch_size) for _ in range(2) ) for i in range(subsequence_length): actions = [subsequence[i].action for subsequence in eval_subsequences] (sim_obs, sim_rewards, _) = sim_env.step(actions) sim_cum_rewards += sim_rewards real_rewards = np.array([ subsequence[i + 1].reward for subsequence in eval_subsequences ]) real_cum_rewards += real_rewards for (length, reward_accuracies) in six.iteritems( reward_accuracies_by_length ): if i + 1 == length: reward_accuracies.append( np.sum(sim_cum_rewards == real_cum_rewards) / len(real_cum_rewards) ) real_obs = decode_real_obs(i + 1) append_debug_frame_batch(sim_obs, real_obs, sim_cum_rewards, real_cum_rewards, sim_rewards, real_rewards) for debug_frames in np.stack(debug_frame_batches, axis=1): for debug_frame in debug_frames: video_writer.write(debug_frame) video_writer.finish_to_disk() return { "reward_accuracy/at_{}".format(length): np.mean(reward_accuracies) for (length, reward_accuracies) in six.iteritems( reward_accuracies_by_length ) }