def create_for_env( cls, env: EnvWrapper, policy: Optional[Policy], *, device: Union[str, torch.device] = "cpu", obs_preprocessor=None, action_extractor=None, **kwargs, ): """ If `policy` is not given, we will try to create a random policy """ if isinstance(device, str): device = torch.device(device) if obs_preprocessor is None: obs_preprocessor = env.get_obs_preprocessor(device=device) if action_extractor is None: action_extractor = env.get_action_extractor() if policy is None: policy = make_random_policy_for_env(env) return cls( policy, obs_preprocessor=obs_preprocessor, action_extractor=action_extractor, **kwargs, )
def create_for_env( cls, env: EnvWrapper, policy: Policy, *, device: Union[str, torch.device] = "cpu", obs_preprocessor=None, action_extractor=None, **kwargs, ): if isinstance(device, str): device = torch.device(device) if obs_preprocessor is None: obs_preprocessor = env.get_obs_preprocessor(device=device) if action_extractor is None: action_extractor = env.get_action_extractor() return cls( policy, obs_preprocessor=obs_preprocessor, action_extractor=action_extractor, **kwargs, )
def _create_replay_buffer_and_insert(env: EnvWrapper): env.seed(1) replay_buffer = ReplayBuffer(replay_capacity=6, batch_size=1) replay_buffer_inserter = make_replay_buffer_inserter(env) obs = env.reset() inserted = [] terminal = False i = 0 while not terminal and i < 5: logger.info(f"Iteration: {i}") action = env.action_space.sample() next_obs, reward, terminal, _ = env.step(action) inserted.append({ "observation": obs, "action": action, "reward": reward, "terminal": terminal, }) transition = Transition( mdp_id=0, sequence_number=i, observation=obs, action=action, reward=reward, terminal=terminal, log_prob=0.0, ) replay_buffer_inserter(replay_buffer, transition) obs = next_obs i += 1 return replay_buffer, inserted
def run_test_episode_buffer( env: EnvWrapper, policy: Policy, trainer: Trainer, num_train_episodes: int, passing_score_bar: float, num_eval_episodes: int, use_gpu: bool = False, ): training_policy = policy post_episode_callback = train_post_episode(env, trainer, use_gpu) # pyre-fixme[16]: `EnvWrapper` has no attribute `seed`. env.seed(SEED) # pyre-fixme[16]: `EnvWrapper` has no attribute `action_space`. env.action_space.seed(SEED) train_rewards = train_policy( env, training_policy, num_train_episodes, post_step=None, post_episode=post_episode_callback, use_gpu=use_gpu, ) # Check whether the max score passed the score bar; we explore during training # the return could be bad (leading to flakiness in C51 and QRDQN). assert np.max(train_rewards) >= passing_score_bar, ( f"max reward ({np.max(train_rewards)}) after training for " f"{len(train_rewards)} episodes is less than < {passing_score_bar}.\n") serving_policy = policy eval_rewards = eval_policy(env, serving_policy, num_eval_episodes, serving=False) assert ( eval_rewards.mean() >= passing_score_bar ), f"Eval reward is {eval_rewards.mean()}, less than < {passing_score_bar}.\n"
def run_episode(env: EnvWrapper, agent: Agent, mdp_id: int = 0, max_steps: Optional[int] = None) -> Trajectory: """ Return sum of rewards from episode. After max_steps (if specified), the environment is assumed to be terminal. Can also specify the mdp_id and gamma of episode. """ trajectory = Trajectory() obs = env.reset() possible_actions_mask = env.possible_actions_mask terminal = False num_steps = 0 while not terminal: action, log_prob = agent.act(obs, possible_actions_mask) next_obs, reward, terminal, _ = env.step(action) next_possible_actions_mask = env.possible_actions_mask if max_steps is not None and num_steps >= max_steps: terminal = True # Only partially filled. Agent can fill in more fields. transition = Transition( mdp_id=mdp_id, sequence_number=num_steps, observation=obs, action=action, reward=float(reward), terminal=bool(terminal), log_prob=log_prob, possible_actions_mask=possible_actions_mask, ) agent.post_step(transition) trajectory.add_transition(transition) SummaryWriterContext.increase_global_step() obs = next_obs possible_actions_mask = next_possible_actions_mask num_steps += 1 agent.post_episode(trajectory) return trajectory
def create_for_env_with_serving_policy( cls, env: EnvWrapper, serving_policy: Policy, *, obs_preprocessor=None, action_extractor=None, **kwargs, ): # device shouldn't be provided as serving is CPU only if obs_preprocessor is None: obs_preprocessor = env.get_serving_obs_preprocessor() if action_extractor is None: action_extractor = env.get_serving_action_extractor() return cls( serving_policy, obs_preprocessor=obs_preprocessor, action_extractor=action_extractor, **kwargs, )