def offline_gym( env: str, pkl_path: str, num_episodes_for_data_batch: int, max_steps: Optional[int], seed: Optional[int] = None, ): """ Generate samples from a DiscreteRandomPolicy on the Gym environment and saves results in a pandas df parquet. """ initialize_seed(seed) env = EnvFactory.make(env) policy = DiscreteRandomPolicy.create_for_env(env) dataset = RLDataset() for i in range(num_episodes_for_data_batch): logger.info(f"Starting episode {i}") post_step = log_data_post_step(dataset=dataset, mdp_id=str(i), env=env) agent = Agent.create_for_env(env, policy, post_transition_callback=post_step) run_episode(env=env, agent=agent, max_steps=max_steps) logger.info(f"Saving dataset with {len(dataset)} samples to {pkl_path}") df = dataset.to_pandas_df() df.to_pickle(pkl_path)
def fill_replay_buffer(env, replay_buffer: ReplayBuffer, desired_size: int): """ Fill replay buffer with random transitions until size reaches desired_size. """ assert ( 0 < desired_size and desired_size <= replay_buffer._replay_capacity ), f"It's not true that 0 < {desired_size} <= {replay_buffer._replay_capacity}." assert replay_buffer.size < desired_size, ( f"Replay buffer already has {replay_buffer.size} elements. " f"(more than desired_size = {desired_size})") logger.info( f" Starting to fill replay buffer using random policy to size: {desired_size}." ) random_policy = make_random_policy_for_env(env) post_step = add_replay_buffer_post_step(replay_buffer, env=env) agent = Agent.create_for_env(env, policy=random_policy, post_transition_callback=post_step) max_episode_steps = env.max_steps with tqdm( total=desired_size - replay_buffer.size, desc= f"Filling replay buffer from {replay_buffer.size} to size {desired_size} using random policy", ) as pbar: mdp_id = 0 while replay_buffer.size < desired_size: last_size = replay_buffer.size max_steps = desired_size - replay_buffer.size - 1 if max_episode_steps is not None: max_steps = min(max_episode_steps, max_steps) run_episode(env=env, agent=agent, mdp_id=mdp_id, max_steps=max_steps) size_delta = replay_buffer.size - last_size # The assertion below is commented out because it can't # support input samples which has seq_len>1. This should be # treated as a bug, and need to be fixed in the future. # assert ( # size_delta >= 0 # ), f"size delta is {size_delta} which should be non-negative." pbar.update(n=size_delta) mdp_id += 1 if size_delta <= 0: # replay buffer size isn't increasing... so stop early break if replay_buffer.size >= desired_size: logger.info( f"Successfully filled replay buffer to size: {replay_buffer.size}!" ) else: logger.info( f"Stopped early and filled replay buffer to size: {replay_buffer.size}." )
def fill_replay_buffer(env: Env, replay_buffer: ReplayBuffer, desired_size: int): """ Fill replay buffer with random transitions until size reaches desired_size. """ assert ( 0 < desired_size and desired_size <= replay_buffer._replay_capacity ), f"It's not true that 0 < {desired_size} <= {replay_buffer._replay_capacity}." assert replay_buffer.size < desired_size, ( f"Replay buffer already has {replay_buffer.size} elements. " f"(more than desired_size = {desired_size})") logger.info(f"Starting to fill replay buffer to size: {desired_size}.") random_policy = make_random_policy_for_env(env) post_step = add_replay_buffer_post_step(replay_buffer, env=env) agent = Agent.create_for_env(env, policy=random_policy, post_transition_callback=post_step) max_episode_steps = get_max_steps(env) with tqdm( total=desired_size - replay_buffer.size, desc= f"Filling replay buffer from {replay_buffer.size} to size {desired_size}", ) as pbar: mdp_id = 0 while replay_buffer.size < desired_size: last_size = replay_buffer.size max_steps = desired_size - replay_buffer.size - 1 if max_episode_steps is not None: max_steps = min(max_episode_steps, max_steps) run_episode(env=env, agent=agent, mdp_id=mdp_id, max_steps=max_steps) size_delta = replay_buffer.size - last_size assert ( size_delta >= 0), f"size delta is {size_delta} which should be non-negative." pbar.update(n=size_delta) mdp_id += 1 if size_delta == 0: # replay buffer size isn't increasing... so stop early break if replay_buffer.size >= desired_size: logger.info( f"Successfully filled replay buffer to size: {replay_buffer.size}!" ) else: logger.info( f"Stopped early and filled replay buffer to size: {replay_buffer.size}." )
def __iter__(self): self.env.reset() for i in range(self.num_episodes): trajectory = run_episode( self.env, self.agent, max_steps=self.max_steps, mdp_id=i ) yield trajectory.to_dict()
def evaluate_gym( env: str, model, eval_temperature: float, num_eval_episodes: int, passing_score_bar: float, max_steps: Optional[int] = None, ): predictor = DiscreteDqnTorchPredictor(model) predictor.softmax_temperature = eval_temperature env = EnvFactory.make(env) policy = TorchPredictorPolicy(predictor) agent = Agent(policy=policy, action_extractor=lambda x: x.item()) rewards = [] for _ in range(num_eval_episodes): ep_reward = run_episode(env=env, agent=agent, max_steps=max_steps) rewards.append(ep_reward) avg_reward = np.mean(rewards) logger.info(f"Average reward over {num_eval_episodes} is {avg_reward}, " f"which passes the bar of {passing_score_bar}!\n" f"List of rewards: {rewards}") assert avg_reward >= passing_score_bar
def train_policy( env: EnvWrapper, training_policy: Policy, num_train_episodes: int, post_step: Optional[PostStep] = None, post_episode: Optional[PostEpisode] = None, use_gpu: bool = False, ) -> np.ndarray: device = torch.device("cuda") if use_gpu else torch.device("cpu") agent = Agent.create_for_env( env, policy=training_policy, post_transition_callback=post_step, post_episode_callback=post_episode, device=device, ) running_reward = 0 writer = SummaryWriter() with summary_writer_context(writer): train_rewards = [] with trange(num_train_episodes, unit=" epoch") as t: for i in t: # Note: run_episode also performs a training step for the agent, if specified in post_step trajectory = run_episode(env=env, agent=agent, mdp_id=i, max_steps=200) ep_reward = trajectory.calculate_cumulative_reward() train_rewards.append(ep_reward) running_reward *= REWARD_DECAY running_reward += (1 - REWARD_DECAY) * ep_reward t.set_postfix(reward=running_reward) logger.info("============Train rewards=============") logger.info(train_rewards) logger.info(f"average: {np.mean(train_rewards)};\tmax: {np.max(train_rewards)}") return np.array(train_rewards)
def evaluate_gym( env_name: str, model: torch.nn.Module, eval_temperature: float, num_eval_episodes: int, passing_score_bar: float, max_steps: Optional[int] = None, ): env: gym.Env = EnvFactory.make(env_name) policy = create_predictor_policy_from_model( env, model, eval_temperature=eval_temperature) # since we already return softmax action, override action_extractor agent = Agent.create_for_env( env, policy=policy, action_extractor=policy.get_action_extractor()) rewards = [] for _ in range(num_eval_episodes): ep_reward = run_episode(env=env, agent=agent, max_steps=max_steps) rewards.append(ep_reward) avg_reward = np.mean(rewards) logger.info(f"Average reward over {num_eval_episodes} is {avg_reward}.\n" f"List of rewards: {rewards}") assert (avg_reward >= passing_score_bar ), f"{avg_reward} fails to pass the bar of {passing_score_bar}!"
def run_test( env: str, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_episodes: int, max_steps: Optional[int], last_score_bar: float, ): env = EnvFactory.make(env) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is {normalization}") manager = model.value trainer = manager.initialize_trainer( use_gpu=False, reward_options=RewardOptions(), normalization_data_map=normalization, ) policy = manager.create_policy() replay_buffer = ReplayBuffer.create_from_env( env=env, replay_memory_size=replay_memory_size, batch_size=trainer.minibatch_size, ) post_step = train_with_replay_buffer_post_step( replay_buffer=replay_buffer, trainer=trainer, training_freq=train_every_ts, batch_size=trainer.minibatch_size, replay_burnin=train_after_ts, ) agent = Agent.create_for_env(env, policy=policy, post_transition_callback=post_step) reward_history = [] for i in range(num_episodes): logger.info(f"running episode {i}") ep_reward = run_episode(env=env, agent=agent, max_steps=max_steps) reward_history.append(ep_reward) assert reward_history[-1] >= last_score_bar, ( f"reward after {len(reward_history)} episodes is {reward_history[-1]}," f"less than < {last_score_bar}...\n" f"Full reward history: {reward_history}") return reward_history
def run_test( env_name: str, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_train_episodes: int, max_steps: Optional[int], passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, ): env = EnvFactory.make(env_name) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value try: # pyre-fixme[16]: `Env` has no attribute `state_feature_config_provider`. manager.state_feature_config_provider = env.state_feature_config_provider logger.info( f"Using environment's state_feature_config_provider.\n" f"{manager.state_feature_config_provider}" ) except AttributeError: logger.info("state_feature_config_provider override not applicable") trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) training_policy = manager.create_policy(serving=False) replay_buffer = ReplayBuffer.create_from_env( env=env, replay_memory_size=replay_memory_size, batch_size=trainer.minibatch_size, ) device = torch.device("cuda") if use_gpu else torch.device("cpu") # first fill the replay buffer to burn_in train_after_ts = max(train_after_ts, trainer.minibatch_size) fill_replay_buffer( env=env, replay_buffer=replay_buffer, desired_size=train_after_ts ) post_step = train_with_replay_buffer_post_step( replay_buffer=replay_buffer, env=env, trainer=trainer, training_freq=train_every_ts, batch_size=trainer.minibatch_size, device=device, ) agent = Agent.create_for_env( env, policy=training_policy, post_transition_callback=post_step, device=device ) writer = SummaryWriter() with summary_writer_context(writer): train_rewards = [] for i in range(num_train_episodes): trajectory = run_episode( env=env, agent=agent, mdp_id=i, max_steps=max_steps ) ep_reward = trajectory.calculate_cumulative_reward() train_rewards.append(ep_reward) logger.info( f"Finished training episode {i} (len {len(trajectory)})" f" with reward {ep_reward}." ) logger.info("============Train rewards=============") logger.info(train_rewards) logger.info(f"average: {np.mean(train_rewards)};\tmax: {np.max(train_rewards)}") # Check whether the max score passed the score bar; we explore during training # the return could be bad (leading to flakiness in C51 and QRDQN). assert np.max(train_rewards) >= passing_score_bar, ( f"max reward ({np.max(train_rewards)})after training for " f"{len(train_rewards)} episodes is less than < {passing_score_bar}.\n" ) serving_policy = manager.create_policy(serving=True) agent = Agent.create_for_env_with_serving_policy(env, serving_policy) eval_rewards = evaluate_for_n_episodes( n=num_eval_episodes, env=env, agent=agent, max_steps=max_steps ).squeeze(1) logger.info("============Eval rewards==============") logger.info(eval_rewards) logger.info(f"average: {np.mean(eval_rewards)};\tmax: {np.max(eval_rewards)}") assert np.mean(eval_rewards) >= passing_score_bar, ( f"Predictor reward is {np.mean(eval_rewards)}," f"less than < {passing_score_bar}.\n" )
def run_test( env_name: str, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_train_episodes: int, max_steps: Optional[int], passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, ): env = EnvFactory.make(env_name) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) training_policy = manager.create_policy(serving=False) replay_buffer = ReplayBuffer.create_from_env( env=env, replay_memory_size=replay_memory_size, batch_size=trainer.minibatch_size, ) device = torch.device("cuda") if use_gpu else None # first fill the replay buffer to burn_in train_after_ts = max(train_after_ts, trainer.minibatch_size) fill_replay_buffer(env=env, replay_buffer=replay_buffer, desired_size=train_after_ts) post_step = train_with_replay_buffer_post_step( replay_buffer=replay_buffer, env=env, trainer=trainer, training_freq=train_every_ts, batch_size=trainer.minibatch_size, device=device, ) agent = Agent.create_for_env( env, policy=training_policy, post_transition_callback=post_step, # pyre-fixme[6]: Expected `Union[str, torch.device]` for 4th param but got # `Optional[torch.device]`. device=device, ) writer = SummaryWriter() with summary_writer_context(writer): train_rewards = [] for i in range(num_train_episodes): trajectory = run_episode(env=env, agent=agent, mdp_id=i, max_steps=max_steps) ep_reward = trajectory.calculate_cumulative_reward() train_rewards.append(ep_reward) logger.info( f"Finished training episode {i} with reward {ep_reward}.") assert train_rewards[-1] >= passing_score_bar, ( f"reward after {len(train_rewards)} episodes is {train_rewards[-1]}," f"less than < {passing_score_bar}...\n" f"Full reward history: {train_rewards}") logger.info("============Train rewards=============") logger.info(train_rewards) serving_policy = manager.create_policy(serving=True) agent = Agent.create_for_env_with_serving_policy(env, serving_policy) eval_rewards = evaluate_for_n_episodes(n=num_eval_episodes, env=env, agent=agent, max_steps=max_steps).squeeze(1) assert np.mean(eval_rewards) >= passing_score_bar, ( f"Predictor reward is {np.mean(eval_rewards)}," f"less than < {passing_score_bar}...\n" f"Full eval rewards: {eval_rewards}.") logger.info("============Eval rewards==============") logger.info(eval_rewards)
def train_mdnrnn_and_train_on_embedded_env( env_name: str, embedding_model: ModelManager__Union, num_embedding_train_transitions: int, seq_len: int, batch_size: int, num_embedding_train_epochs: int, train_model: ModelManager__Union, num_state_embed_transitions: int, num_agent_train_epochs: int, num_agent_eval_epochs: int, use_gpu: bool, passing_score_bar: float, # pyre-fixme[9]: saved_mdnrnn_path has type `str`; used as `None`. saved_mdnrnn_path: str = None, ): """ Train an agent on embedded states by the MDNRNN. """ env = EnvFactory.make(env_name) env.seed(SEED) embedding_manager = embedding_model.value embedding_trainer = embedding_manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=build_normalizer(env), ) device = "cuda" if use_gpu else "cpu" embedding_trainer_preprocessor = make_replay_buffer_trainer_preprocessor( embedding_trainer, device, env ) if saved_mdnrnn_path is None: # train from scratch embedding_trainer = train_mdnrnn( env=env, trainer=embedding_trainer, trainer_preprocessor=embedding_trainer_preprocessor, num_train_transitions=num_embedding_train_transitions, seq_len=seq_len, batch_size=batch_size, num_train_epochs=num_embedding_train_epochs, ) else: # load a pretrained model, and just evaluate it embedding_trainer.memory_network.mdnrnn.load_state_dict( torch.load(saved_mdnrnn_path) ) # create embedding dataset embed_rb, state_min, state_max = create_embed_rl_dataset( env=env, memory_network=embedding_trainer.memory_network, num_state_embed_transitions=num_state_embed_transitions, batch_size=batch_size, seq_len=seq_len, hidden_dim=embedding_trainer.params.hidden_size, use_gpu=use_gpu, ) embed_env = StateEmbedEnvironment( gym_env=env, mdnrnn=embedding_trainer.memory_network, max_embed_seq_len=seq_len, state_min_value=state_min, state_max_value=state_max, ) agent_manager = train_model.value agent_trainer = agent_manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=build_normalizer(embed_env), ) device = "cuda" if use_gpu else "cpu" agent_trainer_preprocessor = make_replay_buffer_trainer_preprocessor( agent_trainer, device, env ) num_batch_per_epoch = embed_rb.size // batch_size for epoch in range(num_agent_train_epochs): for _ in tqdm(range(num_batch_per_epoch), desc=f"epoch {epoch}"): batch = embed_rb.sample_transition_batch_tensor(batch_size=batch_size) preprocessed_batch = agent_trainer_preprocessor(batch) agent_trainer.train(preprocessed_batch) # evaluate model rewards = [] policy = agent_manager.create_policy(serving=False) agent = Agent.create_for_env(embed_env, policy=policy, device=device) for i in range(num_agent_eval_epochs): ep_reward = run_episode(env=embed_env, agent=agent) rewards.append(ep_reward) logger.info(f"Finished eval episode {i} with reward {ep_reward}.") logger.info(f"Average eval reward is {np.mean(rewards)}.") assert ( np.mean(rewards) >= passing_score_bar ), f"average reward doesn't pass our bar {passing_score_bar}" return rewards
def run_test( env: str, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_train_episodes: int, max_steps: Optional[int], passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, ): env = EnvFactory.make(env) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is {normalization}") manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) replay_buffer = ReplayBuffer.create_from_env( env=env, replay_memory_size=replay_memory_size, batch_size=trainer.minibatch_size, ) device = torch.device("cuda") if use_gpu else None post_step = train_with_replay_buffer_post_step( replay_buffer=replay_buffer, trainer=trainer, training_freq=train_every_ts, batch_size=trainer.minibatch_size, replay_burnin=train_after_ts, device=device, ) training_policy = manager.create_policy(serving=False) agent = Agent.create_for_env(env, policy=training_policy, post_transition_callback=post_step, device=device) train_rewards = [] for i in range(num_train_episodes): ep_reward = run_episode(env=env, agent=agent, max_steps=max_steps) train_rewards.append(ep_reward) logger.info(f"Finished training episode {i} with reward {ep_reward}.") assert train_rewards[-1] >= passing_score_bar, ( f"reward after {len(train_rewards)} episodes is {train_rewards[-1]}," f"less than < {passing_score_bar}...\n" f"Full reward history: {train_rewards}") logger.info("============Train rewards=============") logger.info(train_rewards) def gym_to_reagent_serving( obs: np.array) -> Tuple[torch.Tensor, torch.Tensor]: obs_tensor = torch.tensor(obs).float().unsqueeze(0) presence_tensor = torch.ones_like(obs_tensor) return (obs_tensor, presence_tensor) serving_policy = manager.create_policy(serving=True) agent = Agent.create_for_env(env, policy=serving_policy, obs_preprocessor=gym_to_reagent_serving) eval_rewards = [] for i in range(num_eval_episodes): ep_reward = run_episode(env=env, agent=agent, max_steps=max_steps) eval_rewards.append(ep_reward) logger.info(f"Finished eval episode {i} with reward {ep_reward}.") assert np.mean(eval_rewards) >= passing_score_bar, ( f"Predictor reward is {np.mean(eval_rewards)}," f"less than < {passing_score_bar}...\n" f"Full eval rewards: {eval_rewards}.") logger.info("============Eval rewards==============") logger.info(eval_rewards)