def train_seq2reward_and_compute_reward_mse( env_name: str, model: ModelManager__Union, num_train_transitions: int, num_test_transitions: int, seq_len: int, batch_size: int, num_train_epochs: int, use_gpu: bool, saved_seq2reward_path: Optional[str] = None, ): """ Train Seq2Reward Network and compute reward mse. """ env = Gym(env_name=env_name) env.seed(SEED) manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=build_normalizer(env), ) device = "cuda" if use_gpu else "cpu" # pyre-fixme[6]: Expected `device` for 2nd param but got `str`. trainer_preprocessor = make_replay_buffer_trainer_preprocessor(trainer, device, env) test_replay_buffer = ReplayBuffer( replay_capacity=num_test_transitions, batch_size=batch_size, stack_size=seq_len, return_everything_as_stack=True, ) fill_replay_buffer(env, test_replay_buffer, num_test_transitions) if saved_seq2reward_path is None: # train from scratch trainer = train_seq2reward( env=env, trainer=trainer, trainer_preprocessor=trainer_preprocessor, num_train_transitions=num_train_transitions, seq_len=seq_len, batch_size=batch_size, num_train_epochs=num_train_epochs, test_replay_buffer=test_replay_buffer, ) else: # load a pretrained model, and just evaluate it trainer.seq2reward_network.load_state_dict(torch.load(saved_seq2reward_path)) state_dim = env.observation_space.shape[0] with torch.no_grad(): trainer.seq2reward_network.eval() test_batch = test_replay_buffer.sample_transition_batch( batch_size=test_replay_buffer.size ) preprocessed_test_batch = trainer_preprocessor(test_batch) adhoc_action_padding(preprocessed_test_batch, state_dim=state_dim) losses = trainer.get_loss(preprocessed_test_batch) detached_losses = losses.cpu().detach().item() trainer.seq2reward_network.train() return detached_losses
def create_for_trainer( cls, trainer, env: EnvWrapper, agent: Agent, replay_buffer: ReplayBuffer, batch_size: int, training_frequency: int = 1, num_episodes: Optional[int] = None, max_steps: Optional[int] = None, trainer_preprocessor=None, replay_buffer_inserter=None, ): device = torch.device("cpu") if trainer_preprocessor is None: trainer_preprocessor = make_replay_buffer_trainer_preprocessor( trainer, device, env ) if replay_buffer_inserter is None: replay_buffer_inserter = make_replay_buffer_inserter(env) return cls( env=env, agent=agent, replay_buffer=replay_buffer, batch_size=batch_size, training_frequency=training_frequency, num_episodes=num_episodes, max_steps=max_steps, trainer_preprocessor=trainer_preprocessor, replay_buffer_inserter=replay_buffer_inserter, )
def run_test_offline( env_name: str, model: ModelManager__Union, replay_memory_size: int, num_batches_per_epoch: int, num_train_epochs: int, passing_score_bar: float, num_eval_episodes: int, minibatch_size: int, use_gpu: bool, ): env = Gym(env_name=env_name) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) # first fill the replay buffer to burn_in replay_buffer = ReplayBuffer(replay_capacity=replay_memory_size, batch_size=minibatch_size) # always fill full RB random_policy = make_random_policy_for_env(env) agent = Agent.create_for_env(env, policy=random_policy) fill_replay_buffer( env=env, replay_buffer=replay_buffer, desired_size=replay_memory_size, agent=agent, ) device = torch.device("cuda") if use_gpu else None # pyre-fixme[6]: Expected `device` for 2nd param but got `Optional[torch.device]`. trainer_preprocessor = make_replay_buffer_trainer_preprocessor( trainer, device, env) writer = SummaryWriter() with summary_writer_context(writer): for epoch in range(num_train_epochs): logger.info(f"Evaluating before epoch {epoch}: ") eval_rewards = evaluate_cem(env, manager, 1) for _ in tqdm(range(num_batches_per_epoch)): train_batch = replay_buffer.sample_transition_batch() preprocessed_batch = trainer_preprocessor(train_batch) trainer.train(preprocessed_batch) logger.info(f"Evaluating after training for {num_train_epochs} epochs: ") eval_rewards = evaluate_cem(env, manager, num_eval_episodes) mean_rewards = np.mean(eval_rewards) assert (mean_rewards >= passing_score_bar ), f"{mean_rewards} doesn't pass the bar {passing_score_bar}."
def train_with_replay_buffer_post_step( replay_buffer: ReplayBuffer, env: gym.Env, trainer: Trainer, training_freq: int, batch_size: int, trainer_preprocessor=None, device: Union[str, torch.device] = "cpu", replay_buffer_inserter=None, ) -> PostStep: """ Called in post_step of agent to train based on replay buffer (RB). Args: trainer: responsible for having a .train method to train the model trainer_preprocessor: format RB output for trainer.train training_freq: how many steps in between trains batch_size: how big of a batch to sample """ if isinstance(device, str): device = torch.device(device) if trainer_preprocessor is None: trainer_preprocessor = make_replay_buffer_trainer_preprocessor( trainer, device, env) if replay_buffer_inserter is None: replay_buffer_inserter = make_replay_buffer_inserter(env) _num_steps = 0 def post_step(obs: Any, action: Any, reward: float, terminal: bool, log_prob: float) -> None: nonlocal _num_steps replay_buffer_inserter(replay_buffer, obs, action, reward, terminal, log_prob) if _num_steps % training_freq == 0: assert replay_buffer.size >= batch_size train_batch = replay_buffer.sample_transition_batch_tensor( batch_size=batch_size) preprocessed_batch = trainer_preprocessor(train_batch) trainer.train(preprocessed_batch) _num_steps += 1 return return post_step
def create_for_trainer( cls, trainer, env: EnvWrapper, replay_buffer: ReplayBuffer, batch_size: int, num_batches: int, trainer_preprocessor=None, device=None, ): device = device or torch.device("cpu") if trainer_preprocessor is None: trainer_preprocessor = make_replay_buffer_trainer_preprocessor( trainer, device, env) return cls( env=env, replay_buffer=replay_buffer, batch_size=batch_size, num_batches=num_batches, trainer_preprocessor=trainer_preprocessor, )
def train_mdnrnn_and_train_on_embedded_env( env_name: str, embedding_model: ModelManager__Union, num_embedding_train_transitions: int, seq_len: int, batch_size: int, num_embedding_train_epochs: int, train_model: ModelManager__Union, num_state_embed_transitions: int, num_agent_train_epochs: int, num_agent_eval_epochs: int, use_gpu: bool, passing_score_bar: float, # pyre-fixme[9]: saved_mdnrnn_path has type `str`; used as `None`. saved_mdnrnn_path: str = None, ): """ Train an agent on embedded states by the MDNRNN. """ env = EnvFactory.make(env_name) env.seed(SEED) embedding_manager = embedding_model.value embedding_trainer = embedding_manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=build_normalizer(env), ) device = "cuda" if use_gpu else "cpu" embedding_trainer_preprocessor = make_replay_buffer_trainer_preprocessor( embedding_trainer, device, env ) if saved_mdnrnn_path is None: # train from scratch embedding_trainer = train_mdnrnn( env=env, trainer=embedding_trainer, trainer_preprocessor=embedding_trainer_preprocessor, num_train_transitions=num_embedding_train_transitions, seq_len=seq_len, batch_size=batch_size, num_train_epochs=num_embedding_train_epochs, ) else: # load a pretrained model, and just evaluate it embedding_trainer.memory_network.mdnrnn.load_state_dict( torch.load(saved_mdnrnn_path) ) # create embedding dataset embed_rb, state_min, state_max = create_embed_rl_dataset( env=env, memory_network=embedding_trainer.memory_network, num_state_embed_transitions=num_state_embed_transitions, batch_size=batch_size, seq_len=seq_len, hidden_dim=embedding_trainer.params.hidden_size, use_gpu=use_gpu, ) embed_env = StateEmbedEnvironment( gym_env=env, mdnrnn=embedding_trainer.memory_network, max_embed_seq_len=seq_len, state_min_value=state_min, state_max_value=state_max, ) agent_manager = train_model.value agent_trainer = agent_manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=build_normalizer(embed_env), ) device = "cuda" if use_gpu else "cpu" agent_trainer_preprocessor = make_replay_buffer_trainer_preprocessor( agent_trainer, device, env ) num_batch_per_epoch = embed_rb.size // batch_size for epoch in range(num_agent_train_epochs): for _ in tqdm(range(num_batch_per_epoch), desc=f"epoch {epoch}"): batch = embed_rb.sample_transition_batch_tensor(batch_size=batch_size) preprocessed_batch = agent_trainer_preprocessor(batch) agent_trainer.train(preprocessed_batch) # evaluate model rewards = [] policy = agent_manager.create_policy(serving=False) agent = Agent.create_for_env(embed_env, policy=policy, device=device) for i in range(num_agent_eval_epochs): ep_reward = run_episode(env=embed_env, agent=agent) rewards.append(ep_reward) logger.info(f"Finished eval episode {i} with reward {ep_reward}.") logger.info(f"Average eval reward is {np.mean(rewards)}.") assert ( np.mean(rewards) >= passing_score_bar ), f"average reward doesn't pass our bar {passing_score_bar}" return rewards
def train_mdnrnn_and_compute_feature_stats( env_name: str, model: ModelManager__Union, num_train_transitions: int, num_test_transitions: int, seq_len: int, batch_size: int, num_train_epochs: int, use_gpu: bool, saved_mdnrnn_path: Optional[str] = None, ): """ Train MDNRNN Memory Network and compute feature importance/sensitivity. """ env: gym.Env = EnvFactory.make(env_name) env.seed(SEED) manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=build_normalizer(env), ) device = "cuda" if use_gpu else "cpu" trainer_preprocessor = make_replay_buffer_trainer_preprocessor(trainer, device, env) test_replay_buffer = ReplayBuffer.create_from_env( env=env, replay_memory_size=num_test_transitions, batch_size=batch_size, stack_size=seq_len, return_everything_as_stack=True, ) fill_replay_buffer(env, test_replay_buffer, num_test_transitions) if saved_mdnrnn_path is None: # train from scratch trainer = train_mdnrnn( env=env, trainer=trainer, trainer_preprocessor=trainer_preprocessor, num_train_transitions=num_train_transitions, seq_len=seq_len, batch_size=batch_size, num_train_epochs=num_train_epochs, test_replay_buffer=test_replay_buffer, ) else: # load a pretrained model, and just evaluate it trainer.memory_network.mdnrnn.load_state_dict(torch.load(saved_mdnrnn_path)) with torch.no_grad(): trainer.memory_network.mdnrnn.eval() test_batch = test_replay_buffer.sample_transition_batch_tensor( batch_size=test_replay_buffer.size ) preprocessed_test_batch = trainer_preprocessor(test_batch) feature_importance = calculate_feature_importance( env=env, trainer=trainer, use_gpu=use_gpu, test_batch=preprocessed_test_batch, ) feature_sensitivity = calculate_feature_sensitivity( env=env, trainer=trainer, use_gpu=use_gpu, test_batch=preprocessed_test_batch, ) trainer.memory_network.mdnrnn.train() return feature_importance, feature_sensitivity
def train_mdnrnn_and_train_on_embedded_env( env_name: str, embedding_model: ModelManager__Union, num_embedding_train_transitions: int, seq_len: int, batch_size: int, num_embedding_train_epochs: int, train_model: ModelManager__Union, num_state_embed_transitions: int, num_agent_train_epochs: int, num_agent_eval_epochs: int, use_gpu: bool, passing_score_bar: float, # pyre-fixme[9]: saved_mdnrnn_path has type `str`; used as `None`. saved_mdnrnn_path: str = None, ): """ Train an agent on embedded states by the MDNRNN. """ env = Gym(env_name=env_name) env.seed(SEED) embedding_manager = embedding_model.value embedding_trainer = embedding_manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=build_normalizer(env), ) device = "cuda" if use_gpu else "cpu" embedding_trainer_preprocessor = make_replay_buffer_trainer_preprocessor( embedding_trainer, # pyre-fixme[6]: Expected `device` for 2nd param but got `str`. device, env, ) if saved_mdnrnn_path is None: # train from scratch embedding_trainer = train_mdnrnn( env=env, trainer=embedding_trainer, trainer_preprocessor=embedding_trainer_preprocessor, num_train_transitions=num_embedding_train_transitions, seq_len=seq_len, batch_size=batch_size, num_train_epochs=num_embedding_train_epochs, ) else: # load a pretrained model, and just evaluate it embedding_trainer.memory_network.mdnrnn.load_state_dict( torch.load(saved_mdnrnn_path)) # create embedding dataset embed_rb, state_min, state_max = create_embed_rl_dataset( env=env, memory_network=embedding_trainer.memory_network, num_state_embed_transitions=num_state_embed_transitions, batch_size=batch_size, seq_len=seq_len, hidden_dim=embedding_trainer.params.hidden_size, use_gpu=use_gpu, ) embed_env = StateEmbedEnvironment( gym_env=env, mdnrnn=embedding_trainer.memory_network, max_embed_seq_len=seq_len, state_min_value=state_min, state_max_value=state_max, ) agent_manager = train_model.value agent_trainer = agent_manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), # pyre-fixme[6]: Expected `EnvWrapper` for 1st param but got # `StateEmbedEnvironment`. normalization_data_map=build_normalizer(embed_env), ) device = "cuda" if use_gpu else "cpu" agent_trainer_preprocessor = make_replay_buffer_trainer_preprocessor( agent_trainer, # pyre-fixme[6]: Expected `device` for 2nd param but got `str`. device, env, ) num_batch_per_epoch = embed_rb.size // batch_size # FIXME: This has to be wrapped in dataloader for epoch in range(num_agent_train_epochs): for _ in tqdm(range(num_batch_per_epoch), desc=f"epoch {epoch}"): batch = embed_rb.sample_transition_batch(batch_size=batch_size) preprocessed_batch = agent_trainer_preprocessor(batch) # FIXME: This should be fitted with Lightning's trainer agent_trainer.train(preprocessed_batch) # evaluate model rewards = [] policy = agent_manager.create_policy(serving=False) # pyre-fixme[6]: Expected `EnvWrapper` for 1st param but got # `StateEmbedEnvironment`. agent = Agent.create_for_env(embed_env, policy=policy, device=device) # num_processes=1 needed to avoid workers from dying on CircleCI tests rewards = evaluate_for_n_episodes( n=num_agent_eval_epochs, # pyre-fixme[6]: Expected `EnvWrapper` for 2nd param but got # `StateEmbedEnvironment`. env=embed_env, agent=agent, num_processes=1, ) assert (np.mean(rewards) >= passing_score_bar ), f"average reward doesn't pass our bar {passing_score_bar}" return rewards