def train_policy( env: EnvWrapper, training_policy: Policy, num_train_episodes: int, post_step: Optional[PostStep] = None, post_episode: Optional[PostEpisode] = None, use_gpu: bool = False, ) -> np.ndarray: device = torch.device("cuda") if use_gpu else torch.device("cpu") agent = Agent.create_for_env( env, policy=training_policy, post_transition_callback=post_step, post_episode_callback=post_episode, device=device, ) running_reward = 0 writer = SummaryWriter() with summary_writer_context(writer): train_rewards = [] with trange(num_train_episodes, unit=" epoch") as t: for i in t: # Note: run_episode also performs a training step for the agent, if specified in post_step trajectory = run_episode(env=env, agent=agent, mdp_id=i, max_steps=200) ep_reward = trajectory.calculate_cumulative_reward() train_rewards.append(ep_reward) running_reward *= REWARD_DECAY running_reward += (1 - REWARD_DECAY) * ep_reward t.set_postfix(reward=running_reward) logger.info("============Train rewards=============") logger.info(train_rewards) logger.info(f"average: {np.mean(train_rewards)};\tmax: {np.max(train_rewards)}") return np.array(train_rewards)
def test_add_custom_scalars(self): with TemporaryDirectory() as tmp_dir: writer = SummaryWriter(tmp_dir) writer.add_custom_scalars = MagicMock() with summary_writer_context(writer): SummaryWriterContext.add_custom_scalars_multilinechart( ["a", "b"], category="cat", title="title") with self.assertRaisesRegex( AssertionError, "Title \\(title\\) is already in category \\(cat\\)"): SummaryWriterContext.add_custom_scalars_multilinechart( ["c", "d"], category="cat", title="title") SummaryWriterContext.add_custom_scalars_multilinechart( ["e", "f"], category="cat", title="title2") SummaryWriterContext.add_custom_scalars_multilinechart( ["g", "h"], category="cat2", title="title") SummaryWriterContext.add_custom_scalars(writer) writer.add_custom_scalars.assert_called_once_with({ "cat": { "title": ["Multiline", ["a", "b"]], "title2": ["Multiline", ["e", "f"]], }, "cat2": { "title": ["Multiline", ["g", "h"]] }, })
def train_workflow( self, train_dataset: Dataset, eval_dataset: Optional[Dataset], normalization_data_map: Dict[str, NormalizationData], model, # reagent.workflow.model_managers.ModelManager__Union num_epochs: int, use_gpu: bool, parent_workflow_id: int, child_workflow_id: int, reward_options: Optional[RewardOptions] = None, warmstart_path: Optional[str] = None, ) -> RLTrainingOutput: manager = model.value writer = SummaryWriter() logger.info("TensorBoard logging location is: {}".format(writer.log_dir)) warmstart_input_path = warmstart_path or None manager.initialize_trainer( use_gpu=use_gpu, reward_options=reward_options, normalization_data_map=normalization_data_map, warmstart_path=warmstart_input_path, ) with summary_writer_context(writer): train_output = manager.train(train_dataset, eval_dataset, num_epochs) # TODO: make this a parameter torchscript_output_path = f"model_{round(time.time())}.torchscript" serving_module = manager.build_serving_module() torch.jit.save(serving_module, torchscript_output_path) logger.info(f"Saved torchscript model to {torchscript_output_path}") return dataclasses.replace(train_output, output_path=torchscript_output_path)
def test_swallowing_exception(self): with TemporaryDirectory() as tmp_dir: writer = SummaryWriter(tmp_dir) writer.add_scalar = MagicMock( side_effect=NotImplementedError("test")) writer.exceptions_to_ignore = (NotImplementedError, KeyError) with summary_writer_context(writer): SummaryWriterContext.add_scalar("test", torch.ones(1))
def test_writing_stack(self): with TemporaryDirectory() as tmp_dir1, TemporaryDirectory( ) as tmp_dir2: writer1 = SummaryWriter(tmp_dir1) writer1.add_scalar = MagicMock() writer2 = SummaryWriter(tmp_dir2) writer2.add_scalar = MagicMock() with summary_writer_context(writer1): with summary_writer_context(writer2): SummaryWriterContext.add_scalar("test2", torch.ones(1)) SummaryWriterContext.add_scalar("test1", torch.zeros(1)) writer1.add_scalar.assert_called_once_with("test1", torch.zeros(1), global_step=0) writer2.add_scalar.assert_called_once_with("test2", torch.ones(1), global_step=0)
def test_not_swallowing_exception(self): with TemporaryDirectory() as tmp_dir: writer = SummaryWriter(tmp_dir) writer.add_scalar = MagicMock( side_effect=NotImplementedError("test")) with self.assertRaisesRegex( NotImplementedError, "test"), summary_writer_context(writer): SummaryWriterContext.add_scalar("test", torch.ones(1))
def test_writing(self): with TemporaryDirectory() as tmp_dir: writer = SummaryWriter(tmp_dir) writer.add_scalar = MagicMock() with summary_writer_context(writer): SummaryWriterContext.add_scalar("test", torch.ones(1)) writer.add_scalar.assert_called_once_with("test", torch.ones(1), global_step=0)
def run_test_offline( env_name: str, max_steps: Optional[int], model: ModelManager__Union, replay_memory_size: int, num_batches_per_epoch: int, num_train_epochs: int, passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, ): env = EnvFactory.make(env_name) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) # first fill the replay buffer to burn_in replay_buffer = ReplayBuffer.create_from_env( env=env, replay_memory_size=replay_memory_size, batch_size=trainer.minibatch_size, ) # always fill full RB fill_replay_buffer( env=env, replay_buffer=replay_buffer, desired_size=replay_memory_size ) device = torch.device("cuda") if use_gpu else None # pyre-fixme[6]: Expected `device` for 2nd param but got `Optional[torch.device]`. trainer_preprocessor = make_replay_buffer_trainer_preprocessor(trainer, device, env) writer = SummaryWriter() with summary_writer_context(writer): for epoch in range(num_train_epochs): logger.info(f"Evaluating before epoch {epoch}: ") eval_rewards = evaluate_cem(env, manager, max_steps, 1) for _ in tqdm(range(num_batches_per_epoch)): train_batch = replay_buffer.sample_transition_batch_tensor() preprocessed_batch = trainer_preprocessor(train_batch) trainer.train(preprocessed_batch) logger.info(f"Evaluating after training for {num_train_epochs} epochs: ") eval_rewards = evaluate_cem(env, manager, max_steps, num_eval_episodes) mean_rewards = np.mean(eval_rewards) assert ( mean_rewards >= passing_score_bar ), f"{mean_rewards} doesn't pass the bar {passing_score_bar}."
def test_global_step(self): with TemporaryDirectory() as tmp_dir: writer = SummaryWriter(tmp_dir) writer.add_scalar = MagicMock() with summary_writer_context(writer): SummaryWriterContext.add_scalar("test", torch.ones(1)) SummaryWriterContext.increase_global_step() SummaryWriterContext.add_scalar("test", torch.zeros(1)) writer.add_scalar.assert_has_calls([ call("test", torch.ones(1), global_step=0), call("test", torch.zeros(1), global_step=1), ]) self.assertEqual(2, len(writer.add_scalar.mock_calls))
def train_workflow( self, train_dataset: Dataset, eval_dataset: Optional[Dataset], normalization_data_map: Dict[str, NormalizationData], num_epochs: int, use_gpu: bool, named_model_ids: ModuleNameToEntityId, child_workflow_id: int, reward_options: Optional[RewardOptions] = None, reader_options: Optional[ReaderOptions] = None, resource_options: Optional[ResourceOptions] = None, warmstart_path: Optional[str] = None, ) -> RLTrainingOutput: writer = SummaryWriter() logger.info("TensorBoard logging location is: {}".format( writer.log_dir)) warmstart_input_path = warmstart_path or None self.initialize_trainer( use_gpu=use_gpu, # pyre-fixme[6]: Expected `RewardOptions` for 2nd param but got # `Optional[RewardOptions]`. # pyre-fixme[6]: Expected `RewardOptions` for 2nd param but got # `Optional[RewardOptions]`. reward_options=reward_options, normalization_data_map=normalization_data_map, warmstart_path=warmstart_input_path, ) if not reader_options: reader_options = ReaderOptions() with summary_writer_context(writer): train_output = self.train(train_dataset, eval_dataset, num_epochs, reader_options) output_paths = {} for module_name, serving_module in self.build_serving_modules().items( ): # TODO: make this a parameter torchscript_output_path = f"model_{round(time.time())}.torchscript" serving_module = self.build_serving_module() torch.jit.save(serving_module, torchscript_output_path) logger.info(f"Saved {module_name} to {torchscript_output_path}") output_paths[module_name] = torchscript_output_path return dataclasses.replace(train_output, output_paths=output_paths)
def run_test( env_name: str, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_train_episodes: int, max_steps: Optional[int], passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, ): env = EnvFactory.make(env_name) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value try: # pyre-fixme[16]: `Env` has no attribute `state_feature_config_provider`. manager.state_feature_config_provider = env.state_feature_config_provider logger.info( f"Using environment's state_feature_config_provider.\n" f"{manager.state_feature_config_provider}" ) except AttributeError: logger.info("state_feature_config_provider override not applicable") trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) training_policy = manager.create_policy(serving=False) replay_buffer = ReplayBuffer.create_from_env( env=env, replay_memory_size=replay_memory_size, batch_size=trainer.minibatch_size, ) device = torch.device("cuda") if use_gpu else torch.device("cpu") # first fill the replay buffer to burn_in train_after_ts = max(train_after_ts, trainer.minibatch_size) fill_replay_buffer( env=env, replay_buffer=replay_buffer, desired_size=train_after_ts ) post_step = train_with_replay_buffer_post_step( replay_buffer=replay_buffer, env=env, trainer=trainer, training_freq=train_every_ts, batch_size=trainer.minibatch_size, device=device, ) agent = Agent.create_for_env( env, policy=training_policy, post_transition_callback=post_step, device=device ) writer = SummaryWriter() with summary_writer_context(writer): train_rewards = [] for i in range(num_train_episodes): trajectory = run_episode( env=env, agent=agent, mdp_id=i, max_steps=max_steps ) ep_reward = trajectory.calculate_cumulative_reward() train_rewards.append(ep_reward) logger.info( f"Finished training episode {i} (len {len(trajectory)})" f" with reward {ep_reward}." ) logger.info("============Train rewards=============") logger.info(train_rewards) logger.info(f"average: {np.mean(train_rewards)};\tmax: {np.max(train_rewards)}") # Check whether the max score passed the score bar; we explore during training # the return could be bad (leading to flakiness in C51 and QRDQN). assert np.max(train_rewards) >= passing_score_bar, ( f"max reward ({np.max(train_rewards)})after training for " f"{len(train_rewards)} episodes is less than < {passing_score_bar}.\n" ) serving_policy = manager.create_policy(serving=True) agent = Agent.create_for_env_with_serving_policy(env, serving_policy) eval_rewards = evaluate_for_n_episodes( n=num_eval_episodes, env=env, agent=agent, max_steps=max_steps ).squeeze(1) logger.info("============Eval rewards==============") logger.info(eval_rewards) logger.info(f"average: {np.mean(eval_rewards)};\tmax: {np.max(eval_rewards)}") assert np.mean(eval_rewards) >= passing_score_bar, ( f"Predictor reward is {np.mean(eval_rewards)}," f"less than < {passing_score_bar}.\n" )
def run_test( env_name: str, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_train_episodes: int, max_steps: Optional[int], passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, ): env = EnvFactory.make(env_name) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) training_policy = manager.create_policy(serving=False) replay_buffer = ReplayBuffer.create_from_env( env=env, replay_memory_size=replay_memory_size, batch_size=trainer.minibatch_size, ) device = torch.device("cuda") if use_gpu else None # first fill the replay buffer to burn_in train_after_ts = max(train_after_ts, trainer.minibatch_size) fill_replay_buffer(env=env, replay_buffer=replay_buffer, desired_size=train_after_ts) post_step = train_with_replay_buffer_post_step( replay_buffer=replay_buffer, env=env, trainer=trainer, training_freq=train_every_ts, batch_size=trainer.minibatch_size, device=device, ) agent = Agent.create_for_env( env, policy=training_policy, post_transition_callback=post_step, # pyre-fixme[6]: Expected `Union[str, torch.device]` for 4th param but got # `Optional[torch.device]`. device=device, ) writer = SummaryWriter() with summary_writer_context(writer): train_rewards = [] for i in range(num_train_episodes): trajectory = run_episode(env=env, agent=agent, mdp_id=i, max_steps=max_steps) ep_reward = trajectory.calculate_cumulative_reward() train_rewards.append(ep_reward) logger.info( f"Finished training episode {i} with reward {ep_reward}.") assert train_rewards[-1] >= passing_score_bar, ( f"reward after {len(train_rewards)} episodes is {train_rewards[-1]}," f"less than < {passing_score_bar}...\n" f"Full reward history: {train_rewards}") logger.info("============Train rewards=============") logger.info(train_rewards) serving_policy = manager.create_policy(serving=True) agent = Agent.create_for_env_with_serving_policy(env, serving_policy) eval_rewards = evaluate_for_n_episodes(n=num_eval_episodes, env=env, agent=agent, max_steps=max_steps).squeeze(1) assert np.mean(eval_rewards) >= passing_score_bar, ( f"Predictor reward is {np.mean(eval_rewards)}," f"less than < {passing_score_bar}...\n" f"Full eval rewards: {eval_rewards}.") logger.info("============Eval rewards==============") logger.info(eval_rewards)
def test_swallowing_histogram_value_error(self): with TemporaryDirectory() as tmp_dir: writer = SummaryWriter(tmp_dir) with summary_writer_context(writer): SummaryWriterContext.add_histogram("bad_histogram", torch.ones(100, 1))
def test_with_none(self): with summary_writer_context(None): self.assertIsNone( SummaryWriterContext.add_scalar("test", torch.ones(1)))
def single_process_main(gpu_index, *args): params = args[0] # Set minibatch size based on # of devices being used to train params["training"]["minibatch_size"] *= minibatch_size_multiplier( params["use_gpu"], params["use_all_avail_gpus"]) rl_parameters = from_json(params["rl"], RLParameters) training_parameters = from_json(params["training"], TrainingParameters) rainbow_parameters = from_json(params["rainbow"], RainbowDQNParameters) model_params = ContinuousActionModelParameters( rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters) state_normalization = BaseWorkflow.read_norm_file( params["state_norm_data_path"]) action_normalization = BaseWorkflow.read_norm_file( params["action_norm_data_path"]) writer = SummaryWriter(log_dir=params["model_output_path"]) logger.info("TensorBoard logging location is: {}".format(writer.log_dir)) if params["use_all_avail_gpus"]: BaseWorkflow.init_multiprocessing( int(params["num_processes_per_node"]), int(params["num_nodes"]), int(params["node_index"]), gpu_index, params["init_method"], ) workflow = ParametricDqnWorkflow( model_params, state_normalization, action_normalization, params["use_gpu"], params["use_all_avail_gpus"], ) state_sorted_features, _ = sort_features_by_normalization( state_normalization) action_sorted_features, _ = sort_features_by_normalization( action_normalization) preprocess_handler = ParametricDqnPreprocessHandler( StringKeySparseToDenseProcessor(state_sorted_features), StringKeySparseToDenseProcessor(action_sorted_features), ) train_dataset = JSONDatasetReader( params["training_data_path"], batch_size=training_parameters.minibatch_size, preprocess_handler=preprocess_handler, ) eval_dataset = JSONDatasetReader(params["eval_data_path"], batch_size=16, preprocess_handler=preprocess_handler) with summary_writer_context(writer): workflow.train_network(train_dataset, eval_dataset, int(params["epochs"])) if int(params["node_index"]) == 0 and gpu_index == 0: workflow.save_models(params["model_output_path"])