def identify_and_train_network( input_table_spec: TableSpec, model: ModelManager__Union, num_epochs: int, use_gpu: Optional[bool] = None, reward_options: Optional[RewardOptions] = None, reader_options: Optional[ReaderOptions] = None, resource_options: Optional[ResourceOptions] = None, warmstart_path: Optional[str] = None, validator: Optional[ModelValidator__Union] = None, publisher: Optional[ModelPublisher__Union] = None, ) -> RLTrainingOutput: if use_gpu is None: # pyre-fixme[35]: Target cannot be annotated. use_gpu: bool = torch.cuda.is_available() reward_options = reward_options or RewardOptions() reader_options = reader_options or ReaderOptions() manager = model.value normalization_data_map = None setup_data = None data_module = manager.get_data_module( input_table_spec=input_table_spec, reward_options=reward_options, reader_options=reader_options, ) if data_module is not None: setup_data = data_module.prepare_data() else: normalization_data_map = manager.run_feature_identification(input_table_spec) return query_and_train( input_table_spec, model, num_epochs, use_gpu=use_gpu, setup_data=setup_data, normalization_data_map=normalization_data_map, reward_options=reward_options, reader_options=reader_options, resource_options=resource_options, warmstart_path=warmstart_path, validator=validator, publisher=publisher, )
def __init__( self, *, input_table_spec: Optional[TableSpec] = None, reward_options: Optional[RewardOptions] = None, setup_data: Optional[Dict[str, bytes]] = None, saved_setup_data: Optional[Dict[str, bytes]] = None, reader_options: Optional[ReaderOptions] = None, model_manager=None, ): super().__init__() self.input_table_spec = input_table_spec self.reward_options = reward_options or RewardOptions() self.reader_options = reader_options or ReaderOptions() self._model_manager = model_manager self.setup_data = setup_data self.saved_setup_data = saved_setup_data or {} self._setup_done = False
def setUp(self): # preparing various components for qr-dqn trainer initialization self.params = QRDQNTrainerParameters(actions=["1", "2"], num_atoms=11) self.reward_options = RewardOptions() self.metrics_to_score = get_metrics_to_score( self.reward_options.metric_reward_values ) self.state_dim = 10 self.action_dim = 2 self.sizes = [20, 20] self.num_atoms = 11 self.activations = ["relu", "relu"] self.dropout_ratio = 0 self.q_network = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.action_dim, sizes=self.sizes, num_atoms=self.num_atoms, activations=self.activations, dropout_ratio=self.dropout_ratio, ) self.q_network_target = self.q_network.get_target_network() self.x = FeatureData(float_features=torch.rand(5, 10)) self.eval_parameters = EvaluationParameters(calc_cpe_in_training=True) self.num_output_nodes = (len(self.metrics_to_score) + 1) * len( # pyre-fixme[16]: `QRDQNTrainerParameters` has no attribute `actions`. self.params.actions ) self.reward_network = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.num_output_nodes, sizes=self.sizes, activations=self.activations, ) self.q_network_cpe = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.num_output_nodes, sizes=self.sizes, activations=self.activations, ) self.q_network_cpe_target = self.q_network_cpe.get_target_network()
def __init__( self, *, input_table_spec: Optional[TableSpec] = None, reward_options: Optional[RewardOptions] = None, setup_data: Optional[Dict[str, bytes]] = None, saved_setup_data: Optional[Dict[str, bytes]] = None, reader_options: Optional[ReaderOptions] = None, resource_options: Optional[ResourceOptions] = None, model_manager=None, ): super().__init__() self.input_table_spec = input_table_spec self.reward_options = reward_options or RewardOptions() self.reader_options = reader_options or ReaderOptions() self.resource_options = resource_options or ResourceOptions(gpu=0) self._model_manager = model_manager self.setup_data = setup_data self.saved_setup_data = saved_setup_data or {} self._setup_done = False self._num_train_data_loader_calls = 0 self._num_val_data_loader_calls = 0 self._num_test_data_loader_calls = 0
def run_test( env: Env__Union, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_train_episodes: int, passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, minibatch_size: Optional[int] = None, ): env = env.value normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) training_policy = manager.create_policy(serving=False) # pyre-fixme[16]: Module `pl` has no attribute `LightningModule`. if not isinstance(trainer, pl.LightningModule): if minibatch_size is None: minibatch_size = trainer.minibatch_size assert minibatch_size == trainer.minibatch_size assert minibatch_size is not None replay_buffer = ReplayBuffer(replay_capacity=replay_memory_size, batch_size=minibatch_size) device = torch.device("cuda") if use_gpu else torch.device("cpu") # first fill the replay buffer to burn_in train_after_ts = max(train_after_ts, minibatch_size) fill_replay_buffer(env=env, replay_buffer=replay_buffer, desired_size=train_after_ts) # pyre-fixme[16]: Module `pl` has no attribute `LightningModule`. if isinstance(trainer, pl.LightningModule): agent = Agent.create_for_env(env, policy=training_policy) # TODO: Simplify this setup by creating LightningDataModule dataset = ReplayBufferDataset.create_for_trainer( trainer, env, agent, replay_buffer, batch_size=minibatch_size, training_frequency=train_every_ts, num_episodes=num_train_episodes, max_steps=200, ) data_loader = torch.utils.data.DataLoader(dataset, collate_fn=identity_collate) # pyre-fixme[16]: Module `pl` has no attribute `Trainer`. pl_trainer = pl.Trainer(max_epochs=1, gpus=int(use_gpu)) pl_trainer.fit(trainer, data_loader) # TODO: Also check train_reward else: post_step = train_with_replay_buffer_post_step( replay_buffer=replay_buffer, env=env, trainer=trainer, training_freq=train_every_ts, batch_size=trainer.minibatch_size, device=device, ) env.seed(SEED) env.action_space.seed(SEED) train_rewards = train_policy( env, training_policy, num_train_episodes, post_step=post_step, post_episode=None, use_gpu=use_gpu, ) # Check whether the max score passed the score bar; we explore during training # the return could be bad (leading to flakiness in C51 and QRDQN). assert np.max(train_rewards) >= passing_score_bar, ( f"max reward ({np.max(train_rewards)}) after training for " f"{len(train_rewards)} episodes is less than < {passing_score_bar}.\n" ) serving_policy = manager.create_policy(serving=True) eval_rewards = eval_policy(env, serving_policy, num_eval_episodes, serving=True) assert ( eval_rewards.mean() >= passing_score_bar ), f"Eval reward is {eval_rewards.mean()}, less than < {passing_score_bar}.\n"
def train_mdnrnn_and_train_on_embedded_env( env_name: str, embedding_model: ModelManager__Union, num_embedding_train_transitions: int, seq_len: int, batch_size: int, num_embedding_train_epochs: int, train_model: ModelManager__Union, num_state_embed_transitions: int, num_agent_train_epochs: int, num_agent_eval_epochs: int, use_gpu: bool, passing_score_bar: float, # pyre-fixme[9]: saved_mdnrnn_path has type `str`; used as `None`. saved_mdnrnn_path: str = None, ): """ Train an agent on embedded states by the MDNRNN. """ env = Gym(env_name=env_name) env.seed(SEED) embedding_manager = embedding_model.value embedding_trainer = embedding_manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=build_normalizer(env), ) device = "cuda" if use_gpu else "cpu" embedding_trainer_preprocessor = make_replay_buffer_trainer_preprocessor( embedding_trainer, # pyre-fixme[6]: Expected `device` for 2nd param but got `str`. device, env, ) if saved_mdnrnn_path is None: # train from scratch embedding_trainer = train_mdnrnn( env=env, trainer=embedding_trainer, trainer_preprocessor=embedding_trainer_preprocessor, num_train_transitions=num_embedding_train_transitions, seq_len=seq_len, batch_size=batch_size, num_train_epochs=num_embedding_train_epochs, ) else: # load a pretrained model, and just evaluate it embedding_trainer.memory_network.mdnrnn.load_state_dict( torch.load(saved_mdnrnn_path)) # create embedding dataset embed_rb, state_min, state_max = create_embed_rl_dataset( env=env, memory_network=embedding_trainer.memory_network, num_state_embed_transitions=num_state_embed_transitions, batch_size=batch_size, seq_len=seq_len, hidden_dim=embedding_trainer.params.hidden_size, use_gpu=use_gpu, ) embed_env = StateEmbedEnvironment( gym_env=env, mdnrnn=embedding_trainer.memory_network, max_embed_seq_len=seq_len, state_min_value=state_min, state_max_value=state_max, ) agent_manager = train_model.value agent_trainer = agent_manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), # pyre-fixme[6]: Expected `EnvWrapper` for 1st param but got # `StateEmbedEnvironment`. normalization_data_map=build_normalizer(embed_env), ) device = "cuda" if use_gpu else "cpu" agent_trainer_preprocessor = make_replay_buffer_trainer_preprocessor( agent_trainer, # pyre-fixme[6]: Expected `device` for 2nd param but got `str`. device, env, ) num_batch_per_epoch = embed_rb.size // batch_size # FIXME: This has to be wrapped in dataloader for epoch in range(num_agent_train_epochs): for _ in tqdm(range(num_batch_per_epoch), desc=f"epoch {epoch}"): batch = embed_rb.sample_transition_batch(batch_size=batch_size) preprocessed_batch = agent_trainer_preprocessor(batch) # FIXME: This should be fitted with Lightning's trainer agent_trainer.train(preprocessed_batch) # evaluate model rewards = [] policy = agent_manager.create_policy(serving=False) # pyre-fixme[6]: Expected `EnvWrapper` for 1st param but got # `StateEmbedEnvironment`. agent = Agent.create_for_env(embed_env, policy=policy, device=device) # num_processes=1 needed to avoid workers from dying on CircleCI tests rewards = evaluate_for_n_episodes( n=num_agent_eval_epochs, # pyre-fixme[6]: Expected `EnvWrapper` for 2nd param but got # `StateEmbedEnvironment`. env=embed_env, agent=agent, num_processes=1, ) assert (np.mean(rewards) >= passing_score_bar ), f"average reward doesn't pass our bar {passing_score_bar}" return rewards
def query_and_train( input_table_spec: TableSpec, model: ModelManager__Union, num_epochs: int, use_gpu: bool, *, setup_data: Optional[Dict[str, bytes]] = None, saved_setup_data: Optional[Dict[str, bytes]] = None, normalization_data_map: Optional[Dict[str, NormalizationData]] = None, reward_options: Optional[RewardOptions] = None, reader_options: Optional[ReaderOptions] = None, resource_options: Optional[ResourceOptions] = None, warmstart_path: Optional[str] = None, validator: Optional[ModelValidator__Union] = None, publisher: Optional[ModelPublisher__Union] = None, named_model_ids: Optional[ModuleNameToEntityId] = None, recurring_period: Optional[RecurringPeriod] = None, ) -> RLTrainingOutput: child_workflow_id = get_workflow_id() if named_model_ids is None: named_model_ids = get_new_named_entity_ids(model.value.serving_module_names()) logger.info("Starting query") reward_options = reward_options or RewardOptions() reader_options = reader_options or ReaderOptions() resource_options = resource_options or ResourceOptions() manager = model.value if saved_setup_data is not None: def _maybe_get_bytes(v) -> bytes: if isinstance(v, bytes): return v # HACK: FBLearner sometimes pack bytes into Blob return v.data saved_setup_data = {k: _maybe_get_bytes(v) for k, v in saved_setup_data.items()} if setup_data is None: data_module = manager.get_data_module( input_table_spec=input_table_spec, reward_options=reward_options, reader_options=reader_options, saved_setup_data=saved_setup_data, ) if data_module is not None: setup_data = data_module.prepare_data() # Throw away existing normalization data map normalization_data_map = None if sum([int(setup_data is not None), int(normalization_data_map is not None)]) != 1: raise ValueError("setup_data and normalization_data_map are mutually exclusive") train_dataset = None eval_dataset = None if normalization_data_map is not None: calc_cpe_in_training = manager.should_generate_eval_dataset sample_range_output = get_sample_range(input_table_spec, calc_cpe_in_training) train_dataset = manager.query_data( input_table_spec=input_table_spec, sample_range=sample_range_output.train_sample_range, reward_options=reward_options, ) eval_dataset = None if calc_cpe_in_training: eval_dataset = manager.query_data( input_table_spec=input_table_spec, sample_range=sample_range_output.eval_sample_range, reward_options=reward_options, ) logger.info("Starting training") results = manager.train_workflow( train_dataset, eval_dataset, num_epochs=num_epochs, use_gpu=use_gpu, setup_data=setup_data, normalization_data_map=normalization_data_map, named_model_ids=named_model_ids, child_workflow_id=child_workflow_id, reward_options=reward_options, reader_options=reader_options, resource_options=resource_options, warmstart_path=warmstart_path, ) if validator is not None: results = run_validator(validator, results) if publisher is not None: results = run_publisher( publisher, model, results, named_model_ids, child_workflow_id, recurring_period, ) return results
def build_trainer( self, normalization_data_map: Dict[str, NormalizationData], use_gpu: bool, reward_options: Optional[RewardOptions] = None, ) -> DiscreteCRRTrainer: actor_net_builder = self.actor_net_builder.value actor_network = actor_net_builder.build_actor( normalization_data_map[NormalizationKey.STATE], len(self.action_names)) actor_network_target = actor_network.get_target_network() # The arguments to q_network1 and q_network2 below are modeled after those in discrete_dqn.py critic_net_builder = self.critic_net_builder.value q1_network = critic_net_builder.build_q_network( self.state_feature_config, normalization_data_map[NormalizationKey.STATE], len(self.action_names), ) q1_network_target = q1_network.get_target_network() q2_network = q2_network_target = None # pyre-fixme[16]: `CRRTrainerParameters` has no attribute # `double_q_learning`. if self.trainer_param.double_q_learning: q2_network = critic_net_builder.build_q_network( self.state_feature_config, normalization_data_map[NormalizationKey.STATE], len(self.action_names), ) q2_network_target = q2_network.get_target_network() reward_options = reward_options or RewardOptions() metrics_to_score = get_metrics_to_score( reward_options.metric_reward_values) reward_network, q_network_cpe, q_network_cpe_target = None, None, None if self.eval_parameters.calc_cpe_in_training: # Metrics + reward num_output_nodes = (len(metrics_to_score) + 1) * len( # pyre-fixme[16]: `CRRTrainerParameters` has no attribute `actions`. self.trainer_param.actions) cpe_net_builder = self.cpe_net_builder.value reward_network = cpe_net_builder.build_q_network( self.state_feature_config, normalization_data_map[NormalizationKey.STATE], num_output_nodes, ) q_network_cpe = cpe_net_builder.build_q_network( self.state_feature_config, normalization_data_map[NormalizationKey.STATE], num_output_nodes, ) q_network_cpe_target = q_network_cpe.get_target_network() trainer = DiscreteCRRTrainer( actor_network=actor_network, actor_network_target=actor_network_target, q1_network=q1_network, q1_network_target=q1_network_target, reward_network=reward_network, q2_network=q2_network, q2_network_target=q2_network_target, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, metrics_to_score=metrics_to_score, evaluation=self.eval_parameters, # pyre-fixme[16]: `CRRTrainerParameters` has no attribute `asdict`. **self.trainer_param.asdict(), ) return trainer
def run_test( env_name: str, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_train_episodes: int, max_steps: Optional[int], passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, ): env = EnvFactory.make(env_name) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) training_policy = manager.create_policy(serving=False) replay_buffer = ReplayBuffer.create_from_env( env=env, replay_memory_size=replay_memory_size, batch_size=trainer.minibatch_size, ) device = torch.device("cuda") if use_gpu else None # first fill the replay buffer to burn_in train_after_ts = max(train_after_ts, trainer.minibatch_size) fill_replay_buffer(env=env, replay_buffer=replay_buffer, desired_size=train_after_ts) post_step = train_with_replay_buffer_post_step( replay_buffer=replay_buffer, env=env, trainer=trainer, training_freq=train_every_ts, batch_size=trainer.minibatch_size, device=device, ) agent = Agent.create_for_env( env, policy=training_policy, post_transition_callback=post_step, # pyre-fixme[6]: Expected `Union[str, torch.device]` for 4th param but got # `Optional[torch.device]`. device=device, ) writer = SummaryWriter() with summary_writer_context(writer): train_rewards = [] for i in range(num_train_episodes): trajectory = run_episode(env=env, agent=agent, mdp_id=i, max_steps=max_steps) ep_reward = trajectory.calculate_cumulative_reward() train_rewards.append(ep_reward) logger.info( f"Finished training episode {i} with reward {ep_reward}.") assert train_rewards[-1] >= passing_score_bar, ( f"reward after {len(train_rewards)} episodes is {train_rewards[-1]}," f"less than < {passing_score_bar}...\n" f"Full reward history: {train_rewards}") logger.info("============Train rewards=============") logger.info(train_rewards) serving_policy = manager.create_policy(serving=True) agent = Agent.create_for_env_with_serving_policy(env, serving_policy) eval_rewards = evaluate_for_n_episodes(n=num_eval_episodes, env=env, agent=agent, max_steps=max_steps).squeeze(1) assert np.mean(eval_rewards) >= passing_score_bar, ( f"Predictor reward is {np.mean(eval_rewards)}," f"less than < {passing_score_bar}...\n" f"Full eval rewards: {eval_rewards}.") logger.info("============Eval rewards==============") logger.info(eval_rewards)
def query_and_train( input_table_spec: TableSpec, model: ModelManager__Union, normalization_data_map: Dict[str, NormalizationData], num_epochs: int, use_gpu: bool, reward_options: Optional[RewardOptions] = None, reader_options: Optional[ReaderOptions] = None, resource_options: Optional[ResourceOptions] = None, warmstart_path: Optional[str] = None, validator: Optional[ModelValidator__Union] = None, publisher: Optional[ModelPublisher__Union] = None, parent_workflow_id: Optional[int] = None, recurring_period: Optional[RecurringPeriod] = None, ) -> RLTrainingOutput: child_workflow_id = get_workflow_id() if parent_workflow_id is None: parent_workflow_id = child_workflow_id logger.info("Starting query") reward_options = reward_options or RewardOptions() reader_options = reader_options or ReaderOptions() resource_options = resource_options or ResourceOptions() manager = model.value calc_cpe_in_training = manager.should_generate_eval_dataset sample_range_output = get_sample_range(input_table_spec, calc_cpe_in_training) train_dataset = manager.query_data( input_table_spec=input_table_spec, sample_range=sample_range_output.train_sample_range, reward_options=reward_options, ) eval_dataset = None if calc_cpe_in_training: eval_dataset = manager.query_data( input_table_spec=input_table_spec, sample_range=sample_range_output.eval_sample_range, reward_options=reward_options, ) logger.info("Starting training") results = manager.train_workflow( train_dataset, eval_dataset, normalization_data_map, num_epochs, use_gpu, parent_workflow_id=parent_workflow_id, child_workflow_id=child_workflow_id, reward_options=reward_options, reader_options=reader_options, resource_options=resource_options, warmstart_path=warmstart_path, ) if validator is not None: results = run_validator(validator, results) if publisher is not None: results = run_publisher( publisher, model, results, parent_workflow_id, child_workflow_id, recurring_period, ) return results
def train_seq2reward_and_compute_reward_mse( env_name: str, model: ModelManager__Union, num_train_transitions: int, num_test_transitions: int, seq_len: int, batch_size: int, num_train_epochs: int, use_gpu: bool, saved_seq2reward_path: Optional[str] = None, ): """ Train Seq2Reward Network and compute reward mse. """ env = Gym(env_name=env_name) # pyre-fixme[16]: `Gym` has no attribute `seed`. env.seed(SEED) manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=build_normalizer(env), ) device = "cuda" if use_gpu else "cpu" # pyre-fixme[6]: Expected `device` for 2nd param but got `str`. trainer_preprocessor = make_replay_buffer_trainer_preprocessor( trainer, device, env) test_replay_buffer = ReplayBuffer( replay_capacity=num_test_transitions, batch_size=batch_size, stack_size=seq_len, return_everything_as_stack=True, ) fill_replay_buffer(env, test_replay_buffer, num_test_transitions) if saved_seq2reward_path is None: # train from scratch trainer = train_seq2reward( env=env, trainer=trainer, trainer_preprocessor=trainer_preprocessor, num_train_transitions=num_train_transitions, seq_len=seq_len, batch_size=batch_size, num_train_epochs=num_train_epochs, test_replay_buffer=test_replay_buffer, ) else: # load a pretrained model, and just evaluate it trainer.seq2reward_network.load_state_dict( torch.load(saved_seq2reward_path)) # pyre-fixme[16]: `Gym` has no attribute `observation_space`. state_dim = env.observation_space.shape[0] with torch.no_grad(): trainer.seq2reward_network.eval() test_batch = test_replay_buffer.sample_transition_batch( batch_size=test_replay_buffer.size) preprocessed_test_batch = trainer_preprocessor(test_batch) adhoc_padding(preprocessed_test_batch, state_dim=state_dim) losses = trainer.get_loss(preprocessed_test_batch) detached_losses = losses.cpu().detach().item() trainer.seq2reward_network.train() return detached_losses
def setUp(self): # preparing various components for qr-dqn trainer initialization self.batch_size = 3 self.state_dim = 10 self.action_dim = 2 self.num_layers = 2 self.sizes = [20 for _ in range(self.num_layers)] self.num_atoms = 11 self.activations = ["relu" for _ in range(self.num_layers)] self.dropout_ratio = 0 self.exploration_variance = 1e-10 self.actions = [str(i) for i in range(self.action_dim)] self.params = CRRTrainerParameters(actions=self.actions) self.reward_options = RewardOptions() self.metrics_to_score = get_metrics_to_score( self.reward_options.metric_reward_values ) self.actor_network = FullyConnectedActor( state_dim=self.state_dim, action_dim=self.action_dim, sizes=self.sizes, activations=self.activations, exploration_variance=self.exploration_variance, ) self.actor_network_target = self.actor_network.get_target_network() self.q1_network = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.action_dim, sizes=self.sizes, activations=self.activations, dropout_ratio=self.dropout_ratio, ) self.q1_network_target = self.q1_network.get_target_network() self.q2_network = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.action_dim, sizes=self.sizes, activations=self.activations, dropout_ratio=self.dropout_ratio, ) self.q2_network_target = self.q2_network.get_target_network() self.num_output_nodes = (len(self.metrics_to_score) + 1) * len( self.params.actions ) self.eval_parameters = EvaluationParameters(calc_cpe_in_training=True) self.reward_network = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.num_output_nodes, sizes=self.sizes, activations=self.activations, ) self.q_network_cpe = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.num_output_nodes, sizes=self.sizes, activations=self.activations, ) self.q_network_cpe_target = self.q_network_cpe.get_target_network() self.inp = DiscreteDqnInput( state=FeatureData( float_features=torch.rand(self.batch_size, self.state_dim) ), next_state=FeatureData( float_features=torch.rand(self.batch_size, self.state_dim) ), reward=torch.ones(self.batch_size, 1), time_diff=torch.ones(self.batch_size, 1) * 2, step=torch.ones(self.batch_size, 1) * 2, not_terminal=torch.ones( self.batch_size, 1 ), # todo: check terminal behavior action=torch.tensor([[0, 1], [1, 0], [0, 1]]), next_action=torch.tensor([[1, 0], [0, 1], [1, 0]]), possible_actions_mask=torch.ones(self.batch_size, self.action_dim), possible_next_actions_mask=torch.ones(self.batch_size, self.action_dim), extras=ExtraData(action_probability=torch.ones(self.batch_size, 1)), )
def run_test_offline( env_name: str, model: ModelManager__Union, replay_memory_size: int, num_batches_per_epoch: int, num_train_epochs: int, passing_score_bar: float, num_eval_episodes: int, minibatch_size: int, use_gpu: bool, ): env = Gym(env_name=env_name) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value trainer = manager.build_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) # first fill the replay buffer to burn_in replay_buffer = ReplayBuffer( replay_capacity=replay_memory_size, batch_size=minibatch_size ) # always fill full RB random_policy = make_random_policy_for_env(env) agent = Agent.create_for_env(env, policy=random_policy) fill_replay_buffer( env=env, replay_buffer=replay_buffer, desired_size=replay_memory_size, agent=agent, ) device = torch.device("cuda") if use_gpu else None dataset = OfflineReplayBufferDataset.create_for_trainer( trainer, env, replay_buffer, batch_size=minibatch_size, num_batches=num_batches_per_epoch, device=device, ) data_loader = torch.utils.data.DataLoader(dataset, collate_fn=identity_collate) pl_trainer = pl.Trainer( max_epochs=num_train_epochs, gpus=int(use_gpu), deterministic=True, default_root_dir=f"lightning_log_{str(uuid.uuid4())}", ) pl_trainer.fit(trainer, data_loader) logger.info(f"Evaluating after training for {num_train_epochs} epochs: ") eval_rewards = evaluate_cem(env, manager, trainer, num_eval_episodes) mean_rewards = np.mean(eval_rewards) assert ( mean_rewards >= passing_score_bar ), f"{mean_rewards} doesn't pass the bar {passing_score_bar}."
def run_test( env: Env__Union, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_train_episodes: int, passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, ): env = env.value normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) training_policy = manager.create_policy(serving=False) replay_buffer = ReplayBuffer(replay_capacity=replay_memory_size, batch_size=trainer.minibatch_size) device = torch.device("cuda") if use_gpu else torch.device("cpu") # first fill the replay buffer to burn_in train_after_ts = max(train_after_ts, trainer.minibatch_size) fill_replay_buffer(env=env, replay_buffer=replay_buffer, desired_size=train_after_ts) post_step = train_with_replay_buffer_post_step( replay_buffer=replay_buffer, env=env, trainer=trainer, training_freq=train_every_ts, batch_size=trainer.minibatch_size, device=device, ) env.seed(SEED) env.action_space.seed(SEED) train_rewards = train_policy( env, training_policy, num_train_episodes, post_step=post_step, post_episode=None, use_gpu=use_gpu, ) # Check whether the max score passed the score bar; we explore during training # the return could be bad (leading to flakiness in C51 and QRDQN). assert np.max(train_rewards) >= passing_score_bar, ( f"max reward ({np.max(train_rewards)}) after training for " f"{len(train_rewards)} episodes is less than < {passing_score_bar}.\n") serving_policy = manager.create_policy(serving=True) eval_rewards = eval_policy(env, serving_policy, num_eval_episodes, serving=True) assert ( eval_rewards.mean() >= passing_score_bar ), f"Eval reward is {eval_rewards.mean()}, less than < {passing_score_bar}.\n"
def run_test( env: str, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_train_episodes: int, max_steps: Optional[int], passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, ): env = EnvFactory.make(env) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is {normalization}") manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) replay_buffer = ReplayBuffer.create_from_env( env=env, replay_memory_size=replay_memory_size, batch_size=trainer.minibatch_size, ) device = torch.device("cuda") if use_gpu else None post_step = train_with_replay_buffer_post_step( replay_buffer=replay_buffer, trainer=trainer, training_freq=train_every_ts, batch_size=trainer.minibatch_size, replay_burnin=train_after_ts, device=device, ) training_policy = manager.create_policy(serving=False) agent = Agent.create_for_env(env, policy=training_policy, post_transition_callback=post_step, device=device) train_rewards = [] for i in range(num_train_episodes): ep_reward = run_episode(env=env, agent=agent, max_steps=max_steps) train_rewards.append(ep_reward) logger.info(f"Finished training episode {i} with reward {ep_reward}.") assert train_rewards[-1] >= passing_score_bar, ( f"reward after {len(train_rewards)} episodes is {train_rewards[-1]}," f"less than < {passing_score_bar}...\n" f"Full reward history: {train_rewards}") logger.info("============Train rewards=============") logger.info(train_rewards) def gym_to_reagent_serving( obs: np.array) -> Tuple[torch.Tensor, torch.Tensor]: obs_tensor = torch.tensor(obs).float().unsqueeze(0) presence_tensor = torch.ones_like(obs_tensor) return (obs_tensor, presence_tensor) serving_policy = manager.create_policy(serving=True) agent = Agent.create_for_env(env, policy=serving_policy, obs_preprocessor=gym_to_reagent_serving) eval_rewards = [] for i in range(num_eval_episodes): ep_reward = run_episode(env=env, agent=agent, max_steps=max_steps) eval_rewards.append(ep_reward) logger.info(f"Finished eval episode {i} with reward {ep_reward}.") assert np.mean(eval_rewards) >= passing_score_bar, ( f"Predictor reward is {np.mean(eval_rewards)}," f"less than < {passing_score_bar}...\n" f"Full eval rewards: {eval_rewards}.") logger.info("============Eval rewards==============") logger.info(eval_rewards)
def train_mdnrnn_and_compute_feature_stats( env_name: str, model: ModelManager__Union, num_train_transitions: int, num_test_transitions: int, seq_len: int, batch_size: int, num_train_epochs: int, use_gpu: bool, saved_mdnrnn_path: Optional[str] = None, ): """ Train MDNRNN Memory Network and compute feature importance/sensitivity. """ env: gym.Env = EnvFactory.make(env_name) env.seed(SEED) manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=build_normalizer(env), ) device = "cuda" if use_gpu else "cpu" trainer_preprocessor = make_replay_buffer_trainer_preprocessor(trainer, device, env) test_replay_buffer = ReplayBuffer.create_from_env( env=env, replay_memory_size=num_test_transitions, batch_size=batch_size, stack_size=seq_len, return_everything_as_stack=True, ) fill_replay_buffer(env, test_replay_buffer, num_test_transitions) if saved_mdnrnn_path is None: # train from scratch trainer = train_mdnrnn( env=env, trainer=trainer, trainer_preprocessor=trainer_preprocessor, num_train_transitions=num_train_transitions, seq_len=seq_len, batch_size=batch_size, num_train_epochs=num_train_epochs, test_replay_buffer=test_replay_buffer, ) else: # load a pretrained model, and just evaluate it trainer.memory_network.mdnrnn.load_state_dict(torch.load(saved_mdnrnn_path)) with torch.no_grad(): trainer.memory_network.mdnrnn.eval() test_batch = test_replay_buffer.sample_transition_batch_tensor( batch_size=test_replay_buffer.size ) preprocessed_test_batch = trainer_preprocessor(test_batch) feature_importance = calculate_feature_importance( env=env, trainer=trainer, use_gpu=use_gpu, test_batch=preprocessed_test_batch, ) feature_sensitivity = calculate_feature_sensitivity( env=env, trainer=trainer, use_gpu=use_gpu, test_batch=preprocessed_test_batch, ) trainer.memory_network.mdnrnn.train() return feature_importance, feature_sensitivity
def train_mdnrnn_and_train_on_embedded_env( env_name: str, embedding_model: ModelManager__Union, num_embedding_train_transitions: int, seq_len: int, batch_size: int, num_embedding_train_epochs: int, train_model: ModelManager__Union, num_state_embed_transitions: int, num_agent_train_epochs: int, num_agent_eval_epochs: int, use_gpu: bool, passing_score_bar: float, # pyre-fixme[9]: saved_mdnrnn_path has type `str`; used as `None`. saved_mdnrnn_path: str = None, ): """ Train an agent on embedded states by the MDNRNN. """ env = EnvFactory.make(env_name) env.seed(SEED) embedding_manager = embedding_model.value embedding_trainer = embedding_manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=build_normalizer(env), ) device = "cuda" if use_gpu else "cpu" embedding_trainer_preprocessor = make_replay_buffer_trainer_preprocessor( embedding_trainer, device, env ) if saved_mdnrnn_path is None: # train from scratch embedding_trainer = train_mdnrnn( env=env, trainer=embedding_trainer, trainer_preprocessor=embedding_trainer_preprocessor, num_train_transitions=num_embedding_train_transitions, seq_len=seq_len, batch_size=batch_size, num_train_epochs=num_embedding_train_epochs, ) else: # load a pretrained model, and just evaluate it embedding_trainer.memory_network.mdnrnn.load_state_dict( torch.load(saved_mdnrnn_path) ) # create embedding dataset embed_rb, state_min, state_max = create_embed_rl_dataset( env=env, memory_network=embedding_trainer.memory_network, num_state_embed_transitions=num_state_embed_transitions, batch_size=batch_size, seq_len=seq_len, hidden_dim=embedding_trainer.params.hidden_size, use_gpu=use_gpu, ) embed_env = StateEmbedEnvironment( gym_env=env, mdnrnn=embedding_trainer.memory_network, max_embed_seq_len=seq_len, state_min_value=state_min, state_max_value=state_max, ) agent_manager = train_model.value agent_trainer = agent_manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=build_normalizer(embed_env), ) device = "cuda" if use_gpu else "cpu" agent_trainer_preprocessor = make_replay_buffer_trainer_preprocessor( agent_trainer, device, env ) num_batch_per_epoch = embed_rb.size // batch_size for epoch in range(num_agent_train_epochs): for _ in tqdm(range(num_batch_per_epoch), desc=f"epoch {epoch}"): batch = embed_rb.sample_transition_batch_tensor(batch_size=batch_size) preprocessed_batch = agent_trainer_preprocessor(batch) agent_trainer.train(preprocessed_batch) # evaluate model rewards = [] policy = agent_manager.create_policy(serving=False) agent = Agent.create_for_env(embed_env, policy=policy, device=device) for i in range(num_agent_eval_epochs): ep_reward = run_episode(env=embed_env, agent=agent) rewards.append(ep_reward) logger.info(f"Finished eval episode {i} with reward {ep_reward}.") logger.info(f"Average eval reward is {np.mean(rewards)}.") assert ( np.mean(rewards) >= passing_score_bar ), f"average reward doesn't pass our bar {passing_score_bar}" return rewards
def run_test_replay_buffer( env: Env__Union, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_train_episodes: int, passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, minibatch_size: Optional[int] = None, ): """ Run an online learning test with a replay buffer. The replay buffer is pre-filled, then the training starts. Each transition is added to the replay buffer immediately after it takes place. """ env = env.value # pyre-fixme[16]: Module `pl` has no attribute `seed_everything`. pl.seed_everything(SEED) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) training_policy = manager.create_policy(serving=False) # pyre-fixme[16]: Module `pl` has no attribute `LightningModule`. if not isinstance(trainer, pl.LightningModule): if minibatch_size is None: minibatch_size = trainer.minibatch_size assert minibatch_size == trainer.minibatch_size assert minibatch_size is not None replay_buffer = ReplayBuffer(replay_capacity=replay_memory_size, batch_size=minibatch_size) device = torch.device("cuda") if use_gpu else torch.device("cpu") # first fill the replay buffer using random policy train_after_ts = max(train_after_ts, minibatch_size) fill_replay_buffer(env=env, replay_buffer=replay_buffer, desired_size=train_after_ts) agent = Agent.create_for_env(env, policy=training_policy, device=device) # TODO: Simplify this setup by creating LightningDataModule dataset = ReplayBufferDataset.create_for_trainer( trainer, env, agent, replay_buffer, batch_size=minibatch_size, training_frequency=train_every_ts, num_episodes=num_train_episodes, max_steps=200, device=device, ) data_loader = torch.utils.data.DataLoader(dataset, collate_fn=identity_collate) # pyre-fixme[16]: Module `pl` has no attribute `Trainer`. pl_trainer = pl.Trainer(max_epochs=1, gpus=int(use_gpu)) # Note: the fit() function below also evaluates the agent along the way # and adds the new transitions to the replay buffer, so it is training # on incrementally larger and larger buffers. pl_trainer.fit(trainer, data_loader) # TODO: Also check train_reward serving_policy = manager.create_policy(serving=True) eval_rewards = eval_policy(env, serving_policy, num_eval_episodes, serving=True) assert ( eval_rewards.mean() >= passing_score_bar ), f"Eval reward is {eval_rewards.mean()}, less than < {passing_score_bar}.\n"
def run_test( env_name: str, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_train_episodes: int, max_steps: Optional[int], passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, ): env = EnvFactory.make(env_name) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value try: # pyre-fixme[16]: `Env` has no attribute `state_feature_config_provider`. manager.state_feature_config_provider = env.state_feature_config_provider logger.info( f"Using environment's state_feature_config_provider.\n" f"{manager.state_feature_config_provider}" ) except AttributeError: logger.info("state_feature_config_provider override not applicable") trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) training_policy = manager.create_policy(serving=False) replay_buffer = ReplayBuffer.create_from_env( env=env, replay_memory_size=replay_memory_size, batch_size=trainer.minibatch_size, ) device = torch.device("cuda") if use_gpu else torch.device("cpu") # first fill the replay buffer to burn_in train_after_ts = max(train_after_ts, trainer.minibatch_size) fill_replay_buffer( env=env, replay_buffer=replay_buffer, desired_size=train_after_ts ) post_step = train_with_replay_buffer_post_step( replay_buffer=replay_buffer, env=env, trainer=trainer, training_freq=train_every_ts, batch_size=trainer.minibatch_size, device=device, ) agent = Agent.create_for_env( env, policy=training_policy, post_transition_callback=post_step, device=device ) writer = SummaryWriter() with summary_writer_context(writer): train_rewards = [] for i in range(num_train_episodes): trajectory = run_episode( env=env, agent=agent, mdp_id=i, max_steps=max_steps ) ep_reward = trajectory.calculate_cumulative_reward() train_rewards.append(ep_reward) logger.info( f"Finished training episode {i} (len {len(trajectory)})" f" with reward {ep_reward}." ) logger.info("============Train rewards=============") logger.info(train_rewards) logger.info(f"average: {np.mean(train_rewards)};\tmax: {np.max(train_rewards)}") # Check whether the max score passed the score bar; we explore during training # the return could be bad (leading to flakiness in C51 and QRDQN). assert np.max(train_rewards) >= passing_score_bar, ( f"max reward ({np.max(train_rewards)})after training for " f"{len(train_rewards)} episodes is less than < {passing_score_bar}.\n" ) serving_policy = manager.create_policy(serving=True) agent = Agent.create_for_env_with_serving_policy(env, serving_policy) eval_rewards = evaluate_for_n_episodes( n=num_eval_episodes, env=env, agent=agent, max_steps=max_steps ).squeeze(1) logger.info("============Eval rewards==============") logger.info(eval_rewards) logger.info(f"average: {np.mean(eval_rewards)};\tmax: {np.max(eval_rewards)}") assert np.mean(eval_rewards) >= passing_score_bar, ( f"Predictor reward is {np.mean(eval_rewards)}," f"less than < {passing_score_bar}.\n" )
def run_test_online_episode( env: Env__Union, model: ModelManager__Union, num_train_episodes: int, passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, ): """ Run an online learning test. At the end of each episode training is run on the trajectory. """ env = env.value # pyre-fixme[16]: Module `pl` has no attribute `seed_everything`. pl.seed_everything(SEED) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) policy = manager.create_policy(serving=False) device = torch.device("cuda") if use_gpu else torch.device("cpu") agent = Agent.create_for_env(env, policy, device=device) # pyre-fixme[16]: Module `pl` has no attribute `LightningModule`. if isinstance(trainer, pl.LightningModule): # pyre-fixme[16]: Module `pl` has no attribute `Trainer`. pl_trainer = pl.Trainer(max_epochs=1, gpus=int(use_gpu), deterministic=True) dataset = EpisodicDataset(env=env, agent=agent, num_episodes=num_train_episodes, seed=SEED) pl_trainer.fit(trainer, dataset) else: post_episode_callback = train_post_episode(env, trainer, use_gpu) _ = train_policy( env, policy, num_train_episodes, post_step=None, post_episode=post_episode_callback, use_gpu=use_gpu, ) eval_rewards = evaluate_for_n_episodes( n=num_eval_episodes, env=env, agent=agent, max_steps=env.max_steps, num_processes=1, ).squeeze(1) assert ( eval_rewards.mean() >= passing_score_bar ), f"Eval reward is {eval_rewards.mean()}, less than < {passing_score_bar}.\n"