def test_no_soft_update(self): model = Model() target_model = copy.deepcopy(model) for target_param, param in zip(model.parameters(), target_model.parameters()): self.assertIs(target_param, param) optimizer = torch.optim.Adam(model.parameters()) x = torch.tensor([1, 2], dtype=torch.int64) emb = model(x) loss = emb.sum() loss.backward() optimizer.step() params = list(model.parameters()) self.assertEqual(1, len(params)) param = params[0].detach().numpy() trainer = RLTrainer(rl_parameters=RLParameters(), use_gpu=False) trainer._soft_update(model, target_model, 0.1) target_params = list(target_model.parameters()) self.assertEqual(1, len(target_params)) target_param = target_params[0].detach().numpy() npt.assert_array_equal(target_param, param)
def __init__( self, q_network, q_network_target, metrics_to_score=None, loss_reporter=None, use_gpu: bool = False, actions: List[str] = field(default_factory=list), # noqa: B008 rl: RLParameters = field(default_factory=RLParameters), # noqa: B008 double_q_learning: bool = True, minibatch_size: int = 1024, minibatches_per_step: int = 1, num_atoms: int = 51, qmin: float = -100, qmax: float = 200, optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), evaluation: EvaluationParameters = field( # noqa: B008 default_factory=EvaluationParameters), ) -> None: RLTrainer.__init__( self, rl, use_gpu=use_gpu, metrics_to_score=metrics_to_score, actions=actions, loss_reporter=loss_reporter, ) self.double_q_learning = double_q_learning self.minibatch_size = minibatch_size self.minibatches_per_step = minibatches_per_step self._actions = actions self.q_network = q_network self.q_network_target = q_network_target self.q_network_optimizer = optimizer.make_optimizer( q_network.parameters()) self.qmin = qmin self.qmax = qmax self.num_atoms = num_atoms self.support = torch.linspace(self.qmin, self.qmax, self.num_atoms, device=self.device) self.scale_support = (self.qmax - self.qmin) / (self.num_atoms - 1.0) self.reward_boosts = torch.zeros([1, len(self._actions)], device=self.device) if rl.reward_boost is not None: # pyre-fixme[16]: Optional type has no attribute `keys`. for k in rl.reward_boost.keys(): i = self._actions.index(k) # pyre-fixme[16]: Optional type has no attribute `__getitem__`. self.reward_boosts[0, i] = rl.reward_boost[k]
def __init__( self, q_network, q_network_target, parameters: C51TrainerParameters, use_gpu=False, metrics_to_score=None, loss_reporter=None, ) -> None: RLTrainer.__init__( self, parameters.rl, use_gpu=use_gpu, metrics_to_score=metrics_to_score, actions=parameters.actions, loss_reporter=loss_reporter, ) self.double_q_learning = parameters.double_q_learning self.minibatch_size = parameters.minibatch_size self.minibatches_per_step = parameters.minibatches_per_step or 1 self._actions = parameters.actions if parameters.actions is not None else [] self.q_network = q_network self.q_network_target = q_network_target self.q_network_optimizer = self._get_optimizer(q_network, parameters.optimizer) self.qmin = parameters.qmin self.qmax = parameters.qmax self.num_atoms = parameters.num_atoms self.support = torch.linspace(self.qmin, self.qmax, self.num_atoms, device=self.device) self.reward_boosts = torch.zeros([1, len(self._actions)], device=self.device) if parameters.rl.reward_boost is not None: for k in parameters.rl.reward_boost.keys(): i = self._actions.index(k) self.reward_boosts[0, i] = parameters.rl.reward_boost[k]
def train_gym_offline_rl( gym_env: OpenAIGymEnvironment, replay_buffer: OpenAIGymMemoryPool, model_type: str, trainer: RLTrainer, predictor: OnPolicyPredictor, test_run_name: str, score_bar: Optional[float], max_steps: int, avg_over_num_episodes: int, offline_train_epochs: int, num_batch_per_epoch: Optional[int], bcq_imitator_hyper_params: Optional[Dict[str, Any]] = None, ): if num_batch_per_epoch is None: num_batch_per_epoch = replay_buffer.size // trainer.minibatch_size assert num_batch_per_epoch > 0, "The size of replay buffer is not sufficient" logger.info( "{} offline transitions in replay buffer.\n" "Training will take {} epochs, with each epoch having {} mini-batches" " and each mini-batch having {} samples".format( replay_buffer.size, offline_train_epochs, num_batch_per_epoch, trainer.minibatch_size, )) avg_reward_history, epoch_history = [], [] # Pre-train a GBDT imitator if doing batch constrained q-learning in Gym if getattr(trainer, "bcq", None): assert bcq_imitator_hyper_params is not None gbdt = GradientBoostingClassifier( n_estimators=bcq_imitator_hyper_params["gbdt_trees"], max_depth=bcq_imitator_hyper_params["max_depth"], ) samples = replay_buffer.sample_memories(replay_buffer.size, model_type) X, y = samples.states.numpy(), torch.max(samples.actions, dim=1)[1].numpy() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1) logger.info("Fitting GBDT...") gbdt.fit(X_train, y_train) train_score = round(gbdt.score(X_train, y_train) * 100, 1) test_score = round(gbdt.score(X_test, y_test) * 100, 1) logger.info("GBDT train accuracy {}% || test accuracy {}%".format( train_score, test_score)) trainer.bcq_imitator = gbdt.predict_proba # type: ignore # Offline training for i_epoch in range(offline_train_epochs): for _ in range(num_batch_per_epoch): samples = replay_buffer.sample_memories(trainer.minibatch_size, model_type) samples.set_device(trainer.device) trainer.train(samples) batch_td_loss = float( torch.mean( torch.tensor([ stat.td_loss for stat in trainer.loss_reporter.incoming_stats ]))) trainer.loss_reporter.flush() logger.info("Average TD loss: {} in epoch {}".format( batch_td_loss, i_epoch + 1)) # test model performance for this epoch avg_rewards, avg_discounted_rewards = gym_env.run_ep_n_times( avg_over_num_episodes, predictor, test=True, max_steps=max_steps) avg_reward_history.append(avg_rewards) # For offline training, use epoch number as timestep history since # we have a fixed batch of data to count epochs over. epoch_history.append(i_epoch) logger.info( "Achieved an average reward score of {} over {} evaluations" " after epoch {}.".format(avg_rewards, avg_over_num_episodes, i_epoch)) if score_bar is not None and avg_rewards > score_bar: logger.info("Avg. reward history for {}: {}".format( test_run_name, avg_reward_history)) return avg_reward_history, epoch_history, trainer, predictor, gym_env logger.info("Avg. reward history for {}: {}".format( test_run_name, avg_reward_history)) return avg_reward_history, epoch_history, trainer, predictor, gym_env