def test_no_soft_update(self):
        model = Model()
        target_model = copy.deepcopy(model)

        for target_param, param in zip(model.parameters(),
                                       target_model.parameters()):
            self.assertIs(target_param, param)

        optimizer = torch.optim.Adam(model.parameters())

        x = torch.tensor([1, 2], dtype=torch.int64)
        emb = model(x)

        loss = emb.sum()

        loss.backward()
        optimizer.step()

        params = list(model.parameters())
        self.assertEqual(1, len(params))
        param = params[0].detach().numpy()

        trainer = RLTrainer(rl_parameters=RLParameters(), use_gpu=False)
        trainer._soft_update(model, target_model, 0.1)

        target_params = list(target_model.parameters())
        self.assertEqual(1, len(target_params))
        target_param = target_params[0].detach().numpy()

        npt.assert_array_equal(target_param, param)
Exemple #2
0
    def __init__(
        self,
        q_network,
        q_network_target,
        metrics_to_score=None,
        loss_reporter=None,
        use_gpu: bool = False,
        actions: List[str] = field(default_factory=list),  # noqa: B008
        rl: RLParameters = field(default_factory=RLParameters),  # noqa: B008
        double_q_learning: bool = True,
        minibatch_size: int = 1024,
        minibatches_per_step: int = 1,
        num_atoms: int = 51,
        qmin: float = -100,
        qmax: float = 200,
        optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
        evaluation: EvaluationParameters = field(  # noqa: B008
            default_factory=EvaluationParameters),
    ) -> None:
        RLTrainer.__init__(
            self,
            rl,
            use_gpu=use_gpu,
            metrics_to_score=metrics_to_score,
            actions=actions,
            loss_reporter=loss_reporter,
        )

        self.double_q_learning = double_q_learning
        self.minibatch_size = minibatch_size
        self.minibatches_per_step = minibatches_per_step
        self._actions = actions
        self.q_network = q_network
        self.q_network_target = q_network_target
        self.q_network_optimizer = optimizer.make_optimizer(
            q_network.parameters())
        self.qmin = qmin
        self.qmax = qmax
        self.num_atoms = num_atoms
        self.support = torch.linspace(self.qmin,
                                      self.qmax,
                                      self.num_atoms,
                                      device=self.device)
        self.scale_support = (self.qmax - self.qmin) / (self.num_atoms - 1.0)

        self.reward_boosts = torch.zeros([1, len(self._actions)],
                                         device=self.device)
        if rl.reward_boost is not None:
            # pyre-fixme[16]: Optional type has no attribute `keys`.
            for k in rl.reward_boost.keys():
                i = self._actions.index(k)
                # pyre-fixme[16]: Optional type has no attribute `__getitem__`.
                self.reward_boosts[0, i] = rl.reward_boost[k]
Exemple #3
0
    def __init__(
        self,
        q_network,
        q_network_target,
        parameters: C51TrainerParameters,
        use_gpu=False,
        metrics_to_score=None,
        loss_reporter=None,
    ) -> None:
        RLTrainer.__init__(
            self,
            parameters.rl,
            use_gpu=use_gpu,
            metrics_to_score=metrics_to_score,
            actions=parameters.actions,
            loss_reporter=loss_reporter,
        )

        self.double_q_learning = parameters.double_q_learning
        self.minibatch_size = parameters.minibatch_size
        self.minibatches_per_step = parameters.minibatches_per_step or 1
        self._actions = parameters.actions if parameters.actions is not None else []
        self.q_network = q_network
        self.q_network_target = q_network_target
        self.q_network_optimizer = self._get_optimizer(q_network,
                                                       parameters.optimizer)
        self.qmin = parameters.qmin
        self.qmax = parameters.qmax
        self.num_atoms = parameters.num_atoms
        self.support = torch.linspace(self.qmin,
                                      self.qmax,
                                      self.num_atoms,
                                      device=self.device)

        self.reward_boosts = torch.zeros([1, len(self._actions)],
                                         device=self.device)
        if parameters.rl.reward_boost is not None:
            for k in parameters.rl.reward_boost.keys():
                i = self._actions.index(k)
                self.reward_boosts[0, i] = parameters.rl.reward_boost[k]
Exemple #4
0
def train_gym_offline_rl(
    gym_env: OpenAIGymEnvironment,
    replay_buffer: OpenAIGymMemoryPool,
    model_type: str,
    trainer: RLTrainer,
    predictor: OnPolicyPredictor,
    test_run_name: str,
    score_bar: Optional[float],
    max_steps: int,
    avg_over_num_episodes: int,
    offline_train_epochs: int,
    num_batch_per_epoch: Optional[int],
    bcq_imitator_hyper_params: Optional[Dict[str, Any]] = None,
):
    if num_batch_per_epoch is None:
        num_batch_per_epoch = replay_buffer.size // trainer.minibatch_size
    assert num_batch_per_epoch > 0, "The size of replay buffer is not sufficient"

    logger.info(
        "{} offline transitions in replay buffer.\n"
        "Training will take {} epochs, with each epoch having {} mini-batches"
        " and each mini-batch having {} samples".format(
            replay_buffer.size,
            offline_train_epochs,
            num_batch_per_epoch,
            trainer.minibatch_size,
        ))

    avg_reward_history, epoch_history = [], []

    # Pre-train a GBDT imitator if doing batch constrained q-learning in Gym
    if getattr(trainer, "bcq", None):
        assert bcq_imitator_hyper_params is not None
        gbdt = GradientBoostingClassifier(
            n_estimators=bcq_imitator_hyper_params["gbdt_trees"],
            max_depth=bcq_imitator_hyper_params["max_depth"],
        )
        samples = replay_buffer.sample_memories(replay_buffer.size, model_type)
        X, y = samples.states.numpy(), torch.max(samples.actions,
                                                 dim=1)[1].numpy()
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.1)
        logger.info("Fitting GBDT...")
        gbdt.fit(X_train, y_train)
        train_score = round(gbdt.score(X_train, y_train) * 100, 1)
        test_score = round(gbdt.score(X_test, y_test) * 100, 1)
        logger.info("GBDT train accuracy {}% || test accuracy {}%".format(
            train_score, test_score))
        trainer.bcq_imitator = gbdt.predict_proba  # type: ignore

    # Offline training
    for i_epoch in range(offline_train_epochs):
        for _ in range(num_batch_per_epoch):
            samples = replay_buffer.sample_memories(trainer.minibatch_size,
                                                    model_type)
            samples.set_device(trainer.device)
            trainer.train(samples)

        batch_td_loss = float(
            torch.mean(
                torch.tensor([
                    stat.td_loss
                    for stat in trainer.loss_reporter.incoming_stats
                ])))
        trainer.loss_reporter.flush()
        logger.info("Average TD loss: {} in epoch {}".format(
            batch_td_loss, i_epoch + 1))

        # test model performance for this epoch
        avg_rewards, avg_discounted_rewards = gym_env.run_ep_n_times(
            avg_over_num_episodes, predictor, test=True, max_steps=max_steps)
        avg_reward_history.append(avg_rewards)

        # For offline training, use epoch number as timestep history since
        # we have a fixed batch of data to count epochs over.
        epoch_history.append(i_epoch)
        logger.info(
            "Achieved an average reward score of {} over {} evaluations"
            " after epoch {}.".format(avg_rewards, avg_over_num_episodes,
                                      i_epoch))
        if score_bar is not None and avg_rewards > score_bar:
            logger.info("Avg. reward history for {}: {}".format(
                test_run_name, avg_reward_history))
            return avg_reward_history, epoch_history, trainer, predictor, gym_env

    logger.info("Avg. reward history for {}: {}".format(
        test_run_name, avg_reward_history))
    return avg_reward_history, epoch_history, trainer, predictor, gym_env