コード例 #1
0
def train_seq2reward_model(training_data, learning_rate=0.01, num_epochs=5):
    SEQ_LEN, batch_size, NUM_ACTION = next(iter(training_data)).action.shape
    assert SEQ_LEN == 6 and NUM_ACTION == 2

    seq2reward_network = Seq2RewardNetwork(
        state_dim=NUM_ACTION,
        action_dim=NUM_ACTION,
        num_hiddens=64,
        num_hidden_layers=2,
    )

    trainer_param = Seq2RewardTrainerParameters(
        learning_rate=learning_rate,
        multi_steps=SEQ_LEN,
        action_names=["0", "1"],
        gamma=1.0,
        view_q_value=True,
    )

    trainer = Seq2RewardTrainer(
        seq2reward_network=seq2reward_network, params=trainer_param
    )

    pl.seed_everything(SEED)
    pl_trainer = pl.Trainer(max_epochs=num_epochs, deterministic=True)
    pl_trainer.fit(trainer, training_data)

    return trainer
コード例 #2
0
 def build_trainer(self, use_gpu: bool) -> Seq2RewardTrainer:
     seq2reward_network = self.net_builder.value.build_value_network(
         self.state_normalization_data
     )
     trainer = Seq2RewardTrainer(
         seq2reward_network=seq2reward_network, params=self.trainer_param
     )
     return trainer
コード例 #3
0
ファイル: seq2reward_model.py プロジェクト: zachkeer/ReAgent
    def build_trainer(self) -> Seq2RewardTrainer:
        seq2reward_network = self.net_builder.value.build_value_network(
            self.state_normalization_data)

        if self.use_gpu:
            seq2reward_network = seq2reward_network.cuda()

        return Seq2RewardTrainer(seq2reward_network=seq2reward_network,
                                 params=self.trainer_param)
コード例 #4
0
    def build_trainer(self) -> Seq2RewardTrainer:
        seq2reward_network = self.net_builder.value.build_value_network(
            self.state_normalization_data)
        trainer = Seq2RewardTrainer(seq2reward_network=seq2reward_network,
                                    params=self.trainer_param)
        if self.use_gpu:
            trainer.seq2reward_network = trainer.seq2reward_network.cuda()
            trainer.step_predict_network = trainer.step_predict_network.cuda()
            trainer.all_permut = trainer.all_permut.cuda()

        return trainer
コード例 #5
0
 def build_trainer(
     self,
     normalization_data_map: Dict[str, NormalizationData],
     use_gpu: bool,
     reward_options: Optional[RewardOptions] = None,
 ) -> Seq2RewardTrainer:
     seq2reward_network = self.net_builder.value.build_value_network(
         normalization_data_map[NormalizationKey.STATE])
     trainer = Seq2RewardTrainer(seq2reward_network=seq2reward_network,
                                 params=self.trainer_param)
     return trainer
コード例 #6
0
def train_and_eval_seq2reward_model(training_data,
                                    eval_data,
                                    learning_rate=0.01,
                                    num_epochs=5):
    SEQ_LEN, batch_size, NUM_ACTION = training_data[0].action.shape
    assert SEQ_LEN == 6 and NUM_ACTION == 2

    seq2reward_network = Seq2RewardNetwork(
        state_dim=NUM_ACTION,
        action_dim=NUM_ACTION,
        num_hiddens=64,
        num_hidden_layers=2,
    )

    trainer_param = Seq2RewardTrainerParameters(
        learning_rate=0.01,
        multi_steps=SEQ_LEN,
        action_names=["0", "1"],
        batch_size=batch_size,
        gamma=1.0,
        view_q_value=True,
    )

    trainer = Seq2RewardTrainer(seq2reward_network=seq2reward_network,
                                params=trainer_param)

    for _ in range(num_epochs):
        for batch in training_data:
            trainer.train(batch)

    total_eval_mse_loss = 0
    for batch in eval_data:
        mse_loss, _ = trainer.get_loss(batch)
        total_eval_mse_loss += mse_loss.cpu().detach().item()
    eval_mse_loss = total_eval_mse_loss / len(eval_data)

    initial_state = torch.Tensor([[0, 0]])
    q_values = torch.squeeze(
        get_Q(
            trainer.seq2reward_network,
            initial_state,
            trainer.all_permut,
        ))
    return eval_mse_loss, q_values