def test_single_step_synthetic_reward(self):
        state_dim = 10
        action_dim = 2
        sizes = [256, 128]
        activations = ["sigmoid", "relu"]
        last_layer_activation = "leaky_relu"
        reward_net = SingleStepSyntheticRewardNet(
            state_dim=state_dim,
            action_dim=action_dim,
            sizes=sizes,
            activations=activations,
            last_layer_activation=last_layer_activation,
        )
        dnn = reward_net.export_mlp()
        # dnn[0] is a concat layer
        assert dnn[1].in_features == state_dim + action_dim
        assert dnn[1].out_features == 256
        assert dnn[2]._get_name() == "Sigmoid"
        assert dnn[3].in_features == 256
        assert dnn[3].out_features == 128
        assert dnn[4]._get_name() == "ReLU"
        assert dnn[5].in_features == 128
        assert dnn[5].out_features == 1
        assert dnn[6]._get_name() == "LeakyReLU"

        valid_step = torch.tensor([[1], [2], [3]])
        batch_size = 3
        seq_len = 4
        mask = synthetic_reward._gen_mask(valid_step, batch_size, seq_len)
        assert torch.all(
            mask == torch.tensor([[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 1.0, 1.0],
                                  [0.0, 1.0, 1.0, 1.0]]))
Example #2
0
 def test_linear_reward_parametric_reward(self):
     """
     Reward at each step is a linear function of state and action.
     However, we can only observe aggregated reward at the last step
     """
     state_dim = 10
     action_dim = 2
     seq_len = 5
     batch_size = 512
     num_batches = 10000
     sizes = [256, 128]
     activations = ["relu", "relu"]
     last_layer_activation = "linear"
     reward_net = SingleStepSyntheticRewardNet(
         state_dim=state_dim,
         action_dim=action_dim,
         sizes=sizes,
         activations=activations,
         last_layer_activation=last_layer_activation,
     )
     optimizer = Optimizer__Union(SGD=classes["SGD"]())
     trainer = RewardNetTrainer(reward_net, optimizer)
     trainer.set_reporter(
         RewardNetworkReporter(
             trainer.loss_type,
             str(reward_net),
         )
     )
     weight, data = create_data(
         state_dim, action_dim, seq_len, batch_size, num_batches
     )
     threshold = 0.1
     avg_eval_loss = train_and_eval(trainer, data)
     assert avg_eval_loss < threshold
Example #3
0
    def test_linear_reward_parametric_reward(self):
        """
        Reward at each step is a linear function of state and action.
        However, we can only observe aggregated reward at the last step
        """
        state_dim = 10
        action_dim = 2
        seq_len = 5
        batch_size = 512
        num_batches = 10000
        sizes = [256, 128]
        activations = ["relu", "relu"]
        last_layer_activation = "linear"
        reward_net = SingleStepSyntheticRewardNet(
            state_dim=state_dim,
            action_dim=action_dim,
            sizes=sizes,
            activations=activations,
            last_layer_activation=last_layer_activation,
        )
        optimizer = Optimizer__Union(SGD=classes["SGD"]())
        trainer = RewardNetTrainer(reward_net, optimizer)

        weight, data_generator = create_data(state_dim, action_dim, seq_len,
                                             batch_size, num_batches)
        threshold = 0.1
        reach_threshold = False
        for batch in data_generator():
            loss = trainer.train(batch)
            if loss < threshold:
                reach_threshold = True
                break

        assert reach_threshold, f"last loss={loss}"
Example #4
0
 def build_synthetic_reward_network(
     self,
     state_normalization_data: NormalizationData,
     action_normalization_data: Optional[NormalizationData] = None,
     discrete_action_names: Optional[List[str]] = None,
 ) -> ModelBase:
     state_dim = get_num_output_features(
         state_normalization_data.dense_normalization_parameters)
     if not discrete_action_names:
         assert action_normalization_data is not None
         action_dim = get_num_output_features(
             action_normalization_data.dense_normalization_parameters)
     else:
         action_dim = len(discrete_action_names)
     return SingleStepSyntheticRewardNet(
         state_dim=state_dim,
         action_dim=action_dim,
         sizes=self.sizes,
         activations=self.activations,
         last_layer_activation=self.last_layer_activation,
     )
Example #5
0
    def _test_linear_reward_parametric_reward(
            self, ground_truth_reward_from_multiple_steps=False):
        """
        Reward at each step is a linear function of present state and action.
        However, we can only observe aggregated reward at the last step

        This model will fail to learn when ground-truth reward is a function of
        multiple steps' states and actions.
        """
        state_dim = 10
        action_dim = 2
        seq_len = 5
        batch_size = 512
        num_batches = 5000
        sizes = [256, 128]
        activations = ["relu", "relu"]
        last_layer_activation = "linear"
        reward_net = SyntheticRewardNet(
            SingleStepSyntheticRewardNet(
                state_dim=state_dim,
                action_dim=action_dim,
                sizes=sizes,
                activations=activations,
                last_layer_activation=last_layer_activation,
            ))
        optimizer = Optimizer__Union(Adam=classes["Adam"]())
        trainer = RewardNetTrainer(reward_net, optimizer)
        trainer.set_reporter(
            RewardNetworkReporter(
                trainer.loss_type,
                str(reward_net),
            ))
        if ground_truth_reward_from_multiple_steps:
            weight, data = create_sequence_data(state_dim, action_dim, seq_len,
                                                batch_size, num_batches)
        else:
            weight, data = create_data(state_dim, action_dim, seq_len,
                                       batch_size, num_batches)
        avg_eval_loss = train_and_eval(trainer, data)
        return avg_eval_loss