Esempio n. 1
0
 def make_fully_connected(
     cls,
     state_dim: int,
     action_dim: int,
     layers: List[int],
     activations: List[str],
     use_batch_norm: bool = False,
 ):
     state_embedding_dim = layers[-1]
     shared_network = FullyConnectedDQN(
         state_dim,
         state_embedding_dim,
         sizes=layers[:-1],
         activations=activations[:-1],
         normalized_output=True,
     )
     advantage_network = FullyConnectedCritic(
         state_embedding_dim,
         action_dim,
         sizes=[state_embedding_dim // 2],
         activations=activations[-1:],
     )
     value_network = FullyConnectedDQN(
         state_embedding_dim,
         1,
         sizes=[state_embedding_dim // 2],
         activations=activations[-1:],
     )
     return ParametricDuelingQNetwork(
         shared_network=shared_network,
         advantage_network=advantage_network,
         value_network=value_network,
     )
Esempio n. 2
0
    def test_forward_pass(self):
        state_dim = 1
        action_dim = 2
        input = PreprocessedState.from_tensor(state=torch.tensor([[2.0]]))
        bcq_drop_threshold = 0.20

        q_network = FullyConnectedDQN(state_dim,
                                      action_dim,
                                      sizes=[2],
                                      activations=["relu"])
        # Set weights of q-network to make it deterministic
        q_net_layer_0_w = torch.tensor([[1.2], [0.9]])
        q_network.state_dict()["fc.layers.0.weight"].data.copy_(
            q_net_layer_0_w)
        q_net_layer_0_b = torch.tensor([0.0, 0.0])
        q_network.state_dict()["fc.layers.0.bias"].data.copy_(q_net_layer_0_b)
        q_net_layer_1_w = torch.tensor([[0.5, -0.5], [1.0, 1.0]])
        q_network.state_dict()["fc.layers.1.weight"].data.copy_(
            q_net_layer_1_w)
        q_net_layer_1_b = torch.tensor([0.0, 0.0])
        q_network.state_dict()["fc.layers.1.bias"].data.copy_(q_net_layer_1_b)

        imitator_network = FullyConnectedNetwork(
            layers=[state_dim, 2, action_dim], activations=["relu", "linear"])
        # Set weights of imitator network to make it deterministic
        im_net_layer_0_w = torch.tensor([[1.2], [0.9]])
        imitator_network.state_dict()["layers.0.weight"].data.copy_(
            im_net_layer_0_w)
        im_net_layer_0_b = torch.tensor([0.0, 0.0])
        imitator_network.state_dict()["layers.0.bias"].data.copy_(
            im_net_layer_0_b)
        im_net_layer_1_w = torch.tensor([[0.5, 1.5], [1.0, 2.0]])
        imitator_network.state_dict()["layers.1.weight"].data.copy_(
            im_net_layer_1_w)
        im_net_layer_1_b = torch.tensor([0.0, 0.0])
        imitator_network.state_dict()["layers.1.bias"].data.copy_(
            im_net_layer_1_b)

        imitator_probs = torch.nn.functional.softmax(imitator_network(
            input.state.float_features),
                                                     dim=1)
        bcq_mask = imitator_probs < bcq_drop_threshold
        assert bcq_mask[0][0] == 1
        assert bcq_mask[0][1] == 0

        model = BatchConstrainedDQN(
            state_dim=state_dim,
            q_network=q_network,
            imitator_network=imitator_network,
            bcq_drop_threshold=bcq_drop_threshold,
        )
        final_q_values = model(input)
        assert final_q_values.q_values[0][0] == -1e10
        assert abs(final_q_values.q_values[0][1] - 4.2) < 0.0001
Esempio n. 3
0
 def test_save_load_batch_norm(self):
     state_dim = 8
     action_dim = 4
     model = FullyConnectedDQN(
         state_dim,
         action_dim,
         sizes=[8, 4],
         activations=["relu", "relu"],
         use_batch_norm=True,
     )
     # Freezing batch_norm
     model.eval()
     expected_num_params, expected_num_inputs, expected_num_outputs = 21, 1, 1
     check_save_load(self, model, expected_num_params, expected_num_inputs,
                     expected_num_outputs)
Esempio n. 4
0
    def test_discrete_wrapper_with_id_list_none(self):
        state_normalization_parameters = {i: _cont_norm() for i in range(1, 5)}
        state_preprocessor = Preprocessor(state_normalization_parameters,
                                          False)
        action_dim = 2
        dqn = FullyConnectedDQN(
            state_dim=len(state_normalization_parameters),
            action_dim=action_dim,
            sizes=[16],
            activations=["relu"],
        )
        dqn_with_preprocessor = DiscreteDqnWithPreprocessorWithIdList(
            dqn, state_preprocessor)
        action_names = ["L", "R"]
        wrapper = DiscreteDqnPredictorWrapperWithIdList(
            dqn_with_preprocessor, action_names)
        input_prototype = dqn_with_preprocessor.input_prototype()
        output_action_names, q_values = wrapper(*input_prototype)
        self.assertEqual(action_names, output_action_names)
        self.assertEqual(q_values.shape, (1, 2))

        expected_output = dqn(
            rlt.PreprocessedState.from_tensor(
                state_preprocessor(*input_prototype[0]))).q_values
        self.assertTrue((expected_output == q_values).all())
Esempio n. 5
0
 def test_basic(self):
     state_dim = 8
     action_dim = 4
     model = FullyConnectedDQN(
         state_dim,
         action_dim,
         sizes=[8, 4],
         activations=["relu", "relu"],
         use_batch_norm=True,
     )
     input = model.input_prototype()
     self.assertEqual((1, state_dim), input.state.float_features.shape)
     # Using batch norm requires more than 1 example in training, avoid that
     model.eval()
     q_values = model(input)
     self.assertEqual((1, action_dim), q_values.q_values.shape)
Esempio n. 6
0
    def test_forward_pass(self):
        torch.manual_seed(123)
        state_dim = 1
        action_dim = 2
        state = rlt.FeatureData(torch.tensor([[2.0]]))
        bcq_drop_threshold = 0.20

        q_network = FullyConnectedDQN(state_dim,
                                      action_dim,
                                      sizes=[2],
                                      activations=["relu"])
        init.constant_(q_network.fc.dnn[-2].bias, 3.0)
        imitator_network = FullyConnectedNetwork(
            layers=[state_dim, 2, action_dim], activations=["relu", "linear"])

        imitator_probs = torch.nn.functional.softmax(imitator_network(
            state.float_features),
                                                     dim=1)
        bcq_mask = imitator_probs < bcq_drop_threshold
        npt.assert_array_equal(bcq_mask.detach(), [[True, False]])

        model = BatchConstrainedDQN(
            state_dim=state_dim,
            q_network=q_network,
            imitator_network=imitator_network,
            bcq_drop_threshold=bcq_drop_threshold,
        )
        final_q_values = model(state)
        npt.assert_array_equal(final_q_values.detach(), [[-1e10, 3.0]])
Esempio n. 7
0
 def setUp(self):
     # preparing various components for qr-dqn trainer initialization
     self.params = QRDQNTrainerParameters(actions=["1", "2"], num_atoms=11)
     self.reward_options = RewardOptions()
     self.metrics_to_score = get_metrics_to_score(
         self.reward_options.metric_reward_values
     )
     self.state_dim = 10
     self.action_dim = 2
     self.sizes = [20, 20]
     self.num_atoms = 11
     self.activations = ["relu", "relu"]
     self.dropout_ratio = 0
     self.q_network = FullyConnectedDQN(
         state_dim=self.state_dim,
         action_dim=self.action_dim,
         sizes=self.sizes,
         num_atoms=self.num_atoms,
         activations=self.activations,
         dropout_ratio=self.dropout_ratio,
     )
     self.q_network_target = self.q_network.get_target_network()
     self.x = FeatureData(float_features=torch.rand(5, 10))
     self.eval_parameters = EvaluationParameters(calc_cpe_in_training=True)
     self.num_output_nodes = (len(self.metrics_to_score) + 1) * len(
         # pyre-fixme[16]: `QRDQNTrainerParameters` has no attribute `actions`.
         self.params.actions
     )
     self.reward_network = FullyConnectedDQN(
         state_dim=self.state_dim,
         action_dim=self.num_output_nodes,
         sizes=self.sizes,
         activations=self.activations,
     )
     self.q_network_cpe = FullyConnectedDQN(
         state_dim=self.state_dim,
         action_dim=self.num_output_nodes,
         sizes=self.sizes,
         activations=self.activations,
     )
     self.q_network_cpe_target = self.q_network_cpe.get_target_network()
Esempio n. 8
0
    def make_fully_connected(
        cls,
        state_dim: int,
        action_dim: int,
        layers: List[int],
        activations: List[str],
        num_atoms: Optional[int] = None,
        use_batch_norm: bool = False,
    ):
        assert len(layers) > 0, "Must have at least one layer"
        state_embedding_dim = layers[-1]
        assert state_embedding_dim % 2 == 0, "The last size must be divisible by 2"
        shared_network = FullyConnectedDQN(
            state_dim,
            state_embedding_dim,
            sizes=layers[:-1],
            activations=activations[:-1],
            normalized_output=True,
            use_batch_norm=use_batch_norm,
        )
        advantage_network = FullyConnectedDQN(
            state_embedding_dim,
            action_dim,
            sizes=[state_embedding_dim // 2],
            activations=activations[-1:],
            num_atoms=num_atoms,
        )
        value_network = FullyConnectedDQN(
            state_embedding_dim,
            1,
            sizes=[state_embedding_dim // 2],
            activations=activations[-1:],
            num_atoms=num_atoms,
        )

        return cls(
            shared_network=shared_network,
            advantage_network=advantage_network,
            value_network=value_network,
        )
Esempio n. 9
0
 def test_save_load(self):
     state_dim = 8
     action_dim = 4
     model = FullyConnectedDQN(
         state_dim,
         action_dim,
         sizes=[8, 4],
         activations=["relu", "relu"],
         use_batch_norm=False,
     )
     expected_num_params, expected_num_inputs, expected_num_outputs = 6, 1, 1
     check_save_load(self, model, expected_num_params, expected_num_inputs,
                     expected_num_outputs)
Esempio n. 10
0
 def build_q_network(
     self,
     state_feature_config: rlt.ModelFeatureConfig,
     state_normalization_parameters: Dict[int, NormalizationParameters],
     output_dim: int,
 ) -> ModelBase:
     state_dim = self._get_input_dim(state_normalization_parameters)
     return FullyConnectedDQN(
         state_dim=state_dim,
         action_dim=output_dim,
         sizes=self.sizes,
         activations=self.activations,
         dropout_ratio=self.dropout_ratio,
     )
Esempio n. 11
0
 def build_q_network(
     self,
     state_normalization_data: NormalizationData,
     output_dim: int,
     num_atoms: int,
 ) -> ModelBase:
     state_dim = self._get_input_dim(state_normalization_data)
     return FullyConnectedDQN(
         state_dim=state_dim,
         action_dim=output_dim,
         sizes=self.sizes,
         num_atoms=num_atoms,
         activations=self.activations,
         dropout_ratio=self.dropout_ratio,
     )
Esempio n. 12
0
 def build_q_network(
     self,
     state_feature_config: rlt.ModelFeatureConfig,
     state_normalization_data: NormalizationData,
     output_dim: int,
 ) -> ModelBase:
     state_dim = self._get_input_dim(state_normalization_data)
     return FullyConnectedDQN(
         state_dim=state_dim,
         action_dim=output_dim,
         sizes=self.sizes,
         activations=self.activations,
         dropout_ratio=self.dropout_ratio,
         use_batch_norm=self.use_batch_norm,
     )
Esempio n. 13
0
 def test_save_load(self):
     state_dim = 8
     action_dim = 4
     q_network = FullyConnectedDQN(state_dim,
                                   action_dim,
                                   sizes=[8, 4],
                                   activations=["relu", "relu"])
     imitator_network = FullyConnectedNetwork(
         layers=[state_dim, 8, 4, action_dim],
         activations=["relu", "relu", "linear"])
     model = BatchConstrainedDQN(
         state_dim=state_dim,
         q_network=q_network,
         imitator_network=imitator_network,
         bcq_drop_threshold=0.05,
     )
     # 6 for DQN + 6 for Imitator Network + 2 for BCQ constants
     expected_num_params, expected_num_inputs, expected_num_outputs = 14, 1, 1
     check_save_load(self, model, expected_num_params, expected_num_inputs,
                     expected_num_outputs)
Esempio n. 14
0
    def test_basic(self):
        state_dim = 8
        action_dim = 4
        q_network = FullyConnectedDQN(state_dim,
                                      action_dim,
                                      sizes=[8, 4],
                                      activations=["relu", "relu"])
        imitator_network = FullyConnectedNetwork(
            layers=[state_dim, 8, 4, action_dim],
            activations=["relu", "relu", "linear"])
        model = BatchConstrainedDQN(
            state_dim=state_dim,
            q_network=q_network,
            imitator_network=imitator_network,
            bcq_drop_threshold=0.05,
        )

        input = model.input_prototype()
        self.assertEqual((1, state_dim), input.state.float_features.shape)
        q_values = model(input)
        self.assertEqual((1, action_dim), q_values.q_values.shape)
Esempio n. 15
0
 def build_q_network(
     self,
     state_normalization_parameters: Dict[int, NormalizationParameters],
     output_dim: int,
     num_atoms: int,
     qmin: int,
     qmax: int,
 ) -> ModelBase:
     state_dim = self._get_input_dim(state_normalization_parameters)
     distributional_network = FullyConnectedDQN(
         state_dim=state_dim,
         action_dim=output_dim,
         num_atoms=num_atoms,
         sizes=self.sizes,
         activations=self.activations,
         use_batch_norm=False,
         dropout_ratio=0.0,
     )
     return CategoricalDQN(distributional_network,
                           qmin=qmin,
                           qmax=qmax,
                           num_atoms=num_atoms)
Esempio n. 16
0
def create_dqn_trainer_from_params(
    model: DiscreteActionModelParameters,
    normalization_parameters: Dict[int, NormalizationParameters],
    use_gpu: bool = False,
    use_all_avail_gpus: bool = False,
    metrics_to_score=None,
):
    metrics_to_score = metrics_to_score or []

    if model.rainbow.quantile:
        q_network = QuantileDQN(
            state_dim=get_num_output_features(normalization_parameters),
            action_dim=len(model.actions),
            num_atoms=model.rainbow.num_atoms,
            sizes=model.training.layers[1:-1],
            activations=model.training.activations[:-1],
            dropout_ratio=model.training.dropout_ratio,
        )
    elif model.rainbow.categorical:
        q_network = CategoricalDQN(  # type: ignore
            state_dim=get_num_output_features(normalization_parameters),
            action_dim=len(model.actions),
            num_atoms=model.rainbow.num_atoms,
            qmin=model.rainbow.qmin,
            qmax=model.rainbow.qmax,
            sizes=model.training.layers[1:-1],
            activations=model.training.activations[:-1],
            dropout_ratio=model.training.dropout_ratio,
            use_gpu=use_gpu,
        )
    elif model.rainbow.dueling_architecture:
        q_network = DuelingQNetwork(  # type: ignore
            layers=[get_num_output_features(normalization_parameters)] +
            model.training.layers[1:-1] + [len(model.actions)],
            activations=model.training.activations,
        )
    else:
        q_network = FullyConnectedDQN(  # type: ignore
            state_dim=get_num_output_features(normalization_parameters),
            action_dim=len(model.actions),
            sizes=model.training.layers[1:-1],
            activations=model.training.activations[:-1],
            dropout_ratio=model.training.dropout_ratio,
        )

    if use_gpu and torch.cuda.is_available():
        q_network = q_network.cuda()

    q_network_target = q_network.get_target_network()

    reward_network, q_network_cpe, q_network_cpe_target = None, None, None
    if model.evaluation.calc_cpe_in_training:
        # Metrics + reward
        num_output_nodes = (len(metrics_to_score) + 1) * len(model.actions)
        reward_network = FullyConnectedDQN(
            state_dim=get_num_output_features(normalization_parameters),
            action_dim=num_output_nodes,
            sizes=model.training.layers[1:-1],
            activations=model.training.activations[:-1],
            dropout_ratio=model.training.dropout_ratio,
        )
        q_network_cpe = FullyConnectedDQN(
            state_dim=get_num_output_features(normalization_parameters),
            action_dim=num_output_nodes,
            sizes=model.training.layers[1:-1],
            activations=model.training.activations[:-1],
            dropout_ratio=model.training.dropout_ratio,
        )

        if use_gpu and torch.cuda.is_available():
            reward_network.cuda()
            q_network_cpe.cuda()

        q_network_cpe_target = q_network_cpe.get_target_network()

    if (use_all_avail_gpus and not model.rainbow.categorical
            and not model.rainbow.quantile):
        q_network = q_network.get_distributed_data_parallel_model()
        reward_network = (reward_network.get_distributed_data_parallel_model()
                          if reward_network else None)
        q_network_cpe = (q_network_cpe.get_distributed_data_parallel_model()
                         if q_network_cpe else None)

    if model.rainbow.quantile:
        assert (not use_all_avail_gpus
                ), "use_all_avail_gpus not implemented for distributional RL"
        parameters = QRDQNTrainerParameters.from_discrete_action_model_parameters(
            model)
        return QRDQNTrainer(
            q_network,
            q_network_target,
            parameters,
            use_gpu,
            metrics_to_score=metrics_to_score,
            reward_network=reward_network,
            q_network_cpe=q_network_cpe,
            q_network_cpe_target=q_network_cpe_target,
        )

    elif model.rainbow.categorical:
        assert (not use_all_avail_gpus
                ), "use_all_avail_gpus not implemented for distributional RL"
        return C51Trainer(
            q_network,
            q_network_target,
            C51TrainerParameters.from_discrete_action_model_parameters(model),
            use_gpu,
            metrics_to_score=metrics_to_score,
        )

    else:
        parameters = DQNTrainerParameters.from_discrete_action_model_parameters(
            model)
        return DQNTrainer(
            q_network,
            q_network_target,
            reward_network,
            parameters,
            use_gpu,
            q_network_cpe=q_network_cpe,
            q_network_cpe_target=q_network_cpe_target,
            metrics_to_score=metrics_to_score,
        )
Esempio n. 17
0
class TestCRR(unittest.TestCase):
    def setUp(self):
        # preparing various components for qr-dqn trainer initialization
        self.batch_size = 3
        self.state_dim = 10
        self.action_dim = 2
        self.num_layers = 2
        self.sizes = [20 for _ in range(self.num_layers)]
        self.num_atoms = 11
        self.activations = ["relu" for _ in range(self.num_layers)]
        self.dropout_ratio = 0
        self.exploration_variance = 1e-10

        self.actions = [str(i) for i in range(self.action_dim)]
        self.params = CRRTrainerParameters(actions=self.actions)
        self.reward_options = RewardOptions()
        self.metrics_to_score = get_metrics_to_score(
            self.reward_options.metric_reward_values
        )

        self.actor_network = FullyConnectedActor(
            state_dim=self.state_dim,
            action_dim=self.action_dim,
            sizes=self.sizes,
            activations=self.activations,
            exploration_variance=self.exploration_variance,
        )
        self.actor_network_target = self.actor_network.get_target_network()

        self.q1_network = FullyConnectedDQN(
            state_dim=self.state_dim,
            action_dim=self.action_dim,
            sizes=self.sizes,
            activations=self.activations,
            dropout_ratio=self.dropout_ratio,
        )
        self.q1_network_target = self.q1_network.get_target_network()

        self.q2_network = FullyConnectedDQN(
            state_dim=self.state_dim,
            action_dim=self.action_dim,
            sizes=self.sizes,
            activations=self.activations,
            dropout_ratio=self.dropout_ratio,
        )
        self.q2_network_target = self.q2_network.get_target_network()

        self.num_output_nodes = (len(self.metrics_to_score) + 1) * len(
            self.params.actions
        )
        self.eval_parameters = EvaluationParameters(calc_cpe_in_training=True)
        self.reward_network = FullyConnectedDQN(
            state_dim=self.state_dim,
            action_dim=self.num_output_nodes,
            sizes=self.sizes,
            activations=self.activations,
        )
        self.q_network_cpe = FullyConnectedDQN(
            state_dim=self.state_dim,
            action_dim=self.num_output_nodes,
            sizes=self.sizes,
            activations=self.activations,
        )
        self.q_network_cpe_target = self.q_network_cpe.get_target_network()
        self.inp = DiscreteDqnInput(
            state=FeatureData(
                float_features=torch.rand(self.batch_size, self.state_dim)
            ),
            next_state=FeatureData(
                float_features=torch.rand(self.batch_size, self.state_dim)
            ),
            reward=torch.ones(self.batch_size, 1),
            time_diff=torch.ones(self.batch_size, 1) * 2,
            step=torch.ones(self.batch_size, 1) * 2,
            not_terminal=torch.ones(
                self.batch_size, 1
            ),  # todo: check terminal behavior
            action=torch.tensor([[0, 1], [1, 0], [0, 1]]),
            next_action=torch.tensor([[1, 0], [0, 1], [1, 0]]),
            possible_actions_mask=torch.ones(self.batch_size, self.action_dim),
            possible_next_actions_mask=torch.ones(self.batch_size, self.action_dim),
            extras=ExtraData(action_probability=torch.ones(self.batch_size, 1)),
        )

    @staticmethod
    def dummy_log(*args, **kwargs):
        # replaces calls to self.log() which otherwise require the pytorch lighting trainer to be intialized
        return None

    def _construct_trainer(self, new_params=None, no_cpe=False, no_q2=False):
        trainer = DiscreteCRRTrainer(
            actor_network=self.actor_network,
            actor_network_target=self.actor_network_target,
            q1_network=self.q1_network,
            q1_network_target=self.q1_network_target,
            q2_network=(None if no_q2 else self.q2_network),
            q2_network_target=(None if no_q2 else self.q2_network_target),
            reward_network=(None if no_cpe else self.reward_network),
            q_network_cpe=(None if no_cpe else self.q_network_cpe),
            q_network_cpe_target=(None if no_cpe else self.q_network_cpe_target),
            metrics_to_score=self.metrics_to_score,
            evaluation=EvaluationParameters(
                calc_cpe_in_training=(False if no_cpe else True)
            ),
            # pyre-fixme[16]: `QRDQNTrainerParameters` has no attribute `asdict`.
            **(new_params if new_params is not None else self.params).asdict()
        )
        trainer.log = self.dummy_log
        return trainer

    def test_init(self):
        trainer = self._construct_trainer()
        self.assertTrue((torch.isclose(trainer.reward_boosts, torch.zeros(2))).all())
        param_copy = CRRTrainerParameters(
            actions=self.actions,
            rl=RLParameters(reward_boost={i: int(i) + 1 for i in self.actions}),
        )
        reward_boost_trainer = self._construct_trainer(new_params=param_copy)
        self.assertTrue(
            (
                torch.isclose(
                    reward_boost_trainer.reward_boosts, torch.tensor([1.0, 2.0])
                )
            ).all()
        )

    def test_train_step_gen(self):
        mse_backward_type = type(
            torch.nn.functional.mse_loss(
                torch.tensor([1.0], requires_grad=True), torch.zeros(1)
            ).grad_fn
        )
        add_backward_type = type(
            (
                torch.tensor([1.0], requires_grad=True)
                + torch.tensor([1.0], requires_grad=True)
            ).grad_fn
        )
        # vanilla
        trainer = self._construct_trainer()
        loss_gen = trainer.train_step_gen(self.inp, batch_idx=1)
        losses = list(loss_gen)
        self.assertEqual(len(losses), 6)
        self.assertEqual(type(losses[0].grad_fn), mse_backward_type)
        self.assertEqual(type(losses[1].grad_fn), mse_backward_type)
        self.assertEqual(type(losses[2].grad_fn), add_backward_type)
        self.assertEqual(type(losses[3].grad_fn), mse_backward_type)
        self.assertEqual(type(losses[4].grad_fn), mse_backward_type)
        self.assertEqual(type(losses[5].grad_fn), add_backward_type)

        # no CPE
        trainer = self._construct_trainer(no_cpe=True)
        loss_gen = trainer.train_step_gen(self.inp, batch_idx=1)
        losses = list(loss_gen)
        self.assertEqual(len(losses), 4)

        # no q2 net
        trainer = self._construct_trainer(no_q2=True)
        loss_gen = trainer.train_step_gen(self.inp, batch_idx=1)
        losses = list(loss_gen)
        self.assertEqual(len(losses), 5)

        # use_target_actor
        params_copy = CRRTrainerParameters(actions=self.actions, use_target_actor=True)
        trainer = self._construct_trainer(new_params=params_copy)
        loss_gen = trainer.train_step_gen(self.inp, batch_idx=1)
        losses = list(loss_gen)
        self.assertEqual(len(losses), 6)

        # delayed policy update
        params_copy = CRRTrainerParameters(
            actions=self.actions, delayed_policy_update=2
        )
        trainer = self._construct_trainer(new_params=params_copy)
        loss_gen = trainer.train_step_gen(self.inp, batch_idx=1)
        losses = list(loss_gen)
        self.assertEqual(len(losses), 6)
        self.assertEqual(losses[2], None)

        # entropy
        params_copy = CRRTrainerParameters(actions=self.actions, entropy_coeff=1.0)
        trainer = self._construct_trainer(new_params=params_copy)
        loss_gen = trainer.train_step_gen(self.inp, batch_idx=1)
        losses = list(loss_gen)
        self.assertEqual(len(losses), 6)

    def test_q_network_property(self):
        trainer = self._construct_trainer()
        self.assertEqual(trainer.q_network, trainer.q1_network)

    def test_configure_optimizers(self):
        trainer = self._construct_trainer()
        optimizers = trainer.configure_optimizers()
        self.assertEqual(len(optimizers), 6)
        train_step_yield_order = [
            trainer.q1_network,
            trainer.q2_network,
            trainer.actor_network,
            trainer.reward_network,
            trainer.q_network_cpe,
            trainer.q1_network,
        ]
        for i in range(len(train_step_yield_order)):
            opt_param = optimizers[i]["optimizer"].param_groups[0]["params"][0]
            loss_param = list(train_step_yield_order[i].parameters())[0]
            self.assertTrue(torch.all(torch.isclose(opt_param, loss_param)))
        trainer = self._construct_trainer(no_cpe=True)
        optimizers = trainer.configure_optimizers()
        self.assertEqual(len(optimizers), 4)
        trainer = self._construct_trainer(no_q2=True)
        optimizers = trainer.configure_optimizers()
        self.assertEqual(len(optimizers), 5)

    def test_get_detached_model_outputs(self):
        trainer = self._construct_trainer()
        action_scores, _ = trainer.get_detached_model_outputs(
            FeatureData(float_features=torch.rand(self.batch_size, self.state_dim))
        )
        self.assertEqual(action_scores.shape[0], self.batch_size)
        self.assertEqual(action_scores.shape[1], self.action_dim)

    def test_validation_step(self):
        trainer = self._construct_trainer()
        edp = trainer.validation_step(self.inp, batch_idx=1)
        out = trainer.actor_network(self.inp.state)
        # Note: in current code EDP assumes policy induced by q-net instead of actor
        self.assertTrue(torch.all(torch.isclose(edp.optimal_q_values, out.action)))
Esempio n. 18
0
    def setUp(self):
        # preparing various components for qr-dqn trainer initialization
        self.batch_size = 3
        self.state_dim = 10
        self.action_dim = 2
        self.num_layers = 2
        self.sizes = [20 for _ in range(self.num_layers)]
        self.num_atoms = 11
        self.activations = ["relu" for _ in range(self.num_layers)]
        self.dropout_ratio = 0
        self.exploration_variance = 1e-10

        self.actions = [str(i) for i in range(self.action_dim)]
        self.params = CRRTrainerParameters(actions=self.actions)
        self.reward_options = RewardOptions()
        self.metrics_to_score = get_metrics_to_score(
            self.reward_options.metric_reward_values
        )

        self.actor_network = FullyConnectedActor(
            state_dim=self.state_dim,
            action_dim=self.action_dim,
            sizes=self.sizes,
            activations=self.activations,
            exploration_variance=self.exploration_variance,
        )
        self.actor_network_target = self.actor_network.get_target_network()

        self.q1_network = FullyConnectedDQN(
            state_dim=self.state_dim,
            action_dim=self.action_dim,
            sizes=self.sizes,
            activations=self.activations,
            dropout_ratio=self.dropout_ratio,
        )
        self.q1_network_target = self.q1_network.get_target_network()

        self.q2_network = FullyConnectedDQN(
            state_dim=self.state_dim,
            action_dim=self.action_dim,
            sizes=self.sizes,
            activations=self.activations,
            dropout_ratio=self.dropout_ratio,
        )
        self.q2_network_target = self.q2_network.get_target_network()

        self.num_output_nodes = (len(self.metrics_to_score) + 1) * len(
            self.params.actions
        )
        self.eval_parameters = EvaluationParameters(calc_cpe_in_training=True)
        self.reward_network = FullyConnectedDQN(
            state_dim=self.state_dim,
            action_dim=self.num_output_nodes,
            sizes=self.sizes,
            activations=self.activations,
        )
        self.q_network_cpe = FullyConnectedDQN(
            state_dim=self.state_dim,
            action_dim=self.num_output_nodes,
            sizes=self.sizes,
            activations=self.activations,
        )
        self.q_network_cpe_target = self.q_network_cpe.get_target_network()
        self.inp = DiscreteDqnInput(
            state=FeatureData(
                float_features=torch.rand(self.batch_size, self.state_dim)
            ),
            next_state=FeatureData(
                float_features=torch.rand(self.batch_size, self.state_dim)
            ),
            reward=torch.ones(self.batch_size, 1),
            time_diff=torch.ones(self.batch_size, 1) * 2,
            step=torch.ones(self.batch_size, 1) * 2,
            not_terminal=torch.ones(
                self.batch_size, 1
            ),  # todo: check terminal behavior
            action=torch.tensor([[0, 1], [1, 0], [0, 1]]),
            next_action=torch.tensor([[1, 0], [0, 1], [1, 0]]),
            possible_actions_mask=torch.ones(self.batch_size, self.action_dim),
            possible_next_actions_mask=torch.ones(self.batch_size, self.action_dim),
            extras=ExtraData(action_probability=torch.ones(self.batch_size, 1)),
        )
Esempio n. 19
0
class TestQRDQN(unittest.TestCase):
    def setUp(self):
        # preparing various components for qr-dqn trainer initialization
        self.params = QRDQNTrainerParameters(actions=["1", "2"], num_atoms=11)
        self.reward_options = RewardOptions()
        self.metrics_to_score = get_metrics_to_score(
            self.reward_options.metric_reward_values
        )
        self.state_dim = 10
        self.action_dim = 2
        self.sizes = [20, 20]
        self.num_atoms = 11
        self.activations = ["relu", "relu"]
        self.dropout_ratio = 0
        self.q_network = FullyConnectedDQN(
            state_dim=self.state_dim,
            action_dim=self.action_dim,
            sizes=self.sizes,
            num_atoms=self.num_atoms,
            activations=self.activations,
            dropout_ratio=self.dropout_ratio,
        )
        self.q_network_target = self.q_network.get_target_network()
        self.x = FeatureData(float_features=torch.rand(5, 10))
        self.eval_parameters = EvaluationParameters(calc_cpe_in_training=True)
        self.num_output_nodes = (len(self.metrics_to_score) + 1) * len(
            # pyre-fixme[16]: `QRDQNTrainerParameters` has no attribute `actions`.
            self.params.actions
        )
        self.reward_network = FullyConnectedDQN(
            state_dim=self.state_dim,
            action_dim=self.num_output_nodes,
            sizes=self.sizes,
            activations=self.activations,
        )
        self.q_network_cpe = FullyConnectedDQN(
            state_dim=self.state_dim,
            action_dim=self.num_output_nodes,
            sizes=self.sizes,
            activations=self.activations,
        )
        self.q_network_cpe_target = self.q_network_cpe.get_target_network()

    def _construct_trainer(self, new_params=None, no_cpe=False):
        reward_network = self.reward_network
        q_network_cpe = self.q_network_cpe
        q_network_cpe_target = self.q_network_cpe_target
        evaluation = self.eval_parameters
        params = self.params

        if new_params is not None:
            params = new_params
        if no_cpe:
            reward_network = q_network_cpe = q_network_cpe_target = None
            evaluation = EvaluationParameters(calc_cpe_in_training=False)

        return QRDQNTrainer(
            q_network=self.q_network,
            q_network_target=self.q_network_target,
            reward_network=reward_network,
            q_network_cpe=q_network_cpe,
            q_network_cpe_target=q_network_cpe_target,
            metrics_to_score=self.metrics_to_score,
            evaluation=evaluation,
            # pyre-fixme[16]: `QRDQNTrainerParameters` has no attribute `asdict`.
            **params.asdict()
        )

    def test_init(self):
        trainer = self._construct_trainer()
        quantiles = (0.5 + torch.arange(self.num_atoms).float()) / float(self.num_atoms)
        self.assertTrue((torch.isclose(trainer.quantiles, quantiles)).all())
        self.assertTrue((torch.isclose(trainer.reward_boosts, torch.zeros(2))).all())
        param_copy = QRDQNTrainerParameters(
            actions=["1", "2"],
            num_atoms=11,
            rl=RLParameters(reward_boost={"1": 1, "2": 2}),
        )
        reward_boost_trainer = self._construct_trainer(new_params=param_copy)
        self.assertTrue(
            (
                torch.isclose(
                    reward_boost_trainer.reward_boosts, torch.tensor([1.0, 2.0])
                )
            ).all()
        )

    def test_train_step_gen(self):
        inp = DiscreteDqnInput(
            state=FeatureData(float_features=torch.rand(3, 10)),
            next_state=FeatureData(float_features=torch.rand(3, 10)),
            reward=torch.ones(3, 1),
            time_diff=torch.ones(3, 1) * 2,
            step=torch.ones(3, 1) * 2,
            not_terminal=torch.ones(3, 1),  # todo: check terminal behavior
            action=torch.tensor([[0, 1], [1, 0], [0, 1]]),
            next_action=torch.tensor([[1, 0], [0, 1], [1, 0]]),
            possible_actions_mask=torch.ones(3, 2),
            possible_next_actions_mask=torch.ones(3, 2),
            extras=ExtraData(),
        )
        mse_backward_type = type(
            torch.nn.functional.mse_loss(
                torch.tensor([1.0], requires_grad=True), torch.zeros(1)
            ).grad_fn
        )
        add_backward_type = type(
            (
                torch.tensor([1.0], requires_grad=True)
                + torch.tensor([1.0], requires_grad=True)
            ).grad_fn
        )
        mean_backward_type = type(
            torch.tensor([1.0, 2.0], requires_grad=True).mean().grad_fn
        )

        # vanilla
        trainer = self._construct_trainer()
        loss_gen = trainer.train_step_gen(inp, batch_idx=1)
        losses = list(loss_gen)
        self.assertEqual(len(losses), 4)
        self.assertEqual(type(losses[0].grad_fn), mean_backward_type)
        self.assertEqual(type(losses[1].grad_fn), mse_backward_type)
        self.assertEqual(type(losses[2].grad_fn), mse_backward_type)
        self.assertEqual(type(losses[3].grad_fn), add_backward_type)

        # no CPE
        trainer = self._construct_trainer(no_cpe=True)
        loss_gen = trainer.train_step_gen(inp, batch_idx=1)
        losses = list(loss_gen)
        self.assertEqual(len(losses), 2)

        # seq_num
        param_copy = QRDQNTrainerParameters(
            actions=["1", "2"],
            num_atoms=11,
            rl=RLParameters(use_seq_num_diff_as_time_diff=True),
        )
        trainer = self._construct_trainer(new_params=param_copy)
        loss_gen = trainer.train_step_gen(inp, batch_idx=1)
        losses = list(loss_gen)
        self.assertEqual(len(losses), 4)

        # multi_steps
        param_copy = QRDQNTrainerParameters(
            actions=["1", "2"], num_atoms=11, rl=RLParameters(multi_steps=2)
        )
        trainer = self._construct_trainer(new_params=param_copy)
        loss_gen = trainer.train_step_gen(inp, batch_idx=1)
        losses = list(loss_gen)
        self.assertEqual(len(losses), 4)

        # non_max_q
        param_copy = QRDQNTrainerParameters(
            actions=["1", "2"], num_atoms=11, rl=RLParameters(maxq_learning=False)
        )
        trainer = self._construct_trainer(new_params=param_copy)
        loss_gen = trainer.train_step_gen(inp, batch_idx=1)
        losses = list(loss_gen)
        self.assertEqual(len(losses), 4)

    def test_configure_optimizers(self):
        trainer = self._construct_trainer()
        optimizers = trainer.configure_optimizers()
        self.assertEqual(len(optimizers), 4)
        train_step_yield_order = [
            trainer.q_network,
            trainer.reward_network,
            trainer.q_network_cpe,
            trainer.q_network,
        ]
        for i in range(len(train_step_yield_order)):
            opt_param = optimizers[i]["optimizer"].param_groups[0]["params"][0]
            loss_param = list(train_step_yield_order[i].parameters())[0]
            self.assertTrue(torch.all(torch.isclose(opt_param, loss_param)))

        trainer = self._construct_trainer(no_cpe=True)
        optimizers = trainer.configure_optimizers()
        self.assertEqual(len(optimizers), 2)

    def test_get_detached_model_outputs(self):
        trainer = self._construct_trainer()
        q_out, q_target = trainer.get_detached_model_outputs(self.x)
        self.assertEqual(q_out.shape[0], q_target.shape[0], 3)
        self.assertEqual(q_out.shape[1], q_target.shape[1], 2)