Esempio n. 1
0
    def test_forward_pass(self):
        state_dim = 1
        action_dim = 2
        input = StateInput(state=FeatureVector(
            float_features=torch.tensor([[2.0]])))
        bcq_drop_threshold = 0.20

        q_network = FullyConnectedDQN(state_dim,
                                      action_dim,
                                      sizes=[2],
                                      activations=["relu"])
        # Set weights of q-network to make it deterministic
        q_net_layer_0_w = torch.tensor([[1.2], [0.9]])
        q_network.state_dict()["fc.layers.0.weight"].data.copy_(
            q_net_layer_0_w)
        q_net_layer_0_b = torch.tensor([0.0, 0.0])
        q_network.state_dict()["fc.layers.0.bias"].data.copy_(q_net_layer_0_b)
        q_net_layer_1_w = torch.tensor([[0.5, -0.5], [1.0, 1.0]])
        q_network.state_dict()["fc.layers.1.weight"].data.copy_(
            q_net_layer_1_w)
        q_net_layer_1_b = torch.tensor([0.0, 0.0])
        q_network.state_dict()["fc.layers.1.bias"].data.copy_(q_net_layer_1_b)

        imitator_network = FullyConnectedNetwork(
            layers=[state_dim, 2, action_dim], activations=["relu", "linear"])
        # Set weights of imitator network to make it deterministic
        im_net_layer_0_w = torch.tensor([[1.2], [0.9]])
        imitator_network.state_dict()["layers.0.weight"].data.copy_(
            im_net_layer_0_w)
        im_net_layer_0_b = torch.tensor([0.0, 0.0])
        imitator_network.state_dict()["layers.0.bias"].data.copy_(
            im_net_layer_0_b)
        im_net_layer_1_w = torch.tensor([[0.5, 1.5], [1.0, 2.0]])
        imitator_network.state_dict()["layers.1.weight"].data.copy_(
            im_net_layer_1_w)
        im_net_layer_1_b = torch.tensor([0.0, 0.0])
        imitator_network.state_dict()["layers.1.bias"].data.copy_(
            im_net_layer_1_b)

        imitator_probs = torch.nn.functional.softmax(imitator_network(
            input.state.float_features),
                                                     dim=1)
        bcq_mask = imitator_probs < bcq_drop_threshold
        assert bcq_mask[0][0] == 1
        assert bcq_mask[0][1] == 0

        model = BatchConstrainedDQN(
            state_dim=state_dim,
            q_network=q_network,
            imitator_network=imitator_network,
            bcq_drop_threshold=bcq_drop_threshold,
        )
        final_q_values = model(input)
        assert final_q_values.q_values[0][0] == -1e10
        assert abs(final_q_values.q_values[0][1] - 4.2) < 0.0001
Esempio n. 2
0
    def get_modular_sarsa_trainer_reward_boost(
        self,
        environment,
        reward_shape,
        dueling,
        use_gpu=False,
        use_all_avail_gpus=False,
        clip_grad_norm=None,
    ):
        parameters = self.get_sarsa_parameters(environment, reward_shape,
                                               dueling, clip_grad_norm)
        q_network = FullyConnectedDQN(
            state_dim=get_num_output_features(environment.normalization),
            action_dim=len(environment.ACTIONS),
            sizes=parameters.training.layers[1:-1],
            activations=parameters.training.activations[:-1],
        )
        reward_network = FullyConnectedDQN(
            state_dim=get_num_output_features(environment.normalization),
            action_dim=len(environment.ACTIONS),
            sizes=parameters.training.layers[1:-1],
            activations=parameters.training.activations[:-1],
        )
        if use_gpu:
            q_network = q_network.cuda()
            reward_network = reward_network.cuda()
            if use_all_avail_gpus:
                q_network = q_network.get_data_parallel_model()
                reward_network = reward_network.get_data_parallel_model()

        q_network_target = q_network.get_target_network()
        trainer = _DQNTrainer(q_network, q_network_target, reward_network,
                              parameters, use_gpu)
        return trainer
Esempio n. 3
0
 def test_save_load_batch_norm(self):
     state_dim = 8
     action_dim = 4
     model = FullyConnectedDQN(
         state_dim,
         action_dim,
         sizes=[8, 4],
         activations=["relu", "relu"],
         use_batch_norm=True,
     )
     # Freezing batch_norm
     model.eval()
     expected_num_params, expected_num_inputs, expected_num_outputs = 21, 1, 1
     check_save_load(self, model, expected_num_params, expected_num_inputs,
                     expected_num_outputs)
Esempio n. 4
0
    def test_discrete_wrapper(self):
        state_normalization_parameters = {i: _cont_norm() for i in range(1, 5)}
        state_preprocessor = Preprocessor(state_normalization_parameters,
                                          False)
        action_dim = 2
        dqn = FullyConnectedDQN(
            state_dim=len(state_normalization_parameters),
            action_dim=action_dim,
            sizes=[16],
            activations=["relu"],
        )
        dqn_with_preprocessor = DiscreteDqnWithPreprocessor(
            dqn, state_preprocessor)
        action_names = ["L", "R"]
        wrapper = DiscreteDqnPredictorWrapper(dqn_with_preprocessor,
                                              action_names)
        input_prototype = dqn_with_preprocessor.input_prototype()
        output_action_names, q_values = wrapper(*input_prototype)
        self.assertEqual(action_names, output_action_names)
        self.assertEqual(q_values.shape, (1, 2))

        expected_output = dqn(
            rlt.PreprocessedState.from_tensor(
                state_preprocessor(*input_prototype[0]))).q_values
        self.assertTrue((expected_output == q_values).all())
Esempio n. 5
0
 def test_basic(self):
     state_dim = 8
     action_dim = 4
     model = FullyConnectedDQN(
         state_dim,
         action_dim,
         sizes=[8, 4],
         activations=["relu", "relu"],
         use_batch_norm=True,
     )
     input = model.input_prototype()
     self.assertEqual((1, state_dim), input.state.float_features.shape)
     # Using batch norm requires more than 1 example in training, avoid that
     model.eval()
     q_values = model(input)
     self.assertEqual((1, action_dim), q_values.q_values.shape)
Esempio n. 6
0
 def build_q_network(
     self,
     state_normalization_parameters: Dict[int, NormalizationParameters],
     output_dim: int,
 ) -> ModelBase:
     state_dim = self._get_input_dim(state_normalization_parameters)
     return FullyConnectedDQN(
         state_dim=state_dim,
         action_dim=output_dim,
         sizes=self.config.sizes,
         activations=self.config.activations,
         dropout_ratio=self.config.dropout_ratio,
     )
Esempio n. 7
0
 def test_save_load(self):
     state_dim = 8
     action_dim = 4
     model = FullyConnectedDQN(
         state_dim,
         action_dim,
         sizes=[8, 4],
         activations=["relu", "relu"],
         use_batch_norm=False,
     )
     expected_num_params, expected_num_inputs, expected_num_outputs = 6, 1, 1
     check_save_load(self, model, expected_num_params, expected_num_inputs,
                     expected_num_outputs)
Esempio n. 8
0
 def test_save_load(self):
     state_dim = 8
     action_dim = 4
     q_network = FullyConnectedDQN(state_dim,
                                   action_dim,
                                   sizes=[8, 4],
                                   activations=["relu", "relu"])
     imitator_network = FullyConnectedNetwork(
         layers=[state_dim, 8, 4, action_dim],
         activations=["relu", "relu", "linear"])
     model = BatchConstrainedDQN(
         state_dim=state_dim,
         q_network=q_network,
         imitator_network=imitator_network,
         bcq_drop_threshold=0.05,
     )
     # 6 for DQN + 6 for Imitator Network + 2 for BCQ constants
     expected_num_params, expected_num_inputs, expected_num_outputs = 14, 1, 1
     check_save_load(self, model, expected_num_params, expected_num_inputs,
                     expected_num_outputs)
Esempio n. 9
0
    def test_basic(self):
        state_dim = 8
        action_dim = 4
        q_network = FullyConnectedDQN(state_dim,
                                      action_dim,
                                      sizes=[8, 4],
                                      activations=["relu", "relu"])
        imitator_network = FullyConnectedNetwork(
            layers=[state_dim, 8, 4, action_dim],
            activations=["relu", "relu", "linear"])
        model = BatchConstrainedDQN(
            state_dim=state_dim,
            q_network=q_network,
            imitator_network=imitator_network,
            bcq_drop_threshold=0.05,
        )

        input = model.input_prototype()
        self.assertEqual((1, state_dim), input.state.float_features.shape)
        q_values = model(input)
        self.assertEqual((1, action_dim), q_values.q_values.shape)
Esempio n. 10
0
def create_dqn_trainer_from_params(
    model: DiscreteActionModelParameters,
    normalization_parameters: Dict[int, NormalizationParameters],
    use_gpu: bool = False,
    use_all_avail_gpus: bool = False,
    metrics_to_score=None,
):
    metrics_to_score = metrics_to_score or []

    if model.rainbow.quantile:
        q_network = QuantileDQN(
            state_dim=get_num_output_features(normalization_parameters),
            action_dim=len(model.actions),
            num_atoms=model.rainbow.num_atoms,
            sizes=model.training.layers[1:-1],
            activations=model.training.activations[:-1],
            dropout_ratio=model.training.dropout_ratio,
        )
    elif model.rainbow.categorical:
        q_network = CategoricalDQN(  # type: ignore
            state_dim=get_num_output_features(normalization_parameters),
            action_dim=len(model.actions),
            num_atoms=model.rainbow.num_atoms,
            qmin=model.rainbow.qmin,
            qmax=model.rainbow.qmax,
            sizes=model.training.layers[1:-1],
            activations=model.training.activations[:-1],
            dropout_ratio=model.training.dropout_ratio,
            use_gpu=use_gpu,
        )
    elif model.rainbow.dueling_architecture:
        q_network = DuelingQNetwork(  # type: ignore
            layers=[get_num_output_features(normalization_parameters)] +
            model.training.layers[1:-1] + [len(model.actions)],
            activations=model.training.activations,
        )
    else:
        q_network = FullyConnectedDQN(  # type: ignore
            state_dim=get_num_output_features(normalization_parameters),
            action_dim=len(model.actions),
            sizes=model.training.layers[1:-1],
            activations=model.training.activations[:-1],
            dropout_ratio=model.training.dropout_ratio,
        )

    if use_gpu and torch.cuda.is_available():
        q_network = q_network.cuda()

    q_network_target = q_network.get_target_network()

    reward_network, q_network_cpe, q_network_cpe_target = None, None, None
    if model.evaluation.calc_cpe_in_training:
        # Metrics + reward
        num_output_nodes = (len(metrics_to_score) + 1) * len(model.actions)
        reward_network = FullyConnectedDQN(
            state_dim=get_num_output_features(normalization_parameters),
            action_dim=num_output_nodes,
            sizes=model.training.layers[1:-1],
            activations=model.training.activations[:-1],
            dropout_ratio=model.training.dropout_ratio,
        )
        q_network_cpe = FullyConnectedDQN(
            state_dim=get_num_output_features(normalization_parameters),
            action_dim=num_output_nodes,
            sizes=model.training.layers[1:-1],
            activations=model.training.activations[:-1],
            dropout_ratio=model.training.dropout_ratio,
        )

        if use_gpu and torch.cuda.is_available():
            reward_network.cuda()
            q_network_cpe.cuda()

        q_network_cpe_target = q_network_cpe.get_target_network()

    if (use_all_avail_gpus and not model.rainbow.categorical
            and not model.rainbow.quantile):
        q_network = q_network.get_distributed_data_parallel_model()
        reward_network = (reward_network.get_distributed_data_parallel_model()
                          if reward_network else None)
        q_network_cpe = (q_network_cpe.get_distributed_data_parallel_model()
                         if q_network_cpe else None)

    if model.rainbow.quantile:
        assert (not use_all_avail_gpus
                ), "use_all_avail_gpus not implemented for distributional RL"
        return QRDQNTrainer(
            q_network,
            q_network_target,
            model,
            use_gpu,
            metrics_to_score=metrics_to_score,
        )

    elif model.rainbow.categorical:
        assert (not use_all_avail_gpus
                ), "use_all_avail_gpus not implemented for distributional RL"
        return C51Trainer(
            q_network,
            q_network_target,
            model,
            use_gpu,
            metrics_to_score=metrics_to_score,
        )

    else:
        return DQNTrainer(
            q_network,
            q_network_target,
            reward_network,
            model,
            use_gpu,
            q_network_cpe=q_network_cpe,
            q_network_cpe_target=q_network_cpe_target,
            metrics_to_score=metrics_to_score,
        )
Esempio n. 11
0
    def get_modular_sarsa_trainer_reward_boost(
        self,
        environment,
        reward_shape,
        dueling,
        categorical,
        quantile,
        use_gpu=False,
        use_all_avail_gpus=False,
        clip_grad_norm=None,
    ):
        assert not quantile or not categorical
        parameters = self.get_sarsa_parameters(environment, reward_shape,
                                               dueling, categorical, quantile,
                                               clip_grad_norm)

        if quantile:
            if dueling:
                q_network = DuelingQuantileDQN(
                    layers=[
                        get_num_output_features(environment.normalization)
                    ] + parameters.training.layers[1:-1] +
                    [len(environment.ACTIONS)],
                    activations=parameters.training.activations,
                    num_atoms=parameters.rainbow.num_atoms,
                )
            else:
                q_network = QuantileDQN(
                    state_dim=get_num_output_features(
                        environment.normalization),
                    action_dim=len(environment.ACTIONS),
                    num_atoms=parameters.rainbow.num_atoms,
                    sizes=parameters.training.layers[1:-1],
                    activations=parameters.training.activations[:-1],
                )
        elif categorical:
            assert not dueling
            q_network = CategoricalDQN(
                state_dim=get_num_output_features(environment.normalization),
                action_dim=len(environment.ACTIONS),
                num_atoms=parameters.rainbow.num_atoms,
                qmin=-100,
                qmax=200,
                sizes=parameters.training.layers[1:-1],
                activations=parameters.training.activations[:-1],
            )
        else:
            if dueling:
                q_network = DuelingQNetwork(
                    layers=[
                        get_num_output_features(environment.normalization)
                    ] + parameters.training.layers[1:-1] +
                    [len(environment.ACTIONS)],
                    activations=parameters.training.activations,
                )
            else:
                q_network = FullyConnectedDQN(
                    state_dim=get_num_output_features(
                        environment.normalization),
                    action_dim=len(environment.ACTIONS),
                    sizes=parameters.training.layers[1:-1],
                    activations=parameters.training.activations[:-1],
                )

        q_network_cpe, q_network_cpe_target, reward_network = None, None, None

        if parameters.evaluation and parameters.evaluation.calc_cpe_in_training:
            q_network_cpe = FullyConnectedDQN(
                state_dim=get_num_output_features(environment.normalization),
                action_dim=len(environment.ACTIONS),
                sizes=parameters.training.layers[1:-1],
                activations=parameters.training.activations[:-1],
            )
            q_network_cpe_target = q_network_cpe.get_target_network()
            reward_network = FullyConnectedDQN(
                state_dim=get_num_output_features(environment.normalization),
                action_dim=len(environment.ACTIONS),
                sizes=parameters.training.layers[1:-1],
                activations=parameters.training.activations[:-1],
            )

        if use_gpu:
            q_network = q_network.cuda()
            if parameters.evaluation.calc_cpe_in_training:
                reward_network = reward_network.cuda()
                q_network_cpe = q_network_cpe.cuda()
                q_network_cpe_target = q_network_cpe_target.cuda()
            if use_all_avail_gpus and not categorical:
                q_network = q_network.get_distributed_data_parallel_model()
                reward_network = reward_network.get_distributed_data_parallel_model(
                )
                q_network_cpe = q_network_cpe.get_distributed_data_parallel_model(
                )
                q_network_cpe_target = (
                    q_network_cpe_target.get_distributed_data_parallel_model())

        if quantile:
            trainer = QRDQNTrainer(
                q_network,
                q_network.get_target_network(),
                parameters,
                use_gpu,
                reward_network=reward_network,
                q_network_cpe=q_network_cpe,
                q_network_cpe_target=q_network_cpe_target,
            )
        elif categorical:
            trainer = C51Trainer(q_network, q_network.get_target_network(),
                                 parameters, use_gpu)
        else:
            parameters = DQNTrainerParameters.from_discrete_action_model_parameters(
                parameters)
            trainer = DQNTrainer(
                q_network,
                q_network.get_target_network(),
                reward_network,
                parameters,
                use_gpu,
                q_network_cpe=q_network_cpe,
                q_network_cpe_target=q_network_cpe_target,
            )
        return trainer
Esempio n. 12
0
    def get_modular_sarsa_trainer_reward_boost(
        self,
        environment,
        reward_shape,
        dueling,
        categorical,
        use_gpu=False,
        use_all_avail_gpus=False,
        clip_grad_norm=None,
    ):
        parameters = self.get_sarsa_parameters(environment, reward_shape,
                                               dueling, categorical,
                                               clip_grad_norm)

        if not categorical:
            q_network = FullyConnectedDQN(
                state_dim=get_num_output_features(environment.normalization),
                action_dim=len(environment.ACTIONS),
                sizes=parameters.training.layers[1:-1],
                activations=parameters.training.activations[:-1],
            )
            q_network_cpe = FullyConnectedDQN(
                state_dim=get_num_output_features(environment.normalization),
                action_dim=len(environment.ACTIONS),
                sizes=parameters.training.layers[1:-1],
                activations=parameters.training.activations[:-1],
            )
            reward_network = FullyConnectedDQN(
                state_dim=get_num_output_features(environment.normalization),
                action_dim=len(environment.ACTIONS),
                sizes=parameters.training.layers[1:-1],
                activations=parameters.training.activations[:-1],
            )
        else:
            q_network = CategoricalDQN(
                state_dim=get_num_output_features(environment.normalization),
                action_dim=len(environment.ACTIONS),
                num_atoms=51,
                qmin=-100,
                qmax=200,
                sizes=parameters.training.layers[1:-1],
                activations=parameters.training.activations[:-1],
            )

        if use_gpu:
            q_network = q_network.cuda()
            reward_network = reward_network.cuda()
            q_network_cpe = q_network_cpe.cuda()
            if use_all_avail_gpus and not categorical:
                q_network = q_network.get_distributed_data_parallel_model()
                reward_network = reward_network.get_distributed_data_parallel_model(
                )
                q_network_cpe = q_network_cpe.get_distributed_data_parallel_model(
                )

        if not categorical:
            trainer = DQNTrainer(
                q_network,
                q_network.get_target_network(),
                reward_network,
                parameters,
                use_gpu,
                q_network_cpe=q_network_cpe,
                q_network_cpe_target=q_network_cpe.get_target_network(),
            )
        else:
            trainer = C51Trainer(q_network, q_network.get_target_network(),
                                 parameters, use_gpu)
        return trainer
Esempio n. 13
0
def create_dqn_trainer_from_params(
    model: DiscreteActionModelParameters,
    normalization_parameters: Dict[int, NormalizationParameters],
    use_gpu: bool = False,
    metrics_to_score=None,
):
    metrics_to_score = metrics_to_score or []
    if model.rainbow.dueling_architecture:
        q_network = DuelingQNetwork(
            layers=[get_num_output_features(normalization_parameters)] +
            model.training.layers[1:-1] + [len(model.actions)],
            activations=model.training.activations,
        )
    else:
        q_network = FullyConnectedDQN(
            state_dim=get_num_output_features(normalization_parameters),
            action_dim=len(model.actions),
            sizes=model.training.layers[1:-1],
            activations=model.training.activations[:-1],
            dropout_ratio=model.training.dropout_ratio,
        )

    if use_gpu and torch.cuda.is_available():
        q_network = q_network.cuda()

    q_network_target = q_network.get_target_network()

    reward_network, q_network_cpe, q_network_cpe_target = None, None, None
    if model.evaluation.calc_cpe_in_training:
        # Metrics + reward
        num_output_nodes = (len(metrics_to_score) + 1) * len(model.actions)
        reward_network = FullyConnectedDQN(
            state_dim=get_num_output_features(normalization_parameters),
            action_dim=num_output_nodes,
            sizes=model.training.layers[1:-1],
            activations=model.training.activations[:-1],
            dropout_ratio=model.training.dropout_ratio,
        )
        q_network_cpe = FullyConnectedDQN(
            state_dim=get_num_output_features(normalization_parameters),
            action_dim=num_output_nodes,
            sizes=model.training.layers[1:-1],
            activations=model.training.activations[:-1],
            dropout_ratio=model.training.dropout_ratio,
        )

        if use_gpu and torch.cuda.is_available():
            reward_network.cuda()
            q_network_cpe.cuda()

        q_network_cpe_target = q_network_cpe.get_target_network()

    return DQNTrainer(
        q_network,
        q_network_target,
        reward_network,
        model,
        use_gpu,
        q_network_cpe=q_network_cpe,
        q_network_cpe_target=q_network_cpe_target,
        metrics_to_score=metrics_to_score,
    )