def test_discrete_action(self): state_dim = 8 action_dim = 4 model = DuelingQNetwork( layers=[state_dim, 8, 4, action_dim], activations=["relu", "relu", "linear"], use_batch_norm=True, ) input = model.input_prototype() self.assertEqual((1, state_dim), input.state.float_features.shape) # Using batch norm requires more than 1 example in training, avoid that model.eval() q_values = model(input) self.assertEqual((1, action_dim), q_values.q_values.shape)
def test_save_load_discrete_action_batch_norm(self): state_dim = 8 action_dim = 4 model = DuelingQNetwork( layers=[state_dim, 8, 4, action_dim], activations=["relu", "relu", "linear"], use_batch_norm=False, ) # Freezing batch_norm model.eval() # Number of expected params is the same because DuelingQNetwork always # initialize batch norm layer even if it doesn't use it. expected_num_params, expected_num_inputs, expected_num_outputs = 22, 1, 1 check_save_load(self, model, expected_num_params, expected_num_inputs, expected_num_outputs)
def setUp(self): # preparing various components for qr-dqn trainer initialization self.batch_size = 3 self.state_dim = 10 self.action_dim = 2 self.num_layers = 2 self.sizes = [20 for _ in range(self.num_layers)] self.activations = ["relu" for _ in range(self.num_layers)] self.use_layer_norm = False self.softmax_temperature = 1 self.actions = [str(i) for i in range(self.action_dim)] self.params = PPOTrainerParameters(actions=self.actions, normalize=False) self.reward_options = RewardOptions() self.metrics_to_score = get_metrics_to_score( self.reward_options.metric_reward_values ) self.policy_network = DuelingQNetwork.make_fully_connected( state_dim=self.state_dim, action_dim=self.action_dim, layers=self.sizes, activations=self.activations, ) self.sampler = SoftmaxActionSampler(temperature=self.softmax_temperature) self.policy = Policy(scorer=self.policy_network, sampler=self.sampler) self.value_network = FloatFeatureFullyConnected( state_dim=self.state_dim, output_dim=1, sizes=self.sizes, activations=self.activations, use_layer_norm=self.use_layer_norm, )
def build_q_network( self, state_feature_config: rlt.ModelFeatureConfig, state_normalization_data: NormalizationData, output_dim: int, ) -> ModelBase: state_dim = self._get_input_dim(state_normalization_data) return DuelingQNetwork.make_fully_connected( state_dim, output_dim, self.sizes, self.activations )
def test_save_load_discrete_action(self): state_dim = 8 action_dim = 4 model = DuelingQNetwork.make_fully_connected( state_dim, action_dim, layers=[8, 4], activations=["relu", "relu"] ) expected_num_params, expected_num_inputs, expected_num_outputs = 22, 1, 1 check_save_load( self, model, expected_num_params, expected_num_inputs, expected_num_outputs )
def build_q_network( self, state_feature_config: rlt.ModelFeatureConfig, state_normalization_parameters: Dict[int, NormalizationParameters], output_dim: int, ) -> ModelBase: state_dim = self._get_input_dim(state_normalization_parameters) return DuelingQNetwork( layers=[state_dim] + self.sizes + [output_dim], activations=self.activations + ["linear"], )
def test_save_load_discrete_action(self): state_dim = 8 action_dim = 4 model = DuelingQNetwork( layers=[state_dim, 8, 4, action_dim], activations=["relu", "relu", "linear"], use_batch_norm=False, ) expected_num_params, expected_num_inputs, expected_num_outputs = 22, 1, 1 check_save_load(self, model, expected_num_params, expected_num_inputs, expected_num_outputs)
def build_q_network( self, state_normalization_data: NormalizationData, output_dim: int, num_atoms: int, ) -> ModelBase: state_dim = self._get_input_dim(state_normalization_data) return DuelingQNetwork.make_fully_connected( state_dim, output_dim, layers=self.sizes, activations=self.activations, num_atoms=num_atoms, )
def create_dqn_trainer_from_params( model: DiscreteActionModelParameters, normalization_parameters: Dict[int, NormalizationParameters], use_gpu: bool = False, use_all_avail_gpus: bool = False, metrics_to_score=None, ): metrics_to_score = metrics_to_score or [] if model.rainbow.quantile: q_network = QuantileDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=len(model.actions), num_atoms=model.rainbow.num_atoms, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) elif model.rainbow.categorical: q_network = CategoricalDQN( # type: ignore state_dim=get_num_output_features(normalization_parameters), action_dim=len(model.actions), num_atoms=model.rainbow.num_atoms, qmin=model.rainbow.qmin, qmax=model.rainbow.qmax, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, use_gpu=use_gpu, ) elif model.rainbow.dueling_architecture: q_network = DuelingQNetwork( # type: ignore layers=[get_num_output_features(normalization_parameters)] + model.training.layers[1:-1] + [len(model.actions)], activations=model.training.activations, ) else: q_network = FullyConnectedDQN( # type: ignore state_dim=get_num_output_features(normalization_parameters), action_dim=len(model.actions), sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) if use_gpu and torch.cuda.is_available(): q_network = q_network.cuda() q_network_target = q_network.get_target_network() reward_network, q_network_cpe, q_network_cpe_target = None, None, None if model.evaluation.calc_cpe_in_training: # Metrics + reward num_output_nodes = (len(metrics_to_score) + 1) * len(model.actions) reward_network = FullyConnectedDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=num_output_nodes, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) q_network_cpe = FullyConnectedDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=num_output_nodes, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) if use_gpu and torch.cuda.is_available(): reward_network.cuda() q_network_cpe.cuda() q_network_cpe_target = q_network_cpe.get_target_network() if (use_all_avail_gpus and not model.rainbow.categorical and not model.rainbow.quantile): q_network = q_network.get_distributed_data_parallel_model() reward_network = (reward_network.get_distributed_data_parallel_model() if reward_network else None) q_network_cpe = (q_network_cpe.get_distributed_data_parallel_model() if q_network_cpe else None) if model.rainbow.quantile: assert (not use_all_avail_gpus ), "use_all_avail_gpus not implemented for distributional RL" parameters = QRDQNTrainerParameters.from_discrete_action_model_parameters( model) return QRDQNTrainer( q_network, q_network_target, parameters, use_gpu, metrics_to_score=metrics_to_score, reward_network=reward_network, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, ) elif model.rainbow.categorical: assert (not use_all_avail_gpus ), "use_all_avail_gpus not implemented for distributional RL" return C51Trainer( q_network, q_network_target, C51TrainerParameters.from_discrete_action_model_parameters(model), use_gpu, metrics_to_score=metrics_to_score, ) else: parameters = DQNTrainerParameters.from_discrete_action_model_parameters( model) return DQNTrainer( q_network, q_network_target, reward_network, parameters, use_gpu, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, metrics_to_score=metrics_to_score, )