def test_forward_pass(self): state_dim = 1 action_dim = 2 input = StateInput(state=FeatureVector( float_features=torch.tensor([[2.0]]))) bcq_drop_threshold = 0.20 q_network = FullyConnectedDQN(state_dim, action_dim, sizes=[2], activations=["relu"]) # Set weights of q-network to make it deterministic q_net_layer_0_w = torch.tensor([[1.2], [0.9]]) q_network.state_dict()["fc.layers.0.weight"].data.copy_( q_net_layer_0_w) q_net_layer_0_b = torch.tensor([0.0, 0.0]) q_network.state_dict()["fc.layers.0.bias"].data.copy_(q_net_layer_0_b) q_net_layer_1_w = torch.tensor([[0.5, -0.5], [1.0, 1.0]]) q_network.state_dict()["fc.layers.1.weight"].data.copy_( q_net_layer_1_w) q_net_layer_1_b = torch.tensor([0.0, 0.0]) q_network.state_dict()["fc.layers.1.bias"].data.copy_(q_net_layer_1_b) imitator_network = FullyConnectedNetwork( layers=[state_dim, 2, action_dim], activations=["relu", "linear"]) # Set weights of imitator network to make it deterministic im_net_layer_0_w = torch.tensor([[1.2], [0.9]]) imitator_network.state_dict()["layers.0.weight"].data.copy_( im_net_layer_0_w) im_net_layer_0_b = torch.tensor([0.0, 0.0]) imitator_network.state_dict()["layers.0.bias"].data.copy_( im_net_layer_0_b) im_net_layer_1_w = torch.tensor([[0.5, 1.5], [1.0, 2.0]]) imitator_network.state_dict()["layers.1.weight"].data.copy_( im_net_layer_1_w) im_net_layer_1_b = torch.tensor([0.0, 0.0]) imitator_network.state_dict()["layers.1.bias"].data.copy_( im_net_layer_1_b) imitator_probs = torch.nn.functional.softmax(imitator_network( input.state.float_features), dim=1) bcq_mask = imitator_probs < bcq_drop_threshold assert bcq_mask[0][0] == 1 assert bcq_mask[0][1] == 0 model = BatchConstrainedDQN( state_dim=state_dim, q_network=q_network, imitator_network=imitator_network, bcq_drop_threshold=bcq_drop_threshold, ) final_q_values = model(input) assert final_q_values.q_values[0][0] == -1e10 assert abs(final_q_values.q_values[0][1] - 4.2) < 0.0001
def get_modular_sarsa_trainer_reward_boost( self, environment, reward_shape, dueling, use_gpu=False, use_all_avail_gpus=False, clip_grad_norm=None, ): parameters = self.get_sarsa_parameters(environment, reward_shape, dueling, clip_grad_norm) q_network = FullyConnectedDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) reward_network = FullyConnectedDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) if use_gpu: q_network = q_network.cuda() reward_network = reward_network.cuda() if use_all_avail_gpus: q_network = q_network.get_data_parallel_model() reward_network = reward_network.get_data_parallel_model() q_network_target = q_network.get_target_network() trainer = _DQNTrainer(q_network, q_network_target, reward_network, parameters, use_gpu) return trainer
def test_save_load_batch_norm(self): state_dim = 8 action_dim = 4 model = FullyConnectedDQN( state_dim, action_dim, sizes=[8, 4], activations=["relu", "relu"], use_batch_norm=True, ) # Freezing batch_norm model.eval() expected_num_params, expected_num_inputs, expected_num_outputs = 21, 1, 1 check_save_load(self, model, expected_num_params, expected_num_inputs, expected_num_outputs)
def test_discrete_wrapper(self): state_normalization_parameters = {i: _cont_norm() for i in range(1, 5)} state_preprocessor = Preprocessor(state_normalization_parameters, False) action_dim = 2 dqn = FullyConnectedDQN( state_dim=len(state_normalization_parameters), action_dim=action_dim, sizes=[16], activations=["relu"], ) dqn_with_preprocessor = DiscreteDqnWithPreprocessor( dqn, state_preprocessor) action_names = ["L", "R"] wrapper = DiscreteDqnPredictorWrapper(dqn_with_preprocessor, action_names) input_prototype = dqn_with_preprocessor.input_prototype() output_action_names, q_values = wrapper(*input_prototype) self.assertEqual(action_names, output_action_names) self.assertEqual(q_values.shape, (1, 2)) expected_output = dqn( rlt.PreprocessedState.from_tensor( state_preprocessor(*input_prototype[0]))).q_values self.assertTrue((expected_output == q_values).all())
def test_basic(self): state_dim = 8 action_dim = 4 model = FullyConnectedDQN( state_dim, action_dim, sizes=[8, 4], activations=["relu", "relu"], use_batch_norm=True, ) input = model.input_prototype() self.assertEqual((1, state_dim), input.state.float_features.shape) # Using batch norm requires more than 1 example in training, avoid that model.eval() q_values = model(input) self.assertEqual((1, action_dim), q_values.q_values.shape)
def build_q_network( self, state_normalization_parameters: Dict[int, NormalizationParameters], output_dim: int, ) -> ModelBase: state_dim = self._get_input_dim(state_normalization_parameters) return FullyConnectedDQN( state_dim=state_dim, action_dim=output_dim, sizes=self.config.sizes, activations=self.config.activations, dropout_ratio=self.config.dropout_ratio, )
def test_save_load(self): state_dim = 8 action_dim = 4 model = FullyConnectedDQN( state_dim, action_dim, sizes=[8, 4], activations=["relu", "relu"], use_batch_norm=False, ) expected_num_params, expected_num_inputs, expected_num_outputs = 6, 1, 1 check_save_load(self, model, expected_num_params, expected_num_inputs, expected_num_outputs)
def test_save_load(self): state_dim = 8 action_dim = 4 q_network = FullyConnectedDQN(state_dim, action_dim, sizes=[8, 4], activations=["relu", "relu"]) imitator_network = FullyConnectedNetwork( layers=[state_dim, 8, 4, action_dim], activations=["relu", "relu", "linear"]) model = BatchConstrainedDQN( state_dim=state_dim, q_network=q_network, imitator_network=imitator_network, bcq_drop_threshold=0.05, ) # 6 for DQN + 6 for Imitator Network + 2 for BCQ constants expected_num_params, expected_num_inputs, expected_num_outputs = 14, 1, 1 check_save_load(self, model, expected_num_params, expected_num_inputs, expected_num_outputs)
def test_basic(self): state_dim = 8 action_dim = 4 q_network = FullyConnectedDQN(state_dim, action_dim, sizes=[8, 4], activations=["relu", "relu"]) imitator_network = FullyConnectedNetwork( layers=[state_dim, 8, 4, action_dim], activations=["relu", "relu", "linear"]) model = BatchConstrainedDQN( state_dim=state_dim, q_network=q_network, imitator_network=imitator_network, bcq_drop_threshold=0.05, ) input = model.input_prototype() self.assertEqual((1, state_dim), input.state.float_features.shape) q_values = model(input) self.assertEqual((1, action_dim), q_values.q_values.shape)
def create_dqn_trainer_from_params( model: DiscreteActionModelParameters, normalization_parameters: Dict[int, NormalizationParameters], use_gpu: bool = False, use_all_avail_gpus: bool = False, metrics_to_score=None, ): metrics_to_score = metrics_to_score or [] if model.rainbow.quantile: q_network = QuantileDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=len(model.actions), num_atoms=model.rainbow.num_atoms, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) elif model.rainbow.categorical: q_network = CategoricalDQN( # type: ignore state_dim=get_num_output_features(normalization_parameters), action_dim=len(model.actions), num_atoms=model.rainbow.num_atoms, qmin=model.rainbow.qmin, qmax=model.rainbow.qmax, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, use_gpu=use_gpu, ) elif model.rainbow.dueling_architecture: q_network = DuelingQNetwork( # type: ignore layers=[get_num_output_features(normalization_parameters)] + model.training.layers[1:-1] + [len(model.actions)], activations=model.training.activations, ) else: q_network = FullyConnectedDQN( # type: ignore state_dim=get_num_output_features(normalization_parameters), action_dim=len(model.actions), sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) if use_gpu and torch.cuda.is_available(): q_network = q_network.cuda() q_network_target = q_network.get_target_network() reward_network, q_network_cpe, q_network_cpe_target = None, None, None if model.evaluation.calc_cpe_in_training: # Metrics + reward num_output_nodes = (len(metrics_to_score) + 1) * len(model.actions) reward_network = FullyConnectedDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=num_output_nodes, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) q_network_cpe = FullyConnectedDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=num_output_nodes, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) if use_gpu and torch.cuda.is_available(): reward_network.cuda() q_network_cpe.cuda() q_network_cpe_target = q_network_cpe.get_target_network() if (use_all_avail_gpus and not model.rainbow.categorical and not model.rainbow.quantile): q_network = q_network.get_distributed_data_parallel_model() reward_network = (reward_network.get_distributed_data_parallel_model() if reward_network else None) q_network_cpe = (q_network_cpe.get_distributed_data_parallel_model() if q_network_cpe else None) if model.rainbow.quantile: assert (not use_all_avail_gpus ), "use_all_avail_gpus not implemented for distributional RL" return QRDQNTrainer( q_network, q_network_target, model, use_gpu, metrics_to_score=metrics_to_score, ) elif model.rainbow.categorical: assert (not use_all_avail_gpus ), "use_all_avail_gpus not implemented for distributional RL" return C51Trainer( q_network, q_network_target, model, use_gpu, metrics_to_score=metrics_to_score, ) else: return DQNTrainer( q_network, q_network_target, reward_network, model, use_gpu, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, metrics_to_score=metrics_to_score, )
def get_modular_sarsa_trainer_reward_boost( self, environment, reward_shape, dueling, categorical, quantile, use_gpu=False, use_all_avail_gpus=False, clip_grad_norm=None, ): assert not quantile or not categorical parameters = self.get_sarsa_parameters(environment, reward_shape, dueling, categorical, quantile, clip_grad_norm) if quantile: if dueling: q_network = DuelingQuantileDQN( layers=[ get_num_output_features(environment.normalization) ] + parameters.training.layers[1:-1] + [len(environment.ACTIONS)], activations=parameters.training.activations, num_atoms=parameters.rainbow.num_atoms, ) else: q_network = QuantileDQN( state_dim=get_num_output_features( environment.normalization), action_dim=len(environment.ACTIONS), num_atoms=parameters.rainbow.num_atoms, sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) elif categorical: assert not dueling q_network = CategoricalDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), num_atoms=parameters.rainbow.num_atoms, qmin=-100, qmax=200, sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) else: if dueling: q_network = DuelingQNetwork( layers=[ get_num_output_features(environment.normalization) ] + parameters.training.layers[1:-1] + [len(environment.ACTIONS)], activations=parameters.training.activations, ) else: q_network = FullyConnectedDQN( state_dim=get_num_output_features( environment.normalization), action_dim=len(environment.ACTIONS), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) q_network_cpe, q_network_cpe_target, reward_network = None, None, None if parameters.evaluation and parameters.evaluation.calc_cpe_in_training: q_network_cpe = FullyConnectedDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) q_network_cpe_target = q_network_cpe.get_target_network() reward_network = FullyConnectedDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) if use_gpu: q_network = q_network.cuda() if parameters.evaluation.calc_cpe_in_training: reward_network = reward_network.cuda() q_network_cpe = q_network_cpe.cuda() q_network_cpe_target = q_network_cpe_target.cuda() if use_all_avail_gpus and not categorical: q_network = q_network.get_distributed_data_parallel_model() reward_network = reward_network.get_distributed_data_parallel_model( ) q_network_cpe = q_network_cpe.get_distributed_data_parallel_model( ) q_network_cpe_target = ( q_network_cpe_target.get_distributed_data_parallel_model()) if quantile: trainer = QRDQNTrainer( q_network, q_network.get_target_network(), parameters, use_gpu, reward_network=reward_network, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, ) elif categorical: trainer = C51Trainer(q_network, q_network.get_target_network(), parameters, use_gpu) else: parameters = DQNTrainerParameters.from_discrete_action_model_parameters( parameters) trainer = DQNTrainer( q_network, q_network.get_target_network(), reward_network, parameters, use_gpu, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, ) return trainer
def get_modular_sarsa_trainer_reward_boost( self, environment, reward_shape, dueling, categorical, use_gpu=False, use_all_avail_gpus=False, clip_grad_norm=None, ): parameters = self.get_sarsa_parameters(environment, reward_shape, dueling, categorical, clip_grad_norm) if not categorical: q_network = FullyConnectedDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) q_network_cpe = FullyConnectedDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) reward_network = FullyConnectedDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) else: q_network = CategoricalDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), num_atoms=51, qmin=-100, qmax=200, sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) if use_gpu: q_network = q_network.cuda() reward_network = reward_network.cuda() q_network_cpe = q_network_cpe.cuda() if use_all_avail_gpus and not categorical: q_network = q_network.get_distributed_data_parallel_model() reward_network = reward_network.get_distributed_data_parallel_model( ) q_network_cpe = q_network_cpe.get_distributed_data_parallel_model( ) if not categorical: trainer = DQNTrainer( q_network, q_network.get_target_network(), reward_network, parameters, use_gpu, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe.get_target_network(), ) else: trainer = C51Trainer(q_network, q_network.get_target_network(), parameters, use_gpu) return trainer
def create_dqn_trainer_from_params( model: DiscreteActionModelParameters, normalization_parameters: Dict[int, NormalizationParameters], use_gpu: bool = False, metrics_to_score=None, ): metrics_to_score = metrics_to_score or [] if model.rainbow.dueling_architecture: q_network = DuelingQNetwork( layers=[get_num_output_features(normalization_parameters)] + model.training.layers[1:-1] + [len(model.actions)], activations=model.training.activations, ) else: q_network = FullyConnectedDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=len(model.actions), sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) if use_gpu and torch.cuda.is_available(): q_network = q_network.cuda() q_network_target = q_network.get_target_network() reward_network, q_network_cpe, q_network_cpe_target = None, None, None if model.evaluation.calc_cpe_in_training: # Metrics + reward num_output_nodes = (len(metrics_to_score) + 1) * len(model.actions) reward_network = FullyConnectedDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=num_output_nodes, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) q_network_cpe = FullyConnectedDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=num_output_nodes, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) if use_gpu and torch.cuda.is_available(): reward_network.cuda() q_network_cpe.cuda() q_network_cpe_target = q_network_cpe.get_target_network() return DQNTrainer( q_network, q_network_target, reward_network, model, use_gpu, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, metrics_to_score=metrics_to_score, )