def make_fully_connected( cls, state_dim: int, action_dim: int, layers: List[int], activations: List[str], use_batch_norm: bool = False, ): state_embedding_dim = layers[-1] shared_network = FullyConnectedDQN( state_dim, state_embedding_dim, sizes=layers[:-1], activations=activations[:-1], normalized_output=True, ) advantage_network = FullyConnectedCritic( state_embedding_dim, action_dim, sizes=[state_embedding_dim // 2], activations=activations[-1:], ) value_network = FullyConnectedDQN( state_embedding_dim, 1, sizes=[state_embedding_dim // 2], activations=activations[-1:], ) return ParametricDuelingQNetwork( shared_network=shared_network, advantage_network=advantage_network, value_network=value_network, )
def test_forward_pass(self): state_dim = 1 action_dim = 2 input = PreprocessedState.from_tensor(state=torch.tensor([[2.0]])) bcq_drop_threshold = 0.20 q_network = FullyConnectedDQN(state_dim, action_dim, sizes=[2], activations=["relu"]) # Set weights of q-network to make it deterministic q_net_layer_0_w = torch.tensor([[1.2], [0.9]]) q_network.state_dict()["fc.layers.0.weight"].data.copy_( q_net_layer_0_w) q_net_layer_0_b = torch.tensor([0.0, 0.0]) q_network.state_dict()["fc.layers.0.bias"].data.copy_(q_net_layer_0_b) q_net_layer_1_w = torch.tensor([[0.5, -0.5], [1.0, 1.0]]) q_network.state_dict()["fc.layers.1.weight"].data.copy_( q_net_layer_1_w) q_net_layer_1_b = torch.tensor([0.0, 0.0]) q_network.state_dict()["fc.layers.1.bias"].data.copy_(q_net_layer_1_b) imitator_network = FullyConnectedNetwork( layers=[state_dim, 2, action_dim], activations=["relu", "linear"]) # Set weights of imitator network to make it deterministic im_net_layer_0_w = torch.tensor([[1.2], [0.9]]) imitator_network.state_dict()["layers.0.weight"].data.copy_( im_net_layer_0_w) im_net_layer_0_b = torch.tensor([0.0, 0.0]) imitator_network.state_dict()["layers.0.bias"].data.copy_( im_net_layer_0_b) im_net_layer_1_w = torch.tensor([[0.5, 1.5], [1.0, 2.0]]) imitator_network.state_dict()["layers.1.weight"].data.copy_( im_net_layer_1_w) im_net_layer_1_b = torch.tensor([0.0, 0.0]) imitator_network.state_dict()["layers.1.bias"].data.copy_( im_net_layer_1_b) imitator_probs = torch.nn.functional.softmax(imitator_network( input.state.float_features), dim=1) bcq_mask = imitator_probs < bcq_drop_threshold assert bcq_mask[0][0] == 1 assert bcq_mask[0][1] == 0 model = BatchConstrainedDQN( state_dim=state_dim, q_network=q_network, imitator_network=imitator_network, bcq_drop_threshold=bcq_drop_threshold, ) final_q_values = model(input) assert final_q_values.q_values[0][0] == -1e10 assert abs(final_q_values.q_values[0][1] - 4.2) < 0.0001
def test_save_load_batch_norm(self): state_dim = 8 action_dim = 4 model = FullyConnectedDQN( state_dim, action_dim, sizes=[8, 4], activations=["relu", "relu"], use_batch_norm=True, ) # Freezing batch_norm model.eval() expected_num_params, expected_num_inputs, expected_num_outputs = 21, 1, 1 check_save_load(self, model, expected_num_params, expected_num_inputs, expected_num_outputs)
def test_discrete_wrapper_with_id_list_none(self): state_normalization_parameters = {i: _cont_norm() for i in range(1, 5)} state_preprocessor = Preprocessor(state_normalization_parameters, False) action_dim = 2 dqn = FullyConnectedDQN( state_dim=len(state_normalization_parameters), action_dim=action_dim, sizes=[16], activations=["relu"], ) dqn_with_preprocessor = DiscreteDqnWithPreprocessorWithIdList( dqn, state_preprocessor) action_names = ["L", "R"] wrapper = DiscreteDqnPredictorWrapperWithIdList( dqn_with_preprocessor, action_names) input_prototype = dqn_with_preprocessor.input_prototype() output_action_names, q_values = wrapper(*input_prototype) self.assertEqual(action_names, output_action_names) self.assertEqual(q_values.shape, (1, 2)) expected_output = dqn( rlt.PreprocessedState.from_tensor( state_preprocessor(*input_prototype[0]))).q_values self.assertTrue((expected_output == q_values).all())
def test_basic(self): state_dim = 8 action_dim = 4 model = FullyConnectedDQN( state_dim, action_dim, sizes=[8, 4], activations=["relu", "relu"], use_batch_norm=True, ) input = model.input_prototype() self.assertEqual((1, state_dim), input.state.float_features.shape) # Using batch norm requires more than 1 example in training, avoid that model.eval() q_values = model(input) self.assertEqual((1, action_dim), q_values.q_values.shape)
def test_forward_pass(self): torch.manual_seed(123) state_dim = 1 action_dim = 2 state = rlt.FeatureData(torch.tensor([[2.0]])) bcq_drop_threshold = 0.20 q_network = FullyConnectedDQN(state_dim, action_dim, sizes=[2], activations=["relu"]) init.constant_(q_network.fc.dnn[-2].bias, 3.0) imitator_network = FullyConnectedNetwork( layers=[state_dim, 2, action_dim], activations=["relu", "linear"]) imitator_probs = torch.nn.functional.softmax(imitator_network( state.float_features), dim=1) bcq_mask = imitator_probs < bcq_drop_threshold npt.assert_array_equal(bcq_mask.detach(), [[True, False]]) model = BatchConstrainedDQN( state_dim=state_dim, q_network=q_network, imitator_network=imitator_network, bcq_drop_threshold=bcq_drop_threshold, ) final_q_values = model(state) npt.assert_array_equal(final_q_values.detach(), [[-1e10, 3.0]])
def setUp(self): # preparing various components for qr-dqn trainer initialization self.params = QRDQNTrainerParameters(actions=["1", "2"], num_atoms=11) self.reward_options = RewardOptions() self.metrics_to_score = get_metrics_to_score( self.reward_options.metric_reward_values ) self.state_dim = 10 self.action_dim = 2 self.sizes = [20, 20] self.num_atoms = 11 self.activations = ["relu", "relu"] self.dropout_ratio = 0 self.q_network = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.action_dim, sizes=self.sizes, num_atoms=self.num_atoms, activations=self.activations, dropout_ratio=self.dropout_ratio, ) self.q_network_target = self.q_network.get_target_network() self.x = FeatureData(float_features=torch.rand(5, 10)) self.eval_parameters = EvaluationParameters(calc_cpe_in_training=True) self.num_output_nodes = (len(self.metrics_to_score) + 1) * len( # pyre-fixme[16]: `QRDQNTrainerParameters` has no attribute `actions`. self.params.actions ) self.reward_network = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.num_output_nodes, sizes=self.sizes, activations=self.activations, ) self.q_network_cpe = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.num_output_nodes, sizes=self.sizes, activations=self.activations, ) self.q_network_cpe_target = self.q_network_cpe.get_target_network()
def make_fully_connected( cls, state_dim: int, action_dim: int, layers: List[int], activations: List[str], num_atoms: Optional[int] = None, use_batch_norm: bool = False, ): assert len(layers) > 0, "Must have at least one layer" state_embedding_dim = layers[-1] assert state_embedding_dim % 2 == 0, "The last size must be divisible by 2" shared_network = FullyConnectedDQN( state_dim, state_embedding_dim, sizes=layers[:-1], activations=activations[:-1], normalized_output=True, use_batch_norm=use_batch_norm, ) advantage_network = FullyConnectedDQN( state_embedding_dim, action_dim, sizes=[state_embedding_dim // 2], activations=activations[-1:], num_atoms=num_atoms, ) value_network = FullyConnectedDQN( state_embedding_dim, 1, sizes=[state_embedding_dim // 2], activations=activations[-1:], num_atoms=num_atoms, ) return cls( shared_network=shared_network, advantage_network=advantage_network, value_network=value_network, )
def test_save_load(self): state_dim = 8 action_dim = 4 model = FullyConnectedDQN( state_dim, action_dim, sizes=[8, 4], activations=["relu", "relu"], use_batch_norm=False, ) expected_num_params, expected_num_inputs, expected_num_outputs = 6, 1, 1 check_save_load(self, model, expected_num_params, expected_num_inputs, expected_num_outputs)
def build_q_network( self, state_feature_config: rlt.ModelFeatureConfig, state_normalization_parameters: Dict[int, NormalizationParameters], output_dim: int, ) -> ModelBase: state_dim = self._get_input_dim(state_normalization_parameters) return FullyConnectedDQN( state_dim=state_dim, action_dim=output_dim, sizes=self.sizes, activations=self.activations, dropout_ratio=self.dropout_ratio, )
def build_q_network( self, state_normalization_data: NormalizationData, output_dim: int, num_atoms: int, ) -> ModelBase: state_dim = self._get_input_dim(state_normalization_data) return FullyConnectedDQN( state_dim=state_dim, action_dim=output_dim, sizes=self.sizes, num_atoms=num_atoms, activations=self.activations, dropout_ratio=self.dropout_ratio, )
def build_q_network( self, state_feature_config: rlt.ModelFeatureConfig, state_normalization_data: NormalizationData, output_dim: int, ) -> ModelBase: state_dim = self._get_input_dim(state_normalization_data) return FullyConnectedDQN( state_dim=state_dim, action_dim=output_dim, sizes=self.sizes, activations=self.activations, dropout_ratio=self.dropout_ratio, use_batch_norm=self.use_batch_norm, )
def test_save_load(self): state_dim = 8 action_dim = 4 q_network = FullyConnectedDQN(state_dim, action_dim, sizes=[8, 4], activations=["relu", "relu"]) imitator_network = FullyConnectedNetwork( layers=[state_dim, 8, 4, action_dim], activations=["relu", "relu", "linear"]) model = BatchConstrainedDQN( state_dim=state_dim, q_network=q_network, imitator_network=imitator_network, bcq_drop_threshold=0.05, ) # 6 for DQN + 6 for Imitator Network + 2 for BCQ constants expected_num_params, expected_num_inputs, expected_num_outputs = 14, 1, 1 check_save_load(self, model, expected_num_params, expected_num_inputs, expected_num_outputs)
def test_basic(self): state_dim = 8 action_dim = 4 q_network = FullyConnectedDQN(state_dim, action_dim, sizes=[8, 4], activations=["relu", "relu"]) imitator_network = FullyConnectedNetwork( layers=[state_dim, 8, 4, action_dim], activations=["relu", "relu", "linear"]) model = BatchConstrainedDQN( state_dim=state_dim, q_network=q_network, imitator_network=imitator_network, bcq_drop_threshold=0.05, ) input = model.input_prototype() self.assertEqual((1, state_dim), input.state.float_features.shape) q_values = model(input) self.assertEqual((1, action_dim), q_values.q_values.shape)
def build_q_network( self, state_normalization_parameters: Dict[int, NormalizationParameters], output_dim: int, num_atoms: int, qmin: int, qmax: int, ) -> ModelBase: state_dim = self._get_input_dim(state_normalization_parameters) distributional_network = FullyConnectedDQN( state_dim=state_dim, action_dim=output_dim, num_atoms=num_atoms, sizes=self.sizes, activations=self.activations, use_batch_norm=False, dropout_ratio=0.0, ) return CategoricalDQN(distributional_network, qmin=qmin, qmax=qmax, num_atoms=num_atoms)
def create_dqn_trainer_from_params( model: DiscreteActionModelParameters, normalization_parameters: Dict[int, NormalizationParameters], use_gpu: bool = False, use_all_avail_gpus: bool = False, metrics_to_score=None, ): metrics_to_score = metrics_to_score or [] if model.rainbow.quantile: q_network = QuantileDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=len(model.actions), num_atoms=model.rainbow.num_atoms, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) elif model.rainbow.categorical: q_network = CategoricalDQN( # type: ignore state_dim=get_num_output_features(normalization_parameters), action_dim=len(model.actions), num_atoms=model.rainbow.num_atoms, qmin=model.rainbow.qmin, qmax=model.rainbow.qmax, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, use_gpu=use_gpu, ) elif model.rainbow.dueling_architecture: q_network = DuelingQNetwork( # type: ignore layers=[get_num_output_features(normalization_parameters)] + model.training.layers[1:-1] + [len(model.actions)], activations=model.training.activations, ) else: q_network = FullyConnectedDQN( # type: ignore state_dim=get_num_output_features(normalization_parameters), action_dim=len(model.actions), sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) if use_gpu and torch.cuda.is_available(): q_network = q_network.cuda() q_network_target = q_network.get_target_network() reward_network, q_network_cpe, q_network_cpe_target = None, None, None if model.evaluation.calc_cpe_in_training: # Metrics + reward num_output_nodes = (len(metrics_to_score) + 1) * len(model.actions) reward_network = FullyConnectedDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=num_output_nodes, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) q_network_cpe = FullyConnectedDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=num_output_nodes, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) if use_gpu and torch.cuda.is_available(): reward_network.cuda() q_network_cpe.cuda() q_network_cpe_target = q_network_cpe.get_target_network() if (use_all_avail_gpus and not model.rainbow.categorical and not model.rainbow.quantile): q_network = q_network.get_distributed_data_parallel_model() reward_network = (reward_network.get_distributed_data_parallel_model() if reward_network else None) q_network_cpe = (q_network_cpe.get_distributed_data_parallel_model() if q_network_cpe else None) if model.rainbow.quantile: assert (not use_all_avail_gpus ), "use_all_avail_gpus not implemented for distributional RL" parameters = QRDQNTrainerParameters.from_discrete_action_model_parameters( model) return QRDQNTrainer( q_network, q_network_target, parameters, use_gpu, metrics_to_score=metrics_to_score, reward_network=reward_network, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, ) elif model.rainbow.categorical: assert (not use_all_avail_gpus ), "use_all_avail_gpus not implemented for distributional RL" return C51Trainer( q_network, q_network_target, C51TrainerParameters.from_discrete_action_model_parameters(model), use_gpu, metrics_to_score=metrics_to_score, ) else: parameters = DQNTrainerParameters.from_discrete_action_model_parameters( model) return DQNTrainer( q_network, q_network_target, reward_network, parameters, use_gpu, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, metrics_to_score=metrics_to_score, )
class TestCRR(unittest.TestCase): def setUp(self): # preparing various components for qr-dqn trainer initialization self.batch_size = 3 self.state_dim = 10 self.action_dim = 2 self.num_layers = 2 self.sizes = [20 for _ in range(self.num_layers)] self.num_atoms = 11 self.activations = ["relu" for _ in range(self.num_layers)] self.dropout_ratio = 0 self.exploration_variance = 1e-10 self.actions = [str(i) for i in range(self.action_dim)] self.params = CRRTrainerParameters(actions=self.actions) self.reward_options = RewardOptions() self.metrics_to_score = get_metrics_to_score( self.reward_options.metric_reward_values ) self.actor_network = FullyConnectedActor( state_dim=self.state_dim, action_dim=self.action_dim, sizes=self.sizes, activations=self.activations, exploration_variance=self.exploration_variance, ) self.actor_network_target = self.actor_network.get_target_network() self.q1_network = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.action_dim, sizes=self.sizes, activations=self.activations, dropout_ratio=self.dropout_ratio, ) self.q1_network_target = self.q1_network.get_target_network() self.q2_network = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.action_dim, sizes=self.sizes, activations=self.activations, dropout_ratio=self.dropout_ratio, ) self.q2_network_target = self.q2_network.get_target_network() self.num_output_nodes = (len(self.metrics_to_score) + 1) * len( self.params.actions ) self.eval_parameters = EvaluationParameters(calc_cpe_in_training=True) self.reward_network = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.num_output_nodes, sizes=self.sizes, activations=self.activations, ) self.q_network_cpe = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.num_output_nodes, sizes=self.sizes, activations=self.activations, ) self.q_network_cpe_target = self.q_network_cpe.get_target_network() self.inp = DiscreteDqnInput( state=FeatureData( float_features=torch.rand(self.batch_size, self.state_dim) ), next_state=FeatureData( float_features=torch.rand(self.batch_size, self.state_dim) ), reward=torch.ones(self.batch_size, 1), time_diff=torch.ones(self.batch_size, 1) * 2, step=torch.ones(self.batch_size, 1) * 2, not_terminal=torch.ones( self.batch_size, 1 ), # todo: check terminal behavior action=torch.tensor([[0, 1], [1, 0], [0, 1]]), next_action=torch.tensor([[1, 0], [0, 1], [1, 0]]), possible_actions_mask=torch.ones(self.batch_size, self.action_dim), possible_next_actions_mask=torch.ones(self.batch_size, self.action_dim), extras=ExtraData(action_probability=torch.ones(self.batch_size, 1)), ) @staticmethod def dummy_log(*args, **kwargs): # replaces calls to self.log() which otherwise require the pytorch lighting trainer to be intialized return None def _construct_trainer(self, new_params=None, no_cpe=False, no_q2=False): trainer = DiscreteCRRTrainer( actor_network=self.actor_network, actor_network_target=self.actor_network_target, q1_network=self.q1_network, q1_network_target=self.q1_network_target, q2_network=(None if no_q2 else self.q2_network), q2_network_target=(None if no_q2 else self.q2_network_target), reward_network=(None if no_cpe else self.reward_network), q_network_cpe=(None if no_cpe else self.q_network_cpe), q_network_cpe_target=(None if no_cpe else self.q_network_cpe_target), metrics_to_score=self.metrics_to_score, evaluation=EvaluationParameters( calc_cpe_in_training=(False if no_cpe else True) ), # pyre-fixme[16]: `QRDQNTrainerParameters` has no attribute `asdict`. **(new_params if new_params is not None else self.params).asdict() ) trainer.log = self.dummy_log return trainer def test_init(self): trainer = self._construct_trainer() self.assertTrue((torch.isclose(trainer.reward_boosts, torch.zeros(2))).all()) param_copy = CRRTrainerParameters( actions=self.actions, rl=RLParameters(reward_boost={i: int(i) + 1 for i in self.actions}), ) reward_boost_trainer = self._construct_trainer(new_params=param_copy) self.assertTrue( ( torch.isclose( reward_boost_trainer.reward_boosts, torch.tensor([1.0, 2.0]) ) ).all() ) def test_train_step_gen(self): mse_backward_type = type( torch.nn.functional.mse_loss( torch.tensor([1.0], requires_grad=True), torch.zeros(1) ).grad_fn ) add_backward_type = type( ( torch.tensor([1.0], requires_grad=True) + torch.tensor([1.0], requires_grad=True) ).grad_fn ) # vanilla trainer = self._construct_trainer() loss_gen = trainer.train_step_gen(self.inp, batch_idx=1) losses = list(loss_gen) self.assertEqual(len(losses), 6) self.assertEqual(type(losses[0].grad_fn), mse_backward_type) self.assertEqual(type(losses[1].grad_fn), mse_backward_type) self.assertEqual(type(losses[2].grad_fn), add_backward_type) self.assertEqual(type(losses[3].grad_fn), mse_backward_type) self.assertEqual(type(losses[4].grad_fn), mse_backward_type) self.assertEqual(type(losses[5].grad_fn), add_backward_type) # no CPE trainer = self._construct_trainer(no_cpe=True) loss_gen = trainer.train_step_gen(self.inp, batch_idx=1) losses = list(loss_gen) self.assertEqual(len(losses), 4) # no q2 net trainer = self._construct_trainer(no_q2=True) loss_gen = trainer.train_step_gen(self.inp, batch_idx=1) losses = list(loss_gen) self.assertEqual(len(losses), 5) # use_target_actor params_copy = CRRTrainerParameters(actions=self.actions, use_target_actor=True) trainer = self._construct_trainer(new_params=params_copy) loss_gen = trainer.train_step_gen(self.inp, batch_idx=1) losses = list(loss_gen) self.assertEqual(len(losses), 6) # delayed policy update params_copy = CRRTrainerParameters( actions=self.actions, delayed_policy_update=2 ) trainer = self._construct_trainer(new_params=params_copy) loss_gen = trainer.train_step_gen(self.inp, batch_idx=1) losses = list(loss_gen) self.assertEqual(len(losses), 6) self.assertEqual(losses[2], None) # entropy params_copy = CRRTrainerParameters(actions=self.actions, entropy_coeff=1.0) trainer = self._construct_trainer(new_params=params_copy) loss_gen = trainer.train_step_gen(self.inp, batch_idx=1) losses = list(loss_gen) self.assertEqual(len(losses), 6) def test_q_network_property(self): trainer = self._construct_trainer() self.assertEqual(trainer.q_network, trainer.q1_network) def test_configure_optimizers(self): trainer = self._construct_trainer() optimizers = trainer.configure_optimizers() self.assertEqual(len(optimizers), 6) train_step_yield_order = [ trainer.q1_network, trainer.q2_network, trainer.actor_network, trainer.reward_network, trainer.q_network_cpe, trainer.q1_network, ] for i in range(len(train_step_yield_order)): opt_param = optimizers[i]["optimizer"].param_groups[0]["params"][0] loss_param = list(train_step_yield_order[i].parameters())[0] self.assertTrue(torch.all(torch.isclose(opt_param, loss_param))) trainer = self._construct_trainer(no_cpe=True) optimizers = trainer.configure_optimizers() self.assertEqual(len(optimizers), 4) trainer = self._construct_trainer(no_q2=True) optimizers = trainer.configure_optimizers() self.assertEqual(len(optimizers), 5) def test_get_detached_model_outputs(self): trainer = self._construct_trainer() action_scores, _ = trainer.get_detached_model_outputs( FeatureData(float_features=torch.rand(self.batch_size, self.state_dim)) ) self.assertEqual(action_scores.shape[0], self.batch_size) self.assertEqual(action_scores.shape[1], self.action_dim) def test_validation_step(self): trainer = self._construct_trainer() edp = trainer.validation_step(self.inp, batch_idx=1) out = trainer.actor_network(self.inp.state) # Note: in current code EDP assumes policy induced by q-net instead of actor self.assertTrue(torch.all(torch.isclose(edp.optimal_q_values, out.action)))
def setUp(self): # preparing various components for qr-dqn trainer initialization self.batch_size = 3 self.state_dim = 10 self.action_dim = 2 self.num_layers = 2 self.sizes = [20 for _ in range(self.num_layers)] self.num_atoms = 11 self.activations = ["relu" for _ in range(self.num_layers)] self.dropout_ratio = 0 self.exploration_variance = 1e-10 self.actions = [str(i) for i in range(self.action_dim)] self.params = CRRTrainerParameters(actions=self.actions) self.reward_options = RewardOptions() self.metrics_to_score = get_metrics_to_score( self.reward_options.metric_reward_values ) self.actor_network = FullyConnectedActor( state_dim=self.state_dim, action_dim=self.action_dim, sizes=self.sizes, activations=self.activations, exploration_variance=self.exploration_variance, ) self.actor_network_target = self.actor_network.get_target_network() self.q1_network = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.action_dim, sizes=self.sizes, activations=self.activations, dropout_ratio=self.dropout_ratio, ) self.q1_network_target = self.q1_network.get_target_network() self.q2_network = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.action_dim, sizes=self.sizes, activations=self.activations, dropout_ratio=self.dropout_ratio, ) self.q2_network_target = self.q2_network.get_target_network() self.num_output_nodes = (len(self.metrics_to_score) + 1) * len( self.params.actions ) self.eval_parameters = EvaluationParameters(calc_cpe_in_training=True) self.reward_network = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.num_output_nodes, sizes=self.sizes, activations=self.activations, ) self.q_network_cpe = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.num_output_nodes, sizes=self.sizes, activations=self.activations, ) self.q_network_cpe_target = self.q_network_cpe.get_target_network() self.inp = DiscreteDqnInput( state=FeatureData( float_features=torch.rand(self.batch_size, self.state_dim) ), next_state=FeatureData( float_features=torch.rand(self.batch_size, self.state_dim) ), reward=torch.ones(self.batch_size, 1), time_diff=torch.ones(self.batch_size, 1) * 2, step=torch.ones(self.batch_size, 1) * 2, not_terminal=torch.ones( self.batch_size, 1 ), # todo: check terminal behavior action=torch.tensor([[0, 1], [1, 0], [0, 1]]), next_action=torch.tensor([[1, 0], [0, 1], [1, 0]]), possible_actions_mask=torch.ones(self.batch_size, self.action_dim), possible_next_actions_mask=torch.ones(self.batch_size, self.action_dim), extras=ExtraData(action_probability=torch.ones(self.batch_size, 1)), )
class TestQRDQN(unittest.TestCase): def setUp(self): # preparing various components for qr-dqn trainer initialization self.params = QRDQNTrainerParameters(actions=["1", "2"], num_atoms=11) self.reward_options = RewardOptions() self.metrics_to_score = get_metrics_to_score( self.reward_options.metric_reward_values ) self.state_dim = 10 self.action_dim = 2 self.sizes = [20, 20] self.num_atoms = 11 self.activations = ["relu", "relu"] self.dropout_ratio = 0 self.q_network = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.action_dim, sizes=self.sizes, num_atoms=self.num_atoms, activations=self.activations, dropout_ratio=self.dropout_ratio, ) self.q_network_target = self.q_network.get_target_network() self.x = FeatureData(float_features=torch.rand(5, 10)) self.eval_parameters = EvaluationParameters(calc_cpe_in_training=True) self.num_output_nodes = (len(self.metrics_to_score) + 1) * len( # pyre-fixme[16]: `QRDQNTrainerParameters` has no attribute `actions`. self.params.actions ) self.reward_network = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.num_output_nodes, sizes=self.sizes, activations=self.activations, ) self.q_network_cpe = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.num_output_nodes, sizes=self.sizes, activations=self.activations, ) self.q_network_cpe_target = self.q_network_cpe.get_target_network() def _construct_trainer(self, new_params=None, no_cpe=False): reward_network = self.reward_network q_network_cpe = self.q_network_cpe q_network_cpe_target = self.q_network_cpe_target evaluation = self.eval_parameters params = self.params if new_params is not None: params = new_params if no_cpe: reward_network = q_network_cpe = q_network_cpe_target = None evaluation = EvaluationParameters(calc_cpe_in_training=False) return QRDQNTrainer( q_network=self.q_network, q_network_target=self.q_network_target, reward_network=reward_network, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, metrics_to_score=self.metrics_to_score, evaluation=evaluation, # pyre-fixme[16]: `QRDQNTrainerParameters` has no attribute `asdict`. **params.asdict() ) def test_init(self): trainer = self._construct_trainer() quantiles = (0.5 + torch.arange(self.num_atoms).float()) / float(self.num_atoms) self.assertTrue((torch.isclose(trainer.quantiles, quantiles)).all()) self.assertTrue((torch.isclose(trainer.reward_boosts, torch.zeros(2))).all()) param_copy = QRDQNTrainerParameters( actions=["1", "2"], num_atoms=11, rl=RLParameters(reward_boost={"1": 1, "2": 2}), ) reward_boost_trainer = self._construct_trainer(new_params=param_copy) self.assertTrue( ( torch.isclose( reward_boost_trainer.reward_boosts, torch.tensor([1.0, 2.0]) ) ).all() ) def test_train_step_gen(self): inp = DiscreteDqnInput( state=FeatureData(float_features=torch.rand(3, 10)), next_state=FeatureData(float_features=torch.rand(3, 10)), reward=torch.ones(3, 1), time_diff=torch.ones(3, 1) * 2, step=torch.ones(3, 1) * 2, not_terminal=torch.ones(3, 1), # todo: check terminal behavior action=torch.tensor([[0, 1], [1, 0], [0, 1]]), next_action=torch.tensor([[1, 0], [0, 1], [1, 0]]), possible_actions_mask=torch.ones(3, 2), possible_next_actions_mask=torch.ones(3, 2), extras=ExtraData(), ) mse_backward_type = type( torch.nn.functional.mse_loss( torch.tensor([1.0], requires_grad=True), torch.zeros(1) ).grad_fn ) add_backward_type = type( ( torch.tensor([1.0], requires_grad=True) + torch.tensor([1.0], requires_grad=True) ).grad_fn ) mean_backward_type = type( torch.tensor([1.0, 2.0], requires_grad=True).mean().grad_fn ) # vanilla trainer = self._construct_trainer() loss_gen = trainer.train_step_gen(inp, batch_idx=1) losses = list(loss_gen) self.assertEqual(len(losses), 4) self.assertEqual(type(losses[0].grad_fn), mean_backward_type) self.assertEqual(type(losses[1].grad_fn), mse_backward_type) self.assertEqual(type(losses[2].grad_fn), mse_backward_type) self.assertEqual(type(losses[3].grad_fn), add_backward_type) # no CPE trainer = self._construct_trainer(no_cpe=True) loss_gen = trainer.train_step_gen(inp, batch_idx=1) losses = list(loss_gen) self.assertEqual(len(losses), 2) # seq_num param_copy = QRDQNTrainerParameters( actions=["1", "2"], num_atoms=11, rl=RLParameters(use_seq_num_diff_as_time_diff=True), ) trainer = self._construct_trainer(new_params=param_copy) loss_gen = trainer.train_step_gen(inp, batch_idx=1) losses = list(loss_gen) self.assertEqual(len(losses), 4) # multi_steps param_copy = QRDQNTrainerParameters( actions=["1", "2"], num_atoms=11, rl=RLParameters(multi_steps=2) ) trainer = self._construct_trainer(new_params=param_copy) loss_gen = trainer.train_step_gen(inp, batch_idx=1) losses = list(loss_gen) self.assertEqual(len(losses), 4) # non_max_q param_copy = QRDQNTrainerParameters( actions=["1", "2"], num_atoms=11, rl=RLParameters(maxq_learning=False) ) trainer = self._construct_trainer(new_params=param_copy) loss_gen = trainer.train_step_gen(inp, batch_idx=1) losses = list(loss_gen) self.assertEqual(len(losses), 4) def test_configure_optimizers(self): trainer = self._construct_trainer() optimizers = trainer.configure_optimizers() self.assertEqual(len(optimizers), 4) train_step_yield_order = [ trainer.q_network, trainer.reward_network, trainer.q_network_cpe, trainer.q_network, ] for i in range(len(train_step_yield_order)): opt_param = optimizers[i]["optimizer"].param_groups[0]["params"][0] loss_param = list(train_step_yield_order[i].parameters())[0] self.assertTrue(torch.all(torch.isclose(opt_param, loss_param))) trainer = self._construct_trainer(no_cpe=True) optimizers = trainer.configure_optimizers() self.assertEqual(len(optimizers), 2) def test_get_detached_model_outputs(self): trainer = self._construct_trainer() q_out, q_target = trainer.get_detached_model_outputs(self.x) self.assertEqual(q_out.shape[0], q_target.shape[0], 3) self.assertEqual(q_out.shape[1], q_target.shape[1], 2)