def build_trainer( self, normalization_data_map: Dict[str, NormalizationData], use_gpu: bool, reward_options: Optional[RewardOptions] = None, ) -> ParametricDQNTrainer: net_builder = self.net_builder.value # pyre-fixme[16]: `ParametricDQN` has no attribute `_q_network`. self._q_network = net_builder.build_q_network( normalization_data_map[NormalizationKey.STATE], normalization_data_map[NormalizationKey.ACTION], ) # Metrics + reward reward_options = reward_options or RewardOptions() metrics_to_score = get_metrics_to_score( reward_options.metric_reward_values) reward_output_dim = len(metrics_to_score) + 1 reward_network = net_builder.build_q_network( normalization_data_map[NormalizationKey.STATE], normalization_data_map[NormalizationKey.ACTION], output_dim=reward_output_dim, ) q_network_target = self._q_network.get_target_network() return ParametricDQNTrainer( q_network=self._q_network, q_network_target=q_network_target, reward_network=reward_network, # pyre-fixme[16]: `ParametricDQNTrainerParameters` has no attribute # `asdict`. **self.trainer_param.asdict(), )
def build_trainer(self) -> ParametricDQNTrainer: net_builder = self.net_builder.value # pyre-fixme[16]: `ParametricDQN` has no attribute `_q_network`. # pyre-fixme[16]: `ParametricDQN` has no attribute `_q_network`. self._q_network = net_builder.build_q_network( self.state_normalization_data, self.action_normalization_data) # Metrics + reward reward_output_dim = len(self.metrics_to_score) + 1 reward_network = net_builder.build_q_network( self.state_normalization_data, self.action_normalization_data, output_dim=reward_output_dim, ) if self.use_gpu: self._q_network = self._q_network.cuda() reward_network = reward_network.cuda() q_network_target = self._q_network.get_target_network() # pyre-fixme[29]: `Type[ParametricDQNTrainer]` is not a function. # pyre-fixme[29]: `Type[ParametricDQNTrainer]` is not a function. return ParametricDQNTrainer( q_network=self._q_network, q_network_target=q_network_target, reward_network=reward_network, use_gpu=self.use_gpu, # pyre-fixme[16]: `ParametricDQNTrainerParameters` has no attribute # `asdict`. # pyre-fixme[16]: `ParametricDQNTrainerParameters` has no attribute # `asdict`. **self.trainer_param.asdict(), )
def create_parametric_dqn_trainer_from_params( model: ContinuousActionModelParameters, state_normalization_parameters: Dict[int, NormalizationParameters], action_normalization_parameters: Dict[int, NormalizationParameters], use_gpu: bool = False, use_all_avail_gpus: bool = False, ): q_network = FullyConnectedCritic( state_dim=get_num_output_features(state_normalization_parameters), action_dim=get_num_output_features(action_normalization_parameters), sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], ) reward_network = FullyConnectedCritic( state_dim=get_num_output_features(state_normalization_parameters), action_dim=get_num_output_features(action_normalization_parameters), sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], ) q_network_target = q_network.get_target_network() if use_gpu: q_network = q_network.cuda() q_network_target = q_network_target.cuda() reward_network = reward_network.cuda() if use_all_avail_gpus: q_network = q_network.get_distributed_data_parallel_model() q_network_target = q_network_target.get_distributed_data_parallel_model( ) reward_network = reward_network.get_distributed_data_parallel_model() # pyre-fixme[28]: Unexpected keyword argument `rl`. # pyre-fixme[28]: Unexpected keyword argument `rl`. trainer_parameters = ParametricDQNTrainerParameters( rl=model.rl, double_q_learning=model.rainbow.double_q_learning, minibatch_size=model.training.minibatch_size, optimizer=OptimizerParameters( optimizer=model.training.optimizer, learning_rate=model.training.learning_rate, l2_decay=model.training.l2_decay, ), ) return ParametricDQNTrainer( q_network, q_network_target, reward_network, use_gpu=use_gpu, # pyre-fixme[16]: `ParametricDQNTrainerParameters` has no attribute `asdict`. **trainer_parameters.asdict())
def create_from_tensors_parametric_dqn( cls, trainer: ParametricDQNTrainer, mdp_ids: torch.Tensor, sequence_numbers: torch.Tensor, states: rlt.FeatureData, actions: rlt.FeatureData, propensities: torch.Tensor, rewards: torch.Tensor, possible_actions_mask: torch.Tensor, possible_actions: rlt.FeatureData, max_num_actions: int, metrics: Optional[torch.Tensor] = None, ): old_q_train_state = trainer.q_network.training old_reward_train_state = trainer.reward_network.training trainer.q_network.train(False) trainer.reward_network.train(False) tiled_state = states.float_features.repeat(1, max_num_actions).reshape( -1, states.float_features.shape[1]) assert possible_actions is not None # Get Q-value of action taken possible_actions_state_concat = (rlt.FeatureData(tiled_state), possible_actions) # FIXME: model_values, model_values_for_logged_action, and model_metrics_values # should be calculated using q_network_cpe (as in discrete dqn). # q_network_cpe has not been added in parametric dqn yet. model_values = trainer.q_network(*possible_actions_state_concat) optimal_q_values, _ = trainer.get_detached_q_values( *possible_actions_state_concat) eval_action_idxs = None assert (model_values.shape[1] == 1 and model_values.shape[0] == possible_actions_mask.shape[0] * possible_actions_mask.shape[1]), ( "Invalid shapes: " + str(model_values.shape) + " != " + str(possible_actions_mask.shape)) model_values = model_values.reshape(possible_actions_mask.shape) optimal_q_values = optimal_q_values.reshape( possible_actions_mask.shape) model_propensities = masked_softmax(optimal_q_values, possible_actions_mask, trainer.rl_temperature) rewards_and_metric_rewards = trainer.reward_network( *possible_actions_state_concat) model_rewards = rewards_and_metric_rewards[:, :1] assert (model_rewards.shape[0] * model_rewards.shape[1] == possible_actions_mask.shape[0] * possible_actions_mask.shape[1]), ( "Invalid shapes: " + str(model_rewards.shape) + " != " + str(possible_actions_mask.shape)) model_rewards = model_rewards.reshape(possible_actions_mask.shape) model_metrics = rewards_and_metric_rewards[:, 1:] model_metrics = model_metrics.reshape(possible_actions_mask.shape[0], -1) model_values_for_logged_action = trainer.q_network(states, actions) model_rewards_and_metrics_for_logged_action = trainer.reward_network( states, actions) model_rewards_for_logged_action = model_rewards_and_metrics_for_logged_action[:, : 1] action_dim = possible_actions.float_features.shape[1] action_mask = torch.all( possible_actions.float_features.view( -1, max_num_actions, action_dim) == actions.float_features.unsqueeze(dim=1), dim=2, ).float() assert torch.all(action_mask.sum(dim=1) == 1) num_metrics = model_metrics.shape[1] // max_num_actions model_metrics_values = None model_metrics_for_logged_action = None model_metrics_values_for_logged_action = None if num_metrics > 0: # FIXME: calculate model_metrics_values when q_network_cpe is added # to parametric dqn model_metrics_values = model_values.repeat(1, num_metrics) trainer.q_network.train(old_q_train_state) trainer.reward_network.train(old_reward_train_state) return cls( mdp_id=mdp_ids, sequence_number=sequence_numbers, logged_propensities=propensities, logged_rewards=rewards, action_mask=action_mask, model_rewards=model_rewards, model_rewards_for_logged_action=model_rewards_for_logged_action, model_values=model_values, model_values_for_logged_action=model_values_for_logged_action, model_metrics_values=model_metrics_values, model_metrics_values_for_logged_action= model_metrics_values_for_logged_action, model_propensities=model_propensities, logged_metrics=metrics, model_metrics=model_metrics, model_metrics_for_logged_action=model_metrics_for_logged_action, # Will compute later logged_values=None, logged_metrics_values=None, possible_actions_mask=possible_actions_mask, optimal_q_values=optimal_q_values, eval_action_idxs=eval_action_idxs, )