def test_masked_softmax(self): # Postive value case x = torch.tensor([[15.0, 6.0, 9.0], [3.0, 2.0, 1.0]]) temperature = 1 mask = torch.tensor([[1.0, 0.0, 1.0], [0.0, 1.0, 1.0]]) out = masked_softmax(x, mask, temperature) expected_out = torch.tensor([[0.9975, 0.0000, 0.0025], [0, 0.7311, 0.2689]]) npt.assert_array_almost_equal(out, expected_out, 4) # Postive value case (masked value goes to inf) x = torch.tensor([[150.0, 2.0]]) temperature = 0.01 mask = torch.tensor([[0.0, 1.0]]) out = masked_softmax(x, mask, temperature) expected_out = torch.tensor([[0.0, 1.0]]) npt.assert_array_almost_equal(out, expected_out, 4) # Negative value case x = torch.tensor([[-10.0, -1.0, -5.0]]) temperature = 0.01 mask = torch.tensor([[1.0, 1.0, 0.0]]) out = masked_softmax(x, mask, temperature) expected_out = torch.tensor([[0.0, 1.0, 0.0]]) npt.assert_array_almost_equal(out, expected_out, 4) # All values in a row are masked case x = torch.tensor([[-5.0, 4.0, 3.0], [2.0, 1.0, 2.0]]) temperature = 1 mask = torch.tensor([[0.0, 0.0, 0.0], [1.0, 1.0, 1.0]]) out = masked_softmax(x, mask, temperature) expected_out = torch.tensor([[0.0, 0.0, 0.0], [0.4223, 0.1554, 0.4223]]) npt.assert_array_almost_equal(out, expected_out, 4)
def policy_given_q_values( q_scores: torch.Tensor, action_names: List[str], softmax_temperature: float, possible_actions_presence: Optional[torch.Tensor] = None, ) -> DqnPolicyActionSet: assert q_scores.shape[0] == 1 and len(q_scores.shape) == 2 if possible_actions_presence is None: possible_actions_presence = torch.ones_like(q_scores) possible_actions_presence = possible_actions_presence.reshape(1, -1) assert possible_actions_presence.shape == q_scores.shape # set impossible actions so low that they can't be picked q_scores -= (1.0 - possible_actions_presence) * 1e10 # type: ignore q_scores_softmax = ( masked_softmax(q_scores, possible_actions_presence, softmax_temperature) .detach() .numpy()[0] ) if np.isnan(q_scores_softmax).any() or np.max(q_scores_softmax) < 1e-3: q_scores_softmax[:] = 1.0 / q_scores_softmax.shape[0] greedy_act_idx = int(torch.argmax(q_scores)) softmax_act_idx = int(np.random.choice(q_scores.size()[1], p=q_scores_softmax)) return DqnPolicyActionSet( greedy=greedy_act_idx, softmax=softmax_act_idx, greedy_act_name=action_names[greedy_act_idx], softmax_act_name=action_names[softmax_act_idx], softmax_act_prob=q_scores_softmax[softmax_act_idx], )
def policy_given_q_values( q_scores: torch.Tensor, softmax_temperature: float, possible_actions_presence: torch.Tensor, ) -> DqnPolicyActionSet: assert q_scores.shape[0] == 1 and len(q_scores.shape) == 2 possible_actions_presence = possible_actions_presence.reshape(1, -1) assert possible_actions_presence.shape == q_scores.shape # set impossible actions so low that they can't be picked q_scores -= (1.0 - possible_actions_presence) * 1e10 q_scores_softmax_numpy = ( masked_softmax( q_scores.reshape(1, -1), possible_actions_presence, softmax_temperature ) .detach() .numpy()[0] ) if ( np.isnan(q_scores_softmax_numpy).any() or np.max(q_scores_softmax_numpy) < 1e-3 ): q_scores_softmax_numpy[:] = 1.0 / q_scores_softmax_numpy.shape[0] greedy_act_idx = int(torch.argmax(q_scores)) softmax_act_idx = int( np.random.choice(q_scores.size()[1], p=q_scores_softmax_numpy) ) return DqnPolicyActionSet( greedy=greedy_act_idx, softmax=softmax_act_idx, softmax_act_prob=float(q_scores_softmax_numpy[softmax_act_idx]), )
def policy( self, state: torch.Tensor, possible_actions_presence: torch.Tensor ) -> DqnPolicyActionSet: assert state.size()[0] == 1, "Only pass in one state when getting a policy" if self.use_gpu: state = state.cuda() q_scores = self.predict(state) assert q_scores.shape[0] == 1 # set impossible actions so low that they can't be picked q_scores -= (1.0 - possible_actions_presence) * 1e10 # type: ignore q_scores_softmax = masked_softmax( q_scores, possible_actions_presence, self.trainer.rl_temperature ).numpy()[0] if np.isnan(q_scores_softmax).any() or np.max(q_scores_softmax) < 1e-3: q_scores_softmax[:] = 1.0 / q_scores_softmax.shape[0] return DqnPolicyActionSet( greedy=int(torch.argmax(q_scores)), softmax=int(np.random.choice(q_scores.size()[1], p=q_scores_softmax)), )
def policy( self, states_tiled: torch.Tensor, possible_actions_with_presence: Tuple[torch.Tensor, torch.Tensor], ): possible_actions, possible_actions_presence = possible_actions_with_presence assert states_tiled.size()[0] == possible_actions.size()[0] assert possible_actions.size()[1] == self.action_dim assert possible_actions.size()[0] == possible_actions_presence.size()[0] if self.use_gpu: states_tiled = states_tiled.cuda() possible_actions = possible_actions.cuda() q_scores = self.predict(states_tiled, possible_actions).reshape( [1, self.action_dim] ) possible_actions_presence = (possible_actions_presence.sum(dim=1) > 0).float() # set impossible actions so low that they can't be picked q_scores -= ( 1.0 - possible_actions_presence.reshape(1, self.action_dim) # type: ignore ) * 1e10 q_scores_softmax_numpy = masked_softmax( q_scores.reshape(1, -1), possible_actions_presence.reshape(1, -1), self.trainer.rl_temperature, ).numpy()[0] if ( np.isnan(q_scores_softmax_numpy).any() or np.max(q_scores_softmax_numpy) < 1e-3 ): q_scores_softmax_numpy[:] = 1.0 / q_scores_softmax_numpy.shape[0] return DqnPolicyActionSet( greedy=int(torch.argmax(q_scores)), softmax=int(np.random.choice(q_scores.size()[1], p=q_scores_softmax_numpy)), )
def create_from_tensors_parametric_dqn( cls, trainer: ParametricDQNTrainer, mdp_ids: np.ndarray, sequence_numbers: torch.Tensor, states: rlt.PreprocessedFeatureVector, actions: rlt.PreprocessedFeatureVector, propensities: torch.Tensor, rewards: torch.Tensor, possible_actions_mask: torch.Tensor, possible_actions: rlt.PreprocessedFeatureVector, max_num_actions: int, metrics: Optional[torch.Tensor] = None, ): old_q_train_state = trainer.q_network.training old_reward_train_state = trainer.reward_network.training trainer.q_network.train(False) trainer.reward_network.train(False) state_action_pairs = rlt.PreprocessedStateAction(state=states, action=actions) tiled_state = states.float_features.repeat(1, max_num_actions).reshape( -1, states.float_features.shape[1] ) assert possible_actions is not None # Get Q-value of action taken possible_actions_state_concat = rlt.PreprocessedStateAction( state=rlt.PreprocessedFeatureVector(float_features=tiled_state), action=possible_actions, ) # FIXME: model_values, model_values_for_logged_action, and model_metrics_values # should be calculated using q_network_cpe (as in discrete dqn). # q_network_cpe has not been added in parametric dqn yet. model_values = trainer.q_network( possible_actions_state_concat ).q_value # type: ignore optimal_q_values, _ = trainer.get_detached_q_values( possible_actions_state_concat.state, possible_actions_state_concat.action ) eval_action_idxs = None assert ( model_values.shape[1] == 1 and model_values.shape[0] == possible_actions_mask.shape[0] * possible_actions_mask.shape[1] ), ( "Invalid shapes: " + str(model_values.shape) + " != " + str(possible_actions_mask.shape) ) model_values = model_values.reshape(possible_actions_mask.shape) optimal_q_values = optimal_q_values.reshape(possible_actions_mask.shape) model_propensities = masked_softmax( optimal_q_values, possible_actions_mask, trainer.rl_temperature ) rewards_and_metric_rewards = trainer.reward_network( possible_actions_state_concat ).q_value # type: ignore model_rewards = rewards_and_metric_rewards[:, :1] assert ( model_rewards.shape[0] * model_rewards.shape[1] == possible_actions_mask.shape[0] * possible_actions_mask.shape[1] ), ( "Invalid shapes: " + str(model_rewards.shape) + " != " + str(possible_actions_mask.shape) ) model_rewards = model_rewards.reshape(possible_actions_mask.shape) model_metrics = rewards_and_metric_rewards[:, 1:] model_metrics = model_metrics.reshape(possible_actions_mask.shape[0], -1) model_values_for_logged_action = trainer.q_network(state_action_pairs).q_value model_rewards_and_metrics_for_logged_action = trainer.reward_network( state_action_pairs ).q_value model_rewards_for_logged_action = model_rewards_and_metrics_for_logged_action[ :, :1 ] action_dim = possible_actions.float_features.shape[1] action_mask = torch.all( possible_actions.float_features.view(-1, max_num_actions, action_dim) == actions.float_features.unsqueeze(dim=1), dim=2, ).float() assert torch.all(action_mask.sum(dim=1) == 1) num_metrics = model_metrics.shape[1] // max_num_actions model_metrics_values = None model_metrics_for_logged_action = None model_metrics_values_for_logged_action = None if num_metrics > 0: # FIXME: calculate model_metrics_values when q_network_cpe is added # to parametric dqn model_metrics_values = model_values.repeat(1, num_metrics) trainer.q_network.train(old_q_train_state) # type: ignore trainer.reward_network.train(old_reward_train_state) # type: ignore return cls( mdp_id=mdp_ids, sequence_number=sequence_numbers, logged_propensities=propensities, logged_rewards=rewards, action_mask=action_mask, model_rewards=model_rewards, model_rewards_for_logged_action=model_rewards_for_logged_action, model_values=model_values, model_values_for_logged_action=model_values_for_logged_action, model_metrics_values=model_metrics_values, model_metrics_values_for_logged_action=model_metrics_values_for_logged_action, model_propensities=model_propensities, logged_metrics=metrics, model_metrics=model_metrics, model_metrics_for_logged_action=model_metrics_for_logged_action, # Will compute later logged_values=None, logged_metrics_values=None, possible_actions_mask=possible_actions_mask, optimal_q_values=optimal_q_values, eval_action_idxs=eval_action_idxs, )
def create_from_tensors_dqn( cls, trainer: DQNTrainer, mdp_ids: np.ndarray, sequence_numbers: torch.Tensor, states: rlt.PreprocessedFeatureVector, actions: rlt.PreprocessedFeatureVector, propensities: torch.Tensor, rewards: torch.Tensor, possible_actions_mask: torch.Tensor, metrics: Optional[torch.Tensor] = None, ): old_q_train_state = trainer.q_network.training old_reward_train_state = trainer.reward_network.training old_q_cpe_train_state = trainer.q_network_cpe.training trainer.q_network.train(False) trainer.reward_network.train(False) trainer.q_network_cpe.train(False) num_actions = trainer.num_actions action_mask = actions.float() # type: ignore rewards = trainer.boost_rewards(rewards, actions) # type: ignore model_values = trainer.q_network_cpe( rlt.PreprocessedState(state=states) ).q_values[:, 0:num_actions] optimal_q_values, _ = trainer.get_detached_q_values( states # type: ignore ) eval_action_idxs = trainer.get_max_q_values( # type: ignore optimal_q_values, possible_actions_mask )[1] model_propensities = masked_softmax( optimal_q_values, possible_actions_mask, trainer.rl_temperature ) assert model_values.shape == actions.shape, ( # type: ignore "Invalid shape: " + str(model_values.shape) # type: ignore + " != " + str(actions.shape) # type: ignore ) assert model_values.shape == possible_actions_mask.shape, ( # type: ignore "Invalid shape: " + str(model_values.shape) # type: ignore + " != " + str(possible_actions_mask.shape) # type: ignore ) model_values_for_logged_action = torch.sum( model_values * action_mask, dim=1, keepdim=True ) rewards_and_metric_rewards = trainer.reward_network( rlt.PreprocessedState(state=states) ) # In case we reuse the modular for Q-network if hasattr(rewards_and_metric_rewards, "q_values"): rewards_and_metric_rewards = rewards_and_metric_rewards.q_values model_rewards = rewards_and_metric_rewards[:, 0:num_actions] assert model_rewards.shape == actions.shape, ( # type: ignore "Invalid shape: " + str(model_rewards.shape) # type: ignore + " != " + str(actions.shape) # type: ignore ) model_rewards_for_logged_action = torch.sum( model_rewards * action_mask, dim=1, keepdim=True ) model_metrics = rewards_and_metric_rewards[:, num_actions:] assert model_metrics.shape[1] % num_actions == 0, ( "Invalid metrics shape: " + str(model_metrics.shape) + " " + str(num_actions) ) num_metrics = model_metrics.shape[1] // num_actions if num_metrics == 0: model_metrics_values = None model_metrics_for_logged_action = None model_metrics_values_for_logged_action = None else: model_metrics_values = trainer.q_network_cpe( rlt.PreprocessedState(state=states) ) # Backward compatility if hasattr(model_metrics_values, "q_values"): model_metrics_values = model_metrics_values.q_values model_metrics_values = model_metrics_values[:, num_actions:] assert ( model_metrics_values.shape[1] == num_actions * num_metrics ), ( # type: ignore "Invalid shape: " + str(model_metrics_values.shape[1]) # type: ignore + " != " + str(actions.shape[1] * num_metrics) # type: ignore ) model_metrics_for_logged_action_list = [] model_metrics_values_for_logged_action_list = [] for metric_index in range(num_metrics): metric_start = metric_index * num_actions metric_end = (metric_index + 1) * num_actions model_metrics_for_logged_action_list.append( torch.sum( model_metrics[:, metric_start:metric_end] * action_mask, dim=1, keepdim=True, ) ) model_metrics_values_for_logged_action_list.append( torch.sum( model_metrics_values[:, metric_start:metric_end] * action_mask, dim=1, keepdim=True, ) ) model_metrics_for_logged_action = torch.cat( model_metrics_for_logged_action_list, dim=1 ) model_metrics_values_for_logged_action = torch.cat( model_metrics_values_for_logged_action_list, dim=1 ) trainer.q_network_cpe.train(old_q_cpe_train_state) # type: ignore trainer.q_network.train(old_q_train_state) # type: ignore trainer.reward_network.train(old_reward_train_state) # type: ignore return cls( mdp_id=mdp_ids, sequence_number=sequence_numbers, logged_propensities=propensities, logged_rewards=rewards, action_mask=action_mask, model_rewards=model_rewards, model_rewards_for_logged_action=model_rewards_for_logged_action, model_values=model_values, model_values_for_logged_action=model_values_for_logged_action, model_metrics_values=model_metrics_values, model_metrics_values_for_logged_action=model_metrics_values_for_logged_action, model_propensities=model_propensities, logged_metrics=metrics, model_metrics=model_metrics, model_metrics_for_logged_action=model_metrics_for_logged_action, # Will compute later logged_values=None, logged_metrics_values=None, possible_actions_mask=possible_actions_mask, optimal_q_values=optimal_q_values, eval_action_idxs=eval_action_idxs, )
def _calculate_cpes( self, training_batch, states, next_states, all_action_scores, all_next_action_scores, logged_action_idxs, discount_tensor, not_done_mask, ): if not self.calc_cpe_in_training: return None, None, None if training_batch.extras.metrics is None: metrics_reward_concat_real_vals = training_batch.training_input.reward else: metrics_reward_concat_real_vals = torch.cat( (training_batch.training_input.reward, training_batch.extras.metrics), dim=1, ) model_propensities_next_states = masked_softmax( all_next_action_scores, training_batch.training_input.possible_next_actions_mask if self.maxq_learning else training_batch.training_input.next_action, self.rl_temperature, ) with torch.enable_grad(): ######### Train separate reward network for CPE evaluation ############# # FIXME: the reward network should be outputting a tensor, # not a q-value object reward_estimates = self.reward_network(states).q_values reward_estimates_for_logged_actions = reward_estimates.gather( 1, self.reward_idx_offsets + logged_action_idxs) reward_loss = F.mse_loss(reward_estimates_for_logged_actions, metrics_reward_concat_real_vals) reward_loss.backward() self._maybe_run_optimizer(self.reward_network_optimizer, self.minibatches_per_step) ######### Train separate q-network for CPE evaluation ############# metric_q_values = self.q_network_cpe(states).q_values.gather( 1, self.reward_idx_offsets + logged_action_idxs) all_metrics_target_q_values = torch.chunk( self.q_network_cpe_target(next_states).q_values.detach(), len(self.metrics_to_score), dim=1, ) target_metric_q_values = [] for i, per_metric_target_q_values in enumerate( all_metrics_target_q_values): per_metric_next_q_values = torch.sum( per_metric_target_q_values * model_propensities_next_states, 1, keepdim=True, ) per_metric_next_q_values = per_metric_next_q_values * not_done_mask per_metric_target_q_values = metrics_reward_concat_real_vals[:, i:i + 1] + ( discount_tensor * per_metric_next_q_values) target_metric_q_values.append(per_metric_target_q_values) target_metric_q_values = torch.cat(target_metric_q_values, dim=1) metric_q_value_loss = self.q_network_loss(metric_q_values, target_metric_q_values) metric_q_value_loss.backward() self._maybe_run_optimizer(self.q_network_cpe_optimizer, self.minibatches_per_step) # Use the soft update rule to update target network self._maybe_soft_update( self.q_network_cpe, self.q_network_cpe_target, self.tau, self.minibatches_per_step, ) model_propensities = masked_softmax( all_action_scores, training_batch.training_input.possible_actions_mask if self.maxq_learning else training_batch.training_input.action, self.rl_temperature, ) model_rewards = reward_estimates[:, torch.arange( self.reward_idx_offsets[0], self.reward_idx_offsets[0] + self.num_actions, ), ] return reward_loss, model_rewards, model_propensities