def evaluate(self, batch: MemoryNetworkInput): """Calculate state feature sensitivity due to actions: randomly permutating actions and see how much the prediction of next state feature deviates.""" assert isinstance(batch, MemoryNetworkInput) self.trainer.memory_network.mdnrnn.eval() seq_len, batch_size, state_dim = batch.next_state.float_features.size() state_feature_num = self.state_feature_num feature_sensitivity = torch.zeros(state_feature_num) # the input of world_model has seq-len as the first dimension mdnrnn_output = self.trainer.memory_network( batch.state, FeatureData(batch.action) ) predicted_next_state_means = mdnrnn_output.mus shuffled_mdnrnn_output = self.trainer.memory_network( batch.state, # shuffle the actions FeatureData(batch.action[:, torch.randperm(batch_size), :]), ) shuffled_predicted_next_state_means = shuffled_mdnrnn_output.mus assert ( predicted_next_state_means.size() == shuffled_predicted_next_state_means.size() == (seq_len, batch_size, self.trainer.params.num_gaussians, state_dim) ) state_feature_boundaries = self.sorted_state_feature_start_indices + [state_dim] for i in range(state_feature_num): boundary_start, boundary_end = ( state_feature_boundaries[i], state_feature_boundaries[i + 1], ) abs_diff = torch.mean( torch.sum( torch.abs( shuffled_predicted_next_state_means[ :, :, :, boundary_start:boundary_end ] - predicted_next_state_means[ :, :, :, boundary_start:boundary_end ] ), dim=3, ) ) feature_sensitivity[i] = abs_diff.cpu().detach().item() self.trainer.memory_network.mdnrnn.train() logger.info( "**** Debug tool feature sensitivity ****: {}".format(feature_sensitivity) ) return {"feature_sensitivity": feature_sensitivity.numpy()}
def act(self, obs: rlt.FeatureData, possible_actions_mask: Optional[np.ndarray] = None ) -> rlt.ActorOutput: """Act randomly regardless of the observation.""" # pyre-fixme[35]: Target cannot be annotated. obs: torch.Tensor = obs.float_features assert obs.dim() >= 2, f"obs has shape {obs.shape} (dim < 2)" batch_size = obs.size(0) # pyre-fixme[6]: Expected `Union[torch.Size, torch.Tensor]` for 1st param # but got `Tuple[int]`. action = self.dist.sample((batch_size, )) # sum over action_dim (since assuming i.i.d. per coordinate) log_prob = self.dist.log_prob(action).sum(1) return rlt.ActorOutput(action=action, log_prob=log_prob)
def test_get_detached_model_outputs(self): trainer = self._construct_trainer() action_scores, _ = trainer.get_detached_model_outputs( FeatureData(float_features=torch.rand(self.batch_size, self.state_dim)) ) self.assertEqual(action_scores.shape[0], self.batch_size) self.assertEqual(action_scores.shape[1], self.action_dim)
def get_parametric_input(max_num_actions: int, obs: rlt.FeatureData): assert (len(obs.float_features.shape) == 2 ), f"{obs.float_features.shape} is not (batch_size, state_dim)." batch_size, _ = obs.float_features.shape possible_actions = get_possible_actions_for_gym( batch_size, max_num_actions).to(obs.float_features.device) return obs.get_tiled_batch(max_num_actions), possible_actions
def _get_unmasked_q_values(self, q_network, state: rlt.FeatureData, slate: rlt.DocList) -> torch.Tensor: """ Gets the q values from the model and target networks """ batch_size, slate_size, _ = slate.float_features.shape # TODO: Probably should create a new model type return q_network(state.repeat_interleave(slate_size, dim=0), slate.as_feature_data()).view(batch_size, slate_size)
def score(state: rlt.FeatureData) -> torch.Tensor: tiled_state = state.repeat_interleave(repeats=num_candidates, axis=0) candidate_docs = state.candidate_docs assert candidate_docs is not None actions = candidate_docs.as_feature_data() q_network.eval() scores = q_network(tiled_state, actions).view(-1, num_candidates) q_network.train() select_prob = F.softmax(candidate_docs.value, dim=1) assert select_prob.shape == scores.shape return select_prob * scores
def test_fully_connected(self): chooser = ValueNetBuilder__Union( FullyConnected=value.fully_connected.FullyConnected() ) builder = chooser.value state_dim = 3 normalization_data = NormalizationData( dense_normalization_parameters={ i: NormalizationParameters(feature_type=CONTINUOUS) for i in range(state_dim) } ) value_network = builder.build_value_network(normalization_data) batch_size = 5 x = FeatureData(float_features=torch.randn(batch_size, state_dim)) y = value_network(x) self.assertEqual(y.shape, (batch_size, 1))
def setUp(self): # preparing various components for qr-dqn trainer initialization self.params = QRDQNTrainerParameters(actions=["1", "2"], num_atoms=11) self.reward_options = RewardOptions() self.metrics_to_score = get_metrics_to_score( self.reward_options.metric_reward_values ) self.state_dim = 10 self.action_dim = 2 self.sizes = [20, 20] self.num_atoms = 11 self.activations = ["relu", "relu"] self.dropout_ratio = 0 self.q_network = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.action_dim, sizes=self.sizes, num_atoms=self.num_atoms, activations=self.activations, dropout_ratio=self.dropout_ratio, ) self.q_network_target = self.q_network.get_target_network() self.x = FeatureData(float_features=torch.rand(5, 10)) self.eval_parameters = EvaluationParameters(calc_cpe_in_training=True) self.num_output_nodes = (len(self.metrics_to_score) + 1) * len( # pyre-fixme[16]: `QRDQNTrainerParameters` has no attribute `actions`. self.params.actions ) self.reward_network = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.num_output_nodes, sizes=self.sizes, activations=self.activations, ) self.q_network_cpe = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.num_output_nodes, sizes=self.sizes, activations=self.activations, ) self.q_network_cpe_target = self.q_network_cpe.get_target_network()
def act(self, obs: rlt.FeatureData, possible_actions_mask: Optional[np.ndarray] = None ) -> rlt.ActorOutput: """Act randomly regardless of the observation.""" # pyre-fixme[35]: Target cannot be annotated. obs: torch.Tensor = obs.float_features assert obs.dim() >= 2, f"obs has shape {obs.shape} (dim < 2)" assert obs.shape[0] == 1, f"obs has shape {obs.shape} (0th dim != 1)" batch_size = obs.shape[0] scores = torch.ones((batch_size, self.num_actions)) scores = apply_possible_actions_mask(scores, possible_actions_mask, invalid_score=0.0) # sample a random action m = torch.distributions.Categorical(scores) raw_action = m.sample() action = F.one_hot(raw_action, self.num_actions) log_prob = m.log_prob(raw_action).float() return rlt.ActorOutput(action=action, log_prob=log_prob)
def create_from_tensors_dqn( cls, trainer: DQNTrainer, mdp_ids: torch.Tensor, sequence_numbers: torch.Tensor, states: rlt.FeatureData, actions: rlt.FeatureData, propensities: torch.Tensor, rewards: torch.Tensor, possible_actions_mask: torch.Tensor, metrics: Optional[torch.Tensor] = None, ): old_q_train_state = trainer.q_network.training # pyre-fixme[16]: `DQNTrainer` has no attribute `reward_network`. old_reward_train_state = trainer.reward_network.training # pyre-fixme[16]: `DQNTrainer` has no attribute `q_network_cpe`. old_q_cpe_train_state = trainer.q_network_cpe.training trainer.q_network.train(False) trainer.reward_network.train(False) trainer.q_network_cpe.train(False) num_actions = trainer.num_actions action_mask = actions.float() rewards = trainer.boost_rewards(rewards, actions) model_values = trainer.q_network_cpe(states)[:, 0:num_actions] optimal_q_values, _ = trainer.get_detached_q_values(states) # Do we ever really use eval_action_idxs? eval_action_idxs = trainer.get_max_q_values(optimal_q_values, possible_actions_mask)[1] model_propensities = masked_softmax(optimal_q_values, possible_actions_mask, trainer.rl_temperature) assert model_values.shape == actions.shape, ("Invalid shape: " + str(model_values.shape) + " != " + str(actions.shape)) assert model_values.shape == possible_actions_mask.shape, ( "Invalid shape: " + str(model_values.shape) + " != " + str(possible_actions_mask.shape)) model_values_for_logged_action = torch.sum(model_values * action_mask, dim=1, keepdim=True) rewards_and_metric_rewards = trainer.reward_network(states) # In case we reuse the modular for Q-network if hasattr(rewards_and_metric_rewards, "q_values"): rewards_and_metric_rewards = rewards_and_metric_rewards model_rewards = rewards_and_metric_rewards[:, 0:num_actions] assert model_rewards.shape == actions.shape, ( "Invalid shape: " + str(model_rewards.shape) + " != " + str(actions.shape)) model_rewards_for_logged_action = torch.sum(model_rewards * action_mask, dim=1, keepdim=True) model_metrics = rewards_and_metric_rewards[:, num_actions:] assert model_metrics.shape[1] % num_actions == 0, ( "Invalid metrics shape: " + str(model_metrics.shape) + " " + str(num_actions)) num_metrics = model_metrics.shape[1] // num_actions if num_metrics == 0: model_metrics_values = None model_metrics_for_logged_action = None model_metrics_values_for_logged_action = None else: model_metrics_values = trainer.q_network_cpe(states) # Backward compatility if hasattr(model_metrics_values, "q_values"): model_metrics_values = model_metrics_values model_metrics_values = model_metrics_values[:, num_actions:] assert model_metrics_values.shape[ 1] == num_actions * num_metrics, ( "Invalid shape: " + str(model_metrics_values.shape[1]) + " != " + str(actions.shape[1] * num_metrics)) model_metrics_for_logged_action_list = [] model_metrics_values_for_logged_action_list = [] for metric_index in range(num_metrics): metric_start = metric_index * num_actions metric_end = (metric_index + 1) * num_actions model_metrics_for_logged_action_list.append( torch.sum( model_metrics[:, metric_start:metric_end] * action_mask, dim=1, keepdim=True, )) model_metrics_values_for_logged_action_list.append( torch.sum( model_metrics_values[:, metric_start:metric_end] * action_mask, dim=1, keepdim=True, )) model_metrics_for_logged_action = torch.cat( model_metrics_for_logged_action_list, dim=1) model_metrics_values_for_logged_action = torch.cat( model_metrics_values_for_logged_action_list, dim=1) trainer.q_network_cpe.train(old_q_cpe_train_state) trainer.q_network.train(old_q_train_state) trainer.reward_network.train(old_reward_train_state) return cls( mdp_id=mdp_ids, sequence_number=sequence_numbers, logged_propensities=propensities, logged_rewards=rewards, action_mask=action_mask, model_rewards=model_rewards, model_rewards_for_logged_action=model_rewards_for_logged_action, model_values=model_values, model_values_for_logged_action=model_values_for_logged_action, model_metrics_values=model_metrics_values, model_metrics_values_for_logged_action= model_metrics_values_for_logged_action, model_propensities=model_propensities, logged_metrics=metrics, model_metrics=model_metrics, model_metrics_for_logged_action=model_metrics_for_logged_action, # Will compute later logged_values=None, logged_metrics_values=None, possible_actions_mask=possible_actions_mask, optimal_q_values=optimal_q_values, eval_action_idxs=eval_action_idxs, )
def setUp(self): # preparing various components for qr-dqn trainer initialization self.batch_size = 3 self.state_dim = 10 self.action_dim = 2 self.num_layers = 2 self.sizes = [20 for _ in range(self.num_layers)] self.num_atoms = 11 self.activations = ["relu" for _ in range(self.num_layers)] self.dropout_ratio = 0 self.exploration_variance = 1e-10 self.actions = [str(i) for i in range(self.action_dim)] self.params = CRRTrainerParameters(actions=self.actions) self.reward_options = RewardOptions() self.metrics_to_score = get_metrics_to_score( self.reward_options.metric_reward_values ) self.actor_network = FullyConnectedActor( state_dim=self.state_dim, action_dim=self.action_dim, sizes=self.sizes, activations=self.activations, exploration_variance=self.exploration_variance, ) self.actor_network_target = self.actor_network.get_target_network() self.q1_network = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.action_dim, sizes=self.sizes, activations=self.activations, dropout_ratio=self.dropout_ratio, ) self.q1_network_target = self.q1_network.get_target_network() self.q2_network = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.action_dim, sizes=self.sizes, activations=self.activations, dropout_ratio=self.dropout_ratio, ) self.q2_network_target = self.q2_network.get_target_network() self.num_output_nodes = (len(self.metrics_to_score) + 1) * len( self.params.actions ) self.eval_parameters = EvaluationParameters(calc_cpe_in_training=True) self.reward_network = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.num_output_nodes, sizes=self.sizes, activations=self.activations, ) self.q_network_cpe = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.num_output_nodes, sizes=self.sizes, activations=self.activations, ) self.q_network_cpe_target = self.q_network_cpe.get_target_network() self.inp = DiscreteDqnInput( state=FeatureData( float_features=torch.rand(self.batch_size, self.state_dim) ), next_state=FeatureData( float_features=torch.rand(self.batch_size, self.state_dim) ), reward=torch.ones(self.batch_size, 1), time_diff=torch.ones(self.batch_size, 1) * 2, step=torch.ones(self.batch_size, 1) * 2, not_terminal=torch.ones( self.batch_size, 1 ), # todo: check terminal behavior action=torch.tensor([[0, 1], [1, 0], [0, 1]]), next_action=torch.tensor([[1, 0], [0, 1], [1, 0]]), possible_actions_mask=torch.ones(self.batch_size, self.action_dim), possible_next_actions_mask=torch.ones(self.batch_size, self.action_dim), extras=ExtraData(action_probability=torch.ones(self.batch_size, 1)), )
def _concat_features(self, obs: rlt.FeatureData): if self.has_user_feat: return obs.concat_user_doc() else: # pyre-fixme[16]: `Optional` has no attribute `float_features`. return obs.candidate_docs.float_features.float()
def evaluate(self, batch: MemoryNetworkInput): """Calculate feature importance: setting each state/action feature to the mean value and observe loss increase.""" self.trainer.memory_network.mdnrnn.eval() state_features = batch.state.float_features action_features = batch.action seq_len, batch_size, state_dim = state_features.size() action_dim = action_features.size()[2] action_feature_num = self.action_feature_num state_feature_num = self.state_feature_num feature_importance = torch.zeros(action_feature_num + state_feature_num) orig_losses = self.trainer.get_loss(batch, state_dim=state_dim) orig_loss = orig_losses["loss"].cpu().detach().item() del orig_losses action_feature_boundaries = self.sorted_action_feature_start_indices + [ action_dim ] state_feature_boundaries = self.sorted_state_feature_start_indices + [state_dim] for i in range(action_feature_num): action_features = batch.action.reshape( (batch_size * seq_len, action_dim) ).data.clone() # if actions are discrete, an action's feature importance is the loss # increase due to setting all actions to this action if self.discrete_action: assert action_dim == action_feature_num action_vec = torch.zeros(action_dim) action_vec[i] = 1 action_features[:] = action_vec # if actions are continuous, an action's feature importance is the loss # increase due to masking this action feature to its mean value else: boundary_start, boundary_end = ( action_feature_boundaries[i], action_feature_boundaries[i + 1], ) action_features[ :, boundary_start:boundary_end ] = self.compute_median_feature_value( action_features[:, boundary_start:boundary_end] ) action_features = action_features.reshape((seq_len, batch_size, action_dim)) new_batch = MemoryNetworkInput( state=batch.state, action=action_features, next_state=batch.next_state, reward=batch.reward, time_diff=torch.ones_like(batch.reward).float(), not_terminal=batch.not_terminal, step=None, ) losses = self.trainer.get_loss(new_batch, state_dim=state_dim) feature_importance[i] = losses["loss"].cpu().detach().item() - orig_loss del losses for i in range(state_feature_num): state_features = batch.state.float_features.reshape( (batch_size * seq_len, state_dim) ).data.clone() boundary_start, boundary_end = ( state_feature_boundaries[i], state_feature_boundaries[i + 1], ) state_features[ :, boundary_start:boundary_end ] = self.compute_median_feature_value( state_features[:, boundary_start:boundary_end] ) state_features = state_features.reshape((seq_len, batch_size, state_dim)) new_batch = MemoryNetworkInput( state=FeatureData(float_features=state_features), action=batch.action, next_state=batch.next_state, reward=batch.reward, time_diff=torch.ones_like(batch.reward).float(), not_terminal=batch.not_terminal, step=None, ) losses = self.trainer.get_loss(new_batch, state_dim=state_dim) feature_importance[i + action_feature_num] = ( losses["loss"].cpu().detach().item() - orig_loss ) del losses self.trainer.memory_network.mdnrnn.train() logger.info( "**** Debug tool feature importance ****: {}".format(feature_importance) ) return {"feature_loss_increase": feature_importance.numpy()}
def forward(self, obs: rlt.FeatureData): mlp_input = obs.get_ranking_state(self.has_user_feat) scores = self.mlp(mlp_input) return scores.squeeze(-1)
def test_train_step_gen(self): inp = DiscreteDqnInput( state=FeatureData(float_features=torch.rand(3, 10)), next_state=FeatureData(float_features=torch.rand(3, 10)), reward=torch.ones(3, 1), time_diff=torch.ones(3, 1) * 2, step=torch.ones(3, 1) * 2, not_terminal=torch.ones(3, 1), # todo: check terminal behavior action=torch.tensor([[0, 1], [1, 0], [0, 1]]), next_action=torch.tensor([[1, 0], [0, 1], [1, 0]]), possible_actions_mask=torch.ones(3, 2), possible_next_actions_mask=torch.ones(3, 2), extras=ExtraData(), ) mse_backward_type = type( torch.nn.functional.mse_loss( torch.tensor([1.0], requires_grad=True), torch.zeros(1) ).grad_fn ) add_backward_type = type( ( torch.tensor([1.0], requires_grad=True) + torch.tensor([1.0], requires_grad=True) ).grad_fn ) mean_backward_type = type( torch.tensor([1.0, 2.0], requires_grad=True).mean().grad_fn ) # vanilla trainer = self._construct_trainer() loss_gen = trainer.train_step_gen(inp, batch_idx=1) losses = list(loss_gen) self.assertEqual(len(losses), 4) self.assertEqual(type(losses[0].grad_fn), mean_backward_type) self.assertEqual(type(losses[1].grad_fn), mse_backward_type) self.assertEqual(type(losses[2].grad_fn), mse_backward_type) self.assertEqual(type(losses[3].grad_fn), add_backward_type) # no CPE trainer = self._construct_trainer(no_cpe=True) loss_gen = trainer.train_step_gen(inp, batch_idx=1) losses = list(loss_gen) self.assertEqual(len(losses), 2) # seq_num param_copy = QRDQNTrainerParameters( actions=["1", "2"], num_atoms=11, rl=RLParameters(use_seq_num_diff_as_time_diff=True), ) trainer = self._construct_trainer(new_params=param_copy) loss_gen = trainer.train_step_gen(inp, batch_idx=1) losses = list(loss_gen) self.assertEqual(len(losses), 4) # multi_steps param_copy = QRDQNTrainerParameters( actions=["1", "2"], num_atoms=11, rl=RLParameters(multi_steps=2) ) trainer = self._construct_trainer(new_params=param_copy) loss_gen = trainer.train_step_gen(inp, batch_idx=1) losses = list(loss_gen) self.assertEqual(len(losses), 4) # non_max_q param_copy = QRDQNTrainerParameters( actions=["1", "2"], num_atoms=11, rl=RLParameters(maxq_learning=False) ) trainer = self._construct_trainer(new_params=param_copy) loss_gen = trainer.train_step_gen(inp, batch_idx=1) losses = list(loss_gen) self.assertEqual(len(losses), 4)
def extract_state_first_step(batch): return FeatureData(batch.state.float_features[0])