def evaluate( self, evaluator: Evaluator, logged_actions: Optional[np.ndarray], logged_propensities: Optional[np.ndarray], logged_rewards: Optional[np.ndarray], logged_values: Optional[np.ndarray], ): self.model_propensities, model_values_on_logged_actions, maxq_action_idxs = ( None, None, None, ) if self.all_action_scores is not None: self.all_action_scores = self.all_action_scores.cpu().numpy() self.model_propensities = Evaluator.softmax( self.all_action_scores, self.rl_temperature) maxq_action_idxs = self.all_action_scores.argmax(axis=1) if logged_actions is not None: model_values_on_logged_actions = np.sum( (logged_actions * self.all_action_scores), axis=1, keepdims=True) evaluator.report( self.loss.cpu().numpy(), logged_actions, logged_propensities, logged_rewards, logged_values, self.model_propensities, self.all_action_scores, model_values_on_logged_actions, maxq_action_idxs, )
def policy(self, states): with core.DeviceScope(self.c2_device): if isinstance(self.trainer, DiscreteActionTrainer): workspace.FeedBlob("states", states) elif isinstance(self.trainer, ContinuousActionDQNTrainer): num_actions = len(self.trainer.action_normalization_parameters) actions = np.eye(num_actions, dtype=np.float32) actions = np.tile(actions, reps=(len(states), 1)) states = np.repeat(states, repeats=num_actions, axis=0) workspace.FeedBlob("states", states) workspace.FeedBlob("actions", actions) else: raise NotImplementedError( "Invalid trainer passed to GymPredictor") workspace.RunNetOnce(self.trainer.internal_policy_model.net) policy_output_blob = self.trainer.internal_policy_output q_scores = workspace.FetchBlob(policy_output_blob) if isinstance(self.trainer, DiscreteActionTrainer): assert q_scores.shape[0] == 1 q_scores = q_scores[0] q_scores_softmax = Evaluator.softmax(q_scores.reshape( 1, -1), self.trainer.rl_temperature)[0] if np.isnan( q_scores_softmax).any() or np.max(q_scores_softmax) < 1e-3: q_scores_softmax[:] = 1.0 / q_scores_softmax.shape[0] policies = [ np.argmax(q_scores), np.random.choice(q_scores.shape[0], p=q_scores_softmax), ] return policies
def evaluate( self, evaluator: Evaluator, logged_actions: Optional[np.ndarray], logged_propensities: Optional[np.ndarray], logged_values: Optional[np.ndarray], ): workspace.RunNet(self.all_q_score_model.net) all_action_scores = workspace.FetchBlob(self.all_q_score_output) maxq_action_idxs = workspace.FetchBlob(self.maxq_action_idxs) model_values_on_logged_actions = np.sum( (logged_actions * all_action_scores), axis=1, keepdims=True) model_propensities = Evaluator.softmax(all_action_scores, self.rl_temperature) logged_rewards = workspace.FetchBlob("rewards") evaluator.report( workspace.FetchBlob(self.loss_blob), logged_actions, logged_propensities, logged_rewards, logged_values, model_propensities, all_action_scores, model_values_on_logged_actions, maxq_action_idxs, )
def evaluate( self, evaluator: Evaluator, logged_actions: torch.Tensor, logged_propensities: Optional[torch.Tensor], logged_rewards: torch.Tensor, logged_values: Optional[torch.Tensor], ): self.model_propensities, model_values_on_logged_actions, maxq_action_idxs = ( None, None, None, ) if self.all_action_scores is not None: self.all_action_scores = self.all_action_scores self.model_propensities = Evaluator.softmax( self.all_action_scores.cpu().numpy(), self.rl_temperature ) maxq_action_idxs = self.all_action_scores.argmax(dim=1, keepdim=True) if logged_actions is not None: model_values_on_logged_actions = ( torch.sum( (logged_actions * self.all_action_scores), dim=1, keepdim=True ) .cpu() .numpy() ) evaluator.report( self.loss.cpu().numpy(), logged_actions.cpu().numpy(), logged_propensities.cpu().numpy() if logged_propensities is not None else None, logged_rewards.cpu().numpy(), logged_values.cpu().numpy() if logged_values is not None else None, self.model_propensities, self.reward_estimates.cpu().numpy(), self.all_action_scores.cpu().numpy(), model_values_on_logged_actions, maxq_action_idxs, )
def policy(self, states): with core.DeviceScope(self.c2_device): if isinstance(self.trainer, DiscreteActionTrainer): workspace.FeedBlob("states", states) else: raise NotImplementedError( "Invalid trainer passed to GymPredictor") workspace.RunNetOnce(self.trainer.internal_policy_model.net) policy_output_blob = self.trainer.internal_policy_output q_scores = workspace.FetchBlob(policy_output_blob) if isinstance(self.trainer, DiscreteActionTrainer): assert q_scores.shape[0] == 1 q_scores = q_scores[0] q_scores_softmax = Evaluator.softmax(q_scores.reshape( 1, -1), self.trainer.rl_temperature)[0] if np.isnan( q_scores_softmax).any() or np.max(q_scores_softmax) < 1e-3: q_scores_softmax[:] = 1.0 / q_scores_softmax.shape[0] policies = [ np.argmax(q_scores), np.random.choice(q_scores.shape[0], p=q_scores_softmax), ] return policies
def policy(self, states): if isinstance(self.trainer, DQNTrainer): input = states elif isinstance(self.trainer, ParametricDQNTrainer): num_actions = len(self.trainer.action_normalization_parameters) actions = np.eye(num_actions, dtype=np.float32) actions = np.tile(actions, reps=(len(states), 1)) states = np.repeat(states, repeats=num_actions, axis=0) input = np.hstack((states, actions)) else: raise NotImplementedError("Invalid trainer passed to GymPredictor") q_scores = self.trainer.internal_prediction(input) if isinstance(self.trainer, DQNTrainer): assert q_scores.shape[0] == 1 q_scores = q_scores[0] q_scores_softmax = Evaluator.softmax(q_scores.reshape(1, -1), self.trainer.rl_temperature)[0] if np.isnan(q_scores_softmax).any() or np.max(q_scores_softmax) < 1e-3: q_scores_softmax[:] = 1.0 / q_scores_softmax.shape[0] policies = [ np.argmax(q_scores), np.random.choice(q_scores.shape[0], p=q_scores_softmax), ] return policies
def evaluate(self, predictor): # test only float features predictions = predictor.predict(self.logged_states) estimated_reward_values = predictor.estimate_reward(self.logged_states) if isinstance(predictor.trainer, ParametricDQNTrainer): predictions = predictions.reshape([-1, self._env.action_dim]) estimated_reward_values = estimated_reward_values.reshape( [-1, self._env.action_dim]) value_error_sum = 0.0 reward_error_sum = 0.0 for i in range(len(self.logged_states)): logged_action = self.logged_actions[i] logged_value = self.logged_values[i][0] target_value = predictions[i][logged_action] value_error_sum += abs(logged_value - target_value) logged_reward = self.logged_rewards[i][0] estimated_reward = estimated_reward_values[i][logged_action] reward_error_sum += abs(logged_reward - estimated_reward) value_error_mean = value_error_sum / np.sum(np.abs(self.logged_values)) reward_error_mean = reward_error_sum / np.sum( np.abs(self.logged_rewards)) logger.info("EVAL Q-Value MAE ERROR: {0:.3f}".format(value_error_mean)) self.mc_loss.append(value_error_mean) logger.info("EVAL REWARD MAE ERROR: {0:.3f}".format(reward_error_mean)) self.reward_loss.append(reward_error_mean) target_propensities = Evaluator.softmax( predictions, GymEvaluator.SOFTMAX_TEMPERATURE) reward_inverse_propensity_score, reward_direct_method, reward_doubly_robust = self.doubly_robust_one_step_policy_estimation( self.logged_actions_one_hot, self.logged_rewards, self.logged_propensities, target_propensities, estimated_reward_values, ) self.reward_inverse_propensity_score.append( reward_inverse_propensity_score) self.reward_direct_method.append(reward_direct_method) self.reward_doubly_robust.append(reward_doubly_robust) logger.info( "Reward Inverse Propensity Score : normalized {0:.3f} raw {1:.3f}" .format( reward_inverse_propensity_score.normalized, reward_inverse_propensity_score.raw, )) logger.info( "Reward Direct Method : normalized {0:.3f} raw {1:.3f}" .format(reward_direct_method.normalized, reward_direct_method.raw)) logger.info( "Reward Doubly Robust P.E. : normalized {0:.3f} raw {1:.3f}" .format(reward_doubly_robust.normalized, reward_doubly_robust.raw)) value_inverse_propensity_score, value_direct_method, value_doubly_robust = self.doubly_robust_one_step_policy_estimation( self.logged_actions_one_hot, self.logged_values, self.logged_propensities, target_propensities, predictions, ) self.value_inverse_propensity_score.append( value_inverse_propensity_score) self.value_direct_method.append(value_direct_method) self.value_doubly_robust.append(value_doubly_robust) logger.info( "Value Inverse Propensity Score : normalized {0:.3f} raw {1:.3f}" .format( value_inverse_propensity_score.normalized, value_inverse_propensity_score.raw, )) logger.info( "Value Direct Method : normalized {0:.3f} raw {1:.3f}" .format(value_direct_method.normalized, value_direct_method.raw)) logger.info( "Value One-Step Doubly Robust P.E. : normalized {0:.3f} raw {1:.3f}" .format(value_doubly_robust.normalized, value_doubly_robust.raw)) sequential_doubly_robust = self.doubly_robust_sequential_policy_estimation( self.logged_actions_one_hot, self.logged_rewards, self.logged_terminals, self.logged_propensities, target_propensities, predictions, ) self.value_sequential_doubly_robust.append(sequential_doubly_robust) logger.info( "Value Sequential Doubly Robust P.E. : normalized {0:.3f} raw {1:.3f}" .format(sequential_doubly_robust.normalized, sequential_doubly_robust.raw)) weighted_doubly_robust = self.weighted_doubly_robust_sequential_policy_estimation( self.logged_actions_one_hot, self.logged_rewards, self.logged_terminals, self.logged_propensities, target_propensities, predictions, num_j_steps=1, whether_self_normalize_importance_weights=True, ) self.value_weighted_doubly_robust.append(weighted_doubly_robust) logger.info( "Value Weighted Sequential Doubly Robust P.E. : noramlized {0:.3f} raw {1:.3f}" .format(weighted_doubly_robust.normalized, weighted_doubly_robust.raw)) magic_doubly_robust = self.weighted_doubly_robust_sequential_policy_estimation( self.logged_actions_one_hot, self.logged_rewards, self.logged_terminals, self.logged_propensities, target_propensities, predictions, num_j_steps=GymEvaluator.NUM_J_STEPS_FOR_MAGIC_ESTIMATOR, whether_self_normalize_importance_weights=True, ) self.value_magic_doubly_robust.append(magic_doubly_robust) logger.info( "Value Magic Doubly Robust P.E. : normalized {0:.3f} raw {1:.3f}" .format(magic_doubly_robust.normalized, magic_doubly_robust.raw)) avg_rewards, avg_discounted_rewards = self._env.run_ep_n_times( 100, predictor, test=True) episode_starts = np.nonzero(self.logged_terminals.squeeze())[0] + 1 logged_discounted_performance = (self.logged_values[0][0] + np.sum( self.logged_values[episode_starts[:-1]])) / np.sum( self.logged_terminals) true_discounted_value_PE = (avg_discounted_rewards / logged_discounted_performance) self.true_discounted_value_PE.append(true_discounted_value_PE) logger.info( "True Discounted Value P.E : normalized {0:.3f} raw {1:.3f}" .format(true_discounted_value_PE, avg_discounted_rewards)) logged_performance = np.sum(self.logged_rewards) / np.sum( self.logged_terminals) true_value_PE = avg_rewards / logged_performance self.true_value_PE.append(true_value_PE) logger.info( "True Value P.E : normalized {0:.3f} raw {1:.3f}" .format(true_value_PE, avg_rewards))
def evaluate(self, predictor): # Test feeding float features & int features if self.use_int_features: float_features, int_features = self._split_int_and_float_features( self.logged_states) # Since all gridworld features are float types, swap these so # all inputs are now int_features for testing purpose float_features, int_features = int_features, float_features prediction_string = predictor.predict(float_features, int_features) # Test only feeding float features else: prediction_string = predictor.predict(self.logged_states) # Convert action string to integer prediction = np.zeros([len(prediction_string), len(self._env.ACTIONS)], dtype=np.float32) for x in range(len(self.logged_states)): for action_index, action in enumerate(self._env.ACTIONS): prediction[x][action_index] = prediction_string[x].get( action, 1e-9) # Print out scores using all states all_states = [] for x in self._env.STATES: all_states.append({x: 1.0}) if self.use_int_features: all_states_float, all_states_int = self._split_int_and_float_features( all_states) all_states_prediction_string = predictor.predict( all_states_float, all_states_int) else: all_states_prediction_string = predictor.predict(all_states) all_states_prediction = np.zeros( [len(all_states_prediction_string), len(self._env.ACTIONS)], dtype=np.float32, ) for x in range(len(all_states)): for action_index, action in enumerate(self._env.ACTIONS): all_states_prediction[x][ action_index] = all_states_prediction_string[x].get( action, 1e-9) print(all_states_prediction[:, 0].reshape(5, 5), "\n") print(all_states_prediction[:, 1].reshape(5, 5), "\n") print(all_states_prediction[:, 2].reshape(5, 5), "\n") print(all_states_prediction[:, 3].reshape(5, 5), "\n") error_sum = 0.0 num_error_prints = 0 for x in range(len(self.logged_states)): logged_value = self.logged_values[x][0] target_value = prediction_string[x].get(self.logged_actions[x], 1e-9) error = abs(logged_value - target_value) if num_error_prints < 10 and error > 0.2: print( "GOT THIS STATE WRONG: ", x, self._env._pos(list(self.logged_states[x].keys())[0]), self.logged_actions[x], logged_value, target_value, ) num_error_prints += 1 if num_error_prints == 10: print("MAX ERRORS PRINTED") error_sum += error error_mean = error_sum / float(len(self.logged_states)) logger.info("EVAL ERROR: {0:.3f}".format(error_mean)) self.mc_loss.append(error_mean) target_propensities = Evaluator.softmax( prediction, GridworldEvaluator.SOFTMAX_TEMPERATURE) reward_inverse_propensity_score, reward_direct_method, reward_doubly_robust = self.doubly_robust_one_step_policy_estimation( self.logged_actions_one_hot, self.logged_rewards, self.logged_propensities, target_propensities, self.estimated_reward_values, ) self.reward_inverse_propensity_score.append( reward_inverse_propensity_score) self.reward_direct_method.append(reward_direct_method) self.reward_doubly_robust.append(reward_doubly_robust) logger.info( "Reward Inverse Propensity Score : normalized {0:.3f} raw {1:.3f}" .format( reward_inverse_propensity_score.normalized, reward_inverse_propensity_score.raw, )) logger.info( "Reward Direct Method : normalized {0:.3f} raw {1:.3f}" .format(reward_direct_method.normalized, reward_direct_method.raw)) logger.info( "Reward Doubly Robust P.E. : normalized {0:.3f} raw {1:.3f}" .format(reward_doubly_robust.normalized, reward_doubly_robust.raw)) value_inverse_propensity_score, value_direct_method, value_doubly_robust = self.doubly_robust_one_step_policy_estimation( self.logged_actions_one_hot, self.logged_values, self.logged_propensities, target_propensities, self.estimated_ltv_values, ) self.value_inverse_propensity_score.append( value_inverse_propensity_score) self.value_direct_method.append(value_direct_method) self.value_doubly_robust.append(value_doubly_robust) logger.info( "Value Inverse Propensity Score : normalized {0:.3f} raw {1:.3f}" .format( value_inverse_propensity_score.normalized, value_inverse_propensity_score.raw, )) logger.info( "Value Direct Method : normalized {0:.3f} raw {1:.3f}" .format(value_direct_method.normalized, value_direct_method.raw)) logger.info( "Value One-Step Doubly Robust P.E. : normalized {0:.3f} raw {1:.3f}" .format(value_doubly_robust.normalized, value_doubly_robust.raw)) sequential_doubly_robust = self.doubly_robust_sequential_policy_estimation( self.logged_actions_one_hot, self.logged_rewards, self.logged_terminals, self.logged_propensities, target_propensities, self.estimated_ltv_values, ) self.value_sequential_doubly_robust.append(sequential_doubly_robust) logger.info( "Value Sequential Doubly Robust P.E. : normalized {0:.3f} raw {1:.3f}" .format(sequential_doubly_robust.normalized, sequential_doubly_robust.raw)) weighted_doubly_robust = self.weighted_doubly_robust_sequential_policy_estimation( self.logged_actions_one_hot, self.logged_rewards, self.logged_terminals, self.logged_propensities, target_propensities, self.estimated_ltv_values, num_j_steps=1, whether_self_normalize_importance_weights=True, ) self.value_weighted_doubly_robust.append(weighted_doubly_robust) logger.info( "Value Weighted Sequential Doubly Robust P.E. : noramlized {0:.3f} raw {1:.3f}" .format(weighted_doubly_robust.normalized, weighted_doubly_robust.raw)) magic_doubly_robust = self.weighted_doubly_robust_sequential_policy_estimation( self.logged_actions_one_hot, self.logged_rewards, self.logged_terminals, self.logged_propensities, target_propensities, self.estimated_ltv_values, num_j_steps=GridworldEvaluator.NUM_J_STEPS_FOR_MAGIC_ESTIMATOR, whether_self_normalize_importance_weights=True, ) self.value_magic_doubly_robust.append(magic_doubly_robust) logger.info( "Value Magic Doubly Robust P.E. : normalized {0:.3f} raw {1:.3f}" .format(magic_doubly_robust.normalized, magic_doubly_robust.raw))
def evaluate_predictions(self, prediction, all_states_prediction): print(all_states_prediction[:, 0].reshape(5, 5), "\n") print(all_states_prediction[:, 1].reshape(5, 5), "\n") print(all_states_prediction[:, 2].reshape(5, 5), "\n") print(all_states_prediction[:, 3].reshape(5, 5), "\n") error_sum = 0.0 num_error_prints = 0 for x in range(len(self.logged_states)): int_action = self._env.action_to_index(self.logged_actions[x]) logged_value = self.logged_values[x][0] target_value = prediction[x][int_action] error = abs(logged_value - target_value) if num_error_prints < 10 and error > 0.2: print( "GOT THIS STATE WRONG: ", x, self._env._pos(list(self.logged_states[x].keys())[0]), self.logged_actions[x], logged_value, target_value, ) num_error_prints += 1 if num_error_prints == 10: print("MAX ERRORS PRINTED") error_sum += error error_mean = error_sum / float(len(self.logged_states)) logger.info("EVAL ERROR: {0:.3f}".format(error_mean)) self.mc_loss.append(error_mean) target_propensities = Evaluator.softmax( prediction, GridworldEvaluator.SOFTMAX_TEMPERATURE ) reward_inverse_propensity_score, reward_direct_method, reward_doubly_robust = self.doubly_robust_one_step_policy_estimation( self.logged_actions_one_hot, self.logged_rewards, self.logged_propensities, target_propensities, self.estimated_reward_values, ) self.reward_inverse_propensity_score.append(reward_inverse_propensity_score) self.reward_direct_method.append(reward_direct_method) self.reward_doubly_robust.append(reward_doubly_robust) logger.info( "Reward Inverse Propensity Score : normalized {0:.3f} raw {1:.3f}".format( reward_inverse_propensity_score.normalized, reward_inverse_propensity_score.raw, ) ) logger.info( "Reward Direct Method : normalized {0:.3f} raw {1:.3f}".format( reward_direct_method.normalized, reward_direct_method.raw ) ) logger.info( "Reward Doubly Robust P.E. : normalized {0:.3f} raw {1:.3f}".format( reward_doubly_robust.normalized, reward_doubly_robust.raw ) ) value_inverse_propensity_score, value_direct_method, value_doubly_robust = self.doubly_robust_one_step_policy_estimation( self.logged_actions_one_hot, self.logged_values, self.logged_propensities, target_propensities, self.estimated_ltv_values, ) self.value_inverse_propensity_score.append(value_inverse_propensity_score) self.value_direct_method.append(value_direct_method) self.value_doubly_robust.append(value_doubly_robust) logger.info( "Value Inverse Propensity Score : normalized {0:.3f} raw {1:.3f}".format( value_inverse_propensity_score.normalized, value_inverse_propensity_score.raw, ) ) logger.info( "Value Direct Method : normalized {0:.3f} raw {1:.3f}".format( value_direct_method.normalized, value_direct_method.raw ) ) logger.info( "Value One-Step Doubly Robust P.E. : normalized {0:.3f} raw {1:.3f}".format( value_doubly_robust.normalized, value_doubly_robust.raw ) ) sequential_doubly_robust = self.doubly_robust_sequential_policy_estimation( self.logged_actions_one_hot, self.logged_rewards, self.logged_terminals, self.logged_propensities, target_propensities, self.estimated_ltv_values, ) self.value_sequential_doubly_robust.append(sequential_doubly_robust) logger.info( "Value Sequential Doubly Robust P.E. : normalized {0:.3f} raw {1:.3f}".format( sequential_doubly_robust.normalized, sequential_doubly_robust.raw ) ) weighted_doubly_robust = self.weighted_doubly_robust_sequential_policy_estimation( self.logged_actions_one_hot, self.logged_rewards, self.logged_terminals, self.logged_propensities, target_propensities, self.estimated_ltv_values, num_j_steps=1, whether_self_normalize_importance_weights=True, ) self.value_weighted_doubly_robust.append(weighted_doubly_robust) logger.info( "Value Weighted Sequential Doubly Robust P.E. : noramlized {0:.3f} raw {1:.3f}".format( weighted_doubly_robust.normalized, weighted_doubly_robust.raw ) ) magic_doubly_robust = self.weighted_doubly_robust_sequential_policy_estimation( self.logged_actions_one_hot, self.logged_rewards, self.logged_terminals, self.logged_propensities, target_propensities, self.estimated_ltv_values, num_j_steps=GridworldEvaluator.NUM_J_STEPS_FOR_MAGIC_ESTIMATOR, whether_self_normalize_importance_weights=True, ) self.value_magic_doubly_robust.append(magic_doubly_robust) logger.info( "Value Magic Doubly Robust P.E. : normalized {0:.3f} raw {1:.3f}".format( magic_doubly_robust.normalized, magic_doubly_robust.raw ) )
def train(self, training_samples: TrainingDataPage, evaluator: Optional[Evaluator] = None): if self.minibatch == 0: # Assume that the tensors are the right shape after the first minibatch assert (training_samples.states.shape[0] == self.minibatch_size ), "Invalid shape: " + str(training_samples.states.shape) assert training_samples.actions.shape == torch.Size([ self.minibatch_size, len(self._actions) ]), "Invalid shape: " + str(training_samples.actions.shape) assert training_samples.rewards.shape == torch.Size( [self.minibatch_size, 1]), "Invalid shape: " + str(training_samples.rewards.shape) assert (training_samples.next_states.shape == training_samples.states.shape), "Invalid shape: " + str( training_samples.next_states.shape) assert (training_samples.not_terminals.shape == training_samples.rewards.shape), "Invalid shape: " + str( training_samples.not_terminals.shape) if training_samples.possible_next_actions is not None: assert ( training_samples.possible_next_actions.shape == training_samples.actions.shape), "Invalid shape: " + str( training_samples.possible_next_actions.shape) if training_samples.propensities is not None: assert (training_samples.propensities.shape == training_samples .rewards.shape), "Invalid shape: " + str( training_samples.propensities.shape) # Apply reward boost if specified reward_boosts = torch.sum(training_samples.actions.float() * self.reward_boosts, dim=1, keepdim=True) boosted_rewards = training_samples.rewards + reward_boosts self.minibatch += 1 states = training_samples.states.detach().requires_grad_(True) actions = training_samples.actions rewards = boosted_rewards next_states = training_samples.next_states discount_tensor = torch.full(training_samples.time_diffs.shape, self.gamma).type(self.dtype) not_done_mask = training_samples.not_terminals if self.use_seq_num_diff_as_time_diff: discount_tensor = discount_tensor.pow(training_samples.time_diffs) if self.maxq_learning: # Compute max a' Q(s', a') over all possible actions using target network possible_next_actions = training_samples.possible_next_actions next_q_values = self.get_max_q_values(next_states, possible_next_actions, self.double_q_learning) else: # SARSA next_actions = training_samples.next_actions next_q_values = self.get_next_action_q_values( next_states, next_actions) filtered_next_q_vals = next_q_values * not_done_mask if self.minibatch < self.reward_burnin: target_q_values = rewards else: target_q_values = rewards + (discount_tensor * filtered_next_q_vals) # Get Q-value of action taken all_q_values = self.q_network(states) self.all_action_scores = all_q_values.detach() q_values = torch.sum(all_q_values * actions, 1, keepdim=True) loss = self.q_network_loss(q_values, target_q_values) self.loss = loss.detach() self.q_network_optimizer.zero_grad() loss.backward() if self.gradient_handler: self.gradient_handler(self.q_network.parameters()) self.q_network_optimizer.step() if self.minibatch < self.reward_burnin: # Reward burnin: force target network self._soft_update(self.q_network, self.q_network_target, 1.0) else: # Use the soft update rule to update target network self._soft_update(self.q_network, self.q_network_target, self.tau) # get reward estimates reward_estimates = self.reward_network(states) self.reward_estimates = reward_estimates.detach() reward_estimates_for_logged_actions = reward_estimates.gather( 1, actions.argmax(dim=1, keepdim=True)) reward_loss = F.mse_loss(reward_estimates_for_logged_actions, rewards) self.reward_network_optimizer.zero_grad() reward_loss.backward() self.reward_network_optimizer.step() self.loss_reporter.report(td_loss=float(self.loss), reward_loss=float(reward_loss)) training_metadata = {} if evaluator is not None: model_propensities = torch.from_numpy( Evaluator.softmax(self.all_action_scores.cpu().numpy(), self.rl_temperature)) cpe_stats = BatchStatsForCPE( logged_actions=training_samples.actions, logged_propensities=training_samples.propensities, logged_rewards=rewards, logged_values=None, # Compute at end of each epoch for CPE model_propensities=model_propensities, model_rewards=self.reward_estimates, model_values=self.all_action_scores, model_values_on_logged_actions= None, # Compute at end of each epoch for CPE model_action_idxs=self.all_action_scores.argmax(dim=1, keepdim=True), ) evaluator.report(cpe_stats) training_metadata["model_rewards"] = self.reward_estimates.cpu( ).numpy() return training_metadata
def train(self, training_samples: TrainingDataPage): if self.minibatch == 0: # Assume that the tensors are the right shape after the first minibatch assert (training_samples.states.shape[0] == self.minibatch_size ), "Invalid shape: " + str(training_samples.states.shape) assert training_samples.actions.shape == torch.Size([ self.minibatch_size, len(self._actions) ]), "Invalid shape: " + str(training_samples.actions.shape) assert training_samples.rewards.shape == torch.Size( [self.minibatch_size, 1]), "Invalid shape: " + str(training_samples.rewards.shape) assert (training_samples.next_states.shape == training_samples.states.shape), "Invalid shape: " + str( training_samples.next_states.shape) assert (training_samples.not_terminal.shape == training_samples.rewards.shape), "Invalid shape: " + str( training_samples.not_terminal.shape) if training_samples.possible_next_actions_mask is not None: assert ( training_samples.possible_next_actions_mask.shape == training_samples.actions.shape), ( "Invalid shape: " + str(training_samples.possible_next_actions_mask.shape)) if training_samples.propensities is not None: assert (training_samples.propensities.shape == training_samples .rewards.shape), "Invalid shape: " + str( training_samples.propensities.shape) if training_samples.metrics is not None: assert ( training_samples.metrics.shape[0] == self.minibatch_size ), "Invalid shape: " + str(training_samples.metrics.shape) boosted_rewards = self.boost_rewards(training_samples.rewards, training_samples.actions) self.minibatch += 1 states = training_samples.states.detach().requires_grad_(True) actions = training_samples.actions rewards = boosted_rewards discount_tensor = torch.full(training_samples.time_diffs.shape, self.gamma).type(self.dtype) not_done_mask = training_samples.not_terminal if self.use_seq_num_diff_as_time_diff: discount_tensor = discount_tensor.pow(training_samples.time_diffs) all_next_q_values, all_next_q_values_target = self.get_detached_q_values( training_samples.next_states) if self.maxq_learning: # Compute max a' Q(s', a') over all possible actions using target network next_q_values, max_q_action_idxs = self.get_max_q_values( all_next_q_values, all_next_q_values_target, training_samples.possible_next_actions_mask, ) else: # SARSA next_q_values, max_q_action_idxs = self.get_max_q_values( all_next_q_values, all_next_q_values_target, training_samples.next_actions, ) filtered_next_q_vals = next_q_values * not_done_mask if self.minibatch < self.reward_burnin: target_q_values = rewards else: target_q_values = rewards + (discount_tensor * filtered_next_q_vals) # Get Q-value of action taken all_q_values = self.q_network(states) self.all_action_scores = all_q_values.detach() q_values = torch.sum(all_q_values * actions, 1, keepdim=True) loss = self.q_network_loss(q_values, target_q_values) self.loss = loss.detach() self.q_network_optimizer.zero_grad() loss.backward() if self.gradient_handler: self.gradient_handler(self.q_network.parameters()) self.q_network_optimizer.step() if self.minibatch < self.reward_burnin: # Reward burnin: force target network self._soft_update(self.q_network, self.q_network_target, 1.0) else: # Use the soft update rule to update target network self._soft_update(self.q_network, self.q_network_target, self.tau) if training_samples.metrics is None: metrics_reward_concat_real_vals = training_samples.rewards else: metrics_reward_concat_real_vals = torch.cat( (training_samples.metrics, training_samples.rewards), dim=1) ######### Train separate reward network for CPE evaluation ############# reward_estimates = self.reward_network(states) logged_action_idxs = actions.argmax(dim=1, keepdim=True) reward_estimates_for_logged_actions = reward_estimates.gather( 1, self.reward_idx_offsets + logged_action_idxs) reward_loss = F.mse_loss(reward_estimates_for_logged_actions, metrics_reward_concat_real_vals) self.reward_network_optimizer.zero_grad() reward_loss.backward() self.reward_network_optimizer.step() ######### Train separate q-network for CPE evaluation ############# metric_q_values = self.q_network_cpe(states).gather( 1, self.reward_idx_offsets + logged_action_idxs) metric_target_q_values = self.q_network_cpe_target(states).detach() max_q_values_metrics = metric_target_q_values.gather( 1, self.reward_idx_offsets + max_q_action_idxs) filtered_max_q_values_metrics = max_q_values_metrics * not_done_mask if self.minibatch < self.reward_burnin: target_metric_q_values = metrics_reward_concat_real_vals else: target_metric_q_values = metrics_reward_concat_real_vals + ( discount_tensor * filtered_max_q_values_metrics) metric_q_value_loss = self.q_network_loss(metric_q_values, target_metric_q_values) self.q_network_cpe.zero_grad() metric_q_value_loss.backward() self.q_network_cpe_optimizer.step() if self.minibatch < self.reward_burnin: # Reward burnin: force target network self._soft_update(self.q_network_cpe, self.q_network_cpe_target, 1.0) else: # Use the soft update rule to update target network self._soft_update(self.q_network_cpe, self.q_network_cpe_target, self.tau) model_propensities = torch.from_numpy( Evaluator.softmax(self.all_action_scores.cpu().numpy(), self.rl_temperature)) self.loss_reporter.report( td_loss=self.loss, reward_loss=reward_loss, logged_actions=logged_action_idxs, logged_propensities=training_samples.propensities, logged_rewards=rewards, logged_values=None, # Compute at end of each epoch for CPE model_propensities=model_propensities, model_rewards=reward_estimates[:, torch.arange( self.reward_idx_offsets[0], self.reward_idx_offsets[0] + self.num_actions, ), ], model_values=self.all_action_scores, model_values_on_logged_actions= None, # Compute at end of each epoch for CPE model_action_idxs=self.all_action_scores.argmax(dim=1, keepdim=True), ) training_metadata = {} training_metadata["model_rewards"] = reward_estimates.detach().cpu( ).numpy() return training_metadata
def evaluate(self, predictor): # Test feeding float features & int features if self.use_int_features: float_features, int_features = self._split_int_and_float_features( self.logged_states) # Since all gridworld features are float types, swap these so # all inputs are now int_features for testing purpose float_features, int_features = int_features, float_features prediction_string = predictor.predict(float_features, int_features) # Test only feeding float features else: prediction_string = predictor.predict(self.logged_states) # Convert action string to integer prediction = np.zeros([len(prediction_string), len(self._env.ACTIONS)], dtype=np.float32) for x in range(len(self.logged_states)): for action_index, action in enumerate(self._env.ACTIONS): prediction[x][action_index] = prediction_string[x][action] error_sum = 0.0 for x in range(len(self.logged_states)): logged_value = self.logged_values[x][0] target_value = prediction_string[x][self.logged_actions[x]] error_sum += abs(logged_value - target_value) error_mean = error_sum / float(len(self.logged_states)) logger.info("EVAL ERROR: {0:.3f}".format(error_mean)) self.mc_loss.append(error_mean) target_propensities = Evaluator.softmax( prediction, GridworldEvaluator.SOFTMAX_TEMPERATURE) value_inverse_propensity_score, value_direct_method, value_doubly_robust = self.doubly_robust_policy_estimation( self.logged_actions_one_hot, self.logged_values, self.logged_propensities, target_propensities, self.estimated_ltv_values, ) self.value_inverse_propensity_score.append( value_inverse_propensity_score) self.value_direct_method.append(value_direct_method) self.value_doubly_robust.append(value_doubly_robust) logger.info("Value Inverse Propensity Score : {0:.3f}".format( value_inverse_propensity_score)) logger.info("Value Direct Method : {0:.3f}".format( value_direct_method)) logger.info("Value Doubly Robust P.E. : {0:.3f}".format( value_doubly_robust)) reward_inverse_propensity_score, reward_direct_method, reward_doubly_robust = self.doubly_robust_policy_estimation( self.logged_actions_one_hot, self.logged_rewards, self.logged_propensities, target_propensities, self.estimated_reward_values, ) self.reward_inverse_propensity_score.append( reward_inverse_propensity_score) self.reward_direct_method.append(reward_direct_method) self.reward_doubly_robust.append(reward_doubly_robust) logger.info("Reward Inverse Propensity Score: {0:.3f}".format( reward_inverse_propensity_score)) logger.info("Reward Direct Method : {0:.3f}".format( reward_direct_method)) logger.info("Reward Doubly Robust P.E. : {0:.3f}".format( reward_doubly_robust))