def evaluate( self, evaluator: Evaluator, logged_actions: Optional[np.ndarray], logged_propensities: Optional[np.ndarray], ): workspace.RunNet(self.all_q_score_model.net) all_action_scores = workspace.FetchBlob(self.all_q_score_output) maxq_action_idxs = workspace.FetchBlob(self.maxq_action_idxs) model_values_on_logged_actions = np.sum( (logged_actions * all_action_scores), axis=1, keepdims=True ) model_propensities = Evaluator.softmax(all_action_scores, self.rl_temperature) logged_rewards = workspace.FetchBlob("rewards") cpe_stats = BatchStatsForCPE( td_loss=workspace.FetchBlob(self.loss_blob), logged_actions=logged_actions, logged_propensities=logged_propensities, logged_rewards=logged_rewards, logged_values=None, model_propensities=model_propensities, model_rewards=None, model_values=all_action_scores, model_values_on_logged_actions=model_values_on_logged_actions, model_action_idxs=maxq_action_idxs, ) evaluator.report(cpe_stats)
def train(self, training_samples: TrainingDataPage, evaluator=None) -> None: if self.minibatch == 0: # Assume that the tensors are the right shape after the first minibatch assert ( training_samples.states.shape[0] == self.minibatch_size ), "Invalid shape: " + str(training_samples.states.shape) assert ( training_samples.actions.shape[0] == self.minibatch_size ), "Invalid shape: " + str(training_samples.actions.shape) assert training_samples.rewards.shape == torch.Size( [self.minibatch_size, 1] ), "Invalid shape: " + str(training_samples.rewards.shape) assert ( training_samples.next_states.shape == training_samples.states.shape ), "Invalid shape: " + str(training_samples.next_states.shape) assert ( training_samples.not_terminals.shape == training_samples.rewards.shape ), "Invalid shape: " + str(training_samples.not_terminals.shape) if self.use_seq_num_diff_as_time_diff: assert ( training_samples.time_diffs.shape == training_samples.rewards.shape ), "Invalid shape: " + str(training_samples.time_diffs.shape) self.minibatch += 1 states = training_samples.states.detach().requires_grad_(True) actions = training_samples.actions.detach().requires_grad_(True) # As far as ddpg is concerned all actions are [-1, 1] due to actor tanh actions = rescale_torch_tensor( actions, new_min=self.min_action_range_tensor_training, new_max=self.max_action_range_tensor_training, prev_min=self.min_action_range_tensor_serving, prev_max=self.max_action_range_tensor_serving, ) rewards = training_samples.rewards next_states = training_samples.next_states time_diffs = training_samples.time_diffs discount_tensor = torch.tensor(np.full(rewards.shape, self.gamma)).type( self.dtype ) not_done_mask = training_samples.not_terminals # Optimize the critic network subject to mean squared error: # L = ([r + gamma * Q(s2, a2)] - Q(s1, a1)) ^ 2 q_s1_a1 = self.critic(torch.cat((states, actions), dim=1)) next_actions = self.actor_target(next_states) next_state_actions = torch.cat((next_states, next_actions), dim=1) q_s2_a2 = self.critic_target(next_state_actions) filtered_q_s2_a2 = not_done_mask * q_s2_a2 if self.use_seq_num_diff_as_time_diff: discount_tensor = discount_tensor.pow(time_diffs) if self.minibatch < self.reward_burnin: target_q_values = rewards else: target_q_values = rewards + (discount_tensor * filtered_q_s2_a2) # compute loss and update the critic network critic_predictions = q_s1_a1 loss_critic = self.q_network_loss(critic_predictions, target_q_values.detach()) loss_critic_for_eval = loss_critic.detach() self.critic_optimizer.zero_grad() loss_critic.backward() self.critic_optimizer.step() # Optimize the actor network subject to the following: # max mean(Q(s1, a1)) or min -mean(Q(s1, a1)) loss_actor = -self.critic(torch.cat((states, self.actor(states)), dim=1)).mean() self.actor_optimizer.zero_grad() loss_actor.backward() self.actor_optimizer.step() if self.minibatch < self.reward_burnin: # Reward burnin: force target network self._soft_update(self.actor, self.actor_target, 1.0) self._soft_update(self.critic, self.critic_target, 1.0) else: # Use the soft update rule to update both target networks self._soft_update(self.actor, self.actor_target, self.tau) self._soft_update(self.critic, self.critic_target, self.tau) if evaluator is not None: cpe_stats = BatchStatsForCPE(td_loss=loss_critic_for_eval.cpu().numpy()) evaluator.report(cpe_stats)
def train(self, training_batch, evaluator=None) -> None: if hasattr(training_batch, "as_parametric_sarsa_training_batch"): training_batch = training_batch.as_parametric_sarsa_training_batch( ) learning_input = training_batch.training_input self.minibatch += 1 reward = learning_input.reward discount_tensor = torch.full_like(reward, self.gamma) not_done_mask = learning_input.not_terminal if self.use_seq_num_diff_as_time_diff: # TODO: Implement this in another diff raise NotImplementedError if self.maxq_learning: # Compute max a' Q(s', a') over all possible actions using target network next_q_values = self.get_max_q_values( learning_input.tiled_next_state, learning_input.possible_next_actions, self.double_q_learning, ) else: # SARSA next_q_values = self.get_next_action_q_values( learning_input.next_state, learning_input.next_action) filtered_max_q_vals = next_q_values.reshape(-1, 1) * not_done_mask if self.minibatch < self.reward_burnin: target_q_values = reward else: target_q_values = reward + (discount_tensor * filtered_max_q_vals) # Get Q-value of action taken current_state_action = rlt.StateAction(state=learning_input.state, action=learning_input.action) q_values = self.q_network(current_state_action).q_value self.all_action_scores = q_values.detach() value_loss = self.q_network_loss(q_values, target_q_values) self.loss = value_loss.detach() self.q_network_optimizer.zero_grad() value_loss.backward() if self.gradient_handler: self.gradient_handler(self.q_network.parameters()) self.q_network_optimizer.step() # TODO: Maybe soft_update should belong to the target network if self.minibatch < self.reward_burnin: # Reward burnin: force target network self._soft_update(self.q_network, self.q_network_target, 1.0) else: # Use the soft update rule to update target network self._soft_update(self.q_network, self.q_network_target, self.tau) # get reward estimates reward_estimates = self.reward_network(current_state_action).q_value reward_loss = F.mse_loss(reward_estimates, reward) self.reward_network_optimizer.zero_grad() reward_loss.backward() self.reward_network_optimizer.step() self.loss_reporter.report(td_loss=float(self.loss), reward_loss=float(reward_loss)) if evaluator is not None: cpe_stats = BatchStatsForCPE( model_values_on_logged_actions=self.all_action_scores) evaluator.report(cpe_stats)
def train(self, training_samples: TrainingDataPage, evaluator=None) -> None: if self.minibatch == 0: # Assume that the tensors are the right shape after the first minibatch assert (training_samples.states.shape[0] == self.minibatch_size ), "Invalid shape: " + str(training_samples.states.shape) assert (training_samples.actions.shape[0] == self.minibatch_size ), "Invalid shape: " + str(training_samples.actions.shape) assert training_samples.rewards.shape == torch.Size( [self.minibatch_size, 1]), "Invalid shape: " + str(training_samples.rewards.shape) assert (training_samples.next_states.shape == training_samples.states.shape), "Invalid shape: " + str( training_samples.next_states.shape) assert (training_samples.not_terminals.shape == training_samples.rewards.shape), "Invalid shape: " + str( training_samples.not_terminals.shape) assert training_samples.possible_next_actions_state_concat.shape[ 1] == ( training_samples.states.shape[1] + training_samples.actions.shape[1] ), ("Invalid shape: " + str( training_samples.possible_next_actions_state_concat.shape)) assert training_samples.possible_next_actions_lengths.shape == torch.Size( [ self.minibatch_size ]), ("Invalid shape: " + str(training_samples.possible_next_actions_lengths.shape)) self.minibatch += 1 states = training_samples.states.detach().requires_grad_(True) actions = training_samples.actions state_action_pairs = torch.cat((states, actions), dim=1) rewards = training_samples.rewards discount_tensor = torch.full(training_samples.time_diffs.shape, self.gamma).type(self.dtype) not_done_mask = training_samples.not_terminals if self.use_seq_num_diff_as_time_diff: discount_tensor = discount_tensor.pow(training_samples.time_diffs) if self.maxq_learning: # Compute max a' Q(s', a') over all possible actions using target network next_q_values = self.get_max_q_values( training_samples.possible_next_actions_state_concat, training_samples.possible_next_actions_lengths, self.double_q_learning, ) else: # SARSA next_state_action_pairs = torch.cat( (training_samples.next_states, training_samples.next_actions), dim=1) next_q_values = self.get_next_action_q_values( next_state_action_pairs) filtered_max_q_vals = next_q_values.reshape(-1, 1) * not_done_mask if self.minibatch < self.reward_burnin: target_q_values = rewards else: target_q_values = rewards + (discount_tensor * filtered_max_q_vals) # Get Q-value of action taken q_values = self.q_network(state_action_pairs) all_action_scores = q_values.detach() self.model_values_on_logged_actions = q_values.detach() value_loss = self.q_network_loss(q_values, target_q_values) self.loss = value_loss.detach() self.q_network_optimizer.zero_grad() value_loss.backward() if self.gradient_handler: self.gradient_handler(self.q_network.parameters()) self.q_network_optimizer.step() if self.minibatch < self.reward_burnin: # Reward burnin: force target network self._soft_update(self.q_network, self.q_network_target, 1.0) else: # Use the soft update rule to update target network self._soft_update(self.q_network, self.q_network_target, self.tau) # get reward estimates reward_estimates = self.reward_network(state_action_pairs) reward_loss = F.mse_loss(reward_estimates, rewards) self.reward_network_optimizer.zero_grad() reward_loss.backward() self.reward_network_optimizer.step() self.loss_reporter.report(td_loss=float(self.loss), reward_loss=float(reward_loss)) if evaluator is not None: cpe_stats = BatchStatsForCPE( model_values_on_logged_actions=all_action_scores) evaluator.report(cpe_stats)
def train(self, training_samples: TrainingDataPage, evaluator: Optional[Evaluator] = None): if self.minibatch == 0: # Assume that the tensors are the right shape after the first minibatch assert (training_samples.states.shape[0] == self.minibatch_size ), "Invalid shape: " + str(training_samples.states.shape) assert training_samples.actions.shape == torch.Size([ self.minibatch_size, len(self._actions) ]), "Invalid shape: " + str(training_samples.actions.shape) assert training_samples.rewards.shape == torch.Size( [self.minibatch_size, 1]), "Invalid shape: " + str(training_samples.rewards.shape) assert (training_samples.next_states.shape == training_samples.states.shape), "Invalid shape: " + str( training_samples.next_states.shape) assert (training_samples.not_terminals.shape == training_samples.rewards.shape), "Invalid shape: " + str( training_samples.not_terminals.shape) if training_samples.possible_next_actions is not None: assert ( training_samples.possible_next_actions.shape == training_samples.actions.shape), "Invalid shape: " + str( training_samples.possible_next_actions.shape) if training_samples.propensities is not None: assert (training_samples.propensities.shape == training_samples .rewards.shape), "Invalid shape: " + str( training_samples.propensities.shape) # Apply reward boost if specified reward_boosts = torch.sum(training_samples.actions.float() * self.reward_boosts, dim=1, keepdim=True) boosted_rewards = training_samples.rewards + reward_boosts self.minibatch += 1 states = training_samples.states.detach().requires_grad_(True) actions = training_samples.actions rewards = boosted_rewards next_states = training_samples.next_states discount_tensor = torch.full(training_samples.time_diffs.shape, self.gamma).type(self.dtype) not_done_mask = training_samples.not_terminals if self.use_seq_num_diff_as_time_diff: discount_tensor = discount_tensor.pow(training_samples.time_diffs) if self.maxq_learning: # Compute max a' Q(s', a') over all possible actions using target network possible_next_actions = training_samples.possible_next_actions next_q_values = self.get_max_q_values(next_states, possible_next_actions, self.double_q_learning) else: # SARSA next_actions = training_samples.next_actions next_q_values = self.get_next_action_q_values( next_states, next_actions) filtered_next_q_vals = next_q_values * not_done_mask if self.minibatch < self.reward_burnin: target_q_values = rewards else: target_q_values = rewards + (discount_tensor * filtered_next_q_vals) # Get Q-value of action taken all_q_values = self.q_network(states) self.all_action_scores = all_q_values.detach() q_values = torch.sum(all_q_values * actions, 1, keepdim=True) loss = self.q_network_loss(q_values, target_q_values) self.loss = loss.detach() self.q_network_optimizer.zero_grad() loss.backward() if self.gradient_handler: self.gradient_handler(self.q_network.parameters()) self.q_network_optimizer.step() if self.minibatch < self.reward_burnin: # Reward burnin: force target network self._soft_update(self.q_network, self.q_network_target, 1.0) else: # Use the soft update rule to update target network self._soft_update(self.q_network, self.q_network_target, self.tau) # get reward estimates reward_estimates = self.reward_network(states) self.reward_estimates = reward_estimates.detach() reward_estimates_for_logged_actions = reward_estimates.gather( 1, actions.argmax(dim=1, keepdim=True)) reward_loss = F.mse_loss(reward_estimates_for_logged_actions, rewards) self.reward_network_optimizer.zero_grad() reward_loss.backward() self.reward_network_optimizer.step() self.loss_reporter.report(td_loss=float(self.loss), reward_loss=float(reward_loss)) training_metadata = {} if evaluator is not None: model_propensities = torch.from_numpy( Evaluator.softmax(self.all_action_scores.cpu().numpy(), self.rl_temperature)) cpe_stats = BatchStatsForCPE( logged_actions=training_samples.actions, logged_propensities=training_samples.propensities, logged_rewards=rewards, logged_values=None, # Compute at end of each epoch for CPE model_propensities=model_propensities, model_rewards=self.reward_estimates, model_values=self.all_action_scores, model_values_on_logged_actions= None, # Compute at end of each epoch for CPE model_action_idxs=self.all_action_scores.argmax(dim=1, keepdim=True), ) evaluator.report(cpe_stats) training_metadata["model_rewards"] = self.reward_estimates.cpu( ).numpy() return training_metadata
def train(self, training_batch, evaluator=None) -> None: """ IMPORTANT: the input action here is assumed to be preprocessed to match the range of the output of the actor. """ if hasattr(training_batch, "as_parametric_sarsa_training_batch"): training_batch = training_batch.as_parametric_sarsa_training_batch( ) learning_input = training_batch.training_input self.minibatch += 1 state = learning_input.state action = learning_input.action reward = learning_input.reward discount = torch.full_like(reward, self.gamma) not_done_mask = learning_input.not_terminal current_state_action = rlt.StateAction(state=state, action=action) q1_value = self.q1_network(current_state_action).q_value min_q_value = q1_value if self.q2_network: q2_value = self.q2_network(current_state_action).q_value min_q_value = torch.min(q1_value, q2_value) # Use the minimum as target, ensure no gradient going through min_q_value = min_q_value.detach() # # First, optimize value network; minimizing MSE between # V(s) & Q(s, a) - log(pi(a|s)) # state_value = self.value_network(state.float_features) # .q_value with torch.no_grad(): log_prob_a = self.actor_network.get_log_prob( state, action.float_features) target_value = min_q_value - self.entropy_temperature * log_prob_a value_loss = F.mse_loss(state_value, target_value) self.value_network_optimizer.zero_grad() value_loss.backward() self.value_network_optimizer.step() # # Second, optimize Q networks; minimizing MSE between # Q(s, a) & r + discount * V'(next_s) # with torch.no_grad(): next_state_value = (self.value_network_target( learning_input.next_state.float_features) * not_done_mask) if self.minibatch < self.reward_burnin: target_q_value = reward else: target_q_value = reward + discount * next_state_value q1_loss = F.mse_loss(q1_value, target_q_value) self.q1_network_optimizer.zero_grad() q1_loss.backward() self.q1_network_optimizer.step() if self.q2_network: q2_loss = F.mse_loss(q2_value, target_q_value) self.q2_network_optimizer.zero_grad() q2_loss.backward() self.q2_network_optimizer.step() # # Lastly, optimize the actor; minimizing KL-divergence between action propensity # & softmax of value. Due to reparameterization trick, it ends up being # log_prob(actor_action) - Q(s, actor_action) # actor_output = self.actor_network(rlt.StateInput(state=state)) state_actor_action = rlt.StateAction( state=state, action=rlt.FeatureVector(float_features=actor_output.action)) q1_actor_value = self.q1_network(state_actor_action).q_value min_q_actor_value = q1_actor_value if self.q2_network: q2_actor_value = self.q2_network(state_actor_action).q_value min_q_actor_value = torch.min(q1_actor_value, q2_actor_value) actor_loss = torch.mean(self.entropy_temperature * actor_output.log_prob - min_q_actor_value) self.actor_network_optimizer.zero_grad() actor_loss.backward() self.actor_network_optimizer.step() if self.minibatch < self.reward_burnin: # Reward burnin: force target network self._soft_update(self.value_network, self.value_network_target, 1.0) else: # Use the soft update rule to update both target networks self._soft_update(self.value_network, self.value_network_target, self.tau) if evaluator is not None: cpe_stats = BatchStatsForCPE( td_loss=q1_loss.detach().cpu().numpy(), model_values_on_logged_actions=q1_value.detach().cpu().numpy(), ) evaluator.report(cpe_stats)
def train(self, training_batch, evaluator=None) -> None: """ IMPORTANT: the input action here is assumed to be preprocessed to match the range of the output of the actor. """ if hasattr(training_batch, "as_parametric_sarsa_training_batch"): training_batch = training_batch.as_parametric_sarsa_training_batch( ) learning_input = training_batch.training_input self.minibatch += 1 state = learning_input.state action = learning_input.action reward = learning_input.reward discount = torch.full_like(reward, self.gamma) not_done_mask = learning_input.not_terminal if self._should_scale_action_in_train(): action = rlt.FeatureVector( rescale_torch_tensor( action.float_features, new_min=self.min_action_range_tensor_training, new_max=self.max_action_range_tensor_training, prev_min=self.min_action_range_tensor_serving, prev_max=self.max_action_range_tensor_serving, )) current_state_action = rlt.StateAction(state=state, action=action) q1_value = self.q1_network(current_state_action).q_value min_q_value = q1_value if self.q2_network: q2_value = self.q2_network(current_state_action).q_value min_q_value = torch.min(q1_value, q2_value) # Use the minimum as target, ensure no gradient going through min_q_value = min_q_value.detach() # # First, optimize value network; minimizing MSE between # V(s) & Q(s, a) - log(pi(a|s)) # state_value = self.value_network(state.float_features) # .q_value if self.logged_action_uniform_prior: log_prob_a = torch.zeros_like(min_q_value) target_value = min_q_value else: with torch.no_grad(): log_prob_a = self.actor_network.get_log_prob( state, action.float_features) log_prob_a = log_prob_a.clamp(-20.0, 20.0) target_value = min_q_value - self.entropy_temperature * log_prob_a value_loss = F.mse_loss(state_value, target_value) self.value_network_optimizer.zero_grad() value_loss.backward() self.value_network_optimizer.step() # # Second, optimize Q networks; minimizing MSE between # Q(s, a) & r + discount * V'(next_s) # with torch.no_grad(): next_state_value = (self.value_network_target( learning_input.next_state.float_features) * not_done_mask) if self.minibatch < self.reward_burnin: target_q_value = reward else: target_q_value = reward + discount * next_state_value q1_loss = F.mse_loss(q1_value, target_q_value) self.q1_network_optimizer.zero_grad() q1_loss.backward() self.q1_network_optimizer.step() if self.q2_network: q2_loss = F.mse_loss(q2_value, target_q_value) self.q2_network_optimizer.zero_grad() q2_loss.backward() self.q2_network_optimizer.step() # # Lastly, optimize the actor; minimizing KL-divergence between action propensity # & softmax of value. Due to reparameterization trick, it ends up being # log_prob(actor_action) - Q(s, actor_action) # actor_output = self.actor_network(rlt.StateInput(state=state)) state_actor_action = rlt.StateAction( state=state, action=rlt.FeatureVector(float_features=actor_output.action)) q1_actor_value = self.q1_network(state_actor_action).q_value min_q_actor_value = q1_actor_value if self.q2_network: q2_actor_value = self.q2_network(state_actor_action).q_value min_q_actor_value = torch.min(q1_actor_value, q2_actor_value) actor_loss = (self.entropy_temperature * actor_output.log_prob - min_q_actor_value) # Do this in 2 steps so we can log histogram of actor loss actor_loss_mean = actor_loss.mean() self.actor_network_optimizer.zero_grad() actor_loss_mean.backward() self.actor_network_optimizer.step() if self.minibatch < self.reward_burnin: # Reward burnin: force target network self._soft_update(self.value_network, self.value_network_target, 1.0) else: # Use the soft update rule to update both target networks self._soft_update(self.value_network, self.value_network_target, self.tau) # Logging at the end to schedule all the cuda operations first if (self.tensorboard_logging_freq is not None and self.minibatch % self.tensorboard_logging_freq == 0): SummaryWriterContext.add_histogram("q1/logged_state_value", q1_value) if self.q2_network: SummaryWriterContext.add_histogram("q2/logged_state_value", q2_value) SummaryWriterContext.add_histogram("log_prob_a", log_prob_a) SummaryWriterContext.add_histogram("value_network/target", target_value) SummaryWriterContext.add_histogram("q_network/next_state_value", next_state_value) SummaryWriterContext.add_histogram("q_network/target_q_value", target_q_value) SummaryWriterContext.add_histogram("actor/min_q_actor_value", min_q_actor_value) SummaryWriterContext.add_histogram("actor/action_log_prob", actor_output.log_prob) SummaryWriterContext.add_histogram("actor/loss", actor_loss) if evaluator is not None: cpe_stats = BatchStatsForCPE( td_loss=q1_loss.detach().cpu().numpy(), logged_rewards=reward.detach().cpu().numpy(), model_values_on_logged_actions=q1_value.detach().cpu().numpy(), model_propensities=actor_output.log_prob.exp().detach().cpu(). numpy(), model_values=min_q_actor_value.detach().cpu().numpy(), ) evaluator.report(cpe_stats)