def policy(self, states): if isinstance(self.trainer, DQNTrainer): input = states elif isinstance(self.trainer, ParametricDQNTrainer): num_actions = len(self.trainer.action_normalization_parameters) actions = np.eye(num_actions, dtype=np.float32) actions = np.tile(actions, reps=(len(states), 1)) states = np.repeat(states, repeats=num_actions, axis=0) input = np.hstack((states, actions)) else: raise NotImplementedError("Invalid trainer passed to GymPredictor") q_scores = self.trainer.internal_prediction(input) if isinstance(self.trainer, DQNTrainer): assert q_scores.shape[0] == 1 q_scores = q_scores[0] q_scores_softmax = softmax( torch.from_numpy(q_scores.reshape(1, -1)), self.trainer.rl_temperature ).numpy()[0] if np.isnan(q_scores_softmax).any() or np.max(q_scores_softmax) < 1e-3: q_scores_softmax[:] = 1.0 / q_scores_softmax.shape[0] policies = [ np.argmax(q_scores), np.random.choice(q_scores.shape[0], p=q_scores_softmax), ] return policies
def policy(self, states): if isinstance(self.trainer, DQNTrainer): input = [states] elif isinstance(self.trainer, ParametricDQNTrainer): num_actions = self.action_dim actions = np.eye(num_actions, dtype=np.float32) actions = np.tile(actions, reps=(len(states), 1)) states = np.repeat(states, repeats=num_actions, axis=0) input = (states, actions) else: raise NotImplementedError("Invalid trainer passed to GymPredictor") q_scores = self.trainer.internal_prediction(*input) if isinstance(self.trainer, DQNTrainer): assert q_scores.shape[0] == 1 q_scores = q_scores[0] q_scores_softmax = softmax(torch.from_numpy(q_scores.reshape(1, -1)), self.trainer.rl_temperature).numpy()[0] if np.isnan(q_scores_softmax).any() or np.max(q_scores_softmax) < 1e-3: q_scores_softmax[:] = 1.0 / q_scores_softmax.shape[0] policies = [ np.argmax(q_scores), np.random.choice(q_scores.shape[0], p=q_scores_softmax), ] return policies
def train(self, training_samples: TrainingDataPage): if self.minibatch == 0: # Assume that the tensors are the right shape after the first minibatch assert (training_samples.states.shape[0] == self.minibatch_size ), "Invalid shape: " + str(training_samples.states.shape) assert training_samples.actions.shape == torch.Size([ self.minibatch_size, len(self._actions) ]), "Invalid shape: " + str(training_samples.actions.shape) assert training_samples.rewards.shape == torch.Size( [self.minibatch_size, 1]), "Invalid shape: " + str(training_samples.rewards.shape) assert (training_samples.next_states.shape == training_samples.states.shape), "Invalid shape: " + str( training_samples.next_states.shape) assert (training_samples.not_terminal.shape == training_samples.rewards.shape), "Invalid shape: " + str( training_samples.not_terminal.shape) if training_samples.possible_next_actions_mask is not None: assert ( training_samples.possible_next_actions_mask.shape == training_samples.actions.shape), ( "Invalid shape: " + str(training_samples.possible_next_actions_mask.shape)) if training_samples.propensities is not None: assert (training_samples.propensities.shape == training_samples .rewards.shape), "Invalid shape: " + str( training_samples.propensities.shape) if training_samples.metrics is not None: assert ( training_samples.metrics.shape[0] == self.minibatch_size ), "Invalid shape: " + str(training_samples.metrics.shape) boosted_rewards = self.boost_rewards(training_samples.rewards, training_samples.actions) self.minibatch += 1 states = training_samples.states.detach().requires_grad_(True) actions = training_samples.actions rewards = boosted_rewards discount_tensor = torch.full(training_samples.time_diffs.shape, self.gamma).type(self.dtype) not_done_mask = training_samples.not_terminal if self.use_seq_num_diff_as_time_diff: time_diff = training_samples.time_diffs / self.time_diff_unit_length discount_tensor = discount_tensor.pow(time_diff) all_next_q_values, all_next_q_values_target = self.get_detached_q_values( training_samples.next_states) if self.bcq: # Batch constrained q-learning on_policy_actions = self.bcq_imitator(training_samples.next_states) on_policy_action_probs = softmax(on_policy_actions, temperature=1) filter_values = ( on_policy_action_probs / on_policy_action_probs.max(keepdim=True, dim=1)[0]) action_on_policy = (filter_values >= self.bcq_drop_threshold).float() training_samples.possible_next_actions_mask *= action_on_policy if self.maxq_learning: # Compute max a' Q(s', a') over all possible actions using target network next_q_values, max_q_action_idxs = self.get_max_q_values_with_target( all_next_q_values, all_next_q_values_target, training_samples.possible_next_actions_mask, ) else: # SARSA next_q_values, max_q_action_idxs = self.get_max_q_values_with_target( all_next_q_values, all_next_q_values_target, training_samples.next_actions, ) filtered_next_q_vals = next_q_values * not_done_mask if self.minibatch < self.reward_burnin: target_q_values = rewards else: target_q_values = rewards + (discount_tensor * filtered_next_q_vals) # Get Q-value of action taken all_q_values = self.q_network(states) self.all_action_scores = all_q_values.detach() q_values = torch.sum(all_q_values * actions, 1, keepdim=True) loss = self.q_network_loss(q_values, target_q_values) self.loss = loss.detach() self.q_network_optimizer.zero_grad() loss.backward() if self.gradient_handler: self.gradient_handler(self.q_network.parameters()) if self.clip_grad_norm is not None: torch.nn.utils.clip_grad_norm_(self.q_network.parameters(), self.clip_grad_norm) self.q_network_optimizer.step() if self.minibatch < self.reward_burnin: # Reward burnin: force target network self._soft_update(self.q_network, self.q_network_target, 1.0) else: # Use the soft update rule to update target network self._soft_update(self.q_network, self.q_network_target, self.tau) bcq_loss = None if self.bcq: # Batch constrained q-learning action_preds = self.bcq_imitator(states) imitator_loss = torch.nn.CrossEntropyLoss() # Classification label is index of action with value 1 bcq_loss = imitator_loss(action_preds, torch.max(actions, dim=1)[1]) self.bcq_imitator_optimizer.zero_grad() bcq_loss.backward() self.bcq_imitator_optimizer.step() logged_action_idxs = actions.argmax(dim=1, keepdim=True) reward_loss, model_rewards, model_propensities = self.calculate_cpes( training_samples, states, logged_action_idxs, max_q_action_idxs, discount_tensor, not_done_mask, ) self.loss_reporter.report( td_loss=self.loss, imitator_loss=bcq_loss, reward_loss=reward_loss, logged_actions=logged_action_idxs, logged_propensities=training_samples.propensities, logged_rewards=rewards, logged_values=None, # Compute at end of each epoch for CPE model_propensities=model_propensities, model_rewards=model_rewards, model_values=self.all_action_scores, model_values_on_logged_actions= None, # Compute at end of each epoch for CPE model_action_idxs=self.get_max_q_values( self.all_action_scores, training_samples.possible_actions_mask)[1], )