def estimate( self, batch: SampleBatchType, ) -> OffPolicyEstimate: self.check_can_estimate_for(batch) estimates = [] # Split data into train and test batches for train_episodes, test_episodes in train_test_split( batch, self.train_test_split_val, self.k, ): # Train Q-function if train_episodes: # Reinitialize model self.model.reset() train_batch = SampleBatch.concat_samples(train_episodes) losses = self.train(train_batch) self.losses.append(losses) # Calculate doubly robust OPE estimates for episode in test_episodes: rewards, old_prob = episode["rewards"], episode["action_prob"] new_prob = np.exp(self.action_log_likelihood(episode)) v_old = 0.0 v_new = 0.0 q_values = self.model.estimate_q(episode[SampleBatch.OBS], episode[SampleBatch.ACTIONS]) q_values = convert_to_numpy(q_values) all_actions = np.zeros( [episode.count, self.policy.action_space.n]) all_actions[:] = np.arange(self.policy.action_space.n) # Two transposes required for torch.distributions to work tmp_episode = episode.copy() tmp_episode[SampleBatch.ACTIONS] = all_actions.T action_probs = np.exp( self.action_log_likelihood(tmp_episode)).T v_values = self.model.estimate_v(episode[SampleBatch.OBS], action_probs) v_values = convert_to_numpy(v_values) for t in reversed(range(episode.count)): v_old = rewards[t] + self.gamma * v_old v_new = v_values[t] + (new_prob[t] / old_prob[t]) * ( rewards[t] + self.gamma * v_new - q_values[t]) v_new = v_new.item() estimates.append( OffPolicyEstimate( self.name, { "v_old": v_old, "v_new": v_new, "v_gain": v_new / max(1e-8, v_old), }, )) return estimates
def on_policy_output(self, env_id: str, agent_id: str, output: PolicyOutputType): # Buffer latest output states for next input __call__. action, states, _ = output agent_state = self._states[env_id][agent_id] agent_state.action = convert_to_numpy(action) agent_state.states = convert_to_numpy(states)
def get_state(self, sess: Optional["tf.Session"] = None): if sess: return sess.run(self._tf_state_op) eps = self.epsilon_schedule(self.last_timestep) return { "cur_epsilon": convert_to_numpy(eps) if self.framework != "tf" else eps, "last_timestep": convert_to_numpy(self.last_timestep) if self.framework != "tf" else self.last_timestep, }
def estimate(self, batch: SampleBatchType) -> Dict[str, Any]: """Compute off-policy estimates. Args: batch: The SampleBatch to run off-policy estimation on Returns: A dict consists of the following metrics: - v_behavior: The discounted return averaged over episodes in the batch - v_behavior_std: The standard deviation corresponding to v_behavior - v_target: The estimated discounted return for `self.policy`, averaged over episodes in the batch - v_target_std: The standard deviation corresponding to v_target - v_gain: v_target / max(v_behavior, 1e-8), averaged over episodes - v_gain_std: The standard deviation corresponding to v_gain """ batch = self.convert_ma_batch_to_sample_batch(batch) self.check_action_prob_in_batch(batch) estimates = {"v_behavior": [], "v_target": [], "v_gain": []} # Calculate doubly robust OPE estimates for episode in batch.split_by_episode(): rewards, old_prob = episode["rewards"], episode["action_prob"] log_likelihoods = compute_log_likelihoods_from_input_dict( self.policy, episode ) new_prob = np.exp(convert_to_numpy(log_likelihoods)) v_behavior = 0.0 v_target = 0.0 q_values = self.model.estimate_q(episode) q_values = convert_to_numpy(q_values) v_values = self.model.estimate_v(episode) v_values = convert_to_numpy(v_values) assert q_values.shape == v_values.shape == (episode.count,) for t in reversed(range(episode.count)): v_behavior = rewards[t] + self.gamma * v_behavior v_target = v_values[t] + (new_prob[t] / old_prob[t]) * ( rewards[t] + self.gamma * v_target - q_values[t] ) v_target = v_target.item() estimates["v_behavior"].append(v_behavior) estimates["v_target"].append(v_target) estimates["v_gain"].append(v_target / max(v_behavior, 1e-8)) estimates["v_behavior_std"] = np.std(estimates["v_behavior"]) estimates["v_behavior"] = np.mean(estimates["v_behavior"]) estimates["v_target_std"] = np.std(estimates["v_target"]) estimates["v_target"] = np.mean(estimates["v_target"]) estimates["v_gain_std"] = np.std(estimates["v_gain"]) estimates["v_gain"] = np.mean(estimates["v_gain"]) return estimates
def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: if self.config["worker_index"]: return convert_to_numpy({"worker_loss": self.loss_obj.loss}) else: return convert_to_numpy({ "cur_kl_coeff": self.kl_coeff_val, "cur_lr": self.cur_lr, "total_loss": self.loss_obj.loss, "policy_loss": self.loss_obj.mean_policy_loss, "vf_loss": self.loss_obj.mean_vf_loss, "kl_loss": self.loss_obj.mean_kl_loss, "inner_kl": self.loss_obj.mean_inner_kl, "entropy": self.loss_obj.mean_entropy, })
def postprocess_nstep_and_prio( policy: Policy, batch: SampleBatch, other_agent=None, episode=None ) -> SampleBatch: # N-step Q adjustments. if policy.config["n_step"] > 1: adjust_nstep(policy.config["n_step"], policy.config["gamma"], batch) # Create dummy prio-weights (1.0) in case we don't have any in # the batch. if PRIO_WEIGHTS not in batch: batch[PRIO_WEIGHTS] = np.ones_like(batch[SampleBatch.REWARDS]) # Prioritize on the worker side. if batch.count > 0 and policy.config["replay_buffer_config"].get( "worker_side_prioritization", False ): td_errors = policy.compute_td_error( batch[SampleBatch.OBS], batch[SampleBatch.ACTIONS], batch[SampleBatch.REWARDS], batch[SampleBatch.NEXT_OBS], batch[SampleBatch.DONES], batch[PRIO_WEIGHTS], ) # Retain compatibility with old-style Replay args epsilon = policy.config.get("replay_buffer_config", {}).get( "prioritized_replay_eps" ) or policy.config.get("prioritized_replay_eps") if epsilon is None: raise ValueError("prioritized_replay_eps not defined in config.") new_priorities = np.abs(convert_to_numpy(td_errors)) + epsilon batch[PRIO_WEIGHTS] = new_priorities return batch
def postprocess_nstep_and_prio(policy: Policy, batch: SampleBatch, other_agent=None, episode=None) -> SampleBatch: # N-step Q adjustments. if policy.config["n_step"] > 1: _adjust_nstep(policy.config["n_step"], policy.config["gamma"], batch[SampleBatch.CUR_OBS], batch[SampleBatch.ACTIONS], batch[SampleBatch.REWARDS], batch[SampleBatch.NEXT_OBS], batch[SampleBatch.DONES]) if PRIO_WEIGHTS not in batch: batch[PRIO_WEIGHTS] = np.ones_like(batch[SampleBatch.REWARDS]) # Prioritize on the worker side. if batch.count > 0 and policy.config["worker_side_prioritization"]: td_errors = policy.compute_td_error(batch[SampleBatch.CUR_OBS], batch[SampleBatch.ACTIONS], batch[SampleBatch.REWARDS], batch[SampleBatch.NEXT_OBS], batch[SampleBatch.DONES], batch[PRIO_WEIGHTS]) new_priorities = (np.abs(convert_to_numpy(td_errors)) + policy.config["prioritized_replay_eps"]) batch[PRIO_WEIGHTS] = new_priorities return batch
def extra_compute_grad_fetches(self): if extra_learn_fetches_fn: fetches = convert_to_numpy(extra_learn_fetches_fn(self)) # Auto-add empty learner stats dict if needed. return dict({LEARNER_STATS_KEY: {}}, **fetches) else: return parent_cls.extra_compute_grad_fetches(self)
def centralized_critic_postprocessing( policy, sample_batch, other_agent_batches=None, episode=None ): pytorch = policy.config["framework"] == "torch" if (pytorch and hasattr(policy, "compute_central_vf")) or ( not pytorch and policy.loss_initialized() ): assert other_agent_batches is not None [(_, opponent_batch)] = list(other_agent_batches.values()) # also record the opponent obs and actions in the trajectory sample_batch[OPPONENT_OBS] = opponent_batch[SampleBatch.CUR_OBS] sample_batch[OPPONENT_ACTION] = opponent_batch[SampleBatch.ACTIONS] # overwrite default VF prediction with the central VF if args.framework == "torch": sample_batch[SampleBatch.VF_PREDS] = ( policy.compute_central_vf( convert_to_torch_tensor( sample_batch[SampleBatch.CUR_OBS], policy.device ), convert_to_torch_tensor(sample_batch[OPPONENT_OBS], policy.device), convert_to_torch_tensor( sample_batch[OPPONENT_ACTION], policy.device ), ) .cpu() .detach() .numpy() ) else: sample_batch[SampleBatch.VF_PREDS] = convert_to_numpy( policy.compute_central_vf( sample_batch[SampleBatch.CUR_OBS], sample_batch[OPPONENT_OBS], sample_batch[OPPONENT_ACTION], ) ) else: # Policy hasn't been initialized yet, use zeros. sample_batch[OPPONENT_OBS] = np.zeros_like(sample_batch[SampleBatch.CUR_OBS]) sample_batch[OPPONENT_ACTION] = np.zeros_like(sample_batch[SampleBatch.ACTIONS]) sample_batch[SampleBatch.VF_PREDS] = np.zeros_like( sample_batch[SampleBatch.REWARDS], dtype=np.float32 ) completed = sample_batch["dones"][-1] if completed: last_r = 0.0 else: last_r = sample_batch[SampleBatch.VF_PREDS][-1] train_batch = compute_advantages( sample_batch, last_r, policy.config["gamma"], policy.config["lambda"], use_gae=policy.config["use_gae"], ) return train_batch
def learn_on_batch(self, postprocessed_batch): # Callback handling. learn_stats = {} self.callbacks.on_learn_on_batch( policy=self, train_batch=postprocessed_batch, result=learn_stats ) pad_batch_to_sequences_of_same_size( postprocessed_batch, max_seq_len=self._max_seq_len, shuffle=False, batch_divisibility_req=self.batch_divisibility_req, view_requirements=self.view_requirements, ) self._is_training = True postprocessed_batch = self._lazy_tensor_dict(postprocessed_batch) postprocessed_batch.set_training(True) stats = self._learn_on_batch_helper(postprocessed_batch) stats.update( { "custom_metrics": learn_stats, NUM_AGENT_STEPS_TRAINED: postprocessed_batch.count, } ) return convert_to_numpy(stats)
def get_state(self, sess: Optional["tf.Session"] = None): """Returns the current scale value. Returns: Union[float,tf.Tensor[float]]: The current scale value. """ if sess: return sess.run( dict( self._tf_state_op, **{ "ou_state": self.ou_state, } ) ) state = super().get_state() return dict( state, **{ "ou_state": convert_to_numpy(self.ou_state) if self.framework != "tf" else self.ou_state, } )
def get_state(self, sess: Optional["tf.Session"] = None): """Returns the current scale value. Returns: Union[float,tf.Tensor[float]]: The current scale value. """ if sess: return sess.run(self._tf_state_op) scale = self.scale_schedule(self.last_timestep) return { "cur_scale": convert_to_numpy(scale) if self.framework != "tf" else scale, "last_timestep": convert_to_numpy(self.last_timestep) if self.framework != "tf" else self.last_timestep, }
def action_log_likelihood(self, batch: SampleBatchType) -> TensorType: """Returns log likelihood for actions in given batch for policy. Computes likelihoods by passing the observations through the current policy's `compute_log_likelihoods()` method Args: batch: The SampleBatch or MultiAgentBatch to calculate action log likelihoods from. This batch/batches must contain OBS and ACTIONS keys. Returns: The probabilities of the actions in the batch, given the observations and the policy. """ num_state_inputs = 0 for k in batch.keys(): if k.startswith("state_in_"): num_state_inputs += 1 state_keys = ["state_in_{}".format(i) for i in range(num_state_inputs)] log_likelihoods: TensorType = self.policy.compute_log_likelihoods( actions=batch[SampleBatch.ACTIONS], obs_batch=batch[SampleBatch.OBS], state_batches=[batch[k] for k in state_keys], prev_action_batch=batch.get(SampleBatch.PREV_ACTIONS), prev_reward_batch=batch.get(SampleBatch.PREV_REWARDS), actions_normalized=True, ) log_likelihoods = convert_to_numpy(log_likelihoods) return log_likelihoods
def estimate(self, batch: SampleBatchType) -> Dict[str, Any]: """Compute off-policy estimates. Args: batch: The SampleBatch to run off-policy estimation on Returns: A dict consists of the following metrics: - v_behavior: The discounted return averaged over episodes in the batch - v_behavior_std: The standard deviation corresponding to v_behavior - v_target: The estimated discounted return for `self.policy`, averaged over episodes in the batch - v_target_std: The standard deviation corresponding to v_target - v_gain: v_target / max(v_behavior, 1e-8), averaged over episodes - v_gain_std: The standard deviation corresponding to v_gain """ batch = self.convert_ma_batch_to_sample_batch(batch) self.check_action_prob_in_batch(batch) estimates = {"v_behavior": [], "v_target": [], "v_gain": []} for episode in batch.split_by_episode(): rewards, old_prob = episode["rewards"], episode["action_prob"] log_likelihoods = compute_log_likelihoods_from_input_dict( self.policy, episode) new_prob = np.exp(convert_to_numpy(log_likelihoods)) # calculate importance ratios p = [] for t in range(episode.count): if t == 0: pt_prev = 1.0 else: pt_prev = p[t - 1] p.append(pt_prev * new_prob[t] / old_prob[t]) for t, v in enumerate(p): if t >= len(self.filter_values): self.filter_values.append(v) self.filter_counts.append(1.0) else: self.filter_values[t] += v self.filter_counts[t] += 1.0 # calculate stepwise weighted IS estimate v_behavior = 0.0 v_target = 0.0 for t in range(episode.count): v_behavior += rewards[t] * self.gamma**t w_t = self.filter_values[t] / self.filter_counts[t] v_target += p[t] / w_t * rewards[t] * self.gamma**t estimates["v_behavior"].append(v_behavior) estimates["v_target"].append(v_target) estimates["v_gain"].append(v_target / max(v_behavior, 1e-8)) estimates["v_behavior_std"] = np.std(estimates["v_behavior"]) estimates["v_behavior"] = np.mean(estimates["v_behavior"]) estimates["v_target_std"] = np.std(estimates["v_target"]) estimates["v_target"] = np.mean(estimates["v_target"]) estimates["v_gain_std"] = np.std(estimates["v_gain"]) estimates["v_gain"] = np.mean(estimates["v_gain"]) return estimates
def get_state(self) -> Union[Dict[str, TensorType], List[TensorType]]: state = super().get_state() state["_optimizer_variables"] = [] for i, o in enumerate(self._optimizers): optim_state_dict = convert_to_numpy(o.state_dict()) state["_optimizer_variables"].append(optim_state_dict) # Add exploration state. state["_exploration_state"] = self.exploration.get_state() return state
def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: q_t = torch.stack(self.get_tower_stats("q_t")) stats = { "actor_loss": torch.mean(torch.stack(self.get_tower_stats("actor_loss"))), "critic_loss": torch.mean(torch.stack(self.get_tower_stats("critic_loss"))), "mean_q": torch.mean(q_t), "max_q": torch.max(q_t), "min_q": torch.min(q_t), } return convert_to_numpy(stats)
def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: stats = { "policy_loss": self.p_loss, "total_loss": self.total_loss, } if self.config["beta"] != 0.0: stats[ "moving_average_sqd_adv_norm"] = self._moving_average_sqd_adv_norm stats["vf_explained_var"] = self.explained_variance stats["vf_loss"] = self.v_loss return convert_to_numpy(stats)
def test_fqe_model(self): # Test FQETorchModel for: # (1) Check that it does not modify the underlying batch during training # (2) Check that the stoppign criteria from FQE are working correctly # (3) Check that using fqe._compute_action_probs equals brute force # iterating over all actions with policy.compute_log_likelihoods fqe = FQETorchModel( policy=self.algo.get_policy(), gamma=self.gamma, **self.q_model_config, ) tmp_batch = copy.deepcopy(self.batch) losses = fqe.train(self.batch) # Make sure FQETorchModel.train() does not modify self.batch check(tmp_batch, self.batch) # Make sure FQE stopping criteria are respected assert ( len(losses) == fqe.n_iters or losses[-1] < fqe.delta ), f"FQE.train() terminated early in {len(losses)} steps with final loss" f"{losses[-1]} for n_iters: {fqe.n_iters} and delta: {fqe.delta}" # Test fqe._compute_action_probs against "brute force" method # of computing log_prob for each possible action individually # using policy.compute_log_likelihoods obs = torch.tensor(self.batch["obs"], device=fqe.device) action_probs = fqe._compute_action_probs(obs) action_probs = convert_to_numpy(action_probs) tmp_probs = [] for act in range(fqe.policy.action_space.n): tmp_actions = np.zeros_like(self.batch["actions"]) + act log_probs = fqe.policy.compute_log_likelihoods( actions=tmp_actions, obs_batch=self.batch["obs"], ) tmp_probs.append(torch.exp(log_probs)) tmp_probs = torch.stack(tmp_probs).transpose(0, 1) tmp_probs = convert_to_numpy(tmp_probs) check(action_probs, tmp_probs, decimals=3)
def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: return convert_to_numpy({ "cur_lr": self.cur_lr, "entropy_coeff": self.entropy_coeff, "policy_entropy": torch.mean(torch.stack(self.get_tower_stats("entropy"))), "policy_loss": torch.mean(torch.stack(self.get_tower_stats("pi_err"))), "vf_loss": torch.mean(torch.stack(self.get_tower_stats("value_err"))), })
def action_prob(self, batch: SampleBatchType) -> np.ndarray: """Returns the probs for the batch actions for the current policy.""" num_state_inputs = 0 for k in batch.keys(): if k.startswith("state_in_"): num_state_inputs += 1 state_keys = ["state_in_{}".format(i) for i in range(num_state_inputs)] log_likelihoods: TensorType = self.policy.compute_log_likelihoods( actions=batch[SampleBatch.ACTIONS], obs_batch=batch[SampleBatch.CUR_OBS], state_batches=[batch[k] for k in state_keys], prev_action_batch=batch.data.get(SampleBatch.PREV_ACTIONS), prev_reward_batch=batch.data.get(SampleBatch.PREV_REWARDS)) return convert_to_numpy(log_likelihoods)
def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: """Returns the calculated loss in a stats dict. Args: policy: The Policy object. train_batch: The data used for training. Returns: Dict[str, TensorType]: The stats dict. """ return convert_to_numpy({ "policy_loss": torch.mean(torch.stack(self.get_tower_stats("policy_loss"))), })
def compute_actions_from_input_dict( self, input_dict: Dict[str, TensorType], explore: bool = None, timestep: Optional[int] = None, episodes: Optional[List[Episode]] = None, **kwargs, ) -> Tuple[TensorType, List[TensorType], Dict[str, TensorType]]: if not self.config.get( "eager_tracing") and not tf1.executing_eagerly(): tf1.enable_eager_execution() self._is_training = False explore = explore if explore is not None else self.explore timestep = timestep if timestep is not None else self.global_timestep if isinstance(timestep, tf.Tensor): timestep = int(timestep.numpy()) # Pass lazy (eager) tensor dict to Model as `input_dict`. input_dict = self._lazy_tensor_dict(input_dict) input_dict.set_training(False) # Pack internal state inputs into (separate) list. state_batches = [ input_dict[k] for k in input_dict.keys() if "state_in" in k[:8] ] self._state_in = state_batches self._is_recurrent = state_batches != [] # Call the exploration before_compute_actions hook. self.exploration.before_compute_actions(timestep=timestep, explore=explore, tf_sess=self.get_session()) ret = self._compute_actions_helper( input_dict, state_batches, # TODO: Passing episodes into a traced method does not work. None if self.config["eager_tracing"] else episodes, explore, timestep, ) # Update our global timestep by the batch size. self.global_timestep.assign_add( tree.flatten(ret[0])[0].shape.as_list()[0]) return convert_to_numpy(ret)
def compute_gradients(self, postprocessed_batch: SampleBatch) -> \ Tuple[ModelGradients, Dict[str, TensorType]]: pad_batch_to_sequences_of_same_size( postprocessed_batch, shuffle=False, max_seq_len=self._max_seq_len, batch_divisibility_req=self.batch_divisibility_req, view_requirements=self.view_requirements, ) self._is_training = True self._lazy_tensor_dict(postprocessed_batch) postprocessed_batch.set_training(True) grads_and_vars, grads, stats = self._compute_gradients_helper( postprocessed_batch) return convert_to_numpy((grads, stats))
def estimate(self, batch: SampleBatchType) -> OffPolicyEstimate: self.check_can_estimate_for(batch) estimates = [] # Split data into train and test batches for train_episodes, test_episodes in train_test_split( batch, self.train_test_split_val, self.k, ): # Train Q-function if train_episodes: # Reinitialize model self.model.reset() train_batch = SampleBatch.concat_samples(train_episodes) losses = self.train(train_batch) self.losses.append(losses) # Calculate direct method OPE estimates for episode in test_episodes: rewards = episode["rewards"] v_old = 0.0 v_new = 0.0 for t in range(episode.count): v_old += rewards[t] * self.gamma ** t init_step = episode[0:1] init_obs = np.array([init_step[SampleBatch.OBS]]) all_actions = np.arange(self.policy.action_space.n, dtype=float) init_step[SampleBatch.ACTIONS] = all_actions action_probs = np.exp(self.action_log_likelihood(init_step)) v_value = self.model.estimate_v(init_obs, action_probs) v_new = convert_to_numpy(v_value).item() estimates.append( OffPolicyEstimate( self.name, { "v_old": v_old, "v_new": v_new, "v_gain": v_new / max(1e-8, v_old), }, ) ) return estimates
def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: return convert_to_numpy({ "cur_lr": self.cur_lr, "total_loss": torch.mean(torch.stack(self.get_tower_stats("total_loss"))), "policy_loss": torch.mean(torch.stack(self.get_tower_stats("pi_loss"))), "entropy": torch.mean(torch.stack(self.get_tower_stats("mean_entropy"))), "entropy_coeff": self.entropy_coeff, "var_gnorm": global_norm(self.model.trainable_variables()), "vf_loss": torch.mean(torch.stack(self.get_tower_stats("vf_loss"))), "vf_explained_var": torch.mean(torch.stack(self.get_tower_stats("vf_explained_var"))), })
def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: """Stats function for APPO. Returns a dict with important loss stats. Args: policy: The Policy to generate stats for. train_batch: The SampleBatch (already) used for training. Returns: Dict[str, TensorType]: The stats dict. """ stats_dict = { "cur_lr": self.cur_lr, "total_loss": torch.mean(torch.stack(self.get_tower_stats("total_loss"))), "policy_loss": torch.mean(torch.stack(self.get_tower_stats("mean_policy_loss"))), "entropy": torch.mean(torch.stack(self.get_tower_stats("mean_entropy"))), "entropy_coeff": self.entropy_coeff, "var_gnorm": global_norm(self.model.trainable_variables()), "vf_loss": torch.mean(torch.stack(self.get_tower_stats("mean_vf_loss"))), "vf_explained_var": torch.mean(torch.stack(self.get_tower_stats("vf_explained_var"))), } if self.config["vtrace"]: is_stat_mean = torch.mean(self._is_ratio, [0, 1]) is_stat_var = torch.var(self._is_ratio, [0, 1]) stats_dict["mean_IS"] = is_stat_mean stats_dict["var_IS"] = is_stat_var if self.config["use_kl_loss"]: stats_dict["kl"] = torch.mean( torch.stack(self.get_tower_stats("mean_kl_loss"))) stats_dict["KL_Coeff"] = self.kl_coeff return convert_to_numpy(stats_dict)
def estimate(self, batch: SampleBatchType) -> Dict[str, Any]: """Compute off-policy estimates. Args: batch: The SampleBatch to run off-policy estimation on Returns: A dict consists of the following metrics: - v_behavior: The discounted return averaged over episodes in the batch - v_behavior_std: The standard deviation corresponding to v_behavior - v_target: The estimated discounted return for `self.policy`, averaged over episodes in the batch - v_target_std: The standard deviation corresponding to v_target - v_gain: v_target / max(v_behavior, 1e-8), averaged over episodes - v_gain_std: The standard deviation corresponding to v_gain """ batch = self.convert_ma_batch_to_sample_batch(batch) self.check_action_prob_in_batch(batch) estimates = {"v_behavior": [], "v_target": [], "v_gain": []} # Calculate Direct Method OPE estimates for episode in batch.split_by_episode(): rewards = episode["rewards"] v_behavior = 0.0 v_target = 0.0 for t in range(episode.count): v_behavior += rewards[t] * self.gamma ** t init_step = episode[0:1] v_target = self.model.estimate_v(init_step) v_target = convert_to_numpy(v_target).item() estimates["v_behavior"].append(v_behavior) estimates["v_target"].append(v_target) estimates["v_gain"].append(v_target / max(v_behavior, 1e-8)) estimates["v_behavior_std"] = np.std(estimates["v_behavior"]) estimates["v_behavior"] = np.mean(estimates["v_behavior"]) estimates["v_target_std"] = np.std(estimates["v_target"]) estimates["v_target"] = np.mean(estimates["v_target"]) estimates["v_gain_std"] = np.std(estimates["v_gain"]) estimates["v_gain"] = np.mean(estimates["v_gain"]) return estimates
def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: return convert_to_numpy({ "cur_kl_coeff": self.kl_coeff, "cur_lr": self.cur_lr, "total_loss": torch.mean(torch.stack(self.get_tower_stats("total_loss"))), "policy_loss": torch.mean(torch.stack(self.get_tower_stats("mean_policy_loss"))), "vf_loss": torch.mean(torch.stack(self.get_tower_stats("mean_vf_loss"))), "vf_explained_var": torch.mean(torch.stack(self.get_tower_stats("vf_explained_var"))), "kl": torch.mean(torch.stack(self.get_tower_stats("mean_kl_loss"))), "entropy": torch.mean(torch.stack(self.get_tower_stats("mean_entropy"))), "entropy_coeff": self.entropy_coeff, })
def _process_policy_eval_results( *, to_eval: Dict[PolicyID, List[PolicyEvalData]], eval_results: Dict[PolicyID, Tuple[TensorStructType, StateBatch, dict]], active_episodes: Dict[str, MultiAgentEpisode], active_envs: Set[int], off_policy_actions: MultiEnvDict, policies: Dict[PolicyID, Policy], clip_actions: bool, ) -> Dict[EnvID, Dict[AgentID, EnvActionType]]: """Process the output of policy neural network evaluation. Records policy evaluation results into the given episode objects and returns replies to send back to agents in the env. Args: to_eval (Dict[PolicyID, List[PolicyEvalData]]): Mapping of policy IDs to lists of PolicyEvalData objects. eval_results (Dict[PolicyID, List]): Mapping of policy IDs to list of actions, rnn-out states, extra-action-fetches dicts. active_episodes (Dict[str, MultiAgentEpisode]): Mapping from episode ID to currently ongoing MultiAgentEpisode object. active_envs (Set[int]): Set of non-terminated env ids. off_policy_actions (dict): Doubly keyed dict of env-ids -> agent ids -> off-policy-action, returned by a `BaseEnv.poll()` call. policies (Dict[PolicyID, Policy]): Mapping from policy ID to Policy. clip_actions (bool): Whether to clip actions to the action space's bounds. Returns: actions_to_send: Nested dict of env id -> agent id -> actions to be sent to Env (np.ndarrays). """ actions_to_send: Dict[EnvID, Dict[AgentID, EnvActionType]] = \ defaultdict(dict) # type: int for env_id in active_envs: actions_to_send[env_id] = {} # at minimum send empty dict # type: PolicyID, List[PolicyEvalData] for policy_id, eval_data in to_eval.items(): actions: TensorStructType = eval_results[policy_id][0] actions = convert_to_numpy(actions) rnn_out_cols: StateBatch = eval_results[policy_id][1] pi_info_cols: dict = eval_results[policy_id][2] # In case actions is a list (representing the 0th dim of a batch of # primitive actions), try to convert it first. if isinstance(actions, list): actions = np.array(actions) # Store RNN state ins/outs and extra-action fetches to episode. for f_i, column in enumerate(rnn_out_cols): pi_info_cols["state_out_{}".format(f_i)] = column policy: Policy = _get_or_raise(policies, policy_id) # Split action-component batches into single action rows. actions: List[EnvActionType] = unbatch(actions) # type: int, EnvActionType for i, action in enumerate(actions): # Clip if necessary. if clip_actions: clipped_action = clip_action(action, policy.action_space_struct) else: clipped_action = action env_id: int = eval_data[i].env_id agent_id: AgentID = eval_data[i].agent_id episode: MultiAgentEpisode = active_episodes[env_id] episode._set_rnn_state(agent_id, [c[i] for c in rnn_out_cols]) episode._set_last_pi_info( agent_id, {k: v[i] for k, v in pi_info_cols.items()}) if env_id in off_policy_actions and \ agent_id in off_policy_actions[env_id]: episode._set_last_action(agent_id, off_policy_actions[env_id][agent_id]) else: episode._set_last_action(agent_id, action) assert agent_id not in actions_to_send[env_id] actions_to_send[env_id][agent_id] = clipped_action return actions_to_send
def _compute_action_helper(self, input_dict, state_batches, seq_lens, explore, timestep): """Shared forward pass logic (w/ and w/o trajectory view API). Returns: A tuple consisting of a) actions, b) state_out, c) extra_fetches. """ explore = explore if explore is not None else self.config["explore"] timestep = timestep if timestep is not None else self.global_timestep self._is_recurrent = state_batches is not None and state_batches != [] # Switch to eval mode. if self.model: self.model.eval() if is_overridden(self.action_sampler_fn): action_dist = dist_inputs = None actions, logp, state_out = self.action_sampler_fn( self.model, obs_batch=input_dict, state_batches=state_batches, explore=explore, timestep=timestep, ) else: # Call the exploration before_compute_actions hook. self.exploration.before_compute_actions(explore=explore, timestep=timestep) if is_overridden(self.action_distribution_fn): dist_inputs, dist_class, state_out = self.action_distribution_fn( self.model, obs_batch=input_dict, state_batches=state_batches, seq_lens=seq_lens, explore=explore, timestep=timestep, is_training=False, ) else: dist_class = self.dist_class dist_inputs, state_out = self.model(input_dict, state_batches, seq_lens) if not (isinstance(dist_class, functools.partial) or issubclass(dist_class, TorchDistributionWrapper)): raise ValueError( "`dist_class` ({}) not a TorchDistributionWrapper " "subclass! Make sure your `action_distribution_fn` or " "`make_model_and_action_dist` return a correct " "distribution class.".format(dist_class.__name__)) action_dist = dist_class(dist_inputs, self.model) # Get the exploration action from the forward results. actions, logp = self.exploration.get_exploration_action( action_distribution=action_dist, timestep=timestep, explore=explore) input_dict[SampleBatch.ACTIONS] = actions # Add default and custom fetches. extra_fetches = self.extra_action_out(input_dict, state_batches, self.model, action_dist) # Action-dist inputs. if dist_inputs is not None: extra_fetches[SampleBatch.ACTION_DIST_INPUTS] = dist_inputs # Action-logp and action-prob. if logp is not None: extra_fetches[SampleBatch.ACTION_PROB] = torch.exp(logp.float()) extra_fetches[SampleBatch.ACTION_LOGP] = logp # Update our global timestep by the batch size. self.global_timestep += len(input_dict[SampleBatch.CUR_OBS]) return convert_to_numpy((actions, state_out, extra_fetches))