def export_model(network): vec_obs_size = 16 num_vis_obs = 0 dummy_vec_obs = [torch.zeros([1] + [vec_obs_size])] dummy_vis_obs = [] dummy_var_len_obs = [] dummy_masks = torch.ones([1] + [0]) dummy_memories = torch.zeros([1] + [1] + [256]) dummy_input = ( dummy_vec_obs, dummy_vis_obs, dummy_var_len_obs, dummy_masks, dummy_memories, ) input_names = ['vector_observation', 'action_masks', 'recurrent_in'] dynamic_axes = {name: {0: "batch"} for name in input_names} output_names = [ 'version_number', 'memory_size', 'continuous_actions', 'continuous_action_output_shape', 'action', 'is_continuous_control', 'action_output_shape', 'recurrent_out' ] dynamic_axes.update({'continuous_actions': {0: "batch"}}) dynamic_axes.update({'action': {0: "batch"}}) torch.onnx.export(network, dummy_input, EXPORT_FILE, opset_version=9, input_names=input_names, output_names=output_names, dynamic_axes=dynamic_axes)
def __init__(self, policy): # ONNX only support input in NCHW (channel first) format. # Barracuda also expect to get data in NCHW. # Any multi-dimentional input should follow that otherwise will # cause problem to barracuda import. self.policy = policy observation_specs = self.policy.behavior_spec.observation_specs batch_dim = [1] seq_len_dim = [1] num_obs = len(observation_specs) dummy_obs = [ torch.zeros(batch_dim + list(ModelSerializer._get_onnx_shape(obs_spec.shape))) for obs_spec in observation_specs ] dummy_masks = torch.ones( batch_dim + [sum(self.policy.behavior_spec.action_spec.discrete_branches)]) dummy_memories = torch.zeros(batch_dim + seq_len_dim + [self.policy.export_memory_size]) self.dummy_input = (dummy_obs, dummy_masks, dummy_memories) self.input_names = [ TensorNames.get_observation_name(i) for i in range(num_obs) ] self.input_names += [ TensorNames.action_mask_placeholder, TensorNames.recurrent_in_placeholder, ] self.dynamic_axes = {name: {0: "batch"} for name in self.input_names} self.output_names = [ TensorNames.version_number, TensorNames.memory_size ] if self.policy.behavior_spec.action_spec.continuous_size > 0: self.output_names += [ TensorNames.continuous_action_output, TensorNames.continuous_action_output_shape, ] self.dynamic_axes.update( {TensorNames.continuous_action_output: { 0: "batch" }}) if self.policy.behavior_spec.action_spec.discrete_size > 0: self.output_names += [ TensorNames.discrete_action_output, TensorNames.discrete_action_output_shape, ] self.dynamic_axes.update( {TensorNames.discrete_action_output: { 0: "batch" }}) if self.policy.export_memory_size > 0: self.output_names += [TensorNames.recurrent_output]
def __init__(self, policy): # ONNX only support input in NCHW (channel first) format. # Barracuda also expect to get data in NCHW. # Any multi-dimentional input should follow that otherwise will # cause problem to barracuda import. self.policy = policy batch_dim = [1] seq_len_dim = [1] dummy_vec_obs = [torch.zeros(batch_dim + [self.policy.vec_obs_size])] # create input shape of NCHW # (It's NHWC in self.policy.behavior_spec.observation_shapes) dummy_vis_obs = [ torch.zeros(batch_dim + [shape[2], shape[0], shape[1]]) for shape in self.policy.behavior_spec.observation_shapes if len(shape) == 3 ] dummy_masks = torch.ones( batch_dim + [sum(self.policy.behavior_spec.action_spec.discrete_branches)] ) dummy_memories = torch.zeros( batch_dim + seq_len_dim + [self.policy.export_memory_size] ) self.dummy_input = (dummy_vec_obs, dummy_vis_obs, dummy_masks, dummy_memories) self.input_names = ( ["vector_observation"] + [f"visual_observation_{i}" for i in range(self.policy.vis_obs_size)] + ["action_masks", "memories"] ) self.dynamic_axes = {name: {0: "batch"} for name in self.input_names} self.output_names = ["version_number", "memory_size"] if self.policy.behavior_spec.action_spec.continuous_size > 0: self.output_names += [ "continuous_actions", "continuous_action_output_shape", ] self.dynamic_axes.update({"continuous_actions": {0: "batch"}}) if self.policy.behavior_spec.action_spec.discrete_size > 0: self.output_names += ["discrete_actions", "discrete_action_output_shape"] self.dynamic_axes.update({"discrete_actions": {0: "batch"}}) if ( self.policy.behavior_spec.action_spec.continuous_size == 0 or self.policy.behavior_spec.action_spec.discrete_size == 0 ): self.output_names += [ "action", "is_continuous_control", "action_output_shape", ] self.dynamic_axes.update({"action": {0: "batch"}})
def generate_input_helper(pattern): _input = torch.zeros((batch_size, 0, size)) for i in range(len(pattern)): if i % 2 == 0: _input = torch.cat( [_input, torch.rand((batch_size, pattern[i], size))], dim=1) else: _input = torch.cat( [_input, torch.zeros((batch_size, pattern[i], size))], dim=1) return _input
def test_gaussian_dist_instance(): torch.manual_seed(0) act_size = 4 dist_instance = GaussianDistInstance(torch.zeros(1, act_size), torch.ones(1, act_size)) action = dist_instance.sample() assert action.shape == (1, act_size) for log_prob in dist_instance.log_prob(torch.zeros( (1, act_size))).flatten(): # Log prob of standard normal at 0 assert log_prob == pytest.approx(-0.919, abs=0.01) for ent in dist_instance.entropy().flatten(): # entropy of standard normal at 0, based on 1/2 + ln(sqrt(2pi)sigma) assert ent == pytest.approx(1.42, abs=0.01)
def test_gaussian_distribution(conditional_sigma, tanh_squash): torch.manual_seed(0) hidden_size = 16 act_size = 4 sample_embedding = torch.ones((1, 16)) gauss_dist = GaussianDistribution( hidden_size, act_size, conditional_sigma=conditional_sigma, tanh_squash=tanh_squash, ) # Make sure backprop works force_action = torch.zeros((1, act_size)) optimizer = torch.optim.Adam(gauss_dist.parameters(), lr=3e-3) for _ in range(50): dist_inst = gauss_dist(sample_embedding)[0] if tanh_squash: assert isinstance(dist_inst, TanhGaussianDistInstance) else: assert isinstance(dist_inst, GaussianDistInstance) log_prob = dist_inst.log_prob(force_action) loss = torch.nn.functional.mse_loss(log_prob, -2 * torch.ones(log_prob.shape)) optimizer.zero_grad() loss.backward() optimizer.step() for prob in log_prob.flatten(): assert prob == pytest.approx(-2, abs=0.1)
def get_trajectory_value_estimates( self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]: n_obs = len(self.policy.behavior_spec.observation_specs) current_obs = ObsUtil.from_buffer(batch, n_obs) # Convert to tensors current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs] next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs] memory = torch.zeros([1, 1, self.policy.m_size]) next_obs = [obs.unsqueeze(0) for obs in next_obs] value_estimates, next_memory = self.policy.actor_critic.critic_pass( current_obs, memory, sequence_length=batch.num_experiences) next_value_estimate, _ = self.policy.actor_critic.critic_pass( next_obs, next_memory, sequence_length=1) for name, estimate in value_estimates.items(): value_estimates[name] = ModelUtils.to_numpy(estimate) next_value_estimate[name] = ModelUtils.to_numpy( next_value_estimate[name]) if done: for k in next_value_estimate: if not self.reward_signals[k].ignore_done: next_value_estimate[k] = 0.0 return value_estimates, next_value_estimate
def __init__( self, hidden_size: int, num_outputs: int, conditional_sigma: bool = False, tanh_squash: bool = False, ): super().__init__() self.conditional_sigma = conditional_sigma self.mu = linear_layer( hidden_size, num_outputs, kernel_init=Initialization.KaimingHeNormal, kernel_gain=0.1, bias_init=Initialization.Zero, ) self.tanh_squash = tanh_squash if conditional_sigma: self.log_sigma = linear_layer( hidden_size, num_outputs, kernel_init=Initialization.KaimingHeNormal, kernel_gain=0.1, bias_init=Initialization.Zero, ) else: self.log_sigma = nn.Parameter( torch.zeros(1, num_outputs, requires_grad=True))
def test_tanh_gaussian_dist_instance(): torch.manual_seed(0) act_size = 4 dist_instance = TanhGaussianDistInstance(torch.zeros(1, act_size), torch.ones(1, act_size)) for _ in range(10): action = dist_instance.sample() assert action.shape == (1, act_size) assert torch.max(action) < 1.0 and torch.min(action) > -1.0
def _update_batch(self, mini_batch_demo: Dict[str, np.ndarray], n_sequences: int) -> Dict[str, float]: """ Helper function for update_batch. """ vec_obs = [ModelUtils.list_to_tensor(mini_batch_demo["vector_obs"])] act_masks = None if self.policy.use_continuous_act: expert_actions = ModelUtils.list_to_tensor( mini_batch_demo["actions"]) else: raw_expert_actions = ModelUtils.list_to_tensor( mini_batch_demo["actions"], dtype=torch.long) expert_actions = ModelUtils.actions_to_onehot( raw_expert_actions, self.policy.act_size) act_masks = ModelUtils.list_to_tensor( np.ones( ( self.n_sequences * self.policy.sequence_length, sum(self.policy.behavior_spec.discrete_action_branches ), ), dtype=np.float32, )) memories = [] if self.policy.use_recurrent: memories = torch.zeros(1, self.n_sequences, self.policy.m_size) if self.policy.use_vis_obs: vis_obs = [] for idx, _ in enumerate( self.policy.actor_critic.network_body.visual_processors): vis_ob = ModelUtils.list_to_tensor( mini_batch_demo["visual_obs%d" % idx]) vis_obs.append(vis_ob) else: vis_obs = [] selected_actions, all_log_probs, _, _ = self.policy.sample_actions( vec_obs, vis_obs, masks=act_masks, memories=memories, seq_len=self.policy.sequence_length, all_log_probs=True, ) bc_loss = self._behavioral_cloning_loss(selected_actions, all_log_probs, expert_actions) self.optimizer.zero_grad() bc_loss.backward() self.optimizer.step() run_out = {"loss": bc_loss.item()} return run_out
def test_visual_encoder_trains(vis_class, size): torch.manual_seed(0) image_size = (size, size, 1) batch = 100 inputs = torch.cat([ torch.zeros((batch, ) + image_size), torch.ones((batch, ) + image_size) ], dim=0) target = torch.cat([torch.zeros((batch, )), torch.ones((batch, ))], dim=0) enc = vis_class(image_size[0], image_size[1], image_size[2], 1) optimizer = torch.optim.Adam(enc.parameters(), lr=0.001) for _ in range(15): prediction = enc(inputs)[:, 0] loss = torch.mean((target - prediction)**2) optimizer.zero_grad() loss.backward() optimizer.step() assert loss.item() < 0.05
def compute_loss( self, policy_batch: AgentBuffer, expert_batch: AgentBuffer ) -> torch.Tensor: """ Given a policy mini_batch and an expert mini_batch, computes the loss of the discriminator. """ total_loss = torch.zeros(1) stats_dict: Dict[str, np.ndarray] = {} policy_estimate, policy_mu = self.compute_estimate( policy_batch, use_vail_noise=True ) expert_estimate, expert_mu = self.compute_estimate( expert_batch, use_vail_noise=True ) stats_dict["Policy/GAIL Policy Estimate"] = policy_estimate.mean().item() stats_dict["Policy/GAIL Expert Estimate"] = expert_estimate.mean().item() discriminator_loss = -( torch.log(expert_estimate + self.EPSILON) + torch.log(1.0 - policy_estimate + self.EPSILON) ).mean() stats_dict["Losses/GAIL Loss"] = discriminator_loss.item() total_loss += discriminator_loss if self._settings.use_vail: # KL divergence loss (encourage latent representation to be normal) kl_loss = torch.mean( -torch.sum( 1 + (self._z_sigma ** 2).log() - 0.5 * expert_mu ** 2 - 0.5 * policy_mu ** 2 - (self._z_sigma ** 2), dim=1, ) ) vail_loss = self._beta * (kl_loss - self.mutual_information) with torch.no_grad(): self._beta.data = torch.max( self._beta + self.alpha * (kl_loss - self.mutual_information), torch.tensor(0.0), ) total_loss += vail_loss stats_dict["Policy/GAIL Beta"] = self._beta.item() stats_dict["Losses/GAIL KL Loss"] = kl_loss.item() if self.gradient_penalty_weight > 0.0: gradient_magnitude_loss = ( self.gradient_penalty_weight * self.compute_gradient_magnitude(policy_batch, expert_batch) ) stats_dict["Policy/GAIL Grad Mag Loss"] = gradient_magnitude_loss.item() total_loss += gradient_magnitude_loss return total_loss, stats_dict
def test_get_probs_and_entropy(): inp_size = 4 act_size = 2 action_model, masks = create_action_model(inp_size, act_size) _continuous_dist = GaussianDistInstance(torch.zeros((1, 2)), torch.ones((1, 2))) act_size = 2 test_prob = torch.tensor([[1.0 - 0.1 * (act_size - 1)] + [0.1] * (act_size - 1)]) _discrete_dist_list = [ CategoricalDistInstance(test_prob), CategoricalDistInstance(test_prob), ] dist_tuple = DistInstances(_continuous_dist, _discrete_dist_list) agent_action = AgentAction(torch.zeros( (1, 2)), [torch.tensor([0]), torch.tensor([1])]) log_probs, entropies = action_model._get_probs_and_entropy( agent_action, dist_tuple) assert log_probs.continuous_tensor.shape == (1, 2) assert len(log_probs.discrete_list) == 2 for _disc in log_probs.discrete_list: assert _disc.shape == (1, ) assert len(log_probs.all_discrete_list) == 2 for _disc in log_probs.all_discrete_list: assert _disc.shape == (1, 2) for clp in log_probs.continuous_tensor[0]: # Log prob of standard normal at 0 assert clp == pytest.approx(-0.919, abs=0.01) assert log_probs.discrete_list[0] > log_probs.discrete_list[1] for ent, val in zip(entropies[0], [1.4189, 0.6191, 0.6191]): assert ent == pytest.approx(val, abs=0.01)
def test_get_probs_and_entropy(): # Test continuous # Add two dists to the list. This isn't done in the code but we'd like to support it. dist_list = [ GaussianDistInstance(torch.zeros((1, 2)), torch.ones((1, 2))), GaussianDistInstance(torch.zeros((1, 2)), torch.ones((1, 2))), ] action_list = [torch.zeros((1, 2)), torch.zeros((1, 2))] log_probs, entropies, all_probs = ModelUtils.get_probs_and_entropy( action_list, dist_list ) assert log_probs.shape == (1, 2, 2) assert entropies.shape == (1, 2, 2) assert all_probs is None for log_prob in log_probs.flatten(): # Log prob of standard normal at 0 assert log_prob == pytest.approx(-0.919, abs=0.01) for ent in entropies.flatten(): # entropy of standard normal at 0 assert ent == pytest.approx(1.42, abs=0.01) # Test continuous # Add two dists to the list. act_size = 2 test_prob = torch.tensor( [[1.0 - 0.1 * (act_size - 1)] + [0.1] * (act_size - 1)] ) # High prob for first action dist_list = [CategoricalDistInstance(test_prob), CategoricalDistInstance(test_prob)] action_list = [torch.tensor([0]), torch.tensor([1])] log_probs, entropies, all_probs = ModelUtils.get_probs_and_entropy( action_list, dist_list ) assert all_probs.shape == (1, len(dist_list * act_size)) assert entropies.shape == (1, len(dist_list)) # Make sure the first action has high probability than the others. assert log_probs.flatten()[0] > log_probs.flatten()[1]
def get_trajectory_value_estimates( self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]: vector_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])] if self.policy.use_vis_obs: visual_obs = [] for idx, _ in enumerate( self.policy.actor_critic.network_body.visual_processors): visual_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx]) visual_obs.append(visual_ob) else: visual_obs = [] memory = torch.zeros([1, 1, self.policy.m_size]) vec_vis_obs = SplitObservations.from_observations(next_obs) next_vec_obs = [ ModelUtils.list_to_tensor( vec_vis_obs.vector_observations).unsqueeze(0) ] next_vis_obs = [ ModelUtils.list_to_tensor(_vis_ob).unsqueeze(0) for _vis_ob in vec_vis_obs.visual_observations ] value_estimates, next_memory = self.policy.actor_critic.critic_pass( vector_obs, visual_obs, memory, sequence_length=batch.num_experiences) next_value_estimate, _ = self.policy.actor_critic.critic_pass( next_vec_obs, next_vis_obs, next_memory, sequence_length=1) for name, estimate in value_estimates.items(): value_estimates[name] = ModelUtils.to_numpy(estimate) next_value_estimate[name] = ModelUtils.to_numpy( next_value_estimate[name]) if done: for k in next_value_estimate: if not self.reward_signals[k].ignore_done: next_value_estimate[k] = 0.0 return value_estimates, next_value_estimate
def __init__(self, input_size, output_size, hyper_input_size, layer_size, num_layers): """ Hyper Network module. This module will use the hyper_input tensor to generate the weights of the main network. The main network is a single fully connected layer. :param input_size: The size of the input of the main network :param output_size: The size of the output of the main network :param hyper_input_size: The size of the input of the hypernetwork that will generate the main network. :param layer_size: The number of hidden units in the layers of the hypernetwork :param num_layers: The number of layers of the hypernetwork """ super().__init__() self.input_size = input_size self.output_size = output_size layer_in_size = hyper_input_size layers = [] for _ in range(num_layers): layers.append( linear_layer( layer_in_size, layer_size, kernel_init=Initialization.KaimingHeNormal, kernel_gain=1.0, bias_init=Initialization.Zero, )) layers.append(Swish()) layer_in_size = layer_size flat_output = linear_layer( layer_size, input_size * output_size, kernel_init=Initialization.KaimingHeNormal, kernel_gain=0.1, bias_init=Initialization.Zero, ) # Re-initializing the weights of the last layer of the hypernetwork bound = math.sqrt(1 / (layer_size * self.input_size)) flat_output.weight.data.uniform_(-bound, bound) self.hypernet = torch.nn.Sequential(*layers, LayerNorm(), flat_output) # The hypernetwork will not generate the bias of the main network layer self.bias = torch.nn.Parameter(torch.zeros(output_size))
def _update_batch( self, mini_batch_demo: AgentBuffer, n_sequences: int ) -> Dict[str, float]: """ Helper function for update_batch. """ np_obs = ObsUtil.from_buffer( mini_batch_demo, len(self.policy.behavior_spec.observation_specs) ) # Convert to tensors tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs] act_masks = None expert_actions = AgentAction.from_buffer(mini_batch_demo) if self.policy.behavior_spec.action_spec.discrete_size > 0: act_masks = ModelUtils.list_to_tensor( np.ones( ( self.n_sequences * self.policy.sequence_length, sum(self.policy.behavior_spec.action_spec.discrete_branches), ), dtype=np.float32, ) ) memories = [] if self.policy.use_recurrent: memories = torch.zeros(1, self.n_sequences, self.policy.m_size) selected_actions, log_probs, _, _ = self.policy.sample_actions( tensor_obs, masks=act_masks, memories=memories, seq_len=self.policy.sequence_length, ) bc_loss = self._behavioral_cloning_loss( selected_actions, log_probs, expert_actions ) self.optimizer.zero_grad() bc_loss.backward() self.optimizer.step() run_out = {"loss": bc_loss.item()} return run_out
def test_multi_head_attention_masking(): epsilon = 0.0001 n_h, emb_size = 4, 12 n_k, n_q, b = 13, 14, 15 mha = MultiHeadAttention(emb_size, n_h) # create a key input with some keys all 0 query = torch.ones((b, n_q, emb_size)) key = torch.ones((b, n_k, emb_size)) value = torch.ones((b, n_k, emb_size)) mask = torch.zeros((b, n_k)) for i in range(n_k): if i % 3 == 0: key[:, i, :] = 0 mask[:, i] = 1 _, attention = mha.forward(query, key, value, n_q, n_k, mask) for i in range(n_k): if i % 3 == 0: assert torch.sum(attention[:, :, :, i]**2) < epsilon else: assert torch.sum(attention[:, :, :, i]**2) > epsilon
def __init__(self, policy): # ONNX only support input in NCHW (channel first) format. # Barracuda also expect to get data in NCHW. # Any multi-dimentional input should follow that otherwise will # cause problem to barracuda import. self.policy = policy observation_specs = self.policy.behavior_spec.observation_specs batch_dim = [1] seq_len_dim = [1] vec_obs_size = 0 for obs_spec in observation_specs: if len(obs_spec.shape) == 1: vec_obs_size += obs_spec.shape[0] num_vis_obs = sum(1 for obs_spec in observation_specs if len(obs_spec.shape) == 3) dummy_vec_obs = [torch.zeros(batch_dim + [vec_obs_size])] # create input shape of NCHW # (It's NHWC in observation_specs.shape) dummy_vis_obs = [ torch.zeros( batch_dim + [obs_spec.shape[2], obs_spec.shape[0], obs_spec.shape[1]]) for obs_spec in observation_specs if len(obs_spec.shape) == 3 ] dummy_var_len_obs = [ torch.zeros(batch_dim + [obs_spec.shape[0], obs_spec.shape[1]]) for obs_spec in observation_specs if len(obs_spec.shape) == 2 ] dummy_masks = torch.ones( batch_dim + [sum(self.policy.behavior_spec.action_spec.discrete_branches)]) dummy_memories = torch.zeros(batch_dim + seq_len_dim + [self.policy.export_memory_size]) self.dummy_input = ( dummy_vec_obs, dummy_vis_obs, dummy_var_len_obs, dummy_masks, dummy_memories, ) self.input_names = [TensorNames.vector_observation_placeholder] for i in range(num_vis_obs): self.input_names.append(TensorNames.get_visual_observation_name(i)) for i, obs_spec in enumerate(observation_specs): if len(obs_spec.shape) == 2: self.input_names.append(TensorNames.get_observation_name(i)) self.input_names += [ TensorNames.action_mask_placeholder, TensorNames.recurrent_in_placeholder, ] self.dynamic_axes = {name: {0: "batch"} for name in self.input_names} self.output_names = [ TensorNames.version_number, TensorNames.memory_size ] if self.policy.behavior_spec.action_spec.continuous_size > 0: self.output_names += [ TensorNames.continuous_action_output, TensorNames.continuous_action_output_shape, ] self.dynamic_axes.update( {TensorNames.continuous_action_output: { 0: "batch" }}) if self.policy.behavior_spec.action_spec.discrete_size > 0: self.output_names += [ TensorNames.discrete_action_output, TensorNames.discrete_action_output_shape, ] self.dynamic_axes.update( {TensorNames.discrete_action_output: { 0: "batch" }}) if (self.policy.behavior_spec.action_spec.continuous_size == 0 or self.policy.behavior_spec.action_spec.discrete_size == 0): self.output_names += [ TensorNames.action_output_deprecated, TensorNames.is_continuous_control_deprecated, TensorNames.action_output_shape_deprecated, ] self.dynamic_axes.update( {TensorNames.action_output_deprecated: { 0: "batch" }}) if self.policy.export_memory_size > 0: self.output_names += [TensorNames.recurrent_output]
def __init__(self, vec_obs_size: int): super().__init__() self.register_buffer("normalization_steps", torch.tensor(1)) self.register_buffer("running_mean", torch.zeros(vec_obs_size)) self.register_buffer("running_variance", torch.ones(vec_obs_size))
def get_trajectory_value_estimates( self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool, agent_id: str = "", ) -> Tuple[Dict[str, np.ndarray], Dict[str, float], Optional[AgentBufferField]]: """ Get value estimates and memories for a trajectory, in batch form. :param batch: An AgentBuffer that consists of a trajectory. :param next_obs: the next observation (after the trajectory). Used for boostrapping if this is not a termiinal trajectory. :param done: Set true if this is a terminal trajectory. :param agent_id: Agent ID of the agent that this trajectory belongs to. :returns: A Tuple of the Value Estimates as a Dict of [name, np.ndarray(trajectory_len)], the final value estimate as a Dict of [name, float], and optionally (if using memories) an AgentBufferField of initial critic memories to be used during update. """ n_obs = len(self.policy.behavior_spec.observation_specs) if agent_id in self.critic_memory_dict: memory = self.critic_memory_dict[agent_id] else: memory = ( torch.zeros((1, 1, self.critic.memory_size)) if self.policy.use_recurrent else None ) # Convert to tensors current_obs = [ ModelUtils.list_to_tensor(obs) for obs in ObsUtil.from_buffer(batch, n_obs) ] next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs] next_obs = [obs.unsqueeze(0) for obs in next_obs] # If we're using LSTM, we want to get all the intermediate memories. all_next_memories: Optional[AgentBufferField] = None # To prevent memory leak and improve performance, evaluate with no_grad. with torch.no_grad(): if self.policy.use_recurrent: ( value_estimates, all_next_memories, next_memory, ) = self._evaluate_by_sequence(current_obs, memory) else: value_estimates, next_memory = self.critic.critic_pass( current_obs, memory, sequence_length=batch.num_experiences ) # Store the memory for the next trajectory. This should NOT have a gradient. self.critic_memory_dict[agent_id] = next_memory next_value_estimate, _ = self.critic.critic_pass( next_obs, next_memory, sequence_length=1 ) for name, estimate in value_estimates.items(): value_estimates[name] = ModelUtils.to_numpy(estimate) next_value_estimate[name] = ModelUtils.to_numpy(next_value_estimate[name]) if done: for k in next_value_estimate: if not self.reward_signals[k].ignore_done: next_value_estimate[k] = 0.0 if agent_id in self.critic_memory_dict: self.critic_memory_dict.pop(agent_id) return value_estimates, next_value_estimate, all_next_memories
def get_trajectory_and_baseline_value_estimates( self, batch: AgentBuffer, next_obs: List[np.ndarray], next_groupmate_obs: List[List[np.ndarray]], done: bool, agent_id: str = "", ) -> Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray], Dict[str, float], Optional[AgentBufferField], Optional[AgentBufferField], ]: """ Get value estimates, baseline estimates, and memories for a trajectory, in batch form. :param batch: An AgentBuffer that consists of a trajectory. :param next_obs: the next observation (after the trajectory). Used for boostrapping if this is not a termiinal trajectory. :param next_groupmate_obs: the next observations from other members of the group. :param done: Set true if this is a terminal trajectory. :param agent_id: Agent ID of the agent that this trajectory belongs to. :returns: A Tuple of the Value Estimates as a Dict of [name, np.ndarray(trajectory_len)], the baseline estimates as a Dict, the final value estimate as a Dict of [name, float], and optionally (if using memories) an AgentBufferField of initial critic and baseline memories to be used during update. """ n_obs = len(self.policy.behavior_spec.observation_specs) current_obs = ObsUtil.from_buffer(batch, n_obs) groupmate_obs = GroupObsUtil.from_buffer(batch, n_obs) current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs] groupmate_obs = [[ ModelUtils.list_to_tensor(obs) for obs in _groupmate_obs ] for _groupmate_obs in groupmate_obs] groupmate_actions = AgentAction.group_from_buffer(batch) next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs] next_obs = [obs.unsqueeze(0) for obs in next_obs] next_groupmate_obs = [ ModelUtils.list_to_tensor_list(_list_obs) for _list_obs in next_groupmate_obs ] # Expand dimensions of next critic obs next_groupmate_obs = [[_obs.unsqueeze(0) for _obs in _list_obs] for _list_obs in next_groupmate_obs] if agent_id in self.value_memory_dict: # The agent_id should always be in both since they are added together _init_value_mem = self.value_memory_dict[agent_id] _init_baseline_mem = self.baseline_memory_dict[agent_id] else: _init_value_mem = (torch.zeros((1, 1, self.critic.memory_size)) if self.policy.use_recurrent else None) _init_baseline_mem = (torch.zeros((1, 1, self.critic.memory_size)) if self.policy.use_recurrent else None) all_obs = ([current_obs] + groupmate_obs if groupmate_obs is not None else [current_obs]) all_next_value_mem: Optional[AgentBufferField] = None all_next_baseline_mem: Optional[AgentBufferField] = None with torch.no_grad(): if self.policy.use_recurrent: ( value_estimates, baseline_estimates, all_next_value_mem, all_next_baseline_mem, next_value_mem, next_baseline_mem, ) = self._evaluate_by_sequence_team( current_obs, groupmate_obs, groupmate_actions, _init_value_mem, _init_baseline_mem, ) else: value_estimates, next_value_mem = self.critic.critic_pass( all_obs, _init_value_mem, sequence_length=batch.num_experiences) groupmate_obs_and_actions = (groupmate_obs, groupmate_actions) baseline_estimates, next_baseline_mem = self.critic.baseline( current_obs, groupmate_obs_and_actions, _init_baseline_mem, sequence_length=batch.num_experiences, ) # Store the memory for the next trajectory self.value_memory_dict[agent_id] = next_value_mem self.baseline_memory_dict[agent_id] = next_baseline_mem all_next_obs = ([next_obs] + next_groupmate_obs if next_groupmate_obs is not None else [next_obs]) next_value_estimates, _ = self.critic.critic_pass(all_next_obs, next_value_mem, sequence_length=1) for name, estimate in baseline_estimates.items(): baseline_estimates[name] = ModelUtils.to_numpy(estimate) for name, estimate in value_estimates.items(): value_estimates[name] = ModelUtils.to_numpy(estimate) # the base line and V shpuld not be on the same done flag for name, estimate in next_value_estimates.items(): next_value_estimates[name] = ModelUtils.to_numpy(estimate) if done: for k in next_value_estimates: if not self.reward_signals[k].ignore_done: next_value_estimates[k][-1] = 0.0 return ( value_estimates, baseline_estimates, next_value_estimates, all_next_value_mem, all_next_baseline_mem, )