コード例 #1
0
def export_model(network):
    vec_obs_size = 16
    num_vis_obs = 0
    dummy_vec_obs = [torch.zeros([1] + [vec_obs_size])]
    dummy_vis_obs = []
    dummy_var_len_obs = []
    dummy_masks = torch.ones([1] + [0])
    dummy_memories = torch.zeros([1] + [1] + [256])
    dummy_input = (
        dummy_vec_obs,
        dummy_vis_obs,
        dummy_var_len_obs,
        dummy_masks,
        dummy_memories,
    )
    input_names = ['vector_observation', 'action_masks', 'recurrent_in']
    dynamic_axes = {name: {0: "batch"} for name in input_names}
    output_names = [
        'version_number', 'memory_size', 'continuous_actions',
        'continuous_action_output_shape', 'action', 'is_continuous_control',
        'action_output_shape', 'recurrent_out'
    ]
    dynamic_axes.update({'continuous_actions': {0: "batch"}})
    dynamic_axes.update({'action': {0: "batch"}})

    torch.onnx.export(network,
                      dummy_input,
                      EXPORT_FILE,
                      opset_version=9,
                      input_names=input_names,
                      output_names=output_names,
                      dynamic_axes=dynamic_axes)
コード例 #2
0
    def __init__(self, policy):
        # ONNX only support input in NCHW (channel first) format.
        # Barracuda also expect to get data in NCHW.
        # Any multi-dimentional input should follow that otherwise will
        # cause problem to barracuda import.
        self.policy = policy
        observation_specs = self.policy.behavior_spec.observation_specs
        batch_dim = [1]
        seq_len_dim = [1]
        num_obs = len(observation_specs)

        dummy_obs = [
            torch.zeros(batch_dim +
                        list(ModelSerializer._get_onnx_shape(obs_spec.shape)))
            for obs_spec in observation_specs
        ]

        dummy_masks = torch.ones(
            batch_dim +
            [sum(self.policy.behavior_spec.action_spec.discrete_branches)])
        dummy_memories = torch.zeros(batch_dim + seq_len_dim +
                                     [self.policy.export_memory_size])

        self.dummy_input = (dummy_obs, dummy_masks, dummy_memories)

        self.input_names = [
            TensorNames.get_observation_name(i) for i in range(num_obs)
        ]
        self.input_names += [
            TensorNames.action_mask_placeholder,
            TensorNames.recurrent_in_placeholder,
        ]

        self.dynamic_axes = {name: {0: "batch"} for name in self.input_names}

        self.output_names = [
            TensorNames.version_number, TensorNames.memory_size
        ]
        if self.policy.behavior_spec.action_spec.continuous_size > 0:
            self.output_names += [
                TensorNames.continuous_action_output,
                TensorNames.continuous_action_output_shape,
            ]
            self.dynamic_axes.update(
                {TensorNames.continuous_action_output: {
                    0: "batch"
                }})
        if self.policy.behavior_spec.action_spec.discrete_size > 0:
            self.output_names += [
                TensorNames.discrete_action_output,
                TensorNames.discrete_action_output_shape,
            ]
            self.dynamic_axes.update(
                {TensorNames.discrete_action_output: {
                    0: "batch"
                }})

        if self.policy.export_memory_size > 0:
            self.output_names += [TensorNames.recurrent_output]
コード例 #3
0
    def __init__(self, policy):
        # ONNX only support input in NCHW (channel first) format.
        # Barracuda also expect to get data in NCHW.
        # Any multi-dimentional input should follow that otherwise will
        # cause problem to barracuda import.
        self.policy = policy
        batch_dim = [1]
        seq_len_dim = [1]
        dummy_vec_obs = [torch.zeros(batch_dim + [self.policy.vec_obs_size])]
        # create input shape of NCHW
        # (It's NHWC in self.policy.behavior_spec.observation_shapes)
        dummy_vis_obs = [
            torch.zeros(batch_dim + [shape[2], shape[0], shape[1]])
            for shape in self.policy.behavior_spec.observation_shapes
            if len(shape) == 3
        ]
        dummy_masks = torch.ones(
            batch_dim + [sum(self.policy.behavior_spec.action_spec.discrete_branches)]
        )
        dummy_memories = torch.zeros(
            batch_dim + seq_len_dim + [self.policy.export_memory_size]
        )

        self.dummy_input = (dummy_vec_obs, dummy_vis_obs, dummy_masks, dummy_memories)

        self.input_names = (
            ["vector_observation"]
            + [f"visual_observation_{i}" for i in range(self.policy.vis_obs_size)]
            + ["action_masks", "memories"]
        )
        self.dynamic_axes = {name: {0: "batch"} for name in self.input_names}

        self.output_names = ["version_number", "memory_size"]
        if self.policy.behavior_spec.action_spec.continuous_size > 0:
            self.output_names += [
                "continuous_actions",
                "continuous_action_output_shape",
            ]
            self.dynamic_axes.update({"continuous_actions": {0: "batch"}})
        if self.policy.behavior_spec.action_spec.discrete_size > 0:
            self.output_names += ["discrete_actions", "discrete_action_output_shape"]
            self.dynamic_axes.update({"discrete_actions": {0: "batch"}})
        if (
            self.policy.behavior_spec.action_spec.continuous_size == 0
            or self.policy.behavior_spec.action_spec.discrete_size == 0
        ):
            self.output_names += [
                "action",
                "is_continuous_control",
                "action_output_shape",
            ]
            self.dynamic_axes.update({"action": {0: "batch"}})
コード例 #4
0
 def generate_input_helper(pattern):
     _input = torch.zeros((batch_size, 0, size))
     for i in range(len(pattern)):
         if i % 2 == 0:
             _input = torch.cat(
                 [_input,
                  torch.rand((batch_size, pattern[i], size))],
                 dim=1)
         else:
             _input = torch.cat(
                 [_input,
                  torch.zeros((batch_size, pattern[i], size))],
                 dim=1)
     return _input
コード例 #5
0
def test_gaussian_dist_instance():
    torch.manual_seed(0)
    act_size = 4
    dist_instance = GaussianDistInstance(torch.zeros(1, act_size),
                                         torch.ones(1, act_size))
    action = dist_instance.sample()
    assert action.shape == (1, act_size)
    for log_prob in dist_instance.log_prob(torch.zeros(
        (1, act_size))).flatten():
        # Log prob of standard normal at 0
        assert log_prob == pytest.approx(-0.919, abs=0.01)

    for ent in dist_instance.entropy().flatten():
        # entropy of standard normal at 0, based on 1/2 + ln(sqrt(2pi)sigma)
        assert ent == pytest.approx(1.42, abs=0.01)
コード例 #6
0
def test_gaussian_distribution(conditional_sigma, tanh_squash):
    torch.manual_seed(0)
    hidden_size = 16
    act_size = 4
    sample_embedding = torch.ones((1, 16))
    gauss_dist = GaussianDistribution(
        hidden_size,
        act_size,
        conditional_sigma=conditional_sigma,
        tanh_squash=tanh_squash,
    )

    # Make sure backprop works
    force_action = torch.zeros((1, act_size))
    optimizer = torch.optim.Adam(gauss_dist.parameters(), lr=3e-3)

    for _ in range(50):
        dist_inst = gauss_dist(sample_embedding)[0]
        if tanh_squash:
            assert isinstance(dist_inst, TanhGaussianDistInstance)
        else:
            assert isinstance(dist_inst, GaussianDistInstance)
        log_prob = dist_inst.log_prob(force_action)
        loss = torch.nn.functional.mse_loss(log_prob,
                                            -2 * torch.ones(log_prob.shape))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    for prob in log_prob.flatten():
        assert prob == pytest.approx(-2, abs=0.1)
コード例 #7
0
    def get_trajectory_value_estimates(
            self, batch: AgentBuffer, next_obs: List[np.ndarray],
            done: bool) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
        n_obs = len(self.policy.behavior_spec.observation_specs)
        current_obs = ObsUtil.from_buffer(batch, n_obs)

        # Convert to tensors
        current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]
        next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs]

        memory = torch.zeros([1, 1, self.policy.m_size])

        next_obs = [obs.unsqueeze(0) for obs in next_obs]

        value_estimates, next_memory = self.policy.actor_critic.critic_pass(
            current_obs, memory, sequence_length=batch.num_experiences)

        next_value_estimate, _ = self.policy.actor_critic.critic_pass(
            next_obs, next_memory, sequence_length=1)

        for name, estimate in value_estimates.items():
            value_estimates[name] = ModelUtils.to_numpy(estimate)
            next_value_estimate[name] = ModelUtils.to_numpy(
                next_value_estimate[name])

        if done:
            for k in next_value_estimate:
                if not self.reward_signals[k].ignore_done:
                    next_value_estimate[k] = 0.0

        return value_estimates, next_value_estimate
コード例 #8
0
 def __init__(
     self,
     hidden_size: int,
     num_outputs: int,
     conditional_sigma: bool = False,
     tanh_squash: bool = False,
 ):
     super().__init__()
     self.conditional_sigma = conditional_sigma
     self.mu = linear_layer(
         hidden_size,
         num_outputs,
         kernel_init=Initialization.KaimingHeNormal,
         kernel_gain=0.1,
         bias_init=Initialization.Zero,
     )
     self.tanh_squash = tanh_squash
     if conditional_sigma:
         self.log_sigma = linear_layer(
             hidden_size,
             num_outputs,
             kernel_init=Initialization.KaimingHeNormal,
             kernel_gain=0.1,
             bias_init=Initialization.Zero,
         )
     else:
         self.log_sigma = nn.Parameter(
             torch.zeros(1, num_outputs, requires_grad=True))
コード例 #9
0
def test_tanh_gaussian_dist_instance():
    torch.manual_seed(0)
    act_size = 4
    dist_instance = TanhGaussianDistInstance(torch.zeros(1, act_size),
                                             torch.ones(1, act_size))
    for _ in range(10):
        action = dist_instance.sample()
        assert action.shape == (1, act_size)
        assert torch.max(action) < 1.0 and torch.min(action) > -1.0
コード例 #10
0
ファイル: module.py プロジェクト: zt1217396582/ml-agents
    def _update_batch(self, mini_batch_demo: Dict[str, np.ndarray],
                      n_sequences: int) -> Dict[str, float]:
        """
        Helper function for update_batch.
        """
        vec_obs = [ModelUtils.list_to_tensor(mini_batch_demo["vector_obs"])]
        act_masks = None
        if self.policy.use_continuous_act:
            expert_actions = ModelUtils.list_to_tensor(
                mini_batch_demo["actions"])
        else:
            raw_expert_actions = ModelUtils.list_to_tensor(
                mini_batch_demo["actions"], dtype=torch.long)
            expert_actions = ModelUtils.actions_to_onehot(
                raw_expert_actions, self.policy.act_size)
            act_masks = ModelUtils.list_to_tensor(
                np.ones(
                    (
                        self.n_sequences * self.policy.sequence_length,
                        sum(self.policy.behavior_spec.discrete_action_branches
                            ),
                    ),
                    dtype=np.float32,
                ))

        memories = []
        if self.policy.use_recurrent:
            memories = torch.zeros(1, self.n_sequences, self.policy.m_size)

        if self.policy.use_vis_obs:
            vis_obs = []
            for idx, _ in enumerate(
                    self.policy.actor_critic.network_body.visual_processors):
                vis_ob = ModelUtils.list_to_tensor(
                    mini_batch_demo["visual_obs%d" % idx])
                vis_obs.append(vis_ob)
        else:
            vis_obs = []

        selected_actions, all_log_probs, _, _ = self.policy.sample_actions(
            vec_obs,
            vis_obs,
            masks=act_masks,
            memories=memories,
            seq_len=self.policy.sequence_length,
            all_log_probs=True,
        )
        bc_loss = self._behavioral_cloning_loss(selected_actions,
                                                all_log_probs, expert_actions)
        self.optimizer.zero_grad()
        bc_loss.backward()

        self.optimizer.step()
        run_out = {"loss": bc_loss.item()}
        return run_out
コード例 #11
0
def test_visual_encoder_trains(vis_class, size):
    torch.manual_seed(0)
    image_size = (size, size, 1)
    batch = 100

    inputs = torch.cat([
        torch.zeros((batch, ) + image_size),
        torch.ones((batch, ) + image_size)
    ],
                       dim=0)
    target = torch.cat([torch.zeros((batch, )), torch.ones((batch, ))], dim=0)
    enc = vis_class(image_size[0], image_size[1], image_size[2], 1)
    optimizer = torch.optim.Adam(enc.parameters(), lr=0.001)

    for _ in range(15):
        prediction = enc(inputs)[:, 0]
        loss = torch.mean((target - prediction)**2)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    assert loss.item() < 0.05
コード例 #12
0
 def compute_loss(
     self, policy_batch: AgentBuffer, expert_batch: AgentBuffer
 ) -> torch.Tensor:
     """
     Given a policy mini_batch and an expert mini_batch, computes the loss of the discriminator.
     """
     total_loss = torch.zeros(1)
     stats_dict: Dict[str, np.ndarray] = {}
     policy_estimate, policy_mu = self.compute_estimate(
         policy_batch, use_vail_noise=True
     )
     expert_estimate, expert_mu = self.compute_estimate(
         expert_batch, use_vail_noise=True
     )
     stats_dict["Policy/GAIL Policy Estimate"] = policy_estimate.mean().item()
     stats_dict["Policy/GAIL Expert Estimate"] = expert_estimate.mean().item()
     discriminator_loss = -(
         torch.log(expert_estimate + self.EPSILON)
         + torch.log(1.0 - policy_estimate + self.EPSILON)
     ).mean()
     stats_dict["Losses/GAIL Loss"] = discriminator_loss.item()
     total_loss += discriminator_loss
     if self._settings.use_vail:
         # KL divergence loss (encourage latent representation to be normal)
         kl_loss = torch.mean(
             -torch.sum(
                 1
                 + (self._z_sigma ** 2).log()
                 - 0.5 * expert_mu ** 2
                 - 0.5 * policy_mu ** 2
                 - (self._z_sigma ** 2),
                 dim=1,
             )
         )
         vail_loss = self._beta * (kl_loss - self.mutual_information)
         with torch.no_grad():
             self._beta.data = torch.max(
                 self._beta + self.alpha * (kl_loss - self.mutual_information),
                 torch.tensor(0.0),
             )
         total_loss += vail_loss
         stats_dict["Policy/GAIL Beta"] = self._beta.item()
         stats_dict["Losses/GAIL KL Loss"] = kl_loss.item()
     if self.gradient_penalty_weight > 0.0:
         gradient_magnitude_loss = (
             self.gradient_penalty_weight
             * self.compute_gradient_magnitude(policy_batch, expert_batch)
         )
         stats_dict["Policy/GAIL Grad Mag Loss"] = gradient_magnitude_loss.item()
         total_loss += gradient_magnitude_loss
     return total_loss, stats_dict
コード例 #13
0
def test_get_probs_and_entropy():
    inp_size = 4
    act_size = 2
    action_model, masks = create_action_model(inp_size, act_size)

    _continuous_dist = GaussianDistInstance(torch.zeros((1, 2)),
                                            torch.ones((1, 2)))
    act_size = 2
    test_prob = torch.tensor([[1.0 - 0.1 * (act_size - 1)] + [0.1] *
                              (act_size - 1)])
    _discrete_dist_list = [
        CategoricalDistInstance(test_prob),
        CategoricalDistInstance(test_prob),
    ]
    dist_tuple = DistInstances(_continuous_dist, _discrete_dist_list)

    agent_action = AgentAction(torch.zeros(
        (1, 2)), [torch.tensor([0]), torch.tensor([1])])

    log_probs, entropies = action_model._get_probs_and_entropy(
        agent_action, dist_tuple)

    assert log_probs.continuous_tensor.shape == (1, 2)
    assert len(log_probs.discrete_list) == 2
    for _disc in log_probs.discrete_list:
        assert _disc.shape == (1, )
    assert len(log_probs.all_discrete_list) == 2
    for _disc in log_probs.all_discrete_list:
        assert _disc.shape == (1, 2)

    for clp in log_probs.continuous_tensor[0]:
        # Log prob of standard normal at 0
        assert clp == pytest.approx(-0.919, abs=0.01)

    assert log_probs.discrete_list[0] > log_probs.discrete_list[1]

    for ent, val in zip(entropies[0], [1.4189, 0.6191, 0.6191]):
        assert ent == pytest.approx(val, abs=0.01)
コード例 #14
0
ファイル: test_utils.py プロジェクト: rahzaazhar/ml-agents
def test_get_probs_and_entropy():
    # Test continuous
    # Add two dists to the list. This isn't done in the code but we'd like to support it.
    dist_list = [
        GaussianDistInstance(torch.zeros((1, 2)), torch.ones((1, 2))),
        GaussianDistInstance(torch.zeros((1, 2)), torch.ones((1, 2))),
    ]
    action_list = [torch.zeros((1, 2)), torch.zeros((1, 2))]
    log_probs, entropies, all_probs = ModelUtils.get_probs_and_entropy(
        action_list, dist_list
    )
    assert log_probs.shape == (1, 2, 2)
    assert entropies.shape == (1, 2, 2)
    assert all_probs is None

    for log_prob in log_probs.flatten():
        # Log prob of standard normal at 0
        assert log_prob == pytest.approx(-0.919, abs=0.01)

    for ent in entropies.flatten():
        # entropy of standard normal at 0
        assert ent == pytest.approx(1.42, abs=0.01)

    # Test continuous
    # Add two dists to the list.
    act_size = 2
    test_prob = torch.tensor(
        [[1.0 - 0.1 * (act_size - 1)] + [0.1] * (act_size - 1)]
    )  # High prob for first action
    dist_list = [CategoricalDistInstance(test_prob), CategoricalDistInstance(test_prob)]
    action_list = [torch.tensor([0]), torch.tensor([1])]
    log_probs, entropies, all_probs = ModelUtils.get_probs_and_entropy(
        action_list, dist_list
    )
    assert all_probs.shape == (1, len(dist_list * act_size))
    assert entropies.shape == (1, len(dist_list))
    # Make sure the first action has high probability than the others.
    assert log_probs.flatten()[0] > log_probs.flatten()[1]
コード例 #15
0
    def get_trajectory_value_estimates(
            self, batch: AgentBuffer, next_obs: List[np.ndarray],
            done: bool) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
        vector_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
        if self.policy.use_vis_obs:
            visual_obs = []
            for idx, _ in enumerate(
                    self.policy.actor_critic.network_body.visual_processors):
                visual_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" %
                                                            idx])
                visual_obs.append(visual_ob)
        else:
            visual_obs = []

        memory = torch.zeros([1, 1, self.policy.m_size])

        vec_vis_obs = SplitObservations.from_observations(next_obs)
        next_vec_obs = [
            ModelUtils.list_to_tensor(
                vec_vis_obs.vector_observations).unsqueeze(0)
        ]
        next_vis_obs = [
            ModelUtils.list_to_tensor(_vis_ob).unsqueeze(0)
            for _vis_ob in vec_vis_obs.visual_observations
        ]

        value_estimates, next_memory = self.policy.actor_critic.critic_pass(
            vector_obs,
            visual_obs,
            memory,
            sequence_length=batch.num_experiences)

        next_value_estimate, _ = self.policy.actor_critic.critic_pass(
            next_vec_obs, next_vis_obs, next_memory, sequence_length=1)

        for name, estimate in value_estimates.items():
            value_estimates[name] = ModelUtils.to_numpy(estimate)
            next_value_estimate[name] = ModelUtils.to_numpy(
                next_value_estimate[name])

        if done:
            for k in next_value_estimate:
                if not self.reward_signals[k].ignore_done:
                    next_value_estimate[k] = 0.0

        return value_estimates, next_value_estimate
コード例 #16
0
    def __init__(self, input_size, output_size, hyper_input_size, layer_size,
                 num_layers):
        """
        Hyper Network module. This module will use the hyper_input tensor to generate
        the weights of the main network. The main network is a single fully connected
        layer.
        :param input_size: The size of the input of the main network
        :param output_size: The size of the output of the main network
        :param hyper_input_size: The size of the input of the hypernetwork that will
        generate the main network.
        :param layer_size: The number of hidden units in the layers of the hypernetwork
        :param num_layers: The number of layers of the hypernetwork
        """
        super().__init__()
        self.input_size = input_size
        self.output_size = output_size

        layer_in_size = hyper_input_size
        layers = []
        for _ in range(num_layers):
            layers.append(
                linear_layer(
                    layer_in_size,
                    layer_size,
                    kernel_init=Initialization.KaimingHeNormal,
                    kernel_gain=1.0,
                    bias_init=Initialization.Zero,
                ))
            layers.append(Swish())
            layer_in_size = layer_size
        flat_output = linear_layer(
            layer_size,
            input_size * output_size,
            kernel_init=Initialization.KaimingHeNormal,
            kernel_gain=0.1,
            bias_init=Initialization.Zero,
        )

        # Re-initializing the weights of the last layer of the hypernetwork
        bound = math.sqrt(1 / (layer_size * self.input_size))
        flat_output.weight.data.uniform_(-bound, bound)

        self.hypernet = torch.nn.Sequential(*layers, LayerNorm(), flat_output)

        # The hypernetwork will not generate the bias of the main network layer
        self.bias = torch.nn.Parameter(torch.zeros(output_size))
コード例 #17
0
ファイル: module.py プロジェクト: MalCoward/AIForGamesEngines
    def _update_batch(
        self, mini_batch_demo: AgentBuffer, n_sequences: int
    ) -> Dict[str, float]:
        """
        Helper function for update_batch.
        """
        np_obs = ObsUtil.from_buffer(
            mini_batch_demo, len(self.policy.behavior_spec.observation_specs)
        )
        # Convert to tensors
        tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]
        act_masks = None
        expert_actions = AgentAction.from_buffer(mini_batch_demo)
        if self.policy.behavior_spec.action_spec.discrete_size > 0:

            act_masks = ModelUtils.list_to_tensor(
                np.ones(
                    (
                        self.n_sequences * self.policy.sequence_length,
                        sum(self.policy.behavior_spec.action_spec.discrete_branches),
                    ),
                    dtype=np.float32,
                )
            )

        memories = []
        if self.policy.use_recurrent:
            memories = torch.zeros(1, self.n_sequences, self.policy.m_size)

        selected_actions, log_probs, _, _ = self.policy.sample_actions(
            tensor_obs,
            masks=act_masks,
            memories=memories,
            seq_len=self.policy.sequence_length,
        )
        bc_loss = self._behavioral_cloning_loss(
            selected_actions, log_probs, expert_actions
        )
        self.optimizer.zero_grad()
        bc_loss.backward()

        self.optimizer.step()
        run_out = {"loss": bc_loss.item()}
        return run_out
コード例 #18
0
def test_multi_head_attention_masking():
    epsilon = 0.0001
    n_h, emb_size = 4, 12
    n_k, n_q, b = 13, 14, 15
    mha = MultiHeadAttention(emb_size, n_h)
    # create a key input with some keys all 0
    query = torch.ones((b, n_q, emb_size))
    key = torch.ones((b, n_k, emb_size))
    value = torch.ones((b, n_k, emb_size))

    mask = torch.zeros((b, n_k))
    for i in range(n_k):
        if i % 3 == 0:
            key[:, i, :] = 0
            mask[:, i] = 1

    _, attention = mha.forward(query, key, value, n_q, n_k, mask)

    for i in range(n_k):
        if i % 3 == 0:
            assert torch.sum(attention[:, :, :, i]**2) < epsilon
        else:
            assert torch.sum(attention[:, :, :, i]**2) > epsilon
コード例 #19
0
    def __init__(self, policy):
        # ONNX only support input in NCHW (channel first) format.
        # Barracuda also expect to get data in NCHW.
        # Any multi-dimentional input should follow that otherwise will
        # cause problem to barracuda import.
        self.policy = policy
        observation_specs = self.policy.behavior_spec.observation_specs
        batch_dim = [1]
        seq_len_dim = [1]
        vec_obs_size = 0
        for obs_spec in observation_specs:
            if len(obs_spec.shape) == 1:
                vec_obs_size += obs_spec.shape[0]
        num_vis_obs = sum(1 for obs_spec in observation_specs
                          if len(obs_spec.shape) == 3)
        dummy_vec_obs = [torch.zeros(batch_dim + [vec_obs_size])]
        # create input shape of NCHW
        # (It's NHWC in observation_specs.shape)
        dummy_vis_obs = [
            torch.zeros(
                batch_dim +
                [obs_spec.shape[2], obs_spec.shape[0], obs_spec.shape[1]])
            for obs_spec in observation_specs if len(obs_spec.shape) == 3
        ]

        dummy_var_len_obs = [
            torch.zeros(batch_dim + [obs_spec.shape[0], obs_spec.shape[1]])
            for obs_spec in observation_specs if len(obs_spec.shape) == 2
        ]

        dummy_masks = torch.ones(
            batch_dim +
            [sum(self.policy.behavior_spec.action_spec.discrete_branches)])
        dummy_memories = torch.zeros(batch_dim + seq_len_dim +
                                     [self.policy.export_memory_size])

        self.dummy_input = (
            dummy_vec_obs,
            dummy_vis_obs,
            dummy_var_len_obs,
            dummy_masks,
            dummy_memories,
        )

        self.input_names = [TensorNames.vector_observation_placeholder]
        for i in range(num_vis_obs):
            self.input_names.append(TensorNames.get_visual_observation_name(i))
        for i, obs_spec in enumerate(observation_specs):
            if len(obs_spec.shape) == 2:
                self.input_names.append(TensorNames.get_observation_name(i))
        self.input_names += [
            TensorNames.action_mask_placeholder,
            TensorNames.recurrent_in_placeholder,
        ]

        self.dynamic_axes = {name: {0: "batch"} for name in self.input_names}

        self.output_names = [
            TensorNames.version_number, TensorNames.memory_size
        ]
        if self.policy.behavior_spec.action_spec.continuous_size > 0:
            self.output_names += [
                TensorNames.continuous_action_output,
                TensorNames.continuous_action_output_shape,
            ]
            self.dynamic_axes.update(
                {TensorNames.continuous_action_output: {
                    0: "batch"
                }})
        if self.policy.behavior_spec.action_spec.discrete_size > 0:
            self.output_names += [
                TensorNames.discrete_action_output,
                TensorNames.discrete_action_output_shape,
            ]
            self.dynamic_axes.update(
                {TensorNames.discrete_action_output: {
                    0: "batch"
                }})
        if (self.policy.behavior_spec.action_spec.continuous_size == 0
                or self.policy.behavior_spec.action_spec.discrete_size == 0):
            self.output_names += [
                TensorNames.action_output_deprecated,
                TensorNames.is_continuous_control_deprecated,
                TensorNames.action_output_shape_deprecated,
            ]
            self.dynamic_axes.update(
                {TensorNames.action_output_deprecated: {
                    0: "batch"
                }})

        if self.policy.export_memory_size > 0:
            self.output_names += [TensorNames.recurrent_output]
コード例 #20
0
 def __init__(self, vec_obs_size: int):
     super().__init__()
     self.register_buffer("normalization_steps", torch.tensor(1))
     self.register_buffer("running_mean", torch.zeros(vec_obs_size))
     self.register_buffer("running_variance", torch.ones(vec_obs_size))
コード例 #21
0
ファイル: torch_optimizer.py プロジェクト: joomon/ml-agents
    def get_trajectory_value_estimates(
        self,
        batch: AgentBuffer,
        next_obs: List[np.ndarray],
        done: bool,
        agent_id: str = "",
    ) -> Tuple[Dict[str, np.ndarray], Dict[str, float], Optional[AgentBufferField]]:
        """
        Get value estimates and memories for a trajectory, in batch form.
        :param batch: An AgentBuffer that consists of a trajectory.
        :param next_obs: the next observation (after the trajectory). Used for boostrapping
            if this is not a termiinal trajectory.
        :param done: Set true if this is a terminal trajectory.
        :param agent_id: Agent ID of the agent that this trajectory belongs to.
        :returns: A Tuple of the Value Estimates as a Dict of [name, np.ndarray(trajectory_len)],
            the final value estimate as a Dict of [name, float], and optionally (if using memories)
            an AgentBufferField of initial critic memories to be used during update.
        """
        n_obs = len(self.policy.behavior_spec.observation_specs)

        if agent_id in self.critic_memory_dict:
            memory = self.critic_memory_dict[agent_id]
        else:
            memory = (
                torch.zeros((1, 1, self.critic.memory_size))
                if self.policy.use_recurrent
                else None
            )

        # Convert to tensors
        current_obs = [
            ModelUtils.list_to_tensor(obs) for obs in ObsUtil.from_buffer(batch, n_obs)
        ]
        next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs]

        next_obs = [obs.unsqueeze(0) for obs in next_obs]

        # If we're using LSTM, we want to get all the intermediate memories.
        all_next_memories: Optional[AgentBufferField] = None

        # To prevent memory leak and improve performance, evaluate with no_grad.
        with torch.no_grad():
            if self.policy.use_recurrent:
                (
                    value_estimates,
                    all_next_memories,
                    next_memory,
                ) = self._evaluate_by_sequence(current_obs, memory)
            else:
                value_estimates, next_memory = self.critic.critic_pass(
                    current_obs, memory, sequence_length=batch.num_experiences
                )

        # Store the memory for the next trajectory. This should NOT have a gradient.
        self.critic_memory_dict[agent_id] = next_memory

        next_value_estimate, _ = self.critic.critic_pass(
            next_obs, next_memory, sequence_length=1
        )

        for name, estimate in value_estimates.items():
            value_estimates[name] = ModelUtils.to_numpy(estimate)
            next_value_estimate[name] = ModelUtils.to_numpy(next_value_estimate[name])

        if done:
            for k in next_value_estimate:
                if not self.reward_signals[k].ignore_done:
                    next_value_estimate[k] = 0.0
            if agent_id in self.critic_memory_dict:
                self.critic_memory_dict.pop(agent_id)
        return value_estimates, next_value_estimate, all_next_memories
コード例 #22
0
    def get_trajectory_and_baseline_value_estimates(
        self,
        batch: AgentBuffer,
        next_obs: List[np.ndarray],
        next_groupmate_obs: List[List[np.ndarray]],
        done: bool,
        agent_id: str = "",
    ) -> Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray], Dict[str, float],
               Optional[AgentBufferField], Optional[AgentBufferField], ]:
        """
        Get value estimates, baseline estimates, and memories for a trajectory, in batch form.
        :param batch: An AgentBuffer that consists of a trajectory.
        :param next_obs: the next observation (after the trajectory). Used for boostrapping
            if this is not a termiinal trajectory.
        :param next_groupmate_obs: the next observations from other members of the group.
        :param done: Set true if this is a terminal trajectory.
        :param agent_id: Agent ID of the agent that this trajectory belongs to.
        :returns: A Tuple of the Value Estimates as a Dict of [name, np.ndarray(trajectory_len)],
            the baseline estimates as a Dict, the final value estimate as a Dict of [name, float], and
            optionally (if using memories) an AgentBufferField of initial critic and baseline memories to be used
            during update.
        """

        n_obs = len(self.policy.behavior_spec.observation_specs)

        current_obs = ObsUtil.from_buffer(batch, n_obs)
        groupmate_obs = GroupObsUtil.from_buffer(batch, n_obs)

        current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]
        groupmate_obs = [[
            ModelUtils.list_to_tensor(obs) for obs in _groupmate_obs
        ] for _groupmate_obs in groupmate_obs]

        groupmate_actions = AgentAction.group_from_buffer(batch)

        next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs]
        next_obs = [obs.unsqueeze(0) for obs in next_obs]

        next_groupmate_obs = [
            ModelUtils.list_to_tensor_list(_list_obs)
            for _list_obs in next_groupmate_obs
        ]
        # Expand dimensions of next critic obs
        next_groupmate_obs = [[_obs.unsqueeze(0) for _obs in _list_obs]
                              for _list_obs in next_groupmate_obs]

        if agent_id in self.value_memory_dict:
            # The agent_id should always be in both since they are added together
            _init_value_mem = self.value_memory_dict[agent_id]
            _init_baseline_mem = self.baseline_memory_dict[agent_id]
        else:
            _init_value_mem = (torch.zeros((1, 1, self.critic.memory_size))
                               if self.policy.use_recurrent else None)
            _init_baseline_mem = (torch.zeros((1, 1, self.critic.memory_size))
                                  if self.policy.use_recurrent else None)

        all_obs = ([current_obs] + groupmate_obs
                   if groupmate_obs is not None else [current_obs])
        all_next_value_mem: Optional[AgentBufferField] = None
        all_next_baseline_mem: Optional[AgentBufferField] = None
        with torch.no_grad():
            if self.policy.use_recurrent:
                (
                    value_estimates,
                    baseline_estimates,
                    all_next_value_mem,
                    all_next_baseline_mem,
                    next_value_mem,
                    next_baseline_mem,
                ) = self._evaluate_by_sequence_team(
                    current_obs,
                    groupmate_obs,
                    groupmate_actions,
                    _init_value_mem,
                    _init_baseline_mem,
                )
            else:
                value_estimates, next_value_mem = self.critic.critic_pass(
                    all_obs,
                    _init_value_mem,
                    sequence_length=batch.num_experiences)
                groupmate_obs_and_actions = (groupmate_obs, groupmate_actions)
                baseline_estimates, next_baseline_mem = self.critic.baseline(
                    current_obs,
                    groupmate_obs_and_actions,
                    _init_baseline_mem,
                    sequence_length=batch.num_experiences,
                )
        # Store the memory for the next trajectory
        self.value_memory_dict[agent_id] = next_value_mem
        self.baseline_memory_dict[agent_id] = next_baseline_mem

        all_next_obs = ([next_obs] + next_groupmate_obs
                        if next_groupmate_obs is not None else [next_obs])

        next_value_estimates, _ = self.critic.critic_pass(all_next_obs,
                                                          next_value_mem,
                                                          sequence_length=1)

        for name, estimate in baseline_estimates.items():
            baseline_estimates[name] = ModelUtils.to_numpy(estimate)

        for name, estimate in value_estimates.items():
            value_estimates[name] = ModelUtils.to_numpy(estimate)

        # the base line and V shpuld  not be on the same done flag
        for name, estimate in next_value_estimates.items():
            next_value_estimates[name] = ModelUtils.to_numpy(estimate)

        if done:
            for k in next_value_estimates:
                if not self.reward_signals[k].ignore_done:
                    next_value_estimates[k][-1] = 0.0

        return (
            value_estimates,
            baseline_estimates,
            next_value_estimates,
            all_next_value_mem,
            all_next_baseline_mem,
        )