コード例 #1
0
ファイル: test_sample_batch.py プロジェクト: Yard1/ray
    def test_copy(self):
        s = SampleBatch({
            "a": np.array([1, 2, 3, 2, 3, 4]),
            "b": {
                "c": np.array([4, 5, 6, 5, 6, 7])
            },
            "seq_lens": [2, 3, 1],
            "state_in_0": [1.0, 3.0, 4.0],
        })
        s_copy = s.copy(shallow=False)
        s_copy["a"][0] = 100
        s_copy["b"]["c"][0] = 200
        s_copy["seq_lens"][0] = 3
        s_copy["seq_lens"][1] = 2
        s_copy["state_in_0"][0] = 400.0
        self.assertNotEqual(s["a"][0], s_copy["a"][0])
        self.assertNotEqual(s["b"]["c"][0], s_copy["b"]["c"][0])
        self.assertNotEqual(s["seq_lens"][0], s_copy["seq_lens"][0])
        self.assertNotEqual(s["seq_lens"][1], s_copy["seq_lens"][1])
        self.assertNotEqual(s["state_in_0"][0], s_copy["state_in_0"][0])

        s_copy = s.copy(shallow=True)
        s_copy["a"][0] = 100
        s_copy["b"]["c"][0] = 200
        s_copy["seq_lens"][0] = 3
        s_copy["seq_lens"][1] = 2
        s_copy["state_in_0"][0] = 400.0
        self.assertEqual(s["a"][0], s_copy["a"][0])
        self.assertEqual(s["b"]["c"][0], s_copy["b"]["c"][0])
        self.assertEqual(s["seq_lens"][0], s_copy["seq_lens"][0])
        self.assertEqual(s["seq_lens"][1], s_copy["seq_lens"][1])
        self.assertEqual(s["state_in_0"][0], s_copy["state_in_0"][0])
コード例 #2
0
    def _learn_on_batch(self, samples: SampleBatch):

        (
            policies_idx_to_train,
            policies_to_train,
        ) = self._get_policies_idx_to_train_with_current_batch()
        if len(policies_idx_to_train) == 0:
            return self.learner_stats
        logging.debug(f"policies_idx_to_train {policies_idx_to_train}")
        self._init_log_learn_on_batch(policies_idx_to_train)

        for policy_n, algo in zip(policies_idx_to_train, policies_to_train):
            samples_copy = samples.copy()
            samples_copy = self._modify_batch_for_policy(
                policy_n, samples_copy)

            if (len(samples_copy[samples_copy.ACTIONS]) >
                    len(samples[samples.ACTIONS]) // 2):
                self._to_log[f"learn_on_batch_algo{policy_n}"] = len(
                    samples_copy[samples_copy.ACTIONS])
                self.learner_stats["learner_stats"][
                    f"algo{policy_n}"] = algo.learn_on_batch(samples_copy)
            else:
                self.learner_stats["learner_stats"][f"algo{policy_n}"] = {}

        return self.learner_stats
コード例 #3
0
    def from_batch(self, train_batch: SampleBatch,
                   is_training: bool = True) -> (TensorType, List[TensorType]):
        """Convenience function that calls this model with a tensor batch.

        All this does is unpack the tensor batch to call this model with the
        right input dict, state, and seq len arguments.
        """

        input_dict = train_batch.copy()
        input_dict["is_training"] = is_training
        states = []
        i = 0
        while "state_in_{}".format(i) in input_dict:
            states.append(input_dict["state_in_{}".format(i)])
            i += 1
        ret = self.__call__(input_dict, states, input_dict.get("seq_lens"))
        return ret
コード例 #4
0
    def learn_on_batch(self, samples: SampleBatch):
        learner_stats = {"learner_stats": {}}

        # Update LR used in optimizer
        self.optimizer()

        for policy_n, algo in enumerate(self.algorithms):

            samples_copy = samples.copy()
            samples_copy = self._modify_batch_for_policy(
                policy_n, samples_copy)

            if len(samples_copy[samples_copy.ACTIONS]) > 0:
                learner_stats["learner_stats"][
                    f"learner_stats_algo{policy_n}"] = algo.learn_on_batch(
                        samples_copy)
                # self.to_log[f'algo{policy_n}_cur_lr'] = algo.cur_lr
            # For debugging purpose log the true lr (to be compared to algo.cur_lr)
            # for j, opt in enumerate(algo._optimizers):
            #     self.to_log[f"algo_{policy_n}_{j}_lr"] = [p["lr"] for p in opt.param_groups][0]

        return learner_stats
コード例 #5
0
    def test_pg_loss_functions(self):
        """Tests the PG loss function math."""
        config = pg.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        config["gamma"] = 0.99
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = "linear"

        # Fake CartPole episode of n time steps.
        train_batch = SampleBatch({
            SampleBatch.OBS:
            np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
                      [0.9, 1.0, 1.1, 1.2]]),
            SampleBatch.ACTIONS:
            np.array([0, 1, 1]),
            SampleBatch.REWARDS:
            np.array([1.0, 1.0, 1.0]),
            SampleBatch.DONES:
            np.array([False, False, True]),
            SampleBatch.EPS_ID:
            np.array([1234, 1234, 1234]),
            SampleBatch.AGENT_INDEX:
            np.array([0, 0, 0]),
        })

        for fw, sess in framework_iterator(config, session=True):
            dist_cls = (Categorical if fw != "torch" else TorchCategorical)
            trainer = pg.PGTrainer(config=config, env="CartPole-v0")
            policy = trainer.get_policy()
            vars = policy.model.trainable_variables()
            if sess:
                vars = policy.get_session().run(vars)

            # Post-process (calculate simple (non-GAE) advantages) and attach
            # to train_batch dict.
            # A = [0.99^2 * 1.0 + 0.99 * 1.0 + 1.0, 0.99 * 1.0 + 1.0, 1.0] =
            # [2.9701, 1.99, 1.0]
            train_batch_ = pg.post_process_advantages(policy,
                                                      train_batch.copy())
            if fw == "torch":
                train_batch_ = policy._lazy_tensor_dict(train_batch_)

            # Check Advantage values.
            check(train_batch_[Postprocessing.ADVANTAGES], [2.9701, 1.99, 1.0])

            # Actual loss results.
            if sess:
                results = policy.get_session().run(
                    policy._loss,
                    feed_dict=policy._get_loss_inputs_dict(train_batch_,
                                                           shuffle=False))
            else:
                results = (pg.pg_tf_loss if fw in ["tf2", "tfe"] else
                           pg.pg_torch_loss)(policy,
                                             policy.model,
                                             dist_class=dist_cls,
                                             train_batch=train_batch_)

            # Calculate expected results.
            if fw != "torch":
                expected_logits = fc(fc(train_batch_[SampleBatch.OBS],
                                        vars[0],
                                        vars[1],
                                        framework=fw),
                                     vars[2],
                                     vars[3],
                                     framework=fw)
            else:
                expected_logits = fc(fc(train_batch_[SampleBatch.OBS],
                                        vars[2],
                                        vars[3],
                                        framework=fw),
                                     vars[0],
                                     vars[1],
                                     framework=fw)
            expected_logp = dist_cls(expected_logits, policy.model).logp(
                train_batch_[SampleBatch.ACTIONS])
            adv = train_batch_[Postprocessing.ADVANTAGES]
            if sess:
                expected_logp = sess.run(expected_logp)
            elif fw == "torch":
                expected_logp = expected_logp.detach().cpu().numpy()
                adv = adv.detach().cpu().numpy()
            else:
                expected_logp = expected_logp.numpy()
            expected_loss = -np.mean(expected_logp * adv)
            check(results, expected_loss, decimals=4)
コード例 #6
0
def compute_gae_for_sample_batch(
        policy: Policy,
        sample_batch: SampleBatch,
        other_agent_batches: Optional[Dict[AgentID, SampleBatch]] = None,
        episode: Optional[MultiAgentEpisode] = None) -> SampleBatch:
    """Adds GAE (generalized advantage estimations) to a trajectory.
    The trajectory contains only data from one episode and from one agent.
    - If  `config.batch_mode=truncate_episodes` (default), sample_batch may
    contain a truncated (at-the-end) episode, in case the
    `config.rollout_fragment_length` was reached by the sampler.
    - If `config.batch_mode=complete_episodes`, sample_batch will contain
    exactly one episode (no matter how long).
    New columns can be added to sample_batch and existing ones may be altered.
    Args:
        policy (Policy): The Policy used to generate the trajectory
            (`sample_batch`)
        sample_batch (SampleBatch): The SampleBatch to postprocess.
        other_agent_batches (Optional[Dict[PolicyID, SampleBatch]]): Optional
            dict of AgentIDs mapping to other agents' trajectory data (from the
            same episode). NOTE: The other agents use the same policy.
        episode (Optional[MultiAgentEpisode]): Optional multi-agent episode
            object in which the agents operated.
    Returns:
        SampleBatch: The postprocessed, modified SampleBatch (or a new one).
    """

    # the trajectory view API will pass populate the info dict with a np.zeros((n,))
    # array in the first call, in that case the dtype will be float32 and we
    # have to ignore it. For regular calls, we extract the rewards from the info
    # dict into the samplebatch_infos_rewards dict, which now holds the rewards
    # for all agents as dict.
    samplebatch_infos_rewards = {'0': sample_batch[SampleBatch.INFOS]}
    if not sample_batch[SampleBatch.INFOS].dtype == "float32":
        samplebatch_infos = SampleBatch.concat_samples([
            SampleBatch({k: [v]
                         for k, v in s.items()})
            for s in sample_batch[SampleBatch.INFOS]
        ])
        samplebatch_infos_rewards = SampleBatch.concat_samples([
            SampleBatch({str(k): [v]
                         for k, v in s.items()})
            for s in samplebatch_infos["rewards"]
        ])

    if not isinstance(policy.action_space, gym.spaces.tuple.Tuple):
        raise InvalidActionSpace("Expect tuple action space")

    # samplebatches for each agents
    batches = []
    for key, action_space in zip(samplebatch_infos_rewards.keys(),
                                 policy.action_space):
        i = int(key)
        sample_batch_agent = sample_batch.copy()
        sample_batch_agent[SampleBatch.REWARDS] = (
            samplebatch_infos_rewards[key])
        if isinstance(action_space, gym.spaces.box.Box):
            assert len(action_space.shape) == 1
            a_w = action_space.shape[0]
        elif isinstance(action_space, gym.spaces.discrete.Discrete):
            a_w = 1
        else:
            raise InvalidActionSpace(
                "Expect gym.spaces.box or gym.spaces.discrete action space")

        sample_batch_agent[SampleBatch.ACTIONS] = sample_batch[
            SampleBatch.ACTIONS][:, a_w * i:a_w * (i + 1)]
        sample_batch_agent[SampleBatch.VF_PREDS] = sample_batch[
            SampleBatch.VF_PREDS][:, i]

        # Trajectory is actually complete -> last r=0.0.
        if sample_batch[SampleBatch.DONES][-1]:
            last_r = 0.0
        # Trajectory has been truncated -> last r=VF estimate of last obs.
        else:
            # Input dict is provided to us automatically via the Model's
            # requirements. It's a single-timestep (last one in trajectory)
            # input_dict.
            # Create an input dict according to the Model's requirements.
            input_dict = policy.model.get_input_dict(sample_batch,
                                                     index="last")
            all_values = policy._value(**input_dict,
                                       seq_lens=input_dict.seq_lens)
            last_r = all_values[i].item()

        # Adds the policy logits, VF preds, and advantages to the batch,
        # using GAE ("generalized advantage estimation") or not.
        batches.append(
            compute_advantages(sample_batch_agent,
                               last_r,
                               policy.config["gamma"],
                               policy.config["lambda"],
                               use_gae=policy.config["use_gae"],
                               use_critic=policy.config.get(
                                   "use_critic", True)))

    # Now take original samplebatch and overwrite following elements as a concatenation of these
    for k in [
            SampleBatch.REWARDS,
            SampleBatch.VF_PREDS,
            Postprocessing.ADVANTAGES,
            Postprocessing.VALUE_TARGETS,
    ]:
        sample_batch[k] = np.stack([b[k] for b in batches], axis=-1)

    return sample_batch
コード例 #7
0
    def train(self, batch: SampleBatch) -> TensorType:
        """Trains self.q_model using FQE loss on given batch.

        Args:
            batch: A SampleBatch of episodes to train on

        Returns:
            A list of losses for each training iteration
        """
        losses = []
        minibatch_size = self.minibatch_size or batch.count
        # Copy batch for shuffling
        batch = batch.copy(shallow=True)
        for _ in range(self.n_iters):
            minibatch_losses = []
            batch.shuffle()
            for idx in range(0, batch.count, minibatch_size):
                minibatch = batch[idx : idx + minibatch_size]
                obs = torch.tensor(minibatch[SampleBatch.OBS], device=self.device)
                actions = torch.tensor(
                    minibatch[SampleBatch.ACTIONS],
                    device=self.device,
                    dtype=int,
                )
                rewards = torch.tensor(
                    minibatch[SampleBatch.REWARDS], device=self.device
                )
                next_obs = torch.tensor(
                    minibatch[SampleBatch.NEXT_OBS], device=self.device
                )
                dones = torch.tensor(
                    minibatch[SampleBatch.DONES], device=self.device, dtype=float
                )

                # Compute Q-values for current obs
                q_values, _ = self.q_model({"obs": obs}, [], None)
                q_acts = torch.gather(q_values, -1, actions.unsqueeze(-1)).squeeze(-1)

                next_action_probs = self._compute_action_probs(next_obs)

                # Compute Q-values for next obs
                with torch.no_grad():
                    next_q_values, _ = self.target_q_model({"obs": next_obs}, [], None)

                # Compute estimated state value next_v = E_{a ~ pi(s)} [Q(next_obs,a)]
                next_v = torch.sum(next_q_values * next_action_probs, axis=-1)
                targets = rewards + (1 - dones) * self.gamma * next_v
                loss = (targets - q_acts) ** 2
                loss = torch.mean(loss)
                self.optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad.clip_grad_norm_(
                    self.q_model.variables(), self.clip_grad_norm
                )
                self.optimizer.step()
                minibatch_losses.append(loss.item())
            iter_loss = sum(minibatch_losses) / len(minibatch_losses)
            losses.append(iter_loss)
            if iter_loss < self.delta:
                break
            self.update_target()
        return losses
コード例 #8
0
def ppo_loss(policy: Policy, model: ModelV2,
             dist_class: Type[TorchDistributionWrapper],
             train_batch: SampleBatch) -> Union[TensorType, List[TensorType]]:
    """ TODO: Write documentation.
    """
    # Compute original ppo loss
    total_loss = ppo_surrogate_loss(policy, model, dist_class, train_batch)

    # Shallow copy the input batch.
    # Be careful accessing fields using the original batch to properly
    # keep track of acessed keys, which will be used to discard useless
    # components of policy's view requirements.
    train_batch_copy = train_batch.copy(shallow=True)

    # Extract mean of predicted action from logits.
    # No need to compute the perform model forward pass since the original
    # PPO loss is already doing it, so just getting back the last ouput.
    action_logits = model._last_output
    if issubclass(dist_class, TorchDiagGaussian):
        action_mean_true, _ = torch.chunk(action_logits, 2, dim=1)
    else:
        action_dist = dist_class(action_logits, model)
        action_mean_true = action_dist.deterministic_sample()

    if policy.config["caps_temporal_reg"] > 0.0:
        # Compute the mean action corresponding to the previous observation
        observation_prev = train_batch["_prev_obs"]
        train_batch_copy["obs"] = observation_prev
        action_logits_prev, _ = model(train_batch_copy)
        if issubclass(dist_class, TorchDiagGaussian):
            action_mean_prev, _ = torch.chunk(action_logits_prev, 2, dim=1)
        else:
            action_dist_prev = dist_class(action_logits_prev, model)
            action_mean_prev = action_dist_prev.deterministic_sample()

        # Minimize the difference between the successive action mean
        policy._mean_temporal_caps_loss = torch.mean(
            (action_mean_prev - action_mean_true)**2)

        # Add temporal smoothness loss to total loss
        total_loss += policy.config["caps_temporal_reg"] * \
            policy._mean_temporal_caps_loss

    if policy.config["caps_spatial_reg"] > 0.0 or \
            policy.config["symmetric_policy_reg"] > 0.0:
        # Generate noisy observation based on specified sensivity
        offset = 0
        observation_true = train_batch["obs"]
        observation_noisy = observation_true.clone()
        batch_dim = observation_true.shape[:-1]
        observation_space = policy.observation_space.original_space
        for scale in observation_space.sensitivity.values():
            scale = torch.from_numpy(scale.copy()).to(
                dtype=torch.float32, device=observation_true.device)
            unit_noise = torch.randn((*batch_dim, len(scale)),
                                     device=observation_true.device)
            slice_idx = slice(offset, offset + len(scale))
            observation_noisy[..., slice_idx].addcmul_(scale, unit_noise)
            offset += len(scale)

        # Compute the mean action corresponding to the noisy observation
        train_batch_copy["obs"] = observation_noisy
        action_logits_noisy, _ = model(train_batch_copy)
        if issubclass(dist_class, TorchDiagGaussian):
            action_mean_noisy, _ = torch.chunk(action_logits_noisy, 2, dim=1)
        else:
            action_dist_noisy = dist_class(action_logits_noisy, model)
            action_mean_noisy = action_dist_noisy.deterministic_sample()

    if policy.config["caps_spatial_reg"] > 0.0:
        # Minimize the difference between the original action mean and the
        # one corresponding to the noisy observation.
        policy._mean_spatial_caps_loss = torch.mean(
            (action_mean_noisy - action_mean_true)**2)

        # Add spatial smoothness loss to total loss
        total_loss += policy.config["caps_spatial_reg"] * \
            policy._mean_spatial_caps_loss

    if policy.config["caps_global_reg"] > 0.0:
        # Minimize the magnitude of action mean
        policy._mean_global_caps_loss = torch.mean(action_mean_true**2)

        # Add global smoothness loss to total loss
        total_loss += policy.config["caps_global_reg"] * \
            policy._mean_global_caps_loss

    if policy.config["symmetric_policy_reg"] > 0.0:
        # Compute mirrorred observation
        offset = 0
        observation_mirror = torch.empty_like(observation_true)
        observation_space = policy.observation_space.original_space
        for mirror_mat in observation_space.mirror_mat.values():
            mirror_mat = torch.from_numpy(mirror_mat.T.copy()).to(
                dtype=torch.float32, device=observation_true.device)
            slice_idx = slice(offset, offset + len(mirror_mat))
            torch.mm(observation_true[..., slice_idx],
                     mirror_mat,
                     out=observation_mirror[..., slice_idx])
            offset += len(mirror_mat)

        # Compute the mirrored mean action corresponding to the mirrored action
        train_batch_copy["obs"] = observation_mirror
        action_logits_mirror, _ = model(train_batch_copy)
        if issubclass(dist_class, TorchDiagGaussian):
            action_mean_mirror, _ = torch.chunk(action_logits_mirror, 2, dim=1)
        else:
            action_dist_mirror = dist_class(action_logits_mirror, model)
            action_mean_mirror = action_dist_mirror.deterministic_sample()
        action_mirror_mat = policy.action_space.mirror_mat
        action_mirror_mat = torch.from_numpy(action_mirror_mat.T.copy()).to(
            dtype=torch.float32, device=observation_true.device)
        action_mean_mirror = action_mean_mirror @ action_mirror_mat

        # Minimize the assymetry of policy output
        policy._mean_symmetric_policy_loss = torch.mean(
            (action_mean_mirror - action_mean_true)**2)

        # Add policy symmetry loss to total loss
        total_loss += policy.config["symmetric_policy_reg"] * \
            policy._mean_symmetric_policy_loss

    return total_loss