Python as_torch Beispiele, garage.torch.as_torch Python Beispiele

Beispiel #1

0

Datei anzeigen

    def _train_once(self, trainer, epoch):
        """Obtain samplers and train for one epoch.

        Args:
            trainer (Trainer): Experiment trainer, which may be used to
                obtain samples.
            epoch (int): The current epoch.

        Returns:
            List[float]: Losses.

        """
        batch = self._obtain_samples(trainer, epoch)
        indices = np.random.permutation(len(batch.actions))
        minibatches = np.array_split(indices, self._minibatches_per_epoch)
        losses = []
        for minibatch in minibatches:
            observations = as_torch(batch.observations[minibatch])
            actions = as_torch(batch.actions[minibatch])
            self._optimizer.zero_grad()
            loss = self._compute_loss(observations, actions)
            loss.backward()
            losses.append(loss.item())
            self._optimizer.step()
        return losses

Beispiel #2

0

Datei anzeigen

Datei: dqn.py Projekt: mugoh/garage

    def _optimize_qf(self, timesteps):
        """Perform algorithm optimizing.

        Args:
            timesteps (TimeStepBatch): Processed batch data.

        Returns:
            qval_loss: Loss of Q-value predicted by the Q-network.
            ys: y_s.
            qval: Q-value predicted by the Q-network.

        """
        observations = as_torch(timesteps.observations)
        rewards = as_torch(timesteps.rewards).reshape(-1, 1)
        rewards *= self._reward_scale
        actions = as_torch(timesteps.actions)
        next_observations = as_torch(timesteps.next_observations)
        terminals = as_torch(timesteps.terminals).reshape(-1, 1)

        next_inputs = next_observations
        inputs = observations
        with torch.no_grad():
            if self._double_q:
                # Use online qf to get optimal actions
                selected_actions = torch.argmax(self._qf(next_inputs), axis=1)
                # use target qf to get Q values for those actions
                selected_actions = selected_actions.long().unsqueeze(1)
                best_qvals = torch.gather(self._target_qf(next_inputs),
                                          dim=1,
                                          index=selected_actions)
            else:
                target_qvals = self._target_qf(next_inputs)
                best_qvals, _ = torch.max(target_qvals, 1)
                best_qvals = best_qvals.unsqueeze(1)

        rewards_clipped = rewards
        if self._clip_reward is not None:
            rewards_clipped = torch.clamp(rewards, -1 * self._clip_reward,
                                          self._clip_reward)
        y_target = (rewards_clipped +
                    (1.0 - terminals) * self._discount * best_qvals)
        y_target = y_target.squeeze(1)

        # optimize qf
        qvals = self._qf(inputs)
        selected_qs = torch.sum(qvals * actions, axis=1)
        qval_loss = F.smooth_l1_loss(selected_qs, y_target)

        self._qf_optimizer.zero_grad()
        qval_loss.backward()

        # optionally clip the gradients
        if self._clip_grad is not None:
            torch.nn.utils.clip_grad_norm_(self.policy.parameters(),
                                           self._clip_grad)
        self._qf_optimizer.step()

        return (qval_loss.detach(), y_target, selected_qs.detach())

Beispiel #3

0

Datei anzeigen

Datei: stochastic_policy.py Projekt: mugoh/garage

    def get_action(self, observation):
        r"""Get a single action given an observation.

        Args:
            observation (np.ndarray): Observation from the environment.
                Shape is :math:`env_spec.observation_space`.

        Returns:
            tuple:
                * np.ndarray: Predicted action. Shape is
                    :math:`env_spec.action_space`.
                * dict:
                    * np.ndarray[float]: Mean of the distribution
                    * np.ndarray[float]: Standard deviation of logarithmic
                        values of the distribution.
        """
        if not isinstance(observation, np.ndarray) and not isinstance(
                observation, torch.Tensor):
            observation = self._env_spec.observation_space.flatten(observation)
        elif isinstance(observation,
                        np.ndarray) and len(observation.shape) > 1:
            observation = self._env_spec.observation_space.flatten(observation)
        elif isinstance(observation,
                        torch.Tensor) and len(observation.shape) > 1:
            observation = torch.flatten(observation)
        with torch.no_grad():
            if not isinstance(observation, torch.Tensor):
                observation = as_torch(observation)
            observation = observation.unsqueeze(0)
            action, agent_infos = self.get_actions(observation)
            return action[0], {k: v[0] for k, v in agent_infos.items()}

Beispiel #4

0

Datei anzeigen

Datei: test_dqn.py Projekt: mugoh/garage

def test_double_dqn_loss(setup):
    algo, env, buff, _, batch_size = setup

    algo._double_q = True
    trainer = Trainer(snapshot_config)
    trainer.setup(algo, env)

    paths = trainer.obtain_episodes(0, batch_size=batch_size)
    buff.add_episode_batch(paths)
    timesteps = buff.sample_timesteps(algo._buffer_batch_size)
    timesteps_copy = copy.deepcopy(timesteps)

    observations = as_torch(timesteps.observations)
    rewards = as_torch(timesteps.rewards).reshape(-1, 1)
    actions = as_torch(timesteps.actions)
    next_observations = as_torch(timesteps.next_observations)
    terminals = as_torch(timesteps.terminals).reshape(-1, 1)

    next_inputs = next_observations
    inputs = observations
    with torch.no_grad():
        # double Q loss
        selected_actions = torch.argmax(algo._qf(next_inputs), axis=1)
        # use target qf to get Q values for those actions
        selected_actions = selected_actions.long().unsqueeze(1)
        best_qvals = torch.gather(algo._target_qf(next_inputs),
                                  dim=1,
                                  index=selected_actions)

    rewards_clipped = rewards
    y_target = (rewards_clipped +
                (1.0 - terminals) * algo._discount * best_qvals)
    y_target = y_target.squeeze(1)

    # optimize qf
    qvals = algo._qf(inputs)
    selected_qs = torch.sum(qvals * actions, axis=1)
    qval_loss = F.smooth_l1_loss(selected_qs, y_target)

    algo_loss, algo_targets, algo_selected_qs = algo._optimize_qf(
        timesteps_copy)
    env.close()

    assert (qval_loss.detach() == algo_loss).all()
    assert (y_target == algo_targets).all()
    assert (selected_qs == algo_selected_qs).all()

Beispiel #5

0

Datei anzeigen

    def get_actions(self, observations):
        """Get actions given observations.

        Args:
            observations (np.ndarray): Batch of observations, should
                have shape :math:`(N, O)`.

        Returns:
            torch.Tensor: Predicted actions. Tensor has shape :math:`(N, A)`.
            dict: Empty since this policy does not produce a distribution.
        """
        with torch.no_grad():
            return self(as_torch(observations)).cpu().numpy(), dict()

Beispiel #6

0

Datei anzeigen

Datei: stochastic_policy.py Projekt: mugoh/garage

    def get_actions(self, observations):
        r"""Get actions given observations.

        Args:
            observations (np.ndarray): Observations from the environment.
                Shape is :math:`batch_dim \bullet env_spec.observation_space`.

        Returns:
            tuple:
                * np.ndarray: Predicted actions.
                    :math:`batch_dim \bullet env_spec.action_space`.
                * dict:
                    * np.ndarray[float]: Mean of the distribution.
                    * np.ndarray[float]: Standard deviation of logarithmic
                        values of the distribution.
        """
        if not isinstance(observations[0], np.ndarray) and not isinstance(
                observations[0], torch.Tensor):
            observations = self._env_spec.observation_space.flatten_n(
                observations)

        # frequently users like to pass lists of torch tensors or lists of
        # numpy arrays. This handles those conversions.
        if isinstance(observations, list):
            if isinstance(observations[0], np.ndarray):
                observations = np.stack(observations)
            elif isinstance(observations[0], torch.Tensor):
                observations = torch.stack(observations)

        if isinstance(observations[0],
                      np.ndarray) and len(observations[0].shape) > 1:
            observations = self._env_spec.observation_space.flatten_n(
                observations)
        elif isinstance(observations[0],
                        torch.Tensor) and len(observations[0].shape) > 1:
            observations = torch.flatten(observations, start_dim=1)
        with torch.no_grad():
            if not isinstance(observations, torch.Tensor):
                observations = as_torch(observations)
            dist, info = self.forward(observations)
            return dist.sample().cpu().numpy(), {
                k: v.detach().cpu().numpy()
                for (k, v) in info.items()
            }