コード例 #1
0
def train(memory):
    batch = memory.sample()
    batch_states = DOUBLE(batch.state).to(device)
    batch_actions = DOUBLE(batch.action).to(device)
    batch_log_probs = DOUBLE(batch.log_prob).to(device)
    batch_masks = DOUBLE(batch.mask).to(device)
    batch_rewards = DOUBLE(batch.reward).to(device)
    batch_size = batch_states.shape[0]

    with torch.no_grad():
        batch_values = critic(batch_states)

    batch_advantages, batch_returns = estimate_advantages(batch_rewards, batch_masks, batch_values, gamma,
                                                          tau)

    # mini-batch ppo update
    mini_batch_num = int(math.ceil(batch_size / mini_batch_size))
    for _ in range(ppo_epochs):
        idx = torch.randperm(batch_size)

        for i in range(mini_batch_num):
            mini_batch_idx = idx[i * mini_batch_size: min((i + 1) * mini_batch_size, batch_size)]

            mini_batch_states, mini_batch_actions, mini_batch_log_probs, mini_batch_returns, mini_batch_advantages = \
                batch_states[mini_batch_idx], batch_actions[mini_batch_idx], batch_log_probs[mini_batch_idx], \
                batch_returns[mini_batch_idx], batch_advantages[mini_batch_idx]

            ppo_step(actor, critic, opt_p, opt_v, 1, mini_batch_states, mini_batch_actions, mini_batch_returns, mini_batch_advantages,
                          mini_batch_log_probs, epsilon, 1e-3)
コード例 #2
0
    def learn(self, writer, i_iter):
        """learn model"""
        memory, log = self.collector.collect_samples(self.min_batch_size)

        print(
            f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, "
            f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, "
            f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}"
        )

        # record reward information
        writer.add_scalars(
            "PPO_mini_batch", {
                "total reward": log['total_reward'],
                "average reward": log['avg_reward'],
                "min reward": log['min_episode_reward'],
                "max reward": log['max_episode_reward'],
                "num steps": log['num_steps']
            }, i_iter)

        batch = memory.sample()  # sample all items in memory

        batch_state = FLOAT(batch.state).to(device)
        batch_action = FLOAT(batch.action).to(device)
        batch_log_prob = FLOAT(batch.log_prob).to(device)
        batch_reward = FLOAT(batch.reward).to(device)
        batch_mask = FLOAT(batch.mask).to(device)
        batch_size = batch_state.shape[0]

        with torch.no_grad():
            batch_values = self.value_net(batch_state)

        batch_advantages, batch_returns = estimate_advantages(
            batch_reward, batch_mask, batch_values, self.gamma, self.tau)
        v_loss, p_loss = torch.empty(1), torch.empty(1)

        mini_batch_num = int(math.ceil(batch_size / self.ppo_mini_batch_size))

        # update with mini-batch
        for _ in range(self.ppo_epochs):
            index = torch.randperm(batch_size)

            for i in range(mini_batch_num):
                ind = index[slice(
                    i * self.ppo_mini_batch_size,
                    min(batch_size, (i + 1) * self.ppo_mini_batch_size))]
                state, action, returns, advantages, old_log_pis = batch_state[
                    ind], batch_action[ind], batch_returns[
                        ind], batch_advantages[ind], batch_log_prob[ind]

                v_loss, p_loss = ppo_step(self.policy_net, self.value_net,
                                          self.optimizer_p, self.optimizer_v,
                                          1, state, action, returns,
                                          advantages, old_log_pis,
                                          self.clip_epsilon, 1e-3)

        return v_loss, p_loss
コード例 #3
0
ファイル: ppo.py プロジェクト: EdenGabriel/DeepRL_Algorithms
    def learn(self, writer, i_iter):
        """learn model"""
        memory, log = self.collector.collect_samples(self.min_batch_size)

        print(
            f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, "
            f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, "
            f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}"
        )

        # record reward information
        writer.add_scalars(
            "ppo", {
                "total reward": log['total_reward'],
                "average reward": log['avg_reward'],
                "min reward": log['min_episode_reward'],
                "max reward": log['max_episode_reward'],
                "num steps": log['num_steps']
            }, i_iter)

        batch = memory.sample()  # sample all items in memory

        batch_state = DOUBLE(batch.state).to(device)
        batch_action = DOUBLE(batch.action).to(device)
        batch_reward = DOUBLE(batch.reward).to(device)
        batch_mask = DOUBLE(batch.mask).to(device)
        batch_log_prob = DOUBLE(batch.log_prob).to(device)

        with torch.no_grad():
            batch_value = self.value_net(batch_state)

        batch_advantage, batch_return = estimate_advantages(
            batch_reward, batch_mask, batch_value, self.gamma, self.tau)
        v_loss, p_loss = torch.empty(1), torch.empty(1)
        for _ in range(self.ppo_epochs):
            v_loss, p_loss = ppo_step(self.policy_net, self.value_net,
                                      self.optimizer_p, self.optimizer_v, 1,
                                      batch_state, batch_action, batch_return,
                                      batch_advantage, batch_log_prob,
                                      self.clip_epsilon, 1e-3)

        self.policy_net_old.load_state_dict(self.policy_net.state_dict())
        return v_loss, p_loss