Exemple #1
0
    def learn(self, writer, i_iter):
        """learn model"""
        memory, log = self.collector.collect_samples(self.min_batch_size)

        print(
            f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, "
            f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, "
            f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}"
        )

        # record reward information
        writer.add_scalar("total reward", log['total_reward'], i_iter)
        writer.add_scalar("average reward", log['avg_reward'], i_iter)
        writer.add_scalar("min reward", log['min_episode_reward'], i_iter)
        writer.add_scalar("max reward", log['max_episode_reward'], i_iter)
        writer.add_scalar("num steps", log['num_steps'], i_iter)

        batch = memory.sample()  # sample all items in memory

        batch_state = FLOAT(batch.state).to(device)
        batch_action = FLOAT(batch.action).to(device)
        batch_reward = FLOAT(batch.reward).to(device)
        batch_mask = FLOAT(batch.mask).to(device)

        with torch.no_grad():
            batch_value = self.ac_net.get_value(batch_state)

        batch_advantage, batch_return = estimate_advantages(
            batch_reward, batch_mask, batch_value, self.gamma, self.tau)

        alg_step_stats = a2c_step(self.ac_net, self.optimizer_ac, batch_state,
                                  batch_action, batch_return, batch_advantage,
                                  self.value_net_coeff, self.entropy_coeff)

        return alg_step_stats
Exemple #2
0
    def learn(self, writer, i_iter):
        """learn model"""
        memory, log = self.collector.collect_samples(self.min_batch_size)

        print(
            f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, "
            f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, "
            f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}"
        )

        # record reward information
        writer.add_scalar("total reward", log['total_reward'], i_iter)
        writer.add_scalar("average reward", log['avg_reward'], i_iter)
        writer.add_scalar("min reward", log['min_episode_reward'], i_iter)
        writer.add_scalar("max reward", log['max_episode_reward'], i_iter)
        writer.add_scalar("num steps", log['num_steps'], i_iter)

        batch = memory.sample()  # sample all items in memory

        batch_state = FLOAT(batch.state).to(device)
        batch_action = FLOAT(batch.action).to(device)
        batch_reward = FLOAT(batch.reward).to(device)
        batch_mask = FLOAT(batch.mask).to(device)
        batch_log_prob = FLOAT(batch.log_prob).to(device)

        with torch.no_grad():
            batch_value = self.value_net(batch_state)

        batch_advantage, batch_return = estimate_advantages(
            batch_reward, batch_mask, batch_value, self.gamma, self.tau)

        # update by TRPO
        trpo_step(self.policy_net, self.value_net, batch_state, batch_action,
                  batch_return, batch_advantage, batch_log_prob, self.max_kl,
                  self.damping, 1e-3, None)
def train(memory):
    batch = memory.sample()
    batch_states = DOUBLE(batch.state).to(device)
    batch_actions = DOUBLE(batch.action).to(device)
    batch_log_probs = DOUBLE(batch.log_prob).to(device)
    batch_masks = DOUBLE(batch.mask).to(device)
    batch_rewards = DOUBLE(batch.reward).to(device)
    batch_size = batch_states.shape[0]

    with torch.no_grad():
        batch_values = critic(batch_states)

    batch_advantages, batch_returns = estimate_advantages(batch_rewards, batch_masks, batch_values, gamma,
                                                          tau)

    # mini-batch ppo update
    mini_batch_num = int(math.ceil(batch_size / mini_batch_size))
    for _ in range(ppo_epochs):
        idx = torch.randperm(batch_size)

        for i in range(mini_batch_num):
            mini_batch_idx = idx[i * mini_batch_size: min((i + 1) * mini_batch_size, batch_size)]

            mini_batch_states, mini_batch_actions, mini_batch_log_probs, mini_batch_returns, mini_batch_advantages = \
                batch_states[mini_batch_idx], batch_actions[mini_batch_idx], batch_log_probs[mini_batch_idx], \
                batch_returns[mini_batch_idx], batch_advantages[mini_batch_idx]

            ppo_step(actor, critic, opt_p, opt_v, 1, mini_batch_states, mini_batch_actions, mini_batch_returns, mini_batch_advantages,
                          mini_batch_log_probs, epsilon, 1e-3)
Exemple #4
0
    def train(self, memory):
        batch = memory.sample()
        batch_states = FLOAT(batch.state).to(device)
        batch_actions = FLOAT(batch.action).to(device)
        batch_log_probs = FLOAT(batch.log_prob).to(device)
        batch_masks = FLOAT(batch.mask).to(device)
        batch_rewards = FLOAT(batch.reward).to(device)
        batch_size = batch_states.shape[0]

        with torch.no_grad():
            batch_values = self.value(batch_states)

        batch_advantages, batch_returns = estimate_advantages(
            batch_rewards, batch_masks, batch_values, self.gamma, self.tau)

        # mini-batch ppo update
        mini_batch_num = int(math.ceil(batch_size / self.mini_batch_size))
        for _ in range(self.ppo_epochs):
            idx = torch.randperm(batch_size)

            for i in range(mini_batch_num):
                mini_batch_idx = idx[i * self.mini_batch_size:min(
                    (i + 1) * self.mini_batch_size, batch_size)]

                mini_batch_states, mini_batch_actions, mini_batch_log_probs, mini_batch_returns, mini_batch_advantages = \
                    batch_states[mini_batch_idx], batch_actions[mini_batch_idx], batch_log_probs[mini_batch_idx], \
                    batch_returns[mini_batch_idx], batch_advantages[mini_batch_idx]

                self.ppo_step(mini_batch_states, mini_batch_actions,
                              mini_batch_returns, mini_batch_advantages,
                              mini_batch_log_probs, self.epsilon, 1e-3)
Exemple #5
0
    def learn(self, writer, i_iter):
        """learn model"""
        memory, log = self.collector.collect_samples(self.min_batch_size)

        print(
            f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, "
            f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, "
            f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}"
        )

        # record reward information
        writer.add_scalars(
            "PPO_mini_batch", {
                "total reward": log['total_reward'],
                "average reward": log['avg_reward'],
                "min reward": log['min_episode_reward'],
                "max reward": log['max_episode_reward'],
                "num steps": log['num_steps']
            }, i_iter)

        batch = memory.sample()  # sample all items in memory

        batch_state = FLOAT(batch.state).to(device)
        batch_action = FLOAT(batch.action).to(device)
        batch_log_prob = FLOAT(batch.log_prob).to(device)
        batch_reward = FLOAT(batch.reward).to(device)
        batch_mask = FLOAT(batch.mask).to(device)
        batch_size = batch_state.shape[0]

        with torch.no_grad():
            batch_values = self.value_net(batch_state)

        batch_advantages, batch_returns = estimate_advantages(
            batch_reward, batch_mask, batch_values, self.gamma, self.tau)
        v_loss, p_loss = torch.empty(1), torch.empty(1)

        mini_batch_num = int(math.ceil(batch_size / self.ppo_mini_batch_size))

        # update with mini-batch
        for _ in range(self.ppo_epochs):
            index = torch.randperm(batch_size)

            for i in range(mini_batch_num):
                ind = index[slice(
                    i * self.ppo_mini_batch_size,
                    min(batch_size, (i + 1) * self.ppo_mini_batch_size))]
                state, action, returns, advantages, old_log_pis = batch_state[
                    ind], batch_action[ind], batch_returns[
                        ind], batch_advantages[ind], batch_log_prob[ind]

                v_loss, p_loss = ppo_step(self.policy_net, self.value_net,
                                          self.optimizer_p, self.optimizer_v,
                                          1, state, action, returns,
                                          advantages, old_log_pis,
                                          self.clip_epsilon, 1e-3)

        return v_loss, p_loss
Exemple #6
0
    def learn(self, writer, i_iter):
        """learn model"""
        memory, log = self.collector.collect_samples(self.min_batch_size)

        print(
            f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, "
            f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, "
            f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}"
        )

        # record reward information
        writer.add_scalars(
            "ppo", {
                "total reward": log['total_reward'],
                "average reward": log['avg_reward'],
                "min reward": log['min_episode_reward'],
                "max reward": log['max_episode_reward'],
                "num steps": log['num_steps']
            }, i_iter)

        batch = memory.sample()  # sample all items in memory

        batch_state = DOUBLE(batch.state).to(device)
        batch_action = DOUBLE(batch.action).to(device)
        batch_reward = DOUBLE(batch.reward).to(device)
        batch_mask = DOUBLE(batch.mask).to(device)
        batch_log_prob = DOUBLE(batch.log_prob).to(device)

        with torch.no_grad():
            batch_value = self.value_net(batch_state)

        batch_advantage, batch_return = estimate_advantages(
            batch_reward, batch_mask, batch_value, self.gamma, self.tau)
        v_loss, p_loss = torch.empty(1), torch.empty(1)
        for _ in range(self.ppo_epochs):
            v_loss, p_loss = ppo_step(self.policy_net, self.value_net,
                                      self.optimizer_p, self.optimizer_v, 1,
                                      batch_state, batch_action, batch_return,
                                      batch_advantage, batch_log_prob,
                                      self.clip_epsilon, 1e-3)

        self.policy_net_old.load_state_dict(self.policy_net.state_dict())
        return v_loss, p_loss
    def learn(self, writer, i_iter):
        """learn model"""
        memory, log = self.collector.collect_samples(self.min_batch_size)

        print(
            f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, "
            f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, "
            f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}"
        )

        # record reward information
        writer.add_scalars(
            "vpg", {
                "total reward": log['total_reward'],
                "average reward": log['avg_reward'],
                "min reward": log['min_episode_reward'],
                "max reward": log['max_episode_reward'],
                "num steps": log['num_steps']
            }, i_iter)

        batch = memory.sample()  # sample all items in memory

        batch_state = FLOAT(batch.state).to(device)
        batch_action = FLOAT(batch.action).to(device)
        batch_reward = FLOAT(batch.reward).to(device)
        batch_mask = FLOAT(batch.mask).to(device)

        with torch.no_grad():
            batch_value = self.value_net(batch_state)

        batch_advantage, batch_return = estimate_advantages(
            batch_reward, batch_mask, batch_value, self.gamma, self.tau)
        v_loss, p_loss = vpg_step(self.policy_net, self.value_net,
                                  self.optimizer_p, self.optimizer_v,
                                  self.vpg_epochs, batch_state, batch_action,
                                  batch_return, batch_advantage, 1e-3)
        return v_loss, p_loss
    def learn(self, writer, i_iter):
        memory, log = self.collector.collect_samples(
            self.config["train"]["generator"]["sample_batch_size"])

        self.policy.train()
        self.value.train()
        self.discriminator.train()

        print(
            f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, "
            f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, "
            f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}"
        )

        # record reward information
        writer.add_scalar("gail/average reward", log['avg_reward'], i_iter)
        writer.add_scalar("gail/num steps", log['num_steps'], i_iter)

        # collect generated batch
        # gen_batch = self.collect_samples(self.config["ppo"]["sample_batch_size"])
        gen_batch = memory.sample()
        gen_batch_state = FLOAT(gen_batch.state).to(
            device)  # [batch size, state size]
        gen_batch_action = FLOAT(gen_batch.action).to(
            device)  # [batch size, action size]
        gen_batch_old_log_prob = FLOAT(gen_batch.log_prob).to(
            device)  # [batch size, 1]
        gen_batch_mask = FLOAT(gen_batch.mask).to(device)  # [batch, 1]

        ####################################################
        # update discriminator
        ####################################################
        d_optim_i_iters = self.config["train"]["discriminator"]["optim_step"]
        if i_iter % d_optim_i_iters == 0:
            for step, (expert_batch_state, expert_batch_action) in enumerate(
                    self.expert_dataset.train_loader):
                if step >= d_optim_i_iters:
                    break
                # calculate probs and logits
                gen_prob, gen_logits = self.discriminator(
                    gen_batch_state, gen_batch_action)
                expert_prob, expert_logits = self.discriminator(
                    expert_batch_state.to(device),
                    expert_batch_action.to(device))

                # calculate accuracy
                gen_acc = torch.mean((gen_prob < 0.5).float())
                expert_acc = torch.mean((expert_prob > 0.5).float())

                # calculate regression loss
                expert_labels = torch.ones_like(expert_prob)
                gen_labels = torch.zeros_like(gen_prob)
                e_loss = self.discriminator_func(expert_prob,
                                                 target=expert_labels)
                g_loss = self.discriminator_func(gen_prob, target=gen_labels)
                d_loss = e_loss + g_loss

                # calculate entropy loss
                logits = torch.cat([gen_logits, expert_logits], 0)
                entropy = ((1. - torch.sigmoid(logits)) * logits -
                           torch.nn.functional.logsigmoid(logits)).mean()
                entropy_loss = - \
                    self.config["train"]["discriminator"]["ent_coeff"] * entropy

                total_loss = d_loss + entropy_loss

                self.optimizer_discriminator.zero_grad()
                total_loss.backward()
                self.optimizer_discriminator.step()

        writer.add_scalar('discriminator/d_loss', d_loss.item(), i_iter)
        writer.add_scalar("discriminator/e_loss", e_loss.item(), i_iter)
        writer.add_scalar("discriminator/g_loss", g_loss.item(), i_iter)
        writer.add_scalar("discriminator/ent", entropy.item(), i_iter)
        writer.add_scalar('discriminator/expert_acc', gen_acc.item(), i_iter)
        writer.add_scalar('discriminator/gen_acc', expert_acc.item(), i_iter)

        ####################################################
        # update policy by ppo [mini_batch]
        ####################################################

        with torch.no_grad():
            gen_batch_value = self.value(gen_batch_state)
            d_out, _ = self.discriminator(gen_batch_state, gen_batch_action)
            gen_batch_reward = -torch.log(1 - d_out + 1e-6)

        gen_batch_advantage, gen_batch_return = estimate_advantages(
            gen_batch_reward, gen_batch_mask, gen_batch_value,
            self.config["train"]["generator"]["gamma"],
            self.config["train"]["generator"]["tau"])

        ppo_optim_i_iters = self.config["train"]["generator"]["optim_step"]
        ppo_mini_batch_size = self.config["train"]["generator"][
            "mini_batch_size"]

        for _ in range(ppo_optim_i_iters):
            if ppo_mini_batch_size > 0:
                gen_batch_size = gen_batch_state.shape[0]
                optim_iter_num = int(
                    math.ceil(gen_batch_size / ppo_mini_batch_size))
                perm = torch.randperm(gen_batch_size)

                for i in range(optim_iter_num):
                    ind = perm[slice(
                        i * ppo_mini_batch_size,
                        min((i + 1) * ppo_mini_batch_size, gen_batch_size))]
                    mini_batch_state, mini_batch_action, mini_batch_advantage, mini_batch_return, \
                        mini_batch_old_log_prob = gen_batch_state[ind], gen_batch_action[ind], \
                        gen_batch_advantage[ind], gen_batch_return[ind], gen_batch_old_log_prob[
                            ind]

                    v_loss, p_loss, ent_loss = ppo_step(
                        policy_net=self.policy,
                        value_net=self.value,
                        optimizer_policy=self.optimizer_policy,
                        optimizer_value=self.optimizer_value,
                        optim_value_iternum=self.config["value"]
                        ["optim_value_iter"],
                        states=mini_batch_state,
                        actions=mini_batch_action,
                        returns=mini_batch_return,
                        old_log_probs=mini_batch_old_log_prob,
                        advantages=mini_batch_advantage,
                        clip_epsilon=self.config["train"]["generator"]
                        ["clip_ratio"],
                        l2_reg=self.config["value"]["l2_reg"])
            else:
                v_loss, p_loss, ent_loss = ppo_step(
                    policy_net=self.policy,
                    value_net=self.value,
                    optimizer_policy=self.optimizer_policy,
                    optimizer_value=self.optimizer_value,
                    optim_value_iternum=self.config["value"]
                    ["optim_value_iter"],
                    states=gen_batch_state,
                    actions=gen_batch_action,
                    returns=gen_batch_return,
                    old_log_probs=gen_batch_old_log_prob,
                    advantages=gen_batch_advantage,
                    clip_epsilon=self.config["train"]["generator"]
                    ["clip_ratio"],
                    l2_reg=self.config["value"]["l2_reg"])

        writer.add_scalar('generator/p_loss', p_loss, i_iter)
        writer.add_scalar('generator/v_loss', v_loss, i_iter)
        writer.add_scalar('generator/ent_loss', ent_loss, i_iter)

        print(f" Training episode:{i_iter} ".center(80, "#"))
        print('d_gen_prob:', gen_prob.mean().item())
        print('d_expert_prob:', expert_prob.mean().item())
        print('d_loss:', d_loss.item())
        print('e_loss:', e_loss.item())
        print("d/bernoulli_entropy:", entropy.item())
Exemple #9
0
    def learn(self, writer, i_iter):
        """learn model"""
        memory, log = self.collector.collect_samples(self.min_batch_size)

        print(
            f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, "
            f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, "
            f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}"
        )

        # record reward information
        writer.add_scalar("total reward", log['total_reward'], i_iter)
        writer.add_scalar("average reward", log['avg_reward'], i_iter)
        writer.add_scalar("min reward", log['min_episode_reward'], i_iter)
        writer.add_scalar("max reward", log['max_episode_reward'], i_iter)
        writer.add_scalar("num steps", log['num_steps'], i_iter)

        batch = memory.sample()  # sample all items in memory
        #  ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob')
        batch_state = FLOAT(batch.state).to(device)
        batch_action = FLOAT(batch.action).to(device)
        batch_reward = FLOAT(batch.reward).to(device)
        batch_mask = FLOAT(batch.mask).to(device)
        batch_log_prob = FLOAT(batch.log_prob).to(device)

        with torch.no_grad():
            batch_value = self.value_net(batch_state)

        batch_advantage, batch_return = estimate_advantages(
            batch_reward, batch_mask, batch_value, self.gamma, self.tau)

        alg_step_stats = {}
        if self.ppo_mini_batch_size:
            batch_size = batch_state.shape[0]
            mini_batch_num = int(
                math.ceil(batch_size / self.ppo_mini_batch_size))

            # update with mini-batch
            for _ in range(self.ppo_epochs):
                index = torch.randperm(batch_size)

                for i in range(mini_batch_num):
                    ind = index[slice(
                        i * self.ppo_mini_batch_size,
                        min(batch_size, (i + 1) * self.ppo_mini_batch_size))]
                    state, action, returns, advantages, old_log_pis = batch_state[ind], batch_action[ind], \
                        batch_return[
                        ind], batch_advantage[ind], \
                        batch_log_prob[
                        ind]

                    alg_step_stats = ppo_step(self.policy_net, self.value_net,
                                              self.optimizer_p,
                                              self.optimizer_v, 1, state,
                                              action, returns, advantages,
                                              old_log_pis, self.clip_epsilon,
                                              1e-3)
        else:
            for _ in range(self.ppo_epochs):
                alg_step_stats = ppo_step(self.policy_net, self.value_net,
                                          self.optimizer_p, self.optimizer_v,
                                          1, batch_state, batch_action,
                                          batch_return, batch_advantage,
                                          batch_log_prob, self.clip_epsilon,
                                          1e-3)

        return alg_step_stats