Example #1
0
File: ppo.py Project: CAiM-lab/PPO
class PPO(object):
    """Main PPO class"""
    def __init__(self, args):
        """"Constructor which allows the PPO class to initialize the attributes of the class"""
        self.args = args
        self.random_seed()
        # Check if GPU is available via CUDA driver
        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        # Initialize the actor critic class
        self.actor_critic = ActorCritic(
            self.args.nb_states, self.args.nb_actions,
            self.args.hidden_layer_size).to(self.device)
        # Define the optimizer used for the optimization of the surrogate loss
        self.optimizer = self.args.optimizer(self.actor_critic.parameters(),
                                             self.args.lr)

        # For training multiple instances of the env are needed (Shoulder model)
        self.envs = [self.make_env() for i in range(self.args.num_envs)]
        self.envs = SubprocVecEnv(self.envs)
        # To validate the intermediate learning process one test env is needed
        self.env_test = self.args.env
        self.env_test.seed(self.args.seed)
        self.env_test.set_scaling(self.args.output_scaling)

        #  Lists for Tensorboard to visualize learning process during learning
        self.test_rewards = []
        self.loss = []
        self.lr = []
        self.actor_grad_weight = []
        self.action_bang_bang = []

        self.lr.append(self.args.lr)

        # Dump bin files
        if self.args.play is False:
            self.output_path = "trained_models" + '/PPO_{}'.format(
                datetime.now().strftime('%Y%b%d_%H%M%S')) + "/"
            os.mkdir(self.output_path)
            self.writer = SummaryWriter(self.output_path)

        #self.delta = (self.args.lr-self.args.lr_end)/1e6

    def train(self):
        """Main training function"""
        frame_idx = 0
        state = self.envs.reset()
        mean_100_reward = -np.inf
        self.info()

        while frame_idx < self.args.max_frames:
            log_probs = []
            values = []
            states = []
            actions = []
            rewards = []
            masks = []
            entropy = self.args.entropy

            for _ in range(self.args.nb_steps):
                state = torch.FloatTensor(state).to(self.device)
                dist, value = self.actor_critic(state)
                action = dist.sample()
                # Make sure action is loaded to CPU (not GPU)
                next_state, reward, done, _ = self.envs.step(
                    action.cpu().numpy())

                log_prob = dist.log_prob(action)
                entropy += dist.entropy().mean()

                log_probs.append(log_prob)
                values.append(value)
                rewards.append(
                    torch.FloatTensor(reward).unsqueeze(1).to(self.device))
                masks.append(
                    torch.FloatTensor(1 - done).unsqueeze(1).to(self.device))

                states.append(state)
                actions.append(action)
                state = next_state
                frame_idx += 1
                #self.scheduler()

                # Evaluate training process and write data to tensorboard
                if frame_idx % 1000 == 0:
                    test_reward = np.mean(
                        [self.test_env(self.args.vis) for _ in range(10)])
                    self.test_rewards.append(test_reward)

                    if self.args.play is False:
                        print("Mean reward: ",
                              np.round(np.mean(self.test_rewards[-101:-1]), 0))
                        if mean_100_reward < np.round(
                                np.mean(self.test_rewards[-101:-1]), 0):
                            mean_100_reward = np.round(
                                np.mean(self.test_rewards[-101:-1]), 0)
                            self.save_network(mean_100_reward)
                        if len(self.test_rewards) >= 10:
                            self.writer.add_scalar(
                                'data/reward',
                                np.mean(self.test_rewards[-11:-1]),
                                frame_idx * self.args.num_envs)
                            self.writer.add_scalar(
                                'data/ppo_loss', np.mean(self.loss[-11:-1]),
                                frame_idx * self.args.num_envs)
                            self.writer.add_scalar(
                                'data/nb_actions_outside_range',
                                np.mean(self.action_bang_bang[-11:-1]),
                                frame_idx * self.args.num_envs)

                    # if test_reward > threshold_reward: early_stop = True

            next_state = torch.FloatTensor(next_state).to(self.device)
            _, next_value = self.actor_critic(next_state)
            returns = self.calc_gae(next_value, rewards, masks, values,
                                    self.args.gamma, self.args.tau)

            # detach() to take it away from the graph i.e. this operations are ignored for gradient calculations
            returns = torch.cat(returns).detach()
            log_probs = torch.cat(log_probs).detach()
            values = torch.cat(values).detach()
            states = torch.cat(states)
            actions = torch.cat(actions)
            advantage = returns - values
            self.ppo_update(self.args.ppo_epochs, self.args.mini_batch_size,
                            states, actions, log_probs, returns, advantage,
                            self.args.clip)

    def make_env(self):
        # Private trunk function for calling the SubprocVecEnv class
        def _trunk():
            env = self.args.env  # in this simple case the class TestEnv() is called (see openAI for more envs)
            env.seed(self.args.seed)
            env.set_scaling(self.args.output_scaling)
            return env

        return _trunk

    def test_env(self, vis=False):
        state = self.env_test.reset()
        if vis:
            self.env_test.render()
        done = False
        total_reward = 0
        action_bang_bang = 0
        step = 0
        while not done:
            step += 1
            state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            dist, _ = self.actor_critic(state)
            action = dist.sample().cpu().numpy()[0]
            force = action * self.args.output_scaling
            next_state, reward, done, _ = self.env_test.step(action)
            if force > 0.5 or force < -0.5:
                action_bang_bang += 1
            state = next_state
            if vis:
                self.env_test.render()
            total_reward += reward
        self.action_bang_bang.append(action_bang_bang / step)
        return total_reward

    # Plain functions except that one can call them from an instance or the class
    @staticmethod
    def calc_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.95):
        values = values + [next_value]
        gae = 0
        returns = []
        for step in reversed(range(len(rewards))):
            delta = rewards[step] + gamma * values[
                step + 1] * masks[step] - values[step]
            gae = delta + gamma * tau * masks[step] * gae
            returns.insert(0, gae + values[step])
        return returns

    @staticmethod
    def ppo_iter(mini_batch_size, states, actions, log_probs, returns,
                 advantage):
        batch_size = states.size(0)
        for _ in range(batch_size // mini_batch_size):
            rand_ids = np.random.randint(0, batch_size, mini_batch_size)
            yield states[rand_ids, :], actions[rand_ids, :], log_probs[
                rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]

    def ppo_update(self,
                   ppo_epochs,
                   mini_batch_size,
                   states,
                   actions,
                   log_probs,
                   returns,
                   advantages,
                   clip_param=0.2):
        for _ in range(ppo_epochs):
            for state, action, old_log_probs, return_, advantage in self.ppo_iter(
                    mini_batch_size, states, actions, log_probs, returns,
                    advantages):
                dist, value = self.actor_critic(state)
                entropy = dist.entropy().mean()
                new_log_probs = dist.log_prob(action)

                ratio = (new_log_probs - old_log_probs).exp()
                surr1 = ratio * advantage
                surr2 = torch.clamp(ratio, 1.0 - clip_param,
                                    1.0 + clip_param) * advantage

                actor_loss = -torch.min(surr1, surr2).mean()
                critic_loss = (return_ - value).pow(2).mean()

                loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy
                self.loss.append(loss.item())
                # Important step:
                self.optimizer.zero_grad()
                #pdb.set_trace()
                loss.backward()
                if self.args.grad_norm is not None:
                    nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
                                             self.args.grad_norm)
                self.optimizer.step()

    def save_network(self, reward):
        network_path = self.output_path + "/network" + str(reward)
        pickle.dump(self.actor_critic.state_dict(), open(network_path, "wb"))

    def load_network(self, path):
        network_new = pickle.load(open(path, "rb"))
        self.actor_critic.load_state_dict(network_new)

    def random_seed(self):
        torch.manual_seed(self.args.seed)
        random.seed(self.args.seed)
        np.random.seed(self.args.seed)

    def scheduler(self):
        for g in self.optimizer.param_groups:
            lr = g["lr"]
            if self.args.lr_end > lr:
                lr = self.args.lr_end
            else:
                lr -= self.delta
            self.lr.append(lr)
            g["lr"] = lr

    def info(self):
        fhandler = logging.FileHandler(filename=self.output_path +
                                       '/mylog.log',
                                       mode='a')
        logger.addHandler(fhandler)
        logger.info("--- INFO ---")
        logger.info("args: {}".format(self.args))
Example #2
0
                    help='rewards discount factor')
parser.add_argument('--entropy_weight', default=0.0001, type=float)
parser.add_argument('--alpha', default=0.95, type=float)
parser.add_argument('--type', default='notrpo', type=str, help='iftrpo')
parser.add_argument('--render', action='store_true', help='render')

args = parser.parse_args()
# print(args)
torch.manual_seed(args.seed)

env = gym.make("CartPole-v0")
replay_buffer = ReplayBuffer(args.capacity, args.max_episode_length)
model = ActorCritic(env.observation_space.shape[0], env.action_space.n).cuda()
average_model = ActorCritic(env.observation_space.shape[0],
                            env.action_space.n).cuda()
optimizer = optim.Adam(model.parameters())

frame_idx = 0
test_rewards = []
episode_count = 0
step_count = 0
state = env.reset()

running_rew = 0
plotcount = 0

while frame_idx < args.max_frames:

    policies = []
    average_policies = []
    actions = []
Example #3
0
class Learner(object):
    def __init__(self, opt, q_batch):
        self.opt = opt
        self.q_batch = q_batch
        self.network = ActorCritic(opt).to(device)
        self.optimizer = Adam(self.network.parameters(), lr=opt.lr)
        self.network.share_memory()

    def learning(self):
        torch.manual_seed(self.opt.seed)
        coef_hat = torch.Tensor([[self.opt.coef_hat]]).to(device)
        rho_hat = torch.Tensor([[self.opt.rho_hat]]).to(device)
        while True:
            # batch-trace
            # s[batch, n_step+1, 3, width, height]
            # a[batch, n_step, a_space]
            # rew[batch, n_step]
            # a_prob[batch, n_step, a_space]
            s, a, rew, prob = self.q_batch.get(block=True)
            ###########################
            # variables we need later #
            ###########################
            v, coef, rho, entropies, log_prob = [], [], [], [], []
            cx = torch.zeros(self.opt.batch_size, 256).to(device)
            hx = torch.zeros(self.opt.batch_size, 256).to(device)
            for step in range(s.size(1)):
                # value[batch]
                # logit[batch, 12]
                value, logit, (hx, cx) = self.network((s[:, step,
                                                         ...], (hx, cx)))
                v.append(value)
                if step >= a.size(
                        1
                ):  # noted that s[, n_step+1, ...] but a[, n_step,...]
                    break  # loop for n_step+1 because v in n_step+1 is needed.

                # π/μ[batch]
                # TODO: cumprod might produce runtime problem
                logit_a = a[:, step, :] * logit.detach() + (
                    1 - a[:, step, :]) * (1 - logit.detach())
                prob_a = a[:, step, :] * prob[:, step, :] + (
                    1 - a[:, step, :]) * (1 - prob[:, step, :])
                is_rate = torch.cumprod(logit_a / (prob_a + 1e-6), dim=1)[:,
                                                                          -1]
                coef.append(torch.min(coef_hat, is_rate))
                rho.append(torch.min(rho_hat, is_rate))

                # enpy_aspace[batch, 12]
                # calculating the entropy[batch, 1]
                # more specifically there are [a_space] entropy for each batch, sum over them here.
                # noted that ~do not~ use detach here
                enpy_aspace = -torch.log(logit) * logit - torch.log(
                    1 - logit) * (1 - logit)
                enpy = (enpy_aspace).sum(dim=1, keepdim=True)
                entropies.append(enpy)

                # calculating the prob that the action is taken by target policy
                # and the prob_pi_a[batch, 12] and log_prob[batch, 1] of this action
                # noted that ~do not~ use detach here
                prob_pi_a = (a[:, step, :] *
                             logit) + (1 - a[:, step, :]) * (1 - logit)
                log_prob_pi_a = torch.log(prob_pi_a).sum(dim=1, keepdim=True)
                log_prob.append(log_prob_pi_a)
                # prob_pi_a = torch.cumprod(prob_pi_a, dim=1)[:, -1:]
                # log_prob_pi_a = torch.log(prob_pi_a)

            ####################
            # calculating loss #
            ####################
            policy_loss = 0
            value_loss = 0
            # gae = torch.zeros(self.opt.batch_size, 1)
            for rev_step in reversed(range(s.size(1) - 1)):
                # compute v_(s+1)[batch] for policy gradient
                fix_vp = rew[:, rev_step] + self.opt.gamma * (
                    v[rev_step + 1] + value_loss) - v[rev_step]

                # value_loss[batch]
                td = rew[:, rev_step] + self.opt.gamma * v[rev_step +
                                                           1] - v[rev_step]
                value_loss = self.opt.gamma * coef[
                    rev_step] * value_loss + rho[rev_step] * td

                # policy_loss = policy_loss - log_probs[i] * Variable(gae)
                # the td must be detach from network-v

                # # dalta_t[batch]
                # delta_t = rew[:, rev_step] + self.opt.gamma * v[rev_step + 1] - v[rev_step]
                # gae = gae * self.opt.gamma + delta_t.detach()

                policy_loss = policy_loss \
                              - rho[rev_step] * log_prob[rev_step] * fix_vp.detach() \
                              - self.opt.entropy_coef * entropies[rev_step]

            self.optimizer.zero_grad()
            policy_loss = policy_loss.sum()
            value_loss = value_loss.sum()
            loss = policy_loss + self.opt.value_loss_coef * value_loss
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.network.parameters(),
                                           self.opt.max_grad_norm)
            print("v_loss {:.3f} p_loss {:.3f}".format(value_loss.item(),
                                                       policy_loss.item()))
            self.optimizer.step()