Exemple #1
0
class ParallelActorCritic(object):
    """
    The method is also known as A2C i.e. (Parallel) Advantage Actor Critic.
    https://blog.openai.com/baselines-acktr-a2c/
    https://arxiv.org/abs/1705.04862
    """

    CHECKPOINT_SUBDIR = 'checkpoints/'
    SUMMARY_FILE = 'summaries.pkl4'  #pickle, protocol=4
    CHECKPOINT_LAST = 'checkpoint_last.pth'
    CHECKPOINT_BEST = 'checkpoint_best.pth'

    save_every = 10**6
    print_every = 10240
    eval_every = 20 * 10240

    def __init__(self, network, batch_env, args):
        logging.debug('PAAC init is started')
        self.checkpoint_dir = join_path(args.debugging_folder,
                                        self.CHECKPOINT_SUBDIR)
        ensure_dir(self.checkpoint_dir)

        checkpoint = self._load_latest_checkpoint(self.checkpoint_dir)
        self.last_saving_step = checkpoint['last_step'] if checkpoint else 0

        self.global_step = self.last_saving_step
        self.network = network
        self.batch_env = batch_env
        self.optimizer = optim.RMSprop(
            self.network.parameters(),
            lr=args.initial_lr,
            eps=args.e,
        )  #RMSprop defualts: momentum=0., centered=False, weight_decay=0

        if checkpoint:
            logging.info('Restoring agent variables from previous run')
            self.network.load_state_dict(checkpoint['network_state_dict'])
            self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

        self.lr_scheduler = LinearAnnealingLR(self.optimizer,
                                              args.lr_annealing_steps)
        #pytorch documentation says:
        #In most cases it’s better to use CUDA_VISIBLE_DEVICES environmental variable
        #Therefore to specify a particular gpu one should use CUDA_VISIBLE_DEVICES.
        self.device = self.network._device

        self.gamma = args.gamma  # future rewards discount factor
        self.entropy_coef = args.entropy_regularisation_strength
        self.loss_scaling = args.loss_scaling  #5.
        self.critic_coef = args.critic_coef  #0.25
        self.total_steps = args.max_global_steps
        self.rollout_steps = args.rollout_steps
        self.clip_norm = args.clip_norm
        self.num_emulators = batch_env.num_emulators

        self.evaluate = None
        self.reshape_r = lambda r: np.clip(r, -1., 1.)
        self.compute_returns = n_step_returns
        if args.clip_norm_type == 'global':
            self.clip_gradients = nn.utils.clip_grad_norm_
        elif args.clip_norm_type == 'local':
            self.clip_gradients = utils.clip_local_grad_norm
        elif args.clip_norm_type == 'ignore':
            self.clip_gradients = lambda params, _: utils.global_grad_norm(
                params)
        else:
            raise ValueError('Norm type({}) is not recoginized'.format(
                args.clip_norm_type))
        logging.debug('Paac init is done')

        self.curr_learning = True
        self.starting_length = [[5, 10], [5, 10], [5, 10], [5, 10], [15, 20],
                                [15, 20], [15, 20],
                                [15,
                                 20]]  # 1. 5-10;  2. 15-20; 3.40-50; 4.90-100
        self.checking_length = [15, 20]

    def train(self):
        """
        Main actor learner loop for parallerl advantage actor critic learning.
        """
        logging.info('Starting training at step %d' % self.global_step)
        logging.debug('Device: {}'.format(self.device))

        counter = 0
        global_step_start = self.global_step
        average_loss = utils.MovingAverage(
            0.01, ['actor', 'critic', 'entropy', 'grad_norm'])

        total_rewards, training_stats, total_length = [], [], []

        num_emulators = self.batch_env.num_emulators
        total_episode_rewards = np.zeros(num_emulators)

        #stores 0.0 in i-th element if the episode in i-th emulator has just started, otherwise stores 1.0
        #mask is used to cut rnn_state and episode rewards between episodes.
        mask_t = th.zeros(num_emulators).to(self.device)

        #feedforward networks also use rnn_state, it's just empty!
        rnn_state = self.network.init_rnn_state(num_emulators)

        states, infos = self.batch_env.reset_all()
        self.batch_env.set_difficulty(self.starting_length)

        if self.evaluate is not None:
            stats = self.evaluate(self.network)
            training_stats.append((self.global_step, stats))

        start_time = time.time()
        while self.global_step < self.total_steps:

            loop_start_time = time.time()
            values, log_probs, rewards, entropies, masks = [], [], [], [], []
            self.network.detach_rnn_state(rnn_state)

            for t in range(self.rollout_steps):
                outputs = self.choose_action(states, infos,
                                             mask_t.unsqueeze(1), rnn_state)
                a_t, v_t, log_probs_t, entropy_t, rnn_state = outputs
                states, rs, dones, infos = self.batch_env.next(a_t)

                tensor_rs = th.from_numpy(self.reshape_r(rs)).to(self.device)
                rewards.append(tensor_rs)
                entropies.append(entropy_t)
                log_probs.append(log_probs_t)
                values.append(v_t)

                mask_t = 1.0 - th.from_numpy(dones).to(
                    self.device)  #dones.dtype == np.float32
                masks.append(
                    mask_t)  #1.0 if episode is not done, 0.0 otherwise

                done_mask = dones.astype(bool)
                total_episode_rewards += rs

                if any(done_mask):
                    total_rewards.extend(total_episode_rewards[done_mask])
                    total_episode_rewards[done_mask] = 0.

            next_v = self.predict_values(states, infos, mask_t.unsqueeze(1),
                                         rnn_state)

            update_stats = self.update_weights(next_v, rewards, masks, values,
                                               log_probs, entropies)
            average_loss.update(**update_stats)

            self.global_step += num_emulators * self.rollout_steps
            counter += 1

            if counter % (self.print_every //
                          (num_emulators * self.rollout_steps)) == 0:
                curr_time = time.time()
                self._training_info(
                    total_rewards=total_rewards,
                    average_speed=(self.global_step - global_step_start) /
                    (curr_time - start_time),
                    loop_speed=(num_emulators * self.rollout_steps) /
                    (curr_time - loop_start_time),
                    update_stats=average_loss)

            if counter % (self.eval_every //
                          (num_emulators * self.rollout_steps)) == 0:
                if self.evaluate is not None:
                    stats = self.evaluate(self.network)
                    if stats.final_res > 0.95:
                        print(stats.final_res, 'stats.final_res ')
                        if self.curr_learning == True:  #if it is curriculum learning, and final_res > 95 %, then enlarge th length
                            print(self.curr_learning, 'self.curr_learning')
                            self.change_length_labyrinth()
                        else:
                            pass
                    training_stats.append((self.global_step, stats))

            if self.global_step - self.last_saving_step >= self.save_every:
                self._save_progress(self.checkpoint_dir,
                                    summaries=training_stats,
                                    is_best=False)
                training_stats = []
                self.last_saving_step = self.global_step

        self._save_progress(self.checkpoint_dir, is_best=False)
        logging.info('Training ended at step %d' % self.global_step)

    def choose_action(self, states, infos, masks, rnn_states):
        values, distr, rnn_states = self.network(states, infos, masks,
                                                 rnn_states)
        acts = distr.sample().detach()
        log_probs = distr.log_prob(acts)
        entropy = distr.entropy()
        return acts, values.squeeze(dim=1), log_probs, entropy, rnn_states

    def predict_values(self, states, infos, masks, rnn_states):
        values = self.network(states, infos, masks, rnn_states)[0]
        return values.squeeze(dim=1)

    def update_weights(self, next_v, rewards, masks, values, log_probs,
                       entropies):
        returns = self.compute_returns(next_v.detach(), rewards, masks,
                                       self.gamma)

        loss, update_data = self.compute_loss(th.cat(returns), th.cat(values),
                                              th.cat(log_probs),
                                              th.cat(entropies))

        self.lr_scheduler.adjust_learning_rate(self.global_step)
        self.optimizer.zero_grad()
        loss.backward()
        global_norm = self.clip_gradients(self.network.parameters(),
                                          self.clip_norm)
        self.optimizer.step()

        update_data['grad_norm'] = global_norm
        return update_data

    def compute_loss(self, returns, values, log_probs, entropies):
        advantages = returns - values

        critic_loss = self.critic_coef * advantages.pow(2).mean()  #minimize
        actor_loss = th.neg(log_probs * advantages.detach()).mean(
        )  # minimize -log(policy(a))*advantage(s,a)
        entropy_loss = self.entropy_coef * entropies.mean()  # maximize entropy

        loss = self.loss_scaling * (actor_loss + critic_loss - entropy_loss)

        loss_data = {
            'actor': actor_loss.item(),
            'critic': critic_loss.item(),
            'entropy': entropy_loss.item()
        }
        return loss, loss_data

    @classmethod
    def _load_latest_checkpoint(cls, dir):
        last_chkpt_path = join_path(dir, cls.CHECKPOINT_LAST)
        if isfile(last_chkpt_path):
            return th.load(last_chkpt_path)
        return None

    def _save_progress(self, dir, summaries=None, is_best=False):
        last_chkpt_path = join_path(dir, self.CHECKPOINT_LAST)
        state = {
            'last_step': self.global_step,
            'network_state_dict': self.network.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict()
        }
        th.save(state, last_chkpt_path)
        logging.info('The state of the agent is saved at step #%d' %
                     self.global_step)

        if (summaries is not None) and len(summaries) > 0:
            summaries_path = join_path(dir, self.SUMMARY_FILE)
            utils.save_summary(summaries, summaries_path)

        if is_best:
            best_chkpt_path = join_path(dir, self.CHECKPOINT_BEST)
            shutil.copyfile(last_chkpt_path, best_chkpt_path)

    def _training_info(self, total_rewards, average_speed, loop_speed,
                       update_stats):
        last_ten = np.mean(total_rewards[-20:]) if len(total_rewards) else 0.
        logger_msg = "Ran {0} steps, at {1:.3f} fps (avg {2:.3f} fps), last 20 episodes avg {3:.5f}"

        lines = [
            '',
        ]
        lines.append(
            logger_msg.format(self.global_step, loop_speed, average_speed,
                              last_ten))
        lines.append(str(update_stats))
        logging.info(yellow('\n'.join(lines)))

    def change_length_labyrinth(self):
        self.checking_length = list(np.array(self.checking_length) + [10, 10])
        for i in range(8):
            self.starting_length[i] = list(
                np.array(self.starting_length[i]) + [10, 10])
        print(self.checking_length, 'self.checking_length')
Exemple #2
0
class PAACLearner(object):
    CHECKPOINT_SUBDIR = 'checkpoints/'
    SUMMARY_FILE = 'summaries.pkl4'  #pickle, protocol=4
    CHECKPOINT_LAST = 'checkpoint_last.pth'
    CHECKPOINT_BEST = 'checkpoint_best.pth'

    save_every = 10**6
    print_every = 10240
    eval_every = 20 * 10240

    def __init__(self, network_creator, batch_env, args):
        logging.debug('PAAC init is started')
        self.args = copy.copy(vars(args))
        self.checkpoint_dir = join_path(self.args['debugging_folder'],
                                        self.CHECKPOINT_SUBDIR)
        ensure_dir(self.checkpoint_dir)

        checkpoint = self._load_latest_checkpoint(self.checkpoint_dir)
        self.last_saving_step = checkpoint['last_step'] if checkpoint else 0

        self.final_rewards = []
        self.global_step = self.last_saving_step
        self.network = network_creator()
        self.batch_env = batch_env
        self.optimizer = optim.RMSprop(
            self.network.parameters(),
            lr=self.args['initial_lr'],
            eps=self.args['e'],
        )  #RMSprop defualts: momentum=0., centered=False, weight_decay=0

        if checkpoint:
            logging.info('Restoring agent variables from previous run')
            self.network.load_state_dict(checkpoint['network_state_dict'])
            self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

        self.lr_scheduler = LinearAnnealingLR(self.optimizer,
                                              self.args['lr_annealing_steps'])
        #pytorch documentation says:
        #In most cases it’s better to use CUDA_VISIBLE_DEVICES environmental variable
        #Therefore to specify a particular gpu one should use CUDA_VISIBLE_DEVICES.
        self.use_cuda = self.args['device'] == 'gpu'
        self.use_rnn = hasattr(
            self.network, 'get_initial_state'
        )  #get_initial_state should return state of the rnn layers
        self._tensors = torch.cuda if self.use_cuda else torch

        self.action_codes = np.eye(
            batch_env.num_actions)  #envs reveive actions in one-hot encoding!
        self.gamma = self.args['gamma']  # future rewards discount factor
        self.entropy_coef = self.args['entropy_regularisation_strength']
        self.loss_scaling = self.args['loss_scaling']  #5.
        self.critic_coef = self.args['critic_coef']  #0.25
        self.eval_func = None

        if self.args['clip_norm_type'] == 'global':
            self.clip_gradients = nn.utils.clip_grad_norm_
        elif self.args['clip_norm_type'] == 'local':
            self.clip_gradients = utils.clip_local_grad_norm
        elif self.args['clip_norm_type'] == 'ignore':
            self.clip_gradients = lambda params, _: utils.global_grad_norm(
                params)
        else:
            raise ValueError('Norm type({}) is not recoginized'.format(
                self.args['clip_norm_type']))
        logging.debug('Paac init is done')

    def train(self):
        """
        Main actor learner loop for parallerl advantage actor critic learning.
        """
        logging.info('Starting training at step %d' % self.global_step)
        logging.debug('use_cuda == {}'.format(self.use_cuda))

        counter = 0
        global_step_start = self.global_step
        average_loss = utils.MovingAverage(0.01, ['total', 'actor', 'critic'])
        total_rewards, training_stats = [], []

        if self.eval_func is not None:
            stats = self.evaluate(verbose=True)
            training_stats.append((self.global_step, stats))

        #num_actions = self.args['num_actions']
        num_emulators = self.args['num_envs']
        max_local_steps = self.args['max_local_steps']
        max_global_steps = self.args['max_global_steps']
        clip_norm = self.args['clip_norm']
        rollout_steps = num_emulators * max_local_steps

        states, infos = self.batch_env.reset_all()

        emulator_steps = np.zeros(num_emulators, dtype=int)
        total_episode_rewards = np.zeros(num_emulators)
        not_done_masks = torch.zeros(max_local_steps, num_emulators).type(
            self._tensors.FloatTensor)
        if self.use_rnn:
            hx_init, cx_init = self.network.get_initial_state(num_emulators)
            hx, cx = hx_init, cx_init
        else:  #for feedforward nets just ignore this argument
            hx, cx = None, None

        start_time = time.time()
        while self.global_step < max_global_steps:
            loop_start_time = time.time()
            values, log_probs, rewards, entropies = [], [], [], []
            if self.use_rnn:
                hx, cx = hx.detach(), cx.detach(
                )  #Do I really need to detach here?

            for t in range(max_local_steps):
                outputs = self.choose_action(states, infos, (hx, cx))
                a_t, v_t, log_probs_t, entropy_t, (hx, cx) = outputs
                states, rs, dones, infos = self.batch_env.next(a_t)

                #actions_sum += a_t
                rewards.append(np.clip(rs, -1., 1.))
                entropies.append(entropy_t)
                log_probs.append(log_probs_t)
                values.append(v_t)
                is_done = torch.from_numpy(dones).type(
                    self._tensors.FloatTensor)
                not_done_masks[t] = 1.0 - is_done

                done_mask = dones.astype(bool)
                total_episode_rewards += rs
                emulator_steps += 1

                total_rewards.extend(total_episode_rewards[done_mask])
                total_episode_rewards[done_mask] = 0.
                emulator_steps[done_mask] = 0
                if self.use_rnn and any(
                        done_mask
                ):  # we need to clear all lstm states corresponding to the terminated emulators
                    done_idx = is_done.nonzero().view(-1)
                    hx, cx = hx.clone(), cx.clone(
                    )  #hx_t, cx_t are used for backward op, so we can't modify them in-place
                    hx[done_idx, :] = hx_init[done_idx, :].detach()
                    cx[done_idx, :] = cx_init[done_idx, :].detach()

            self.global_step += rollout_steps
            next_v = self.predict_values(states, infos, (hx, cx))
            R = next_v.detach().view(-1)

            delta_v = []
            for t in reversed(range(max_local_steps)):
                rs = Variable(torch.from_numpy(rewards[t])).type(
                    self._tensors.FloatTensor)
                not_done_t = Variable(not_done_masks[t])
                R = rs + self.gamma * R * not_done_t
                delta_v_t = R - values[t].view(-1)
                delta_v.append(delta_v_t)

            loss, actor_loss, critic_loss = self.compute_loss(
                torch.cat(delta_v, 0),
                torch.cat(log_probs, 0).view(-1),
                torch.cat(entropies, 0).view(-1))

            self.lr_scheduler.adjust_learning_rate(self.global_step)
            self.optimizer.zero_grad()
            loss.backward()
            global_norm = self.clip_gradients(self.network.parameters(),
                                              clip_norm)
            self.optimizer.step()

            average_loss.update(total=loss.data.item(),
                                actor=actor_loss.item(),
                                critic=critic_loss.item())

            counter += 1
            if counter % (self.print_every // rollout_steps) == 0:
                curr_time = time.time()
                self._training_info(
                    total_rewards=total_rewards,
                    average_speed=(self.global_step - global_step_start) /
                    (curr_time - start_time),
                    loop_speed=rollout_steps / (curr_time - loop_start_time),
                    moving_averages=average_loss,
                    grad_norms=global_norm)

            if counter % (self.eval_every // rollout_steps) == 0:
                if (self.eval_func is not None):
                    stats = self.evaluate(verbose=True)
                    training_stats.append((self.global_step, stats))

            if self.global_step - self.last_saving_step >= self.save_every:
                self._save_progress(self.checkpoint_dir,
                                    summaries=training_stats,
                                    is_best=False)
                training_stats = []
                self.last_saving_step = self.global_step

        self._save_progress(self.checkpoint_dir, is_best=False)
        logging.info('Training ended at step %d' % self.global_step)

    def choose_action(self, states, infos, rnn_states):
        if self.use_rnn:
            values, a_logits, rnn_states = self.network(
                states, infos, rnn_states)
        else:
            values, a_logits = self.network(states, infos)  #without rnn_state

        probs = F.softmax(a_logits, dim=1)
        log_probs = F.log_softmax(a_logits, dim=1)
        entropy = torch.neg((log_probs * probs)).sum(1)
        acts = probs.multinomial(1).detach()
        selected_log_probs = log_probs.gather(1, acts)

        check_log_zero(log_probs.data)
        acts_one_hot = self.action_codes[acts.data.cpu().view(-1).numpy(), :]
        return acts_one_hot, values, selected_log_probs, entropy, rnn_states

    def predict_values(self, states, infos, rnn_states):
        if self.use_rnn:
            return self.network(states, infos, rnn_states)[0]
        return self.network(states, infos)[0]

    def compute_loss(self, delta_v, selected_log_probs, entropies):
        #delta_v = target_value - v_t which is basicale an advantage_t
        ##detach() prevents from providing grads from actor_loss to the critic
        advantages = delta_v.detach()
        actor_loss = selected_log_probs * advantages + self.entropy_coef * entropies
        actor_loss = torch.neg(torch.mean(actor_loss, 0))  #-1. * actor_loss
        critic_loss = self.critic_coef * torch.mean(delta_v.pow(2), 0)
        loss = self.loss_scaling * (actor_loss + critic_loss)
        return loss, actor_loss, critic_loss

    @classmethod
    def _load_latest_checkpoint(cls, dir):
        last_chkpt_path = join_path(dir, cls.CHECKPOINT_LAST)
        if isfile(last_chkpt_path):
            return torch.load(last_chkpt_path)
        return None

    def _save_progress(self, dir, summaries=None, is_best=False):
        last_chkpt_path = join_path(dir, self.CHECKPOINT_LAST)
        state = {
            'last_step': self.global_step,
            'network_state_dict': self.network.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict()
        }
        torch.save(state, last_chkpt_path)
        logging.info('The state of the agent is saved at step #%d' %
                     self.global_step)

        if (summaries is not None) and len(summaries) > 0:
            summaries_path = join_path(dir, self.SUMMARY_FILE)
            utils.save_summary(summaries, summaries_path)

        if is_best:
            best_chkpt_path = join_path(dir, self.CHECKPOINT_BEST)
            shutil.copyfile(last_chkpt_path, best_chkpt_path)

    def _training_info(self, total_rewards, average_speed, loop_speed,
                       moving_averages, grad_norms):
        last_ten = np.mean(total_rewards[-10:]) if len(total_rewards) else 0.
        logger_msg = "Ran {} steps, at {} steps/s ({} steps/s avg), last 10 rewards avg {}"

        lines = [
            '',
        ]
        lines.append(
            logger_msg.format(self.global_step, loop_speed, average_speed,
                              last_ten))
        lines.append(str(moving_averages))
        lines.append('grad_norm: {}'.format(grad_norms))
        logging.info(yellow('\n'.join(lines)))

    def evaluate(self, verbose=True):
        num_steps, rewards = self.eval_func(*self.eval_args,
                                            **self.eval_kwargs)

        mean_steps = np.mean(num_steps)
        min_r, max_r = np.min(rewards), np.max(rewards)
        mean_r, std_r = np.mean(rewards), np.std(rewards)

        stats = TrainingStats(mean_r, max_r, min_r, std_r, mean_steps)
        if verbose:
            lines = [
                'Perfromed {0} tests:'.format(len(num_steps)),
                'Mean number of steps: {0:.3f}'.format(mean_steps),
                'Mean R: {0:.2f} | Std of R: {1:.3f}'.format(mean_r, std_r)
            ]
            logging.info(red('\n'.join(lines)))

        return stats

    def set_eval_function(self, eval_func, *args, **kwargs):
        self.eval_func = eval_func
        self.eval_args = args
        self.eval_kwargs = kwargs