Ejemplo n.º 1
0
class Agent(object):
    def __init__(self, cfg, seed=1):
        self.cfg = cfg
        self.env = NormalizeWrapper(
            self.cfg['environment'],
            norm_obs=self.cfg['normalize_observations'],
            norm_reward=self.cfg['normalize_rewards'],
            clip_obs=self.cfg['clip_observations'],
            clip_reward=self.cfg['clip_rewards'],
            gamma=self.cfg['gamma_env_normalization'],
            epsilon=self.cfg['num_stab_envnorm'])

        self.discrete = True if isinstance(
            self.env.action_space,
            gym.spaces.Discrete) else False  # only allowing Discrete and Box2D

        # get dimensions of input / output dimension
        self.input_dim = self.env.observation_space.shape

        if self.discrete:
            self.n_actions = self.env.action_space.n
        else:
            self.n_actions = self.env.action_space.shape[0]
            self.action_space_means = (self.env.action_space.high +
                                       self.env.action_space.low) / 2.0
            self.action_space_magnitude = (self.env.action_space.high -
                                           self.env.action_space.low) / 2.0

        if self.cfg['model_load_path_prefix']:
            self.load_model(self.cfg['model_load_path_prefix'])
        else:
            self.actor = self._build_network(self.cfg['actor_model'],
                                             self.input_dim, self.n_actions)
            self.critic = self._build_network(self.cfg['critic_model'],
                                              self.input_dim, 1)
            self.log_std_stateless = tf.Variable(tf.zeros(self.n_actions,
                                                          dtype=tf.float32),
                                                 trainable=True)

        self.actor_optimizer = Adam(learning_rate=self.cfg['adam_actor_alpha'],
                                    epsilon=self.cfg['adam_actor_epsilon'])
        self.critic_optimizer = Adam(
            learning_rate=self.cfg['adam_critic_alpha'],
            epsilon=self.cfg['adam_critic_epsilon'])

        ## MEMORY
        self._reset_memory()

        ## TENSORBOARD metrics and writers
        self.start_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        self.train_log_dir = f"logs/ppoagent/{self.env.get_name()}/{str(self.start_time)}"
        self.train_summary_writer = tf.summary.create_file_writer(
            self.train_log_dir)

        # logging losses
        self.tb_actor_loss = tf.keras.metrics.Mean('actor_losses/total_loss',
                                                   dtype=tf.float32)
        self.tb_ppo_loss = tf.keras.metrics.Mean('actor_losses/ppo_loss',
                                                 dtype=tf.float32)
        self.tb_entropy_loss = tf.keras.metrics.Mean(
            'actor_losses/entropy_loss', dtype=tf.float32)
        self.tb_actor_regloss = tf.keras.metrics.Mean('actor_losses/reg_loss',
                                                      dtype=tf.float32)

        self.tb_critic_loss = tf.keras.metrics.Mean('critic_losses/total_loss',
                                                    dtype=tf.float32)
        self.tb_value_loss = tf.keras.metrics.Mean('critic_losses/value_loss',
                                                   dtype=tf.float32)
        self.tb_critic_regloss = tf.keras.metrics.Mean(
            'critic_losses/reg_loss', dtype=tf.float32)

        if self.cfg['tb_log_graph']:
            tb_log_model_graph(self.train_summary_writer, self.actor,
                               self.train_log_dir, 'actor_model')
            tb_log_model_graph(self.train_summary_writer, self.critic,
                               self.train_log_dir, 'critic_model')

        cfg_as_list = [[str(k), str(v)] for k, v in self.cfg.items()]

        with self.train_summary_writer.as_default():
            tf.summary.text(name='hyperparameters',
                            data=tf.convert_to_tensor(cfg_as_list),
                            step=0)

    def _reset_memory(self):
        self.state_memory, self.not_done_memory = [], []
        self.action_memory, self.action_dist_memory = [], []
        self.reward_memory, self.v_est_memory = [], []
        self.last_vest_buffer = 0.0

    def _build_network(self, network_model, input_dim, output_dim):
        model = network_model(input_dim, output_dim)
        model.build(input_shape=input_dim)
        return model

    def save_model(self, filepath):
        file_prefix = f'{filepath}/models/{self.step}'
        os.makedirs(file_prefix)

        tf.keras.models.save_model(self.actor,
                                   f'{file_prefix}/actor.h5',
                                   overwrite=True,
                                   include_optimizer=False,
                                   save_format='h5')
        tf.keras.models.save_model(self.actor,
                                   f'{file_prefix}/critic.h5',
                                   overwrite=True,
                                   include_optimizer=False,
                                   save_format='h5')
        np.save(f'{file_prefix}/logstd.npy', self.log_std_stateless.numpy())
        self.env.save(f'{file_prefix}/env.pkl')

    def load_model(self, file_prefix):
        self.actor = tf.keras.models.load_model(f'{file_prefix}/actor.h5',
                                                compile=False)
        self.critic = tf.keras.models.load_model(f'{file_prefix}/critic.h5',
                                                 compile=False)
        self.log_std_stateless = tf.Variable(
            np.load(f'{file_prefix}/logstd.npy'), trainable=True)
        self.env = NormalizeWrapper.load(f'{file_prefix}/env.pkl',
                                         self.cfg['environment'])

    def _get_dist(self, means, log_stds):
        if self.discrete:
            return tfd.Categorical(logits=means)
        else:
            return tfd.Normal(loc=means, scale=K.exp(log_stds))

    def actor_choose(self, state):
        a_mu = self.actor(K.expand_dims(state, axis=0))[0]
        dist = self._get_dist(a_mu, self.log_std_stateless)

        if self.discrete:
            scaled_action = unscaled_action = dist.sample().numpy()
        else:
            unscaled_action = np.clip(dist.sample(), -1.0, 1.0)

            if self.cfg['scale_actions']:
                scaled_action = self.action_space_means + unscaled_action * self.action_space_magnitude
            else:
                scaled_action = unscaled_action

        return scaled_action, unscaled_action, a_mu

    def critic_evaluate(self, state):
        return self.critic(K.expand_dims(state, axis=0))[0]

    def store_transition(self, state, action, action_dist, reward, v_est,
                         not_done):
        self.state_memory.append(state), self.not_done_memory.append(not_done)
        self.action_memory.append(action), self.action_dist_memory.append(
            action_dist)
        self.reward_memory.append(reward), self.v_est_memory.append(v_est)

    def _calculate_returns_and_advantages(self, v_ests, rewards, not_dones):
        vests, rews, notdones = np.asarray(
            v_ests + [self.last_vest_buffer]).flatten(), np.asarray(
                rewards).flatten(), np.asarray(not_dones).flatten()

        # calculate actual returns (discounted rewards) based on observation
        def discounted_return_fn(accumulated_discounted_reward,
                                 reward_discount):
            reward, discount = reward_discount
            return accumulated_discounted_reward * discount + reward

        discounts = self.cfg['gae_gamma'] * notdones
        returns = npscanr(discounted_return_fn, self.last_vest_buffer,
                          list(zip(rews, discounts)))

        # calculate actual advantages based on td residual (see gae paper, eq. 16)
        def weighted_cumulative_td_fn(accumulated_td, weights_td_tuple):
            td, weighted_discount = weights_td_tuple
            return accumulated_td * weighted_discount + td

        deltas = rews + discounts * vests[1:] - vests[:-1]
        advantages = npscanr(
            weighted_cumulative_td_fn, 0,
            list(zip(deltas, discounts * self.cfg['gae_lambda'])))

        return returns, advantages

    def _ppo_clip_loss(self, log_pi_new, log_pi_old, advantage):
        ratio = K.exp(log_pi_new - log_pi_old)
        clip_ratio = K.clip(ratio,
                            min_value=1 - self.cfg['ppo_clip'](self.step),
                            max_value=1 + self.cfg['ppo_clip'](self.step))

        surrogate1 = ratio * advantage
        surrogate2 = clip_ratio * advantage

        return K.mean(K.minimum(surrogate1, surrogate2))

    def _value_loss(self, values, values_old, returns):
        clipped_vest = K.clip(
            values,
            min_value=values_old - self.cfg['vest_clip'](self.step),
            max_value=values_old + self.cfg['vest_clip'](self.step))

        surrogate1 = K.square(values - returns)
        surrogate2 = K.square(clipped_vest - returns)

        return -K.mean(K.minimum(surrogate1, surrogate2))

    def _entropy_loss(self, mu, log_std):
        return -K.mean(self._get_dist(mu, log_std).entropy())

    def _reg_loss(self, model):
        if model.losses:
            return tf.math.add_n(self.actor.losses)
        else:
            return 0.0

    def _train(self, states, actions, actions_dist, returns, advantages,
               v_ests):
        x_states, y_true_actions_dist = states, actions_dist
        y_true_actions, y_pred_vest_old, y_true_returns = actions, v_ests, returns
        old_log_std = tf.Variable(self.log_std_stateless.value(),
                                  dtype=tf.float32)

        sample_amt = len(self.action_memory)
        sample_range, batches_amt = np.arange(
            sample_amt), sample_amt // self.cfg['batchsize']

        if self.cfg['permutate']:
            np.random.shuffle(sample_range)

        for _ in range(self.cfg['epochs']):
            for i in range(batches_amt):
                if self.cfg['shuffle']:
                    np.random.shuffle(sample_range)
                    sample_idx = sample_range[:self.cfg['batchsize']]
                else:
                    sample_idx = sample_range[i *
                                              self.cfg['batchsize']:(i + 1) *
                                              self.cfg['batchsize']]

                batch_states = np.asarray([x_states[i] for i in sample_idx])
                batch_action_dist = np.asarray(
                    [y_true_actions_dist[i] for i in sample_idx])

                batch_y_true_actions = np.asarray(
                    [y_true_actions[i] for i in sample_idx])
                batch_y_true_returns = np.asarray(
                    [y_true_returns[i] for i in sample_idx])
                batch_advantage = np.asarray(
                    [advantages[i] for i in sample_idx])
                batch_y_pred_vest_old = np.asarray(
                    [y_pred_vest_old[i] for i in sample_idx])

                if self.cfg['normalize_advantages']:
                    batch_advantage = (batch_advantage -
                                       batch_advantage.mean()) / np.maximum(
                                           batch_advantage.std(),
                                           self.cfg['num_stab_advnorm'])

                with tf.GradientTape(persistent=True) as tape:
                    batch_y_pred_mu = self.actor(batch_states)
                    batch_y_pred_vest = self.critic(batch_states)

                    log_pi_new = self._get_dist(
                        batch_y_pred_mu,
                        self.log_std_stateless).log_prob(batch_y_true_actions)
                    log_pi_old = self._get_dist(
                        batch_action_dist,
                        old_log_std).log_prob(batch_y_true_actions)

                    # in case of multiple actions p(a_0, ..., a_N) = p(a_0) * ... * p(a_N) [only continuous case]
                    if not self.discrete:
                        log_pi_new = K.sum(log_pi_new, axis=-1)
                        log_pi_old = K.sum(log_pi_old, axis=-1)

                    # loss calculation
                    ppo_clip_loss = self._ppo_clip_loss(
                        log_pi_new=log_pi_new,
                        log_pi_old=log_pi_old,
                        advantage=batch_advantage)
                    entropy_loss = self.cfg['entropy_factor'](
                        self.step) * self._entropy_loss(
                            batch_y_pred_mu, self.log_std_stateless)
                    reg_loss_actor = self.cfg[
                        'actor_regloss_factor'] * self._reg_loss(self.actor)
                    actor_loss = ppo_clip_loss + entropy_loss + reg_loss_actor

                    value_loss = self.cfg[
                        'value_loss_factor'] * self._value_loss(
                            batch_y_pred_vest, batch_y_pred_vest_old,
                            batch_y_true_returns)
                    reg_loss_critic = self.cfg[
                        'critic_regloss_factor'] * self._reg_loss(self.critic)
                    critic_loss = value_loss + reg_loss_critic

                    # tensorboard logging
                    self.tb_actor_loss(actor_loss)
                    self.tb_ppo_loss(ppo_clip_loss)
                    self.tb_entropy_loss(entropy_loss)
                    self.tb_actor_regloss(reg_loss_actor)

                    self.tb_critic_loss(critic_loss)
                    self.tb_value_loss(value_loss)
                    self.tb_critic_regloss(reg_loss_critic)

                if not self.discrete:
                    gradient = tape.gradient(actor_loss,
                                             [self.log_std_stateless])
                    self.actor_optimizer.apply_gradients(
                        zip(gradient, [self.log_std_stateless]))

                gradient = tape.gradient(actor_loss,
                                         self.actor.trainable_variables)
                gradient, _ = tf.clip_by_global_norm(
                    gradient, clip_norm=self.cfg['clip_policy_gradient_norm'])
                self.actor_optimizer.apply_gradients(
                    zip(gradient, self.actor.trainable_variables))

                gradient = tape.gradient(critic_loss,
                                         self.critic.trainable_variables)
                self.critic_optimizer.apply_gradients(
                    zip(gradient, self.critic.trainable_variables))

    def train(self):
        # calculate returns and advantages
        self.returns, self.advantages = self._calculate_returns_and_advantages(
            self.v_est_memory, self.reward_memory, self.not_done_memory)

        # train agent
        self._train(self.state_memory, self.action_memory,
                    self.action_dist_memory, self.returns, self.advantages,
                    self.v_est_memory)
        self._log_training()
        self._reset_memory()

    def _log_training(self):
        with self.train_summary_writer.as_default():
            # log losses
            tf.summary.scalar('actor_losses/total_loss',
                              self.tb_actor_loss.result(),
                              step=self.step)
            tf.summary.scalar('actor_losses/ppo_loss',
                              self.tb_ppo_loss.result(),
                              step=self.step)
            tf.summary.scalar('actor_losses/entropy_loss',
                              self.tb_entropy_loss.result(),
                              step=self.step)
            tf.summary.scalar('actor_losses/reg_loss',
                              self.tb_actor_regloss.result(),
                              step=self.step)

            tf.summary.scalar('critic_losses/total_loss',
                              self.tb_critic_loss.result(),
                              step=self.step)
            tf.summary.scalar('critic_losses/value_loss',
                              self.tb_value_loss.result(),
                              step=self.step)
            tf.summary.scalar('critic_losses/reg_loss',
                              self.tb_critic_regloss.result(),
                              step=self.step)

            # log returns and advantages
            tf.summary.scalar('env_metrics/avg_returns_per_step',
                              np.average(self.returns),
                              step=self.step)
            tf.summary.scalar('env_metrics/avg_advantages_per_step',
                              np.average(self.advantages),
                              step=self.step)
            tf.summary.histogram('env_metrics/returns_per_step',
                                 self.returns,
                                 step=self.step)
            tf.summary.histogram('env_metrics/advantages_per_step',
                                 self.advantages,
                                 step=self.step)

            # log optimizer statistisc
            tf.summary.scalar('optimizer/actor_lr',
                              self.actor_optimizer._decayed_lr(tf.float32),
                              step=self.step)
            tf.summary.scalar('optimizer/critic_lr',
                              self.critic_optimizer._decayed_lr(tf.float32),
                              step=self.step)

        self.tb_actor_loss.reset_states()
        self.tb_ppo_loss.reset_states()
        self.tb_entropy_loss.reset_states()
        self.tb_actor_regloss.reset_states()

        self.tb_critic_loss.reset_states()
        self.tb_value_loss.reset_states()
        self.tb_critic_regloss.reset_states()

    def _log_episode(self, observations, actions, scores, episode, step):
        epscore = foldl(operator.add, scores)
        with self.train_summary_writer.as_default():
            tf.summary.scalar('env_metrics/episode_score_per_step',
                              epscore,
                              step=step)
            tf.summary.scalar('env_metrics/episode_score_per_episode',
                              epscore,
                              step=episode)
            tf.summary.histogram('env_metrics/rewards_per_episode',
                                 scores,
                                 step=episode)

            # observations logging
            obs = np.asarray(observations)
            for i in range(obs.shape[1]):
                tf.summary.histogram(
                    f'env_metrics_obs/observation_{i}_per_episode',
                    obs[:, i],
                    step=episode)

            # action logging
            if self.discrete:
                pass  # TODO log actions for discrete envs
            else:
                acts = np.asarray(actions)
                for i in range(acts.shape[1]):
                    tf.summary.histogram(
                        f'env_metrics_acts/action_{i}_per_episode',
                        acts[:, i],
                        step=episode)

            # std logging
            for i in range(self.log_std_stateless.shape[0]):
                tf.summary.scalar(
                    f'env_metrics_acts/std_action_{i}_per_episode',
                    np.exp(self.log_std_stateless[i]),
                    step=step)

    def learn(self):
        s, episode, done = self.env.reset(), 0, False
        observations, actions, scores = [], [], []

        for self.step in tqdm(range(self.cfg['total_steps'])):
            # choose and take an action, advance environment and store data
            self.env.render()
            observations.append(self.env.unnormalize_obs(s))

            scaled_a, unscaled_a, a_dist = self.actor_choose(s)
            actions.append(unscaled_a)

            s_, r, done, _ = self.env.step(scaled_a)
            scores.append(self.env.unnormalize_reward(r))

            v_est = self.critic_evaluate(s)

            if self.cfg['clip_eplength'] and len(
                    observations) > self.cfg['clip_eplength'](self.step):
                done = True

            self.store_transition(s, unscaled_a, a_dist, r, v_est, not done)
            s = s_

            # resetting environment if instance is terminated
            if done:
                self._log_episode(observations, actions, scores, episode,
                                  self.step)
                s, scores, observations, actions, done = self.env.reset(
                ), [], [], [], False
                episode += 1

            if self.step % self.cfg[
                    'model_save_interval'] == 0 or self.step == self.cfg[
                        'total_steps'] - 1:
                self.save_model(self.train_log_dir)

            if self.step % self.cfg['rollout'] == 0 and self.step > 0:
                self.cfg['adam_actor_alpha'].update_rollout_step(self.step)
                self.cfg['adam_critic_alpha'].update_rollout_step(self.step)

                self.last_vest_buffer = self.critic_evaluate(s_)
                self.train()
Ejemplo n.º 2
0
        if batch % 150 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(
                epoch, batch, train_loss.result()))

    # print()
    # print(summarize(clean_words(documents_val[i1])))
    # print(summarize(clean_words(documents_val[i2])))
    # print(summarize(clean_words(documents_val[i3])))
    # print(summarize(clean_words(documents_val[i4])))
    # print()

    val_loss_ = validate().numpy()
    history['val'].append((epoch, val_loss_))
    print('\n* Train Loss {:.4f}'.format(train_loss.result()))
    history['train'].append((epoch, train_loss.result().numpy()))

    if best_val_loss - val_loss_ > 0.05:
        ckpt_save_path4 = ckpt_manager4.save()
        print('\nSaving checkpoint for epoch {} at {}'.format(
            epoch, ckpt_save_path4))
        best_val_loss = val_loss_

    hist(history)
    print('Current Lr: ', optimizer2._decayed_lr('float32').numpy())
    print('\nTime taken for this epoch: {:.2f} secs\n'.format(time.time() -
                                                              start))
    print('=' * 40)

#save weights
transformer.save_weights(
    'drive/My Drive/Doc_Sum/Transformer/models/CNN_Dailymail.h5')