def run_policy(env, policy, scaler, num_episodes, max_timesteps, mode):

    total_steps = 0
    trajectories = []
    traj_len_list = []

    for itr in range(num_episodes):
        observes, actions, rewards, unscaled_obs = run_episode(env, \
                    policy, scaler,
                    max_timesteps=max_timesteps)

        total_steps += observes.shape[0]

        traj_len_list.append(len(observes))

        trajectory = {
            'observes': observes,
            'actions': actions,
            'rewards': rewards,
            'unscaled_obs': unscaled_obs
        }
        trajectories.append(trajectory)

    unscaled = np.concatenate([t['unscaled_obs'] for t in trajectories])
    if mode == 'save':  # only update scaler when training policy, get rid of possible bias when evaluating
        scaler.update(unscaled)
    logger.record_dicts({
        "_MeanReward":
        np.mean([t['rewards'].sum() for t in trajectories]),
        'Steps':
        total_steps,
    })

    return trajectories, traj_len_list
    def update(self, observes, actions, advantages, use_lr_adjust, ada_kl_penalty):
        """ Update policy based on observations, actions and advantages

        Args:
            observes: observations, shape = (N, obs_dim)
            actions: actions, shape = (N, act_dim)
            advantages: advantages, shape = (N,)
            phi_value: phi_value, shape = (N,)
            phi_act_g: phi_act_g, shape = (N, act_dim)
        """
        feed_dict = {self.obs_ph: observes,
                     self.act_ph: actions,
                     self.advantages_ph: advantages,
                     self.beta_ph: self.beta,
                     self.eta_ph: self.eta,
                     self.lr_ph: self.lr * self.lr_multiplier,
                     self.lr_phi_ph: self.lr_phi}
        old_means_np, old_log_vars_np = self.sess.run([self.means, self.log_vars],
                                                      feed_dict)
        feed_dict[self.old_log_vars_ph] = old_log_vars_np
        feed_dict[self.old_means_ph] = old_means_np
        loss, kl, entropy = 0, 0, 0
        
        if self.c_ph == 1.:
            # Update phi function & policy network
            logger.log("Training Phi for %d epochs"%self.phi_epochs)
            
            for _ in progressbar(range(self.phi_epochs), "Train Phi:", 25):
                self.sess.run(self.phi_train_op, feed_dict)
                phi_loss = self.sess.run(self.phi_loss, feed_dict)

            logger.record_tabular("Phi_loss", phi_loss)
        
        # Training policy
        logger.log("Training Policy for %d epochs"%self.epochs)
        for _ in progressbar(range(self.epochs), "Train Policy", 25):
            self.sess.run(self.train_op, feed_dict)
            loss, kl, entropy = self.sess.run([self.loss, self.kl, self.entropy], feed_dict)
            if kl > self.kl_targ * 4:  # early stopping if D_KL diverges badly
                break

        if (ada_kl_penalty):
            if kl > self.kl_targ * 2:  # servo beta to reach D_KL target
                self.beta = np.minimum(35, 1.5 * self.beta)  # max clip beta
                if (use_lr_adjust):
                    if self.beta > 30 and self.lr_multiplier > 0.1:
                        self.lr_multiplier /= 1.5
            elif kl < self.kl_targ / 2:
                self.beta = np.maximum(1 / 35, self.beta / 1.5)  # min clip beta
                if (use_lr_adjust):
                    if self.beta < (1 / 30) and self.lr_multiplier < 10:
                        self.lr_multiplier *= 1.5

        logger.record_dicts({
            'PolicyLoss': loss,
            'PolicyEntropy': entropy,
            'KL': kl,
            'Beta': self.beta,
            '_lr_multiplier': self.lr_multiplier})
    def update(self, load_policy,  
            observes, actions, 
            advantages, use_lr_adjust, 
            ada_kl_penalty, c=1):
 
        feed_dict = {self.obs_ph: observes,
                     self.act_ph: actions,
                     self.advantages_ph: advantages,
                     self.beta_ph: self.beta,
                     self.eta_ph: self.eta,
                     self.lr_ph: self.lr * self.lr_multiplier,
                     self.lr_phi_ph: self.lr_phi,
                     self.c_ph:c}

        old_means_np, old_log_vars_np = self.sess.run([self.means, self.log_vars],
                                                      feed_dict)
        feed_dict[self.old_log_vars_ph] = old_log_vars_np
        feed_dict[self.old_means_ph] = old_means_np
        loss, kl, entropy = 0, 0, 0

        for _ in range(self.phi_epochs):
            self.sess.run(self.phi_train_op, feed_dict)
        
        if load_policy == 'save':
        
            for e in range(self.epochs):
                self.sess.run(self.train_op, feed_dict)
                loss, kl, entropy = self.sess.run([self.loss, 
                        self.kl, self.entropy], feed_dict)
                if kl > self.kl_targ * 4:  
                    break
          
            if (ada_kl_penalty):
                if kl > self.kl_targ * 2:  # servo beta to reach D_KL target
                    self.beta = np.minimum(35, 1.5 * self.beta)  # max clip beta
                    if (use_lr_adjust):
                        if self.beta > 30 and self.lr_multiplier > 0.1:
                            self.lr_multiplier /= 1.5
                elif kl < self.kl_targ / 2:
                    self.beta = np.maximum(1 / 35, self.beta / 1.5)  # min clip beta
                    if (use_lr_adjust):
                        if self.beta < (1 / 30) and self.lr_multiplier < 10:
                          self.lr_multiplier *= 1.5

            logger.record_dicts({
                'PolicyLoss': loss,
                'PolicyEntropy': entropy,
                'KL': kl,
                'Beta': self.beta,
                '_lr_multiplier': self.lr_multiplier})
Esempio n. 4
0
def run_policy(env, policy, scaler, batch_size, max_timesteps):
    """ Run policy and collect data for a minimum of min_steps and min_episodes

    Args:
        env: ai gym environment
        policy: policy object with sample() method
        scaler: scaler object, used to scale/offset each observation dimension
            to a similar range
        episodes: total episodes to run
        max_timesteps: max timesteps per episode to run

    Returns: list of trajectory dictionaries, list length = number of episodes
        'observes' : NumPy array of states from episode
        'actions' : NumPy array of actions from episode
        'rewards' : NumPy array of (un-discounted) rewards from episode
        'unscaled_obs' : NumPy array of (un-discounted) rewards from episode
    """
    total_steps = 0
    trajectories = []

    while total_steps < batch_size:
        observes, actions, rewards, unscaled_obs = run_episode(env, \
                    policy, scaler, max_timesteps=max_timesteps)
        total_steps += observes.shape[0]
        trajectory = {
            'observes': observes,
            'actions': actions,
            'rewards': rewards,
            'unscaled_obs': unscaled_obs
        }
        trajectories.append(trajectory)

    unscaled = np.concatenate([t['unscaled_obs'] for t in trajectories])
    scaler.update(
        unscaled)  # update running statistics for scaling observations

    logger.record_dicts({
        "_MeanReward":
        np.mean([t['rewards'].sum() for t in trajectories]),
        'Steps':
        total_steps,
    })

    return trajectories
def log_batch_stats(observes, actions, advantages, disc_sum_rew, episode):
    
    logger.record_dicts({
        '_mean_obs': np.mean(observes),
        '_min_obs': np.min(observes),
        '_max_obs': np.max(observes),
        '_mean_act': np.mean(actions),
        '_max_act': np.max(actions),
        '_std_act': np.mean(np.var(actions, axis=0)),
        '_mean_adv': np.mean(advantages),
        '_min_adv': np.min(advantages),
        '_max_adv': np.max(advantages),
        '_std_adv': np.var(advantages),
        '_mean_discrew': np.mean(disc_sum_rew),
        '_min_discrew': np.min(disc_sum_rew),
        '_max_discrew': np.max(disc_sum_rew),
        '_std_discrew': np.var(disc_sum_rew)})
    
    logger.dump_tabular()
Esempio n. 6
0
    def fit(self, x, y):
        """ Fit model to current data batch + previous data batch

        Args:
            x: features
            y: target
        """
        num_batches = max(x.shape[0] // 256, 1)
        batch_size = x.shape[0] // num_batches
        y_hat = self.predict(x)  # check explained variance prior to update
        old_exp_var = 1 - np.var(y - y_hat) / np.var(y)
        if self.replay_buffer_x is None:
            x_train, y_train = x, y
        else:
            x_train = np.concatenate([x, self.replay_buffer_x])
            y_train = np.concatenate([y, self.replay_buffer_y])
        self.replay_buffer_x = x
        self.replay_buffer_y = y
        for e in range(self.epochs):
            x_train, y_train = shuffle(x_train, y_train)
            for j in range(num_batches):
                start = j * batch_size
                end = (j + 1) * batch_size
                feed_dict = {
                    self.obs_ph: x_train[start:end, :],
                    self.val_ph: y_train[start:end]
                }
                _, l = self.sess.run([self.train_op, self.loss],
                                     feed_dict=feed_dict)
        y_hat = self.predict(x)
        loss = np.mean(np.square(y_hat - y))  # explained variance after update
        exp_var = 1 - np.var(y - y_hat) / np.var(
            y)  # diagnose over-fitting of val func

        logger.record_dicts({
            'VarFuncLoss': loss,
            'ExplainedVarNew': exp_var,
            'ExplainedVarOld': old_exp_var
        })