Beispiel #1
0
    def finish_path(self, last_val=0):
        """
        Call this at the end of a trajectory, or when one gets cut off
        by an epoch ending. This looks back in the buffer to where the
        trajectory started, and uses rewards and value estimates from
        the whole trajectory to compute advantage estimates with GAE-Lambda,
        as well as compute the rewards-to-go for each state, to use as
        the targets for the value function.

        The "last_val" argument should be 0 if the trajectory ended
        because the agent reached a terminal state (died), and otherwise
        should be V(s_T), the value function estimated for the last state.
        This allows us to bootstrap the reward-to-go calculation to account
        for timesteps beyond the arbitrary episode horizon (or epoch cutoff).
        """

        path_slice = slice(self.path_start_idx, self.ptr)
        rews = np.append(self.rew_buf[path_slice], last_val)
        vals = np.append(self.val_buf[path_slice], last_val)
        
        # the next two lines implement GAE-Lambda advantage calculation
        deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1]
        self.adv_buf[path_slice] = core.discount_cumsum(deltas, self.gamma * self.lam)
        
        # the next line computes rewards-to-go, to be targets for the value function
        self.ret_buf[path_slice] = core.discount_cumsum(rews, self.gamma)[:-1]
        
        self.path_start_idx = self.ptr
Beispiel #2
0
 def finish_path(self, last_val=0):
     path_slice = slice(self.path_start_idx, self.ptr)
     rews = np.append(self.rew_buf[path_slice], last_val)
     vals = np.append(self.val_buf[path_slice], last_val)
     deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1]
     self.adv_buf[path_slice] = core.discount_cumsum(
         deltas, self.gamma * self.lam)
     self.ret_buf[path_slice] = core.discount_cumsum(rews, self.gamma)[:-1]
     self.path_start_idx = self.ptr
Beispiel #3
0
    def finish_path(self, last_value=0):
        path_slice = slice(self.path_start_idx, self.ptr)
        rews = np.append(self.rew_buf[path_slice], last_value)
        vals = np.append(self.val_buf[path_slice], last_value)

        # The next two lines implement GAE-Lambda advantage calculation
        deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1]
        self.adv_buf[path_slice] = core.discount_cumsum(deltas, self.gamma)[:-1]

        # The next lines computes the rewards-to-go, to be the targets for the value funtion
        self.ret_buf[path_slice] = core.discount_cumsum(rews, self.gamma)[:-1]

        self.path_start_idx = self.ptr
Beispiel #4
0
    def finish_path(self, last_val=0):
        path_slice = slice(self.path_start_index, self.ptr)
        rews = np.append(self.rew_buf[path_slice], last_val)
        vals = np.append(self.val_buf[path_slice], last_val)

        # The next two lines implement GAE Lambda
        deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1]
        self.adv_buf[path_slice] = core.discount_cumsum(
            deltas, self.gamma * self.lam)

        # The next line computes rewards to go
        self.ret_buf[path_slice] = core.discount_cumsum(rews, self.gamma)[:-1]

        self.path_start_index = self.ptr
Beispiel #5
0
    def finish_path(self, last_val=0):
        """
            在一个周期结束后, 计算每个时间步的advantage, 加权合成 GAE-Lambda
            如果是由于失败导致周期结束, 则 last_val=0; 否则 last_val=V(s_T).
            This allows us to bootstrap the reward-to-go calculation to account
            for time-steps beyond the arbitrary episode horizon (or epoch cutoff).
        """
        path_slice = slice(self.path_start_idx, self.ptr)  # 截取从初始位置到ptr
        rews = np.append(self.rew_buf[path_slice], last_val)  # 提取奖励序列
        vals = np.append(self.val_buf[path_slice], last_val)  # 提取值函数序列

        # 计算 GAE-Lambda advantage calculation. 用于更新 actor
        deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1]
        self.adv_buf[path_slice] = core.discount_cumsum(
            deltas, self.gamma * self.lam)

        # the next line computes rewards-to-go, to be targets for the value function. 用于更新 critic
        self.ret_buf[path_slice] = core.discount_cumsum(
            rews, self.gamma)[:-1]  # 值函数的目标使用蒙特卡洛方法
        self.path_start_idx = self.ptr
Beispiel #6
0
def evaluateq(actor, env, eval_episodes=None, critic=None):
    """
    Computes the score of an actor on a given number of runs,
    calculates the error of between evaluate_q and true_q
    calculates true_q using Monte Carlo
    """
    global test_num

    def policy(state):
        state = FloatTensor(state.reshape(-1))
        action = actor(state).cpu().data.numpy().flatten()
        return np.clip(action, -max_action, max_action)

    scores = []
    for _ in range(eval_episodes):
        steps = 0
        score = 0
        obs = deepcopy(env.reset())

        action = policy(obs)
        obs = FloatTensor(obs)
        action = FloatTensor(action)
        action = torch.unsqueeze(action, 0)
        obs = torch.unsqueeze(obs, 0)
        qeval = critic(obs, action)[0]
        qeval = torch.squeeze(qeval, 0)
        qeval = qeval.cpu().data.numpy()[0]
        rewList = []

        done = False
        while not (done or (steps == env._max_episode_steps)):
            # get next action and act
            action = policy(obs)
            n_obs, reward, done, _ = env.step(action)
            rewList.append(reward)
            score += reward
            steps += 1
            obs = n_obs
            if done:
                env.reset()
        rew_array = np.array(rewList)
        qmc = core.discount_cumsum(rew_array, 0.99)[0]
        scores.append(score)
        qerr = qeval - qmc
        Qevaluations.append([qeval, qmc, qerr, score, test_num])
        test_num += 1
        if (test_num + 1) > 980:
            test = pd.DataFrame(
                columns=[Qeval, Qmc, Qerr, EvaluationReturn, Epoch],
                data=Qevaluations)
            test.to_csv(goal_path + '/' + 'progress.csv')
    return scores