예제 #1
0
def update_hyperstate(agent, hyperstate_params, hyperparameters_state,
                      hyperparameters_reward, datum, learn_diff):
    state, action, reward, next_state, _ = [
        np.atleast_2d(np.copy(dat)) for dat in datum
    ]
    Llower_state, Xy_state, Llower_reward, Xy_reward = hyperstate_params

    state_action = np.concatenate([state, action], axis=-1)
    state_ = next_state - state if learn_diff else next_state

    basis_state = _basis(state_action, agent.random_matrix_state,
                         agent.bias_state, agent.basis_dim_state,
                         hyperparameters_state[0], hyperparameters_state[1])
    Llower_state = Llower_state.transpose([0, 2, 1])
    for i in range(len(Llower_state)):
        cholupdate(Llower_state[i], basis_state[i].copy())
    Llower_state = Llower_state.transpose([0, 2, 1])
    Xy_state += np.matmul(basis_state[..., None, :].transpose([0, 2, 1]),
                          state_[..., None, :])

    basis_reward = _basis(state_action, agent.random_matrix_reward,
                          agent.bias_reward, agent.basis_dim_reward,
                          hyperparameters_reward[0], hyperparameters_reward[1])
    Llower_reward = Llower_reward.transpose([0, 2, 1])
    for i in range(len(Llower_reward)):
        cholupdate(Llower_reward[i], basis_reward[i].copy())
    Llower_reward = Llower_reward.transpose([0, 2, 1])
    Xy_reward += np.matmul(basis_reward[..., None, :].transpose([0, 2, 1]),
                           reward[..., None, :])

    return [Llower_state, Xy_state, Llower_reward, Xy_reward]
    def _reward(self, state, action, sess, Llower, Xy, hyperparameters):
        if self.environment == 'Pendulum-v0' and self.learn_reward == 0:
            reward = self.reward_function.build_np(sess, state, action)
        elif self.environment == 'MountainCarContinuous-v0' and self.learn_reward == 0:
            reward = self.reward_function.build_np(state, action)
        else:
            state_action = np.concatenate([state, action], axis=-1)
            length_scale, signal_sd, noise_sd, prior_sd = hyperparameters
            basis = _basis(state_action, self.random_matrices[-1],
                           self.biases[-1], self.basis_dims[-1], length_scale,
                           signal_sd)
            #tmp = (noise_sd/prior_sd)**2*np.eye(self.basis_dims[-1]) + XX
            if self.use_mean_reward == 1:
                pred_sigma = np.zeros([len(basis), 1])
            else:
                #pred_sigma = noise_sd**2 + np.sum(np.multiply(basis, noise_sd**2*scipy.linalg.solve(tmp, basis.T, sym_pos=True).T), axis=-1, keepdims=True)
                #TODO: fix this.
                LinvXT = scipy.linalg.solve_triangular(Llower,
                                                       basis.T,
                                                       lower=True)
                pred_sigma = np.sum(np.square(LinvXT),
                                    axis=0) * noise_sd**2 + noise_sd**2
                pred_sigma = pred_sigma[..., np.newaxis]
            #pred_mu = np.matmul(basis, scipy.linalg.solve(tmp, Xy, sym_pos=True))
            tmp0 = scipy.linalg.solve_triangular(Llower, basis.T, lower=True).T
            tmp1 = scipy.linalg.solve_triangular(Llower, Xy, lower=True)
            pred_mu = np.matmul(tmp0, tmp1)

            reward = np.stack([
                np.random.normal(loc=loc, scale=scale)
                for loc, scale in zip(pred_mu, pred_sigma)
            ],
                              axis=0)
        return reward
예제 #3
0
    def _update_hyperstate(self, X, y, update_hyperstate):
        if update_hyperstate:
            basis = _basis(X, self.random_matrix, self.bias, self.basis_dim,
                           self.length_scale, self.signal_sd)

            self.Llower_tiled = self.Llower_tiled.transpose([0, 2, 1])
            assert len(self.Llower_tiled) == len(basis)
            for i in range(len(self.Llower_tiled)):
                cholupdate(self.Llower_tiled[i], basis[i].copy())
            self.Llower_tiled = self.Llower_tiled.transpose([0, 2, 1])

            self.Xy_tiled += np.matmul(
                basis[:, None, :].transpose([
                    0,
                    2,
                    1,
                ]), y[:, None, :])
예제 #4
0
    def _predict(self, X, update_hyperstate):
        if update_hyperstate:
            basis = _basis(X, self.random_matrix, self.bias, self.basis_dim,
                           self.length_scale, self.signal_sd)
            basis = np.expand_dims(basis, axis=1)

            LinvXT = solve_triangular(self.Llower_tiled,
                                      np.transpose(basis, [0, 2, 1]))
            pred_sigma = np.sum(np.square(LinvXT),
                                axis=1) * self.noise_sd**2 + self.noise_sd**2
            tmp0 = np.transpose(
                solve_triangular(self.Llower_tiled,
                                 np.transpose(basis, [0, 2, 1])), [0, 2, 1])
            tmp1 = solve_triangular(self.Llower_tiled, self.Xy_tiled)
            pred_mu = np.matmul(tmp0, tmp1)
            pred_mu = np.squeeze(pred_mu, axis=-1)
            return pred_mu, pred_sigma
        else:
            return RegressionWrapper._predict(self, X)
예제 #5
0
 def _reward(self, state, action, state_action, sess, Llower, Xy,
             hyperparameters):
     basis = None
     if self.environment == 'Pendulum-v0' and self.learn_reward == 0:
         reward = self.reward_function.build_np(sess, state, action)
     elif self.environment == 'MountainCarContinuous-v0' and self.learn_reward == 0:
         reward = self.reward_function.build_np(state, action)
     else:
         #state_action = np.concatenate([state, action], axis=-1)
         length_scale, signal_sd, noise_sd, prior_sd = hyperparameters
         basis = _basis(state_action, self.random_matrix_reward,
                        self.bias_reward, self.basis_dim_reward,
                        length_scale, signal_sd)
         basis = basis[:, None, ...]
         mu, sigma = self._predict(Llower, Xy, basis, noise_sd)
         if self.use_mean_reward == 1: sigma = np.zeros_like(sigma)
         reward = mu + np.sqrt(sigma) * np.random.standard_normal(
             size=mu.shape)
     return reward, basis
    def _loss(self, thetas, X, Llowers, Xy, hyperparameters, sess):
        rng_state = np.random.get_state()
        try:
            np.random.seed(2)

            rewards = []
            state = X
            for unroll_step in xrange(self.unroll_steps):
                action = self._forward(thetas, state)
                reward = self._reward(state, action, sess, Llowers[-1], Xy[-1],
                                      hyperparameters[-1])
                rewards.append((self.discount_factor**unroll_step) * reward)
                state_action = np.concatenate([state, action], axis=-1)

                means = []
                covs = []
                for i in range(self.state_dim):
                    length_scale, signal_sd, noise_sd, prior_sd = hyperparameters[
                        i]
                    basis = _basis(state_action, self.random_matrices[i],
                                   self.biases[i], self.basis_dims[i],
                                   length_scale, signal_sd)

                    #tmp = (noise_sd/prior_sd)**2*np.eye(self.basis_dims[i]) + XX[i]
                    #pred_sigma = noise_sd**2 + np.sum(np.multiply(basis, noise_sd**2*scipy.linalg.solve(tmp, basis.T, sym_pos=True).T), axis=-1, keepdims=True)
                    #pred_mu = np.matmul(basis, scipy.linalg.solve(tmp, Xy[i], sym_pos=True))
                    #TODO: fix this.
                    LinvXT = scipy.linalg.solve_triangular(Llowers[i],
                                                           basis.T,
                                                           lower=True)
                    pred_sigma = np.sum(np.square(LinvXT),
                                        axis=0) * noise_sd**2 + noise_sd**2
                    pred_sigma = pred_sigma[..., np.newaxis]
                    tmp0 = scipy.linalg.solve_triangular(Llowers[i],
                                                         basis.T,
                                                         lower=True).T
                    tmp1 = scipy.linalg.solve_triangular(Llowers[i],
                                                         Xy[i],
                                                         lower=True)
                    pred_mu = np.matmul(tmp0, tmp1)

                    means.append(pred_mu)
                    covs.append(pred_sigma)
                means = np.concatenate(means, axis=-1)
                covs = np.concatenate(covs, axis=-1)

                state_ = np.stack([
                    np.random.multivariate_normal(mean=mean, cov=np.diag(cov))
                    for mean, cov in zip(means, covs)
                ],
                                  axis=0)
                state = state + state_ if self.learn_diff else state_
                state = np.clip(state, self.observation_space_low,
                                self.observation_space_high)
            rewards = np.concatenate(rewards, axis=-1)
            rewards = np.sum(rewards, axis=-1)
            loss = -np.mean(rewards)
            np.random.set_state(rng_state)
            return loss
        except Exception as e:
            np.random.set_state(rng_state)
            print e, 'Returning 10e100.'
            return 10e100
예제 #7
0
    def _loss(self,
              thetas,
              X,
              Llower_state,
              XXtr_state,
              Xytr_state,
              hyperparameters_state,
              Llower_reward,
              XXtr_reward,
              Xytr_reward,
              hyperparameters_reward,
              sess=None):
        X = X.copy()
        Llower_state = Llower_state.copy()
        XXtr_state = XXtr_state.copy()
        Xytr_state = Xytr_state.copy()
        hyperparameters_state = hyperparameters_state.copy()
        if self.learn_reward:
            Llower_reward = Llower_reward.copy()
            XXtr_reward = XXtr_reward.copy()
            Xytr_reward = Xytr_reward.copy()
            hyperparameters_reward = hyperparameters_reward.copy()
        rng_state = np.random.get_state()
        #try:
        np.random.seed(2)

        rewards = []
        state = X
        for unroll_step in xrange(self.unroll_steps):
            action = self._forward(thetas,
                                   state,
                                   hyperstate_params=[
                                       Llower_state, Xytr_state, Llower_reward,
                                       Xytr_reward
                                   ])
            state_action = np.concatenate([state, action], axis=-1)

            reward, basis_reward = self._reward(state, action, state_action,
                                                sess, Llower_reward,
                                                Xytr_reward,
                                                hyperparameters_reward)
            rewards.append((self.discount_factor**unroll_step) * reward)

            length_scale, signal_sd, noise_sd, prior_sd = hyperparameters_state
            basis_state = _basis(state_action, self.random_matrix_state,
                                 self.bias_state, self.basis_dim_state,
                                 length_scale, signal_sd)
            basis_state = basis_state[:, None, ...]
            mu, sigma = self._predict(Llower_state, Xytr_state, basis_state,
                                      noise_sd)
            state_ = mu + np.sqrt(sigma) * np.random.standard_normal(
                size=mu.shape)

            if self.learn_diff:
                state_tmp = state.copy()
                state = np.clip(state + state_, self.observation_space_low,
                                self.observation_space_high)
                state_ = state - state_tmp
            else:
                state_ = np.clip(state_, self.observation_space_low,
                                 self.observation_space_high)
                state = state_.copy()

            if self.update_hyperstate == 1 or self.policy_use_hyperstate == 1:
                #Update state hyperstate
                Llower_state = Llower_state.transpose([0, 2, 1])
                for i in range(len(Llower_state)):
                    cholupdate(Llower_state[i], basis_state[i, 0].copy())
                Llower_state = Llower_state.transpose([0, 2, 1])
                Xytr_state += np.matmul(basis_state.transpose([0, 2, 1]),
                                        state_[..., None, :])

                #Update reward hyperstate
                if self.learn_reward:
                    Llower_reward = Llower_reward.transpose([0, 2, 1])
                    for i in range(len(Llower_reward)):
                        cholupdate(Llower_reward[i], basis_reward[i, 0].copy())
                    Llower_reward = Llower_reward.transpose([0, 2, 1])
                Xytr_reward += np.matmul(basis_reward.transpose([0, 2, 1]),
                                         reward[..., None, :])

        rewards = np.concatenate(rewards, axis=-1)
        rewards = np.sum(rewards, axis=-1)
        loss = -np.mean(rewards)
        np.random.set_state(rng_state)
        return loss