def update_hyperstate(agent, hyperstate_params, hyperparameters_state, hyperparameters_reward, datum, learn_diff): state, action, reward, next_state, _ = [ np.atleast_2d(np.copy(dat)) for dat in datum ] Llower_state, Xy_state, Llower_reward, Xy_reward = hyperstate_params state_action = np.concatenate([state, action], axis=-1) state_ = next_state - state if learn_diff else next_state basis_state = _basis(state_action, agent.random_matrix_state, agent.bias_state, agent.basis_dim_state, hyperparameters_state[0], hyperparameters_state[1]) Llower_state = Llower_state.transpose([0, 2, 1]) for i in range(len(Llower_state)): cholupdate(Llower_state[i], basis_state[i].copy()) Llower_state = Llower_state.transpose([0, 2, 1]) Xy_state += np.matmul(basis_state[..., None, :].transpose([0, 2, 1]), state_[..., None, :]) basis_reward = _basis(state_action, agent.random_matrix_reward, agent.bias_reward, agent.basis_dim_reward, hyperparameters_reward[0], hyperparameters_reward[1]) Llower_reward = Llower_reward.transpose([0, 2, 1]) for i in range(len(Llower_reward)): cholupdate(Llower_reward[i], basis_reward[i].copy()) Llower_reward = Llower_reward.transpose([0, 2, 1]) Xy_reward += np.matmul(basis_reward[..., None, :].transpose([0, 2, 1]), reward[..., None, :]) return [Llower_state, Xy_state, Llower_reward, Xy_reward]
def _reward(self, state, action, sess, Llower, Xy, hyperparameters): if self.environment == 'Pendulum-v0' and self.learn_reward == 0: reward = self.reward_function.build_np(sess, state, action) elif self.environment == 'MountainCarContinuous-v0' and self.learn_reward == 0: reward = self.reward_function.build_np(state, action) else: state_action = np.concatenate([state, action], axis=-1) length_scale, signal_sd, noise_sd, prior_sd = hyperparameters basis = _basis(state_action, self.random_matrices[-1], self.biases[-1], self.basis_dims[-1], length_scale, signal_sd) #tmp = (noise_sd/prior_sd)**2*np.eye(self.basis_dims[-1]) + XX if self.use_mean_reward == 1: pred_sigma = np.zeros([len(basis), 1]) else: #pred_sigma = noise_sd**2 + np.sum(np.multiply(basis, noise_sd**2*scipy.linalg.solve(tmp, basis.T, sym_pos=True).T), axis=-1, keepdims=True) #TODO: fix this. LinvXT = scipy.linalg.solve_triangular(Llower, basis.T, lower=True) pred_sigma = np.sum(np.square(LinvXT), axis=0) * noise_sd**2 + noise_sd**2 pred_sigma = pred_sigma[..., np.newaxis] #pred_mu = np.matmul(basis, scipy.linalg.solve(tmp, Xy, sym_pos=True)) tmp0 = scipy.linalg.solve_triangular(Llower, basis.T, lower=True).T tmp1 = scipy.linalg.solve_triangular(Llower, Xy, lower=True) pred_mu = np.matmul(tmp0, tmp1) reward = np.stack([ np.random.normal(loc=loc, scale=scale) for loc, scale in zip(pred_mu, pred_sigma) ], axis=0) return reward
def _update_hyperstate(self, X, y, update_hyperstate): if update_hyperstate: basis = _basis(X, self.random_matrix, self.bias, self.basis_dim, self.length_scale, self.signal_sd) self.Llower_tiled = self.Llower_tiled.transpose([0, 2, 1]) assert len(self.Llower_tiled) == len(basis) for i in range(len(self.Llower_tiled)): cholupdate(self.Llower_tiled[i], basis[i].copy()) self.Llower_tiled = self.Llower_tiled.transpose([0, 2, 1]) self.Xy_tiled += np.matmul( basis[:, None, :].transpose([ 0, 2, 1, ]), y[:, None, :])
def _predict(self, X, update_hyperstate): if update_hyperstate: basis = _basis(X, self.random_matrix, self.bias, self.basis_dim, self.length_scale, self.signal_sd) basis = np.expand_dims(basis, axis=1) LinvXT = solve_triangular(self.Llower_tiled, np.transpose(basis, [0, 2, 1])) pred_sigma = np.sum(np.square(LinvXT), axis=1) * self.noise_sd**2 + self.noise_sd**2 tmp0 = np.transpose( solve_triangular(self.Llower_tiled, np.transpose(basis, [0, 2, 1])), [0, 2, 1]) tmp1 = solve_triangular(self.Llower_tiled, self.Xy_tiled) pred_mu = np.matmul(tmp0, tmp1) pred_mu = np.squeeze(pred_mu, axis=-1) return pred_mu, pred_sigma else: return RegressionWrapper._predict(self, X)
def _reward(self, state, action, state_action, sess, Llower, Xy, hyperparameters): basis = None if self.environment == 'Pendulum-v0' and self.learn_reward == 0: reward = self.reward_function.build_np(sess, state, action) elif self.environment == 'MountainCarContinuous-v0' and self.learn_reward == 0: reward = self.reward_function.build_np(state, action) else: #state_action = np.concatenate([state, action], axis=-1) length_scale, signal_sd, noise_sd, prior_sd = hyperparameters basis = _basis(state_action, self.random_matrix_reward, self.bias_reward, self.basis_dim_reward, length_scale, signal_sd) basis = basis[:, None, ...] mu, sigma = self._predict(Llower, Xy, basis, noise_sd) if self.use_mean_reward == 1: sigma = np.zeros_like(sigma) reward = mu + np.sqrt(sigma) * np.random.standard_normal( size=mu.shape) return reward, basis
def _loss(self, thetas, X, Llowers, Xy, hyperparameters, sess): rng_state = np.random.get_state() try: np.random.seed(2) rewards = [] state = X for unroll_step in xrange(self.unroll_steps): action = self._forward(thetas, state) reward = self._reward(state, action, sess, Llowers[-1], Xy[-1], hyperparameters[-1]) rewards.append((self.discount_factor**unroll_step) * reward) state_action = np.concatenate([state, action], axis=-1) means = [] covs = [] for i in range(self.state_dim): length_scale, signal_sd, noise_sd, prior_sd = hyperparameters[ i] basis = _basis(state_action, self.random_matrices[i], self.biases[i], self.basis_dims[i], length_scale, signal_sd) #tmp = (noise_sd/prior_sd)**2*np.eye(self.basis_dims[i]) + XX[i] #pred_sigma = noise_sd**2 + np.sum(np.multiply(basis, noise_sd**2*scipy.linalg.solve(tmp, basis.T, sym_pos=True).T), axis=-1, keepdims=True) #pred_mu = np.matmul(basis, scipy.linalg.solve(tmp, Xy[i], sym_pos=True)) #TODO: fix this. LinvXT = scipy.linalg.solve_triangular(Llowers[i], basis.T, lower=True) pred_sigma = np.sum(np.square(LinvXT), axis=0) * noise_sd**2 + noise_sd**2 pred_sigma = pred_sigma[..., np.newaxis] tmp0 = scipy.linalg.solve_triangular(Llowers[i], basis.T, lower=True).T tmp1 = scipy.linalg.solve_triangular(Llowers[i], Xy[i], lower=True) pred_mu = np.matmul(tmp0, tmp1) means.append(pred_mu) covs.append(pred_sigma) means = np.concatenate(means, axis=-1) covs = np.concatenate(covs, axis=-1) state_ = np.stack([ np.random.multivariate_normal(mean=mean, cov=np.diag(cov)) for mean, cov in zip(means, covs) ], axis=0) state = state + state_ if self.learn_diff else state_ state = np.clip(state, self.observation_space_low, self.observation_space_high) rewards = np.concatenate(rewards, axis=-1) rewards = np.sum(rewards, axis=-1) loss = -np.mean(rewards) np.random.set_state(rng_state) return loss except Exception as e: np.random.set_state(rng_state) print e, 'Returning 10e100.' return 10e100
def _loss(self, thetas, X, Llower_state, XXtr_state, Xytr_state, hyperparameters_state, Llower_reward, XXtr_reward, Xytr_reward, hyperparameters_reward, sess=None): X = X.copy() Llower_state = Llower_state.copy() XXtr_state = XXtr_state.copy() Xytr_state = Xytr_state.copy() hyperparameters_state = hyperparameters_state.copy() if self.learn_reward: Llower_reward = Llower_reward.copy() XXtr_reward = XXtr_reward.copy() Xytr_reward = Xytr_reward.copy() hyperparameters_reward = hyperparameters_reward.copy() rng_state = np.random.get_state() #try: np.random.seed(2) rewards = [] state = X for unroll_step in xrange(self.unroll_steps): action = self._forward(thetas, state, hyperstate_params=[ Llower_state, Xytr_state, Llower_reward, Xytr_reward ]) state_action = np.concatenate([state, action], axis=-1) reward, basis_reward = self._reward(state, action, state_action, sess, Llower_reward, Xytr_reward, hyperparameters_reward) rewards.append((self.discount_factor**unroll_step) * reward) length_scale, signal_sd, noise_sd, prior_sd = hyperparameters_state basis_state = _basis(state_action, self.random_matrix_state, self.bias_state, self.basis_dim_state, length_scale, signal_sd) basis_state = basis_state[:, None, ...] mu, sigma = self._predict(Llower_state, Xytr_state, basis_state, noise_sd) state_ = mu + np.sqrt(sigma) * np.random.standard_normal( size=mu.shape) if self.learn_diff: state_tmp = state.copy() state = np.clip(state + state_, self.observation_space_low, self.observation_space_high) state_ = state - state_tmp else: state_ = np.clip(state_, self.observation_space_low, self.observation_space_high) state = state_.copy() if self.update_hyperstate == 1 or self.policy_use_hyperstate == 1: #Update state hyperstate Llower_state = Llower_state.transpose([0, 2, 1]) for i in range(len(Llower_state)): cholupdate(Llower_state[i], basis_state[i, 0].copy()) Llower_state = Llower_state.transpose([0, 2, 1]) Xytr_state += np.matmul(basis_state.transpose([0, 2, 1]), state_[..., None, :]) #Update reward hyperstate if self.learn_reward: Llower_reward = Llower_reward.transpose([0, 2, 1]) for i in range(len(Llower_reward)): cholupdate(Llower_reward[i], basis_reward[i, 0].copy()) Llower_reward = Llower_reward.transpose([0, 2, 1]) Xytr_reward += np.matmul(basis_reward.transpose([0, 2, 1]), reward[..., None, :]) rewards = np.concatenate(rewards, axis=-1) rewards = np.sum(rewards, axis=-1) loss = -np.mean(rewards) np.random.set_state(rng_state) return loss