def _compute_grad_exact(): if 'env' not in self.__dict__: import lqg1d self.env = lqg1d.LQG1D() sigma = pol.sigma M = self.env.max_pos ENV_GAMMA = self.env.gamma ENV_VOLUME = 2 * self.env.max_action ENV_R = np.asscalar(self.env.R) ENV_Q = np.asscalar(self.env.Q) ENV_B = np.asscalar(self.env.B) ENV_MAX_ACTION = self.env.max_action MAX_REWARD = ENV_Q * M**2 + ENV_R * ENV_MAX_ACTION**2 C1 = (1 - ENV_GAMMA)**3 * math.sqrt(2 * math.pi) C2 = ENV_GAMMA * math.sqrt(2 * math.pi) * MAX_REWARD * M**2 C3 = 2 * (1 - ENV_GAMMA) * ENV_VOLUME * MAX_REWARD * M**2 m = 1 # c = utils.computeLoss(MAX_REWARD, M, ENV_GAMMA, ENV_VOLUME, sigma) c = pol.penaltyCoeff(MAX_REWARD, M, ENV_GAMMA, ENV_VOLUME) # d = utils.computeLossSigma(MAX_REWARD, M, ENV_GAMMA, ENV_VOLUME, sigma) alphaStar = 1 / (2 * c) gradK = self.env.grad_K(np.asscalar(pol.theta_mat), sigma) gradMixed = self.env.grad_mixed(np.asscalar(pol.theta_mat), sigma) grad_sigma_alpha_star = sigma**2 * ( 2 * C1 * C2 * sigma + 3 * C1 * C3) / (m * (C2 * sigma + C3)**2) grad_sigma_norm_grad_theta = 2 * gradK * gradMixed # Compute the gradient for sigma grad_local_step = (1 / 2) * gradK**2 * grad_sigma_alpha_star grad_far_sighted = (1 / 2) * alphaStar * grad_sigma_norm_grad_theta gradDelta = grad_local_step + grad_far_sighted gradDeltaW = gradDelta * math.exp(pol.w) return gradMixed, gradDeltaW
T = states.shape[0] exp_bon = np.zeros(T) for t in range(0, T): s = states[t] a = actions[t] x = bisect.bisect(S_space, s) y = bisect.bisect(S_space, a) N[x, y] += 1 exp_bon[t] = beta * np.sqrt(1 / N[x, y]) return exp_bon ##################################################### # Define the environment and the policy ##################################################### env = lqg1d.LQG1D(initial_state_type='random') policy = policy_gaussian ##################################################### # Experiments parameters ##################################################### # We will collect N trajectories per iteration N = 60 # Each trajectory will have at most T time steps T = 100 # Number of policy parameters updates n_itr = 100 # Number of epochs epochs = 5 # Set the discount factor for the problem