def __init__(self, stateCount, actionCount, config): # Get dimensions of V, pi and L self.dim_v = 1 self.dim_p = actionCount self.dim_l = 1 #(1 + actionCount) * actionCount / 2 #TODO self.dim_a = self.dim_v + self.dim_p + self.dim_l # Get action space self.min_act = np.reshape(json.loads(config.get('MinAction')), (-1, 1)) self.max_act = np.reshape(json.loads(config.get('MaxAction')), (-1, 1)) # Initialize L self.init_l = config.getfloat('InitL', 0.01) # Represent V, pi, L in one RKHS self.vpl = KernelRepresentation(stateCount, self.dim_a, config) # Learning rates self.eta_v = ScheduledParameter('LearningRateV', config) self.eta_p = ScheduledParameter('LearningRateP', config) self.eta_l = ScheduledParameter('LearningRateL', config) # Regularization self.lossL = config.getfloat('Regularization', 1e-6) # Representation error budget self.eps = config.getfloat('RepresentationError', 1.0) # Reward discount self.gamma = config.getfloat('RewardDiscount')
def __init__(self, indim, outdim, grad_type): self.f = KernelRepresentation(indim, outdim, ModelParameters.config) self.indim = indim self.outdim = outdim self.grad_type = grad_type self.delta = 0
def __init__(self, stateCount, actionCount, config): self.Q = KernelRepresentation(stateCount, actionCount, config) # Learning rate self.eta = ScheduledParameter('LearningRate', config) # Regularization self.lossL = config.getfloat('Regularization', 1e-4) # Representation error budget self.eps = config.getfloat('RepresentationError', 1.0) # Reward discount self.gamma = config.getfloat('RewardDiscount')
class KSCGDModel(object): def __init__(self, stateCount, config): self.V = KernelRepresentation(stateCount, 1, config) # Learning rate self.eta = ScheduledParameter('LearningRate', config) # Regularization self.lossL = config.getfloat('Regularization', 1e-6) # Representation error budget # self.eps = config.getfloat('RepresentationError', 1.0) self.eps = ScheduledParameter('RepresentationError', config) # Reward discount self.gamma = config.getfloat('RewardDiscount') # TD-loss expectation approximation rate self.beta = ScheduledParameter('ExpectationRate', config) # Running estimate of our expected TD-loss self.y = 0. def bellman_error(self, s, a, r, s_): if s_ is None: return r - self.V(s) else: return r + self.gamma * self.V(s_) - self.V(s) def model_error(self): return 0.5 * self.lossL * self.V.normsq() def train(self, step, sample): self.eta.step(step) self.eps.step(step) self.beta.step(step) # Unpack sample s, a, r, s_ = sample # Compute error delta = self.bellman_error(s, a, r, s_) # Running average self.y += self.beta.value * (delta - self.y) # Gradient step self.V.shrink(1. - self.eta.value * self.lossL) if s_ is None: self.V.append(s, -self.eta.value * self.y * np.array([[-1.]])) else: W = np.zeros((2, 1)) W[0] = -1 W[1] = self.gamma self.V.append(np.vstack((s, s_)), -self.eta.value * self.y * W) # Prune modelOrder = len(self.V.D) self.V.prune(self.eps.value * self.eta.value**2) modelOrder_ = len(self.V.D) # Compute new error loss = 0.5 * self.bellman_error(s, a, r, s_)**2 + self.model_error() return (float(loss), float(modelOrder_), self.eta.value, self.beta.value) @property def metrics_names(self): return ('Training Loss', 'Model Order', 'Step Size', 'Averaging Coefficient')
def __init__(self, stateCount, actionCount, config): self.Q = KernelRepresentation(stateCount + actionCount, 2, config) # Learning rate self.eta = ScheduledParameter('LearningRate', config) # Regularization self.lossL = config.getfloat('Regularization', 1e-4) # Representation error budget self.eps = config.getfloat('RepresentationError', 1.0) # TD-loss expectation approximation rate self.beta = ScheduledParameter('ExpectationRate', config) # Running estimate of our expected TD-loss self.y = 0 # np.zeros((0,1))
def __init__(self, stateCount, actionCount, config): self.Q = KernelRepresentation(stateCount + actionCount, 1, config) self.algorithm = config.get('Algorithm', 'td').lower() # gtd, td or hybrid # Learning rate self.eta = ScheduledParameter('LearningRate', config) # Regularization self.lossL = config.getfloat('Regularization', 1e-4) self.phi = config.getfloat('Phi', 0.0) # Representation error budget self.eps = config.getfloat('RepresentationError', 1.0) # TD-loss expectation approximation rate self.beta = ScheduledParameter('ExpectationRate', config) # Running estimate of our expected TD-loss self.y = 0 # np.zeros((0,1))
def __init__(self, stateCount, config): self.V = KernelRepresentation(stateCount, 1, config) # Learning rate self.eta = ScheduledParameter('LearningRate', config) # Regularization self.lossL = config.getfloat('Regularization', 1e-6) # Representation error budget # self.eps = config.getfloat('RepresentationError', 1.0) self.eps = ScheduledParameter('RepresentationError', config) # Reward discount self.gamma = config.getfloat('RewardDiscount') # TD-loss expectation approximation rate self.beta = ScheduledParameter('ExpectationRate', config) # Running estimate of our expected TD-loss self.y = 0.
class KQGreedyModel(object): def __init__(self, stateCount, actionCount, config): self.Q = KernelRepresentation(stateCount + actionCount, 2, config) # Learning rate self.eta = ScheduledParameter('LearningRate', config) # Regularization self.lossL = config.getfloat('Regularization', 1e-4) # Representation error budget self.eps = config.getfloat('RepresentationError', 1.0) # TD-loss expectation approximation rate self.beta = ScheduledParameter('ExpectationRate', config) # Running estimate of our expected TD-loss self.y = 0 # np.zeros((0,1)) def train(self, step, x, x_, nonterminal, delta, g, gamma): self.eta.step(step) self.beta.step(step) self.Q.shrink(1. - self.eta.value * self.lossL) # Stack sample points X = np.vstack((x, x_[nonterminal])) W = np.zeros((len(X), 2)) N = float(len(delta)) # Compute gradient weights W[:len(x), 0] = self.eta.value / N * delta W[len(x):, 0] = -self.eta.value / N * gamma * g[nonterminal][:] W[:len(x), 1] = self.beta.value / N * (delta[:] - g[:]) self.Q.append(X, W) # Prune self.Q.prune((self.eps / N)**2 * self.eta.value**2 / self.beta.value) def evaluate(self, xs): "Evaluate the Q function for a list of (s,a) pairs." return np.reshape(self.Q(np.array(xs))[:, 0], (-1, 1)) #self.Q(np.array(xs)) def evaluateOne(self, x): "Evaluate the Q function for a single (s,a) pair." return self.Q(x)[:, 0] def maximize(self, ss): "Find the maximizing action for a batch of states." return [self.Q.argmax(s) for s in ss] def maximizeOne(self, s): "Find the maximizing action for a single state." return self.Q.argmax(s) def model_error(self): return 0.5 * self.lossL * self.Q.normsq()
class KSARSAModel2(object): def __init__(self, stateCount, actionCount, config): self.Q = KernelRepresentation(stateCount + actionCount, 1, config) # Learning rate self.eta = ScheduledParameter('LearningRate', config) # TD-loss expectation approximation rate self.beta = ScheduledParameter('ExpectationRate', config) # Regularization self.lossL = config.getfloat('Regularization', 1e-4) # Representation error budget self.eps = config.getfloat('RepresentationError', 1.0) # Reward discount self.gamma = config.getfloat('RewardDiscount') # Running estimate of our expected TD-loss self.y = 0. def bellman_error(self, s, a, r, s_, a_): x = np.concatenate((np.reshape(s, (1, -1)), np.reshape(a, (1, -1))), axis=1) if s_ is None: return r - self.Q(x) else: x_ = np.concatenate((np.reshape(s_, (1, -1)), np.reshape(a_, (1, -1))), axis=1) return r + self.Q(x_) - self.Q(x) def model_error(self): return 0.5 * self.lossL * self.Q.normsq() def predict(self, s): pass # "Predict the Q function values for a batch of states." # return self.Q(s) def predictOne(self, s): pass # "Predict the Q function values for a single state." # return self.Q(s.reshape(1, len(s))).flatten() @property def metrics_names(self): return ('Training Loss', 'Model Order')
class KTDModel(object): def __init__(self, stateCount, config): self.V = KernelRepresentation(stateCount, 1, config) # Learning rate self.eta = ScheduledParameter('LearningRate', config) # Regularization self.lossL = config.getfloat('Regularization', 1e-6) # Representation error budget self.eps = ScheduledParameter('RepresentationError', config) # Reward discount self.gamma = config.getfloat('RewardDiscount') def bellman_error(self, s, a, r, s_): if s_ is None: return r - self.V(s) else: return r + self.gamma * self.V(s_) - self.V(s) def model_error(self): return 0.5 * self.lossL * self.V.normsq() def train(self, step, sample): self.eta.step(step) self.eps.step(step) # Unpack sample s, a, r, s_ = sample # Compute error delta = self.bellman_error(s, a, r, s_) # Gradient step self.V.shrink(1. - self.eta.value * self.lossL) self.V.append(s, self.eta.value * delta) # Prune modelOrder = len(self.V.D) self.V.prune(self.eps.value * self.eta.value**2) modelOrder_ = len(self.V.D) # Compute new error loss = 0.5 * self.bellman_error(s, a, r, s_)**2 + self.model_error() return (float(loss), float(modelOrder_), self.eta.value) @property def metrics_names(self): return ('Training Loss', 'Model Order', 'Step Size')
class KQLearningModel(object): def __init__(self, stateCount, actionCount, config): self.Q = KernelRepresentation(stateCount, actionCount, config) # Learning rate self.eta = ScheduledParameter('LearningRate', config) # Regularization self.lossL = config.getfloat('Regularization', 1e-4) # Representation error budget self.eps = config.getfloat('RepresentationError', 1.0) # Reward discount self.gamma = config.getfloat('RewardDiscount') def bellman_error(self, s, a, r, s_): if s_ is None: return r - self.predictOne(s)[a] else: return r + self.gamma * self.predictOne( s_).max() - self.predictOne(s)[a] def model_error(self): return 0.5 * self.lossL * self.Q.normsq() def train(self, step, sample): pass def predict(self, s): "Predict the Q function values for a batch of states." return self.Q(s) def predictOne(self, s): "Predict the Q function values for a single state." return self.Q(s.reshape(1, len(s))).flatten() @property def metrics_names(self): return ('Training Loss', 'Model Order')
class KGreedyQModel(object): def __init__(self, stateCount, actionCount, config): self.Q = KernelRepresentation(stateCount + actionCount, 2, config) # Learning rates self.eta = ScheduledParameter('LearningRate', config) self.beta = ScheduledParameter('ExpectationRate', config) # Regularization self.lossL = config.getfloat('Regularization', 1e-4) # Representation error budget self.eps = config.getfloat('RepresentationError', 1.0) # Reward discount self.gamma = config.getfloat('RewardDiscount') def get_q(self, x): return self.Q(x)[0][0] def get_g(self, x): return self.Q(x)[0][1] def bellman_error(self, s, a, r, s_): x = np.concatenate((np.reshape(s, (1, -1)), np.reshape(a, (1, -1))), axis=1) if s_ is None: return r - self.get_q(x) else: a_ = self.Q.argmax(s_) x_ = np.concatenate((np.reshape(s_, (1, -1)), np.reshape(a_, (1, -1))), axis=1) return r + self.gamma * self.get_q(x_) - self.get_q(x) def bellman_error2(self, x, r, x_): if x_ is None: return r - self.get_q(x) else: return r + self.gamma * self.get_q(x_) - self.get_q(x) def model_error(self): return 0.5 * self.lossL * self.Q.normsq() @property def metrics_names(self): return ('Training Loss', 'Model Order') def train(self, step, sample): self.eta.step(step) self.beta.step(step) # Unpack sample and compute error s, a, r, s_ = sample x = np.concatenate( (np.reshape(np.array(s), (1, -1)), np.reshape(np.array(a), (1, -1))), axis=1) if s_ is None: x_ = None else: a_ = self.Q.argmax(s_) x_ = np.concatenate( (np.reshape(np.array(s_), (1, -1)), np.reshape(np.array(a_), (1, -1))), axis=1) delta = self.bellman_error2(x, r, x_) # Gradient step self.Q.shrink(1. - self.eta.value * self.lossL) if s_ is None: W = np.zeros((1, 2)) W[0, 0] = self.eta.value * delta W[0, 1] = self.beta.value * (delta - self.get_g(x)) self.Q.append(x, W) else: W = np.zeros((2, 2)) W[0, 0] = self.eta.value * delta W[1, 0] = -self.eta.value * self.gamma * self.get_g(x) W[0, 1] = self.beta.value * (delta - self.get_g(x)) self.Q.append(np.vstack((x, x_)), W) # Prune self.Q.prune(self.eps**2 * self.eta.value**2 / self.beta.value) modelOrder_ = self.Q.model_order() # Compute new error loss = 0.5 * self.bellman_error2(x, r, x_)**2 + self.model_error( ) # TODO should we have model error here? return (float(loss), float(modelOrder_))
class Model: def __init__(self, indim, outdim, grad_type): self.f = KernelRepresentation(indim, outdim, ModelParameters.config) self.indim = indim self.outdim = outdim self.grad_type = grad_type self.delta = 0 def model_error(self): return 0.5 * ModelParameters.lossL * self.f.normsq() def predict(self, x): # Predict the Q function values for a batch of states. return self.f(x) def predictOne(self, x): # Predict the Q function values for a single state. return self.f(np.reshape(x, (1, -1)))[0] def ucb(self, x): # Predict the Q function values for a single state. return np.abs(self.mom(x)) + np.sqrt(self.var(x)) def val(self, x): return self.predictOne(x)[0] + ModelParameters.mu def mom(self, x): return self.predictOne(x)[1] + 100.0 def var(self, x): return max(0, self.predictOne(x)[2] + ModelParameters.sigma) def train(self, sample): x, y = sample grad = (self.val(x) - y) grad_sq = grad**2 # V gradient W = np.zeros((3, )) if self.grad_type == GradType.SGD: # simple SGD W[0] = -ModelParameters.eta * grad # elif self.grad_type == GradType.MOM: # W[0] = -ModelParameters.eta * grad / (np.sqrt(self.var(x) + ModelParameters.eta**2)) # W[2] = (-ModelParameters.beta2 * self.var(x) + ModelParameters.beta2 * grad_sq) elif self.grad_type == GradType.MOM: # W[0] = -ModelParameters.eta * self.mom(x) #/ (np.sqrt(self.var(x) + ModelParameters.eta**2)) # W[1] = (-ModelParameters.beta1 * self.mom(x) + ModelParameters.beta1 * grad) ########################## W[0] = -ModelParameters.eta * self.mom( x) #/ (np.sqrt(self.var(x) + ModelParameters.eta**2)) W[1] = (-ModelParameters.beta1 * self.mom(x) + ModelParameters.beta1 * grad) #W[2] = (-ModelParameters.beta2 * self.var(x) + ModelParameters.beta2 * grad_sq) elif self.grad_type == GradType.VAR: grad_var = (grad - self.mom(x))**2 #print(grad_var) W[0] = -ModelParameters.eta * self.mom( x ) #/ (np.sqrt(self.var(x) + ModelParameters.eta**2)) #/ np.sqrt(self.var(x) + ModelParameters.eta) W[1] = (-ModelParameters.beta1 * self.mom(x) + ModelParameters.beta1 * grad) W[2] = (-ModelParameters.beta2 * self.var(x) + ModelParameters.beta2 * grad_var) elif self.grad_type == GradType.MOMENTUM: W[0] = -ModelParameters.eta * self.mom(x) / ( np.sqrt(self.var(x) + ModelParameters.eta**2)) W[1] = (-ModelParameters.beta1 * self.mom(x) + ModelParameters.beta1 * grad) W[2] = (-ModelParameters.beta2 * self.var(x) + ModelParameters.beta2 * grad_sq) elif self.grad_type == GradType.DELTA: W[0] = -ModelParameters.eta * self.delta self.delta = self.delta + (-ModelParameters.beta1 * self.delta + ModelParameters.beta1 * grad) else: print('error') # Gradient step self.f.shrink(1. - ModelParameters.lossL) self.f.append(np.array(x), np.reshape(W, (1, -1))) # Prune self.f.prune(ModelParameters.eps) return (grad_sq / 2, len(self.f.D)) def loss(self, sample): x, y = sample return 0.5 * (self.val(x) - y)**2 def point_density(self, x): return np.sum(self.f.kernel.f(x, self.f.D)) def compose(f1, f2): #static function #f = KernelRepresentation(4, 3, config) f = Model(f1.indim, f1.outdim, f1.grad_type) d = np.vstack([f1.f.D, f2.f.D]) thresh = np.shape(f1.f.D)[0] W = np.zeros((3, )) for i in np.random.permutation(np.shape(d)[0]): x = d[i, :] if f.grad_type == GradType.SGD: if f1.point_density(x) >= f2.point_density( x ) and i < thresh: # reciprocal here so larger is better W[1] = -f.mom(x) + f1.mom(x) W[2] = -f.var(x) + f1.var(x) W[0] = -f.val(x) + f1.val(x) f.f.append(np.array(x), np.reshape(W, (1, -1))) elif f1.point_density(x) < f2.point_density(x) and i >= thresh: W[1] = -f.mom(x) + f2.mom(x) W[2] = -f.var(x) + f2.var(x) W[0] = -f.val(x) + f2.val(x) f.f.append(np.array(x), np.reshape(W, (1, -1))) elif f.grad_type == GradType.MOM: if f1.mom(x) <= f2.mom( x ) and i < thresh: # reciprocal here so larger is better W[1] = -f.mom(x) + f1.mom(x) W[2] = -f.var(x) + f1.var(x) W[0] = -f.val(x) + f1.val(x) f.f.append(np.array(x), np.reshape(W, (1, -1))) elif f1.mom(x) > f2.mom(x) and i >= thresh: W[1] = -f.mom(x) + f2.mom(x) W[2] = -f.var(x) + f2.var(x) W[0] = -f.val(x) + f2.val(x) f.f.append(np.array(x), np.reshape(W, (1, -1))) # elif f.grad_type == GradType.VAR: # if f1.var(x) <= f2.var(x) and i < thresh: # reciprocal here so larger is better # W[1] = -f.mom(x) + f1.mom(x) # W[2] = -f.var(x) + f1.var(x) # W[0] = -f.val(x) + f1.val(x) # f.f.append(np.array(x), np.reshape(W, (1, -1))) # elif f1.var(x) > f2.var(x) and i >= thresh: # W[1] = -f.mom(x) + f2.mom(x) # W[2] = -f.var(x) + f2.var(x) # W[0] = -f.val(x) + f2.val(x) # f.f.append(np.array(x), np.reshape(W, (1, -1))) elif f.grad_type == GradType.VAR: if (f1.ucb(x)) <= f2.ucb( x ) and i < thresh: # reciprocal here so larger is better W[1] = -f.mom(x) + f1.mom(x) W[2] = -f.var(x) + f1.var(x) W[0] = -f.val(x) + f1.val(x) f.f.append(np.array(x), np.reshape(W, (1, -1))) elif f1.ucb(x) > f2.ucb(x) and i >= thresh: W[1] = -f.mom(x) + f2.mom(x) W[2] = -f.var(x) + f2.var(x) W[0] = -f.val(x) + f2.val(x) f.f.append(np.array(x), np.reshape(W, (1, -1))) return f
class KNAFModel(object): def __init__(self, stateCount, actionCount, config): # Get dimensions of V, pi and L self.dim_v = 1 self.dim_p = actionCount self.dim_l = 1 #(1 + actionCount) * actionCount / 2 #TODO self.dim_a = self.dim_v + self.dim_p + self.dim_l # Get action space self.min_act = np.reshape(json.loads(config.get('MinAction')), (-1, 1)) self.max_act = np.reshape(json.loads(config.get('MaxAction')), (-1, 1)) # Initialize L self.init_l = config.getfloat('InitL', 0.01) # Represent V, pi, L in one RKHS self.vpl = KernelRepresentation(stateCount, self.dim_a, config) # Learning rates self.eta_v = ScheduledParameter('LearningRateV', config) self.eta_p = ScheduledParameter('LearningRateP', config) self.eta_l = ScheduledParameter('LearningRateL', config) # Regularization self.lossL = config.getfloat('Regularization', 1e-6) # Representation error budget self.eps = config.getfloat('RepresentationError', 1.0) # Reward discount self.gamma = config.getfloat('RewardDiscount') def get_q(self, s, a): lmat = self.get_lmat(s) pi = self.get_pi(s) if self.dim_p > 1: return self.get_v(s) - 0.5 * ( (a - pi).T).dot(lmat).dot(lmat.T).dot(a - pi) else: return np.array( [self.get_v(s) - 0.5 * (a - pi) * lmat * lmat * (a - pi)]) def get_v(self, s): return np.array([self.predictOne(s)[0, 0]]) def get_pi(self, s): pi = self.predictOne(s)[0, 1:self.dim_p + 1] return np.reshape(np.clip(pi, self.min_act, self.max_act), (-1, )) def get_lmat(self, s): lmat = np.zeros((self.dim_p, self.dim_p)) temp = self.predictOne(s) if self.dim_p > 1: lmat[np.tril_indices(self.dim_p)] = temp[self.dim_p + 1:] return lmat + self.init_l * np.eye(self.dim_p) else: return np.array([temp[0, 2] + self.init_l]) def bellman_error(self, s, a, r, s_): if s_ is None: return r - self.get_q(s, a) else: return r + self.gamma * self.get_v(s_) - self.get_q(s, a) def model_error(self): return 0.5 * self.lossL * self.vpl.normsq() def predict(self, s): # Predict the Q function values for a batch of states. return self.vpl(s) def predictOne(self, s): # Predict the Q function values for a single state. return self.vpl(np.reshape(s, (1, -1))) @property def metrics_names(self): return ('Training Loss', 'Model Order') def train(self, step, sample): self.eta_v.step(step) self.eta_p.step(step) self.eta_l.step(step) #self.beta.step(step) # Unpack sample s, a, r, s_ = sample # Compute error delta = self.bellman_error(s, a, r, s_) # Gradient step self.vpl.shrink(1. - self.lossL) # V gradient W = np.zeros((self.dim_a, )) W[0] = -1 * self.eta_v.value lmat = self.get_lmat(s) pi = self.get_pi(s) # Pi gradient if self.dim_p > 1: W[1:self.dim_p + 1] = -self.eta_p.value * np.matmul( np.matmul(lmat, np.transpose(lmat)), a - pi) lgrad_temp = np.matmul(np.matmul(np.transpose(lmat), a - pi), np.transpose(a - pi)) else: lgrad_temp = lmat * (a - pi) * (a - pi) W[1] = -self.eta_p.value * lmat * lmat * (a - pi) if self.dim_p > 1: W[self.dim_p + 1:self.dim_a] = np.reshape( lgrad_temp[np.tril_indices(self.dim_p)], (-1, 1)) * self.eta_l.value else: W[-1] = lgrad_temp * self.eta_l.value # Check for model divergence! # if np.abs(delta) > 50 and False: # print ("Divergence!") # print (pi) # print (lmat) # print (delta) self.vpl.append(np.array(s), -delta * np.reshape(W, (1, -1))) # Prune self.vpl.prune(self.eps) modelOrder_ = len(self.vpl.D) # Compute new error loss = 0.5 * self.bellman_error(s, a, r, s_)**2 # + self.model_error() return (float(loss), float(modelOrder_))
class KQLearningModel(object): def __init__(self, stateCount, actionCount, config): self.Q = KernelRepresentation(stateCount + actionCount, 1, config) self.algorithm = config.get('Algorithm', 'td').lower() # gtd, td or hybrid # Learning rate self.eta = ScheduledParameter('LearningRate', config) # Regularization self.lossL = config.getfloat('Regularization', 1e-4) self.phi = config.getfloat('Phi', 0.0) # Representation error budget self.eps = config.getfloat('RepresentationError', 1.0) # TD-loss expectation approximation rate self.beta = ScheduledParameter('ExpectationRate', config) # Running estimate of our expected TD-loss self.y = 0 # np.zeros((0,1)) def train(self, step, x, x_, nonterminal, delta, gamma, rand_act=None): self.eta.step(step) self.beta.step(step) yy = self.y + self.beta.value * (delta - self.y) self.Q.shrink(1. - self.eta.value * self.lossL) # Stack sample points if self.algorithm == 'hybrid': nonterminal = list(set(nonterminal) & set(rand_act)) if self.algorithm == 'gtd' or self.algorithm == 'hybrid': X = np.vstack((x, x_[nonterminal])) W = np.zeros((len(X), 1)) N = float(len(delta)) W[:len(x)] = self.eta.value / N * yy W[len(x):] = -self.phi * self.eta.value / N * gamma * yy[nonterminal] self.y = np.mean(yy) # Running average of TAD error elif self.algorithm == 'td': X = x N = float(len(delta)) W = self.eta.value / N * yy self.y = np.mean(yy) # Running average of TAD error else: raise ValueError('Unknown algorithm: {}'.format(self.algorithm)) self.Q.append(X, W) # Prune # self.Q.prune(self.eps ** 2 * (self.eta.value / N) ** 2 / self.beta.value) self.Q.prune((self.eps * self.eta.value ** 2) ** 2) def evaluate(self, xs): "Evaluate the Q function for a list of (s,a) pairs." return self.Q(np.array(xs)) def evaluateOne(self, x): "Evaluate the Q function for a single (s,a) pair." return self.Q(x) def maximize(self, ss): "Find the maximizing action for a batch of states." return [self.Q.argmax(s) for s in ss] def maximizeOne(self, s): "Find the maximizing action for a single state." return self.Q.argmax(s) def model_error(self): return 0.5 * self.lossL * self.Q.normsq()
class KNAFIIDModel(object): def __init__(self, stateCount, actionCount, config): # Get dimensions of V, pi and L self.dim_v = 1 self.dim_p = actionCount self.dim_l = 1 # (1 + actionCount) * actionCount / 2 #TODO self.dim_a = self.dim_v + self.dim_p + self.dim_l # Get action space self.min_act = np.reshape(json.loads(config.get('MinAction')), (-1, 1)) self.max_act = np.reshape(json.loads(config.get('MaxAction')), (-1, 1)) # Initialize L self.init_l = config.getfloat('InitL', 0.01) # Represent V, pi, L in one RKHS self.vpl = KernelRepresentation(stateCount, self.dim_a, config) # Learning rates self.eta_v = ScheduledParameter('LearningRateV', config) self.eta_p = ScheduledParameter('LearningRateP', config) self.eta_l = ScheduledParameter('LearningRateL', config) # Learning rate self.eta = ScheduledParameter('LearningRate', config) # Regularization self.lossL = config.getfloat('Regularization', 1e-4) # self.phi = config.getfloat('Phi', 1) # Representation error budget self.eps = config.getfloat('RepresentationError', 1.0) # Reward discount self.gamma = config.getfloat('RewardDiscount') def get_q(self, s, a): lmat = self.get_lmat(s) pi = self.get_pi(s) return np.array( [self.get_v(s) - 0.5 * (a - pi) * lmat * lmat * (a - pi)]) def get_v(self, s): return np.array([self.predictOne(s)[0, 0]]) def get_pi(self, s): pi = self.predictOne(s)[0, 1:self.dim_p + 1] return np.reshape(np.clip(pi, self.min_act, self.max_act), (-1, )) def get_lmat(self, s): lmat = np.zeros((self.dim_p, self.dim_p)) temp = self.predictOne(s) if self.dim_p > 1: lmat[np.tril_indices(self.dim_p)] = temp[self.dim_p + 1:] return lmat + self.init_l * np.eye(self.dim_p) else: return np.array([temp[0, 2] + self.init_l]) def train(self, step, sample): self.eta_v.step(step) self.eta_p.step(step) self.eta_l.step(step) s, a, r, s_ = sample[0][1][0], sample[0][1][1], sample[0][1][ 2], sample[0][1][3] delta = self.bellman_error(s, a, r, s_) # Gradient step self.vpl.shrink(1. - self.lossL) W = np.zeros((self.dim_a, )) W[0] = -1 * self.eta_v.value lmat = self.get_lmat(s) pi = self.get_pi(s) lgrad_temp = lmat * (a - pi) * (a - pi) W[1] = -self.eta_p.value * lmat * lmat * (a - pi) W[-1] = lgrad_temp * self.eta_l.value self.vpl.append(np.array(s), -delta * np.reshape(W, (1, -1))) self.vpl.prune(self.eps) modelOrder_ = len(self.vpl.D) loss = 0.5 * self.bellman_error(s, a, r, s_)**2 # + self.model_error() return (float(loss), float(modelOrder_)) def bellman_error(self, s, a, r, s_): if s_ is None: return r - self.get_q(s, a) else: return r + self.gamma * self.get_v(s_) - self.get_q(s, a) def predict(self, s): # Predict the Q function values for a batch of states. return self.vpl(s) def predictOne(self, s): # Predict the Q function values for a single state. return self.vpl(np.reshape(s, (1, -1))) def model_error(self): return 0.5 * self.lossL * self.vpl.normsq()