class KQGreedyModel(object): def __init__(self, stateCount, actionCount, config): self.Q = KernelRepresentation(stateCount + actionCount, 2, config) # Learning rate self.eta = ScheduledParameter('LearningRate', config) # Regularization self.lossL = config.getfloat('Regularization', 1e-4) # Representation error budget self.eps = config.getfloat('RepresentationError', 1.0) # TD-loss expectation approximation rate self.beta = ScheduledParameter('ExpectationRate', config) # Running estimate of our expected TD-loss self.y = 0 # np.zeros((0,1)) def train(self, step, x, x_, nonterminal, delta, g, gamma): self.eta.step(step) self.beta.step(step) self.Q.shrink(1. - self.eta.value * self.lossL) # Stack sample points X = np.vstack((x, x_[nonterminal])) W = np.zeros((len(X), 2)) N = float(len(delta)) # Compute gradient weights W[:len(x), 0] = self.eta.value / N * delta W[len(x):, 0] = -self.eta.value / N * gamma * g[nonterminal][:] W[:len(x), 1] = self.beta.value / N * (delta[:] - g[:]) self.Q.append(X, W) # Prune self.Q.prune((self.eps / N)**2 * self.eta.value**2 / self.beta.value) def evaluate(self, xs): "Evaluate the Q function for a list of (s,a) pairs." return np.reshape(self.Q(np.array(xs))[:, 0], (-1, 1)) #self.Q(np.array(xs)) def evaluateOne(self, x): "Evaluate the Q function for a single (s,a) pair." return self.Q(x)[:, 0] def maximize(self, ss): "Find the maximizing action for a batch of states." return [self.Q.argmax(s) for s in ss] def maximizeOne(self, s): "Find the maximizing action for a single state." return self.Q.argmax(s) def model_error(self): return 0.5 * self.lossL * self.Q.normsq()
class KQTabAgent(object): def __init__(self, env, config): self.config = config self.folder = config.get('Folder', 'exp') self.dim_s = env.stateCount self.dim_a = env.actionCount self.sarsa_steps = config.getint('SARSASteps', 100000) # self.prune_steps = 100 self.last_avg_error = 0 self.avg_error = 0 self.model = KQTabModel(config) # How many steps we have observed self.steps = 0 # ---- Configure exploration self.epsilon = ScheduledParameter('ExplorationRate', config) self.epsilon.step(0) # ---- Configure rewards self.gamma = config.getfloat('RewardDiscount') def act(self, s, stochastic=True): "Decide what action to take in state s." if stochastic and (random.random() < self.epsilon.value): action = self.model.getRandomAction() # return random.randint(0, self.actionCount-1) else: (s1n, s2n) = self.model.getStateIndex(s) action = self.model.getAction( np.argmax(self.model.table[s1n, s2n, :])) # print action return np.array([action]) # return self.model.predictOne(s).argmax() def observe(self, sample): self.lastSample = sample self.steps += 1 self.epsilon.step(self.steps) def improve(self): loss = self.model.train(self.steps, self.lastSample) return loss @property def metrics_names(self): return self.model.metrics_names
class KSARSAAgent2(object): def __init__(self, env, config): self.stateCount = env.stateCount self.actionCount = env.actionCount self.max_act = 5 # We can switch between SCGD and TD learning here algorithm = config.get('Algorithm', 'TD') if algorithm.lower() == 'scgd': self.model = KSARSAModelSCGD2(self.stateCount, self.actionCount, config) elif algorithm.lower() == 'td': self.model = KSARSAModelTD2(self.stateCount, self.actionCount, config) else: raise ValueError('Unknown algorithm: {}'.format(algorithm)) # How many steps we have observed self.steps = 0 # ---- Configure exploration self.epsilon = ScheduledParameter('ExplorationRate', config) self.epsilon.step(0) # ---- Configure rewards self.gamma = config.getfloat('RewardDiscount') def act(self, s, stochastic=True): "Decide what action to take in state s." if stochastic and (random.random() < self.epsilon.value): return np.random.uniform(-self.max_act, self.max_act, (self.actionCount, 1)) # return random.randint(0, self.actionCount-1) else: return self.model.Q.argmax(s) # return self.model.predictOne(s).argmax() def observe(self, sample): self.lastSample = sample self.steps += 1 self.epsilon.step(self.steps) def improve(self): return self.model.train(self.steps, self.lastSample) def bellman_error(self, s, a, r, s_, a_): return self.model.bellman_error(s, a, r, s_, a_) def model_error(self): return self.model.model_error() @property def metrics_names(self): return self.model.metrics_names
class KGreedyQAgent(object): def __init__(self, env, config): self.stateCount = env.stateCount self.actionCount = env.actionCount # ---- Configure exploration self.epsilon = ScheduledParameter('ExplorationRate', config) self.epsilon.step(0) self.min_act = np.reshape(json.loads(config.get('MinAction')), (-1, 1)) self.max_act = np.reshape(json.loads(config.get('MaxAction')), (-1, 1)) self.act_mult = config.getfloat('ActMultiplier', 1) # How many steps we have observed self.steps = 0 self.lastSample = None # Initialize model self.model = KGreedyQModel(self.stateCount, self.actionCount, config) def act(self, s, stochastic=True): # "Decide what action to take in state s." if stochastic and (random.random() < self.epsilon.value): a = np.random.uniform(self.act_mult * self.min_act, self.act_mult * self.max_act) else: a = self.model.Q.argmax(s) return np.reshape(np.clip(a, self.min_act, self.max_act), (-1, )) def observe(self, sample): self.lastSample = sample self.steps += 1 self.epsilon.step(self.steps) def improve(self): return self.model.train(self.steps, self.lastSample) def bellman_error(self, s, a, r, s_): return self.model.bellman_error(s, a, r, s_) def model_error(self): return self.model.model_error() @property def metrics_names(self): return self.model.metrics_names
class KTDModel(object): def __init__(self, stateCount, config): self.V = KernelRepresentation(stateCount, 1, config) # Learning rate self.eta = ScheduledParameter('LearningRate', config) # Regularization self.lossL = config.getfloat('Regularization', 1e-6) # Representation error budget self.eps = ScheduledParameter('RepresentationError', config) # Reward discount self.gamma = config.getfloat('RewardDiscount') def bellman_error(self, s, a, r, s_): if s_ is None: return r - self.V(s) else: return r + self.gamma * self.V(s_) - self.V(s) def model_error(self): return 0.5 * self.lossL * self.V.normsq() def train(self, step, sample): self.eta.step(step) self.eps.step(step) # Unpack sample s, a, r, s_ = sample # Compute error delta = self.bellman_error(s, a, r, s_) # Gradient step self.V.shrink(1. - self.eta.value * self.lossL) self.V.append(s, self.eta.value * delta) # Prune modelOrder = len(self.V.D) self.V.prune(self.eps.value * self.eta.value**2) modelOrder_ = len(self.V.D) # Compute new error loss = 0.5 * self.bellman_error(s, a, r, s_)**2 + self.model_error() return (float(loss), float(modelOrder_), self.eta.value) @property def metrics_names(self): return ('Training Loss', 'Model Order', 'Step Size')
class KSARSAModelSCGD2(KSARSAModel2): def __init__(self, stateCount, actionCount, config): super(KSARSAModelSCGD2, self).__init__(stateCount, actionCount, config) # TD-loss expectation approximation rate self.beta = ScheduledParameter('ExpectationRate', config) # Running estimate of our expected TD-loss self.y = 0. def train(self, step, sample): self.eta.step(step) self.beta.step(step) # Unpack sample s, a, r, s_, a_ = sample # Compute error delta = self.bellman_error(s, a, r, s_, a_) # Running average self.y += self.beta.value * (delta - self.y) # Gradient step self.Q.shrink(1. - self.eta.value * self.lossL) x = np.concatenate((np.reshape(s, (1, -1)), np.reshape(a, (1, -1))), axis=1) if s_ is None: self.Q.append(x, self.eta.value * self.y) else: x_ = np.concatenate((np.reshape(s_, (1, -1)), np.reshape(a_, (1, -1))), axis=1) W = np.zeros((2, 1)) W[0] = -1. W[1] = self.gamma self.Q.append(np.vstack((x, x_)), -self.eta.value * self.y * W) # Prune modelOrder = len(self.Q.D) self.Q.prune(self.eps * self.eta.value**2) modelOrder_ = len(self.Q.D) # Compute new error loss = 0.5 * self.bellman_error(s, a, r, s_, a_)**2 + self.model_error() return (float(loss), float(modelOrder_))
class KSCGDModel(object): def __init__(self, stateCount, config): self.V = KernelRepresentation(stateCount, 1, config) # Learning rate self.eta = ScheduledParameter('LearningRate', config) # Regularization self.lossL = config.getfloat('Regularization', 1e-6) # Representation error budget # self.eps = config.getfloat('RepresentationError', 1.0) self.eps = ScheduledParameter('RepresentationError', config) # Reward discount self.gamma = config.getfloat('RewardDiscount') # TD-loss expectation approximation rate self.beta = ScheduledParameter('ExpectationRate', config) # Running estimate of our expected TD-loss self.y = 0. def bellman_error(self, s, a, r, s_): if s_ is None: return r - self.V(s) else: return r + self.gamma * self.V(s_) - self.V(s) def model_error(self): return 0.5 * self.lossL * self.V.normsq() def train(self, step, sample): self.eta.step(step) self.eps.step(step) self.beta.step(step) # Unpack sample s, a, r, s_ = sample # Compute error delta = self.bellman_error(s, a, r, s_) # Running average self.y += self.beta.value * (delta - self.y) # Gradient step self.V.shrink(1. - self.eta.value * self.lossL) if s_ is None: self.V.append(s, -self.eta.value * self.y * np.array([[-1.]])) else: W = np.zeros((2, 1)) W[0] = -1 W[1] = self.gamma self.V.append(np.vstack((s, s_)), -self.eta.value * self.y * W) # Prune modelOrder = len(self.V.D) self.V.prune(self.eps.value * self.eta.value**2) modelOrder_ = len(self.V.D) # Compute new error loss = 0.5 * self.bellman_error(s, a, r, s_)**2 + self.model_error() return (float(loss), float(modelOrder_), self.eta.value, self.beta.value) @property def metrics_names(self): return ('Training Loss', 'Model Order', 'Step Size', 'Averaging Coefficient')
class KNAFAgent(object): def __init__(self, env, config): self.stateCount = env.stateCount self.actionCount = env.actionCount self.steps = 0 # How many steps we have observed self.gamma = config.getfloat('RewardDiscount') # ---- Configure exploration self.epsilon = ScheduledParameter('ExplorationRate', config) self.epsilon.step(0) self.noise_var = ScheduledParameter('NoiseVariance', config) self.noise_var.step(0) self.min_act = np.reshape(json.loads(config.get('MinAction')), (-1, 1)) self.max_act = np.reshape(json.loads(config.get('MaxAction')), (-1, 1)) # ---- Initialize model if config.get('LoadModel'): fname = config.get('LoadModel') self.model = pickle.load(open(fname, "rb")) else: self.model = KNAFModel(self.stateCount, self.actionCount, config) def act(self, s, stochastic=True): # "Decide what action to take in state s." a = self.model.get_pi(s) if stochastic: # if exploration, add noise a = a + np.random.normal(0, self.noise_var.value, self.actionCount) a = np.reshape(np.clip(a, self.min_act, self.max_act), (-1, )) return a def observe(self, sample): self.lastSample = sample self.steps += 1 self.epsilon.step(self.steps) self.noise_var.step(self.steps) def improve(self): return self.model.train(self.steps, self.lastSample) def bellman_error(self, s, a, r, s_): return self.model.bellman_error(s, a, r, s_) def model_error(self): return self.model.model_error() @property def metrics_names(self): return self.model.metrics_names
class KQLearningModelSCGD(KQLearningModel): def __init__(self, stateCount, actionCount, config): super(KQLearningModelSCGD, self).__init__(stateCount, actionCount, config) # TD-loss expectation approximation rate self.beta = ScheduledParameter('ExpectationRate', config) # Running estimate of our expected TD-loss self.y = 0. def train(self, step, sample): self.eta.step(step) self.beta.step(step) # Unpack sample s, a, r, s_ = sample # Compute error delta = self.bellman_error(s, a, r, s_) # Running average of TD-error self.y += self.beta.value * (delta - self.y) # Gradient step self.Q.shrink(1. - self.eta.value * self.lossL) if s_ is None: W = np.zeros((1, self.Q.W.shape[1])) W[0, a] = -1. self.Q.append(s, -self.eta.value * self.y * W) else: a_ = self.predictOne(s_).argmax() W = np.zeros((2, self.Q.W.shape[1])) W[0, a] = -1. W[1, a_] = 0 # self.gamma self.Q.append(np.vstack((s, s_)), -self.eta.value * self.y * W) # Prune modelOrder = len(self.Q.D) self.Q.prune(self.eps * self.eta.value**2) modelOrder_ = len(self.Q.D) # Compute new error loss = 0.5 * self.bellman_error(s, a, r, s_)**2 + self.model_error() return (float(loss), float(modelOrder_))
class KGreedyQModel(object): def __init__(self, stateCount, actionCount, config): self.Q = KernelRepresentation(stateCount + actionCount, 2, config) # Learning rates self.eta = ScheduledParameter('LearningRate', config) self.beta = ScheduledParameter('ExpectationRate', config) # Regularization self.lossL = config.getfloat('Regularization', 1e-4) # Representation error budget self.eps = config.getfloat('RepresentationError', 1.0) # Reward discount self.gamma = config.getfloat('RewardDiscount') def get_q(self, x): return self.Q(x)[0][0] def get_g(self, x): return self.Q(x)[0][1] def bellman_error(self, s, a, r, s_): x = np.concatenate((np.reshape(s, (1, -1)), np.reshape(a, (1, -1))), axis=1) if s_ is None: return r - self.get_q(x) else: a_ = self.Q.argmax(s_) x_ = np.concatenate((np.reshape(s_, (1, -1)), np.reshape(a_, (1, -1))), axis=1) return r + self.gamma * self.get_q(x_) - self.get_q(x) def bellman_error2(self, x, r, x_): if x_ is None: return r - self.get_q(x) else: return r + self.gamma * self.get_q(x_) - self.get_q(x) def model_error(self): return 0.5 * self.lossL * self.Q.normsq() @property def metrics_names(self): return ('Training Loss', 'Model Order') def train(self, step, sample): self.eta.step(step) self.beta.step(step) # Unpack sample and compute error s, a, r, s_ = sample x = np.concatenate( (np.reshape(np.array(s), (1, -1)), np.reshape(np.array(a), (1, -1))), axis=1) if s_ is None: x_ = None else: a_ = self.Q.argmax(s_) x_ = np.concatenate( (np.reshape(np.array(s_), (1, -1)), np.reshape(np.array(a_), (1, -1))), axis=1) delta = self.bellman_error2(x, r, x_) # Gradient step self.Q.shrink(1. - self.eta.value * self.lossL) if s_ is None: W = np.zeros((1, 2)) W[0, 0] = self.eta.value * delta W[0, 1] = self.beta.value * (delta - self.get_g(x)) self.Q.append(x, W) else: W = np.zeros((2, 2)) W[0, 0] = self.eta.value * delta W[1, 0] = -self.eta.value * self.gamma * self.get_g(x) W[0, 1] = self.beta.value * (delta - self.get_g(x)) self.Q.append(np.vstack((x, x_)), W) # Prune self.Q.prune(self.eps**2 * self.eta.value**2 / self.beta.value) modelOrder_ = self.Q.model_order() # Compute new error loss = 0.5 * self.bellman_error2(x, r, x_)**2 + self.model_error( ) # TODO should we have model error here? return (float(loss), float(modelOrder_))
class KNAFModel(object): def __init__(self, stateCount, actionCount, config): # Get dimensions of V, pi and L self.dim_v = 1 self.dim_p = actionCount self.dim_l = 1 #(1 + actionCount) * actionCount / 2 #TODO self.dim_a = self.dim_v + self.dim_p + self.dim_l # Get action space self.min_act = np.reshape(json.loads(config.get('MinAction')), (-1, 1)) self.max_act = np.reshape(json.loads(config.get('MaxAction')), (-1, 1)) # Initialize L self.init_l = config.getfloat('InitL', 0.01) # Represent V, pi, L in one RKHS self.vpl = KernelRepresentation(stateCount, self.dim_a, config) # Learning rates self.eta_v = ScheduledParameter('LearningRateV', config) self.eta_p = ScheduledParameter('LearningRateP', config) self.eta_l = ScheduledParameter('LearningRateL', config) # Regularization self.lossL = config.getfloat('Regularization', 1e-6) # Representation error budget self.eps = config.getfloat('RepresentationError', 1.0) # Reward discount self.gamma = config.getfloat('RewardDiscount') def get_q(self, s, a): lmat = self.get_lmat(s) pi = self.get_pi(s) if self.dim_p > 1: return self.get_v(s) - 0.5 * ( (a - pi).T).dot(lmat).dot(lmat.T).dot(a - pi) else: return np.array( [self.get_v(s) - 0.5 * (a - pi) * lmat * lmat * (a - pi)]) def get_v(self, s): return np.array([self.predictOne(s)[0, 0]]) def get_pi(self, s): pi = self.predictOne(s)[0, 1:self.dim_p + 1] return np.reshape(np.clip(pi, self.min_act, self.max_act), (-1, )) def get_lmat(self, s): lmat = np.zeros((self.dim_p, self.dim_p)) temp = self.predictOne(s) if self.dim_p > 1: lmat[np.tril_indices(self.dim_p)] = temp[self.dim_p + 1:] return lmat + self.init_l * np.eye(self.dim_p) else: return np.array([temp[0, 2] + self.init_l]) def bellman_error(self, s, a, r, s_): if s_ is None: return r - self.get_q(s, a) else: return r + self.gamma * self.get_v(s_) - self.get_q(s, a) def model_error(self): return 0.5 * self.lossL * self.vpl.normsq() def predict(self, s): # Predict the Q function values for a batch of states. return self.vpl(s) def predictOne(self, s): # Predict the Q function values for a single state. return self.vpl(np.reshape(s, (1, -1))) @property def metrics_names(self): return ('Training Loss', 'Model Order') def train(self, step, sample): self.eta_v.step(step) self.eta_p.step(step) self.eta_l.step(step) #self.beta.step(step) # Unpack sample s, a, r, s_ = sample # Compute error delta = self.bellman_error(s, a, r, s_) # Gradient step self.vpl.shrink(1. - self.lossL) # V gradient W = np.zeros((self.dim_a, )) W[0] = -1 * self.eta_v.value lmat = self.get_lmat(s) pi = self.get_pi(s) # Pi gradient if self.dim_p > 1: W[1:self.dim_p + 1] = -self.eta_p.value * np.matmul( np.matmul(lmat, np.transpose(lmat)), a - pi) lgrad_temp = np.matmul(np.matmul(np.transpose(lmat), a - pi), np.transpose(a - pi)) else: lgrad_temp = lmat * (a - pi) * (a - pi) W[1] = -self.eta_p.value * lmat * lmat * (a - pi) if self.dim_p > 1: W[self.dim_p + 1:self.dim_a] = np.reshape( lgrad_temp[np.tril_indices(self.dim_p)], (-1, 1)) * self.eta_l.value else: W[-1] = lgrad_temp * self.eta_l.value # Check for model divergence! # if np.abs(delta) > 50 and False: # print ("Divergence!") # print (pi) # print (lmat) # print (delta) self.vpl.append(np.array(s), -delta * np.reshape(W, (1, -1))) # Prune self.vpl.prune(self.eps) modelOrder_ = len(self.vpl.D) # Compute new error loss = 0.5 * self.bellman_error(s, a, r, s_)**2 # + self.model_error() return (float(loss), float(modelOrder_))
class KQLearningAgentPER(object): def __init__(self, env, config): self.stateCount = env.stateCount self.actionCount = env.actionCount self.min_act = env.env.action_space.low self.max_act = env.env.action_space.high self.max_model_order = config.getfloat('MaxModelOrder', 10000) # Reward discount self.gamma = config.getfloat('RewardDiscount') self.act_mult = config.getfloat('ActMultiplier', 1) self.rand_act = True # ---- Configure batch size self.batchSize = config.getint('MinibatchSize', 16) # We can switch between SCGD and TD learning here algorithm = config.get('Algorithm', 'gtd').lower() if algorithm == 'gtd' or algorithm == 'td' or algorithm == 'hybrid': self.model = KQLearningModel(self.stateCount, self.actionCount, config) else: raise ValueError('Unknown algorithm: {}'.format(algorithm)) # How many steps we have observed self.steps = 0 # ---- Configure exploration self.epsilon = ScheduledParameter('ExplorationRate', config) self.epsilon.step(0) # ---- Configure rewards self.gamma = config.getfloat('RewardDiscount') # ---- Configure priority experience replay self.memory = None self.eps = config.getfloat('ExperiencePriorityMinimum', 0.01) self.alpha = config.getfloat('ExperiencePriorityExponent', 1.) def _getStates(self, batch): # no_state = np.zeros(self.stateCount + self.actionCount) def assemble(s, a): return np.concatenate((s.reshape((1, -1)), a.reshape((1, -1))), axis=1).flatten() x = np.array([assemble(e[0], e[1]) for (_, e) in batch]) x_ = np.zeros((len(batch), self.stateCount + self.actionCount)) nonterminal = [] rand_act = [] for i, (_, e) in enumerate(batch): if e[3] is not None: a_ = self.model.maximizeOne(e[3]) x_[i] = assemble(e[3], a_) nonterminal.append(i) if self.model.algorithm == 'hybrid': if len(e) == 4 or e[4]: # assumes that samples added into the buffer initially are random rand_act.append(i) r = np.array([e[2] for (_, e) in batch]) if self.model.algorithm == 'hybrid': return x, x_, nonterminal, r, rand_act else: return x, x_, nonterminal, r def _computeError(self, x, x_, nonterminal, r, rand_act=None): error = r.reshape((-1, 1)) - self.model.evaluate(x) error[nonterminal] += self.gamma * self.model.evaluate(x_[nonterminal]) return error def bellman_error(self, s, a, r, s_): x = np.concatenate((np.reshape(s, (1, -1)), np.reshape(a, (1, -1))), axis=1) if s_ is None: return r - self.model.evaluateOne(x) else: a_ = self.model.maximizeOne(s_) x_ = np.concatenate((np.reshape(s_, (1, -1)), np.reshape(a_, (1, -1))), axis=1) return r + self.gamma * self.model.evaluateOne(x_) - self.model.evaluateOne(x) def act(self, s, stochastic=True): "Decide what action to take in state s." if stochastic and (random.random() < self.epsilon.value): a = np.random.uniform(self.act_mult * self.min_act, self.act_mult * self.max_act) self.rand_act = True else: a = self.model.Q.argmax(s) self.rand_act = False a_temp = np.reshape(np.clip(a, self.min_act, self.max_act), (-1,)) return a_temp def observe(self, sample): error = self._computeError(*self._getStates([(0, sample)])) if self.model.algorithm == 'hybrid': s, a, r, s_ = sample sample = (s, a, r, s_, self.rand_act) self.memory.add(sample, np.abs((error[0] + self.eps) ** self.alpha)) self.steps += 1 self.epsilon.step(self.steps) def improve(self): batch = self.memory.sample(self.batchSize) if self.model.algorithm == 'hybrid': x, x_, nt, r, rand_act = self._getStates(batch) else: x, x_, nt, r = self._getStates(batch) # compute bellman error error = self._computeError(x, x_, nt, r) # update model if self.model.algorithm == 'hybrid': self.model.train(self.steps, x, x_, nt, error, self.gamma, rand_act) else: self.model.train(self.steps, x, x_, nt, error, self.gamma) # compute updated error error = self._computeError(x, x_, nt, r) # update errors in memory if type(self.memory) == PrioritizedMemory: for (idx, _), delta in zip(batch, error): self.memory.update(idx, (np.abs(delta) + self.eps) ** self.alpha) # compute our average minibatch loss loss = 0.5 * np.mean(error ** 2) + self.model.model_error() # compute our model order model_order = len(self.model.Q.D) # report metrics return (float(loss), float(model_order)) def model_error(self): return self.model.model_error() @property def metrics_names(self): return ('Training Loss', 'Model Order')
class KQLearningModel(object): def __init__(self, stateCount, actionCount, config): self.Q = KernelRepresentation(stateCount + actionCount, 1, config) self.algorithm = config.get('Algorithm', 'td').lower() # gtd, td or hybrid # Learning rate self.eta = ScheduledParameter('LearningRate', config) # Regularization self.lossL = config.getfloat('Regularization', 1e-4) self.phi = config.getfloat('Phi', 0.0) # Representation error budget self.eps = config.getfloat('RepresentationError', 1.0) # TD-loss expectation approximation rate self.beta = ScheduledParameter('ExpectationRate', config) # Running estimate of our expected TD-loss self.y = 0 # np.zeros((0,1)) def train(self, step, x, x_, nonterminal, delta, gamma, rand_act=None): self.eta.step(step) self.beta.step(step) yy = self.y + self.beta.value * (delta - self.y) self.Q.shrink(1. - self.eta.value * self.lossL) # Stack sample points if self.algorithm == 'hybrid': nonterminal = list(set(nonterminal) & set(rand_act)) if self.algorithm == 'gtd' or self.algorithm == 'hybrid': X = np.vstack((x, x_[nonterminal])) W = np.zeros((len(X), 1)) N = float(len(delta)) W[:len(x)] = self.eta.value / N * yy W[len(x):] = -self.phi * self.eta.value / N * gamma * yy[nonterminal] self.y = np.mean(yy) # Running average of TAD error elif self.algorithm == 'td': X = x N = float(len(delta)) W = self.eta.value / N * yy self.y = np.mean(yy) # Running average of TAD error else: raise ValueError('Unknown algorithm: {}'.format(self.algorithm)) self.Q.append(X, W) # Prune # self.Q.prune(self.eps ** 2 * (self.eta.value / N) ** 2 / self.beta.value) self.Q.prune((self.eps * self.eta.value ** 2) ** 2) def evaluate(self, xs): "Evaluate the Q function for a list of (s,a) pairs." return self.Q(np.array(xs)) def evaluateOne(self, x): "Evaluate the Q function for a single (s,a) pair." return self.Q(x) def maximize(self, ss): "Find the maximizing action for a batch of states." return [self.Q.argmax(s) for s in ss] def maximizeOne(self, s): "Find the maximizing action for a single state." return self.Q.argmax(s) def model_error(self): return 0.5 * self.lossL * self.Q.normsq()
class KNAFIIDModel(object): def __init__(self, stateCount, actionCount, config): # Get dimensions of V, pi and L self.dim_v = 1 self.dim_p = actionCount self.dim_l = 1 # (1 + actionCount) * actionCount / 2 #TODO self.dim_a = self.dim_v + self.dim_p + self.dim_l # Get action space self.min_act = np.reshape(json.loads(config.get('MinAction')), (-1, 1)) self.max_act = np.reshape(json.loads(config.get('MaxAction')), (-1, 1)) # Initialize L self.init_l = config.getfloat('InitL', 0.01) # Represent V, pi, L in one RKHS self.vpl = KernelRepresentation(stateCount, self.dim_a, config) # Learning rates self.eta_v = ScheduledParameter('LearningRateV', config) self.eta_p = ScheduledParameter('LearningRateP', config) self.eta_l = ScheduledParameter('LearningRateL', config) # Learning rate self.eta = ScheduledParameter('LearningRate', config) # Regularization self.lossL = config.getfloat('Regularization', 1e-4) # self.phi = config.getfloat('Phi', 1) # Representation error budget self.eps = config.getfloat('RepresentationError', 1.0) # Reward discount self.gamma = config.getfloat('RewardDiscount') def get_q(self, s, a): lmat = self.get_lmat(s) pi = self.get_pi(s) return np.array( [self.get_v(s) - 0.5 * (a - pi) * lmat * lmat * (a - pi)]) def get_v(self, s): return np.array([self.predictOne(s)[0, 0]]) def get_pi(self, s): pi = self.predictOne(s)[0, 1:self.dim_p + 1] return np.reshape(np.clip(pi, self.min_act, self.max_act), (-1, )) def get_lmat(self, s): lmat = np.zeros((self.dim_p, self.dim_p)) temp = self.predictOne(s) if self.dim_p > 1: lmat[np.tril_indices(self.dim_p)] = temp[self.dim_p + 1:] return lmat + self.init_l * np.eye(self.dim_p) else: return np.array([temp[0, 2] + self.init_l]) def train(self, step, sample): self.eta_v.step(step) self.eta_p.step(step) self.eta_l.step(step) s, a, r, s_ = sample[0][1][0], sample[0][1][1], sample[0][1][ 2], sample[0][1][3] delta = self.bellman_error(s, a, r, s_) # Gradient step self.vpl.shrink(1. - self.lossL) W = np.zeros((self.dim_a, )) W[0] = -1 * self.eta_v.value lmat = self.get_lmat(s) pi = self.get_pi(s) lgrad_temp = lmat * (a - pi) * (a - pi) W[1] = -self.eta_p.value * lmat * lmat * (a - pi) W[-1] = lgrad_temp * self.eta_l.value self.vpl.append(np.array(s), -delta * np.reshape(W, (1, -1))) self.vpl.prune(self.eps) modelOrder_ = len(self.vpl.D) loss = 0.5 * self.bellman_error(s, a, r, s_)**2 # + self.model_error() return (float(loss), float(modelOrder_)) def bellman_error(self, s, a, r, s_): if s_ is None: return r - self.get_q(s, a) else: return r + self.gamma * self.get_v(s_) - self.get_q(s, a) def predict(self, s): # Predict the Q function values for a batch of states. return self.vpl(s) def predictOne(self, s): # Predict the Q function values for a single state. return self.vpl(np.reshape(s, (1, -1))) def model_error(self): return 0.5 * self.lossL * self.vpl.normsq()
class KNAFIIDAgent(object): def __init__(self, env, config): self.stateCount = env.stateCount self.actionCount = env.actionCount self.min_act = env.env.action_space.low self.max_act = env.env.action_space.high self.memory = None # Reward discount self.gamma = config.getfloat('RewardDiscount') # ---- Configure batch size self.batchSize = config.getint('MinibatchSize', 1) self.model = KNAFIIDModel(self.stateCount, self.actionCount, config) # How many steps we have observed self.steps = 0 # ---- Configure exploration self.epsilon = ScheduledParameter('ExplorationRate', config) self.epsilon.step(0) # ---- Configure rewards self.gamma = config.getfloat('RewardDiscount') # ---- Configure priority experience replay self.eps = config.getfloat('ExperiencePriorityMinimum', 0.01) self.alpha = config.getfloat('ExperiencePriorityExponent', 1.) self.noise_var = ScheduledParameter('NoiseVariance', config) self.noise_var.step(0) def act(self, s, stochastic=True): # "Decide what action to take in state s." a = self.model.get_pi(s) if stochastic: # if exploration, add noise a = a + np.random.normal(0, self.noise_var.value, self.actionCount) a = np.reshape(np.clip(a, self.min_act, self.max_act), (-1, )) return a def observe(self, sample): error = self.model.bellman_error(sample[0], sample[1], sample[2], sample[3]) self.memory.add(sample, np.abs((error[0] + self.eps)**self.alpha)) self.steps += 1 self.epsilon.step(self.steps) self.noise_var.step(self.steps) def improve(self): sample = self.memory.sample(1) s, a, r, s_ = sample[0][1][0], sample[0][1][1], sample[0][1][ 2], sample[0][1][3] # update model self.model.train(self.steps, sample) # compute updated error error = self.model.bellman_error(s, a, r, s_) # compute our average minibatch loss loss = 0.5 * np.mean(error**2) + self.model.model_error() # compute our model order modelOrder = len(self.model.vpl.D) # report metrics return (float(loss), float(modelOrder)) def model_error(self): return self.model.model_error() def bellman_error(self, s, a, r, s_): return self.model.bellman_error(s, a, r, s_) @property def metrics_names(self): return ('Training Loss', 'Model Order')