def weight_example(): import matplotlib.pyplot as plt # s2_means = np.array([-100.0, 10.0]) # s2_vars = np.array([1.0, 10.0]) # c_mean = 5.0 # hist = [] # var_avg = [] # mean_disp = [] # for log_c_var in np.arange(-2.5, 5.0, 0.1): # outputs = adfq_fun.posterior_adfq(s2_means, s2_vars, c_mean, np.exp(log_c_var), 0.0, 0.9, terminal=0, varTH=1e-10) # hist.append(outputs[1]) # var_avg.append(np.sum(outputs[2][1]*outputs[2][2])) # f, ax = plt.subplots() # ax.plot(np.exp(np.arange(-2.5, 5.0, 0.1)), np.exp(np.arange(-2.5, 5.0, 0.1)), 'k--') # ax.plot(np.exp(np.arange(-2.5, 5.0, 0.1)), hist) # ax.plot(np.exp(np.arange(-2.5, 5.0, 0.1)), var_avg) # ax.plot(np.exp(np.arange(-2.5, 5.0, 0.1)), np.array(hist)-np.array(var_avg)) # ax.legend(['prior','new varaince', 'avg variance', 'mean dispersion']) # plt.show() # pdb.set_trace() hist = [] X = np.arange(-20, 20, 0.1) Y = np.arange(-20, 20, 0.1) n_vars = np.array([100.0, 100.0]) for n1 in X: for n2 in Y: n_means = np.array([n1, n2]) outputs = adfq_fun.posterior_adfq(n_means, n_vars, 0.0, 1000.0, 0.0, 0.9, terminal=0, varTH=1e-10) hist.append(outputs[1]) hist = np.reshape(hist, (X.shape[0], Y.shape[0])) from mpl_toolkits.mplot3d import Axes3D from matplotlib import cm from matplotlib.ticker import LinearLocator, FormatStrFormatter X, Y = np.meshgrid(X, Y) fig = plt.figure() ax = fig.gca(projection='3d') surf = ax.plot_wireframe(X, Y, hist, rstride=10, cstride=10) ax.set_xlabel('s_tp1 mean 1') ax.set_ylabel('s_tp1 mean 2') ax.set_zlabel('Variance update') #fig.colorbar(surf, shrink=0.5, aspect=5) plt.show() pdb.set_trace()
def fun(rewards, test_num): alpha = 0.5 print("Test%d: r=%.2f, r=%.2f" % (test_num, rewards[0], rewards[1])) s2_means = np.array([0.0, 0.0]) s2_vars = np.array([10.0, 10.0]) c_mean = s2_means[0] c_var = s2_vars[0] Q = np.zeros((3, 2)) outputs = adfq_fun.posterior_adfq(s2_means, s2_vars, c_mean, c_var, rewards[0], 0.9, terminal=0, varTH=1e-10) s2_means[0] = outputs[0] s2_vars[0] = outputs[1] print("t=1 mean: ", s2_means) print("t=1 var: ", s2_vars) outputs = adfq_fun.posterior_adfq(s2_means, s2_vars, c_mean, c_var, rewards[1], 0.9, terminal=0, varTH=1e-10) s2_means[0] = outputs[0] s2_vars[0] = outputs[1] print("t=2 mean: ", s2_means) print("t=2 var: ", s2_vars) print("Before Q: ", Q[1, :]) for r in rewards: Q[1, 0] = (1 - alpha) * Q[1, 0] + alpha * (r + 0.9 * max(Q[1, :])) print("After Q: ", Q[1, :])
def adfq_update(q_means, q_vars, sars_tuples): print("\nInitial ADFQ Mean and Variance") print("means:") print(q_means[:2, :]) print("vars:") print(q_vars[:2, :]) visits = np.zeros(q_means.shape) for (i, sars) in enumerate(sars_tuples): visits[sars[0], sars[1]] += 1.0 print("t=%d" % i) prior = [q_means[sars[0], sars[1]], q_vars[sars[0], sars[1]]] targets = [ sars[2] + GAMMA * q_means[sars[-1], :], GAMMA * GAMMA * q_vars[sars[-1], :] ] td_err = sars[2] + GAMMA * q_means[sars[-1], :] - q_means[sars[0], sars[1]] outputs = adfq_fun.posterior_adfq(q_means[sars[-1], :], q_vars[sars[-1], :], q_means[sars[0], sars[1]], q_vars[sars[0], sars[1]], sars[2], GAMMA, terminal=0, varTH=1e-10) print("sars:", sars) q_means[sars[0], sars[1]] = outputs[0] q_vars[sars[0], sars[1]] = outputs[1] print("means:") print(q_means[:2, :]) print("vars:") print(q_vars[:2, :]) print("mu_bs:", outputs[2][0].astype(np.float16)) print("target:", sars[2] + GAMMA * q_means[sars[-1], :]) print("var_bs:", outputs[2][1].astype(np.float16)) print("k_bs:", outputs[2][2].astype(np.float16)) print("TD error:", td_err) if i > 11: display_update(prior, targets, outputs) q_tab[sars[0], sars[1]] = q_update(sars, q_tab, alpha=1. / (visits[sars[0], sars[1]])) print("If Q-learning with alpha:%.2f" % (1. / (visits[sars[0], sars[1]]))) print(q_tab) else: q_tab = np.copy(q_means) return q_means, q_vars, q_tab
def td_err_effect(): import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D fig = plt.figure() ax = fig.add_subplot(111, projection='3d') s2_means = np.array([0.0, 5.0]) s2_vars = np.array([10.0, 10.0]) c_var = 1.0 td_errs = [] var_hist = [] comp = [] for c_mean in np.arange(-10.0, 20.0, 0.5): #s2_means[1] = c_mean -10.0 # if len(comp) > 1: # if not(comp[-1][0]) and not(comp[-1][1]): # pdb.set_trace() outputs = adfq_fun.posterior_adfq(s2_means, s2_vars, c_mean, c_var, 0.0, 0.9, terminal=0, varTH=1e-10) td_errs.append(0.0 + 0.9 * s2_means - c_mean) td_targets = 0.9 * s2_means comp.append([ outputs[2][0][0] < td_targets[1], outputs[2][0][1] < td_targets[0] ]) var_hist.append(outputs[1]) #td_errs = np.abs(td_errs) td_errs = np.array(td_errs) pdb.set_trace() ax.plot(td_errs[:, 0], td_errs[:, 1], var_hist, 'bo-') ax.set_xlabel('TD error 1') ax.set_ylabel('TD error 2') ax.set_zlabel('Variance update') plt.show()
def learning(self, actionPolicy, actionParam=None, updatePolicy='adfq', eval_greedy=False, draw=False, varTH=1e-5, updateParam=None, asymptotic=False, asymptotic_trigger=1e-8, useScale=False, noise=0.0, noise_c=0.0, batch_size=0): """train with ADFQ Parameters ---------- actionPolicy : action policy. See "action_selection" function below. actionParam : a hyperparameter for the chosen action policy if necessary. updatePolicy : 'adfq' for the ADFQ algorithm. 'numeric' for the ADFQ-Numeric update. 'adfq-v2' for the ADFQ V2 update (appendix). eval_greedy : True to evaluate the current policy during learning. draw : True to print out the simulation (for grid and maze domains) varTH : variance thereshold asymptotic : True to use the asymptotic update asymptotic_trigger : a value to decide when to start the asymptotic update if "asymptotic==True" useScale : use the scaling trick. noise : for stochastic case, you can add a small noise to the variance[s,a] batch_size : batch size. 0 if you don't use experience replay. """ if len(self.rewards) == self.env.timeH: print("The object has already learned") return None if (actionPolicy == 'offline') and (len(actionParam) != self.env.timeH): print(len(actionParam), self.env.timeH) raise ValueError( 'The given action trajectory does not match with the number of learning steps.' ) np.random.seed() self.varTH = varTH if batch_size > 0: s = self.env.reset(self.np_random) while (len(self.replayMem[(0, 0)]) < self.memory_size): a = np.random.choice(self.env.anum) r, s_n, done = self.env.observe(s, a, self.np_random) self.store({ 'state': s, 'action': a, 'reward': r, 'state_n': s_n, 'terminal': done }) s = self.env.reset(self.np_random) self.log_scale = 0.0 while (self.step < self.env.timeH): if self.step % (int(self.env.timeH / util.EVAL_NUM)) == 0: self.Q_err.append(self.err()) a = self.action_selection(s, actionPolicy, actionParam) # Observation r, s_n, done = self.env.observe(s, a, self.np_random) self.rewards.append(r) self.visits[s][a] += 1 if batch_size > 0: self.store({ 'state': s, 'action': a, 'reward': r, 'state_n': s_n, 'terminal': done }) batch = self.get_batch(s, a, batch_size) n_means = self.means[batch['state_n'], :] n_vars = self.vars[batch['state_n'], :] c_mean = self.means[batch['state'], batch['action']] c_var = self.vars[batch['state'], batch['action']] reward = batch['reward'] terminal = batch['terminal'] else: # Record self.states.append(s) self.actions.append(a) n_means = self.means[s_n] n_vars = self.vars[s_n] c_mean = self.means[s][a] c_var = self.vars[s][a] reward = r terminal = done # Update self.varTH = varTH / np.exp(self.log_scale, dtype=util.DTYPE) if (updatePolicy == 'adfq'): new_mean, new_var, _ = adfq_fun.posterior_adfq( n_means, n_vars, c_mean, c_var, reward, self.discount, terminal, scale_factor=np.exp(self.log_scale, dtype=util.DTYPE), varTH=self.varTH, asymptotic=asymptotic, asymptotic_trigger=asymptotic_trigger, noise=noise / (1. + self.visits[s][a]), noise_c=noise_c / (1. + self.visits[s][a]), batch=(batch_size > 0)) elif updatePolicy == 'numeric': new_mean, new_var, _ = adfq_fun.posterior_numeric( n_means, n_vars, c_mean, c_var, reward, self.discount, terminal, scale_factor=np.exp(self.log_scale, dtype=util.DTYPE), varTH=self.varTH, noise=noise / (1. + self.visits[s][a]), noise_c=noise_c / (1. + self.visits[s][a]), batch=(batch_size > 0)) elif (updatePolicy == 'adfq-v2'): new_mean, new_var, _ = adfq_fun.posterior_adfq_v2( n_means, n_vars, c_mean, c_var, reward, self.discount, terminal, scale_factor=np.exp(self.log_scale, dtype=util.DTYPE), varTH=self.varTH, asymptotic=asymptotic, asymptotic_trigger=asymptotic_trigger, noise=noise, batch=(batch_size > 0)) elif updatePolicy == 'hybrid': new_mean, new_var, _ = adfq_fun.posterior_hybrid( n_means, n_vars, c_mean, c_var, reward, self.discount, terminal, scale_factor=np.exp(self.log_scale, dtype=util.DTYPE), varTH=self.varTH, noise=noise, batch=(batch_size > 0)) else: raise ValueError("No such update policy") self.means[s][a] = np.mean(new_mean) self.vars[s][a] = np.mean( new_var) #np.maximum(self.varTH, new_var) if useScale: delta = np.log(np.mean(self.vars[self.env.eff_states, :])) self.vars[self.env.eff_states, :] = np.exp( np.log(self.vars[self.env.eff_states, :]) - delta, dtype=np.float64) self.log_scale = np.maximum(-100.0, self.log_scale + delta) if draw: self.draw(s, a, self.step, r) if eval_greedy and ((self.step + 1) % (int(self.env.timeH / util.EVAL_NUM)) == 0): count, rew, _, _ = self.greedy_policy( lambda x: self.get_action_egreedy(x, util.EVAL_EPS)) self.test_counts.append(count) self.test_rewards.append(rew) s = self.env.reset(self.np_random) if done else s_n self.step += 1
def batch_test(): # Test example n_means = np.array( [[ 0.8811533, 0.9458812, 0.78800523, 0.7855228, 0.8548022, 1.0071998, 0.8347546, 0.8921166, 0.9238528, 0.92809606, 0.9720104, 0.8862572 ], [ 0.818557, 0.7337841, 1.1270436, 1.239617, 1.5188323, 1.0134017, 1.0220736, 1.0529686, 1.1472604, 1.0060027, 0.79905474, 1.0601841 ], [ 8.18557, 0.7337841, 1.1270436, -1.239617, 1.5188323, 1.0134017, -10.220736, 1.0529686, -11.472604, 1.0060027, 79.905474, 1.0601841 ]], dtype=np.float32) n_vars = np.array( [[ 56.72953, 50.670547, 57.65268, 57.328403, 58.38417, 57.436073, 56.893364, 54.29608, 57.82789, 51.035233, 50.11123, 56.001694 ], [ 62.661198, 69.064156, 48.007538, 44.541973, 28.289375, 49.62694, 56.438583, 51.90718, 51.234684, 46.22086, 53.456276, 41.784817 ], [ 0.626611, 69.064156, 48.007538, 0.44541973, 0.282893, 49.62694, 56.438583, 51.90718, 0.051234, 46.22086, 0.053456, 41.784817 ]], dtype=np.float32) c_mean = np.array([0.9727963, 0.78518265, 6.2386910], dtype=np.float32) c_var = np.array([50.026917, 69.16801, 10.0012345], dtype=np.float64) reward = np.array([-0.0119087, -0., 1.0], dtype=np.float64) discount = 0.99 terminal = [0, 1, 0] out_batch = adfq_fun.posterior_adfq(n_means, n_vars, c_mean, c_var, reward, discount, terminal=terminal, batch=True) for i in range(len(n_means)): out = adfq_fun.posterior_adfq(n_means[i], n_vars[i], c_mean[i], c_var[i], reward[i], discount, terminal=terminal[i]) if np.abs(out[0] - out_batch[0][i]) > 1e-5: print("MISMATCH Mean for ENTRY.%d:" % i, out[0], out_batch[0][i]) else: print("PASS Mean for ENTRY.%d" % i) if np.abs(out[1] - out_batch[1][i]) > 1e-5: print("MISMATCH Variance for ENTRY.%d:" % i, out[1], out_batch[1][i]) else: print("PASS Variance for ENTRY.%d" % i)
def learning(self, actionPolicy, actionParam, updatePolicy='adfq', eval_greedy=False, draw=False, varTH=1e-10, updateParam=None, asymptotic=False, asymptotic_trigger=1e-8, useScale=False, noise=0.0, batch_size=0, change=True, beta=0.0): """train with ADFQ Parameters ---------- actionPolicy : action policy. See "action_selection" function below. actionParam : a hyperparameter for the chosen action policy if necessary. updatePolicy : 'adfq' for the ADFQ algorithm. 'numeric' for the ADFQ-Numeric update. 'adfq-v2' for the ADFQ V2 update (appendix). eval_greedy : True to evaluate the current policy during learning. draw : True to print out the simulation (for grid and maze domains) varTH : variance thereshold asymptotic : True to use the asymptotic update asymptotic_trigger : a value to decide when to start the asymptotic update if "asymptotic==True" useScale : use the scaling trick. noise : for stochastic case, you can add a small noise to the variance[s,a] batch_size : batch size. 0 if you don't use experience replay. """ if len(self.rewards) == self.env.timeH: print("The object has already learned") return None if (actionPolicy == 'offline') and (len(actionParam) != self.env.timeH): print(len(actionParam), self.env.timeH) raise ValueError( 'The given action trajectory does not match with the number of learning steps.' ) np.random.seed() self.Q_target = np.array(self.env.optQ(self.discount)) self.varTH = varTH records = {'t': [], 'k': [], 'var': [], 'mean': []} if batch_size > 0: s = self.env.reset(self.np_random) while (len(self.replayMem[(0, 0)]) < self.memory_size): a = np.random.choice(self.env.anum) r, s_n, done = self.env.observe(s, a, self.np_random) self.store({ 'state': s, 'action': a, 'reward': r, 'state_n': s_n, 'terminal': done }) s = self.env.reset(self.np_random) self.log_scale = 0.0 temp = [] while (self.step < self.env.timeH): if change and (self.step == self.env.changePt): # 0.5*self.env.timeH): self.env.change() self.Q_target = np.array( self.env.optQ(self.discount, changed=True)) if self.step % (int(self.env.timeH / util.EVAL_NUM)) == 0: self.Q_err.append(self.err()) a = self.action_selection(s, actionPolicy, actionParam) # Observation r, s_n, done = self.env.observe(s, a, self.np_random) self.rewards.append(r) self.visits[s][a] += 1 if batch_size > 0: self.store({ 'state': s, 'action': a, 'reward': r, 'state_n': s_n, 'terminal': done }) batch = self.get_batch(s, a, batch_size) n_means = self.means[batch['state_n'], :] n_vars = self.vars[batch['state_n'], :] c_mean = self.means[batch['state'], batch['action']] c_var = self.vars[batch['state'], batch['action']] reward = batch['reward'] terminal = batch['terminal'] else: # Record self.states.append(s) self.actions.append(a) n_means = self.means[s_n] n_vars = self.vars[s_n] c_mean = self.means[s][a] c_var = self.vars[s][a] reward = r terminal = done # Update self.varTH = varTH / np.exp(self.log_scale, dtype=util.DTYPE) if (updatePolicy == 'adfq'): new_mean, new_var, stats = adfq_fun.posterior_adfq( n_means, n_vars, c_mean, c_var, reward, self.discount, terminal, scale_factor=np.exp(self.log_scale, dtype=util.DTYPE), varTH=self.varTH, asymptotic=asymptotic, asymptotic_trigger=asymptotic_trigger, noise=noise, batch=(batch_size > 0)) elif updatePolicy == 'numeric': new_mean, new_var, _ = adfq_fun.posterior_numeric( n_means, n_vars, c_mean, c_var, reward, self.discount, terminal, scale_factor=np.exp(self.log_scale, dtype=util.DTYPE), varTH=self.varTH, noise=noise, batch=(batch_size > 0)) elif (updatePolicy == 'adfq-v2'): new_mean, new_var, stats = adfq_fun.posterior_adfq_v2( n_means, n_vars, c_mean, c_var, reward, self.discount, terminal, scale_factor=np.exp(self.log_scale, dtype=util.DTYPE), varTH=self.varTH, asymptotic=asymptotic, asymptotic_trigger=asymptotic_trigger, noise=noise, batch=(batch_size > 0)) elif updatePolicy == 'hybrid': new_mean, new_var, _ = adfq_fun.posterior_hybrid( n_means, n_vars, c_mean, c_var, reward, self.discount, terminal, scale_factor=np.exp(self.log_scale, dtype=util.DTYPE), varTH=self.varTH, noise=noise, batch=(batch_size > 0)) else: raise ValueError("No such update policy") td_err = reward + self.discount * n_means - c_mean #np.clip(np.abs(reward + self.discount*n_means - c_mean), 0.1, 10.0) add_vars = c_var + self.discount**2 * n_vars #penalty = np.dot(stats[2], norm.cdf(td_err,0.0, 0.001*np.sqrt(add_vars)))-0.5 #penalty = 50*(np.tanh(0.1*(np.dot(stats[2],td_err**2/add_vars)-50.0))+1.0) gate_bound = 1.0 penalty = np.dot(stats[2], td_err**2 / add_vars) gate_const = 1.0 if penalty > gate_bound else 0.0 #penalty *= gate_const steepness = 0.01 midpoint = 5.0 penalty = gate_const * 30.0 / (1. + np.exp(-steepness * (penalty - midpoint))) temp.append([np.dot(stats[2], td_err**2 / add_vars), penalty]) if s == 1 and a == 3: records['t'].append(self.step) records['k'].append(stats[2]) records['mean'].append(copy.deepcopy(self.means)) records['var'].append(copy.deepcopy(self.vars)) #print("t:%d, var:%.4f, penalty:%.4f"%(self.step,new_var, penalty)) self.means[s][a] = np.mean(new_mean) self.vars[s][a] = np.mean( new_var) + beta * penalty #np.maximum(self.varTH, new_var) if useScale: delta = np.log(np.mean(self.vars[self.env.eff_states, :])) self.vars[self.env.eff_states, :] = np.exp( np.log(self.vars[self.env.eff_states, :]) - delta, dtype=np.float64) self.log_scale = np.maximum(-100.0, self.log_scale + delta) if draw: #self.var_plot() self.draw(s, a, self.step, r) if eval_greedy and ((self.step + 1) % (int(self.env.timeH / util.EVAL_NUM)) == 0): count, rew, _, _ = self.greedy_policy( lambda x: self.get_action_egreedy(x, util.EVAL_EPS)) self.test_counts.append(count) self.test_rewards.append(rew) s = self.env.reset(self.np_random) if done else s_n self.step += 1 return records, temp