class IterableApproxArray: def __init__(self, lists): if isinstance(lists, Transform): group = lists.lists + [lists.timings] self.srcArray = array(list(zip(*group))) self.dimSize = len(group) else: self.srcArray = array(list(zip(*lists))) self.dimSize = len(lists) self.indices = None self.weights = None self.approx = [None for i in range(0, self.dimSize)] self.approximator = Approximator(self.dimSize) self.maxError = None def approximate(self): self.indices, self.weights = self.approximator.approximateIterable(self.srcArray, self.indices, self.weights) for i in range(0, self.dimSize): self.approx[i] = [self.srcArray[j][i] for j in self.indices] item = self.approximator.findMaxError(self.weights) self.maxError = self.weights[item][1] def approximateByError(self, err): indices = self.approximator.approximate(self.srcArray, err) for i in range(0, self.dimSize): self.approx[i] = [self.srcArray[j][i] for j in indices] self.maxError = None def clean(self): self.indices = None self.weights = None self.approx = [None for i in range(0, self.dimSize)] self.maxError = None
def __init__(self, env: Env = None, trans_capacity = 20000, hidden_dim: int = 16): '''set input_dim(w.r.t. obs.space) and output_dim(w.r.t. action_space)... super(...).__init__(...), self.Q = Approximator(...) self.PQ = self.Q.clone() #PQ for updating parameters #args env: environment of this agent trans_capacity:<int>max num. of transitions in memory hiddden_dim:<int>num. of nodes in hidden layer ''' if env is None: raise "agent should have an environment" super(ApproxQAgent, self).__init__(env, trans_capacity) self.input_dim, self.output_dim = 1, 1 if isinstance(env.observation_space, spaces.Discrete): self.input_dim = 1 elif isinstance(env.observation_space, spaces.Box): self.input_dim = env.observation_space.shape[0] #e.g. observation_space>>Box(6,), .shape>>(6,) if isinstance(env.action_space, spaces.Discrete): self.output_dim = env.action_space.n # elif isinstance(env.action_space, spaces.Box): self.output_dim = env.action_space.shape[0] # print("{},{}".format(self.input_dim, self.output_dim)) self.hidden_dim = hidden_dim self.Q = Approximator(dim_input = self.input_dim, dim_output = self.output_dim, dim_hidden = self.hidden_dim) self.PQ = self.Q.clone() # 更新参数的网络 return
def __init__(self, env: Env = None, trans_capacity=20000, hidden_dim: int = 16): if env is None: raise "agent should have an environment" super(ApproxQAgent, self).__init__(env, trans_capacity) self.input_dim, self.output_dim = 1, 1 if isinstance(env.observation_space, spaces.Discrete): self.input_dim = 1 elif isinstance(env.observation_space, spaces.Box): self.input_dim = env.observation_space.shape[0] if isinstance(env.action_space, spaces.Discrete): self.output_dim = env.action_space.n elif isinstance(env.action_space, spaces.Box): self.output_dim = env.action_space.shape[0] # print("{},{}".format(self.input_dim, self.output_dim)) self.hidden_dim = hidden_dim self.Q = Approximator(dim_input=self.input_dim, dim_output=self.output_dim, dim_hidden=self.hidden_dim) self.PQ = self.Q.clone() # 更新参数的网络 return
def __init__(self, env: Env = None, trans_capacity=20000, hidden_dim: int = 16): if env is None: raise Exception("agent should have an environment") super(ApproxQAgent, self).__init__(env, trans_capacity) self.input_dim, self.output_dim = 1, 1 # 适应不同的状态和行为空间类型 if isinstance(env.observation_space, spaces.Discrete): self.input_dim = 1 elif isinstance(env.observation_space, spaces.Box): self.input_dim = env.observation_space.shape[0] if isinstance(env.action_space, spaces.Discrete): self.output_dim = env.action_space.n elif isinstance(env.action_space, spaces.Box): self.output_dim = env.action_space.shape[0] # print("{},{}".format(self.input_dim, self.output_dim)) # 隐藏层神经元数目 self.hidden_dim = hidden_dim # 关键在下面两句,声明了两个近似价值函数 # 变量Q是一个计算价值,产生loss的近似函数(网络), # 该网络参数在一定时间段内不更新参数 self.Q = Approximator(dim_input=self.input_dim, dim_output=self.output_dim, dim_hidden=self.hidden_dim) # 变量PQ是一个生成策略的近似函数,该函数(网络)的参数频繁更新 # 更新参数的网络 self.PQ = self.Q.clone() return
def __init__(self, lists): if isinstance(lists, Transform): group = lists.lists + [lists.timings] self.srcArray = array(list(zip(*group))) self.dimSize = len(group) else: self.srcArray = array(list(zip(*lists))) self.dimSize = len(lists) self.indices = None self.weights = None self.approx = [None for i in range(0, self.dimSize)] self.approximator = Approximator(self.dimSize) self.maxError = None
def speed_test(): from approximator import Approximator import time import tensorflow as tf from learn.preprocessing import faster_featurize import settings from environment import load_DS settings.init() load_DS('dataset/krk.epd') settings.params['PL'] = list('KRkr') model_fn = 'Models/stem_leaf/TDLeaf/TDLeaf_stem_or_leaf_7__03_07/TDLeaf_stem_or_leaf_7__03_07-1_13299-0' with tf.Session() as sess: saver = tf.train.import_meta_graph(model_fn + '.meta') saver.restore(sess, model_fn) approx = Approximator(sess) V = approx.value F = faster_featurize avg_time1 = 0 avg_time2 = 0 avg_time3 = 0 for _ in xrange(20): env = Environment() flag = False mv_cnt = 0 time1 = 0 time2 = 0 time3 = 0 while not flag: if env.is_terminal(): flag = True else: start = time.time() a, score = alphabeta_native(V, F, env, 3, -float('inf'), float('inf')) end = time.time() a2, score2 = alphabeta_batch_hist(V, F, env, list(env.hist.keys()), 3, -float('inf'), float('inf')) end2 = time.time() a3, score = alphabeta_batch(V, F, env, 3, -float('inf'), float('inf')) end3 = time.time() env.perform_action(a) time1 += end - start time2 += end2 - end time3 += end3 - end2 mv_cnt += 1 avg_time1 += time1 / mv_cnt avg_time2 += time2 / mv_cnt avg_time3 += time3 / mv_cnt print avg_time1 / 100, avg_time2 / 100, avg_time3 / 100
def model_play(m1, m2, play_file): # script playing models against each other. (slooow) import cPickle as cp with open(play_file, 'rb') as f: states = cp.load(f) scores = [0, 0] for s in states: for i in xrange(2): if i == 0: M = [m1, m2] else: M = [m2, m1] e = Environment(s) board = chess.Board.from_epd(e.current_state)[0] while not board.is_game_over(claim_draw=True): print board, '\n\n' with tf.Session() as sess: saver = tf.train.import_meta_graph(M[int(e.get_turn())] + '.meta') saver.restore(sess, M[int(e.get_turn())]) approx = Approximator(sess) agent = tdstem.TDStemPlayAgent(approx, depth=3) a, _, _ = agent.play(e) board.push_uci(a) if board.result() == '1-0': if i == 0: scores[0] += 1 else: scores[1] += 1 elif board.result() == '0-1': if i == 0: scores[1] += 1 else: scores[0] += 1 print scores while not board.is_game_over(claim_draw=True): a, _, _ = agents[env.get_turn()].play(env) board.push_uci(a) mv_cnt += 1 if board.result() == '1-0': if start_turn: wdl = 1 else: wdl = -1 elif board.result() == '0-1': if start_turn: wdl = -1 else: wdl = -1 else: wdl = 0
def play(model_fn, color, start_board=None, sim=False, depth=3): env=Environment(start_board) pol=GreedyPolicy() with tf.Session() as sess: if model_fn is not None: saver=tf.train.import_meta_graph(model_fn+'.meta') saver.restore(sess,model_fn) approx=Approximator(sess) a=[None,None] if sim: a[int(color)]=tdstem.TDStemPlayAgent(approx,depth=depth) a[int(not color)]=opt.OptimalAgent() else: a[int(not color)]=tdstem.TDStemPlayAgent(approx,depth=depth) a[int(color)]=tdstem.TDStemPlayAgent(approx,depth=depth) oa=opt.OptimalAgent() flag=False name=str(raw_input("What's your name? ")) print "Let's play a game, %s!" %(str(name)) while not flag: time.sleep(2) env.draw() print 'DTM: {}'.format(np.abs(oa.approx.tb.probe_dtm(chess.Board.from_epd(env.current_state)[0]))) if env.is_game_over(): print env.result() flag=True else: print 'Evaluation: {}'.format(a[int(color)].get_av_pairs(env)) print 'Optimal moves: {}'.format(oa.get_best_moves(env)) start=time.time() if env.get_turn()==color: if sim: a[int(color)].play(env) else: suc=False while not suc: m=str(raw_input('YOUR MOVE: ')) try: env.perform_action(m) suc=True except: raise ValueError else: a[int(not color)].play(env)
def test(): from approximator import Approximator import time import tensorflow as tf from learn.preprocessing import faster_featurize env = Environment(draw_r=-1, move_r=0.001) env.reset() model_fn = 'Models/DeepTDy_m8_krk_3-4_cont__07_05/DeepTDy_m8_krk_3-4_cont__07_05-0_0173614-0' with tf.Session() as sess: saver = tf.train.import_meta_graph(model_fn + '.meta') saver.restore(sess, model_fn) approx = Approximator(sess) V = approx.value F = faster_featurize flag = False mv_cnt = 0 time1 = 0 time2 = 0 trans = dict() while not flag: env.draw() print env.hist print '\n' if env.is_terminal(): print env.result() flag = True else: start = time.time() a, score = alphabeta_batch(V, F, env, 3, -float('inf'), float('inf')) end = time.time() a2, score2, leaf = alphabeta_batch_hist_leaf( V, F, env, list(env.hist.keys()), 3, -float('inf'), float('inf')) end2 = time.time() #assert np.abs(score-score2)<0.001 env.perform_action(a2) time1 += end - start time2 += end2 - end mv_cnt += 1 print('\nLeaf:') Environment(state=leaf).draw() print('AB-Minimax Batch: {}\tAB-Minimax hist:{}'.format( time1 / mv_cnt, time2 / mv_cnt))
#!/usr/bin/env python3 from approximator import Approximator if __name__ == "__main__": approx = Approximator(n=20, interval=(-2, 1), params=(0.5, 0)) res = approx.search(amp=1) print("c = {}, d = {}".format(res[0], res[1]))
class ApproxQAgent(Agent): '''使用近似的价值函数实现的Q学习个体 ''' def __init__(self, env: Env = None, trans_capacity = 20000, hidden_dim: int = 16): if env is None: raise "agent should have an environment" super(ApproxQAgent, self).__init__(env, trans_capacity) self.input_dim, self.output_dim = 1, 1 if isinstance(env.observation_space, spaces.Discrete): self.input_dim = 1 elif isinstance(env.observation_space, spaces.Box): self.input_dim = env.observation_space.shape[0] if isinstance(env.action_space, spaces.Discrete): self.output_dim = env.action_space.n elif isinstance(env.action_space, spaces.Box): self.output_dim = env.action_space.shape[0] # print("{},{}".format(self.input_dim, self.output_dim)) self.hidden_dim = hidden_dim self.Q = Approximator(dim_input = self.input_dim, dim_output = self.output_dim, dim_hidden = self.hidden_dim) self.PQ = self.Q.clone() # 更新参数的网络 return def _decayed_epsilon(self,cur_episode: int, min_epsilon: float, max_epsilon: float, target_episode: int) -> float: '''获得一个在一定范围内的epsilon ''' slope = (min_epsilon - max_epsilon) / (target_episode) intercept = max_epsilon return max(min_epsilon, slope * cur_episode + intercept) def _curPolicy(self, s, epsilon = None): '''依据更新策略的价值函数(网络)产生一个行为 ''' Q_s = self.PQ(s) rand_value = random() if epsilon is not None and rand_value < epsilon: return self.env.action_space.sample() else: return int(np.argmax(Q_s)) def performPolicy(self, s, epsilon = None): return self._curPolicy(s, epsilon) def _update_Q_net(self): '''将更新策略的Q网络(连带其参数)复制给输出目标Q值的网络 ''' self.Q = self.PQ.clone() def _learn_from_memory(self, gamma, batch_size, learning_rate, epochs): trans_pieces = self.sample(batch_size) # 随机获取记忆里的Transmition states_0 = np.vstack([x.s0 for x in trans_pieces]) actions_0 = np.array([x.a0 for x in trans_pieces]) reward_1 = np.array([x.reward for x in trans_pieces]) is_done = np.array([x.is_done for x in trans_pieces]) states_1 = np.vstack([x.s1 for x in trans_pieces]) X_batch = states_0 y_batch = self.Q(states_0) # 得到numpy格式的结果 Q_target = reward_1 + gamma * np.max(self.Q(states_1), axis=1)*\ (~ is_done) # is_done则Q_target==reward_1 y_batch[np.arange(len(X_batch)), actions_0] = Q_target # loss is a torch Variable with size of 1 loss = self.PQ.fit(x = X_batch, y = y_batch, learning_rate = learning_rate, epochs = epochs) mean_loss = loss.sum().data[0] / batch_size self._update_Q_net() return mean_loss def learning(self, gamma = 0.99, learning_rate=1e-5, max_episodes=1000, batch_size = 64, min_epsilon = 0.2, epsilon_factor = 0.1, epochs = 1): total_steps, step_in_episode, num_episode = 0, 0, 0 target_episode = max_episodes * epsilon_factor while num_episode < max_episodes: epsilon = self._decayed_epsilon(cur_episode = num_episode, min_epsilon = min_epsilon, max_epsilon = 1, target_episode = target_episode) self.state = self.env.reset() # self.env.render() step_in_episode = 0 loss, mean_loss = 0.00, 0.00 is_done = False while not is_done: s0 = self.state a0 = self.performPolicy(s0, epsilon) s1, r1, is_done, info, total_reward = self.act(a0) # self.env.render() step_in_episode += 1 if self.total_trans > batch_size: loss += self._learn_from_memory(gamma, batch_size, learning_rate, epochs) mean_loss = loss / step_in_episode print("{0} epsilon:{1:3.2f}, loss:{2:.3f}". format(self.experience.last, epsilon, mean_loss)) # print(self.experience) total_steps += step_in_episode num_episode += 1 return
class ApproxQAgent(Agent): def __init__(self, env: Env = None, trans_capacity=20000, hidden_dim: int = 16): if env is None: raise "agent should have an environment" super(ApproxQAgent, self).__init__(env, trans_capacity) self.input_dim, self.output_dim = 1, 1 if isinstance(env.observation_space, spaces.Discrete): self.input_dim = 1 elif isinstance(env.observation_space, spaces.Box): self.input_dim = env.observation_space.shape[0] if isinstance(env.action_space, spaces.Discrete): self.output_dim = env.action_space.n elif isinstance(env.action_space, spaces.Box): self.output_dim = env.action_space.shape[0] # print("{},{}".format(self.input_dim, self.output_dim)) self.hidden_dim = hidden_dim self.Q = Approximator(dim_input=self.input_dim, dim_output=self.output_dim, dim_hidden=self.hidden_dim) self.PQ = self.Q.clone() return def _decayed_epsilon(self, cur_episode: int, min_epsilon: float, max_epsilon: float, target_episode: int) -> float: slope = (min_epsilon - max_epsilon) / (target_episode) intercept = max_epsilon return max(min_epsilon, slope * cur_episode + intercept) def _curPolicy(self, s, epsilon=None): Q_s = self.PQ(s) rand_value = random() if epsilon is not None and rand_value < epsilon: return self.env.action_space.sample() else: return int(np.argmax(Q_s)) def performPolicy(self, s, epsilon=None): return self._curPolicy(s, epsilon) def _update_Q_net(self): self.Q = self.PQ.clone() def _learn_from_memory(self, gamma, batch_size, learning_rate, epochs, r, s): trans_pieces = self.sample(batch_size) states_0 = np.vstack([x.s0 for x in trans_pieces]) actions_0 = np.array([x.a0 for x in trans_pieces]) reward_1 = np.array([x.reward for x in trans_pieces]) is_done = np.array([x.is_done for x in trans_pieces]) states_1 = np.vstack([x.s1 for x in trans_pieces]) X_batch = states_0 y_batch = self.Q(states_0) Q_target = reward_1 + gamma * np.max(self.Q(states_1), axis=1) * \ (~ is_done) y_batch[np.arange(len(X_batch)), actions_0] = Q_target # loss is a torch Variable with size of 1 loss = self.PQ.fit(x=X_batch, y=y_batch, learning_rate=learning_rate, epochs=epochs) mean_loss = loss.sum().data[0] / batch_size self._update_Q_net() return mean_loss def learning(self, gamma=0.99, learning_rate=1e-5, max_episodes=1000, batch_size=64, min_epsilon=0.2, epsilon_factor=0.1, epochs=1): total_steps, step_in_episode, num_episode = 0, 0, 0 target_episode = max_episodes * epsilon_factor file = open('dqn.csv', 'w') file.write("Episode" + "," + "Distance" + "\n") tot_dis = 0 file = open('reward.csv', 'w') file.write("Steps in Episode" + "," + "reward" + "\n") while num_episode < max_episodes: epsilon = self._decayed_epsilon(cur_episode=num_episode, min_epsilon=min_epsilon, max_epsilon=1, target_episode=target_episode) self.state = self.env._reset() self.env._render() step_in_episode = 0 loss, mean_loss = 0.00, 0.00 is_done = False while not is_done: s0 = self.state a0 = self.performPolicy(s0, epsilon) s1, r1, is_done, dis_info = self.env._step_b(a0) self.env._render() step_in_episode += 1 tot_dis += r1 print("Step in Episode :: ", step_in_episode) print("Distance of agent from goal :: ", dis_info) file.write(str(step_in_episode) + "," + str(tot_dis) + "\n") if self.total_trans > batch_size: loss += self._learn_from_memory(gamma, batch_size, learning_rate, epochs, r1, s1) file.close() mean_loss = loss / step_in_episode print("{0} epsilon:{1:3.2f}, loss:{2:.3f}".format( self.experience.last, epsilon, mean_loss)) # print(self.experience) total_steps += step_in_episode num_episode += 1 #print("Episode :: ", num_episode) # print("Distance of agent from goal :: ", dis_info) return
def comparison_stem_leaf_kqk(): model_fn_leaf='Models/KQK/TDL/network' with open('Models/KQK/TDL_BAD/sim','rb') as f: A,evaldict,S=cp.load(f) wc_l=np.mean(np.array(evaldict['wc'])) we_l=np.mean(np.array(evaldict['we'])) lhs_l=np.mean(np.array(evaldict['lhs'])) #t=stem[''] print wc_l, we_l, lhs_l model_fn_leaf='Models/KQK/TDS/network' with open('Models/KQK/TDS_BAD/sim','rb') as f: A,evaldict,S=cp.load(f) wc_s=np.mean(np.array(evaldict['wc'])) we_s=np.mean(np.array(evaldict['we'])) lhs_s=np.mean(np.array(evaldict['lhs'])) #t=stem[''] print wc_s, we_s, lhs_s with open('Models/KQK/TDL_BAD/meta','rb') as f: leaf=cp.load(f) with open('Models/KQK/TDS_BAD/meta','rb') as f: stem=cp.load(f) mps_s=np.mean(np.array(stem['mps'])) mps_l=np.mean(np.array(leaf['mps'])) ntot_s=stem['episodes'] ntot_l=leaf['episodes'] el_s=stem['elapsed_time'] el_l=leaf['elapsed_time'] print mps_s, mps_l, ntot_s, ntot_l, el_s, el_l import tablebases with open('Models/KQK/TDL/meta','rb') as f: leaf=cp.load(f) with open('Models/KQK/TDS/meta','rb') as f: stem=cp.load(f) settings.init() load_DS('dataset/kqk_fics.epd') settings.params['PL']='KQkq' settings.params['USE_DSET']=True N_l=np.array([0]+leaf['N'],dtype=float) N_s=np.array([0]+stem['N'],dtype=float) eps_l=leaf['eps'] eps_s=stem['eps'] w_l=np.array([0]+leaf['w_list'])/N_l w_s=np.array([0]+stem['w_list'])/N_s e_l=np.cumsum(N_l) e_s=np.cumsum(N_s) stages_s=[t[0] for t in stem['lambda']] stages_l=[t[0] for t in leaf['lambda']] print leaf['lambda'] l_l=leaf['avg_len'] l_s=stem['avg_len'] plt.figure(1) plt.subplot(111) line_stem, =plt.plot(e_s,w_s,label='TD-Stem'+r'$(\lambda)$') line_leaf, =plt.plot(e_l,w_l,label='TD-Leaf'+r'$(\lambda)$') for i in stages_s: plt.axvline(x=i,color='#99ccff') for i in stages_l: plt.axvline(x=i,color='#ffc266') plt.xlabel(r'$N$') plt.ylabel('winning rate') plt.legend(handles=[line_leaf,line_stem]) plt.xlim(0,max(max(e_l),max(e_s))) plt.ylim(0,1) plt.show() mps_s=np.mean(np.array(stem['mps'])) mps_l=np.mean(np.array(leaf['mps'])) ntot_s=stem['episodes'] ntot_l=leaf['episodes'] el_s=stem['elapsed_time'] el_l=leaf['elapsed_time'] print mps_s, mps_l, ntot_s, ntot_l, el_s, el_l model_fn_stem='Models/KQK/TDS/network' with open('Models/KQK/TDS/sim2','rb') as f: A,evaldict,S=cp.load(f) wc_s=np.mean(np.array(evaldict['wc'])) we_s=np.mean(np.array(evaldict['we'])) lhs_s=np.mean(np.array(evaldict['lhs'])) #t=stem[''] print wc_s, we_s, lhs_s tw=[t for t in A if tablebases.probe_result(t[-1])==1] td=[t for t in A if tablebases.probe_result(t[-1])==0] tb=[t for t in A if tablebases.probe_result(t[-1])==-1] Sw=[t[-1] for t in tw] dtmw=[t[1] for t in tw] print min(dtmw) wdlw=[t[0] for t in tw] Sb=[t[-1] for t in tb] dtmb=[t[1] for t in tb] wdlb=[t[0] for t in tb] print min(dtmb) vw_s=Approximator.V(Sw,model_fn_stem) vb_s=Approximator.V(Sb,model_fn_stem) hist_wcs=20*[0] hist_wes=20*[0] hist_lhss=20*[0] hist_dcs=20*[0] avg_vs=20*[0] std_vs=20*[0] avg_vsb=20*[0] std_vsb=20*[0] #print A for i in xrange(len(hist_wcs)): hist_wcs[i]=np.mean(np.array([wc(t) for t in A if wc(t) is not None and t[1]==i+1])) hist_wes[i]=np.mean(np.array([we(t) for t in A if we(t) is not None and t[1]==i+1])) hist_lhss[i]=np.mean(np.array([lhs(t) for t in A if lhs(t) is not None and t[1]==i+1])) hist_dcs[i]=np.mean(np.array([dc(t) for t in A if dc(t) is not None and t[1]==i+1])) avg_vs[i]=np.mean(np.array([vw_s[j] for j in xrange(vw_s.shape[0]) if dtmw[j]==i+1 ])) std_vs[i]=np.std(np.array([vw_s[j] for j in xrange(vw_s.shape[0]) if dtmw[j]==i+1 and wdlw[j]==1])) avg_vsb[i]=np.mean(np.array([vb_s[j] for j in xrange(vb_s.shape[0]) if dtmb[j]==i+1 ])) std_vsb[i]=np.std(np.array([vb_s[j] for j in xrange(vb_s.shape[0]) if dtmb[j]==i+1 and wdlb[j]==1])) model_fn_leaf='Models/KQK/TDL/network' with open('Models/KQK/TDL/sim2','rb') as f: A,evaldict,S=cp.load(f) wc_l=np.mean(np.array(evaldict['wc'])) we_l=np.mean(np.array(evaldict['we'])) lhs_l=np.mean(np.array(evaldict['lhs'])) #t=stem[''] print wc_l, we_l, lhs_l tw=[t for t in A if tablebases.probe_result(t[-1])==1] td=[t for t in A if tablebases.probe_result(t[-1])==0] tb=[t for t in A if tablebases.probe_result(t[-1])==-1] Sw=[t[-1] for t in tw] dtmw=[t[1] for t in tw] wdlw=[t[0] for t in tw] Sb=[t[-1] for t in tb] dtmb=[t[1] for t in tb] wdlb=[t[0] for t in tb] vw_l=Approximator.V(Sw,model_fn_leaf) vb_l=Approximator.V(Sb,model_fn_leaf) hist_wcl=20*[0] hist_wel=20*[0] hist_lhsl=20*[0] hist_dcl=20*[0] avg_vl=20*[0] std_vl=20*[0] avg_vlb=20*[0] std_vlb=20*[0] for i in xrange(len(hist_wcs)): hist_wcl[i]=np.mean(np.array([wc(t) for t in A if wc(t) is not None and t[1]==i+1])) hist_wel[i]=np.mean(np.array([we(t) for t in A if we(t) is not None and t[1]==i+1])) hist_lhsl[i]=np.mean(np.array([lhs(t) for t in A if lhs(t) is not None and t[1]==i+1])) hist_dcl[i]=np.mean(np.array([dc(t) for t in A if dc(t) is not None and t[1]==i+1])) avg_vl[i]=np.mean(np.array([vw_l[j] for j in xrange(vw_l.shape[0]) if dtmw[j]==i+1 ])) std_vl[i]=np.std(np.array([vw_l[j] for j in xrange(vw_l.shape[0]) if dtmw[j]==i+1 and wdlw[j]==1])) avg_vlb[i]=np.mean(np.array([vb_l[j] for j in xrange(vb_l.shape[0]) if dtmb[j]==i+1 ])) std_vlb[i]=np.std(np.array([vb_l[j] for j in xrange(vb_l.shape[0]) if dtmb[j]==i+1 and wdlb[j]==1])) x=np.array(range(1,len(hist_wcs)+1)) plt.figure(2) plt.subplot(111) b1=plt.bar(x-1./6, hist_wcs,width=1./3,align='center',label='TD-Stem'+r'$(\lambda)$ ') b2=plt.bar(x+1./6, hist_wcl,width=1./3,align='center',label='TD-Leaf'+r'$(\lambda)$ ') #plt.title('kqk endgame win conversion rate') plt.legend(handles=[b1,b2]) plt.xlabel('DTM') plt.ylabel('WCR') plt.xlim(0,x.max()) plt.ylim(0,1) plt.xticks(x,x) plt.show() plt.figure(3) plt.subplot(111) b1=plt.bar(x-1./6, hist_wes,width=1./3,align='center',label='TD-Stem'+r'$(\lambda)$ ') b2=plt.bar(x+1./6, hist_wel,width=1./3,align='center',label='TD-Leaf'+r'$(\lambda)$ ') #plt.title('kqk endgame win efficiency') plt.legend(handles=[b1,b2]) plt.xlabel('DTM') plt.ylabel('WE') plt.xlim(0,x.max()) plt.ylim(0,1) plt.xticks(x,x) plt.show() plt.figure(4) plt.subplot(111) b1=plt.bar(x-1./6, hist_lhss,width=1./3,align='center',label='TD-Stem'+r'$(\lambda)$ ') b2=plt.bar(x+1./6, hist_lhsl,width=1./3,align='center',label='TD-Leaf'+r'$(\lambda)$ ') #plt.title('kqk endgame loss holding score') plt.legend(handles=[b1,b2]) plt.xlabel('DTM') plt.ylabel('LHS') plt.xlim(0,x.max()) plt.ylim(0,1) plt.xticks(x,x) plt.show() plt.figure(5) b1,=plt.plot(x,avg_vs,label='TD-Stem'+r'$(\lambda)$ ') b2,=plt.plot(x,avg_vl,label='TD-Leaf'+r'$(\lambda)$ ') s1,=plt.plot(x,np.array(avg_vs)+2*np.array(std_vs),color='#99ccff') s2,=plt.plot(x,np.array(avg_vs)-2*np.array(std_vs),color='#99ccff') s3,=plt.plot(x,np.array(avg_vl)+2*np.array(std_vl),color='#ffc266') s4,=plt.plot(x,np.array(avg_vl)-2*np.array(std_vl),color='#ffc266') c1,=plt.plot(x,avg_vsb,label='TD-Stem'+r'$(\lambda)$ ',color=b1.get_color()) c2,=plt.plot(x,avg_vlb,label='TD-Leaf'+r'$(\lambda)$ ',color=b2.get_color()) t1,=plt.plot(x,np.array(avg_vsb)+2*np.array(std_vsb),color='#99ccff') t2,=plt.plot(x,np.array(avg_vsb)-2*np.array(std_vsb),color='#99ccff') t3,=plt.plot(x,np.array(avg_vlb)+2*np.array(std_vlb),color='#ffc266') t4,=plt.plot(x,np.array(avg_vlb)-2*np.array(std_vlb),color='#ffc266') plt.xticks(x,x) #plt.title('krk endgame win conversion rate') plt.legend(handles=[b1,b2]) plt.xlabel('DTM') plt.ylabel('E[V]') plt.xlim(0,x.max()) #plt.ylim(0,1) plt.show()
def comparison_stem_leaf(): settings.init() settings.params['USE_DSET']=True settings.params['PL']='KRkr' load_DS('dataset/krk.epd') with open('Models/stem_leaf/TDLeaf/TDLeaf_stem_or_leaf_7__03_07/stem_or_leaf_7_meta.sv','rb') as f: leaf=cp.load(f) with open('Models/stem_leaf/TDStem/TDStem_stem_or_leaf_7__28_06/stem_or_leaf_7_meta.sv','rb') as f: stem=cp.load(f) print leaf.keys() print stem.keys() N_l=leaf['N'][0] N_s=stem['N'][0] #print N_l, N_s w_l=leaf['w_list'] r_l=leaf['r_lists'] l_l=leaf['avg_len'] w_s=stem['w_list'] r_s=stem['r_lists'] l_s=stem['avg_len'] mps_s=np.mean(np.array(stem['mps'])) mps_l=np.mean(np.array(leaf['mps'])) ntot_s=stem['episodes'] ntot_l=leaf['episodes'] el_s=stem['elapsed_time'] el_l=leaf['elapsed_time'] print mps_s, mps_l, ntot_s, ntot_l, el_s, el_l ep_s2=[0] rate_s=[] cumsum=0 for i in xrange(len(w_s)): if i<53: cumsum+=5000 rate_s.append(5000.) elif i<73: cumsum+=500 rate_s.append(500.) else: cumsum+=250 rate_s.append(250.) ep_s2.append(cumsum) wr_s=np.array([0]+w_s)/np.array([1]+rate_s) rrw_s=5000*np.array(r_s[0])/np.array(rate_s) rrb_s=5000*np.array(r_s[1])/np.array(rate_s) ep_l2=[0] rate_l=[] cumsum=0 for i in xrange(len(w_l)): if i<63: cumsum+=5000 rate_l.append(5000.) elif i<83: cumsum+=500 rate_l.append(500.) else: cumsum+=250 rate_l.append(250.) ep_l2.append(cumsum) wr_l=np.array([0]+w_l)/np.array([1]+rate_l) rrw_l=5000*np.array(r_l[0])/np.array(rate_l) rrb_l=5000*np.array(r_l[1])/np.array(rate_l) plt.figure(1) plt.subplot(111) line_stem, =plt.plot(ep_s2,wr_s,label='TD-Stem'+r'$(\lambda)$') for i in [100000,170000,264000,275000,283200,291000]: plt.axvline(x=i,color='#99ccff') line_leaf, =plt.plot(ep_l2,wr_l,label='TD-Leaf'+r'$(\lambda)$') for i in [120000,220000,315000,325250,333000,341000]: plt.axvline(x=i,color='#ffc266') plt.xlabel(r'$N$') plt.ylabel('winning rate') plt.legend(handles=[line_leaf,line_stem]) plt.xlim(0,max(ep_l2)) plt.ylim(0,1) #plt.title('krk endgame learning curve') plt.show() mps_s=np.mean(np.array(stem['mps'])) mps_l=np.mean(np.array(leaf['mps'])) ntot_s=stem['episodes'] ntot_l=leaf['episodes'] el_s=stem['elapsed_time'] el_l=leaf['elapsed_time'] print mps_s, mps_l, ntot_s, ntot_l, el_s, el_l model_fn='Models/stem_leaf/TDStem/TDStem_stem_or_leaf_7__28_06/TDStem_stem_or_leaf_7__28_06-1_23116-0' with open('Models/stem_leaf/TDStem/sim','rb') as f: A,evaldict,S=cp.load(f) wc_s=np.mean(np.array(evaldict['wc'])) we_s=np.mean(np.array(evaldict['we'])) lhs_s=np.mean(np.array(evaldict['lhs'])) #t=stem[''] print wc_s, we_s, lhs_s S=[t[-1] for t in A] dtm=[t[1] for t in A] wdl=[t[0] for t in A] v=Approximator.V(S,model_fn) hist_wcs=33*[0] hist_wes=33*[0] hist_lhss=33*[0] avg_vs=33*[0] std_vs=33*[0] #print A for i in xrange(len(hist_wcs)): hist_wcs[i]=np.mean(np.array([wc(t) for t in A if wc(t) is not None and t[1]==i])) hist_wes[i]=np.mean(np.array([we(t) for t in A if we(t) is not None and t[1]==i])) hist_lhss[i]=np.mean(np.array([lhs(t) for t in A if lhs(t) is not None and t[1]==i])) avg_vs[i]=np.mean(np.array([v[j] for j in xrange(v.shape[0]) if dtm[j]==i and wdl[j]==1])) std_vs[i]=np.std(np.array([v[j] for j in xrange(v.shape[0]) if dtm[j]==i and wdl[j]==1])) model_fn='Models/stem_leaf/TDLeaf/TDLeaf_stem_or_leaf_7__03_07/TDLeaf_stem_or_leaf_7__03_07-1_13299-0' with open('Models/stem_leaf/TDLeaf/sim','rb') as f: A,evaldict,S=cp.load(f) wc_l=np.mean(np.array(evaldict['wc'])) we_l=np.mean(np.array(evaldict['we'])) lhs_l=np.mean(np.array(evaldict['lhs'])) #t=stem[''] print wc_l, we_l, lhs_l S=[t[-1] for t in A] dtm=[t[1] for t in A] wdl=[t[0] for t in A] v=Approximator.V(S,model_fn) hist_wcl=33*[0] hist_wel=33*[0] hist_lhsl=33*[0] avg_vl=33*[0] std_vl=33*[0] #print A for i in xrange(len(hist_wcl)): hist_wcl[i]=np.mean(np.array([wc(t) for t in A if wc(t) is not None and t[1]==i+1])) hist_wel[i]=np.mean(np.array([we(t) for t in A if we(t) is not None and t[1]==i+1])) hist_lhsl[i]=np.mean(np.array([lhs(t) for t in A if lhs(t) is not None and t[1]==i+1])) avg_vl[i]=np.mean(np.array([v[j] for j in xrange(v.shape[0]) if dtm[j]==i+1 and wdl[j]==1])) std_vl[i]=np.std(np.array([v[j] for j in xrange(v.shape[0]) if dtm[j]==i+1 and wdl[j]==1])) x=np.array(range(1,len(hist_wcs)+1)) plt.figure(2) plt.subplot(111) b1=plt.bar(x-1./6, hist_wcs,width=1./3,align='center',label='TD-Stem'+r'$(\lambda)$ ') b2=plt.bar(x+1./6, hist_wcl,width=1./3,align='center',label='TD-Leaf'+r'$(\lambda)$ ') #plt.title('krk endgame win conversion rate') plt.legend(handles=[b1,b2]) plt.xlabel('DTM') plt.ylabel('WCR') plt.show() plt.figure(3) plt.subplot(111) b1=plt.bar(x-1./6, hist_wes,width=1./3,align='center',label='TD-Stem'+r'$(\lambda)$ ') b2=plt.bar(x+1./6, hist_wel,width=1./3,align='center',label='TD-Leaf'+r'$(\lambda)$ ') #plt.title('krk endgame win efficiency') plt.legend(handles=[b1,b2]) plt.xlabel('DTM') plt.ylabel('WE') plt.show() plt.figure(4) plt.subplot(111) b1=plt.bar(x-1./6, hist_lhss,width=1./3,align='center',label='TD-Stem'+r'$(\lambda)$ ') b2=plt.bar(x+1./6, hist_lhsl,width=1./3,align='center',label='TD-Leaf'+r'$(\lambda)$ ') #plt.title('krk endgame loss holding score') plt.legend(handles=[b1,b2]) plt.xlabel('DTM') plt.ylabel('LHS') plt.show() plt.figure(5) plt.subplot(111) b1,=plt.plot(x,avg_vs,label='TD-Stem'+r'$(\lambda)$ ') b2,=plt.plot(x,avg_vl,label='TD-Leaf'+r'$(\lambda)$ ') s1,=plt.plot(x,np.array(avg_vs)+2*np.array(std_vs),color='#99ccff') s2,=plt.plot(x,np.array(avg_vs)-2*np.array(std_vs),color='#99ccff') s3,=plt.plot(x,np.array(avg_vl)+2*np.array(std_vl),color='#ffc266') s4,=plt.plot(x,np.array(avg_vl)-2*np.array(std_vl),color='#ffc266') #plt.title('krk endgame value function') plt.legend(handles=[b1,b2]) plt.xlabel('DTM') plt.ylabel(r'$V$') plt.show()
class ApproxQAgent(Agent): '''使用近似的价值函数实现的Q学习个体 #Function 1 value function approximation 2 base on Experience Relay, which is good for eliminating relationship of transition in a single episode, in order to get a better approximation 3 DQN ''' def __init__(self, env: Env = None, trans_capacity = 20000, hidden_dim: int = 16): '''set input_dim(w.r.t. obs.space) and output_dim(w.r.t. action_space)... super(...).__init__(...), self.Q = Approximator(...) self.PQ = self.Q.clone() #PQ for updating parameters #args env: environment of this agent trans_capacity:<int>max num. of transitions in memory hiddden_dim:<int>num. of nodes in hidden layer ''' if env is None: raise "agent should have an environment" super(ApproxQAgent, self).__init__(env, trans_capacity) self.input_dim, self.output_dim = 1, 1 if isinstance(env.observation_space, spaces.Discrete): self.input_dim = 1 elif isinstance(env.observation_space, spaces.Box): self.input_dim = env.observation_space.shape[0] #e.g. observation_space>>Box(6,), .shape>>(6,) if isinstance(env.action_space, spaces.Discrete): self.output_dim = env.action_space.n # elif isinstance(env.action_space, spaces.Box): self.output_dim = env.action_space.shape[0] # print("{},{}".format(self.input_dim, self.output_dim)) self.hidden_dim = hidden_dim self.Q = Approximator(dim_input = self.input_dim, dim_output = self.output_dim, dim_hidden = self.hidden_dim) self.PQ = self.Q.clone() # 更新参数的网络 return def _decayed_epsilon(self,cur_episode: int, min_epsilon: float, max_epsilon: float, target_episode: int) -> float: '''获得一个在一定范围内的epsilon #return epsilon<float>changing from max_epsilon(when cur_episode=0) to min_epsilon w.r.t. cur_episode ''' slope = (min_epsilon - max_epsilon) / (target_episode) intercept = max_epsilon return max(min_epsilon, slope * cur_episode + intercept) #slope*cur_episode is negative def _curPolicy(self, s, epsilon = None): '''依据更新策略的价值函数(网络)产生一个行为 #args s: state s0<6x1 ndarray> epsilon: =None means greedy, otherwise epsilon greedy #return an action a0<int> w.r.t. PQ(policy evaluation) using decayed epsilon-greedy(policy improvement) ''' Q_s = self.PQ(s) # rand_value = random() if epsilon is not None and rand_value < epsilon: return self.env.action_space.sample() else: return int(np.argmax(Q_s)) def performPolicy(self, s, epsilon = None): #若只有一个Policy,则可略 return self._curPolicy(s, epsilon) def _update_Q_net(self): '''将更新策略的Q网络(连带其参数)复制给输出目标Q值的网络 ''' self.Q = self.PQ.clone() def _learn_from_memory(self, gamma, batch_size, learning_rate, epochs): # get Transmition randomly from experience, return a <list>, consists of batch_size * Transition object(consists of data,s0,a0,reward,s1,is_done) trans_pieces = self.sample(batch_size) states_0 = np.vstack([x.s0 for x in trans_pieces]) #ndarray actions_0 = np.array([x.a0 for x in trans_pieces]) reward_1 = np.array([x.reward for x in trans_pieces]) is_done = np.array([x.is_done for x in trans_pieces]) states_1 = np.vstack([x.s1 for x in trans_pieces]) X_batch = states_0 # ndarray, consists of list([Q(s0)(a_0), Q(s0)(a_1),....]), describe all Q of all actions in state s0 #y_batch = self.Q(states_0) #main difference in a0 dimension y_batch = self.PQ(states_0) #only Q(s,a,w) in a0 dimension different. But always walk around #matrix-weise calculation Q_target = reward_1 + gamma * np.max(self.Q(states_1), axis=1)*\ (~ is_done) # is_done则Q_target==reward_1 #Attension: y_batch[np.arange(len(X_batch)), actions_0] = Q_target # loss is a torch Variable with size of 1 loss = self.PQ.fit(x = X_batch, y = y_batch, learning_rate = learning_rate, epochs = epochs) mean_loss = loss.sum().data[0] / batch_size self._update_Q_net() return mean_loss def learning(self, gamma = 0.99, learning_rate=1e-5, max_episodes=1000, batch_size = 64, min_epsilon = 0.2, epsilon_factor = 0.1, epochs = 1): '''contruct experience, when nums of trans. in experience enough, start learning from experience, compute loss Methods details see below #Arguments gamma = 0.99, # discount factor, range from [0,1] learning_rate=1e-5, # 集中学习的规模 max_episodes=1000, # 最大训练Episode数量 batch_size = 64, min_epsilon = 0.2, epsilon_factor = 0.1, # 开始使用最小Epsilon时Episode的序号占最大 # Episodes序号之比,该比值越小,表示使用 # min_epsilon的episode越多 epochs = 1): # 每个batch_size训练的次数 ''' total_steps, step_in_episode, num_episode = 0, 0, 0 target_episode = max_episodes * epsilon_factor while num_episode < max_episodes: #for each episode until max_episode, get loss epsilon = self._decayed_epsilon(cur_episode = num_episode, min_epsilon = min_epsilon, max_epsilon = 1, target_episode = target_episode) self.state = self.env.reset() self.env.render() step_in_episode = 0 loss, mean_loss = 0.00, 0.00 # is_done = False while not is_done:#for every transition s0 = self.state #self.state change inside self.act(a0) a0 = self.performPolicy(s0, epsilon) #get action w.r.t. PQ using decayed epsilon-greedy s1, r1, is_done, info, total_reward = self.act(a0) #inside self.act(a0): self.state = s1 #inside act also:sotre trans as episode_list in experience, and as trans_list in episode, and accumulate the total_reward self.env.render() step_in_episode += 1 if self.total_trans > batch_size: loss += self._learn_from_memory(gamma, batch_size, learning_rate, epochs) mean_loss = loss / step_in_episode print("{0} epsilon:{1:3.2f}, loss:{2:.3f}". format(self.experience.last, epsilon, mean_loss)) # print(self.experience) total_steps += step_in_episode num_episode += 1 return
help='number of episodes to play', type=int) parser.add_argument('-p', help='piece cfg') parser.add_argument('-D', help='dset file') parser.add_argument( '-R', default=10, type=int, help='number of random moves to play before registration') parser.add_argument('-d', default=3, type=int, help='depth') parser.add_argument('-w', action='store_true') args = parser.parse_args() settings.init() settings.params['USE_DSET'] = True settings.params['PL'] = args.p load_DS(args.D) settings.params['RAND'] = args.R settings.params['OC_DEPTH'] = args.d model_fn = args.c with tf.Session() as sess: saver = tf.train.import_meta_graph(model_fn + '.meta') saver.restore(sess, model_fn) approx = Approximator(sess) agent = tdstem.TDStemPlayAgent(approx, depth=3) A, evaldict, all_s = opt.recursive_eval_sim(agent, N=args.N, w=args.w) with open(args.o, 'wb') as f: cp.dump((A, evaldict, all_s), f)
class ApproxQAgent(Agent): '''使用近似的价值函数实现的Q学习的个体 ''' def __init__(self, env: Env = None, trans_capacity=20000, hidden_dim: int = 16): if env is None: raise Exception("agent should have an environment") super(ApproxQAgent, self).__init__(env, trans_capacity) self.input_dim, self.output_dim = 1, 1 # 适应不同的状态和行为空间类型 if isinstance(env.observation_space, spaces.Discrete): self.input_dim = 1 elif isinstance(env.observation_space, spaces.Box): self.input_dim = env.observation_space.shape[0] if isinstance(env.action_space, spaces.Discrete): self.output_dim = env.action_space.n elif isinstance(env.action_space, spaces.Box): self.output_dim = env.action_space.shape[0] # print("{},{}".format(self.input_dim, self.output_dim)) # 隐藏层神经元数目 self.hidden_dim = hidden_dim # 关键在下面两句,声明了两个近似价值函数 # 变量Q是一个计算价值,产生loss的近似函数(网络), # 该网络参数在一定时间段内不更新参数 self.Q = Approximator(dim_input=self.input_dim, dim_output=self.output_dim, dim_hidden=self.hidden_dim) # 变量PQ是一个生成策略的近似函数,该函数(网络)的参数频繁更新 # 更新参数的网络 self.PQ = self.Q.clone() return def _learning_from_memory(self, gamma, batch_size, learning_rate, epochs): # 随机获取记忆里的Transmition trans_pieces = self.sample(batch_size) states_0 = np.vstack([x.s0 for x in trans_pieces]) actions_0 = np.array([x.a0 for x in trans_pieces]) reward_1 = np.array([x.reward for x in trans_pieces]) is_done = np.array([x.is_done for x in trans_pieces]) states_1 = np.vstack([x.s1 for x in trans_pieces]) X_batch = states_0 # 调用的时approximator的__call__方法 y_batch = self.Q(states_0) # 使用了Batch,代码是矩阵运算 # np.max => axis=1时取出最大的一列;axis=0时取出最大的一行 # ~ True = -2; ~ False = -1 Q_target = reward_1 + gamma * np.max(self.Q(states_1), axis=1) * (~ is_done) y_batch[np.arange(len(X_batch)), actions_0] = Q_target # loss is a torch Variable with size of 1 loss = self.PQ.fit(x=X_batch, y=y_batch, learning_rate=learning_rate, epochs=epochs) mean_loss = loss.sum().item() / batch_size self._update_Q_net() return mean_loss def learning(self, gamma=0.99, learning_rate=1e-5, max_episodes=1000, batch_size=64, min_epsilon=0.2, epsilon_factor=0.1, epochs=1): '''learning的主要工作是构建经历,当构建的经历足够时,同时启动基于经历的学习 ''' total_steps, step_in_episode, num_episode = 0, 0, 0 target_episode = max_episodes * epsilon_factor while num_episode < max_episodes: epsilon = self._decayed_epsilon(cur_episode=num_episode, min_epsilon=min_epsilon, max_epsilon=1, target_episode=target_episode) self.state = self.env.reset() self.env.render() step_in_episode = 0 loss, mean_loss = 0.00, 0.00 is_done = False while not is_done: s0 = self.state a0 = self.performPolicy(s0, epsilon) # act方法封装了将Transition记录至Experience中的过程 s1, r1, is_done, info, total_reward = self.act(a0) # self.env.render() step_in_episode += 1 # 当经历里有足够大小的Transition时,开始启用基于经历的学习 if self.total_trans > batch_size: loss += self._learning_from_memory(gamma, batch_size, learning_rate, epochs) mean_loss = loss / step_in_episode print("{0} epsilon:{1:3.2f}, loss:{2:.3f}". format(self.experience.last, epsilon, mean_loss)) # print(self.experience) total_steps += step_in_episode num_episode += 1 return def _decayed_epsilon(self, cur_episode: int, min_epsilon: float, max_epsilon: float, target_episode: int) -> float: '''获得一个在一定范围内的epsilon ''' slope = (min_epsilon - max_epsilon) / (target_episode) intercept = max_epsilon return max(min_epsilon, slope * cur_episode + intercept) def _curPolicy(self, s, epsilon=None): '''依据更新策略的价值函数(网络)产生一个行为 ''' Q_s = self.PQ(s) rand_value = random() if epsilon is not None and rand_value < epsilon: return self.env.action_space.sample() else: return int(np.argmax(Q_s)) def performPolicy(self, s, epsilon=None): return self._curPolicy(s, epsilon) def _update_Q_net(self): '''将更新策略的Q网络(连带其参数)复制给输出目标Q值的网络 ''' self.Q = self.PQ.clone()