Exemple #1
0
class IterableApproxArray:
    def __init__(self, lists):
        if isinstance(lists, Transform):
            group = lists.lists + [lists.timings]
            self.srcArray = array(list(zip(*group)))
            self.dimSize = len(group)
        else:
            self.srcArray = array(list(zip(*lists)))
            self.dimSize = len(lists)
        self.indices = None
        self.weights = None
        self.approx = [None for i in range(0, self.dimSize)]
        self.approximator = Approximator(self.dimSize)
        self.maxError = None

    def approximate(self):
        self.indices, self.weights = self.approximator.approximateIterable(self.srcArray, self.indices, self.weights)
        for i in range(0, self.dimSize):
            self.approx[i] = [self.srcArray[j][i] for j in self.indices]
        item = self.approximator.findMaxError(self.weights)
        self.maxError = self.weights[item][1]

    def approximateByError(self, err):
        indices = self.approximator.approximate(self.srcArray, err)
        for i in range(0, self.dimSize):
            self.approx[i] = [self.srcArray[j][i] for j in indices]
        self.maxError = None

    def clean(self):
        self.indices = None
        self.weights = None
        self.approx = [None for i in range(0, self.dimSize)]
        self.maxError = None
Exemple #2
0
    def __init__(self, env: Env = None,
                       trans_capacity = 20000,
                       hidden_dim: int = 16):
        '''set input_dim(w.r.t. obs.space) and output_dim(w.r.t. action_space)...

        super(...).__init__(...),
        self.Q = Approximator(...)
        self.PQ = self.Q.clone() #PQ for updating parameters

        #args
            env: environment of this agent
            trans_capacity:<int>max num. of transitions in memory
            hiddden_dim:<int>num. of nodes in hidden layer
        '''
        if env is None:
            raise "agent should have an environment"
        super(ApproxQAgent, self).__init__(env, trans_capacity)
        self.input_dim, self.output_dim = 1, 1
        if isinstance(env.observation_space, spaces.Discrete):
            self.input_dim = 1
        elif isinstance(env.observation_space, spaces.Box):
            self.input_dim = env.observation_space.shape[0]     #e.g. observation_space>>Box(6,), .shape>>(6,)

        if isinstance(env.action_space, spaces.Discrete):
            self.output_dim = env.action_space.n                #
        elif isinstance(env.action_space, spaces.Box):
            self.output_dim = env.action_space.shape[0]

        # print("{},{}".format(self.input_dim, self.output_dim))
        self.hidden_dim = hidden_dim
        self.Q = Approximator(dim_input = self.input_dim,
                              dim_output = self.output_dim,
                              dim_hidden = self.hidden_dim)
        self.PQ = self.Q.clone() # 更新参数的网络
        return
Exemple #3
0
    def __init__(self,
                 env: Env = None,
                 trans_capacity=20000,
                 hidden_dim: int = 16):
        if env is None:
            raise "agent should have an environment"
        super(ApproxQAgent, self).__init__(env, trans_capacity)
        self.input_dim, self.output_dim = 1, 1
        if isinstance(env.observation_space, spaces.Discrete):
            self.input_dim = 1
        elif isinstance(env.observation_space, spaces.Box):
            self.input_dim = env.observation_space.shape[0]

        if isinstance(env.action_space, spaces.Discrete):
            self.output_dim = env.action_space.n
        elif isinstance(env.action_space, spaces.Box):
            self.output_dim = env.action_space.shape[0]

        # print("{},{}".format(self.input_dim, self.output_dim))
        self.hidden_dim = hidden_dim
        self.Q = Approximator(dim_input=self.input_dim,
                              dim_output=self.output_dim,
                              dim_hidden=self.hidden_dim)
        self.PQ = self.Q.clone()  # 更新参数的网络
        return
Exemple #4
0
    def __init__(self, env: Env = None,
                 trans_capacity=20000,
                 hidden_dim: int = 16):
        if env is None:
            raise Exception("agent should have an environment")
        super(ApproxQAgent, self).__init__(env, trans_capacity)
        self.input_dim, self.output_dim = 1, 1

        # 适应不同的状态和行为空间类型
        if isinstance(env.observation_space, spaces.Discrete):
            self.input_dim = 1
        elif isinstance(env.observation_space, spaces.Box):
            self.input_dim = env.observation_space.shape[0]

        if isinstance(env.action_space, spaces.Discrete):
            self.output_dim = env.action_space.n
        elif isinstance(env.action_space, spaces.Box):
            self.output_dim = env.action_space.shape[0]
        # print("{},{}".format(self.input_dim, self.output_dim))

        # 隐藏层神经元数目
        self.hidden_dim = hidden_dim
        # 关键在下面两句,声明了两个近似价值函数
        # 变量Q是一个计算价值,产生loss的近似函数(网络),
        # 该网络参数在一定时间段内不更新参数
        self.Q = Approximator(dim_input=self.input_dim,
                              dim_output=self.output_dim,
                              dim_hidden=self.hidden_dim)
        # 变量PQ是一个生成策略的近似函数,该函数(网络)的参数频繁更新
        # 更新参数的网络
        self.PQ = self.Q.clone()
        return
Exemple #5
0
 def __init__(self, lists):
     if isinstance(lists, Transform):
         group = lists.lists + [lists.timings]
         self.srcArray = array(list(zip(*group)))
         self.dimSize = len(group)
     else:
         self.srcArray = array(list(zip(*lists)))
         self.dimSize = len(lists)
     self.indices = None
     self.weights = None
     self.approx = [None for i in range(0, self.dimSize)]
     self.approximator = Approximator(self.dimSize)
     self.maxError = None
Exemple #6
0
def speed_test():
    from approximator import Approximator
    import time
    import tensorflow as tf
    from learn.preprocessing import faster_featurize
    import settings
    from environment import load_DS

    settings.init()
    load_DS('dataset/krk.epd')
    settings.params['PL'] = list('KRkr')
    model_fn = 'Models/stem_leaf/TDLeaf/TDLeaf_stem_or_leaf_7__03_07/TDLeaf_stem_or_leaf_7__03_07-1_13299-0'
    with tf.Session() as sess:
        saver = tf.train.import_meta_graph(model_fn + '.meta')
        saver.restore(sess, model_fn)
        approx = Approximator(sess)
        V = approx.value
        F = faster_featurize

        avg_time1 = 0
        avg_time2 = 0
        avg_time3 = 0

        for _ in xrange(20):
            env = Environment()

            flag = False
            mv_cnt = 0
            time1 = 0
            time2 = 0
            time3 = 0
            while not flag:
                if env.is_terminal():
                    flag = True

                else:
                    start = time.time()
                    a, score = alphabeta_native(V, F, env, 3, -float('inf'),
                                                float('inf'))
                    end = time.time()
                    a2, score2 = alphabeta_batch_hist(V, F, env,
                                                      list(env.hist.keys()), 3,
                                                      -float('inf'),
                                                      float('inf'))
                    end2 = time.time()
                    a3, score = alphabeta_batch(V, F, env, 3, -float('inf'),
                                                float('inf'))
                    end3 = time.time()
                    env.perform_action(a)
                    time1 += end - start
                    time2 += end2 - end
                    time3 += end3 - end2
                    mv_cnt += 1

            avg_time1 += time1 / mv_cnt
            avg_time2 += time2 / mv_cnt
            avg_time3 += time3 / mv_cnt

        print avg_time1 / 100, avg_time2 / 100, avg_time3 / 100
Exemple #7
0
def model_play(m1, m2, play_file):
    # script playing models against each other. (slooow)
    import cPickle as cp
    with open(play_file, 'rb') as f:
        states = cp.load(f)

    scores = [0, 0]

    for s in states:
        for i in xrange(2):
            if i == 0:
                M = [m1, m2]
            else:
                M = [m2, m1]

            e = Environment(s)
            board = chess.Board.from_epd(e.current_state)[0]

            while not board.is_game_over(claim_draw=True):
                print board, '\n\n'

                with tf.Session() as sess:
                    saver = tf.train.import_meta_graph(M[int(e.get_turn())] +
                                                       '.meta')
                    saver.restore(sess, M[int(e.get_turn())])
                    approx = Approximator(sess)
                    agent = tdstem.TDStemPlayAgent(approx, depth=3)

                    a, _, _ = agent.play(e)
                    board.push_uci(a)

            if board.result() == '1-0':
                if i == 0: scores[0] += 1
                else: scores[1] += 1
            elif board.result() == '0-1':
                if i == 0: scores[1] += 1
                else: scores[0] += 1

    print scores

    while not board.is_game_over(claim_draw=True):
        a, _, _ = agents[env.get_turn()].play(env)
        board.push_uci(a)
        mv_cnt += 1

        if board.result() == '1-0':
            if start_turn:
                wdl = 1
            else:
                wdl = -1
        elif board.result() == '0-1':
            if start_turn:
                wdl = -1
            else:
                wdl = -1
        else:
            wdl = 0
Exemple #8
0
def play(model_fn, color, start_board=None, sim=False, depth=3):

    env=Environment(start_board)
    pol=GreedyPolicy()
    with tf.Session() as sess:
        if model_fn is not None: 
            saver=tf.train.import_meta_graph(model_fn+'.meta')
            saver.restore(sess,model_fn)

        approx=Approximator(sess)
        a=[None,None]
        if sim:
            a[int(color)]=tdstem.TDStemPlayAgent(approx,depth=depth)
            a[int(not color)]=opt.OptimalAgent()
            
        else:
            a[int(not color)]=tdstem.TDStemPlayAgent(approx,depth=depth)
            a[int(color)]=tdstem.TDStemPlayAgent(approx,depth=depth)

        oa=opt.OptimalAgent()

        flag=False
        
        name=str(raw_input("What's your name? "))
        print "Let's play a game, %s!" %(str(name))

        while not flag:
            time.sleep(2)

            env.draw() 
            print 'DTM: {}'.format(np.abs(oa.approx.tb.probe_dtm(chess.Board.from_epd(env.current_state)[0])))

            if env.is_game_over():
                print env.result()
                flag=True

            else:
                print 'Evaluation: {}'.format(a[int(color)].get_av_pairs(env))
                print 'Optimal moves: {}'.format(oa.get_best_moves(env))
                start=time.time()

                if env.get_turn()==color:
                    if sim:
                        a[int(color)].play(env)
                    else:
                        suc=False
                        while not suc:
                            m=str(raw_input('YOUR MOVE: '))
                            try:
                                env.perform_action(m)
                                suc=True
                            except:
                                raise ValueError

                else:
                    a[int(not color)].play(env)
Exemple #9
0
def test():
    from approximator import Approximator
    import time
    import tensorflow as tf
    from learn.preprocessing import faster_featurize
    env = Environment(draw_r=-1, move_r=0.001)
    env.reset()
    model_fn = 'Models/DeepTDy_m8_krk_3-4_cont__07_05/DeepTDy_m8_krk_3-4_cont__07_05-0_0173614-0'
    with tf.Session() as sess:
        saver = tf.train.import_meta_graph(model_fn + '.meta')
        saver.restore(sess, model_fn)
        approx = Approximator(sess)
        V = approx.value
        F = faster_featurize
        flag = False
        mv_cnt = 0
        time1 = 0
        time2 = 0
        trans = dict()
        while not flag:
            env.draw()
            print env.hist
            print '\n'

            if env.is_terminal():
                print env.result()
                flag = True

            else:
                start = time.time()
                a, score = alphabeta_batch(V, F, env, 3, -float('inf'),
                                           float('inf'))
                end = time.time()
                a2, score2, leaf = alphabeta_batch_hist_leaf(
                    V, F, env, list(env.hist.keys()), 3, -float('inf'),
                    float('inf'))
                end2 = time.time()
                #assert np.abs(score-score2)<0.001
                env.perform_action(a2)
                time1 += end - start
                time2 += end2 - end
                mv_cnt += 1
                print('\nLeaf:')
                Environment(state=leaf).draw()

        print('AB-Minimax Batch: {}\tAB-Minimax hist:{}'.format(
            time1 / mv_cnt, time2 / mv_cnt))
Exemple #10
0
#!/usr/bin/env python3

from approximator import Approximator

if __name__ == "__main__":
    approx = Approximator(n=20, interval=(-2, 1), params=(0.5, 0))
    res = approx.search(amp=1)
    print("c = {}, d = {}".format(res[0], res[1]))
class ApproxQAgent(Agent):
    '''使用近似的价值函数实现的Q学习个体
    '''
    def __init__(self, env: Env = None,
                       trans_capacity = 20000,
                       hidden_dim: int = 16):
        if env is None:
            raise "agent should have an environment"
        super(ApproxQAgent, self).__init__(env, trans_capacity)
        self.input_dim, self.output_dim = 1, 1
        if isinstance(env.observation_space, spaces.Discrete):
            self.input_dim = 1
        elif isinstance(env.observation_space, spaces.Box):
            self.input_dim = env.observation_space.shape[0]

        if isinstance(env.action_space, spaces.Discrete):
            self.output_dim = env.action_space.n
        elif isinstance(env.action_space, spaces.Box):
            self.output_dim = env.action_space.shape[0]

        # print("{},{}".format(self.input_dim, self.output_dim))
        self.hidden_dim = hidden_dim
        self.Q = Approximator(dim_input = self.input_dim,
                              dim_output = self.output_dim,
                              dim_hidden = self.hidden_dim)
        self.PQ = self.Q.clone() # 更新参数的网络
        return

    def _decayed_epsilon(self,cur_episode: int, 
                              min_epsilon: float, 
                              max_epsilon: float, 
                              target_episode: int) -> float:
        '''获得一个在一定范围内的epsilon
        '''
        slope = (min_epsilon - max_epsilon) / (target_episode)
        intercept = max_epsilon
        return max(min_epsilon, slope * cur_episode + intercept)

    def _curPolicy(self, s, epsilon = None):
        '''依据更新策略的价值函数(网络)产生一个行为
        '''
        Q_s = self.PQ(s)
        rand_value = random()
        if epsilon is not None and rand_value < epsilon:
            return self.env.action_space.sample()
        else:
            return int(np.argmax(Q_s))
        
    def performPolicy(self, s, epsilon = None):
        return self._curPolicy(s, epsilon)


    def _update_Q_net(self):
        '''将更新策略的Q网络(连带其参数)复制给输出目标Q值的网络
        '''
        self.Q = self.PQ.clone()

    
    def _learn_from_memory(self, gamma, batch_size, learning_rate, epochs):
        trans_pieces = self.sample(batch_size)  # 随机获取记忆里的Transmition
        states_0 = np.vstack([x.s0 for x in trans_pieces])
        actions_0 = np.array([x.a0 for x in trans_pieces])
        reward_1 = np.array([x.reward for x in trans_pieces])
        is_done = np.array([x.is_done for x in trans_pieces])
        states_1 = np.vstack([x.s1 for x in trans_pieces])

        X_batch = states_0
        y_batch = self.Q(states_0)  # 得到numpy格式的结果

        Q_target = reward_1 + gamma * np.max(self.Q(states_1), axis=1)*\
            (~ is_done) # is_done则Q_target==reward_1
        y_batch[np.arange(len(X_batch)), actions_0] = Q_target
        # loss is a torch Variable with size of 1
        loss = self.PQ.fit(x = X_batch, 
                           y = y_batch, 
                           learning_rate = learning_rate,
                           epochs = epochs)

        mean_loss = loss.sum().data[0] / batch_size
        self._update_Q_net()
        return mean_loss

    def learning(self, gamma = 0.99,
                       learning_rate=1e-5, 
                       max_episodes=1000, 
                       batch_size = 64,
                       min_epsilon = 0.2,
                       epsilon_factor = 0.1,
                       epochs = 1):

        total_steps, step_in_episode, num_episode = 0, 0, 0
        target_episode = max_episodes * epsilon_factor
        while num_episode < max_episodes:
            epsilon = self._decayed_epsilon(cur_episode = num_episode,
                                            min_epsilon = min_epsilon, 
                                            max_epsilon = 1,
                                            target_episode = target_episode)
            self.state = self.env.reset()
            # self.env.render()
            step_in_episode = 0
            loss, mean_loss = 0.00, 0.00
            is_done = False
            while not is_done:
                s0 = self.state

                a0  = self.performPolicy(s0, epsilon)
                s1, r1, is_done, info, total_reward = self.act(a0)
                # self.env.render()
                step_in_episode += 1
                
                if self.total_trans > batch_size:
                    loss += self._learn_from_memory(gamma, 
                                                    batch_size, 
                                                    learning_rate,
                                                    epochs)
            mean_loss = loss / step_in_episode
            print("{0} epsilon:{1:3.2f}, loss:{2:.3f}".
                format(self.experience.last, epsilon, mean_loss))
            # print(self.experience)
            total_steps += step_in_episode
            num_episode += 1

        return   
Exemple #12
0
class ApproxQAgent(Agent):
    def __init__(self,
                 env: Env = None,
                 trans_capacity=20000,
                 hidden_dim: int = 16):
        if env is None:
            raise "agent should have an environment"
        super(ApproxQAgent, self).__init__(env, trans_capacity)
        self.input_dim, self.output_dim = 1, 1
        if isinstance(env.observation_space, spaces.Discrete):
            self.input_dim = 1
        elif isinstance(env.observation_space, spaces.Box):
            self.input_dim = env.observation_space.shape[0]

        if isinstance(env.action_space, spaces.Discrete):
            self.output_dim = env.action_space.n
        elif isinstance(env.action_space, spaces.Box):
            self.output_dim = env.action_space.shape[0]

        # print("{},{}".format(self.input_dim, self.output_dim))
        self.hidden_dim = hidden_dim
        self.Q = Approximator(dim_input=self.input_dim,
                              dim_output=self.output_dim,
                              dim_hidden=self.hidden_dim)
        self.PQ = self.Q.clone()
        return

    def _decayed_epsilon(self, cur_episode: int, min_epsilon: float,
                         max_epsilon: float, target_episode: int) -> float:

        slope = (min_epsilon - max_epsilon) / (target_episode)
        intercept = max_epsilon
        return max(min_epsilon, slope * cur_episode + intercept)

    def _curPolicy(self, s, epsilon=None):

        Q_s = self.PQ(s)
        rand_value = random()
        if epsilon is not None and rand_value < epsilon:
            return self.env.action_space.sample()
        else:
            return int(np.argmax(Q_s))

    def performPolicy(self, s, epsilon=None):
        return self._curPolicy(s, epsilon)

    def _update_Q_net(self):

        self.Q = self.PQ.clone()

    def _learn_from_memory(self, gamma, batch_size, learning_rate, epochs, r,
                           s):
        trans_pieces = self.sample(batch_size)
        states_0 = np.vstack([x.s0 for x in trans_pieces])
        actions_0 = np.array([x.a0 for x in trans_pieces])
        reward_1 = np.array([x.reward for x in trans_pieces])
        is_done = np.array([x.is_done for x in trans_pieces])
        states_1 = np.vstack([x.s1 for x in trans_pieces])

        X_batch = states_0
        y_batch = self.Q(states_0)

        Q_target = reward_1 + gamma * np.max(self.Q(states_1), axis=1) * \
                   (~ is_done)
        y_batch[np.arange(len(X_batch)), actions_0] = Q_target
        # loss is a torch Variable with size of 1
        loss = self.PQ.fit(x=X_batch,
                           y=y_batch,
                           learning_rate=learning_rate,
                           epochs=epochs)

        mean_loss = loss.sum().data[0] / batch_size
        self._update_Q_net()
        return mean_loss

    def learning(self,
                 gamma=0.99,
                 learning_rate=1e-5,
                 max_episodes=1000,
                 batch_size=64,
                 min_epsilon=0.2,
                 epsilon_factor=0.1,
                 epochs=1):

        total_steps, step_in_episode, num_episode = 0, 0, 0
        target_episode = max_episodes * epsilon_factor

        file = open('dqn.csv', 'w')
        file.write("Episode" + "," + "Distance" + "\n")
        tot_dis = 0

        file = open('reward.csv', 'w')
        file.write("Steps in Episode" + "," + "reward" + "\n")
        while num_episode < max_episodes:
            epsilon = self._decayed_epsilon(cur_episode=num_episode,
                                            min_epsilon=min_epsilon,
                                            max_epsilon=1,
                                            target_episode=target_episode)
            self.state = self.env._reset()
            self.env._render()
            step_in_episode = 0
            loss, mean_loss = 0.00, 0.00
            is_done = False
            while not is_done:
                s0 = self.state

                a0 = self.performPolicy(s0, epsilon)
                s1, r1, is_done, dis_info = self.env._step_b(a0)
                self.env._render()
                step_in_episode += 1

                tot_dis += r1
                print("Step in Episode :: ", step_in_episode)
                print("Distance of agent from goal :: ", dis_info)
                file.write(str(step_in_episode) + "," + str(tot_dis) + "\n")

                if self.total_trans > batch_size:
                    loss += self._learn_from_memory(gamma, batch_size,
                                                    learning_rate, epochs, r1,
                                                    s1)

            file.close()
            mean_loss = loss / step_in_episode
            print("{0} epsilon:{1:3.2f}, loss:{2:.3f}".format(
                self.experience.last, epsilon, mean_loss))
            # print(self.experience)
            total_steps += step_in_episode
            num_episode += 1
            #print("Episode :: ", num_episode)
            # print("Distance of agent from goal :: ", dis_info)
        return
Exemple #13
0
def comparison_stem_leaf_kqk():

    model_fn_leaf='Models/KQK/TDL/network'
    with open('Models/KQK/TDL_BAD/sim','rb') as f:
        A,evaldict,S=cp.load(f)
    wc_l=np.mean(np.array(evaldict['wc']))
    we_l=np.mean(np.array(evaldict['we']))
    lhs_l=np.mean(np.array(evaldict['lhs']))
    #t=stem[''] 
    print wc_l, we_l, lhs_l
    model_fn_leaf='Models/KQK/TDS/network'
    with open('Models/KQK/TDS_BAD/sim','rb') as f:
        A,evaldict,S=cp.load(f)
    wc_s=np.mean(np.array(evaldict['wc']))
    we_s=np.mean(np.array(evaldict['we']))
    lhs_s=np.mean(np.array(evaldict['lhs']))
    #t=stem[''] 
    print wc_s, we_s, lhs_s
    with open('Models/KQK/TDL_BAD/meta','rb') as f:
        leaf=cp.load(f)
    with open('Models/KQK/TDS_BAD/meta','rb') as f:
        stem=cp.load(f)
    mps_s=np.mean(np.array(stem['mps']))
    mps_l=np.mean(np.array(leaf['mps']))
    ntot_s=stem['episodes']
    ntot_l=leaf['episodes']
    el_s=stem['elapsed_time']
    el_l=leaf['elapsed_time']
    print mps_s, mps_l, ntot_s, ntot_l, el_s, el_l



    import tablebases
    with open('Models/KQK/TDL/meta','rb') as f:
        leaf=cp.load(f)
    with open('Models/KQK/TDS/meta','rb') as f:
        stem=cp.load(f)

    settings.init()
    load_DS('dataset/kqk_fics.epd')
    settings.params['PL']='KQkq'
    settings.params['USE_DSET']=True

    N_l=np.array([0]+leaf['N'],dtype=float) 
    N_s=np.array([0]+stem['N'],dtype=float) 
    eps_l=leaf['eps'] 
    eps_s=stem['eps']
    w_l=np.array([0]+leaf['w_list'])/N_l
    w_s=np.array([0]+stem['w_list'])/N_s
    e_l=np.cumsum(N_l)
    e_s=np.cumsum(N_s)

    stages_s=[t[0] for t in stem['lambda']]
    stages_l=[t[0] for t in leaf['lambda']]
    print leaf['lambda']

    l_l=leaf['avg_len'] 
    l_s=stem['avg_len']

    plt.figure(1)
    plt.subplot(111)
    line_stem, =plt.plot(e_s,w_s,label='TD-Stem'+r'$(\lambda)$')
    line_leaf, =plt.plot(e_l,w_l,label='TD-Leaf'+r'$(\lambda)$')
    for i in stages_s:
        plt.axvline(x=i,color='#99ccff')
    for i in stages_l:
        plt.axvline(x=i,color='#ffc266')
    plt.xlabel(r'$N$')
    plt.ylabel('winning rate')
    plt.legend(handles=[line_leaf,line_stem])
    plt.xlim(0,max(max(e_l),max(e_s)))
    plt.ylim(0,1)
    plt.show()

    mps_s=np.mean(np.array(stem['mps']))
    mps_l=np.mean(np.array(leaf['mps']))
    ntot_s=stem['episodes']
    ntot_l=leaf['episodes']
    el_s=stem['elapsed_time']
    el_l=leaf['elapsed_time']

    print mps_s, mps_l, ntot_s, ntot_l, el_s, el_l

    model_fn_stem='Models/KQK/TDS/network'
    with open('Models/KQK/TDS/sim2','rb') as f:
        A,evaldict,S=cp.load(f)

    wc_s=np.mean(np.array(evaldict['wc']))
    we_s=np.mean(np.array(evaldict['we']))
    lhs_s=np.mean(np.array(evaldict['lhs']))
    #t=stem['']
    print wc_s, we_s, lhs_s

    tw=[t for t in A if tablebases.probe_result(t[-1])==1]
    td=[t for t in A if tablebases.probe_result(t[-1])==0]
    tb=[t for t in A if tablebases.probe_result(t[-1])==-1]

    Sw=[t[-1] for t in tw]
    dtmw=[t[1] for t in tw]
    print min(dtmw)
    wdlw=[t[0] for t in tw]

    Sb=[t[-1] for t in tb]
    dtmb=[t[1] for t in tb]
    wdlb=[t[0] for t in tb]
    print min(dtmb)

    vw_s=Approximator.V(Sw,model_fn_stem)
    vb_s=Approximator.V(Sb,model_fn_stem)

    hist_wcs=20*[0]
    hist_wes=20*[0]
    hist_lhss=20*[0]
    hist_dcs=20*[0]
    avg_vs=20*[0]
    std_vs=20*[0]
    avg_vsb=20*[0]
    std_vsb=20*[0]
    #print A
    for i in xrange(len(hist_wcs)):
        hist_wcs[i]=np.mean(np.array([wc(t) for t in A if wc(t) is not None and 
                    t[1]==i+1]))
        hist_wes[i]=np.mean(np.array([we(t) for t in A if we(t) is not None and 
                    t[1]==i+1]))
        hist_lhss[i]=np.mean(np.array([lhs(t) for t in A if lhs(t) is not None and 
                    t[1]==i+1]))
        hist_dcs[i]=np.mean(np.array([dc(t) for t in A if dc(t) is not None and 
                    t[1]==i+1]))
        avg_vs[i]=np.mean(np.array([vw_s[j] for j in xrange(vw_s.shape[0]) if
                                   dtmw[j]==i+1 ]))
        std_vs[i]=np.std(np.array([vw_s[j] for j in xrange(vw_s.shape[0]) if
                                   dtmw[j]==i+1 and wdlw[j]==1]))
        avg_vsb[i]=np.mean(np.array([vb_s[j] for j in xrange(vb_s.shape[0]) if
                                   dtmb[j]==i+1 ]))
        std_vsb[i]=np.std(np.array([vb_s[j] for j in xrange(vb_s.shape[0]) if
                                   dtmb[j]==i+1 and wdlb[j]==1]))

    model_fn_leaf='Models/KQK/TDL/network'
    with open('Models/KQK/TDL/sim2','rb') as f:
        A,evaldict,S=cp.load(f)
    wc_l=np.mean(np.array(evaldict['wc']))
    we_l=np.mean(np.array(evaldict['we']))
    lhs_l=np.mean(np.array(evaldict['lhs']))
    #t=stem['']
    print wc_l, we_l, lhs_l

    tw=[t for t in A if tablebases.probe_result(t[-1])==1]
    td=[t for t in A if tablebases.probe_result(t[-1])==0]
    tb=[t for t in A if tablebases.probe_result(t[-1])==-1]
    Sw=[t[-1] for t in tw]
    dtmw=[t[1] for t in tw]
    wdlw=[t[0] for t in tw]
    Sb=[t[-1] for t in tb]
    dtmb=[t[1] for t in tb]
    wdlb=[t[0] for t in tb]

    vw_l=Approximator.V(Sw,model_fn_leaf)
    vb_l=Approximator.V(Sb,model_fn_leaf)

    hist_wcl=20*[0]
    hist_wel=20*[0]
    hist_lhsl=20*[0]
    hist_dcl=20*[0]
    avg_vl=20*[0]
    std_vl=20*[0]
    avg_vlb=20*[0]
    std_vlb=20*[0]

    for i in xrange(len(hist_wcs)):
        hist_wcl[i]=np.mean(np.array([wc(t) for t in A if wc(t) is not None and 
                    t[1]==i+1]))
        hist_wel[i]=np.mean(np.array([we(t) for t in A if we(t) is not None and 
                    t[1]==i+1]))
        hist_lhsl[i]=np.mean(np.array([lhs(t) for t in A if lhs(t) is not None and 
                    t[1]==i+1]))
        hist_dcl[i]=np.mean(np.array([dc(t) for t in A if dc(t) is not None and 
                    t[1]==i+1]))
        avg_vl[i]=np.mean(np.array([vw_l[j] for j in xrange(vw_l.shape[0]) if
                                   dtmw[j]==i+1 ]))
        std_vl[i]=np.std(np.array([vw_l[j] for j in xrange(vw_l.shape[0]) if
                                   dtmw[j]==i+1 and wdlw[j]==1]))
        avg_vlb[i]=np.mean(np.array([vb_l[j] for j in xrange(vb_l.shape[0]) if
                                   dtmb[j]==i+1 ]))
        std_vlb[i]=np.std(np.array([vb_l[j] for j in xrange(vb_l.shape[0]) if
                                   dtmb[j]==i+1 and wdlb[j]==1]))

    x=np.array(range(1,len(hist_wcs)+1))
    plt.figure(2)
    plt.subplot(111)
    b1=plt.bar(x-1./6, hist_wcs,width=1./3,align='center',label='TD-Stem'+r'$(\lambda)$ ')
    b2=plt.bar(x+1./6, hist_wcl,width=1./3,align='center',label='TD-Leaf'+r'$(\lambda)$ ')
    #plt.title('kqk endgame win conversion rate')
    plt.legend(handles=[b1,b2])
    plt.xlabel('DTM')
    plt.ylabel('WCR')
    plt.xlim(0,x.max())
    plt.ylim(0,1)
    plt.xticks(x,x)
    plt.show()

    plt.figure(3)
    plt.subplot(111)
    b1=plt.bar(x-1./6, hist_wes,width=1./3,align='center',label='TD-Stem'+r'$(\lambda)$ ')
    b2=plt.bar(x+1./6, hist_wel,width=1./3,align='center',label='TD-Leaf'+r'$(\lambda)$ ')
    #plt.title('kqk endgame win efficiency')
    plt.legend(handles=[b1,b2])
    plt.xlabel('DTM')
    plt.ylabel('WE')
    plt.xlim(0,x.max())
    plt.ylim(0,1)
    plt.xticks(x,x)
    plt.show()

    plt.figure(4)
    plt.subplot(111)
    b1=plt.bar(x-1./6, hist_lhss,width=1./3,align='center',label='TD-Stem'+r'$(\lambda)$ ')
    b2=plt.bar(x+1./6, hist_lhsl,width=1./3,align='center',label='TD-Leaf'+r'$(\lambda)$ ')
    #plt.title('kqk endgame loss holding score')
    plt.legend(handles=[b1,b2])
    plt.xlabel('DTM')
    plt.ylabel('LHS')
    plt.xlim(0,x.max())
    plt.ylim(0,1)
    plt.xticks(x,x)
    plt.show()

    plt.figure(5)
    b1,=plt.plot(x,avg_vs,label='TD-Stem'+r'$(\lambda)$ ')
    b2,=plt.plot(x,avg_vl,label='TD-Leaf'+r'$(\lambda)$ ')
    s1,=plt.plot(x,np.array(avg_vs)+2*np.array(std_vs),color='#99ccff')
    s2,=plt.plot(x,np.array(avg_vs)-2*np.array(std_vs),color='#99ccff')
    s3,=plt.plot(x,np.array(avg_vl)+2*np.array(std_vl),color='#ffc266')
    s4,=plt.plot(x,np.array(avg_vl)-2*np.array(std_vl),color='#ffc266')

    c1,=plt.plot(x,avg_vsb,label='TD-Stem'+r'$(\lambda)$ ',color=b1.get_color())
    c2,=plt.plot(x,avg_vlb,label='TD-Leaf'+r'$(\lambda)$ ',color=b2.get_color())
    t1,=plt.plot(x,np.array(avg_vsb)+2*np.array(std_vsb),color='#99ccff')
    t2,=plt.plot(x,np.array(avg_vsb)-2*np.array(std_vsb),color='#99ccff')
    t3,=plt.plot(x,np.array(avg_vlb)+2*np.array(std_vlb),color='#ffc266')
    t4,=plt.plot(x,np.array(avg_vlb)-2*np.array(std_vlb),color='#ffc266')
    plt.xticks(x,x)

    #plt.title('krk endgame win conversion rate')
    plt.legend(handles=[b1,b2])
    plt.xlabel('DTM')
    plt.ylabel('E[V]')
    plt.xlim(0,x.max())
    #plt.ylim(0,1)
    plt.show()
Exemple #14
0
def comparison_stem_leaf():
    settings.init()
    settings.params['USE_DSET']=True
    settings.params['PL']='KRkr'
    load_DS('dataset/krk.epd')
    
    with open('Models/stem_leaf/TDLeaf/TDLeaf_stem_or_leaf_7__03_07/stem_or_leaf_7_meta.sv','rb') as f:
        leaf=cp.load(f)
    with open('Models/stem_leaf/TDStem/TDStem_stem_or_leaf_7__28_06/stem_or_leaf_7_meta.sv','rb') as f:
        stem=cp.load(f)
    print leaf.keys()
    print stem.keys()
    N_l=leaf['N'][0]
    N_s=stem['N'][0]
    #print N_l, N_s
    w_l=leaf['w_list']
    r_l=leaf['r_lists']
    l_l=leaf['avg_len']
    w_s=stem['w_list']
    r_s=stem['r_lists']
    l_s=stem['avg_len']

    mps_s=np.mean(np.array(stem['mps']))
    mps_l=np.mean(np.array(leaf['mps']))
    ntot_s=stem['episodes']
    ntot_l=leaf['episodes']
    el_s=stem['elapsed_time']
    el_l=leaf['elapsed_time']

    print mps_s, mps_l, ntot_s, ntot_l, el_s, el_l

    ep_s2=[0]
    rate_s=[]
    cumsum=0
    for i in xrange(len(w_s)):
        if i<53:
            cumsum+=5000
            rate_s.append(5000.)
        elif i<73:
            cumsum+=500
            rate_s.append(500.)
        else:
            cumsum+=250
            rate_s.append(250.)
        ep_s2.append(cumsum)
    wr_s=np.array([0]+w_s)/np.array([1]+rate_s)
    rrw_s=5000*np.array(r_s[0])/np.array(rate_s)
    rrb_s=5000*np.array(r_s[1])/np.array(rate_s)

    ep_l2=[0]
    rate_l=[]
    cumsum=0
    for i in xrange(len(w_l)):
        if i<63:
            cumsum+=5000
            rate_l.append(5000.)
        elif i<83:
            cumsum+=500
            rate_l.append(500.)
        else:
            cumsum+=250
            rate_l.append(250.)
        ep_l2.append(cumsum)
    wr_l=np.array([0]+w_l)/np.array([1]+rate_l)
    rrw_l=5000*np.array(r_l[0])/np.array(rate_l)
    rrb_l=5000*np.array(r_l[1])/np.array(rate_l)

    plt.figure(1)
    plt.subplot(111)
    line_stem, =plt.plot(ep_s2,wr_s,label='TD-Stem'+r'$(\lambda)$')
    for i in [100000,170000,264000,275000,283200,291000]:
        plt.axvline(x=i,color='#99ccff')
    line_leaf, =plt.plot(ep_l2,wr_l,label='TD-Leaf'+r'$(\lambda)$')
    for i in [120000,220000,315000,325250,333000,341000]:
        plt.axvline(x=i,color='#ffc266')
    plt.xlabel(r'$N$')
    plt.ylabel('winning rate')
    plt.legend(handles=[line_leaf,line_stem])
    plt.xlim(0,max(ep_l2))
    plt.ylim(0,1)
    #plt.title('krk endgame learning curve')

    plt.show()

    mps_s=np.mean(np.array(stem['mps']))
    mps_l=np.mean(np.array(leaf['mps']))
    ntot_s=stem['episodes']
    ntot_l=leaf['episodes']
    el_s=stem['elapsed_time']
    el_l=leaf['elapsed_time']

    print mps_s, mps_l, ntot_s, ntot_l, el_s, el_l

    model_fn='Models/stem_leaf/TDStem/TDStem_stem_or_leaf_7__28_06/TDStem_stem_or_leaf_7__28_06-1_23116-0'
    with open('Models/stem_leaf/TDStem/sim','rb') as f:
        A,evaldict,S=cp.load(f)
    
    wc_s=np.mean(np.array(evaldict['wc']))
    we_s=np.mean(np.array(evaldict['we']))
    lhs_s=np.mean(np.array(evaldict['lhs']))
    #t=stem['']
    print wc_s, we_s, lhs_s

    S=[t[-1] for t in A]
    dtm=[t[1] for t in A]
    wdl=[t[0] for t in A]
    v=Approximator.V(S,model_fn)
    hist_wcs=33*[0]
    hist_wes=33*[0]
    hist_lhss=33*[0]
    avg_vs=33*[0]
    std_vs=33*[0]
    #print A
    for i in xrange(len(hist_wcs)):
        hist_wcs[i]=np.mean(np.array([wc(t) for t in A if wc(t) is not None and 
                    t[1]==i]))
        hist_wes[i]=np.mean(np.array([we(t) for t in A if we(t) is not None and 
                    t[1]==i]))
        hist_lhss[i]=np.mean(np.array([lhs(t) for t in A if lhs(t) is not None and 
                    t[1]==i]))
        avg_vs[i]=np.mean(np.array([v[j] for j in xrange(v.shape[0]) if
                                   dtm[j]==i and wdl[j]==1]))
        std_vs[i]=np.std(np.array([v[j] for j in xrange(v.shape[0]) if
                                   dtm[j]==i and wdl[j]==1]))
        
    model_fn='Models/stem_leaf/TDLeaf/TDLeaf_stem_or_leaf_7__03_07/TDLeaf_stem_or_leaf_7__03_07-1_13299-0'
    with open('Models/stem_leaf/TDLeaf/sim','rb') as f:
        A,evaldict,S=cp.load(f)
    wc_l=np.mean(np.array(evaldict['wc']))
    we_l=np.mean(np.array(evaldict['we']))
    lhs_l=np.mean(np.array(evaldict['lhs']))
    #t=stem['']
    print wc_l, we_l, lhs_l

    S=[t[-1] for t in A]
    dtm=[t[1] for t in A]
    wdl=[t[0] for t in A]
    v=Approximator.V(S,model_fn)
    hist_wcl=33*[0]
    hist_wel=33*[0]
    hist_lhsl=33*[0]
    avg_vl=33*[0]
    std_vl=33*[0]
    #print A
    for i in xrange(len(hist_wcl)):
        hist_wcl[i]=np.mean(np.array([wc(t) for t in A if wc(t) is not None and 
                    t[1]==i+1]))
        hist_wel[i]=np.mean(np.array([we(t) for t in A if we(t) is not None and 
                    t[1]==i+1]))
        hist_lhsl[i]=np.mean(np.array([lhs(t) for t in A if lhs(t) is not None and 
                    t[1]==i+1]))
        avg_vl[i]=np.mean(np.array([v[j] for j in xrange(v.shape[0]) if
                                   dtm[j]==i+1 and wdl[j]==1]))
        std_vl[i]=np.std(np.array([v[j] for j in xrange(v.shape[0]) if
                                   dtm[j]==i+1 and wdl[j]==1]))
    
    x=np.array(range(1,len(hist_wcs)+1))

    plt.figure(2)
    plt.subplot(111)
    b1=plt.bar(x-1./6, hist_wcs,width=1./3,align='center',label='TD-Stem'+r'$(\lambda)$ ')
    b2=plt.bar(x+1./6, hist_wcl,width=1./3,align='center',label='TD-Leaf'+r'$(\lambda)$ ')
    #plt.title('krk endgame win conversion rate')
    plt.legend(handles=[b1,b2])
    plt.xlabel('DTM')
    plt.ylabel('WCR')
    plt.show()

    plt.figure(3)
    plt.subplot(111)
    b1=plt.bar(x-1./6, hist_wes,width=1./3,align='center',label='TD-Stem'+r'$(\lambda)$ ')
    b2=plt.bar(x+1./6, hist_wel,width=1./3,align='center',label='TD-Leaf'+r'$(\lambda)$ ')
    #plt.title('krk endgame win efficiency')
    plt.legend(handles=[b1,b2])
    plt.xlabel('DTM')
    plt.ylabel('WE')
    plt.show()

    plt.figure(4)
    plt.subplot(111)
    b1=plt.bar(x-1./6, hist_lhss,width=1./3,align='center',label='TD-Stem'+r'$(\lambda)$ ')
    b2=plt.bar(x+1./6, hist_lhsl,width=1./3,align='center',label='TD-Leaf'+r'$(\lambda)$ ')
    #plt.title('krk endgame loss holding score')
    plt.legend(handles=[b1,b2])
    plt.xlabel('DTM')
    plt.ylabel('LHS')
    plt.show()

    plt.figure(5)
    plt.subplot(111)
    b1,=plt.plot(x,avg_vs,label='TD-Stem'+r'$(\lambda)$ ')
    b2,=plt.plot(x,avg_vl,label='TD-Leaf'+r'$(\lambda)$ ')
    s1,=plt.plot(x,np.array(avg_vs)+2*np.array(std_vs),color='#99ccff')
    s2,=plt.plot(x,np.array(avg_vs)-2*np.array(std_vs),color='#99ccff')
    s3,=plt.plot(x,np.array(avg_vl)+2*np.array(std_vl),color='#ffc266')
    s4,=plt.plot(x,np.array(avg_vl)-2*np.array(std_vl),color='#ffc266')
    #plt.title('krk endgame value function')
    plt.legend(handles=[b1,b2])
    plt.xlabel('DTM')
    plt.ylabel(r'$V$')
    plt.show()
Exemple #15
0
class ApproxQAgent(Agent):
    '''使用近似的价值函数实现的Q学习个体

    #Function
        1 value function approximation
        2 base on Experience Relay, which is good for eliminating relationship of transition in a single episode,
        in order to get a better approximation
        3 DQN
    '''
    def __init__(self, env: Env = None,
                       trans_capacity = 20000,
                       hidden_dim: int = 16):
        '''set input_dim(w.r.t. obs.space) and output_dim(w.r.t. action_space)...

        super(...).__init__(...),
        self.Q = Approximator(...)
        self.PQ = self.Q.clone() #PQ for updating parameters

        #args
            env: environment of this agent
            trans_capacity:<int>max num. of transitions in memory
            hiddden_dim:<int>num. of nodes in hidden layer
        '''
        if env is None:
            raise "agent should have an environment"
        super(ApproxQAgent, self).__init__(env, trans_capacity)
        self.input_dim, self.output_dim = 1, 1
        if isinstance(env.observation_space, spaces.Discrete):
            self.input_dim = 1
        elif isinstance(env.observation_space, spaces.Box):
            self.input_dim = env.observation_space.shape[0]     #e.g. observation_space>>Box(6,), .shape>>(6,)

        if isinstance(env.action_space, spaces.Discrete):
            self.output_dim = env.action_space.n                #
        elif isinstance(env.action_space, spaces.Box):
            self.output_dim = env.action_space.shape[0]

        # print("{},{}".format(self.input_dim, self.output_dim))
        self.hidden_dim = hidden_dim
        self.Q = Approximator(dim_input = self.input_dim,
                              dim_output = self.output_dim,
                              dim_hidden = self.hidden_dim)
        self.PQ = self.Q.clone() # 更新参数的网络
        return

    def _decayed_epsilon(self,cur_episode: int, 
                              min_epsilon: float, 
                              max_epsilon: float, 
                              target_episode: int) -> float:
        '''获得一个在一定范围内的epsilon

        #return
            epsilon<float>changing from max_epsilon(when cur_episode=0) to min_epsilon w.r.t. cur_episode
        '''
        slope = (min_epsilon - max_epsilon) / (target_episode)
        intercept = max_epsilon
        return max(min_epsilon, slope * cur_episode + intercept)        #slope*cur_episode is negative

    def _curPolicy(self, s, epsilon = None):
        '''依据更新策略的价值函数(网络)产生一个行为

        #args
            s: state s0<6x1 ndarray>
            epsilon: =None means greedy, otherwise epsilon greedy
        #return
            an action a0<int> w.r.t. PQ(policy evaluation) using decayed epsilon-greedy(policy improvement)
        '''
        Q_s = self.PQ(s)                    #
        rand_value = random()
        if epsilon is not None and rand_value < epsilon:
            return self.env.action_space.sample()
        else:
            return int(np.argmax(Q_s))
        
    def performPolicy(self, s, epsilon = None):
        #若只有一个Policy,则可略
        return self._curPolicy(s, epsilon)


    def _update_Q_net(self):
        '''将更新策略的Q网络(连带其参数)复制给输出目标Q值的网络
        '''
        self.Q = self.PQ.clone()

    
    def _learn_from_memory(self, gamma, batch_size, learning_rate, epochs):
        # get Transmition randomly from experience, return a <list>, consists of batch_size * Transition object(consists of data,s0,a0,reward,s1,is_done)
        trans_pieces = self.sample(batch_size)
        states_0 = np.vstack([x.s0 for x in trans_pieces])  #ndarray
        actions_0 = np.array([x.a0 for x in trans_pieces])
        reward_1 = np.array([x.reward for x in trans_pieces])
        is_done = np.array([x.is_done for x in trans_pieces])
        states_1 = np.vstack([x.s1 for x in trans_pieces])

        X_batch = states_0

        # ndarray, consists of list([Q(s0)(a_0), Q(s0)(a_1),....]), describe all Q of all actions in state s0
        #y_batch = self.Q(states_0)     #main difference in a0 dimension
        y_batch = self.PQ(states_0)     #only Q(s,a,w) in a0 dimension different. But always walk around

        #matrix-weise calculation
        Q_target = reward_1 + gamma * np.max(self.Q(states_1), axis=1)*\
            (~ is_done) # is_done则Q_target==reward_1

        #Attension:
        y_batch[np.arange(len(X_batch)), actions_0] = Q_target
        # loss is a torch Variable with size of 1
        loss = self.PQ.fit(x = X_batch, 
                           y = y_batch, 
                           learning_rate = learning_rate,
                           epochs = epochs)

        mean_loss = loss.sum().data[0] / batch_size
        self._update_Q_net()
        return mean_loss

    def learning(self, gamma = 0.99,
                       learning_rate=1e-5, 
                       max_episodes=1000, 
                       batch_size = 64,
                       min_epsilon = 0.2,
                       epsilon_factor = 0.1,
                       epochs = 1):
        '''contruct experience, when nums of trans. in experience enough, start learning from experience, compute loss

        Methods details see below

        #Arguments
            gamma = 0.99,           # discount factor, range from [0,1]
            learning_rate=1e-5,     # 集中学习的规模
            max_episodes=1000,      # 最大训练Episode数量
            batch_size = 64,
            min_epsilon = 0.2,
            epsilon_factor = 0.1,   # 开始使用最小Epsilon时Episode的序号占最大
                                    # Episodes序号之比,该比值越小,表示使用
                                    # min_epsilon的episode越多
            epochs = 1):            # 每个batch_size训练的次数
        '''
        total_steps, step_in_episode, num_episode = 0, 0, 0
        target_episode = max_episodes * epsilon_factor
        while num_episode < max_episodes:           #for each episode until max_episode,  get loss
            epsilon = self._decayed_epsilon(cur_episode = num_episode,
                                            min_epsilon = min_epsilon, 
                                            max_epsilon = 1,
                                            target_episode = target_episode)
            self.state = self.env.reset()
            self.env.render()
            step_in_episode = 0
            loss, mean_loss = 0.00, 0.00    #
            is_done = False
            while not is_done:#for every transition
                s0 = self.state                                     #self.state change inside self.act(a0)
                a0  = self.performPolicy(s0, epsilon)               #get action w.r.t. PQ using decayed epsilon-greedy
                s1, r1, is_done, info, total_reward = self.act(a0)  #inside self.act(a0): self.state = s1
                #inside act also:sotre trans as episode_list in experience, and as trans_list in episode, and accumulate the total_reward
                self.env.render()
                step_in_episode += 1
                
                if self.total_trans > batch_size:
                    loss += self._learn_from_memory(gamma, 
                                                    batch_size, 
                                                    learning_rate,
                                                    epochs)

            mean_loss = loss / step_in_episode
            print("{0} epsilon:{1:3.2f}, loss:{2:.3f}".
                format(self.experience.last, epsilon, mean_loss))
            # print(self.experience)
            total_steps += step_in_episode
            num_episode += 1

        return   
Exemple #16
0
                        help='number of episodes to play',
                        type=int)
    parser.add_argument('-p', help='piece cfg')
    parser.add_argument('-D', help='dset file')
    parser.add_argument(
        '-R',
        default=10,
        type=int,
        help='number of random moves to play before registration')
    parser.add_argument('-d', default=3, type=int, help='depth')
    parser.add_argument('-w', action='store_true')

    args = parser.parse_args()

    settings.init()
    settings.params['USE_DSET'] = True
    settings.params['PL'] = args.p
    load_DS(args.D)
    settings.params['RAND'] = args.R
    settings.params['OC_DEPTH'] = args.d

    model_fn = args.c
    with tf.Session() as sess:
        saver = tf.train.import_meta_graph(model_fn + '.meta')
        saver.restore(sess, model_fn)
        approx = Approximator(sess)
        agent = tdstem.TDStemPlayAgent(approx, depth=3)
        A, evaldict, all_s = opt.recursive_eval_sim(agent, N=args.N, w=args.w)
        with open(args.o, 'wb') as f:
            cp.dump((A, evaldict, all_s), f)
Exemple #17
0
class ApproxQAgent(Agent):
    '''使用近似的价值函数实现的Q学习的个体
    '''

    def __init__(self, env: Env = None,
                 trans_capacity=20000,
                 hidden_dim: int = 16):
        if env is None:
            raise Exception("agent should have an environment")
        super(ApproxQAgent, self).__init__(env, trans_capacity)
        self.input_dim, self.output_dim = 1, 1

        # 适应不同的状态和行为空间类型
        if isinstance(env.observation_space, spaces.Discrete):
            self.input_dim = 1
        elif isinstance(env.observation_space, spaces.Box):
            self.input_dim = env.observation_space.shape[0]

        if isinstance(env.action_space, spaces.Discrete):
            self.output_dim = env.action_space.n
        elif isinstance(env.action_space, spaces.Box):
            self.output_dim = env.action_space.shape[0]
        # print("{},{}".format(self.input_dim, self.output_dim))

        # 隐藏层神经元数目
        self.hidden_dim = hidden_dim
        # 关键在下面两句,声明了两个近似价值函数
        # 变量Q是一个计算价值,产生loss的近似函数(网络),
        # 该网络参数在一定时间段内不更新参数
        self.Q = Approximator(dim_input=self.input_dim,
                              dim_output=self.output_dim,
                              dim_hidden=self.hidden_dim)
        # 变量PQ是一个生成策略的近似函数,该函数(网络)的参数频繁更新
        # 更新参数的网络
        self.PQ = self.Q.clone()
        return

    def _learning_from_memory(self, gamma, batch_size, learning_rate, epochs):
        # 随机获取记忆里的Transmition
        trans_pieces = self.sample(batch_size)
        states_0 = np.vstack([x.s0 for x in trans_pieces])
        actions_0 = np.array([x.a0 for x in trans_pieces])
        reward_1 = np.array([x.reward for x in trans_pieces])
        is_done = np.array([x.is_done for x in trans_pieces])
        states_1 = np.vstack([x.s1 for x in trans_pieces])

        X_batch = states_0
        # 调用的时approximator的__call__方法
        y_batch = self.Q(states_0)

        # 使用了Batch,代码是矩阵运算
        # np.max => axis=1时取出最大的一列;axis=0时取出最大的一行
        # ~ True = -2;  ~ False = -1
        Q_target = reward_1 + gamma * np.max(self.Q(states_1), axis=1) * (~ is_done)
        y_batch[np.arange(len(X_batch)), actions_0] = Q_target
        # loss is a torch Variable with size of 1
        loss = self.PQ.fit(x=X_batch,
                           y=y_batch,
                           learning_rate=learning_rate,
                           epochs=epochs)
        mean_loss = loss.sum().item() / batch_size
        self._update_Q_net()
        return mean_loss

    def learning(self, gamma=0.99,
                 learning_rate=1e-5,
                 max_episodes=1000,
                 batch_size=64,
                 min_epsilon=0.2,
                 epsilon_factor=0.1,
                 epochs=1):
        '''learning的主要工作是构建经历,当构建的经历足够时,同时启动基于经历的学习
        '''
        total_steps, step_in_episode, num_episode = 0, 0, 0
        target_episode = max_episodes * epsilon_factor
        while num_episode < max_episodes:
            epsilon = self._decayed_epsilon(cur_episode=num_episode,
                                            min_epsilon=min_epsilon,
                                            max_epsilon=1,
                                            target_episode=target_episode)
            self.state = self.env.reset()
            self.env.render()
            step_in_episode = 0
            loss, mean_loss = 0.00, 0.00
            is_done = False
            while not is_done:
                s0 = self.state
                a0 = self.performPolicy(s0, epsilon)
                # act方法封装了将Transition记录至Experience中的过程
                s1, r1, is_done, info, total_reward = self.act(a0)
                # self.env.render()
                step_in_episode += 1
                # 当经历里有足够大小的Transition时,开始启用基于经历的学习
                if self.total_trans > batch_size:
                    loss += self._learning_from_memory(gamma,
                                                       batch_size,
                                                       learning_rate,
                                                       epochs)
            mean_loss = loss / step_in_episode
            print("{0} epsilon:{1:3.2f}, loss:{2:.3f}".
                  format(self.experience.last, epsilon, mean_loss))
            # print(self.experience)
            total_steps += step_in_episode
            num_episode += 1
        return

    def _decayed_epsilon(self, cur_episode: int,
                         min_epsilon: float,
                         max_epsilon: float,
                         target_episode: int) -> float:
        '''获得一个在一定范围内的epsilon
        '''
        slope = (min_epsilon - max_epsilon) / (target_episode)
        intercept = max_epsilon
        return max(min_epsilon, slope * cur_episode + intercept)

    def _curPolicy(self, s, epsilon=None):
        '''依据更新策略的价值函数(网络)产生一个行为
                '''
        Q_s = self.PQ(s)
        rand_value = random()
        if epsilon is not None and rand_value < epsilon:
            return self.env.action_space.sample()
        else:
            return int(np.argmax(Q_s))

    def performPolicy(self, s, epsilon=None):
        return self._curPolicy(s, epsilon)

    def _update_Q_net(self):
        '''将更新策略的Q网络(连带其参数)复制给输出目标Q值的网络
        '''
        self.Q = self.PQ.clone()