class ApproxQAgent(Agent):
    '''使用近似的价值函数实现的Q学习个体
    '''
    def __init__(self, env: Env = None,
                       trans_capacity = 20000,
                       hidden_dim: int = 16):
        if env is None:
            raise "agent should have an environment"
        super(ApproxQAgent, self).__init__(env, trans_capacity)
        self.input_dim, self.output_dim = 1, 1
        if isinstance(env.observation_space, spaces.Discrete):
            self.input_dim = 1
        elif isinstance(env.observation_space, spaces.Box):
            self.input_dim = env.observation_space.shape[0]

        if isinstance(env.action_space, spaces.Discrete):
            self.output_dim = env.action_space.n
        elif isinstance(env.action_space, spaces.Box):
            self.output_dim = env.action_space.shape[0]

        # print("{},{}".format(self.input_dim, self.output_dim))
        self.hidden_dim = hidden_dim
        self.Q = Approximator(dim_input = self.input_dim,
                              dim_output = self.output_dim,
                              dim_hidden = self.hidden_dim)
        self.PQ = self.Q.clone() # 更新参数的网络
        return

    def _decayed_epsilon(self,cur_episode: int, 
                              min_epsilon: float, 
                              max_epsilon: float, 
                              target_episode: int) -> float:
        '''获得一个在一定范围内的epsilon
        '''
        slope = (min_epsilon - max_epsilon) / (target_episode)
        intercept = max_epsilon
        return max(min_epsilon, slope * cur_episode + intercept)

    def _curPolicy(self, s, epsilon = None):
        '''依据更新策略的价值函数(网络)产生一个行为
        '''
        Q_s = self.PQ(s)
        rand_value = random()
        if epsilon is not None and rand_value < epsilon:
            return self.env.action_space.sample()
        else:
            return int(np.argmax(Q_s))
        
    def performPolicy(self, s, epsilon = None):
        return self._curPolicy(s, epsilon)


    def _update_Q_net(self):
        '''将更新策略的Q网络(连带其参数)复制给输出目标Q值的网络
        '''
        self.Q = self.PQ.clone()

    
    def _learn_from_memory(self, gamma, batch_size, learning_rate, epochs):
        trans_pieces = self.sample(batch_size)  # 随机获取记忆里的Transmition
        states_0 = np.vstack([x.s0 for x in trans_pieces])
        actions_0 = np.array([x.a0 for x in trans_pieces])
        reward_1 = np.array([x.reward for x in trans_pieces])
        is_done = np.array([x.is_done for x in trans_pieces])
        states_1 = np.vstack([x.s1 for x in trans_pieces])

        X_batch = states_0
        y_batch = self.Q(states_0)  # 得到numpy格式的结果

        Q_target = reward_1 + gamma * np.max(self.Q(states_1), axis=1)*\
            (~ is_done) # is_done则Q_target==reward_1
        y_batch[np.arange(len(X_batch)), actions_0] = Q_target
        # loss is a torch Variable with size of 1
        loss = self.PQ.fit(x = X_batch, 
                           y = y_batch, 
                           learning_rate = learning_rate,
                           epochs = epochs)

        mean_loss = loss.sum().data[0] / batch_size
        self._update_Q_net()
        return mean_loss

    def learning(self, gamma = 0.99,
                       learning_rate=1e-5, 
                       max_episodes=1000, 
                       batch_size = 64,
                       min_epsilon = 0.2,
                       epsilon_factor = 0.1,
                       epochs = 1):

        total_steps, step_in_episode, num_episode = 0, 0, 0
        target_episode = max_episodes * epsilon_factor
        while num_episode < max_episodes:
            epsilon = self._decayed_epsilon(cur_episode = num_episode,
                                            min_epsilon = min_epsilon, 
                                            max_epsilon = 1,
                                            target_episode = target_episode)
            self.state = self.env.reset()
            # self.env.render()
            step_in_episode = 0
            loss, mean_loss = 0.00, 0.00
            is_done = False
            while not is_done:
                s0 = self.state

                a0  = self.performPolicy(s0, epsilon)
                s1, r1, is_done, info, total_reward = self.act(a0)
                # self.env.render()
                step_in_episode += 1
                
                if self.total_trans > batch_size:
                    loss += self._learn_from_memory(gamma, 
                                                    batch_size, 
                                                    learning_rate,
                                                    epochs)
            mean_loss = loss / step_in_episode
            print("{0} epsilon:{1:3.2f}, loss:{2:.3f}".
                format(self.experience.last, epsilon, mean_loss))
            # print(self.experience)
            total_steps += step_in_episode
            num_episode += 1

        return   
예제 #2
0
class ApproxQAgent(Agent):
    def __init__(self,
                 env: Env = None,
                 trans_capacity=20000,
                 hidden_dim: int = 16):
        if env is None:
            raise "agent should have an environment"
        super(ApproxQAgent, self).__init__(env, trans_capacity)
        self.input_dim, self.output_dim = 1, 1
        if isinstance(env.observation_space, spaces.Discrete):
            self.input_dim = 1
        elif isinstance(env.observation_space, spaces.Box):
            self.input_dim = env.observation_space.shape[0]

        if isinstance(env.action_space, spaces.Discrete):
            self.output_dim = env.action_space.n
        elif isinstance(env.action_space, spaces.Box):
            self.output_dim = env.action_space.shape[0]

        # print("{},{}".format(self.input_dim, self.output_dim))
        self.hidden_dim = hidden_dim
        self.Q = Approximator(dim_input=self.input_dim,
                              dim_output=self.output_dim,
                              dim_hidden=self.hidden_dim)
        self.PQ = self.Q.clone()
        return

    def _decayed_epsilon(self, cur_episode: int, min_epsilon: float,
                         max_epsilon: float, target_episode: int) -> float:

        slope = (min_epsilon - max_epsilon) / (target_episode)
        intercept = max_epsilon
        return max(min_epsilon, slope * cur_episode + intercept)

    def _curPolicy(self, s, epsilon=None):

        Q_s = self.PQ(s)
        rand_value = random()
        if epsilon is not None and rand_value < epsilon:
            return self.env.action_space.sample()
        else:
            return int(np.argmax(Q_s))

    def performPolicy(self, s, epsilon=None):
        return self._curPolicy(s, epsilon)

    def _update_Q_net(self):

        self.Q = self.PQ.clone()

    def _learn_from_memory(self, gamma, batch_size, learning_rate, epochs, r,
                           s):
        trans_pieces = self.sample(batch_size)
        states_0 = np.vstack([x.s0 for x in trans_pieces])
        actions_0 = np.array([x.a0 for x in trans_pieces])
        reward_1 = np.array([x.reward for x in trans_pieces])
        is_done = np.array([x.is_done for x in trans_pieces])
        states_1 = np.vstack([x.s1 for x in trans_pieces])

        X_batch = states_0
        y_batch = self.Q(states_0)

        Q_target = reward_1 + gamma * np.max(self.Q(states_1), axis=1) * \
                   (~ is_done)
        y_batch[np.arange(len(X_batch)), actions_0] = Q_target
        # loss is a torch Variable with size of 1
        loss = self.PQ.fit(x=X_batch,
                           y=y_batch,
                           learning_rate=learning_rate,
                           epochs=epochs)

        mean_loss = loss.sum().data[0] / batch_size
        self._update_Q_net()
        return mean_loss

    def learning(self,
                 gamma=0.99,
                 learning_rate=1e-5,
                 max_episodes=1000,
                 batch_size=64,
                 min_epsilon=0.2,
                 epsilon_factor=0.1,
                 epochs=1):

        total_steps, step_in_episode, num_episode = 0, 0, 0
        target_episode = max_episodes * epsilon_factor

        file = open('dqn.csv', 'w')
        file.write("Episode" + "," + "Distance" + "\n")
        tot_dis = 0

        file = open('reward.csv', 'w')
        file.write("Steps in Episode" + "," + "reward" + "\n")
        while num_episode < max_episodes:
            epsilon = self._decayed_epsilon(cur_episode=num_episode,
                                            min_epsilon=min_epsilon,
                                            max_epsilon=1,
                                            target_episode=target_episode)
            self.state = self.env._reset()
            self.env._render()
            step_in_episode = 0
            loss, mean_loss = 0.00, 0.00
            is_done = False
            while not is_done:
                s0 = self.state

                a0 = self.performPolicy(s0, epsilon)
                s1, r1, is_done, dis_info = self.env._step_b(a0)
                self.env._render()
                step_in_episode += 1

                tot_dis += r1
                print("Step in Episode :: ", step_in_episode)
                print("Distance of agent from goal :: ", dis_info)
                file.write(str(step_in_episode) + "," + str(tot_dis) + "\n")

                if self.total_trans > batch_size:
                    loss += self._learn_from_memory(gamma, batch_size,
                                                    learning_rate, epochs, r1,
                                                    s1)

            file.close()
            mean_loss = loss / step_in_episode
            print("{0} epsilon:{1:3.2f}, loss:{2:.3f}".format(
                self.experience.last, epsilon, mean_loss))
            # print(self.experience)
            total_steps += step_in_episode
            num_episode += 1
            #print("Episode :: ", num_episode)
            # print("Distance of agent from goal :: ", dis_info)
        return
예제 #3
0
파일: agents.py 프로젝트: jh-yi/test_RL
class ApproxQAgent(Agent):
    '''使用近似的价值函数实现的Q学习个体

    #Function
        1 value function approximation
        2 base on Experience Relay, which is good for eliminating relationship of transition in a single episode,
        in order to get a better approximation
        3 DQN
    '''
    def __init__(self, env: Env = None,
                       trans_capacity = 20000,
                       hidden_dim: int = 16):
        '''set input_dim(w.r.t. obs.space) and output_dim(w.r.t. action_space)...

        super(...).__init__(...),
        self.Q = Approximator(...)
        self.PQ = self.Q.clone() #PQ for updating parameters

        #args
            env: environment of this agent
            trans_capacity:<int>max num. of transitions in memory
            hiddden_dim:<int>num. of nodes in hidden layer
        '''
        if env is None:
            raise "agent should have an environment"
        super(ApproxQAgent, self).__init__(env, trans_capacity)
        self.input_dim, self.output_dim = 1, 1
        if isinstance(env.observation_space, spaces.Discrete):
            self.input_dim = 1
        elif isinstance(env.observation_space, spaces.Box):
            self.input_dim = env.observation_space.shape[0]     #e.g. observation_space>>Box(6,), .shape>>(6,)

        if isinstance(env.action_space, spaces.Discrete):
            self.output_dim = env.action_space.n                #
        elif isinstance(env.action_space, spaces.Box):
            self.output_dim = env.action_space.shape[0]

        # print("{},{}".format(self.input_dim, self.output_dim))
        self.hidden_dim = hidden_dim
        self.Q = Approximator(dim_input = self.input_dim,
                              dim_output = self.output_dim,
                              dim_hidden = self.hidden_dim)
        self.PQ = self.Q.clone() # 更新参数的网络
        return

    def _decayed_epsilon(self,cur_episode: int, 
                              min_epsilon: float, 
                              max_epsilon: float, 
                              target_episode: int) -> float:
        '''获得一个在一定范围内的epsilon

        #return
            epsilon<float>changing from max_epsilon(when cur_episode=0) to min_epsilon w.r.t. cur_episode
        '''
        slope = (min_epsilon - max_epsilon) / (target_episode)
        intercept = max_epsilon
        return max(min_epsilon, slope * cur_episode + intercept)        #slope*cur_episode is negative

    def _curPolicy(self, s, epsilon = None):
        '''依据更新策略的价值函数(网络)产生一个行为

        #args
            s: state s0<6x1 ndarray>
            epsilon: =None means greedy, otherwise epsilon greedy
        #return
            an action a0<int> w.r.t. PQ(policy evaluation) using decayed epsilon-greedy(policy improvement)
        '''
        Q_s = self.PQ(s)                    #
        rand_value = random()
        if epsilon is not None and rand_value < epsilon:
            return self.env.action_space.sample()
        else:
            return int(np.argmax(Q_s))
        
    def performPolicy(self, s, epsilon = None):
        #若只有一个Policy,则可略
        return self._curPolicy(s, epsilon)


    def _update_Q_net(self):
        '''将更新策略的Q网络(连带其参数)复制给输出目标Q值的网络
        '''
        self.Q = self.PQ.clone()

    
    def _learn_from_memory(self, gamma, batch_size, learning_rate, epochs):
        # get Transmition randomly from experience, return a <list>, consists of batch_size * Transition object(consists of data,s0,a0,reward,s1,is_done)
        trans_pieces = self.sample(batch_size)
        states_0 = np.vstack([x.s0 for x in trans_pieces])  #ndarray
        actions_0 = np.array([x.a0 for x in trans_pieces])
        reward_1 = np.array([x.reward for x in trans_pieces])
        is_done = np.array([x.is_done for x in trans_pieces])
        states_1 = np.vstack([x.s1 for x in trans_pieces])

        X_batch = states_0

        # ndarray, consists of list([Q(s0)(a_0), Q(s0)(a_1),....]), describe all Q of all actions in state s0
        #y_batch = self.Q(states_0)     #main difference in a0 dimension
        y_batch = self.PQ(states_0)     #only Q(s,a,w) in a0 dimension different. But always walk around

        #matrix-weise calculation
        Q_target = reward_1 + gamma * np.max(self.Q(states_1), axis=1)*\
            (~ is_done) # is_done则Q_target==reward_1

        #Attension:
        y_batch[np.arange(len(X_batch)), actions_0] = Q_target
        # loss is a torch Variable with size of 1
        loss = self.PQ.fit(x = X_batch, 
                           y = y_batch, 
                           learning_rate = learning_rate,
                           epochs = epochs)

        mean_loss = loss.sum().data[0] / batch_size
        self._update_Q_net()
        return mean_loss

    def learning(self, gamma = 0.99,
                       learning_rate=1e-5, 
                       max_episodes=1000, 
                       batch_size = 64,
                       min_epsilon = 0.2,
                       epsilon_factor = 0.1,
                       epochs = 1):
        '''contruct experience, when nums of trans. in experience enough, start learning from experience, compute loss

        Methods details see below

        #Arguments
            gamma = 0.99,           # discount factor, range from [0,1]
            learning_rate=1e-5,     # 集中学习的规模
            max_episodes=1000,      # 最大训练Episode数量
            batch_size = 64,
            min_epsilon = 0.2,
            epsilon_factor = 0.1,   # 开始使用最小Epsilon时Episode的序号占最大
                                    # Episodes序号之比,该比值越小,表示使用
                                    # min_epsilon的episode越多
            epochs = 1):            # 每个batch_size训练的次数
        '''
        total_steps, step_in_episode, num_episode = 0, 0, 0
        target_episode = max_episodes * epsilon_factor
        while num_episode < max_episodes:           #for each episode until max_episode,  get loss
            epsilon = self._decayed_epsilon(cur_episode = num_episode,
                                            min_epsilon = min_epsilon, 
                                            max_epsilon = 1,
                                            target_episode = target_episode)
            self.state = self.env.reset()
            self.env.render()
            step_in_episode = 0
            loss, mean_loss = 0.00, 0.00    #
            is_done = False
            while not is_done:#for every transition
                s0 = self.state                                     #self.state change inside self.act(a0)
                a0  = self.performPolicy(s0, epsilon)               #get action w.r.t. PQ using decayed epsilon-greedy
                s1, r1, is_done, info, total_reward = self.act(a0)  #inside self.act(a0): self.state = s1
                #inside act also:sotre trans as episode_list in experience, and as trans_list in episode, and accumulate the total_reward
                self.env.render()
                step_in_episode += 1
                
                if self.total_trans > batch_size:
                    loss += self._learn_from_memory(gamma, 
                                                    batch_size, 
                                                    learning_rate,
                                                    epochs)

            mean_loss = loss / step_in_episode
            print("{0} epsilon:{1:3.2f}, loss:{2:.3f}".
                format(self.experience.last, epsilon, mean_loss))
            # print(self.experience)
            total_steps += step_in_episode
            num_episode += 1

        return   
예제 #4
0
class ApproxQAgent(Agent):
    '''使用近似的价值函数实现的Q学习的个体
    '''

    def __init__(self, env: Env = None,
                 trans_capacity=20000,
                 hidden_dim: int = 16):
        if env is None:
            raise Exception("agent should have an environment")
        super(ApproxQAgent, self).__init__(env, trans_capacity)
        self.input_dim, self.output_dim = 1, 1

        # 适应不同的状态和行为空间类型
        if isinstance(env.observation_space, spaces.Discrete):
            self.input_dim = 1
        elif isinstance(env.observation_space, spaces.Box):
            self.input_dim = env.observation_space.shape[0]

        if isinstance(env.action_space, spaces.Discrete):
            self.output_dim = env.action_space.n
        elif isinstance(env.action_space, spaces.Box):
            self.output_dim = env.action_space.shape[0]
        # print("{},{}".format(self.input_dim, self.output_dim))

        # 隐藏层神经元数目
        self.hidden_dim = hidden_dim
        # 关键在下面两句,声明了两个近似价值函数
        # 变量Q是一个计算价值,产生loss的近似函数(网络),
        # 该网络参数在一定时间段内不更新参数
        self.Q = Approximator(dim_input=self.input_dim,
                              dim_output=self.output_dim,
                              dim_hidden=self.hidden_dim)
        # 变量PQ是一个生成策略的近似函数,该函数(网络)的参数频繁更新
        # 更新参数的网络
        self.PQ = self.Q.clone()
        return

    def _learning_from_memory(self, gamma, batch_size, learning_rate, epochs):
        # 随机获取记忆里的Transmition
        trans_pieces = self.sample(batch_size)
        states_0 = np.vstack([x.s0 for x in trans_pieces])
        actions_0 = np.array([x.a0 for x in trans_pieces])
        reward_1 = np.array([x.reward for x in trans_pieces])
        is_done = np.array([x.is_done for x in trans_pieces])
        states_1 = np.vstack([x.s1 for x in trans_pieces])

        X_batch = states_0
        # 调用的时approximator的__call__方法
        y_batch = self.Q(states_0)

        # 使用了Batch,代码是矩阵运算
        # np.max => axis=1时取出最大的一列;axis=0时取出最大的一行
        # ~ True = -2;  ~ False = -1
        Q_target = reward_1 + gamma * np.max(self.Q(states_1), axis=1) * (~ is_done)
        y_batch[np.arange(len(X_batch)), actions_0] = Q_target
        # loss is a torch Variable with size of 1
        loss = self.PQ.fit(x=X_batch,
                           y=y_batch,
                           learning_rate=learning_rate,
                           epochs=epochs)
        mean_loss = loss.sum().item() / batch_size
        self._update_Q_net()
        return mean_loss

    def learning(self, gamma=0.99,
                 learning_rate=1e-5,
                 max_episodes=1000,
                 batch_size=64,
                 min_epsilon=0.2,
                 epsilon_factor=0.1,
                 epochs=1):
        '''learning的主要工作是构建经历,当构建的经历足够时,同时启动基于经历的学习
        '''
        total_steps, step_in_episode, num_episode = 0, 0, 0
        target_episode = max_episodes * epsilon_factor
        while num_episode < max_episodes:
            epsilon = self._decayed_epsilon(cur_episode=num_episode,
                                            min_epsilon=min_epsilon,
                                            max_epsilon=1,
                                            target_episode=target_episode)
            self.state = self.env.reset()
            self.env.render()
            step_in_episode = 0
            loss, mean_loss = 0.00, 0.00
            is_done = False
            while not is_done:
                s0 = self.state
                a0 = self.performPolicy(s0, epsilon)
                # act方法封装了将Transition记录至Experience中的过程
                s1, r1, is_done, info, total_reward = self.act(a0)
                # self.env.render()
                step_in_episode += 1
                # 当经历里有足够大小的Transition时,开始启用基于经历的学习
                if self.total_trans > batch_size:
                    loss += self._learning_from_memory(gamma,
                                                       batch_size,
                                                       learning_rate,
                                                       epochs)
            mean_loss = loss / step_in_episode
            print("{0} epsilon:{1:3.2f}, loss:{2:.3f}".
                  format(self.experience.last, epsilon, mean_loss))
            # print(self.experience)
            total_steps += step_in_episode
            num_episode += 1
        return

    def _decayed_epsilon(self, cur_episode: int,
                         min_epsilon: float,
                         max_epsilon: float,
                         target_episode: int) -> float:
        '''获得一个在一定范围内的epsilon
        '''
        slope = (min_epsilon - max_epsilon) / (target_episode)
        intercept = max_epsilon
        return max(min_epsilon, slope * cur_episode + intercept)

    def _curPolicy(self, s, epsilon=None):
        '''依据更新策略的价值函数(网络)产生一个行为
                '''
        Q_s = self.PQ(s)
        rand_value = random()
        if epsilon is not None and rand_value < epsilon:
            return self.env.action_space.sample()
        else:
            return int(np.argmax(Q_s))

    def performPolicy(self, s, epsilon=None):
        return self._curPolicy(s, epsilon)

    def _update_Q_net(self):
        '''将更新策略的Q网络(连带其参数)复制给输出目标Q值的网络
        '''
        self.Q = self.PQ.clone()