class A2Cagent(object):
    def __init__(self, env):

        # hyperparameters
        self.GAMMA = 0.95
        self.BATCH_SIZE = 32
        self.ACTOR_LEARNING_RATE = 0.0001
        self.CRITIC_LEARNING_RATE = 0.001

        self.env = env
        # get state dimension
        self.state_dim = env.observation_space.shape[0]
        # get action dimension
        self.action_dim = env.action_space.shape[0]
        # get action bound
        self.action_bound = env.action_space.high[0]

        # create actor and critic networks
        self.actor = Actor(self.state_dim, self.action_dim, self.action_bound,
                           self.ACTOR_LEARNING_RATE)
        self.critic = Critic(self.state_dim, self.action_dim,
                             self.CRITIC_LEARNING_RATE)

        # save the results
        self.save_epi_reward = []

    ## computing Advantages and targets: y_k = r_k + gamma*V(s_k+1), A(s_k, a_k)= y_k - V(s_k)
    def advantage_td_target(self, reward, v_value, next_v_value, done):
        if done:
            y_k = reward
            advantage = y_k - v_value
        else:
            y_k = reward + self.GAMMA * next_v_value
            advantage = y_k - v_value
        return advantage, y_k

    ## convert (list of np.array) to np.array
    def unpack_batch(self, batch):
        unpack = batch[0]
        for idx in range(len(batch) - 1):
            unpack = np.append(unpack, batch[idx + 1], axis=0)

        return unpack

    ## train the agent
    def train(self, max_episode_num):

        for ep in range(int(max_episode_num)):

            # initialize batch
            batch_state, batch_action, batch_td_target, batch_advantage = [], [], [], []
            # reset episode
            time, episode_reward, done = 0, 0, False
            # reset the environment and observe the first state
            state = self.env.reset()  # shape of state from gym (3,)

            while not done:

                # visualize the environment
                #self.env.render()
                # pick an action (shape of gym action = (action_dim,) )
                action = self.actor.get_action(state)
                # clip continuous action to be within action_bound
                action = np.clip(action, -self.action_bound, self.action_bound)
                # observe reward, new_state, shape of output of gym (state_dim,)
                next_state, reward, done, _ = self.env.step(action)
                # change shape (state_dim,) -> (1, state_dim), same to action, next_state
                state = np.reshape(state, [1, self.state_dim])
                next_state = np.reshape(next_state, [1, self.state_dim])
                action = np.reshape(action, [1, self.action_dim])
                reward = np.reshape(reward, [1, 1])
                # compute next v_value
                v_value = self.critic.model.predict(state)
                next_v_value = self.critic.model.predict(next_state)
                # compute advantage and TD target
                train_reward = (reward + 8) / 8  # <-- normalization
                advantage, y_i = self.advantage_td_target(
                    train_reward, v_value, next_v_value, done)

                # append to the batch
                batch_state.append(state)
                batch_action.append(action)
                batch_td_target.append(y_i)
                batch_advantage.append(advantage)

                # continue until batch becomes full
                if len(batch_state) < self.BATCH_SIZE:
                    # update current state
                    state = next_state[0]
                    episode_reward += reward[0]
                    time += 1
                    continue

                # if batch is full, start to train networks on batch
                # extract batched states, actions, td_targets, advantages
                states = self.unpack_batch(batch_state)
                actions = self.unpack_batch(batch_action)
                td_targets = self.unpack_batch(batch_td_target)
                advantages = self.unpack_batch(batch_advantage)
                # clear the batch
                batch_state, batch_action, batch_td_target, batch_advantage = [], [], [], []
                # train critic
                self.critic.train_on_batch(states, td_targets)
                # train actor
                self.actor.train(states, actions, advantages)

                # update current state
                state = next_state[0]
                episode_reward += reward[0]
                time += 1

            ## display rewards every episode
            print('Episode: ', ep + 1, 'Time: ', time, 'Reward: ',
                  episode_reward)

            self.save_epi_reward.append(episode_reward)

            ## save weights every episode
            if ep % 10 == 0:
                self.actor.save_weights("./save_weights/pendulum_actor.h5")
                self.critic.save_weights("./save_weights/pendulum_critic.h5")

        np.savetxt('./save_weights/pendulum_epi_reward.txt',
                   self.save_epi_reward)
        print(self.save_epi_reward)

    ## save them to file if done
    def plot_result(self):
        plt.plot(self.save_epi_reward)
        plt.show()
class A2Cagent(object):
    def __init__(self, env):

        self.sess = tf.Session()
        K.set_session(self.sess)

        # hyperparameters
        self.GAMMA = 0.95
        self.BATCH_SIZE = 32
        self.ACTOR_LEARNING_RATE = 0.0001
        self.CRITIC_LEARNING_RATE = 0.001

        self.env = env
        # get state dimension
        self.state_dim = env.observation_space.shape[0]
        # get action dimension
        self.action_dim = env.action_space.shape[0]
        # get action bound
        self.action_bound = env.action_space.high[0]

        # create actor and critic networks
        self.actor = Actor(self.sess, self.state_dim, self.action_dim,
                           self.action_bound, self.ACTOR_LEARNING_RATE)
        self.critic = Critic(self.state_dim, self.action_dim,
                             self.CRITIC_LEARNING_RATE)

        # initialize for later gradient calculation
        self.sess.run(
            tf.global_variables_initializer())  #<-- no problem without it

        # save the results
        self.save_epi_reward = []

    ## computing Advantages and targets: y_k = r_k + gamma*V(s_k+1), A(s_k, a_k)= y_k - V(s_k)
    def advantage_td_target(self, reward, v_value, next_v_value, done):
        if done:
            y_k = reward
            advantage = y_k - v_value
        else:
            y_k = reward + self.GAMMA * next_v_value
            advantage = y_k - v_value
        return advantage, y_k

    ## train the agent
    def train(self, max_episode_num):

        for ep in range(int(max_episode_num)):

            # initialize batch
            states, actions, td_targets, advantages = [], [], [], []
            # reset episode
            time, episode_reward, done = 0, 0, False
            # reset the environment and observe the first state
            state = self.env.reset()  # shape of state from gym (3,)

            while not done:

                # visualize the environment
                #self.env.render()
                # pick an action (shape of gym action = (action_dim,) )
                action = self.actor.get_action(state)
                # clip continuous action to be within action_bound
                action = np.clip(action, -self.action_bound, self.action_bound)
                # observe reward, new_state, shape of output of gym (state_dim,)
                next_state, reward, done, _ = self.env.step(action)
                # compute next v_value
                v_value = self.critic.predict(state)
                next_v_value = self.critic.predict(next_state)
                # compute advantage and TD target
                train_reward = (reward + 8) / 8  # <-- normalization
                advantage, y_i = self.advantage_td_target(
                    train_reward, v_value, next_v_value, done)

                # append to the batch
                states.append(state)
                actions.append(action)
                td_targets.append(y_i)
                advantages.append(advantage)

                # if batch is full, start to train networks on batch
                if len(states) == self.BATCH_SIZE:
                    # train critic
                    self.critic.train_on_batch(states, td_targets)
                    # train actor
                    self.actor.train(states, actions, advantages)

                    # clear the batch
                    states, actions, td_targets, advantages = [], [], [], []

                # update current state
                state = next_state
                episode_reward += reward
                time += 1

            ## display rewards every episode
            print('Episode: ', ep + 1, 'Time: ', time, 'Reward: ',
                  episode_reward)

            self.save_epi_reward.append(episode_reward)

            ## save weights every episode
            if ep % 10 == 0:
                self.actor.save_weights("./save_weights/pendulum_actor.h5")
                self.critic.save_weights("./save_weights/pendulum_critic.h5")

        np.savetxt('./save_weights/pendulum_epi_reward.txt',
                   self.save_epi_reward)
        print(self.save_epi_reward)

    ## save them to file if done
    def plot_result(self):
        plt.plot(self.save_epi_reward)
        plt.show()
Esempio n. 3
0
class A2Cagent(object):
    def __init__(self, env):
        # 텐서플로우 세션 설정
        self.sess = tf.Session()
        K.set_session(self.sess)

        # 하이퍼파라미터
        self.GAMMMA = .95
        self.BATCH_SIZE = 32
        self.ACTOR_LEARNING_RATE = .0001
        self.CRITIC_LEARNING_RATE = .001

        #환경
        self.env = env
        #상태변수 차원
        self.state_dim = env.observation_space.shape[0]
        #행동 차원(dimension)
        self.action_dim = env.action_space.shape[0]
        #행동의 최대 크기
        self.action_bound = env.action_space.high[0]

        #액터 신경망 및 크리틱 신경망 생성
        self.actor = Actor(self.sess, self.state_dim, self.action_dim,
                           self.action_bound, self.ACTOR_LEARNING_RATE)
        self.critic = Critic(self.state_dim, self.action_dim,
                             self.CRITIC_LEARNING_RATE)

        # 그래디언트 계산을 위한 초기화
        self.sess.run(tf.global_variables_initializer())

        #에피소드에서 얻은 총 보상값 저장
        self.save_epi_reward = []

    ## 어드벤티지와 TD 타깃 계산
    def advantage_td_target(self, reward, v_value, next_v_value, done):
        if done:  ## episode가 done일때
            y_k = reward
            advantage = y_k - v_value
        else:
            y_k = reward + self.GAMMMA * next_v_value
            advantage = y_k - v_value
        return advantage, y_k

    ## 배치에 저장된 데이터 추출 (###########################batch에 어떻게 저장되는지 확인 #########################################)
    def unpack_batch(self, batch):
        unpack = batch[0]
        for idx in range(len(batch) - 1):
            unpack = np.append(unpack, batch[idx + 1], axis=0)
        return unpack

    ## 에이전트 학습
    def train(self, max_episode_num):

        # 에피소드마다 다음을 반복
        for ep in range(int(max_episode_num)):
            #배치 초기화
            batch_state, batch_action, batch_td_target, batch_advantage = [], [], [], []
            #에피소드 초기화
            time, episode_reward, done = 0, 0, False
            #환경 초기화 및 초기 상태 관측
            state = self.env.reset()

            while not done:
                #환경 시각화
                self.env.render()
                #행동 추출
                action = self.actor.get_action(state)
                #행동 범위 클리핑
                action = np.clip(action, -self.action_bound, self.action_bound)
                #다음 상태, 보상 관측
                next_state, reward, done, _ = self.env.step(action)

                #shape 변환
                state = np.reshape(state, [1, self.state_dim])
                next_state = np.reshape(next_state, [1, self.state_dim])
                action = np.reshape(action, [1, self.action_dim])
                reward = np.reshape(reward, [1, 1])

                #상태가치 계산
                v_value = self.critic.model.predict(state)
                next_v_value = self.critic.model.predict(next_state)

                #어드벤티지 TD 타깃 계산
                train_reward = (reward + 8) / 8
                advantage, y_i = self.advantage_td_target(
                    train_reward, v_value, next_v_value, done)

                #배치에 저장
                batch_state.append(state)
                batch_action.append(action)
                batch_td_target.append(y_i)
                batch_advantage.append(advantage)

                #배치가 채워질 때까지 학습하지 않고 저장만 계속
                if len(batch_state) < self.BATCH_SIZE:
                    #상태 업데이트
                    state = next_state[0]
                    episode_reward += reward[0]
                    time += 1
                    continue

                #배치가 채워지면 학습 진행
                #배치에서 데이터 추출
                states = self.unpack_batch(batch_state)
                actions = self.unpack_batch(batch_action)
                td_targets = self.unpack_batch(batch_td_target)
                advantages = self.unpack_batch(batch_advantage)

                # batch flushing
                batch_state, batch_action, batch_td_target, batch_advantage = [], [], [], []

                #critic 신경망 업데이트
                self.critic.train_on_batch(states, td_targets)
                #actor 신경망 업데이트
                self.actor.train(states, actions, advantages)

                #상태 업데이트
                state = next_state[0]
                episode_reward += reward[0]
                time += 1

            #에피소드마다 결과 보상값 출력
            print('Episode:', ep + 1, 'Time:', time, 'Reward:', episode_reward)
            self.save_epi_reward.append(episode_reward)

            #에피소드 10번 마다 신경망 파라미터를 파일에 저장
            if ep % 10 == 0:
                self.actor.save_weights('./save_weights/pendulum_actor.h5')
                self.critic.save_weights('./save_weights/pendulum_critic.h5')
        #학습 후, 누적 보상값 저장
        np.savetxt('./save_weights/pendulum_epi_reward.txt',
                   self.save_epi_reward)

    #에피소드와 누적 보상값을 그려주는 함수
    def plot_result(self):
        plt.plot(self.save_epi_reward)
        plt.show()
Esempio n. 4
0
class A2Cagent(object):         # object: python 2 의 old class 와 호환을 위해서.. python3 만 쓴다면, (..) 생략 가능
    def __init__(self, env):    # 클래스 초기화 메서드

        # hyper parameters
        self.GAMMA = 0.95
        self.BATCH_SIZE = 32
        self.ACTOR_LEARNING_RATE = 0.0001
        self.CRITIC_LEARNING_RATE = 0.001

        # 환경
        self.env = env
        # get state dimension
        self.state_dim = env.observation_space.shape[0]
        # get action dimension
        self.action_dim = env.action_space.shape[0]
        # get action bound (행동의 최대 크기)
        self.action_bound = env.action_space.high[0]

        ## A2C 알고리즘 1. critic 과 actor 신경망의 파라메터 phi (critic 신경망) 와 theta (actor 신경망)를 초기화한다.
        # create actor and critic networks
        self.actor = Actor(self.state_dim, self.action_dim, self.action_bound, self.ACTOR_LEARNING_RATE)
        self.critic = Critic(self.state_dim, self.action_dim, self.CRITIC_LEARNING_RATE);

        # save the results (에피소드에서 얻은 총 보상값을 저장하기 위한 변수)
        self.save_epi_reward = [];

    ## computing Advantages and targets: y(k) = r(k) + gamma * V(s_k + 1), A(s_k, a_k) = y_k - V(s_k)
    def advantage_td_target(self, reward, v_value, next_v_value, done):
        if done:
            y_k = reward;
            advantage = y_k - v_value;
        else:
            y_k = reward + self.GAMMA * next_v_value;
            advantage = y_k - v_value;
        return advantage, y_k


    ## train the agent
    def train(self, max_episode_num):
        ## A2C 알고리즘 2. Repeat
        # 에피소드마다 다음을 반복
        for ep in range(int(max_episode_num)):
            # initialize batch
            # 상태변수, 행동, 시간차 타겟, 어드밴티지를 저장할 배치를 초기화한다.
            states, actions, td_targets, advantages = [], [], [], []
            # reset episode (에피소드 초기화)
            time, episode_reward, done = 0, 0, False
            # reset the environment and observe the first state x0
            state = self.env.reset()    # shape of state from gym (3,)

            ## A2C 알고리즘 2. 내부 Repeat
            while not done:
                # visualize the environment
                # self.env.render()

                ## A2C 알고리즘 2.1.1. 정책 u_{i} ~ pi_{theta}(u_{i}|x_{i}) 으로 행동을 확률적으로 선택한다.
                # pick an action (shape of gym action = (action_dim,) ) (행동 추출)
                action = self.actor.get_action(state)
                # clip continuous action to be within action_bound [-2, 2]
                action = np.clip(action, -self.action_bound, self.action_bound)

                ## A2C 알고리즘 2.1.2. u_{i} 를 실행해 보상 r(x_{i}, u_{i})과 다음 상태변수 x_{i+1}을 측정한다.
                # observe reward, new_state, shape of output of gym (state_dim,)
                next_state, reward, done, _ = self.env.step(action)

                # compute next v_value
                v_value = self.critic.predict(state)
                next_v_value = self.critic.predict(next_state)

                # compute advantage and TD target
                train_reward = (reward + 8) / 8  # <-- normalization
                advantage, y_i = self.advantage_td_target(train_reward, v_value, next_v_value, done)

                # append to the batch (배치에 저장)
                states.append(state)
                actions.append(action)
                td_targets.append(y_i)
                advantages.append(advantage)

                # if batch is full, start to train networks on batch
                if len(states) == self.BATCH_SIZE:
                    ## A2C 2.5. critic 신경망 업데이트 (학습) TD target 에 대한 regression ??
                    # train critic
                    self.critic.train_on_batch(states, td_targets)
                    ## A2C 2.6. actor 신경망 업데이트 (학습)
                    # train actor
                    self.actor.train(states, actions, advantages)

                    # clear the batch
                    states, actions, td_targets, advantages = [], [], [], []

                # update current state
                state = next_state
                episode_reward += reward
                time += 1

            ## display rewards every episode
            print('Episode: ', ep + 1, 'Time: ', time, 'Reward: ', episode_reward)

            self.save_epi_reward.append(episode_reward)

            ## save weights every episode
            if ep % 10 == 0:
                self.actor.save_weights("./save_weights/pendulum_actor.h5")
                self.critic.save_weights("./save_weights/pendulum_critic.h5")

        np.savetxt('./save_weights/pendulum_epi_reward.txt', self.save_epi_reward)
        print(self.save_epi_reward)

    ## save them to file if done
    def plot_result(self):
        plt.plot(self.save_epi_reward)
        plt.show()