Python Worker_Actor.train Exemples, a3c_actor.Worker_Actor.train Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : a3c_agent.py Projet : HyungjunJu/tcprl

class A3CWorker(threading.Thread):
    """
        create woreker thread
    """
    def __init__(self, worker_name, env_count, sess,
                 global_actor, global_critic, max_episode_num):
        threading.Thread.__init__(self)

        # hyperparameters
        self.GAMMA = 0.95
        self.ACTOR_LEARNING_RATE = 0.0001
        self.CRITIC_LEARNING_RATE = 0.001
        self.ENTROPY_BETA = 0.01
        self.t_MAX = 4 # n-step TD

        self.max_episode_num = max_episode_num

        #self.env = gym.make(env_name)
        self.env = ns3env.Ns3Env(port=(port+env_count), stepTime=stepTime, startSim=startSim, simSeed=seed, simArgs=simArgs, debug=debug)
        self.worker_name = worker_name
        self.sess = sess

        self.global_actor = global_actor
        self.global_critic = global_critic

        # state variable dimension
        #self.state_dim = self.env.observation_space.shape[0]
        self.state_dim = state_dimension
        # action dimension
        #self.action_dim = self.env.action_space.shape[0]
        self.action_dim = action_dimension
        # action maximum boundary
        #self.action_bound = int(self.env.action_space.high[0])
        self.action_bound = action_max_bound
        # create worker actor and critic NN
        self.worker_actor = Worker_Actor(self.sess, self.state_dim, self.action_dim,
                                         self.action_bound, self.ACTOR_LEARNING_RATE,
                                         self.ENTROPY_BETA, self.global_actor)
        self.worker_critic = Worker_Critic(self.sess, self.state_dim, self.action_dim,
                                           self.CRITIC_LEARNING_RATE, self.global_critic)
        # Copy Hyperparameters from Global NN to Worker NN
        self.worker_actor.model.set_weights(self.global_actor.model.get_weights())
        self.worker_critic.model.set_weights(self.global_critic.model.get_weights())

    ## calculage n-step td target
    def n_step_td_target(self, rewards, next_v_value, done):
        td_targets = np.zeros_like(rewards)
        cumulative = 0
        if not done:
            cumulative = next_v_value

        for k in reversed(range(0, len(rewards))):
            cumulative = self.GAMMA * cumulative + rewards[k]
            td_targets[k] = cumulative
        return td_targets

    ## extract data from batch
    def unpack_batch(self, batch):
        unpack = batch[0]
        for idx in range(len(batch) - 1):
            unpack = np.append(unpack, batch[idx+1], axis=0)
        return unpack

    ## worker train function, run - thread
    def run(self):
        # declaration for common global variables
        global global_episode_count, global_step
        global global_episode_reward
        # print worker execution
        print(self.worker_name, "starts ----")
        trace = Traces(1)
        default_cwnd = 536
        rtt_alpha = 0.6
        cwnd_alpha = 0.6
        rtt_ewma = EWMATrace(rtt_alpha)
        cwnd_ewma = EWMATrace(cwnd_alpha)
        loss_window_size = 100
        losstrace = LossTrace(1, loss_window_size)
        seg_acked = 0

        # repeat episodes
        while global_episode_count <= int(self.max_episode_num):
            # initialize batch
            batch_state, batch_action, batch_reward = [], [], []
            # initialize episode
            step, episode_reward, done = 0, 0, False
            # reset environment and observe initial state
            state = self.env.reset()
            cwnd = state[5]
            U_old = 0
            losstrace.clear()
            # repeat episode
            while not done:
                # rendering
                # self.env.render()
                # extract action
                losstrace.step() # maybe socketUuid need
                state_tr = extract_state_without_ewma(state, cwnd_ewma, rtt_ewma, losstrace)
                action = self.worker_actor.get_action(state_tr, self.sess)
                print(":", action, ":")
                # action boundary clipping
                #action = int(np.clip(action, -self.action_bound, self.action_bound))
                new_cwnd = cwnd + action_mapping[action]
                if new_cwnd < default_cwnd:
                    new_cwnd = default_cwnd
                if trace.check_validate(state):
                    trace.add_action(action)
                new_ssThresh = np.int(cwnd / 2)
                actions = [new_ssThresh, new_cwnd]
                U_reward, U_old = get_reward(state, losstrace, U_old)

                # observe next state and reward
                next_state, reward, done, _ = self.env.step(actions)
                next_state_tr = extract_state(next_state, cwnd_ewma, rtt_ewma, losstrace)
                #reward_new = get_reward(state)
                cwnd = next_state[5]
                if next_state[11] == 0 & next_state[0] == 1:
                    losstrace.loss(1, step)
                if trace.check_validate(next_state):
                    trace.add_cwnd(cwnd)
                    trace.add_rtt(next_state[9])
                    trace.add_reward(U_reward)
                    seg_acked += next_state[7]

                # shape translation
                state = np.reshape(state_tr, [1, self.state_dim])
                reward = np.reshape(U_reward, [1, 1])
                action = np.reshape(action, [1, self.action_dim])
                # save batch
                batch_state.append(state)
                batch_action.append(action)
                batch_reward.append(reward)
                # update state
                state = next_state
                step += 1
                episode_reward += reward[0]

                # if batch is filled, start worker training
                with self.sess.as_default():
                    with self.sess.graph.as_default():  
                        if len(batch_state) == self.t_MAX or done:
                            # extract data from batch
                            states = self.unpack_batch(batch_state)
                            actions = self.unpack_batch(batch_action)
                            rewards = self.unpack_batch(batch_reward)
                            # clear batch
                            batch_state, batch_action, batch_reward = [], [], []
                           # calculate n-step TD target and advantages
                            next_state = np.reshape(next_state_tr, [1, self.state_dim])
                            next_v_value = self.worker_critic.model.predict(next_state)
                            n_step_td_targets = self.n_step_td_target(rewards, next_v_value, done)
                            v_values = self.worker_critic.model.predict(states)
                            advantages = n_step_td_targets - v_values
                            # update global critic and actor nn
                            self.worker_critic.train(states, n_step_td_targets)
                            self.worker_actor.train(states, actions, advantages)
                            # copy global parameter to worker nn
                            self.worker_actor.model.set_weights(self.global_actor.model.get_weights())
                            self.worker_critic.model.set_weights(self.global_critic.model.get_weights())
                            # update global step
                            global_step += 1
                        # if episode is done
                        if done:
                            # update global episode count
                            global_episode_count += 1
                            # print episode rewards

                            print('Worker name: ', self.worker_name,
                                  ", Episode: ",  global_episode_count,
                                ', Step: ', step, ', Reward: ', episode_reward)
                            global_episode_reward.append(episode_reward)
                            # save episode reward at every 10th episodes
                            if global_episode_count % 10 == 0:
                                self.global_actor.save_weights("./save_weights/_actor.h5")
                                self.global_critic.save_weights("./save_weights/_critic.h5")

Exemple #2

0

Afficher le fichier

Fichier : a3c_agent.py Projet : cnw1114/RL

class A3Cworker(threading.Thread):
    '''
	워커 스레드 생성
	'''
    def __init__(self, worker_name, env_name, sess, global_actor,
                 global_critic, max_episode_num):
        threading.Thread.__init__(self)

        #하이퍼 파라미터
        self.GAMMA = .95
        self.ACTOR_LEARING_RATE = .0001
        self.CRITIC_LEARNING_RATE = .001
        self.ENTROPY_BETA = .01
        self.t_MAX = 4  ## 시간차 스텝

        self.max_episode_num = max_episode_num

        #워커의 환경 생성
        self.env = gym.make(env_name)
        self.worker_name = worker_name
        self.sess = sess

        #글로벌 신경망 공유
        self.global_actor = global_actor
        self.global_critic = global_critic

        #상태변수 차원(dimension)
        self.state_dim = self.env.observation_space.shape[0]

        #행동 차원(dimension)
        self.action_dim = self.env.action_space.shape[0]

        #행동의 최대 크기
        self.action_bound = self.env.action_space.high[0]

        #워커 액터 및 크리틱 신경망 생성
        self.worker_actor = Worker_Actor(self.sess, self.state_dim,
                                         self.action_dim, self.action_bound,
                                         self.ACTOR_LEARING_RATE,
                                         self.ENTROPY_BETA, self.global_actor)
        self.worker_critic = Worker_Critic(self.sess, self.state_dim,
                                           self.action_dim,
                                           self.CRITIC_LEARNING_RATE,
                                           self.global_critic)

        #글로벌 신경망의 파라미터를 워커 신경망으로 복사
        self.worker_actor.model.set_weights(
            self.global_actor.model.get_weights())
        self.worker_critic.model.set_weights(
            self.global_critic.model.get_weights())

    ##n-step 시간차 타깃 계산
    def n_step_td_target(self, rewards, next_v_value, done):
        td_targets = np.zero_like(rewards)
        cumulative = 0
        if not done:
            cumulative = next_v_value

        for k in reversed(range(
                0, len(rewards))):  # trajectory 의 시간만큼 t=0,...,len(reward)(=T)
            cumulative = self.GAMMA * cumulative + rewards[k]
            td_targets[k] = cumulative
        return td_targets  #시간별로 시간차 타깃 계산 (벡터)

    ## 배치에 저장된 데이터 추출
    def unpack_batch(self, batch):
        unpack = batch[0]
        for idx in range(len(batch) - 1):
            unpack = np.append(unpack, batch[idx + 1], axis=0)
        return unpack

    def run(
        self
    ):  ## Thread를 상속받기에 super class method명이 run이라서 run으로 이름을 통일해주어야 한다.
        #모든 워커에서 공통으로 사용할 글로벌 변수 선언
        global global_episode_count, global_step, global_episode_reward

        #워커 실행 시 프린트
        print(self.worker_name, 'starts ---')
        #에피소드마다 다음을 반복
        while global_episode_count <= int(self.max_episode_num):

            #배치 초기화
            batch_state, batch_action, batch_reward = [], [], []
            #에피소드 초기화
            step, episode_reward, done = 0, 0, False
            #환경 초기화 및 초기 상태 관측
            state = self.env.reset()
            #에피소드 종료 시까지 다음을 반복

            while not done:

                #환경 가시화
                # self.env.render #멀티 스레드방식이라 창이 여러개 뜨기 때문에 주석처리
                #행동 추출
                action = self.worker_actor.get_action(state)
                #행동 범위 클리핑
                action = np.clip(action, -self.action_bound, self.action_bound)
                #다음 상태, 보상 관측
                next_state, reward, done, _ = self.env.step(action)

                #shape 변환
                state = np.reshape(state, [1, self.state_dim])
                reward = np.reshape(reward, [1, 1])
                action = np.reshape(action, [1, self.action_dim])

                #batch에 저장
                batch_state.append(state)
                batch_action.append(action)
                batch_reward.append(
                    (reward + 8) / 8)  #보상 범위 조절[-16, 0] ==> [-1, 1]

                #상태 업데이트
                state = next_state
                step += 1
                episode_reward += reward[0]

                #배치가 채워지면, 워커 학습 시작!
                if len(batch_state) == self.t_MAX or done:

                    #배치에서 데이터 추출
                    states = self.unpack_batch(batch_state)
                    actions = self.unpack_batch(batch_action)
                    rewards = self.unpack_batch(batch_reward)

                    #배치 비우기
                    batch_state, batch_action, batch_reward = [], [], []

                    #n-step TD타깃 어드벤티지 계산
                    next_state = np.reshape(next_state, [1, self.state_dim])
                    next_v_value = self.worker_critic.model.predict(
                        next_state)  # 1차원 스칼라 가치값 도출
                    n_step_td_targets = n_step_td_target(
                        rewards, next_v_value, done)
                    v_values = self.worker_critic.model.predict(
                        states)  # 각 시점별로 vector형성
                    advantages = n_step_td_targets - v_values

                    #글로벌 크리틱과 액터 신경망 업데이트
                    self.worker_critic.train(states, n_step_td_targets)
                    self.worker_actor.train(states, actions, advantages)

                    #글로벌 신경망 파라미터를 워커 신경망으로 복사
                    self.worker_actor.model.set_weights(
                        self.global_actor.model.get_weights())
                    self.worker_critic.model.set_weights(
                        self.global_critic.model.get_weights())

                    #글로벌 스텝 업데이트
                    global_step += 1

                #에피소드 종료
                if done:
                    #글로벌 에피소드 카운트 업데이트
                    global_episode_count += 1
                    #episode마다 결과 보상값 출력
                    print('Worker name:', self.worker_name, ', Episode:',
                          global_episode_count, ', Step:', step, ', Reward:',
                          episode_reward)
                    #10번째 에피소드마다 신경망 파라미터를 파일에 저장
                    if global_episode_count % 10 == 0:
                        self.global_actor.save_weights(
                            './save_weights/pendulum_actor.h5')
                        self.global_critic.save_weights(
                            './save_weights/pendulum_critic.h5')

Exemple #3

0

Afficher le fichier

Fichier : a3c_agent.py Projet : sa757/Reinforcement-Learning-Book

class A3Cworker(threading.Thread):
    """
        local agent network (worker)
    """
    def __init__(self, worker_name, env_name, sess, global_actor,
                 global_critic, max_episode_num):
        threading.Thread.__init__(self)

        #self.lock = threading.Lock()

        # hyperparameters
        self.GAMMA = 0.95
        self.ACTOR_LEARNING_RATE = 0.0001
        self.CRITIC_LEARNING_RATE = 0.001
        self.ENTROPY_BETA = 0.01
        self.t_MAX = 4  # t-step prediction

        self.max_episode_num = max_episode_num

        # environment
        self.env = gym.make(env_name)
        self.worker_name = worker_name
        self.sess = sess

        # global network sharing
        self.global_actor = global_actor
        self.global_critic = global_critic

        # get state dimension
        self.state_dim = self.env.observation_space.shape[0]
        # get action dimension
        self.action_dim = self.env.action_space.shape[0]
        # get action bound
        self.action_bound = self.env.action_space.high[0]

        # create local actor and critic networks
        self.worker_actor = Worker_Actor(self.sess, self.state_dim,
                                         self.action_dim, self.action_bound,
                                         self.ACTOR_LEARNING_RATE,
                                         self.ENTROPY_BETA, self.global_actor)
        self.worker_critic = Worker_Critic(self.sess, self.state_dim,
                                           self.action_dim,
                                           self.CRITIC_LEARNING_RATE,
                                           self.global_critic)

        # initial transfer global network parameters to worker network parameters
        self.worker_actor.model.set_weights(
            self.global_actor.model.get_weights())
        self.worker_critic.model.set_weights(
            self.global_critic.model.get_weights())

    ## computing Advantages and targets: y_k = r_k + gamma*V(s_k+1), A(s_k, a_k)= y_k - V(s_k)
    def n_step_td_target(self, rewards, next_v_value, done):
        td_targets = np.zeros_like(rewards)
        cumulative = 0
        if not done:
            cumulative = next_v_value

        for k in reversed(range(0, len(rewards))):
            cumulative = self.GAMMA * cumulative + rewards[k]
            td_targets[k] = cumulative
        return td_targets

    # train each worker
    def run(self):

        global global_episode_count, global_step
        global global_episode_reward  # total episode across all workers

        print(self.worker_name, "starts ---")

        while global_episode_count <= int(self.max_episode_num):

            # initialize batch
            states, actions, rewards = [], [], []

            # reset episode
            step, episode_reward, done = 0, 0, False
            # reset the environment and observe the first state
            state = self.env.reset()  # shape of state from gym (3,)

            while not done:

                # visualize the environment
                #self.env.render()
                # pick an action (shape of gym action = (action_dim,) )
                action = self.worker_actor.get_action(state)
                # clip continuous action to be within action_bound
                action = np.clip(action, -self.action_bound, self.action_bound)
                # observe reward, new_state, shape of output of gym (state_dim,)
                next_state, reward, done, _ = self.env.step(action)

                # append to the batch
                states.append(state)
                actions.append(action)
                rewards.append((reward + 8) / 8)  # <-- normalization

                # if batch is full or episode ends, start to train worker on batch
                if len(states) == self.t_MAX or done:

                    # compute n-step TD target and advantage prediction
                    next_v_value = self.worker_critic.predict([next_state])
                    n_step_td_targets = self.n_step_td_target(
                        rewards, next_v_value, done)
                    v_values = self.worker_critic.predict(states)
                    advantages = n_step_td_targets - v_values

                    #with self.lock:
                    # update global critic
                    self.worker_critic.train(states, n_step_td_targets)
                    # update global actor
                    self.worker_actor.train(states, actions, advantages)

                    # transfer global network parameters to worker network parameters
                    self.worker_actor.model.set_weights(
                        self.global_actor.model.get_weights())
                    self.worker_critic.model.set_weights(
                        self.global_critic.model.get_weights())

                    # clear the batch
                    states, actions, rewards = [], [], []

                    # update global step
                    global_step += 1

                # update state and step
                state = next_state
                step += 1
                episode_reward += reward

            # update global episode count
            global_episode_count += 1
            ## display rewards every episode
            print('Worker name:', self.worker_name, ', Episode: ',
                  global_episode_count, ', Step: ', step, ', Reward: ',
                  episode_reward)

            global_episode_reward.append(episode_reward)

            ## save weights every episode
            if global_episode_count % 10 == 0:
                self.global_actor.save_weights(
                    "./save_weights/pendulum_actor.h5")
                self.global_critic.save_weights(
                    "./save_weights/pendulum_critic.h5")