class A3CWorker(threading.Thread): """ create woreker thread """ def __init__(self, worker_name, env_count, sess, global_actor, global_critic, max_episode_num): threading.Thread.__init__(self) # hyperparameters self.GAMMA = 0.95 self.ACTOR_LEARNING_RATE = 0.0001 self.CRITIC_LEARNING_RATE = 0.001 self.ENTROPY_BETA = 0.01 self.t_MAX = 4 # n-step TD self.max_episode_num = max_episode_num #self.env = gym.make(env_name) self.env = ns3env.Ns3Env(port=(port+env_count), stepTime=stepTime, startSim=startSim, simSeed=seed, simArgs=simArgs, debug=debug) self.worker_name = worker_name self.sess = sess self.global_actor = global_actor self.global_critic = global_critic # state variable dimension #self.state_dim = self.env.observation_space.shape[0] self.state_dim = state_dimension # action dimension #self.action_dim = self.env.action_space.shape[0] self.action_dim = action_dimension # action maximum boundary #self.action_bound = int(self.env.action_space.high[0]) self.action_bound = action_max_bound # create worker actor and critic NN self.worker_actor = Worker_Actor(self.sess, self.state_dim, self.action_dim, self.action_bound, self.ACTOR_LEARNING_RATE, self.ENTROPY_BETA, self.global_actor) self.worker_critic = Worker_Critic(self.sess, self.state_dim, self.action_dim, self.CRITIC_LEARNING_RATE, self.global_critic) # Copy Hyperparameters from Global NN to Worker NN self.worker_actor.model.set_weights(self.global_actor.model.get_weights()) self.worker_critic.model.set_weights(self.global_critic.model.get_weights()) ## calculage n-step td target def n_step_td_target(self, rewards, next_v_value, done): td_targets = np.zeros_like(rewards) cumulative = 0 if not done: cumulative = next_v_value for k in reversed(range(0, len(rewards))): cumulative = self.GAMMA * cumulative + rewards[k] td_targets[k] = cumulative return td_targets ## extract data from batch def unpack_batch(self, batch): unpack = batch[0] for idx in range(len(batch) - 1): unpack = np.append(unpack, batch[idx+1], axis=0) return unpack ## worker train function, run - thread def run(self): # declaration for common global variables global global_episode_count, global_step global global_episode_reward # print worker execution print(self.worker_name, "starts ----") trace = Traces(1) default_cwnd = 536 rtt_alpha = 0.6 cwnd_alpha = 0.6 rtt_ewma = EWMATrace(rtt_alpha) cwnd_ewma = EWMATrace(cwnd_alpha) loss_window_size = 100 losstrace = LossTrace(1, loss_window_size) seg_acked = 0 # repeat episodes while global_episode_count <= int(self.max_episode_num): # initialize batch batch_state, batch_action, batch_reward = [], [], [] # initialize episode step, episode_reward, done = 0, 0, False # reset environment and observe initial state state = self.env.reset() cwnd = state[5] U_old = 0 losstrace.clear() # repeat episode while not done: # rendering # self.env.render() # extract action losstrace.step() # maybe socketUuid need state_tr = extract_state_without_ewma(state, cwnd_ewma, rtt_ewma, losstrace) action = self.worker_actor.get_action(state_tr, self.sess) print(":", action, ":") # action boundary clipping #action = int(np.clip(action, -self.action_bound, self.action_bound)) new_cwnd = cwnd + action_mapping[action] if new_cwnd < default_cwnd: new_cwnd = default_cwnd if trace.check_validate(state): trace.add_action(action) new_ssThresh = np.int(cwnd / 2) actions = [new_ssThresh, new_cwnd] U_reward, U_old = get_reward(state, losstrace, U_old) # observe next state and reward next_state, reward, done, _ = self.env.step(actions) next_state_tr = extract_state(next_state, cwnd_ewma, rtt_ewma, losstrace) #reward_new = get_reward(state) cwnd = next_state[5] if next_state[11] == 0 & next_state[0] == 1: losstrace.loss(1, step) if trace.check_validate(next_state): trace.add_cwnd(cwnd) trace.add_rtt(next_state[9]) trace.add_reward(U_reward) seg_acked += next_state[7] # shape translation state = np.reshape(state_tr, [1, self.state_dim]) reward = np.reshape(U_reward, [1, 1]) action = np.reshape(action, [1, self.action_dim]) # save batch batch_state.append(state) batch_action.append(action) batch_reward.append(reward) # update state state = next_state step += 1 episode_reward += reward[0] # if batch is filled, start worker training with self.sess.as_default(): with self.sess.graph.as_default(): if len(batch_state) == self.t_MAX or done: # extract data from batch states = self.unpack_batch(batch_state) actions = self.unpack_batch(batch_action) rewards = self.unpack_batch(batch_reward) # clear batch batch_state, batch_action, batch_reward = [], [], [] # calculate n-step TD target and advantages next_state = np.reshape(next_state_tr, [1, self.state_dim]) next_v_value = self.worker_critic.model.predict(next_state) n_step_td_targets = self.n_step_td_target(rewards, next_v_value, done) v_values = self.worker_critic.model.predict(states) advantages = n_step_td_targets - v_values # update global critic and actor nn self.worker_critic.train(states, n_step_td_targets) self.worker_actor.train(states, actions, advantages) # copy global parameter to worker nn self.worker_actor.model.set_weights(self.global_actor.model.get_weights()) self.worker_critic.model.set_weights(self.global_critic.model.get_weights()) # update global step global_step += 1 # if episode is done if done: # update global episode count global_episode_count += 1 # print episode rewards print('Worker name: ', self.worker_name, ", Episode: ", global_episode_count, ', Step: ', step, ', Reward: ', episode_reward) global_episode_reward.append(episode_reward) # save episode reward at every 10th episodes if global_episode_count % 10 == 0: self.global_actor.save_weights("./save_weights/_actor.h5") self.global_critic.save_weights("./save_weights/_critic.h5")
class A3Cworker(threading.Thread): ''' 워커 스레드 생성 ''' def __init__(self, worker_name, env_name, sess, global_actor, global_critic, max_episode_num): threading.Thread.__init__(self) #하이퍼 파라미터 self.GAMMA = .95 self.ACTOR_LEARING_RATE = .0001 self.CRITIC_LEARNING_RATE = .001 self.ENTROPY_BETA = .01 self.t_MAX = 4 ## 시간차 스텝 self.max_episode_num = max_episode_num #워커의 환경 생성 self.env = gym.make(env_name) self.worker_name = worker_name self.sess = sess #글로벌 신경망 공유 self.global_actor = global_actor self.global_critic = global_critic #상태변수 차원(dimension) self.state_dim = self.env.observation_space.shape[0] #행동 차원(dimension) self.action_dim = self.env.action_space.shape[0] #행동의 최대 크기 self.action_bound = self.env.action_space.high[0] #워커 액터 및 크리틱 신경망 생성 self.worker_actor = Worker_Actor(self.sess, self.state_dim, self.action_dim, self.action_bound, self.ACTOR_LEARING_RATE, self.ENTROPY_BETA, self.global_actor) self.worker_critic = Worker_Critic(self.sess, self.state_dim, self.action_dim, self.CRITIC_LEARNING_RATE, self.global_critic) #글로벌 신경망의 파라미터를 워커 신경망으로 복사 self.worker_actor.model.set_weights( self.global_actor.model.get_weights()) self.worker_critic.model.set_weights( self.global_critic.model.get_weights()) ##n-step 시간차 타깃 계산 def n_step_td_target(self, rewards, next_v_value, done): td_targets = np.zero_like(rewards) cumulative = 0 if not done: cumulative = next_v_value for k in reversed(range( 0, len(rewards))): # trajectory 의 시간만큼 t=0,...,len(reward)(=T) cumulative = self.GAMMA * cumulative + rewards[k] td_targets[k] = cumulative return td_targets #시간별로 시간차 타깃 계산 (벡터) ## 배치에 저장된 데이터 추출 def unpack_batch(self, batch): unpack = batch[0] for idx in range(len(batch) - 1): unpack = np.append(unpack, batch[idx + 1], axis=0) return unpack def run( self ): ## Thread를 상속받기에 super class method명이 run이라서 run으로 이름을 통일해주어야 한다. #모든 워커에서 공통으로 사용할 글로벌 변수 선언 global global_episode_count, global_step, global_episode_reward #워커 실행 시 프린트 print(self.worker_name, 'starts ---') #에피소드마다 다음을 반복 while global_episode_count <= int(self.max_episode_num): #배치 초기화 batch_state, batch_action, batch_reward = [], [], [] #에피소드 초기화 step, episode_reward, done = 0, 0, False #환경 초기화 및 초기 상태 관측 state = self.env.reset() #에피소드 종료 시까지 다음을 반복 while not done: #환경 가시화 # self.env.render #멀티 스레드방식이라 창이 여러개 뜨기 때문에 주석처리 #행동 추출 action = self.worker_actor.get_action(state) #행동 범위 클리핑 action = np.clip(action, -self.action_bound, self.action_bound) #다음 상태, 보상 관측 next_state, reward, done, _ = self.env.step(action) #shape 변환 state = np.reshape(state, [1, self.state_dim]) reward = np.reshape(reward, [1, 1]) action = np.reshape(action, [1, self.action_dim]) #batch에 저장 batch_state.append(state) batch_action.append(action) batch_reward.append( (reward + 8) / 8) #보상 범위 조절[-16, 0] ==> [-1, 1] #상태 업데이트 state = next_state step += 1 episode_reward += reward[0] #배치가 채워지면, 워커 학습 시작! if len(batch_state) == self.t_MAX or done: #배치에서 데이터 추출 states = self.unpack_batch(batch_state) actions = self.unpack_batch(batch_action) rewards = self.unpack_batch(batch_reward) #배치 비우기 batch_state, batch_action, batch_reward = [], [], [] #n-step TD타깃 어드벤티지 계산 next_state = np.reshape(next_state, [1, self.state_dim]) next_v_value = self.worker_critic.model.predict( next_state) # 1차원 스칼라 가치값 도출 n_step_td_targets = n_step_td_target( rewards, next_v_value, done) v_values = self.worker_critic.model.predict( states) # 각 시점별로 vector형성 advantages = n_step_td_targets - v_values #글로벌 크리틱과 액터 신경망 업데이트 self.worker_critic.train(states, n_step_td_targets) self.worker_actor.train(states, actions, advantages) #글로벌 신경망 파라미터를 워커 신경망으로 복사 self.worker_actor.model.set_weights( self.global_actor.model.get_weights()) self.worker_critic.model.set_weights( self.global_critic.model.get_weights()) #글로벌 스텝 업데이트 global_step += 1 #에피소드 종료 if done: #글로벌 에피소드 카운트 업데이트 global_episode_count += 1 #episode마다 결과 보상값 출력 print('Worker name:', self.worker_name, ', Episode:', global_episode_count, ', Step:', step, ', Reward:', episode_reward) #10번째 에피소드마다 신경망 파라미터를 파일에 저장 if global_episode_count % 10 == 0: self.global_actor.save_weights( './save_weights/pendulum_actor.h5') self.global_critic.save_weights( './save_weights/pendulum_critic.h5')
class A3Cworker(threading.Thread): """ local agent network (worker) """ def __init__(self, worker_name, env_name, sess, global_actor, global_critic, max_episode_num): threading.Thread.__init__(self) #self.lock = threading.Lock() # hyperparameters self.GAMMA = 0.95 self.ACTOR_LEARNING_RATE = 0.0001 self.CRITIC_LEARNING_RATE = 0.001 self.ENTROPY_BETA = 0.01 self.t_MAX = 4 # t-step prediction self.max_episode_num = max_episode_num # environment self.env = gym.make(env_name) self.worker_name = worker_name self.sess = sess # global network sharing self.global_actor = global_actor self.global_critic = global_critic # get state dimension self.state_dim = self.env.observation_space.shape[0] # get action dimension self.action_dim = self.env.action_space.shape[0] # get action bound self.action_bound = self.env.action_space.high[0] # create local actor and critic networks self.worker_actor = Worker_Actor(self.sess, self.state_dim, self.action_dim, self.action_bound, self.ACTOR_LEARNING_RATE, self.ENTROPY_BETA, self.global_actor) self.worker_critic = Worker_Critic(self.sess, self.state_dim, self.action_dim, self.CRITIC_LEARNING_RATE, self.global_critic) # initial transfer global network parameters to worker network parameters self.worker_actor.model.set_weights( self.global_actor.model.get_weights()) self.worker_critic.model.set_weights( self.global_critic.model.get_weights()) ## computing Advantages and targets: y_k = r_k + gamma*V(s_k+1), A(s_k, a_k)= y_k - V(s_k) def n_step_td_target(self, rewards, next_v_value, done): td_targets = np.zeros_like(rewards) cumulative = 0 if not done: cumulative = next_v_value for k in reversed(range(0, len(rewards))): cumulative = self.GAMMA * cumulative + rewards[k] td_targets[k] = cumulative return td_targets # train each worker def run(self): global global_episode_count, global_step global global_episode_reward # total episode across all workers print(self.worker_name, "starts ---") while global_episode_count <= int(self.max_episode_num): # initialize batch states, actions, rewards = [], [], [] # reset episode step, episode_reward, done = 0, 0, False # reset the environment and observe the first state state = self.env.reset() # shape of state from gym (3,) while not done: # visualize the environment #self.env.render() # pick an action (shape of gym action = (action_dim,) ) action = self.worker_actor.get_action(state) # clip continuous action to be within action_bound action = np.clip(action, -self.action_bound, self.action_bound) # observe reward, new_state, shape of output of gym (state_dim,) next_state, reward, done, _ = self.env.step(action) # append to the batch states.append(state) actions.append(action) rewards.append((reward + 8) / 8) # <-- normalization # if batch is full or episode ends, start to train worker on batch if len(states) == self.t_MAX or done: # compute n-step TD target and advantage prediction next_v_value = self.worker_critic.predict([next_state]) n_step_td_targets = self.n_step_td_target( rewards, next_v_value, done) v_values = self.worker_critic.predict(states) advantages = n_step_td_targets - v_values #with self.lock: # update global critic self.worker_critic.train(states, n_step_td_targets) # update global actor self.worker_actor.train(states, actions, advantages) # transfer global network parameters to worker network parameters self.worker_actor.model.set_weights( self.global_actor.model.get_weights()) self.worker_critic.model.set_weights( self.global_critic.model.get_weights()) # clear the batch states, actions, rewards = [], [], [] # update global step global_step += 1 # update state and step state = next_state step += 1 episode_reward += reward # update global episode count global_episode_count += 1 ## display rewards every episode print('Worker name:', self.worker_name, ', Episode: ', global_episode_count, ', Step: ', step, ', Reward: ', episode_reward) global_episode_reward.append(episode_reward) ## save weights every episode if global_episode_count % 10 == 0: self.global_actor.save_weights( "./save_weights/pendulum_actor.h5") self.global_critic.save_weights( "./save_weights/pendulum_critic.h5")