def __init__(self, env): # 텐서플로우 세션 설정 self.sess = tf.Session() K.set_session(self.sess) # 하이퍼파라미터 self.GAMMMA = .95 self.BATCH_SIZE = 32 self.ACTOR_LEARNING_RATE = .0001 self.CRITIC_LEARNING_RATE = .001 #환경 self.env = env #상태변수 차원 self.state_dim = env.observation_space.shape[0] #행동 차원(dimension) self.action_dim = env.action_space.shape[0] #행동의 최대 크기 self.action_bound = env.action_space.high[0] #액터 신경망 및 크리틱 신경망 생성 self.actor = Actor(self.sess, self.state_dim, self.action_dim, self.action_bound, self.ACTOR_LEARNING_RATE) self.critic = Critic(self.state_dim, self.action_dim, self.CRITIC_LEARNING_RATE) # 그래디언트 계산을 위한 초기화 self.sess.run(tf.global_variables_initializer()) #에피소드에서 얻은 총 보상값 저장 self.save_epi_reward = []
def __init__(self, env): self.sess = tf.Session() K.set_session(self.sess) # hyperparameters self.GAMMA = 0.95 self.BATCH_SIZE = 32 self.ACTOR_LEARNING_RATE = 0.0001 self.CRITIC_LEARNING_RATE = 0.001 self.env = env # get state dimension self.state_dim = env.observation_space.shape[0] # get action dimension self.action_dim = env.action_space.shape[0] # get action bound self.action_bound = env.action_space.high[0] # create actor and critic networks self.actor = Actor(self.sess, self.state_dim, self.action_dim, self.action_bound, self.ACTOR_LEARNING_RATE) self.critic = Critic(self.state_dim, self.action_dim, self.CRITIC_LEARNING_RATE) # initialize for later gradient calculation self.sess.run( tf.global_variables_initializer()) #<-- no problem without it # save the results self.save_epi_reward = []
def __init__(self, env): # hyperparameters self.GAMMA = 0.95 self.BATCH_SIZE = 32 self.ACTOR_LEARNING_RATE = 0.0001 self.CRITIC_LEARNING_RATE = 0.001 self.env = env # get state dimension self.state_dim = env.observation_space.shape[0] # get action dimension self.action_dim = env.action_space.shape[0] # get action bound self.action_bound = env.action_space.high[0] # create actor and critic networks self.actor = Actor(self.state_dim, self.action_dim, self.action_bound, self.ACTOR_LEARNING_RATE) self.critic = Critic(self.state_dim, self.action_dim, self.CRITIC_LEARNING_RATE) # save the results self.save_epi_reward = []
def __init__(self, env): # 클래스 초기화 메서드 # hyper parameters self.GAMMA = 0.95 self.BATCH_SIZE = 32 self.ACTOR_LEARNING_RATE = 0.0001 self.CRITIC_LEARNING_RATE = 0.001 # 환경 self.env = env # get state dimension self.state_dim = env.observation_space.shape[0] # get action dimension self.action_dim = env.action_space.shape[0] # get action bound (행동의 최대 크기) self.action_bound = env.action_space.high[0] ## A2C 알고리즘 1. critic 과 actor 신경망의 파라메터 phi (critic 신경망) 와 theta (actor 신경망)를 초기화한다. # create actor and critic networks self.actor = Actor(self.state_dim, self.action_dim, self.action_bound, self.ACTOR_LEARNING_RATE) self.critic = Critic(self.state_dim, self.action_dim, self.CRITIC_LEARNING_RATE); # save the results (에피소드에서 얻은 총 보상값을 저장하기 위한 변수) self.save_epi_reward = [];
class A2Cagent(object): def __init__(self, env): # hyperparameters self.GAMMA = 0.95 self.BATCH_SIZE = 32 self.ACTOR_LEARNING_RATE = 0.0001 self.CRITIC_LEARNING_RATE = 0.001 self.env = env # get state dimension self.state_dim = env.observation_space.shape[0] # get action dimension self.action_dim = env.action_space.shape[0] # get action bound self.action_bound = env.action_space.high[0] # create actor and critic networks self.actor = Actor(self.state_dim, self.action_dim, self.action_bound, self.ACTOR_LEARNING_RATE) self.critic = Critic(self.state_dim, self.action_dim, self.CRITIC_LEARNING_RATE) # save the results self.save_epi_reward = [] ## computing Advantages and targets: y_k = r_k + gamma*V(s_k+1), A(s_k, a_k)= y_k - V(s_k) def advantage_td_target(self, reward, v_value, next_v_value, done): if done: y_k = reward advantage = y_k - v_value else: y_k = reward + self.GAMMA * next_v_value advantage = y_k - v_value return advantage, y_k ## convert (list of np.array) to np.array def unpack_batch(self, batch): unpack = batch[0] for idx in range(len(batch) - 1): unpack = np.append(unpack, batch[idx + 1], axis=0) return unpack ## train the agent def train(self, max_episode_num): for ep in range(int(max_episode_num)): # initialize batch batch_state, batch_action, batch_td_target, batch_advantage = [], [], [], [] # reset episode time, episode_reward, done = 0, 0, False # reset the environment and observe the first state state = self.env.reset() # shape of state from gym (3,) while not done: # visualize the environment #self.env.render() # pick an action (shape of gym action = (action_dim,) ) action = self.actor.get_action(state) # clip continuous action to be within action_bound action = np.clip(action, -self.action_bound, self.action_bound) # observe reward, new_state, shape of output of gym (state_dim,) next_state, reward, done, _ = self.env.step(action) # change shape (state_dim,) -> (1, state_dim), same to action, next_state state = np.reshape(state, [1, self.state_dim]) next_state = np.reshape(next_state, [1, self.state_dim]) action = np.reshape(action, [1, self.action_dim]) reward = np.reshape(reward, [1, 1]) # compute next v_value v_value = self.critic.model.predict(state) next_v_value = self.critic.model.predict(next_state) # compute advantage and TD target train_reward = (reward + 8) / 8 # <-- normalization advantage, y_i = self.advantage_td_target( train_reward, v_value, next_v_value, done) # append to the batch batch_state.append(state) batch_action.append(action) batch_td_target.append(y_i) batch_advantage.append(advantage) # continue until batch becomes full if len(batch_state) < self.BATCH_SIZE: # update current state state = next_state[0] episode_reward += reward[0] time += 1 continue # if batch is full, start to train networks on batch # extract batched states, actions, td_targets, advantages states = self.unpack_batch(batch_state) actions = self.unpack_batch(batch_action) td_targets = self.unpack_batch(batch_td_target) advantages = self.unpack_batch(batch_advantage) # clear the batch batch_state, batch_action, batch_td_target, batch_advantage = [], [], [], [] # train critic self.critic.train_on_batch(states, td_targets) # train actor self.actor.train(states, actions, advantages) # update current state state = next_state[0] episode_reward += reward[0] time += 1 ## display rewards every episode print('Episode: ', ep + 1, 'Time: ', time, 'Reward: ', episode_reward) self.save_epi_reward.append(episode_reward) ## save weights every episode if ep % 10 == 0: self.actor.save_weights("./save_weights/pendulum_actor.h5") self.critic.save_weights("./save_weights/pendulum_critic.h5") np.savetxt('./save_weights/pendulum_epi_reward.txt', self.save_epi_reward) print(self.save_epi_reward) ## save them to file if done def plot_result(self): plt.plot(self.save_epi_reward) plt.show()
class A2Cagent(object): def __init__(self, env): # 텐서플로우 세션 설정 self.sess = tf.Session() K.set_session(self.sess) # 하이퍼파라미터 self.GAMMMA = .95 self.BATCH_SIZE = 32 self.ACTOR_LEARNING_RATE = .0001 self.CRITIC_LEARNING_RATE = .001 #환경 self.env = env #상태변수 차원 self.state_dim = env.observation_space.shape[0] #행동 차원(dimension) self.action_dim = env.action_space.shape[0] #행동의 최대 크기 self.action_bound = env.action_space.high[0] #액터 신경망 및 크리틱 신경망 생성 self.actor = Actor(self.sess, self.state_dim, self.action_dim, self.action_bound, self.ACTOR_LEARNING_RATE) self.critic = Critic(self.state_dim, self.action_dim, self.CRITIC_LEARNING_RATE) # 그래디언트 계산을 위한 초기화 self.sess.run(tf.global_variables_initializer()) #에피소드에서 얻은 총 보상값 저장 self.save_epi_reward = [] ## 어드벤티지와 TD 타깃 계산 def advantage_td_target(self, reward, v_value, next_v_value, done): if done: ## episode가 done일때 y_k = reward advantage = y_k - v_value else: y_k = reward + self.GAMMMA * next_v_value advantage = y_k - v_value return advantage, y_k ## 배치에 저장된 데이터 추출 (###########################batch에 어떻게 저장되는지 확인 #########################################) def unpack_batch(self, batch): unpack = batch[0] for idx in range(len(batch) - 1): unpack = np.append(unpack, batch[idx + 1], axis=0) return unpack ## 에이전트 학습 def train(self, max_episode_num): # 에피소드마다 다음을 반복 for ep in range(int(max_episode_num)): #배치 초기화 batch_state, batch_action, batch_td_target, batch_advantage = [], [], [], [] #에피소드 초기화 time, episode_reward, done = 0, 0, False #환경 초기화 및 초기 상태 관측 state = self.env.reset() while not done: #환경 시각화 self.env.render() #행동 추출 action = self.actor.get_action(state) #행동 범위 클리핑 action = np.clip(action, -self.action_bound, self.action_bound) #다음 상태, 보상 관측 next_state, reward, done, _ = self.env.step(action) #shape 변환 state = np.reshape(state, [1, self.state_dim]) next_state = np.reshape(next_state, [1, self.state_dim]) action = np.reshape(action, [1, self.action_dim]) reward = np.reshape(reward, [1, 1]) #상태가치 계산 v_value = self.critic.model.predict(state) next_v_value = self.critic.model.predict(next_state) #어드벤티지 TD 타깃 계산 train_reward = (reward + 8) / 8 advantage, y_i = self.advantage_td_target( train_reward, v_value, next_v_value, done) #배치에 저장 batch_state.append(state) batch_action.append(action) batch_td_target.append(y_i) batch_advantage.append(advantage) #배치가 채워질 때까지 학습하지 않고 저장만 계속 if len(batch_state) < self.BATCH_SIZE: #상태 업데이트 state = next_state[0] episode_reward += reward[0] time += 1 continue #배치가 채워지면 학습 진행 #배치에서 데이터 추출 states = self.unpack_batch(batch_state) actions = self.unpack_batch(batch_action) td_targets = self.unpack_batch(batch_td_target) advantages = self.unpack_batch(batch_advantage) # batch flushing batch_state, batch_action, batch_td_target, batch_advantage = [], [], [], [] #critic 신경망 업데이트 self.critic.train_on_batch(states, td_targets) #actor 신경망 업데이트 self.actor.train(states, actions, advantages) #상태 업데이트 state = next_state[0] episode_reward += reward[0] time += 1 #에피소드마다 결과 보상값 출력 print('Episode:', ep + 1, 'Time:', time, 'Reward:', episode_reward) self.save_epi_reward.append(episode_reward) #에피소드 10번 마다 신경망 파라미터를 파일에 저장 if ep % 10 == 0: self.actor.save_weights('./save_weights/pendulum_actor.h5') self.critic.save_weights('./save_weights/pendulum_critic.h5') #학습 후, 누적 보상값 저장 np.savetxt('./save_weights/pendulum_epi_reward.txt', self.save_epi_reward) #에피소드와 누적 보상값을 그려주는 함수 def plot_result(self): plt.plot(self.save_epi_reward) plt.show()
class A2Cagent(object): def __init__(self, env): self.sess = tf.Session() K.set_session(self.sess) # hyperparameters self.GAMMA = 0.95 self.BATCH_SIZE = 32 self.ACTOR_LEARNING_RATE = 0.0001 self.CRITIC_LEARNING_RATE = 0.001 self.env = env # get state dimension self.state_dim = env.observation_space.shape[0] # get action dimension self.action_dim = env.action_space.shape[0] # get action bound self.action_bound = env.action_space.high[0] # create actor and critic networks self.actor = Actor(self.sess, self.state_dim, self.action_dim, self.action_bound, self.ACTOR_LEARNING_RATE) self.critic = Critic(self.state_dim, self.action_dim, self.CRITIC_LEARNING_RATE) # initialize for later gradient calculation self.sess.run( tf.global_variables_initializer()) #<-- no problem without it # save the results self.save_epi_reward = [] ## computing Advantages and targets: y_k = r_k + gamma*V(s_k+1), A(s_k, a_k)= y_k - V(s_k) def advantage_td_target(self, reward, v_value, next_v_value, done): if done: y_k = reward advantage = y_k - v_value else: y_k = reward + self.GAMMA * next_v_value advantage = y_k - v_value return advantage, y_k ## train the agent def train(self, max_episode_num): for ep in range(int(max_episode_num)): # initialize batch states, actions, td_targets, advantages = [], [], [], [] # reset episode time, episode_reward, done = 0, 0, False # reset the environment and observe the first state state = self.env.reset() # shape of state from gym (3,) while not done: # visualize the environment #self.env.render() # pick an action (shape of gym action = (action_dim,) ) action = self.actor.get_action(state) # clip continuous action to be within action_bound action = np.clip(action, -self.action_bound, self.action_bound) # observe reward, new_state, shape of output of gym (state_dim,) next_state, reward, done, _ = self.env.step(action) # compute next v_value v_value = self.critic.predict(state) next_v_value = self.critic.predict(next_state) # compute advantage and TD target train_reward = (reward + 8) / 8 # <-- normalization advantage, y_i = self.advantage_td_target( train_reward, v_value, next_v_value, done) # append to the batch states.append(state) actions.append(action) td_targets.append(y_i) advantages.append(advantage) # if batch is full, start to train networks on batch if len(states) == self.BATCH_SIZE: # train critic self.critic.train_on_batch(states, td_targets) # train actor self.actor.train(states, actions, advantages) # clear the batch states, actions, td_targets, advantages = [], [], [], [] # update current state state = next_state episode_reward += reward time += 1 ## display rewards every episode print('Episode: ', ep + 1, 'Time: ', time, 'Reward: ', episode_reward) self.save_epi_reward.append(episode_reward) ## save weights every episode if ep % 10 == 0: self.actor.save_weights("./save_weights/pendulum_actor.h5") self.critic.save_weights("./save_weights/pendulum_critic.h5") np.savetxt('./save_weights/pendulum_epi_reward.txt', self.save_epi_reward) print(self.save_epi_reward) ## save them to file if done def plot_result(self): plt.plot(self.save_epi_reward) plt.show()
class A2Cagent(object): # object: python 2 의 old class 와 호환을 위해서.. python3 만 쓴다면, (..) 생략 가능 def __init__(self, env): # 클래스 초기화 메서드 # hyper parameters self.GAMMA = 0.95 self.BATCH_SIZE = 32 self.ACTOR_LEARNING_RATE = 0.0001 self.CRITIC_LEARNING_RATE = 0.001 # 환경 self.env = env # get state dimension self.state_dim = env.observation_space.shape[0] # get action dimension self.action_dim = env.action_space.shape[0] # get action bound (행동의 최대 크기) self.action_bound = env.action_space.high[0] ## A2C 알고리즘 1. critic 과 actor 신경망의 파라메터 phi (critic 신경망) 와 theta (actor 신경망)를 초기화한다. # create actor and critic networks self.actor = Actor(self.state_dim, self.action_dim, self.action_bound, self.ACTOR_LEARNING_RATE) self.critic = Critic(self.state_dim, self.action_dim, self.CRITIC_LEARNING_RATE); # save the results (에피소드에서 얻은 총 보상값을 저장하기 위한 변수) self.save_epi_reward = []; ## computing Advantages and targets: y(k) = r(k) + gamma * V(s_k + 1), A(s_k, a_k) = y_k - V(s_k) def advantage_td_target(self, reward, v_value, next_v_value, done): if done: y_k = reward; advantage = y_k - v_value; else: y_k = reward + self.GAMMA * next_v_value; advantage = y_k - v_value; return advantage, y_k ## train the agent def train(self, max_episode_num): ## A2C 알고리즘 2. Repeat # 에피소드마다 다음을 반복 for ep in range(int(max_episode_num)): # initialize batch # 상태변수, 행동, 시간차 타겟, 어드밴티지를 저장할 배치를 초기화한다. states, actions, td_targets, advantages = [], [], [], [] # reset episode (에피소드 초기화) time, episode_reward, done = 0, 0, False # reset the environment and observe the first state x0 state = self.env.reset() # shape of state from gym (3,) ## A2C 알고리즘 2. 내부 Repeat while not done: # visualize the environment # self.env.render() ## A2C 알고리즘 2.1.1. 정책 u_{i} ~ pi_{theta}(u_{i}|x_{i}) 으로 행동을 확률적으로 선택한다. # pick an action (shape of gym action = (action_dim,) ) (행동 추출) action = self.actor.get_action(state) # clip continuous action to be within action_bound [-2, 2] action = np.clip(action, -self.action_bound, self.action_bound) ## A2C 알고리즘 2.1.2. u_{i} 를 실행해 보상 r(x_{i}, u_{i})과 다음 상태변수 x_{i+1}을 측정한다. # observe reward, new_state, shape of output of gym (state_dim,) next_state, reward, done, _ = self.env.step(action) # compute next v_value v_value = self.critic.predict(state) next_v_value = self.critic.predict(next_state) # compute advantage and TD target train_reward = (reward + 8) / 8 # <-- normalization advantage, y_i = self.advantage_td_target(train_reward, v_value, next_v_value, done) # append to the batch (배치에 저장) states.append(state) actions.append(action) td_targets.append(y_i) advantages.append(advantage) # if batch is full, start to train networks on batch if len(states) == self.BATCH_SIZE: ## A2C 2.5. critic 신경망 업데이트 (학습) TD target 에 대한 regression ?? # train critic self.critic.train_on_batch(states, td_targets) ## A2C 2.6. actor 신경망 업데이트 (학습) # train actor self.actor.train(states, actions, advantages) # clear the batch states, actions, td_targets, advantages = [], [], [], [] # update current state state = next_state episode_reward += reward time += 1 ## display rewards every episode print('Episode: ', ep + 1, 'Time: ', time, 'Reward: ', episode_reward) self.save_epi_reward.append(episode_reward) ## save weights every episode if ep % 10 == 0: self.actor.save_weights("./save_weights/pendulum_actor.h5") self.critic.save_weights("./save_weights/pendulum_critic.h5") np.savetxt('./save_weights/pendulum_epi_reward.txt', self.save_epi_reward) print(self.save_epi_reward) ## save them to file if done def plot_result(self): plt.plot(self.save_epi_reward) plt.show()