class A3CLocal: def __init__(self, config): self.config = config # 리플레이메모리 self.replay_memory = deque(maxlen=self.config.n_replay_memory) # 정책신경망 생성 self.actor = PolicyNet(self.config.n_state, self.config.n_action) self.actor.to(device) # 가치신경망 생성 self.critic = ValueNet(self.config.n_state, 1) self.critic.to(device) # 정책신경망의 출력을 받아 확률적으로 행동을 선택 def get_action(self, state): state = torch.tensor(state, dtype=torch.float).to(device) policy = self.actor(state) policy = policy.detach().cpu().numpy()[0] return np.random.choice(self.config.n_action, 1, p=policy)[0] # 리플에이메모리 추가 def append_replay(self, state, action, reward, next_state): act = np.zeros(self.config.n_action) act[action] = 1 self.replay_memory.append((state, act, reward, next_state)) # 리플에이메모리 조회 및 클리어 def get_replay(self): # 히스토리를 배열 형태로 정렬 replay_memory = np.array(self.replay_memory) self.replay_memory.clear() states = np.vstack(replay_memory[:, 0]) actions = list(replay_memory[:, 1]) rewards = list(replay_memory[:, 2]) next_states = list(replay_memory[:, 3]) return states, actions, rewards, next_states # 글로벌신병망의 wegith를 로컬신경망으로 복사 def update_local_model(self, actor_dict, critic_dict): self.actor.load_state_dict(actor_dict) self.critic.load_state_dict(critic_dict) # GPU 메모리 반납 def close(self): del self.actor del self.critic
class A3CGlobal: def __init__(self, config): self.config = config # 정책신경망 생성 self.actor = PolicyNet(self.config.n_state, self.config.n_action) self.actor.to(device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.actor_lr) # 가치신경망 생성 self.critic = ValueNet(self.config.n_state, 1) self.critic.to(device) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.critic_lr) # 리턴값 계산 def get_returns(self, rewards, done, next_value): returns = torch.zeros(len(rewards), dtype=torch.float).to(self.config.device) R = 0 if done else next_value for i in reversed(range(0, len(rewards))): R = rewards[i] + self.config.discount_factor * R returns[i] = R return returns # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트 def train_model(self, states, actions, rewards, next_states, done): states = torch.tensor(states, dtype=torch.float).to(self.config.device) actions = torch.tensor(actions, dtype=torch.float).to(self.config.device) next_states = torch.tensor(next_states, dtype=torch.float).to(self.config.device) next_values = self.critic(next_states).view(-1) # 리턴값 계산 returns = self.get_returns(rewards, done, next_values[-1]) values = self.critic(states).view(-1) # 가치신경망 학습 critic_loss = self.train_critic(values, returns) # 정책신경망 학습 actor_loss = self.train_actor(states, actions, returns - values) return actor_loss, critic_loss # 정책신경망을 업데이트하는 함수 def train_actor(self, states, actions, advantages): policy = self.actor(states) action_prob = torch.sum(actions * policy, dim=1) cross_entropy = torch.log(action_prob + 1.e-7) * advantages.detach() actor_loss = -torch.mean(cross_entropy) self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() return actor_loss.item() # 가치신경망을 업데이트하는 states def train_critic(self, values, targets): critic_loss = torch.mean(torch.pow(targets - values, 2)) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() return critic_loss.item() # GPU 메모리 반납 def close(self): del self.actor del self.critic
class PGAgent: def __init__(self, config): self.config = config # replay memory self.replay_memory = deque(maxlen=self.config.n_replay_memory) # 정책신경망 생성 self.model = PolicyNet(self.config.n_state, self.config.n_action) self.model.to(device) self.model_optimizer = torch.optim.Adam(self.model.parameters(), lr=self.config.learning_rate) # 정책신경망의 출력을 받아 확률적으로 행동을 선택 def get_action(self, state): state = torch.tensor(state, dtype=torch.float).to(device) policy = self.model(state) policy = policy.detach().cpu().numpy()[0] return np.random.choice(self.config.n_action, 1, p=policy)[0] # 히스토리 추가 def append_replay(self, state, action, reward, next_state): act = np.zeros(self.config.n_action) act[action] = 1 self.replay_memory.append((state, act, reward, next_state)) # 리턴값 계산 def get_returns(self, rewards): returns = torch.zeros(len(rewards), dtype=torch.float).to(self.config.device) R = 0 for i in reversed(range(0, len(rewards))): R = rewards[i] + self.config.discount_factor * R returns[i] = R if 1 < len(returns): returns -= torch.mean(returns) returns /= (torch.std(returns) + 1.e-7) return returns # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트 def train_model(self): # 히스토리를 배열 형태로 정렬 replay_memory = np.array(self.replay_memory) self.replay_memory.clear() states = np.vstack(replay_memory[:, 0]) actions = list(replay_memory[:, 1]) rewards = list(replay_memory[:, 2]) next_states = list(replay_memory[:, 3]) states = torch.tensor(states, dtype=torch.float).to(self.config.device) actions = torch.tensor(actions, dtype=torch.float).to(self.config.device) # 리턴값 계산 returns = self.get_returns(rewards) loss = self.train_policy(states, actions, returns) return loss # 정책신경망을 업데이트하는 함수 def train_policy(self, states, actions, returns): policy = self.model(states) action_prob = torch.sum(actions * policy, dim=1) cross_entropy = torch.log(action_prob + 1.e-7) * returns loss = -torch.mean(cross_entropy) self.model_optimizer.zero_grad() loss.backward() self.model_optimizer.step() return loss.item() # model의 weight를 파일로 저장 def save(self): torch.save(self.model.state_dict(), self.config.save_file) # 파일로 부터 model의 weight를 읽어 옴 def load(self): self.model.load_state_dict(torch.load(self.config.save_file)) # GPU 메모리 반납 def close(self): del self.model
class A2CAgent: def __init__(self, config): self.config = config # replay memory self.replay_memory = deque(maxlen=self.config.n_replay_memory) # 정책신경망 생성 self.actor = PolicyNet(self.config.n_state, self.config.n_action) self.actor.to(device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.actor_lr) # 가치신경망 생성 self.critic = ValueNet(self.config.n_state, 1) self.critic.to(device) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.critic_lr) # 정책신경망의 출력을 받아 확률적으로 행동을 선택 def get_action(self, state): state = torch.tensor(state, dtype=torch.float).to(device) policy = self.actor(state) policy = policy.detach().cpu().numpy()[0] return np.random.choice(self.config.n_action, 1, p=policy)[0] # 히스토리 추가 def append_replay(self, state, action, reward, next_state): act = np.zeros(self.config.n_action) act[action] = 1 self.replay_memory.append((state, act, reward, next_state)) # 리턴값 계산 def get_returns(self, rewards, done, next_value): returns = torch.zeros(len(rewards), dtype=torch.float).to(self.config.device) R = 0 if done else next_value for i in reversed(range(0, len(rewards))): R = rewards[i] + self.config.discount_factor * R returns[i] = R return returns # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트 def train_model(self, done): # 히스토리를 배열 형태로 정렬 replay_memory = np.array(self.replay_memory) self.replay_memory.clear() states = np.vstack(replay_memory[:, 0]) actions = list(replay_memory[:, 1]) rewards = list(replay_memory[:, 2]) next_states = list(replay_memory[:, 3]) states = torch.tensor(states, dtype=torch.float).to(self.config.device) actions = torch.tensor(actions, dtype=torch.float).to(self.config.device) next_states = torch.tensor(next_states, dtype=torch.float).to(self.config.device) next_values = self.critic(next_states).view(-1) # 리턴값 계산 returns = self.get_returns(rewards, done, next_values[-1]) values = self.critic(states).view(-1) # 가치신경망 학습 critic_loss = self.train_critic(values, returns) # 정책신경망 학습 actor_loss = self.train_actor(states, actions, returns - values) return actor_loss, critic_loss # 정책신경망을 업데이트하는 함수 def train_actor(self, states, actions, advantages): policy = self.actor(states) action_prob = torch.sum(actions * policy, dim=1) cross_entropy = torch.log(action_prob + 1.e-7) * advantages.detach() actor_loss = -torch.mean(cross_entropy) self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() return actor_loss.item() # 가치신경망을 업데이트하는 states def train_critic(self, values, targets): critic_loss = torch.mean(torch.pow(targets - values, 2)) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() return critic_loss.item() # model의 weight를 파일로 저장 def save(self): torch.save(self.actor.state_dict(), self.config.save_file + ".actor") torch.save(self.critic.state_dict(), self.config.save_file + ".critic") # 파일로 부터 model의 weight를 읽어 옴 def load(self): self.actor.load_state_dict(torch.load(self.config.save_file + ".actor")) self.critic.load_state_dict(torch.load(self.config.save_file + ".critic")) # GPU 메모리 반납 def close(self): del self.actor del self.critic