def run(): policy_net = DQN(num_channels, 19).cuda() target_net = DQN(num_channels, 19).cuda() optimizer = optim.Adam(policy_net.parameters(), LR) memory = Memory(50000) env = gym.make(ENV_NAME) env.make_interactive(port=6666, realtime=False) max_epi = 100 n_step = 2 update_period = 10 gamma = 0.99 total_steps = 0 epsilon = 0.95 endEpsilon = 0.01 stepDrop = (epsilon - endEpsilon) / max_epi for num_epi in range(max_epi): obs = env.reset() state = converter(ENV_NAME, obs).cuda() state = state.float() done = False total_reward = 0 steps = 0 if epsilon > endEpsilon: epsilon -= stepDrop while not done: steps += 1 total_steps += 1 a_out = policy_net.sample_action(state, epsilon) action_index = a_out action = make_19action(env, action_index) obs_prime, reward, done, info = env.step(action) total_reward += reward if done: print("%d episode is done" % num_epi) print("total rewards : %d " % total_reward) writer.add_scalar('Rewards/train', total_reward, num_epi) break state_prime = converter(ENV_NAME, obs_prime).cuda() append_sample(memory, policy_net, target_net, state, action_index, reward, state_prime, done) state = state_prime if memory.size() > 1000: update_network(policy_net, target_net, memory, 2, optimizer, total_steps) if total_steps % 2000 == 0: update_target(policy_net, target_net)
class Actor: def __init__(self, learner, actor_idx, epsilon): # environment initialization import gym import minerl self.actor_idx = actor_idx self.env = gym.make("MineRLTreechop-v0") self.port_number = int("12340") + actor_idx print("actor environment %d initialize successfully" % self.actor_idx) self.shared_network_cpu = ray.get(learner.get_network.remote()) # self.shared_memory = ray.get(shared_memory_id) # print("shared memory assign successfully") # network initalization self.actor_network = DQN(19).cpu() self.actor_target_network = DQN(19).cpu() self.actor_network.load_state_dict(self.shared_network_cpu.state_dict()) self.actor_target_network.load_state_dict(self.actor_network.state_dict()) print("actor network %d initialize successfully" % self.actor_idx) self.initialized = False self.epi_counter = 0 # exploring info self.epsilon = epsilon self.max_step = 100 self.local_buffer_size = 100 self.local_buffer = deque(maxlen=self.local_buffer_size) project_name = 'apex_dqfd_Actor%d' %(actor_idx) wandb.init(project=project_name, entity='neverparadise') # 1. 네트워크 파라미터 복사 # 2. 환경 탐험 (초기화, 행동) # 3. 로컬버퍼에 저장 # 4. priority 계산 # 5. 글로벌 버퍼에 저장 # 6. 주기적으로 네트워크 업데이트 def get_initialized(self): return self.initialized def get_counter(self): return self.epi_counter # 각 환경 인스턴스에서 각 엡실론에 따라 탐험을 진행한다. # 탐험 과정에서 local buffer에 transition들을 저장한다. # local buffer의 개수가 특정 개수 이상이면 global buffer에 추가해준다. def explore(self, learner, shared_memory): self.env.make_interactive(port=self.port_number, realtime=False) self.initialized = True for num_epi in range(self.max_step): obs = self.env.reset() state = converter(obs).cpu() state = state.float() done = False total_reward = 0 steps = 0 total_steps = 0 self.epsilon = 0.5 if (self.epsilon > endEpsilon): self.epsilon -= stepDrop / (self.actor_idx + 1) n_step = 2 n_step_state_buffer = deque(maxlen=n_step) n_step_action_buffer = deque(maxlen=n_step) n_step_reward_buffer = deque(maxlen=n_step) n_step_n_rewards_buffer = deque(maxlen=n_step) n_step_next_state_buffer = deque(maxlen=n_step) n_step_done_buffer = deque(maxlen=n_step) gamma_list = [0.99 ** i for i in range(n_step)] while not done: steps += 1 total_steps += 1 a_out = self.actor_network.sample_action(state, self.epsilon) action_index = a_out action = make_action(self.env, action_index) #action['attack'] = 1 obs_prime, reward, done, info = self.env.step(action) total_reward += reward state_prime = converter(obs_prime) # local buffer add n_step_state_buffer.append(state) n_step_action_buffer.append(action_index) n_step_reward_buffer.append(reward) n_step_next_state_buffer.append(state_prime) n_step_done_buffer.append(done) n_rewards = sum([gamma * reward for gamma, reward in zip(gamma_list, n_step_reward_buffer)]) n_step_n_rewards_buffer.append(n_rewards) if (len(n_step_state_buffer) >= n_step): # LocalBuffer Get # Compute Priorities for i in range(n_step): self.append_sample(shared_memory, self.actor_network, self.actor_target_network, \ n_step_state_buffer[i], \ n_step_action_buffer[i], n_step_reward_buffer[i], \ n_step_next_state_buffer[i], \ n_step_done_buffer[i], \ n_step_n_rewards_buffer[i]) if (n_step_done_buffer[i]): break state = state_prime.float().cpu() if done: break if done: print("%d episode is done" % num_epi) print("total rewards : %d " % total_reward) wandb.log({"rewards": total_reward}) self.update_params(learner) #if (num_epi % 5 == 0 and num_epi != 0): # print("actor network is updated ") def env_close(self): self.env.close() def update_params(self, learner): shared_network = ray.get(learner.get_network.remote()) self.actor_network.load_state_dict(shared_network.state_dict()) def append_sample(self, memory, model, target_model, state, action, reward, next_state, done, n_rewards): # Caluclating Priority (TD Error) target = model(state.float()).data old_val = target[0][action].cpu() target_val = target_model(next_state.float()).data.cpu() if done: target[0][action] = reward else: target[0][action] = reward + 0.99 * torch.max(target_val) error = abs(old_val - target[0][action]) error = error.cpu() memory.add.remote(error, [state, action, reward, next_state, done, n_rewards])
class Actor: def __init__(self, learner, param_server, actor_idx, epsilon, num_channels=3, num_actions=19): # environment initialization import gym import minerl self.actor_idx = actor_idx self.env = gym.make(ENV_NAME) self.port_number = int("12340") + actor_idx print("actor environment %d initialize successfully" % self.actor_idx) self.env.make_interactive(port=self.port_number, realtime=False) self.learner_state_dict = ray.get(learner.get_state_dict.remote()) print("getting learner state dict finished...") # network initalization self.actor_network = DQN(num_channels, num_actions).cuda() self.actor_target_network = DQN(num_channels, num_actions).cuda() self.actor_network.load_state_dict(self.learner_state_dict) self.actor_target_network.load_state_dict(self.learner_state_dict) print("actor network %d initialize successfully" % self.actor_idx) self.param_server = param_server self.epi_counter = 0 self.max_epi = 100 self.n_step = 4 self.update_period = 4 self.gamma = 0.99 # exploring info self.epsilon = epsilon self.endEpsilon = 0.01 self.stepDrop = (self.epsilon - self.endEpsilon) / self.max_epi self.local_buffer_size = 100 self.local_buffer = deque(maxlen=self.local_buffer_size) self.writer = SummaryWriter(f'runs/apex/actor{self.actor_idx}') # 1. 네트워크 파라미터 복사 # 2. 환경 탐험 (초기화, 행동) # 3. 로컬버퍼에 저장 # 4. priority 계산 # 5. 글로벌 버퍼에 저장 # 6. 주기적으로 네트워크 업데이트 def get_epi_counter(self): return self.epi_counter def update_params(self, learner): ray.get(self.param_server.pull_from_learner.remote(learner)) policy_params, target_params = ray.get( self.param_server.push_to_actor.remote()) self.actor_network.load_state_dict(policy_params) self.actor_target_network.load_state_dict(target_params) def append_sample(self, memory, state, action, reward, next_state, done, n_rewards=None): # Caluclating Priority (TD Error) target = self.actor_network(state).data old_val = target[0][action].cpu() target_val = self.actor_target_network(next_state).data.cpu() if done: target[0][action] = reward else: target[0][action] = reward + 0.99 * torch.max(target_val) error = abs(old_val - target[0][action]) error = error.cpu() state_ = state.cpu() next_state_ = next_state.cpu() if isinstance(memory, Memory): if n_rewards == None: memory.add(error, [state_, action, reward, next_state_, done]) else: memory.add( error, (state_, action, reward, next_state_, done, n_rewards)) else: if n_rewards == None: memory.remote.add(error, [state_, action, reward, next_state_, done]) else: memory.add.remote( error, (state_, action, reward, next_state_, done, n_rewards)) def explore(self, learner, memory): for num_epi in range(self.max_epi): obs = self.env.reset() state = converter(ENV_NAME, obs).cuda() state = state.float() done = False total_reward = 0 steps = 0 total_steps = 0 if (self.epsilon > self.endEpsilon): self.epsilon -= self.stepDrop # initialize local_buffer n_step = self.n_step n_step_state_buffer = deque(maxlen=n_step) n_step_action_buffer = deque(maxlen=n_step) n_step_reward_buffer = deque(maxlen=n_step) n_step_n_rewards_buffer = deque(maxlen=n_step) n_step_next_state_buffer = deque(maxlen=n_step) n_step_done_buffer = deque(maxlen=n_step) gamma_list = [self.gamma**i for i in range(n_step)] while not done: steps += 1 total_steps += 1 a_out = self.actor_network.sample_action(state, self.epsilon) action_index = a_out action = make_19action(self.env, action_index) obs_prime, reward, done, info = self.env.step(action) total_reward += reward state_prime = converter(ENV_NAME, obs_prime).cuda() # put transition in local buffer n_step_state_buffer.append(state) n_step_action_buffer.append(action_index) n_step_reward_buffer.append(reward) n_step_next_state_buffer.append(state_prime) n_step_done_buffer.append(done) n_rewards = sum([ gamma * reward for gamma, reward in zip(gamma_list, n_step_reward_buffer) ]) n_step_n_rewards_buffer.append(n_rewards) if (len(n_step_state_buffer) >= n_step): # Compute Priorities for i in range(n_step): self.append_sample(memory, n_step_state_buffer[i], n_step_action_buffer[i], n_step_reward_buffer[i], n_step_next_state_buffer[i], n_step_done_buffer[i], n_step_n_rewards_buffer[i]) if (n_step_done_buffer[i]): break state = state_prime self.actor_network.cuda() self.actor_target_network.cuda() if done: print("%d episode is done" % num_epi) print("total rewards : %d " % total_reward) self.writer.add_scalar('Rewards/train', total_reward, num_epi) self.epi_counter += 1 if (num_epi % self.update_period == 0): self.update_params(learner) break