Exemple #1
0
def run():
    policy_net = DQN(num_channels, 19).cuda()
    target_net = DQN(num_channels, 19).cuda()
    optimizer = optim.Adam(policy_net.parameters(), LR)
    memory = Memory(50000)
    env = gym.make(ENV_NAME)
    env.make_interactive(port=6666, realtime=False)
    max_epi = 100
    n_step = 2
    update_period = 10
    gamma = 0.99

    total_steps = 0
    epsilon = 0.95
    endEpsilon = 0.01
    stepDrop = (epsilon - endEpsilon) / max_epi

    for num_epi in range(max_epi):
        obs = env.reset()
        state = converter(ENV_NAME, obs).cuda()
        state = state.float()
        done = False
        total_reward = 0
        steps = 0
        if epsilon > endEpsilon:
            epsilon -= stepDrop

        while not done:
            steps += 1
            total_steps += 1
            a_out = policy_net.sample_action(state, epsilon)
            action_index = a_out
            action = make_19action(env, action_index)
            obs_prime, reward, done, info = env.step(action)

            total_reward += reward

            if done:
                print("%d episode is done" % num_epi)
                print("total rewards : %d " % total_reward)
                writer.add_scalar('Rewards/train', total_reward, num_epi)
                break

            state_prime = converter(ENV_NAME, obs_prime).cuda()
            append_sample(memory, policy_net, target_net, state, action_index,
                          reward, state_prime, done)
            state = state_prime

            if memory.size() > 1000:
                update_network(policy_net, target_net, memory, 2, optimizer,
                               total_steps)

            if total_steps % 2000 == 0:
                update_target(policy_net, target_net)
class Actor:
    def __init__(self, learner, actor_idx, epsilon):
        # environment initialization
        import gym
        import minerl
        self.actor_idx = actor_idx
        self.env = gym.make("MineRLTreechop-v0")
        self.port_number = int("12340") + actor_idx
        print("actor environment %d initialize successfully" % self.actor_idx)
        self.shared_network_cpu = ray.get(learner.get_network.remote())
        # self.shared_memory = ray.get(shared_memory_id)
        # print("shared memory assign successfully")

        # network initalization
        self.actor_network = DQN(19).cpu()
        self.actor_target_network = DQN(19).cpu()
        self.actor_network.load_state_dict(self.shared_network_cpu.state_dict())
        self.actor_target_network.load_state_dict(self.actor_network.state_dict())
        print("actor network %d initialize successfully" % self.actor_idx)

        self.initialized = False
        self.epi_counter = 0
        # exploring info
        self.epsilon = epsilon
        self.max_step = 100
        self.local_buffer_size = 100
        self.local_buffer = deque(maxlen=self.local_buffer_size)

        project_name = 'apex_dqfd_Actor%d' %(actor_idx)
        wandb.init(project=project_name, entity='neverparadise')

    # 1. 네트워크 파라미터 복사
    # 2. 환경 탐험 (초기화, 행동)
    # 3. 로컬버퍼에 저장
    # 4. priority 계산
    # 5. 글로벌 버퍼에 저장
    # 6. 주기적으로 네트워크 업데이트

    def get_initialized(self):
        return self.initialized

    def get_counter(self):
        return self.epi_counter

    # 각 환경 인스턴스에서 각 엡실론에 따라 탐험을 진행한다.
    # 탐험 과정에서 local buffer에 transition들을 저장한다.
    # local buffer의 개수가 특정 개수 이상이면 global buffer에 추가해준다.

    def explore(self, learner, shared_memory):
        self.env.make_interactive(port=self.port_number, realtime=False)
        self.initialized = True

        for num_epi in range(self.max_step):
            obs = self.env.reset()
            state = converter(obs).cpu()
            state = state.float()
            done = False
            total_reward = 0
            steps = 0
            total_steps = 0
            self.epsilon = 0.5
            if (self.epsilon > endEpsilon):
                self.epsilon -= stepDrop / (self.actor_idx + 1)

            n_step = 2
            n_step_state_buffer = deque(maxlen=n_step)
            n_step_action_buffer = deque(maxlen=n_step)
            n_step_reward_buffer = deque(maxlen=n_step)
            n_step_n_rewards_buffer = deque(maxlen=n_step)
            n_step_next_state_buffer = deque(maxlen=n_step)
            n_step_done_buffer = deque(maxlen=n_step)
            gamma_list = [0.99 ** i for i in range(n_step)]

            while not done:
                steps += 1
                total_steps += 1
                a_out = self.actor_network.sample_action(state, self.epsilon)
                action_index = a_out
                action = make_action(self.env, action_index)
                #action['attack'] = 1
                obs_prime, reward, done, info = self.env.step(action)
                total_reward += reward
                state_prime = converter(obs_prime)

                # local buffer add
                n_step_state_buffer.append(state)
                n_step_action_buffer.append(action_index)
                n_step_reward_buffer.append(reward)
                n_step_next_state_buffer.append(state_prime)
                n_step_done_buffer.append(done)
                n_rewards = sum([gamma * reward for gamma, reward in zip(gamma_list, n_step_reward_buffer)])
                n_step_n_rewards_buffer.append(n_rewards)

                if (len(n_step_state_buffer) >= n_step):
                    # LocalBuffer Get
                    # Compute Priorities
                    for i in range(n_step):
                        self.append_sample(shared_memory, self.actor_network, self.actor_target_network, \
                                           n_step_state_buffer[i], \
                                           n_step_action_buffer[i], n_step_reward_buffer[i], \
                                           n_step_next_state_buffer[i], \
                                           n_step_done_buffer[i], \
                                           n_step_n_rewards_buffer[i])
                        if (n_step_done_buffer[i]):
                            break
                state = state_prime.float().cpu()
                if done:
                    break

            if done:
                print("%d episode is done" % num_epi)
                print("total rewards : %d " % total_reward)
                wandb.log({"rewards": total_reward})
                self.update_params(learner)

            #if (num_epi % 5 == 0 and num_epi != 0):
            #    print("actor network is updated ")

    def env_close(self):
        self.env.close()

    def update_params(self, learner):
        shared_network = ray.get(learner.get_network.remote())
        self.actor_network.load_state_dict(shared_network.state_dict())

    def append_sample(self, memory, model, target_model, state, action, reward, next_state, done, n_rewards):
        # Caluclating Priority (TD Error)
        target = model(state.float()).data
        old_val = target[0][action].cpu()
        target_val = target_model(next_state.float()).data.cpu()
        if done:
            target[0][action] = reward
        else:
            target[0][action] = reward + 0.99 * torch.max(target_val)

        error = abs(old_val - target[0][action])
        error = error.cpu()
        memory.add.remote(error, [state, action, reward, next_state, done, n_rewards])
Exemple #3
0
class Actor:
    def __init__(self,
                 learner,
                 param_server,
                 actor_idx,
                 epsilon,
                 num_channels=3,
                 num_actions=19):
        # environment initialization
        import gym
        import minerl
        self.actor_idx = actor_idx
        self.env = gym.make(ENV_NAME)
        self.port_number = int("12340") + actor_idx
        print("actor environment %d initialize successfully" % self.actor_idx)
        self.env.make_interactive(port=self.port_number, realtime=False)
        self.learner_state_dict = ray.get(learner.get_state_dict.remote())
        print("getting learner state dict finished...")
        # network initalization
        self.actor_network = DQN(num_channels, num_actions).cuda()
        self.actor_target_network = DQN(num_channels, num_actions).cuda()
        self.actor_network.load_state_dict(self.learner_state_dict)
        self.actor_target_network.load_state_dict(self.learner_state_dict)
        print("actor network %d initialize successfully" % self.actor_idx)

        self.param_server = param_server
        self.epi_counter = 0
        self.max_epi = 100
        self.n_step = 4
        self.update_period = 4
        self.gamma = 0.99

        # exploring info
        self.epsilon = epsilon
        self.endEpsilon = 0.01
        self.stepDrop = (self.epsilon - self.endEpsilon) / self.max_epi
        self.local_buffer_size = 100
        self.local_buffer = deque(maxlen=self.local_buffer_size)

        self.writer = SummaryWriter(f'runs/apex/actor{self.actor_idx}')

        # 1. 네트워크 파라미터 복사
        # 2. 환경 탐험 (초기화, 행동)
        # 3. 로컬버퍼에 저장
        # 4. priority 계산
        # 5. 글로벌 버퍼에 저장
        # 6. 주기적으로 네트워크 업데이트

    def get_epi_counter(self):
        return self.epi_counter

    def update_params(self, learner):
        ray.get(self.param_server.pull_from_learner.remote(learner))
        policy_params, target_params = ray.get(
            self.param_server.push_to_actor.remote())
        self.actor_network.load_state_dict(policy_params)
        self.actor_target_network.load_state_dict(target_params)

    def append_sample(self,
                      memory,
                      state,
                      action,
                      reward,
                      next_state,
                      done,
                      n_rewards=None):
        # Caluclating Priority (TD Error)
        target = self.actor_network(state).data
        old_val = target[0][action].cpu()
        target_val = self.actor_target_network(next_state).data.cpu()
        if done:
            target[0][action] = reward
        else:
            target[0][action] = reward + 0.99 * torch.max(target_val)

        error = abs(old_val - target[0][action])
        error = error.cpu()
        state_ = state.cpu()
        next_state_ = next_state.cpu()

        if isinstance(memory, Memory):
            if n_rewards == None:
                memory.add(error, [state_, action, reward, next_state_, done])
            else:
                memory.add(
                    error,
                    (state_, action, reward, next_state_, done, n_rewards))

        else:
            if n_rewards == None:
                memory.remote.add(error,
                                  [state_, action, reward, next_state_, done])
            else:
                memory.add.remote(
                    error,
                    (state_, action, reward, next_state_, done, n_rewards))

    def explore(self, learner, memory):
        for num_epi in range(self.max_epi):
            obs = self.env.reset()
            state = converter(ENV_NAME, obs).cuda()
            state = state.float()
            done = False
            total_reward = 0
            steps = 0
            total_steps = 0
            if (self.epsilon > self.endEpsilon):
                self.epsilon -= self.stepDrop

            # initialize local_buffer
            n_step = self.n_step
            n_step_state_buffer = deque(maxlen=n_step)
            n_step_action_buffer = deque(maxlen=n_step)
            n_step_reward_buffer = deque(maxlen=n_step)
            n_step_n_rewards_buffer = deque(maxlen=n_step)
            n_step_next_state_buffer = deque(maxlen=n_step)
            n_step_done_buffer = deque(maxlen=n_step)
            gamma_list = [self.gamma**i for i in range(n_step)]

            while not done:
                steps += 1
                total_steps += 1
                a_out = self.actor_network.sample_action(state, self.epsilon)
                action_index = a_out
                action = make_19action(self.env, action_index)
                obs_prime, reward, done, info = self.env.step(action)
                total_reward += reward
                state_prime = converter(ENV_NAME, obs_prime).cuda()

                # put transition in local buffer
                n_step_state_buffer.append(state)
                n_step_action_buffer.append(action_index)
                n_step_reward_buffer.append(reward)
                n_step_next_state_buffer.append(state_prime)
                n_step_done_buffer.append(done)
                n_rewards = sum([
                    gamma * reward
                    for gamma, reward in zip(gamma_list, n_step_reward_buffer)
                ])
                n_step_n_rewards_buffer.append(n_rewards)

                if (len(n_step_state_buffer) >= n_step):
                    # Compute Priorities
                    for i in range(n_step):
                        self.append_sample(memory, n_step_state_buffer[i],
                                           n_step_action_buffer[i],
                                           n_step_reward_buffer[i],
                                           n_step_next_state_buffer[i],
                                           n_step_done_buffer[i],
                                           n_step_n_rewards_buffer[i])
                        if (n_step_done_buffer[i]):
                            break
                state = state_prime
                self.actor_network.cuda()
                self.actor_target_network.cuda()

                if done:
                    print("%d episode is done" % num_epi)
                    print("total rewards : %d " % total_reward)
                    self.writer.add_scalar('Rewards/train', total_reward,
                                           num_epi)
                    self.epi_counter += 1
                    if (num_epi % self.update_period == 0):
                        self.update_params(learner)
                    break