コード例 #1
0
def run():
    model_name = "drqn_pomdp_random"
    env_name = "MineRLNavigateDense-v0"
    seed = 1

    env = gym.make(env_name)
    #env.make_interactive(realtime=False, port=6666)

    device = torch.device("cuda")
    np.random.seed(seed)
    random.seed(seed)
    writer = SummaryWriter('runs/' + env_name + "_" + model_name)

    batch_size = 2
    learning_rate = 1e-3
    memory_size = 50000
    min_epi_num = 1
    target_update_period = 2

    eps_start = 0.1
    eps_end = 0.001
    eps_decay = 0.995
    tau = 1e-2

    random_update = True
    n_step = 4
    max_epi = 10000
    max_epi_len = 10000
    max_epi_step = 30000

    num_channels = 4
    batch_first = False
    policy_net = DRQN(num_channels=4, num_actions=6,
                      batch_first=batch_first).cuda().float()
    target_net = DRQN(num_channels=4, num_actions=6,
                      batch_first=batch_first).cuda().float()
    target_net.load_state_dict(policy_net.state_dict())
    optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)

    score = 0
    total_score = 0

    epsilon = eps_start

    memory_device = torch.device("cpu")

    memory = EpisodeMemory(random_update=random_update,
                           max_epi_num=20,
                           max_epi_len=max_epi_len,
                           batch_size=batch_size,
                           n_step=n_step)

    for e in range(max_epi):
        state = env.reset()
        obs = converter(env_name,
                        state).to(memory_device)  # obs : [1, 4, 64, 64]
        done = False

        episode_record = EpisodeBuffer()
        hidden = policy_net.init_hidden_state(batch_first=batch_first,
                                              batch_size=batch_size,
                                              training=False)
        for t in range(max_epi_step):
            action_index, hidden = policy_net.sample_action(
                obs.to(device="cuda:0"), epsilon, hidden)
            action = make_6action(env, action_index)
            s_prime, reward, done, info = env.step(action)
            obs_prime = converter(env_name, s_prime).to(memory_device)
            done_mask = 0.0 if done else 1.0

            batch_action = torch.tensor([action_index
                                         ]).unsqueeze(0).to(memory_device)
            batch_reward = torch.tensor([reward
                                         ]).unsqueeze(0).to(memory_device)
            batch_done = torch.tensor([done_mask
                                       ]).unsqueeze(0).to(memory_device)
            episode_record.put([
                obs, batch_action, batch_reward / 10.0, obs_prime, batch_done
            ])
            obs = obs_prime
            score += reward
            total_score += reward

            if len(memory) > min_epi_num:
                train(writer,
                      policy_net,
                      target_net,
                      memory,
                      optimizer,
                      batch_size,
                      gamma=0.99)

                if (t + 1) % target_update_period == 0:
                    for target_param, local_param in zip(
                            target_net.parameters(),
                            policy_net.parameters()):  # <- soft update
                        target_param.data.copy_(tau * local_param.data +
                                                (1.0 - tau) *
                                                target_param.data)

            if done:
                print(f"Score of # {e} episode : {score}")
                break
        memory.put(episode_record)
        epsilon = max(eps_end, epsilon * eps_decay)

        if e % 5:
            torch.save(policy_net, model_name + '.pth')
        writer.add_scalar('Rewards per episodes', score, e)
        score = 0

    writer.close()
    env.close()
コード例 #2
0
class Agent:
    def __init__(self, env, n_input, n_output):
        self.env = env
        self.epsilon = 1.0
        self.epsilon_decay = 0
        self.net = DRQN(n_input, n_output).to(cf.DEVICE)
        self.tgt_net = DRQN(n_input, n_output).to(cf.DEVICE)
        self.optimizer = torch.optim.Adam(self.net.parameters(),
                                          lr=cf.LEARNING_RATE)

    def action(self, state, hidden):

        state = state.unsqueeze(0).unsqueeze(0)
        q_value, hidden = self.tgt_net.forward(state, hidden)
        _, action = torch.max(q_value, 2)
        self.epsilon_decay += 1
        self.update_epsilon()
        if np.random.rand() <= self.epsilon:
            return self.env.action_space.sample(), hidden
        else:
            return action.item(), hidden

    def update_epsilon(self):

        if self.epsilon_decay > 1000:

            self.epsilon = max(self.epsilon - 0.00005, 0.02)

    def update_tgt(self):

        self.tgt_net.load_state_dict(self.net.state_dict())

    def train_model(self, batch):
        current_states, rewards, actions, next_states, dones = batch

        states_v = torch.stack(current_states).view(cf.BATCH_SIZE,
                                                    cf.l_sequence,
                                                    self.net.n_input)
        next_states_v = torch.stack(next_states).view(cf.BATCH_SIZE,
                                                      cf.l_sequence,
                                                      self.net.n_input)
        actions_v = torch.stack(actions).view(cf.BATCH_SIZE, cf.l_sequence,
                                              -1).long()
        rewards_v = torch.stack(rewards).view(cf.BATCH_SIZE, cf.l_sequence,
                                              -1).to(cf.DEVICE)
        dones_v = torch.stack(dones).view(cf.BATCH_SIZE, cf.l_sequence,
                                          -1).to(cf.DEVICE)
        state_action_values, _ = self.net(states_v)

        state_action_values = state_action_values.gather(
            2, actions_v.to(cf.DEVICE))
        next_state_values, _ = self.tgt_net(next_states_v)
        next_state_values = next_state_values.max(2, keepdim=True)[0]
        next_state_values = next_state_values.detach()

        expected_state_action_values = dones_v * cf.gamma * next_state_values + rewards_v
        loss = torch.nn.functional.mse_loss(state_action_values,
                                            expected_state_action_values)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss