def run(): model_name = "drqn_pomdp_random" env_name = "MineRLNavigateDense-v0" seed = 1 env = gym.make(env_name) #env.make_interactive(realtime=False, port=6666) device = torch.device("cuda") np.random.seed(seed) random.seed(seed) writer = SummaryWriter('runs/' + env_name + "_" + model_name) batch_size = 2 learning_rate = 1e-3 memory_size = 50000 min_epi_num = 1 target_update_period = 2 eps_start = 0.1 eps_end = 0.001 eps_decay = 0.995 tau = 1e-2 random_update = True n_step = 4 max_epi = 10000 max_epi_len = 10000 max_epi_step = 30000 num_channels = 4 batch_first = False policy_net = DRQN(num_channels=4, num_actions=6, batch_first=batch_first).cuda().float() target_net = DRQN(num_channels=4, num_actions=6, batch_first=batch_first).cuda().float() target_net.load_state_dict(policy_net.state_dict()) optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate) score = 0 total_score = 0 epsilon = eps_start memory_device = torch.device("cpu") memory = EpisodeMemory(random_update=random_update, max_epi_num=20, max_epi_len=max_epi_len, batch_size=batch_size, n_step=n_step) for e in range(max_epi): state = env.reset() obs = converter(env_name, state).to(memory_device) # obs : [1, 4, 64, 64] done = False episode_record = EpisodeBuffer() hidden = policy_net.init_hidden_state(batch_first=batch_first, batch_size=batch_size, training=False) for t in range(max_epi_step): action_index, hidden = policy_net.sample_action( obs.to(device="cuda:0"), epsilon, hidden) action = make_6action(env, action_index) s_prime, reward, done, info = env.step(action) obs_prime = converter(env_name, s_prime).to(memory_device) done_mask = 0.0 if done else 1.0 batch_action = torch.tensor([action_index ]).unsqueeze(0).to(memory_device) batch_reward = torch.tensor([reward ]).unsqueeze(0).to(memory_device) batch_done = torch.tensor([done_mask ]).unsqueeze(0).to(memory_device) episode_record.put([ obs, batch_action, batch_reward / 10.0, obs_prime, batch_done ]) obs = obs_prime score += reward total_score += reward if len(memory) > min_epi_num: train(writer, policy_net, target_net, memory, optimizer, batch_size, gamma=0.99) if (t + 1) % target_update_period == 0: for target_param, local_param in zip( target_net.parameters(), policy_net.parameters()): # <- soft update target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) if done: print(f"Score of # {e} episode : {score}") break memory.put(episode_record) epsilon = max(eps_end, epsilon * eps_decay) if e % 5: torch.save(policy_net, model_name + '.pth') writer.add_scalar('Rewards per episodes', score, e) score = 0 writer.close() env.close()
class Agent: def __init__(self, env, n_input, n_output): self.env = env self.epsilon = 1.0 self.epsilon_decay = 0 self.net = DRQN(n_input, n_output).to(cf.DEVICE) self.tgt_net = DRQN(n_input, n_output).to(cf.DEVICE) self.optimizer = torch.optim.Adam(self.net.parameters(), lr=cf.LEARNING_RATE) def action(self, state, hidden): state = state.unsqueeze(0).unsqueeze(0) q_value, hidden = self.tgt_net.forward(state, hidden) _, action = torch.max(q_value, 2) self.epsilon_decay += 1 self.update_epsilon() if np.random.rand() <= self.epsilon: return self.env.action_space.sample(), hidden else: return action.item(), hidden def update_epsilon(self): if self.epsilon_decay > 1000: self.epsilon = max(self.epsilon - 0.00005, 0.02) def update_tgt(self): self.tgt_net.load_state_dict(self.net.state_dict()) def train_model(self, batch): current_states, rewards, actions, next_states, dones = batch states_v = torch.stack(current_states).view(cf.BATCH_SIZE, cf.l_sequence, self.net.n_input) next_states_v = torch.stack(next_states).view(cf.BATCH_SIZE, cf.l_sequence, self.net.n_input) actions_v = torch.stack(actions).view(cf.BATCH_SIZE, cf.l_sequence, -1).long() rewards_v = torch.stack(rewards).view(cf.BATCH_SIZE, cf.l_sequence, -1).to(cf.DEVICE) dones_v = torch.stack(dones).view(cf.BATCH_SIZE, cf.l_sequence, -1).to(cf.DEVICE) state_action_values, _ = self.net(states_v) state_action_values = state_action_values.gather( 2, actions_v.to(cf.DEVICE)) next_state_values, _ = self.tgt_net(next_states_v) next_state_values = next_state_values.max(2, keepdim=True)[0] next_state_values = next_state_values.detach() expected_state_action_values = dones_v * cf.gamma * next_state_values + rewards_v loss = torch.nn.functional.mse_loss(state_action_values, expected_state_action_values) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss