class DoubleDQNAgent: def __init__(self, env, use_conv=True, learning_rate=3e-4, gamma=0.99, tau=0.01, buffer_size=10000): self.env = env self.learning_rate = learning_rate self.gamma = gamma self.tau = tau self.replay_buffer = BasicBuffer(max_size=buffer_size) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.use_conv = use_conv if self.use_conv: self.model1 = ConvDQN(env.observation_space.shape, env.action_space.n).to(self.device) self.model2 = ConvDQN(env.observation_space.shape, env.action_space.n).to(self.device) else: self.model1 = DQN(env.observation_space.shape, len(env.action_space)).to(self.device) self.model2 = DQN(env.observation_space.shape, len(env.action_space)).to(self.device) self.optimizer1 = torch.optim.Adam(self.model1.parameters()) self.optimizer2 = torch.optim.Adam(self.model2.parameters()) def get_action(self, state, eps=0.20): if (np.random.randn() < eps): return np.random.choice(self.env.action_space) state = torch.FloatTensor(state).float().unsqueeze(0).to(self.device) qvals = self.model1.forward(state) action = np.argmax(qvals.cpu().detach().numpy()) return action def compute_loss(self, batch): states, actions, rewards, next_states, dones = batch states = torch.FloatTensor(states).to(self.device) actions = torch.LongTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) # resize tensors actions = actions.view(actions.size(0), 1) dones = dones.view(dones.size(0), 1) # compute loss curr_Q1 = self.model1.forward(states).gather(1, actions) curr_Q2 = self.model2.forward(states).gather(1, actions) next_Q1 = self.model1.forward(next_states) next_Q2 = self.model2.forward(next_states) next_Q = torch.min( torch.max(self.model1.forward(next_states), 1)[0], torch.max(self.model2.forward(next_states), 1)[0]) next_Q = next_Q.view(next_Q.size(0), 1) expected_Q = rewards + (1 - dones) * self.gamma * next_Q loss1 = F.mse_loss(curr_Q1, expected_Q.detach()) loss2 = F.mse_loss(curr_Q2, expected_Q.detach()) return loss1, loss2 def update(self, batch_size): batch = self.replay_buffer.sample(batch_size) loss1, loss2 = self.compute_loss(batch) self.optimizer1.zero_grad() loss1.backward() self.optimizer1.step() self.optimizer2.zero_grad() loss2.backward() self.optimizer2.step()
class DQNAgent(object): def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec=0.9999, replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn', device='cuda:0'): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.algo = algo self.env_name = env_name self.chkpt_dir = chkpt_dir self.action_space = [i for i in range(n_actions)] self.learn_step_counter = 0 self.device = device self.memory = ReplayBuffer(mem_size, input_dims, n_actions) # Create policy and target DQN models self.policy = DQN(self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + 'policy', chkpt_dir=self.chkpt_dir) self.target = DQN(self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + 'target', chkpt_dir=self.chkpt_dir) # put on correct device (GPU or CPU) self.policy.to(device) self.target.to(device) # Optimizer self.optimizer = optim.Adam(self.policy.parameters(), lr=lr) # Loss self.loss = nn.MSELoss() def choose_action(self, observation): # Choose an action if np.random.random() > self.epsilon: state = torch.tensor([observation], dtype=torch.float).to(self.device) actions = self.policy.forward(state) action = torch.argmax(actions).item() else: action = np.random.choice(self.action_space) return action def store_transition(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def sample_memory(self): state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) states = torch.tensor(state).to(self.device) rewards = torch.tensor(reward).to(self.device) dones = torch.tensor(done).to(self.device) actions = torch.tensor(action).to(self.device) states_ = torch.tensor(new_state).to(self.device) return states, actions, rewards, states_, dones def replace_target_network(self): if self.learn_step_counter % self.replace_target_cnt == 0: self.target.load_state_dict(self.policy.state_dict()) def decrement_epsilon(self): if self.epsilon > self.eps_min: self.epsilon *= self.eps_dec def save_models(self): self.policy.save_checkpoint() def load_models(self): self.policy.load_checkpoint() def learn(self): if self.memory.mem_cntr < self.batch_size: return self.optimizer.zero_grad() self.replace_target_network() states, actions, rewards, states_, dones = self.sample_memory() indices = np.arange(self.batch_size) q_pred = self.policy.forward(states)[indices, actions] q_next = self.target.forward(states_).max(dim=1)[0] q_next[dones] = 0.0 q_target = rewards + self.gamma * q_next loss = self.loss(q_target, q_pred).to(self.device) loss.backward() self.optimizer.step() self.learn_step_counter += 1 self.decrement_epsilon()
class DQNAgent: def __init__(self, env, use_conv=True, learning_rate=3e-4, gamma=0.99, buffer_size=10000): self.env = env self.learning_rate = learning_rate self.gamma = gamma self.replay_buffer = BasicBuffer(max_size=buffer_size) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print(self.device) self.use_conv = use_conv if self.use_conv: self.model = ConvDQN(env.observation_space.shape, len(env.action_space)).to(self.device) else: self.model = DQN(env.observation_space.shape, len(env.action_space)).to(self.device) self.optimizer = torch.optim.Adam(self.model.parameters()) self.MSE_loss = nn.MSELoss() def get_action(self, state, eps=0.20): state = torch.FloatTensor(state).float().unsqueeze(0).to(self.device) qvals = self.model.forward(state) #print(qvals) action = np.argmax(qvals.cpu().detach().numpy()) action = np.max(action) if (np.random.randn() < eps): return np.random.choice(self.env.action_space) return action def compute_loss(self, batch): states, actions, rewards, next_states, dones = batch states = torch.FloatTensor(states).to(self.device) actions = torch.LongTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones) #print(self.model.forward(states)) curr_Q = self.model.forward(states).gather(1, actions.unsqueeze(1)) curr_Q = curr_Q.squeeze(1) next_Q = self.model.forward(next_states) max_next_Q = torch.max(next_Q, 1)[0] expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q loss = self.MSE_loss(curr_Q, expected_Q) return loss def update(self, batch_size): batch = self.replay_buffer.sample(batch_size) loss = self.compute_loss(batch) self.optimizer.zero_grad() loss.backward() self.optimizer.step()