def __init__(self, observation_shape, num_actions, model_path=None, device='cuda:0', gamma=0.99, learning_rate=0.001, weight_decay=0.0, clip_gradient=True, optim_name='Adam', huber_loss=False): self.num_actions = num_actions self.gamma = gamma self.device = device self.huber_loss = huber_loss self.clip_gradient = clip_gradient self.optim_name = optim_name self.weight_decay = weight_decay if model_path is not None: self.model = torch.load(model_path).to(device) else: self.model = VoxelDQN(observation_shape, num_actions).to(device) if optim_name == "SGD": self.optimizer = optim.SGD(self.model.parameters(), lr=learning_rate, weight_decay=weight_decay) elif optim_name == "RMSProp": self.optimizer = optim.RMSprop(self.model.parameters(), lr=learning_rate, weight_decay=weight_decay) elif optim_name == "Adam": self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
def __init__(self, observation_shape, num_actions, device='cuda:0', gamma=0.99, learning_rate=0.001, weight_decay=0.0, update_tar_interval=1000, clip_gradient=True, optim_name='Adam'): self.num_actions = num_actions self.gamma = gamma self.device = device self.clip_gradient = clip_gradient self.optim_name = optim_name self.weight_decay = weight_decay self.update_tar_interval = update_tar_interval self.model = VoxelDQN(observation_shape, num_actions).to(device) self.target_model = VoxelDQN(observation_shape, num_actions).to(device) self.target_model.load_state_dict(self.model.state_dict()) if optim_name == "SGD": self.optimizer = optim.SGD(self.model.parameters(), lr=learning_rate, weight_decay=weight_decay) elif optim_name == "RMSProp": self.optimizer = optim.RMSProp(self.model.parameters(), lr=learning_rate, weight_decay=weight_decay) elif optim_name == "Adam": self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
class DQNAgent: def __init__(self, observation_shape, num_actions, model_path=None, device='cuda:0', gamma=0.99, learning_rate=0.001, weight_decay=0.0, clip_gradient=True, optim_name='Adam', huber_loss=False): self.num_actions = num_actions self.gamma = gamma self.device = device self.huber_loss = huber_loss self.clip_gradient = clip_gradient self.optim_name = optim_name self.weight_decay = weight_decay if model_path is not None: self.model = torch.load(model_path).to(device) else: self.model = VoxelDQN(observation_shape, num_actions).to(device) if optim_name == "SGD": self.optimizer = optim.SGD(self.model.parameters(), lr=learning_rate, weight_decay=weight_decay) elif optim_name == "RMSProp": self.optimizer = optim.RMSprop(self.model.parameters(), lr=learning_rate, weight_decay=weight_decay) elif optim_name == "Adam": self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate) def act(self, state, mask, epsilon=0.05): if random.random() > epsilon: state = torch.tensor(state, dtype=torch.float).unsqueeze(0).to(self.device) mask = torch.tensor(mask, dtype=torch.float).to(self.device) q_value = self.model(state) q_value = F.softmax(q_value) q_value *= mask action = q_value.max(1)[1].item() # print("Action: ", action) else: action = np.random.choice(np.arange(self.num_actions)[mask]) # print("Action: ", action, "(random)") return action def compute_td_loss(self, state, action, reward, next_state, done, mask): state = state.to(self.device) action = action.to(self.device) reward = reward.to(self.device) next_state = next_state.to(self.device) done = done.to(self.device) mask = mask.to(self.device) q_values = self.model(state) next_q_values = self.model(next_state) next_q_values *= mask q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) next_q_value = next_q_values.max(1)[0] expected_q_value = reward + self.gamma * next_q_value * (1 - done) if self.huber_loss: loss = huber_loss(q_value, expected_q_value.detach(), delta=10.0) else: loss = (q_value - expected_q_value.detach()).pow(2).mean() self.optimizer.zero_grad() loss.backward() if self.clip_gradient: nn.utils.clip_grad_norm_(self.model.parameters(), 10) # for param in self.model.parameters(): # param.grad.data.clamp_(-1, 1) if self.optim_name == 'Adam': for group in self.optimizer.param_groups: for param in group['params']: param.data = param.data.add(-self.weight_decay * group['lr'], param.data) self.optimizer.step() return loss.item() def load_weights(self, model_path): if model_path is None: return self.model.load_state_dict(torch.load(model_path)) def save_model(self, output, tag=''): torch.save(self.model.state_dict(), '%s/model_%s.pkl' % (output, tag))