class DropDQNAgent(nn.Module): def __init__(self, s_dim, a_dim, h_dim, h_act=nn.ReLU, buffer_size=100000, batch_size=32, lr=1e-4, gamma=0.95, theta=0.01, dropout=0.5, weight_decay=0.1, noise_level=None, n_sample=5, *args, **kwargs): super(DropDQNAgent, self).__init__() self.q_net = MCDropout(in_dim=s_dim, o_dim=a_dim, h_dim=h_dim, h_act=h_act, dropout=dropout, noise_level=noise_level) self.target_net = MCDropout(in_dim=s_dim, o_dim=a_dim, h_dim=h_dim, h_act=h_act, dropout=dropout, noise_level=noise_level, agent=False) self.target_net.load_state_dict(self.q_net.state_dict()) self.buffer = ReplayBuffer(buffer_size) self.batch_size = batch_size self.optimizer = Adam(self.q_net.parameters(), lr=lr, weight_decay=weight_decay) self.gamma = gamma self.theta = theta self.noise_level = noise_level self.n_sample = n_sample self.a_dim = a_dim # T.B.U. def forward(self, x): if self.training: self.q_net.agent = True return self.q_net.sample(x, self.n_sample) else: self.q_net.agent = False return self.q_net(x) def save_memory(self, ex): self.buffer.push(ex) # T.B.U. def train(self, k=1, max_norm=None): losses = [] self.q_net.agent = True for _ in range(k): experiences = self.buffer.sample(self.batch_size) s, a, r, t, mask = get_batch(experiences) next_mu, _ = self.target_net(t) next_q = next_mu.max(-1, keepdim=True)[0] target = r + self.gamma * mask * next_q.detach() pred_mu, pred_un = self.q_net.sample(s, self.n_sample) pred = pred_mu.gather(-1, a) un = pred_un.gather(-1, a) loss = -1. * D.Normal(pred, un).log_prob(target).mean() self.optimizer.zero_grad() loss.backward() if max_norm is not None: clip_grad_norm_(self.q_net.parameters(), max_norm) self.optimizer.step() losses.append(loss.item()) self.target_update() return np.mean(losses) def train_start(self): return (len(self.buffer) >= self.batch_size) def target_update(self): for target, param in zip(self.target_net.parameters(), self.q_net.parameters()): target.data = (1 - self.theta) * target.data + self.theta * param.data
class B3DQNAgent(nn.Module): def __init__(self, s_dim, a_dim, h_dim, h_act=nn.ReLU, buffer_size=100000, batch_size=32, lr=1e-4, gamma=0.95, theta=0.01, noise_level=None, n_sample=5, *args, **kwargs): super(B3DQNAgent, self).__init__() self.q_net = BayesNet(in_dim=s_dim, o_dim=a_dim, h_dim=h_dim, h_act=h_act, noise_level=noise_level) self.target_net = BayesNet(in_dim=s_dim, o_dim=a_dim, h_dim=h_dim, h_act=h_act, noise_level=noise_level) self.target_net.load_state_dict(self.q_net.state_dict()) self.buffer = ReplayBuffer(buffer_size) self.batch_size = batch_size self.optimizer = Adam(self.q_net.parameters(), lr=lr) self.gamma = gamma self.theta = theta self.noise_level = noise_level self.n_sample = n_sample self.a_dim = a_dim def forward(self, x): if self.training: self.q_net.train() return self.q_net.sample(x, self.n_sample) else: self.q_net.eval() return self.q_net(x) def save_memory(self, ex): self.buffer.push(ex) def train(self, lamb, k=1, max_norm=None): losses = [] self.q_net.train() for _ in range(k): experiences = self.buffer.sample(self.batch_size) s, a, r, t, mask = get_batch(experiences) self.target_net.eval() next_mu, _ = self.target_net(t) next_q = next_mu.max(-1, keepdim=True)[0] target = r + self.gamma * mask * next_q.detach() preds, uns, log_prior, log_var_post = self.q_net.sample( s, self.n_sample, True) pred = preds.gather(-1, a) un = uns.gather(-1, a) ll = D.Normal(pred, un).log_prob(target).mean() loss = lamb * (log_var_post - log_prior) - ll self.optimizer.zero_grad() loss.backward() if max_norm is not None: clip_grad_norm_(self.q_net.parameters(), max_norm) self.optimizer.step() losses.append(loss.item()) self.target_update() return np.mean(losses) def train_start(self): return (len(self.buffer) >= self.batch_size) def target_update(self): self.target_net.train() for target, param in zip(self.target_net.parameters(), self.q_net.parameters()): target.data = (1 - self.theta) * target.data + self.theta * param.data
class DQNAgent(nn.Module): def __init__(self, s_dim, a_dim, h_dim, h_act=nn.ReLU, buffer_size=100000, batch_size=32, lr=1e-4, gamma=0.95, theta=0.01, *args, **kwargs): super(DQNAgent, self).__init__() self.q_net = SimpleMLP(in_dim=s_dim, o_dim=a_dim, h_dim=h_dim, h_act=h_act) self.target_net = SimpleMLP(in_dim=s_dim, o_dim=a_dim, h_dim=h_dim, h_act=h_act) self.target_net.load_state_dict(self.q_net.state_dict()) self.buffer = ReplayBuffer(buffer_size) self.batch_size = batch_size self.optimizer = Adam(self.q_net.parameters(), lr=lr) self.gamma = gamma self.theta = theta self.a_dim = a_dim def forward(self, x): return self.q_net(x) def save_memory(self, ex): self.buffer.push(ex) def train(self, k=1, max_norm=None): losses = [] for _ in range(k): experiences = self.buffer.sample(self.batch_size) s, a, r, t, mask = get_batch(experiences) next_q = self.target_net(t).max(-1, keepdim=True)[0] target = r + self.gamma * mask * next_q.detach() pred = self.q_net(s).gather(-1, a) loss = F.mse_loss(pred, target) self.optimizer.zero_grad() loss.backward() if max_norm is not None: clip_grad_norm_(self.q_net.parameters(), max_norm) self.optimizer.step() losses.append(loss.item()) self.target_update() return np.mean(losses) def train_start(self): return (len(self.buffer) >= self.batch_size) def target_update(self): for target, param in zip(self.target_net.parameters(), self.q_net.parameters()): target.data = (1 - self.theta) * target.data + self.theta * param.data
class BootDQNAgent(nn.Module): def __init__(self, s_dim, a_dim, h_dim, h_act=nn.ReLU, buffer_size=100000, batch_size=32, lr=1e-4, gamma=0.95, theta=0.01, n_model=5, *args, **kwargs): super(BootDQNAgent, self).__init__() q_list = [ SimpleMLP(in_dim=s_dim, o_dim=a_dim, h_dim=h_dim, h_act=h_act) for _ in range(n_model) ] target_list = [ SimpleMLP(in_dim=s_dim, o_dim=a_dim, h_dim=h_dim, h_act=h_act) for _ in range(n_model) ] self.q_nets = nn.ModuleList(q_list) self.target_nets = nn.ModuleList(target_list) self.target_nets.load_state_dict(self.q_nets.state_dict()) self.buffer = ReplayBuffer(buffer_size) self.batch_size = batch_size self.optimizers = [ Adam(q_net.parameters(), lr=lr) for q_net in self.q_nets ] self.gamma = gamma self.theta = theta self.n_model = n_model self.current_head = None self.a_dim = a_dim def forward(self, x): if self.training: return self.q_nets[self.current_head](x) else: return torch.cat([q_net(x) for q_net in self.q_nets], dim=0) def save_memory(self, ex): self.buffer.push(ex) def train(self, k=1, max_norm=None): losses = [] for _ in range(k): for m, q_net in enumerate(self.q_nets): experiences = self.buffer.sample(self.batch_size) s, a, r, t, mask = get_batch(experiences) next_q = self.target_nets[m](t).max(-1, keepdim=True)[0] target = r + self.gamma * mask * next_q.detach() pred = q_net(s).gather(-1, a) loss = F.mse_loss(pred, target) self.optimizers[m].zero_grad() loss.backward() if max_norm is not None: clip_grad_norm_(q_net.parameters(), max_norm) self.optimizers[m].step() losses.append(loss.item()) self.target_update() return np.mean(losses) def train_start(self): return (len(self.buffer) >= self.batch_size) def target_update(self): for target, param in zip(self.target_nets.parameters(), self.q_nets.parameters()): target.data = (1 - self.theta) * target.data + self.theta * param.data
class EnDQNAgent(nn.Module): def __init__(self, s_dim, a_dim, h_dim, h_act=nn.ReLU, buffer_size=100000, batch_size=32, lr=1e-4, gamma=0.95, theta=0.01, n_model=5, *args, **kwargs): super(EnDQNAgent, self).__init__() self.q_nets = EnModel(in_dim=s_dim, o_dim=a_dim, h_dim=h_dim, h_act=h_act, n_model=5) self.target_nets = EnModel(in_dim=s_dim, o_dim=a_dim, h_dim=h_dim, h_act=h_act, n_model=5) self.target_nets.load_state_dict(self.q_nets.state_dict()) self.buffer = ReplayBuffer(buffer_size) self.batch_size = batch_size self.optimizers = [ Adam(head.parameters(), lr=lr) for head in self.q_nets.heads ] self.gamma = gamma self.theta = theta self.n_model = n_model self.a_dim = a_dim # T.B.U. def forward(self, x): return self.q_nets(x) def save_memory(self, ex): self.buffer.push(ex) # T.B.U. def train(self, k=1, max_norm=None): losses = [] for _ in range(k): for m in range(self.n_model): q_net = self.q_nets.heads[m] target_net = self.target_nets.heads[m] optimizer = self.optimizers[m] experiences = self.buffer.sample(self.batch_size) s, a, r, t, mask = get_batch(experiences) next_q_mu, _ = target_net(t) next_q = next_q_mu.max(-1, keepdim=True)[0] target = r + self.gamma * mask * next_q.detach() pred_mu, pred_var = q_net(s) pred = pred_mu.gather(-1, a) un = pred_var.sqrt().gather(-1, a) loss = -1. * D.Normal(pred, un).log_prob(target).mean() optimizer.zero_grad() loss.backward() if max_norm is not None: clip_grad_norm_(q_net.parameters(), max_norm) optimizer.step() losses.append(loss.item()) self.target_update() return np.mean(losses) def train_start(self): return (len(self.buffer) >= self.batch_size) def target_update(self): for target, param in zip(self.target_nets.parameters(), self.q_nets.parameters()): target.data = (1 - self.theta) * target.data + self.theta * param.data