class DDPG: def __init__(self, state_space, action_space): self.actor = Actor(state_space, action_space).to(device) self.critic = Critic(state_space, action_space).to(device) self.actor_target = Actor(state_space, action_space).to(device) self.critic_target = Critic(state_space, action_space).to(device) self.actor_optimiser = optim.Adam(actor.parameters(), lr=1e-3) self.critic_optimiser = optim.Adam(critic.parameters(), lr=1e-3) self.mem = ReplayBuffer(buffer_size) def act(self, state, add_noise=False): return self.actor.act(state, add_noise) def save(self, fn): torch.save(self.actor.state_dict(), "{}_actor_model.pth".format(fn)) torch.save(self.critic.state_dict(), "{}_critic_model.pth".format(fn)) def learn(self): state_batch, action_batch, reward_batch, next_state_batch, masks = self.mem.sample( batch_size) state_batch = torch.FloatTensor(state_batch).to(device) action_batch = torch.FloatTensor(action_batch).to(device) reward_batch = torch.FloatTensor(reward_batch).to(device) next_state_batch = torch.FloatTensor(next_state_batch).to(device) masks = torch.FloatTensor(masks).to(device) # Update Critic self.update_critic(states=state_batch, next_states=next_state_batch, actions=action_batch, rewards=reward_batch, dones=masks) # Update actor self.update_actor(states=state_batch) # Update target networks self.update_target_networks() def update_actor(self, states): actions_pred = self.actor(states) loss = -self.critic(states, actions_pred).mean() self.actor_optimiser.zero_grad() loss.backward() self.actor_optimiser.step() def update_critic(self, states, next_states, actions, rewards, dones): next_actions = self.actor_target.forward(next_states) y_i = rewards + (gamma * self.critic_target(next_states, next_actions) * (1 - dones)) expected_Q = self.critic(states, actions) loss = F.mse_loss(y_i, expected_Q) self.critic_optimiser.zero_grad() loss.backward() self.critic_optimiser.step() def update_target_networks(self): for target, local in zip(self.actor_target.parameters(), self.actor.parameters()): target.data.copy_(tau * local.data + (1.0 - tau) * target.data) for target, local in zip(self.critic_target.parameters(), self.critic.parameters()): target.data.copy_(tau * local.data + (1.0 - tau) * target.data)
class DDPG: def __init__(self, s_dim, a_dim, device, hidden, capacity, batch_size, lr_actor, lr_critic, variance_start, variance_decay, variance_min, gamma, tau): # Parameter Initialization self.s_dim = s_dim self.a_dim = a_dim self.device = device self.hidden = hidden self.lr_actor = lr_actor self.lr_critic = lr_critic self.capacity = capacity self.batch_size = batch_size self.var = variance_start self.var_decay = variance_decay self.var_min = variance_min self.gamma = gamma self.tau = tau # Network self.actor = Actor(s_dim, hidden, a_dim).to(device) self.actor_target = Actor(s_dim, hidden, a_dim).to(device) self.actor_target.load_state_dict(self.actor.state_dict()) self.opt_actor = torch.optim.Adam(self.actor.parameters(), lr=lr_actor) self.critic = Critic(s_dim, a_dim, hidden).to(device) self.critic_target = Critic(s_dim, a_dim, hidden).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) self.opt_critic = torch.optim.Adam(self.critic.parameters(), lr=lr_critic) # replay buffer, or memory self.memory = ReplayBuffer(capacity, batch_size, device) def get_action(self, s): with torch.no_grad(): s = torch.FloatTensor(s).to(self.device) a = self.actor(s).numpy() a = np.clip(np.random.normal(a, self.var), -1., 1.) return a def learn(self): # samples from memory s, a, s_, r, done = self.memory.get_sample() # update critic with torch.no_grad(): td_target = r + (1 - done) * self.gamma * self.critic_target( s_, self.actor_target(s_)) q = self.critic(s, a) critic_loss = F.mse_loss(q, td_target) self.opt_critic.zero_grad() critic_loss.backward() self.opt_critic.step() # update actor q = self.critic(s, self.actor(s)) actor_loss = -torch.mean(q) self.opt_actor.zero_grad() actor_loss.backward() self.opt_actor.step() # update target network self.soft_update(self.critic_target, self.critic) self.soft_update(self.actor_target, self.actor) # update variance self.var = max(self.var * self.var_decay, self.var_min) def soft_update(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau)
class PPO: def __init__(self, path, s_dim=3, a_dim=1, hidden=64, actor_lr=1e-4, critic_lr=1e-4, memory_len=64, batch_size=32, update_epoch=10, gamma=0.9, lambda_=0.95, epsilon=0.2): # Parameter initialization self.path = path self.s_dim = s_dim self.a_dim = a_dim self.hidden = hidden self.actor_lr = actor_lr self.critic_lr = critic_lr self.memory_len = memory_len self.batch_size = batch_size self.update_epoch = update_epoch self.gamma = gamma self.lambda_ = lambda_ self.epsilon = epsilon # network initialization self.actor = Actor(s_dim, a_dim, hidden) self.actor_old = Actor(s_dim, a_dim, hidden) self.actor_opt = torch.optim.Adam(self.actor.parameters(), lr=self.actor_lr) self.critic = Critic(s_dim, hidden) self.critic_opt = torch.optim.Adam(self.critic.parameters(), lr=self.critic_lr) # memory initialization self.memory_s, self.memory_a, self.memory_s_, self.memory_r, self.memory_done = [], [], [], [], [] # 是否继承以前的成果 if not os.listdir(self.path + '/Net'): # 没有以前的东西可以继承 print('init completed') else: # 继承以前的网络与记忆 print('loading completed') self.actor.load_state_dict(torch.load(self.path + '/Net/Actor.pth')) self.critic.load_state_dict( torch.load(self.path + '/Net/Critic.pth')) with open(self.path + '/Net/Memory_s.json', 'r') as f: self.memory_s = json.load(f) with open(self.path + '/Net/Memory_a.json', 'r') as f: self.memory_a = json.load(f) with open(self.path + '/Net/Memory_s_.json', 'r') as f: self.memory_s_ = json.load(f) with open(self.path + '/Net/Memory_r.json', 'r') as f: self.memory_r = json.load(f) with open(self.path + '/Net/Memory_done.json', 'r') as f: self.memory_done = json.load(f) self.actor_old.load_state_dict(self.actor.state_dict()) def store_network(self): torch.save(self.actor.state_dict(), self.path + '/Net/Actor.pth') torch.save(self.critic.state_dict(), self.path + '/Net/Critic.pth') with open(self.path + '/Net/Memory_s.json', 'w') as f: json.dump(self.memory_s, f) with open(self.path + '/Net/Memory_a.json', 'w') as f: json.dump(self.memory_a, f) with open(self.path + '/Net/Memory_s_.json', 'w') as f: json.dump(self.memory_s_, f) with open(self.path + '/Net/Memory_r.json', 'w') as f: json.dump(self.memory_r, f) with open(self.path + '/Net/Memory_done.json', 'w') as f: json.dump(self.memory_done, f) def choose_action(self, s): with torch.no_grad(): s = torch.tensor(s, dtype=torch.float) mean, std = self.actor(s) cov = torch.diag_embed(std) dist = MultivariateNormal(loc=mean, covariance_matrix=cov) a = dist.sample() a = torch.clamp(a, -1., 1.).numpy().tolist() return a def store_transition(self, s, a, s_, r, done): # store transition self.memory_s.append(s) self.memory_a.append(a) self.memory_s_.append(s_) self.memory_r.append(r) self.memory_done.append(1 if done else 0) if len(self.memory_r) == self.memory_len: # prepare of data s = torch.tensor(self.memory_s, dtype=torch.float) # [memory_len, s_dim] a = torch.tensor(self.memory_a, dtype=torch.float) # [memory_len, 1(a_dim)] r = torch.tensor(self.memory_r, dtype=torch.float) # [memory_len] s_ = torch.tensor(self.memory_s_, dtype=torch.float) # [memory_len, s_dim] done = torch.tensor(self.memory_done, dtype=torch.float) # [memory_len] self._learn(s, a, s_, r, done) def _learn(self, s, a, s_, r, done): gae = self._gae(s, r, s_, done) # [memory_len, 1] r = self._discounted_r(r, s_, done) # [memory_len, 1] # calculate old log probability self.actor_old.load_state_dict(self.actor.state_dict()) old_log_prob = self._log_prob(s, a, old=True) # [memory_len, 1] # batch update the network for i in range(self.update_epoch): for index in range(0, self.memory_len, self.batch_size): self.update_actor(s[index:index + self.batch_size], a[index:index + self.batch_size], gae[index:index + self.batch_size], old_log_prob[index:index + self.batch_size]) self.update_critic(s[index:index + self.batch_size], r[index:index + self.batch_size]) # empty the memory self.memory_s, self.memory_a, self.memory_s_, self.memory_r, self.memory_done = [], [], [], [], [] def _log_prob(self, s, a, old=False): # calculate the log probability if old: with torch.no_grad(): mean, std = self.actor_old(s) else: mean, std = self.actor(s) std = torch.stack([std] * mean.shape[0], dim=0) cov = torch.diag_embed(std) dist = MultivariateNormal(loc=mean, covariance_matrix=cov) log_prob = dist.log_prob(a).unsqueeze(dim=-1) return log_prob def _gae(self, s, r, s_, done): # calculate the general advantage estimation with torch.no_grad(): v = self.critic(s).squeeze() # [memory_len] v_ = self.critic(s_).squeeze() # [memory_len] delta = r + self.gamma * v_ - v length = r.shape[0] gae = torch.zeros(size=[length]) running_add = 0 for t in range(length - 1, -1, -1): gae[t] = running_add * self.gamma * self.lambda_ * ( 1 - done[t]) + delta[t] running_add = gae[t] return torch.unsqueeze(gae, dim=-1) def _discounted_r(self, r, s_, done): # calculate the discounted reward with torch.no_grad(): length = len(r) discounted_r = torch.zeros(size=[length]) v_ = self.critic(s_) running_add = 0 for t in range(length - 1, -1, -1): if done[t] == 1 or t == length - 1: discounted_r[t] = v_[t] * self.gamma + r[t] else: discounted_r[t] = running_add * self.gamma + r[t] running_add = discounted_r[t] return discounted_r.unsqueeze(dim=-1) def update_actor(self, s, a, gae, old_log_prob): # calculate the actor loss log_prob = self._log_prob(s, a) ratio = torch.exp(log_prob - old_log_prob) surr1 = ratio * gae surr2 = torch.clamp(ratio, 1.0 - self.epsilon, 1.0 + self.epsilon) * gae loss = -torch.mean(torch.min(surr1, surr2)) loss = loss - 0.001 * self.actor.log_std # 这个任务当中,加入PPO是有效果的。 # update self.actor_opt.zero_grad() loss.backward() self.actor_opt.step() def update_critic(self, s, r): # calculate critic loss v = self.critic(s) loss = F.mse_loss(v, r) # update self.critic_opt.zero_grad() loss.backward() self.critic_opt.step()
class DDPG: def __init__(self, path, s_dim = 3, # 状态空间维度, a_dim = 1, # 行动空间维度, hidden = 64, # 隐藏层宽度, device = 'gpu', # 训练位置, capacity = 2e3, # 记忆库大小 batch_size= 256, # 训练批次大小, start_lr_step = 512, # 开始学习的时间 gamma=0.9, # 回报折现率, var_init = 1., # variance的初始值 var_decay = 0.9999, # variance的衰减值 var_min = 0.1, # variance的最小值 actor_lr = 1e-3, # actor学习率, critic_lr = 3e-4, # critic学习率, actor_tau = 0.1, # actor更新率, critic_tau = 0.2, # critic更新率 ): # 初始化所有需要的参数 self.s_dim = s_dim self.a_dim = a_dim self.hidden = hidden # 因为我目前的测试机,无法使用gpu,所以gpu训练以后再加 self.device = torch.device(device if torch.cuda.is_available() else 'cpu') self.capacity = capacity self.batch_size = batch_size self.start_lr_step = start_lr_step self.gamma = gamma self.var = var_init self.var_decay = var_decay self.var_min = var_min self.actor_lr = actor_lr self.critic_lr = critic_lr self.actor_tau = actor_tau self.critic_tau = critic_tau # 还没有使用 self.path = path self.counter = 0 # 初始化网络 self.actor = Actor(s_dim, a_dim, hidden) self.actor_target = Actor(s_dim, a_dim, hidden) self.actor_opt = torch.optim.Adam(self.actor.parameters(), lr=self.actor_lr) self.critic = Critic(s_dim, a_dim, hidden) self.critic_target = Critic(s_dim, a_dim, hidden) self.critic_opt = torch.optim.Adam(self.critic.parameters(), lr=self.critic_lr) # 初始化记忆库 self.memory = Memory(capacity, batch_size, self.device) # 是否继承以前的成果 if not os.listdir(self.path + '/Net'): # 没有以前的东西可以继承 print('init completed') self.actor_target.load_state_dict(self.actor.state_dict()) self.critic_target.load_state_dict(self.critic.state_dict()) else: # 继承以前的网络与记忆 print('loading completed') self.actor.load_state_dict(torch.load(self.path + '/Net/Actor.pth')) self.actor_target.load_state_dict(torch.load(self.path + '/Net/Actor_Target.pth')) self.critic.load_state_dict(torch.load(self.path + '/Net/Critic.pth')) self.critic_target.load_state_dict(torch.load(self.path + '/Net/Critic_Target.pth')) with open(self.path + '/Net/Memory.json', 'r') as f: self.memory.memory = json.load(f) with open(self.path + '/Net/Counter.json', 'r') as f: self.memory.counter = json.load(f) with open(self.path + '/Net/Var.json', 'r') as f: self.var = json.load(f) def choose_action(self, s): with torch.no_grad(): s = torch.tensor(s, dtype=torch.float) a = self.actor(s).numpy() a = np.clip(np.random.normal(loc=a, scale=self.var), -1., 1.) # 行动:仅为pitch_pos return a def store_transition(self, s, a, s_, r, done): # 向记忆库中存储经历 self.memory.store_transition(s, a, s_, r, done) if self.memory.counter >= self.start_lr_step: s, a, s_, r, done = self.memory.get_sample() self._learn(s, a, s_, r, done) def store_network(self): # print('I stored actor in:', self.path+'/Net/Actor.pth') torch.save(self.actor.state_dict(), self.path + '/Net/Actor.pth') torch.save(self.actor_target.state_dict(), self.path + '/Net/Actor_Target.pth') torch.save(self.critic.state_dict(), self.path + '/Net/Critic.pth') torch.save(self.critic_target.state_dict(), self.path + '/Net/Critic_Target.pth') with open(self.path + '/Net/Memory.json', 'w') as f: json.dump(self.memory.memory, f) with open(self.path + '/Net/Counter.json', 'w') as f: json.dump(self.memory.counter, f) with open(self.path + '/Net/Var.json', 'w') as f: json.dump(self.var, f) print(self.var, self.memory.counter) def _learn(self, s, a, s_, r, done): # 更新critic td_target = r + (1-done) * self.gamma * self.critic_target(s_, self.actor_target(s_)) q = self.critic(s, a) critic_loss = F.mse_loss(q, td_target) self.critic_opt.zero_grad() critic_loss.backward() self.critic_opt.step() # 更新actor q = self.critic(s, self.actor(s)) actor_loss = -torch.mean(q) self.actor_opt.zero_grad() actor_loss.backward() self.actor_opt.step() # 更新target网络 _soft_update(self.critic_target, self.critic, self.critic_tau) _soft_update(self.actor_target, self.actor, self.actor_tau) # update variance self.var = max(self.var * self.var_decay, self.var_min)