class ActorCritic: def __init__( self, s_dim, a_num, device, hidden, lr_actor, lr_critic, gamma, ): # Parameter Initialization self.s_dim = s_dim self.a_num = a_num self.device = device self.hidden = hidden self.lr_actor = lr_actor self.lr_critic = lr_critic self.gamma = gamma # network initialization self.actor = Actor(s_dim, hidden, a_num).to(self.device) self.opt_actor = torch.optim.Adam(self.actor.parameters(), lr=lr_actor) self.critic = Critic(s_dim, hidden).to(self.device) self.opt_critic = torch.optim.Adam(self.critic.parameters(), lr=lr_critic) # no memory in this algorithm def get_action(self, s): s = torch.FloatTensor(s).to(self.device) prob_weights = self.actor(s) # select action w.r.t the actions prob dist = Categorical(prob_weights) action = (dist.sample()).detach().item() return action def learn(self, s, a, s_, r, done): done = 1 if done else 0 # torch.LongTensor torch.FloatTensor only work for list # when transform scalar to Tensor, we could use torch.tensor() s = torch.tensor(s, dtype=torch.float, device=self.device) a = torch.tensor(a, dtype=torch.long, device=self.device) s_ = torch.tensor(s_, dtype=torch.float, device=self.device) r = torch.tensor(r, dtype=torch.float, device=self.device) # update for critic v = self.critic(s) with torch.no_grad(): v_ = self.critic(s_) td_target = r + (1-done)*self.gamma*v_.detach() td_error = td_target - v critic_loss = F.mse_loss(v, td_target) self.opt_critic.zero_grad() critic_loss.backward() self.opt_critic.step() # update for actor prob = self.actor(s) dist = Categorical(prob) actor_loss = -td_error * dist.log_prob(a) self.opt_actor.zero_grad() actor_loss.backward() self.opt_actor.step()
class TD3: def __init__(self, s_dim, a_dim, capacity, batch_size, lr_actor, lr_critic, alpha, beta, p_with_pi, hidden, reg_coe, var_init, var_decay, var_min, gamma, tau, policy_noise, noise_clip, policy_freq): # Parameter Initialization self.s_dim = s_dim self.a_dim = a_dim self.lr_actor = lr_actor self.lr_critic = lr_critic self.alpha = alpha self.beta = beta self.p_with_pi = p_with_pi self.hidden = hidden self.reg_coe = reg_coe self.capacity = capacity self.batch_size = batch_size self.var = var_init self.var_decay = var_decay self.var_min = var_min self.gamma = gamma self.tau = tau self.policy_noise = policy_noise self.noise_clip = noise_clip self.policy_freq = policy_freq self.train_it = 0 # Network self.actor = Actor(s_dim, a_dim, hidden) self.actor_target = copy.deepcopy(self.actor) self.opt_actor = torch.optim.Adam(self.actor.parameters(), lr=lr_actor, weight_decay=reg_coe) self.critic = Critic(s_dim, a_dim, hidden) self.critic_target = copy.deepcopy(self.critic) self.opt_critic = torch.optim.Adam(self.critic.parameters(), lr=lr_critic, weight_decay=reg_coe) # replay buffer, or memory self.memory = PER(capacity, batch_size, alpha, beta) def get_action(self, s): with torch.no_grad(): a = self.actor(torch.FloatTensor(s)) # add randomness to action selection for exploration a = a.numpy() a = np.clip(np.random.normal(a, self.var), -1., 1.) return a def learn(self): self.train_it += 1 s, a, s_, r, done, weight, samples_index = self.memory.get_sample() with torch.no_grad(): # Select action according to policy and add clipped noise noise = torch.clip( torch.randn_like(a) * self.policy_noise, -self.noise_clip, self.noise_clip) a_ = torch.clip(self.actor_target(s_) + noise, -1., 1.) # Compute the target Q value target_Q1, target_Q2 = self.critic_target(s_, a_) target_Q = torch.min(target_Q1, target_Q2) td_target = r + (1 - done) * self.gamma * target_Q # update critic q1, q2 = self.critic(s, a) td_error = (q1 - td_target)**2 + (q2 - td_target)**2 # critic_loss = F.mse_loss(q1, td_target) + F.mse_loss(q2, td_target) critic_loss = torch.mean(td_error) self.opt_critic.zero_grad() critic_loss.backward() self.opt_critic.step() if not self.p_with_pi: new_priority = torch.abs(td_error.squeeze()).detach().numpy() + \ (np.e ** -10) # + (np.e ** -10))**self.memory.alpha self.memory.priority[samples_index] = new_priority if self.train_it % self.policy_freq == 0: # update actor q = self.critic.Q1(s, self.actor(s)) actor_loss = -torch.mean(q) self.opt_actor.zero_grad() actor_loss.backward() self.opt_actor.step() if self.p_with_pi: new_priority = torch.abs(td_error.squeeze()).detach().numpy() + \ torch.pow(q.squeeze(), 2).detach().numpy() + \ (np.e ** -10) # + (np.e ** -10))**self.memory.alpha self.memory.priority[samples_index] = new_priority # update target network self.soft_update(self.critic_target, self.critic) self.soft_update(self.actor_target, self.actor) # update varaiance self.var = max(self.var * self.var_decay, self.var_min) def soft_update(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class ActorCritic: def __init__( self, s_dim, a_num, device, hidden, lr_actor, lr_critic, memory_len, gamma, lambda_, ): # Parameter Initialization self.s_dim = s_dim self.a_num = a_num self.device = device self.hidden = hidden self.lr_actor = lr_actor self.lr_critic = lr_critic self.memory_len = memory_len self.gamma = gamma self.lambda_ = lambda_ # network initialization self.actor = Actor(s_dim, hidden, a_num).to(self.device) self.opt_actor = torch.optim.Adam(self.actor.parameters(), lr=lr_actor) self.critic = Critic(s_dim, hidden).to(self.device) self.opt_critic = torch.optim.Adam(self.critic.parameters(), lr=lr_critic) # no memory in this algorithm self.memory_s = [] self.memory_a = [] self.memory_s_ = [] self.memory_r = [] self.memory_done = [] def get_action(self, s): s = torch.FloatTensor(s).to(self.device) prob_weights = self.actor(s) # select action w.r.t the actions prob dist = Categorical(prob_weights) action = (dist.sample()).detach().item() return action def store_transition(self, s, a, s_, r, done): self.memory_s.append(s) self.memory_a.append(a) self.memory_s_.append(s_) self.memory_r.append(r) self.memory_done.append(1 if done else 0) if len(self.memory_r)>self.memory_len: self._learn() def _GAE(self, s, r, s_, done): with torch.no_grad(): v = self.critic(s).squeeze() v_ = self.critic(s_).squeeze() delta = r + self.gamma*v_*(1-done) - v length = r.shape[0] GAE = torch.zeros(size=[length], device=self.device) running_add = 0 for t in range(length - 1, -1, -1): running_add = delta[t] + running_add * \ self.gamma * self.lambda_ * (1 - done[t]) GAE[t] = running_add return GAE def _discounted_r(self, r, done): length = r.shape[0] discounted_r = torch.zeros([length], device=self.device) running_add = 0 for t in range(length - 1, -1, -1): running_add = running_add * self.gamma * (1 - done[t]) + r[t] discounted_r[t] = running_add return discounted_r def _learn(self): # torch.LongTensor torch.FloatTensor only work for list # when transform scalar to Tensor, we could use torch.tensor() s = torch.tensor(self.memory_s, dtype=torch.float).to(self.device) a = torch.tensor(self.memory_a, dtype=torch.long).to(self.device) s_ = torch.tensor(self.memory_s_, dtype=torch.float).to(self.device) r = torch.tensor(self.memory_r, dtype=torch.float).to(self.device) done = torch.tensor(self.memory_done, dtype=torch.float).to(self.device) GAE = self._GAE(s, r, s_, done) discounted_r = self._discounted_r(r, done) # update for critic v = self.critic(s).squeeze() critic_loss = F.mse_loss(v, discounted_r) self.opt_critic.zero_grad() critic_loss.backward() self.opt_critic.step() # update for actor prob = self.actor(s) dist = Categorical(prob) actor_loss = -torch.sum(GAE.detach()*dist.log_prob(a)) self.opt_actor.zero_grad() actor_loss.backward() self.opt_actor.step() self.memory_s = [] self.memory_a = [] self.memory_s_ = [] self.memory_r = [] self.memory_done = []
class DDPG: def __init__(self, state_space, action_space): self.actor = Actor(state_space, action_space).to(device) self.critic = Critic(state_space, action_space).to(device) self.actor_target = Actor(state_space, action_space).to(device) self.critic_target = Critic(state_space, action_space).to(device) self.actor_optimiser = optim.Adam(actor.parameters(), lr=1e-3) self.critic_optimiser = optim.Adam(critic.parameters(), lr=1e-3) self.mem = ReplayBuffer(buffer_size) def act(self, state, add_noise=False): return self.actor.act(state, add_noise) def save(self, fn): torch.save(self.actor.state_dict(), "{}_actor_model.pth".format(fn)) torch.save(self.critic.state_dict(), "{}_critic_model.pth".format(fn)) def learn(self): state_batch, action_batch, reward_batch, next_state_batch, masks = self.mem.sample( batch_size) state_batch = torch.FloatTensor(state_batch).to(device) action_batch = torch.FloatTensor(action_batch).to(device) reward_batch = torch.FloatTensor(reward_batch).to(device) next_state_batch = torch.FloatTensor(next_state_batch).to(device) masks = torch.FloatTensor(masks).to(device) # Update Critic self.update_critic(states=state_batch, next_states=next_state_batch, actions=action_batch, rewards=reward_batch, dones=masks) # Update actor self.update_actor(states=state_batch) # Update target networks self.update_target_networks() def update_actor(self, states): actions_pred = self.actor(states) loss = -self.critic(states, actions_pred).mean() self.actor_optimiser.zero_grad() loss.backward() self.actor_optimiser.step() def update_critic(self, states, next_states, actions, rewards, dones): next_actions = self.actor_target.forward(next_states) y_i = rewards + (gamma * self.critic_target(next_states, next_actions) * (1 - dones)) expected_Q = self.critic(states, actions) loss = F.mse_loss(y_i, expected_Q) self.critic_optimiser.zero_grad() loss.backward() self.critic_optimiser.step() def update_target_networks(self): for target, local in zip(self.actor_target.parameters(), self.actor.parameters()): target.data.copy_(tau * local.data + (1.0 - tau) * target.data) for target, local in zip(self.critic_target.parameters(), self.critic.parameters()): target.data.copy_(tau * local.data + (1.0 - tau) * target.data)
class TD3: def __init__(self, s_dim, a_dim, capacity, batch_size, lr_actor, lr_critic, hidden, var_init, var_decay, var_min, gamma, tau, policy_noise, noise_clip, policy_freq): # Parameter Initialization self.s_dim = s_dim self.a_dim = a_dim self.lr_actor = lr_actor self.lr_critic = lr_critic self.hidden = hidden self.capacity = capacity self.batch_size = batch_size self.var = var_init self.var_decay = var_decay self.var_min = var_min self.gamma = gamma self.tau = tau self.policy_noise = policy_noise self.noise_clip = noise_clip self.policy_freq = policy_freq self.train_it = 0 # Network self.actor = Actor(s_dim, a_dim, hidden) self.actor_target = copy.deepcopy(self.actor) self.opt_actor = torch.optim.Adam(self.actor.parameters(), lr=lr_actor) self.critic = Critic(s_dim, a_dim, hidden) self.critic_target = copy.deepcopy(self.critic) self.opt_critic = torch.optim.Adam(self.critic.parameters(), lr=lr_critic) # replay buffer, or memory self.memory = ReplayBuffer(capacity, batch_size) def get_action(self, s): with torch.no_grad(): a = self.actor(torch.FloatTensor(s)) # add randomness to action selection for exploration a = a.numpy() a = np.clip(np.random.normal(a, self.var), -1., 1.) return a def learn(self): self.train_it += 1 s, a, s_, r, done = self.memory.get_sample() with torch.no_grad(): # Select action according to policy and add clipped noise noise = torch.randn_like(a) * self.policy_noise noise = torch.clip(noise, -self.noise_clip, self.noise_clip) a_ = self.actor_target(s_) + noise a_ = torch.clip(a_, -1., 1.) # Compute the target Q value target_Q1, target_Q2 = self.critic_target(s_, a_) target_Q = torch.min(target_Q1, target_Q2) td_target = r + (1 - done) * self.gamma * target_Q # update critic q1, q2 = self.critic(s, a) critic_loss = F.mse_loss(q1, td_target) + F.mse_loss(q2, td_target) self.opt_critic.zero_grad() critic_loss.backward() self.opt_critic.step() if self.train_it % self.policy_freq == 0: # update actor # 两种写法都是可行的,可以直接用一个,也可以取min q1, q2 = self.critic(s, self.actor(s)) q = torch.min(q1, q2) # q = self.critic.Q1(s, self.actor(s)) actor_loss = -torch.mean(q) self.opt_actor.zero_grad() actor_loss.backward() self.opt_actor.step() # update target network self.soft_update(self.critic_target, self.critic) self.soft_update(self.actor_target, self.actor) # update varaiance self.var = max(self.var * self.var_decay, self.var_min) def soft_update(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau)
class PPO: def __init__(self, path, s_dim=3, a_dim=1, hidden=64, actor_lr=1e-4, critic_lr=1e-4, memory_len=64, batch_size=32, update_epoch=10, gamma=0.9, lambda_=0.95, epsilon=0.2): # Parameter initialization self.path = path self.s_dim = s_dim self.a_dim = a_dim self.hidden = hidden self.actor_lr = actor_lr self.critic_lr = critic_lr self.memory_len = memory_len self.batch_size = batch_size self.update_epoch = update_epoch self.gamma = gamma self.lambda_ = lambda_ self.epsilon = epsilon # network initialization self.actor = Actor(s_dim, a_dim, hidden) self.actor_old = Actor(s_dim, a_dim, hidden) self.actor_opt = torch.optim.Adam(self.actor.parameters(), lr=self.actor_lr) self.critic = Critic(s_dim, hidden) self.critic_opt = torch.optim.Adam(self.critic.parameters(), lr=self.critic_lr) # memory initialization self.memory_s, self.memory_a, self.memory_s_, self.memory_r, self.memory_done = [], [], [], [], [] # 是否继承以前的成果 if not os.listdir(self.path + '/Net'): # 没有以前的东西可以继承 print('init completed') else: # 继承以前的网络与记忆 print('loading completed') self.actor.load_state_dict(torch.load(self.path + '/Net/Actor.pth')) self.critic.load_state_dict( torch.load(self.path + '/Net/Critic.pth')) with open(self.path + '/Net/Memory_s.json', 'r') as f: self.memory_s = json.load(f) with open(self.path + '/Net/Memory_a.json', 'r') as f: self.memory_a = json.load(f) with open(self.path + '/Net/Memory_s_.json', 'r') as f: self.memory_s_ = json.load(f) with open(self.path + '/Net/Memory_r.json', 'r') as f: self.memory_r = json.load(f) with open(self.path + '/Net/Memory_done.json', 'r') as f: self.memory_done = json.load(f) self.actor_old.load_state_dict(self.actor.state_dict()) def store_network(self): torch.save(self.actor.state_dict(), self.path + '/Net/Actor.pth') torch.save(self.critic.state_dict(), self.path + '/Net/Critic.pth') with open(self.path + '/Net/Memory_s.json', 'w') as f: json.dump(self.memory_s, f) with open(self.path + '/Net/Memory_a.json', 'w') as f: json.dump(self.memory_a, f) with open(self.path + '/Net/Memory_s_.json', 'w') as f: json.dump(self.memory_s_, f) with open(self.path + '/Net/Memory_r.json', 'w') as f: json.dump(self.memory_r, f) with open(self.path + '/Net/Memory_done.json', 'w') as f: json.dump(self.memory_done, f) def choose_action(self, s): with torch.no_grad(): s = torch.tensor(s, dtype=torch.float) mean, std = self.actor(s) cov = torch.diag_embed(std) dist = MultivariateNormal(loc=mean, covariance_matrix=cov) a = dist.sample() a = torch.clamp(a, -1., 1.).numpy().tolist() return a def store_transition(self, s, a, s_, r, done): # store transition self.memory_s.append(s) self.memory_a.append(a) self.memory_s_.append(s_) self.memory_r.append(r) self.memory_done.append(1 if done else 0) if len(self.memory_r) == self.memory_len: # prepare of data s = torch.tensor(self.memory_s, dtype=torch.float) # [memory_len, s_dim] a = torch.tensor(self.memory_a, dtype=torch.float) # [memory_len, 1(a_dim)] r = torch.tensor(self.memory_r, dtype=torch.float) # [memory_len] s_ = torch.tensor(self.memory_s_, dtype=torch.float) # [memory_len, s_dim] done = torch.tensor(self.memory_done, dtype=torch.float) # [memory_len] self._learn(s, a, s_, r, done) def _learn(self, s, a, s_, r, done): gae = self._gae(s, r, s_, done) # [memory_len, 1] r = self._discounted_r(r, s_, done) # [memory_len, 1] # calculate old log probability self.actor_old.load_state_dict(self.actor.state_dict()) old_log_prob = self._log_prob(s, a, old=True) # [memory_len, 1] # batch update the network for i in range(self.update_epoch): for index in range(0, self.memory_len, self.batch_size): self.update_actor(s[index:index + self.batch_size], a[index:index + self.batch_size], gae[index:index + self.batch_size], old_log_prob[index:index + self.batch_size]) self.update_critic(s[index:index + self.batch_size], r[index:index + self.batch_size]) # empty the memory self.memory_s, self.memory_a, self.memory_s_, self.memory_r, self.memory_done = [], [], [], [], [] def _log_prob(self, s, a, old=False): # calculate the log probability if old: with torch.no_grad(): mean, std = self.actor_old(s) else: mean, std = self.actor(s) std = torch.stack([std] * mean.shape[0], dim=0) cov = torch.diag_embed(std) dist = MultivariateNormal(loc=mean, covariance_matrix=cov) log_prob = dist.log_prob(a).unsqueeze(dim=-1) return log_prob def _gae(self, s, r, s_, done): # calculate the general advantage estimation with torch.no_grad(): v = self.critic(s).squeeze() # [memory_len] v_ = self.critic(s_).squeeze() # [memory_len] delta = r + self.gamma * v_ - v length = r.shape[0] gae = torch.zeros(size=[length]) running_add = 0 for t in range(length - 1, -1, -1): gae[t] = running_add * self.gamma * self.lambda_ * ( 1 - done[t]) + delta[t] running_add = gae[t] return torch.unsqueeze(gae, dim=-1) def _discounted_r(self, r, s_, done): # calculate the discounted reward with torch.no_grad(): length = len(r) discounted_r = torch.zeros(size=[length]) v_ = self.critic(s_) running_add = 0 for t in range(length - 1, -1, -1): if done[t] == 1 or t == length - 1: discounted_r[t] = v_[t] * self.gamma + r[t] else: discounted_r[t] = running_add * self.gamma + r[t] running_add = discounted_r[t] return discounted_r.unsqueeze(dim=-1) def update_actor(self, s, a, gae, old_log_prob): # calculate the actor loss log_prob = self._log_prob(s, a) ratio = torch.exp(log_prob - old_log_prob) surr1 = ratio * gae surr2 = torch.clamp(ratio, 1.0 - self.epsilon, 1.0 + self.epsilon) * gae loss = -torch.mean(torch.min(surr1, surr2)) loss = loss - 0.001 * self.actor.log_std # 这个任务当中,加入PPO是有效果的。 # update self.actor_opt.zero_grad() loss.backward() self.actor_opt.step() def update_critic(self, s, r): # calculate critic loss v = self.critic(s) loss = F.mse_loss(v, r) # update self.critic_opt.zero_grad() loss.backward() self.critic_opt.step()
class A2C: def __init__( self, s_dim, a_num, device, hidden, lr_actor, lr_critic, max_len, gamma, ): # Parameter Initialization self.s_dim = s_dim self.a_num = a_num self.device = device self.hidden = hidden self.lr_actor = lr_actor self.lr_critic = lr_critic self.max_len = max_len self.gamma = gamma # network initialization self.actor = Actor(s_dim, hidden, a_num).to(self.device) self.opt_actor = torch.optim.Adam(self.actor.parameters(), lr=lr_actor) self.critic = Critic(s_dim, hidden).to(self.device) self.opt_critic = torch.optim.Adam(self.critic.parameters(), lr=lr_critic) # define memory self.memory_s = [] self.memory_a = [] self.memory_r = [] def get_action(self, s): s = torch.FloatTensor(s).to(self.device) prob_weights = self.actor(s) # select action w.r.t the actions prob dist = Categorical(prob_weights) action = (dist.sample()).detach().item() return action def store_transition(self, s, a, s_, r, done): self.memory_s.append(s) self.memory_a.append(a) self.memory_r.append(r) if len(self.memory_r) >= self.max_len or done: discounted_r = self._discounted_r(self.memory_r, s_, done) s = torch.FloatTensor(self.memory_s).to(self.device) a = torch.LongTensor(self.memory_a).to(self.device) r = torch.FloatTensor(discounted_r).to(self.device) self._learn(s, a, r) def _learn(self, s, a, r): # update critic v = self.critic(s) advantage = r - v critic_loss = torch.mean(torch.pow(advantage, 2)) self.opt_critic.zero_grad() critic_loss.backward() self.opt_critic.step() # update actor prob = self.actor(s) dist = Categorical(prob) log_prob = dist.log_prob(a) actor_loss = -torch.mean(log_prob * advantage.detach()) self.opt_actor.zero_grad() actor_loss.backward() self.opt_actor.step() # renew the memory self.memory_s = [] self.memory_a = [] self.memory_r = [] def _discounted_r(self, r, s_, done): length = len(r) discounted_r = np.zeros(length) running_add = 0 if done else self.critic( torch.FloatTensor(s_).to(self.device)).item() for t in range(length - 1, -1, -1): running_add = r[t] + running_add * self.gamma discounted_r[t] = running_add return discounted_r
class SAC: def __init__(self, s_dim, a_dim, hidden, capacity, batch_size, lr, gamma, tau, log_prob_reg): # Parameter Initialization self.s_dim = s_dim self.a_dim = a_dim self.hidden = hidden self.lr = lr self.capacity = capacity self.batch_size = batch_size self.gamma = gamma self.tau = tau self.log_prob_reg = log_prob_reg # Network self.actor = Actor(s_dim, a_dim, hidden) self.opt_actor = torch.optim.Adam(self.actor.parameters(), lr=lr) self.critic = Critic(s_dim, a_dim, hidden) self.critic_target = copy.deepcopy(self.critic) self.opt_critic = torch.optim.Adam(self.critic.parameters(), lr=lr) # alpha self.target_entropy = -a_dim self.alpha = torch.tensor(1, dtype=torch.float, requires_grad=True) self.opt_alpha = torch.optim.Adam([self.alpha], lr=lr) # replay buffer, memory self.memory = ReplayBuffer(capacity, batch_size) def get_action(self, s): s = torch.tensor(data=s, dtype=torch.float) mean, std = self.actor(s) normal = Normal(mean, std) z = normal.rsample() a = torch.tanh(z) return a.detach().numpy().tolist() def _log_prob(self, s): mean, std = self.actor(s) dist = Normal(mean, std) u = dist.rsample() a = torch.tanh(u) log_prob = dist.log_prob(u) - torch.log(1 - a.pow(2) + self.log_prob_reg) log_prob = log_prob.sum(-1, keepdim=True) return a, log_prob def learn(self): # samples from memory s, a, s_, r = self.memory.get_sample() # update q net with torch.no_grad(): a_, log_prob_ = self._log_prob(s_) q1_, q2_ = self.critic_target(s_, a_) q_target = r + self.gamma * (torch.min(q1_, q2_) - self.alpha * log_prob_) q1, q2 = self.critic(s, a) q_loss = F.mse_loss(q1, q_target) + F.mse_loss(q2, q_target) self.opt_critic.zero_grad() q_loss.backward() self.opt_critic.step() # update policy net a_new, log_prob_new = self._log_prob(s) q_new = self.critic.Q1(s, a_new) # q1_new, q2_new = self.critic(s, a_new) # q_new = torch.min(q1_new, q2_new) 这两种做法都可行 policy_loss = torch.mean(self.alpha * log_prob_new - q_new) self.opt_actor.zero_grad() policy_loss.backward() self.opt_actor.step() # update temperature alpha alpha_loss = -torch.mean(self.alpha * (log_prob_new + self.target_entropy).detach()) self.opt_alpha.zero_grad() alpha_loss.backward() self.opt_alpha.step() # update target net self.soft_update(self.critic_target, self.critic) def soft_update(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau)
class PPO: def __init__(self, s_dim, a_dim, bound, hidden, device, lr, memory_len, batch_size, update_epoch, gamma, lambda_, epsilon): # Parameter initialization self.s_dim = s_dim self.a_dim = a_dim self.bound = bound self.hidden = hidden self.device = torch.device( device if torch.cuda.is_available() else 'cpu') self.lr = lr self.memory_len = memory_len self.batch_size = batch_size self.update_epoch = update_epoch self.gamma = gamma self.lambda_ = lambda_ self.epsilon = epsilon # network initialization self.actor = Actor(s_dim, a_dim, hidden).to(self.device) self.actor_old = Actor(s_dim, a_dim, hidden).to(self.device) self.actor_old.load_state_dict(self.actor.state_dict()) self.actor_opt = torch.optim.Adam(self.actor.parameters(), lr=self.lr) self.critic = Critic(s_dim).to(self.device) self.critic_opt = torch.optim.Adam(self.critic.parameters(), lr=self.lr) # memory initialization self.memory_s, self.memory_a, self.memory_s_, self.memory_r, self.memory_done = [], [], [], [], [] def get_action(self, s): # select action w.r.t the actions prob s = torch.tensor(s, dtype=torch.float, device=self.device) mean, std = self.actor(s) cov = torch.diag_embed(std) dist = MultivariateNormal(loc=mean, covariance_matrix=cov) a = dist.sample() a = torch.clamp(a * self.bound, -self.bound, self.bound) # Because in this environment, action_dim equals 1, we use .item(). # When action_dim>1, please use .unmpy() return a.item() def learn(self, s, a, s_, r, done): # store transition self.memory_s.append(s) self.memory_a.append(a / self.bound) self.memory_s_.append(s_) self.memory_r.append(r) self.memory_done.append(1 if done else 0) if len(self.memory_r) == self.memory_len: # prepare of data s = torch.tensor(self.memory_s, dtype=torch.float, device=self.device) # [memory_len, s_dim] a = torch.tensor(self.memory_a, dtype=torch.float, device=self.device).unsqueeze( dim=-1) # [memory_len, 1(a_dim)] r = torch.tensor(self.memory_r, dtype=torch.float, device=self.device) # [memory_len] s_ = torch.tensor(self.memory_s_, dtype=torch.float, device=self.device) # [memory_len, s_dim] gae = self._gae(s, r, s_, self.memory_done) r = self._discounted_r(r, s_, self.memory_done) # calculate old log probability self.actor_old.load_state_dict(self.actor.state_dict()) old_log_prob = self._log_prob(s, a, old=True) # [memory_len, 1] # batch update the network for i in range(self.update_epoch): for index in range(0, self.memory_len, self.batch_size): self.update_actor( s[index:index + self.batch_size], a[index:index + self.batch_size], gae[index:index + self.batch_size], old_log_prob[index:index + self.batch_size]) self.update_critic(s[index:index + self.batch_size], r[index:index + self.batch_size]) # empty the memory self.memory_s, self.memory_a, self.memory_s_, self.memory_r, self.memory_done = [], [], [], [], [] def _log_prob(self, s, a, old=False): # calculate the log probability if old: with torch.no_grad(): mean, std = self.actor_old(s) else: mean, std = self.actor(s) std = torch.stack([std] * mean.shape[0], dim=0) cov = torch.diag_embed(std) dist = MultivariateNormal(loc=mean, covariance_matrix=cov) log_prob = dist.log_prob(a).unsqueeze(dim=-1) return log_prob def _gae(self, s, r, s_, done): # calculate the general advantage estimation with torch.no_grad(): v = self.critic(s).squeeze() # [memory_len] v_ = self.critic(s_).squeeze() # [memory_len] delta = r + self.gamma * v_ - v length = r.shape[0] gae = torch.zeros(size=[length]) running_add = 0 for t in range(length - 1, -1, -1): gae[t] = running_add * self.gamma * self.lambda_ * ( 1 - done[t]) + delta[t] running_add = gae[t] return torch.unsqueeze(gae, dim=-1) def _discounted_r(self, r, s_, done): # calculate the discounted reward with torch.no_grad(): length = len(r) discounted_r = torch.zeros(size=[length]) v_ = self.critic(s_) running_add = 0 for t in range(length - 1, -1, -1): if done[t] == 1 or t == length - 1: discounted_r[t] = v_[t] * self.gamma + r[t] else: discounted_r[t] = running_add * self.gamma + r[t] # discounted_r[t] = running_add * self.gamma + r[t] running_add = discounted_r[t] return discounted_r.unsqueeze(dim=-1) def _entropy(self, s, a): mean, std = self.actor(s) std = torch.stack([std] * mean.shape[0], dim=0) cov = torch.diag_embed(std) dist = MultivariateNormal(loc=mean, covariance_matrix=cov) entropy = dist.entropy() return entropy def update_actor(self, s, a, gae, old_log_prob): # calculate the actor loss log_prob = self._log_prob(s, a) ratio = torch.exp(log_prob - old_log_prob) surr1 = ratio * gae surr2 = torch.clamp(ratio, 1.0 - self.epsilon, 1.0 + self.epsilon) * gae loss = -torch.mean(torch.min(surr1, surr2)) # loss = loss - 0.001 * self.actor.entropy() # 这个entropy项,在这个任务当中,不加为好。 # update self.actor_opt.zero_grad() loss.backward() self.actor_opt.step() def update_critic(self, s, r): # calculate critic loss v = self.critic(s) loss = F.mse_loss(v, r) # update self.critic_opt.zero_grad() loss.backward() self.critic_opt.step()
class DDPG: def __init__(self, s_dim, a_dim, device, hidden, capacity, batch_size, lr_actor, lr_critic, variance_start, variance_decay, variance_min, gamma, tau): # Parameter Initialization self.s_dim = s_dim self.a_dim = a_dim self.device = device self.hidden = hidden self.lr_actor = lr_actor self.lr_critic = lr_critic self.capacity = capacity self.batch_size = batch_size self.var = variance_start self.var_decay = variance_decay self.var_min = variance_min self.gamma = gamma self.tau = tau # Network self.actor = Actor(s_dim, hidden, a_dim).to(device) self.actor_target = Actor(s_dim, hidden, a_dim).to(device) self.actor_target.load_state_dict(self.actor.state_dict()) self.opt_actor = torch.optim.Adam(self.actor.parameters(), lr=lr_actor) self.critic = Critic(s_dim, a_dim, hidden).to(device) self.critic_target = Critic(s_dim, a_dim, hidden).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) self.opt_critic = torch.optim.Adam(self.critic.parameters(), lr=lr_critic) # replay buffer, or memory self.memory = ReplayBuffer(capacity, batch_size, device) def get_action(self, s): with torch.no_grad(): s = torch.FloatTensor(s).to(self.device) a = self.actor(s).numpy() a = np.clip(np.random.normal(a, self.var), -1., 1.) return a def learn(self): # samples from memory s, a, s_, r, done = self.memory.get_sample() # update critic with torch.no_grad(): td_target = r + (1 - done) * self.gamma * self.critic_target( s_, self.actor_target(s_)) q = self.critic(s, a) critic_loss = F.mse_loss(q, td_target) self.opt_critic.zero_grad() critic_loss.backward() self.opt_critic.step() # update actor q = self.critic(s, self.actor(s)) actor_loss = -torch.mean(q) self.opt_actor.zero_grad() actor_loss.backward() self.opt_actor.step() # update target network self.soft_update(self.critic_target, self.critic) self.soft_update(self.actor_target, self.actor) # update variance self.var = max(self.var * self.var_decay, self.var_min) def soft_update(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau)
class DDPG: def __init__(self, path, s_dim = 3, # 状态空间维度, a_dim = 1, # 行动空间维度, hidden = 64, # 隐藏层宽度, device = 'gpu', # 训练位置, capacity = 2e3, # 记忆库大小 batch_size= 256, # 训练批次大小, start_lr_step = 512, # 开始学习的时间 gamma=0.9, # 回报折现率, var_init = 1., # variance的初始值 var_decay = 0.9999, # variance的衰减值 var_min = 0.1, # variance的最小值 actor_lr = 1e-3, # actor学习率, critic_lr = 3e-4, # critic学习率, actor_tau = 0.1, # actor更新率, critic_tau = 0.2, # critic更新率 ): # 初始化所有需要的参数 self.s_dim = s_dim self.a_dim = a_dim self.hidden = hidden # 因为我目前的测试机,无法使用gpu,所以gpu训练以后再加 self.device = torch.device(device if torch.cuda.is_available() else 'cpu') self.capacity = capacity self.batch_size = batch_size self.start_lr_step = start_lr_step self.gamma = gamma self.var = var_init self.var_decay = var_decay self.var_min = var_min self.actor_lr = actor_lr self.critic_lr = critic_lr self.actor_tau = actor_tau self.critic_tau = critic_tau # 还没有使用 self.path = path self.counter = 0 # 初始化网络 self.actor = Actor(s_dim, a_dim, hidden) self.actor_target = Actor(s_dim, a_dim, hidden) self.actor_opt = torch.optim.Adam(self.actor.parameters(), lr=self.actor_lr) self.critic = Critic(s_dim, a_dim, hidden) self.critic_target = Critic(s_dim, a_dim, hidden) self.critic_opt = torch.optim.Adam(self.critic.parameters(), lr=self.critic_lr) # 初始化记忆库 self.memory = Memory(capacity, batch_size, self.device) # 是否继承以前的成果 if not os.listdir(self.path + '/Net'): # 没有以前的东西可以继承 print('init completed') self.actor_target.load_state_dict(self.actor.state_dict()) self.critic_target.load_state_dict(self.critic.state_dict()) else: # 继承以前的网络与记忆 print('loading completed') self.actor.load_state_dict(torch.load(self.path + '/Net/Actor.pth')) self.actor_target.load_state_dict(torch.load(self.path + '/Net/Actor_Target.pth')) self.critic.load_state_dict(torch.load(self.path + '/Net/Critic.pth')) self.critic_target.load_state_dict(torch.load(self.path + '/Net/Critic_Target.pth')) with open(self.path + '/Net/Memory.json', 'r') as f: self.memory.memory = json.load(f) with open(self.path + '/Net/Counter.json', 'r') as f: self.memory.counter = json.load(f) with open(self.path + '/Net/Var.json', 'r') as f: self.var = json.load(f) def choose_action(self, s): with torch.no_grad(): s = torch.tensor(s, dtype=torch.float) a = self.actor(s).numpy() a = np.clip(np.random.normal(loc=a, scale=self.var), -1., 1.) # 行动:仅为pitch_pos return a def store_transition(self, s, a, s_, r, done): # 向记忆库中存储经历 self.memory.store_transition(s, a, s_, r, done) if self.memory.counter >= self.start_lr_step: s, a, s_, r, done = self.memory.get_sample() self._learn(s, a, s_, r, done) def store_network(self): # print('I stored actor in:', self.path+'/Net/Actor.pth') torch.save(self.actor.state_dict(), self.path + '/Net/Actor.pth') torch.save(self.actor_target.state_dict(), self.path + '/Net/Actor_Target.pth') torch.save(self.critic.state_dict(), self.path + '/Net/Critic.pth') torch.save(self.critic_target.state_dict(), self.path + '/Net/Critic_Target.pth') with open(self.path + '/Net/Memory.json', 'w') as f: json.dump(self.memory.memory, f) with open(self.path + '/Net/Counter.json', 'w') as f: json.dump(self.memory.counter, f) with open(self.path + '/Net/Var.json', 'w') as f: json.dump(self.var, f) print(self.var, self.memory.counter) def _learn(self, s, a, s_, r, done): # 更新critic td_target = r + (1-done) * self.gamma * self.critic_target(s_, self.actor_target(s_)) q = self.critic(s, a) critic_loss = F.mse_loss(q, td_target) self.critic_opt.zero_grad() critic_loss.backward() self.critic_opt.step() # 更新actor q = self.critic(s, self.actor(s)) actor_loss = -torch.mean(q) self.actor_opt.zero_grad() actor_loss.backward() self.actor_opt.step() # 更新target网络 _soft_update(self.critic_target, self.critic, self.critic_tau) _soft_update(self.actor_target, self.actor, self.actor_tau) # update variance self.var = max(self.var * self.var_decay, self.var_min)