class SAC(algorithms): def __init__(self, args): super().__init__(args) state_dim = self.env.observation_space.shape[0] action_dim = self.env.action_space.shape[0] self.actor = GaussianPolicy(state_dim, action_dim, 64, self.env.action_space).to(device) self.actor_optimizer = optim.Adam(self.actor.parameters(), self.args.lr) self.critic_1 = QNetwork(state_dim, action_dim, 64).to(device) self.critic_optimizer_1 = optim.Adam(self.critic_1.parameters(), self.args.lr) self.critic_target_1 = QNetwork(state_dim, action_dim, 64).to(device) self.critic_target_1.load_state_dict(self.critic_1.state_dict()) self.critic_2 = QNetwork(state_dim, action_dim, 64).to(device) self.critic_optimizer_2 = optim.Adam(self.critic_2.parameters(), self.args.lr) self.critic_target_2 = QNetwork(state_dim, action_dim, 64).to(device) self.critic_target_2.load_state_dict(self.critic_2.state_dict()) self.replay_buffer = ReplayBuffer(self.args.capacity) self.global_steps = 0 def update(self): for it in range(self.args.update_iteration): # sample from replay buffer x, y, u, r, d = self.replay_buffer.sample(self.args.batch_size) state = torch.FloatTensor(x).to(device) action = torch.FloatTensor(u).to(device) next_state = torch.FloatTensor(y).to(device) done = torch.FloatTensor(d).to(device) reward = torch.FloatTensor(r).to(device) # get the next action and compute target Q with torch.no_grad(): next_action, log_prob, _ = self.actor.sample(next_state) target_Q1 = self.critic_target_1(next_state, next_action) target_Q2 = self.critic_target_2(next_state, next_action) target_Q = torch.min(target_Q1, target_Q2) - self.args.alpha * log_prob y_Q = reward + self.args.gamma * (1 - done) * target_Q # update critic current_Q1 = self.critic_1(state, action) critic_loss1 = F.mse_loss(current_Q1, y_Q) self.critic_optimizer_1.zero_grad() critic_loss1.backward() self.critic_optimizer_1.step() current_Q2 = self.critic_2(state, action) critic_loss2 = F.mse_loss(current_Q2, y_Q) self.critic_optimizer_2.zero_grad() critic_loss2.backward() self.critic_optimizer_2.step() # update actor actor_action, actor_log_prob, _ = self.actor.sample(state) Q1 = self.critic_1(state, actor_action) Q2 = self.critic_2(state, actor_action) actor_loss = -(torch.min(Q1, Q2) - self.args.alpha * actor_log_prob).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # update target network for param, target_param in zip(self.critic_1.parameters(), self.critic_target_1.parameters()): target_param.data.copy_((1 - self.args.tau) * target_param.data + self.args.tau * param.data) for param, target_param in zip(self.critic_2.parameters(), self.critic_target_2.parameters()): target_param.data.copy_((1 - self.args.tau) * target_param.data + self.args.tau * param.data) def train(self): for i in range(self.args.max_episode): state = self.env.reset() ep_r = 0 for t in count(): action, _, _ = self.actor.sample( torch.FloatTensor([state]).to(device)) action = action.cpu().detach().numpy()[0] next_state, reward, done, info = self.env.step(action) self.global_steps += 1 ep_r += reward self.replay_buffer.push( (state, next_state, action, reward, np.float(done))) state = next_state if done or t > self.args.max_length_trajectory: if i % self.args.print_log == 0: print( "Ep_i \t {}, the ep_r is \t{:0.2f}, the step is \t{}, global_steps is {}" .format(i, ep_r, t, self.global_steps)) self.evaluate(10, False) ep_r = 0 break if len(self.replay_buffer.storage) >= self.args.capacity - 1: self.update() self.save(i + 1) def evaluate(self, number=1, render=True): rewards = [] for _ in range(number): state = self.env.reset() done = False total_rews = 0 time_step = 0 while not done: with torch.no_grad(): # use the mean action action, _, _ = self.actor.sample( torch.FloatTensor([state]).to(device)) action = action.cpu().detach().numpy()[0] if render: self.env.render() state, reward, done, _ = self.env.step(action) total_rews += reward time_step += 1 if render: print("total reward of this episode is " + str(total_rews)) rewards.append(total_rews) rewards = np.array(rewards) if not render: pickle.dump((self.global_steps, rewards), self.log_file) return rewards.max(), rewards.min(), rewards.mean() def save(self, episode): file_name = self.weights_file(episode) torch.save( { 'actor': self.actor.state_dict(), 'critic_1': self.critic_1.state_dict(), 'critic_2': self.critic_2.state_dict(), 'critic_target_1': self.critic_target_1.state_dict(), 'critic_target_2': self.critic_target_2.state_dict() }, file_name) print("save model to " + file_name) def load(self, episode): file_name = self.weights_file(episode) checkpoint = torch.load(file_name) self.actor.load_state_dict(checkpoint['actor']) self.critic_1.load_state_dict(checkpoint['critic_1']) self.critic_2.load_state_dict(checkpoint['critic_2']) self.critic_target_1.load_state_dict(checkpoint['critic_target_1']) self.critic_target_2.load_state_dict(checkpoint['critic_target_2']) print("successfully load model from " + file_name)
class DDPG(algorithms): def __init__(self, args): super().__init__(args) state_dim = self.env.observation_space.shape[0] action_dim = self.env.action_space.shape[0] self.actor = DeterministicPolicy(state_dim, action_dim, 64, self.env.action_space).to(device) self.actor_target = DeterministicPolicy( state_dim, action_dim, 64, self.env.action_space).to(device) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = optim.Adam(self.actor.parameters(), self.args.lr) self.critic = QNetwork(state_dim, action_dim, 64).to(device) self.critic_target = QNetwork(state_dim, action_dim, 64).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = optim.Adam(self.critic.parameters(), self.args.lr) self.replay_buffer = ReplayBuffer(self.args.capacity) self.num_critic_update_iteration = 0 self.num_actor_update_iteration = 0 self.num_training = 0 self.global_steps = 0 if self.args.last_episode > 0: self.load(self.args.last_episode) def update(self): for it in range(self.args.update_iteration): # sample from replay buffer x, y, u, r, d = self.replay_buffer.sample(self.args.batch_size) state = torch.FloatTensor(x).to(device) action = torch.FloatTensor(u).to(device) next_state = torch.FloatTensor(y).to(device) done = torch.FloatTensor(d).to(device) reward = torch.FloatTensor(r).to(device) # computer the target Q value next_action, _, _ = self.actor_target.sample(next_state) target_Q = self.critic_target(next_state, next_action) target_Q = reward + ( (1 - done) * self.args.gamma * target_Q).detach() # get current Q estimate current_Q = self.critic(state, action) # compute cirtic loss and update critic_loss = F.mse_loss(current_Q, target_Q) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # computer actor loss actor_action, _, _ = self.actor.sample(state) actor_loss = -self.critic(state, actor_action).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # update target model for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.args.tau * param.data + (1 - self.args.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.args.tau * param.data + (1 - self.args.tau) * target_param.data) self.num_actor_update_iteration += 1 self.num_critic_update_iteration += 1 def train(self): for i in range(self.args.max_episode): state = self.env.reset() ep_r = 0 for t in count(): action, _, _ = self.actor.sample( torch.FloatTensor([state]).to(device)) action = action.cpu().detach().numpy()[0] next_state, reward, done, info = self.env.step(action) self.global_steps += 1 ep_r += reward self.replay_buffer.push( (state, next_state, action, reward, np.float(done))) state = next_state if done or t > self.args.max_length_trajectory: if i % self.args.print_log == 0: print( "Ep_i \t {}, the ep_r is \t{:0.2f}, the step is \t{}, global_steps is {}" .format(i, ep_r, t, self.global_steps)) self.evaluate(10, False) break if len(self.replay_buffer.storage) >= self.args.capacity - 1: self.update() self.save(i + 1) def evaluate(self, number=1, render=True): rewards = [] for _ in range(number): total_rews = 0 time_step = 0 done = False state = self.env.reset() while not done: with torch.no_grad(): # use the mean action _, _, action = self.actor.sample( torch.FloatTensor([state]).to(device)) action = action.cpu().detach().numpy()[0] if render: self.env.render() state, reward, done, _ = self.env.step(action) total_rews += reward time_step += 1 if render: print("total reward of this episode is " + str(total_rews)) rewards.append(total_rews) rewards = np.array(rewards) if not render: pickle.dump((self.global_steps, rewards), self.log_file) print("mean reward {}, max reward {}".format(rewards.mean(), rewards.max())) def load(self, episode=None): file_name = self.weights_file(episode) checkpoint = torch.load(file_name) self.actor.load_state_dict(checkpoint['actor']) self.actor_target.load_state_dict(checkpoint['actor_target']) self.critic.load_state_dict(checkpoint['critic']) self.critic.load_state_dict(checkpoint['critic_target']) print("successfully load model from " + file_name) def save(self, episode=None): file_name = self.weights_file(episode) torch.save( { 'actor': self.actor.state_dict(), 'critic': self.critic.state_dict(), 'actor_target': self.actor_target.state_dict(), 'critic_target': self.critic_target.state_dict() }, file_name) print("save model to " + file_name)
class DDPG(): def __init__(self, args, env = None): self.args = args # actor self.actor = DeterministicPolicy(128).to(device) self.actor_target = DeterministicPolicy(128).to(device) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = optim.Adam(self.actor.parameters(), self.args.lr) # critics self.critic = QNetwork(128).to(device) self.critic_target = QNetwork(128).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = optim.Adam(self.critic.parameters(), self.args.lr) self.replay_buffer = ReplayBuffer(self.args.capacity) self.num_critic_update_iteration = 0 self.num_actor_update_iteration = 0 self.num_training = 0 self.global_steps = 0 self.action_scale = torch.FloatTensor([[20, 1]]).to(device) self.env = env #self.load() def update(self): for it in range(self.args.update_iteration): # sample from replay buffer obs, local_goal, next_obs, next_goal, action, reward, done = self.replay_buffer.sample(self.args.batch_size) obs = torch.FloatTensor(obs).to(device) local_goal = torch.FloatTensor(local_goal).to(device) next_obs = torch.FloatTensor(next_obs).to(device) next_goal = torch.FloatTensor(next_goal).to(device) action = torch.FloatTensor(action).to(device) reward = torch.FloatTensor(reward).to(device) done = torch.FloatTensor(done).to(device) # computer the target Q value next_action, _ = self.actor_target.sample(next_obs, next_goal) target_Q = self.critic_target(next_obs, next_goal, next_action / self.action_scale) target_Q = reward + ((1-done) * self.args.gamma * target_Q).detach() # get current Q estimate current_Q = self.critic(obs, local_goal, action) # compute cirtic loss and update critic_loss = F.mse_loss(current_Q, target_Q) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # computer actor loss actor_action, _ = self.actor.sample(obs, local_goal) actor_loss = -self.critic(obs, local_goal, actor_action / self.action_scale).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # update target model for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.args.tau * param.data + (1 - self.args.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.args.tau * param.data + (1 - self.args.tau) * target_param.data) self.num_actor_update_iteration += 1 self.num_critic_update_iteration += 1 def train(self): for i in range(self.args.max_episode): obs, local_goal = self.env.reset() ep_r = 0 for t in count(): action, _ = self.actor.sample(torch.FloatTensor(obs).to(device), torch.FloatTensor(local_goal).to(device)) action = action.cpu().detach().numpy()[0] next_obs, next_goal, done, reward = self.env.step(action) self.global_steps += 1 ep_r += reward self.replay_buffer.push((obs / 4.0, local_goal / 20., next_obs / 4.0, next_goal / 20., action / np.array([20, 1]), reward, np.float(done))) obs = next_obs local_goal = next_goal if done or t > self.args.max_length_trajectory: if i % self.args.print_log == 0: print("Ep_i \t {}, the ep_r is \t{:0.2f}, the step is \t{}, global_steps is {}".format(i, ep_r, t, self.global_steps)) self.evaluate(10, False) break if len(self.replay_buffer.storage) >= self.args.capacity * 0.2: self.update() self.save() def evaluate(self, number = 1, render = True): rewards = [] for _ in range(number): total_rews = 0 time_step = 0 done = False obs, local_goal = self.env.reset() while not done: action = self.predict(obs / 4., local_goal / 20.) # with torch.no_grad(): # # use the mean action # _, action = self.actor.sample(torch.FloatTensor(obs).to(device) / 4., torch.FloatTensor(local_goal).to(device) / 20) # action = action.cpu().detach().numpy()[0] obs, local_goal, done, reward = self.env.step(action) if render: self.env.render() total_rews += reward time_step += 1 if time_step > self.args.max_length_trajectory: break #print(str(action) + " " + str(local_goal)) if done: break rewards.append(total_rews) rewards = np.array(rewards) print("mean reward {}, max reward {}, min reward {}".format(rewards.mean(), rewards.max(), rewards.min())) def predict(self, obs, local_goal): with torch.no_grad(): action = self.actor.forward(torch.FloatTensor(obs).to(device), torch.FloatTensor(local_goal).to(device)) action = action.cpu().detach().numpy()[0] return action def load(self, episode = None): file_name = "weights/DDPG.pt" checkpoint = torch.load(file_name) self.actor.load_state_dict(checkpoint['actor']) self.actor_target.load_state_dict(checkpoint['actor_target']) self.critic.load_state_dict(checkpoint['critic']) self.critic.load_state_dict(checkpoint['critic_target']) print("successfully load model from " + file_name) def save(self, episode = None): file_name = "weights/DDPG.pt" torch.save({'actor' : self.actor.state_dict(), 'critic' : self.critic.state_dict(), 'actor_target' : self.actor_target.state_dict(), 'critic_target' : self.critic_target.state_dict()}, file_name) print("save model to " + file_name)
class Agents(): def __init__(self, args): self.args = args self.policy = [Q_net(args) for _ in range(args.n_agents)] self.hyperNet = HyperNet(args) self.policy_target = [copy.deepcopy(p) for p in self.policy] self.hyperNet_target = copy.deepcopy(self.hyperNet) self.replayBuffer = ReplayBuffer(args) self.preference_pool = Preference(args) policy_param = [policy.parameters() for policy in self.policy] self.optim = torch.optim.Adam(itertools.chain( *policy_param, self.hyperNet.parameters()), lr=self.args.learning_rate) self.lr_scheduler = torch.optim.lr_scheduler.StepLR(self.optim, step_size=100, gamma=0.9, last_epoch=-1) self.step = 0 def choose_action(self, obs, preference, epsilon): obs = np.array(obs).transpose((1, 0, 2)) preference = np.array(preference).transpose((1, 0, 2)) act = np.array([ self.policy[i].choose_action(obs[i], preference[i], epsilon) for i in range(self.args.n_agents) ]) return act.transpose((2, 0, 1)) def learn(self): def combine(obs, pref): ow = [] n_pref = len(pref) for w in range(n_pref): ow.append( torch.cat([obs, pref[w]]).unsqueeze(0).to(self.args.device)) ow = torch.cat(ow, dim=0) return ow.unsqueeze(0) sample = self.replayBuffer.sample(self.args.batch_size) batch_w = self.preference_pool.sample(self.args.batch_size_p, train=True) obs = sample["obs"] obs_ = sample["next_obs"] act = sample["act"] rew = sample["rew"] state = sample["state"] state_ = sample["next_state"] Q_ = [] #################################################################### for i in range(self.args.batch_size): Q_.append([]) for j in range(self.args.batch_size_p): Q_[i].append( torch.cat([ combine(obs_[a][i], batch_w[a]) for a in range(self.args.n_agents) ], dim=0).unsqueeze(0)) Q_[i] = torch.cat(Q_[i], dim=0) Q_ = torch.cat(Q_, dim=0).permute(1, 0, 2, 3) #################################################################### Q_ = torch.cat([ self.policy[a].get_target_q(Q_[a], batch_w[a][0]).unsqueeze(0) for a in range(self.args.n_agents) ], dim=0) Q_ = Q_.squeeze(-1).permute(2, 0, 1).view(-1, self.args.n_agents * 3) obs = [ torch.cat([obs[i] for _ in range(self.args.batch_size_p)]) for i in range(self.args.n_agents) ] w = copy.deepcopy(batch_w[0]) batch_w = [ batch_w[i].data.cpu().numpy().repeat(self.args.batch_size, axis=0) for i in range(self.args.n_agents) ] Q = torch.cat([ self.policy[i].get_q(obs[i], batch_w[i], act[i]) for i in range(self.args.n_agents) ], dim=-1) Q_tot = self.hyperNet.get_Q_tot(state, w, Q) Q_tot_target = self.hyperNet_target.get_Q_tot(state_, w, Q_).detach() rew = rew.unsqueeze(0).repeat([self.args.batch_size_p, 1, 1]).view(-1, self.args.n_obj) loss = self.loss_func(Q_tot, Q_tot_target, rew, w) self.optim.zero_grad() loss.backward() self.optim.step() self.lr_scheduler.step() # print("learning rate:", self.optim) def loss_func(self, Q, Q_target, R, w): R = self.convert_type(R) w = self.convert_type(w) y = R + Q_target w = w.repeat([self.args.batch_size, 1]).view(-1, self.args.n_obj) La = torch.norm(y - Q, p=2, dim=-1).mean() wy = torch.bmm(w.unsqueeze(1), y.unsqueeze(-1)) wq = torch.bmm(w.unsqueeze(1), Q.unsqueeze(-1)) Lb = torch.abs(wy - wq).mean() # loss = La + Lb loss = La return loss def push(self, traj): self.replayBuffer.push(traj["obs"], traj["acts"], traj["rew"], traj["next_obs"], traj["done"], traj["state"], traj["next_state"], traj["pref"]) def update_target(self): self.step += 1 if self.step % 1000 == 0: print("updating target nets") self.hyperNet_target.load_state_dict(self.hyperNet.state_dict()) for i in range(len(self.policy)): self.policy_target[i].load_state_dict( self.policy[i].state_dict()) def convert_type(self, input): if not isinstance(input, torch.Tensor): input = torch.Tensor(input) if input.device != torch.device(self.args.device): input = input.to(self.args.device) return input def save_model(self, ep, path='./model/MOQMIX/'): print("saving model") state = {} for i in range(len(self.policy)): state['policy{0}'.format(i)] = self.policy[i].state_dict() state['target_policy{0}'.format( i)] = self.policy_target[i].state_dict() state['hyperNet'] = self.hyperNet.state_dict() state['target_hyperNet'] = self.hyperNet_target.state_dict() state['optim'] = self.optim.state_dict() state['lr_scheduler'] = self.lr_scheduler.state_dict() state['epoch'] = ep torch.save(state, path + "model.pth") def load_model(self, path='./model/MOQMIX', device='cpu'): state = torch.load(path + "model.pth", map_location=device) for i in range(len(self.policy)): self.policy[i].load_state_dict(state['policy{0}'.format(i)]) self.policy_target[i].load_state_dict( state['target_policy{0}'.format(i)]) self.hyperNet = state['hyperNet'] self.hyperNet_target = state['target_hyperNet'] self.optim = state['optim'] self.lr_scheduler = state['lr_scheduler'] return state['epoch']