class SAC(algorithms):
    def __init__(self, args):
        super().__init__(args)
        state_dim = self.env.observation_space.shape[0]
        action_dim = self.env.action_space.shape[0]

        self.actor = GaussianPolicy(state_dim, action_dim, 64,
                                    self.env.action_space).to(device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          self.args.lr)

        self.critic_1 = QNetwork(state_dim, action_dim, 64).to(device)
        self.critic_optimizer_1 = optim.Adam(self.critic_1.parameters(),
                                             self.args.lr)
        self.critic_target_1 = QNetwork(state_dim, action_dim, 64).to(device)
        self.critic_target_1.load_state_dict(self.critic_1.state_dict())

        self.critic_2 = QNetwork(state_dim, action_dim, 64).to(device)
        self.critic_optimizer_2 = optim.Adam(self.critic_2.parameters(),
                                             self.args.lr)
        self.critic_target_2 = QNetwork(state_dim, action_dim, 64).to(device)
        self.critic_target_2.load_state_dict(self.critic_2.state_dict())

        self.replay_buffer = ReplayBuffer(self.args.capacity)

        self.global_steps = 0

    def update(self):
        for it in range(self.args.update_iteration):
            # sample from replay buffer
            x, y, u, r, d = self.replay_buffer.sample(self.args.batch_size)
            state = torch.FloatTensor(x).to(device)
            action = torch.FloatTensor(u).to(device)
            next_state = torch.FloatTensor(y).to(device)
            done = torch.FloatTensor(d).to(device)
            reward = torch.FloatTensor(r).to(device)

            # get the next action and compute target Q
            with torch.no_grad():
                next_action, log_prob, _ = self.actor.sample(next_state)
                target_Q1 = self.critic_target_1(next_state, next_action)
                target_Q2 = self.critic_target_2(next_state, next_action)
                target_Q = torch.min(target_Q1,
                                     target_Q2) - self.args.alpha * log_prob
                y_Q = reward + self.args.gamma * (1 - done) * target_Q

            # update critic
            current_Q1 = self.critic_1(state, action)
            critic_loss1 = F.mse_loss(current_Q1, y_Q)
            self.critic_optimizer_1.zero_grad()
            critic_loss1.backward()
            self.critic_optimizer_1.step()

            current_Q2 = self.critic_2(state, action)
            critic_loss2 = F.mse_loss(current_Q2, y_Q)
            self.critic_optimizer_2.zero_grad()
            critic_loss2.backward()
            self.critic_optimizer_2.step()

            # update actor
            actor_action, actor_log_prob, _ = self.actor.sample(state)
            Q1 = self.critic_1(state, actor_action)
            Q2 = self.critic_2(state, actor_action)
            actor_loss = -(torch.min(Q1, Q2) -
                           self.args.alpha * actor_log_prob).mean()
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # update target network
            for param, target_param in zip(self.critic_1.parameters(),
                                           self.critic_target_1.parameters()):
                target_param.data.copy_((1 - self.args.tau) *
                                        target_param.data +
                                        self.args.tau * param.data)

            for param, target_param in zip(self.critic_2.parameters(),
                                           self.critic_target_2.parameters()):
                target_param.data.copy_((1 - self.args.tau) *
                                        target_param.data +
                                        self.args.tau * param.data)

    def train(self):
        for i in range(self.args.max_episode):
            state = self.env.reset()
            ep_r = 0
            for t in count():
                action, _, _ = self.actor.sample(
                    torch.FloatTensor([state]).to(device))
                action = action.cpu().detach().numpy()[0]
                next_state, reward, done, info = self.env.step(action)
                self.global_steps += 1
                ep_r += reward
                self.replay_buffer.push(
                    (state, next_state, action, reward, np.float(done)))
                state = next_state

                if done or t > self.args.max_length_trajectory:
                    if i % self.args.print_log == 0:
                        print(
                            "Ep_i \t {}, the ep_r is \t{:0.2f}, the step is \t{}, global_steps is {}"
                            .format(i, ep_r, t, self.global_steps))
                        self.evaluate(10, False)
                    ep_r = 0
                    break

            if len(self.replay_buffer.storage) >= self.args.capacity - 1:
                self.update()

        self.save(i + 1)

    def evaluate(self, number=1, render=True):
        rewards = []
        for _ in range(number):
            state = self.env.reset()
            done = False
            total_rews = 0
            time_step = 0
            while not done:
                with torch.no_grad():
                    # use the mean action
                    action, _, _ = self.actor.sample(
                        torch.FloatTensor([state]).to(device))
                    action = action.cpu().detach().numpy()[0]
                if render:
                    self.env.render()
                state, reward, done, _ = self.env.step(action)
                total_rews += reward
                time_step += 1

            if render:
                print("total reward of this episode is " + str(total_rews))
            rewards.append(total_rews)
        rewards = np.array(rewards)
        if not render:
            pickle.dump((self.global_steps, rewards), self.log_file)
        return rewards.max(), rewards.min(), rewards.mean()

    def save(self, episode):
        file_name = self.weights_file(episode)
        torch.save(
            {
                'actor': self.actor.state_dict(),
                'critic_1': self.critic_1.state_dict(),
                'critic_2': self.critic_2.state_dict(),
                'critic_target_1': self.critic_target_1.state_dict(),
                'critic_target_2': self.critic_target_2.state_dict()
            }, file_name)
        print("save model to " + file_name)

    def load(self, episode):
        file_name = self.weights_file(episode)
        checkpoint = torch.load(file_name)
        self.actor.load_state_dict(checkpoint['actor'])
        self.critic_1.load_state_dict(checkpoint['critic_1'])
        self.critic_2.load_state_dict(checkpoint['critic_2'])
        self.critic_target_1.load_state_dict(checkpoint['critic_target_1'])
        self.critic_target_2.load_state_dict(checkpoint['critic_target_2'])
        print("successfully load model from " + file_name)
Esempio n. 2
0
class DDPG(algorithms):
    def __init__(self, args):
        super().__init__(args)
        state_dim = self.env.observation_space.shape[0]
        action_dim = self.env.action_space.shape[0]

        self.actor = DeterministicPolicy(state_dim, action_dim, 64,
                                         self.env.action_space).to(device)
        self.actor_target = DeterministicPolicy(
            state_dim, action_dim, 64, self.env.action_space).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          self.args.lr)

        self.critic = QNetwork(state_dim, action_dim, 64).to(device)
        self.critic_target = QNetwork(state_dim, action_dim, 64).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           self.args.lr)

        self.replay_buffer = ReplayBuffer(self.args.capacity)
        self.num_critic_update_iteration = 0
        self.num_actor_update_iteration = 0
        self.num_training = 0
        self.global_steps = 0

        if self.args.last_episode > 0:
            self.load(self.args.last_episode)

    def update(self):
        for it in range(self.args.update_iteration):
            # sample from replay buffer
            x, y, u, r, d = self.replay_buffer.sample(self.args.batch_size)
            state = torch.FloatTensor(x).to(device)
            action = torch.FloatTensor(u).to(device)
            next_state = torch.FloatTensor(y).to(device)
            done = torch.FloatTensor(d).to(device)
            reward = torch.FloatTensor(r).to(device)

            # computer the target Q value
            next_action, _, _ = self.actor_target.sample(next_state)
            target_Q = self.critic_target(next_state, next_action)
            target_Q = reward + (
                (1 - done) * self.args.gamma * target_Q).detach()

            # get current Q estimate
            current_Q = self.critic(state, action)

            # compute cirtic loss and update
            critic_loss = F.mse_loss(current_Q, target_Q)
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # computer actor loss
            actor_action, _, _ = self.actor.sample(state)
            actor_loss = -self.critic(state, actor_action).mean()
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # update target model
            for param, target_param in zip(self.critic.parameters(),
                                           self.critic_target.parameters()):
                target_param.data.copy_(self.args.tau * param.data +
                                        (1 - self.args.tau) *
                                        target_param.data)

            for param, target_param in zip(self.actor.parameters(),
                                           self.actor_target.parameters()):
                target_param.data.copy_(self.args.tau * param.data +
                                        (1 - self.args.tau) *
                                        target_param.data)

            self.num_actor_update_iteration += 1
            self.num_critic_update_iteration += 1

    def train(self):
        for i in range(self.args.max_episode):
            state = self.env.reset()
            ep_r = 0
            for t in count():
                action, _, _ = self.actor.sample(
                    torch.FloatTensor([state]).to(device))
                action = action.cpu().detach().numpy()[0]

                next_state, reward, done, info = self.env.step(action)
                self.global_steps += 1
                ep_r += reward
                self.replay_buffer.push(
                    (state, next_state, action, reward, np.float(done)))
                state = next_state

                if done or t > self.args.max_length_trajectory:
                    if i % self.args.print_log == 0:
                        print(
                            "Ep_i \t {}, the ep_r is \t{:0.2f}, the step is \t{}, global_steps is {}"
                            .format(i, ep_r, t, self.global_steps))
                        self.evaluate(10, False)
                    break

            if len(self.replay_buffer.storage) >= self.args.capacity - 1:
                self.update()
        self.save(i + 1)

    def evaluate(self, number=1, render=True):
        rewards = []
        for _ in range(number):
            total_rews = 0
            time_step = 0
            done = False
            state = self.env.reset()
            while not done:
                with torch.no_grad():
                    # use the mean action
                    _, _, action = self.actor.sample(
                        torch.FloatTensor([state]).to(device))
                    action = action.cpu().detach().numpy()[0]
                if render:
                    self.env.render()
                state, reward, done, _ = self.env.step(action)
                total_rews += reward
                time_step += 1

            if render:
                print("total reward of this episode is " + str(total_rews))
            rewards.append(total_rews)
        rewards = np.array(rewards)
        if not render:
            pickle.dump((self.global_steps, rewards), self.log_file)
        print("mean reward {}, max reward {}".format(rewards.mean(),
                                                     rewards.max()))

    def load(self, episode=None):
        file_name = self.weights_file(episode)
        checkpoint = torch.load(file_name)
        self.actor.load_state_dict(checkpoint['actor'])
        self.actor_target.load_state_dict(checkpoint['actor_target'])
        self.critic.load_state_dict(checkpoint['critic'])
        self.critic.load_state_dict(checkpoint['critic_target'])
        print("successfully load model from " + file_name)

    def save(self, episode=None):
        file_name = self.weights_file(episode)
        torch.save(
            {
                'actor': self.actor.state_dict(),
                'critic': self.critic.state_dict(),
                'actor_target': self.actor_target.state_dict(),
                'critic_target': self.critic_target.state_dict()
            }, file_name)
        print("save model to " + file_name)
Esempio n. 3
0
class DDPG():
	def __init__(self, args, env = None):
		self.args = args
		# actor
		self.actor = DeterministicPolicy(128).to(device)
		self.actor_target = DeterministicPolicy(128).to(device)
		self.actor_target.load_state_dict(self.actor.state_dict())
		self.actor_optimizer = optim.Adam(self.actor.parameters(), self.args.lr)
		# critics
		self.critic = QNetwork(128).to(device)
		self.critic_target = QNetwork(128).to(device)
		self.critic_target.load_state_dict(self.critic.state_dict())
		self.critic_optimizer = optim.Adam(self.critic.parameters(), self.args.lr)

		self.replay_buffer = ReplayBuffer(self.args.capacity)
		self.num_critic_update_iteration = 0
		self.num_actor_update_iteration = 0
		self.num_training = 0
		self.global_steps = 0

		self.action_scale = torch.FloatTensor([[20, 1]]).to(device)
		self.env = env
		#self.load()

	def update(self):
		for it in range(self.args.update_iteration):
			# sample from replay buffer
			obs, local_goal, next_obs, next_goal, action, reward, done = self.replay_buffer.sample(self.args.batch_size)
			obs = torch.FloatTensor(obs).to(device)
			local_goal = torch.FloatTensor(local_goal).to(device)
			next_obs = torch.FloatTensor(next_obs).to(device)
			next_goal = torch.FloatTensor(next_goal).to(device)
			action = torch.FloatTensor(action).to(device)
			reward = torch.FloatTensor(reward).to(device)
			done = torch.FloatTensor(done).to(device)

			# computer the target Q value
			next_action, _ = self.actor_target.sample(next_obs, next_goal)
			target_Q = self.critic_target(next_obs, next_goal, next_action / self.action_scale)
			target_Q = reward + ((1-done) * self.args.gamma * target_Q).detach()

			# get current Q estimate
			current_Q = self.critic(obs, local_goal, action)

			# compute cirtic loss and update
			critic_loss = F.mse_loss(current_Q, target_Q)
			self.critic_optimizer.zero_grad()
			critic_loss.backward()
			self.critic_optimizer.step()

			# computer actor loss
			actor_action, _ = self.actor.sample(obs, local_goal)
			actor_loss = -self.critic(obs, local_goal, actor_action / self.action_scale).mean()
			self.actor_optimizer.zero_grad()
			actor_loss.backward()
			self.actor_optimizer.step()

			# update target model 
			for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
				target_param.data.copy_(self.args.tau * param.data + (1 - self.args.tau) * target_param.data)

			for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
				target_param.data.copy_(self.args.tau * param.data + (1 - self.args.tau) * target_param.data)

			self.num_actor_update_iteration += 1
			self.num_critic_update_iteration += 1

	def train(self):
		for i in range(self.args.max_episode):
			obs, local_goal = self.env.reset()
			ep_r = 0

			for t in count():
				action, _ = self.actor.sample(torch.FloatTensor(obs).to(device), torch.FloatTensor(local_goal).to(device))
				action = action.cpu().detach().numpy()[0]

				next_obs, next_goal, done, reward = self.env.step(action)
				self.global_steps += 1
				ep_r += reward
				self.replay_buffer.push((obs / 4.0, local_goal / 20., next_obs / 4.0, next_goal / 20., action / np.array([20, 1]), reward, np.float(done)))
				obs = next_obs
				local_goal = next_goal

				if done or t > self.args.max_length_trajectory:
					if i % self.args.print_log == 0:
						print("Ep_i \t {}, the ep_r is \t{:0.2f}, the step is \t{}, global_steps is {}".format(i, ep_r, t, self.global_steps))
						self.evaluate(10, False)
					break

			if len(self.replay_buffer.storage) >= self.args.capacity * 0.2:
				self.update()

		self.save()

	def evaluate(self, number = 1, render = True):
		rewards = []
		for _ in range(number):
			total_rews = 0
			time_step = 0
			done = False
			obs, local_goal = self.env.reset()
			while not done:
				action = self.predict(obs / 4., local_goal / 20.)
				# with torch.no_grad():
				# 	# use the mean action
				# 	_, action = self.actor.sample(torch.FloatTensor(obs).to(device) / 4., torch.FloatTensor(local_goal).to(device) / 20)
				# 	action = action.cpu().detach().numpy()[0]

				obs, local_goal, done, reward = self.env.step(action)
				
				if render:
					self.env.render()
				total_rews += reward
				time_step += 1
				if time_step > self.args.max_length_trajectory:
					break
				#print(str(action) + "  " + str(local_goal))
				if done:
					break

			rewards.append(total_rews)
		rewards = np.array(rewards)
		print("mean reward {}, max reward {}, min reward {}".format(rewards.mean(), rewards.max(), rewards.min()))

	def predict(self, obs, local_goal):
		with torch.no_grad():
			action = self.actor.forward(torch.FloatTensor(obs).to(device), torch.FloatTensor(local_goal).to(device))
		action = action.cpu().detach().numpy()[0]
		return action

	def load(self, episode = None):
		file_name = "weights/DDPG.pt"
		checkpoint = torch.load(file_name)
		self.actor.load_state_dict(checkpoint['actor'])
		self.actor_target.load_state_dict(checkpoint['actor_target'])
		self.critic.load_state_dict(checkpoint['critic'])
		self.critic.load_state_dict(checkpoint['critic_target'])
		print("successfully load model from " + file_name)

	def save(self, episode = None):
		file_name = "weights/DDPG.pt"
		torch.save({'actor' : self.actor.state_dict(),
					'critic' : self.critic.state_dict(),
					'actor_target' : self.actor_target.state_dict(),
					'critic_target' : self.critic_target.state_dict()}, file_name)
		print("save model to " + file_name)
Esempio n. 4
0
class Agents():
    def __init__(self, args):
        self.args = args
        self.policy = [Q_net(args) for _ in range(args.n_agents)]
        self.hyperNet = HyperNet(args)
        self.policy_target = [copy.deepcopy(p) for p in self.policy]
        self.hyperNet_target = copy.deepcopy(self.hyperNet)
        self.replayBuffer = ReplayBuffer(args)
        self.preference_pool = Preference(args)
        policy_param = [policy.parameters() for policy in self.policy]
        self.optim = torch.optim.Adam(itertools.chain(
            *policy_param, self.hyperNet.parameters()),
                                      lr=self.args.learning_rate)
        self.lr_scheduler = torch.optim.lr_scheduler.StepLR(self.optim,
                                                            step_size=100,
                                                            gamma=0.9,
                                                            last_epoch=-1)
        self.step = 0

    def choose_action(self, obs, preference, epsilon):
        obs = np.array(obs).transpose((1, 0, 2))
        preference = np.array(preference).transpose((1, 0, 2))
        act = np.array([
            self.policy[i].choose_action(obs[i], preference[i], epsilon)
            for i in range(self.args.n_agents)
        ])

        return act.transpose((2, 0, 1))

    def learn(self):
        def combine(obs, pref):
            ow = []
            n_pref = len(pref)
            for w in range(n_pref):
                ow.append(
                    torch.cat([obs,
                               pref[w]]).unsqueeze(0).to(self.args.device))
            ow = torch.cat(ow, dim=0)
            return ow.unsqueeze(0)

        sample = self.replayBuffer.sample(self.args.batch_size)
        batch_w = self.preference_pool.sample(self.args.batch_size_p,
                                              train=True)
        obs = sample["obs"]
        obs_ = sample["next_obs"]
        act = sample["act"]
        rew = sample["rew"]
        state = sample["state"]
        state_ = sample["next_state"]
        Q_ = []
        ####################################################################
        for i in range(self.args.batch_size):
            Q_.append([])
            for j in range(self.args.batch_size_p):
                Q_[i].append(
                    torch.cat([
                        combine(obs_[a][i], batch_w[a])
                        for a in range(self.args.n_agents)
                    ],
                              dim=0).unsqueeze(0))
            Q_[i] = torch.cat(Q_[i], dim=0)
        Q_ = torch.cat(Q_, dim=0).permute(1, 0, 2, 3)
        ####################################################################
        Q_ = torch.cat([
            self.policy[a].get_target_q(Q_[a], batch_w[a][0]).unsqueeze(0)
            for a in range(self.args.n_agents)
        ],
                       dim=0)
        Q_ = Q_.squeeze(-1).permute(2, 0, 1).view(-1, self.args.n_agents * 3)
        obs = [
            torch.cat([obs[i] for _ in range(self.args.batch_size_p)])
            for i in range(self.args.n_agents)
        ]
        w = copy.deepcopy(batch_w[0])
        batch_w = [
            batch_w[i].data.cpu().numpy().repeat(self.args.batch_size, axis=0)
            for i in range(self.args.n_agents)
        ]
        Q = torch.cat([
            self.policy[i].get_q(obs[i], batch_w[i], act[i])
            for i in range(self.args.n_agents)
        ],
                      dim=-1)
        Q_tot = self.hyperNet.get_Q_tot(state, w, Q)
        Q_tot_target = self.hyperNet_target.get_Q_tot(state_, w, Q_).detach()
        rew = rew.unsqueeze(0).repeat([self.args.batch_size_p, 1,
                                       1]).view(-1, self.args.n_obj)
        loss = self.loss_func(Q_tot, Q_tot_target, rew, w)
        self.optim.zero_grad()
        loss.backward()
        self.optim.step()
        self.lr_scheduler.step()
        # print("learning rate:", self.optim)

    def loss_func(self, Q, Q_target, R, w):
        R = self.convert_type(R)
        w = self.convert_type(w)
        y = R + Q_target
        w = w.repeat([self.args.batch_size, 1]).view(-1, self.args.n_obj)
        La = torch.norm(y - Q, p=2, dim=-1).mean()
        wy = torch.bmm(w.unsqueeze(1), y.unsqueeze(-1))
        wq = torch.bmm(w.unsqueeze(1), Q.unsqueeze(-1))
        Lb = torch.abs(wy - wq).mean()
        # loss = La + Lb
        loss = La
        return loss

    def push(self, traj):
        self.replayBuffer.push(traj["obs"], traj["acts"], traj["rew"],
                               traj["next_obs"], traj["done"], traj["state"],
                               traj["next_state"], traj["pref"])

    def update_target(self):
        self.step += 1
        if self.step % 1000 == 0:
            print("updating target nets")
            self.hyperNet_target.load_state_dict(self.hyperNet.state_dict())
            for i in range(len(self.policy)):
                self.policy_target[i].load_state_dict(
                    self.policy[i].state_dict())

    def convert_type(self, input):
        if not isinstance(input, torch.Tensor):
            input = torch.Tensor(input)
        if input.device != torch.device(self.args.device):
            input = input.to(self.args.device)

        return input

    def save_model(self, ep, path='./model/MOQMIX/'):
        print("saving model")
        state = {}
        for i in range(len(self.policy)):
            state['policy{0}'.format(i)] = self.policy[i].state_dict()
            state['target_policy{0}'.format(
                i)] = self.policy_target[i].state_dict()
        state['hyperNet'] = self.hyperNet.state_dict()
        state['target_hyperNet'] = self.hyperNet_target.state_dict()
        state['optim'] = self.optim.state_dict()
        state['lr_scheduler'] = self.lr_scheduler.state_dict()
        state['epoch'] = ep
        torch.save(state, path + "model.pth")

    def load_model(self, path='./model/MOQMIX', device='cpu'):
        state = torch.load(path + "model.pth", map_location=device)
        for i in range(len(self.policy)):
            self.policy[i].load_state_dict(state['policy{0}'.format(i)])
            self.policy_target[i].load_state_dict(
                state['target_policy{0}'.format(i)])
        self.hyperNet = state['hyperNet']
        self.hyperNet_target = state['target_hyperNet']
        self.optim = state['optim']
        self.lr_scheduler = state['lr_scheduler']
        return state['epoch']