コード例 #1
0
ファイル: train.py プロジェクト: yinjiangjin/MERL
def evaluate(weights, args, NUM_EVALS=10, render=False):
	"""Rollout Worker runs a simulation in the environment to generate experiences and fitness values

		Parameters:
			args (object): Parameter class
			id (int): Specific Id unique to each worker spun
			task_pipe (pipe): Receiver end of the task pipe used to receive signal to start on a task
			result_pipe (pipe): Sender end of the pipe used to report back results
			data_bucket (shared list object): A shared list object managed by a manager that is used to store experience tuples
			models_bucket (shared list object): A shared list object managed by a manager used to store all the models (actors)
			store_transition (bool): Log experiences to exp_list?
			random_baseline (bool): Test with random action for baseline purpose?

		Returns:
			None
	"""
	env = RoverDomainPython(args, NUM_EVALS)
	model = MultiHeadActor(args.state_dim, args.action_dim, args.hidden_size, args.config.num_agents)

	for i, param in enumerate(model.parameters()):
		try:
			param.data = weights[i]
		except:
			param.data = weights[i].data



	fitness = [None for _ in range(NUM_EVALS)]; frame=0
	joint_state = env.reset()
	joint_state = utils.to_tensor(np.array(joint_state))

	while True: #unless done

		joint_action = [model.clean_action(joint_state[i, :], head=i).detach().numpy() for i in range(args.config.num_agents)]

		#JOINT ACTION [agent_id, universe_id, action]
		#Bound Action
		joint_action = np.array(joint_action).clip(-1.0, 1.0)
		next_state, reward, done, global_reward = env.step(joint_action)  # Simulate one step in environment
		#State --> [agent_id, universe_id, obs]
		#reward --> [agent_id, universe_id]
		#done --> [universe_id]
		#info --> [universe_id]

		next_state = utils.to_tensor(np.array(next_state))

		#Grab global reward as fitnesses
		for i, grew in enumerate(global_reward):
			if grew != None:
				fitness[i] = grew


		joint_state = next_state
		frame+=NUM_EVALS

		#DONE FLAG IS Received
		if sum(done)==len(done):
			break

	return sum(fitness)/len(fitness)
コード例 #2
0
    def __init__(self, args, id):
        self.args = args
        self.id = id

        #### Rollout Actor is a template used for MP #####
        self.manager = Manager()
        self.rollout_actor = self.manager.list()
        for _ in range(args.config.num_agents):

            if args.ps == 'trunk':
                self.rollout_actor.append(
                    MultiHeadActor(args.state_dim, args.action_dim,
                                   args.hidden_size, args.config.num_agents))
            else:
                if args.algo_name == 'TD3':
                    self.rollout_actor.append(
                        Actor(args.state_dim,
                              args.action_dim,
                              args.hidden_size,
                              policy_type='DeterministicPolicy'))
                else:
                    self.rollout_actor.append(
                        Actor(args.state_dim,
                              args.action_dim,
                              args.hidden_size,
                              policy_type='GaussianPolicy'))

            if self.args.ps == 'full' or self.args.ps == 'trunk':
                break  #Only need one for homogeneous workloads
コード例 #3
0
	def __init__(self, args, id):
		self.args = args
		self.id = id

		#### Rollout Actor is a template used for MP #####
		self.manager = Manager()
		self.rollout_actor = self.manager.list()
		self.rollout_actor.append(MultiHeadActor(args.state_dim, args.action_dim, args.hidden_size, args.config.num_agents))
コード例 #4
0
ファイル: off_policy_algo.py プロジェクト: wsg1873/MERL
	def __init__(self, id, algo_name, state_dim, action_dim, hidden_size, actor_lr, critic_lr, gamma, tau, savetag, foldername, actualize, use_gpu, num_agents, init_w = True):

		self.algo_name = algo_name; self.gamma = gamma; self.tau = tau; self.total_update = 0; self.agent_id = id;self.use_gpu = use_gpu
		self.tracker = utils.Tracker(foldername, ['q_'+savetag, 'qloss_'+savetag, 'policy_loss_'+savetag], '.csv', save_iteration=1000, conv_size=1000)
		self.num_agents = num_agents

		#Initialize actors
		self.policy = MultiHeadActor(state_dim, action_dim, hidden_size, num_agents)
		if init_w: self.policy.apply(utils.init_weights)
		self.policy_target = MultiHeadActor(state_dim, action_dim, hidden_size, num_agents)
		utils.hard_update(self.policy_target, self.policy)
		self.policy_optim = Adam(self.policy.parameters(), actor_lr)


		self.critics = [QNetwork(state_dim*num_agents, action_dim*num_agents, hidden_size*3) for _ in range(num_agents)]

		self.critics_target = [QNetwork(state_dim*num_agents, action_dim*num_agents, hidden_size*3) for _ in range(num_agents)]
		if init_w:
			for critic, critic_target in zip(self.critics, self.critics_target):
				critic.apply(utils.init_weights)
				utils.hard_update(critic_target, critic)
		self.critic_optims = [Adam(critic.parameters(), critic_lr) for critic in self.critics]


		self.loss = nn.MSELoss()

		if use_gpu:
			self.policy_target.cuda(); self.policy.cuda()
			for critic, critic_target in zip(self.critics, self.critics_target):
				critic.cuda()
				critic_target.cuda()


		self.num_critic_updates = 0

		#Statistics Tracker
		#self.action_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.policy_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.q_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.q = {'min':None, 'max': None, 'mean':None, 'std':None}
コード例 #5
0
	def __init__(self, id, algo_name, state_dim, action_dim, hidden_size, actor_lr, critic_lr, gamma, tau, savetag, foldername, actualize, use_gpu, num_agents, init_w = True):

		self.algo_name = algo_name; self.gamma = gamma; self.tau = tau; self.total_update = 0; self.agent_id = id;	self.actualize = actualize; self.use_gpu = use_gpu
		self.tracker = utils.Tracker(foldername, ['q_'+savetag, 'qloss_'+savetag, 'policy_loss_'+savetag, 'alz_score'+savetag,'alz_policy'+savetag], '.csv', save_iteration=1000, conv_size=1000)

		#Initialize actors
		self.policy = MultiHeadActor(state_dim, action_dim, hidden_size, num_agents)
		if init_w: self.policy.apply(utils.init_weights)
		self.policy_target = MultiHeadActor(state_dim, action_dim, hidden_size, num_agents)
		utils.hard_update(self.policy_target, self.policy)
		self.policy_optim = Adam(self.policy.parameters(), actor_lr)


		self.critic = QNetwork(state_dim, action_dim,hidden_size)
		if init_w: self.critic.apply(utils.init_weights)
		self.critic_target = QNetwork(state_dim, action_dim, hidden_size)
		utils.hard_update(self.critic_target, self.critic)
		self.critic_optim = Adam(self.critic.parameters(), critic_lr)

		if actualize:
			self.ANetwork = ActualizationNetwork(state_dim, action_dim, hidden_size)
			if init_w: self.ANetwork.apply(utils.init_weights)
			self.actualize_optim = Adam(self.ANetwork.parameters(), critic_lr)
			self.actualize_lr = 0.2
			if use_gpu: self.ANetwork.cuda()

		self.loss = nn.MSELoss()

		if use_gpu:
			self.policy_target.cuda(); self.critic_target.cuda(); self.policy.cuda(); self.critic.cuda()
		self.num_critic_updates = 0

		#Statistics Tracker
		#self.action_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.policy_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.q_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.q = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.alz_score = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.alz_policy = {'min':None, 'max': None, 'mean':None, 'std':None}
コード例 #6
0
	def __init__(self, args, id):
		self.args = args
		self.id = id

		###Initalize neuroevolution module###
		self.evolver = SSNE(self.args)

		########Initialize population
		self.manager = Manager()
		self.popn = self.manager.list()
		for _ in range(args.popn_size):
			self.popn.append(MultiHeadActor(args.state_dim, args.action_dim, args.hidden_size, args.config.num_agents))
			self.popn[-1].eval()

		#### INITIALIZE PG ALGO #####
		if self.args.is_matd3 or args.is_maddpg:
			algo_name = 'TD3' if self.args.is_matd3 else 'DDPG'
			self.algo = MATD3(id, algo_name, args.state_dim, args.action_dim, args.hidden_size, args.actor_lr,
			                args.critic_lr, args.gamma, args.tau, args.savetag, args.aux_save, 
			                args.use_gpu, args.config.num_agents, args.init_w)

		else:
			self.algo = MultiTD3(id, 'TD3', args.state_dim, args.action_dim, args.hidden_size, args.actor_lr,
			                args.critic_lr, args.gamma, args.tau, args.savetag, args.aux_save,
			                args.use_gpu, args.config.num_agents, args.init_w)


		#### Rollout Actor is a template used for MP #####
		self.rollout_actor = self.manager.list()
		self.rollout_actor.append(MultiHeadActor(args.state_dim, args.action_dim, args.hidden_size, args.config.num_agents))

		#Initalize buffer
		self.buffer = [Buffer(args.buffer_size, buffer_gpu=False) for _ in range(args.config.num_agents)]

		#Agent metrics
		self.fitnesses = [[] for _ in range(args.popn_size)]

		###Best Policy HOF####
		self.champ_ind = 0
コード例 #7
0
class MATD3(object):
	"""Classes implementing TD3 and DDPG off-policy learners

		 Parameters:
			   args (object): Parameter class


	 """
	def __init__(self, id, algo_name, state_dim, action_dim, hidden_size, actor_lr, critic_lr, gamma, tau, savetag, foldername, actualize, use_gpu, num_agents, init_w = True):

		self.algo_name = algo_name; self.gamma = gamma; self.tau = tau; self.total_update = 0; self.agent_id = id;self.use_gpu = use_gpu
		self.tracker = utils.Tracker(foldername, ['q_'+savetag, 'qloss_'+savetag, 'policy_loss_'+savetag], '.csv', save_iteration=1000, conv_size=1000)
		self.num_agents = num_agents

		#Initialize actors
		self.policy = MultiHeadActor(state_dim, action_dim, hidden_size, num_agents)
		if init_w: self.policy.apply(utils.init_weights)
		self.policy_target = MultiHeadActor(state_dim, action_dim, hidden_size, num_agents)
		utils.hard_update(self.policy_target, self.policy)
		self.policy_optim = Adam(self.policy.parameters(), actor_lr)


		self.critics = [QNetwork(state_dim*num_agents, action_dim*num_agents, hidden_size*2) for _ in range(num_agents)]
		self.critics_target = [QNetwork(state_dim*num_agents, action_dim*num_agents, hidden_size*2) for _ in range(num_agents)]
		if init_w:
			for critic, critic_target in zip(self.critics, self.critics_target):
				critic.apply(utils.init_weights)
				utils.hard_update(critic_target, critic)
		self.critic_optims = [Adam(critic.parameters(), critic_lr) for critic in self.critics]


		self.loss = nn.MSELoss()

		if use_gpu:
			self.policy_target.cuda(); self.policy.cuda()
			for critic, critic_target in zip(self.critics, self.critics_target):
				critic.cuda()
				critic_target.cuda()


		self.num_critic_updates = 0

		#Statistics Tracker
		#self.action_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.policy_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.q_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.q = {'min':None, 'max': None, 'mean':None, 'std':None}



	def update_parameters(self, state_batch, next_state_batch, action_batch, reward_batch, done_batch, agent_id, num_epoch=1, **kwargs):
		"""Runs a step of Bellman upodate and policy gradient using a batch of experiences

			 Parameters:
				  state_batch (tensor): Current States
				  next_state_batch (tensor): Next States
				  action_batch (tensor): Actions
				  reward_batch (tensor): Rewards
				  done_batch (tensor): Done batch
				  num_epoch (int): Number of learning iteration to run with the same data

			 Returns:
				   None

		 """

		if isinstance(state_batch, list): state_batch = torch.cat(state_batch); next_state_batch = torch.cat(next_state_batch); action_batch = torch.cat(action_batch); reward_batch = torch.cat(reward_batch). done_batch = torch.cat(done_batch)
		batch_size = len(state_batch)

		for _ in range(num_epoch):
			########### CRITIC UPDATE ####################

			#Compute next q-val, next_v and target
			with torch.no_grad():


				#Compute next action_bacth
				next_action_batch = torch.cat([self.policy_target.clean_action(next_state_batch[:, id, :], id) for id in range(self.num_agents)], 1)
				if self.algo_name == 'TD3':
					# Policy Noise
					policy_noise = np.random.normal(0, kwargs['policy_noise'], (action_batch.size()[0], action_batch.size()[1] * action_batch.size()[2]))
					policy_noise = torch.clamp(torch.Tensor(policy_noise), -kwargs['policy_noise_clip'], kwargs['policy_noise_clip'])
					next_action_batch += policy_noise.cuda() if self.use_gpu else policy_noise
				next_action_batch = torch.clamp(next_action_batch, -1, 1)

				#Compute Q-val and value of next state masking by done

				q1, q2 = self.critics_target[agent_id].forward(next_state_batch.view(batch_size, -1), next_action_batch)
				q1 = (1 - done_batch) * q1
				q2 = (1 - done_batch) * q2
				#next_val = (1 - done_batch) * next_val

				#Select which q to use as next-q (depends on algo)
				if self.algo_name == 'TD3':next_q = torch.min(q1, q2)
				elif self.algo_name == 'DDPG': next_q = q1

				#Compute target q and target val
				target_q = reward_batch[:,agent_id].unsqueeze(1) + (self.gamma * next_q)
				#if self.args.use_advantage: target_val = reward_batch + (self.gamma * next_val)



			self.critic_optims[agent_id].zero_grad()
			current_q1, current_q2 = self.critics[agent_id].forward((state_batch.view(batch_size, -1)), (action_batch.view(batch_size, -1)))
			utils.compute_stats(current_q1, self.q)

			dt = self.loss(current_q1, target_q)
			# if self.args.use_advantage:
			#     dt = dt + self.loss(current_val, target_val)
			#     utils.compute_stats(current_val, self.val)

			if self.algo_name == 'TD3': dt = dt + self.loss(current_q2, target_q)
			utils.compute_stats(dt, self.q_loss)

			# if self.args.critic_constraint:
			#     if dt.item() > self.args.critic_constraint_w:
			#         dt = dt * (abs(self.args.critic_constraint_w / dt.item()))
			dt.backward()

			self.critic_optims[agent_id].step()
			self.num_critic_updates += 1

			#Delayed Actor Update
			if self.num_critic_updates % kwargs['policy_ups_freq'] == 0 or self.algo_name == 'DDPG':

				agent_action = self.policy.clean_action(state_batch[:,agent_id,:], agent_id)
				joint_action = action_batch.clone()
				joint_action[:,agent_id,:] = agent_action[:]

				#print(np.max(torch.abs(joint_action - action_batch).detach().cpu().numpy()), np.max(torch.abs(joint_action[:,agent_id,:] - agent_action).detach().cpu().numpy()))
				# # Trust Region constraint
				# if self.args.trust_region_actor:
				#     with torch.no_grad(): old_actor_actions = self.actor_target.forward(state_batch)
				#     actor_actions = action_batch - old_actor_actions


				Q1, Q2 = self.critics[agent_id].forward(state_batch.view(batch_size, -1), joint_action.view(batch_size, -1))

				# if self.args.use_advantage: policy_loss = -(Q1 - val)
				policy_loss = -Q1

				utils.compute_stats(-policy_loss,self.policy_loss)
				policy_loss = policy_loss.mean()


				self.policy_optim.zero_grad()



				policy_loss.backward(retain_graph=True)
				#nn.utils.clip_grad_norm_(self.actor.parameters(), 10)
				# if self.args.action_loss:
				#     action_loss = torch.abs(actor_actions-0.5)
				#     utils.compute_stats(action_loss, self.action_loss)
				#     action_loss = action_loss.mean() * self.args.action_loss_w
				#     action_loss.backward()
				#     #if self.action_loss[-1] > self.policy_loss[-1]: self.args.action_loss_w *= 0.9 #Decay action_w loss if action loss is larger than policy gradient loss
				self.policy_optim.step()


			# if self.args.hard_update:
			#     if self.num_critic_updates % self.args.hard_update_freq == 0:
			#         if self.num_critic_updates % self.args.policy_ups_freq == 0: self.hard_update(self.actor_target, self.actor)
			#         self.hard_update(self.critic_target, self.critic)


			if self.num_critic_updates % kwargs['policy_ups_freq'] == 0 or self.algo_name == 'DDPG': utils.soft_update(self.policy_target, self.policy, self.tau)
			for critic, critic_target in zip(self.critics, self.critics_target):
				utils.soft_update(critic_target, critic, self.tau)

			self.total_update += 1
			if self.agent_id == 0:
				self.tracker.update([self.q['mean'], self.q_loss['mean'], self.policy_loss['mean']] ,self.total_update)
コード例 #8
0
ファイル: off_policy_algo.py プロジェクト: wsg1873/MERL
class MultiTD3(object):
	"""Classes implementing TD3 and DDPG off-policy learners

		 Parameters:
			   args (object): Parameter class


	 """
	def __init__(self, id, algo_name, state_dim, action_dim, hidden_size, actor_lr, critic_lr, gamma, tau, savetag, foldername, actualize, use_gpu, num_agents, init_w = True):

		self.algo_name = algo_name; self.gamma = gamma; self.tau = tau; self.total_update = 0; self.agent_id = id;	self.actualize = actualize; self.use_gpu = use_gpu
		self.tracker = utils.Tracker(foldername, ['q_'+savetag, 'qloss_'+savetag, 'policy_loss_'+savetag, 'alz_score'+savetag,'alz_policy'+savetag], '.csv', save_iteration=1000, conv_size=1000)

		#Initialize actors
		self.policy = MultiHeadActor(state_dim, action_dim, hidden_size, num_agents)
		if init_w: self.policy.apply(utils.init_weights)
		self.policy_target = MultiHeadActor(state_dim, action_dim, hidden_size, num_agents)
		utils.hard_update(self.policy_target, self.policy)
		self.policy_optim = Adam(self.policy.parameters(), actor_lr)


		self.critic = QNetwork(state_dim, action_dim,hidden_size)
		if init_w: self.critic.apply(utils.init_weights)
		self.critic_target = QNetwork(state_dim, action_dim, hidden_size)
		utils.hard_update(self.critic_target, self.critic)
		self.critic_optim = Adam(self.critic.parameters(), critic_lr)

		if actualize:
			self.ANetwork = ActualizationNetwork(state_dim, action_dim, hidden_size)
			if init_w: self.ANetwork.apply(utils.init_weights)
			self.actualize_optim = Adam(self.ANetwork.parameters(), critic_lr)
			self.actualize_lr = 0.2
			if use_gpu: self.ANetwork.cuda()

		self.loss = nn.MSELoss()

		if use_gpu:
			self.policy_target.cuda(); self.critic_target.cuda(); self.policy.cuda(); self.critic.cuda()
		self.num_critic_updates = 0

		#Statistics Tracker
		#self.action_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.policy_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.q_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.q = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.alz_score = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.alz_policy = {'min':None, 'max': None, 'mean':None, 'std':None}
		#self.val = {'min':None, 'max': None, 'mean':None, 'std':None}
		#self.value_loss = {'min':None, 'max': None, 'mean':None, 'std':None}


	def update_parameters(self, state_batch, next_state_batch, action_batch, reward_batch, done_batch, global_reward, agent_id, num_epoch=1, **kwargs):
		"""Runs a step of Bellman upodate and policy gradient using a batch of experiences

			 Parameters:
				  state_batch (tensor): Current States
				  next_state_batch (tensor): Next States
				  action_batch (tensor): Actions
				  reward_batch (tensor): Rewards
				  done_batch (tensor): Done batch
				  num_epoch (int): Number of learning iteration to run with the same data

			 Returns:
				   None

		 """

		if isinstance(state_batch, list): state_batch = torch.cat(state_batch); next_state_batch = torch.cat(next_state_batch); action_batch = torch.cat(action_batch); reward_batch = torch.cat(reward_batch). done_batch = torch.cat(done_batch); global_reward = torch.cat(global_reward)

		for _ in range(num_epoch):
			########### CRITIC UPDATE ####################

			#Compute next q-val, next_v and target
			with torch.no_grad():
				#Policy Noise
				policy_noise = np.random.normal(0, kwargs['policy_noise'], (action_batch.size()[0], action_batch.size()[1]))
				policy_noise = torch.clamp(torch.Tensor(policy_noise), -kwargs['policy_noise_clip'], kwargs['policy_noise_clip'])

				#Compute next action_bacth
				next_action_batch = self.policy_target.clean_action(next_state_batch, agent_id) + policy_noise.cuda() if self.use_gpu else policy_noise
				next_action_batch = torch.clamp(next_action_batch, -1, 1)

				#Compute Q-val and value of next state masking by done
				q1, q2 = self.critic_target.forward(next_state_batch, next_action_batch)
				q1 = (1 - done_batch) * q1
				q2 = (1 - done_batch) * q2
				#next_val = (1 - done_batch) * next_val

				#Select which q to use as next-q (depends on algo)
				if self.algo_name == 'TD3' or self.algo_name == 'TD3_actor_min': next_q = torch.min(q1, q2)
				elif self.algo_name == 'DDPG': next_q = q1
				elif self.algo_name == 'TD3_max': next_q = torch.max(q1, q2)

				#Compute target q and target val
				target_q = reward_batch + (self.gamma * next_q)
				#if self.args.use_advantage: target_val = reward_batch + (self.gamma * next_val)

			if self.actualize:
				##########Actualization Network Update
				current_Ascore = self.ANetwork.forward(state_batch, action_batch)
				utils.compute_stats(current_Ascore, self.alz_score)
				target_Ascore = (self.actualize_lr) * (global_reward * 10.0) + (1 - self.actualize_lr) * current_Ascore.detach()
				actualize_loss = self.loss(target_Ascore, current_Ascore).mean()



			self.critic_optim.zero_grad()
			current_q1, current_q2 = self.critic.forward((state_batch), (action_batch))
			utils.compute_stats(current_q1, self.q)

			dt = self.loss(current_q1, target_q)
			# if self.args.use_advantage:
			#     dt = dt + self.loss(current_val, target_val)
			#     utils.compute_stats(current_val, self.val)

			if self.algo_name == 'TD3' or self.algo_name == 'TD3_max': dt = dt + self.loss(current_q2, target_q)
			utils.compute_stats(dt, self.q_loss)

			# if self.args.critic_constraint:
			#     if dt.item() > self.args.critic_constraint_w:
			#         dt = dt * (abs(self.args.critic_constraint_w / dt.item()))
			dt.backward()

			self.critic_optim.step()
			self.num_critic_updates += 1

			if self.actualize:
				self.actualize_optim.zero_grad()
				actualize_loss.backward()
				self.actualize_optim.step()


			#Delayed Actor Update
			if self.num_critic_updates % kwargs['policy_ups_freq'] == 0:

				actor_actions = self.policy.clean_action(state_batch, agent_id)

				# # Trust Region constraint
				# if self.args.trust_region_actor:
				#     with torch.no_grad(): old_actor_actions = self.actor_target.forward(state_batch)
				#     actor_actions = action_batch - old_actor_actions


				Q1, Q2 = self.critic.forward(state_batch, actor_actions)

				# if self.args.use_advantage: policy_loss = -(Q1 - val)
				policy_loss = -Q1

				utils.compute_stats(-policy_loss,self.policy_loss)
				policy_loss = policy_loss.mean()

				###Actualzie Policy Update
				if self.actualize:
					A1 = self.ANetwork.forward(state_batch, actor_actions)
					utils.compute_stats(A1, self.alz_policy)
					policy_loss += -A1.mean()



				self.policy_optim.zero_grad()



				policy_loss.backward(retain_graph=True)
				#nn.utils.clip_grad_norm_(self.actor.parameters(), 10)
				# if self.args.action_loss:
				#     action_loss = torch.abs(actor_actions-0.5)
				#     utils.compute_stats(action_loss, self.action_loss)
				#     action_loss = action_loss.mean() * self.args.action_loss_w
				#     action_loss.backward()
				#     #if self.action_loss[-1] > self.policy_loss[-1]: self.args.action_loss_w *= 0.9 #Decay action_w loss if action loss is larger than policy gradient loss
				self.policy_optim.step()


			# if self.args.hard_update:
			#     if self.num_critic_updates % self.args.hard_update_freq == 0:
			#         if self.num_critic_updates % self.args.policy_ups_freq == 0: self.hard_update(self.actor_target, self.actor)
			#         self.hard_update(self.critic_target, self.critic)


			if self.num_critic_updates % kwargs['policy_ups_freq'] == 0: utils.soft_update(self.policy_target, self.policy, self.tau)
			utils.soft_update(self.critic_target, self.critic, self.tau)

			self.total_update += 1
			if self.agent_id == 0:
				self.tracker.update([self.q['mean'], self.q_loss['mean'], self.policy_loss['mean'],self.alz_score['mean'], self.alz_policy['mean']] ,self.total_update)
コード例 #9
0
ファイル: train.py プロジェクト: yinjiangjin/MERL


if __name__ == "__main__":
	args = Parameters()  # Create the Parameters class
	train_env = RoverDomainPython(args, 10)
	test_env = RoverDomainPython(args, 100)


	#test_tracker = utils.Tracker(args.metric_save, [args.log_fname], '.csv')  # Initiate tracker
	torch.manual_seed(args.seed);
	np.random.seed(args.seed);
	random.seed(args.seed)  # Seeds

	total_frames = 0; all_scores = [-1.0]; all_test = [-1.0]
	model = MultiHeadActor(args.state_dim, args.action_dim, args.hidden_size, args.config.num_agents)


	print_threshold = 1000000
	###### TRAINING LOOP ########
	while True:

		if args.dist == 'uniform':
			model.apply(sample_weight_uniform)
		elif args.dist == 'normal':
			model.apply(sample_weight_normal)
		else:
			Exception('Unknown distribution')

		score, frame = evaluate(train_env, model, 10)
		total_frames += frame
コード例 #10
0
class MultiTD3(object):
    """Classes implementing TD3 and DDPG off-policy learners




	 """
    def __init__(self,
                 id,
                 algo_name,
                 state_dim,
                 action_dim,
                 hidden_size,
                 actor_lr,
                 critic_lr,
                 gamma,
                 tau,
                 savetag,
                 foldername,
                 use_gpu,
                 num_agents,
                 init_w=True):

        self.algo_name = algo_name
        self.gamma = gamma
        self.tau = tau
        self.total_update = 0
        self.agent_id = id
        self.use_gpu = use_gpu
        self.tracker = utils.Tracker(
            foldername,
            ['q_' + savetag, 'qloss_' + savetag, 'policy_loss_' + savetag],
            '.csv',
            save_iteration=1000,
            conv_size=1000)

        #Initialize actors
        self.policy = MultiHeadActor(state_dim, action_dim, hidden_size,
                                     num_agents)
        if init_w: self.policy.apply(utils.init_weights)
        self.policy_target = MultiHeadActor(state_dim, action_dim, hidden_size,
                                            num_agents)
        utils.hard_update(self.policy_target, self.policy)
        self.policy_optim = Adam(self.policy.parameters(), actor_lr)

        self.critic = QNetwork(state_dim, action_dim, hidden_size)
        if init_w: self.critic.apply(utils.init_weights)
        self.critic_target = QNetwork(state_dim, action_dim, hidden_size)
        utils.hard_update(self.critic_target, self.critic)
        self.critic_optim = Adam(self.critic.parameters(), critic_lr)

        self.loss = nn.MSELoss()

        if use_gpu:
            self.policy_target.cuda()
            self.critic_target.cuda()
            self.policy.cuda()
            self.critic.cuda()
        self.num_critic_updates = 0

        #Statistics Tracker
        self.policy_loss = {
            'min': None,
            'max': None,
            'mean': None,
            'std': None
        }
        self.q_loss = {'min': None, 'max': None, 'mean': None, 'std': None}
        self.q = {'min': None, 'max': None, 'mean': None, 'std': None}

    def update_parameters(self,
                          state_batch,
                          next_state_batch,
                          action_batch,
                          reward_batch,
                          done_batch,
                          global_reward,
                          agent_id,
                          num_epoch=1,
                          **kwargs):
        """Runs a step of Bellman upodate and policy gradient using a batch of experiences



		 """

        if isinstance(state_batch, list):
            state_batch = torch.cat(state_batch)
            next_state_batch = torch.cat(next_state_batch)
            action_batch = torch.cat(action_batch)
            reward_batch = torch.cat(reward_batch).done_batch = torch.cat(
                done_batch)
            global_reward = torch.cat(global_reward)

        for _ in range(num_epoch):
            ########### CRITIC UPDATE ####################

            #Compute next q-val, next_v and target
            with torch.no_grad():

                #Policy Noise
                policy_noise = np.random.normal(
                    0, kwargs['policy_noise'],
                    (action_batch.size()[0], action_batch.size()[1]))
                policy_noise = torch.clamp(torch.Tensor(policy_noise),
                                           -kwargs['policy_noise_clip'],
                                           kwargs['policy_noise_clip'])

                #Compute next action_bacth
                next_action_batch = self.policy_target.clean_action(
                    next_state_batch, agent_id) + policy_noise.cuda(
                    ) if self.use_gpu else policy_noise
                next_action_batch = torch.clamp(next_action_batch, -1, 1)

                #Compute Q-val and value of next state masking by done
                q1, q2 = self.critic_target.forward(next_state_batch,
                                                    next_action_batch)
                q1 = (1 - done_batch) * q1
                q2 = (1 - done_batch) * q2

                #Select which q to use as next-q (depends on algo)
                if self.algo_name == 'TD3': next_q = torch.min(q1, q2)
                elif self.algo_name == 'DDPG': next_q = q1

                #Compute target q and target val
                target_q = reward_batch + (self.gamma * next_q)

            self.critic_optim.zero_grad()
            current_q1, current_q2 = self.critic.forward((state_batch),
                                                         (action_batch))
            utils.compute_stats(current_q1, self.q)

            dt = self.loss(current_q1, target_q)

            if self.algo_name == 'TD3':
                dt = dt + self.loss(current_q2, target_q)
            utils.compute_stats(dt, self.q_loss)
            dt.backward()

            self.critic_optim.step()
            self.num_critic_updates += 1

            #Delayed Actor Update
            if self.num_critic_updates % kwargs['policy_ups_freq'] == 0:

                actor_actions = self.policy.clean_action(state_batch, agent_id)
                Q1, Q2 = self.critic.forward(state_batch, actor_actions)

                # if self.args.use_advantage: policy_loss = -(Q1 - val)
                policy_loss = -Q1

                utils.compute_stats(-policy_loss, self.policy_loss)
                policy_loss = policy_loss.mean()

                self.policy_optim.zero_grad()

                policy_loss.backward(retain_graph=True)
                self.policy_optim.step()

            if self.num_critic_updates % kwargs['policy_ups_freq'] == 0:
                utils.soft_update(self.policy_target, self.policy, self.tau)
            utils.soft_update(self.critic_target, self.critic, self.tau)

            self.total_update += 1
            if self.agent_id == 0:
                self.tracker.update([
                    self.q['mean'], self.q_loss['mean'],
                    self.policy_loss['mean']
                ], self.total_update)
コード例 #11
0
    def __init__(self, args, id):
        self.args = args
        self.id = id

        ###Initalize neuroevolution module###
        self.evolver = SSNE(self.args)

        ########Initialize population
        self.manager = Manager()
        self.popn = self.manager.list()
        for _ in range(args.popn_size):
            if args.ps == 'trunk':
                self.popn.append(
                    MultiHeadActor(args.state_dim, args.action_dim,
                                   args.hidden_size, args.config.num_agents))

            else:
                if args.algo_name == 'TD3':
                    self.popn.append(
                        Actor(args.state_dim,
                              args.action_dim,
                              args.hidden_size,
                              policy_type='DeterministicPolicy'))
                else:
                    self.popn.append(
                        Actor(args.state_dim,
                              args.action_dim,
                              args.hidden_size,
                              policy_type='GaussianPolicy'))
            self.popn[-1].eval()

        #### INITIALIZE PG ALGO #####
        if args.ps == 'trunk':

            if self.args.is_matd3 or args.is_maddpg:
                algo_name = 'TD3' if self.args.is_matd3 else 'DDPG'
                self.algo = MATD3(id, algo_name, args.state_dim,
                                  args.action_dim, args.hidden_size,
                                  args.actor_lr, args.critic_lr, args.gamma,
                                  args.tau, args.savetag, args.aux_save,
                                  args.actualize, args.use_gpu,
                                  args.config.num_agents, args.init_w)

            else:
                self.algo = MultiTD3(id, args.algo_name, args.state_dim,
                                     args.action_dim, args.hidden_size,
                                     args.actor_lr, args.critic_lr, args.gamma,
                                     args.tau, args.savetag, args.aux_save,
                                     args.actualize, args.use_gpu,
                                     args.config.num_agents, args.init_w)

        else:
            if args.algo_name == 'TD3':
                self.algo = TD3(id, args.algo_name, args.state_dim,
                                args.action_dim, args.hidden_size,
                                args.actor_lr, args.critic_lr, args.gamma,
                                args.tau, args.savetag, args.aux_save,
                                args.actualize, args.use_gpu, args.init_w)
            else:
                self.algo = SAC(id, args.state_dim, args.action_dim,
                                args.hidden_size, args.gamma, args.critic_lr,
                                args.actor_lr, args.tau, args.alpha,
                                args.target_update_interval, args.savetag,
                                args.aux_save, args.actualize, args.use_gpu)

        #### Rollout Actor is a template used for MP #####
        self.rollout_actor = self.manager.list()

        if args.ps == 'trunk':
            self.rollout_actor.append(
                MultiHeadActor(args.state_dim, args.action_dim,
                               args.hidden_size, args.config.num_agents))
        else:
            if args.algo_name == 'TD3':
                self.rollout_actor.append(
                    Actor(args.state_dim,
                          args.action_dim,
                          args.hidden_size,
                          policy_type='DeterministicPolicy'))
            else:
                self.rollout_actor.append(
                    Actor(args.state_dim,
                          args.action_dim,
                          args.hidden_size,
                          policy_type='GaussianPolicy'))

        #Initalize buffer
        if args.ps == 'trunk':
            self.buffer = [
                Buffer(args.buffer_size,
                       buffer_gpu=False,
                       filter_c=args.filter_c)
                for _ in range(args.config.num_agents)
            ]
        else:
            self.buffer = Buffer(args.buffer_size,
                                 buffer_gpu=False,
                                 filter_c=args.filter_c)

        #Agent metrics
        self.fitnesses = [[] for _ in range(args.popn_size)]

        ###Best Policy HOF####
        self.champ_ind = 0
コード例 #12
0
ファイル: train.py プロジェクト: yinjiangjin/MERL
		self.best_fname = 'best_' + self.savetag





if __name__ == "__main__":
	args = Parameters()  # Create the Parameters class


	test_tracker = utils.Tracker(args.metric_save, [args.log_fname], '.csv')  # Initiate tracker
	torch.manual_seed(args.seed);
	np.random.seed(args.seed);
	random.seed(args.seed)  # Seeds

	model = MultiHeadActor(args.state_dim, args.action_dim, args.hidden_size, args.config.num_agents)
	#model.cuda()
	# EvolutionModule runs the population in a ThreadPool, so
	# if you need to inject other arguments, you can do that
	# using the partial tool
	partial_func = partial(evaluate, args=args)
	mother_parameters = list(model.parameters())

	es = EvolutionModule(
		mother_parameters, partial_func, population_size=args.popsize,
		sigma=args.sigma, learning_rate=args.lr,
		threadcount=100, cuda=False, render_test=False
	)

	###### TRAINING LOOP ########
	total_frames = 0
コード例 #13
0
    def __init__(self, args, id):
        self.args = args
        self.id = id

        #### Rollout Actor is a template used for MP #####
        self.manager = Manager()
        self.rollout_actor = self.manager.list()

        for agent_id in range(args.config.num_agents):

            if (self.args.EVALUATE):  # LOAD model

                #filename = self.args.model_directory + str(
                #	agent_id) + "_actor_pop10_roll50_envrover_heterogeneous_fire_truck_uav_long_range_lidar_action_" + str(self.args.action_space)+ "_seed" + str(self.args.seed)+"-rewardglobal"

                filename = self.args.model_directory + str(
                    agent_id
                ) + "_actor_pop10_roll50_envrover_heterogeneous_fire_truck_uav_long_range_lidar_action_" + str(
                    self.args.action_space) + "_seed" + str(
                        self.args.seed) + "-rewardglobal_pg"

                m = torch.load(filename)
                temp_model = Actor(args.state_dim,
                                   args.action_dim,
                                   args.hidden_size,
                                   policy_type='DeterministicPolicy')

                temp_model.load_state_dict(m)

                if args.ps == 'trunk':
                    self.rollout_actor.append(
                        MultiHeadActor(args.state_dim, args.action_dim,
                                       args.hidden_size,
                                       args.config.num_agents))
                else:
                    if args.algo_name == 'TD3':
                        #self.rollout_actor.append(Actor(args.state_dim, args.action_dim, args.hidden_size, policy_type='DeterministicPolicy').load_state_dict(torch.load(filename)))
                        self.rollout_actor.append(temp_model)
                    else:
                        self.rollout_actor.append(
                            Actor(args.state_dim,
                                  args.action_dim,
                                  args.hidden_size,
                                  policy_type='GaussianPolicy'))
            else:

                if args.ps == 'trunk':
                    self.rollout_actor.append(
                        MultiHeadActor(args.state_dim, args.action_dim,
                                       args.hidden_size,
                                       args.config.num_agents))
                else:
                    if args.algo_name == 'TD3':
                        self.rollout_actor.append(
                            Actor(args.state_dim,
                                  args.action_dim,
                                  args.hidden_size,
                                  policy_type='DeterministicPolicy'))
                    else:
                        self.rollout_actor.append(
                            Actor(args.state_dim,
                                  args.action_dim,
                                  args.hidden_size,
                                  policy_type='GaussianPolicy'))

            if self.args.ps == 'full' or self.args.ps == 'trunk':
                break  #Only need one for homogeneous workloads