Ejemplo n.º 1
0
    def __init__(self, args, id):
        self.args = args
        self.id = id

        #### Rollout Actor is a template used for MP #####
        self.manager = Manager()
        self.rollout_actor = self.manager.list()
        for _ in range(args.config.num_agents):

            if args.ps == 'trunk':
                self.rollout_actor.append(
                    MultiHeadActor(args.state_dim, args.action_dim,
                                   args.hidden_size, args.config.num_agents))
            else:
                if args.algo_name == 'TD3':
                    self.rollout_actor.append(
                        Actor(args.state_dim,
                              args.action_dim,
                              args.hidden_size,
                              policy_type='DeterministicPolicy'))
                else:
                    self.rollout_actor.append(
                        Actor(args.state_dim,
                              args.action_dim,
                              args.hidden_size,
                              policy_type='GaussianPolicy'))

            if self.args.ps == 'full' or self.args.ps == 'trunk':
                break  #Only need one for homogeneous workloads
Ejemplo n.º 2
0
    def __init__(self, wwid, algo_name, state_dim, action_dim, actor_lr, critic_lr, gamma, tau, init_w = True):

        self.algo_name = algo_name; self.gamma = gamma; self.tau = tau

        #Initialize actors
        self.actor = Actor(state_dim, action_dim, wwid)
        if init_w: self.actor.apply(utils.init_weights)
        self.actor_target = Actor(state_dim, action_dim, wwid)
        utils.hard_update(self.actor_target, self.actor)
        self.actor_optim = Adam(self.actor.parameters(), actor_lr)


        self.critic = Critic(state_dim, action_dim)
        if init_w: self.critic.apply(utils.init_weights)
        self.critic_target = Critic(state_dim, action_dim)
        utils.hard_update(self.critic_target, self.critic)
        self.critic_optim = Adam(self.critic.parameters(), critic_lr)

        self.loss = nn.MSELoss()

        self.actor_target.cuda(); self.critic_target.cuda(); self.actor.cuda(); self.critic.cuda()
        self.num_critic_updates = 0

        #Statistics Tracker
        self.action_loss = {'min':[], 'max': [], 'mean':[], 'std':[]}
        self.policy_loss = {'min':[], 'max': [], 'mean':[], 'std':[]}
        self.critic_loss = {'mean':[]}
        self.q = {'min':[], 'max': [], 'mean':[], 'std':[]}
        self.val = {'min':[], 'max': [], 'mean':[], 'std':[]}
    def __init__(self, args):

        self.args = args

        self.actor = Actor(args)
        self.actor.apply(utils.init_weights)
        self.actor_target = Actor(args)
        self.actor_optim = Adam(self.actor.parameters(), lr=1e-4)

        self.critic = Critic(args)
        self.critic.apply(utils.init_weights)
        self.critic_target = Critic(args)
        self.critic_optim = Adam(self.critic.parameters(), lr=1e-3)

        self.gamma = args.gamma
        self.tau = self.args.tau
        self.loss = nn.MSELoss()

        self.hard_update(
            self.actor_target,
            self.actor)  # Make sure target is with the same weight
        self.hard_update(self.critic_target, self.critic)
        self.actor_target.cuda()
        self.critic_target.cuda()
        self.actor.cuda()
        self.critic.cuda()
        self.num_critic_updates = 0

        #Statistics Tracker
        self.action_loss = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.policy_loss = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.critic_loss = {'mean': []}
        self.q = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.val = {'min': [], 'max': [], 'mean': [], 'std': []}
Ejemplo n.º 4
0
    def __init__(self, state_dim, action_dim, gamma, tau, buffer_size,
                 is_mem_cuda, out_act):

        self.actor = Actor(state_dim,
                           action_dim,
                           is_evo=False,
                           out_act=out_act)
        self.actor_target = Actor(state_dim,
                                  action_dim,
                                  is_evo=False,
                                  out_act=out_act)
        self.actor_optim = Adam(self.actor.parameters(), lr=1e-4)

        self.critic = Critic(state_dim, action_dim)
        self.critic_target = Critic(state_dim, action_dim)
        self.critic_optim = Adam(self.critic.parameters(), lr=1e-3)

        self.gamma = gamma
        self.tau = tau
        self.loss = nn.MSELoss()
        self.replay_buffer = ReplayMemory(buffer_size, is_mem_cuda)
        self.exploration_noise = OUNoise(action_dim)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
Ejemplo n.º 5
0
class A2C(object):
    def __init__(self, args):

        self.args = args

        self.actor = Actor(args)
        self.actor_target = Actor(args)
        self.actor_optim = Adam(self.actor.parameters(), lr=1e-4)

        self.critic = Critic(args)
        self.critic_target = Critic(args)
        self.critic_optim = Adam(self.critic.parameters(), lr=1e-3)

        self.gamma = args.gamma
        self.tau = self.args.tau
        self.loss = nn.MSELoss()

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

    def update_parameters(self, batch):
        state_batch = torch.cat(batch.state)
        next_state_batch = torch.cat(batch.next_state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        done_batch = torch.cat(batch.done)
        state_batch.volatile = False
        next_state_batch.volatile = True
        action_batch.volatile = False

        # Critic Update
        vals = self.critic.forward(state_batch)
        new_vals = self.critic.forward(next_state_batch) * (1 - done_batch)
        targets = reward_batch + self.gamma * new_vals
        self.critic_optim.zero_grad()
        dt = self.loss(vals, targets)
        dt.backward()
        self.critic_optim.step()

        # Actor Update
        self.actor_optim.zero_grad()
        state_batch = utils.to_tensor(utils.to_numpy(state_batch))
        targets = utils.to_tensor(utils.to_numpy(targets))
        vals = utils.to_tensor(utils.to_numpy(vals))
        action_logs = self.actor.forward(state_batch)
        entropy_loss = torch.mean(entropy(torch.exp(action_logs)))
        action_logs = F.log_softmax(action_logs)
        dt = targets - vals
        alogs = []
        for i, action in enumerate(action_batch):
            action_i = int(action.cpu().data.numpy())
            alogs.append(action_logs[i, action_i])
        alogs = torch.cat(alogs).unsqueeze(0)

        policy_loss = -torch.mean(dt * alogs.t())
        actor_loss = policy_loss - entropy_loss
        actor_loss.backward()
        self.actor_optim.step()
Ejemplo n.º 6
0
    def __init__(
        self,
        CERL_agent,
        num_workers,
        trainers,
        pomdp_adv=False
    ):  #trainers first is the blue agent and second is the red model
        self.num_workers = num_workers
        self.trainers = trainers
        self.pomdp_adv = pomdp_adv
        self.args = CERL_agent.args
        self.drqn = CERL_agent.args.drqn  #denote if blue uses drqn
        if self.pomdp_adv:
            self.trainers = [trainers[0],
                             None]  #make sure the red model is never used
        self.buffer_gpu = CERL_agent.args.buffer_gpu
        self.batch_size = CERL_agent.args.batch_size
        self.algo = CERL_agent.args.algo
        self.state_dim = CERL_agent.args.state_dim
        self.action_dim = CERL_agent.args.action_dim
        self.buffer = Buffer(BUFFER_SIZE,
                             self.buffer_gpu)  #initialize own replay buffer
        self.data_bucket = self.buffer.tuples
        self.evo_task_pipes = [Pipe() for _ in range(self.num_workers)]
        self.evo_result_pipes = [Pipe() for _ in range(self.num_workers)]
        self.actual_red_worker = Actor(
            CERL_agent.args.state_dim, CERL_agent.args.action_dim, -1,
            'dis')  #this model is shared accross the workers
        self.actual_red_worker.share_memory()
        self.td3args = {
            'policy_noise': 0.2,
            'policy_noise_clip': 0.5,
            'policy_ups_freq': 2,
            'action_low': CERL_agent.args.action_low,
            'action_high': CERL_agent.args.action_high,
            'cerl_args': self.args
        }
        self.renew_learner(
        )  #now we are not using new learner for each iteration
        self.rollout_bucket = [
            self.actual_red_worker for i in range(num_workers)
        ]
        self.workers = [
            Process(target=rollout_worker,
                    args=(id, 3, self.evo_task_pipes[id][1],
                          self.evo_result_pipes[id][0], False,
                          self.data_bucket, self.rollout_bucket, 'dummy_name',
                          None, 'dis', self.trainers, False, self.pomdp_adv))
            for id in range(num_workers)
        ]

        for worker in self.workers:
            worker.start()
        self.evo_flag = [True for _ in range(self.num_workers)]
Ejemplo n.º 7
0
	def __init__(self, id, num_inputs, action_dim, hidden_size, gamma, critic_lr, actor_lr, tau, alpha, target_update_interval, savetag, foldername, actualize, use_gpu):

		self.num_inputs = num_inputs
		self.action_space = action_dim
		self.gamma = gamma
		self.tau = 0.005
		self.alpha = 0.2
		self.policy_type = "Gaussian"
		self.target_update_interval = 1
		self.tracker = utils.Tracker(foldername, ['q_'+savetag, 'qloss_'+savetag, 'value_'+savetag, 'value_loss_'+savetag, 'policy_loss_'+savetag, 'mean_loss_'+savetag, 'std_loss_'+savetag], '.csv',save_iteration=1000, conv_size=1000)
		self.total_update = 0
		self.agent_id = id
		self.actualize = actualize

		self.critic = QNetwork(self.num_inputs, self.action_space, hidden_size)
		self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr)
		self.soft_q_criterion = nn.MSELoss()

		if self.policy_type == "Gaussian":
			self.policy = Actor(self.num_inputs, self.action_space, hidden_size, policy_type='GaussianPolicy')
			self.policy_optim = Adam(self.policy.parameters(), lr=actor_lr)

			self.value = ValueNetwork(self.num_inputs, hidden_size)
			self.value_target = ValueNetwork(self.num_inputs, hidden_size)
			self.value_optim = Adam(self.value.parameters(), lr=critic_lr)
			utils.hard_update(self.value_target, self.value)
			self.value_criterion = nn.MSELoss()
		else:
			self.policy = Actor(self.num_inputs, self.action_space, hidden_size, policy_type='DeterministicPolicy')
			self.policy_optim = Adam(self.policy.parameters(), lr=actor_lr)

			self.critic_target = QNetwork(self.num_inputs, self.action_space, hidden_size)
			utils.hard_update(self.critic_target, self.critic)

		self.policy.cuda()
		self.value.cuda()
		self.value_target.cuda()
		self.critic.cuda()

		#Statistics Tracker
		self.q = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.val = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.value_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.policy_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.mean_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.std_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.q_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
Ejemplo n.º 8
0
    def __init__(self, args):
        self.args = args
        self.evolver = SSNE(self.args)

        #MP TOOLS
        self.manager = Manager()

        #Init population
        self.pop = self.manager.list()
        for _ in range(args.pop_size):
            self.pop.append(Actor(args))
            #self.pop[-1].apply(utils.init_weights)
        self.best_policy = Actor(args)
        #Turn off gradients and put in eval mode
        for actor in self.pop:
            actor = actor.cpu()
            actor.eval()

        if SEED_POP: self.load_seed(args.model_save, self.pop)

        #Init BUFFER
        self.replay_buffer = Buffer(100000, self.args.data_folder)

        #MP TOOLS
        self.exp_list = self.manager.list()
        self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)]
        self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)]

        self.evo_workers = [
            Process(target=rollout_worker,
                    args=(i, self.evo_task_pipes[i][1],
                          self.evo_result_pipes[i][1], None, self.exp_list,
                          self.pop, DIFFICULTY, USE_RS, True,
                          USE_SYNTHETIC_TARGET, XBIAS, ZBIAS, PHASE_LEN, None,
                          EP_LEN, JGS)) for i in range(args.pop_size)
        ]

        for worker in self.evo_workers:
            worker.start()

        #Trackers
        self.buffer_added = 0
        self.best_score = 0.0
        self.frames_seen = 0.0
        self.best_shaped_score = None
        self.eval_flag = [True for _ in range(args.pop_size)]
Ejemplo n.º 9
0
def create_actor(name="ModelName", height=185, sex=1):
    actor = Actor()
    actor.name = name
    actor.height = height
    actor.sex = sex
    actor.save()
    return actor
Ejemplo n.º 10
0
    def __init__(self, args):

        self.args = args

        self.actor = Actor(args, init=True)
        self.actor_target = Actor(args, init=True)
        self.actor_optim = Adam(self.actor.parameters(), lr=0.5e-4)

        self.critic = Critic(args)
        self.critic_target = Critic(args)
        self.critic_optim = Adam(self.critic.parameters(), lr=0.5e-3)

        self.gamma = args.gamma
        self.tau = self.args.tau
        self.loss = nn.MSELoss()

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
Ejemplo n.º 11
0
    def __init__(self, sample_budget):
        self.sample_budget = sample_budget
        dummy_args = Parameters()
        #Load all Critics
        critic_template = Critic(dummy_args)
        self.critic_ensemble = utils.load_all_models_dir(
            CRITIC_DIR, critic_template)

        #Load all Actors
        actor_template = Actor(dummy_args)
        self.actor_ensemble = utils.load_all_models_dir(
            ACTOR_DIR, actor_template)
Ejemplo n.º 12
0
	def __init__(self, id, algo_name, state_dim, action_dim, hidden_size, actor_lr, critic_lr, gamma, tau, savetag, foldername, actualize, use_gpu, init_w = True):

		self.algo_name = algo_name; self.gamma = gamma; self.tau = tau; self.total_update = 0; self.agent_id = id;	self.actualize = actualize; self.use_gpu = use_gpu
		self.tracker = utils.Tracker(foldername, ['q_'+savetag, 'qloss_'+savetag, 'policy_loss_'+savetag, 'alz_score'+savetag,'alz_policy'+savetag], '.csv', save_iteration=1000, conv_size=1000)

		#Initialize actors
		self.policy = Actor(state_dim, action_dim, hidden_size, policy_type='DeterministicPolicy')
		if init_w: self.policy.apply(utils.init_weights)
		self.policy_target = Actor(state_dim, action_dim, hidden_size, policy_type='DeterministicPolicy')
		utils.hard_update(self.policy_target, self.policy)
		self.policy_optim = Adam(self.policy.parameters(), actor_lr)


		self.critic = QNetwork(state_dim, action_dim,hidden_size)
		if init_w: self.critic.apply(utils.init_weights)
		self.critic_target = QNetwork(state_dim, action_dim, hidden_size)
		utils.hard_update(self.critic_target, self.critic)
		self.critic_optim = Adam(self.critic.parameters(), critic_lr)

		if actualize:
			self.ANetwork = ActualizationNetwork(state_dim, action_dim, hidden_size)
			if init_w: self.ANetwork.apply(utils.init_weights)
			self.actualize_optim = Adam(self.ANetwork.parameters(), critic_lr)
			self.actualize_lr = 0.2
			if use_gpu: self.ANetwork.cuda()

		self.loss = nn.MSELoss()

		if use_gpu:
			self.policy_target.cuda(); self.critic_target.cuda(); self.policy.cuda(); self.critic.cuda()
		self.num_critic_updates = 0

		#Statistics Tracker
		#self.action_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.policy_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.q_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.q = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.alz_score = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.alz_policy = {'min':None, 'max': None, 'mean':None, 'std':None}
Ejemplo n.º 13
0
    def __init__(self, args):

        self.args = args

        self.actor = Actor(args)
        if args.init_w: self.actor.apply(utils.init_weights)
        self.actor_target = Actor(args)
        self.optim = Adam(self.actor.parameters(), lr=5e-4)

        self.vfunc = ValueFunc(args)
        if args.init_w: self.vfunc.apply(utils.init_weights)

        self.gamma = args.gamma
        self.loss = nn.SmoothL1Loss()  #nn.MSELoss()

        #self.actor.cuda(); self.vfunc.cuda()
        self.num_critic_updates = 0

        #Statistics Tracker
        self.action_loss = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.policy_loss = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.critic_loss = {'mean': []}
        self.q = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.val = {'min': [], 'max': [], 'mean': [], 'std': []}
Ejemplo n.º 14
0
    def __init__(self, n_agents, dim_obs, dim_act, batch_size,
                 capacity, episodes_before_train):
        self.actors = [Actor(dim_obs, dim_act) for i in range(n_agents)]
        self.critics = [Critic(n_agents, dim_obs,
                               dim_act) for i in range(n_agents)]
        self.actors_target = deepcopy(self.actors)
        self.critics_target = deepcopy(self.critics)

        self.n_agents = n_agents
        self.n_states = dim_obs
        self.n_actions = dim_act
        self.memory = ReplayMemory(capacity)
        self.batch_size = batch_size
        self.use_cuda = th.cuda.is_available()
        self.episodes_before_train = episodes_before_train

        self.GAMMA = 0.5
        self.tau = 0.0001

        self.var = [1.0 for i in range(n_agents)]
        self.critic_optimizer = [Adam(x.parameters(),
                                      lr=0.00005) for x in self.critics]
        self.actor_optimizer = [Adam(x.parameters(),
                                     lr=0.00005) for x in self.actors]

        if self.use_cuda:
            for x in self.actors:
                x.cuda()
            for x in self.critics:
                x.cuda()
            for x in self.actors_target:
                x.cuda()
            for x in self.critics_target:
                x.cuda()

        self.steps_done = 0
        self.episode_done = 0
Ejemplo n.º 15
0
class Off_Policy_Algo(object):
    """Classes implementing TD3 and DDPG off-policy learners

         Parameters:
               args (object): Parameter class


     """
    def __init__(self,
                 wwid,
                 algo_name,
                 state_dim,
                 action_dim,
                 actor_lr,
                 critic_lr,
                 gamma,
                 tau,
                 init_w=True):

        self.algo_name = algo_name
        self.gamma = gamma
        self.tau = tau

        self.HLoss = HLoss()
        #Initialize actors
        self.actor = Actor(state_dim, action_dim, wwid, self.algo_name)
        if init_w: self.actor.apply(utils.init_weights)
        self.actor_target = Actor(state_dim, action_dim, wwid, self.algo_name)
        utils.hard_update(self.actor_target, self.actor)
        self.actor_optim = Adam(self.actor.parameters(), actor_lr)

        self.critic = Critic(state_dim, action_dim)
        if init_w: self.critic.apply(utils.init_weights)
        self.critic_target = Critic(state_dim, action_dim)
        utils.hard_update(self.critic_target, self.critic)
        self.critic_optim = Adam(self.critic.parameters(), critic_lr)

        self.loss = nn.MSELoss()

        if torch.cuda.is_available():
            self.actor_target.cuda()
            self.critic_target.cuda()
            self.actor.cuda()
            self.critic.cuda()
        self.num_critic_updates = 0

        #Statistics Tracker
        self.action_loss = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.policy_loss = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.critic_loss = {'mean': []}
        self.q = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.val = {'min': [], 'max': [], 'mean': [], 'std': []}

    def save_net(self, path):
        torch.save(self.actor.state_dict(), path)

    def act(self, state):
        return self.actor(state)

    def share_memory(self):
        self.actor.share_memory()
        self.actor_target.share_memory()
        self.critic.share_memory()
        self.critic_target.share_memory()

    def compute_stats(self, tensor, tracker):
        """Computes stats from intermediate tensors

             Parameters:
                   tensor (tensor): tensor
                   tracker (object): logger

             Returns:
                   None


         """
        tracker['min'].append(torch.min(tensor).item())
        tracker['max'].append(torch.max(tensor).item())
        tracker['mean'].append(torch.mean(tensor).item())
        tracker['mean'].append(torch.mean(tensor).item())

    def update_parameters(self,
                          state_batch,
                          next_state_batch,
                          action_batch,
                          reward_batch,
                          done_batch,
                          num_epoch=1,
                          **kwargs):
        """Runs a step of Bellman upodate and policy gradient using a batch of experiences

             Parameters:
                  state_batch (tensor): Current States
                  next_state_batch (tensor): Next States
                  action_batch (tensor): Actions
                  reward_batch (tensor): Rewards
                  done_batch (tensor): Done batch
                  num_epoch (int): Number of learning iteration to run with the same data

             Returns:
                   None

         """

        if isinstance(state_batch, list):
            state_batch = torch.cat(state_batch)
            next_state_batch = torch.cat(next_state_batch)
            action_batch = torch.cat(action_batch)
            reward_batch = torch.cat(reward_batch).done_batch = torch.cat(
                done_batch)

        for _ in range(num_epoch):
            ########### CRITIC UPDATE ####################

            #Compute next q-val, next_v and target
            with torch.no_grad():
                #Policy Noise
                policy_noise = np.random.normal(
                    0, kwargs['policy_noise'],
                    (action_batch.size()[0], action_batch.size()[1]))
                policy_noise = torch.clamp(torch.Tensor(policy_noise),
                                           -kwargs['policy_noise_clip'],
                                           kwargs['policy_noise_clip'])

                #Compute next action_bacth
                #next_action_batch = self.actor_target.turn_max_into_onehot(self.actor_target.Gumbel_softmax_sample_distribution(next_state_batch, use_cuda=True))\
                #        if self.algo_name == 'dis' else self.actor_target.forward(next_state_batch) + policy_noise.cuda()  #this should use one-hot from logits
                next_action_batch = self.actor_target.turn_max_into_onehot(self.actor_target.forward(next_state_batch)) \
                    if self.algo_name == 'dis' else self.actor_target.forward(next_state_batch) + policy_noise.cuda()  # this should use one-hot from logits
                if random.random() < 0.0001:
                    print('off_policy line 114, changed next action batch')
                next_action_batch = torch.clamp(next_action_batch, 0, 1)

                #Compute Q-val and value of next state masking by done
                q1, q2, _ = self.critic_target.forward(next_state_batch,
                                                       next_action_batch)
                q1 = (1 - done_batch) * q1
                q2 = (1 - done_batch) * q2

                #Select which q to use as next-q (depends on algo)
                if self.algo_name == 'TD3' or self.algo_name == 'TD3_actor_min' or self.algo_name == 'dis':
                    next_q = torch.min(q1, q2)
                elif self.algo_name == 'DDPG':
                    next_q = q1
                elif self.algo_name == 'TD3_max':
                    next_q = torch.max(q1, q2)

                #Compute target q and target val
                target_q = reward_batch + (self.gamma * next_q)

            self.critic_optim.zero_grad()
            current_q1, current_q2, current_val = self.critic.forward(
                (state_batch),
                (action_batch
                 ))  #here the action batch should be the soft version
            self.compute_stats(current_q1, self.q)

            dt = self.loss(current_q1, target_q)

            if self.algo_name == 'TD3' or self.algo_name == 'TD3_max' or self.algo_name == 'dis':
                dt = dt + self.loss(current_q2, target_q)
            self.critic_loss['mean'].append(dt.item())
            #print(dt.item(), "off_policy_algo line 136")

            dt.backward()

            self.critic_optim.step()
            self.num_critic_updates += 1

            #Delayed Actor Update
            if self.num_critic_updates % kwargs['policy_ups_freq'] == 0:

                actor_actions = self.actor.Gumbel_softmax_sample_distribution(state_batch, use_cuda=True)\
                    if self.algo_name == 'dis' else self.actor.forward(state_batch)
                #actor_actions = self.actor.forward(state_batch)
                #if random.random() < 0.001: print('actor action changed')
                Q1, Q2, val = self.critic.forward(state_batch, actor_actions)

                # if self.args.use_advantage: policy_loss = -(Q1 - val)
                policy_loss = -Q1 + 0.1 * self.HLoss(
                    actor_actions
                )  # HLoss is a single scalar, directly regularized logits?

                if random.random() < 0.0005:
                    print('added entropy regularization, off_policy_algo 161')

                self.compute_stats(policy_loss, self.policy_loss)
                policy_loss = policy_loss.mean()

                #print(policy_loss, 'off_policy line 157')
                self.actor_optim.zero_grad()

                policy_loss.backward(retain_graph=True)
                self.actor_optim.step()

                #if random.random() <= 0.001:
                #    self.test_actor_gradient_descent(state_batch)

            if self.num_critic_updates % kwargs['policy_ups_freq'] == 0:
                utils.soft_update(self.actor_target, self.actor, self.tau)
            utils.soft_update(self.critic_target, self.critic, self.tau)

    def test_actor_gradient_descent(self, state_batch):
        #this method test if running gradient descent on the actor actually decrease the loss
        print("test_actor_gradient_descent, off_policy_algo line 179")
        for i in range(10):
            actor_actions = self.actor.forward(state_batch)
            print("logits_",
                  self.actor.w_out(self.actor.logits(state_batch))[0])
            print("action_batch", actor_actions[0])
            Q1, Q2, val = self.critic.forward(state_batch, actor_actions)
            policy_loss = -Q1
            policy_loss = policy_loss.mean()
            print("policy_loss at i = ", i, " is ", policy_loss)
            self.actor_optim.zero_grad()
            policy_loss.backward(retain_graph=True)
            print("gradient_", self.actor.f1.bias.grad[0])
            self.actor_optim.step()
            print("bias_", self.actor.f1.bias[0])
Ejemplo n.º 16
0
class PPO(object):
    """Classes implementing TD3 and DDPG off-policy learners

         Parameters:
               args (object): Parameter class


     """
    def __init__(self, args):

        self.args = args

        self.actor = Actor(args)
        if args.init_w: self.actor.apply(utils.init_weights)
        self.actor_target = Actor(args)
        self.optim = Adam(self.actor.parameters(), lr=5e-4)

        self.vfunc = ValueFunc(args)
        if args.init_w: self.vfunc.apply(utils.init_weights)

        self.gamma = args.gamma
        self.loss = nn.SmoothL1Loss()  #nn.MSELoss()

        #self.actor.cuda(); self.vfunc.cuda()
        self.num_critic_updates = 0

        #Statistics Tracker
        self.action_loss = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.policy_loss = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.critic_loss = {'mean': []}
        self.q = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.val = {'min': [], 'max': [], 'mean': [], 'std': []}

    def compute_gae(self, trajectory, gamma=0.99, tau=0.95):
        with torch.no_grad():
            values = []
            next_values = []
            rewards = []
            masks = []
            states = []
            actions = []

            for entry in trajectory:
                states.append(torch.tensor(entry[0]))
                actions.append(torch.tensor(entry[1]))
                values.append(self.vfunc(torch.Tensor(entry[0])))
                rewards.append(torch.Tensor(entry[3]))
                masks.append(torch.Tensor(entry[5]))
            values.append(self.vfunc(torch.Tensor(entry[2])))

            gae = 0.0
            returns = []
            for step in reversed(range(len(rewards))):
                delta = rewards[step] + gamma * values[
                    step + 1] * masks[step] - values[step]
                gae = delta + gamma * tau * masks[step] * gae
                returns.insert(0, gae + values[step])

        return states, actions, values, returns

    def compute_stats(self, tensor, tracker):
        """Computes stats from intermediate tensors

             Parameters:
                   tensor (tensor): tensor
                   tracker (object): logger

             Returns:
                   None


         """
        tracker['min'].append(torch.min(tensor).item())
        tracker['max'].append(torch.max(tensor).item())
        tracker['mean'].append(torch.mean(tensor).item())
        tracker['mean'].append(torch.mean(tensor).item())

    def update_parameters(self,
                          states,
                          actions,
                          log_probs,
                          returns,
                          advantages,
                          ppo_epochs=8,
                          mini_batch_size=128,
                          clip_param=0.2):
        """Runs a step of Bellman upodate and policy gradient using a batch of experiences

             Parameters:
                  state_batch (tensor): Current States
                  next_state_batch (tensor): Next States
                  action_batch (tensor): Actions
                  reward_batch (tensor): Rewards
                  done_batch (tensor): Done batch
                  num_epoch (int): Number of learning iteration to run with the same data

             Returns:
                   None

         """

        for _ in range(ppo_epochs):
            ind = random.sample(range(len(states)), mini_batch_size)
            mini_s = states[ind]
            mini_a = actions[ind]
            mini_ret = returns[ind]
            mini_adv = advantages[ind]

            #PPO Update
            new_action, value = self.actor(mini_s), self.vfunc(mini_s)

            ratio = mini_a - new_action
            surr1 = ratio * mini_adv
            surr2 = torch.clamp(ratio, 1.0 - clip_param,
                                1.0 + clip_param) * mini_adv

            actor_loss = -torch.min(surr1, surr2).mean()
            critic_loss = (mini_ret - value).pow(2).mean()

            loss = 0.5 * critic_loss + actor_loss

            self.optim.zero_grad()
            loss.backward()
            self.optim.step()

    def soft_update(self, target, source, tau):
        """Soft update from target network to source

            Parameters:
                  target (object): A pytorch model
                  source (object): A pytorch model
                  tau (float): Tau parameter

            Returns:
                None

        """

        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - tau) +
                                    param.data * tau)

    def hard_update(self, target, source):
        """Hard update (clone) from target network to source

            Parameters:
                  target (object): A pytorch model
                  source (object): A pytorch model

            Returns:
                None
        """

        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)
class TD3_DDPG(object):
    """Classes implementing TD3 and DDPG off-policy learners

         Parameters:
               args (object): Parameter class


     """
    def __init__(self, args):

        self.args = args
        self.algo = args.algo

        self.actor = Actor(args)
        if args.init_w: self.actor.apply(utils.init_weights)
        self.actor_target = Actor(args)
        self.actor_optim = Adam(self.actor.parameters(), lr=5e-5)

        self.critic = Critic(args)
        if args.init_w: self.critic.apply(utils.init_weights)
        self.critic_target = Critic(args)
        self.critic_optim = Adam(self.critic.parameters(), lr=5e-4)

        self.gamma = args.gamma
        self.tau = self.args.tau
        self.loss = nn.MSELoss()

        self.hard_update(
            self.actor_target,
            self.actor)  # Make sure target is with the same weight
        self.hard_update(self.critic_target, self.critic)
        self.actor_target.cuda()
        self.critic_target.cuda()
        self.actor.cuda()
        self.critic.cuda()
        self.num_critic_updates = 0

        #Statistics Tracker
        self.action_loss = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.policy_loss = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.critic_loss = {'mean': []}
        self.q = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.val = {'min': [], 'max': [], 'mean': [], 'std': []}

    def compute_stats(self, tensor, tracker):
        """Computes stats from intermediate tensors

             Parameters:
                   tensor (tensor): tensor
                   tracker (object): logger

             Returns:
                   None


         """
        tracker['min'].append(torch.min(tensor).item())
        tracker['max'].append(torch.max(tensor).item())
        tracker['mean'].append(torch.mean(tensor).item())
        tracker['mean'].append(torch.mean(tensor).item())

    def update_parameters(self,
                          state_batch,
                          next_state_batch,
                          action_batch,
                          reward_batch,
                          done_batch,
                          num_epoch=1):
        """Runs a step of Bellman upodate and policy gradient using a batch of experiences

             Parameters:
                  state_batch (tensor): Current States
                  next_state_batch (tensor): Next States
                  action_batch (tensor): Actions
                  reward_batch (tensor): Rewards
                  done_batch (tensor): Done batch
                  num_epoch (int): Number of learning iteration to run with the same data

             Returns:
                   None

         """

        if isinstance(state_batch, list):
            state_batch = torch.cat(state_batch)
            next_state_batch = torch.cat(next_state_batch)
            action_batch = torch.cat(action_batch)
            reward_batch = torch.cat(reward_batch).done_batch = torch.cat(
                done_batch)

        for _ in range(num_epoch):
            ########### CRITIC UPDATE ####################

            #Compute next q-val, next_v and target
            with torch.no_grad():
                #Policy Noise
                policy_noise = np.random.normal(
                    0, self.args.policy_noise,
                    (action_batch.size()[0], action_batch.size()[1]))
                policy_noise = torch.clamp(torch.Tensor(policy_noise),
                                           -self.args.policy_noise_clip,
                                           self.args.policy_noise_clip)

                #Compute next action_bacth
                next_action_batch = self.actor_target.forward(
                    next_state_batch) + policy_noise.cuda()
                next_action_batch = torch.clamp(next_action_batch, 0, 1)

                #Compute Q-val and value of next state masking by done
                q1, q2, next_val = self.critic_target.forward(
                    next_state_batch, next_action_batch)
                if self.args.use_done_mask:
                    q1 = (1 - done_batch) * q1
                    q2 = (1 - done_batch) * q2
                    next_val = (1 - done_batch) * next_val

                #Clamp Q-vals
                if self.args.q_clamp != None:
                    q1 = torch.clamp(q1, -self.args.q_clamp, self.args.q_clamp)
                    q1 = torch.clamp(q2, -self.args.q_clamp, self.args.q_clamp)

                #Select which q to use as next-q (depends on algo)
                if self.algo == 'TD3' or self.algo == 'TD3_actor_min':
                    next_q = torch.min(q1, q2)
                elif self.algo == 'DDPG':
                    next_q = q1
                elif self.algo == 'TD3_max':
                    next_q = torch.max(q1, q2)

                #Compute target q and target val
                target_q = reward_batch + (self.gamma * next_q)
                if self.args.use_advantage:
                    target_val = reward_batch + (self.gamma * next_val)

            self.critic_optim.zero_grad()
            current_q1, current_q2, current_val = self.critic.forward(
                (state_batch), (action_batch))
            self.compute_stats(current_q1, self.q)

            dt = self.loss(current_q1, target_q)
            if self.args.use_advantage:
                dt = dt + self.loss(current_val, target_val)
                self.compute_stats(current_val, self.val)

            if self.algo == 'TD3' or self.algo == 'TD3_max':
                dt = dt + self.loss(current_q2, target_q)
            self.critic_loss['mean'].append(dt.item())

            if self.args.critic_constraint:
                if dt.item() > self.args.critic_constraint_w:
                    dt = dt * (abs(self.args.critic_constraint_w / dt.item()))
            dt.backward()

            self.critic_optim.step()
            self.num_critic_updates += 1

            #Delayed Actor Update
            if self.num_critic_updates % self.args.policy_ups_freq == 0:

                actor_actions = self.actor.forward(state_batch)

                # Trust Region constraint
                if self.args.trust_region_actor:
                    with torch.no_grad():
                        old_actor_actions = self.actor_target.forward(
                            state_batch)
                    actor_actions = action_batch - old_actor_actions

                Q1, Q2, val = self.critic.forward(state_batch, actor_actions)

                if self.args.use_advantage: policy_loss = -(Q1 - val)
                else: policy_loss = -Q1
                self.compute_stats(policy_loss, self.policy_loss)
                policy_loss = policy_loss.mean()

                self.actor_optim.zero_grad()

                policy_loss.backward(retain_graph=True)
                #nn.utils.clip_grad_norm_(self.actor.parameters(), 10)
                if self.args.action_loss:
                    action_loss = torch.abs(actor_actions - 0.5)
                    self.compute_stats(action_loss, self.action_loss)
                    action_loss = action_loss.mean() * self.args.action_loss_w
                    action_loss.backward()
                    #if self.action_loss[-1] > self.policy_loss[-1]: self.args.action_loss_w *= 0.9 #Decay action_w loss if action loss is larger than policy gradient loss
                self.actor_optim.step()

            if self.args.hard_update:
                if self.num_critic_updates % self.args.hard_update_freq == 0:
                    if self.num_critic_updates % self.args.policy_ups_freq == 0:
                        self.hard_update(self.actor_target, self.actor)
                    self.hard_update(self.critic_target, self.critic)

            else:
                if self.num_critic_updates % self.args.policy_ups_freq == 0:
                    self.soft_update(self.actor_target, self.actor, self.tau)
                self.soft_update(self.critic_target, self.critic, self.tau)

    def soft_update(self, target, source, tau):
        """Soft update from target network to source

            Parameters:
                  target (object): A pytorch model
                  source (object): A pytorch model
                  tau (float): Tau parameter

            Returns:
                None

        """

        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - tau) +
                                    param.data * tau)

    def hard_update(self, target, source):
        """Hard update (clone) from target network to source

            Parameters:
                  target (object): A pytorch model
                  source (object): A pytorch model

            Returns:
                None
        """

        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)
Ejemplo n.º 18
0
class Actor_Critic(object):
    def __init__(self, state_dim, action_dim, gamma, tau, buffer_size,
                 is_mem_cuda, out_act):

        self.actor = Actor(state_dim,
                           action_dim,
                           is_evo=False,
                           out_act=out_act)
        self.actor_target = Actor(state_dim,
                                  action_dim,
                                  is_evo=False,
                                  out_act=out_act)
        self.actor_optim = Adam(self.actor.parameters(), lr=1e-4)

        self.critic = Critic(state_dim, action_dim)
        self.critic_target = Critic(state_dim, action_dim)
        self.critic_optim = Adam(self.critic.parameters(), lr=1e-3)

        self.gamma = gamma
        self.tau = tau
        self.loss = nn.MSELoss()
        self.replay_buffer = ReplayMemory(buffer_size, is_mem_cuda)
        self.exploration_noise = OUNoise(action_dim)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

    def act(self, state, is_noise):
        state = utils.to_tensor(state).unsqueeze(0)
        action = self.actor.forward(state)
        action = action.detach().numpy().flatten()
        if is_noise: action += self.exploration_noise.noise()
        return action

    def train_from_batch(self, batch):
        env_state_batch = torch.cat(batch.state)
        goal_batch = torch.cat(batch.goal)
        uvfa_states = torch.cat((env_state_batch, goal_batch), dim=1).detach()
        next_env_state_batch = torch.cat(batch.next_state)
        next_uvfa_states = torch.cat((next_env_state_batch, goal_batch),
                                     dim=1).detach()
        action_batch = torch.cat(batch.action).detach()
        reward_batch = torch.cat(batch.reward).detach()

        #if self.args.use_done_mask: done_batch = torch.cat(batch.done)

        #Load everything to GPU if not already
        # if self.args.is_memory_cuda and not self.args.is_cuda:
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic_target.cuda()
        self.critic.cuda()
        uvfa_states = uvfa_states.cuda()
        next_uvfa_states = next_uvfa_states.cuda()
        action_batch = action_batch.cuda()
        reward_batch = reward_batch.cuda()
        #     if self.args.use_done_mask: done_batch = done_batch.cuda()

        #Critic Update
        with torch.no_grad():
            next_action_batch = self.actor_target.forward(next_uvfa_states)
            next_q = self.critic_target.forward(next_uvfa_states,
                                                next_action_batch)
            #if self.args.use_done_mask: next_q = next_q * ( 1 - done_batch.float()) #Done mask
            target_q = reward_batch + (self.gamma * next_q)

        self.critic_optim.zero_grad()
        current_q = self.critic.forward((uvfa_states.detach()),
                                        (action_batch.detach()))
        dt = self.loss(current_q, target_q)
        dt.backward()
        nn.utils.clip_grad_norm_(self.critic.parameters(), 10)
        self.critic_optim.step()

        #Actor Update
        self.actor_optim.zero_grad()
        policy_loss = -self.critic.forward(
            (uvfa_states), self.actor.forward((uvfa_states)))
        policy_loss = policy_loss.mean()
        policy_loss.backward()
        nn.utils.clip_grad_norm_(self.critic.parameters(), 10)
        self.actor_optim.step()

        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        #Nets back to CPU if using memory_cuda
        self.actor.cpu()
        self.actor_target.cpu()
        self.critic_target.cpu()
        self.critic.cpu()
Ejemplo n.º 19
0
class Off_Policy_Algo(object):
    """Classes implementing TD3 and DDPG off-policy learners

         Parameters:
               args (object): Parameter class


     """
    def __init__(self, wwid, algo_name, state_dim, action_dim, actor_lr, critic_lr, gamma, tau, init_w = True):

        self.algo_name = algo_name; self.gamma = gamma; self.tau = tau

        #Initialize actors
        self.actor = Actor(state_dim, action_dim, wwid)
        if init_w: self.actor.apply(utils.init_weights)
        self.actor_target = Actor(state_dim, action_dim, wwid)
        utils.hard_update(self.actor_target, self.actor)
        self.actor_optim = Adam(self.actor.parameters(), actor_lr)


        self.critic = Critic(state_dim, action_dim)
        if init_w: self.critic.apply(utils.init_weights)
        self.critic_target = Critic(state_dim, action_dim)
        utils.hard_update(self.critic_target, self.critic)
        self.critic_optim = Adam(self.critic.parameters(), critic_lr)

        self.loss = nn.MSELoss()

        self.actor_target.cuda(); self.critic_target.cuda(); self.actor.cuda(); self.critic.cuda()
        self.num_critic_updates = 0

        #Statistics Tracker
        self.action_loss = {'min':[], 'max': [], 'mean':[], 'std':[]}
        self.policy_loss = {'min':[], 'max': [], 'mean':[], 'std':[]}
        self.critic_loss = {'mean':[]}
        self.q = {'min':[], 'max': [], 'mean':[], 'std':[]}
        self.val = {'min':[], 'max': [], 'mean':[], 'std':[]}

    def compute_stats(self, tensor, tracker):
        """Computes stats from intermediate tensors

             Parameters:
                   tensor (tensor): tensor
                   tracker (object): logger

             Returns:
                   None


         """
        tracker['min'].append(torch.min(tensor).item())
        tracker['max'].append(torch.max(tensor).item())
        tracker['mean'].append(torch.mean(tensor).item())
        tracker['mean'].append(torch.mean(tensor).item())

    def update_parameters(self, state_batch, next_state_batch, action_batch, reward_batch, done_batch, num_epoch=1, **kwargs):
        """Runs a step of Bellman upodate and policy gradient using a batch of experiences

             Parameters:
                  state_batch (tensor): Current States
                  next_state_batch (tensor): Next States
                  action_batch (tensor): Actions
                  reward_batch (tensor): Rewards
                  done_batch (tensor): Done batch
                  num_epoch (int): Number of learning iteration to run with the same data

             Returns:
                   None

         """

        if isinstance(state_batch, list): state_batch = torch.cat(state_batch); next_state_batch = torch.cat(next_state_batch); action_batch = torch.cat(action_batch); reward_batch = torch.cat(reward_batch). done_batch = torch.cat(done_batch)

        for _ in range(num_epoch):
            ########### CRITIC UPDATE ####################

            #Compute next q-val, next_v and target
            with torch.no_grad():
                #Policy Noise
                policy_noise = np.random.normal(0, kwargs['policy_noise'], (action_batch.size()[0], action_batch.size()[1]))
                policy_noise = torch.clamp(torch.Tensor(policy_noise), -kwargs['policy_noise_clip'], kwargs['policy_noise_clip'])

                #Compute next action_bacth
                next_action_batch = self.actor_target.forward(next_state_batch) + policy_noise.cuda()
                next_action_batch = torch.clamp(next_action_batch, 0,1)

                #Compute Q-val and value of next state masking by done
                q1, q2, _ = self.critic_target.forward(next_state_batch, next_action_batch)
                q1 = (1 - done_batch) * q1
                q2 = (1 - done_batch) * q2

                #Select which q to use as next-q (depends on algo)
                if self.algo_name == 'TD3' or self.algo_name == 'TD3_actor_min': next_q = torch.min(q1, q2)
                elif self.algo_name == 'DDPG': next_q = q1
                elif self.algo_name == 'TD3_max': next_q = torch.max(q1, q2)

                #Compute target q and target val
                target_q = reward_batch + (self.gamma * next_q)


            self.critic_optim.zero_grad()
            current_q1, current_q2, current_val = self.critic.forward((state_batch), (action_batch))
            self.compute_stats(current_q1, self.q)

            dt = self.loss(current_q1, target_q)

            if self.algo_name == 'TD3' or self.algo_name == 'TD3_max': dt = dt + self.loss(current_q2, target_q)
            self.critic_loss['mean'].append(dt.item())

            dt.backward()

            self.critic_optim.step()
            self.num_critic_updates += 1


            #Delayed Actor Update
            if self.num_critic_updates % kwargs['policy_ups_freq'] == 0:

                actor_actions = self.actor.forward(state_batch)
                Q1, Q2, val = self.critic.forward(state_batch, actor_actions)

                # if self.args.use_advantage: policy_loss = -(Q1 - val)
                policy_loss = -Q1

                self.compute_stats(policy_loss,self.policy_loss)
                policy_loss = policy_loss.mean()


                self.actor_optim.zero_grad()



                policy_loss.backward(retain_graph=True)
                self.actor_optim.step()


            if self.num_critic_updates % kwargs['policy_ups_freq'] == 0: utils.soft_update(self.actor_target, self.actor, self.tau)
            utils.soft_update(self.critic_target, self.critic, self.tau)
Ejemplo n.º 20
0
class SAC(object):
	def __init__(self, id, num_inputs, action_dim, hidden_size, gamma, critic_lr, actor_lr, tau, alpha, target_update_interval, savetag, foldername, actualize, use_gpu):

		self.num_inputs = num_inputs
		self.action_space = action_dim
		self.gamma = gamma
		self.tau = 0.005
		self.alpha = 0.2
		self.policy_type = "Gaussian"
		self.target_update_interval = 1
		self.tracker = utils.Tracker(foldername, ['q_'+savetag, 'qloss_'+savetag, 'value_'+savetag, 'value_loss_'+savetag, 'policy_loss_'+savetag, 'mean_loss_'+savetag, 'std_loss_'+savetag], '.csv',save_iteration=1000, conv_size=1000)
		self.total_update = 0
		self.agent_id = id
		self.actualize = actualize

		self.critic = QNetwork(self.num_inputs, self.action_space, hidden_size)
		self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr)
		self.soft_q_criterion = nn.MSELoss()

		if self.policy_type == "Gaussian":
			self.policy = Actor(self.num_inputs, self.action_space, hidden_size, policy_type='GaussianPolicy')
			self.policy_optim = Adam(self.policy.parameters(), lr=actor_lr)

			self.value = ValueNetwork(self.num_inputs, hidden_size)
			self.value_target = ValueNetwork(self.num_inputs, hidden_size)
			self.value_optim = Adam(self.value.parameters(), lr=critic_lr)
			utils.hard_update(self.value_target, self.value)
			self.value_criterion = nn.MSELoss()
		else:
			self.policy = Actor(self.num_inputs, self.action_space, hidden_size, policy_type='DeterministicPolicy')
			self.policy_optim = Adam(self.policy.parameters(), lr=actor_lr)

			self.critic_target = QNetwork(self.num_inputs, self.action_space, hidden_size)
			utils.hard_update(self.critic_target, self.critic)

		self.policy.cuda()
		self.value.cuda()
		self.value_target.cuda()
		self.critic.cuda()

		#Statistics Tracker
		self.q = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.val = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.value_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.policy_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.mean_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.std_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.q_loss = {'min':None, 'max': None, 'mean':None, 'std':None}



	# def select_action(self, state, eval=False):
	#     state = torch.FloatTensor(state).unsqueeze(0)
	#     if eval == False:
	#         self.policy.train()
	#         action, _, _, _, _ = self.policy.evaluate(state)
	#     else:
	#         self.policy.eval()
	#         _, _, _, action, _ = self.policy.evaluate(state)
	#
	#     # action = torch.tanh(action)
	#     action = action.detach().cpu().numpy()
	#     return action[0]

	def update_parameters(self, state_batch, next_state_batch, action_batch, reward_batch, mask_batch, updates, **ignore):
		# state_batch = torch.FloatTensor(state_batch)
		# next_state_batch = torch.FloatTensor(next_state_batch)
		# action_batch = torch.FloatTensor(action_batch)
		# reward_batch = torch.FloatTensor(reward_batch)
		# mask_batch = torch.FloatTensor(np.float32(mask_batch))

		# reward_batch = reward_batch.unsqueeze(1)  # reward_batch = [batch_size, 1]
		# mask_batch = mask_batch.unsqueeze(1)  # mask_batch = [batch_size, 1]

		"""
		Use two Q-functions to mitigate positive bias in the policy improvement step that is known
		to degrade performance of value based methods. Two Q-functions also significantly speed
		up training, especially on harder task.
		"""
		expected_q1_value, expected_q2_value = self.critic(state_batch, action_batch)
		new_action, log_prob, _, mean, log_std = self.policy.noisy_action(state_batch, return_only_action=False)
		utils.compute_stats(expected_q1_value, self.q)


		if self.policy_type == "Gaussian":
			"""
			Including a separate function approximator for the soft value can stabilize training.
			"""
			expected_value = self.value(state_batch)
			utils.compute_stats(expected_value, self.val)
			target_value = self.value_target(next_state_batch)
			next_q_value = reward_batch + mask_batch * self.gamma * target_value  # Reward Scale * r(st,at) - γV(target)(st+1))
		else:
			"""
			There is no need in principle to include a separate function approximator for the state value.
			We use a target critic network for deterministic policy and eradicate the value value network completely.
			"""
			next_state_action, _, _, _, _, = self.policy.noisy_action(next_state_batch, return_only_action=False)
			target_critic_1, target_critic_2 = self.critic_target(next_state_batch, next_state_action)
			target_critic = torch.min(target_critic_1, target_critic_2)
			next_q_value = reward_batch + mask_batch * self.gamma * target_critic  # Reward Scale * r(st,at) - γQ(target)(st+1)

		"""
		Soft Q-function parameters can be trained to minimize the soft Bellman residual
		JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
		∇JQ = ∇Q(st,at)(Q(st,at) - r(st,at) - γV(target)(st+1))
		"""
		q1_value_loss = self.soft_q_criterion(expected_q1_value, next_q_value.detach())
		q2_value_loss = self.soft_q_criterion(expected_q2_value, next_q_value.detach())
		utils.compute_stats(q1_value_loss, self.q_loss)
		q1_new, q2_new = self.critic(state_batch, new_action)
		expected_new_q_value = torch.min(q1_new, q2_new)

		if self.policy_type == "Gaussian":
			"""
			Including a separate function approximator for the soft value can stabilize training and is convenient to 
			train simultaneously with the other networks
			Update the V towards the min of two Q-functions in order to reduce overestimation bias from function approximation error.
			JV = 𝔼st~D[0.5(V(st) - (𝔼at~π[Qmin(st,at) - log π(at|st)]))^2]
			∇JV = ∇V(st)(V(st) - Q(st,at) + logπ(at|st))
			"""
			next_value = expected_new_q_value - (self.alpha * log_prob)
			value_loss = self.value_criterion(expected_value, next_value.detach())
			utils.compute_stats(value_loss, self.value_loss)
		else:
			pass

		"""
		Reparameterization trick is used to get a low variance estimator
		f(εt;st) = action sampled from the policy
		εt is an input noise vector, sampled from some fixed distribution
		Jπ = 𝔼st∼D,εt∼N[logπ(f(εt;st)|st)−Q(st,f(εt;st))]
		∇Jπ =∇log π + ([∇at log π(at|st) − ∇at Q(st,at)])∇f(εt;st)
		"""
		policy_loss = ((self.alpha * log_prob) - expected_new_q_value)
		utils.compute_stats(policy_loss, self.policy_loss)
		policy_loss = policy_loss.mean()

		# Regularization Loss
		mean_loss = 0.001 * mean.pow(2)
		std_loss = 0.001 * log_std.pow(2)
		utils.compute_stats(mean_loss, self.mean_loss)
		utils.compute_stats(std_loss, self.std_loss)
		mean_loss = mean_loss.mean()
		std_loss = std_loss.mean()


		policy_loss += mean_loss + std_loss

		self.critic_optim.zero_grad()
		q1_value_loss.backward()
		self.critic_optim.step()

		self.critic_optim.zero_grad()
		q2_value_loss.backward()
		self.critic_optim.step()

		if self.policy_type == "Gaussian":
			self.value_optim.zero_grad()
			value_loss.backward()
			self.value_optim.step()
		else:
			value_loss = torch.tensor(0.)

		self.policy_optim.zero_grad()
		policy_loss.backward()
		self.policy_optim.step()

		self.total_update += 1
		if self.agent_id == 0:
			self.tracker.update([self.q['mean'], self.q_loss['mean'], self.val['mean'], self.value_loss['mean']
								, self.policy_loss['mean'], self.mean_loss['mean'], self.std_loss['mean']], self.total_update)

		"""
		We update the target weights to match the current value function weights periodically
		Update target parameter after every n(args.target_update_interval) updates
		"""
		if updates % self.target_update_interval == 0 and self.policy_type == "Deterministic":
			utils.soft_update(self.critic_target, self.critic, self.tau)

		elif updates % self.target_update_interval == 0 and self.policy_type == "Gaussian":
			utils.soft_update(self.value_target, self.value, self.tau)
		return value_loss.item(), q1_value_loss.item(), q2_value_loss.item(), policy_loss.item()

	# Save model parameters
	def save_model(self, env_name, suffix="", actor_path=None, critic_path=None, value_path=None):
		if not os.path.exists('models/'):
			os.makedirs('models/')

		if actor_path is None:
			actor_path = "models/sac_actor_{}_{}".format(env_name, suffix)
		if critic_path is None:
			critic_path = "models/sac_critic_{}_{}".format(env_name, suffix)
		if value_path is None:
			value_path = "models/sac_value_{}_{}".format(env_name, suffix)
		print('Saving models to {}, {} and {}'.format(actor_path, critic_path, value_path))
		torch.save(self.value.state_dict(), value_path)
		torch.save(self.policy.state_dict(), actor_path)
		torch.save(self.critic.state_dict(), critic_path)

	# Load model parameters
	def load_model(self, actor_path, critic_path, value_path):
		print('Loading models from {}, {} and {}'.format(actor_path, critic_path, value_path))
		if actor_path is not None:
			self.policy.load_state_dict(torch.load(actor_path))
		if critic_path is not None:
			self.critic.load_state_dict(torch.load(critic_path))
		if value_path is not None:
			self.value.load_state_dict(torch.load(value_path))
Ejemplo n.º 21
0
                    self.ns = torch.cat((self.ns, torch.Tensor(ns)), 0)
                    self.a = torch.cat((self.a, torch.Tensor(a)), 0)
                    self.r = torch.cat((self.r, torch.Tensor(r)), 0)
                    self.done = torch.cat((self.done, torch.Tensor(done)), 0)

                self.num_entries = len(self.s)
                if self.num_entries > DATA_LIMIT: break


        print('BUFFER LOADED WITH', self.num_entries, 'SAMPLES')

        # self.s = self.s.pin_memory()
        # self.ns = self.ns.pin_memory()
        # self.a = self.a.pin_memory()
        # self.r = self.r.pin_memory()
        # self.done = self.done.pin_memory()




args = Parameters()

pg_model = Actor(args)
pg_model.load_state_dict(torch.load('R_Skeleton/models/champ'))


evo_model = Actor(args)
evo_model.load_state_dict(torch.load('R_Skeleton/rl_models/td3_best0.95_RS_PROP0.9__ADV_-5.0_-7.5_-5.0_0.0'))


k = None
Ejemplo n.º 22
0
    def __init__(self,
                 args):  # need to intialize rollout_workers to have blue agent
        self.args = args
        self.evolver = SSNE(
            self.args)  # this evolver implements neuro-evolution

        # MP TOOLS
        self.manager = Manager()

        self.mutate_algos = [
            Mutation_Add(self),
            Mutation_Delete(self),
            Mutation_Exchange(self)
        ]  #store all the mutate algorithm objects
        # Genealogy tool
        self.genealogy = Genealogy()

        # Init BUFFER
        self.replay_buffer = Buffer(1000000, self.args.buffer_gpu)

        #if SA_FLAG:
        self.metrics = []
        self.last_portfolio = None
        self.T_max = 30
        self.T = self.T_max
        self.T_min = 0.2
        self.decay_rate = 0.975

        # Initialize population
        self.pop = self.manager.list()
        for _ in range(args.pop_size):
            wwid = self.genealogy.new_id('evo')
            if ALGO == 'SAC':
                self.pop.append(
                    GaussianPolicy(args.state_dim, args.action_dim,
                                   args.hidden_size, wwid))
            elif ALGO == 'TD3':
                self.pop.append(
                    Actor(args.state_dim, args.action_dim, wwid, ALGO))
                # use ALGO to distinguish differe net architecture
            elif ALGO == 'dis' or 'TD3_tennis':
                self.pop.append(
                    Actor(args.state_dim, args.action_dim, wwid, ALGO))
            else:
                assert False, "invalid algorithm type"

        if ALGO == "SAC":
            self.best_policy = GaussianPolicy(args.state_dim, args.action_dim,
                                              args.hidden_size, -1)
        else:
            self.best_policy = Actor(args.state_dim, args.action_dim, -1, ALGO)
            if ALGO == 'dis':
                self.average_policy = AverageActor(args.state_dim,
                                                   args.action_dim,
                                                   -2,
                                                   ALGO,
                                                   self.pop,
                                                   self.replay_buffer,
                                                   args.buffer_gpu,
                                                   args.batch_size,
                                                   iterations=10)
                self.average_policy.share_memory()

        self.best_policy.share_memory()

        # added by macheng, share the best policy accross processes (used as internal belief update models for blue)

        # now we assign shared blue_trainer, we should train this agent such that the roll_out workers are also up to date
        # should make sure that self.best_policy (emergent learner) is also shared
        if ALGO == 'dis' or 'TD3_tennis':
            assert hasattr(
                args, "blue_trainer"
            ), "must have blue_agent trainer to intialize rollout_worker, see line 109, class Parameter definition"
        if ALGO == 'dis':
            trainers = [args.blue_trainer, self.average_policy]
        else:
            trainers = [args.blue_trainer, None
                        ] if ALGO == 'TD3_tennis' else []

        self.trainers = trainers

        self.blue_dqn = args.blue_trainer

        # Turn off gradients and put in eval mod
        for actor in self.pop:
            actor = actor.cpu()
            actor.eval()
        # Intialize portfolio of learners
        self.portfolio = []
        self.portfolio = initialize_portfolio(self.portfolio, self.args,
                                              self.genealogy, PORTFOLIO_ID)
        self.complement_portfolio = [
        ]  #complementary of the portfolio, whatever not in the portfolio should be stored here
        self.total_rollout_bucket = self.manager.list(
        )  #macheng: we use total_rollout_bucket to represents the whole set of rollout models, now rollout_bukcet dynamically resize according to portforlio, for SA
        self.rollout_bucket = self.total_rollout_bucket
        #self.rollout_bucket = self.manager.list()
        #print("rollout_bucker needs to be updated, main.py line 239 ")
        for _ in range(len(self.portfolio)):
            if ALGO == 'SAC':
                self.rollout_bucket.append(
                    GaussianPolicy(args.state_dim, args.action_dim,
                                   args.hidden_size, -1))
            else:
                self.rollout_bucket.append(
                    Actor(args.state_dim, args.action_dim, -1, ALGO))
        # Initialize shared data bucket
        self.data_bucket = self.replay_buffer.tuples

        ############## MULTIPROCESSING TOOLS ###################
        # Evolutionary population Rollout workers
        self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)]
        self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)]
        self.evo_workers = [
            Process(target=rollout_worker,
                    args=(id, 0, self.evo_task_pipes[id][1],
                          self.evo_result_pipes[id][0], False,
                          self.data_bucket, self.pop, ENV_NAME, None, ALGO,
                          self.trainers)) for id in range(args.pop_size)
        ]
        for worker in self.evo_workers:
            worker.start()
        self.evo_flag = [True for _ in range(args.pop_size)]

        # Learner rollout workers
        self.task_pipes = [Pipe() for _ in range(args.rollout_size)]
        self.result_pipes = [Pipe() for _ in range(args.rollout_size)]
        self.workers = [
            Process(target=rollout_worker,
                    args=(id, 1, self.task_pipes[id][1],
                          self.result_pipes[id][0], True, self.data_bucket,
                          self.rollout_bucket, ENV_NAME, args.noise_std, ALGO,
                          self.trainers)) for id in range(args.rollout_size)
        ]
        for worker in self.workers:
            worker.start()
        self.roll_flag = [True for _ in range(args.rollout_size)]

        # Test bucket
        self.test_bucket = self.manager.list()
        if ALGO == 'SAC':
            self.test_bucket.append(
                GaussianPolicy(args.state_dim, args.action_dim,
                               args.hidden_size, -1))
        else:
            self.test_bucket.append(
                Actor(args.state_dim, args.action_dim, -1, ALGO))

        # 5 Test workers
        self.test_task_pipes = [Pipe() for _ in range(TEST_SIZE)]
        self.test_result_pipes = [Pipe() for _ in range(TEST_SIZE)]
        self.test_workers = [
            Process(target=rollout_worker,
                    args=(id, 2, self.test_task_pipes[id][1],
                          self.test_result_pipes[id][0], False, None,
                          self.test_bucket, ENV_NAME, args.noise_std, ALGO,
                          self.trainers)) for id in range(TEST_SIZE)
        ]
        for worker in self.test_workers:
            worker.start()
        self.test_flag = False

        # Meta-learning controller (Resource Distribution)
        self.allocation = [
        ]  #Allocation controls the resource allocation across learners
        for i in range(args.rollout_size):
            self.allocation.append(
                i % len(self.portfolio))  #Start uniformly (equal resources)
        # self.learner_stats = [{'fitnesses': [], 'ep_lens': [], 'value': 0.0, 'visit_count':0} for _ in range(len(self.portfolio))] #Track node statistsitic (each node is a learner), to compute UCB scores

        # Trackers
        self.best_score = -np.inf
        self.gen_frames = 0
        self.total_frames = 0
        self.best_shaped_score = None
        self.test_score = None
        self.test_std = None
Ejemplo n.º 23
0
    def __init__(self, args, id):
        self.args = args
        self.id = id

        ###Initalize neuroevolution module###
        self.evolver = SSNE(self.args)

        ########Initialize population
        self.manager = Manager()
        self.popn = self.manager.list()
        for _ in range(args.popn_size):
            if args.ps == 'trunk':
                self.popn.append(
                    MultiHeadActor(args.state_dim, args.action_dim,
                                   args.hidden_size, args.config.num_agents))

            else:
                if args.algo_name == 'TD3':
                    self.popn.append(
                        Actor(args.state_dim,
                              args.action_dim,
                              args.hidden_size,
                              policy_type='DeterministicPolicy'))
                else:
                    self.popn.append(
                        Actor(args.state_dim,
                              args.action_dim,
                              args.hidden_size,
                              policy_type='GaussianPolicy'))
            self.popn[-1].eval()

        #### INITIALIZE PG ALGO #####
        if args.ps == 'trunk':

            if self.args.is_matd3 or args.is_maddpg:
                algo_name = 'TD3' if self.args.is_matd3 else 'DDPG'
                self.algo = MATD3(id, algo_name, args.state_dim,
                                  args.action_dim, args.hidden_size,
                                  args.actor_lr, args.critic_lr, args.gamma,
                                  args.tau, args.savetag, args.aux_save,
                                  args.actualize, args.use_gpu,
                                  args.config.num_agents, args.init_w)

            else:
                self.algo = MultiTD3(id, args.algo_name, args.state_dim,
                                     args.action_dim, args.hidden_size,
                                     args.actor_lr, args.critic_lr, args.gamma,
                                     args.tau, args.savetag, args.aux_save,
                                     args.actualize, args.use_gpu,
                                     args.config.num_agents, args.init_w)

        else:
            if args.algo_name == 'TD3':
                self.algo = TD3(id, args.algo_name, args.state_dim,
                                args.action_dim, args.hidden_size,
                                args.actor_lr, args.critic_lr, args.gamma,
                                args.tau, args.savetag, args.aux_save,
                                args.actualize, args.use_gpu, args.init_w)
            else:
                self.algo = SAC(id, args.state_dim, args.action_dim,
                                args.hidden_size, args.gamma, args.critic_lr,
                                args.actor_lr, args.tau, args.alpha,
                                args.target_update_interval, args.savetag,
                                args.aux_save, args.actualize, args.use_gpu)

        #### Rollout Actor is a template used for MP #####
        self.rollout_actor = self.manager.list()

        if args.ps == 'trunk':
            self.rollout_actor.append(
                MultiHeadActor(args.state_dim, args.action_dim,
                               args.hidden_size, args.config.num_agents))
        else:
            if args.algo_name == 'TD3':
                self.rollout_actor.append(
                    Actor(args.state_dim,
                          args.action_dim,
                          args.hidden_size,
                          policy_type='DeterministicPolicy'))
            else:
                self.rollout_actor.append(
                    Actor(args.state_dim,
                          args.action_dim,
                          args.hidden_size,
                          policy_type='GaussianPolicy'))

        #Initalize buffer
        if args.ps == 'trunk':
            self.buffer = [
                Buffer(args.buffer_size,
                       buffer_gpu=False,
                       filter_c=args.filter_c)
                for _ in range(args.config.num_agents)
            ]
        else:
            self.buffer = Buffer(args.buffer_size,
                                 buffer_gpu=False,
                                 filter_c=args.filter_c)

        #Agent metrics
        self.fitnesses = [[] for _ in range(args.popn_size)]

        ###Best Policy HOF####
        self.champ_ind = 0
Ejemplo n.º 24
0
class TD3(object):
	"""Classes implementing TD3 and DDPG off-policy learners

		 Parameters:
			   args (object): Parameter class


	 """
	def __init__(self, id, algo_name, state_dim, action_dim, hidden_size, actor_lr, critic_lr, gamma, tau, savetag, foldername, actualize, use_gpu, init_w = True):

		self.algo_name = algo_name; self.gamma = gamma; self.tau = tau; self.total_update = 0; self.agent_id = id;	self.actualize = actualize; self.use_gpu = use_gpu
		self.tracker = utils.Tracker(foldername, ['q_'+savetag, 'qloss_'+savetag, 'policy_loss_'+savetag, 'alz_score'+savetag,'alz_policy'+savetag], '.csv', save_iteration=1000, conv_size=1000)

		#Initialize actors
		self.policy = Actor(state_dim, action_dim, hidden_size, policy_type='DeterministicPolicy')
		if init_w: self.policy.apply(utils.init_weights)
		self.policy_target = Actor(state_dim, action_dim, hidden_size, policy_type='DeterministicPolicy')
		utils.hard_update(self.policy_target, self.policy)
		self.policy_optim = Adam(self.policy.parameters(), actor_lr)


		self.critic = QNetwork(state_dim, action_dim,hidden_size)
		if init_w: self.critic.apply(utils.init_weights)
		self.critic_target = QNetwork(state_dim, action_dim, hidden_size)
		utils.hard_update(self.critic_target, self.critic)
		self.critic_optim = Adam(self.critic.parameters(), critic_lr)

		if actualize:
			self.ANetwork = ActualizationNetwork(state_dim, action_dim, hidden_size)
			if init_w: self.ANetwork.apply(utils.init_weights)
			self.actualize_optim = Adam(self.ANetwork.parameters(), critic_lr)
			self.actualize_lr = 0.2
			if use_gpu: self.ANetwork.cuda()

		self.loss = nn.MSELoss()

		if use_gpu:
			self.policy_target.cuda(); self.critic_target.cuda(); self.policy.cuda(); self.critic.cuda()
		self.num_critic_updates = 0

		#Statistics Tracker
		#self.action_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.policy_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.q_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.q = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.alz_score = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.alz_policy = {'min':None, 'max': None, 'mean':None, 'std':None}
		#self.val = {'min':None, 'max': None, 'mean':None, 'std':None}
		#self.value_loss = {'min':None, 'max': None, 'mean':None, 'std':None}


	def update_parameters(self, state_batch, next_state_batch, action_batch, reward_batch, done_batch, global_reward, num_epoch=1, **kwargs):
		"""Runs a step of Bellman upodate and policy gradient using a batch of experiences

			 Parameters:
				  state_batch (tensor): Current States
				  next_state_batch (tensor): Next States
				  action_batch (tensor): Actions
				  reward_batch (tensor): Rewards
				  done_batch (tensor): Done batch
				  num_epoch (int): Number of learning iteration to run with the same data

			 Returns:
				   None

		 """

		if isinstance(state_batch, list): state_batch = torch.cat(state_batch); next_state_batch = torch.cat(next_state_batch); action_batch = torch.cat(action_batch); reward_batch = torch.cat(reward_batch). done_batch = torch.cat(done_batch); global_reward = torch.cat(global_reward)

		for _ in range(num_epoch):
			########### CRITIC UPDATE ####################

			#Compute next q-val, next_v and target
			with torch.no_grad():
				#Policy Noise
				policy_noise = np.random.normal(0, kwargs['policy_noise'], (action_batch.size()[0], action_batch.size()[1]))
				policy_noise = torch.clamp(torch.Tensor(policy_noise), -kwargs['policy_noise_clip'], kwargs['policy_noise_clip'])

				#Compute next action_bacth
				next_action_batch = self.policy_target.clean_action(next_state_batch, return_only_action=True) + policy_noise.cuda() if self.use_gpu else policy_noise
				next_action_batch = torch.clamp(next_action_batch, -1, 1)

				#Compute Q-val and value of next state masking by done
				q1, q2 = self.critic_target.forward(next_state_batch, next_action_batch)
				q1 = (1 - done_batch) * q1
				q2 = (1 - done_batch) * q2
				#next_val = (1 - done_batch) * next_val

				#Select which q to use as next-q (depends on algo)
				if self.algo_name == 'TD3' or self.algo_name == 'TD3_actor_min': next_q = torch.min(q1, q2)
				elif self.algo_name == 'DDPG': next_q = q1
				elif self.algo_name == 'TD3_max': next_q = torch.max(q1, q2)

				#Compute target q and target val
				target_q = reward_batch + (self.gamma * next_q)
				#if self.args.use_advantage: target_val = reward_batch + (self.gamma * next_val)

			if self.actualize:
				##########Actualization Network Update
				current_Ascore = self.ANetwork.forward(state_batch, action_batch)
				utils.compute_stats(current_Ascore, self.alz_score)
				target_Ascore = (self.actualize_lr) * (global_reward * 10.0) + (1 - self.actualize_lr) * current_Ascore.detach()
				actualize_loss = self.loss(target_Ascore, current_Ascore).mean()



			self.critic_optim.zero_grad()
			current_q1, current_q2 = self.critic.forward((state_batch), (action_batch))
			utils.compute_stats(current_q1, self.q)

			dt = self.loss(current_q1, target_q)
			# if self.args.use_advantage:
			#     dt = dt + self.loss(current_val, target_val)
			#     utils.compute_stats(current_val, self.val)

			if self.algo_name == 'TD3' or self.algo_name == 'TD3_max': dt = dt + self.loss(current_q2, target_q)
			utils.compute_stats(dt, self.q_loss)

			# if self.args.critic_constraint:
			#     if dt.item() > self.args.critic_constraint_w:
			#         dt = dt * (abs(self.args.critic_constraint_w / dt.item()))
			dt.backward()

			self.critic_optim.step()
			self.num_critic_updates += 1

			if self.actualize:
				self.actualize_optim.zero_grad()
				actualize_loss.backward()
				self.actualize_optim.step()


			#Delayed Actor Update
			if self.num_critic_updates % kwargs['policy_ups_freq'] == 0:

				actor_actions = self.policy.clean_action(state_batch, return_only_action=False)

				# # Trust Region constraint
				# if self.args.trust_region_actor:
				#     with torch.no_grad(): old_actor_actions = self.actor_target.forward(state_batch)
				#     actor_actions = action_batch - old_actor_actions


				Q1, Q2 = self.critic.forward(state_batch, actor_actions)

				# if self.args.use_advantage: policy_loss = -(Q1 - val)
				policy_loss = -Q1

				utils.compute_stats(-policy_loss,self.policy_loss)
				policy_loss = policy_loss.mean()

				###Actualzie Policy Update
				if self.actualize:
					A1 = self.ANetwork.forward(state_batch, actor_actions)
					utils.compute_stats(A1, self.alz_policy)
					policy_loss += -A1.mean()*0.1



				self.policy_optim.zero_grad()



				policy_loss.backward(retain_graph=True)
				#nn.utils.clip_grad_norm_(self.actor.parameters(), 10)
				# if self.args.action_loss:
				#     action_loss = torch.abs(actor_actions-0.5)
				#     utils.compute_stats(action_loss, self.action_loss)
				#     action_loss = action_loss.mean() * self.args.action_loss_w
				#     action_loss.backward()
				#     #if self.action_loss[-1] > self.policy_loss[-1]: self.args.action_loss_w *= 0.9 #Decay action_w loss if action loss is larger than policy gradient loss
				self.policy_optim.step()


			# if self.args.hard_update:
			#     if self.num_critic_updates % self.args.hard_update_freq == 0:
			#         if self.num_critic_updates % self.args.policy_ups_freq == 0: self.hard_update(self.actor_target, self.actor)
			#         self.hard_update(self.critic_target, self.critic)


			if self.num_critic_updates % kwargs['policy_ups_freq'] == 0: utils.soft_update(self.policy_target, self.policy, self.tau)
			utils.soft_update(self.critic_target, self.critic, self.tau)

			self.total_update += 1
			if self.agent_id == 0:
				self.tracker.update([self.q['mean'], self.q_loss['mean'], self.policy_loss['mean'],self.alz_score['mean'], self.alz_policy['mean']] ,self.total_update)
class TD3(object):
    """Classes implementing TD3 and DDPG off-policy learners

         Parameters:
               args (object): Parameter class


     """
    def to_cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic_target.cuda()
        self.critic.cuda()

    def __init__(self, args):

        self.args = args

        self.actor = Actor(args)
        self.actor.apply(utils.init_weights)
        self.actor_target = Actor(args)
        self.actor_optim = Adam(self.actor.parameters(), lr=1e-4)

        self.critic = Critic(args)
        self.critic.apply(utils.init_weights)
        self.critic_target = Critic(args)
        self.critic_optim = Adam(self.critic.parameters(), lr=1e-3)

        self.gamma = args.gamma
        self.tau = self.args.tau
        self.loss = nn.MSELoss()

        self.hard_update(
            self.actor_target,
            self.actor)  # Make sure target is with the same weight
        self.hard_update(self.critic_target, self.critic)
        self.actor_target.cuda()
        self.critic_target.cuda()
        self.actor.cuda()
        self.critic.cuda()
        self.num_critic_updates = 0

        #Statistics Tracker
        self.action_loss = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.policy_loss = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.critic_loss = {'mean': []}
        self.q = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.val = {'min': [], 'max': [], 'mean': [], 'std': []}

    def compute_stats(self, tensor, tracker):
        """Computes stats from intermediate tensors

             Parameters:
                   tensor (tensor): tensor
                   tracker (object): logger

             Returns:
                   None


         """
        tracker['min'].append(torch.min(tensor).item())
        tracker['max'].append(torch.max(tensor).item())
        tracker['mean'].append(torch.mean(tensor).item())
        tracker['mean'].append(torch.mean(tensor).item())

    def update_parameters(self,
                          state_batch,
                          next_state_batch,
                          action_batch,
                          reward_batch,
                          done_batch,
                          dpp,
                          num_epoch=1):
        """Runs a step of Bellman upodate and policy gradient using a batch of experiences

             Parameters:
                  state_batch (tensor): Current States
                  next_state_batch (tensor): Next States
                  action_batch (tensor): Actions
                  reward_batch (tensor): Rewards
                  done_batch (tensor): Done batch
                  num_epoch (int): Number of learning iteration to run with the same data

             Returns:
                   None

         """

        if isinstance(state_batch, list):
            state_batch = torch.cat(state_batch)
            next_state_batch = torch.cat(next_state_batch)
            action_batch = torch.cat(action_batch)
            reward_batch = torch.cat(reward_batch).done_batch = torch.cat(
                done_batch)

        for _ in range(num_epoch):
            ########### CRITIC UPDATE ####################

            #Compute next q-val, next_v and target
            with torch.no_grad():
                #Policy Noise
                policy_noise = np.random.normal(
                    0, self.args.policy_noise,
                    (action_batch.size()[0], action_batch.size()[1]))
                policy_noise = torch.clamp(torch.Tensor(policy_noise),
                                           -self.args.policy_noise_clip,
                                           self.args.policy_noise_clip)

                #Compute next action_bacth
                next_action_batch = self.actor_target.forward(
                    next_state_batch) + policy_noise.cuda()
                next_action_batch = torch.clamp(next_action_batch, 0, 1)

                #Compute Q-val and value of next state masking by done
                q1, q2, next_val = self.critic_target.forward(
                    next_state_batch, next_action_batch)
                q1 = (1 - done_batch) * q1
                q2 = (1 - done_batch) * q2
                next_val = (1 - done_batch) * next_val
                next_q = torch.min(q1, q2)

                #Compute target q and target val
                target_q = reward_batch + (self.gamma * next_q)
                target_val = reward_batch + (self.gamma * next_val)

            self.critic_optim.zero_grad()
            current_q1, current_q2, current_val = self.critic.forward(
                (state_batch), (action_batch))
            self.compute_stats(current_q1, self.q)

            dt = self.loss(current_q1, target_q)
            dt = dt + self.loss(current_val, target_val)
            self.compute_stats(current_val, self.val)

            dt = dt + self.loss(current_q2, target_q)
            self.critic_loss['mean'].append(dt.item())

            dt.backward()

            self.critic_optim.step()
            self.num_critic_updates += 1

            #Delayed Actor Update
            if self.num_critic_updates % self.args.policy_ups_freq == 0:

                actor_actions = self.actor.forward(state_batch)

                if dpp:
                    policy_loss = -self.shape_dpp(self.critic, self.actor,
                                                  state_batch,
                                                  self.args.sensor_model)

                else:
                    Q1, Q2, val = self.critic.forward(state_batch,
                                                      actor_actions)
                    policy_loss = -(Q1 - val)

                self.compute_stats(policy_loss, self.policy_loss)
                policy_loss = policy_loss.mean()
                self.actor_optim.zero_grad()

                policy_loss.backward(retain_graph=True)
                if self.args.action_loss:
                    action_loss = torch.abs(actor_actions - 0.5)
                    self.compute_stats(action_loss, self.action_loss)
                    action_loss = action_loss.mean() * self.args.action_loss_w
                    action_loss.backward()
                    #if self.action_loss[-1] > self.policy_loss[-1]: self.args.action_loss_w *= 0.9 #Decay action_w loss if action loss is larger than policy gradient loss
                self.actor_optim.step()

                if self.num_critic_updates % self.args.policy_ups_freq == 0:
                    self.soft_update(self.actor_target, self.actor, self.tau)
                self.soft_update(self.critic_target, self.critic, self.tau)

    def soft_update(self, target, source, tau):
        """Soft update from target network to source

            Parameters:
                  target (object): A pytorch model
                  source (object): A pytorch model
                  tau (float): Tau parameter

            Returns:
                None

        """

        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - tau) +
                                    param.data * tau)

    def hard_update(self, target, source):
        """Hard update (clone) from target network to source

            Parameters:
                  target (object): A pytorch model
                  source (object): A pytorch model

            Returns:
                None
        """

        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)

    def shape_dpp(self, critic, actor, state, sensor_model):

        Q1, _, val = critic((state), actor((state)))
        original_T = Q1 - val

        all_adv = [original_T]

        state = utils.to_numpy(state.cpu())
        #mid_index = int(180 / self.args.angle_res)
        coupling = self.args.coupling

        max_ind = int(360 / self.args.angle_res)

        perturb_index = [
            np.argwhere(state[i, 0:max_ind] != -1).flatten()
            for i in range(len(state))
        ]
        for i, entry in enumerate(perturb_index):
            np.random.shuffle(entry)
            if len(entry) < coupling:
                perturb_index[i] = np.tile(entry, (coupling, 1)).flatten()

        for coupling_mag in range(coupling):

            empty_ind = [int(entry[coupling_mag]) for entry in perturb_index]

            if sensor_model == 'density':
                for i, ind in enumerate(empty_ind):
                    state[i, ind] = 1.0
            elif sensor_model == 'closets':
                for i, ind in enumerate(empty_ind):
                    state[i, ind] = 1.0

            shaped_state = utils.to_tensor(state).cuda()

            Q1, _, val = critic((shaped_state), actor((shaped_state)))
            adv = (Q1 - val) / (coupling_mag + 1)
            all_adv.append(adv)

        all_adv = torch.cat(all_adv, 1)
        dpp_max = torch.max(all_adv, 1)[0].unsqueeze(1)

        with torch.no_grad():
            normalizer = dpp_max / original_T

        return original_T * normalizer
Ejemplo n.º 26
0
 def mutate(root, info, input=None):
     ok = True
     actor_instance = Actor(name=input.name, pic=input.pic)
     actor_instance.save()
     return CreateActor(ok=ok, actor=actor_instance)
Ejemplo n.º 27
0
class CERL_Agent:
    """Main CERL class containing all methods for CERL

		Parameters:
		args (int): Parameter class with all the parameters

	"""
    def __init__(self,
                 args):  # need to intialize rollout_workers to have blue agent
        self.args = args
        self.evolver = SSNE(
            self.args)  # this evolver implements neuro-evolution

        # MP TOOLS
        self.manager = Manager()

        self.mutate_algos = [
            Mutation_Add(self),
            Mutation_Delete(self),
            Mutation_Exchange(self)
        ]  #store all the mutate algorithm objects
        # Genealogy tool
        self.genealogy = Genealogy()

        # Init BUFFER
        self.replay_buffer = Buffer(1000000, self.args.buffer_gpu)

        #if SA_FLAG:
        self.metrics = []
        self.last_portfolio = None
        self.T_max = 30
        self.T = self.T_max
        self.T_min = 0.2
        self.decay_rate = 0.975

        # Initialize population
        self.pop = self.manager.list()
        for _ in range(args.pop_size):
            wwid = self.genealogy.new_id('evo')
            if ALGO == 'SAC':
                self.pop.append(
                    GaussianPolicy(args.state_dim, args.action_dim,
                                   args.hidden_size, wwid))
            elif ALGO == 'TD3':
                self.pop.append(
                    Actor(args.state_dim, args.action_dim, wwid, ALGO))
                # use ALGO to distinguish differe net architecture
            elif ALGO == 'dis' or 'TD3_tennis':
                self.pop.append(
                    Actor(args.state_dim, args.action_dim, wwid, ALGO))
            else:
                assert False, "invalid algorithm type"

        if ALGO == "SAC":
            self.best_policy = GaussianPolicy(args.state_dim, args.action_dim,
                                              args.hidden_size, -1)
        else:
            self.best_policy = Actor(args.state_dim, args.action_dim, -1, ALGO)
            if ALGO == 'dis':
                self.average_policy = AverageActor(args.state_dim,
                                                   args.action_dim,
                                                   -2,
                                                   ALGO,
                                                   self.pop,
                                                   self.replay_buffer,
                                                   args.buffer_gpu,
                                                   args.batch_size,
                                                   iterations=10)
                self.average_policy.share_memory()

        self.best_policy.share_memory()

        # added by macheng, share the best policy accross processes (used as internal belief update models for blue)

        # now we assign shared blue_trainer, we should train this agent such that the roll_out workers are also up to date
        # should make sure that self.best_policy (emergent learner) is also shared
        if ALGO == 'dis' or 'TD3_tennis':
            assert hasattr(
                args, "blue_trainer"
            ), "must have blue_agent trainer to intialize rollout_worker, see line 109, class Parameter definition"
        if ALGO == 'dis':
            trainers = [args.blue_trainer, self.average_policy]
        else:
            trainers = [args.blue_trainer, None
                        ] if ALGO == 'TD3_tennis' else []

        self.trainers = trainers

        self.blue_dqn = args.blue_trainer

        # Turn off gradients and put in eval mod
        for actor in self.pop:
            actor = actor.cpu()
            actor.eval()
        # Intialize portfolio of learners
        self.portfolio = []
        self.portfolio = initialize_portfolio(self.portfolio, self.args,
                                              self.genealogy, PORTFOLIO_ID)
        self.complement_portfolio = [
        ]  #complementary of the portfolio, whatever not in the portfolio should be stored here
        self.total_rollout_bucket = self.manager.list(
        )  #macheng: we use total_rollout_bucket to represents the whole set of rollout models, now rollout_bukcet dynamically resize according to portforlio, for SA
        self.rollout_bucket = self.total_rollout_bucket
        #self.rollout_bucket = self.manager.list()
        #print("rollout_bucker needs to be updated, main.py line 239 ")
        for _ in range(len(self.portfolio)):
            if ALGO == 'SAC':
                self.rollout_bucket.append(
                    GaussianPolicy(args.state_dim, args.action_dim,
                                   args.hidden_size, -1))
            else:
                self.rollout_bucket.append(
                    Actor(args.state_dim, args.action_dim, -1, ALGO))
        # Initialize shared data bucket
        self.data_bucket = self.replay_buffer.tuples

        ############## MULTIPROCESSING TOOLS ###################
        # Evolutionary population Rollout workers
        self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)]
        self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)]
        self.evo_workers = [
            Process(target=rollout_worker,
                    args=(id, 0, self.evo_task_pipes[id][1],
                          self.evo_result_pipes[id][0], False,
                          self.data_bucket, self.pop, ENV_NAME, None, ALGO,
                          self.trainers)) for id in range(args.pop_size)
        ]
        for worker in self.evo_workers:
            worker.start()
        self.evo_flag = [True for _ in range(args.pop_size)]

        # Learner rollout workers
        self.task_pipes = [Pipe() for _ in range(args.rollout_size)]
        self.result_pipes = [Pipe() for _ in range(args.rollout_size)]
        self.workers = [
            Process(target=rollout_worker,
                    args=(id, 1, self.task_pipes[id][1],
                          self.result_pipes[id][0], True, self.data_bucket,
                          self.rollout_bucket, ENV_NAME, args.noise_std, ALGO,
                          self.trainers)) for id in range(args.rollout_size)
        ]
        for worker in self.workers:
            worker.start()
        self.roll_flag = [True for _ in range(args.rollout_size)]

        # Test bucket
        self.test_bucket = self.manager.list()
        if ALGO == 'SAC':
            self.test_bucket.append(
                GaussianPolicy(args.state_dim, args.action_dim,
                               args.hidden_size, -1))
        else:
            self.test_bucket.append(
                Actor(args.state_dim, args.action_dim, -1, ALGO))

        # 5 Test workers
        self.test_task_pipes = [Pipe() for _ in range(TEST_SIZE)]
        self.test_result_pipes = [Pipe() for _ in range(TEST_SIZE)]
        self.test_workers = [
            Process(target=rollout_worker,
                    args=(id, 2, self.test_task_pipes[id][1],
                          self.test_result_pipes[id][0], False, None,
                          self.test_bucket, ENV_NAME, args.noise_std, ALGO,
                          self.trainers)) for id in range(TEST_SIZE)
        ]
        for worker in self.test_workers:
            worker.start()
        self.test_flag = False

        # Meta-learning controller (Resource Distribution)
        self.allocation = [
        ]  #Allocation controls the resource allocation across learners
        for i in range(args.rollout_size):
            self.allocation.append(
                i % len(self.portfolio))  #Start uniformly (equal resources)
        # self.learner_stats = [{'fitnesses': [], 'ep_lens': [], 'value': 0.0, 'visit_count':0} for _ in range(len(self.portfolio))] #Track node statistsitic (each node is a learner), to compute UCB scores

        # Trackers
        self.best_score = -np.inf
        self.gen_frames = 0
        self.total_frames = 0
        self.best_shaped_score = None
        self.test_score = None
        self.test_std = None

        # trainer contains the blue_dqn to be trained, and the red model used for belief update, red_actor is the actual red agent trained against
        # id is the actual red agent id

    def _update_SA_temperature(self):
        self.T = max(self.T * self.decay_rate, self.T_min)

    def _get_accept_rate(self):
        if RANDOM_WALK:
            return 1.0
        else:
            if self.metrics[-1] > self.metrics[-2]:
                return 1.0
            else:
                return np.exp((self.metrics[-1] - self.metrics[-2]) / self.T)

    def _mutate(self):
        while True:
            mutate_algo_index = random.choice(range(3))
            if self._try_mutate(mutate_algo_index):
                return

    def _try_mutate(self,
                    algo_index):  # 0 for add, 1 for delete, 2 for exchange
        return self.mutate_algos[algo_index].try_mutate()

    def simulated_annealing(self, metric):  #take in the current metric
        self.metrics.append(metric)
        if self.last_portfolio:  #has last_portfolio
            accept_rate = self._get_accept_rate()  #based on self.metrics[-2:]
            self._update_SA_temperature()
            if np.random.random() > accept_rate:  #reject
                self.portfolio = self.last_portfolio
                self.complement_portfolio = self.last_complement_portfolio

        self.last_portfolio = copy.copy(
            self.portfolio)  #maintain a shallow copy as
        self.last_complement_portfolio = copy.copy(self.complement_portfolio)
        self._mutate()  #perturb the portfolio
        # update rollout_bucket size, only the first len(self.portfolio) rollout_buckets are visible
        self.update_rollout_bucket()
        # update allocation, to be compatible with the current portfolio
        self.update_allocation()

    def update_rollout_bucket(self):
        self.rollout_bucket = self.total_rollout_bucket[:len(self.portfolio)]

    def train_blue_dqn(
        self,
        trainers,
        env_name,
        gen,
        ALGO='dis',
        pomdp_adv=False
    ):  #in this method, rollout and training are done together, opponent sampled from the population
        NUM_EPISODE = 100  #train 100 episodes for the blue to converge to the new best response to red
        EPS_START = max(1.0 * 0.5**(gen - 10),
                        0.15) if gen >= 10 else 1.0  #initial epsilon
        EPS_END = 0.05
        EPS_DECAY = 0.995

        if ALGO == 'dis':  # make env with blue and red policy agent inside,
            assert trainers is not None
            dis_env = make_self_play_env(
                seed=np.random.choice(np.array(range(len(self.pop)))),
                return_policy_agent=False,
                trainers=trainers
            )[0]  # trainer if not None, first is the shared DQN agent, second is the best red policy
            env = EnvironmentWrapper(
                env_name, ALGO, dis_env,
                0)  # the "0" is the index for training blue agent
        elif ALGO == 'TD3_tennis':
            no_graphics = not RENDER
            tennis_env = make_tennis_env.TennisEnvFactory(
                seed=np.random.choice(np.array(range(len(self.pop)))),
                no_graphics=no_graphics,
                pid=-1).getEnv()[0]
            env = EnvironmentWrapper('Tennis', ALGO, tennis_env, 0)
        else:
            env = EnvironmentWrapper(env_name, ALGO)

        blue_dqn = trainers[0]
        average_reward = 0
        eps = EPS_START

        average_red_reward = 0
        red_count = 0
        average_actual_blue_reward = 0
        blue_count = 0

        for it in range(NUM_EPISODE):
            if not pomdp_adv:  #if pomdp_adv, make sure that TD3_actor is never used
                id = np.random.choice(np.array(range(len(self.pop))))
                red_actor = self.pop[id]
                env.set_TD3_actor(red_actor)

            fitness = 0.0
            #here fitness if simplely reward
            total_frame = 0
            state = env.reset()
            env.randomize_neu_adv()

            if pomdp_adv:
                env.try_set_pomdp_adv(
                )  #try to set if opponent to pomdp adv if opponent is adversary, else do nothing

            render_flag = (np.random.random() < 0.05)
            while True:  # unless done

                action = blue_dqn.act(state, eps=eps)
                # action = utils.to_numpy(action)

                next_state, reward, done, info = env.step(
                    copy.deepcopy(action), use_actual_reward=DRQN
                )  #after calling env.step, evaluator initialized later does not work
                #should be something wrong with the internal red model?
                blue_dqn.step(state, action, reward, next_state, done)

                if render_flag and self.args.render:
                    env.render()
                # next_state = utils.to_tensor(np.array(next_state)).unsqueeze(0)
                state = next_state
                fitness += reward
                total_frame += 1

                # DONE FLAG IS Received
                if done:
                    average_red_reward += env.get_red_reward(
                    ) if env.get_red_reward() is not None else 0
                    average_actual_blue_reward += env.get_blue_actual_reward(
                    ) if env.get_blue_actual_reward() is not None else 0
                    red_count += 1 if env.get_red_reward() is not None else 0
                    blue_count += 1 if env.get_blue_actual_reward(
                    ) is not None else 0
                    if render_flag: env.env.close()
                    break

            average_reward += fitness
            eps = max(EPS_END, EPS_DECAY * eps)

        if gen >= 10 and gen % 5 == 0:
            blue_dqn.save_net('./pytorch_models/train_blue_dqn_step_' +
                              str(gen) + '.pth')

        average_reward /= NUM_EPISODE
        if red_count != 0:
            average_red_reward /= red_count
        if blue_count != 0:
            average_actual_blue_reward /= blue_count
        return average_reward, average_red_reward, average_actual_blue_reward

    def evaluate_training_fixed_blue(
            self):  #this evaluate against the training opponent (red pop)
        self.evaluator.pomdp_adv = False
        return self.evaluator.evaluate_fixed_agents(self.trainers[0],
                                                    self.trainers[1], self.pop)

    def train(self, gen, frame_tracker):
        """Main training loop to do rollouts, neureoevolution, and policy gradients

			Parameters:
				gen (int): Current epoch of training

			Returns:
				None
		"""
        ################ START ROLLOUTS ##############

        # Start Evolution rollouts
        if not ISOLATE_PG:
            for id, actor in enumerate(self.pop):
                if self.evo_flag[id]:
                    self.evo_task_pipes[id][0].send((id, gen))
                    self.evo_flag[id] = False

        # Sync all learners actor to cpu (rollout) actor
        # (update rollout parameter using the learner parameter, such that rollout worker is up to date)
        for i, learner in enumerate(self.portfolio):  #number of learner
            learner.algo.actor.cpu()
            utils.hard_update(
                self.rollout_bucket[i], learner.algo.actor
            )  #rollout bucket is now synchronized with learner to perform rollout for learner actors
            if torch.cuda.is_available(): learner.algo.actor.cuda()

        # Start Learner rollouts
        for rollout_id, learner_id in enumerate(
                self.allocation):  #number of rollout_size
            if self.roll_flag[rollout_id]:
                self.task_pipes[rollout_id][0].send(
                    (learner_id, gen)
                )  #allocation record the id of the learner that bucket should run, so rollout_id is the id of rollout_bucket
                self.roll_flag[rollout_id] = False

        # Start Test rollouts
        if gen % 5 == 0:
            self.test_flag = True
            for pipe in self.test_task_pipes:
                pipe[0].send((0, gen))

        ############# UPDATE PARAMS USING GRADIENT DESCENT ##########
        # main training loop
        if self.replay_buffer.__len__(
        ) > self.args.batch_size * 10:  ###BURN IN PERIOD
            self.replay_buffer.tensorify(
            )  # Tensorify the buffer for fast sampling

            # Spin up threads for each learner
            threads = [
                threading.Thread(
                    target=learner.update_parameters,
                    args=(self.replay_buffer, self.args.buffer_gpu,
                          self.args.batch_size,
                          int(self.gen_frames * self.args.gradperstep)))
                for learner in self.portfolio
            ]  #macheng: do we want to train all the learners?

            # Start threads
            for thread in threads:
                thread.start()

            # Join threads
            for thread in threads:
                thread.join()

            # Now update average_policy
            #self.average_policy.cuda()
            if ALGO == 'dis':
                self.average_policy.update(
                )  #update the average_policy parameter with supervised learning

            self.gen_frames = 0

            #########Visualize Learner Critic Function#################
            # if self.replay_buffer.__len__() % 2500 == 0:
            #	visualize_critic(self.portfolio[2], make_self_play_env(trainers=[[],[]])[0], 50)  #arguments: Learner, env, N_GRID

        ########## SOFT -JOIN ROLLOUTS FOR EVO POPULATION ############
        if not ISOLATE_PG:
            all_fitness = []
            all_net_ids = []
            all_eplens = []
            while True:
                for i in range(self.args.pop_size):
                    if self.evo_result_pipes[i][1].poll():
                        entry = self.evo_result_pipes[i][1].recv()
                        all_fitness.append(entry[1])
                        all_net_ids.append(entry[0])
                        all_eplens.append(entry[2])
                        self.gen_frames += entry[2]
                        self.total_frames += entry[2]
                        self.evo_flag[i] = True

                # Soft-join (50%)
                if len(all_fitness
                       ) / self.args.pop_size >= self.args.asynch_frac:
                    break

        ########## HARD -JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############
        for i in range(self.args.rollout_size):
            entry = self.result_pipes[i][1].recv()
            learner_id = entry[0]
            fitness = entry[1]
            num_frames = entry[2]
            self.portfolio[learner_id].update_stats(fitness, num_frames)

            self.gen_frames += num_frames
            self.total_frames += num_frames
            if fitness > self.best_score: self.best_score = fitness

            self.roll_flag[i] = True

        # Referesh buffer (housekeeping tasks - pruning to keep under capacity)
        self.replay_buffer.referesh()
        ######################### END OF PARALLEL ROLLOUTS ################

        ############ PROCESS MAX FITNESS #############
        # ms:best policy is always up to date
        # so here the best learner is saved
        if not ISOLATE_PG:
            champ_index = all_net_ids[all_fitness.index(max(all_fitness))]
            utils.hard_update(self.test_bucket[0], self.pop[champ_index])
            if max(all_fitness) > self.best_score:
                self.best_score = max(all_fitness)
                utils.hard_update(self.best_policy, self.pop[champ_index])
                if SAVE:
                    torch.save(
                        self.pop[champ_index].state_dict(),
                        self.args.aux_folder + ENV_NAME + '_best' + SAVETAG)
                    print("Best policy saved with score",
                          '%.2f' % max(all_fitness))

        else:  #Run PG in isolation
            utils.hard_update(self.test_bucket[0], self.rollout_bucket[0])

        ###### TEST SCORE ######
        if self.test_flag:
            self.test_flag = False
            test_scores = []
            for pipe in self.test_result_pipes:  #Collect all results
                entry = pipe[1].recv()
                test_scores.append(entry[1])
            test_scores = np.array(test_scores)
            test_mean = np.mean(test_scores)
            test_std = (np.std(test_scores))

            # Update score to trackers
            frame_tracker.update([test_mean], self.total_frames)
        else:
            test_mean, test_std = None, None

        # NeuroEvolution's probabilistic selection and recombination step
        # ms: this epoch() method implements neuro-evolution
        if not ISOLATE_PG:  #seems pop_size and rollout_size must be 10, otherwise this will produce error
            if gen % 5 == 0:
                self.evolver.epoch(
                    gen, self.genealogy, self.pop, all_net_ids, all_fitness,
                    self.rollout_bucket
                )  #this method also copies learner to evoler
            else:
                self.evolver.epoch(gen, self.genealogy, self.pop, all_net_ids,
                                   all_fitness, [])

        # META LEARNING - RESET ALLOCATION USING UCB
        if gen % 1 == 0:
            self.update_allocation()
        # Metrics
        if not ISOLATE_PG:
            champ_len = all_eplens[all_fitness.index(max(all_fitness))]
            champ_wwid = int(self.pop[champ_index].wwid.item())
            max_fit = max(all_fitness)
        else:
            champ_len = num_frames
            champ_wwid = int(self.rollout_bucket[0].wwid.item())
            all_fitness = [fitness]
            max_fit = fitness
            all_eplens = [num_frames]

        return max_fit, champ_len, all_fitness, all_eplens, test_mean, test_std, champ_wwid

    def update_allocation(self):
        self.allocation = ucb(len(self.allocation), self.portfolio,
                              self.args.ucb_coefficient)

    def sim_and_eval_POMDP(self):
        self.evaluator = Evaluator(
            self, 5, self.trainers,
            pomdp_adv=True)  # evaluator must be created before train_dqn
        for gen in range(1000000):
            print('gen=', gen)
            blue_score, red_score, actual_blue_score = agent.train_blue_dqn(
                agent.trainers, ENV_NAME, gen, ALGO='dis', pomdp_adv=True)
            print('Env', ENV_NAME, 'Gen', gen,
                  ", Training average: Blue agent score: ", blue_score,
                  " Red score: ", red_score, " Actual blue score: ",
                  actual_blue_score)
            blue_score, red_score, actual_blue_score = self.evaluator.evaluate(
            )
            print("Evaluation result: Blue agent score: ", blue_score,
                  " Red score: ", red_score, " Actual blue score: ",
                  actual_blue_score)
Ejemplo n.º 28
0
class DDPG(object):
    def __init__(self, args):

        self.args = args

        self.actor = Actor(args, init=True)
        self.actor_target = Actor(args, init=True)
        self.actor_optim = Adam(self.actor.parameters(), lr=0.5e-4)

        self.critic = Critic(args)
        self.critic_target = Critic(args)
        self.critic_optim = Adam(self.critic.parameters(), lr=0.5e-3)

        self.gamma = args.gamma
        self.tau = self.args.tau
        self.loss = nn.MSELoss()

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

    def update_parameters(self, batch):
        state_batch = torch.cat(batch.state)
        next_state_batch = torch.cat(batch.next_state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        if self.args.use_done_mask: done_batch = torch.cat(batch.done)

        #Load everything to GPU if not already
        if self.args.is_memory_cuda and not self.args.is_cuda:
            self.actor.cuda()
            self.actor_target.cuda()
            self.critic_target.cuda()
            self.critic.cuda()
            state_batch = state_batch.cuda()
            next_state_batch = next_state_batch.cuda()
            action_batch = action_batch.cuda()
            reward_batch = reward_batch.cuda()
            if self.args.use_done_mask: done_batch = done_batch.cuda()

        #Critic Update
        next_action_batch = self.actor_target.forward(next_state_batch)
        with torch.no_grad():
            next_q = self.critic_target.forward(next_state_batch,
                                                next_action_batch)
            if self.args.use_done_mask:
                next_q = next_q * (1 - done_batch.float())  #Done mask
            target_q = reward_batch + (self.gamma * next_q)

        self.critic_optim.zero_grad()
        current_q = self.critic.forward((state_batch), (action_batch))
        dt = self.loss(current_q, target_q)
        dt.backward()
        nn.utils.clip_grad_norm_(self.critic.parameters(), 10)
        self.critic_optim.step()

        #Actor Update
        self.actor_optim.zero_grad()
        policy_loss = -self.critic.forward(
            (state_batch), self.actor.forward((state_batch)))
        policy_loss = policy_loss.mean()
        policy_loss.backward()
        nn.utils.clip_grad_norm_(self.critic.parameters(), 10)
        self.actor_optim.step()

        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        #Nets back to CPU if using memory_cuda
        if self.args.is_memory_cuda and not self.args.is_cuda:
            self.actor.cpu()
            self.actor_target.cpu()
            self.critic_target.cpu()
            self.critic.cpu()
Ejemplo n.º 29
0
    def __init__(self, args):
        self.args = args
        self.evolver = SSNE(self.args)

        #MP TOOLS
        self.manager = Manager()

        #Genealogy tool
        self.genealogy = Genealogy()

        #Initialize population
        self.pop = self.manager.list()
        for _ in range(args.pop_size):
            wwid = self.genealogy.new_id('evo')
            if ALGO == 'SAC':
                self.pop.append(
                    GaussianPolicy(args.state_dim, args.action_dim,
                                   args.hidden_size, wwid))
            else:
                self.pop.append(Actor(args.state_dim, args.action_dim, wwid))

        if ALGO == "SAC":
            self.best_policy = GaussianPolicy(args.state_dim, args.action_dim,
                                              args.hidden_size, -1)
        else:
            self.best_policy = Actor(args.state_dim, args.action_dim, -1)

        #Turn off gradients and put in eval mod
        for actor in self.pop:
            actor = actor.cpu()
            actor.eval()

        #Init BUFFER
        self.replay_buffer = Buffer(1000000, self.args.buffer_gpu)

        #Intialize portfolio of learners
        self.portfolio = []
        self.portfolio = initialize_portfolio(self.portfolio, self.args,
                                              self.genealogy, PORTFOLIO_ID)
        self.rollout_bucket = self.manager.list()
        for _ in range(len(self.portfolio)):
            if ALGO == 'SAC':
                self.rollout_bucket.append(
                    GaussianPolicy(args.state_dim, args.action_dim,
                                   args.hidden_size, -1))
            else:
                self.rollout_bucket.append(
                    Actor(args.state_dim, args.action_dim, -1))

        # Initialize shared data bucket
        self.data_bucket = self.replay_buffer.tuples

        ############## MULTIPROCESSING TOOLS ###################

        #Evolutionary population Rollout workers
        self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)]
        self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)]
        self.evo_workers = [
            Process(target=rollout_worker,
                    args=(id, self.evo_task_pipes[id][1],
                          self.evo_result_pipes[id][0], False,
                          self.data_bucket, self.pop, ENV_NAME, None, ALGO))
            for id in range(args.pop_size)
        ]
        for worker in self.evo_workers:
            worker.start()
        self.evo_flag = [True for _ in range(args.pop_size)]

        #Learner rollout workers
        self.task_pipes = [Pipe() for _ in range(args.rollout_size)]
        self.result_pipes = [Pipe() for _ in range(args.rollout_size)]
        self.workers = [
            Process(target=rollout_worker,
                    args=(id, self.task_pipes[id][1], self.result_pipes[id][0],
                          True, self.data_bucket, self.rollout_bucket,
                          ENV_NAME, args.noise_std, ALGO))
            for id in range(args.rollout_size)
        ]
        for worker in self.workers:
            worker.start()
        self.roll_flag = [True for _ in range(args.rollout_size)]

        #Test bucket
        self.test_bucket = self.manager.list()
        if ALGO == 'SAC':
            self.test_bucket.append(
                GaussianPolicy(args.state_dim, args.action_dim,
                               args.hidden_size, -1))
        else:
            self.test_bucket.append(Actor(args.state_dim, args.action_dim, -1))

        #5 Test workers
        self.test_task_pipes = [Pipe() for _ in range(TEST_SIZE)]
        self.test_result_pipes = [Pipe() for _ in range(TEST_SIZE)]
        self.test_workers = [
            Process(target=rollout_worker,
                    args=(id, self.test_task_pipes[id][1],
                          self.test_result_pipes[id][0], False, None,
                          self.test_bucket, ENV_NAME, args.noise_std, ALGO))
            for id in range(TEST_SIZE)
        ]
        for worker in self.test_workers:
            worker.start()
        self.test_flag = False

        #Meta-learning controller (Resource Distribution)
        self.allocation = [
        ]  #Allocation controls the resource allocation across learners
        for i in range(args.rollout_size):
            self.allocation.append(
                i % len(self.portfolio))  #Start uniformly (equal resources)
        #self.learner_stats = [{'fitnesses': [], 'ep_lens': [], 'value': 0.0, 'visit_count':0} for _ in range(len(self.portfolio))] #Track node statistsitic (each node is a learner), to compute UCB scores

        #Trackers
        self.best_score = 0.0
        self.gen_frames = 0
        self.total_frames = 0
        self.best_shaped_score = None
        self.test_score = None
        self.test_std = None
Ejemplo n.º 30
0
class Evaluator(object):
    def __init__(
        self,
        CERL_agent,
        num_workers,
        trainers,
        pomdp_adv=False
    ):  #trainers first is the blue agent and second is the red model
        self.num_workers = num_workers
        self.trainers = trainers
        self.pomdp_adv = pomdp_adv
        self.args = CERL_agent.args
        self.drqn = CERL_agent.args.drqn  #denote if blue uses drqn
        if self.pomdp_adv:
            self.trainers = [trainers[0],
                             None]  #make sure the red model is never used
        self.buffer_gpu = CERL_agent.args.buffer_gpu
        self.batch_size = CERL_agent.args.batch_size
        self.algo = CERL_agent.args.algo
        self.state_dim = CERL_agent.args.state_dim
        self.action_dim = CERL_agent.args.action_dim
        self.buffer = Buffer(BUFFER_SIZE,
                             self.buffer_gpu)  #initialize own replay buffer
        self.data_bucket = self.buffer.tuples
        self.evo_task_pipes = [Pipe() for _ in range(self.num_workers)]
        self.evo_result_pipes = [Pipe() for _ in range(self.num_workers)]
        self.actual_red_worker = Actor(
            CERL_agent.args.state_dim, CERL_agent.args.action_dim, -1,
            'dis')  #this model is shared accross the workers
        self.actual_red_worker.share_memory()
        self.td3args = {
            'policy_noise': 0.2,
            'policy_noise_clip': 0.5,
            'policy_ups_freq': 2,
            'action_low': CERL_agent.args.action_low,
            'action_high': CERL_agent.args.action_high,
            'cerl_args': self.args
        }
        self.renew_learner(
        )  #now we are not using new learner for each iteration
        self.rollout_bucket = [
            self.actual_red_worker for i in range(num_workers)
        ]
        self.workers = [
            Process(target=rollout_worker,
                    args=(id, 3, self.evo_task_pipes[id][1],
                          self.evo_result_pipes[id][0], False,
                          self.data_bucket, self.rollout_bucket, 'dummy_name',
                          None, 'dis', self.trainers, False, self.pomdp_adv))
            for id in range(num_workers)
        ]

        for worker in self.workers:
            worker.start()
        self.evo_flag = [True for _ in range(self.num_workers)]

    #def initialize(self, actor_in):  #use the given actor parameter to initialize the red actor
    #    utils.hard_update(self.actual_red_actor, actor_in)

    def renew_learner(
            self
    ):  #create a new learning agent, with randomized initial parameter
        self.learner = Learner(-1,
                               self.algo,
                               self.state_dim,
                               self.action_dim,
                               actor_lr=5e-5,
                               critic_lr=1e-3,
                               gamma=0.99,
                               tau=5e-3,
                               init_w=True,
                               **self.td3args)
        self.actual_red_actor = self.learner.algo.actor

    def collect_trajectory(self):
        utils.hard_update(self.actual_red_worker,
                          self.actual_red_actor)  #first snyc the actor

        #launch rollout_workers
        for id, actor in enumerate(self.rollout_bucket):
            if self.evo_flag[id]:
                self.evo_task_pipes[id][0].send(
                    (id, 0))  #second argument in send is dummy
                self.evo_flag[id] = False

        #wait for the rollout to complete and record fitness
        all_fitness = []
        for i in range(self.num_workers):
            entry = self.evo_result_pipes[i][1].recv()
            all_fitness.append(entry[1])
            self.evo_flag[i] = True

        self.buffer.referesh()  #update replay buffer

        return all_fitness

    def train_red(
        self, training_iterations
    ):  #alternate between collect_trajectory and parameter update
        while self.buffer.__len__() < self.batch_size * 10:  ###BURN IN PERIOD
            self.collect_trajectory()

        for i in range(training_iterations):
            self.collect_trajectory()
            self.buffer.tensorify()  # Tensorify the buffer for fast sampling
            self.learner.update_parameters(self.buffer, self.buffer_gpu,
                                           self.batch_size, 2)  #2 update steps

    def evaluate(
        self
    ):  #evaluate the quality of blue agent policy, by training a red against it, after evaluation, erase the reply buffer and renew learner
        self.train_red(TRAIN_ITERATION)
        self.clear_buffer()
        #self.renew_learner()
        return self.evaluate_fixed_agents(
            self.trainers[0], self.trainers[1],
            [self.actual_red_actor
             ])  #calculate the mean and std of the evaluation metric

    def evaluate_fixed_agents(
        self,
        blue_dqn,
        red_model,
        red_actor_list,
        num_iterations=25
    ):  #evaluate the performance given agents, use random neutral and red agent
        if self.algo == 'dis':  # make env with blue and red policy agent inside,
            dis_env = make_self_play_env(
                seed=0,
                return_policy_agent=False,
                trainers=[blue_dqn, red_model]
            )[0]  # trainer if not None, first is the shared DQN agent, second is the best red policy
            env = EnvironmentWrapper(
                '', self.algo, dis_env,
                0)  # the "0" is the index for training blue agent
        elif self.algo == 'TD3_tennis':
            tennis_env = make_tennis_env.TennisEnvFactory(
                seed=np.random.choice(np.array(range(len(self.pop)))),
                no_graphics=True,
                pid=-1).getEnv()[0]
            env = EnvironmentWrapper('Tennis', self.algo, tennis_env, 0)
        else:
            raise Exception("only work for 'dis' envir?")
        average_reward = 0
        eps = 0
        average_red_reward = 0
        red_count = 0
        average_actual_blue_reward = 0
        blue_count = 0
        belief_and_true_type_list = []
        assert len(red_actor_list
                   ) is not None, "make sure to input a list of possible red"
        for it in range(num_iterations):
            belief_and_true_type = []
            if not self.pomdp_adv:  # if pomdp_adv, make sure that TD3_actor is never used
                red_actor = random.choice(red_actor_list)
                env.set_TD3_actor(red_actor)
            fitness = 0.0
            # here fitness if simplely reward
            state = env.reset()
            belief_and_true_type.append(env.belief_and_true_type())
            env.randomize_neu_adv()

            if self.pomdp_adv:
                env.try_set_pomdp_adv(
                )  # try to set if opponent to pomdp adv if opponent is adversary, else do nothing

            render_flag = (np.random.random() < 0.05)
            while True:  # unless done
                action = blue_dqn.act(state, eps=eps)
                next_state, reward, done, info = env.step(
                    copy.deepcopy(action), use_actual_reward=self.drqn)
                belief_and_true_type.append(env.belief_and_true_type())
                if render_flag and self.args.render:
                    env.render()

                state = next_state
                fitness += reward

                if done:
                    average_red_reward += env.get_red_reward(
                    ) if env.get_red_reward() is not None else 0
                    average_actual_blue_reward += env.get_blue_actual_reward(
                    ) if env.get_blue_actual_reward() is not None else 0
                    red_count += 1 if env.get_red_reward() is not None else 0
                    blue_count += 1 if env.get_blue_actual_reward(
                    ) is not None else 0
                    if render_flag: env.env.close()
                    break
            belief_and_true_type_list.append(belief_and_true_type)
            average_reward += fitness
        average_reward /= num_iterations
        if red_count != 0:
            average_red_reward /= red_count
        if blue_count != 0:
            average_actual_blue_reward /= blue_count
        return average_reward, average_red_reward, average_actual_blue_reward, belief_and_true_type_list

    def clear_buffer(self):
        self.buffer.clear_buffer_data()  #reinitialize replay buffer

    def kill_processes(self):
        for id, actor in enumerate(self.rollout_bucket):
            self.evo_task_pipes[id][0].send(
                ('TERMINATE', 0))  #second argument in send is dummy

    def __del__(self):
        self.kill_processes()