Esempio n. 1
0
    def __init__(self, wwid, algo_name, state_dim, action_dim, actor_lr, critic_lr, gamma, tau, init_w = True):

        self.algo_name = algo_name; self.gamma = gamma; self.tau = tau

        #Initialize actors
        self.actor = Actor(state_dim, action_dim, wwid)
        if init_w: self.actor.apply(utils.init_weights)
        self.actor_target = Actor(state_dim, action_dim, wwid)
        utils.hard_update(self.actor_target, self.actor)
        self.actor_optim = Adam(self.actor.parameters(), actor_lr)


        self.critic = Critic(state_dim, action_dim)
        if init_w: self.critic.apply(utils.init_weights)
        self.critic_target = Critic(state_dim, action_dim)
        utils.hard_update(self.critic_target, self.critic)
        self.critic_optim = Adam(self.critic.parameters(), critic_lr)

        self.loss = nn.MSELoss()

        self.actor_target.cuda(); self.critic_target.cuda(); self.actor.cuda(); self.critic.cuda()
        self.num_critic_updates = 0

        #Statistics Tracker
        self.action_loss = {'min':[], 'max': [], 'mean':[], 'std':[]}
        self.policy_loss = {'min':[], 'max': [], 'mean':[], 'std':[]}
        self.critic_loss = {'mean':[]}
        self.q = {'min':[], 'max': [], 'mean':[], 'std':[]}
        self.val = {'min':[], 'max': [], 'mean':[], 'std':[]}
Esempio n. 2
0
	def make_champ_team(self, agents):
		for agent_id, agent in enumerate(agents):
			if self.args.popn_size <= 1: #Testing without Evo
				agent.update_rollout_actor()
				mod.hard_update(self.rollout_actor[agent_id], agent.rollout_actor[0])
			else:
				mod.hard_update(self.rollout_actor[agent_id], agent.popn[agent.champ_ind])
Esempio n. 3
0
	def __init__(self, id, num_inputs, action_dim, hidden_size, gamma, critic_lr, actor_lr, tau, alpha, target_update_interval, savetag, foldername, actualize, use_gpu):

		self.num_inputs = num_inputs
		self.action_space = action_dim
		self.gamma = gamma
		self.tau = 0.005
		self.alpha = 0.2
		self.policy_type = "Gaussian"
		self.target_update_interval = 1
		self.tracker = utils.Tracker(foldername, ['q_'+savetag, 'qloss_'+savetag, 'value_'+savetag, 'value_loss_'+savetag, 'policy_loss_'+savetag, 'mean_loss_'+savetag, 'std_loss_'+savetag], '.csv',save_iteration=1000, conv_size=1000)
		self.total_update = 0
		self.agent_id = id
		self.actualize = actualize

		self.critic = QNetwork(self.num_inputs, self.action_space, hidden_size)
		self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr)
		self.soft_q_criterion = nn.MSELoss()

		if self.policy_type == "Gaussian":
			self.policy = Actor(self.num_inputs, self.action_space, hidden_size, policy_type='GaussianPolicy')
			self.policy_optim = Adam(self.policy.parameters(), lr=actor_lr)

			self.value = ValueNetwork(self.num_inputs, hidden_size)
			self.value_target = ValueNetwork(self.num_inputs, hidden_size)
			self.value_optim = Adam(self.value.parameters(), lr=critic_lr)
			utils.hard_update(self.value_target, self.value)
			self.value_criterion = nn.MSELoss()
		else:
			self.policy = Actor(self.num_inputs, self.action_space, hidden_size, policy_type='DeterministicPolicy')
			self.policy_optim = Adam(self.policy.parameters(), lr=actor_lr)

			self.critic_target = QNetwork(self.num_inputs, self.action_space, hidden_size)
			utils.hard_update(self.critic_target, self.critic)

		self.policy.cuda()
		self.value.cuda()
		self.value_target.cuda()
		self.critic.cuda()

		#Statistics Tracker
		self.q = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.val = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.value_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.policy_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.mean_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.std_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.q_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
Esempio n. 4
0
    def collect_trajectory(self):
        utils.hard_update(self.actual_red_worker,
                          self.actual_red_actor)  #first snyc the actor

        #launch rollout_workers
        for id, actor in enumerate(self.rollout_bucket):
            if self.evo_flag[id]:
                self.evo_task_pipes[id][0].send(
                    (id, 0))  #second argument in send is dummy
                self.evo_flag[id] = False

        #wait for the rollout to complete and record fitness
        all_fitness = []
        for i in range(self.num_workers):
            entry = self.evo_result_pipes[i][1].recv()
            all_fitness.append(entry[1])
            self.evo_flag[i] = True

        self.buffer.referesh()  #update replay buffer

        return all_fitness
Esempio n. 5
0
	def __init__(self, id, algo_name, state_dim, action_dim, hidden_size, actor_lr, critic_lr, gamma, tau, savetag, foldername, actualize, use_gpu, num_agents, init_w = True):

		self.algo_name = algo_name; self.gamma = gamma; self.tau = tau; self.total_update = 0; self.agent_id = id;self.use_gpu = use_gpu
		self.tracker = utils.Tracker(foldername, ['q_'+savetag, 'qloss_'+savetag, 'policy_loss_'+savetag], '.csv', save_iteration=1000, conv_size=1000)
		self.num_agents = num_agents

		#Initialize actors
		self.policy = MultiHeadActor(state_dim, action_dim, hidden_size, num_agents)
		if init_w: self.policy.apply(utils.init_weights)
		self.policy_target = MultiHeadActor(state_dim, action_dim, hidden_size, num_agents)
		utils.hard_update(self.policy_target, self.policy)
		self.policy_optim = Adam(self.policy.parameters(), actor_lr)


		self.critics = [QNetwork(state_dim*num_agents, action_dim*num_agents, hidden_size*3) for _ in range(num_agents)]

		self.critics_target = [QNetwork(state_dim*num_agents, action_dim*num_agents, hidden_size*3) for _ in range(num_agents)]
		if init_w:
			for critic, critic_target in zip(self.critics, self.critics_target):
				critic.apply(utils.init_weights)
				utils.hard_update(critic_target, critic)
		self.critic_optims = [Adam(critic.parameters(), critic_lr) for critic in self.critics]


		self.loss = nn.MSELoss()

		if use_gpu:
			self.policy_target.cuda(); self.policy.cuda()
			for critic, critic_target in zip(self.critics, self.critics_target):
				critic.cuda()
				critic_target.cuda()


		self.num_critic_updates = 0

		#Statistics Tracker
		#self.action_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.policy_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.q_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.q = {'min':None, 'max': None, 'mean':None, 'std':None}
Esempio n. 6
0
	def __init__(self, id, algo_name, state_dim, action_dim, hidden_size, actor_lr, critic_lr, gamma, tau, savetag, foldername, actualize, use_gpu, num_agents, init_w = True):

		self.algo_name = algo_name; self.gamma = gamma; self.tau = tau; self.total_update = 0; self.agent_id = id;	self.actualize = actualize; self.use_gpu = use_gpu
		self.tracker = utils.Tracker(foldername, ['q_'+savetag, 'qloss_'+savetag, 'policy_loss_'+savetag, 'alz_score'+savetag,'alz_policy'+savetag], '.csv', save_iteration=1000, conv_size=1000)

		#Initialize actors
		self.policy = MultiHeadActor(state_dim, action_dim, hidden_size, num_agents)
		if init_w: self.policy.apply(utils.init_weights)
		self.policy_target = MultiHeadActor(state_dim, action_dim, hidden_size, num_agents)
		utils.hard_update(self.policy_target, self.policy)
		self.policy_optim = Adam(self.policy.parameters(), actor_lr)


		self.critic = QNetwork(state_dim, action_dim,hidden_size)
		if init_w: self.critic.apply(utils.init_weights)
		self.critic_target = QNetwork(state_dim, action_dim, hidden_size)
		utils.hard_update(self.critic_target, self.critic)
		self.critic_optim = Adam(self.critic.parameters(), critic_lr)

		if actualize:
			self.ANetwork = ActualizationNetwork(state_dim, action_dim, hidden_size)
			if init_w: self.ANetwork.apply(utils.init_weights)
			self.actualize_optim = Adam(self.ANetwork.parameters(), critic_lr)
			self.actualize_lr = 0.2
			if use_gpu: self.ANetwork.cuda()

		self.loss = nn.MSELoss()

		if use_gpu:
			self.policy_target.cuda(); self.critic_target.cuda(); self.policy.cuda(); self.critic.cuda()
		self.num_critic_updates = 0

		#Statistics Tracker
		#self.action_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.policy_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.q_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.q = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.alz_score = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.alz_policy = {'min':None, 'max': None, 'mean':None, 'std':None}
Esempio n. 7
0
    def train(self, gen, frame_tracker):
        """Main training loop to do rollouts, neureoevolution, and policy gradients

			Parameters:
				gen (int): Current epoch of training

			Returns:
				None
		"""
        ################ START ROLLOUTS ##############

        #Start Evolution rollouts
        if not ISOLATE_PG:
            for id, actor in enumerate(self.pop):
                if self.evo_flag[id]:
                    self.evo_task_pipes[id][0].send(id)
                    self.evo_flag[id] = False

        #Sync all learners actor to cpu (rollout) actor
        for i, learner in enumerate(self.portfolio):
            learner.algo.actor.cpu()
            utils.hard_update(self.rollout_bucket[i], learner.algo.actor)
            learner.algo.actor.cuda()

        # Start Learner rollouts
        for rollout_id, learner_id in enumerate(self.allocation):
            if self.roll_flag[rollout_id]:
                self.task_pipes[rollout_id][0].send(learner_id)
                self.roll_flag[rollout_id] = False

        #Start Test rollouts
        if gen % 5 == 0:
            self.test_flag = True
            for pipe in self.test_task_pipes:
                pipe[0].send(0)

        ############# UPDATE PARAMS USING GRADIENT DESCENT ##########
        if self.replay_buffer.__len__(
        ) > self.args.batch_size * 10:  ###BURN IN PERIOD
            self.replay_buffer.tensorify(
            )  # Tensorify the buffer for fast sampling

            #Spin up threads for each learner
            threads = [
                threading.Thread(
                    target=learner.update_parameters,
                    args=(self.replay_buffer, self.args.buffer_gpu,
                          self.args.batch_size,
                          int(self.gen_frames * self.args.gradperstep)))
                for learner in self.portfolio
            ]

            # Start threads
            for thread in threads:
                thread.start()

            #Join threads
            for thread in threads:
                thread.join()
            self.gen_frames = 0

        ########## SOFT -JOIN ROLLOUTS FOR EVO POPULATION ############
        if not ISOLATE_PG:
            all_fitness = []
            all_net_ids = []
            all_eplens = []
            while True:
                for i in range(self.args.pop_size):
                    if self.evo_result_pipes[i][1].poll():
                        entry = self.evo_result_pipes[i][1].recv()
                        all_fitness.append(entry[1])
                        all_net_ids.append(entry[0])
                        all_eplens.append(entry[2])
                        self.gen_frames += entry[2]
                        self.total_frames += entry[2]
                        self.evo_flag[i] = True

                # Soft-join (50%)
                if len(all_fitness
                       ) / self.args.pop_size >= self.args.asynch_frac:
                    break

        ########## HARD -JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############
        for i in range(self.args.rollout_size):
            entry = self.result_pipes[i][1].recv()
            learner_id = entry[0]
            fitness = entry[1]
            num_frames = entry[2]
            self.portfolio[learner_id].update_stats(fitness, num_frames)

            self.gen_frames += num_frames
            self.total_frames += num_frames
            if fitness > self.best_score: self.best_score = fitness

            self.roll_flag[i] = True

        #Referesh buffer (housekeeping tasks - pruning to keep under capacity)
        self.replay_buffer.referesh()
        ######################### END OF PARALLEL ROLLOUTS ################

        ############ PROCESS MAX FITNESS #############
        if not ISOLATE_PG:
            champ_index = all_net_ids[all_fitness.index(max(all_fitness))]
            utils.hard_update(self.test_bucket[0], self.pop[champ_index])
            if max(all_fitness) > self.best_score:
                self.best_score = max(all_fitness)
                utils.hard_update(self.best_policy, self.pop[champ_index])
                if SAVE:
                    torch.save(
                        self.pop[champ_index].state_dict(),
                        self.args.aux_folder + ENV_NAME + '_best' + SAVETAG)
                    print("Best policy saved with score",
                          '%.2f' % max(all_fitness))

        else:  #Run PG in isolation
            utils.hard_update(self.test_bucket[0], self.rollout_bucket[0])

        ###### TEST SCORE ######
        if self.test_flag:
            self.test_flag = False
            test_scores = []
            for pipe in self.test_result_pipes:  #Collect all results
                entry = pipe[1].recv()
                test_scores.append(entry[1])
            test_scores = np.array(test_scores)
            test_mean = np.mean(test_scores)
            test_std = (np.std(test_scores))

            # Update score to trackers
            frame_tracker.update([test_mean], self.total_frames)
        else:
            test_mean, test_std = None, None

        #NeuroEvolution's probabilistic selection and recombination step
        if not ISOLATE_PG:
            if gen % 5 == 0:
                self.evolver.epoch(gen, self.genealogy, self.pop, all_net_ids,
                                   all_fitness, self.rollout_bucket)
            else:
                self.evolver.epoch(gen, self.genealogy, self.pop, all_net_ids,
                                   all_fitness, [])

        #META LEARNING - RESET ALLOCATION USING UCB
        if gen % 1 == 0:
            self.allocation = ucb(len(self.allocation), self.portfolio,
                                  self.args.ucb_coefficient)

        #Metrics
        if not ISOLATE_PG:
            champ_len = all_eplens[all_fitness.index(max(all_fitness))]
            champ_wwid = int(self.pop[champ_index].wwid.item())
            max_fit = max(all_fitness)
        else:
            champ_len = num_frames
            champ_wwid = int(self.rollout_bucket[0].wwid.item())
            all_fitness = [fitness]
            max_fit = fitness
            all_eplens = [num_frames]

        return max_fit, champ_len, all_fitness, all_eplens, test_mean, test_std, champ_wwid
    def train(self, gen, frame_tracker):
        """Main training loop to do rollouts, neureoevolution, and policy gradients

			Parameters:
				gen (int): Current epoch of training

			Returns:
				None
		"""
        ################ START ROLLOUTS ##############

        # Start Evolution rollouts
        if not ISOLATE_PG:
            for id, actor in enumerate(self.pop):
                if self.evo_flag[id]:
                    self.evo_task_pipes[id][0].send((id, gen))
                    self.evo_flag[id] = False

        # Sync all learners actor to cpu (rollout) actor
        # (update rollout parameter using the learner parameter, such that rollout worker is up to date)
        for i, learner in enumerate(self.portfolio):  #number of learner
            learner.algo.actor.cpu()
            utils.hard_update(
                self.rollout_bucket[i], learner.algo.actor
            )  #rollout bucket is now synchronized with learner to perform rollout for learner actors
            if torch.cuda.is_available(): learner.algo.actor.cuda()

        # Start Learner rollouts
        for rollout_id, learner_id in enumerate(
                self.allocation):  #number of rollout_size
            if self.roll_flag[rollout_id]:
                self.task_pipes[rollout_id][0].send(
                    (learner_id, gen)
                )  #allocation record the id of the learner that bucket should run, so rollout_id is the id of rollout_bucket
                self.roll_flag[rollout_id] = False

        # Start Test rollouts
        if gen % 5 == 0:
            self.test_flag = True
            for pipe in self.test_task_pipes:
                pipe[0].send((0, gen))

        ############# UPDATE PARAMS USING GRADIENT DESCENT ##########
        # main training loop
        if self.replay_buffer.__len__(
        ) > self.args.batch_size * 10:  ###BURN IN PERIOD
            self.replay_buffer.tensorify(
            )  # Tensorify the buffer for fast sampling

            # Spin up threads for each learner
            threads = [
                threading.Thread(
                    target=learner.update_parameters,
                    args=(self.replay_buffer, self.args.buffer_gpu,
                          self.args.batch_size,
                          int(self.gen_frames * self.args.gradperstep)))
                for learner in self.portfolio
            ]  #macheng: do we want to train all the learners?

            # Start threads
            for thread in threads:
                thread.start()

            # Join threads
            for thread in threads:
                thread.join()

            # Now update average_policy
            #self.average_policy.cuda()
            if ALGO == 'dis':
                self.average_policy.update(
                )  #update the average_policy parameter with supervised learning

            self.gen_frames = 0

            #########Visualize Learner Critic Function#################
            # if self.replay_buffer.__len__() % 2500 == 0:
            #	visualize_critic(self.portfolio[2], make_self_play_env(trainers=[[],[]])[0], 50)  #arguments: Learner, env, N_GRID

        ########## SOFT -JOIN ROLLOUTS FOR EVO POPULATION ############
        if not ISOLATE_PG:
            all_fitness = []
            all_net_ids = []
            all_eplens = []
            while True:
                for i in range(self.args.pop_size):
                    if self.evo_result_pipes[i][1].poll():
                        entry = self.evo_result_pipes[i][1].recv()
                        all_fitness.append(entry[1])
                        all_net_ids.append(entry[0])
                        all_eplens.append(entry[2])
                        self.gen_frames += entry[2]
                        self.total_frames += entry[2]
                        self.evo_flag[i] = True

                # Soft-join (50%)
                if len(all_fitness
                       ) / self.args.pop_size >= self.args.asynch_frac:
                    break

        ########## HARD -JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############
        for i in range(self.args.rollout_size):
            entry = self.result_pipes[i][1].recv()
            learner_id = entry[0]
            fitness = entry[1]
            num_frames = entry[2]
            self.portfolio[learner_id].update_stats(fitness, num_frames)

            self.gen_frames += num_frames
            self.total_frames += num_frames
            if fitness > self.best_score: self.best_score = fitness

            self.roll_flag[i] = True

        # Referesh buffer (housekeeping tasks - pruning to keep under capacity)
        self.replay_buffer.referesh()
        ######################### END OF PARALLEL ROLLOUTS ################

        ############ PROCESS MAX FITNESS #############
        # ms:best policy is always up to date
        # so here the best learner is saved
        if not ISOLATE_PG:
            champ_index = all_net_ids[all_fitness.index(max(all_fitness))]
            utils.hard_update(self.test_bucket[0], self.pop[champ_index])
            if max(all_fitness) > self.best_score:
                self.best_score = max(all_fitness)
                utils.hard_update(self.best_policy, self.pop[champ_index])
                if SAVE:
                    torch.save(
                        self.pop[champ_index].state_dict(),
                        self.args.aux_folder + ENV_NAME + '_best' + SAVETAG)
                    print("Best policy saved with score",
                          '%.2f' % max(all_fitness))

        else:  #Run PG in isolation
            utils.hard_update(self.test_bucket[0], self.rollout_bucket[0])

        ###### TEST SCORE ######
        if self.test_flag:
            self.test_flag = False
            test_scores = []
            for pipe in self.test_result_pipes:  #Collect all results
                entry = pipe[1].recv()
                test_scores.append(entry[1])
            test_scores = np.array(test_scores)
            test_mean = np.mean(test_scores)
            test_std = (np.std(test_scores))

            # Update score to trackers
            frame_tracker.update([test_mean], self.total_frames)
        else:
            test_mean, test_std = None, None

        # NeuroEvolution's probabilistic selection and recombination step
        # ms: this epoch() method implements neuro-evolution
        if not ISOLATE_PG:  #seems pop_size and rollout_size must be 10, otherwise this will produce error
            if gen % 5 == 0:
                self.evolver.epoch(
                    gen, self.genealogy, self.pop, all_net_ids, all_fitness,
                    self.rollout_bucket
                )  #this method also copies learner to evoler
            else:
                self.evolver.epoch(gen, self.genealogy, self.pop, all_net_ids,
                                   all_fitness, [])

        # META LEARNING - RESET ALLOCATION USING UCB
        if gen % 1 == 0:
            self.update_allocation()
        # Metrics
        if not ISOLATE_PG:
            champ_len = all_eplens[all_fitness.index(max(all_fitness))]
            champ_wwid = int(self.pop[champ_index].wwid.item())
            max_fit = max(all_fitness)
        else:
            champ_len = num_frames
            champ_wwid = int(self.rollout_bucket[0].wwid.item())
            all_fitness = [fitness]
            max_fit = fitness
            all_eplens = [num_frames]

        return max_fit, champ_len, all_fitness, all_eplens, test_mean, test_std, champ_wwid
Esempio n. 9
0
    def evolve(self, pop, net_inds, fitness_evals, migration, states):
        """Method to implement a round of selection and mutation operation
			Parameters:
				  pop (shared_list): Population of models
				  net_inds (list): Indices of individuals evaluated this generation
				  fitness_evals (list of lists): Fitness values for evaluated individuals
				  **migration (object): Policies from learners to be synced into population
			Returns:
				None
		"""

        self.gen += 1

        # Convert the list of fitness values corresponding to each individual into a float [CCEA Reduction]
        if isinstance(fitness_evals[0], list):
            for i in range(len(fitness_evals)):
                if self.ccea_reduction == "mean":
                    fitness_evals[i] = sum(fitness_evals[i]) / len(
                        fitness_evals[i])
                elif self.ccea_reduction == "leniency":
                    fitness_evals[i] = max(fitness_evals[i])
                elif self.ccea_reduction == "min":
                    fitness_evals[i] = min(fitness_evals[i])
                else:
                    sys.exit('Incorrect CCEA Reduction scheme')

        # Append new fitness to lineage
        lineage_scores = [
        ]  # Tracks the average lineage score fot the generation
        for ind, fitness in zip(net_inds, fitness_evals):
            self.lineage[ind].append(fitness)
            lineage_scores.append(
                0.75 * sum(self.lineage[ind]) / len(self.lineage[ind]) +
                0.25 * fitness
            )  # Current fitness is weighted higher than lineage info
            if len(self.lineage[ind]) > self.lineage_depth:
                self.lineage[ind].pop(0)  # Housekeeping

        # Entire epoch is handled with indices; Index rank nets by fitness evaluation (0 is the best after reversing)
        index_rank = self.list_argsort(fitness_evals)
        index_rank.reverse()
        elitist_index = index_rank[:self.
                                   num_elites]  # Elitist indexes safeguard

        # Lineage rankings to elitists
        lineage_rank = self.list_argsort(lineage_scores[:])
        lineage_rank.reverse()
        elitist_index = elitist_index + lineage_rank[:int(self.num_elites)]

        # Take out copies in elitist indices
        elitist_index = list(set(elitist_index))

        #################### MULTI_POINT SEARCH WITH ANCHORS/PROBES/BLENDS AND EXPLICIT DIVERSITY-BASED SEPARATION
        if self.scheme == 'multipoint':

            # Compute anchors
            anchor_inds = self.get_anchors(states, pop, net_inds[:],
                                           np.array(lineage_rank[:]))

            # Remove duplicates between anchors and elitists
            for i, elite in enumerate(elitist_index):
                if elite in anchor_inds: elitist_index.pop(i)

            ##################### TRANSFER INDICES BACK TO POP INDICES: Change from ind in net_inds to ind referring to the real ind in pop ###############################
            elites = [net_inds[i] for i in elitist_index]
            anchors = [net_inds[i] for i in anchor_inds]
            anchor_fitnesses = [fitness_evals[i] for i in anchor_inds]
            anchor_index_ranks = [index_rank.index(i) for i in anchor_inds]
            #######################################################################################################################################################

            # Unselects are the individuals left in the population
            unselects = [
                ind for ind in net_inds
                if ind not in elites and ind not in anchors
            ]

            # Inheritance step (sync learners to population)
            for policy in migration:
                replacee = unselects.pop(0)
                utils.hard_update(target=pop[replacee], source=policy)
                # wwid = genealogy.asexual(int(policy.wwid.item()))
                # pop[replacee].wwid[0] = wwid
                self.lineage[replacee] = []  # Reinitialize as empty

            # Sample anchors from a probability distribution formed of their relative fitnesses using a roulette wheel
            probe_allocation_inds = self.roulette_wheel(
                anchor_fitnesses,
                len(unselects) - self.num_blends)
            sampled_anchors = [anchors[i] for i in probe_allocation_inds]

            # Mutate the anchors to form probes
            for anchor_ind in sampled_anchors:
                # Mutate to form probes from anchors
                replacee = unselects.pop(0)
                utils.hard_update(target=pop[replacee], source=pop[anchor_ind])
                self.lineage[replacee] = [
                    utils.list_mean(self.lineage[anchor_ind])
                ]  # Inherit lineage from replacee
                self.mutate_inplace(pop[replacee])
            # genealogy.mutation(int(pop[replacee].wwid.item()), gen)

            if random.random() < 0.1:
                print('Evo_Info #Anchors', len(anchors), '#Probes_allocation',
                      [sampled_anchors.count(i) for i in anchors], '#elites',
                      len(elites), '#Blends', len(unselects), '#Migration',
                      len(migration), 'Nets', len(net_inds),
                      'Anchor fitness Ranks', anchor_index_ranks)

            ###### Create the blends to fill the rest of the unselects by crossovers #########
            # Number of unselects left should be even
            if len(unselects) % 2 != 0:
                unselects.append(unselects[random.randint(
                    0,
                    len(unselects) - 1)])

            for i, j in zip(unselects[0::2], unselects[1::2]):
                off_i = random.choice(anchors)
                while True:
                    off_j = random.choice(anchors)
                    if off_j != off_i: break

                utils.hard_update(target=pop[i], source=pop[off_i])
                utils.hard_update(target=pop[j], source=pop[off_j])
                self.crossover_inplace(pop[i], pop[j])
                # wwid1 = genealogy.crossover(int(pop[off_i].wwid.item()), int(pop[off_j].wwid.item()), gen)
                # wwid2 = genealogy.crossover(int(pop[off_i].wwid.item()), int(pop[off_j].wwid.item()), gen)
                # pop[i].wwid[0] = wwid1; pop[j].wwid[0] = wwid2
                self.lineage[i] = [
                    0.5 * utils.list_mean(self.lineage[off_i]) +
                    0.5 * utils.list_mean(self.lineage[off_j])
                ]
                self.lineage[j] = [
                    0.5 * utils.list_mean(self.lineage[off_i]) +
                    0.5 * utils.list_mean(self.lineage[off_j])
                ]

            return anchors[0]

        ####################### OLD EVOLVER WITHOUT MULTI_POINT SEARCH ###########
        elif self.scheme == 'standard':

            # Selection step
            offsprings = self.selection_tournament(
                index_rank,
                num_offsprings=len(index_rank) - len(elitist_index) -
                len(migration),
                tournament_size=3)

            # Transcribe ranked indexes from now on to refer to net indexes
            elitist_index = [net_inds[i] for i in elitist_index]
            offsprings = [net_inds[i] for i in offsprings]

            # Figure out unselected candidates
            unselects = []
            new_elitists = []
            for i in range(len(pop)):
                if i in offsprings or i in elitist_index:
                    continue
                else:
                    unselects.append(i)
            random.shuffle(unselects)

            # Check for migration's performance
            for ind in self.migrating_inds:
                if ind in offsprings or ind in elitist_index:
                    self.rl_res['selects'] += 1
                else:
                    self.rl_res['discarded'] += 1
            self.migrating_inds = []

            # Inheritance step (sync learners to population)
            for policy in migration:
                replacee = unselects.pop(0)
                utils.hard_update(target=pop[replacee], source=policy)
                self.migrating_inds.append(replacee)
                self.lineage[replacee] = [
                    sum(lineage_scores) / len(lineage_scores)
                ]  # Initialize as average

            # Elitism step, assigning elite candidates to some unselects
            for i in elitist_index:
                if len(unselects) >= 1:
                    replacee = unselects.pop(0)
                elif len(offsprings) >= 1:
                    replacee = offsprings.pop(0)
                else:
                    continue
                new_elitists.append(replacee)
                utils.hard_update(target=pop[replacee], source=pop[i])
                # wwid = genealogy.asexual(int(pop[i].wwid.item()))
                # pop[replacee].wwid[0] = wwid
                # genealogy.elite(wwid, gen)

                self.lineage[replacee] = self.lineage[i][:]

            # Crossover for unselected genes with 100 percent probability
            if len(unselects
                   ) % 2 != 0:  # Number of unselects left should be even
                unselects.append(unselects[random.randint(
                    0,
                    len(unselects) - 1)])
            for i, j in zip(unselects[0::2], unselects[1::2]):
                off_i = random.choice(new_elitists)
                off_j = random.choice(offsprings)
                utils.hard_update(target=pop[i], source=pop[off_i])
                utils.hard_update(target=pop[j], source=pop[off_j])
                self.crossover_inplace(pop[i], pop[j])
                # wwid1 = genealogy.crossover(int(pop[off_i].wwid.item()), int(pop[off_j].wwid.item()), gen)
                # wwid2 = genealogy.crossover(int(pop[off_i].wwid.item()), int(pop[off_j].wwid.item()), gen)
                # pop[i].wwid[0] = wwid1; pop[j].wwid[0] = wwid2

                self.lineage[i] = [
                    0.5 * utils.list_mean(self.lineage[off_i]) +
                    0.5 * utils.list_mean(self.lineage[off_j])
                ]
                self.lineage[j] = [
                    0.5 * utils.list_mean(self.lineage[off_i]) +
                    0.5 * utils.list_mean(self.lineage[off_j])
                ]

            # Crossover for selected offsprings
            for i, j in zip(offsprings[0::2], offsprings[1::2]):
                if random.random() < self.crossover_prob:
                    self.crossover_inplace(pop[i], pop[j])
                    # wwid1 = genealogy.crossover(int(pop[i].wwid.item()), int(pop[j].wwid.item()), gen)
                    # wwid2 = genealogy.crossover(int(pop[i].wwid.item()), int(pop[j].wwid.item()), gen)
                    # pop[i].wwid[0] = wwid1; pop[j].wwid[0] = wwid2
                    self.lineage[i] = [
                        0.5 * utils.list_mean(self.lineage[i]) +
                        0.5 * utils.list_mean(self.lineage[j])
                    ]
                    self.lineage[j] = [
                        0.5 * utils.list_mean(self.lineage[i]) +
                        0.5 * utils.list_mean(self.lineage[j])
                    ]

            # Mutate all genes in the population except the new elitists
            for i in range(len(pop)):
                if i not in new_elitists:  # Spare the new elitists
                    if random.random() < self.mutation_prob:
                        self.mutate_inplace(pop[i])
            # genealogy.mutation(int(pop[net_i].wwid.item()), gen)

            self.all_offs[:] = offsprings[:]
            return new_elitists[0]

        else:
            sys.exit('Incorrect Evolution Scheme')
Esempio n. 10
0
	def epoch(self, gen, genealogy, pop, net_inds, fitness_evals, migration):
		"""Method to implement a round of selection and mutation operation

			Parameters:
				  pop (shared_list): Population of models
				  net_inds (list): Indices of individuals evaluated this generation
				  fitness_evals (list): Fitness values for evaluated individuals
				  **migration (object): Policies from learners to be synced into population

			Returns:
				None

		"""

		self.gen+= 1; num_elitists = int(self.args.elite_fraction * len(fitness_evals))
		if num_elitists < 2: num_elitists = 2


		# Entire epoch is handled with indices; Index rank nets by fitness evaluation (0 is the best after reversing)
		index_rank = self.list_argsort(fitness_evals); index_rank.reverse()
		elitist_index = index_rank[:num_elitists]  # Elitist indexes safeguard

		# Selection step
		offsprings = self.selection_tournament(index_rank, num_offsprings=len(index_rank) - len(elitist_index) - len(migration), tournament_size=3)

		#Transcripe ranked indexes from now on to refer to net indexes
		elitist_index = [net_inds[i] for i in elitist_index]
		offsprings = [net_inds[i] for i in offsprings]

		#Figure out unselected candidates
		unselects = []; new_elitists = []
		for net_i in net_inds:
			if net_i in offsprings or net_i in elitist_index:
				continue
			else:
				unselects.append(net_i)
		random.shuffle(unselects)

		#Inheritance step (sync learners to population)
		for policy in migration:
			replacee = unselects.pop(0)
			utils.hard_update(target=pop[replacee], source=policy)
			wwid = genealogy.asexual(int(policy.wwid.item()))
			pop[replacee].wwid[0] = wwid

		# Elitism step, assigning elite candidates to some unselects
		for i in elitist_index:
			try: replacee = unselects.pop(0)
			except: replacee = offsprings.pop(0)
			new_elitists.append(replacee)
			utils.hard_update(target=pop[replacee], source=pop[i])
			wwid = genealogy.asexual(int(pop[i].wwid.item()))
			pop[replacee].wwid[0] = wwid
			genealogy.elite(wwid, gen)

			#self.lineage[replacee] = self.lineage[i]

		# Crossover for unselected genes with 100 percent probability
		if len(unselects) % 2 != 0:  # Number of unselects left should be even
			unselects.append(unselects[random.randint(0, len(unselects)-1)])
		for i, j in zip(unselects[0::2], unselects[1::2]):
			off_i = random.choice(new_elitists);
			off_j = random.choice(offsprings)
			utils.hard_update(target=pop[i], source=pop[off_i])
			utils.hard_update(target=pop[j], source=pop[off_j])
			self.crossover_inplace(pop[i], pop[j])
			wwid1 = genealogy.crossover(int(pop[off_i].wwid.item()), int(pop[off_j].wwid.item()), gen)
			wwid2 = genealogy.crossover(int(pop[off_i].wwid.item()), int(pop[off_j].wwid.item()), gen)
			pop[i].wwid[0] = wwid1; pop[j].wwid[0] = wwid2

			#self.lineage[i] = (self.lineage[off_i]+self.lineage[off_j])/2
			#self.lineage[j] = (self.lineage[off_i] + self.lineage[off_j]) / 2

		# Crossover for selected offsprings
		for i, j in zip(offsprings[0::2], offsprings[1::2]):
			if random.random() < self.args.crossover_prob:
				self.crossover_inplace(pop[i], pop[j])
				wwid1 = genealogy.crossover(int(pop[i].wwid.item()), int(pop[j].wwid.item()), gen)
				wwid2 = genealogy.crossover(int(pop[i].wwid.item()), int(pop[j].wwid.item()), gen)
				pop[i].wwid[0] = wwid1; pop[j].wwid[0] = wwid2


		# Mutate all genes in the population except the new elitists
		for net_i in net_inds:
			if net_i not in new_elitists:  # Spare the new elitists
				if random.random() < self.args.mutation_prob:
					self.mutate_inplace(pop[net_i])
					genealogy.mutation(int(pop[net_i].wwid.item()), gen)


		self.all_offs[:] = offsprings[:]
Esempio n. 11
0
	def evolve(self, pop, net_inds, fitness_evals, migration):
		"""Method to implement a round of selection and mutation operation

			Parameters:
				  pop (shared_list): Population of models
				  net_inds (list): Indices of individuals evaluated this generation
				  fitness_evals (list of lists): Fitness values for evaluated individuals
				  migration (object): Policies from learners to be synced into population

			Returns:
				None

		"""

		self.gen+= 1


		#Convert the list of fitness values corresponding to each individual into a float [CCEA Reduction]
		if isinstance(fitness_evals[0], list):
			for i in range(len(fitness_evals)):
				if self.ccea_reduction == "mean": fitness_evals[i] = sum(fitness_evals[i])/len(fitness_evals[i])
				elif self.ccea_reduction == "leniency":fitness_evals[i] = max(fitness_evals[i])
				elif self.ccea_reduction == "min": fitness_evals[i] = min(fitness_evals[i])
				else: sys.exit('Incorrect CCEA Reduction scheme')




		# Entire epoch is handled with indices; Index rank nets by fitness evaluation (0 is the best after reversing)
		index_rank = self.list_argsort(fitness_evals); index_rank.reverse()
		elitist_index = index_rank[:self.num_elites]  # Elitist indexes safeguard


		# Selection step
		offsprings = self.selection_tournament(index_rank,
		                                       num_offsprings=len(index_rank) - len(elitist_index) - len(
			                                       migration) - 1, tournament_size=3)

		# Transcribe ranked indexes from now on to refer to net indexes
		elitist_index = [net_inds[i] for i in elitist_index]
		offsprings = [net_inds[i] for i in offsprings]

		# Figure out unselected candidates
		unselects = []; new_elitists = []
		for i in range(len(pop)):
			if i in offsprings or i in elitist_index:
				continue
			else:
				unselects.append(i)
		random.shuffle(unselects)

		# Inheritance step (sync learners to population)
		for policy in migration:
			replacee = unselects.pop(0)
			utils.hard_update(target=pop[replacee], source=policy)


		# Elitism step, assigning elite candidates to some unselects
		for i in elitist_index:
			if len(unselects) >= 1: replacee = unselects.pop(0)
			elif len(offsprings) >= 1: replacee = offsprings.pop(0)
			else: continue
			new_elitists.append(replacee)
			utils.hard_update(target=pop[replacee], source=pop[i])


		# Crossover for unselected genes with 100 percent probability
		if len(unselects) % 2 != 0:  # Number of unselects left should be even
			unselects.append(unselects[random.randint(0, len(unselects) - 1)])
		for i, j in zip(unselects[0::2], unselects[1::2]):
			off_i = random.choice(new_elitists);
			off_j = random.choice(offsprings)
			utils.hard_update(target=pop[i], source=pop[off_i])
			utils.hard_update(target=pop[j], source=pop[off_j])
			self.crossover_inplace(pop[i], pop[j])


		# Crossover for selected offsprings
		for i, j in zip(offsprings[0::2], offsprings[1::2]):
			if random.random() < self.crossover_prob:
				self.crossover_inplace(pop[i], pop[j])


		# Mutate all genes in the population except the new elitists
		for i in range(len(pop)):
			if i not in new_elitists:  # Spare the new elitists
				if random.random() < self.mutation_prob:
					self.mutate_inplace(pop[i])
			# genealogy.mutation(int(pop[net_i].wwid.item()), gen)

		self.all_offs[:] = offsprings[:]
		return new_elitists[0]
Esempio n. 12
0
 def update_rollout_actor(self):
     for actor in self.rollout_actor:
         self.algo.policy.cpu()
         mod.hard_update(actor, self.algo.policy)
         if self.args.use_gpu: self.algo.policy.cuda()
Esempio n. 13
0
    def evolve(self, pop, net_inds, fitness_evals, migration):
        """Method to implement a round of selection and mutation operation

			Parameters:
				  pop (shared_list): Population of models
				  net_inds (list): Indices of individuals evaluated this generation
				  fitness_evals (list of lists): Fitness values for evaluated individuals
				  migration (object): Policies from learners to be synced into population

			Returns:
				None

		"""

        self.gen += 1

        #Convert the list of fitness values corresponding to each individual into a float [CCEA Reduction]
        if isinstance(fitness_evals[0], list):
            for i in range(len(fitness_evals)):
                if self.ccea_reduction == "mean":
                    fitness_evals[i] = sum(fitness_evals[i]) / len(
                        fitness_evals[i])
                elif self.ccea_reduction == "leniency":
                    fitness_evals[i] = max(fitness_evals[i])
                elif self.ccea_reduction == "min":
                    fitness_evals[i] = min(fitness_evals[i])
                else:
                    sys.exit('Incorrect CCEA Reduction scheme')

        #Append new fitness to lineage
        lineage_scores = [
        ]  #Tracks the average lineage score fot the generation
        for ind, fitness in zip(net_inds, fitness_evals):
            self.lineage[ind].append(fitness)
            lineage_scores.append(
                0.75 * sum(self.lineage[ind]) / len(self.lineage[ind]) + 0.25 *
                fitness)  #Current fitness is weighted higher than lineage info
            if len(self.lineage[ind]) > self.lineage_depth:
                self.lineage[ind].pop(0)  #Housekeeping

        # Entire epoch is handled with indices; Index rank nets by fitness evaluation (0 is the best after reversing)
        index_rank = self.list_argsort(fitness_evals)
        index_rank.reverse()
        elitist_index = index_rank[:self.
                                   num_elites]  # Elitist indexes safeguard

        #Lineage rankings to elitists
        lineage_rank = self.list_argsort(lineage_scores[:])
        lineage_rank.reverse()
        elitist_index = elitist_index + lineage_rank[:int(self.num_elites)]

        #Take out copies in elitist indices
        elitist_index = list(set(elitist_index))

        # Selection step
        offsprings = self.selection_tournament(index_rank,
                                               num_offsprings=len(index_rank) -
                                               len(elitist_index) -
                                               len(migration),
                                               tournament_size=3)

        # Transcripe ranked indexes from now on to refer to net indexes
        elitist_index = [net_inds[i] for i in elitist_index]
        offsprings = [net_inds[i] for i in offsprings]

        # Figure out unselected candidates
        unselects = []
        new_elitists = []
        for i in range(len(pop)):
            if i in offsprings or i in elitist_index:
                continue
            else:
                unselects.append(i)
        random.shuffle(unselects)

        # Inheritance step (sync learners to population)
        for policy in migration:
            replacee = unselects.pop(0)
            utils.hard_update(target=pop[replacee], source=policy)
            # wwid = genealogy.asexual(int(policy.wwid.item()))
            # pop[replacee].wwid[0] = wwid
            self.lineage[replacee] = [
                sum(lineage_scores) / len(lineage_scores)
            ]  # Initialize as average

        # Elitism step, assigning elite candidates to some unselects
        for i in elitist_index:
            if len(unselects) >= 1: replacee = unselects.pop(0)
            elif len(offsprings) >= 1: replacee = offsprings.pop(0)
            else: continue
            new_elitists.append(replacee)
            utils.hard_update(target=pop[replacee], source=pop[i])
            # wwid = genealogy.asexual(int(pop[i].wwid.item()))
            # pop[replacee].wwid[0] = wwid
            # genealogy.elite(wwid, gen)

            self.lineage[replacee] = self.lineage[i][:]

        # Crossover for unselected genes with 100 percent probability
        if len(unselects) % 2 != 0:  # Number of unselects left should be even
            unselects.append(unselects[random.randint(0, len(unselects) - 1)])
        for i, j in zip(unselects[0::2], unselects[1::2]):
            off_i = random.choice(new_elitists)
            off_j = random.choice(offsprings)
            utils.hard_update(target=pop[i], source=pop[off_i])
            utils.hard_update(target=pop[j], source=pop[off_j])
            self.crossover_inplace(pop[i], pop[j])
            # wwid1 = genealogy.crossover(int(pop[off_i].wwid.item()), int(pop[off_j].wwid.item()), gen)
            # wwid2 = genealogy.crossover(int(pop[off_i].wwid.item()), int(pop[off_j].wwid.item()), gen)
            # pop[i].wwid[0] = wwid1; pop[j].wwid[0] = wwid2

            self.lineage[i] = [
                0.5 * utils.list_mean(self.lineage[off_i]) +
                0.5 * utils.list_mean(self.lineage[off_j])
            ]
            self.lineage[j] = [
                0.5 * utils.list_mean(self.lineage[off_i]) +
                0.5 * utils.list_mean(self.lineage[off_j])
            ]

        # Crossover for selected offsprings
        for i, j in zip(offsprings[0::2], offsprings[1::2]):
            if random.random() < self.crossover_prob:
                self.crossover_inplace(pop[i], pop[j])
                # wwid1 = genealogy.crossover(int(pop[i].wwid.item()), int(pop[j].wwid.item()), gen)
                # wwid2 = genealogy.crossover(int(pop[i].wwid.item()), int(pop[j].wwid.item()), gen)
                # pop[i].wwid[0] = wwid1; pop[j].wwid[0] = wwid2
                self.lineage[i] = [
                    0.5 * utils.list_mean(self.lineage[i]) +
                    0.5 * utils.list_mean(self.lineage[j])
                ]
                self.lineage[j] = [
                    0.5 * utils.list_mean(self.lineage[i]) +
                    0.5 * utils.list_mean(self.lineage[j])
                ]

        # Mutate all genes in the population except the new elitists
        for i in range(len(pop)):
            if i not in new_elitists:  # Spare the new elitists
                if random.random() < self.mutation_prob:
                    self.mutate_inplace(pop[i])
            # genealogy.mutation(int(pop[net_i].wwid.item()), gen)

        self.all_offs[:] = offsprings[:]
        return new_elitists[0]
Esempio n. 14
0
    def train(self, gen):
        """Main training loop to do rollouts, neureoevolution, and policy gradients

            Parameters:
                gen (int): Current epoch of training

            Returns:
                None
        """

        ################ ROLLOUTS ##############
        #Start Evo rollouts
        for id, actor in enumerate(self.pop):
            if self.eval_flag[id]:
                self.evo_task_pipes[id][0].send(True)
                self.eval_flag[id] = False

        ########## SOFT -JOIN ROLLOUTS ############
        all_fitness = []
        all_net_ids = []
        all_eplens = []
        all_shaped_fitness = []
        while True:
            for i in range(self.args.pop_size):
                if self.evo_result_pipes[i][0].poll():
                    entry = self.evo_result_pipes[i][0].recv()
                    all_fitness.append(entry[1])
                    all_net_ids.append(entry[0])
                    all_eplens.append(entry[2])
                    self.frames_seen += entry[2]
                    all_shaped_fitness.append(entry[3])
                    self.eval_flag[i] = True

            # Soft-join (50%)
            if len(all_fitness) / self.args.pop_size >= self.args.asynch_frac:
                break

        # Add ALL EXPERIENCE COLLECTED TO MEMORY concurrently
        for _ in range(len(self.exp_list)):
            exp = self.exp_list.pop()
            self.add_experience(exp[0], exp[1], exp[2], exp[3], exp[4], exp[5])
        ######################### END OF PARALLEL ROLLOUTS ################

        ############ PROCESS MAX FITNESS #############
        champ_index = all_net_ids[all_fitness.index(max(all_fitness))]
        if max(all_fitness) > self.best_score:
            self.best_score = max(all_fitness)
            utils.hard_update(self.best_policy, self.pop[champ_index])
            if SAVE:
                torch.save(self.pop[champ_index].state_dict(),
                           self.args.model_save + 'erl_best' + SAVE_TAG)
                print("Best policy saved with score",
                      '%.2f' % max(all_fitness))

        #Save champion periodically
        if gen % 5 == 0 and max(all_fitness) > (self.best_score -
                                                100) and SAVE:
            torch.save(self.pop[champ_index].state_dict(),
                       self.args.model_save + 'champ' + SAVE_TAG)
            torch.save(self.pop[champ_index].state_dict(),
                       self.args.rl_models + 'champ' + SAVE_TAG)
            print("Champ saved with score ", '%.2f' % max(all_fitness))

        if gen % 20 == 0 and SAVE:
            torch.save(
                self.pop[self.evolver.lineage.index(max(
                    self.evolver.lineage))].state_dict(),
                self.args.model_save + 'eugenic_champ' + SAVE_TAG)
            print("Eugenic Champ saved with score ",
                  '%.2f' % max(self.evolver.lineage))

        if USE_RS:
            all_shaped_fitness = np.array(all_shaped_fitness)
            if self.best_shaped_score == None:
                self.best_shaped_score = [
                    0.0 for _ in range(all_shaped_fitness.shape[1])
                ]  #First time run (set the best shaped score size to track a variable # of shaped fitnesses)

            max_shaped_fit = [max(a) for a in all_shaped_fitness.transpose()]

            for metric_id in range(len(max_shaped_fit)):

                if max_shaped_fit[metric_id] > self.best_shaped_score[
                        metric_id]:
                    self.best_shaped_score[metric_id] = max_shaped_fit[
                        metric_id]
                    shaped_champ_ind = all_net_ids[np.argmax(
                        all_shaped_fitness[:, metric_id])]
                    if SAVE:
                        torch.save(
                            self.pop[shaped_champ_ind].state_dict(),
                            self.args.model_save + 'shaped_erl_best' +
                            str(metric_id) + SAVE_TAG)
                        print(
                            "Best Shaped ERL policy saved with true score",
                            '%.2f' % all_fitness[np.argmax(
                                all_shaped_fitness[:, metric_id])],
                            'and shaped score of ',
                            '%.2f' % max_shaped_fit[metric_id],
                            'for metric id', str(metric_id))

        else:
            max_shaped_fit = None

        #NeuroEvolution's probabilistic selection and recombination step
        self.evolver.epoch(self.pop, all_net_ids, all_fitness,
                           all_shaped_fitness)

        # Synch RL Agent to NE periodically
        if gen % 5 == 0:
            self.evolver.sync_rl(self.args.rl_models, self.pop)

        return max(all_fitness), all_eplens[all_fitness.index(
            max(all_fitness))], all_fitness, all_eplens, all_shaped_fitness
def rollout_worker(args, worker_id, task_pipe, result_pipe, noise, data_bucket,
                   models_bucket, model_template):
    """Rollout Worker runs a simulation in the environment to generate experiences and fitness values

        Parameters:
            worker_id (int): Specific Id unique to each worker spun
            task_pipe (pipe): Receiver end of the task pipe used to receive signal to start on a task
            result_pipe (pipe): Sender end of the pipe used to report back results
            noise (object): A noise generator object
            exp_list (shared list object): A shared list object managed by a manager that is used to store experience tuples
            pop (shared list object): A shared list object managed by a manager used to store all the models (actors)
            difficulty (int): Difficulty of the task
            use_rs (bool): Use behavioral reward shaping?
            store_transition (bool): Log experiences to exp_list?

        Returns:
            None
    """

    worker_id = worker_id
    env = Task_Rovers(args)
    models = [model_template for _ in range(args.num_rover)]
    for m in models:
        m = m.eval()

    while True:
        RENDER = task_pipe.recv(
        )  #Wait until a signal is received  to start rollout

        # Get the current model state from the population
        for m, bucket_model in zip(models, models_bucket):
            utils.hard_update(m, bucket_model)

        fitness = 0.0
        joint_state = env.reset()
        rollout_trajectory = [[] for _ in range(args.num_rover)]
        joint_state = utils.to_tensor(np.array(joint_state))
        while True:  #unless done

            joint_action = [
                models[i].forward(joint_state[i, :]).detach().numpy()
                for i in range(args.num_rover)
            ]
            if noise != None:
                for action in joint_action:
                    action += noise.noise()

            next_state, reward, done, info = env.step(
                joint_action)  # Simulate one step in environment

            next_state = utils.to_tensor(np.array(next_state))
            fitness += sum(reward) / args.coupling

            #If storing transitions
            for i in range(args.num_rover):
                rollout_trajectory[i].append([
                    np.expand_dims(utils.to_numpy(joint_state)[i, :], 0),
                    np.expand_dims(np.array(joint_action)[i, :], 0),
                    np.expand_dims(utils.to_numpy(next_state)[i, :], 0),
                    np.expand_dims(np.array([reward[i]]), 0),
                    np.expand_dims(np.array([done]), 0)
                ])

            joint_state = next_state

            #DONE FLAG IS Received
            if done:
                if RENDER: env.render()
                #Push experiences to main
                for rover_id in range(args.num_rover):
                    for entry in rollout_trajectory[rover_id]:
                        for i in range(len(entry[0])):
                            data_bucket[rover_id].append([
                                entry[0], entry[1], entry[2], entry[3],
                                entry[4]
                            ])
                break

        #Send back id, fitness, total length and shaped fitness using the result pipe
        result_pipe.send([fitness])