def __init__(self, wwid, algo_name, state_dim, action_dim, actor_lr, critic_lr, gamma, tau, init_w = True): self.algo_name = algo_name; self.gamma = gamma; self.tau = tau #Initialize actors self.actor = Actor(state_dim, action_dim, wwid) if init_w: self.actor.apply(utils.init_weights) self.actor_target = Actor(state_dim, action_dim, wwid) utils.hard_update(self.actor_target, self.actor) self.actor_optim = Adam(self.actor.parameters(), actor_lr) self.critic = Critic(state_dim, action_dim) if init_w: self.critic.apply(utils.init_weights) self.critic_target = Critic(state_dim, action_dim) utils.hard_update(self.critic_target, self.critic) self.critic_optim = Adam(self.critic.parameters(), critic_lr) self.loss = nn.MSELoss() self.actor_target.cuda(); self.critic_target.cuda(); self.actor.cuda(); self.critic.cuda() self.num_critic_updates = 0 #Statistics Tracker self.action_loss = {'min':[], 'max': [], 'mean':[], 'std':[]} self.policy_loss = {'min':[], 'max': [], 'mean':[], 'std':[]} self.critic_loss = {'mean':[]} self.q = {'min':[], 'max': [], 'mean':[], 'std':[]} self.val = {'min':[], 'max': [], 'mean':[], 'std':[]}
def make_champ_team(self, agents): for agent_id, agent in enumerate(agents): if self.args.popn_size <= 1: #Testing without Evo agent.update_rollout_actor() mod.hard_update(self.rollout_actor[agent_id], agent.rollout_actor[0]) else: mod.hard_update(self.rollout_actor[agent_id], agent.popn[agent.champ_ind])
def __init__(self, id, num_inputs, action_dim, hidden_size, gamma, critic_lr, actor_lr, tau, alpha, target_update_interval, savetag, foldername, actualize, use_gpu): self.num_inputs = num_inputs self.action_space = action_dim self.gamma = gamma self.tau = 0.005 self.alpha = 0.2 self.policy_type = "Gaussian" self.target_update_interval = 1 self.tracker = utils.Tracker(foldername, ['q_'+savetag, 'qloss_'+savetag, 'value_'+savetag, 'value_loss_'+savetag, 'policy_loss_'+savetag, 'mean_loss_'+savetag, 'std_loss_'+savetag], '.csv',save_iteration=1000, conv_size=1000) self.total_update = 0 self.agent_id = id self.actualize = actualize self.critic = QNetwork(self.num_inputs, self.action_space, hidden_size) self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr) self.soft_q_criterion = nn.MSELoss() if self.policy_type == "Gaussian": self.policy = Actor(self.num_inputs, self.action_space, hidden_size, policy_type='GaussianPolicy') self.policy_optim = Adam(self.policy.parameters(), lr=actor_lr) self.value = ValueNetwork(self.num_inputs, hidden_size) self.value_target = ValueNetwork(self.num_inputs, hidden_size) self.value_optim = Adam(self.value.parameters(), lr=critic_lr) utils.hard_update(self.value_target, self.value) self.value_criterion = nn.MSELoss() else: self.policy = Actor(self.num_inputs, self.action_space, hidden_size, policy_type='DeterministicPolicy') self.policy_optim = Adam(self.policy.parameters(), lr=actor_lr) self.critic_target = QNetwork(self.num_inputs, self.action_space, hidden_size) utils.hard_update(self.critic_target, self.critic) self.policy.cuda() self.value.cuda() self.value_target.cuda() self.critic.cuda() #Statistics Tracker self.q = {'min':None, 'max': None, 'mean':None, 'std':None} self.val = {'min':None, 'max': None, 'mean':None, 'std':None} self.value_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.policy_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.mean_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.std_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.q_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
def collect_trajectory(self): utils.hard_update(self.actual_red_worker, self.actual_red_actor) #first snyc the actor #launch rollout_workers for id, actor in enumerate(self.rollout_bucket): if self.evo_flag[id]: self.evo_task_pipes[id][0].send( (id, 0)) #second argument in send is dummy self.evo_flag[id] = False #wait for the rollout to complete and record fitness all_fitness = [] for i in range(self.num_workers): entry = self.evo_result_pipes[i][1].recv() all_fitness.append(entry[1]) self.evo_flag[i] = True self.buffer.referesh() #update replay buffer return all_fitness
def __init__(self, id, algo_name, state_dim, action_dim, hidden_size, actor_lr, critic_lr, gamma, tau, savetag, foldername, actualize, use_gpu, num_agents, init_w = True): self.algo_name = algo_name; self.gamma = gamma; self.tau = tau; self.total_update = 0; self.agent_id = id;self.use_gpu = use_gpu self.tracker = utils.Tracker(foldername, ['q_'+savetag, 'qloss_'+savetag, 'policy_loss_'+savetag], '.csv', save_iteration=1000, conv_size=1000) self.num_agents = num_agents #Initialize actors self.policy = MultiHeadActor(state_dim, action_dim, hidden_size, num_agents) if init_w: self.policy.apply(utils.init_weights) self.policy_target = MultiHeadActor(state_dim, action_dim, hidden_size, num_agents) utils.hard_update(self.policy_target, self.policy) self.policy_optim = Adam(self.policy.parameters(), actor_lr) self.critics = [QNetwork(state_dim*num_agents, action_dim*num_agents, hidden_size*3) for _ in range(num_agents)] self.critics_target = [QNetwork(state_dim*num_agents, action_dim*num_agents, hidden_size*3) for _ in range(num_agents)] if init_w: for critic, critic_target in zip(self.critics, self.critics_target): critic.apply(utils.init_weights) utils.hard_update(critic_target, critic) self.critic_optims = [Adam(critic.parameters(), critic_lr) for critic in self.critics] self.loss = nn.MSELoss() if use_gpu: self.policy_target.cuda(); self.policy.cuda() for critic, critic_target in zip(self.critics, self.critics_target): critic.cuda() critic_target.cuda() self.num_critic_updates = 0 #Statistics Tracker #self.action_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.policy_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.q_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.q = {'min':None, 'max': None, 'mean':None, 'std':None}
def __init__(self, id, algo_name, state_dim, action_dim, hidden_size, actor_lr, critic_lr, gamma, tau, savetag, foldername, actualize, use_gpu, num_agents, init_w = True): self.algo_name = algo_name; self.gamma = gamma; self.tau = tau; self.total_update = 0; self.agent_id = id; self.actualize = actualize; self.use_gpu = use_gpu self.tracker = utils.Tracker(foldername, ['q_'+savetag, 'qloss_'+savetag, 'policy_loss_'+savetag, 'alz_score'+savetag,'alz_policy'+savetag], '.csv', save_iteration=1000, conv_size=1000) #Initialize actors self.policy = MultiHeadActor(state_dim, action_dim, hidden_size, num_agents) if init_w: self.policy.apply(utils.init_weights) self.policy_target = MultiHeadActor(state_dim, action_dim, hidden_size, num_agents) utils.hard_update(self.policy_target, self.policy) self.policy_optim = Adam(self.policy.parameters(), actor_lr) self.critic = QNetwork(state_dim, action_dim,hidden_size) if init_w: self.critic.apply(utils.init_weights) self.critic_target = QNetwork(state_dim, action_dim, hidden_size) utils.hard_update(self.critic_target, self.critic) self.critic_optim = Adam(self.critic.parameters(), critic_lr) if actualize: self.ANetwork = ActualizationNetwork(state_dim, action_dim, hidden_size) if init_w: self.ANetwork.apply(utils.init_weights) self.actualize_optim = Adam(self.ANetwork.parameters(), critic_lr) self.actualize_lr = 0.2 if use_gpu: self.ANetwork.cuda() self.loss = nn.MSELoss() if use_gpu: self.policy_target.cuda(); self.critic_target.cuda(); self.policy.cuda(); self.critic.cuda() self.num_critic_updates = 0 #Statistics Tracker #self.action_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.policy_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.q_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.q = {'min':None, 'max': None, 'mean':None, 'std':None} self.alz_score = {'min':None, 'max': None, 'mean':None, 'std':None} self.alz_policy = {'min':None, 'max': None, 'mean':None, 'std':None}
def train(self, gen, frame_tracker): """Main training loop to do rollouts, neureoevolution, and policy gradients Parameters: gen (int): Current epoch of training Returns: None """ ################ START ROLLOUTS ############## #Start Evolution rollouts if not ISOLATE_PG: for id, actor in enumerate(self.pop): if self.evo_flag[id]: self.evo_task_pipes[id][0].send(id) self.evo_flag[id] = False #Sync all learners actor to cpu (rollout) actor for i, learner in enumerate(self.portfolio): learner.algo.actor.cpu() utils.hard_update(self.rollout_bucket[i], learner.algo.actor) learner.algo.actor.cuda() # Start Learner rollouts for rollout_id, learner_id in enumerate(self.allocation): if self.roll_flag[rollout_id]: self.task_pipes[rollout_id][0].send(learner_id) self.roll_flag[rollout_id] = False #Start Test rollouts if gen % 5 == 0: self.test_flag = True for pipe in self.test_task_pipes: pipe[0].send(0) ############# UPDATE PARAMS USING GRADIENT DESCENT ########## if self.replay_buffer.__len__( ) > self.args.batch_size * 10: ###BURN IN PERIOD self.replay_buffer.tensorify( ) # Tensorify the buffer for fast sampling #Spin up threads for each learner threads = [ threading.Thread( target=learner.update_parameters, args=(self.replay_buffer, self.args.buffer_gpu, self.args.batch_size, int(self.gen_frames * self.args.gradperstep))) for learner in self.portfolio ] # Start threads for thread in threads: thread.start() #Join threads for thread in threads: thread.join() self.gen_frames = 0 ########## SOFT -JOIN ROLLOUTS FOR EVO POPULATION ############ if not ISOLATE_PG: all_fitness = [] all_net_ids = [] all_eplens = [] while True: for i in range(self.args.pop_size): if self.evo_result_pipes[i][1].poll(): entry = self.evo_result_pipes[i][1].recv() all_fitness.append(entry[1]) all_net_ids.append(entry[0]) all_eplens.append(entry[2]) self.gen_frames += entry[2] self.total_frames += entry[2] self.evo_flag[i] = True # Soft-join (50%) if len(all_fitness ) / self.args.pop_size >= self.args.asynch_frac: break ########## HARD -JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############ for i in range(self.args.rollout_size): entry = self.result_pipes[i][1].recv() learner_id = entry[0] fitness = entry[1] num_frames = entry[2] self.portfolio[learner_id].update_stats(fitness, num_frames) self.gen_frames += num_frames self.total_frames += num_frames if fitness > self.best_score: self.best_score = fitness self.roll_flag[i] = True #Referesh buffer (housekeeping tasks - pruning to keep under capacity) self.replay_buffer.referesh() ######################### END OF PARALLEL ROLLOUTS ################ ############ PROCESS MAX FITNESS ############# if not ISOLATE_PG: champ_index = all_net_ids[all_fitness.index(max(all_fitness))] utils.hard_update(self.test_bucket[0], self.pop[champ_index]) if max(all_fitness) > self.best_score: self.best_score = max(all_fitness) utils.hard_update(self.best_policy, self.pop[champ_index]) if SAVE: torch.save( self.pop[champ_index].state_dict(), self.args.aux_folder + ENV_NAME + '_best' + SAVETAG) print("Best policy saved with score", '%.2f' % max(all_fitness)) else: #Run PG in isolation utils.hard_update(self.test_bucket[0], self.rollout_bucket[0]) ###### TEST SCORE ###### if self.test_flag: self.test_flag = False test_scores = [] for pipe in self.test_result_pipes: #Collect all results entry = pipe[1].recv() test_scores.append(entry[1]) test_scores = np.array(test_scores) test_mean = np.mean(test_scores) test_std = (np.std(test_scores)) # Update score to trackers frame_tracker.update([test_mean], self.total_frames) else: test_mean, test_std = None, None #NeuroEvolution's probabilistic selection and recombination step if not ISOLATE_PG: if gen % 5 == 0: self.evolver.epoch(gen, self.genealogy, self.pop, all_net_ids, all_fitness, self.rollout_bucket) else: self.evolver.epoch(gen, self.genealogy, self.pop, all_net_ids, all_fitness, []) #META LEARNING - RESET ALLOCATION USING UCB if gen % 1 == 0: self.allocation = ucb(len(self.allocation), self.portfolio, self.args.ucb_coefficient) #Metrics if not ISOLATE_PG: champ_len = all_eplens[all_fitness.index(max(all_fitness))] champ_wwid = int(self.pop[champ_index].wwid.item()) max_fit = max(all_fitness) else: champ_len = num_frames champ_wwid = int(self.rollout_bucket[0].wwid.item()) all_fitness = [fitness] max_fit = fitness all_eplens = [num_frames] return max_fit, champ_len, all_fitness, all_eplens, test_mean, test_std, champ_wwid
def train(self, gen, frame_tracker): """Main training loop to do rollouts, neureoevolution, and policy gradients Parameters: gen (int): Current epoch of training Returns: None """ ################ START ROLLOUTS ############## # Start Evolution rollouts if not ISOLATE_PG: for id, actor in enumerate(self.pop): if self.evo_flag[id]: self.evo_task_pipes[id][0].send((id, gen)) self.evo_flag[id] = False # Sync all learners actor to cpu (rollout) actor # (update rollout parameter using the learner parameter, such that rollout worker is up to date) for i, learner in enumerate(self.portfolio): #number of learner learner.algo.actor.cpu() utils.hard_update( self.rollout_bucket[i], learner.algo.actor ) #rollout bucket is now synchronized with learner to perform rollout for learner actors if torch.cuda.is_available(): learner.algo.actor.cuda() # Start Learner rollouts for rollout_id, learner_id in enumerate( self.allocation): #number of rollout_size if self.roll_flag[rollout_id]: self.task_pipes[rollout_id][0].send( (learner_id, gen) ) #allocation record the id of the learner that bucket should run, so rollout_id is the id of rollout_bucket self.roll_flag[rollout_id] = False # Start Test rollouts if gen % 5 == 0: self.test_flag = True for pipe in self.test_task_pipes: pipe[0].send((0, gen)) ############# UPDATE PARAMS USING GRADIENT DESCENT ########## # main training loop if self.replay_buffer.__len__( ) > self.args.batch_size * 10: ###BURN IN PERIOD self.replay_buffer.tensorify( ) # Tensorify the buffer for fast sampling # Spin up threads for each learner threads = [ threading.Thread( target=learner.update_parameters, args=(self.replay_buffer, self.args.buffer_gpu, self.args.batch_size, int(self.gen_frames * self.args.gradperstep))) for learner in self.portfolio ] #macheng: do we want to train all the learners? # Start threads for thread in threads: thread.start() # Join threads for thread in threads: thread.join() # Now update average_policy #self.average_policy.cuda() if ALGO == 'dis': self.average_policy.update( ) #update the average_policy parameter with supervised learning self.gen_frames = 0 #########Visualize Learner Critic Function################# # if self.replay_buffer.__len__() % 2500 == 0: # visualize_critic(self.portfolio[2], make_self_play_env(trainers=[[],[]])[0], 50) #arguments: Learner, env, N_GRID ########## SOFT -JOIN ROLLOUTS FOR EVO POPULATION ############ if not ISOLATE_PG: all_fitness = [] all_net_ids = [] all_eplens = [] while True: for i in range(self.args.pop_size): if self.evo_result_pipes[i][1].poll(): entry = self.evo_result_pipes[i][1].recv() all_fitness.append(entry[1]) all_net_ids.append(entry[0]) all_eplens.append(entry[2]) self.gen_frames += entry[2] self.total_frames += entry[2] self.evo_flag[i] = True # Soft-join (50%) if len(all_fitness ) / self.args.pop_size >= self.args.asynch_frac: break ########## HARD -JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############ for i in range(self.args.rollout_size): entry = self.result_pipes[i][1].recv() learner_id = entry[0] fitness = entry[1] num_frames = entry[2] self.portfolio[learner_id].update_stats(fitness, num_frames) self.gen_frames += num_frames self.total_frames += num_frames if fitness > self.best_score: self.best_score = fitness self.roll_flag[i] = True # Referesh buffer (housekeeping tasks - pruning to keep under capacity) self.replay_buffer.referesh() ######################### END OF PARALLEL ROLLOUTS ################ ############ PROCESS MAX FITNESS ############# # ms:best policy is always up to date # so here the best learner is saved if not ISOLATE_PG: champ_index = all_net_ids[all_fitness.index(max(all_fitness))] utils.hard_update(self.test_bucket[0], self.pop[champ_index]) if max(all_fitness) > self.best_score: self.best_score = max(all_fitness) utils.hard_update(self.best_policy, self.pop[champ_index]) if SAVE: torch.save( self.pop[champ_index].state_dict(), self.args.aux_folder + ENV_NAME + '_best' + SAVETAG) print("Best policy saved with score", '%.2f' % max(all_fitness)) else: #Run PG in isolation utils.hard_update(self.test_bucket[0], self.rollout_bucket[0]) ###### TEST SCORE ###### if self.test_flag: self.test_flag = False test_scores = [] for pipe in self.test_result_pipes: #Collect all results entry = pipe[1].recv() test_scores.append(entry[1]) test_scores = np.array(test_scores) test_mean = np.mean(test_scores) test_std = (np.std(test_scores)) # Update score to trackers frame_tracker.update([test_mean], self.total_frames) else: test_mean, test_std = None, None # NeuroEvolution's probabilistic selection and recombination step # ms: this epoch() method implements neuro-evolution if not ISOLATE_PG: #seems pop_size and rollout_size must be 10, otherwise this will produce error if gen % 5 == 0: self.evolver.epoch( gen, self.genealogy, self.pop, all_net_ids, all_fitness, self.rollout_bucket ) #this method also copies learner to evoler else: self.evolver.epoch(gen, self.genealogy, self.pop, all_net_ids, all_fitness, []) # META LEARNING - RESET ALLOCATION USING UCB if gen % 1 == 0: self.update_allocation() # Metrics if not ISOLATE_PG: champ_len = all_eplens[all_fitness.index(max(all_fitness))] champ_wwid = int(self.pop[champ_index].wwid.item()) max_fit = max(all_fitness) else: champ_len = num_frames champ_wwid = int(self.rollout_bucket[0].wwid.item()) all_fitness = [fitness] max_fit = fitness all_eplens = [num_frames] return max_fit, champ_len, all_fitness, all_eplens, test_mean, test_std, champ_wwid
def evolve(self, pop, net_inds, fitness_evals, migration, states): """Method to implement a round of selection and mutation operation Parameters: pop (shared_list): Population of models net_inds (list): Indices of individuals evaluated this generation fitness_evals (list of lists): Fitness values for evaluated individuals **migration (object): Policies from learners to be synced into population Returns: None """ self.gen += 1 # Convert the list of fitness values corresponding to each individual into a float [CCEA Reduction] if isinstance(fitness_evals[0], list): for i in range(len(fitness_evals)): if self.ccea_reduction == "mean": fitness_evals[i] = sum(fitness_evals[i]) / len( fitness_evals[i]) elif self.ccea_reduction == "leniency": fitness_evals[i] = max(fitness_evals[i]) elif self.ccea_reduction == "min": fitness_evals[i] = min(fitness_evals[i]) else: sys.exit('Incorrect CCEA Reduction scheme') # Append new fitness to lineage lineage_scores = [ ] # Tracks the average lineage score fot the generation for ind, fitness in zip(net_inds, fitness_evals): self.lineage[ind].append(fitness) lineage_scores.append( 0.75 * sum(self.lineage[ind]) / len(self.lineage[ind]) + 0.25 * fitness ) # Current fitness is weighted higher than lineage info if len(self.lineage[ind]) > self.lineage_depth: self.lineage[ind].pop(0) # Housekeeping # Entire epoch is handled with indices; Index rank nets by fitness evaluation (0 is the best after reversing) index_rank = self.list_argsort(fitness_evals) index_rank.reverse() elitist_index = index_rank[:self. num_elites] # Elitist indexes safeguard # Lineage rankings to elitists lineage_rank = self.list_argsort(lineage_scores[:]) lineage_rank.reverse() elitist_index = elitist_index + lineage_rank[:int(self.num_elites)] # Take out copies in elitist indices elitist_index = list(set(elitist_index)) #################### MULTI_POINT SEARCH WITH ANCHORS/PROBES/BLENDS AND EXPLICIT DIVERSITY-BASED SEPARATION if self.scheme == 'multipoint': # Compute anchors anchor_inds = self.get_anchors(states, pop, net_inds[:], np.array(lineage_rank[:])) # Remove duplicates between anchors and elitists for i, elite in enumerate(elitist_index): if elite in anchor_inds: elitist_index.pop(i) ##################### TRANSFER INDICES BACK TO POP INDICES: Change from ind in net_inds to ind referring to the real ind in pop ############################### elites = [net_inds[i] for i in elitist_index] anchors = [net_inds[i] for i in anchor_inds] anchor_fitnesses = [fitness_evals[i] for i in anchor_inds] anchor_index_ranks = [index_rank.index(i) for i in anchor_inds] ####################################################################################################################################################### # Unselects are the individuals left in the population unselects = [ ind for ind in net_inds if ind not in elites and ind not in anchors ] # Inheritance step (sync learners to population) for policy in migration: replacee = unselects.pop(0) utils.hard_update(target=pop[replacee], source=policy) # wwid = genealogy.asexual(int(policy.wwid.item())) # pop[replacee].wwid[0] = wwid self.lineage[replacee] = [] # Reinitialize as empty # Sample anchors from a probability distribution formed of their relative fitnesses using a roulette wheel probe_allocation_inds = self.roulette_wheel( anchor_fitnesses, len(unselects) - self.num_blends) sampled_anchors = [anchors[i] for i in probe_allocation_inds] # Mutate the anchors to form probes for anchor_ind in sampled_anchors: # Mutate to form probes from anchors replacee = unselects.pop(0) utils.hard_update(target=pop[replacee], source=pop[anchor_ind]) self.lineage[replacee] = [ utils.list_mean(self.lineage[anchor_ind]) ] # Inherit lineage from replacee self.mutate_inplace(pop[replacee]) # genealogy.mutation(int(pop[replacee].wwid.item()), gen) if random.random() < 0.1: print('Evo_Info #Anchors', len(anchors), '#Probes_allocation', [sampled_anchors.count(i) for i in anchors], '#elites', len(elites), '#Blends', len(unselects), '#Migration', len(migration), 'Nets', len(net_inds), 'Anchor fitness Ranks', anchor_index_ranks) ###### Create the blends to fill the rest of the unselects by crossovers ######### # Number of unselects left should be even if len(unselects) % 2 != 0: unselects.append(unselects[random.randint( 0, len(unselects) - 1)]) for i, j in zip(unselects[0::2], unselects[1::2]): off_i = random.choice(anchors) while True: off_j = random.choice(anchors) if off_j != off_i: break utils.hard_update(target=pop[i], source=pop[off_i]) utils.hard_update(target=pop[j], source=pop[off_j]) self.crossover_inplace(pop[i], pop[j]) # wwid1 = genealogy.crossover(int(pop[off_i].wwid.item()), int(pop[off_j].wwid.item()), gen) # wwid2 = genealogy.crossover(int(pop[off_i].wwid.item()), int(pop[off_j].wwid.item()), gen) # pop[i].wwid[0] = wwid1; pop[j].wwid[0] = wwid2 self.lineage[i] = [ 0.5 * utils.list_mean(self.lineage[off_i]) + 0.5 * utils.list_mean(self.lineage[off_j]) ] self.lineage[j] = [ 0.5 * utils.list_mean(self.lineage[off_i]) + 0.5 * utils.list_mean(self.lineage[off_j]) ] return anchors[0] ####################### OLD EVOLVER WITHOUT MULTI_POINT SEARCH ########### elif self.scheme == 'standard': # Selection step offsprings = self.selection_tournament( index_rank, num_offsprings=len(index_rank) - len(elitist_index) - len(migration), tournament_size=3) # Transcribe ranked indexes from now on to refer to net indexes elitist_index = [net_inds[i] for i in elitist_index] offsprings = [net_inds[i] for i in offsprings] # Figure out unselected candidates unselects = [] new_elitists = [] for i in range(len(pop)): if i in offsprings or i in elitist_index: continue else: unselects.append(i) random.shuffle(unselects) # Check for migration's performance for ind in self.migrating_inds: if ind in offsprings or ind in elitist_index: self.rl_res['selects'] += 1 else: self.rl_res['discarded'] += 1 self.migrating_inds = [] # Inheritance step (sync learners to population) for policy in migration: replacee = unselects.pop(0) utils.hard_update(target=pop[replacee], source=policy) self.migrating_inds.append(replacee) self.lineage[replacee] = [ sum(lineage_scores) / len(lineage_scores) ] # Initialize as average # Elitism step, assigning elite candidates to some unselects for i in elitist_index: if len(unselects) >= 1: replacee = unselects.pop(0) elif len(offsprings) >= 1: replacee = offsprings.pop(0) else: continue new_elitists.append(replacee) utils.hard_update(target=pop[replacee], source=pop[i]) # wwid = genealogy.asexual(int(pop[i].wwid.item())) # pop[replacee].wwid[0] = wwid # genealogy.elite(wwid, gen) self.lineage[replacee] = self.lineage[i][:] # Crossover for unselected genes with 100 percent probability if len(unselects ) % 2 != 0: # Number of unselects left should be even unselects.append(unselects[random.randint( 0, len(unselects) - 1)]) for i, j in zip(unselects[0::2], unselects[1::2]): off_i = random.choice(new_elitists) off_j = random.choice(offsprings) utils.hard_update(target=pop[i], source=pop[off_i]) utils.hard_update(target=pop[j], source=pop[off_j]) self.crossover_inplace(pop[i], pop[j]) # wwid1 = genealogy.crossover(int(pop[off_i].wwid.item()), int(pop[off_j].wwid.item()), gen) # wwid2 = genealogy.crossover(int(pop[off_i].wwid.item()), int(pop[off_j].wwid.item()), gen) # pop[i].wwid[0] = wwid1; pop[j].wwid[0] = wwid2 self.lineage[i] = [ 0.5 * utils.list_mean(self.lineage[off_i]) + 0.5 * utils.list_mean(self.lineage[off_j]) ] self.lineage[j] = [ 0.5 * utils.list_mean(self.lineage[off_i]) + 0.5 * utils.list_mean(self.lineage[off_j]) ] # Crossover for selected offsprings for i, j in zip(offsprings[0::2], offsprings[1::2]): if random.random() < self.crossover_prob: self.crossover_inplace(pop[i], pop[j]) # wwid1 = genealogy.crossover(int(pop[i].wwid.item()), int(pop[j].wwid.item()), gen) # wwid2 = genealogy.crossover(int(pop[i].wwid.item()), int(pop[j].wwid.item()), gen) # pop[i].wwid[0] = wwid1; pop[j].wwid[0] = wwid2 self.lineage[i] = [ 0.5 * utils.list_mean(self.lineage[i]) + 0.5 * utils.list_mean(self.lineage[j]) ] self.lineage[j] = [ 0.5 * utils.list_mean(self.lineage[i]) + 0.5 * utils.list_mean(self.lineage[j]) ] # Mutate all genes in the population except the new elitists for i in range(len(pop)): if i not in new_elitists: # Spare the new elitists if random.random() < self.mutation_prob: self.mutate_inplace(pop[i]) # genealogy.mutation(int(pop[net_i].wwid.item()), gen) self.all_offs[:] = offsprings[:] return new_elitists[0] else: sys.exit('Incorrect Evolution Scheme')
def epoch(self, gen, genealogy, pop, net_inds, fitness_evals, migration): """Method to implement a round of selection and mutation operation Parameters: pop (shared_list): Population of models net_inds (list): Indices of individuals evaluated this generation fitness_evals (list): Fitness values for evaluated individuals **migration (object): Policies from learners to be synced into population Returns: None """ self.gen+= 1; num_elitists = int(self.args.elite_fraction * len(fitness_evals)) if num_elitists < 2: num_elitists = 2 # Entire epoch is handled with indices; Index rank nets by fitness evaluation (0 is the best after reversing) index_rank = self.list_argsort(fitness_evals); index_rank.reverse() elitist_index = index_rank[:num_elitists] # Elitist indexes safeguard # Selection step offsprings = self.selection_tournament(index_rank, num_offsprings=len(index_rank) - len(elitist_index) - len(migration), tournament_size=3) #Transcripe ranked indexes from now on to refer to net indexes elitist_index = [net_inds[i] for i in elitist_index] offsprings = [net_inds[i] for i in offsprings] #Figure out unselected candidates unselects = []; new_elitists = [] for net_i in net_inds: if net_i in offsprings or net_i in elitist_index: continue else: unselects.append(net_i) random.shuffle(unselects) #Inheritance step (sync learners to population) for policy in migration: replacee = unselects.pop(0) utils.hard_update(target=pop[replacee], source=policy) wwid = genealogy.asexual(int(policy.wwid.item())) pop[replacee].wwid[0] = wwid # Elitism step, assigning elite candidates to some unselects for i in elitist_index: try: replacee = unselects.pop(0) except: replacee = offsprings.pop(0) new_elitists.append(replacee) utils.hard_update(target=pop[replacee], source=pop[i]) wwid = genealogy.asexual(int(pop[i].wwid.item())) pop[replacee].wwid[0] = wwid genealogy.elite(wwid, gen) #self.lineage[replacee] = self.lineage[i] # Crossover for unselected genes with 100 percent probability if len(unselects) % 2 != 0: # Number of unselects left should be even unselects.append(unselects[random.randint(0, len(unselects)-1)]) for i, j in zip(unselects[0::2], unselects[1::2]): off_i = random.choice(new_elitists); off_j = random.choice(offsprings) utils.hard_update(target=pop[i], source=pop[off_i]) utils.hard_update(target=pop[j], source=pop[off_j]) self.crossover_inplace(pop[i], pop[j]) wwid1 = genealogy.crossover(int(pop[off_i].wwid.item()), int(pop[off_j].wwid.item()), gen) wwid2 = genealogy.crossover(int(pop[off_i].wwid.item()), int(pop[off_j].wwid.item()), gen) pop[i].wwid[0] = wwid1; pop[j].wwid[0] = wwid2 #self.lineage[i] = (self.lineage[off_i]+self.lineage[off_j])/2 #self.lineage[j] = (self.lineage[off_i] + self.lineage[off_j]) / 2 # Crossover for selected offsprings for i, j in zip(offsprings[0::2], offsprings[1::2]): if random.random() < self.args.crossover_prob: self.crossover_inplace(pop[i], pop[j]) wwid1 = genealogy.crossover(int(pop[i].wwid.item()), int(pop[j].wwid.item()), gen) wwid2 = genealogy.crossover(int(pop[i].wwid.item()), int(pop[j].wwid.item()), gen) pop[i].wwid[0] = wwid1; pop[j].wwid[0] = wwid2 # Mutate all genes in the population except the new elitists for net_i in net_inds: if net_i not in new_elitists: # Spare the new elitists if random.random() < self.args.mutation_prob: self.mutate_inplace(pop[net_i]) genealogy.mutation(int(pop[net_i].wwid.item()), gen) self.all_offs[:] = offsprings[:]
def evolve(self, pop, net_inds, fitness_evals, migration): """Method to implement a round of selection and mutation operation Parameters: pop (shared_list): Population of models net_inds (list): Indices of individuals evaluated this generation fitness_evals (list of lists): Fitness values for evaluated individuals migration (object): Policies from learners to be synced into population Returns: None """ self.gen+= 1 #Convert the list of fitness values corresponding to each individual into a float [CCEA Reduction] if isinstance(fitness_evals[0], list): for i in range(len(fitness_evals)): if self.ccea_reduction == "mean": fitness_evals[i] = sum(fitness_evals[i])/len(fitness_evals[i]) elif self.ccea_reduction == "leniency":fitness_evals[i] = max(fitness_evals[i]) elif self.ccea_reduction == "min": fitness_evals[i] = min(fitness_evals[i]) else: sys.exit('Incorrect CCEA Reduction scheme') # Entire epoch is handled with indices; Index rank nets by fitness evaluation (0 is the best after reversing) index_rank = self.list_argsort(fitness_evals); index_rank.reverse() elitist_index = index_rank[:self.num_elites] # Elitist indexes safeguard # Selection step offsprings = self.selection_tournament(index_rank, num_offsprings=len(index_rank) - len(elitist_index) - len( migration) - 1, tournament_size=3) # Transcribe ranked indexes from now on to refer to net indexes elitist_index = [net_inds[i] for i in elitist_index] offsprings = [net_inds[i] for i in offsprings] # Figure out unselected candidates unselects = []; new_elitists = [] for i in range(len(pop)): if i in offsprings or i in elitist_index: continue else: unselects.append(i) random.shuffle(unselects) # Inheritance step (sync learners to population) for policy in migration: replacee = unselects.pop(0) utils.hard_update(target=pop[replacee], source=policy) # Elitism step, assigning elite candidates to some unselects for i in elitist_index: if len(unselects) >= 1: replacee = unselects.pop(0) elif len(offsprings) >= 1: replacee = offsprings.pop(0) else: continue new_elitists.append(replacee) utils.hard_update(target=pop[replacee], source=pop[i]) # Crossover for unselected genes with 100 percent probability if len(unselects) % 2 != 0: # Number of unselects left should be even unselects.append(unselects[random.randint(0, len(unselects) - 1)]) for i, j in zip(unselects[0::2], unselects[1::2]): off_i = random.choice(new_elitists); off_j = random.choice(offsprings) utils.hard_update(target=pop[i], source=pop[off_i]) utils.hard_update(target=pop[j], source=pop[off_j]) self.crossover_inplace(pop[i], pop[j]) # Crossover for selected offsprings for i, j in zip(offsprings[0::2], offsprings[1::2]): if random.random() < self.crossover_prob: self.crossover_inplace(pop[i], pop[j]) # Mutate all genes in the population except the new elitists for i in range(len(pop)): if i not in new_elitists: # Spare the new elitists if random.random() < self.mutation_prob: self.mutate_inplace(pop[i]) # genealogy.mutation(int(pop[net_i].wwid.item()), gen) self.all_offs[:] = offsprings[:] return new_elitists[0]
def update_rollout_actor(self): for actor in self.rollout_actor: self.algo.policy.cpu() mod.hard_update(actor, self.algo.policy) if self.args.use_gpu: self.algo.policy.cuda()
def evolve(self, pop, net_inds, fitness_evals, migration): """Method to implement a round of selection and mutation operation Parameters: pop (shared_list): Population of models net_inds (list): Indices of individuals evaluated this generation fitness_evals (list of lists): Fitness values for evaluated individuals migration (object): Policies from learners to be synced into population Returns: None """ self.gen += 1 #Convert the list of fitness values corresponding to each individual into a float [CCEA Reduction] if isinstance(fitness_evals[0], list): for i in range(len(fitness_evals)): if self.ccea_reduction == "mean": fitness_evals[i] = sum(fitness_evals[i]) / len( fitness_evals[i]) elif self.ccea_reduction == "leniency": fitness_evals[i] = max(fitness_evals[i]) elif self.ccea_reduction == "min": fitness_evals[i] = min(fitness_evals[i]) else: sys.exit('Incorrect CCEA Reduction scheme') #Append new fitness to lineage lineage_scores = [ ] #Tracks the average lineage score fot the generation for ind, fitness in zip(net_inds, fitness_evals): self.lineage[ind].append(fitness) lineage_scores.append( 0.75 * sum(self.lineage[ind]) / len(self.lineage[ind]) + 0.25 * fitness) #Current fitness is weighted higher than lineage info if len(self.lineage[ind]) > self.lineage_depth: self.lineage[ind].pop(0) #Housekeeping # Entire epoch is handled with indices; Index rank nets by fitness evaluation (0 is the best after reversing) index_rank = self.list_argsort(fitness_evals) index_rank.reverse() elitist_index = index_rank[:self. num_elites] # Elitist indexes safeguard #Lineage rankings to elitists lineage_rank = self.list_argsort(lineage_scores[:]) lineage_rank.reverse() elitist_index = elitist_index + lineage_rank[:int(self.num_elites)] #Take out copies in elitist indices elitist_index = list(set(elitist_index)) # Selection step offsprings = self.selection_tournament(index_rank, num_offsprings=len(index_rank) - len(elitist_index) - len(migration), tournament_size=3) # Transcripe ranked indexes from now on to refer to net indexes elitist_index = [net_inds[i] for i in elitist_index] offsprings = [net_inds[i] for i in offsprings] # Figure out unselected candidates unselects = [] new_elitists = [] for i in range(len(pop)): if i in offsprings or i in elitist_index: continue else: unselects.append(i) random.shuffle(unselects) # Inheritance step (sync learners to population) for policy in migration: replacee = unselects.pop(0) utils.hard_update(target=pop[replacee], source=policy) # wwid = genealogy.asexual(int(policy.wwid.item())) # pop[replacee].wwid[0] = wwid self.lineage[replacee] = [ sum(lineage_scores) / len(lineage_scores) ] # Initialize as average # Elitism step, assigning elite candidates to some unselects for i in elitist_index: if len(unselects) >= 1: replacee = unselects.pop(0) elif len(offsprings) >= 1: replacee = offsprings.pop(0) else: continue new_elitists.append(replacee) utils.hard_update(target=pop[replacee], source=pop[i]) # wwid = genealogy.asexual(int(pop[i].wwid.item())) # pop[replacee].wwid[0] = wwid # genealogy.elite(wwid, gen) self.lineage[replacee] = self.lineage[i][:] # Crossover for unselected genes with 100 percent probability if len(unselects) % 2 != 0: # Number of unselects left should be even unselects.append(unselects[random.randint(0, len(unselects) - 1)]) for i, j in zip(unselects[0::2], unselects[1::2]): off_i = random.choice(new_elitists) off_j = random.choice(offsprings) utils.hard_update(target=pop[i], source=pop[off_i]) utils.hard_update(target=pop[j], source=pop[off_j]) self.crossover_inplace(pop[i], pop[j]) # wwid1 = genealogy.crossover(int(pop[off_i].wwid.item()), int(pop[off_j].wwid.item()), gen) # wwid2 = genealogy.crossover(int(pop[off_i].wwid.item()), int(pop[off_j].wwid.item()), gen) # pop[i].wwid[0] = wwid1; pop[j].wwid[0] = wwid2 self.lineage[i] = [ 0.5 * utils.list_mean(self.lineage[off_i]) + 0.5 * utils.list_mean(self.lineage[off_j]) ] self.lineage[j] = [ 0.5 * utils.list_mean(self.lineage[off_i]) + 0.5 * utils.list_mean(self.lineage[off_j]) ] # Crossover for selected offsprings for i, j in zip(offsprings[0::2], offsprings[1::2]): if random.random() < self.crossover_prob: self.crossover_inplace(pop[i], pop[j]) # wwid1 = genealogy.crossover(int(pop[i].wwid.item()), int(pop[j].wwid.item()), gen) # wwid2 = genealogy.crossover(int(pop[i].wwid.item()), int(pop[j].wwid.item()), gen) # pop[i].wwid[0] = wwid1; pop[j].wwid[0] = wwid2 self.lineage[i] = [ 0.5 * utils.list_mean(self.lineage[i]) + 0.5 * utils.list_mean(self.lineage[j]) ] self.lineage[j] = [ 0.5 * utils.list_mean(self.lineage[i]) + 0.5 * utils.list_mean(self.lineage[j]) ] # Mutate all genes in the population except the new elitists for i in range(len(pop)): if i not in new_elitists: # Spare the new elitists if random.random() < self.mutation_prob: self.mutate_inplace(pop[i]) # genealogy.mutation(int(pop[net_i].wwid.item()), gen) self.all_offs[:] = offsprings[:] return new_elitists[0]
def train(self, gen): """Main training loop to do rollouts, neureoevolution, and policy gradients Parameters: gen (int): Current epoch of training Returns: None """ ################ ROLLOUTS ############## #Start Evo rollouts for id, actor in enumerate(self.pop): if self.eval_flag[id]: self.evo_task_pipes[id][0].send(True) self.eval_flag[id] = False ########## SOFT -JOIN ROLLOUTS ############ all_fitness = [] all_net_ids = [] all_eplens = [] all_shaped_fitness = [] while True: for i in range(self.args.pop_size): if self.evo_result_pipes[i][0].poll(): entry = self.evo_result_pipes[i][0].recv() all_fitness.append(entry[1]) all_net_ids.append(entry[0]) all_eplens.append(entry[2]) self.frames_seen += entry[2] all_shaped_fitness.append(entry[3]) self.eval_flag[i] = True # Soft-join (50%) if len(all_fitness) / self.args.pop_size >= self.args.asynch_frac: break # Add ALL EXPERIENCE COLLECTED TO MEMORY concurrently for _ in range(len(self.exp_list)): exp = self.exp_list.pop() self.add_experience(exp[0], exp[1], exp[2], exp[3], exp[4], exp[5]) ######################### END OF PARALLEL ROLLOUTS ################ ############ PROCESS MAX FITNESS ############# champ_index = all_net_ids[all_fitness.index(max(all_fitness))] if max(all_fitness) > self.best_score: self.best_score = max(all_fitness) utils.hard_update(self.best_policy, self.pop[champ_index]) if SAVE: torch.save(self.pop[champ_index].state_dict(), self.args.model_save + 'erl_best' + SAVE_TAG) print("Best policy saved with score", '%.2f' % max(all_fitness)) #Save champion periodically if gen % 5 == 0 and max(all_fitness) > (self.best_score - 100) and SAVE: torch.save(self.pop[champ_index].state_dict(), self.args.model_save + 'champ' + SAVE_TAG) torch.save(self.pop[champ_index].state_dict(), self.args.rl_models + 'champ' + SAVE_TAG) print("Champ saved with score ", '%.2f' % max(all_fitness)) if gen % 20 == 0 and SAVE: torch.save( self.pop[self.evolver.lineage.index(max( self.evolver.lineage))].state_dict(), self.args.model_save + 'eugenic_champ' + SAVE_TAG) print("Eugenic Champ saved with score ", '%.2f' % max(self.evolver.lineage)) if USE_RS: all_shaped_fitness = np.array(all_shaped_fitness) if self.best_shaped_score == None: self.best_shaped_score = [ 0.0 for _ in range(all_shaped_fitness.shape[1]) ] #First time run (set the best shaped score size to track a variable # of shaped fitnesses) max_shaped_fit = [max(a) for a in all_shaped_fitness.transpose()] for metric_id in range(len(max_shaped_fit)): if max_shaped_fit[metric_id] > self.best_shaped_score[ metric_id]: self.best_shaped_score[metric_id] = max_shaped_fit[ metric_id] shaped_champ_ind = all_net_ids[np.argmax( all_shaped_fitness[:, metric_id])] if SAVE: torch.save( self.pop[shaped_champ_ind].state_dict(), self.args.model_save + 'shaped_erl_best' + str(metric_id) + SAVE_TAG) print( "Best Shaped ERL policy saved with true score", '%.2f' % all_fitness[np.argmax( all_shaped_fitness[:, metric_id])], 'and shaped score of ', '%.2f' % max_shaped_fit[metric_id], 'for metric id', str(metric_id)) else: max_shaped_fit = None #NeuroEvolution's probabilistic selection and recombination step self.evolver.epoch(self.pop, all_net_ids, all_fitness, all_shaped_fitness) # Synch RL Agent to NE periodically if gen % 5 == 0: self.evolver.sync_rl(self.args.rl_models, self.pop) return max(all_fitness), all_eplens[all_fitness.index( max(all_fitness))], all_fitness, all_eplens, all_shaped_fitness
def rollout_worker(args, worker_id, task_pipe, result_pipe, noise, data_bucket, models_bucket, model_template): """Rollout Worker runs a simulation in the environment to generate experiences and fitness values Parameters: worker_id (int): Specific Id unique to each worker spun task_pipe (pipe): Receiver end of the task pipe used to receive signal to start on a task result_pipe (pipe): Sender end of the pipe used to report back results noise (object): A noise generator object exp_list (shared list object): A shared list object managed by a manager that is used to store experience tuples pop (shared list object): A shared list object managed by a manager used to store all the models (actors) difficulty (int): Difficulty of the task use_rs (bool): Use behavioral reward shaping? store_transition (bool): Log experiences to exp_list? Returns: None """ worker_id = worker_id env = Task_Rovers(args) models = [model_template for _ in range(args.num_rover)] for m in models: m = m.eval() while True: RENDER = task_pipe.recv( ) #Wait until a signal is received to start rollout # Get the current model state from the population for m, bucket_model in zip(models, models_bucket): utils.hard_update(m, bucket_model) fitness = 0.0 joint_state = env.reset() rollout_trajectory = [[] for _ in range(args.num_rover)] joint_state = utils.to_tensor(np.array(joint_state)) while True: #unless done joint_action = [ models[i].forward(joint_state[i, :]).detach().numpy() for i in range(args.num_rover) ] if noise != None: for action in joint_action: action += noise.noise() next_state, reward, done, info = env.step( joint_action) # Simulate one step in environment next_state = utils.to_tensor(np.array(next_state)) fitness += sum(reward) / args.coupling #If storing transitions for i in range(args.num_rover): rollout_trajectory[i].append([ np.expand_dims(utils.to_numpy(joint_state)[i, :], 0), np.expand_dims(np.array(joint_action)[i, :], 0), np.expand_dims(utils.to_numpy(next_state)[i, :], 0), np.expand_dims(np.array([reward[i]]), 0), np.expand_dims(np.array([done]), 0) ]) joint_state = next_state #DONE FLAG IS Received if done: if RENDER: env.render() #Push experiences to main for rover_id in range(args.num_rover): for entry in rollout_trajectory[rover_id]: for i in range(len(entry[0])): data_bucket[rover_id].append([ entry[0], entry[1], entry[2], entry[3], entry[4] ]) break #Send back id, fitness, total length and shaped fitness using the result pipe result_pipe.send([fitness])