class CERL_Agent: """Main CERL class containing all methods for CERL Parameters: args (int): Parameter class with all the parameters """ def __init__(self, args): # need to intialize rollout_workers to have blue agent self.args = args self.evolver = SSNE( self.args) # this evolver implements neuro-evolution # MP TOOLS self.manager = Manager() self.mutate_algos = [ Mutation_Add(self), Mutation_Delete(self), Mutation_Exchange(self) ] #store all the mutate algorithm objects # Genealogy tool self.genealogy = Genealogy() # Init BUFFER self.replay_buffer = Buffer(1000000, self.args.buffer_gpu) #if SA_FLAG: self.metrics = [] self.last_portfolio = None self.T_max = 30 self.T = self.T_max self.T_min = 0.2 self.decay_rate = 0.975 # Initialize population self.pop = self.manager.list() for _ in range(args.pop_size): wwid = self.genealogy.new_id('evo') if ALGO == 'SAC': self.pop.append( GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, wwid)) elif ALGO == 'TD3': self.pop.append( Actor(args.state_dim, args.action_dim, wwid, ALGO)) # use ALGO to distinguish differe net architecture elif ALGO == 'dis' or 'TD3_tennis': self.pop.append( Actor(args.state_dim, args.action_dim, wwid, ALGO)) else: assert False, "invalid algorithm type" if ALGO == "SAC": self.best_policy = GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1) else: self.best_policy = Actor(args.state_dim, args.action_dim, -1, ALGO) if ALGO == 'dis': self.average_policy = AverageActor(args.state_dim, args.action_dim, -2, ALGO, self.pop, self.replay_buffer, args.buffer_gpu, args.batch_size, iterations=10) self.average_policy.share_memory() self.best_policy.share_memory() # added by macheng, share the best policy accross processes (used as internal belief update models for blue) # now we assign shared blue_trainer, we should train this agent such that the roll_out workers are also up to date # should make sure that self.best_policy (emergent learner) is also shared if ALGO == 'dis' or 'TD3_tennis': assert hasattr( args, "blue_trainer" ), "must have blue_agent trainer to intialize rollout_worker, see line 109, class Parameter definition" if ALGO == 'dis': trainers = [args.blue_trainer, self.average_policy] else: trainers = [args.blue_trainer, None ] if ALGO == 'TD3_tennis' else [] self.trainers = trainers self.blue_dqn = args.blue_trainer # Turn off gradients and put in eval mod for actor in self.pop: actor = actor.cpu() actor.eval() # Intialize portfolio of learners self.portfolio = [] self.portfolio = initialize_portfolio(self.portfolio, self.args, self.genealogy, PORTFOLIO_ID) self.complement_portfolio = [ ] #complementary of the portfolio, whatever not in the portfolio should be stored here self.total_rollout_bucket = self.manager.list( ) #macheng: we use total_rollout_bucket to represents the whole set of rollout models, now rollout_bukcet dynamically resize according to portforlio, for SA self.rollout_bucket = self.total_rollout_bucket #self.rollout_bucket = self.manager.list() #print("rollout_bucker needs to be updated, main.py line 239 ") for _ in range(len(self.portfolio)): if ALGO == 'SAC': self.rollout_bucket.append( GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1)) else: self.rollout_bucket.append( Actor(args.state_dim, args.action_dim, -1, ALGO)) # Initialize shared data bucket self.data_bucket = self.replay_buffer.tuples ############## MULTIPROCESSING TOOLS ################### # Evolutionary population Rollout workers self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_workers = [ Process(target=rollout_worker, args=(id, 0, self.evo_task_pipes[id][1], self.evo_result_pipes[id][0], False, self.data_bucket, self.pop, ENV_NAME, None, ALGO, self.trainers)) for id in range(args.pop_size) ] for worker in self.evo_workers: worker.start() self.evo_flag = [True for _ in range(args.pop_size)] # Learner rollout workers self.task_pipes = [Pipe() for _ in range(args.rollout_size)] self.result_pipes = [Pipe() for _ in range(args.rollout_size)] self.workers = [ Process(target=rollout_worker, args=(id, 1, self.task_pipes[id][1], self.result_pipes[id][0], True, self.data_bucket, self.rollout_bucket, ENV_NAME, args.noise_std, ALGO, self.trainers)) for id in range(args.rollout_size) ] for worker in self.workers: worker.start() self.roll_flag = [True for _ in range(args.rollout_size)] # Test bucket self.test_bucket = self.manager.list() if ALGO == 'SAC': self.test_bucket.append( GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1)) else: self.test_bucket.append( Actor(args.state_dim, args.action_dim, -1, ALGO)) # 5 Test workers self.test_task_pipes = [Pipe() for _ in range(TEST_SIZE)] self.test_result_pipes = [Pipe() for _ in range(TEST_SIZE)] self.test_workers = [ Process(target=rollout_worker, args=(id, 2, self.test_task_pipes[id][1], self.test_result_pipes[id][0], False, None, self.test_bucket, ENV_NAME, args.noise_std, ALGO, self.trainers)) for id in range(TEST_SIZE) ] for worker in self.test_workers: worker.start() self.test_flag = False # Meta-learning controller (Resource Distribution) self.allocation = [ ] #Allocation controls the resource allocation across learners for i in range(args.rollout_size): self.allocation.append( i % len(self.portfolio)) #Start uniformly (equal resources) # self.learner_stats = [{'fitnesses': [], 'ep_lens': [], 'value': 0.0, 'visit_count':0} for _ in range(len(self.portfolio))] #Track node statistsitic (each node is a learner), to compute UCB scores # Trackers self.best_score = -np.inf self.gen_frames = 0 self.total_frames = 0 self.best_shaped_score = None self.test_score = None self.test_std = None # trainer contains the blue_dqn to be trained, and the red model used for belief update, red_actor is the actual red agent trained against # id is the actual red agent id def _update_SA_temperature(self): self.T = max(self.T * self.decay_rate, self.T_min) def _get_accept_rate(self): if RANDOM_WALK: return 1.0 else: if self.metrics[-1] > self.metrics[-2]: return 1.0 else: return np.exp((self.metrics[-1] - self.metrics[-2]) / self.T) def _mutate(self): while True: mutate_algo_index = random.choice(range(3)) if self._try_mutate(mutate_algo_index): return def _try_mutate(self, algo_index): # 0 for add, 1 for delete, 2 for exchange return self.mutate_algos[algo_index].try_mutate() def simulated_annealing(self, metric): #take in the current metric self.metrics.append(metric) if self.last_portfolio: #has last_portfolio accept_rate = self._get_accept_rate() #based on self.metrics[-2:] self._update_SA_temperature() if np.random.random() > accept_rate: #reject self.portfolio = self.last_portfolio self.complement_portfolio = self.last_complement_portfolio self.last_portfolio = copy.copy( self.portfolio) #maintain a shallow copy as self.last_complement_portfolio = copy.copy(self.complement_portfolio) self._mutate() #perturb the portfolio # update rollout_bucket size, only the first len(self.portfolio) rollout_buckets are visible self.update_rollout_bucket() # update allocation, to be compatible with the current portfolio self.update_allocation() def update_rollout_bucket(self): self.rollout_bucket = self.total_rollout_bucket[:len(self.portfolio)] def train_blue_dqn( self, trainers, env_name, gen, ALGO='dis', pomdp_adv=False ): #in this method, rollout and training are done together, opponent sampled from the population NUM_EPISODE = 100 #train 100 episodes for the blue to converge to the new best response to red EPS_START = max(1.0 * 0.5**(gen - 10), 0.15) if gen >= 10 else 1.0 #initial epsilon EPS_END = 0.05 EPS_DECAY = 0.995 if ALGO == 'dis': # make env with blue and red policy agent inside, assert trainers is not None dis_env = make_self_play_env( seed=np.random.choice(np.array(range(len(self.pop)))), return_policy_agent=False, trainers=trainers )[0] # trainer if not None, first is the shared DQN agent, second is the best red policy env = EnvironmentWrapper( env_name, ALGO, dis_env, 0) # the "0" is the index for training blue agent elif ALGO == 'TD3_tennis': no_graphics = not RENDER tennis_env = make_tennis_env.TennisEnvFactory( seed=np.random.choice(np.array(range(len(self.pop)))), no_graphics=no_graphics, pid=-1).getEnv()[0] env = EnvironmentWrapper('Tennis', ALGO, tennis_env, 0) else: env = EnvironmentWrapper(env_name, ALGO) blue_dqn = trainers[0] average_reward = 0 eps = EPS_START average_red_reward = 0 red_count = 0 average_actual_blue_reward = 0 blue_count = 0 for it in range(NUM_EPISODE): if not pomdp_adv: #if pomdp_adv, make sure that TD3_actor is never used id = np.random.choice(np.array(range(len(self.pop)))) red_actor = self.pop[id] env.set_TD3_actor(red_actor) fitness = 0.0 #here fitness if simplely reward total_frame = 0 state = env.reset() env.randomize_neu_adv() if pomdp_adv: env.try_set_pomdp_adv( ) #try to set if opponent to pomdp adv if opponent is adversary, else do nothing render_flag = (np.random.random() < 0.05) while True: # unless done action = blue_dqn.act(state, eps=eps) # action = utils.to_numpy(action) next_state, reward, done, info = env.step( copy.deepcopy(action), use_actual_reward=DRQN ) #after calling env.step, evaluator initialized later does not work #should be something wrong with the internal red model? blue_dqn.step(state, action, reward, next_state, done) if render_flag and self.args.render: env.render() # next_state = utils.to_tensor(np.array(next_state)).unsqueeze(0) state = next_state fitness += reward total_frame += 1 # DONE FLAG IS Received if done: average_red_reward += env.get_red_reward( ) if env.get_red_reward() is not None else 0 average_actual_blue_reward += env.get_blue_actual_reward( ) if env.get_blue_actual_reward() is not None else 0 red_count += 1 if env.get_red_reward() is not None else 0 blue_count += 1 if env.get_blue_actual_reward( ) is not None else 0 if render_flag: env.env.close() break average_reward += fitness eps = max(EPS_END, EPS_DECAY * eps) if gen >= 10 and gen % 5 == 0: blue_dqn.save_net('./pytorch_models/train_blue_dqn_step_' + str(gen) + '.pth') average_reward /= NUM_EPISODE if red_count != 0: average_red_reward /= red_count if blue_count != 0: average_actual_blue_reward /= blue_count return average_reward, average_red_reward, average_actual_blue_reward def evaluate_training_fixed_blue( self): #this evaluate against the training opponent (red pop) self.evaluator.pomdp_adv = False return self.evaluator.evaluate_fixed_agents(self.trainers[0], self.trainers[1], self.pop) def train(self, gen, frame_tracker): """Main training loop to do rollouts, neureoevolution, and policy gradients Parameters: gen (int): Current epoch of training Returns: None """ ################ START ROLLOUTS ############## # Start Evolution rollouts if not ISOLATE_PG: for id, actor in enumerate(self.pop): if self.evo_flag[id]: self.evo_task_pipes[id][0].send((id, gen)) self.evo_flag[id] = False # Sync all learners actor to cpu (rollout) actor # (update rollout parameter using the learner parameter, such that rollout worker is up to date) for i, learner in enumerate(self.portfolio): #number of learner learner.algo.actor.cpu() utils.hard_update( self.rollout_bucket[i], learner.algo.actor ) #rollout bucket is now synchronized with learner to perform rollout for learner actors if torch.cuda.is_available(): learner.algo.actor.cuda() # Start Learner rollouts for rollout_id, learner_id in enumerate( self.allocation): #number of rollout_size if self.roll_flag[rollout_id]: self.task_pipes[rollout_id][0].send( (learner_id, gen) ) #allocation record the id of the learner that bucket should run, so rollout_id is the id of rollout_bucket self.roll_flag[rollout_id] = False # Start Test rollouts if gen % 5 == 0: self.test_flag = True for pipe in self.test_task_pipes: pipe[0].send((0, gen)) ############# UPDATE PARAMS USING GRADIENT DESCENT ########## # main training loop if self.replay_buffer.__len__( ) > self.args.batch_size * 10: ###BURN IN PERIOD self.replay_buffer.tensorify( ) # Tensorify the buffer for fast sampling # Spin up threads for each learner threads = [ threading.Thread( target=learner.update_parameters, args=(self.replay_buffer, self.args.buffer_gpu, self.args.batch_size, int(self.gen_frames * self.args.gradperstep))) for learner in self.portfolio ] #macheng: do we want to train all the learners? # Start threads for thread in threads: thread.start() # Join threads for thread in threads: thread.join() # Now update average_policy #self.average_policy.cuda() if ALGO == 'dis': self.average_policy.update( ) #update the average_policy parameter with supervised learning self.gen_frames = 0 #########Visualize Learner Critic Function################# # if self.replay_buffer.__len__() % 2500 == 0: # visualize_critic(self.portfolio[2], make_self_play_env(trainers=[[],[]])[0], 50) #arguments: Learner, env, N_GRID ########## SOFT -JOIN ROLLOUTS FOR EVO POPULATION ############ if not ISOLATE_PG: all_fitness = [] all_net_ids = [] all_eplens = [] while True: for i in range(self.args.pop_size): if self.evo_result_pipes[i][1].poll(): entry = self.evo_result_pipes[i][1].recv() all_fitness.append(entry[1]) all_net_ids.append(entry[0]) all_eplens.append(entry[2]) self.gen_frames += entry[2] self.total_frames += entry[2] self.evo_flag[i] = True # Soft-join (50%) if len(all_fitness ) / self.args.pop_size >= self.args.asynch_frac: break ########## HARD -JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############ for i in range(self.args.rollout_size): entry = self.result_pipes[i][1].recv() learner_id = entry[0] fitness = entry[1] num_frames = entry[2] self.portfolio[learner_id].update_stats(fitness, num_frames) self.gen_frames += num_frames self.total_frames += num_frames if fitness > self.best_score: self.best_score = fitness self.roll_flag[i] = True # Referesh buffer (housekeeping tasks - pruning to keep under capacity) self.replay_buffer.referesh() ######################### END OF PARALLEL ROLLOUTS ################ ############ PROCESS MAX FITNESS ############# # ms:best policy is always up to date # so here the best learner is saved if not ISOLATE_PG: champ_index = all_net_ids[all_fitness.index(max(all_fitness))] utils.hard_update(self.test_bucket[0], self.pop[champ_index]) if max(all_fitness) > self.best_score: self.best_score = max(all_fitness) utils.hard_update(self.best_policy, self.pop[champ_index]) if SAVE: torch.save( self.pop[champ_index].state_dict(), self.args.aux_folder + ENV_NAME + '_best' + SAVETAG) print("Best policy saved with score", '%.2f' % max(all_fitness)) else: #Run PG in isolation utils.hard_update(self.test_bucket[0], self.rollout_bucket[0]) ###### TEST SCORE ###### if self.test_flag: self.test_flag = False test_scores = [] for pipe in self.test_result_pipes: #Collect all results entry = pipe[1].recv() test_scores.append(entry[1]) test_scores = np.array(test_scores) test_mean = np.mean(test_scores) test_std = (np.std(test_scores)) # Update score to trackers frame_tracker.update([test_mean], self.total_frames) else: test_mean, test_std = None, None # NeuroEvolution's probabilistic selection and recombination step # ms: this epoch() method implements neuro-evolution if not ISOLATE_PG: #seems pop_size and rollout_size must be 10, otherwise this will produce error if gen % 5 == 0: self.evolver.epoch( gen, self.genealogy, self.pop, all_net_ids, all_fitness, self.rollout_bucket ) #this method also copies learner to evoler else: self.evolver.epoch(gen, self.genealogy, self.pop, all_net_ids, all_fitness, []) # META LEARNING - RESET ALLOCATION USING UCB if gen % 1 == 0: self.update_allocation() # Metrics if not ISOLATE_PG: champ_len = all_eplens[all_fitness.index(max(all_fitness))] champ_wwid = int(self.pop[champ_index].wwid.item()) max_fit = max(all_fitness) else: champ_len = num_frames champ_wwid = int(self.rollout_bucket[0].wwid.item()) all_fitness = [fitness] max_fit = fitness all_eplens = [num_frames] return max_fit, champ_len, all_fitness, all_eplens, test_mean, test_std, champ_wwid def update_allocation(self): self.allocation = ucb(len(self.allocation), self.portfolio, self.args.ucb_coefficient) def sim_and_eval_POMDP(self): self.evaluator = Evaluator( self, 5, self.trainers, pomdp_adv=True) # evaluator must be created before train_dqn for gen in range(1000000): print('gen=', gen) blue_score, red_score, actual_blue_score = agent.train_blue_dqn( agent.trainers, ENV_NAME, gen, ALGO='dis', pomdp_adv=True) print('Env', ENV_NAME, 'Gen', gen, ", Training average: Blue agent score: ", blue_score, " Red score: ", red_score, " Actual blue score: ", actual_blue_score) blue_score, red_score, actual_blue_score = self.evaluator.evaluate( ) print("Evaluation result: Blue agent score: ", blue_score, " Red score: ", red_score, " Actual blue score: ", actual_blue_score)
class CERL_Agent: """Main CERL class containing all methods for CERL Parameters: args (int): Parameter class with all the parameters """ def __init__(self, args): self.args = args self.evolver = SSNE(self.args) #MP TOOLS self.manager = Manager() #Genealogy tool self.genealogy = Genealogy() #Initialize population self.pop = self.manager.list() for _ in range(args.pop_size): wwid = self.genealogy.new_id('evo') if ALGO == 'SAC': self.pop.append( GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, wwid)) else: self.pop.append(Actor(args.state_dim, args.action_dim, wwid)) if ALGO == "SAC": self.best_policy = GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1) else: self.best_policy = Actor(args.state_dim, args.action_dim, -1) #Turn off gradients and put in eval mod for actor in self.pop: actor = actor.cpu() actor.eval() #Init BUFFER self.replay_buffer = Buffer(1000000, self.args.buffer_gpu) #Intialize portfolio of learners self.portfolio = [] self.portfolio = initialize_portfolio(self.portfolio, self.args, self.genealogy, PORTFOLIO_ID) self.rollout_bucket = self.manager.list() for _ in range(len(self.portfolio)): if ALGO == 'SAC': self.rollout_bucket.append( GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1)) else: self.rollout_bucket.append( Actor(args.state_dim, args.action_dim, -1)) # Initialize shared data bucket self.data_bucket = self.replay_buffer.tuples ############## MULTIPROCESSING TOOLS ################### #Evolutionary population Rollout workers self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_workers = [ Process(target=rollout_worker, args=(id, self.evo_task_pipes[id][1], self.evo_result_pipes[id][0], False, self.data_bucket, self.pop, ENV_NAME, None, ALGO)) for id in range(args.pop_size) ] for worker in self.evo_workers: worker.start() self.evo_flag = [True for _ in range(args.pop_size)] #Learner rollout workers self.task_pipes = [Pipe() for _ in range(args.rollout_size)] self.result_pipes = [Pipe() for _ in range(args.rollout_size)] self.workers = [ Process(target=rollout_worker, args=(id, self.task_pipes[id][1], self.result_pipes[id][0], True, self.data_bucket, self.rollout_bucket, ENV_NAME, args.noise_std, ALGO)) for id in range(args.rollout_size) ] for worker in self.workers: worker.start() self.roll_flag = [True for _ in range(args.rollout_size)] #Test bucket self.test_bucket = self.manager.list() if ALGO == 'SAC': self.test_bucket.append( GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1)) else: self.test_bucket.append(Actor(args.state_dim, args.action_dim, -1)) #5 Test workers self.test_task_pipes = [Pipe() for _ in range(TEST_SIZE)] self.test_result_pipes = [Pipe() for _ in range(TEST_SIZE)] self.test_workers = [ Process(target=rollout_worker, args=(id, self.test_task_pipes[id][1], self.test_result_pipes[id][0], False, None, self.test_bucket, ENV_NAME, args.noise_std, ALGO)) for id in range(TEST_SIZE) ] for worker in self.test_workers: worker.start() self.test_flag = False #Meta-learning controller (Resource Distribution) self.allocation = [ ] #Allocation controls the resource allocation across learners for i in range(args.rollout_size): self.allocation.append( i % len(self.portfolio)) #Start uniformly (equal resources) #self.learner_stats = [{'fitnesses': [], 'ep_lens': [], 'value': 0.0, 'visit_count':0} for _ in range(len(self.portfolio))] #Track node statistsitic (each node is a learner), to compute UCB scores #Trackers self.best_score = 0.0 self.gen_frames = 0 self.total_frames = 0 self.best_shaped_score = None self.test_score = None self.test_std = None def train(self, gen, frame_tracker): """Main training loop to do rollouts, neureoevolution, and policy gradients Parameters: gen (int): Current epoch of training Returns: None """ ################ START ROLLOUTS ############## #Start Evolution rollouts if not ISOLATE_PG: for id, actor in enumerate(self.pop): if self.evo_flag[id]: self.evo_task_pipes[id][0].send(id) self.evo_flag[id] = False #Sync all learners actor to cpu (rollout) actor for i, learner in enumerate(self.portfolio): learner.algo.actor.cpu() utils.hard_update(self.rollout_bucket[i], learner.algo.actor) learner.algo.actor.cuda() # Start Learner rollouts for rollout_id, learner_id in enumerate(self.allocation): if self.roll_flag[rollout_id]: self.task_pipes[rollout_id][0].send(learner_id) self.roll_flag[rollout_id] = False #Start Test rollouts if gen % 5 == 0: self.test_flag = True for pipe in self.test_task_pipes: pipe[0].send(0) ############# UPDATE PARAMS USING GRADIENT DESCENT ########## if self.replay_buffer.__len__( ) > self.args.batch_size * 10: ###BURN IN PERIOD self.replay_buffer.tensorify( ) # Tensorify the buffer for fast sampling #Spin up threads for each learner threads = [ threading.Thread( target=learner.update_parameters, args=(self.replay_buffer, self.args.buffer_gpu, self.args.batch_size, int(self.gen_frames * self.args.gradperstep))) for learner in self.portfolio ] # Start threads for thread in threads: thread.start() #Join threads for thread in threads: thread.join() self.gen_frames = 0 ########## SOFT -JOIN ROLLOUTS FOR EVO POPULATION ############ if not ISOLATE_PG: all_fitness = [] all_net_ids = [] all_eplens = [] while True: for i in range(self.args.pop_size): if self.evo_result_pipes[i][1].poll(): entry = self.evo_result_pipes[i][1].recv() all_fitness.append(entry[1]) all_net_ids.append(entry[0]) all_eplens.append(entry[2]) self.gen_frames += entry[2] self.total_frames += entry[2] self.evo_flag[i] = True # Soft-join (50%) if len(all_fitness ) / self.args.pop_size >= self.args.asynch_frac: break ########## HARD -JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############ for i in range(self.args.rollout_size): entry = self.result_pipes[i][1].recv() learner_id = entry[0] fitness = entry[1] num_frames = entry[2] self.portfolio[learner_id].update_stats(fitness, num_frames) self.gen_frames += num_frames self.total_frames += num_frames if fitness > self.best_score: self.best_score = fitness self.roll_flag[i] = True #Referesh buffer (housekeeping tasks - pruning to keep under capacity) self.replay_buffer.referesh() ######################### END OF PARALLEL ROLLOUTS ################ ############ PROCESS MAX FITNESS ############# if not ISOLATE_PG: champ_index = all_net_ids[all_fitness.index(max(all_fitness))] utils.hard_update(self.test_bucket[0], self.pop[champ_index]) if max(all_fitness) > self.best_score: self.best_score = max(all_fitness) utils.hard_update(self.best_policy, self.pop[champ_index]) if SAVE: torch.save( self.pop[champ_index].state_dict(), self.args.aux_folder + ENV_NAME + '_best' + SAVETAG) print("Best policy saved with score", '%.2f' % max(all_fitness)) else: #Run PG in isolation utils.hard_update(self.test_bucket[0], self.rollout_bucket[0]) ###### TEST SCORE ###### if self.test_flag: self.test_flag = False test_scores = [] for pipe in self.test_result_pipes: #Collect all results entry = pipe[1].recv() test_scores.append(entry[1]) test_scores = np.array(test_scores) test_mean = np.mean(test_scores) test_std = (np.std(test_scores)) # Update score to trackers frame_tracker.update([test_mean], self.total_frames) else: test_mean, test_std = None, None #NeuroEvolution's probabilistic selection and recombination step if not ISOLATE_PG: if gen % 5 == 0: self.evolver.epoch(gen, self.genealogy, self.pop, all_net_ids, all_fitness, self.rollout_bucket) else: self.evolver.epoch(gen, self.genealogy, self.pop, all_net_ids, all_fitness, []) #META LEARNING - RESET ALLOCATION USING UCB if gen % 1 == 0: self.allocation = ucb(len(self.allocation), self.portfolio, self.args.ucb_coefficient) #Metrics if not ISOLATE_PG: champ_len = all_eplens[all_fitness.index(max(all_fitness))] champ_wwid = int(self.pop[champ_index].wwid.item()) max_fit = max(all_fitness) else: champ_len = num_frames champ_wwid = int(self.rollout_bucket[0].wwid.item()) all_fitness = [fitness] max_fit = fitness all_eplens = [num_frames] return max_fit, champ_len, all_fitness, all_eplens, test_mean, test_std, champ_wwid
class ERL_Agent: """Main ERL class containing all methods for CERL Parameters: args (int): Parameter class with all the parameters """ def __init__(self, args): self.args = args self.evolver = SSNE(self.args) #MP TOOLS self.manager = Manager() #Init population self.pop = self.manager.list() for _ in range(args.pop_size): self.pop.append(Actor(args)) #self.pop[-1].apply(utils.init_weights) self.best_policy = Actor(args) #Turn off gradients and put in eval mode for actor in self.pop: actor = actor.cpu() actor.eval() if SEED_POP: self.load_seed(args.model_save, self.pop) #Init BUFFER self.replay_buffer = Buffer(100000, self.args.data_folder) #MP TOOLS self.exp_list = self.manager.list() self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_workers = [ Process(target=rollout_worker, args=(i, self.evo_task_pipes[i][1], self.evo_result_pipes[i][1], None, self.exp_list, self.pop, DIFFICULTY, USE_RS, True, USE_SYNTHETIC_TARGET, XBIAS, ZBIAS, PHASE_LEN, None, EP_LEN, JGS)) for i in range(args.pop_size) ] for worker in self.evo_workers: worker.start() #Trackers self.buffer_added = 0 self.best_score = 0.0 self.frames_seen = 0.0 self.best_shaped_score = None self.eval_flag = [True for _ in range(args.pop_size)] def load_seed(self, dir, pop): """Read models from drive and sync it into the population Parameters: dir (str): Folder location to pull models from pop (shared_list): population of models Returns: None """ list_files = os.listdir(dir) print(list_files) for i, model in enumerate(list_files): try: pop[i].load_state_dict(torch.load(dir + model)) pop[i].eval() except: print(model, 'Failed to load') def add_experience(self, state, action, next_state, reward, done_probs, done): """Process and send experiences to be added to the buffer Parameters: state (ndarray): Current State next_state (ndarray): Next State action (ndarray): Action reward (ndarray): Reward done_dist (ndarray): Temporal distance to done (#action steps after which the skselton fell over) done (ndarray): Done Returns: None """ self.buffer_added += 1 self.replay_buffer.push(state, next_state, action, reward, done_probs, done) if self.buffer_added % 100000 == 0: self.replay_buffer.save() def train(self, gen): """Main training loop to do rollouts, neureoevolution, and policy gradients Parameters: gen (int): Current epoch of training Returns: None """ ################ ROLLOUTS ############## #Start Evo rollouts for id, actor in enumerate(self.pop): if self.eval_flag[id]: self.evo_task_pipes[id][0].send(True) self.eval_flag[id] = False ########## SOFT -JOIN ROLLOUTS ############ all_fitness = [] all_net_ids = [] all_eplens = [] all_shaped_fitness = [] while True: for i in range(self.args.pop_size): if self.evo_result_pipes[i][0].poll(): entry = self.evo_result_pipes[i][0].recv() all_fitness.append(entry[1]) all_net_ids.append(entry[0]) all_eplens.append(entry[2]) self.frames_seen += entry[2] all_shaped_fitness.append(entry[3]) self.eval_flag[i] = True # Soft-join (50%) if len(all_fitness) / self.args.pop_size >= self.args.asynch_frac: break # Add ALL EXPERIENCE COLLECTED TO MEMORY concurrently for _ in range(len(self.exp_list)): exp = self.exp_list.pop() self.add_experience(exp[0], exp[1], exp[2], exp[3], exp[4], exp[5]) ######################### END OF PARALLEL ROLLOUTS ################ ############ PROCESS MAX FITNESS ############# champ_index = all_net_ids[all_fitness.index(max(all_fitness))] if max(all_fitness) > self.best_score: self.best_score = max(all_fitness) utils.hard_update(self.best_policy, self.pop[champ_index]) if SAVE: torch.save(self.pop[champ_index].state_dict(), self.args.model_save + 'erl_best' + SAVE_TAG) print("Best policy saved with score", '%.2f' % max(all_fitness)) #Save champion periodically if gen % 5 == 0 and max(all_fitness) > (self.best_score - 100) and SAVE: torch.save(self.pop[champ_index].state_dict(), self.args.model_save + 'champ' + SAVE_TAG) torch.save(self.pop[champ_index].state_dict(), self.args.rl_models + 'champ' + SAVE_TAG) print("Champ saved with score ", '%.2f' % max(all_fitness)) if gen % 20 == 0 and SAVE: torch.save( self.pop[self.evolver.lineage.index(max( self.evolver.lineage))].state_dict(), self.args.model_save + 'eugenic_champ' + SAVE_TAG) print("Eugenic Champ saved with score ", '%.2f' % max(self.evolver.lineage)) if USE_RS: all_shaped_fitness = np.array(all_shaped_fitness) if self.best_shaped_score == None: self.best_shaped_score = [ 0.0 for _ in range(all_shaped_fitness.shape[1]) ] #First time run (set the best shaped score size to track a variable # of shaped fitnesses) max_shaped_fit = [max(a) for a in all_shaped_fitness.transpose()] for metric_id in range(len(max_shaped_fit)): if max_shaped_fit[metric_id] > self.best_shaped_score[ metric_id]: self.best_shaped_score[metric_id] = max_shaped_fit[ metric_id] shaped_champ_ind = all_net_ids[np.argmax( all_shaped_fitness[:, metric_id])] if SAVE: torch.save( self.pop[shaped_champ_ind].state_dict(), self.args.model_save + 'shaped_erl_best' + str(metric_id) + SAVE_TAG) print( "Best Shaped ERL policy saved with true score", '%.2f' % all_fitness[np.argmax( all_shaped_fitness[:, metric_id])], 'and shaped score of ', '%.2f' % max_shaped_fit[metric_id], 'for metric id', str(metric_id)) else: max_shaped_fit = None #NeuroEvolution's probabilistic selection and recombination step self.evolver.epoch(self.pop, all_net_ids, all_fitness, all_shaped_fitness) # Synch RL Agent to NE periodically if gen % 5 == 0: self.evolver.sync_rl(self.args.rl_models, self.pop) return max(all_fitness), all_eplens[all_fitness.index( max(all_fitness))], all_fitness, all_eplens, all_shaped_fitness