def renew_learner( self ): #create a new learning agent, with randomized initial parameter self.learner = Learner(-1, self.algo, self.state_dim, self.action_dim, actor_lr=5e-5, critic_lr=1e-3, gamma=0.99, tau=5e-3, init_w=True, **self.td3args) self.actual_red_actor = self.learner.algo.actor
def train(flags): # pylint: disable=too-many-branches, too-many-statements ray.init() if flags.xpid is None: flags.xpid = "torchbeast-%s" % time.strftime("%Y%m%d-%H%M%S") flags.replay_batch_size = int(flags.batch_size * flags.replay_ratio) stat_keys = [ "total_loss", "mean_episode_return", "pg_loss", "baseline_loss", "entropy_loss", ] logger = logging.getLogger("logfile") flags.device = None if not flags.disable_cuda and torch.cuda.is_available(): logger.error("Using CUDA.") flags.device = torch.device("cuda") else: logger.error("Not using CUDA.") flags.device = torch.device("cpu") env = create_env(flags) actors = [] for i in range(flags.num_actors): actors.append(Actor.remote( flags, i, )) learner = Learner.remote(flags, actors, env.observation_space.shape[0], env.action_space.n, stat_keys) learner_handle = learner.train.remote() ray.wait([learner_handle]) ray.wait([actors[0].print_timings.remote()])
def __init__(self): """Parameter class stores all parameters for policy gradient Parameters: None Returns: None """ self.seed = SEED self.asynch_frac = 1.0 # Aynchronosity of NeuroEvolution self.algo = ALGO self.drqn = DRQN self.isolate_pg = ISOLATE_PG self.render = RENDER self.batch_size = BATCHSIZE # Batch size self.noise_std = 0.1 # Gaussian noise exploration std self.ucb_coefficient = 0.25 # ms: was 0.9 #Exploration coefficient in UCB self.gradperstep = GRADPERSTEP self.buffer_gpu = BUFFER_GPU self.rollout_size = ROLLOUT_SIZE # Size of learner rollouts # NeuroEvolution stuff self.pop_size = POP_SIZE self.elite_fraction = 0.2 self.crossover_prob = 0.15 self.mutation_prob = 0.90 #######unused######## self.extinction_prob = 0.005 # Probability of extinction event # Probabilty of extinction for each genome, given an extinction event self.extinction_magnituide = 0.5 self.weight_magnitude_limit = 10000000 self.mut_distribution = 1 # 1-Gaussian, 2-Laplace, 3-Uniform # Save Results if ALGO == 'dis': a = make_self_play_env(trainers=[[], []]) # actually does not need trainers, only want blue_agent_trainer dummy_env, blue_agent_trainer = make_self_play_env( trainers=[[], []], blue_use_drqn=DRQN) # blue_agent_trainer this is actually two trainer self.blue_trainer = blue_agent_trainer[0] self.blue_trainer.share_memory() self.action_dim = dummy_env.action_dim self.state_dim = dummy_env.state_dim self.action_low = 0 self.action_high = 1 elif ALGO == 'TD3_tennis': no_graphics = not RENDER dummy_env, self.action_dim, self.state_dim = make_tennis_env.TennisEnvFactory( seed=SEED, no_graphics=no_graphics, pid=-1).getEnv() self.action_low = -1.0 self.action_high = +1.0 #according to unity document td3args = { 'policy_noise': 0.2, 'policy_noise_clip': 0.5, 'policy_ups_freq': 2, 'action_low': self.action_low, 'action_high': self.action_high, 'cerl_args': self } self.blue_trainer = Learner(-1, 'TD3', self.state_dim, self.action_dim, actor_lr=5e-5, critic_lr=1e-3, gamma=0.99, tau=5e-3, init_w=True, **td3args) self.blue_trainer.share_memory() else: dummy_env = gym.make(ENV_NAME) self.state_dim = dummy_env.observation_space.shape[0] self.action_dim = dummy_env.action_space.shape[0] self.action_low = float(dummy_env.action_space.low[0]) self.action_high = float(dummy_env.action_space.high[0]) self.savefolder = 'Results/' if not os.path.exists('Results/'): os.makedirs('Results/') if not os.path.exists('pytorch_models/'): os.makedirs('pytorch_models/') self.aux_folder = self.savefolder + 'Auxiliary/' if not os.path.exists(self.aux_folder): os.makedirs(self.aux_folder)
def initialize_portfolio(portfolio, args, genealogy, portfolio_id): """Portfolio of learners Parameters: portfolio (list): Incoming list args (object): param class Returns: portfolio (list): Portfolio of learners """ if portfolio_id == 10: td3args = { 'policy_noise': 0.2, 'policy_noise_clip': 0.5, 'policy_ups_freq': 2, 'action_low': args.action_low, 'action_high': args.action_high } # Learner 1 wwid = genealogy.new_id('learner_1') portfolio.append( Learner(wwid, 'TD3', args.state_dim, args.action_dim, actor_lr=1e-3, critic_lr=1e-3, gamma=0.9, tau=5e-3, init_w=True, **td3args)) # Learner 3 wwid = genealogy.new_id('learner_3') portfolio.append( Learner(wwid, 'TD3', args.state_dim, args.action_dim, actor_lr=1e-3, critic_lr=1e-3, gamma=0.99, tau=5e-3, init_w=True, **td3args)) # Learner 4 wwid = genealogy.new_id('learner_4') portfolio.append( Learner(wwid, 'TD3', args.state_dim, args.action_dim, actor_lr=1e-3, critic_lr=1e-3, gamma=0.997, tau=5e-3, init_w=True, **td3args)) # Learner 4 wwid = genealogy.new_id('learner_4') portfolio.append( Learner(wwid, 'TD3', args.state_dim, args.action_dim, actor_lr=1e-3, critic_lr=1e-3, gamma=0.9995, tau=5e-3, init_w=True, **td3args)) if portfolio_id == 11: td3args = { 'policy_noise': 0.2, 'policy_noise_clip': 0.5, 'policy_ups_freq': 2, 'action_low': args.action_low, 'action_high': args.action_high } # Learner 1 wwid = genealogy.new_id('learner_1') portfolio.append( Learner(wwid, 'TD3', args.state_dim, args.action_dim, actor_lr=1e-3, critic_lr=1e-3, gamma=0.9, tau=5e-3, init_w=True, **td3args)) if portfolio_id == 12: td3args = { 'policy_noise': 0.2, 'policy_noise_clip': 0.5, 'policy_ups_freq': 2, 'action_low': args.action_low, 'action_high': args.action_high } # Learner 1 wwid = genealogy.new_id('learner_1') portfolio.append( Learner(wwid, 'TD3', args.state_dim, args.action_dim, actor_lr=1e-3, critic_lr=1e-3, gamma=0.99, tau=5e-3, init_w=True, **td3args)) if portfolio_id == 13: td3args = { 'policy_noise': 0.2, 'policy_noise_clip': 0.5, 'policy_ups_freq': 2, 'action_low': args.action_low, 'action_high': args.action_high } # Learner 1 wwid = genealogy.new_id('learner_1') portfolio.append( Learner(wwid, 'TD3', args.state_dim, args.action_dim, actor_lr=1e-3, critic_lr=1e-3, gamma=0.997, tau=5e-3, init_w=True, **td3args)) if portfolio_id == 14: td3args = { 'policy_noise': 0.2, 'policy_noise_clip': 0.5, 'policy_ups_freq': 2, 'action_low': args.action_low, 'action_high': args.action_high } # Learner 1 wwid = genealogy.new_id('learner_1') portfolio.append( Learner(wwid, 'TD3', args.state_dim, args.action_dim, actor_lr=1e-3, critic_lr=1e-3, gamma=0.9995, tau=5e-3, init_w=True, **td3args)) ##############MOTIVATING EXAMPLE ####### if portfolio_id == 100: td3args = { 'policy_noise': 0.2, 'policy_noise_clip': 0.5, 'policy_ups_freq': 2, 'action_low': args.action_low, 'action_high': args.action_high } # Learner 1 wwid = genealogy.new_id('learner_1') portfolio.append( Learner(wwid, 'TD3', args.state_dim, args.action_dim, actor_lr=1e-3, critic_lr=1e-3, gamma=0.0, tau=5e-3, init_w=True, **td3args)) # Learner 2 wwid = genealogy.new_id('learner_2') portfolio.append( Learner(wwid, 'TD3', args.state_dim, args.action_dim, actor_lr=1e-3, critic_lr=1e-3, gamma=1.0, tau=5e-3, init_w=True, **td3args)) if portfolio_id == 101: td3args = { 'policy_noise': 0.2, 'policy_noise_clip': 0.5, 'policy_ups_freq': 2, 'action_low': args.action_low, 'action_high': args.action_high } # Learner 3 wwid = genealogy.new_id('learner_3') portfolio.append( Learner(wwid, 'TD3', args.state_dim, args.action_dim, actor_lr=1e-3, critic_lr=1e-3, gamma=0.0, tau=5e-3, init_w=True, **td3args)) if portfolio_id == 102: td3args = { 'policy_noise': 0.2, 'policy_noise_clip': 0.5, 'policy_ups_freq': 2, 'action_low': args.action_low, 'action_high': args.action_high } # Learner 1 wwid = genealogy.new_id('learner_1') portfolio.append( Learner(wwid, 'TD3', args.state_dim, args.action_dim, actor_lr=1e-3, critic_lr=1e-3, gamma=1.0, tau=5e-3, init_w=True, **td3args)) return portfolio
class Evaluator(object): def __init__( self, CERL_agent, num_workers, trainers, pomdp_adv=False ): #trainers first is the blue agent and second is the red model self.num_workers = num_workers self.trainers = trainers self.pomdp_adv = pomdp_adv self.args = CERL_agent.args self.drqn = CERL_agent.args.drqn #denote if blue uses drqn if self.pomdp_adv: self.trainers = [trainers[0], None] #make sure the red model is never used self.buffer_gpu = CERL_agent.args.buffer_gpu self.batch_size = CERL_agent.args.batch_size self.algo = CERL_agent.args.algo self.state_dim = CERL_agent.args.state_dim self.action_dim = CERL_agent.args.action_dim self.buffer = Buffer(BUFFER_SIZE, self.buffer_gpu) #initialize own replay buffer self.data_bucket = self.buffer.tuples self.evo_task_pipes = [Pipe() for _ in range(self.num_workers)] self.evo_result_pipes = [Pipe() for _ in range(self.num_workers)] self.actual_red_worker = Actor( CERL_agent.args.state_dim, CERL_agent.args.action_dim, -1, 'dis') #this model is shared accross the workers self.actual_red_worker.share_memory() self.td3args = { 'policy_noise': 0.2, 'policy_noise_clip': 0.5, 'policy_ups_freq': 2, 'action_low': CERL_agent.args.action_low, 'action_high': CERL_agent.args.action_high, 'cerl_args': self.args } self.renew_learner( ) #now we are not using new learner for each iteration self.rollout_bucket = [ self.actual_red_worker for i in range(num_workers) ] self.workers = [ Process(target=rollout_worker, args=(id, 3, self.evo_task_pipes[id][1], self.evo_result_pipes[id][0], False, self.data_bucket, self.rollout_bucket, 'dummy_name', None, 'dis', self.trainers, False, self.pomdp_adv)) for id in range(num_workers) ] for worker in self.workers: worker.start() self.evo_flag = [True for _ in range(self.num_workers)] #def initialize(self, actor_in): #use the given actor parameter to initialize the red actor # utils.hard_update(self.actual_red_actor, actor_in) def renew_learner( self ): #create a new learning agent, with randomized initial parameter self.learner = Learner(-1, self.algo, self.state_dim, self.action_dim, actor_lr=5e-5, critic_lr=1e-3, gamma=0.99, tau=5e-3, init_w=True, **self.td3args) self.actual_red_actor = self.learner.algo.actor def collect_trajectory(self): utils.hard_update(self.actual_red_worker, self.actual_red_actor) #first snyc the actor #launch rollout_workers for id, actor in enumerate(self.rollout_bucket): if self.evo_flag[id]: self.evo_task_pipes[id][0].send( (id, 0)) #second argument in send is dummy self.evo_flag[id] = False #wait for the rollout to complete and record fitness all_fitness = [] for i in range(self.num_workers): entry = self.evo_result_pipes[i][1].recv() all_fitness.append(entry[1]) self.evo_flag[i] = True self.buffer.referesh() #update replay buffer return all_fitness def train_red( self, training_iterations ): #alternate between collect_trajectory and parameter update while self.buffer.__len__() < self.batch_size * 10: ###BURN IN PERIOD self.collect_trajectory() for i in range(training_iterations): self.collect_trajectory() self.buffer.tensorify() # Tensorify the buffer for fast sampling self.learner.update_parameters(self.buffer, self.buffer_gpu, self.batch_size, 2) #2 update steps def evaluate( self ): #evaluate the quality of blue agent policy, by training a red against it, after evaluation, erase the reply buffer and renew learner self.train_red(TRAIN_ITERATION) self.clear_buffer() #self.renew_learner() return self.evaluate_fixed_agents( self.trainers[0], self.trainers[1], [self.actual_red_actor ]) #calculate the mean and std of the evaluation metric def evaluate_fixed_agents( self, blue_dqn, red_model, red_actor_list, num_iterations=25 ): #evaluate the performance given agents, use random neutral and red agent if self.algo == 'dis': # make env with blue and red policy agent inside, dis_env = make_self_play_env( seed=0, return_policy_agent=False, trainers=[blue_dqn, red_model] )[0] # trainer if not None, first is the shared DQN agent, second is the best red policy env = EnvironmentWrapper( '', self.algo, dis_env, 0) # the "0" is the index for training blue agent elif self.algo == 'TD3_tennis': tennis_env = make_tennis_env.TennisEnvFactory( seed=np.random.choice(np.array(range(len(self.pop)))), no_graphics=True, pid=-1).getEnv()[0] env = EnvironmentWrapper('Tennis', self.algo, tennis_env, 0) else: raise Exception("only work for 'dis' envir?") average_reward = 0 eps = 0 average_red_reward = 0 red_count = 0 average_actual_blue_reward = 0 blue_count = 0 belief_and_true_type_list = [] assert len(red_actor_list ) is not None, "make sure to input a list of possible red" for it in range(num_iterations): belief_and_true_type = [] if not self.pomdp_adv: # if pomdp_adv, make sure that TD3_actor is never used red_actor = random.choice(red_actor_list) env.set_TD3_actor(red_actor) fitness = 0.0 # here fitness if simplely reward state = env.reset() belief_and_true_type.append(env.belief_and_true_type()) env.randomize_neu_adv() if self.pomdp_adv: env.try_set_pomdp_adv( ) # try to set if opponent to pomdp adv if opponent is adversary, else do nothing render_flag = (np.random.random() < 0.05) while True: # unless done action = blue_dqn.act(state, eps=eps) next_state, reward, done, info = env.step( copy.deepcopy(action), use_actual_reward=self.drqn) belief_and_true_type.append(env.belief_and_true_type()) if render_flag and self.args.render: env.render() state = next_state fitness += reward if done: average_red_reward += env.get_red_reward( ) if env.get_red_reward() is not None else 0 average_actual_blue_reward += env.get_blue_actual_reward( ) if env.get_blue_actual_reward() is not None else 0 red_count += 1 if env.get_red_reward() is not None else 0 blue_count += 1 if env.get_blue_actual_reward( ) is not None else 0 if render_flag: env.env.close() break belief_and_true_type_list.append(belief_and_true_type) average_reward += fitness average_reward /= num_iterations if red_count != 0: average_red_reward /= red_count if blue_count != 0: average_actual_blue_reward /= blue_count return average_reward, average_red_reward, average_actual_blue_reward, belief_and_true_type_list def clear_buffer(self): self.buffer.clear_buffer_data() #reinitialize replay buffer def kill_processes(self): for id, actor in enumerate(self.rollout_bucket): self.evo_task_pipes[id][0].send( ('TERMINATE', 0)) #second argument in send is dummy def __del__(self): self.kill_processes()