class Visualizer(object): def __init__(self, game, network, train_directory): self.game = game env_name = '%sNoFrameskip-v4' % game env = gym.make(env_name) # env = gym.wrappers.Monitor(env, '/tmp/temp_%s' % game, mode='evaluation', force=True) vb_file = os.path.join(train_directory, "vb.npy") vb = np.load(vb_file) parameters_file = 'parameters_81' self.policy = Policy(env, network, "relu") parameters_path = os.path.join(train_directory, parameters_file) print('Using parameters file %s \n' % parameters_path) with open(parameters_path, 'rb') as f: parameters = pickle.load(f)['parameters'] self.policy.set_parameters(parameters) self.policy.set_vb(vb) def play_game(self): rews = [0]*100 for i in range(100): rew,step = self.policy.rollout() rews[i] = rew print(np.mean(rews)) print(np.max(rews)) print(rews)
class Visualizer(object): def __init__(self, game, network, train_directory): self.game = game env_name = '%sNoFrameskip-v4' % game env = gym.make(env_name) env = gym.wrappers.Monitor(env, '/tmp/temp_%s' % game, mode='evaluation', force=True) vb_file = os.path.join(train_directory, "vb.npy") vb = np.load(vb_file) parameters_file = sorted(os.listdir(train_directory))[-3] self.policy = Policy(env, network, "elu") parameters_path = os.path.join(train_directory, parameters_file) print('Using parameters file %s \n' % parameters_path) with open(parameters_path, 'rb') as f: parameters = pickle.load(f)['params'] self.policy.set_parameters(parameters) self.policy.set_vb(vb) def play_game(self): print(self.policy.rollout(render=True))
def main(ep_per_cpu, game, configuration_file, run_name): start_time = time.time() with open(configuration_file, 'r') as f: configuration = json.loads(f.read()) env_name = '%sNoFrameskip-v4' % game # MPI stuff comm = MPI.COMM_WORLD rank = comm.Get_rank() cpus = comm.Get_size() # One cpu (rank 0) will evaluate results train_cpus = cpus - 1 k = 10 epoch = 5 m = 1 # Deduce population size lam = train_cpus * ep_per_cpu # Create environment env = gym.make(env_name) # Create policy (Deep Neural Network) # Internally it applies preprocessing to the environment state policy = Policy(env, network=configuration['network'], nonlin_name=configuration['nonlin_name']) # Create reference batch used for normalization # It will be overwritten with vb from worker with rank 0 vb = policy.get_vb() # Extract vector with current parameters. #parameter have length of 1693380 parameters = policy.get_parameters() shape = policy.parameter_shapes # Send parameters from worker 0 to all workers (MPI stuff) # to ensure that every worker starts in the same position #comm.Bcast([parameters, MPI.FLOAT], root=0) comm.Bcast([vb, MPI.FLOAT], root=0) # Create optimizer with user defined settings (hyperparameters) OptimizerClass = optimizer_dict[configuration['optimizer']] optimizer = OptimizerClass(train_cpus, parameters, shape, lam, rank, configuration["settings"], epoch, m) # Set the same virtual batch for each worker if rank != 0: policy.set_vb(vb) rews = [0] e_r = 0 p = optimizer.get_parameters() policy.set_parameters(p) for j in range(k): e_rew, e_len = policy.rollout() e_r += e_rew rews[0] = e_r / k optimizer.rew = e_r / k msg = np.array(rews) pp = p # Only rank 0 worker will log information from the training logger = Logger( optimizer.log_path(game, configuration['network'], run_name)) if rank == 0: # Initialize logger, save virtual batch and save some basic stuff at the beginning logger.save_vb(vb) logger.log('Game'.ljust(25) + '%s' % game) logger.log('Network'.ljust(25) + '%s' % configuration['network']) logger.log('Optimizer'.ljust(25) + '%s' % configuration['optimizer']) logger.log('Number of CPUs'.ljust(25) + '%d' % cpus) logger.log('Population'.ljust(25) + '%d' % lam) logger.log('Dimensionality'.ljust(25) + '%d' % len(parameters)) # Log basic info from the optimizer # optimizer.log_basic(logger) msg = np.zeros(1) pp = np.zeros(optimizer.n) results = np.empty((cpus, 1)) ppp = np.empty((cpus, optimizer.n)) comm.Allgather([msg, MPI.FLOAT], [results, MPI.FLOAT]) comm.Allgather([pp, MPI.FLOAT], [ppp, MPI.FLOAT]) results = results[1:, :] ppp = ppp[1:, :].flatten() rews = results[:, :1].flatten() BestScore = max(rews) Bestid = np.argmax(rews) BestFound = ppp[Bestid * optimizer.n:(Bestid + 1) * optimizer.n] if rank == 0: logger.log('Best'.ljust(25) + '%f' % BestScore) # We will count number of steps # frames = 4 * steps steps_passed = 0 iteration = 1 while steps_passed <= 25000000: # Iteration start time iter_start_time = time.time() if iteration % epoch == 1: optimizer.sigupdatelist = np.zeros(optimizer.n) llambda = np.random.normal(1, 0.1 - 0.1 * steps_passed / 25000000) # Workers that run train episodes optimizer.RandomGrouping() for ii in range(m): optimizer.groupnum = ii if rank != 0: # Empty arrays for each episode. We save: length, reward, noise index lens1 = [0] rews1 = [0] orew = [0] # sig1 = [0] # For each episode in this CPU we get new parameters, # update policy network and perform policy rollout e_r = 0 e_l = 0 p = optimizer.get_parameters1() policy.set_parameters(p) for j in range(k): e_rew, e_len = policy.rollout() e_r += e_rew e_l += e_len lens1[0] = e_l rews1[0] = e_r / k optimizer.rew1 = e_r / k orew[0] = optimizer.rew if iteration % epoch == 1: sig1 = optimizer.sigmalist # Aggregate information, will later send it to each worker using MPI msg1 = np.array(rews1 + lens1 + orew, dtype=np.float64) pp1 = optimizer.parameters1 if iteration % epoch == 1: sigmsg1 = sig1 # Worker rank 0 that runs evaluation episodes else: # Empty array, evaluation results are not used for the update msg1 = np.zeros(3, dtype=np.float64) pp1 = optimizer.parameters if iteration % epoch == 1: sigmsg1 = np.zeros(optimizer.n) # MPI stuff # Initialize array which will be updated with information from all workers using MPI results1 = np.empty((cpus, 3), dtype=np.float64) ppp1 = np.empty((cpus, optimizer.n)) if iteration % epoch == 1: sigmsgs1 = np.empty((cpus, optimizer.n)) comm.Allgather([msg1, MPI.FLOAT], [results1, MPI.FLOAT]) comm.Allgather([pp1, MPI.FLOAT], [ppp1, MPI.FLOAT]) if iteration % epoch == 1: comm.Allgather([sigmsg1, MPI.FLOAT], [sigmsgs1, MPI.FLOAT]) ppp1 = ppp1[1:, :].flatten() if iteration % epoch == 1: sigmsgs1 = sigmsgs1[1:, :].flatten() # Skip empty evaluation results from worker with id 0 results1 = results1[1:, :] # Extract IDs and rewards rews1 = results1[:, :1].flatten() lens1 = results1[:, 1:2].flatten() oreward = results1[:, 2:].flatten() newBestidx = np.argmax(rews1) if np.max(rews1) > BestScore: BestScore = rews1[newBestidx] BestFound = ppp1[newBestidx * optimizer.n:(newBestidx + 1) * optimizer.n] #uodate parameters, sigmas, rews if rank != 0: optimizer.update(ppp, BestScore, sigmsgs1, llambda) # Steps passed = Sum of episode steps from all offsprings steps = np.sum(lens1) steps_passed += steps # Write some logs for this iteration # Using logs we are able to recover solution saved # after 1 hour of training or after 1 billion frames if rank == 0: eval_mean_rew = np.mean(oreward) eval_mean_rew1 = np.mean(rews1) iteration_time = (time.time() - iter_start_time) time_elapsed = (time.time() - start_time) / 60 logger.log('------------------------------------') logger.log('Iteration'.ljust(25) + '%f' % iteration) logger.log('EvalMeanReward'.ljust(25) + '%f' % eval_mean_rew) logger.log('EvalMeanReward1'.ljust(25) + '%f' % eval_mean_rew1) logger.log('StepsThisIter'.ljust(25) + '%f' % steps) logger.log('StepsSinceStart'.ljust(25) + '%f' % steps_passed) logger.log('IterationTime'.ljust(25) + '%f' % iteration_time) logger.log('TimeSinceStart'.ljust(25) + '%f' % time_elapsed) logger.log('Best'.ljust(25) + '%f' % BestScore) # Give optimizer a chance to log its own stuff # optimizer.log(logger) logger.log('------------------------------------') if iteration % 20 == 1: fin_rews = 0 p = BestFound policy.set_parameters(p) for i in range(30): e_rew, e_len = policy.rollout() fin_rews += e_rew fin_eval = fin_rews / 30 else: fin_eval = 0 # Write stuff for training curve plot stat_string = "{},\t{},\t{},\t{}\n".\ format(steps_passed, (time.time()-start_time), eval_mean_rew1, fin_eval) logger.write_general_stat(stat_string) logger.write_optimizer_stat(optimizer.stat_string()) # Save currently proposed solution every 20 iterations if iteration % 20 == 1: logger.save_parameters(BestFound, iteration) else: if iteration % epoch == 0: optimizer.updatesigma() iteration += 1 #test best if rank == 0: final_rews = [] p = BestFound policy.set_parameters(p) for i in range(200): e_rew, e_len = policy.rollout() final_rews.append(e_rew) final_eval = np.mean(final_rews) logger.log('Final'.ljust(25) + '%f' % final_eval) logger.save_parameters(BestFound, iteration)
def main(ep_per_cpu, game, configuration_file, run_name): start_time = time.time() with open(configuration_file, 'r') as f: configuration = json.loads(f.read()) env_name = '%sNoFrameskip-v4' % game # MPI stuff comm = MPI.COMM_WORLD rank = comm.Get_rank() cpus = comm.Get_size() # One cpu (rank 0) will evaluate results train_cpus = cpus - 1 # Deduce population size lam = train_cpus * ep_per_cpu # Create environment env = gym.make(env_name) # Create policy (Deep Neural Network) # Internally it applies preprocessing to the environment state policy = Policy(env, network=configuration['network'], nonlin_name=configuration['nonlin_name']) # Create reference batch used for normalization # It will be overwritten with vb from worker with rank 0 vb = policy.get_vb() # Extract vector with current parameters. parameters = policy.get_parameters() # Send parameters from worker 0 to all workers (MPI stuff) # to ensure that every worker starts in the same position comm.Bcast([parameters, MPI.FLOAT], root=0) comm.Bcast([vb, MPI.FLOAT], root=0) # Set the same virtual batch for each worker if rank != 0: policy.set_vb(vb) # Create optimizer with user defined settings (hyperparameters) OptimizerClass = optimizer_dict[configuration['optimizer']] optimizer = OptimizerClass(parameters, lam, rank, configuration["settings"]) # Only rank 0 worker will log information from the training logger = None if rank == 0: # Initialize logger, save virtual batch and save some basic stuff at the beginning logger = Logger( optimizer.log_path(game, configuration['network'], run_name)) logger.save_vb(vb) # Log basic stuff logger.log('Game'.ljust(25) + '%s' % game) logger.log('Network'.ljust(25) + '%s' % configuration['network']) logger.log('Optimizer'.ljust(25) + '%s' % configuration['optimizer']) logger.log('Number of CPUs'.ljust(25) + '%d' % cpus) logger.log('Population'.ljust(25) + '%d' % lam) logger.log('Dimensionality'.ljust(25) + '%d' % len(parameters)) # Log basic info from the optimizer optimizer.log_basic(logger) # We will count number of steps # frames = 4 * steps (3 * steps for SpaceInvaders) steps_passed = 0 while steps_passed <= 25000000: # Iteration start time iter_start_time = time.time() # Workers that run train episodes if rank != 0: # Empty arrays for each episode. We save: length, reward, noise index lens = [0] * ep_per_cpu rews = [0] * ep_per_cpu inds = [0] * ep_per_cpu # For each episode in this CPU we get new parameters, # update policy network and perform policy rollout for i in range(ep_per_cpu): ind, p = optimizer.get_parameters() policy.set_parameters(p) e_rew, e_len = policy.rollout() lens[i] = e_len rews[i] = e_rew inds[i] = ind # Aggregate information, will later send it to each worker using MPI msg = np.array(rews + lens + inds, dtype=np.int32) # Worker rank 0 that runs evaluation episodes else: rews = [0] * ep_per_cpu lens = [0] * ep_per_cpu for i in range(ep_per_cpu): ind, p = optimizer.get_parameters() policy.set_parameters(p) e_rew, e_len = policy.rollout() rews[i] = e_rew lens[i] = e_len eval_mean_rew = np.mean(rews) eval_max_rew = np.max(rews) # Empty array, evaluation results are not used for the update msg = np.zeros(3 * ep_per_cpu, dtype=np.int32) # MPI stuff # Initialize array which will be updated with information from all workers using MPI results = np.empty((cpus, 3 * ep_per_cpu), dtype=np.int32) comm.Allgather([msg, MPI.INT], [results, MPI.INT]) # Skip empty evaluation results from worker with id 0 results = results[1:, :] # Extract IDs and rewards rews = results[:, :ep_per_cpu].flatten() lens = results[:, ep_per_cpu:(2 * ep_per_cpu)].flatten() ids = results[:, (2 * ep_per_cpu):].flatten() # Update parameters optimizer.update(ids=ids, rewards=rews) # Steps passed = Sum of episode steps from all offsprings steps = np.sum(lens) steps_passed += steps # Write some logs for this iteration # Using logs we are able to recover solution saved # after 1 hour of training or after 1 billion frames if rank == 0: iteration_time = (time.time() - iter_start_time) time_elapsed = (time.time() - start_time) / 60 train_mean_rew = np.mean(rews) train_max_rew = np.max(rews) logger.log('------------------------------------') logger.log('Iteration'.ljust(25) + '%f' % optimizer.iteration) logger.log('EvalMeanReward'.ljust(25) + '%f' % eval_mean_rew) logger.log('EvalMaxReward'.ljust(25) + '%f' % eval_max_rew) logger.log('TrainMeanReward'.ljust(25) + '%f' % train_mean_rew) logger.log('TrainMaxReward'.ljust(25) + '%f' % train_max_rew) logger.log('StepsSinceStart'.ljust(25) + '%f' % steps_passed) logger.log('StepsThisIter'.ljust(25) + '%f' % steps) logger.log('IterationTime'.ljust(25) + '%f' % iteration_time) logger.log('TimeSinceStart'.ljust(25) + '%f' % time_elapsed) # Give optimizer a chance to log its own stuff optimizer.log(logger) logger.log('------------------------------------') # Write stuff for training curve plot stat_string = "{},\t{},\t{},\t{},\t{},\t{}\n".\ format(steps_passed, (time.time()-start_time), eval_mean_rew, eval_max_rew, train_mean_rew, train_max_rew) logger.write_general_stat(stat_string) logger.write_optimizer_stat(optimizer.stat_string()) # Save currently proposed solution every 20 iterations if optimizer.iteration % 20 == 1: logger.save_parameters(optimizer.parameters, optimizer.iteration) #test best if rank == 0: final_rews = [] for i in range(200): indd, p = optimizer.get_parameters() policy.set_parameters(p) e_rew, e_len = policy.rollout() final_rews.append(e_rew) final_eval = np.mean(final_rews) logger.log('Final'.ljust(25) + '%f' % final_eval) logger.save_parameters(optimizer.parameters, optimizer.iteration)
def main(ep_per_cpu, game, configuration_file, run_name): start_time = time.time() with open(configuration_file, 'r') as f: configuration = json.loads(f.read()) env_name = '%sNoFrameskip-v4' % game # MPI stuff comm = MPI.COMM_WORLD rank = comm.Get_rank() cpus = comm.Get_size() # Meta Population meta_pop_size = 5 meta_pop_active = list(range(meta_pop_size)) next_havling_time = 40 # minutes for next mu_list = [5, 10, 20, 50, 100] # One cpu (rank 0) will evaluate results train_cpus = cpus - meta_pop_size # Deduce population size lam = train_cpus * ep_per_cpu # Create environment env = gym.make(env_name) # Create policy (Deep Neural Network) # Internally it applies preprocessing to the environment state policy = Policy(env, network=configuration['network'], nonlin_name=configuration['nonlin_name']) # Create reference batch used for normalization # It will be overwritten with vb from worker with rank 0 vb = policy.get_vb() # Extract vector with current parameters. parameters_list = [policy.get_parameters() for count in range(meta_pop_size)] # Send parameters from worker 0 to all workers (MPI stuff) # to ensure that every worker starts in the same position for i in range(meta_pop_size): comm.Bcast([parameters_list[i], MPI.FLOAT], root=0) comm.Bcast([vb, MPI.FLOAT], root=0) # Set the same virtual batch for each worker if rank != 0: policy.set_vb(vb) if rank < meta_pop_size: parent_id = rank eval_moving_avg = 0 forget_factor = 0.9 if rank >= meta_pop_size: parent_id = int((rank-meta_pop_size)//(train_cpus/meta_pop_size)) # Create optimizer with user defined settings (hyperparameters) OptimizerClass = optimizer_dict[configuration['optimizer']] optimizer = OptimizerClass(parameters_list, lam, rank, meta_pop_size, parent_id, mu_list[parent_id], configuration["settings"]) # Only rank 0 worker will log information from the training logger = None if rank < meta_pop_size: # TODO: Improve logger for meta pop # Initialize logger, save virtual batch and save some basic stuff at the beginning logger = Logger(optimizer.log_path(game, configuration['network'], run_name)) if rank == 0: logger.save_vb(vb) # Log basic stuff logger.log('Game'.ljust(25) + '%s' % game, rank) logger.log('Network'.ljust(25) + '%s' % configuration['network'], rank) logger.log('Optimizer'.ljust(25) + '%s' % configuration['optimizer'], rank) logger.log('Number of CPUs'.ljust(25) + '%d' % cpus, rank) logger.log('Population'.ljust(25) + '%d' % lam, rank) logger.log('Dimensionality'.ljust(25) + '%d' % len(parameters_list[0]), rank) # Log basic info from the optimizer #optimizer.log_basic(logger) # We will count number of steps # frames = 4 * steps (3 * steps for SpaceInvaders) steps_passed = 0 while True: # Iteration start time iter_start_time = time.time() # Workers that run train episodes if rank >= meta_pop_size: # Empty arrays for each episode. We save: length, reward, noise index lens = [0] * ep_per_cpu rews = [0] * ep_per_cpu inds = [0] * ep_per_cpu parent_id_arr = [0] * ep_per_cpu # For each episode in this CPU we get new parameters, # update policy network and perform policy rollout for i in range(ep_per_cpu): ind, p = optimizer.get_parameters() policy.set_parameters(p) e_rew, e_len = policy.rollout() lens[i] = e_len rews[i] = e_rew inds[i] = ind parent_id_arr[i] = parent_id # Aggregate information, will later send it to each worker using MPI msg = np.array(rews + lens + inds + parent_id_arr, dtype=np.int32) # Worker rank 0 that runs evaluation episodes else: rews = [0] * ep_per_cpu lens = [0] * ep_per_cpu for i in range(ep_per_cpu): ind, p = optimizer.get_parameters() policy.set_parameters(p) e_rew, e_len = policy.rollout() rews[i] = e_rew lens[i] = e_len eval_mean_rew = np.mean(rews) eval_max_rew = np.max(rews) print("real mean {}".format(eval_mean_rew)) eval_moving_avg = eval_mean_rew + forget_factor*(eval_moving_avg-eval_mean_rew) print("mean eval for rank {} is {}".format(rank, eval_moving_avg)) # Empty array, evaluation results are not used for the update msg = np.array(eval_moving_avg, dtype=np.int32) #msg = np.zeros(3 * ep_per_cpu, dtype=np.int32) # MPI stuff # Initialize array which will be updated with information from all workers using MPI results = np.empty((cpus, 4 * ep_per_cpu), dtype=np.int32) comm.Allgather([msg, MPI.INT], [results, MPI.INT]) eval_results = results[:meta_pop_size, 0] # Skip empty evaluation results from worker with id 0 results = results[meta_pop_size:, :] # Extract IDs and rewards rews = results[:, :ep_per_cpu].flatten() lens = results[:, ep_per_cpu:(2*ep_per_cpu)].flatten() ids = results[:, (2 * ep_per_cpu):(3 * ep_per_cpu)].flatten() par_id = results[:, (3 * ep_per_cpu):].flatten() rews_list = [0] * meta_pop_size ids_list = [0] * meta_pop_size train_mean_reward = [0] * meta_pop_size train_max_reward = [0] * meta_pop_size for id in meta_pop_active: rewards_id = [i for i, x in enumerate(par_id) if x == id] if not rewards_id: print("shittttttttttt {}".format(rewards_id)) rews_list[id] = ([rews[i] for i in rewards_id]) train_mean_reward[id] = (np.mean(rews_list[id])) train_max_reward[id] = (np.max(rews_list[id])) ids_list[id] = ([ids[i] for i in rewards_id]) # Update parameters for i in meta_pop_active: optimizer.update(ids=ids_list[i], rewards=rews_list[i]) #===============Sucssesive Halving================== if next_havling_time <= ((time.time()-start_time)/60): print("Assigning good weights to bad {}".format(((time.time() - start_time) / 60))) print("Eval rewards list {}".format(eval_results)) ranking = sorted(range(len(eval_results)), key=lambda k: eval_results[k], reverse=True) print("ranking {}".format(ranking)) bottom = ranking[int(0.6*meta_pop_size):] print("bottom {}".format(bottom)) if parent_id in bottom: optimizer.assign_weights(ranking[int(len(ranking) - ranking.index(parent_id) - 1)]) print("rank {} switch from {} to {}".format(rank,parent_id, ranking[int(len(ranking) - ranking.index(parent_id) - 1)])) next_havling_time += 40 # print("Halving now time passed {}".format(((time.time()-start_time)/60))) # eval_mean = [] # for rank_i in range(meta_population): # # print(eval_results[rank_i, :ep_per_cpu]) # eval_mean.append(np.mean(eval_results[rank_i, :ep_per_cpu])) # print("halving rewards list {}".format(eval_mean)) # ranking = sorted(range(len(eval_mean)), key=lambda k: eval_mean[k], reverse=True) # print("ranking {}".format(ranking)) # bottom = ranking[int(half_pop // 2):] # print("bottom {}".format(bottom)) # if parent_id in bottom: # old = parent_id # parent_id = int(ranking.index(parent_id)-len(ranking)//2) # print("switch from {} to {}".format(old, parent_id)) # next_havling_time *= 2 # half_pop /= 2 # ep_per_cpu //=2 # Steps passed = Sum of episode steps from all offsprings steps = np.sum(lens) steps_passed += steps # Write some logs for this iteration # Using logs we are able to recover solution saved # after 1 hour of training or after 1 billion frames if rank < meta_pop_size: iteration_time = (time.time() - iter_start_time) time_elapsed = (time.time() - start_time) / 60 train_mean_rew = np.mean(rews) train_max_rew = np.max(rews) logger.log('------------------------------------', rank) logger.log('Iteration'.ljust(25) + '%f' % (optimizer.iteration//meta_pop_size), rank) logger.log('EvalMeanReward'.ljust(25) + '%f' % eval_moving_avg, rank) logger.log('EvalMaxReward'.ljust(25) + '%f' % eval_max_rew, rank) logger.log('TrainMeanReward'.ljust(25) + '%f' % train_mean_rew, rank) logger.log('TrainMaxReward'.ljust(25) + '%f' % train_max_rew, rank) logger.log('StepsSinceStart'.ljust(25) + '%f' % steps_passed, rank) logger.log('StepsThisIter'.ljust(25) + '%f' % steps, rank) logger.log('IterationTime'.ljust(25) + '%f' % iteration_time, rank) logger.log('TimeSinceStart'.ljust(25) + '%f' % time_elapsed, rank) # Give optimizer a chance to log its own stuff # optimizer.log(logger) logger.log('------------------------------------', rank) # Write stuff for training curve plot stat_string = "{},\t{},\t{},\t{},\t{},\t{}\n". \ format(steps_passed, (time.time() - start_time), eval_moving_avg, eval_max_rew, train_mean_rew, train_max_rew) logger.write_general_stat(stat_string, rank) # logger.write_optimizer_stat(optimizer.stat_string()) # Save currently proposed solution every 20 iterations if optimizer.iteration % 20 == 1: logger.save_parameters(optimizer.parameters, optimizer.iteration, rank)