def __init__(self, row_sz: int, col_sz: int): self.grid = Grid(row_sz, col_sz) self.policy = Policy(env.ACTIONS, env.ETA, env.GAMMA, env.EPSILON) self.robby = Agent(self.grid, np.random.randint(0, row_sz), np.random.randint(0, col_sz), self.policy) self.epoch = 1 self.rewards_per_episode = []
class Visualizer(object): def __init__(self, game, network, train_directory): self.game = game env_name = '%sNoFrameskip-v4' % game env = gym.make(env_name) # env = gym.wrappers.Monitor(env, '/tmp/temp_%s' % game, mode='evaluation', force=True) vb_file = os.path.join(train_directory, "vb.npy") vb = np.load(vb_file) parameters_file = 'parameters_81' self.policy = Policy(env, network, "relu") parameters_path = os.path.join(train_directory, parameters_file) print('Using parameters file %s \n' % parameters_path) with open(parameters_path, 'rb') as f: parameters = pickle.load(f)['parameters'] self.policy.set_parameters(parameters) self.policy.set_vb(vb) def play_game(self): rews = [0]*100 for i in range(100): rew,step = self.policy.rollout() rews[i] = rew print(np.mean(rews)) print(np.max(rews)) print(rews)
class Visualizer(object): def __init__(self, game, network, train_directory): self.game = game env_name = '%sNoFrameskip-v4' % game env = gym.make(env_name) env = gym.wrappers.Monitor(env, '/tmp/temp_%s' % game, mode='evaluation', force=True) vb_file = os.path.join(train_directory, "vb.npy") vb = np.load(vb_file) parameters_file = sorted(os.listdir(train_directory))[-3] self.policy = Policy(env, network, "elu") parameters_path = os.path.join(train_directory, parameters_file) print('Using parameters file %s \n' % parameters_path) with open(parameters_path, 'rb') as f: parameters = pickle.load(f)['params'] self.policy.set_parameters(parameters) self.policy.set_vb(vb) def play_game(self): print(self.policy.rollout(render=True))
def main(): args = parse_args() args.rom_path = args.rom_path + utils.game_file(args.game_name) print(args) if args.mode != 1: drrn = DRRN(args) drrn.train(batch_size=args.batch_size, epochs=args.qnet_iter) elif args.mode != 0: policy = Policy(args) policy.train(epochs=args.policy_epoch) else: print("Argument Error!!")
def __init__(self, game, network, train_directory): self.game = game env_name = '%sNoFrameskip-v4' % game env = gym.make(env_name) # env = gym.wrappers.Monitor(env, '/tmp/temp_%s' % game, mode='evaluation', force=True) vb_file = os.path.join(train_directory, "vb.npy") vb = np.load(vb_file) parameters_file = 'parameters_81' self.policy = Policy(env, network, "relu") parameters_path = os.path.join(train_directory, parameters_file) print('Using parameters file %s \n' % parameters_path) with open(parameters_path, 'rb') as f: parameters = pickle.load(f)['parameters'] self.policy.set_parameters(parameters) self.policy.set_vb(vb)
def main(): args = parse_args() print(args) args.rom_path = args.rom_path + utils.game_file(args.game_name) data_path = args.data_path.replace('GAME', args.game_name) if args.seed is None: import random args.seed = random.randint(0, 1000) np.random.seed(args.seed) import sentencepiece as spm sp = spm.SentencePieceProcessor() sp.Load('spm_models/unigram_8k.model') log_dir = data_path + '/%s_trial_%d/round_%d/' % (args.uct_type, args.trial, args.round) if not os.path.exists(log_dir): os.makedirs(log_dir) env = JerichoEnv(args.rom_path, args.seed, args.env_step_limit) env.create() visited_transitions = [] ob, info = env.reset() done = False cum_reward = info['score'] step = 0 if args.load_cache: try: valid_action_dict = np.load('cache/%s_valid_action_dict.npy' % args.game_name, allow_pickle=True)[()] except EOFError: print("EOFError: skip loading cache..") valid_action_dict = None except OSError: print("OSError: skip loading cache..") valid_action_dict = None else: valid_action_dict = None actions_info = None prev_action = '<START>' if args.round == 0: policy = None elif args.round > 0: policy = Policy(args) policy.load_weights( 'weights/%s/round_%s/%s_weight_policy_best_seed%d.pickle' % (args.game_name, args.round - 1, args.uct_type, args.trial)) args.load_path = 'weights/%s/round_%s/%s_weight_q_best_seed%d.pickle' % ( args.game_name, args.round - 1, args.uct_type, args.trial) else: raise NotImplementedError import time start = time.time() log_file = log_dir + 'mcts_log_d%02d_s%d_e%d_%02d.txt'\ % (args.max_depth, args.simulation_per_act, args.exploration_constant, args.seed) data = open(log_file, 'w') replay_buffer_filename = log_dir + 'mcts_replay_d%02d_%02d.txt' % ( args.max_depth, args.seed) replay_buffer_file = open(replay_buffer_filename, 'w') for cur_depth in range(args.max_episode_len): agent = MCTSAgent(args, env.copy(), policy, uct_type=args.uct_type, valid_action_dict=valid_action_dict, actions_info=actions_info, log_dir=log_dir, visited_transitions=visited_transitions, replay_file=replay_buffer_file) prev_action_str = '[PREV_ACTION] ' + prev_action + '\n' root_node, action, visited_transitions = agent.search( ob, info, cur_depth) data.write('#######################################################\n') state_str = '[OBS] ' + ob + '\n' + '[LOOK] ' + info[ 'look'] + '\n' + '[INV] ' + info['inv'] + '\n' valid_action_strs = [ '[VALID_ACTION] ' + valid + '\n' for valid in info['valid'] ] action_str = '[ACTION] ' + action + '\n' data.write(state_str) for valid_action_str in valid_action_strs: data.write(valid_action_str) data.write(action_str) data.write(prev_action_str) ob, reward, done, info = env.step(action) cum_reward += reward score = info['score'] step += 1 next_ob_text = ob + info['look'] + info['inv'] if '*** You have won ***' in next_ob_text or '*** You have died ***' in next_ob_text: score = int( next_ob_text.split('you scored ')[1].split(' out of')[0]) reward = score - cum_reward data.write('Reward: %d, Cum_reward: %d \n' % (reward, score)) for action_node in root_node.children: data.write('%s Q_val: %f Q_hat: %f count: %d \n' % (action_node.action, action_node.Q, action_node.Q_hat, action_node.N)) prev_action = action print('##########################') print('STEP: %s' % step) print(root_node.state) print() print('BEST_ACTION: ', action) print() print('Valid actions:', [action.action for action in root_node.children]) print('Q-values', [action.Q for action in root_node.children]) print('Q-hat', [action.Q_hat for action in root_node.children]) print('Final Q', [action.Q + action.Q_hat for action in root_node.children]) print('Maximum Q', [ 0 if len(action.Rs) == 0 else max(action.Rs) for action in root_node.children ]) print('Count of actions', [action.N for action in root_node.children]) print('Action Probs:', [prob for prob in root_node.children_probs]) print() print('Reward: %s, CUM_Reward: %s' % (reward, score)) print() print(ob + info['look'] + info['inv']) print(flush=True) valid_action_dict = agent.valid_action_dict actions_info = [agent.actions, agent.actions_e] if args.save_cache: np.save('cache/%s_valid_action_dict.npy' % args.game_name, valid_action_dict) if '*** You have won ***' in next_ob_text or '*** You have died ***' in next_ob_text: break print('TOTAL TIME: ', time.time() - start) data.close() replay_buffer_file.close()
def main(ep_per_cpu, game, configuration_file, run_name): start_time = time.time() with open(configuration_file, 'r') as f: configuration = json.loads(f.read()) env_name = '%sNoFrameskip-v4' % game # MPI stuff comm = MPI.COMM_WORLD rank = comm.Get_rank() cpus = comm.Get_size() # One cpu (rank 0) will evaluate results train_cpus = cpus - 1 k = 10 epoch = 5 m = 1 # Deduce population size lam = train_cpus * ep_per_cpu # Create environment env = gym.make(env_name) # Create policy (Deep Neural Network) # Internally it applies preprocessing to the environment state policy = Policy(env, network=configuration['network'], nonlin_name=configuration['nonlin_name']) # Create reference batch used for normalization # It will be overwritten with vb from worker with rank 0 vb = policy.get_vb() # Extract vector with current parameters. #parameter have length of 1693380 parameters = policy.get_parameters() shape = policy.parameter_shapes # Send parameters from worker 0 to all workers (MPI stuff) # to ensure that every worker starts in the same position #comm.Bcast([parameters, MPI.FLOAT], root=0) comm.Bcast([vb, MPI.FLOAT], root=0) # Create optimizer with user defined settings (hyperparameters) OptimizerClass = optimizer_dict[configuration['optimizer']] optimizer = OptimizerClass(train_cpus, parameters, shape, lam, rank, configuration["settings"], epoch, m) # Set the same virtual batch for each worker if rank != 0: policy.set_vb(vb) rews = [0] e_r = 0 p = optimizer.get_parameters() policy.set_parameters(p) for j in range(k): e_rew, e_len = policy.rollout() e_r += e_rew rews[0] = e_r / k optimizer.rew = e_r / k msg = np.array(rews) pp = p # Only rank 0 worker will log information from the training logger = Logger( optimizer.log_path(game, configuration['network'], run_name)) if rank == 0: # Initialize logger, save virtual batch and save some basic stuff at the beginning logger.save_vb(vb) logger.log('Game'.ljust(25) + '%s' % game) logger.log('Network'.ljust(25) + '%s' % configuration['network']) logger.log('Optimizer'.ljust(25) + '%s' % configuration['optimizer']) logger.log('Number of CPUs'.ljust(25) + '%d' % cpus) logger.log('Population'.ljust(25) + '%d' % lam) logger.log('Dimensionality'.ljust(25) + '%d' % len(parameters)) # Log basic info from the optimizer # optimizer.log_basic(logger) msg = np.zeros(1) pp = np.zeros(optimizer.n) results = np.empty((cpus, 1)) ppp = np.empty((cpus, optimizer.n)) comm.Allgather([msg, MPI.FLOAT], [results, MPI.FLOAT]) comm.Allgather([pp, MPI.FLOAT], [ppp, MPI.FLOAT]) results = results[1:, :] ppp = ppp[1:, :].flatten() rews = results[:, :1].flatten() BestScore = max(rews) Bestid = np.argmax(rews) BestFound = ppp[Bestid * optimizer.n:(Bestid + 1) * optimizer.n] if rank == 0: logger.log('Best'.ljust(25) + '%f' % BestScore) # We will count number of steps # frames = 4 * steps steps_passed = 0 iteration = 1 while steps_passed <= 25000000: # Iteration start time iter_start_time = time.time() if iteration % epoch == 1: optimizer.sigupdatelist = np.zeros(optimizer.n) llambda = np.random.normal(1, 0.1 - 0.1 * steps_passed / 25000000) # Workers that run train episodes optimizer.RandomGrouping() for ii in range(m): optimizer.groupnum = ii if rank != 0: # Empty arrays for each episode. We save: length, reward, noise index lens1 = [0] rews1 = [0] orew = [0] # sig1 = [0] # For each episode in this CPU we get new parameters, # update policy network and perform policy rollout e_r = 0 e_l = 0 p = optimizer.get_parameters1() policy.set_parameters(p) for j in range(k): e_rew, e_len = policy.rollout() e_r += e_rew e_l += e_len lens1[0] = e_l rews1[0] = e_r / k optimizer.rew1 = e_r / k orew[0] = optimizer.rew if iteration % epoch == 1: sig1 = optimizer.sigmalist # Aggregate information, will later send it to each worker using MPI msg1 = np.array(rews1 + lens1 + orew, dtype=np.float64) pp1 = optimizer.parameters1 if iteration % epoch == 1: sigmsg1 = sig1 # Worker rank 0 that runs evaluation episodes else: # Empty array, evaluation results are not used for the update msg1 = np.zeros(3, dtype=np.float64) pp1 = optimizer.parameters if iteration % epoch == 1: sigmsg1 = np.zeros(optimizer.n) # MPI stuff # Initialize array which will be updated with information from all workers using MPI results1 = np.empty((cpus, 3), dtype=np.float64) ppp1 = np.empty((cpus, optimizer.n)) if iteration % epoch == 1: sigmsgs1 = np.empty((cpus, optimizer.n)) comm.Allgather([msg1, MPI.FLOAT], [results1, MPI.FLOAT]) comm.Allgather([pp1, MPI.FLOAT], [ppp1, MPI.FLOAT]) if iteration % epoch == 1: comm.Allgather([sigmsg1, MPI.FLOAT], [sigmsgs1, MPI.FLOAT]) ppp1 = ppp1[1:, :].flatten() if iteration % epoch == 1: sigmsgs1 = sigmsgs1[1:, :].flatten() # Skip empty evaluation results from worker with id 0 results1 = results1[1:, :] # Extract IDs and rewards rews1 = results1[:, :1].flatten() lens1 = results1[:, 1:2].flatten() oreward = results1[:, 2:].flatten() newBestidx = np.argmax(rews1) if np.max(rews1) > BestScore: BestScore = rews1[newBestidx] BestFound = ppp1[newBestidx * optimizer.n:(newBestidx + 1) * optimizer.n] #uodate parameters, sigmas, rews if rank != 0: optimizer.update(ppp, BestScore, sigmsgs1, llambda) # Steps passed = Sum of episode steps from all offsprings steps = np.sum(lens1) steps_passed += steps # Write some logs for this iteration # Using logs we are able to recover solution saved # after 1 hour of training or after 1 billion frames if rank == 0: eval_mean_rew = np.mean(oreward) eval_mean_rew1 = np.mean(rews1) iteration_time = (time.time() - iter_start_time) time_elapsed = (time.time() - start_time) / 60 logger.log('------------------------------------') logger.log('Iteration'.ljust(25) + '%f' % iteration) logger.log('EvalMeanReward'.ljust(25) + '%f' % eval_mean_rew) logger.log('EvalMeanReward1'.ljust(25) + '%f' % eval_mean_rew1) logger.log('StepsThisIter'.ljust(25) + '%f' % steps) logger.log('StepsSinceStart'.ljust(25) + '%f' % steps_passed) logger.log('IterationTime'.ljust(25) + '%f' % iteration_time) logger.log('TimeSinceStart'.ljust(25) + '%f' % time_elapsed) logger.log('Best'.ljust(25) + '%f' % BestScore) # Give optimizer a chance to log its own stuff # optimizer.log(logger) logger.log('------------------------------------') if iteration % 20 == 1: fin_rews = 0 p = BestFound policy.set_parameters(p) for i in range(30): e_rew, e_len = policy.rollout() fin_rews += e_rew fin_eval = fin_rews / 30 else: fin_eval = 0 # Write stuff for training curve plot stat_string = "{},\t{},\t{},\t{}\n".\ format(steps_passed, (time.time()-start_time), eval_mean_rew1, fin_eval) logger.write_general_stat(stat_string) logger.write_optimizer_stat(optimizer.stat_string()) # Save currently proposed solution every 20 iterations if iteration % 20 == 1: logger.save_parameters(BestFound, iteration) else: if iteration % epoch == 0: optimizer.updatesigma() iteration += 1 #test best if rank == 0: final_rews = [] p = BestFound policy.set_parameters(p) for i in range(200): e_rew, e_len = policy.rollout() final_rews.append(e_rew) final_eval = np.mean(final_rews) logger.log('Final'.ljust(25) + '%f' % final_eval) logger.save_parameters(BestFound, iteration)
class Problem: def __init__(self, row_sz: int, col_sz: int): self.grid = Grid(row_sz, col_sz) self.policy = Policy(env.ACTIONS, env.ETA, env.GAMMA, env.EPSILON) self.robby = Agent(self.grid, np.random.randint(0, row_sz), np.random.randint(0, col_sz), self.policy) self.epoch = 1 self.rewards_per_episode = [] def run(self, n: int, m: int) -> (float, float): """ Run a training session (runs the robot for m steps, n times) and create a plot, saved in src/training_plot.png. Then runs a test session, retuning the mean and standard deviation. :param n: The number of episodes :param m: The number of steps :return: mean, standard deviation """ self._train(n, m) return self._test(n, m) def _train(self, episodes: int, steps: int): """ Runs the robot for a certain number of steps (steps), and runs this a certain number of times (episodes). Each episode, the epsilon value is updated, a new grid is generated along with a starting position for the robot. The total reward accumulated per episode is tracked in the rewards_per_episode list. A training plot is then created and saved. """ print("Beginning Training...\n") for i in range(episodes): reward_accumulated = self.run_n_steps(steps) self.policy.update_e() self.epoch += 1 self.grid = Grid(env.GRID_BOUND, env.GRID_BOUND) self.robby = Agent(self.grid, np.random.randint(0, env.GRID_BOUND), np.random.randint(0, env.GRID_BOUND), self.policy) if self.epoch % 100 == 0: self.rewards_per_episode.append(reward_accumulated) print("Episode: {}, total reward for episode: {}".format( i + 1, reward_accumulated)) self._create_training_plot() def _test(self, episodes: int, steps: int) -> (float, float): """ Runs a test session where the epsilon value is fixed at a value, and the robot takes a certain number of steps (steps), for a certain number of episodes (episodes). Total rewards per episode are tracked and the mean and standard deviation are returned. :param episodes: :param steps: """ print("Beginning Test...\n") self.policy.reset_e() self.rewards_per_episode = [] for i in range(episodes): self.grid = Grid(env.GRID_BOUND, env.GRID_BOUND) self.robby = Agent(self.grid, np.random.randint(0, env.GRID_BOUND), np.random.randint(0, env.GRID_BOUND), self.policy) reward_accumulated = self.run_n_steps(steps) self.rewards_per_episode.append(reward_accumulated) print("Episode: {}, total reward for episode: {}".format( i + 1, reward_accumulated)) return self._calculate_test_mean_std() def run_n_steps(self, steps: int) -> int: """ Run the robot for a certain number of steps. :param steps: number of steps to run the simulation :return: the reward accumulated """ for i in range(steps): self.robby.take_action() return self.robby.total_reward def _create_training_plot(self): episodes = [x * 100 for x in range(1, 51)] plt.plot(episodes, self.rewards_per_episode, 'ro') plt.savefig('training_plot.png', bbox_inches='tight') def _calculate_test_mean_std(self) -> (float, float): mean = np.mean(self.rewards_per_episode) std = np.std(self.rewards_per_episode) return mean, std
def main(ep_per_cpu, game, configuration_file, run_name): start_time = time.time() with open(configuration_file, 'r') as f: configuration = json.loads(f.read()) env_name = '%sNoFrameskip-v4' % game # MPI stuff comm = MPI.COMM_WORLD rank = comm.Get_rank() cpus = comm.Get_size() # One cpu (rank 0) will evaluate results train_cpus = cpus - 1 # Deduce population size lam = train_cpus * ep_per_cpu # Create environment env = gym.make(env_name) # Create policy (Deep Neural Network) # Internally it applies preprocessing to the environment state policy = Policy(env, network=configuration['network'], nonlin_name=configuration['nonlin_name']) # Create reference batch used for normalization # It will be overwritten with vb from worker with rank 0 vb = policy.get_vb() # Extract vector with current parameters. parameters = policy.get_parameters() # Send parameters from worker 0 to all workers (MPI stuff) # to ensure that every worker starts in the same position comm.Bcast([parameters, MPI.FLOAT], root=0) comm.Bcast([vb, MPI.FLOAT], root=0) # Set the same virtual batch for each worker if rank != 0: policy.set_vb(vb) # Create optimizer with user defined settings (hyperparameters) OptimizerClass = optimizer_dict[configuration['optimizer']] optimizer = OptimizerClass(parameters, lam, rank, configuration["settings"]) # Only rank 0 worker will log information from the training logger = None if rank == 0: # Initialize logger, save virtual batch and save some basic stuff at the beginning logger = Logger( optimizer.log_path(game, configuration['network'], run_name)) logger.save_vb(vb) # Log basic stuff logger.log('Game'.ljust(25) + '%s' % game) logger.log('Network'.ljust(25) + '%s' % configuration['network']) logger.log('Optimizer'.ljust(25) + '%s' % configuration['optimizer']) logger.log('Number of CPUs'.ljust(25) + '%d' % cpus) logger.log('Population'.ljust(25) + '%d' % lam) logger.log('Dimensionality'.ljust(25) + '%d' % len(parameters)) # Log basic info from the optimizer optimizer.log_basic(logger) # We will count number of steps # frames = 4 * steps (3 * steps for SpaceInvaders) steps_passed = 0 while steps_passed <= 25000000: # Iteration start time iter_start_time = time.time() # Workers that run train episodes if rank != 0: # Empty arrays for each episode. We save: length, reward, noise index lens = [0] * ep_per_cpu rews = [0] * ep_per_cpu inds = [0] * ep_per_cpu # For each episode in this CPU we get new parameters, # update policy network and perform policy rollout for i in range(ep_per_cpu): ind, p = optimizer.get_parameters() policy.set_parameters(p) e_rew, e_len = policy.rollout() lens[i] = e_len rews[i] = e_rew inds[i] = ind # Aggregate information, will later send it to each worker using MPI msg = np.array(rews + lens + inds, dtype=np.int32) # Worker rank 0 that runs evaluation episodes else: rews = [0] * ep_per_cpu lens = [0] * ep_per_cpu for i in range(ep_per_cpu): ind, p = optimizer.get_parameters() policy.set_parameters(p) e_rew, e_len = policy.rollout() rews[i] = e_rew lens[i] = e_len eval_mean_rew = np.mean(rews) eval_max_rew = np.max(rews) # Empty array, evaluation results are not used for the update msg = np.zeros(3 * ep_per_cpu, dtype=np.int32) # MPI stuff # Initialize array which will be updated with information from all workers using MPI results = np.empty((cpus, 3 * ep_per_cpu), dtype=np.int32) comm.Allgather([msg, MPI.INT], [results, MPI.INT]) # Skip empty evaluation results from worker with id 0 results = results[1:, :] # Extract IDs and rewards rews = results[:, :ep_per_cpu].flatten() lens = results[:, ep_per_cpu:(2 * ep_per_cpu)].flatten() ids = results[:, (2 * ep_per_cpu):].flatten() # Update parameters optimizer.update(ids=ids, rewards=rews) # Steps passed = Sum of episode steps from all offsprings steps = np.sum(lens) steps_passed += steps # Write some logs for this iteration # Using logs we are able to recover solution saved # after 1 hour of training or after 1 billion frames if rank == 0: iteration_time = (time.time() - iter_start_time) time_elapsed = (time.time() - start_time) / 60 train_mean_rew = np.mean(rews) train_max_rew = np.max(rews) logger.log('------------------------------------') logger.log('Iteration'.ljust(25) + '%f' % optimizer.iteration) logger.log('EvalMeanReward'.ljust(25) + '%f' % eval_mean_rew) logger.log('EvalMaxReward'.ljust(25) + '%f' % eval_max_rew) logger.log('TrainMeanReward'.ljust(25) + '%f' % train_mean_rew) logger.log('TrainMaxReward'.ljust(25) + '%f' % train_max_rew) logger.log('StepsSinceStart'.ljust(25) + '%f' % steps_passed) logger.log('StepsThisIter'.ljust(25) + '%f' % steps) logger.log('IterationTime'.ljust(25) + '%f' % iteration_time) logger.log('TimeSinceStart'.ljust(25) + '%f' % time_elapsed) # Give optimizer a chance to log its own stuff optimizer.log(logger) logger.log('------------------------------------') # Write stuff for training curve plot stat_string = "{},\t{},\t{},\t{},\t{},\t{}\n".\ format(steps_passed, (time.time()-start_time), eval_mean_rew, eval_max_rew, train_mean_rew, train_max_rew) logger.write_general_stat(stat_string) logger.write_optimizer_stat(optimizer.stat_string()) # Save currently proposed solution every 20 iterations if optimizer.iteration % 20 == 1: logger.save_parameters(optimizer.parameters, optimizer.iteration) #test best if rank == 0: final_rews = [] for i in range(200): indd, p = optimizer.get_parameters() policy.set_parameters(p) e_rew, e_len = policy.rollout() final_rews.append(e_rew) final_eval = np.mean(final_rews) logger.log('Final'.ljust(25) + '%f' % final_eval) logger.save_parameters(optimizer.parameters, optimizer.iteration)
def main(ep_per_cpu, game, configuration_file, run_name): start_time = time.time() with open(configuration_file, 'r') as f: configuration = json.loads(f.read()) env_name = '%sNoFrameskip-v4' % game # MPI stuff comm = MPI.COMM_WORLD rank = comm.Get_rank() cpus = comm.Get_size() # Meta Population meta_pop_size = 5 meta_pop_active = list(range(meta_pop_size)) next_havling_time = 40 # minutes for next mu_list = [5, 10, 20, 50, 100] # One cpu (rank 0) will evaluate results train_cpus = cpus - meta_pop_size # Deduce population size lam = train_cpus * ep_per_cpu # Create environment env = gym.make(env_name) # Create policy (Deep Neural Network) # Internally it applies preprocessing to the environment state policy = Policy(env, network=configuration['network'], nonlin_name=configuration['nonlin_name']) # Create reference batch used for normalization # It will be overwritten with vb from worker with rank 0 vb = policy.get_vb() # Extract vector with current parameters. parameters_list = [policy.get_parameters() for count in range(meta_pop_size)] # Send parameters from worker 0 to all workers (MPI stuff) # to ensure that every worker starts in the same position for i in range(meta_pop_size): comm.Bcast([parameters_list[i], MPI.FLOAT], root=0) comm.Bcast([vb, MPI.FLOAT], root=0) # Set the same virtual batch for each worker if rank != 0: policy.set_vb(vb) if rank < meta_pop_size: parent_id = rank eval_moving_avg = 0 forget_factor = 0.9 if rank >= meta_pop_size: parent_id = int((rank-meta_pop_size)//(train_cpus/meta_pop_size)) # Create optimizer with user defined settings (hyperparameters) OptimizerClass = optimizer_dict[configuration['optimizer']] optimizer = OptimizerClass(parameters_list, lam, rank, meta_pop_size, parent_id, mu_list[parent_id], configuration["settings"]) # Only rank 0 worker will log information from the training logger = None if rank < meta_pop_size: # TODO: Improve logger for meta pop # Initialize logger, save virtual batch and save some basic stuff at the beginning logger = Logger(optimizer.log_path(game, configuration['network'], run_name)) if rank == 0: logger.save_vb(vb) # Log basic stuff logger.log('Game'.ljust(25) + '%s' % game, rank) logger.log('Network'.ljust(25) + '%s' % configuration['network'], rank) logger.log('Optimizer'.ljust(25) + '%s' % configuration['optimizer'], rank) logger.log('Number of CPUs'.ljust(25) + '%d' % cpus, rank) logger.log('Population'.ljust(25) + '%d' % lam, rank) logger.log('Dimensionality'.ljust(25) + '%d' % len(parameters_list[0]), rank) # Log basic info from the optimizer #optimizer.log_basic(logger) # We will count number of steps # frames = 4 * steps (3 * steps for SpaceInvaders) steps_passed = 0 while True: # Iteration start time iter_start_time = time.time() # Workers that run train episodes if rank >= meta_pop_size: # Empty arrays for each episode. We save: length, reward, noise index lens = [0] * ep_per_cpu rews = [0] * ep_per_cpu inds = [0] * ep_per_cpu parent_id_arr = [0] * ep_per_cpu # For each episode in this CPU we get new parameters, # update policy network and perform policy rollout for i in range(ep_per_cpu): ind, p = optimizer.get_parameters() policy.set_parameters(p) e_rew, e_len = policy.rollout() lens[i] = e_len rews[i] = e_rew inds[i] = ind parent_id_arr[i] = parent_id # Aggregate information, will later send it to each worker using MPI msg = np.array(rews + lens + inds + parent_id_arr, dtype=np.int32) # Worker rank 0 that runs evaluation episodes else: rews = [0] * ep_per_cpu lens = [0] * ep_per_cpu for i in range(ep_per_cpu): ind, p = optimizer.get_parameters() policy.set_parameters(p) e_rew, e_len = policy.rollout() rews[i] = e_rew lens[i] = e_len eval_mean_rew = np.mean(rews) eval_max_rew = np.max(rews) print("real mean {}".format(eval_mean_rew)) eval_moving_avg = eval_mean_rew + forget_factor*(eval_moving_avg-eval_mean_rew) print("mean eval for rank {} is {}".format(rank, eval_moving_avg)) # Empty array, evaluation results are not used for the update msg = np.array(eval_moving_avg, dtype=np.int32) #msg = np.zeros(3 * ep_per_cpu, dtype=np.int32) # MPI stuff # Initialize array which will be updated with information from all workers using MPI results = np.empty((cpus, 4 * ep_per_cpu), dtype=np.int32) comm.Allgather([msg, MPI.INT], [results, MPI.INT]) eval_results = results[:meta_pop_size, 0] # Skip empty evaluation results from worker with id 0 results = results[meta_pop_size:, :] # Extract IDs and rewards rews = results[:, :ep_per_cpu].flatten() lens = results[:, ep_per_cpu:(2*ep_per_cpu)].flatten() ids = results[:, (2 * ep_per_cpu):(3 * ep_per_cpu)].flatten() par_id = results[:, (3 * ep_per_cpu):].flatten() rews_list = [0] * meta_pop_size ids_list = [0] * meta_pop_size train_mean_reward = [0] * meta_pop_size train_max_reward = [0] * meta_pop_size for id in meta_pop_active: rewards_id = [i for i, x in enumerate(par_id) if x == id] if not rewards_id: print("shittttttttttt {}".format(rewards_id)) rews_list[id] = ([rews[i] for i in rewards_id]) train_mean_reward[id] = (np.mean(rews_list[id])) train_max_reward[id] = (np.max(rews_list[id])) ids_list[id] = ([ids[i] for i in rewards_id]) # Update parameters for i in meta_pop_active: optimizer.update(ids=ids_list[i], rewards=rews_list[i]) #===============Sucssesive Halving================== if next_havling_time <= ((time.time()-start_time)/60): print("Assigning good weights to bad {}".format(((time.time() - start_time) / 60))) print("Eval rewards list {}".format(eval_results)) ranking = sorted(range(len(eval_results)), key=lambda k: eval_results[k], reverse=True) print("ranking {}".format(ranking)) bottom = ranking[int(0.6*meta_pop_size):] print("bottom {}".format(bottom)) if parent_id in bottom: optimizer.assign_weights(ranking[int(len(ranking) - ranking.index(parent_id) - 1)]) print("rank {} switch from {} to {}".format(rank,parent_id, ranking[int(len(ranking) - ranking.index(parent_id) - 1)])) next_havling_time += 40 # print("Halving now time passed {}".format(((time.time()-start_time)/60))) # eval_mean = [] # for rank_i in range(meta_population): # # print(eval_results[rank_i, :ep_per_cpu]) # eval_mean.append(np.mean(eval_results[rank_i, :ep_per_cpu])) # print("halving rewards list {}".format(eval_mean)) # ranking = sorted(range(len(eval_mean)), key=lambda k: eval_mean[k], reverse=True) # print("ranking {}".format(ranking)) # bottom = ranking[int(half_pop // 2):] # print("bottom {}".format(bottom)) # if parent_id in bottom: # old = parent_id # parent_id = int(ranking.index(parent_id)-len(ranking)//2) # print("switch from {} to {}".format(old, parent_id)) # next_havling_time *= 2 # half_pop /= 2 # ep_per_cpu //=2 # Steps passed = Sum of episode steps from all offsprings steps = np.sum(lens) steps_passed += steps # Write some logs for this iteration # Using logs we are able to recover solution saved # after 1 hour of training or after 1 billion frames if rank < meta_pop_size: iteration_time = (time.time() - iter_start_time) time_elapsed = (time.time() - start_time) / 60 train_mean_rew = np.mean(rews) train_max_rew = np.max(rews) logger.log('------------------------------------', rank) logger.log('Iteration'.ljust(25) + '%f' % (optimizer.iteration//meta_pop_size), rank) logger.log('EvalMeanReward'.ljust(25) + '%f' % eval_moving_avg, rank) logger.log('EvalMaxReward'.ljust(25) + '%f' % eval_max_rew, rank) logger.log('TrainMeanReward'.ljust(25) + '%f' % train_mean_rew, rank) logger.log('TrainMaxReward'.ljust(25) + '%f' % train_max_rew, rank) logger.log('StepsSinceStart'.ljust(25) + '%f' % steps_passed, rank) logger.log('StepsThisIter'.ljust(25) + '%f' % steps, rank) logger.log('IterationTime'.ljust(25) + '%f' % iteration_time, rank) logger.log('TimeSinceStart'.ljust(25) + '%f' % time_elapsed, rank) # Give optimizer a chance to log its own stuff # optimizer.log(logger) logger.log('------------------------------------', rank) # Write stuff for training curve plot stat_string = "{},\t{},\t{},\t{},\t{},\t{}\n". \ format(steps_passed, (time.time() - start_time), eval_moving_avg, eval_max_rew, train_mean_rew, train_max_rew) logger.write_general_stat(stat_string, rank) # logger.write_optimizer_stat(optimizer.stat_string()) # Save currently proposed solution every 20 iterations if optimizer.iteration % 20 == 1: logger.save_parameters(optimizer.parameters, optimizer.iteration, rank)
from src.agent import Agent torch.manual_seed(1) np.random.seed(1) # Setting up bounds position_bounds = (-1.2, 0.5) velocity_bounds = (-0.07, 0.07) actions = [-1.0, 0.0, 1.0] episodes = 1500 epoches = 150 greed_factor = 0.1 # Instanced Policy policy = Policy(2, len(actions)) # Instanced Model model = Model( position_bounds, # Position bounds velocity_bounds # Velocity bounds ) # Instanced Agent agent = Agent( policy, # NeuralNetwork class model, actions, # Actions array (after discretization) episodes, # Max number of episodes epoches, # Max number of epoches per episode greed_factor # Greed factor
def main(ep_per_cpu, game, configuration_file, run_name): start_time = time.time() with open(configuration_file, 'r') as f: configuration = json.loads(f.read()) env_name = '%sNoFrameskip-v4' % game # MPI stuff comm = MPI.COMM_WORLD rank = comm.Get_rank() cpus = comm.Get_size() # One cpu (rank 0) will evaluate results train_cpus = cpus-1 # Deduce population size lam = train_cpus * ep_per_cpu # Create environment env = gym.make(env_name) # Create policy (Deep Neural Network) # Internally it applies preprocessing to the environment state policy = Policy(env, network=configuration['network'], nonlin_name=configuration['nonlin_name']) # Create reference batch used for normalization # It will be overwritten with vb from worker with rank 0 vb = policy.get_vb() # Extract vector with current parameters. parameters = policy.get_parameters() # Send parameters from worker 0 to all workers (MPI stuff) # to ensure that every worker starts in the same position comm.Bcast([parameters, MPI.FLOAT], root=0) comm.Bcast([vb, MPI.FLOAT], root=0) # Set the same virtual batch for each worker if rank != 0: policy.set_vb(vb) # Create optimizer with user defined settings (hyperparameters) OptimizerClass = optimizer_dict[configuration['optimizer']] optimizer = OptimizerClass(parameters, lam, rank, configuration["settings"]) # Only rank 0 worker will log information from the training logger = None if rank == 0: # Initialize logger, save virtual batch and save some basic stuff at the beginning logger = Logger(optimizer.log_path(game, configuration['network'], run_name)) logger.save_vb(vb) # Log basic stuff logger.log('Game'.ljust(25) + '%s' % game) logger.log('Network'.ljust(25) + '%s' % configuration['network']) logger.log('Optimizer'.ljust(25) + '%s' % configuration['optimizer']) # Write stuff for training curve plot stat_string = "{},\t{},\t{},\t{},\t{},\t{}\n".\ format(steps_passed, (time.time()-start_time), eval_mean_rew, eval_mean_rew1,BestScore,f_eval) logger.write_general_stat(stat_string) logger.write_optimizer_stat(optimizer.stat_string()) # Save currently proposed solution every 20 iterations if iteration % 20 == 1: logger.save_parameters(BestFound, iteration) else: if iteration%5 ==0: optimizer.updatesigma(updateCount) comm.Bcast([ppp, MPI.FLOAT], root=0) comm.Bcast([rews, MPI.FLOAT], root=0) comm.Bcast([sigmas, MPI.FLOAT], root=0) iteration+=1