def main(**exp): log_dir = tlogger.log_dir() tlogger.info(json.dumps(exp, indent=4, sort_keys=True)) tlogger.info('Logging to: {}'.format(log_dir)) Model = neuroevolution.models.__dict__[exp['model']] all_tstart = time.time() noise = SharedNoiseTable() rs = np.random.RandomState() def make_env0(b): return gym_tensorflow.make(game=exp["games"][0], batch_size=b) def make_env1(b): return gym_tensorflow.make(game=exp["games"][1], batch_size=b) workers = [ ConcurrentWorkers(make_env0, Model, batch_size=64), ConcurrentWorkers(make_env1, Model, batch_size=64) ] saver = tf.train.Saver() tlogger.info('Start timing') tstart = time.time() tf_sess = tf.Session() tf_sess.run(tf.global_variables_initializer()) state = TrainingState(exp) state.initialize(rs, noise, workers[0].model) workers[0].initialize(tf_sess) workers[1].initialize(tf_sess) for iteration in range(exp['iterations']): tlogger.info("BEGINNING ITERATION: {}".format(iteration)) ############## ### GAME 0 ### ############## worker = workers[0] frames_computed_so_far = tf_sess.run(worker.steps_counter) game0_results = [] game0_rewards = [] game0_episode_lengths = [] iterator = iter( worker.monitor_eval(make_offspring(exp, noise, rs, worker, state), max_frames=state.tslimit * 4)) for pos_seeds, pos_reward, pos_length in iterator: neg_seeds, neg_reward, neg_length = next(iterator) assert pos_seeds == neg_seeds result = Offspring(pos_seeds, [pos_reward, neg_reward], [pos_length, neg_length]) rewards = result.rewards game0_results.append(result) game0_rewards.append(rewards) game0_episode_lengths.append(result.ep_len) state.num_frames += tf_sess.run( worker.steps_counter) - frames_computed_so_far game0_returns_n2 = np.array([a.rewards for a in game0_results]) game0_noise_inds_n = [a.seeds for a in game0_results] # tlogger.info("game0 rewards: {}".format(np.mean(game0_rewards))) # tlogger.info("game0 eplens: {}".format(game0_episode_lengths)) save_pickle(iteration, log_dir, "game0_rewards", game0_rewards) save_pickle(iteration, log_dir, "game0_episode_lengths", game0_episode_lengths) ############## ### GAME 1 ### ############## worker = workers[1] frames_computed_so_far = tf_sess.run(worker.steps_counter) game1_results = [] game1_rewards = [] game1_episode_lengths = [] seeds_vector = np.array(game0_noise_inds_n) iterator = iter( worker.monitor_eval(make_offspring(exp, noise, rs, worker, state, seeds_vector), max_frames=state.tslimit * 4)) for pos_seeds, pos_reward, pos_length in iterator: neg_seeds, neg_reward, neg_length = next(iterator) assert pos_seeds == neg_seeds result = Offspring(pos_seeds, [pos_reward, neg_reward], [pos_length, neg_length]) rewards = result.rewards game1_results.append(result) game1_rewards.append(rewards) game1_episode_lengths.append(result.ep_len) state.num_frames += tf_sess.run( worker.steps_counter) - frames_computed_so_far game1_returns_n2 = np.array([a.rewards for a in game1_results]) game1_noise_inds_n = [a.seeds for a in game1_results] # tlogger.info("game1 rewards: {}".format(np.mean(game1_rewards))) # tlogger.info("game1 eplens: {}".format(game0_episode_lengths)) save_pickle(iteration, log_dir, "game1_rewards", game1_rewards) save_pickle(iteration, log_dir, "game1_episode_lengths", game1_episode_lengths) tlogger.info("Saving offsprings seeds") save_pickle(iteration, log_dir, "offsprings_seeds", game1_noise_inds_n) #################### ### UPDATE THETA ### #################### game_returns = [game0_returns_n2, game1_returns_n2] proc_returns = obtain_proc_returns(exp['learn_option'], game_returns) assert game0_noise_inds_n == game1_noise_inds_n noise_inds_n = game0_noise_inds_n + game1_noise_inds_n # concatenate the two lists # TOP 100 offspring # dx = proc_returns[:, 0] # dy = proc_returns[:, 1] # dist_squared = (np.ones(dx.shape) - np.abs(dx))**2 + (np.ones(dy.shape) - np.abs(dy))**2 # top_n_rewards = dist_squared.argsort()[-100:][::-1] # batched_weighted_indices = (noise.get(idx, worker.model.num_params) for idx in noise_inds_n) # proc_returns = proc_returns[top_n_rewards, :] # batched_weighted_args = { # 'deltas': proc_returns[:, 0] - proc_returns[:, 1], # 'indices': [myval for myidx, myval in enumerate(batched_weighted_indices) if myidx in top_n_rewards] # } # noise_inds_n = batched_weighted_args['indices'] # g, count = batched_weighted_sum(batched_weighted_args['deltas'], batched_weighted_args['indices'], batch_size=len(batched_weighted_args['deltas'])) # ALL offspring g, count = batched_weighted_sum( proc_returns[:, 0] - proc_returns[:, 1], (noise.get(idx, worker.model.num_params) for idx in noise_inds_n), batch_size=500) # NOTE: gradients are scaled by \theta returns_n2 = np.array([a.rewards for a in game0_results] + [a.rewards for a in game1_results]) # Only if using top 100 # returns_n2 = returns_n2[top_n_rewards] g /= returns_n2.size assert g.shape == ( worker.model.num_params, ) and g.dtype == np.float32 and count == len(noise_inds_n) update_ratio, state.theta = state.optimizer.update(-g + exp['l2coeff'] * state.theta) save_pickle(iteration, log_dir, "state", state) ###################### ### EVALUATE ELITE ### ###################### _, test_evals, test_timesteps = workers[0].monitor_eval_repeated( [(state.theta, 0)], max_frames=None, num_episodes=exp['num_test_episodes'] // 2)[0] tlogger.info("game0 elite: {}".format(np.mean(test_evals))) save_pickle(iteration, log_dir, 'game0_elite', test_evals) save_pickle(iteration, log_dir, 'game0_elite_timestemps', test_timesteps) _, test_evals, test_timesteps = workers[1].monitor_eval_repeated( [(state.theta, 0)], max_frames=None, num_episodes=exp['num_test_episodes'] // 2)[0] tlogger.info("game1 elite: {}".format(np.mean(test_evals))) save_pickle(iteration, log_dir, "game1_elite", test_evals) save_pickle(iteration, log_dir, 'game1_elite_timestemps', test_timesteps) state.num_frames += tf_sess.run( worker.steps_counter) - frames_computed_so_far saver.save(tf_sess, "{}/model-{}".format(log_dir, state.it)) state.it += 1 os.kill(os.getpid(), signal.SIGTERM)
def main(**exp): log_dir = tlogger.log_dir() tlogger.info(json.dumps(exp, indent=4, sort_keys=True)) tlogger.info('Logging to: {}'.format(log_dir)) Model = neuroevolution.models.__dict__[exp['model']] all_tstart = time.time() def make_env(b): return gym_tensorflow.make(game=exp["game"], batch_size=b) worker = ConcurrentWorkers(make_env, Model, batch_size=64) with WorkerSession(worker) as sess: noise = SharedNoiseTable() rs = np.random.RandomState() tlogger.info('Start timing') tstart = time.time() try: load_file = os.path.join(log_dir, 'snapshot.pkl') with open(load_file, 'rb+') as file: state = pickle.load(file) tlogger.info("Loaded iteration {} from {}".format( state.it, load_file)) except FileNotFoundError: tlogger.info('Failed to load snapshot') state = TrainingState(exp) if 'load_from' in exp: dirname = os.path.join(os.path.dirname(__file__), '..', 'neuroevolution', 'ga_legacy.py') load_from = exp['load_from'].format(**exp) os.system('python {} {} seeds.pkl'.format(dirname, load_from)) with open('seeds.pkl', 'rb+') as file: seeds = pickle.load(file) state.set_theta( worker.model.compute_weights_from_seeds(noise, seeds)) tlogger.info('Loaded initial theta from {}'.format(load_from)) else: state.initialize(rs, noise, worker.model) def make_offspring(state): for i in range(exp['population_size'] // 2): idx = noise.sample_index(rs, worker.model.num_params) mutation_power = state.sample(state.mutation_power) pos_theta = worker.model.compute_mutation( noise, state.theta, idx, mutation_power) yield (pos_theta, idx) neg_theta = worker.model.compute_mutation( noise, state.theta, idx, -mutation_power) diff = (np.max( np.abs((pos_theta + neg_theta) / 2 - state.theta))) assert diff < 1e-5, 'Diff too large: {}'.format(diff) yield (neg_theta, idx) tlogger.info('Start training') _, initial_performance, _ = worker.monitor_eval_repeated( [(state.theta, 0)], max_frames=None, num_episodes=exp['num_test_episodes'])[0] while True: tstart_iteration = time.time() if state.timesteps_so_far >= exp['timesteps']: tlogger.info('Training terminated after {} timesteps'.format( state.timesteps_so_far)) break frames_computed_so_far = sess.run(worker.steps_counter) tlogger.info('Evaluating perturbations') iterator = iter( worker.monitor_eval(make_offspring(state), max_frames=state.tslimit * 4)) results = [] for pos_seeds, pos_reward, pos_length in iterator: neg_seeds, neg_reward, neg_length = next(iterator) assert pos_seeds == neg_seeds results.append( Offspring(pos_seeds, [pos_reward, neg_reward], [pos_length, neg_length])) state.num_frames += sess.run( worker.steps_counter) - frames_computed_so_far state.it += 1 tlogger.record_tabular('Iteration', state.it) tlogger.record_tabular('MutationPower', state.sample(state.mutation_power)) tlogger.record_tabular('TimestepLimitPerEpisode', state.tslimit) # Trim unwanted results results = results[:exp['population_size'] // 2] assert len(results) == exp['population_size'] // 2 rewards = np.array([b for a in results for b in a.rewards]) results_timesteps = np.array([a.training_steps for a in results]) timesteps_this_iter = sum([a.training_steps for a in results]) state.timesteps_so_far += timesteps_this_iter tlogger.record_tabular('PopulationEpRewMax', np.max(rewards)) tlogger.record_tabular('PopulationEpRewMean', np.mean(rewards)) tlogger.record_tabular('PopulationEpRewMedian', np.median(rewards)) tlogger.record_tabular('PopulationEpCount', len(rewards)) tlogger.record_tabular('PopulationTimesteps', timesteps_this_iter) # Update Theta returns_n2 = np.array([a.rewards for a in results]) noise_inds_n = [a.seeds for a in results] if exp['return_proc_mode'] == 'centered_rank': proc_returns_n2 = compute_centered_ranks(returns_n2) else: raise NotImplementedError(exp['return_proc_mode']) # Compute and take step g, count = batched_weighted_sum( proc_returns_n2[:, 0] - proc_returns_n2[:, 1], (noise.get(idx, worker.model.num_params) for idx in noise_inds_n), batch_size=500) # NOTE: gradients are scaled by \theta g /= returns_n2.size assert g.shape == ( worker.model.num_params, ) and g.dtype == np.float32 and count == len(noise_inds_n) update_ratio, state.theta = state.optimizer.update(-g + exp['l2coeff'] * state.theta) time_elapsed_this_iter = time.time() - tstart_iteration state.time_elapsed += time_elapsed_this_iter tlogger.info('Evaluate elite') _, test_evals, test_timesteps = worker.monitor_eval_repeated( [(state.theta, 0)], max_frames=None, num_episodes=exp['num_test_episodes'])[0] test_timesteps = sum(test_timesteps) # Log Results tlogger.record_tabular('TestRewMean', np.mean(test_evals)) tlogger.record_tabular('TestRewMedian', np.median(test_evals)) tlogger.record_tabular('TestEpCount', len(test_evals)) tlogger.record_tabular('TestEpLenSum', test_timesteps) tlogger.record_tabular('InitialRewMax', np.max(initial_performance)) tlogger.record_tabular('InitialRewMean', np.mean(initial_performance)) tlogger.record_tabular('InitialRewMedian', np.median(initial_performance)) tlogger.record_tabular('TimestepsThisIter', timesteps_this_iter) tlogger.record_tabular( 'TimestepsPerSecondThisIter', timesteps_this_iter / (time.time() - tstart_iteration)) tlogger.record_tabular('TimestepsComputed', state.num_frames) tlogger.record_tabular('TimestepsSoFar', state.timesteps_so_far) tlogger.record_tabular('TimeElapsedThisIter', time_elapsed_this_iter) tlogger.record_tabular('TimeElapsedThisIterTotal', time.time() - tstart_iteration) tlogger.record_tabular('TimeElapsed', state.time_elapsed) tlogger.record_tabular('TimeElapsedTotal', time.time() - all_tstart) tlogger.dump_tabular() fps = state.timesteps_so_far / (time.time() - tstart) tlogger.info( 'Timesteps Per Second: {:.0f}. Elapsed: {:.2f}h ETA {:.2f}h'. format(fps, (time.time() - all_tstart) / 3600, (exp['timesteps'] - state.timesteps_so_far) / fps / 3600)) if state.adaptive_tslimit: if np.mean( [a.training_steps >= state.tslimit for a in results]) > state.incr_tslimit_threshold: state.tslimit = min( state.tslimit * state.tslimit_incr_ratio, state.tslimit_max) tlogger.info('Increased threshold to {}'.format( state.tslimit)) os.makedirs(log_dir, exist_ok=True) save_file = os.path.join(log_dir, 'snapshot.pkl') with open(save_file, 'wb+') as file: pickle.dump(state, file) #copyfile(save_file, os.path.join(log_dir, 'snapshot_gen{:04d}.pkl'.format(state.it))) tlogger.info("Saved iteration {} to {}".format( state.it, save_file)) if state.timesteps_so_far >= exp['timesteps']: tlogger.info('Training terminated after {} timesteps'.format( state.timesteps_so_far)) break results.clear()
def main(**exp): log_dir = tlogger.log_dir() tlogger.info(json.dumps(exp, indent=4, sort_keys=True)) tlogger.info('Logging to: {}'.format(log_dir)) Model = neuroevolution.models.__dict__[exp['model']] all_tstart = time.time() noise = SharedNoiseTable() rs = np.random.RandomState() def make_env0(b): return gym_tensorflow.make(game=exp["games"][0], batch_size=b) def make_env1(b): return gym_tensorflow.make(game=exp["games"][1], batch_size=b) workers = [ ConcurrentWorkers(make_env0, Model, batch_size=64), ConcurrentWorkers(make_env1, Model, batch_size=64) ] tlogger.info('Start timing') tstart = time.time() tf_sess = tf.Session() tf_sess.run(tf.global_variables_initializer()) state = TrainingState(exp) state.initialize(rs, noise, workers[0].model) workers[0].initialize(tf_sess) workers[1].initialize(tf_sess) for iteration in range(exp['iterations']): tlogger.info("BEGINNING ITERATION: {}".format(iteration)) ############## ### GAME 0 ### ############## worker = workers[0] frames_computed_so_far = tf_sess.run(worker.steps_counter) game0_results = [] game0_rewards = [] game0_episode_lengths = [] iterator = iter( worker.monitor_eval(make_offspring(exp, noise, rs, worker, state), max_frames=state.tslimit * 4)) for pos_seeds, pos_reward, pos_length in iterator: neg_seeds, neg_reward, neg_length = next(iterator) assert pos_seeds == neg_seeds result = Offspring(pos_seeds, [pos_reward, neg_reward], [pos_length, neg_length]) rewards = result.rewards game0_results.append(result) game0_rewards.append(rewards) game0_episode_lengths.append(result.ep_len) state.num_frames += tf_sess.run( worker.steps_counter) - frames_computed_so_far game0_returns_n2 = np.array([a.rewards for a in game0_results]) game0_noise_inds_n = [a.seeds for a in game0_results] save_pickle(iteration, log_dir, "game0_rewards", game0_rewards) save_pickle(iteration, log_dir, "game0_episode_lengths", game0_episode_lengths) ############## ### GAME 1 ### ############## if f_isSingleTask(exp): game1_results = [] game1_rewards = [] game1_episode_lengths = [] game1_returns_n2 = game0_returns_n2 game1_noise_inds_n = game0_noise_inds_n else: worker = workers[1] frames_computed_so_far = tf_sess.run(worker.steps_counter) game1_results = [] game1_rewards = [] game1_episode_lengths = [] seeds_vector = np.array(game0_noise_inds_n) iterator = iter( worker.monitor_eval(make_offspring(exp, noise, rs, worker, state, seeds_vector), max_frames=state.tslimit * 4)) for pos_seeds, pos_reward, pos_length in iterator: neg_seeds, neg_reward, neg_length = next(iterator) assert pos_seeds == neg_seeds result = Offspring(pos_seeds, [pos_reward, neg_reward], [pos_length, neg_length]) rewards = result.rewards game1_results.append(result) game1_rewards.append(rewards) game1_episode_lengths.append(result.ep_len) state.num_frames += tf_sess.run( worker.steps_counter) - frames_computed_so_far game1_returns_n2 = np.array([a.rewards for a in game1_results]) game1_noise_inds_n = [a.seeds for a in game1_results] save_pickle(iteration, log_dir, "game1_rewards", game1_rewards) save_pickle(iteration, log_dir, "game1_episode_lengths", game1_episode_lengths) tlogger.info("Saving offsprings seeds") save_pickle(iteration, log_dir, "offsprings_seeds", game1_noise_inds_n) #################### ### UPDATE THETA ### #################### if f_isSingleTask(exp): proc_frames = compute_centered_ranks( np.asarray(game0_episode_lengths)) proc_returns = compute_centered_ranks(game0_returns_n2) noise_inds_n = game0_noise_inds_n else: game_returns = [game0_returns_n2, game1_returns_n2] proc_returns = obtain_proc_returns(exp['learn_option'], game_returns) assert game0_noise_inds_n == game1_noise_inds_n noise_inds_n = game0_noise_inds_n + game1_noise_inds_n # concatenate the two lists g_returns, count_returns = batched_weighted_sum( proc_returns[:, 0] - proc_returns[:, 1], (noise.get(idx, worker.model.num_params) for idx in noise_inds_n), batch_size=500) g_frames, count_frames = batched_weighted_sum( proc_frames[:, 0] - proc_frames[:, 1], (noise.get(idx, worker.model.num_params) for idx in noise_inds_n), batch_size=500) assert count_frames == count_returns count = count_returns w = exp['w'] g = w * g_returns + (1 - w) * g_frames returns_n2 = np.array([a.rewards for a in game0_results] + [a.rewards for a in game1_results]) g /= returns_n2.size assert g.shape == ( worker.model.num_params, ) and g.dtype == np.float32 and count == len(noise_inds_n) update_ratio, state.theta = state.optimizer.update(-g + exp['l2coeff'] * state.theta) save_pickle(iteration, log_dir, "state", state) ###################### ### EVALUATE ELITE ### ###################### _, test_evals, test_timesteps = workers[0].monitor_eval_repeated( [(state.theta, 0)], max_frames=None, num_episodes=exp['num_test_episodes'] // (2**(1 - f_isSingleTask(exp))))[0] tlogger.info("game0 elite: {}".format(np.mean(test_evals))) tlogger.info("game0 elite frames max: {}".format( np.max(test_timesteps))) tlogger.info("game0 elite frames mean: {}".format( np.mean(test_timesteps))) tlogger.info("game0 elite frames min: {}".format( np.min(test_timesteps))) tlogger.info("game0 offspring frames max: {}".format( np.max(game0_episode_lengths))) tlogger.info("game0 offspring frames mean: {}".format( np.mean(game0_episode_lengths))) tlogger.info("game0 offspring frames min: {}".format( np.min(game0_episode_lengths))) save_pickle(iteration, log_dir, 'game0_elite', test_evals) save_pickle(iteration, log_dir, 'game0_elite_timestemps', test_timesteps) if not (f_isSingleTask(exp)): _, test_evals, test_timesteps = workers[1].monitor_eval_repeated( [(state.theta, 0)], max_frames=None, num_episodes=exp['num_test_episodes'] // 2)[0] tlogger.info("game1 elite: {}".format(np.mean(test_evals))) save_pickle(iteration, log_dir, "game1_elite", test_evals) save_pickle(iteration, log_dir, 'game1_elite_timestemps', test_timesteps) state.num_frames += tf_sess.run( worker.steps_counter) - frames_computed_so_far state.it += 1 os.kill(os.getpid(), signal.SIGTERM)