def main(exp, log_dir): log_dir = tlogger.log_dir(log_dir) snap_idx = 0 snapshots = [] tlogger.info(json.dumps(exp, indent=4, sort_keys=True)) tlogger.info('Logging to: {}'.format(log_dir)) Model = neuroevolution.models.__dict__[exp['model']] all_tstart = time.time() def make_env(b): return gym_tensorflow.make(game=exp["game"], batch_size=b) worker = ConcurrentWorkers(make_env, Model, batch_size=64) with WorkerSession(worker) as sess: rs = np.random.RandomState() noise = None state = None cached_parents = [] results = [] def make_offspring(): if len(cached_parents) == 0: return worker.model.randomize(rs, noise) else: assert len(cached_parents) == exp['selection_threshold'] parent = cached_parents[rs.randint(len(cached_parents))] return worker.model.mutate(parent, rs, noise, mutation_power=state.sample( state.mutation_power)) tlogger.info('Start timing') tstart = time.time() load_file = os.path.join(log_dir, 'snapshot.pkl') if 'load_from' in exp: filename = os.path.join(log_dir, exp['load_from']) with open(filename, 'rb+') as file: state = pickle.load(file) state.timesteps_so_far = 0 # Reset timesteps to 0 state.it = 0 state.max_reward = 0 state.max_avg = 0 state.max_sd = 0 tlogger.info('Loaded initial policy from {}'.format(filename)) elif os.path.exists(load_file): try: with open(load_file, 'rb+') as file: state = pickle.load(file) tlogger.info("Loaded iteration {} from {}".format( state.it, load_file)) except FileNotFoundError: tlogger.info('Failed to load snapshot') if not noise: tlogger.info("Generating new noise table") noise = SharedNoiseTable() else: tlogger.info("Using noise table from snapshot") if not state: tlogger.info("Generation new TrainingState") state = TrainingState(exp) if 'load_population' in exp: state.copy_population(exp['load_population']) # Cache first population if needed (on restart) if state.population and exp['selection_threshold'] > 0: tlogger.info("Caching parents") cached_parents.clear() if state.elite in state.population[:exp['selection_threshold']]: cached_parents.extend([ (worker.model.compute_weights_from_seeds(noise, o.seeds), o.seeds) for o in state.population[:exp['selection_threshold']] ]) else: cached_parents.append((worker.model.compute_weights_from_seeds( noise, state.elite.seeds), state.elite.seeds)) cached_parents.extend([ (worker.model.compute_weights_from_seeds(noise, o.seeds), o.seeds) for o in state.population[:exp['selection_threshold'] - 1] ]) tlogger.info("Done caching parents") while True: tstart_iteration = time.time() if state.timesteps_so_far >= exp['timesteps']: tlogger.info('Training terminated after {} timesteps'.format( state.timesteps_so_far)) break frames_computed_so_far = sess.run(worker.steps_counter) assert (len(cached_parents) == 0 and state.it == 0) or len(cached_parents) == exp['selection_threshold'] tasks = [make_offspring() for _ in range(exp['population_size'])] for seeds, episode_reward, episode_length in worker.monitor_eval( tasks, max_frames=state.tslimit * 4): results.append( Offspring(seeds, [episode_reward], [episode_length])) state.num_frames += sess.run( worker.steps_counter) - frames_computed_so_far state.it += 1 tlogger.record_tabular('Iteration', state.it) tlogger.record_tabular('MutationPower', state.sample(state.mutation_power)) # Trim unwanted results results = results[:exp['population_size']] assert len(results) == exp['population_size'] rewards = np.array([a.fitness for a in results]) population_timesteps = sum([a.training_steps for a in results]) state.population = sorted(results, key=lambda x: x.fitness, reverse=True) state.max_reward = save_best_pop_member(state.max_reward, np.max(rewards), state, state.population[0]) tlogger.record_tabular('PopulationEpRewMax', np.max(rewards)) tlogger.record_tabular('PopulationEpRewMean', np.mean(rewards)) tlogger.record_tabular('PopulationEpCount', len(rewards)) tlogger.record_tabular('PopulationTimesteps', population_timesteps) tlogger.record_tabular('NumSelectedIndividuals', exp['selection_threshold']) tlogger.info('Evaluate population') validation_population = state.population[:exp[ 'validation_threshold']] if state.elite is not None: validation_population = [state.elite ] + validation_population[:-1] validation_tasks = [(worker.model.compute_weights_from_seeds( noise, validation_population[x].seeds, cache=cached_parents), validation_population[x].seeds) for x in range(exp['validation_threshold'])] _, population_validation, population_validation_len = zip( *worker.monitor_eval_repeated( validation_tasks, max_frames=state.tslimit * 4, num_episodes=exp['num_validation_episodes'])) it_max_avg = np.max([np.mean(x) for x in population_validation]) it_max_sd = np.max([np.std(x) for x in population_validation]) state.max_avg = np.max([state.max_avg, it_max_avg]) state.max_sd = np.max([state.max_sd, it_max_sd]) tlogger.info("Max Average: {}".format(state.max_avg)) tlogger.info("Max Std: {}".format(state.max_sd)) fitness_results = [(np.mean(x), np.std(x)) for x in population_validation] with open(os.path.join(log_dir, 'fitness.log'), 'a') as f: f.write("{},{},{}: {}\n".format( state.it, state.max_avg, state.max_sd, ','.join([ "({},{})".format(x[0], x[1]) for x in fitness_results ]))) population_fitness = [ fitness(x[0], x[1], state.max_avg, state.max_sd) for x in fitness_results ] tlogger.info("Fitness: {}".format(population_fitness)) population_validation_len = [ np.sum(x) for x in population_validation_len ] time_elapsed_this_iter = time.time() - tstart_iteration state.time_elapsed += time_elapsed_this_iter population_elite_idx = np.argmin(population_fitness) state.elite = validation_population[population_elite_idx] elite_theta = worker.model.compute_weights_from_seeds( noise, state.elite.seeds, cache=cached_parents) _, population_elite_evals, population_elite_evals_timesteps = worker.monitor_eval_repeated( [(elite_theta, state.elite.seeds)], max_frames=None, num_episodes=exp['num_test_episodes'])[0] # Log Results validation_timesteps = sum(population_validation_len) timesteps_this_iter = population_timesteps + validation_timesteps state.timesteps_so_far += timesteps_this_iter state.validation_timesteps_so_far += validation_timesteps # Log tlogger.record_tabular( 'TruncatedPopulationRewMean', np.mean([a.fitness for a in validation_population])) tlogger.record_tabular('TruncatedPopulationValidationFitMean', np.mean(population_fitness)) tlogger.record_tabular('TruncatedPopulationValidationFitMax', np.max(population_fitness)) tlogger.record_tabular('TruncatedPopulationValidationFitMin', np.min(population_fitness)) tlogger.record_tabular('TruncatedPopulationValidationMaxAvg', state.max_avg) tlogger.record_tabular('TruncatedPopulationValidationMaxStd', state.max_sd) tlogger.record_tabular('TruncatedPopulationEliteValidationFitMin', np.min(population_fitness)) tlogger.record_tabular("TruncatedPopulationEliteIndex", population_elite_idx) tlogger.record_tabular('TruncatedPopulationEliteSeeds', state.elite.seeds) tlogger.record_tabular('TruncatedPopulationEliteTestRewMean', np.mean(population_elite_evals)) tlogger.record_tabular('TruncatedPopulationEliteTestRewStd', np.std(population_elite_evals)) tlogger.record_tabular('TruncatedPopulationEliteTestEpCount', len(population_elite_evals)) tlogger.record_tabular('TruncatedPopulationEliteTestEpLenSum', np.sum(population_elite_evals_timesteps)) if np.mean(population_validation) > state.curr_solution_val: state.curr_solution = state.elite.seeds state.curr_solution_val = np.mean(population_validation) state.curr_solution_test = np.mean(population_elite_evals) tlogger.record_tabular('ValidationTimestepsThisIter', validation_timesteps) tlogger.record_tabular('ValidationTimestepsSoFar', state.validation_timesteps_so_far) tlogger.record_tabular('TimestepsThisIter', timesteps_this_iter) tlogger.record_tabular( 'TimestepsPerSecondThisIter', timesteps_this_iter / (time.time() - tstart_iteration)) tlogger.record_tabular('TimestepsComputed', state.num_frames) tlogger.record_tabular('TimestepsSoFar', state.timesteps_so_far) tlogger.record_tabular('TimeElapsedThisIter', time_elapsed_this_iter) tlogger.record_tabular('TimeElapsedThisIterTotal', time.time() - tstart_iteration) tlogger.record_tabular('TimeElapsed', state.time_elapsed) tlogger.record_tabular('TimeElapsedTotal', time.time() - all_tstart) tlogger.dump_tabular() # tlogger.info('Current elite: {}'.format(state.elite.seeds)) fps = state.timesteps_so_far / (time.time() - tstart) tlogger.info( 'Timesteps Per Second: {:.0f}. Elapsed: {:.2f}h ETA {:.2f}h'. format(fps, (time.time() - all_tstart) / 3600, (exp['timesteps'] - state.timesteps_so_far) / fps / 3600)) if state.adaptive_tslimit: if np.mean( [a.training_steps >= state.tslimit for a in results]) > state.incr_tslimit_threshold: state.tslimit = min( state.tslimit * state.tslimit_incr_ratio, state.tslimit_max) tlogger.info('Increased threshold to {}'.format( state.tslimit)) snap_idx, snapshots = save_snapshot(state, log_dir, snap_idx, snapshots) # os.makedirs(log_dir, exist_ok=True) # copyfile(save_file, os.path.join(log_dir, 'snapshot_gen{:04d}.pkl'.format(state.it))) tlogger.info("Saved iteration {} to {}".format( state.it, snapshots[snap_idx - 1])) if state.timesteps_so_far >= exp['timesteps']: tlogger.info('Training terminated after {} timesteps'.format( state.timesteps_so_far)) break results.clear() if exp['selection_threshold'] > 0: tlogger.info("Caching parents") new_parents = [] if state.elite in state.population[: exp['selection_threshold']]: new_parents.extend([ (worker.model.compute_weights_from_seeds( noise, o.seeds, cache=cached_parents), o.seeds) for o in state.population[:exp['selection_threshold']] ]) else: new_parents.append( (worker.model.compute_weights_from_seeds( noise, state.elite.seeds, cache=cached_parents), state.elite.seeds)) new_parents.extend([ (worker.model.compute_weights_from_seeds( noise, o.seeds, cache=cached_parents), o.seeds) for o in state.population[:exp['selection_threshold'] - 1] ]) cached_parents.clear() cached_parents.extend(new_parents) tlogger.info("Done caching parents") return float(state.curr_solution_test), { 'val': float(state.curr_solution_val) }
def main(**exp): log_dir = tlogger.log_dir() tlogger.info(json.dumps(exp, indent=4, sort_keys=True)) tlogger.info('Logging to: {}'.format(log_dir)) Model = neuroevolution.models.__dict__[exp['model']] all_tstart = time.time() def make_env_game0(b): return gym_tensorflow.make(game=exp['games'][0], batch_size=b) def make_env_game1(b): return gym_tensorflow.make(game=exp['games'][1], batch_size=b) def make_offspring(state): for i in range(exp['population_size'] // 2): idx = noise.sample_index(rs, worker.model.num_params) mutation_power = state.sample(state.mutation_power) pos_theta = worker.model.compute_mutation(noise, state.theta, idx, mutation_power) yield (pos_theta, idx) neg_theta = worker.model.compute_mutation(noise, state.theta, idx, -mutation_power) diff = (np.max(np.abs((pos_theta + neg_theta) / 2 - state.theta))) assert diff < 1e-5, 'Diff too large: {}'.format(diff) yield (neg_theta, idx) worker = MTConcurrentWorkers([make_env_game0, make_env_game1], Model, batch_size=32) print("=== [mtes] worker.sess = {}".format(worker.sess)) with WorkerSession(worker) as sess: print("=== [mtes] worker.sess = {}".format(worker.sess)) noise = SharedNoiseTable() rs = np.random.RandomState() tlogger.info('Start timing') tstart = time.time() state = TrainingState(exp) state.initialize(rs, noise, worker.model) tlogger.info('Start training') game_index, _, initial_performance, _ = worker.monitor_eval_repeated( [(state.theta, 0)], max_frames=None, num_episodes=exp['num_test_episodes'])[0] print("=== past worker.monitor_eval_repeated") while True: print("=== next cycle in while loop") tstart_iteration = time.time() if state.timesteps_so_far >= exp['timesteps']: tlogger.info('Training terminated after {} timesteps'.format( state.timesteps_so_far)) break frames_computed_so_far = sess.run(worker.steps_counter) print("=== frames_captured_so_far = {}".format( frames_computed_so_far)) tlogger.info('Evaluating perturbations') iterator = iter( worker.monitor_eval(make_offspring(state), max_frames=state.tslimit * 4)) results = [] for game_index, pos_seeds, pos_reward, pos_length in iterator: game_index, neg_seeds, neg_reward, neg_length = next(iterator) assert pos_seeds == neg_seeds results.append( Offspring(pos_seeds, [pos_reward, neg_reward], [pos_length, neg_length])) state.num_frames += sess.run( worker.steps_counter) - frames_computed_so_far state.it += 1 tlogger.record_tabular('Iteration', state.it) tlogger.record_tabular('MutationPower', state.sample(state.mutation_power)) tlogger.record_tabular('TimestepLimitPerEpisode', state.tslimit) # Trim unwanted results results = results[:exp['population_size'] // 2] assert len(results) == exp['population_size'] // 2 rewards = np.array([b for a in results for b in a.rewards]) results_timesteps = np.array([a.training_steps for a in results]) timesteps_this_iter = sum([a.training_steps for a in results]) state.timesteps_so_far += timesteps_this_iter tlogger.record_tabular('PopulationEpRewMax', np.max(rewards)) tlogger.record_tabular('PopulationEpRewMean', np.mean(rewards)) tlogger.record_tabular('PopulationEpRewMedian', np.median(rewards)) tlogger.record_tabular('PopulationEpCount', len(rewards)) tlogger.record_tabular('PopulationTimesteps', timesteps_this_iter) # Update Theta returns_n2 = np.array([a.rewards for a in results]) noise_inds_n = [a.seeds for a in results] if exp['return_proc_mode'] == 'centered_rank': proc_returns_n2 = compute_centered_ranks(returns_n2) else: raise NotImplementedError(exp['return_proc_mode']) # Compute and take step g, count = batched_weighted_sum( proc_returns_n2[:, 0] - proc_returns_n2[:, 1], (noise.get(idx, worker.model.num_params) for idx in noise_inds_n), batch_size=500) # NOTE: gradients are scaled by \theta g /= returns_n2.size assert g.shape == ( worker.model.num_params, ) and g.dtype == np.float32 and count == len(noise_inds_n) update_ratio, state.theta = state.optimizer.update(-g + exp['l2coeff'] * state.theta) time_elapsed_this_iter = time.time() - tstart_iteration state.time_elapsed += time_elapsed_this_iter tlogger.info('Evaluate elite') #_, test_evals, test_timesteps = worker.monitor_eval_repeated([(state.theta, 0)], max_frames=None, num_episodes=exp['num_test_episodes'])[0] game_index, _, test_evals, test_timesteps = worker.monitor_eval_repeated( [(state.theta, 0)], max_frames=None, num_episodes=exp['num_test_episodes'])[0] test_timesteps = sum(test_timesteps) # Log Results tlogger.record_tabular('TestRewMin', np.min(test_evals)) tlogger.record_tabular('TestRewMean', np.mean(test_evals)) tlogger.record_tabular('TestRewMedian', np.median(test_evals)) tlogger.record_tabular('TestRewMax', np.max(test_evals)) tlogger.record_tabular('TestEpCount', len(test_evals)) game_stats = [[], []] for k, v in enumerate(game_index): game_stats[v].append(test_evals[k]) tlogger.record_tabular('Game0TestRewMin', np.min(game_stats[0])) tlogger.record_tabular('Game0TestRewMean', np.mean(game_stats[0])) tlogger.record_tabular('Game0TestRewMedian', np.median(game_stats[0])) tlogger.record_tabular('Game0TestRewMax', np.max(game_stats[0])) tlogger.record_tabular('Game0TestEpCount', len(game_stats[0])) tlogger.record_tabular('Game1TestRewMin', np.min(game_stats[1])) tlogger.record_tabular('Game1TestRewMean', np.mean(game_stats[1])) tlogger.record_tabular('Game1TestRewMedian', np.median(game_stats[1])) tlogger.record_tabular('Game1TestRewMax', np.max(game_stats[1])) tlogger.record_tabular('Game1TestEpCount', len(game_stats[1])) tlogger.record_tabular('TestEpLenSum', test_timesteps) tlogger.record_tabular('InitialRewMax', np.max(initial_performance)) tlogger.record_tabular('InitialRewMean', np.mean(initial_performance)) tlogger.record_tabular('InitialRewMedian', np.median(initial_performance)) tlogger.record_tabular('TimestepsThisIter', timesteps_this_iter) tlogger.record_tabular( 'TimestepsPerSecondThisIter', timesteps_this_iter / (time.time() - tstart_iteration)) tlogger.record_tabular('TimestepsComputed', state.num_frames) tlogger.record_tabular('TimestepsSoFar', state.timesteps_so_far) tlogger.record_tabular('TimeElapsedThisIter', time_elapsed_this_iter) tlogger.record_tabular('TimeElapsedThisIterTotal', time.time() - tstart_iteration) tlogger.record_tabular('TimeElapsed', state.time_elapsed) tlogger.record_tabular('TimeElapsedTotal', time.time() - all_tstart) tlogger.dump_tabular() fps = state.timesteps_so_far / (time.time() - tstart) tlogger.info( 'Timesteps Per Second: {:.0f}. Elapsed: {:.2f}h ETA {:.2f}h'. format(fps, (time.time() - all_tstart) / 3600, (exp['timesteps'] - state.timesteps_so_far) / fps / 3600)) if state.adaptive_tslimit: if np.mean( [a.training_steps >= state.tslimit for a in results]) > state.incr_tslimit_threshold: state.tslimit = min( state.tslimit * state.tslimit_incr_ratio, state.tslimit_max) tlogger.info('Increased threshold to {}'.format( state.tslimit)) os.makedirs(log_dir, exist_ok=True) save_file = os.path.join(log_dir, 'snapshot.pkl') with open(save_file, 'wb+') as file: pickle.dump(state, file, pickle.HIGHEST_PROTOCOL) copyfile( save_file, os.path.join(log_dir, 'snapshot_gen{:04d}.pkl'.format(state.it))) tlogger.info("Saved iteration {} to {}".format( state.it, save_file)) if state.timesteps_so_far >= exp['timesteps']: tlogger.info('Training terminated after {} timesteps'.format( state.timesteps_so_far)) break results.clear()
def main(game, filename=None, outfile=None, model_name="LargeModel", no_video=False, add_text=False, num_runs=RUNS, graph=None): seeds = default_seeds outvid = None viewer = None iteration = None state = None if filename: with open(filename, 'rb+') as file: state = pickle.load(file) #if hasattr(state, 'best_score'): # seeds = state.best_score.seeds # iteration = len(seeds) # print("Loading GA snapshot from best_score, iteration: ", len(seeds)) if hasattr(state, 'elite'): seeds = state.elite.seeds iteration = state.it print("Loading GA snapshot from elite, iteration: {} / {}", len(seeds), iteration) else: seeds = None iteration = state.it print("Loading ES snapshot, iteration: {}".format(state.it)) if outfile: pass fourcc = cv.VideoWriter_fourcc(*'MJPG') outvid = cv.VideoWriter(outfile, fourcc, 16, (VIDEO_SIZE, VIDEO_SIZE)) env = gym_tensorflow.make(game, 1) model = get_model(model_name) obs_op = env.observation() reset_op = env.reset() if model.requires_ref_batch: def make_env(b): return gym_tensorflow.make(game=game, batch_size=1) with tf.Session() as sess: ref_batch = gym_tensorflow.get_ref_batch(make_env, sess, 128) ref_batch = ref_batch[:, ...] else: ref_batch = None action_op = model.make_net(tf.expand_dims(obs_op, axis=1), env.action_space, batch_size=1, ref_batch=ref_batch) if env.discrete_action: action_op = tf.argmax(action_op, axis=-1, output_type=tf.int32) rew_op, done_op = env.step(action_op) if not no_video: from gym.envs.classic_control import rendering viewer = rendering.SimpleImageViewer() if hasattr(env.unwrapped, 'render'): obs_op = env.unwrapped.render() def display_obs(im): # pdb.set_trace() if im.shape[1] > 1: im = np.bitwise_or(im[0, 0, ...], im[0, 1, ...]) else: im = im[0, 0, ...] handle_frame(im, outvid, viewer, game, iteration, add_text) else: def display_obs(im): pdb.set_trace() im = im[0, :, :, -1] im = np.stack([im] * 3, axis=-1) im = (im * 255).astype(np.uint8) handle_frame(im, outvid, viewer, game, iteration, add_text) rewards = [] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) model.initialize() tlogger.info(model.description) import pdb pdb.set_trace() if seeds: noise = SharedNoiseTable() weights = model.compute_weights_from_seeds(noise, seeds) model.load(sess, 0, weights, seeds) else: weights = state.theta model.load(sess, 0, weights, (weights, 0)) if graph: saver.save(sess, graph) for i in range(num_runs): sess.run(reset_op) sess.run(obs_op) #recorder.capture_frame() display_obs(sess.run(obs_op)) total_rew = 0 num_frames = 0 while True: rew, done = sess.run([rew_op, done_op]) num_frames += 1 total_rew += rew[0] display_obs(sess.run(obs_op)) time.sleep(4 / 60) if done[0]: break rewards += [total_rew] print('Final reward: ', total_rew, 'after', num_frames, 'steps') print(rewards) print("Mean: ", np.mean(rewards)) print("Std: ", np.std(rewards)) if outvid: outvid.release()
def main(**exp): log_dir = tlogger.log_dir() tlogger.info(json.dumps(exp, indent=4, sort_keys=True)) tlogger.info('Logging to: {}'.format(log_dir)) Model = neuroevolution.models.__dict__[exp['model']] all_tstart = time.time() noise = SharedNoiseTable() rs = np.random.RandomState() def make_env0(b): return gym_tensorflow.make(game=exp["games"][0], batch_size=b) def make_env1(b): return gym_tensorflow.make(game=exp["games"][1], batch_size=b) # if f_isSingleTask(exp): # workers = [ ConcurrentWorkers(make_env0, Model, batch_size=64) ] # else: # workers = [ # ConcurrentWorkers(make_env0, Model, batch_size=32), # ConcurrentWorkers(make_env1, Model, batch_size=32) # ] workers = [ ConcurrentWorkers(make_env0, Model, batch_size=32 * (1 + f_isSingleTask(exp))), ConcurrentWorkers(make_env1, Model, batch_size=32 * (1 - f_isSingleTask(exp))) ] tlogger.info('Start timing') tstart = time.time() tf_sess = tf.Session() tf_sess.run(tf.global_variables_initializer()) state = TrainingState(exp) state.initialize(rs, noise, workers[0].model) workers[0].initialize(tf_sess) workers[1].initialize(tf_sess) if not f_isSingleTask(exp): raise RuntimeError( "This script is only designed to run single task. Check your configuration and make sure both games have the same name" ) for iteration in range(exp['iterations']): tlogger.info("BEGINNING ITERATION: {}".format(iteration)) # override state using one from a previous run state = load_pickle(iteration, exp['model_logdir'], "state") ############## ### GAME 0 ### ############## worker = workers[0] frames_computed_so_far = tf_sess.run(worker.steps_counter) game0_results = [] game0_rewards = [] game0_episode_lengths = [] iterator = iter( worker.monitor_eval(make_offspring(exp, noise, rs, worker, state), max_frames=state.tslimit * 4)) for pos_seeds, pos_reward, pos_length in iterator: neg_seeds, neg_reward, neg_length = next(iterator) assert pos_seeds == neg_seeds result = Offspring(pos_seeds, [pos_reward, neg_reward], [pos_length, neg_length]) rewards = result.rewards game0_results.append(result) game0_rewards.append(rewards) game0_episode_lengths.append(result.ep_len) state.num_frames += tf_sess.run( worker.steps_counter) - frames_computed_so_far game0_returns_n2 = np.array([a.rewards for a in game0_results]) game0_noise_inds_n = [a.seeds for a in game0_results] tlogger.info("game0 rewards: {}".format(np.mean(game0_rewards))) tlogger.info("game0 eplens: {}".format(np.mean(game0_episode_lengths))) save_pickle(iteration, log_dir, "game0_rewards", game0_rewards) save_pickle(iteration, log_dir, "game0_episode_lengths", game0_episode_lengths) tlogger.info("Saving offsprings seeds") save_pickle(iteration, log_dir, "offsprings_seeds", game0_noise_inds_n) save_pickle(iteration, log_dir, "state", state) ###################### ### EVALUATE ELITE ### ###################### _, test_evals, test_timesteps = workers[0].monitor_eval_repeated( [(state.theta, 0)], max_frames=None, num_episodes=exp['num_test_episodes'] // (2**(1 - f_isSingleTask(exp))))[0] tlogger.info("game0 elite: {}".format(np.mean(test_evals))) save_pickle(iteration, log_dir, 'game0_elite', test_evals) save_pickle(iteration, log_dir, 'game0_elite_timestemps', test_timesteps) state.num_frames += tf_sess.run( worker.steps_counter) - frames_computed_so_far state.it += 1 os.kill(os.getpid(), signal.SIGTERM)
def main(**exp): log_dir = tlogger.log_dir() tlogger.info(json.dumps(exp, indent=4, sort_keys=True)) tlogger.info('Logging to: {}'.format(log_dir)) Model = neuroevolution.models.__dict__[exp['model']] all_tstart = time.time() noise = SharedNoiseTable() rs = np.random.RandomState() def make_env0(b): return gym_tensorflow.make(game=exp["games"][0], batch_size=b) def make_env1(b): return gym_tensorflow.make(game=exp["games"][1], batch_size=b) workers = [ ConcurrentWorkers(make_env0, Model, batch_size=64), ConcurrentWorkers(make_env1, Model, batch_size=64) ] saver = tf.train.Saver() tlogger.info('Start timing') tstart = time.time() tf_sess = tf.Session() tf_sess.run(tf.global_variables_initializer()) state = TrainingState(exp) state.initialize(rs, noise, workers[0].model) workers[0].initialize(tf_sess) workers[1].initialize(tf_sess) for iteration in range(exp['iterations']): tlogger.info("BEGINNING ITERATION: {}".format(iteration)) ############## ### GAME 0 ### ############## worker = workers[0] frames_computed_so_far = tf_sess.run(worker.steps_counter) game0_results = [] game0_rewards = [] game0_episode_lengths = [] iterator = iter( worker.monitor_eval(make_offspring(exp, noise, rs, worker, state), max_frames=state.tslimit * 4)) for pos_seeds, pos_reward, pos_length in iterator: neg_seeds, neg_reward, neg_length = next(iterator) assert pos_seeds == neg_seeds result = Offspring(pos_seeds, [pos_reward, neg_reward], [pos_length, neg_length]) rewards = result.rewards game0_results.append(result) game0_rewards.append(rewards) game0_episode_lengths.append(result.ep_len) state.num_frames += tf_sess.run( worker.steps_counter) - frames_computed_so_far game0_returns_n2 = np.array([a.rewards for a in game0_results]) game0_noise_inds_n = [a.seeds for a in game0_results] save_pickle(iteration, log_dir, "game0_rewards", game0_rewards) save_pickle(iteration, log_dir, "game0_episode_lengths", game0_episode_lengths) ############## ### GAME 1 ### ############## worker = workers[1] frames_computed_so_far = tf_sess.run(worker.steps_counter) game1_results = [] game1_rewards = [] game1_episode_lengths = [] seeds_vector = np.array(game0_noise_inds_n) iterator = iter( worker.monitor_eval(make_offspring(exp, noise, rs, worker, state, seeds_vector), max_frames=state.tslimit * 4)) for pos_seeds, pos_reward, pos_length in iterator: neg_seeds, neg_reward, neg_length = next(iterator) assert pos_seeds == neg_seeds result = Offspring(pos_seeds, [pos_reward, neg_reward], [pos_length, neg_length]) rewards = result.rewards game1_results.append(result) game1_rewards.append(rewards) game1_episode_lengths.append(result.ep_len) state.num_frames += tf_sess.run( worker.steps_counter) - frames_computed_so_far game1_returns_n2 = np.array([a.rewards for a in game1_results]) game1_noise_inds_n = [a.seeds for a in game1_results] save_pickle(iteration, log_dir, "game1_rewards", game1_rewards) save_pickle(iteration, log_dir, "game1_episode_lengths", game1_episode_lengths) tlogger.info("Saving offsprings seeds") save_pickle(iteration, log_dir, "offsprings_seeds", game1_noise_inds_n) #################### ### UPDATE THETA ### #################### game_returns = [game0_returns_n2, game1_returns_n2] proc_returns = obtain_proc_returns(exp['learn_option'], game_returns) assert game0_noise_inds_n == game1_noise_inds_n noise_inds_n = game0_noise_inds_n + game1_noise_inds_n # concatenate the two lists g, count = batched_weighted_sum( proc_returns[:, 0] - proc_returns[:, 1], (noise.get(idx, worker.model.num_params) for idx in noise_inds_n), batch_size=500) returns_n2 = np.array([a.rewards for a in game0_results] + [a.rewards for a in game1_results]) g /= returns_n2.size assert g.shape == ( worker.model.num_params, ) and g.dtype == np.float32 and count == len(noise_inds_n) update_ratio, state.theta = state.optimizer.update(-g + exp['l2coeff'] * state.theta) save_pickle(iteration, log_dir, "state", state) ###################### ### EVALUATE ELITE ### ###################### _, test_evals, test_timesteps = workers[0].monitor_eval_repeated( [(state.theta, 0)], max_frames=None, num_episodes=exp['num_test_episodes'] // 2)[0] tlogger.info("game0 elite: {}".format(np.mean(test_evals))) save_pickle(iteration, log_dir, 'game0_elite', test_evals) save_pickle(iteration, log_dir, 'game0_elite_timestemps', test_timesteps) _, test_evals, test_timesteps = workers[1].monitor_eval_repeated( [(state.theta, 0)], max_frames=None, num_episodes=exp['num_test_episodes'] // 2)[0] tlogger.info("game1 elite: {}".format(np.mean(test_evals))) save_pickle(iteration, log_dir, "game1_elite", test_evals) save_pickle(iteration, log_dir, 'game1_elite_timestemps', test_timesteps) state.num_frames += tf_sess.run( worker.steps_counter) - frames_computed_so_far saver.save(tf_sess, "{}/model-{}".format(log_dir, state.it)) state.it += 1 os.kill(os.getpid(), signal.SIGTERM)
def main(game, filename=None, out_dir=None, model_name='LargeModel', add_text=False, num_runs=RUNS, layer=None): seeds = default_seeds outvid = None viewer = None iteration = None state = None if filename: with open(filename, 'rb+') as file: state = pickle.load(file) #if hasattr(state, 'best_score'): # seeds = state.best_score.seeds # iteration = len(seeds) # print("Loading GA snapshot from best_score, iteration: ", len(seeds)) if hasattr(state, 'elite'): seeds = state.elite.seeds iteration = state.it print("Loading GA snapshot from elite, iteration: {} / {}".format(len(seeds), iteration)) else: seeds = None iteration = state.it print("Loading ES snapshot, iteration: {}", state.it) fourcc = cv.VideoWriter_fourcc(*'H264') env = gym_tensorflow.make(game, 1) model = get_model(model_name) obs_op = env.observation() reset_op = env.reset() if model.requires_ref_batch: def make_env(b): return gym_tensorflow.make(game=game, batch_size=1) with tf.Session() as sess: ref_batch = gym_tensorflow.get_ref_batch(make_env, sess, 128) ref_batch = ref_batch[:, ...] else: ref_batch = None input_op = tf.expand_dims(obs_op, axis=1) action_op = model.make_net(input_op, env.action_space, batch_size=1, ref_batch=ref_batch) if env.discrete_action: action_op = tf.argmax(action_op, axis=-1, output_type=tf.int32) rew_op, done_op = env.step(action_op) out_vids = {'all': cv.VideoWriter(os.path.join(out_dir, 'all.mp4'), fourcc, 16, (VIDEO_SIZE, VIDEO_SIZE))} if hasattr(env.unwrapped, 'render'): obs_op = env.unwrapped.render() def display_obs(im, viz): # pdb.set_trace() if im.shape[1] > 1: im = np.bitwise_or(im[0, 0, ...], im[0, 1, ...]) else: im = im[0, 0, ...] for key in out_vids.keys(): im = combine_viz(im, viz, key) handle_frame(im, out_vids[key], viewer, game, iteration, add_text) else: def display_obs(im, viz): im = im[0, :, :, -1] im = np.stack([im] * 3, axis=-1) im = (im * 255).astype(np.uint8) for key in out_vids.keys(): im = combine_viz(im, viz, key) handle_frame(im, out_vids[key], viewer, game, iteration, add_text) rewards = [] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) model.initialize() tlogger.info(model.description) if seeds: noise = SharedNoiseTable() weights = model.compute_weights_from_seeds(noise, seeds) model.load(sess, 0, weights, seeds) else: weights = state.theta model.load(sess, 0, weights, (weights, 0)) success, images = get_nn_images(sess, input_op, model) for key in images.keys(): out_vids[key] = cv.VideoWriter( os.path.join(out_dir, '{}.mp4'.format(key.replace('/', '-'))), fourcc, 16, (VIDEO_SIZE, VIDEO_SIZE)) for i in range(num_runs): sess.run(reset_op) # recorder.capture_frame() total_rew = 0 num_frames = 0 while True: img = sess.run(obs_op) success, images = get_nn_images(sess, input_op, model) rew, done = sess.run([rew_op, done_op]) num_frames += 1 total_rew += rew[0] display_obs(img, images) # time.sleep(4/60) if done[0] or num_frames == 50: rewards += [total_rew] print('Final reward: ', total_rew, 'after', num_frames, 'steps') break print(rewards) print("Mean: ", np.mean(rewards)) print("Std: ", np.std(rewards)) for key in out_vids: out_vids[key].release()