def run_training( config=None, tuner=None, logdir=None, trial_name=None, # pylint: disable=unused-argument is_chief=True): """Do all training runs. This is the top level training function for policy gradient based models. Run this from the main function. Args: config: config_lib.Config instance containing global config (agent and environment hparams). If None, config will be parsed from FLAGS.config. tuner: (unused) A tuner instance. Leave as None if not tuning. logdir: Parent directory where all data from all runs will be written. If None, FLAGS.logdir will be used. trial_name: (unused) If tuning, set this to a unique string that identifies this trial. If `tuner` is not None, this also must be set. is_chief: True if this worker is the chief. Returns: List of results dicts which were written to disk. Each training run gets a results dict. Results dict contains metrics, i.e. (name, value) pairs which give information about the training run. Raises: ValueError: If FLAGS.num_workers does not divide FLAGS.num_repetitions. ValueError: If results dicts read from disk contain invalid data. """ if not config: # If custom config is not given, get it from flags. config = defaults.default_config_with_updates(FLAGS.config) if not logdir: logdir = FLAGS.logdir if FLAGS.num_repetitions % FLAGS.num_workers != 0: raise ValueError('Number of workers must divide number of repetitions') num_local_reps = FLAGS.num_repetitions // FLAGS.num_workers logging.info('Running %d reps globally.', FLAGS.num_repetitions) logging.info('This worker will run %d local reps.', num_local_reps) if FLAGS.max_npe: max_generations = FLAGS.max_npe // config.batch_size logging.info('Max samples per rep: %d', FLAGS.max_npe) logging.info('Max generations per rep: %d', max_generations) else: max_generations = sys.maxint logging.info('Running unlimited generations.') assert FLAGS.num_workers > 0 logging.info('Starting experiment. Directory: "%s"', logdir) results = results_lib.Results(logdir, FLAGS.task_id) local_results_list = results.read_this_shard() if local_results_list: if local_results_list[0]['max_npe'] != FLAGS.max_npe: raise ValueError( 'Cannot resume training. Max-NPE changed. Was %s, now %s', local_results_list[0]['max_npe'], FLAGS.max_npe) if local_results_list[0][ 'max_global_repetitions'] != FLAGS.num_repetitions: raise ValueError( 'Cannot resume training. Number of repetitions changed. Was %s, ' 'now %s', local_results_list[0]['max_global_repetitions'], FLAGS.num_repetitions) start_rep = len(local_results_list) for rep in xrange(start_rep, num_local_reps): global_rep = num_local_reps * FLAGS.task_id + rep logging.info('Starting repetition: Rep = %d. (global rep = %d)', rep, global_rep) # Save data for each rep, like checkpoints, goes into separate folders. run_dir = os.path.join(logdir, 'run_%d' % global_rep) if not tf.gfile.IsDirectory(run_dir): tf.gfile.MakeDirs(run_dir) checkpoint_writer = CheckpointWriter(run_dir, population_size=config.batch_size) data_manager = data.DataManager(config, run_number=global_rep) task_eval_fn = ga_lib.make_task_eval_fn(data_manager.rl_task) if config.agent.algorithm == 'rand': logging.info('Running random search.') assert FLAGS.max_npe result = run_random_search(FLAGS.max_npe, run_dir, task_eval_fn, config.timestep_limit) else: assert config.agent.algorithm == 'ga' logging.info('Running genetic algorithm.') pop = ga_lib.make_population(ga_lib.random_individual( config.timestep_limit), n=config.batch_size) hof = utils.MaxUniquePriorityQueue(2) # Hall of fame. result = ga_lib.ga_loop(pop, cxpb=config.agent.crossover_rate, mutpb=config.agent.mutation_rate, task_eval_fn=task_eval_fn, ngen=max_generations, halloffame=hof, checkpoint_writer=checkpoint_writer) logging.info('Finished rep. Num gens: %d', result.generations) results_dict = { 'max_npe': FLAGS.max_npe, 'batch_size': config.batch_size, 'max_batches': FLAGS.max_npe // config.batch_size, 'npe': result.num_programs, 'max_global_repetitions': FLAGS.num_repetitions, 'max_local_repetitions': num_local_reps, 'code_solution': result.best_code if result.solution_found else '', 'best_reward': result.reward, 'num_batches': result.generations, 'found_solution': result.solution_found, 'task': data_manager.task_name, 'global_rep': global_rep } logging.info('results_dict: %s', results_dict) results.append(results_dict) if is_chief: logging.info( 'Worker is chief. Waiting for all workers to finish so that results ' 'can be reported to the tuner.') global_results_list, shard_stats = results.read_all( num_shards=FLAGS.num_workers) while not all(s.finished for s in shard_stats): logging.info( 'Still waiting on these workers: %s', ', '.join([ '%d (%d reps left)' % (i, s.max_local_reps - s.num_local_reps_completed) for i, s in enumerate(shard_stats) if not s.finished ])) sleep(60) global_results_list, shard_stats = results.read_all( num_shards=FLAGS.num_workers) logging.info( '%d results obtained. Chief worker is exiting the experiment.', len(global_results_list)) return global_results_list
def __init__(self, config, task_id, ps_tasks, num_workers, is_chief=True, summary_writer=None, dtype=tf.float32, summary_interval=1, run_number=0, logging_dir='/tmp', model_v=0): self.config = config self.data_manager = data.DataManager( config, run_number=run_number, do_code_simplification=not FLAGS.stop_on_success) self.task_id = task_id self.ps_tasks = ps_tasks self.is_chief = is_chief if ps_tasks == 0: assert task_id == 0, 'No parameter servers specified. Expecting 1 task.' assert num_workers == 1, ( 'No parameter servers specified. Expecting 1 task.') worker_device = '/job:localhost/replica:%d/task:0/cpu:0' % task_id # worker_device = '/cpu:0' # ps_device = '/cpu:0' else: assert num_workers > 0, 'There must be at least 1 training worker.' worker_device = '/job:worker/replica:%d/task:0/cpu:0' % task_id # ps_device = '/job:ps/replica:0/task:0/cpu:0' tf.logging.info('worker_device: %s', worker_device) logging_file = os.path.join(logging_dir, 'solutions_%d.txt' % task_id) experience_replay_file = os.path.join( logging_dir, 'replay_buffer_%d.pickle' % task_id) self.topk_file = os.path.join(logging_dir, 'topk_buffer_%d.pickle' % task_id) tf.get_variable_scope().set_use_resource(True) # global model with tf.device( tf.train.replica_device_setter(ps_tasks, ps_device='/job:ps/replica:0', worker_device=worker_device)): with tf.variable_scope('global'): global_model = agent_lib.LMAgent(config, dtype=dtype, is_local=False) global_params_dict = { p.name: p for p in global_model.sync_variables } self.global_model = global_model self.global_step = make_initialized_variable(0, 'global_step', dtype=tf.int64) self.global_best_reward = make_initialized_variable( -10.0, 'global_best_reward', dtype=tf.float64) self.is_best_model = make_initialized_variable(False, 'is_best_model', dtype=tf.bool) self.reset_is_best_model = self.is_best_model.assign(False) self.global_best_reward_placeholder = tf.placeholder( tf.float64, [], name='global_best_reward_placeholder') self.assign_global_best_reward_op = tf.group( self.global_best_reward.assign( self.global_best_reward_placeholder), self.is_best_model.assign(True)) def assign_global_best_reward_fn(session, reward): reward = round(reward, 10) best_reward = round(session.run(self.global_best_reward), 10) is_best = reward > best_reward if is_best: session.run( self.assign_global_best_reward_op, {self.global_best_reward_placeholder: reward}) return is_best self.assign_global_best_reward_fn = assign_global_best_reward_fn # Any worker will set to true when it finds a solution. self.found_solution_flag = make_initialized_variable( False, 'found_solution_flag', dtype=tf.bool) self.found_solution_op = self.found_solution_flag.assign(True) self.run_number = make_initialized_variable(run_number, 'run_number', dtype=tf.int32) # Store a solution when found. self.code_solution_variable = tf.get_variable( 'code_solution', [], tf.string, initializer=tf.constant_initializer('')) self.code_solution_ph = tf.placeholder(tf.string, [], name='code_solution_ph') self.code_solution_assign_op = self.code_solution_variable.assign( self.code_solution_ph) def assign_code_solution_fn(session, code_solution_string): session.run(self.code_solution_assign_op, {self.code_solution_ph: code_solution_string}) self.assign_code_solution_fn = assign_code_solution_fn # Count all programs sampled from policy. This does not include # programs sampled from replay buffer. # This equals NPE (number of programs executed). Only programs sampled # from the policy need to be executed. self.program_count = make_initialized_variable(0, 'program_count', dtype=tf.int64) # local model with tf.device(worker_device): with tf.variable_scope('local'): self.model = model = agent_lib.LMAgent( config, task_id=task_id, logging_file=logging_file, experience_replay_file=experience_replay_file, dtype=dtype, global_best_reward_fn=self.assign_global_best_reward_fn, found_solution_op=self.found_solution_op, assign_code_solution_fn=self.assign_code_solution_fn, program_count=self.program_count, stop_on_success=FLAGS.stop_on_success, verbose_level=model_v) local_params = model.trainable_variables local_params_dict = {p.name: p for p in local_params} # Pull global params to local model. def _global_to_local_scope(name): assert name.startswith('global/') return 'local' + name[6:] sync_dict = { local_params_dict[_global_to_local_scope(p_name)]: p for p_name, p in global_params_dict.items() } self.sync_op = tf.group(*[ v_local.assign(v_global) for v_local, v_global in sync_dict.items() ]) # Pair local gradients with global params. grad_var_dict = { gradient: sync_dict[local_var] for local_var, gradient in model.gradients_dict.items() } # local model model.make_summary_ops() # Don't put summaries under 'local' scope. with tf.variable_scope('local'): self.train_op = model.optimizer.apply_gradients( grad_var_dict.items(), global_step=self.global_step) self.local_init_op = tf.variables_initializer( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, tf.get_variable_scope().name)) self.local_step = 0 self.last_summary_time = time.time() self.summary_interval = summary_interval self.summary_writer = summary_writer self.cached_global_step = -1 self.cached_global_npe = -1 tf.logging.info('summary_interval: %d', self.summary_interval) # Load top-k buffer. if self.model.top_episodes is not None and tf.gfile.Exists( self.topk_file): try: with tf.gfile.FastGFile(self.topk_file, 'r') as f: self.model.top_episodes = cPickle.loads(f.read()) tf.logging.info( 'Loaded top-k buffer from disk with %d items. Location: "%s"', len(self.model.top_episodes), self.topk_file) except (cPickle.UnpicklingError, EOFError) as e: tf.logging.warn( 'Failed to load existing top-k buffer from disk. Removing bad file.' '\nLocation: "%s"\nException: %s', self.topk_file, str(e)) tf.gfile.Remove(self.topk_file)