def main(unused_argv): """Run the reinforcement learning loop.""" print('Wiping dir %s' % FLAGS.base_dir, flush=True) shutil.rmtree(FLAGS.base_dir, ignore_errors=True) dirs = [fsdb.models_dir(), fsdb.selfplay_dir(), fsdb.holdout_dir(), fsdb.eval_dir(), fsdb.golden_chunk_dir(), fsdb.working_dir()] for d in dirs: ensure_dir_exists(d); # Copy the flag files so there's no chance of them getting accidentally # overwritten while the RL loop is running. flags_dir = os.path.join(FLAGS.base_dir, 'flags') shutil.copytree(FLAGS.flags_dir, flags_dir) FLAGS.flags_dir = flags_dir # Copy the target model to the models directory so we can find it easily. for file_name in [ "target.pb", "target_raw.ckpt.data-00000-of-00001", "target_raw.ckpt.index", "target_raw.ckpt.meta"]: shutil.copy(FLAGS.target_path[:-len("target.pb")] + file_name, os.path.join(fsdb.models_dir(), file_name)) logging.getLogger().addHandler( logging.FileHandler(os.path.join(FLAGS.base_dir, 'rl_loop.log'))) formatter = logging.Formatter('[%(asctime)s] %(message)s', '%Y-%m-%d %H:%M:%S') for handler in logging.getLogger().handlers: handler.setFormatter(formatter) with logged_timer('Total time'): try: rl_loop() finally: asyncio.get_event_loop().close()
def main(unused_argv): """Run the reinforcement learning loop.""" print('Wiping dir %s' % FLAGS.base_dir, flush=True) shutil.rmtree(FLAGS.base_dir, ignore_errors=True) utils.ensure_dir_exists(fsdb.models_dir()) utils.ensure_dir_exists(fsdb.selfplay_dir()) utils.ensure_dir_exists(fsdb.holdout_dir()) utils.ensure_dir_exists(fsdb.eval_dir()) utils.ensure_dir_exists(fsdb.golden_chunk_dir()) utils.ensure_dir_exists(fsdb.working_dir()) # Copy the target model to the models directory so we can find it easily. shutil.copy('ml_perf/target.pb', fsdb.models_dir()) logging.getLogger().addHandler( logging.FileHandler(os.path.join(FLAGS.base_dir, 'reinforcement.log'))) formatter = logging.Formatter('[%(asctime)s] %(message)s', '%Y-%m-%d %H:%M:%S') for handler in logging.getLogger().handlers: handler.setFormatter(formatter) with utils.logged_timer('Total time'): for target_win_rate in rl_loop(): if target_win_rate > 0.5: return logging.info('Passed exit criteria.') logging.info('Failed to converge.')
async def start_selfplay(): output_dir = os.path.join(fsdb.selfplay_dir(), "$MODEL") holdout_dir = os.path.join(fsdb.holdout_dir(), "$MODEL") model_pattern = os.path.join(fsdb.models_dir(), '%d.pb') logs = [] processes = [] loop = asyncio.get_event_loop() for i, device in enumerate(FLAGS.selfplay_devices): cmd = [ 'bazel-bin/cc/concurrent_selfplay', '--flagfile={}'.format(os.path.join(FLAGS.flags_dir, 'selfplay.flags')), '--run_forever=1', '--device={}'.format(device), '--model={}'.format(model_pattern), '--output_dir={}/{}'.format(output_dir, i), '--holdout_dir={}/{}'.format(holdout_dir, i)] cmd_str = await expand_cmd_str(cmd) f = open(os.path.join(FLAGS.base_dir, 'selfplay_%d.log' % i), 'w') f.write(cmd_str + '\n\n') f.flush() logging.info('Running: %s', cmd_str) processes.append(await asyncio.create_subprocess_exec( *cmd, stdout=f, stderr=asyncio.subprocess.STDOUT)) logs.append(f) return (processes, logs)
def smart_rsync(from_model_num=0, source_dir=None, dest_dir=LOCAL_DIR): source_dir = source_dir or fsdb.selfplay_dir() from_model_num = 0 if from_model_num < 0 else from_model_num models = [m for m in fsdb.get_models() if m[0] >= from_model_num] for _, model in models: _rsync_dir(os.path.join(source_dir, model), os.path.join(dest_dir, model))
async def sample_training_examples(state): """Sample training examples from recent selfplay games. Args: state: the RL loop State instance. Returns: A list of golden chunks up to num_records in length, sorted by path. """ dirs = [x.path for x in os.scandir(fsdb.selfplay_dir()) if x.is_dir()] src_patterns = [] for d in sorted(dirs, reverse=True)[:FLAGS.window_size]: src_patterns.append(os.path.join(d, '*', '*', '*.tfrecord.zz')) dst_path = os.path.join(fsdb.golden_chunk_dir(), '{}.tfrecord.zz'.format(state.train_model_name)) logging.info('Writing training chunks to %s', dst_path) lines = await sample_records(src_patterns, dst_path, num_read_threads=8, num_write_threads=8, sample_frac=FLAGS.train_filter) logging.info('\n'.join(lines)) chunk_pattern = os.path.join( fsdb.golden_chunk_dir(), '{}-*-of-*.tfrecord.zz'.format(state.train_model_name)) chunk_paths = sorted(tf.gfile.Glob(chunk_pattern)) assert len(chunk_paths) == 8 return chunk_paths
def main(unused_argv): """Run the reinforcement learning loop.""" print('Wiping dir %s' % FLAGS.base_dir, flush=True) shutil.rmtree(FLAGS.base_dir, ignore_errors=True) utils.ensure_dir_exists(fsdb.models_dir()) utils.ensure_dir_exists(fsdb.selfplay_dir()) utils.ensure_dir_exists(fsdb.holdout_dir()) utils.ensure_dir_exists(fsdb.eval_dir()) utils.ensure_dir_exists(fsdb.golden_chunk_dir()) utils.ensure_dir_exists(fsdb.working_dir()) # Copy the flag files so there's no chance of them getting accidentally # overwritten while the RL loop is running. flags_dir = os.path.join(FLAGS.base_dir, 'flags') shutil.copytree(FLAGS.flags_dir, flags_dir) FLAGS.flags_dir = flags_dir # Copy the target model to the models directory so we can find it easily. shutil.copy('ml_perf/target.pb', fsdb.models_dir()) logging.getLogger().addHandler( logging.FileHandler(os.path.join(FLAGS.base_dir, 'rl_loop.log'))) formatter = logging.Formatter('[%(asctime)s] %(message)s', '%Y-%m-%d %H:%M:%S') for handler in logging.getLogger().handlers: handler.setFormatter(formatter) with utils.logged_timer('Total time'): try: rl_loop() finally: asyncio.get_event_loop().close()
def selfplay(state): output_dir = os.path.join(fsdb.selfplay_dir(), state.output_model_name) holdout_dir = os.path.join(fsdb.holdout_dir(), state.output_model_name) model_path = os.path.join(fsdb.models_dir(), state.best_model_name) result = checked_run([ 'bazel-bin/cc/selfplay', '--parallel_games=2048', '--num_readouts=100', '--model={}.pb'.format(model_path), '--output_dir={}'.format(output_dir), '--holdout_dir={}'.format(holdout_dir) ] + cc_flags(state), 'selfplay') logging.info(get_lines(result, make_slice[-2:])) # Write examples to a single record. pattern = os.path.join(output_dir, '*', '*.zz') random.seed(state.seed) tf.set_random_seed(state.seed) np.random.seed(state.seed) # TODO(tommadams): This method of generating one golden chunk per generation # is sub-optimal because each chunk gets reused multiple times for training, # introducing bias. Instead, a fresh dataset should be uniformly sampled out # of *all* games in the training window before the start of each training run. buffer = example_buffer.ExampleBuffer(sampling_frac=1.0) # TODO(tommadams): parallel_fill is currently non-deterministic. Make it not # so. logging.info('Writing golden chunk from "{}"'.format(pattern)) buffer.parallel_fill(tf.gfile.Glob(pattern)) buffer.flush( os.path.join(fsdb.golden_chunk_dir(), state.output_model_name + '.tfrecord.zz'))
def run_tpu(no_resign=False): os.environ[ 'GRPC_DEFAULT_SSL_ROOTS_FILE_PATH'] = '/etc/ssl/certs/ca-certificates.crt' flagset = [ 'bazel-bin/cc/main', '--mode=selfplay', '--engine=tpu', '--model={}'.format( os.path.join(fsdb.working_dir(), 'model.ckpt-%d.pb')), '--output_dir={}'.format(fsdb.selfplay_dir()), '--holdout_dir={}'.format(fsdb.holdout_dir()), '--sgf_dir={}'.format(fsdb.sgf_dir()), '--run_forever=true', '--output_bigtable={}'.format(FLAGS.output_bigtable) ] if 'KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS' in os.environ: flagset.append('--tpu_name={}'.format( os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'])) if no_resign: flagset.extend(['--flagfile=rl_loop/distributed_flags_nr']) else: flagset.extend([ '--flags_path={}'.format(fsdb.flags_path()), '--flagfile=rl_loop/distributed_flags' ]) mask_flags.checked_run(flagset)
def time_rsync(from_date, source_dir=None, dest_dir=LOCAL_DIR): source_dir = source_dir or fsdb.selfplay_dir() while from_date < dt.datetime.utcnow(): src = os.path.join(source_dir, from_date.strftime("%Y-%m-%d-%H")) if tf.gfile.Exists(src): _rsync_dir( src, os.path.join(dest_dir, from_date.strftime("%Y-%m-%d-%H"))) from_date = from_date + dt.timedelta(hours=1)
def run_tpu(): mask_flags.checked_run([ 'bazel-bin/cc/main', '--mode=selfplay', '--engine=tpu', '--checkpoint_dir={}'.format(fsdb.working_dir()), '--output_dir={}'.format(fsdb.selfplay_dir()), '--holdout_dir={}'.format(fsdb.holdout_dir()), '--sgf_dir={}'.format( fsdb.sgf_dir()), '--flags_path={}'.format(fsdb.flags_path()), '--run_forever=true', '--flagfile=rl_loop/distributed_flags' ])
async def selfplay(state): """Run selfplay and write a training chunk to the fsdb golden_chunk_dir. Args: state: the RL loop State instance. """ output_dir = os.path.join(fsdb.selfplay_dir(), state.output_model_name) holdout_dir = os.path.join(fsdb.holdout_dir(), state.output_model_name) commands = [] num_selfplay_processes = len(FLAGS.selfplay_devices) if num_selfplay_processes == 1: commands.append([ 'bazel-bin/cc/selfplay', '--flagfile={}'.format( os.path.join(FLAGS.flags_dir, 'selfplay.flags')), '--num_games={}'.format(FLAGS.selfplay_num_games), '--parallel_games={}'.format(FLAGS.selfplay_num_games_per_thread), '--model={}:0,{}'.format(FLAGS.engine, state.best_model_path), '--output_dir={}/{}'.format(output_dir, 0), '--holdout_dir={}/{}'.format(holdout_dir, 0) ]) else: for i, device in enumerate(FLAGS.selfplay_devices): a = ((i - 1) * FLAGS.selfplay_num_games) // (num_selfplay_processes - 1) b = (i * FLAGS.selfplay_num_games) // (num_selfplay_processes - 1) num_games = b - a parallel_games = ( (num_games + FLAGS.selfplay_num_games_per_thread - 1) // FLAGS.selfplay_num_games_per_thread) commands.append([ 'bazel-bin/cc/selfplay', '--flagfile={}'.format( os.path.join(FLAGS.flags_dir, 'selfplay.flags')), '--num_games={}'.format(num_games), '--parallel_games={}'.format(parallel_games), '--model={}:{},{}'.format(FLAGS.engine, device, state.best_model_path), '--output_dir={}/{}'.format(output_dir, i), '--holdout_dir={}/{}'.format(holdout_dir, i) ]) all_lines = await run_commands(commands) black_wins_total = white_wins_total = num_games = 0 for lines in all_lines: result = '\n'.join(lines[-6:]) logging.info(result) stats = parse_win_stats_table(result, 1)[0] num_games += stats.total_wins black_wins_total += stats.black_wins.total white_wins_total += stats.white_wins.total logging.info('Black won %0.3f, white won %0.3f', black_wins_total / num_games, white_wins_total / num_games)
async def bootstrap_selfplay(state): output_dir = os.path.join(fsdb.selfplay_dir(), state.train_model_name) holdout_dir = os.path.join(fsdb.holdout_dir(), state.train_model_name) lines = await run( 'bazel-bin/cc/concurrent_selfplay', '--flagfile={}'.format(os.path.join(FLAGS.flags_dir, 'bootstrap.flags')), '--output_dir={}/0'.format(output_dir), '--holdout_dir={}/0'.format(holdout_dir)) logging.info('\n'.join(lines[-6:]))
async def bootstrap_selfplay(state): output_dir = os.path.join(fsdb.selfplay_dir(), state.train_model_name) holdout_dir = os.path.join(fsdb.holdout_dir(), state.train_model_name) features = 'extra' if FLAGS.use_extra_features else 'agz' lines = await run( 'bazel-bin/cc/concurrent_selfplay', '--flagfile={}'.format(os.path.join(FLAGS.flags_dir, 'bootstrap.flags')), '--model={}:0.4:0.4'.format(features), '--num_games={}'.format(FLAGS.min_games_per_iteration), '--output_dir={}/0'.format(output_dir), '--holdout_dir={}/0'.format(holdout_dir)) logging.info('\n'.join(lines[-6:]))
async def selfplay(state, flagfile='selfplay', seed_factor=0): """Run selfplay and write a training chunk to the fsdb golden_chunk_dir. Args: state: the RL loop State instance. flagfile: the name of the flagfile to use for selfplay, either 'selfplay' (the default) or 'boostrap'. seed_factor: Factor to increase seed. """ output_dir = os.path.join(fsdb.selfplay_dir(), state.output_model_name) holdout_dir = os.path.join(fsdb.holdout_dir(), state.output_model_name) lines = await run( 'bazel-bin/cc/selfplay', '--flagfile={}.flags'.format(os.path.join(FLAGS.flags_dir, flagfile)), '--model={}'.format(get_ckpt_path(state.best_model_path)), '--output_dir={}'.format(output_dir), '--holdout_dir={}'.format(holdout_dir), '--seed={}'.format(state.seed+100*seed_factor)) result = '\n'.join(lines[-6:]) logging.info(result) result = '\n'.join(lines[-50:]) try: stats = parse_win_stats_table(result, 1)[0] num_games = stats.total_wins logging.info('Black won %0.3f, white won %0.3f', stats.black_wins.total / num_games, stats.white_wins.total / num_games) except AssertionError: # Poplar logging might screw up lines extraction approach. logging.error("No results to parse: \n %s" % lines[-50:]) if not MULTI_SP: # Write examples to a single record. pattern = os.path.join(output_dir, '*', '*.zz') random.seed(state.seed) tf.set_random_seed(state.seed) np.random.seed(state.seed) # TODO(tommadams): This method of generating one golden chunk per generation # is sub-optimal because each chunk gets reused multiple times for training, # introducing bias. Instead, a fresh dataset should be uniformly sampled out # of *all* games in the training window before the start of each training run. buffer = example_buffer.ExampleBuffer(sampling_frac=1.0) # TODO(tommadams): parallel_fill is currently non-deterministic. Make it not # so. logging.info('Writing golden chunk from "{}"'.format(pattern)) buffer.parallel_fill(tf.gfile.Glob(pattern)) buffer.flush(os.path.join(fsdb.golden_chunk_dir(), state.output_model_name + '.tfrecord.zz'))
def main(unused_argv): """Run the reinforcement learning loop.""" logging.getLogger('mlperf_compliance').propagate = False ##-->multi-node setup if FLAGS.use_multinode: mpi_comm = MPI.COMM_WORLD mpi_rank = mpi_comm.Get_rank() mpi_size = mpi_comm.Get_size() print('[MPI Init] MPI rank {}, mpi size is {} host is {}'.format( mpi_rank, mpi_size, socket.gethostname())) else: mpi_comm = None mpi_rank = 0 mpi_size = 1 print('Wiping dir %s' % FLAGS.base_dir, flush=True) shutil.rmtree(FLAGS.base_dir, ignore_errors=True) dirs = [ fsdb.models_dir(), fsdb.selfplay_dir(), fsdb.holdout_dir(), fsdb.eval_dir(), fsdb.golden_chunk_dir(), fsdb.working_dir() ] ##-->sharedFS for dataExchange. tmp solution 5/6/2019 if FLAGS.use_multinode: ensure_dir_exists(FLAGS.shared_dir_exchange) for d in dirs: ensure_dir_exists(d) # Copy the flag files so there's no chance of them getting accidentally # overwritten while the RL loop is running. flags_dir = os.path.join(FLAGS.base_dir, 'flags') shutil.copytree(FLAGS.flags_dir, flags_dir) FLAGS.flags_dir = flags_dir # Copy the target model to the models directory so we can find it easily. shutil.copy(FLAGS.target_path, os.path.join(fsdb.models_dir(), 'target.pb')) shutil.copy(FLAGS.target_path + '.og', os.path.join(fsdb.models_dir(), 'target.pb.og')) with logged_timer('Total time from mpi_rank={}'.format(mpi_rank)): try: rl_loop(mpi_comm, mpi_rank, mpi_size) finally: asyncio.get_event_loop().close()
async def bootstrap_selfplay(state): output_name = '000000-000000' output_dir = os.path.join(fsdb.selfplay_dir(), output_name) holdout_dir = os.path.join(fsdb.holdout_dir(), output_name) sgf_dir = os.path.join(fsdb.sgf_dir(), output_name) lines = await run( 'bazel-bin/cc/selfplay', '--flagfile={}'.format(os.path.join(FLAGS.flags_dir, 'bootstrap.flags')), '--num_games={}'.format(FLAGS.selfplay_num_games), '--parallel_games=32', '--model=random:0,0.4:0.4', '--output_dir={}/0'.format(output_dir), '--holdout_dir={}/0'.format(holdout_dir), '--sgf_dir={}'.format(sgf_dir)) logging.info('\n'.join(lines[-6:]))
async def selfplay(state, flagfile='selfplay'): """Run selfplay and write a training chunk to the fsdb golden_chunk_dir. Args: state: the RL loop State instance. flagfile: the name of the flagfile to use for selfplay, either 'selfplay' (the default) or 'boostrap'. """ output_dir = os.path.join(fsdb.selfplay_dir(), state.output_model_name) holdout_dir = os.path.join(fsdb.holdout_dir(), state.output_model_name) # instead of 2 workers in 1 process per device, we do 2 processes with 1 worker all_tasks = [] loop = asyncio.get_event_loop() for i in range(FLAGS.num_gpus_selfplay * 2): # 2 worker per device all_tasks.append( loop.create_task( selfplay_sub(state, output_dir, holdout_dir, flagfile, i))) all_lines = await asyncio.gather(*all_tasks, return_exceptions=True) black_wins_total = white_wins_total = num_games = 0 for lines in all_lines: if type(lines) == RuntimeError or type(lines) == OSError: raise lines continue result = '\n'.join(lines[-6:]) logging.info(result) stats = parse_win_stats_table(result, 1)[0] num_games += stats.total_wins black_wins_total += stats.black_wins.total white_wins_total += stats.white_wins.total logging.info('Black won %0.3f, white won %0.3f', black_wins_total / num_games, white_wins_total / num_games) # Write examples to a single record. pattern = os.path.join(output_dir, '*', '*.zz') random.seed(state.seed) tf.set_random_seed(state.seed) np.random.seed(state.seed) logging.info('Writing golden chunk from "{}"'.format(pattern)) if FLAGS.use_multinode: mpi_rank = MPI.COMM_WORLD.Get_rank() divide_record(state, pattern, FLAGS.num_gpus_train, mpi_rank) else: divide_record(state, pattern, FLAGS.num_gpus_train, -1)
async def selfplay_multi(state, num_ipus): """ Start *num_ipu* selfplay processes """ output_dir = os.path.join(fsdb.selfplay_dir(), state.output_model_name) holdout_dir = os.path.join(fsdb.holdout_dir(), state.output_model_name) flagfile = 'selfplay' all_tasks = [] loop = asyncio.get_event_loop() for i in range(num_ipus): all_tasks.append(loop.create_task(selfplay_sub(state, output_dir, holdout_dir, flagfile, i))) all_lines = await asyncio.gather(*all_tasks, return_exceptions=True) black_wins_total = white_wins_total = num_games = 0 for lines in all_lines: if type(lines) == RuntimeError or type(lines) == OSError: raise lines result = '\n'.join(lines[-6:]) logging.info(result) stats = parse_win_stats_table(result, 1)[0] num_games += stats.total_wins black_wins_total += stats.black_wins.total white_wins_total += stats.white_wins.total logging.info('Black won %0.3f, white won %0.3f', black_wins_total / num_games, white_wins_total / num_games) # copy paste from selfplay to aggregate results # potentially should be parallized to training? # Write examples to a single record. pattern = os.path.join(output_dir, '*', '*.zz') random.seed(state.seed) tf.set_random_seed(state.seed) np.random.seed(state.seed) # TODO(tommadams): This method of generating one golden chunk per generation # is sub-optimal because each chunk gets reused multiple times for training, # introducing bias. Instead, a fresh dataset should be uniformly sampled out # of *all* games in the training window before the start of each training run. buffer = example_buffer.ExampleBuffer(sampling_frac=1.0) # TODO(tommadams): parallel_fill is currently non-deterministic. Make it not # so. logging.info('Writing golden chunk from "{}"'.format(pattern)) buffer.parallel_fill(tf.gfile.Glob(pattern)) buffer.flush(os.path.join(fsdb.golden_chunk_dir(), state.output_model_name + '.tfrecord.zz'))
def run_cc(): _, model_name = fsdb.get_latest_model() num_games_finished = len(fsdb.get_games(model_name)) if num_games_finished > 25000: print("{} has enough games! ({})".format(model_name, num_games_finished)) time.sleep(10 * 60) sys.exit() mask_flags.checked_run([ 'bazel-bin/cc/selfplay', '--model=tf,{}'.format(model_name), '--mode=selfplay', '--output_dir={}/{}'.format(fsdb.selfplay_dir(), model_name), '--holdout_dir={}/{}'.format(fsdb.holdout_dir(), model_name), '--sgf_dir={}/{}'.format(fsdb.sgf_dir(), model_name), '--flagfile=rl_loop/distributed_flags' ])
async def sample_training_examples(state): """Sample training examples from recent selfplay games. Args: state: the RL loop State instance. Returns: A list of golden chunks up to num_records in length, sorted by path. """ # Training examples are written out to the following directory hierarchy: # selfplay_dir/device_id/model_name/timestamp/ # Read examples from the most recent `window_size` models. device_dirs = [ x.path for x in os.scandir(fsdb.selfplay_dir()) if x.is_dir() ] models = set() for d in device_dirs: models.update([x.name for x in os.scandir(d) if x.is_dir()]) models = sorted(models, reverse=True)[:FLAGS.window_size] src_patterns = [] for d in device_dirs: for model in models: src_patterns.append(os.path.join(d, model, '*', '*.tfrecord.zz')) dst_path = os.path.join(fsdb.golden_chunk_dir(), '{}.tfrecord.zz'.format(state.train_model_name)) logging.info('Writing training chunks to %s', dst_path) lines = await sample_records(src_patterns, dst_path, num_read_threads=8, num_write_threads=8, sample_frac=FLAGS.train_filter) logging.info('\n'.join(lines)) chunk_pattern = os.path.join( fsdb.golden_chunk_dir(), '{}-*-of-*.tfrecord.zz'.format(state.train_model_name)) chunk_paths = sorted(tf.gfile.Glob(chunk_pattern)) assert len(chunk_paths) == 8 return chunk_paths
def wait_for_training_examples(state, num_games): """Wait for training examples to be generated by the latest model. Args: state: the RL loop State instance. num_games: number of games to wait for. """ first_time_around = True while True: model_dirs = list(os.scandir(fsdb.selfplay_dir())) if len(model_dirs) == state.iter_num: pattern = os.path.join(model_dirs[-1], '*', '*', '*.tfrecord.zz') paths = sorted(tf.gfile.Glob(pattern)) if len(paths) >= num_games: break if first_time_around: logging.info('Waiting for %d games', num_games) first_time_around = False time.sleep(1)
def wait_for_training_examples(state, selfplay_processes, num_games): """Wait for training examples to be generated by the latest model. Args: state: the RL loop State instance. num_games: number of games to wait for. """ first_time_around = True while True: check_on_selfplay(selfplay_processes) model_dir = os.path.join(fsdb.selfplay_dir(), state.selfplay_model_name) if os.path.isdir(model_dir): pattern = os.path.join(model_dir, '*', '*', '*.tfrecord.zz') paths = sorted(tf.gfile.Glob(pattern)) if len(paths) >= num_games: break if first_time_around: logging.info('Waiting for %d games in %s', num_games, model_dir) first_time_around = False time.sleep(1)
def main(unused_argv): """Run the reinforcement learning loop.""" mll.init_start() print('Wiping dir %s' % FLAGS.base_dir, flush=True) shutil.rmtree(FLAGS.base_dir, ignore_errors=True) dirs = [fsdb.models_dir(), fsdb.selfplay_dir(), fsdb.holdout_dir(), fsdb.eval_dir(), fsdb.golden_chunk_dir(), fsdb.working_dir(), fsdb.mpi_log_dir()] for d in dirs: ensure_dir_exists(d); # Copy the flag files so there's no chance of them getting accidentally # overwritten while the RL loop is running. flags_dir = os.path.join(FLAGS.base_dir, 'flags') shutil.copytree(FLAGS.flags_dir, flags_dir) FLAGS.flags_dir = flags_dir # Copy the target model to the models directory so we can find it easily. shutil.copy(FLAGS.target_path, os.path.join(fsdb.models_dir(), 'target.pb')) logging.getLogger().addHandler( logging.FileHandler(os.path.join(FLAGS.base_dir, 'rl_loop.log'))) formatter = logging.Formatter('[%(asctime)s] %(message)s', '%Y-%m-%d %H:%M:%S') for handler in logging.getLogger().handlers: handler.setFormatter(formatter) logging.info('Selfplay nodes = {}'.format(FLAGS.selfplay_node)) logging.info('Train nodes = {}'.format(FLAGS.train_node)) logging.info('Eval nodes = {}'.format(FLAGS.eval_node)) with logged_timer('Total time'): try: mll.init_stop() mll.run_start() rl_loop() finally: asyncio.get_event_loop().close()
def selfplay(state): play_output_name = state.play_output_name play_output_dir = os.path.join(fsdb.selfplay_dir(), play_output_name) play_holdout_dir = os.path.join(fsdb.holdout_dir(), play_output_name) result = checked_run([ 'external/minigo/cc/main', '--mode=selfplay', '--parallel_games=2048', '--num_readouts=100', '--model={}'.format( state.play_model_path), '--output_dir={}'.format(play_output_dir), '--holdout_dir={}'.format(play_holdout_dir) ] + cc_flags(state), 'selfplay') logging.info(get_lines(result, make_slice[-2:])) # Write examples to a single record. logging.info('Extracting examples') random.seed(state.seed) tensorflow.set_random_seed(state.seed) numpy.random.seed(state.seed) buffer = example_buffer.ExampleBuffer(sampling_frac=1.0) buffer.parallel_fill( tensorflow.gfile.Glob(os.path.join(play_output_dir, '*.zz'))) buffer.flush( os.path.join(fsdb.golden_chunk_dir(), play_output_name + '.tfrecord.zz'))
def make_chunk_for(output_dir=LOCAL_DIR, local_dir=LOCAL_DIR, game_dir=None, model_num=1, positions=EXAMPLES_PER_GENERATION, threads=8, sampling_frac=0.02): """ Explicitly make a golden chunk for a given model `model_num` (not necessarily the most recent one). While we haven't yet got enough samples (EXAMPLES_PER_GENERATION) Add samples from the games of previous model. """ game_dir = game_dir or fsdb.selfplay_dir() ensure_dir_exists(output_dir) models = [model for model in fsdb.get_models() if model[0] < model_num] buf = ExampleBuffer(positions, sampling_frac=sampling_frac) files = [] for _, model in sorted(models, reverse=True): local_model_dir = os.path.join(local_dir, model) if not tf.gfile.Exists(local_model_dir): print("Rsyncing", model) _rsync_dir(os.path.join(game_dir, model), local_model_dir) files.extend(tf.gfile.Glob(os.path.join(local_model_dir, '*.zz'))) print("{}: {} games".format(model, len(files))) if len(files) * 200 * sampling_frac > positions: break print("Filling from {} files".format(len(files))) buf.parallel_fill(files, threads=threads) print(buf) output = os.path.join(output_dir, str(model_num) + '.tfrecord.zz') print("Writing to", output) buf.flush(output)
async def selfplay(state, flagfile='selfplay'): """Run selfplay and write a training chunk to the fsdb golden_chunk_dir. Args: state: the RL loop State instance. flagfile: the name of the flagfile to use for selfplay, either 'selfplay' (the default) or 'boostrap'. """ output_dir = os.path.join(fsdb.selfplay_dir(), state.output_model_name) holdout_dir = os.path.join(fsdb.holdout_dir(), state.output_model_name) multi_instance, num_instance, flag_list = extract_multi_instance( ['--flagfile={}_mi.flags'.format(os.path.join(FLAGS.flags_dir, flagfile))]) sp_cmd = ['bazel-bin/cc/selfplay', '--flagfile={}.flags'.format(os.path.join(FLAGS.flags_dir, flagfile)), '--model={}'.format(state.best_model_path), '--output_dir={}'.format(output_dir), '--holdout_dir={}'.format(holdout_dir)] if not multi_instance: lines = await run( *sp_cmd, '--seed={}'.format(state.seed)) else: if FLAGS.selfplay_node == []: # run selfplay locally lines = await run( 'python3', 'ml_perf/execute.py', '--num_instance={}'.format(num_instance), '--', *sp_cmd, '--seed={}'.format(state.seed)) else: with logged_timer('selfplay mn'): # run one selfplay instance per host lines = await run_distributed( ['LD_LIBRARY_PATH=$LD_LIBRARY_PATH:cc/tensorflow'], num_instance, FLAGS.selfplay_node, None, None, state.seed, *sp_cmd) result = '\n'.join(lines) with logged_timer('parse win stats'): stats = parse_win_stats_table(result, 1)[0] num_games = stats.total_wins black_total = stats.black_wins.total white_total = stats.white_wins.total logging.info('Black won %0.3f, white won %0.3f', black_total / num_games, white_total / num_games) bias = abs(white_total - black_total)/num_games logging.info('Black total %d, white total %d, total games %d, bias %0.3f.', black_total, white_total, num_games, bias) with logged_timer('generate golden chunk'): # Write examples to a single record. pattern = os.path.join(output_dir, '*', '*.zz') files = tf.gfile.Glob(pattern) random.seed(state.seed) tf.set_random_seed(state.seed) np.random.seed(state.seed) # TODO(tommadams): This method of generating one golden chunk per generation # is sub-optimal because each chunk gets reused multiple times for training, # introducing bias. Instead, a fresh dataset should be uniformly sampled out # of *all* games in the training window before the start of each training run. # TODO(tommadams): parallel_fill is currently non-deterministic. Make it not # so. logging.info('Writing golden chunk from "{}"'.format(pattern)) threads = FLAGS.golden_chunk_split file_list = [] files_number = len(files) chunk_size = files_number // threads # split files into N seperate parts for i in range(threads): if i == threads - 1: file_list += [[i, files[chunk_size * i :]]] else: file_list += [[i, files[chunk_size * i : chunk_size * (i + 1)]]] pool = mp.Pool(threads) pool.map(functools.partial(gen_golden_chunk, state=state), file_list) return bias
def main(unused_argv): """Run the reinforcement learning loop.""" utils.ensure_dir_exists(fsdb.models_dir()) utils.ensure_dir_exists(fsdb.selfplay_dir()) utils.ensure_dir_exists(fsdb.holdout_dir()) utils.ensure_dir_exists(fsdb.sgf_dir()) utils.ensure_dir_exists(fsdb.eval_dir()) utils.ensure_dir_exists(fsdb.golden_chunk_dir()) utils.ensure_dir_exists(fsdb.working_dir()) bootstrap_name = shipname.generate(0) bootstrap_model_path = os.path.join(fsdb.models_dir(), bootstrap_name) mask_flags.checked_run([ 'python3', 'bootstrap.py', '--export_path={}'.format(bootstrap_model_path), '--work_dir={}'.format(fsdb.working_dir()), '--flagfile=rl_loop/local_flags' ]) selfplay_cmd = [ 'python3', 'selfplay.py', '--load_file={}'.format(bootstrap_model_path), '--selfplay_dir={}'.format( os.path.join(fsdb.selfplay_dir(), bootstrap_name)), '--holdout_dir={}'.format( os.path.join(fsdb.holdout_dir(), bootstrap_name)), '--sgf_dir={}'.format(fsdb.sgf_dir()), '--holdout_pct=0', '--flagfile=rl_loop/local_flags' ] # Selfplay twice mask_flags.checked_run(selfplay_cmd) mask_flags.checked_run(selfplay_cmd) # and once more to generate a held out game for validation # exploits flags behavior where if you pass flag twice, second one wins. mask_flags.checked_run(selfplay_cmd + ['--holdout_pct=100']) # Double check that at least one sgf has been generated. assert os.listdir(os.path.join(fsdb.sgf_dir(), 'full')) print("Making shuffled golden chunk from selfplay data...") # TODO(amj): refactor example_buffer so it can be called the same way # as everything else. eb.make_chunk_for(output_dir=fsdb.golden_chunk_dir(), local_dir=fsdb.working_dir(), game_dir=fsdb.selfplay_dir(), model_num=1, positions=64, threads=8, sampling_frac=1) tf_records = sorted( gfile.Glob(os.path.join(fsdb.golden_chunk_dir(), '*.tfrecord.zz'))) trained_model_name = shipname.generate(1) trained_model_path = os.path.join(fsdb.models_dir(), trained_model_name) # Train on shuffled game data mask_flags.checked_run([ 'python3', 'train.py', *tf_records, '--work_dir={}'.format(fsdb.working_dir()), '--export_path={}'.format(trained_model_path), '--flagfile=rl_loop/local_flags' ]) # Validate the trained model on held out game mask_flags.checked_run([ 'python3', 'validate.py', os.path.join(fsdb.holdout_dir(), bootstrap_name), '--work_dir={}'.format(fsdb.working_dir()), '--flagfile=rl_loop/local_flags' ]) # Verify that trained model works for selfplay # exploits flags behavior where if you pass flag twice, second one wins. mask_flags.checked_run(selfplay_cmd + ['--load_file={}'.format(trained_model_path)]) mask_flags.checked_run([ 'python3', 'evaluate.py', bootstrap_model_path, trained_model_path, '--games=1', '--eval_sgf_dir={}'.format(fsdb.eval_dir()), '--flagfile=rl_loop/local_flags' ]) print("Completed integration test!")
async def selfplay(state, flagfile='selfplay'): """Run selfplay and write a training chunk to the fsdb golden_chunk_dir. Args: state: the RL loop State instance. flagfile: the name of the flagfile to use for selfplay, either 'selfplay' (the default) or 'boostrap'. """ output_dir = os.path.join(fsdb.selfplay_dir(), state.output_model_name) holdout_dir = os.path.join(fsdb.holdout_dir(), state.output_model_name) output_dir = '/tmp/minigo' + output_dir multi_instance, num_instance, flag_list = extract_multi_instance([ '--flagfile={}_mi.flags'.format(os.path.join(FLAGS.flags_dir, flagfile)) ]) sp_cmd = [ 'bazel-bin/cc/selfplay', '--flagfile={}.flags'.format(os.path.join(FLAGS.flags_dir, flagfile)), '--model={}'.format(state.best_model_path), '--output_dir={}'.format(output_dir), '--holdout_dir={}'.format(holdout_dir) ] if not multi_instance: lines = await run(*sp_cmd, '--seed={}'.format(state.seed)) else: if FLAGS.selfplay_node == []: # run selfplay locally lines = await run('python3', 'ml_perf/execute.py', '--num_instance={}'.format(num_instance), '--', *sp_cmd, '--seed={}'.format(state.seed)) else: with logged_timer('selfplay mn'): # run one selfplay instance per host lines = await run_distributed( ['LD_LIBRARY_PATH=$LD_LIBRARY_PATH:cc/tensorflow'], num_instance, FLAGS.selfplay_node, None, None, state.seed, *sp_cmd) #result = '\n'.join(lines) #with logged_timer('parse win stats'): # stats = parse_win_stats_table(result, 1)[0] # num_games = stats.total_wins # black_total = stats.black_wins.total # white_total = stats.white_wins.total # logging.info('Black won %0.3f, white won %0.3f', # black_total / num_games, # white_total / num_games) # bias = abs(white_total - black_total)/num_games # logging.info('Black total %d, white total %d, total games %d, bias %0.3f.', # black_total, white_total, num_games, bias) with logged_timer('generate golden chunk'): # Write examples to a single record. hosts = FLAGS.selfplay_node if hosts == []: hosts = ['localhost'] num_instance = len(hosts) numa_per_node = FLAGS.physical_cores // FLAGS.numa_cores train_instance_num = FLAGS.train_instance_per_numa * len( FLAGS.train_node) * numa_per_node selfplay_node_num = len(hosts) selfplay_num = selfplay_node_num out_files_number = int(train_instance_num / gcd(train_instance_num, selfplay_num)) cmd = [ 'python3', 'ml_perf/divide_golden_chunk.py', '--read_path={}'.format(output_dir + "/*"), '--write_path={}'.format( os.path.join(fsdb.golden_chunk_dir(), state.output_model_name + '.tfrecord.zz')), '--out_files_number={}'.format(out_files_number), '--physical_cores={}'.format(FLAGS.physical_cores), '--base_dir={}'.format(FLAGS.base_dir) ] lines = await run_distributed([], 1, hosts, None, None, state.seed, *cmd)
def selfplay_noasync(state, flagfile='selfplay'): """Run selfplay and write a training chunk to the fsdb golden_chunk_dir. Args: state: the RL loop State instance. flagfile: the name of the flagfile to use for selfplay, either 'selfplay' (the default) or 'boostrap'. """ output_dir = os.path.join(fsdb.selfplay_dir(), state.output_model_name) holdout_dir = os.path.join(fsdb.holdout_dir(), state.output_model_name) base_seed = state.seed * FLAGS.num_gpus_selfplay * 2 if FLAGS.use_multinode: mpi_rank = MPI.COMM_WORLD.Get_rank() base_seed = base_seed + (mpi_rank * 1433) mpi_info = MPI.Info.Create() num_workers = 2 * FLAGS.num_gpus_selfplay cores_per_worker = (FLAGS.cores_per_socket * FLAGS.num_socket) // num_workers # TODO: set hosts to self play nodes here. mpi_info.Set("host", socket.gethostname()) mpi_info.Set("bind_to", "none") icomm = MPI.COMM_SELF.Spawn("ompi_bind_DGX1.sh", maxprocs=num_workers, args=[ 'bazel-bin/cc/selfplay_mpi', '--flagfile={}.flags'.format( os.path.join(FLAGS.flags_dir, flagfile)), '--model={}'.format(state.best_model_path), '--output_dir={}'.format(output_dir), '--holdout_dir={}'.format(holdout_dir), '--seed={}'.format(base_seed) ], info=mpi_info) icomm.barrier() icomm.Disconnect() black_wins_total = white_wins_total = num_games = 0 #for lines in all_lines: # if type(lines) == RuntimeError or type(lines) == OSError: # raise lines # continue # result = '\n'.join(lines[-6:]) # logging.info(result) # stats = parse_win_stats_table(result, 1)[0] # num_games += stats.total_wins # black_wins_total += stats.black_wins.total # white_wins_total += stats.white_wins.total #logging.info('Black won %0.3f, white won %0.3f', # black_wins_total / num_games, # white_wins_total / num_games) # Write examples to a single record. pattern = os.path.join(output_dir, '*', '*.zz') random.seed(state.seed) tf.set_random_seed(state.seed) np.random.seed(state.seed) logging.info('Writing golden chunk from "{}"'.format(pattern)) if FLAGS.use_multinode: mpi_rank = MPI.COMM_WORLD.Get_rank() divide_record(state, pattern, FLAGS.num_gpus_train, mpi_rank) else: divide_record(state, pattern, FLAGS.num_gpus_train, -1)
def main(unused_argv): for i in range(0, NUM_LOOP): if i == 0: src_model_name = shipname.generate(0) fsdb.switch_base(os.path.join(base_dir, src_model_name)) src_model_path = os.path.join(fsdb.models_dir(), src_model_name) bootstrap_model_path = os.path.join(fsdb.models_dir(), src_model_name) mask_flags.checked_run([ 'python3', 'bootstrap.py', '--export_path={}'.format(bootstrap_model_path), '--work_dir={}'.format(fsdb.working_dir()), '--flagfile=rl_loop/local_flags' ]) dst_model_name = shipname.generate(1) fsdb.switch_base(os.path.join(base_dir, dst_model_name)) else: src_model_name = dst_model_name src_model_path = os.path.join(fsdb.models_dir(), src_model_name) dst_model_name = shipname.generate(i + 1) fsdb.switch_base(os.path.join(base_dir, dst_model_name)) utils.ensure_dir_exists(fsdb.models_dir()) utils.ensure_dir_exists(fsdb.selfplay_dir()) utils.ensure_dir_exists(fsdb.holdout_dir()) utils.ensure_dir_exists(fsdb.sgf_dir()) utils.ensure_dir_exists(fsdb.eval_dir()) utils.ensure_dir_exists(fsdb.golden_chunk_dir()) utils.ensure_dir_exists(fsdb.working_dir()) #bootstrap_name = shipname.generate(0) #bootstrap_model_path = os.path.join(fsdb.models_dir(), bootstrap_name) print(src_model_name) print(src_model_path) selfplay_cmd = [ 'python3', 'selfplay.py', '--load_file={}'.format(src_model_path), '--selfplay_dir={}'.format( os.path.join(fsdb.selfplay_dir(), dst_model_name)), '--holdout_dir={}'.format( os.path.join(fsdb.holdout_dir(), dst_model_name)), '--sgf_dir={}'.format(fsdb.sgf_dir()), '--holdout_pct=0', '--flagfile=rl_loop/local_flags' ] # Selfplay twice mask_flags.checked_run(selfplay_cmd) mask_flags.checked_run(selfplay_cmd) # and once more to generate a held out game for validation # exploits flags behavior where if you pass flag twice, second one wins. mask_flags.checked_run(selfplay_cmd + ['--holdout_pct=100']) # Double check that at least one sgf has been generated. assert os.listdir(os.path.join(fsdb.sgf_dir(), 'full')) print("Making shuffled golden chunk from selfplay data...") # TODO(amj): refactor example_buffer so it can be called the same way # as everything else. eb.make_chunk_for(output_dir=fsdb.golden_chunk_dir(), local_dir=fsdb.working_dir(), game_dir=fsdb.selfplay_dir(), model_num=1, positions=64, threads=8, sampling_frac=1) tf_records = sorted( gfile.Glob(os.path.join(fsdb.golden_chunk_dir(), '*.tfrecord.zz'))) #trained_model_name = shipname.generate(1) trained_model_name = dst_model_name trained_model_path = os.path.join(fsdb.models_dir(), trained_model_name) # Train on shuffled game data mask_flags.checked_run([ 'python3', 'train.py', *tf_records, '--work_dir={}'.format(fsdb.working_dir()), '--export_path={}'.format(trained_model_path), '--flagfile=rl_loop/local_flags' ]) print("Finished!")