def fill_and_wait_models(bufsize=EXAMPLES_PER_GENERATION, write_dir=None, threads=8, model_window=100, skip_first_rsync=False): """ Fills a ringbuffer with positions from the most recent games, then continually rsync's and updates the buffer until a new model is promoted. Once it detects a new model, iit then dumps its contents for training to immediately begin on the next model. """ write_dir = write_dir or fsdb.golden_chunk_dir() buf = ExampleBuffer(bufsize) models = fsdb.get_models()[-model_window:] if not skip_first_rsync: with timer("Rsync"): smart_rsync(models[-1][0] - 6) files = tqdm(map(files_for_model, models), total=len(models)) buf.parallel_fill(list(itertools.chain(*files)), threads=threads) print("Filled buffer, watching for new games") while fsdb.get_latest_model()[0] == models[-1][0]: with timer("Rsync"): smart_rsync(models[-1][0] - 2) new_files = tqdm(map(files_for_model, models[-2:]), total=len(models)) buf.update(list(itertools.chain(*new_files))) time.sleep(60) latest = fsdb.get_latest_model() print("New model!", latest[1], "!=", models[-1][1]) print(buf) buf.flush(os.path.join(write_dir, str(latest[0] + 1) + '.tfrecord.zz'))
def fill_and_wait_time(bufsize=EXAMPLES_PER_GENERATION, write_dir=None, threads=32, start_from=None): start_from = start_from or dt.datetime.utcnow() write_dir = write_dir or fsdb.golden_chunk_dir() buf = ExampleBuffer(bufsize) chunk_to_make, fast_write = _determine_chunk_to_make(write_dir) hours = fsdb.get_hour_dirs() with timer("Rsync"): time_rsync( min(dt.datetime.strptime(hours[-1], "%Y-%m-%d-%H/"), start_from)) start_from = dt.datetime.utcnow() hours = fsdb.get_hour_dirs() files = (tf.gfile.Glob(os.path.join(LOCAL_DIR, d, "*.zz")) for d in reversed(hours) if tf.gfile.Exists(os.path.join(LOCAL_DIR, d))) files = itertools.islice(files, get_window_size(chunk_to_make)) models = fsdb.get_models() buf.parallel_fill(list(itertools.chain.from_iterable(files)), threads=threads) print("Filled buffer, watching for new games") while (fsdb.get_latest_model() == models[-1] or buf.total_updates < MINIMUM_NEW_GAMES): with timer("Rsync"): time_rsync(start_from - dt.timedelta(minutes=60)) start_from = dt.datetime.utcnow() hours = sorted(fsdb.get_hour_dirs(LOCAL_DIR)) new_files = list( map(lambda d: tf.gfile.Glob(os.path.join(LOCAL_DIR, d, '*.zz')), hours[-2:])) buf.update(list(itertools.chain.from_iterable(new_files))) if fast_write: break time.sleep(30) if fsdb.get_latest_model() != models[-1]: print("New model! Waiting for games. Got", buf.total_updates, "new games so far") latest = fsdb.get_latest_model() print("New model!", latest[1], "!=", models[-1][1]) print(buf) buf.flush(chunk_to_make)
def run_cc(): _, model_name = fsdb.get_latest_model() num_games_finished = len(fsdb.get_games(model_name)) if num_games_finished > 25000: print("{} has enough games! ({})".format(model_name, num_games_finished)) time.sleep(10 * 60) sys.exit() mask_flags.checked_run([ 'bazel-bin/cc/selfplay', '--model=tf,{}'.format(model_name), '--mode=selfplay', '--output_dir={}/{}'.format(fsdb.selfplay_dir(), model_name), '--holdout_dir={}/{}'.format(fsdb.holdout_dir(), model_name), '--sgf_dir={}/{}'.format(fsdb.sgf_dir(), model_name), '--flagfile=rl_loop/distributed_flags' ])
def train(): model_num, model_name = fsdb.get_latest_model() print("Training on gathered game data, initializing from {}".format( model_name)) new_model_num = model_num + 1 new_model_name = shipname.generate(new_model_num) print("New model will be {}".format(new_model_name)) save_file = os.path.join(fsdb.models_dir(), new_model_name) # TODO(jacksona): Refactor train.py to take the filepath as a flag. cmd = [ 'python3', 'train.py', '__unused_file__', '--use_tpu', '--use_bt', '--work_dir={}'.format(fsdb.working_dir()), '--tpu_name={}'.format(TPU_NAME), '--flagfile=rl_loop/distributed_flags', '--export_path={}'.format(save_file) ] completed_process = mask_flags.run(cmd) if completed_process.returncode > 0: print("Training failed!") return completed_process # Train.py already copies the {data,index,meta} files to $BUCKET/models # Persist the checkpoint two ways: # Freeze the .ckpt file in the work_dir for the TPU selfplayers # Freeze a non-tpu version of the graph for later GPU use. latest_checkpoint = tf.train.latest_checkpoint(fsdb.working_dir()) p = freeze(latest_checkpoint, rewrite_tpu=True) if p.returncode > 0: print("== TPU freeze failed!") return p p = freeze(save_file, rewrite_tpu=False) if p.returncode > 0: print("== Model freeze failed!") return p return completed_process