def _determine_chunk_to_make(write_dir): """ Returns the full path of the chunk to make (gs://...) and a boolean, indicating whether we should wait for a new model or if we're 'behind' and should just write out our current chunk immediately True == write immediately. """ models = fsdb.get_models() # Last model is N. N+1 (should be) training. We should gather games for N+2. chunk_to_make = os.path.join(write_dir, str(models[-1][0] + 1) + '.tfrecord.zz') if not tf.gfile.Exists(chunk_to_make): # N+1 is missing. Write it out ASAP print("Making chunk ASAP:", chunk_to_make) return chunk_to_make, True chunk_to_make = os.path.join(write_dir, str(models[-1][0] + 2) + '.tfrecord.zz') while tf.gfile.Exists(chunk_to_make): print("Chunk for next model ({}) already exists. Sleeping.".format( chunk_to_make)) time.sleep(5 * 60) models = fsdb.get_models() chunk_to_make = os.path.join(write_dir, str(models[-1][0] + 2) + '.tfrecord.zz') print("Making chunk:", chunk_to_make) return chunk_to_make, False
def validate(working_dir, model_num=None, validate_name=None): """ Runs validate on the directories up to the most recent model, or up to (but not including) the model specified by `model_num` """ if model_num is None: model_num, model_name = fsdb.get_latest_model() else: model_num = int(model_num) model_name = fsdb.get_model(model_num) # Model N was trained on games up through model N-2, so the validation set # should only be for models through N-2 as well, thus the (model_num - 1) # term. models = list( filter(lambda num_name: num_name[0] < (model_num - 1), fsdb.get_models())) # Run on the most recent 50 generations, # TODO(brianklee): make this hyperparameter dependency explicit/not hardcoded holdout_dirs = [ os.path.join(fsdb.holdout_dir(), pair[1]) for pair in models[-50:] ] main.validate(working_dir, *holdout_dirs, checkpoint_name=os.path.join(fsdb.models_dir(), model_name), validate_name=validate_name)
def fill_and_wait_models(bufsize=dual_net.EXAMPLES_PER_GENERATION, write_dir=None, threads=8, model_window=100, skip_first_rsync=False): """ Fills a ringbuffer with positions from the most recent games, then continually rsync's and updates the buffer until a new model is promoted. Once it detects a new model, iit then dumps its contents for training to immediately begin on the next model. """ write_dir = write_dir or fsdb.golden_chunk_dir() buf = ExampleBuffer(bufsize) models = fsdb.get_models()[-model_window:] if not skip_first_rsync: with timer("Rsync"): smart_rsync(models[-1][0] - 6) files = tqdm(map(files_for_model, models), total=len(models)) buf.parallel_fill(list(itertools.chain(*files)), threads=threads) print("Filled buffer, watching for new games") while fsdb.get_latest_model()[0] == models[-1][0]: with timer("Rsync"): smart_rsync(models[-1][0] - 2) new_files = tqdm(map(files_for_model, models[-2:]), total=len(models)) buf.update(list(itertools.chain(*new_files))) time.sleep(60) latest = fsdb.get_latest_model() print("New model!", latest[1], "!=", models[-1][1]) print(buf) buf.flush(os.path.join(write_dir, str(latest[0] + 1) + '.tfrecord.zz'))
def smart_rsync(from_model_num=0, source_dir=None, dest_dir=LOCAL_DIR): source_dir = source_dir or fsdb.selfplay_dir() from_model_num = 0 if from_model_num < 0 else from_model_num models = [m for m in fsdb.get_models() if m[0] >= from_model_num] for _, model in models: _rsync_dir(os.path.join(source_dir, model), os.path.join(dest_dir, model))
def main(): root = os.path.abspath( os.path.join("sgf", fsdb.FLAGS.bucket_name, "sgf/eval")) sync(root, True) models = fsdb.get_models() data = wins_subset(fsdb.models_dir()) print(len(data)) r = compute_ratings(data) for v, k in sorted([(v, k) for k, v in r.items()])[-20:][::-1]: print(models[model_num_for(k)][1], v) db = sqlite3.connect("ratings.db") print(db.execute("select count(*) from wins").fetchone()[0], "games") for m in models[-10:]: m_id = model_id(m[0]) print(m[1], r.get(m_id, "model id not found({})".format(m_id)))
def fill_and_wait_time(bufsize=dual_net.EXAMPLES_PER_GENERATION, write_dir=None, threads=32, start_from=None): start_from = start_from or dt.datetime.utcnow() write_dir = write_dir or fsdb.golden_chunk_dir() buf = ExampleBuffer(bufsize) chunk_to_make, fast_write = _determine_chunk_to_make(write_dir) hours = fsdb.get_hour_dirs() with timer("Rsync"): time_rsync( min(dt.datetime.strptime(hours[-1], "%Y-%m-%d-%H/"), start_from)) start_from = dt.datetime.utcnow() hours = fsdb.get_hour_dirs() files = (tf.gfile.Glob(os.path.join(LOCAL_DIR, d, "*.zz")) for d in reversed(hours) if tf.gfile.Exists(os.path.join(LOCAL_DIR, d))) files = itertools.islice(files, get_window_size(chunk_to_make)) models = fsdb.get_models() buf.parallel_fill(list(itertools.chain.from_iterable(files)), threads=threads) print("Filled buffer, watching for new games") while (fsdb.get_latest_model() == models[-1] or buf.total_updates < MINIMUM_NEW_GAMES): with timer("Rsync"): time_rsync(start_from - dt.timedelta(minutes=60)) start_from = dt.datetime.utcnow() hours = sorted(fsdb.get_hour_dirs(LOCAL_DIR)) new_files = list( map(lambda d: tf.gfile.Glob(os.path.join(LOCAL_DIR, d, '*.zz')), hours[-2:])) buf.update(list(itertools.chain.from_iterable(new_files))) if fast_write: break time.sleep(30) if fsdb.get_latest_model() != models[-1]: print("New model! Waiting for games. Got", buf.total_updates, "new games so far") latest = fsdb.get_latest_model() print("New model!", latest[1], "!=", models[-1][1]) print(buf) buf.flush(chunk_to_make)
def backfill(): models = [m[1] for m in fsdb.get_models()] import dual_net import tensorflow as tf from tqdm import tqdm features, labels = dual_net.get_inference_input() dual_net.model_fn(features, labels, tf.estimator.ModeKeys.PREDICT, dual_net.get_default_hyperparams()) for model_name in tqdm(models): if model_name.endswith('-upgrade'): continue try: load_file = os.path.join(fsdb.models_dir(), model_name) dest_file = os.path.join(fsdb.models_dir(), model_name) main.convert(load_file, dest_file) except: print('failed on', model_name) continue
def make_chunk_for(output_dir=LOCAL_DIR, local_dir=LOCAL_DIR, game_dir=None, model_num=1, positions=dual_net.EXAMPLES_PER_GENERATION, threads=8, samples_per_game=4): """ Explicitly make a golden chunk for a given model `model_num` (not necessarily the most recent one). While we haven't yet got enough samples (EXAMPLES_PER_GENERATION) Add samples from the games of previous model. """ game_dir = game_dir or fsdb.selfplay_dir() ensure_dir_exists(output_dir) models = [(num, name) for num, name in fsdb.get_models() if num < model_num] buf = ExampleBuffer(positions) files = [] for _, model in sorted(models, reverse=True): local_model_dir = os.path.join(local_dir, model) if not tf.gfile.Exists(local_model_dir): print("Rsyncing", model) _rsync_dir(os.path.join(game_dir, model), local_model_dir) files.extend(tf.gfile.Glob(os.path.join(local_model_dir, '*.zz'))) if len(files) * samples_per_game > positions: break print("Filling from {} files".format(len(files))) buf.parallel_fill(files, threads=threads, samples_per_game=samples_per_game) print(buf) output = os.path.join(output_dir, str(model_num) + '.tfrecord.zz') print("Writing to", output) buf.flush(output)