def rl_loop(): """Run the reinforcement learning loop This tries to create a realistic way to run the reinforcement learning with all default parameters. """ if goparams.DUMMY_MODEL: # monkeypatch the hyperparams so that we get a quickly executing network. dual_net.get_default_hyperparams = lambda **kwargs: { 'k': 8, 'fc_width': 16, 'num_shared_layers': 1, 'l2_strength': 1e-4, 'momentum': 0.9} dual_net.TRAIN_BATCH_SIZE = 16 dual_net.EXAMPLES_PER_GENERATION = 64 #monkeypatch the shuffle buffer size so we don't spin forever shuffling up positions. preprocessing.SHUFFLE_BUFFER_SIZE = 1000 qmeas.stop_time('selfplay_wait') print("Gathering game output...") gather() print("Training on gathered game data...") _, model_name = get_latest_model() new_model = train() if goparams.EVALUATE_PUZZLES: qmeas.start_time('puzzle') new_model_path = os.path.join(MODELS_DIR, new_model) sgf_files = [ './benchmark_sgf/9x9_pro_YKSH.sgf', './benchmark_sgf/9x9_pro_IYMD.sgf', './benchmark_sgf/9x9_pro_YSIY.sgf', './benchmark_sgf/9x9_pro_IYHN.sgf', ] result, total_pct = predict_games.report_for_puzzles(new_model_path, sgf_files, 2, tries_per_move=1) print('accuracy = ', total_pct) qmeas.record('puzzle_total', total_pct) qmeas.record('puzzle_result', repr(result)) qmeas.record('puzzle_summary', {'results': repr(result), 'total_pct': total_pct, 'model': new_model}) qmeas._flush() with open(os.path.join(BASE_DIR, new_model + '-puzzles.txt'), 'w') as f: f.write(repr(result)) f.write('\n' + str(total_pct) + '\n') qmeas.stop_time('puzzle') if total_pct >= goparams.TERMINATION_ACCURACY: print('Reaching termination accuracy; ', goparams.TERMINATION_ACCURACY) with open('TERMINATE_FLAG', 'w') as f: f.write(repr(result)) f.write('\n' + str(total_pct) + '\n') if goparams.EVALUATE_MODELS: if not evaluate(model_name, new_model): bury_latest_model()
def rl_loop_train(): """Run the reinforcement learning loop This tries to create a realistic way to run the reinforcement learning with all default parameters. """ qmeas.stop_time('selfplay_wait') print("Gathering game output...") gather() print("Training on gathered game data...") _, model_name = get_latest_model() new_model = train()
def gather( input_directory: 'where to look for games' = 'data/selfplay/', output_directory: 'where to put collected games' = 'data/training_chunks/', examples_per_record: 'how many tf.examples to gather in each chunk' = EXAMPLES_PER_RECORD): qmeas.start_time('gather') _ensure_dir_exists(output_directory) models = [ model_dir.strip('/') for model_dir in sorted(gfile.ListDirectory(input_directory))[-50:] ] with timer("Finding existing tfrecords..."): model_gamedata = { model: gfile.Glob(os.path.join(input_directory, model, '*.tfrecord.zz')) for model in models } print("Found %d models" % len(models)) for model_name, record_files in sorted(model_gamedata.items()): print(" %s: %s files" % (model_name, len(record_files))) meta_file = os.path.join(output_directory, 'meta.txt') try: with gfile.GFile(meta_file, 'r') as f: already_processed = set(f.read().split()) except tf.errors.NotFoundError: already_processed = set() num_already_processed = len(already_processed) for model_name, record_files in sorted(model_gamedata.items()): if set(record_files) <= already_processed: continue print("Gathering files for %s:" % model_name) for i, example_batch in enumerate( tqdm( preprocessing.shuffle_tf_examples(examples_per_record, record_files))): output_record = os.path.join( output_directory, '{}-{}.tfrecord.zz'.format(model_name, str(i))) preprocessing.write_tf_examples(output_record, example_batch, serialize=False) already_processed.update(record_files) print("Processed %s new files" % (len(already_processed) - num_already_processed)) with gfile.GFile(meta_file, 'w') as f: f.write('\n'.join(sorted(already_processed))) qmeas.stop_time('gather')
def train(working_dir: 'tf.estimator working directory.', chunk_dir: 'Directory where gathered training chunks are.', model_save_path: 'Where to export the completed generation.', generation_num: 'Which generation you are training.' = 0): qmeas.start_time('train') tf_records = sorted(gfile.Glob(os.path.join(chunk_dir, '*.tfrecord.zz'))) tf_records = tf_records[-1 * (WINDOW_SIZE // EXAMPLES_PER_RECORD):] print("Training from:", tf_records[0], "to", tf_records[-1]) with timer("Training"): dual_net.train(working_dir, tf_records, generation_num) dual_net.export_model(working_dir, model_save_path) qmeas.stop_time('train')
def rl_loop_eval(): """Run the reinforcement learning loop This tries to create a realistic way to run the reinforcement learning with all default parameters. """ (_, new_model) = get_latest_model() qmeas.start_time('puzzle') new_model_path = os.path.join(MODELS_DIR, new_model) sgf_files = [ './benchmark_sgf/9x9_pro_YKSH.sgf', './benchmark_sgf/9x9_pro_IYMD.sgf', './benchmark_sgf/9x9_pro_YSIY.sgf', './benchmark_sgf/9x9_pro_IYHN.sgf', ] result, total_pct = predict_games.report_for_puzzles_parallel( new_model_path, sgf_files, 2, tries_per_move=1) #result, total_pct = predict_games.report_for_puzzles(new_model_path, sgf_files, 2, tries_per_move=1) print('accuracy = ', total_pct) print('result = ', result) mlperf_log.minigo_print(key=mlperf_log.EVAL_ACCURACY, value={ "epoch": iteration, "value": total_pct }) mlperf_log.minigo_print(key=mlperf_log.EVAL_TARGET, value=goparams.TERMINATION_ACCURACY) qmeas.record('puzzle_total', total_pct) qmeas.record('puzzle_result', repr(result)) qmeas.record('puzzle_summary', { 'results': repr(result), 'total_pct': total_pct, 'model': new_model }) qmeas._flush() with open(os.path.join(BASE_DIR, new_model + '-puzzles.txt'), 'w') as f: f.write(repr(result)) f.write('\n' + str(total_pct) + '\n') qmeas.stop_time('puzzle') if total_pct >= goparams.TERMINATION_ACCURACY: print('Reaching termination accuracy; ', goparams.TERMINATION_ACCURACY) mlperf_log.minigo_print(key=mlperf_log.RUN_STOP, value={"success": True}) with open('TERMINATE_FLAG', 'w') as f: f.write(repr(result)) f.write('\n' + str(total_pct) + '\n') qmeas.end()
def train( working_dir: 'tf.estimator working directory.', chunk_dir: 'Directory where gathered training chunks are.', model_save_path: 'Where to export the completed generation.', generation_num: 'Which generation you are training.'=0): qmeas.start_time('train') tf_records = sorted(gfile.Glob(os.path.join(chunk_dir, '*.tfrecord.zz'))) tf_records = tf_records[-1 * (WINDOW_SIZE // EXAMPLES_PER_RECORD):] print("Training from:", tf_records[0], "to", tf_records[-1]) with timer("Training"): dual_net.train(working_dir, tf_records, generation_num) dual_net.export_model(working_dir, model_save_path) qmeas.stop_time('train')
def bootstrap( working_dir: 'tf.estimator working directory. If not set, defaults to a random tmp dir'=None, model_save_path: 'Where to export the first bootstrapped generation'=None): qmeas.start_time('bootstrap') if working_dir is None: with tempfile.TemporaryDirectory() as working_dir: _ensure_dir_exists(working_dir) _ensure_dir_exists(os.path.dirname(model_save_path)) dual_net.bootstrap(working_dir) dual_net.export_model(working_dir, model_save_path) else: _ensure_dir_exists(working_dir) _ensure_dir_exists(os.path.dirname(model_save_path)) dual_net.bootstrap(working_dir) dual_net.export_model(working_dir, model_save_path) qmeas.stop_time('bootstrap')
def validate( working_dir: 'tf.estimator working directory', *tf_record_dirs: 'Directories where holdout data are', checkpoint_name: 'Which checkpoint to evaluate (None=latest)'=None, validate_name: 'Name for validation set (i.e., selfplay or human)'=None): qmeas.start_time('validate') tf_records = [] with timer("Building lists of holdout files"): for record_dir in tf_record_dirs: tf_records.extend(gfile.Glob(os.path.join(record_dir, '*.zz'))) first_record = os.path.basename(tf_records[0]) last_record = os.path.basename(tf_records[-1]) with timer("Validating from {} to {}".format(first_record, last_record)): dual_net.validate( working_dir, tf_records, checkpoint_name=checkpoint_name, name=validate_name) qmeas.stop_time('validate')
def gather( input_directory: 'where to look for games'='data/selfplay/', output_directory: 'where to put collected games'='data/training_chunks/', examples_per_record: 'how many tf.examples to gather in each chunk'=EXAMPLES_PER_RECORD): qmeas.start_time('gather') _ensure_dir_exists(output_directory) models = [model_dir.strip('/') for model_dir in sorted(gfile.ListDirectory(input_directory))[-50:]] with timer("Finding existing tfrecords..."): model_gamedata = { model: gfile.Glob( os.path.join(input_directory, model, '*.tfrecord.zz')) for model in models } print("Found %d models" % len(models)) for model_name, record_files in sorted(model_gamedata.items()): print(" %s: %s files" % (model_name, len(record_files))) meta_file = os.path.join(output_directory, 'meta.txt') try: with gfile.GFile(meta_file, 'r') as f: already_processed = set(f.read().split()) except tf.errors.NotFoundError: already_processed = set() num_already_processed = len(already_processed) for model_name, record_files in sorted(model_gamedata.items()): if set(record_files) <= already_processed: continue print("Gathering files for %s:" % model_name) for i, example_batch in enumerate( tqdm(preprocessing.shuffle_tf_examples(examples_per_record, record_files))): output_record = os.path.join(output_directory, '{}-{}.tfrecord.zz'.format(model_name, str(i))) preprocessing.write_tf_examples( output_record, example_batch, serialize=False) already_processed.update(record_files) print("Processed %s new files" % (len(already_processed) - num_already_processed)) with gfile.GFile(meta_file, 'w') as f: f.write('\n'.join(sorted(already_processed))) qmeas.stop_time('gather')
def selfplay( load_file: "The path to the network model files", output_dir: "Where to write the games" = "data/selfplay", holdout_dir: "Where to write the games" = "data/holdout", output_sgf: "Where to write the sgfs" = "sgf/", readouts: 'How many simulations to run per move' = 100, verbose: '>=2 will print debug info, >=3 will print boards' = 1, resign_threshold: 'absolute value of threshold to resign at' = 0.95, holdout_pct: 'how many games to hold out for validation' = 0.05): qmeas.start_time('selfplay') clean_sgf = os.path.join(output_sgf, 'clean') full_sgf = os.path.join(output_sgf, 'full') _ensure_dir_exists(clean_sgf) _ensure_dir_exists(full_sgf) _ensure_dir_exists(output_dir) _ensure_dir_exists(holdout_dir) with timer("Loading weights from %s ... " % load_file): network = dual_net.DualNetwork(load_file) with timer("Playing game"): player = selfplay_mcts.play(network, readouts, resign_threshold, verbose) output_name = '{}-{}'.format(int(time.time() * 1000 * 1000), socket.gethostname()) game_data = player.extract_data() with gfile.GFile(os.path.join(clean_sgf, '{}.sgf'.format(output_name)), 'w') as f: f.write(player.to_sgf(use_comments=False)) with gfile.GFile(os.path.join(full_sgf, '{}.sgf'.format(output_name)), 'w') as f: f.write(player.to_sgf()) tf_examples = preprocessing.make_dataset_from_selfplay(game_data) # Hold out 5% of games for evaluation. if random.random() < holdout_pct: fname = os.path.join(holdout_dir, "{}.tfrecord.zz".format(output_name)) else: fname = os.path.join(output_dir, "{}.tfrecord.zz".format(output_name)) preprocessing.write_tf_examples(fname, tf_examples) qmeas.stop_time('selfplay')
def evaluate_both( prev_model: 'The path to previous model', cur_model: 'The path to current model', output_dir: 'Where to write the evaluation results'='sgf/evaluate', readouts: 'How many readouts to make per move.'=200, games: 'the number of games to play'=20, verbose: 'How verbose the players should be (see selfplay)' = 1): qmeas.start_time('evaluate') _ensure_dir_exists(output_dir) winners = [] with timer("%d games" % games): winners = evaluation.play_match_many_instance_both( prev_model, cur_model, games, readouts, output_dir, verbose) qmeas.stop_time('evaluate') white_count = 0 for win in winners: if 'W' in win or 'w' in win: white_count += 1 return white_count * 1.0 / (games*2)
def selfplay( load_file: "The path to the network model files", output_dir: "Where to write the games"="data/selfplay", holdout_dir: "Where to write the games"="data/holdout", output_sgf: "Where to write the sgfs"="sgf/", readouts: 'How many simulations to run per move'=100, verbose: '>=2 will print debug info, >=3 will print boards' = 1, resign_threshold: 'absolute value of threshold to resign at' = 0.95, holdout_pct: 'how many games to hold out for validation' = 0.05): qmeas.start_time('selfplay') clean_sgf = os.path.join(output_sgf, 'clean') full_sgf = os.path.join(output_sgf, 'full') _ensure_dir_exists(clean_sgf) _ensure_dir_exists(full_sgf) _ensure_dir_exists(output_dir) _ensure_dir_exists(holdout_dir) with timer("Loading weights from %s ... " % load_file): network = dual_net.DualNetwork(load_file) with timer("Playing game"): player = selfplay_mcts.play( network, readouts, resign_threshold, verbose) output_name = '{}-{}'.format(int(time.time() * 1000 * 1000), socket.gethostname()) game_data = player.extract_data() with gfile.GFile(os.path.join(clean_sgf, '{}.sgf'.format(output_name)), 'w') as f: f.write(player.to_sgf(use_comments=False)) with gfile.GFile(os.path.join(full_sgf, '{}.sgf'.format(output_name)), 'w') as f: f.write(player.to_sgf()) tf_examples = preprocessing.make_dataset_from_selfplay(game_data) # Hold out 5% of games for evaluation. if random.random() < holdout_pct: fname = os.path.join(holdout_dir, "{}.tfrecord.zz".format(output_name)) else: fname = os.path.join(output_dir, "{}.tfrecord.zz".format(output_name)) preprocessing.write_tf_examples(fname, tf_examples) qmeas.stop_time('selfplay')
def evaluate( black_model: 'The path to the model to play black', white_model: 'The path to the model to play white', output_dir: 'Where to write the evaluation results'='sgf/evaluate', readouts: 'How many readouts to make per move.'=200, games: 'the number of games to play'=20, verbose: 'How verbose the players should be (see selfplay)' = 1): qmeas.start_time('evaluate') _ensure_dir_exists(output_dir) with timer("Loading weights"): black_net = dual_net.DualNetwork(black_model) white_net = dual_net.DualNetwork(white_model) winners = [] with timer("%d games" % games): winners = evaluation.play_match( black_net, white_net, games, readouts, output_dir, verbose) qmeas.stop_time('evaluate') white_count = 0 for win in winners: if 'W' in win or 'w' in win: white_count += 1 return white_count * 1.0 / games
def main_(): """Run the reinforcement learning loop This tries to create a realistic way to run the reinforcement learning with all default parameters. """ print('Starting self play loop.') qmeas.start_time('selfplay_wait') start_t = time.time() _, model_name = get_latest_model() num_workers = 0 procs = [] if sys.argv[3] == 'worker' or sys.argv[3] == 'driver': selfplay_dir = os.path.join(SELFPLAY_DIR, model_name) else: selfplay_dir = SELFPLAY_BACKUP_DIR def count_live_procs(): return len(list(filter(lambda proc: proc.poll() is None, procs))) def start_worker(num_workers): worker_seed = hash(hash(SEED) + ITERATION) + num_workers cmd = 'GOPARAMS={} OMP_NUM_THREADS=1 KMP_HW_SUBSET={} KMP_AFFINITY=granularity=fine,proclist=[{}],explicit python3 selfplay_worker.py {} {} {}'.format( os.environ['GOPARAMS'], os.environ['KMP_HW_SUBSET'], num_workers % multiprocessing.cpu_count(), BASE_DIR, worker_seed, sys.argv[3]) procs.append(subprocess.Popen(cmd, shell=True)) def count_games(): # returns number of games in the selfplay directory if not os.path.exists(selfplay_dir): # directory not existing implies no games have been played yet return 0 return len(gfile.Glob(os.path.join(selfplay_dir, '*.zz'))) # generate selfplay games until needed number of games reached if sys.argv[3] == 'worker': for i in range(goparams.NUM_PARALLEL_SELFPLAY): print('Starting Worker...') start_worker(num_workers) time.sleep(0.1) num_workers += 1 sys.stdout.flush() while count_games() < MAX_GAMES_PER_GENERATION and not os.path.isfile( "PK_FLAG"): time.sleep(1) games = count_games() sys.stdout.flush() print('Done with selfplay loop.') for proc in procs: proc.kill() # Sometimes the workers need extra help... os.system('pkill -f selfplay_worker.py') sys.stdout.flush() # check generated games, remove exssesive games if sys.argv[3] == 'driver': # Because we use process level parallelism for selfpaying and we don't # sync or communicate between processes, there could be too many games # played (up to 1 extra game per worker process). # This is a rather brutish way to ensure we train on the correct number # of games... print('There are {} games in the selfplay directory at {}'.format( count_games(), selfplay_dir)) sys.stdout.flush() while count_games() > MAX_GAMES_PER_GENERATION: games = count_games() print('Too many selfplay games ({}/{}) ... deleting extra'.format( games, MAX_GAMES_PER_GENERATION)) # This will remove exactly one game file from the selfplay directory... or # so we hope :) sys.stdout.flush() os.system('ls {}/* -d | tail -n {} | xargs rm '.format( selfplay_dir, games - MAX_GAMES_PER_GENERATION)) print( 'After cleanup, there are {} games in the selfplay directory at {}' .format(count_games(), selfplay_dir)) sys.stdout.flush() # generate backup games, in case the new model will be buried and we need more old games for training if sys.argv[3] == 'backup': for i in range(goparams.NUM_PARALLEL_SELFPLAY): print('Starting Worker...') start_worker(num_workers) num_workers += 1 sys.stdout.flush() while count_games() < MAX_GAMES_PER_GENERATION: time.sleep(1) games = count_games() sys.stdout.flush() print('Done with selfplay loop.') for proc in procs: proc.kill() # Sometimes the workers need extra help... os.system('pkill -f selfplay_worker.py') sys.stdout.flush() if sys.argv[3] == 'clean_backup': print('cleaning up {}'.format(SELFPLAY_BACKUP_DIR)) os.system('rm {}/*'.format(SELFPLAY_BACKUP_DIR)) qmeas.stop_time('selfplay_wait')
def rl_loop(): """Run the reinforcement learning loop This is meant to be more of an integration test than a realistic way to run the reinforcement learning. """ # monkeypatch the hyperparams so that we get a quickly executing network. dual_net.get_default_hyperparams = lambda **kwargs: { 'k': 8, 'fc_width': 16, 'num_shared_layers': 1, 'l2_strength': 1e-4, 'momentum': 0.9 } dual_net.TRAIN_BATCH_SIZE = 16 dual_net.EXAMPLES_PER_GENERATION = 64 #monkeypatch the shuffle buffer size so we don't spin forever shuffling up positions. preprocessing.SHUFFLE_BUFFER_SIZE = 1000 # with tempfile.TemporaryDirectory() as base_dir: base_dir = "/tmp/minigo" with open('/tmp/foo', 'w') as fff: working_dir = os.path.join(base_dir, 'models_in_training') model_save_path = os.path.join(base_dir, 'models', '000000-bootstrap') next_model_save_file = os.path.join(base_dir, 'models', '000001-nextmodel') selfplay_dir = os.path.join(base_dir, 'data', 'selfplay') model_selfplay_dir = os.path.join(selfplay_dir, '000000-bootstrap') gather_dir = os.path.join(base_dir, 'data', 'training_chunks') holdout_dir = os.path.join(base_dir, 'data', 'holdout', '000000-bootstrap') sgf_dir = os.path.join(base_dir, 'sgf', '000000-bootstrap') os.makedirs(os.path.join(base_dir, 'data'), exist_ok=True) print("Creating random initial weights...") main.bootstrap(working_dir, model_save_path) for i in range(100): qmeas.start_time('main-loop') print("Playing some games...") # Do two selfplay runs to test gather functionality qmeas.start_time('main-loop-self-play') for j in range(2): main.selfplay(load_file=model_save_path, output_dir=model_selfplay_dir, output_sgf=sgf_dir, holdout_pct=0, readouts=10) qmeas.stop_time('main-loop-self-play') # Do one holdout run to test validation qmeas.start_time('main-loop-self-play-holdout') main.selfplay(load_file=model_save_path, holdout_dir=holdout_dir, output_dir=model_selfplay_dir, output_sgf=sgf_dir, holdout_pct=100, readouts=10) qmeas.stop_time('main-loop-self-play-holdout') print("See sgf files here?") sgf_listing = subprocess.check_output( ["ls", "-l", sgf_dir + "/full"]) print(sgf_listing.decode("utf-8")) print("Gathering game output...") qmeas.start_time('main-loop-gather') main.gather(input_directory=selfplay_dir, output_directory=gather_dir) qmeas.stop_time('main-loop-gather') print("Training on gathered game data...") qmeas.start_time('main-loop-train') main.train(working_dir, gather_dir, next_model_save_file, generation_num=1) qmeas.stop_time('main-loop-train') print("Trying validate on 'holdout' game...") qmeas.start_time('main-loop-validate') main.validate(working_dir, holdout_dir) qmeas.stop_time('main-loop-validate') print("Verifying that new checkpoint is playable...") main.selfplay(load_file=next_model_save_file, holdout_dir=holdout_dir, output_dir=model_selfplay_dir, output_sgf=sgf_dir, readouts=10) qmeas.stop_time('main-loop') qmeas._flush()
def main_(): """Run the reinforcement learning loop This tries to create a realistic way to run the reinforcement learning with all default parameters. """ print('Starting self play loop.') qmeas.start_time('selfplay_wait') start_t = time.time() _, model_name = get_latest_model() num_workers = 0 procs = [ ] def count_live_procs(): return len(list(filter(lambda proc: proc.poll() is None, procs))) def start_worker(num_workers): #procs.append(subprocess.Popen(cmd, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)) worker_seed = hash(hash(SEED) + ITERATION) + num_workers cmd = 'GOPARAMS={} python3 selfplay_worker.py {} {}'.format(os.environ['GOPARAMS'], BASE_DIR, worker_seed) procs.append(subprocess.Popen(cmd, shell=True)) selfplay_dir = os.path.join(SELFPLAY_DIR, model_name) def count_games(): # returns number of games in the selfplay directory if not os.path.exists(os.path.join(SELFPLAY_DIR, model_name)): # directory not existing implies no games have been played yet return 0 return len(gfile.Glob(os.path.join(SELFPLAY_DIR, model_name, '*.zz'))) for i in range(goparams.NUM_PARALLEL_SELFPLAY): print('Starting Worker...') num_workers += 1 start_worker(num_workers) time.sleep(1) sys.stdout.flush() while count_games() < MAX_GAMES_PER_GENERATION: time.sleep(10) games = count_games() print('Found Games: {}'.format(games)) print('selfplaying: {:.2f} games/hour'.format(games / ((time.time() - start_t) / 60 / 60) )) print('Worker Processes: {}'.format(count_live_procs())) sys.stdout.flush() print('Done with selfplay loop.') time.sleep(10) for proc in procs: proc.kill() # Sometimes the workers need extra help... time.sleep(5) os.system('pkill -f selfplay_worker.py') # Let things settle after we kill processes. time.sleep(10) # Because we use process level parallelism for selfpaying and we don't # sync or communicate between processes, there could be too many games # played (up to 1 extra game per worker process). # This is a rather brutish way to ensure we train on the correct number # of games... print('There are {} games in the selfplay directory at {}'.format(count_games(), selfplay_dir)) sys.stdout.flush() while count_games() > MAX_GAMES_PER_GENERATION: print('Too many selfplay games ({}/{}) ... deleting one'.format(count_games(), MAX_GAMES_PER_GENERATION)) # This will remove exactly one game file from the selfplay directory... or # so we hope :) sys.stdout.flush() os.system('ls {}/* -d | tail -n 1 | xargs rm'.format(selfplay_dir)) # unclear if this sleep is necessary... time.sleep(1) print('After cleanup, there are {} games in the selfplay directory at {}'.format(count_games(), selfplay_dir)) sys.stdout.flush() qmeas.stop_time('selfplay_wait')
def main_(): """Run the reinforcement learning loop This tries to create a realistic way to run the reinforcement learning with all default parameters. """ print('Starting self play loop.') qmeas.start_time('selfplay_wait') start_t = time.time() _, model_name = get_latest_model() num_workers = 0 procs = [] def count_live_procs(): return len(list(filter(lambda proc: proc.poll() is None, procs))) def start_worker(num_workers): #procs.append(subprocess.Popen(cmd, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)) worker_seed = hash(hash(SEED) + ITERATION) + num_workers cmd = 'GOPARAMS={} python3 selfplay_worker.py {} {}'.format( os.environ['GOPARAMS'], BASE_DIR, worker_seed) procs.append(subprocess.Popen(cmd, shell=True)) selfplay_dir = os.path.join(SELFPLAY_DIR, model_name) def count_games(): # returns number of games in the selfplay directory if not os.path.exists(os.path.join(SELFPLAY_DIR, model_name)): # directory not existing implies no games have been played yet return 0 return len(gfile.Glob(os.path.join(SELFPLAY_DIR, model_name, '*.zz'))) print( 'NUM_PARALLEL_SELFPLAY = {n}'.format(n=goparams.NUM_PARALLEL_SELFPLAY)) for i in range(goparams.NUM_PARALLEL_SELFPLAY): print('Starting Worker...') num_workers += 1 start_worker(num_workers) time.sleep(1) sys.stdout.flush() while count_games() < MAX_GAMES_PER_GENERATION: time.sleep(10) games = count_games() print('Found Games: {}'.format(games)) print('selfplaying: {:.2f} games/hour'.format( games / ((time.time() - start_t) / 60 / 60))) print('Worker Processes: {}'.format(count_live_procs())) sys.stdout.flush() print('Done with selfplay loop.') time.sleep(10) for proc in procs: proc.kill() # Sometimes the workers need extra help... time.sleep(5) os.system('pkill -f selfplay_worker.py') # Let things settle after we kill processes. time.sleep(10) # Because we use process level parallelism for selfpaying and we don't # sync or communicate between processes, there could be too many games # played (up to 1 extra game per worker process). # This is a rather brutish way to ensure we train on the correct number # of games... print('There are {} games in the selfplay directory at {}'.format( count_games(), selfplay_dir)) sys.stdout.flush() while count_games() > MAX_GAMES_PER_GENERATION: print('Too many selfplay games ({}/{}) ... deleting one'.format( count_games(), MAX_GAMES_PER_GENERATION)) # This will remove exactly one game file from the selfplay directory... or # so we hope :) sys.stdout.flush() os.system('ls {}/* -d | tail -n 1 | xargs rm'.format(selfplay_dir)) # unclear if this sleep is necessary... time.sleep(1) print('After cleanup, there are {} games in the selfplay directory at {}'. format(count_games(), selfplay_dir)) sys.stdout.flush() qmeas.stop_time('selfplay_wait')
def rl_loop(): """Run the reinforcement learning loop This is meant to be more of an integration test than a realistic way to run the reinforcement learning. """ # monkeypatch the hyperparams so that we get a quickly executing network. dual_net.get_default_hyperparams = lambda **kwargs: { 'k': 8, 'fc_width': 16, 'num_shared_layers': 1, 'l2_strength': 1e-4, 'momentum': 0.9} dual_net.TRAIN_BATCH_SIZE = 16 dual_net.EXAMPLES_PER_GENERATION = 64 #monkeypatch the shuffle buffer size so we don't spin forever shuffling up positions. preprocessing.SHUFFLE_BUFFER_SIZE = 1000 # with tempfile.TemporaryDirectory() as base_dir: base_dir = "/tmp/minigo" with open('/tmp/foo', 'w') as fff: working_dir = os.path.join(base_dir, 'models_in_training') model_save_path = os.path.join(base_dir, 'models', '000000-bootstrap') next_model_save_file = os.path.join(base_dir, 'models', '000001-nextmodel') selfplay_dir = os.path.join(base_dir, 'data', 'selfplay') model_selfplay_dir = os.path.join(selfplay_dir, '000000-bootstrap') gather_dir = os.path.join(base_dir, 'data', 'training_chunks') holdout_dir = os.path.join( base_dir, 'data', 'holdout', '000000-bootstrap') sgf_dir = os.path.join(base_dir, 'sgf', '000000-bootstrap') os.makedirs(os.path.join(base_dir, 'data'), exist_ok=True) print("Creating random initial weights...") main.bootstrap(working_dir, model_save_path) for i in range(100): qmeas.start_time('main-loop') print("Playing some games...") # Do two selfplay runs to test gather functionality qmeas.start_time('main-loop-self-play') for j in range(2): main.selfplay( load_file=model_save_path, output_dir=model_selfplay_dir, output_sgf=sgf_dir, holdout_pct=0, readouts=10) qmeas.stop_time('main-loop-self-play') # Do one holdout run to test validation qmeas.start_time('main-loop-self-play-holdout') main.selfplay( load_file=model_save_path, holdout_dir=holdout_dir, output_dir=model_selfplay_dir, output_sgf=sgf_dir, holdout_pct=100, readouts=10) qmeas.stop_time('main-loop-self-play-holdout') print("See sgf files here?") sgf_listing = subprocess.check_output(["ls", "-l", sgf_dir + "/full"]) print(sgf_listing.decode("utf-8")) print("Gathering game output...") qmeas.start_time('main-loop-gather') main.gather(input_directory=selfplay_dir, output_directory=gather_dir) qmeas.stop_time('main-loop-gather') print("Training on gathered game data...") qmeas.start_time('main-loop-train') main.train(working_dir, gather_dir, next_model_save_file, generation_num=1) qmeas.stop_time('main-loop-train') print("Trying validate on 'holdout' game...") qmeas.start_time('main-loop-validate') main.validate(working_dir, holdout_dir) qmeas.stop_time('main-loop-validate') print("Verifying that new checkpoint is playable...") main.selfplay( load_file=next_model_save_file, holdout_dir=holdout_dir, output_dir=model_selfplay_dir, output_sgf=sgf_dir, readouts=10) qmeas.stop_time('main-loop') qmeas._flush()