def _input_fn(params): games = bigtable_input.GameQueue( FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table) games_nr = bigtable_input.GameQueue( FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table + '-nr') return preprocessing.get_tpu_bt_input_tensors( games, games_nr, params['batch_size'], number_of_games=FLAGS.window_size, random_rotation=True)
def train(*tf_records: "Records to train on"): """Train on examples.""" tf.logging.set_verbosity(tf.logging.INFO) estimator = dual_net.get_estimator() effective_batch_size = FLAGS.train_batch_size if FLAGS.use_tpu: effective_batch_size *= FLAGS.num_tpu_cores if FLAGS.use_bt: games = bigtable_input.GameQueue( FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table) games_nr = bigtable_input.GameQueue( FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table + '-nr') if FLAGS.use_tpu: if FLAGS.use_bt: def _input_fn(params): return preprocessing.get_tpu_bt_input_tensors( games, games_nr, params['batch_size'], number_of_games=FLAGS.window_size, random_rotation=True) else: def _input_fn(params): return preprocessing.get_tpu_input_tensors( params['batch_size'], tf_records, random_rotation=True) # Hooks are broken with TPUestimator at the moment. hooks = [] else: def _input_fn(): return preprocessing.get_input_tensors( FLAGS.train_batch_size, tf_records, filter_amount=1.0, shuffle_buffer_size=FLAGS.shuffle_buffer_size, random_rotation=True) hooks = [UpdateRatioSessionHook(FLAGS.work_dir), EchoStepCounterHook(output_dir=FLAGS.work_dir)] steps = FLAGS.steps_to_train logging.info("Training, steps = %s, batch = %s -> %s examples", steps or '?', effective_batch_size, (steps * effective_batch_size) if steps else '?') estimator.train(_input_fn, steps=steps, hooks=hooks) if FLAGS.use_bt: bigtable_input.set_fresh_watermark(games, FLAGS.window_size)
def main(argv): """Main program. """ del argv # Unused total_games = FLAGS.training_games total_moves = FLAGS.training_moves fresh = FLAGS.training_fresh batch_size = FLAGS.batch_size output_prefix = FLAGS.output_prefix spec = bigtable_input.BigtableSpec(FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table) gq_r = bigtable_input.GameQueue(spec.project, spec.instance, spec.table) gq_c = bigtable_input.GameQueue(spec.project, spec.instance, spec.table + '-nr') mix = bigtable_input.mix_by_decile(total_games, total_moves, 9) trainings = [ (spec, start_r, start_c, mix, batch_size, '{}{:0>10}_{:0>10}.tfrecord.zz'.format(output_prefix, start_r, start_c)) for start_r, finish_r, start_c, finish_c in reversed( list( training_series(gq_r.latest_game_number, gq_c.latest_game_number, mix, fresh))) ] if FLAGS.starting_game: game = FLAGS.starting_game starts = [t[1] for t in trainings] where = bisect.bisect_left(starts, game) trainings = trainings[where:] if FLAGS.max_trainings: trainings = trainings[:FLAGS.max_trainings] if FLAGS.dry_run: for t in trainings: print(t) raise SystemExit concurrency = min(FLAGS.concurrency, multiprocessing.cpu_count() * 2) with tqdm(desc='Training Sets', unit_scale=2, total=len(trainings)) as pbar: for b in utils.iter_chunks(concurrency, trainings): with multiprocessing.Pool(processes=concurrency) as pool: pool.map(_export_training_set, b) pbar.update(len(b))
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') games_nr = bigtable_input.GameQueue(FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table + '-nr') while True: new_pct = get_95_percentile_bleak(games_nr) update_flagfile(fsdb.flags_path(), new_pct) time.sleep(60 * 3)
def _export_training_set(args): spec, start_r, start_c, mix, batch_size, output_url = args gq_r = bigtable_input.GameQueue(spec.project, spec.instance, spec.table) gq_c = bigtable_input.GameQueue(spec.project, spec.instance, spec.table + '-nr') total_moves = mix.moves_r + mix.moves_c with tf.Session() as sess: ds = bigtable_input.get_unparsed_moves_from_games( gq_r, gq_c, start_r, start_c, mix) ds = ds.batch(batch_size) iterator = ds.make_initializable_iterator() sess.run(iterator.initializer) get_next = iterator.get_next() writes = 0 print('Writing to', output_url) with tf.io.TFRecordWriter( output_url, options=tf.io.TFRecordCompressionType.ZLIB) as wr: log_filename = '/tmp/{}_{}.log'.format(start_r, start_c) with open(log_filename, 'w') as progress_file: with tqdm(desc='Records', unit_scale=2, total=total_moves, file=progress_file) as pbar: while True: try: batch = sess.run(get_next) pbar.update(len(batch)) for b in batch: wr.write(b) writes += 1 if (writes % 10000) == 0: wr.flush() except tf.errors.OutOfRangeError: break os.unlink(log_filename)
def train(*tf_records: "Records to train on"): """Train on examples.""" tf.logging.set_verbosity(tf.logging.INFO) estimator = dual_net.get_estimator() effective_batch_size = FLAGS.train_batch_size if FLAGS.use_tpu: effective_batch_size *= FLAGS.num_tpu_cores if FLAGS.use_tpu: if FLAGS.use_bt: def _input_fn(params): games = bigtable_input.GameQueue( FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table) games_nr = bigtable_input.GameQueue( FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table + '-nr') return preprocessing.get_tpu_bt_input_tensors( games, games_nr, params['batch_size'], params['input_layout'], number_of_games=FLAGS.window_size, random_rotation=True) else: def _input_fn(params): return preprocessing.get_tpu_input_tensors( params['batch_size'], params['input_layout'], tf_records, filter_amount=FLAGS.filter_amount, shuffle_examples=FLAGS.shuffle_examples, shuffle_buffer_size=FLAGS.shuffle_buffer_size, random_rotation=True) # Hooks are broken with TPUestimator at the moment. hooks = [] else: def _input_fn(): return preprocessing.get_input_tensors( FLAGS.train_batch_size, FLAGS.input_layout, tf_records, filter_amount=FLAGS.filter_amount, shuffle_examples=FLAGS.shuffle_examples, shuffle_buffer_size=FLAGS.shuffle_buffer_size, random_rotation=True) hooks = [UpdateRatioSessionHook(FLAGS.work_dir), EchoStepCounterHook(output_dir=FLAGS.work_dir)] steps = FLAGS.steps_to_train if not steps and FLAGS.num_examples: batch_size = FLAGS.train_batch_size if FLAGS.use_tpu: batch_size *= FLAGS.num_tpu_cores steps = math.floor(FLAGS.num_examples / batch_size) logging.info("Training, steps = %s, batch = %s -> %s examples", steps or '?', effective_batch_size, (steps * effective_batch_size) if steps else '?') if FLAGS.use_bt: games = bigtable_input.GameQueue( FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table) if not games.read_wait_cell(): games.require_fresh_games(20000) latest_game = games.latest_game_number index_from = max(latest_game, games.read_wait_cell()) print("== Last game before training:", latest_game, flush=True) print("== Wait cell:", games.read_wait_cell(), flush=True) try: estimator.train(_input_fn, steps=steps, hooks=hooks) if FLAGS.use_bt: bigtable_input.set_fresh_watermark(games, index_from, FLAGS.window_size) except: if FLAGS.use_bt: games.require_fresh_games(0) raise
def train(*tf_records: "Records to train on"): """Train on examples.""" tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) estimator = dual_net.get_estimator(FLAGS.num_intra_threads, FLAGS.num_inter_threads) if FLAGS.dist_train: effective_batch_size = int(FLAGS.train_batch_size / hvd.size()) global_batch_size = effective_batch_size * hvd.size() mllogger = mllog.get_mllogger() mllogger.event(key=mllog.constants.GLOBAL_BATCH_SIZE, value=global_batch_size) else: effective_batch_size = FLAGS.train_batch_size global_batch_size = FLAGS.train_batch_size logging.info("Real global batch size = {}, local batch size = {}.".format( global_batch_size, effective_batch_size)) if FLAGS.use_tpu: effective_batch_size *= FLAGS.num_tpu_cores if FLAGS.use_tpu: if FLAGS.use_bt: def _input_fn(params): games = bigtable_input.GameQueue(FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table) games_nr = bigtable_input.GameQueue(FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table + '-nr') return preprocessing.get_tpu_bt_input_tensors( games, games_nr, params['batch_size'], params['input_layout'], number_of_games=FLAGS.window_size, random_rotation=True) else: def _input_fn(params): return preprocessing.get_tpu_input_tensors( params['batch_size'], params['input_layout'], tf_records, filter_amount=FLAGS.filter_amount, shuffle_examples=FLAGS.shuffle_examples, shuffle_buffer_size=FLAGS.shuffle_buffer_size, random_rotation=True) # Hooks are broken with TPUestimator at the moment. hooks = [] else: def _input_fn(): return preprocessing.get_input_tensors( effective_batch_size, FLAGS.input_layout, tf_records, filter_amount=FLAGS.filter_amount, shuffle_examples=FLAGS.shuffle_examples, shuffle_buffer_size=FLAGS.shuffle_buffer_size, random_rotation=True, seed=FLAGS.training_seed, dist_train=FLAGS.dist_train, use_bf16=FLAGS.use_bfloat16) hooks = [ UpdateRatioSessionHook(FLAGS.work_dir), EchoStepCounterHook(output_dir=FLAGS.work_dir) ] if FLAGS.dist_train: hooks.append(hvd.BroadcastGlobalVariablesHook(0)) steps = FLAGS.steps_to_train if not steps and FLAGS.num_examples: batch_size = effective_batch_size if FLAGS.use_tpu: batch_size *= FLAGS.num_tpu_cores steps = math.floor(FLAGS.num_examples / batch_size) logging.info("Training, steps = %s, batch = %s -> %s examples", steps or '?', effective_batch_size, (steps * effective_batch_size) if steps else '?') if FLAGS.use_bt: games = bigtable_input.GameQueue(FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table) if not games.read_wait_cell(): games.require_fresh_games(20000) latest_game = games.latest_game_number index_from = max(latest_game, games.read_wait_cell()) print("== Last game before training:", latest_game, flush=True) print("== Wait cell:", games.read_wait_cell(), flush=True) try: estimator.train(_input_fn, steps=steps, hooks=hooks) if FLAGS.use_bt: bigtable_input.set_fresh_watermark(games, index_from, FLAGS.window_size) except: if FLAGS.use_bt: games.require_fresh_games(0) raise
def train(*tf_records: "Records to train on"): """Train on examples.""" tf.logging.set_verbosity(tf.logging.INFO) estimator = dual_net.get_estimator() effective_batch_size = FLAGS.train_batch_size if FLAGS.use_tpu: effective_batch_size *= FLAGS.num_tpu_cores elif FLAGS.use_ipu: effective_batch_size *= FLAGS.num_ipu_cores if FLAGS.use_tpu: if FLAGS.use_bt: def _input_fn(params): games = bigtable_input.GameQueue(FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table) games_nr = bigtable_input.GameQueue(FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table + '-nr') return preprocessing.get_tpu_bt_input_tensors( games, games_nr, params['batch_size'], number_of_games=FLAGS.window_size, random_rotation=True) else: def _input_fn(params): return preprocessing.get_tpu_input_tensors( params['batch_size'], tf_records, random_rotation=True) # Hooks are broken with TPUestimator at the moment. hooks = [] elif FLAGS.use_ipu: def _input_fn(): return preprocessing.get_ipu_input_tensors( FLAGS.train_batch_size, tf_records, filter_amount=FLAGS.filter_amount, shuffle_buffer_size=FLAGS.shuffle_buffer_size, shuffle_examples=False, random_rotation=False) hooks = [] else: def _input_fn(): return preprocessing.get_input_tensors( FLAGS.train_batch_size, tf_records, filter_amount=FLAGS.filter_amount, shuffle_buffer_size=FLAGS.shuffle_buffer_size, random_rotation=True) hooks = [ UpdateRatioSessionHook(FLAGS.work_dir), EchoStepCounterHook(output_dir=FLAGS.work_dir) ] try: if FLAGS.PROFILING: ph = ProfilerHook() hooks = [ph] except: pass steps = FLAGS.steps_to_train # step correction due to smaller batch size if FLAGS.use_ipu: steps = steps * 4096 // effective_batch_size logging.info("Training, steps = %s, batch = %s -> %s examples", steps or '?', effective_batch_size, (steps * effective_batch_size) if steps else '?') if FLAGS.use_bt: games = bigtable_input.GameQueue(FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table) if not games.read_wait_cell(): games.require_fresh_games(20000) latest_game = games.latest_game_number index_from = max(latest_game, games.read_wait_cell()) print("== Last game before training:", latest_game, flush=True) print("== Wait cell:", games.read_wait_cell(), flush=True) if DATA_BENCHMARK: benchmark_op = dataset_benchmark( dataset=_input_fn(), number_of_epochs=80, elements_per_epochs=10000, print_stats=True, # apply_options=False ) import json print("Benchmarking data pipeline:") with tf.Session() as sess: json_string = sess.run(benchmark_op) json_object = json.loads(json_string[0]) print(json_object) if not INFEED_BENCHMARK: raise NotImplementedError("Data benchmark ended.") else: print("Data benchmark ended.") if INFEED_BENCHMARK: benchmark_op = infeed_benchmark( infeed_queue=ipu_infeed_queue.IPUInfeedQueue(_input_fn(), feed_name="infeed"), number_of_epochs=80, elements_per_epochs=10000, print_stats=True, # apply_options=False ) import json print("Benchmarking data pipeline:") with tf.Session() as sess: json_string = sess.run(benchmark_op) json_object = json.loads(json_string[0]) print(json_object) raise NotImplementedError("Infeed benchmark ended.") try: estimator.train(_input_fn, steps=steps, hooks=hooks) if FLAGS.use_bt: bigtable_input.set_fresh_watermark(games, index_from, FLAGS.window_size) except: if FLAGS.use_bt: games.require_fresh_games(0) raise return estimator