Ejemplo n.º 1
0
    def test_filter(self):
        raw_data = create_random_data(100)
        tfexamples = list(map(preprocessing.make_tf_example, *zip(*raw_data)))

        with tempfile.NamedTemporaryFile() as f:
            preprocessing.write_tf_examples(f.name, tfexamples)
            recovered_data = self.extract_data(f.name, filter_amount=.05)

        self.assertLess(len(recovered_data), 50)
Ejemplo n.º 2
0
def gather(selfplay_dir, training_chunk_dir, params):
    """Gather selfplay data into large training chunk.

  Args:
    selfplay_dir: Where to look for games. Set as 'base_dir/data/selfplay/'.
    training_chunk_dir: where to put collected games. Set as
      'base_dir/data/training_chunks/'.
    params: A MiniGoParams instance of hyperparameters for the model.
  """
    # Check the selfplay data from the most recent 50 models.
    _ensure_dir_exists(training_chunk_dir)
    sorted_model_dirs = sorted(tf.gfile.ListDirectory(selfplay_dir))
    models = [
        model_dir.strip('/')
        for model_dir in sorted_model_dirs[-params.gather_generation:]
    ]

    with utils.logged_timer('Finding existing tfrecords...'):
        model_gamedata = {
            model: tf.gfile.Glob(
                os.path.join(selfplay_dir, model, '*' + _TF_RECORD_SUFFIX))
            for model in models
        }
    print('Found {} models'.format(len(models)))
    for model_name, record_files in sorted(model_gamedata.items()):
        print('    {}: {} files'.format(model_name, len(record_files)))

    meta_file = os.path.join(training_chunk_dir, 'meta.txt')
    try:
        with tf.gfile.GFile(meta_file, 'r') as f:
            already_processed = set(f.read().split())
    except tf.errors.NotFoundError:
        already_processed = set()

    num_already_processed = len(already_processed)

    for model_name, record_files in sorted(model_gamedata.items()):
        if set(record_files) <= already_processed:
            continue
        print('Gathering files from {}:'.format(model_name))
        tf_examples = preprocessing.shuffle_tf_examples(
            params.shuffle_buffer_size, params.examples_per_chunk,
            record_files)
        # tqdm to make the loops show a smart progress meter
        for i, example_batch in enumerate(tf_examples):
            output_record = os.path.join(training_chunk_dir,
                                         ('{}-{}' + _TF_RECORD_SUFFIX).format(
                                             model_name, str(i)))
            preprocessing.write_tf_examples(output_record,
                                            example_batch,
                                            serialize=False)
        already_processed.update(record_files)

    print('Processed {} new files'.format(
        len(already_processed) - num_already_processed))
    with tf.gfile.GFile(meta_file, 'w') as f:
        f.write('\n'.join(sorted(already_processed)))
Ejemplo n.º 3
0
    def test_serialize_round_trip(self):
        np.random.seed(1)
        raw_data = create_random_data(10)
        tfexamples = list(map(preprocessing.make_tf_example, *zip(*raw_data)))

        with tempfile.NamedTemporaryFile() as f:
            preprocessing.write_tf_examples(f.name, tfexamples)
            recovered_data = self.extract_data(f.name)

        self.assertEqualData(raw_data, recovered_data)
Ejemplo n.º 4
0
def selfplay(selfplay_dirs, selfplay_model, params):
    """Perform selfplay with a specific model.

  Args:
    selfplay_dirs: A dict to specify the directories used in selfplay.
      selfplay_dirs = {
          'output_dir': output_dir,
          'holdout_dir': holdout_dir,
          'clean_sgf': clean_sgf,
          'full_sgf': full_sgf
      }
    selfplay_model: The actual Dualnet runner for selfplay.
    params: A MiniGoParams instance of hyperparameters for the model.
  """
    with utils.logged_timer('Playing game'):
        player = selfplay_mcts.play(params.board_size, selfplay_model,
                                    params.selfplay_readouts,
                                    params.selfplay_resign_threshold,
                                    params.simultaneous_leaves,
                                    params.selfplay_verbose)

    output_name = '{}-{}'.format(int(time.time()), socket.gethostname())

    def _write_sgf_data(dir_sgf, use_comments):
        with tf.gfile.GFile(
                os.path.join(dir_sgf, '{}.sgf'.format(output_name)), 'w') as f:
            f.write(player.to_sgf(use_comments=use_comments))

    _write_sgf_data(selfplay_dirs['clean_sgf'], use_comments=False)
    _write_sgf_data(selfplay_dirs['full_sgf'], use_comments=True)

    game_data = player.extract_data()
    tf_examples = preprocessing.make_dataset_from_selfplay(game_data, params)

    # Hold out 5% of games for evaluation.
    if random.random() < params.holdout_pct:
        fname = os.path.join(selfplay_dirs['holdout_dir'],
                             output_name + _TF_RECORD_SUFFIX)
    else:
        fname = os.path.join(selfplay_dirs['output_dir'],
                             output_name + _TF_RECORD_SUFFIX)

    preprocessing.write_tf_examples(fname, tf_examples)
Ejemplo n.º 5
0
    def test_serialize_round_trip_no_parse(self):
        np.random.seed(1)
        raw_data = create_random_data(10)
        tfexamples = list(map(preprocessing.make_tf_example, *zip(*raw_data)))

        with tempfile.NamedTemporaryFile() as start_file, \
                tempfile.NamedTemporaryFile() as rewritten_file:
            preprocessing.write_tf_examples(start_file.name, tfexamples)
            # We want to test that the rewritten, shuffled file contains correctly
            # serialized tf.Examples.
            batch_size = 4
            batches = list(
                preprocessing.shuffle_tf_examples(1000, batch_size,
                                                  [start_file.name]))
            # 2 batches of 4, 1 incomplete batch of 2.
            self.assertEqual(len(batches), 3)

            # concatenate list of lists into one list
            all_batches = list(itertools.chain.from_iterable(batches))

            for _ in batches:
                preprocessing.write_tf_examples(rewritten_file.name,
                                                all_batches,
                                                serialize=False)

            original_data = self.extract_data(start_file.name)
            recovered_data = self.extract_data(rewritten_file.name)

        # stuff is shuffled, so sort before checking equality
        def sort_key(nparray_tuple):
            return nparray_tuple[2]

        original_data = sorted(original_data, key=sort_key)
        recovered_data = sorted(recovered_data, key=sort_key)

        self.assertEqualData(original_data, recovered_data)