Esempio n. 1
0
class GameConverter:

    def __init__(self, features: list) -> None:
        self.feature_processor = Preprocess(features)
        self.n_features = self.feature_processor.output_dim

    def convert_game(self, file_name, bd_size: int) -> None:
        """Reads the given SGF file into an iterable of (input, output) pairs
        for neural network training
        Each input is a GameState converted into one-hot neural net features
        Each output is an action as an (x,y) pair (passes are skipped)
        If this game's size does not match bd_size, a SizeMismatchError is raised
        """
        with open(file_name, 'r') as file_object:
            state_action_iterator = sgf_iter_states(file_object.read(), include_end=False)

        for (state, move, player) in state_action_iterator:
            if state.size != bd_size:
                raise SizeMismatchError()
            if move != go.PASS_MOVE:
                nn_input = self.feature_processor.state_to_tensor(state)
                yield (nn_input, move)

    def sgfs_to_hdf5(self, sgf_files, hdf5_file, bd_size: int=19, ignore_errors: bool=True, verbose: bool=False) -> None:
        """Convert all files in the iterable sgf_files into an hdf5 group to be stored in hdf5_file
        Parameters
        ----------
        - sgf_files: an iterable of relative or absolute paths to SGF files
        - hdf5_file: the name of the HDF5 where features will be saved
        - bd_size: side length of board of games that are loaded
        - ignore_errors: if True, issues a Warning when there is an unknown
            exception rather than halting. Note that sgf.ParseException and
            go.IllegalMove exceptions are always skipped
        Results
        -------
        states : dataset with shape (n_data, n_features, board width, board height)
        actions: dataset with shape (n_data, 2) (actions are stored as x,y tuples of
                    where the move was played)
        file_offsets : group mapping from filenames to tuples of (index, length)
        For example, to find what positions in the dataset come from 'test.sgf':
            index, length = file_offsets['test.sgf']
            test_states = states[index:index+length]
            test_actions = actions[index:index+length]
        """
        REMOVED_GAMES = 0
        NB_ACTION = 0
        # make a hidden temporary file in case of a crash.
        # on success, this is renamed to hdf5_file
        tmp_file = os.path.join(os.path.dirname(hdf5_file), ".tmp." + os.path.basename(hdf5_file))
        h5f = h5.File(tmp_file, 'w')

        try:
            states = h5f.require_dataset(
                'states',
                dtype=np.uint8,
                shape=(1, bd_size, bd_size, self.n_features),
                maxshape=(None, bd_size, bd_size, self.n_features),  # 'None' == arbitrary size
                exact=False,  # allow non-uint8 datasets to be loaded, coerced to uint8
                chunks=(64, bd_size, bd_size, self.n_features),  # approximately 1MB chunks
                compression="lzf")
            actions = h5f.require_dataset(
                'actions',
                dtype=np.uint8,
                shape=(1, 2),
                maxshape=(None, 2),
                exact=False,
                chunks=(1024, 2),
                compression="lzf")

            # 'file_offsets' is an HDF5 group so that 'file_name in file_offsets' is fast
            file_offsets = h5f.require_group('file_offsets')

            # Store comma-separated list of feature planes in the scalar field 'features'. The
            # string can be retrieved using h5py's scalar indexing: h5f['features'][()]
            h5f['features'] = np.string_(','.join(self.feature_processor.feature_list))
            h5f['features_nb'] =  self.n_features

            if verbose:
                print("created HDF5 dataset in {}".format(tmp_file))

            next_idx = 0
            for file_name in sgf_files:
                if verbose:
                    print(file_name)
                # count number of state/action pairs yielded by this game
                n_pairs = 0
                file_start_idx = next_idx
                try:
                    for state, move in self.convert_game(file_name, bd_size):
                        if next_idx >= len(states):
                            states.resize((next_idx + 1, bd_size, bd_size, self.n_features))
                            actions.resize((next_idx + 1, 2))
                        states[next_idx] = state
                        actions[next_idx] = move
                        n_pairs += 1
                        next_idx += 1
                        NB_ACTION +=1
                except go.IllegalMove:
                    warnings.warn("Illegal Move encountered, dropping the remainder of the game" )
                    REMOVED_GAMES +=1
                except sgf.ParseException:
                    warnings.warn("Could not parse, dropping game")
                    REMOVED_GAMES +=1
                except SizeMismatchError:
                    warnings.warn("Skipping; wrong board size")
                    REMOVED_GAMES +=1
                except Exception as e:
                    # catch everything else
                    if ignore_errors:
                        warnings.warn("Unkown exception with file %s  %s" % (file_name, e),
                                      stacklevel=2)
                        REMOVED_GAMES +=1

                    else:

                        raise e

                finally:

                    if n_pairs > 0:
                        # '/' has special meaning in HDF5 key names, so they
                        # are replaced with ':' here
                        file_name_key = file_name.replace('/', ':')
                        file_offsets[file_name_key] = [file_start_idx, n_pairs]
                        if verbose:
                            print("\t%d state/action pairs extracted" % n_pairs)
                    elif verbose:
                        print("\t-no usable data-")
        except Exception as e:
            print("sgfs_to_hdf5 failed")
            os.remove(tmp_file)
            raise e

        if verbose:
            print("FINISHED. renaming %s to %s" % (tmp_file, hdf5_file))
            print "With %s removed games" % REMOVED_GAMES
            print "And a total of %s states" % NB_ACTION

        # processing complete; rename tmp_file to hdf5_file
        h5f.close()
        os.rename(tmp_file, hdf5_file)
Esempio n. 2
0
 def __init__(self, features: list) -> None:
     self.feature_processor = Preprocess(features)
     self.n_features = self.feature_processor.output_dim
Esempio n. 3
0
class GameConverter:
    def __init__(self, features):
        self.feature_processor = Preprocess(features)
        self.n_features = self.feature_processor.output_dim

    def convert_game(self, file_name, bd_size):
        """Read the given SGF file into an iterable of (input, output) pairs
        for neural network training.
        Each input is a GameState converted into one-hot neural net features.
        Each output is an action as an (x,y) pair (passes are skipped)
        If this game's size does not match bd_size, a SizeMismatchError is raised
        """
        with open(file_name, 'r') as file_object:
            state_action_iterator = sgf_iter_states(file_object.read(),
                                                    include_end=False)

        for (state, move, player) in state_action_iterator:
            if state.size != bd_size:
                raise SizeMismatchError()
            if move != go.PASS_MOVE:
                nn_input = self.feature_processor.state_to_tensor(state)
                yield (nn_input, move)

    def sgfs_to_hdf5(self,
                     sgf_files,
                     hdf5_file,
                     bd_size=19,
                     ignore_errors=True):
        """Convert all files in the iterable sgf_files into an hdf5 group to be stored in hdf5_file
        Arguments:
        - sgf_files : an iterable of relative or absolute paths to SGF files
        - hdf5_file : the name of the HDF5 where features will be saved
        - bd_size : side length of board of games that are loaded
        - ignore_errors : if True, issues a Warning when there is an unknown
            exception rather than halting. Note that sgf.ParseException and
            go.IllegalMove exceptions are always skipped
        The resulting file has the following properties:
            states  : dataset with shape (n_data, n_features, board width, board height)
            actions : dataset with shape (n_data, 2) (actions are stored as x,y tuples of
                      where the move was played)
            file_offsets : group mapping from filenames to tuples of (index, length)
        For example, to find what positions in the dataset come from 'test.sgf':
            index, length = file_offsets['test.sgf']
            test_states = states[index:index+length]
            test_actions = actions[index:index+length]
        """

        # Make a hidden temporary file in case of a crash.
        # If success, this is renamed to hdf5_file
        tmp_file = os.path.join(os.path.dirname(hdf5_file),
                                ".tmp." + os.path.basename(hdf5_file))
        h5f = h5.File(tmp_file, 'w')

        try:
            states = h5f.require_dataset(
                'states',
                dtype=np.uint8,
                shape=(1, self.n_features, bd_size, bd_size),
                maxshape=(None, self.n_features, bd_size,
                          bd_size),  # 'None' == arbitrary size
                exact=
                False,  # allow non-uint8 datasets to be loaded, coerced to uint8
                chunks=(64, self.n_features, bd_size,
                        bd_size),  # approximately 1MB chunks
                compression="lzf")
            actions = h5f.require_dataset('actions',
                                          dtype=np.uint8,
                                          shape=(1, 2),
                                          maxshape=(None, 2),
                                          exact=False,
                                          chunks=(1024, 2),
                                          compression="lzf")

            # Store comma-separated list of feature planes in the scalar field 'features'. The
            # String can be retrieved using h5py's scalar indexing: h5f['features'][()]
            h5f['features'] = np.string_(','.join(
                self.feature_processor.feature_list))

            next_idx = 0
            for file_name in sgf_files:
                # count number of state/action pairs yielded by this game
                n_pairs = 0
                file_start_idx = next_idx
                for state, move in self.convert_game(file_name, bd_size):
                    if next_idx >= len(states):
                        states.resize(
                            (next_idx + 1, self.n_features, bd_size, bd_size))
                        actions.resize((next_idx + 1, 2))
                    states[next_idx] = state
                    actions[next_idx] = move
                    n_pairs += 1
                    next_idx += 1

        except go.IllegalMove:
            warnings.warn("Illegal Move encountered in %s\n"
                          "dropping the remainder of the game" % file_name)
        except sgf.ParseException:
            warnings.warn("Could not parse %s\n\tdropping game" % file_name)

        except SizeMismatchError:
            warnings.warn("Skipping %s; wrong board size" % file_name)

        except Exception as e:
            # catch everything else
            if ignore_errors:
                warnings.warn("Unkown exception with file %s\n\t%s" %
                              (file_name, e),
                              stacklevel=2)
            else:
                raise e

        except Exception as e:
            print("sgfs_to_hdf5 failed")
            os.remove(tmp_file)
            raise e

        h5f.close()
        os.rename(tmp_file, hdf5_file)