class GameConverter: def __init__(self, features: list) -> None: self.feature_processor = Preprocess(features) self.n_features = self.feature_processor.output_dim def convert_game(self, file_name, bd_size: int) -> None: """Reads the given SGF file into an iterable of (input, output) pairs for neural network training Each input is a GameState converted into one-hot neural net features Each output is an action as an (x,y) pair (passes are skipped) If this game's size does not match bd_size, a SizeMismatchError is raised """ with open(file_name, 'r') as file_object: state_action_iterator = sgf_iter_states(file_object.read(), include_end=False) for (state, move, player) in state_action_iterator: if state.size != bd_size: raise SizeMismatchError() if move != go.PASS_MOVE: nn_input = self.feature_processor.state_to_tensor(state) yield (nn_input, move) def sgfs_to_hdf5(self, sgf_files, hdf5_file, bd_size: int=19, ignore_errors: bool=True, verbose: bool=False) -> None: """Convert all files in the iterable sgf_files into an hdf5 group to be stored in hdf5_file Parameters ---------- - sgf_files: an iterable of relative or absolute paths to SGF files - hdf5_file: the name of the HDF5 where features will be saved - bd_size: side length of board of games that are loaded - ignore_errors: if True, issues a Warning when there is an unknown exception rather than halting. Note that sgf.ParseException and go.IllegalMove exceptions are always skipped Results ------- states : dataset with shape (n_data, n_features, board width, board height) actions: dataset with shape (n_data, 2) (actions are stored as x,y tuples of where the move was played) file_offsets : group mapping from filenames to tuples of (index, length) For example, to find what positions in the dataset come from 'test.sgf': index, length = file_offsets['test.sgf'] test_states = states[index:index+length] test_actions = actions[index:index+length] """ REMOVED_GAMES = 0 NB_ACTION = 0 # make a hidden temporary file in case of a crash. # on success, this is renamed to hdf5_file tmp_file = os.path.join(os.path.dirname(hdf5_file), ".tmp." + os.path.basename(hdf5_file)) h5f = h5.File(tmp_file, 'w') try: states = h5f.require_dataset( 'states', dtype=np.uint8, shape=(1, bd_size, bd_size, self.n_features), maxshape=(None, bd_size, bd_size, self.n_features), # 'None' == arbitrary size exact=False, # allow non-uint8 datasets to be loaded, coerced to uint8 chunks=(64, bd_size, bd_size, self.n_features), # approximately 1MB chunks compression="lzf") actions = h5f.require_dataset( 'actions', dtype=np.uint8, shape=(1, 2), maxshape=(None, 2), exact=False, chunks=(1024, 2), compression="lzf") # 'file_offsets' is an HDF5 group so that 'file_name in file_offsets' is fast file_offsets = h5f.require_group('file_offsets') # Store comma-separated list of feature planes in the scalar field 'features'. The # string can be retrieved using h5py's scalar indexing: h5f['features'][()] h5f['features'] = np.string_(','.join(self.feature_processor.feature_list)) h5f['features_nb'] = self.n_features if verbose: print("created HDF5 dataset in {}".format(tmp_file)) next_idx = 0 for file_name in sgf_files: if verbose: print(file_name) # count number of state/action pairs yielded by this game n_pairs = 0 file_start_idx = next_idx try: for state, move in self.convert_game(file_name, bd_size): if next_idx >= len(states): states.resize((next_idx + 1, bd_size, bd_size, self.n_features)) actions.resize((next_idx + 1, 2)) states[next_idx] = state actions[next_idx] = move n_pairs += 1 next_idx += 1 NB_ACTION +=1 except go.IllegalMove: warnings.warn("Illegal Move encountered, dropping the remainder of the game" ) REMOVED_GAMES +=1 except sgf.ParseException: warnings.warn("Could not parse, dropping game") REMOVED_GAMES +=1 except SizeMismatchError: warnings.warn("Skipping; wrong board size") REMOVED_GAMES +=1 except Exception as e: # catch everything else if ignore_errors: warnings.warn("Unkown exception with file %s %s" % (file_name, e), stacklevel=2) REMOVED_GAMES +=1 else: raise e finally: if n_pairs > 0: # '/' has special meaning in HDF5 key names, so they # are replaced with ':' here file_name_key = file_name.replace('/', ':') file_offsets[file_name_key] = [file_start_idx, n_pairs] if verbose: print("\t%d state/action pairs extracted" % n_pairs) elif verbose: print("\t-no usable data-") except Exception as e: print("sgfs_to_hdf5 failed") os.remove(tmp_file) raise e if verbose: print("FINISHED. renaming %s to %s" % (tmp_file, hdf5_file)) print "With %s removed games" % REMOVED_GAMES print "And a total of %s states" % NB_ACTION # processing complete; rename tmp_file to hdf5_file h5f.close() os.rename(tmp_file, hdf5_file)
def __init__(self, features: list) -> None: self.feature_processor = Preprocess(features) self.n_features = self.feature_processor.output_dim
class GameConverter: def __init__(self, features): self.feature_processor = Preprocess(features) self.n_features = self.feature_processor.output_dim def convert_game(self, file_name, bd_size): """Read the given SGF file into an iterable of (input, output) pairs for neural network training. Each input is a GameState converted into one-hot neural net features. Each output is an action as an (x,y) pair (passes are skipped) If this game's size does not match bd_size, a SizeMismatchError is raised """ with open(file_name, 'r') as file_object: state_action_iterator = sgf_iter_states(file_object.read(), include_end=False) for (state, move, player) in state_action_iterator: if state.size != bd_size: raise SizeMismatchError() if move != go.PASS_MOVE: nn_input = self.feature_processor.state_to_tensor(state) yield (nn_input, move) def sgfs_to_hdf5(self, sgf_files, hdf5_file, bd_size=19, ignore_errors=True): """Convert all files in the iterable sgf_files into an hdf5 group to be stored in hdf5_file Arguments: - sgf_files : an iterable of relative or absolute paths to SGF files - hdf5_file : the name of the HDF5 where features will be saved - bd_size : side length of board of games that are loaded - ignore_errors : if True, issues a Warning when there is an unknown exception rather than halting. Note that sgf.ParseException and go.IllegalMove exceptions are always skipped The resulting file has the following properties: states : dataset with shape (n_data, n_features, board width, board height) actions : dataset with shape (n_data, 2) (actions are stored as x,y tuples of where the move was played) file_offsets : group mapping from filenames to tuples of (index, length) For example, to find what positions in the dataset come from 'test.sgf': index, length = file_offsets['test.sgf'] test_states = states[index:index+length] test_actions = actions[index:index+length] """ # Make a hidden temporary file in case of a crash. # If success, this is renamed to hdf5_file tmp_file = os.path.join(os.path.dirname(hdf5_file), ".tmp." + os.path.basename(hdf5_file)) h5f = h5.File(tmp_file, 'w') try: states = h5f.require_dataset( 'states', dtype=np.uint8, shape=(1, self.n_features, bd_size, bd_size), maxshape=(None, self.n_features, bd_size, bd_size), # 'None' == arbitrary size exact= False, # allow non-uint8 datasets to be loaded, coerced to uint8 chunks=(64, self.n_features, bd_size, bd_size), # approximately 1MB chunks compression="lzf") actions = h5f.require_dataset('actions', dtype=np.uint8, shape=(1, 2), maxshape=(None, 2), exact=False, chunks=(1024, 2), compression="lzf") # Store comma-separated list of feature planes in the scalar field 'features'. The # String can be retrieved using h5py's scalar indexing: h5f['features'][()] h5f['features'] = np.string_(','.join( self.feature_processor.feature_list)) next_idx = 0 for file_name in sgf_files: # count number of state/action pairs yielded by this game n_pairs = 0 file_start_idx = next_idx for state, move in self.convert_game(file_name, bd_size): if next_idx >= len(states): states.resize( (next_idx + 1, self.n_features, bd_size, bd_size)) actions.resize((next_idx + 1, 2)) states[next_idx] = state actions[next_idx] = move n_pairs += 1 next_idx += 1 except go.IllegalMove: warnings.warn("Illegal Move encountered in %s\n" "dropping the remainder of the game" % file_name) except sgf.ParseException: warnings.warn("Could not parse %s\n\tdropping game" % file_name) except SizeMismatchError: warnings.warn("Skipping %s; wrong board size" % file_name) except Exception as e: # catch everything else if ignore_errors: warnings.warn("Unkown exception with file %s\n\t%s" % (file_name, e), stacklevel=2) else: raise e except Exception as e: print("sgfs_to_hdf5 failed") os.remove(tmp_file) raise e h5f.close() os.rename(tmp_file, hdf5_file)