def _load_dataset_file(dataset_filepath): """ Loads a single dataset file give by its path :param dataset_filepath: path where the file is located :return: """ store = zarr.ZipStore(dataset_filepath, mode='r') pgn_dataset = zarr.group(store=store) s_idcs, x, yv, yp = get_numpy_arrays(pgn_dataset) return s_idcs, x, yv, yp
def _load_dataset_file(dataset_filepath): """ Loads a single dataset file give by its path :param dataset_filepath: path where the file is located :return:starting_idx: [int] - List of indices where ech game starts x: nd.array - Numpy array which contains the game positions y_value: nd.array - Numpy array which describes the winner for each board position y_policy: nd.array - Numpy array which describes the policy distribution for each board state (in case of a pgn dataset the move is one hot encoded) """ return get_numpy_arrays( zarr.group(store=zarr.ZipStore(dataset_filepath, mode="r")))
def load_pgn_dataset( dataset_type="train", part_id=0, verbose=True, normalize=False, q_value_ratio=0, ): """ Loads one part of the pgn dataset in form of planes / multidimensional numpy array. It reads all files which are located either in the main_config['test_dir'] or main_config['test_dir'] :param dataset_type: either ['train', 'test', 'mate_in_one'] :param part_id: Decides which part of the data set will be loaded :param verbose: True if the log message shall be shown :param normalize: True if the inputs shall be normalized to 0-1 ! Note this only supported for hist-length=1 at the moment :param q_value_ratio: Ratio for mixing the value return with the corresponding q-value For a ratio of 0 no q-value information will be used. Value must be in [0, 1] :return: numpy-arrays: start_indices - defines the index where each game starts x - the board representation for all games y_value - the game outcome (-1,0,1) for each board position y_policy - the movement policy for the next_move played plys_to_end - array of how many plys to the end of the game for each position. This can be used to apply discounting pgn_datasets - the dataset file handle (you can use .tree() to show the file structure) """ if dataset_type == "train": zarr_filepaths = glob.glob(main_config["planes_train_dir"] + "**/*.zip") elif dataset_type == "val": zarr_filepaths = glob.glob(main_config["planes_val_dir"] + "**/*.zip") elif dataset_type == "test": zarr_filepaths = glob.glob(main_config["planes_test_dir"] + "**/*.zip") elif dataset_type == "mate_in_one": zarr_filepaths = glob.glob(main_config["planes_mate_in_one_dir"] + "**/*.zip") else: raise Exception( 'Invalid dataset type "%s" given. It must be either "train", "val", "test" or "mate_in_one"' % dataset_type) if len(zarr_filepaths) < part_id + 1: raise Exception( "There aren't enough parts available (%d parts) in the given directory for partid=%d" % (len(zarr_filepaths), part_id)) # load the zarr-files pgn_datasets = zarr_filepaths if verbose: logging.debug("loading: %s...", pgn_datasets[part_id]) logging.debug("") pgn_dataset = zarr.group( store=zarr.ZipStore(pgn_datasets[part_id], mode="r")) start_indices, x, y_value, y_policy, plys_to_end, y_best_move_q = get_numpy_arrays( pgn_dataset) # Get the data if verbose: logging.info("STATISTICS:") try: for member in pgn_dataset["statistics"]: print(member, list(pgn_dataset["statistics"][member])) except KeyError: logging.warning("no statistics found") logging.info("PARAMETERS:") try: for member in pgn_dataset["parameters"]: print(member, list(pgn_dataset["parameters"][member])) except KeyError: logging.warning("no parameters found") if q_value_ratio != 0: y_value = (1 - q_value_ratio) * y_value + q_value_ratio * y_best_move_q if normalize: x = x.astype(np.float32) # the y-vectors need to be casted as well in order to be accepted by the network y_value = y_value.astype(np.float32) y_policy = y_policy.astype(np.float32) # apply rescaling using a predefined scaling constant (this makes use of vectorized operations) x *= MATRIX_NORMALIZER return start_indices, x, y_value, y_policy, plys_to_end, pgn_dataset
def load_pgn_dataset(dataset_type='train', part_id=0, print_statistics=False, print_parameters=False, verbose=True, normalize=False): """ Loads one part of the pgn dataset in form of planes / multidimensional numpy array. It reads all files which are located either in the main_config['test_dir'] or main_config['test_dir'] :param config_path: Define where your config file is located :param dataset_type: either ['train', 'test', 'mate_in_one'] :param part_id: Decides which part of the data set will be loaded :param print_statistics: Decides whether to print file statistics :param print_parameters: Decide whether to print the parameters with which the dataset was generated :param verbose: True if the log message shall be shown :param normalize: True if the inputs shall be normalized to 0-1 ! Note this only supported for hist-length=1 at the moment :return: numpy-arrays: s_idcs - defines the index where each game starts x - the board representation for all games yv - the game outcome (-1,0,1) for each board position yp - the movement policy for the next_move played pgn_datasets - the dataset file handle (you can use .tree() to show the file structure) """ if dataset_type == 'train': zarr_filepaths = glob.glob(main_config["planes_train_dir"] + '**/*') elif dataset_type == 'val': zarr_filepaths = glob.glob(main_config["planes_test_dir"] + '**/*') elif dataset_type == 'test': zarr_filepaths = glob.glob(main_config["planes_test_dir"] + '**/*') elif dataset_type == 'mate_in_one': zarr_filepaths = glob.glob(main_config["planes_mate_in_one_dir"] + '**/*') else: raise Exception('Invalid dataset type "%s" given. It must be either "train", "val", "test" or "mate_in_one"' % dataset_type) if len(zarr_filepaths) < part_id+1: raise Exception("There aren't enough parts available in the given directory for partid=" + str(part_id)) # load the zarr-files pgn_datasets = zarr_filepaths if verbose is True: logging.debug('loading: %s...', zarr_filepaths[part_id]) logging.debug('') store = zarr.ZipStore(zarr_filepaths[part_id], mode='r') pgn_dataset = zarr.group(store=store) # Get the data s_idcs, x, yv, yp = get_numpy_arrays(pgn_dataset) if print_statistics is True: logging.info('STATISTICS:') for member in pgn_dataset['statistics']: print(member, list(pgn_dataset['statistics'][member])) if print_parameters is True: logging.info('PARAMETERS:') for member in pgn_dataset['parameters']: print(member, list(pgn_dataset['parameters'][member])) if normalize is True: x = x.astype(np.float32) # the y-vectors need to be casted as well in order to be accepted by the network yv = yv.astype(np.float32) yp = yp.astype(np.float32) # !TODO replace this by function normalize_input_planes() mat_pos = x[:, :NB_CHANNELS_POS, :, :] mat_const = x[:, NB_CHANNELS_POS:, :, :] # iterate over all pieces except the king for p_type in chess.PIECE_TYPES[:-1]: # p_type -1 because p_type starts with 1 ch = CHANNEL_MAPPING_POS['prisoners'] + p_type - 1 mat_pos[:, ch, :, :] /= MAX_NB_PRISONERS # the prison for black begins 5 channels later mat_pos[:, ch + POCKETS_SIZE_PIECE_TYPE, :, :] /= MAX_NB_PRISONERS ### Total Move Count # 500 was set as the max number of total moves mat_const[:, CHANNEL_MAPPING_CONST['total_mv_cnt'], :, :] /= MAX_NB_MOVES ### No progress count # after 40 moves of no progress the 40 moves rule for draw applies mat_const[:, CHANNEL_MAPPING_CONST['no_progress_cnt'], :, :] /= MAX_NB_NO_PROGRESS return s_idcs, x, yv, yp, pgn_dataset
def load_pgn_dataset(dataset_type="train", part_id=0, print_statistics=False, print_parameters=False, verbose=True, normalize=False): # Too many arguments (6/5) """ Loads one part of the pgn dataset in form of planes / multidimensional numpy array. It reads all files which are located either in the main_config['test_dir'] or main_config['test_dir'] :param dataset_type: either ['train', 'test', 'mate_in_one'] :param part_id: Decides which part of the data set will be loaded :param print_statistics: Decides whether to print file statistics :param print_parameters: Decide whether to print the parameters with which the dataset was generated :param verbose: True if the log message shall be shown :param normalize: True if the inputs shall be normalized to 0-1 ! Note this only supported for hist-length=1 at the moment :return: numpy-arrays: starting_idx - defines the index where each game starts x - the board representation for all games y_value - the game outcome (-1,0,1) for each board position y_policy - the movement policy for the next_move played pgn_datasets - the dataset file handle (you can use .tree() to show the file structure) """ if dataset_type == "train": zarr_filepaths = glob.glob(main_config["planes_train_dir"] + "**/*") elif dataset_type == "val": zarr_filepaths = glob.glob(main_config["planes_val_dir"] + "**/*") elif dataset_type == "test": zarr_filepaths = glob.glob(main_config["planes_test_dir"] + "**/*") elif dataset_type == "mate_in_one": zarr_filepaths = glob.glob(main_config["planes_mate_in_one_dir"] + "**/*") else: raise Exception( 'Invalid dataset type "%s" given. It must be either "train", "val", "test" or "mate_in_one"' % dataset_type) if len(zarr_filepaths) < part_id + 1: raise Exception( "There aren't enough parts available in the given directory for partid=" + str(part_id)) # load the zarr-files pgn_datasets = zarr_filepaths if verbose: logging.debug("loading: %s...", pgn_datasets[part_id]) logging.debug("") pgn_dataset = zarr.group( store=zarr.ZipStore(pgn_datasets[part_id], mode="r")) starting_idx, x, y_value, y_policy = get_numpy_arrays( pgn_dataset) # Get the data if print_statistics: logging.info("STATISTICS:") for member in pgn_dataset["statistics"]: print(member, list(pgn_dataset["statistics"][member])) if print_parameters: logging.info("PARAMETERS:") for member in pgn_dataset["parameters"]: print(member, list(pgn_dataset["parameters"][member])) if normalize: x = x.astype(np.float32) # the y-vectors need to be casted as well in order to be accepted by the network y_value = y_value.astype(np.float32) y_policy = y_policy.astype(np.float32) # apply rescaling using a predefined scaling constant (this makes use of vectorized operations) x *= MATRIX_NORMALIZER return starting_idx, x, y_value, y_policy, pgn_dataset