def npz_dir_dataset(file_dir_or_list: Union[str, List[str]], features: dict, metadata_path: str, fold: int = None, randomize: bool = True, num_parallel: int = 5, shuffle_size: int = 500, filesystem: S3FS = None, npz_from_s3: bool = False) -> tf.data.Dataset: """ Creates a tf.data.Dataset from a directory containing numpy .npz files. Files are loaded lazily when needed. `num_parallel` files are read in parallel and interleaved together. :param file_dir_or_list: directory containing .npz files or a list of paths to .npz files :param features: dict of (`field` -> `feature_name`) mappings, where `field` is the field in the .npz array and `feature_name` is the name of the feature it is saved to. :param fold: in k-fold validation, fold to consider when querying the patchlet info dataframe :param randomize: whether to shuffle the samples of the dataset or not, defaults to `True` :param num_parallel: number of files to read in parallel and intereleave, defaults to 5 :param shuffle_size: buffer size for shuffling file order, defaults to 500 :param metadata_path: path to input csv files with patchlet information :param filesystem: filesystem to access bucket, defaults to None :param npz_from_s3: if True, npz files are loaded from S3 bucket, otherwise from local disk :return: dataset containing examples merged from files """ files = file_dir_or_list if npz_from_s3: assert filesystem is not None # If dir, then list files if isinstance(file_dir_or_list, str): if filesystem and not filesystem.isdir(file_dir_or_list): filesystem.makedirs(file_dir_or_list) dir_list = os.listdir( file_dir_or_list) if not npz_from_s3 else filesystem.listdir( file_dir_or_list) files = [os.path.join(file_dir_or_list, f) for f in dir_list] fields = list(features.keys()) # Read one file for shape info file = next(iter(files)) data = np.load(file) if not npz_from_s3 else np.load( filesystem.openbin(file)) np_arrays = [data[f] for f in fields] # Append norm arrays perc99, meanstd_mean, meanstd_median, meanstd_std = _construct_norm_arrays( file, metadata_path, fold, filesystem) np_arrays.append(perc99) np_arrays.append(meanstd_mean) np_arrays.append(meanstd_median) np_arrays.append(meanstd_std) # Read shape and type info # types = tuple(arr.dtype for arr in np_arrays) types = (tf.uint16, tf.float32, tf.float32, tf.float32, tf.float64, tf.float64, tf.float64, tf.float64) shapes = tuple(arr.shape[1:] for arr in np_arrays) # Create datasets datasets = [ _npz_file_lazy_dataset(file, fields, types, shapes, metadata_path, fold=fold, filesystem=filesystem, npz_from_s3=npz_from_s3) for file in files ] ds = tf.data.Dataset.from_tensor_slices(datasets) # Shuffle files and interleave multiple files in parallel if randomize: ds = ds.shuffle(shuffle_size) ds = ds.interleave(lambda x: x, cycle_length=num_parallel) return ds
def read_imageset(imset_file: str, filesystem: S3FS = None, normalize: bool = True, country_norm_df: pd.DataFrame = None, norm_deimos_npz: np.lib.npyio.NpzFile = None, norm_s2_npz: np.lib.npyio.NpzFile = None, n_views: int = 16, padding: str = 'zeros', histogram_matching: bool = False) -> ImageSet: """ Retrieves all assets from the given directory. :param imset_file: name of npz file with sample imageset :param filesystem: S3 filesystem to read files directly from bucket. Default reads from local disk :param normalize: whether to normalize data or not :param country_norm_df: S2 median/std normalization factors stored per country :param norm_deimos_npz: 1st and 99th percentile normalization factors for DEIMOS :param norm_s2_npz: 1st and 99th percentile normalization factors for S2 :param n_views: number of time frames to consider in lrs sequence. If n_views is smaller than the available time frames, `n_views` timeframes from the lrs sequence are taken in reverted order, i.e. last is first :param padding: strategy used to fill lrs sequence if n_views is greater than available timestamps. Supported options are `zeros`, where 0 frames are prepended to features, or `repeat` where random repeats of timeframes are taken :param histogram_matching: whether to match the histogram between the HR and the corresponding LR image """ assert padding in ['zeros', 'repeat'] # Read asset names npz = np.load(filesystem.openbin(imset_file), allow_pickle=True) if filesystem else np.load( imset_file, allow_pickle=True) features = npz['features'] hr = npz['labels'] if normalize: country = npz['countries'] country_stats = country_norm_df[country_norm_df.country == str( country)] norm_median = country_stats[[ 'median_0', 'median_1', 'median_2', 'median_3' ]].values norm_std = country_stats[['std_0', 'std_1', 'std_2', 'std_3']].values features = (features - norm_median) / norm_std deimos_p1 = norm_deimos_npz['p1'] deimos_p99 = norm_deimos_npz['p99'] s2_p1 = norm_s2_npz['p1'] s2_p99 = norm_s2_npz['p99'] hr = (hr - deimos_p1) / (deimos_p99 - deimos_p1) features = (features - s2_p1) / (s2_p99 - s2_p1) alphas = np.ones(n_views) if histogram_matching: hr = match_histograms(hr, features[-1], multichannel=True) n_feature_timestamps = len(features) if n_feature_timestamps < n_views: if padding == 'zeros': features = pad_to_k(features, n_views, pad_to_front=False) alphas[n_feature_timestamps:] = 0 elif padding == 'repeat': n_pad = n_views - n_feature_timestamps padded = features[-1:].repeat(n_pad, axis=0) features = np.concatenate((features, padded)) else: features = features[-n_views:, ...] # Tensor is `CxTxHxW` features = np.moveaxis(features, -1, 0) hr = np.moveaxis(hr, 2, 0) imageset = ImageSet(name=os.path.basename(imset_file), timestamp_deimos=str(npz['timetamps_deimos'].item()), lr=features, hr=hr, alphas=alphas) return imageset