コード例 #1
0
def npz_dir_dataset(file_dir_or_list: Union[str, List[str]],
                    features: dict,
                    metadata_path: str,
                    fold: int = None,
                    randomize: bool = True,
                    num_parallel: int = 5,
                    shuffle_size: int = 500,
                    filesystem: S3FS = None,
                    npz_from_s3: bool = False) -> tf.data.Dataset:
    """ Creates a tf.data.Dataset from a directory containing numpy .npz files.

    Files are loaded lazily when needed. `num_parallel` files are read in parallel and interleaved together.

    :param file_dir_or_list: directory containing .npz files or a list of paths to .npz files
    :param features: dict of (`field` -> `feature_name`) mappings, where `field` is the field in the .npz array
                   and `feature_name` is the name of the feature it is saved to.
    :param fold: in k-fold validation, fold to consider when querying the patchlet info dataframe
    :param randomize: whether to shuffle the samples of the dataset or not, defaults to `True`
    :param num_parallel: number of files to read in parallel and intereleave, defaults to 5
    :param shuffle_size: buffer size for shuffling file order, defaults to 500
    :param metadata_path: path to input csv files with patchlet information
    :param filesystem: filesystem to access bucket, defaults to None
    :param npz_from_s3: if True, npz files are loaded from S3 bucket, otherwise from local disk
    :return: dataset containing examples merged from files
    """

    files = file_dir_or_list

    if npz_from_s3:
        assert filesystem is not None

    # If dir, then list files
    if isinstance(file_dir_or_list, str):
        if filesystem and not filesystem.isdir(file_dir_or_list):
            filesystem.makedirs(file_dir_or_list)
        dir_list = os.listdir(
            file_dir_or_list) if not npz_from_s3 else filesystem.listdir(
                file_dir_or_list)
        files = [os.path.join(file_dir_or_list, f) for f in dir_list]

    fields = list(features.keys())

    # Read one file for shape info
    file = next(iter(files))
    data = np.load(file) if not npz_from_s3 else np.load(
        filesystem.openbin(file))
    np_arrays = [data[f] for f in fields]

    # Append norm arrays
    perc99, meanstd_mean, meanstd_median, meanstd_std = _construct_norm_arrays(
        file, metadata_path, fold, filesystem)

    np_arrays.append(perc99)
    np_arrays.append(meanstd_mean)
    np_arrays.append(meanstd_median)
    np_arrays.append(meanstd_std)

    # Read shape and type info
    #     types = tuple(arr.dtype for arr in np_arrays)
    types = (tf.uint16, tf.float32, tf.float32, tf.float32, tf.float64,
             tf.float64, tf.float64, tf.float64)
    shapes = tuple(arr.shape[1:] for arr in np_arrays)

    # Create datasets
    datasets = [
        _npz_file_lazy_dataset(file,
                               fields,
                               types,
                               shapes,
                               metadata_path,
                               fold=fold,
                               filesystem=filesystem,
                               npz_from_s3=npz_from_s3) for file in files
    ]
    ds = tf.data.Dataset.from_tensor_slices(datasets)

    # Shuffle files and interleave multiple files in parallel
    if randomize:
        ds = ds.shuffle(shuffle_size)

    ds = ds.interleave(lambda x: x, cycle_length=num_parallel)

    return ds
コード例 #2
0
def read_imageset(imset_file: str,
                  filesystem: S3FS = None,
                  normalize: bool = True,
                  country_norm_df: pd.DataFrame = None,
                  norm_deimos_npz: np.lib.npyio.NpzFile = None,
                  norm_s2_npz: np.lib.npyio.NpzFile = None,
                  n_views: int = 16,
                  padding: str = 'zeros',
                  histogram_matching: bool = False) -> ImageSet:
    """
    Retrieves all assets from the given directory.

    :param imset_file: name of npz file with sample imageset
    :param filesystem: S3 filesystem to read files directly from bucket. Default reads from local disk
    :param normalize: whether to normalize data or not
    :param country_norm_df: S2 median/std normalization factors stored per country
    :param norm_deimos_npz: 1st and 99th percentile normalization factors for DEIMOS
    :param norm_s2_npz: 1st and 99th percentile normalization factors for S2
    :param n_views: number of time frames to consider in lrs sequence. If n_views is smaller than the available time
                    frames, `n_views` timeframes from the lrs sequence are taken in reverted order, i.e. last is first
    :param padding: strategy used to fill lrs sequence if n_views is greater than available timestamps. Supported
                    options are `zeros`, where 0 frames are prepended to features, or `repeat` where random repeats of
                    timeframes are taken
    :param histogram_matching: whether to match the histogram between the HR and the corresponding LR image
    """
    assert padding in ['zeros', 'repeat']

    # Read asset names
    npz = np.load(filesystem.openbin(imset_file),
                  allow_pickle=True) if filesystem else np.load(
                      imset_file, allow_pickle=True)

    features = npz['features']
    hr = npz['labels']

    if normalize:
        country = npz['countries']
        country_stats = country_norm_df[country_norm_df.country == str(
            country)]
        norm_median = country_stats[[
            'median_0', 'median_1', 'median_2', 'median_3'
        ]].values

        norm_std = country_stats[['std_0', 'std_1', 'std_2', 'std_3']].values
        features = (features - norm_median) / norm_std

        deimos_p1 = norm_deimos_npz['p1']
        deimos_p99 = norm_deimos_npz['p99']

        s2_p1 = norm_s2_npz['p1']
        s2_p99 = norm_s2_npz['p99']

        hr = (hr - deimos_p1) / (deimos_p99 - deimos_p1)
        features = (features - s2_p1) / (s2_p99 - s2_p1)

    alphas = np.ones(n_views)

    if histogram_matching:
        hr = match_histograms(hr, features[-1], multichannel=True)

    n_feature_timestamps = len(features)
    if n_feature_timestamps < n_views:
        if padding == 'zeros':
            features = pad_to_k(features, n_views, pad_to_front=False)
            alphas[n_feature_timestamps:] = 0
        elif padding == 'repeat':
            n_pad = n_views - n_feature_timestamps
            padded = features[-1:].repeat(n_pad, axis=0)
            features = np.concatenate((features, padded))
    else:
        features = features[-n_views:, ...]

    # Tensor is `CxTxHxW`
    features = np.moveaxis(features, -1, 0)
    hr = np.moveaxis(hr, 2, 0)

    imageset = ImageSet(name=os.path.basename(imset_file),
                        timestamp_deimos=str(npz['timetamps_deimos'].item()),
                        lr=features,
                        hr=hr,
                        alphas=alphas)
    return imageset