Beispiel #1
0
    def _get_simulated_fault_polygons(
            self, address: SimulatedFaultPolygonsAddress
    ) -> Optional[xtgeo.Polygons]:
        """Returns a Xtgeo fault polygons instance of a single realization fault polygons"""

        timer = PerfTimer()

        fault_polygons_fns: List[str] = self._locate_simulated_fault_polygons(
            attribute=address.attribute,
            name=address.name,
            realizations=[address.realization],
        )

        if len(fault_polygons_fns) == 0:
            LOGGER.warning(f"No simulated fault polygons found for {address}")
            return None
        if len(fault_polygons_fns) > 1:
            LOGGER.warning(
                f"Multiple simulated fault polygonss found for: {address}"
                "Returning first fault polygons.")

        fault_polygons = xtgeo.polygons_from_file(fault_polygons_fns[0])

        LOGGER.debug(
            f"Loaded simulated fault polygons in: {timer.elapsed_s():.2f}s")

        return fault_polygons
        def _handle_surface_request(full_surf_address_str: str) -> flask.Response:
            LOGGER.debug(
                f"Handling surface_request: "
                f"full_surf_address_str={full_surf_address_str} "
            )

            timer = PerfTimer()

            img_cache_key = "IMG:" + full_surf_address_str
            LOGGER.debug(f"Looking for image in cache (key={img_cache_key}")

            cached_img_bytes = self._image_cache.get(img_cache_key)
            if not cached_img_bytes:
                LOGGER.error(
                    f"Error getting image for address: {full_surf_address_str}"
                )
                flask.abort(404)

            response = flask.send_file(
                io.BytesIO(cached_img_bytes), mimetype="image/png"
            )
            LOGGER.debug(
                f"Request handled from image cache in: {timer.elapsed_s():.2f}s"
            )
            return response
Beispiel #3
0
    def _get_or_create_statistical_surface(
        self, address: StatisticalSurfaceAddress
    ) -> Optional[xtgeo.RegularSurface]:

        timer = PerfTimer()

        surf = self._stat_surf_cache.fetch(address)
        if surf:
            LOGGER.debug(
                f"Fetched statistical surface from cache in: {timer.elapsed_s():.2f}s"
            )
            return surf

        surf = self._create_statistical_surface(address)
        et_create_s = timer.lap_s()

        self._stat_surf_cache.store(address, surf)
        et_write_cache_s = timer.lap_s()

        LOGGER.debug(
            f"Created and wrote statistical surface to cache in: {timer.elapsed_s():.2f}s ("
            f"create={et_create_s:.2f}s, store={et_write_cache_s:.2f}s), "
            f"[stat={address.statistic}, "
            f"attr={address.attribute}, name={address.name}, date={address.datestr}]"
        )

        return surf
    def dates(
        self,
        resampling_frequency: Optional[Frequency],
        realizations: Optional[Sequence[int]] = None,
    ) -> List[datetime.datetime]:

        if resampling_frequency is not None:
            raise ValueError("Resampling is not supported by this provider")

        timer = PerfTimer()

        table = self._get_or_read_table(["DATE", "REAL"])
        et_read_ms = timer.lap_ms()

        if realizations:
            mask = pc.is_in(table["REAL"], value_set=pa.array(realizations))
            table = table.filter(mask)
        et_filter_ms = timer.lap_ms()

        intersected_dates = find_intersected_dates_between_realizations(table)
        et_find_unique_ms = timer.lap_ms()

        LOGGER.debug(f"dates() took: {timer.elapsed_ms()}ms ("
                     f"read={et_read_ms}ms, "
                     f"filter={et_filter_ms}ms, "
                     f"find_unique={et_find_unique_ms}ms)")

        return intersected_dates.astype(datetime.datetime).tolist()
Beispiel #5
0
def load_ensemble_summary_csv_file(
        csv_file: Path, ensemble_filter: Optional[str]) -> pd.DataFrame:

    LOGGER.debug(f"load_ensemble_summary_csv_file() starting - {csv_file}")
    timer = PerfTimer()

    df: pd.DataFrame = pd.read_csv(csv_file)

    if ensemble_filter is not None:
        if "ENSEMBLE" not in df.columns:
            raise ValueError(
                "Cannot filter on ensemble, no ENSEMBLE column exist in CSV file"
            )

        df = df[df["ENSEMBLE"] == ensemble_filter]

    if "ENSEMBLE" in df.columns:
        if df["ENSEMBLE"].nunique() > 1:
            raise KeyError(
                "Input data contains more than one unique ensemble name")

        df = df.drop(columns="ENSEMBLE")

    LOGGER.debug(
        f"load_ensemble_summary_csv_file() finished in: {timer.elapsed_s():.2f}s"
    )

    return df
def surface_to_png_bytes_optimized(surface: xtgeo.RegularSurface) -> bytes:

    timer = PerfTimer()
    # Note that returned values array is a 2d masked array
    surf_values_ma: np.ma.MaskedArray = surface.values

    surf_values_ma = np.flip(surf_values_ma.transpose(), axis=0)  # type: ignore
    LOGGER.debug(f"flip/transpose: {timer.lap_s():.2f}s")

    # This will be a flat bool array with true for all valid entries
    valid_arr = np.invert(np.ma.getmaskarray(surf_values_ma).flatten())
    LOGGER.debug(f"get valid_arr: {timer.lap_s():.2f}s")

    shape = surf_values_ma.shape
    min_val = surf_values_ma.min()
    max_val = surf_values_ma.max()
    LOGGER.debug(f"minmax: {timer.lap_s():.2f}s")

    if min_val == 0.0 and max_val == 0.0:
        scale_factor = 1.0
    else:
        scale_factor = (256 * 256 * 256 - 1) / (max_val - min_val)

    # Scale the values into the wanted range
    scaled_values_ma = (surf_values_ma - min_val) * scale_factor

    # Get a NON-masked array with all undefined entries filled with 0
    scaled_values = scaled_values_ma.filled(0)

    LOGGER.debug(f"scale and fill: {timer.lap_s():.2f}s")

    val_arr = scaled_values.astype(np.uint32).ravel()
    LOGGER.debug(f"cast and flatten: {timer.lap_s():.2f}s")

    val = val_arr.view(dtype=np.uint8)
    rgba_arr = np.empty(4 * len(val_arr), dtype=np.uint8)
    rgba_arr[0::4] = val[2::4]
    rgba_arr[1::4] = val[1::4]
    rgba_arr[2::4] = val[0::4]
    rgba_arr[3::4] = np.multiply(valid_arr, 255).astype(np.uint8)

    LOGGER.debug(f"rgba combine: {timer.lap_s():.2f}s")

    # Back to 2d shape + 1 dimension for the rgba values.
    rgba_arr_reshaped = rgba_arr.reshape((shape[0], shape[1], 4))

    image = Image.fromarray(rgba_arr_reshaped, "RGBA")
    LOGGER.debug(f"create: {timer.lap_s():.2f}s")

    byte_io = io.BytesIO()
    image.save(byte_io, format="png", compress_level=1)
    LOGGER.debug(f"save png to bytes: {timer.lap_s():.2f}s")

    byte_io.seek(0)
    ret_bytes = byte_io.read()
    LOGGER.debug(f"read bytes: {timer.lap_s():.2f}s")

    LOGGER.debug(f"Total time: {timer.elapsed_s():.2f}s")

    return ret_bytes
    def publish_surface(
        self,
        qualified_address: Union[QualifiedSurfaceAddress, QualifiedDiffSurfaceAddress],
        surface: xtgeo.RegularSurface,
    ) -> None:
        timer = PerfTimer()

        if isinstance(qualified_address, QualifiedSurfaceAddress):
            base_cache_key = _address_to_str(
                qualified_address.provider_id, qualified_address.address
            )
        else:
            base_cache_key = _diff_address_to_str(
                qualified_address.provider_id_a,
                qualified_address.address_a,
                qualified_address.provider_id_b,
                qualified_address.address_b,
            )

        LOGGER.debug(
            f"Publishing surface (dim={surface.dimensions}, #cells={surface.ncol*surface.nrow}), "
            f"[base_cache_key={base_cache_key}]"
        )

        self._create_and_store_image_in_cache(base_cache_key, surface)

        LOGGER.debug(f"Surface published in: {timer.elapsed_s():.2f}s")
Beispiel #8
0
    def _get_observed_surface(
        self, address: ObservedSurfaceAddress
    ) -> Optional[xtgeo.RegularSurface]:
        """Returns a Xtgeo surface instance for an observed surface"""

        timer = PerfTimer()

        surf_fns: List[str] = self._locate_observed_surfaces(
            attribute=address.attribute,
            name=address.name,
            datestr=address.datestr if address.datestr is not None else "",
        )

        if len(surf_fns) == 0:
            LOGGER.warning(f"No observed surface found for {address}")
            return None
        if len(surf_fns) > 1:
            LOGGER.warning(
                f"Multiple observed surfaces found for: {address}"
                "Returning first surface."
            )

        surf = xtgeo.surface_from_file(surf_fns[0])

        LOGGER.debug(f"Loaded simulated surface in: {timer.elapsed_s():.2f}s")

        return surf
    def create_from_arrow_unsmry_lazy(
            self, ens_path: str,
            rel_file_pattern: str) -> EnsembleSummaryProvider:
        """Create EnsembleSummaryProvider from per-realization unsmry data in .arrow format.

        The `rel_file_pattern` parameter must specify a relative (per realization) file pattern
        that will be used to find the wanted .arrow files within each realization. The file
        pattern is relative to each realization's `runpath`.
        Typically the file pattern will be: "share/results/unsmry/*.arrow"

        The returned summary provider supports lazy resampling.
        """

        timer = PerfTimer()

        storage_key = (
            f"arrow_unsmry_lazy__{_make_hash_string(ens_path + rel_file_pattern)}"
        )
        provider = ProviderImplArrowLazy.from_backing_store(
            self._storage_dir, storage_key)
        if provider:
            LOGGER.info(
                f"Loaded lazy summary provider from backing store in {timer.elapsed_s():.2f}s ("
                f"ens_path={ens_path})")
            return provider

        # We can only import data from data source if storage writes are allowed
        if not self._allow_storage_writes:
            raise ValueError(
                f"Failed to load lazy summary provider for {ens_path}")

        LOGGER.info(f"Importing/saving arrow summary data for: {ens_path}")

        timer.lap_s()
        per_real_tables = load_per_realization_arrow_unsmry_files(
            ens_path, rel_file_pattern)
        if not per_real_tables:
            raise ValueError(
                f"Could not find any .arrow unsmry files for ens_path={ens_path}"
            )
        et_import_smry_s = timer.lap_s()

        ProviderImplArrowLazy.write_backing_store_from_per_realization_tables(
            self._storage_dir, storage_key, per_real_tables)
        et_write_s = timer.lap_s()

        provider = ProviderImplArrowLazy.from_backing_store(
            self._storage_dir, storage_key)
        if not provider:
            raise ValueError(
                f"Failed to load/create lazy provider for {ens_path}")

        LOGGER.info(
            f"Saved lazy summary provider to backing store in {timer.elapsed_s():.2f}s ("
            f"import_smry={et_import_smry_s:.2f}s, write={et_write_s:.2f}s, ens_path={ens_path})"
        )

        return provider
    def create_from_per_realization_csv_file(
        self, ens_path: str, csv_file_rel_path: str
    ) -> EnsembleSummaryProvider:
        """Create EnsembleSummaryProvider from per realization CSV files.

        Note that the returned summary provider does not support resampling, nor will it
        be able to return vector metadata.
        """

        timer = PerfTimer()

        storage_key = f"per_real_csv__{_make_hash_string(ens_path + csv_file_rel_path)}"
        provider = ProviderImplArrowPresampled.from_backing_store(
            self._storage_dir, storage_key
        )

        if provider:
            LOGGER.info(
                f"Loaded summary provider (per real CSV) from backing store in "
                f"{timer.elapsed_s():.2f}s ("
                f"ens_path={ens_path}, csv_file_rel_path={csv_file_rel_path})"
            )
            return provider

        # We can only import data from CSV if storage writes are allowed
        if not self._allow_storage_writes:
            raise ValueError(
                f"Failed to load summary provider (per real CSV) for {ens_path}"
            )

        LOGGER.info(f"Importing/saving per real CSV summary data for: {ens_path}")

        timer.lap_s()

        ensemble_df = load_per_real_csv_file_using_fmu(ens_path, csv_file_rel_path)
        et_import_csv_s = timer.lap_s()

        ProviderImplArrowPresampled.write_backing_store_from_ensemble_dataframe(
            self._storage_dir, storage_key, ensemble_df
        )
        et_write_s = timer.lap_s()

        provider = ProviderImplArrowPresampled.from_backing_store(
            self._storage_dir, storage_key
        )

        if not provider:
            raise ValueError(
                f"Failed to load/create provider (per real CSV) for {ens_path}"
            )

        LOGGER.info(
            f"Saved summary provider (per real CSV) to backing store in {timer.elapsed_s():.2f}s ("
            f"import_csv={et_import_csv_s:.2f}s, write={et_write_s:.2f}s, "
            f"ens_path={ens_path}, csv_file_rel_path={csv_file_rel_path})"
        )

        return provider
Beispiel #11
0
    def _create_statistical_surface(
        self, address: StatisticalSurfaceAddress
    ) -> Optional[xtgeo.RegularSurface]:
        surf_fns: List[str] = self._locate_simulated_surfaces(
            attribute=address.attribute,
            name=address.name,
            datestr=address.datestr if address.datestr is not None else "",
            realizations=address.realizations,
        )

        if len(surf_fns) == 0:
            LOGGER.warning(f"No input surfaces found for statistical surface {address}")
            return None

        timer = PerfTimer()

        surfaces = xtgeo.Surfaces(surf_fns)
        et_load_s = timer.lap_s()

        surf_count = len(surfaces.surfaces)
        if surf_count == 0:
            LOGGER.warning(
                f"Could not load input surfaces for statistical surface {address}"
            )
            return None

        # print("########################################################")
        # first_surf = surfaces.surfaces[0]
        # for surf in surfaces.surfaces:
        #     print(
        #         surf.dimensions,
        #         surf.xinc,
        #         surf.yinc,
        #         surf.xori,
        #         surf.yori,
        #         surf.rotation,
        #         surf.filesrc,
        #     )
        # print("########################################################")

        # Suppress numpy warnings when surfaces have undefined z-values
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", "All-NaN slice encountered")
            warnings.filterwarnings("ignore", "Mean of empty slice")
            warnings.filterwarnings("ignore", "Degrees of freedom <= 0 for slice")

            stat_surface = _calc_statistic_across_surfaces(address.statistic, surfaces)
        et_calc_s = timer.lap_s()

        LOGGER.debug(
            f"Created statistical surface in: {timer.elapsed_s():.2f}s ("
            f"load={et_load_s:.2f}s, calc={et_calc_s:.2f}s), "
            f"[#surfaces={surf_count}, stat={address.statistic}, "
            f"attr={address.attribute}, name={address.name}, date={address.datestr}]"
        )

        return stat_surface
Beispiel #12
0
    def write_backing_store(
        storage_dir: Path,
        storage_key: str,
        well_file_names: List[str],
        md_logname: Optional[str],
    ) -> None:

        timer = PerfTimer()

        # All data for this provider will be stored inside a sub-directory
        # given by the storage key
        provider_dir = storage_dir / storage_key
        LOGGER.debug(f"Writing well backing store to: {provider_dir}")
        provider_dir.mkdir(parents=True, exist_ok=True)

        inventory_dict: Dict[str, dict] = {}

        LOGGER.debug(
            f"Writing {len(well_file_names)} wells into backing store...")

        timer.lap_s()
        for file_name in well_file_names:
            well = xtgeo.well_from_file(wfile=file_name, mdlogname=md_logname)

            if well.mdlogname is None:
                try:
                    well.geometrics()
                except ValueError:
                    LOGGER.debug(
                        f"Ignoring {well.name} as MD cannot be calculated")
                    continue

            print("well.mdlogname=", well.mdlogname)

            well_name = well.name
            rel_path = f"{well_name}.rmswell"
            # rel_path = f"{well_name}.hdf"

            dst_file = provider_dir / rel_path
            print("dst_file=", dst_file)
            well.to_file(wfile=dst_file, fformat="rmswell")
            # well.to_hdf(wfile=dst_file)

            inventory_dict[well_name] = {
                INV_KEY_REL_PATH: rel_path,
                INV_KEY_MD_LOGNAME: well.mdlogname,
            }

        et_copy_s = timer.lap_s()

        json_fn = provider_dir / "inventory.json"
        with open(json_fn, "w") as file:
            json.dump(inventory_dict, file)

        LOGGER.debug(f"Wrote well backing store in: {timer.elapsed_s():.2f}s ("
                     f"copy={et_copy_s:.2f}s)")
    def create_from_ensemble_surface_files(
        self,
        ens_path: str,
        rel_surface_folder: str = "share/results/maps",
        attribute_filter: List[str] = None,
    ) -> EnsembleSurfaceProvider:
        timer = PerfTimer()
        string_to_hash = (
            f"{ens_path}_{rel_surface_folder}" if attribute_filter is None else
            (f"{ens_path}_{rel_surface_folder}_"
             f"{'_'.join([str(attr) for attr in attribute_filter])}"))
        storage_key = f"ens__{_make_hash_string(string_to_hash)}"
        provider = ProviderImplFile.from_backing_store(self._storage_dir,
                                                       storage_key)
        if provider:
            LOGGER.info(
                f"Loaded surface provider from backing store in {timer.elapsed_s():.2f}s ("
                f"ens_path={ens_path})")
            return provider

        # We can only import data from data source if storage writes are allowed
        if not self._allow_storage_writes:
            raise ValueError(f"Failed to load surface provider for {ens_path}")

        LOGGER.info(f"Importing/copying surface data for: {ens_path}")

        timer.lap_s()
        sim_surface_files = discover_per_realization_surface_files(
            ens_path, rel_surface_folder, attribute_filter)
        obs_surface_files = discover_observed_surface_files(
            ens_path, attribute_filter)
        et_discover_s = timer.lap_s()

        # As an optimization, avoid copying the surfaces into the backing store,
        # typically when  we're running in non-portable mode
        ProviderImplFile.write_backing_store(
            self._storage_dir,
            storage_key,
            sim_surfaces=sim_surface_files,
            obs_surfaces=obs_surface_files,
            avoid_copying_surfaces=self._avoid_copying_surfaces,
        )
        et_write_s = timer.lap_s()

        provider = ProviderImplFile.from_backing_store(self._storage_dir,
                                                       storage_key)
        if not provider:
            raise ValueError(
                f"Failed to load/create surface provider for {ens_path}")

        LOGGER.info(
            f"Saved surface provider to backing store in {timer.elapsed_s():.2f}s ("
            f"discover={et_discover_s:.2f}s, write={et_write_s:.2f}s, ens_path={ens_path})"
        )

        return provider
    def create_from_well_files(self, well_folder: str, well_suffix: str,
                               md_logname: Optional[str]) -> WellProvider:
        timer = PerfTimer()

        file_pattern = str(Path(well_folder) / f"*{well_suffix}")
        storage_key = f"from_files__{_make_hash_string(f'{file_pattern}_{md_logname}')}"

        provider = ProviderImplFile.from_backing_store(self._storage_dir,
                                                       storage_key)
        if provider:
            LOGGER.info(
                f"Loaded well provider from backing store in {timer.elapsed_s():.2f}s ("
                f"file_pattern={file_pattern})")
            return provider

        # We can only import data from data source if storage writes are allowed
        if not self._allow_storage_writes:
            raise ValueError(
                f"Failed to load well provider for {file_pattern}")

        LOGGER.info(f"Importing/writing well data for: {file_pattern}")

        timer.lap_s()
        src_file_names = sorted([
            str(filename)
            for filename in Path(well_folder).glob(f"*{well_suffix}")
        ])
        et_discover_s = timer.lap_s()

        ProviderImplFile.write_backing_store(
            self._storage_dir,
            storage_key,
            well_file_names=src_file_names,
            md_logname=md_logname,
        )
        et_write_s = timer.lap_s()

        provider = ProviderImplFile.from_backing_store(self._storage_dir,
                                                       storage_key)
        if not provider:
            raise ValueError(
                f"Failed to load/create well provider for {file_pattern}")

        LOGGER.info(
            f"Saved well provider to backing store in {timer.elapsed_s():.2f}s ("
            f"discover={et_discover_s:.2f}s, write={et_write_s:.2f}s, file_pattern={file_pattern})"
        )

        return provider
Beispiel #15
0
def load_per_real_csv_file_using_fmu(ens_path: str,
                                     csv_file_rel_path: str) -> pd.DataFrame:

    LOGGER.debug(
        f"load_per_real_csv_file_using_fmu() starting - {csv_file_rel_path}")
    timer = PerfTimer()

    scratch_ensemble = ScratchEnsemble("tempEnsName",
                                       ens_path,
                                       autodiscovery=True)
    df = scratch_ensemble.load_csv(csv_file_rel_path)

    LOGGER.debug(
        f"load_per_real_csv_file_using_fmu() finished in: {timer.elapsed_s():.2f}s"
    )

    return df
Beispiel #16
0
    def create_from_ensemble_fault_polygons_files(
            self, ens_path: str) -> EnsembleFaultPolygonsProvider:
        timer = PerfTimer()

        storage_key = f"ens__{_make_hash_string(ens_path)}"
        provider = ProviderImplFile.from_backing_store(self._storage_dir,
                                                       storage_key)
        if provider:
            LOGGER.info(
                f"Loaded fault polygons provider from backing store in {timer.elapsed_s():.2f}s ("
                f"ens_path={ens_path})")
            return provider

        # We can only import data from data source if storage writes are allowed
        if not self._allow_storage_writes:
            raise ValueError(
                f"Failed to load fault polygons provider for {ens_path}")

        LOGGER.info(f"Importing/copying fault polygons data for: {ens_path}")

        timer.lap_s()
        sim_fault_polygons_files = discover_per_realization_fault_polygons_files(
            ens_path)

        et_discover_s = timer.lap_s()

        ProviderImplFile.write_backing_store(
            self._storage_dir,
            storage_key,
            sim_fault_polygons=sim_fault_polygons_files,
        )
        et_write_s = timer.lap_s()

        provider = ProviderImplFile.from_backing_store(self._storage_dir,
                                                       storage_key)
        if not provider:
            raise ValueError(
                f"Failed to load/create fault polygons provider for {ens_path}"
            )

        LOGGER.info(
            f"Saved fault polygons provider to backing store in {timer.elapsed_s():.2f}s ("
            f"discover={et_discover_s:.2f}s, write={et_write_s:.2f}s, ens_path={ens_path})"
        )

        return provider
    def __init__(self, arrow_file_name: Path) -> None:
        self._arrow_file_name = str(arrow_file_name)

        LOGGER.debug(f"init with arrow file: {self._arrow_file_name}")
        timer = PerfTimer()

        source = pa.memory_map(self._arrow_file_name, "r")
        et_open_ms = timer.lap_ms()

        reader = pa.ipc.RecordBatchFileReader(source)
        et_create_reader_ms = timer.lap_ms()

        # Discover columns and realizations that are present in the file
        column_names_on_file = reader.schema.names
        self._vector_names: List[str] = [
            colname
            for colname in column_names_on_file
            if colname not in ["DATE", "REAL", "ENSEMBLE"]
        ]
        et_find_vec_names_ms = timer.lap_ms()

        unique_realizations_on_file = reader.read_all().column("REAL").unique()
        self._realizations: List[int] = unique_realizations_on_file.to_pylist()
        et_find_real_ms = timer.lap_ms()

        # We'll try and keep the file open for the life-span of the provider.
        # Done to try and stop blobfuse from throwing the file out of its cache.
        self._cached_reader = reader

        # For testing, uncomment code below and we will be more aggressive
        # and keep the "raw" table in memory
        self._cached_full_table = None
        # self._cached_full_table = reader.read_all()

        LOGGER.debug(
            f"init took: {timer.elapsed_s():.2f}s, "
            f"(open={et_open_ms}ms, create_reader={et_create_reader_ms}ms, "
            f"find_vec_names={et_find_vec_names_ms}ms, find_real={et_find_real_ms}ms), "
            f"#vector_names={len(self._vector_names)}, "
            f"#realization={len(self._realizations)}"
        )

        if not self._realizations:
            raise ValueError("Init from backing store failed NO realizations")
        if not self._vector_names:
            raise ValueError("Init from backing store failed NO vector_names")
    def get_vectors_for_date_df(
        self,
        date: datetime.datetime,
        vector_names: Sequence[str],
        realizations: Optional[Sequence[int]] = None,
    ) -> pd.DataFrame:

        timer = PerfTimer()

        columns_to_get = ["DATE", "REAL"]
        columns_to_get.extend(vector_names)
        table = self._get_or_read_table(columns_to_get)
        et_read_ms = timer.lap_ms()

        # Note that we use MS here to be aligned with storage type in arrow file
        lookup_date = pa.scalar(date, type=pa.timestamp("ms"))
        mask = pc.equal(table["DATE"], lookup_date)

        if realizations:
            real_mask = pc.is_in(table["REAL"],
                                 value_set=pa.array(realizations))
            mask = pc.and_(mask, real_mask)

        table = table.drop(["DATE"])

        # table = table.filter(mask).combine_chunks()
        table = table.filter(mask)
        et_filter_ms = timer.lap_ms()

        df = table.to_pandas()
        # df = table.to_pandas(split_blocks=True, zero_copy_only=True)
        # del table  # not necessary, but a good practice
        et_to_pandas_ms = timer.lap_ms()

        LOGGER.debug(
            f"get_vectors_for_date_df() took: {timer.elapsed_ms()}ms ("
            f"read={et_read_ms}ms, "
            f"filter={et_filter_ms}ms, "
            f"to_pandas={et_to_pandas_ms}ms), "
            f"#vecs={len(vector_names)}, "
            f"#real={len(realizations) if realizations else 'all'}, "
            f"df.shape={df.shape}, file={Path(self._arrow_file_name).name}")

        return df
    def get_vectors_for_date_df(
        self,
        date: datetime.datetime,
        vector_names: Sequence[str],
        realizations: Optional[Sequence[int]] = None,
    ) -> pd.DataFrame:

        if not vector_names:
            raise ValueError("List of requested vector names is empty")

        timer = PerfTimer()

        columns_to_get = ["DATE", "REAL"]
        columns_to_get.extend(vector_names)
        table = self._get_or_read_table(columns_to_get)
        et_read_ms = timer.lap_ms()

        if realizations:
            real_mask = pc.is_in(table["REAL"], value_set=pa.array(realizations))
            table = table.filter(real_mask)
        et_filter_ms = timer.lap_ms()

        np_lookup_date = np.datetime64(date, "ms")
        table = sample_segmented_multi_real_table_at_date(table, np_lookup_date)

        et_resample_ms = timer.lap_ms()
        table = table.drop(["DATE"])

        df = table.to_pandas()
        et_to_pandas_ms = timer.lap_ms()

        LOGGER.debug(
            f"get_vectors_for_date_df() took: {timer.elapsed_ms()}ms ("
            f"read={et_read_ms}ms, "
            f"filter={et_filter_ms}ms, "
            f"resample={et_resample_ms}ms, "
            f"to_pandas={et_to_pandas_ms}ms), "
            f"#vecs={len(vector_names)}, "
            f"#real={len(realizations) if realizations else 'all'}, "
            f"df.shape={df.shape}, file={Path(self._arrow_file_name).name}"
        )

        return df
    def _create_and_store_image_in_cache(
        self,
        base_cache_key: str,
        surface: xtgeo.RegularSurface,
    ) -> None:

        timer = PerfTimer()

        LOGGER.debug("Converting surface to PNG image...")
        png_bytes: bytes = surface_to_png_bytes_optimized(surface)
        LOGGER.debug(f"Got PNG image, size={(len(png_bytes) / (1024 * 1024)):.2f}MB")
        et_to_image_s = timer.lap_s()

        img_cache_key = "IMG:" + base_cache_key
        meta_cache_key = "META:" + base_cache_key

        self._image_cache.add(img_cache_key, png_bytes)

        # For debugging rotations
        # unrot_surf = surface.copy()
        # unrot_surf.unrotate()
        # unrot_surf.quickplot("/home/sigurdp/gitRoot/hk-webviz-subsurface/quickplot.png")

        deckgl_bounds, deckgl_rot = _calc_map_component_bounds_and_rot(surface)

        meta = SurfaceMeta(
            x_min=surface.xmin,
            x_max=surface.xmax,
            y_min=surface.ymin,
            y_max=surface.ymax,
            val_min=surface.values.min(),
            val_max=surface.values.max(),
            deckgl_bounds=deckgl_bounds,
            deckgl_rot_deg=deckgl_rot,
        )
        self._image_cache.add(meta_cache_key, meta)
        et_write_cache_s = timer.lap_s()

        LOGGER.debug(
            f"Created image and wrote to cache in in: {timer.elapsed_s():.2f}s ("
            f"to_image={et_to_image_s:.2f}s, write_cache={et_write_cache_s:.2f}s), "
            f"[base_cache_key={base_cache_key}]"
        )
    def get_vectors_df(
        self,
        vector_names: Sequence[str],
        resampling_frequency: Optional[Frequency],
        realizations: Optional[Sequence[int]] = None,
    ) -> pd.DataFrame:

        if not vector_names:
            raise ValueError("List of requested vector names is empty")

        timer = PerfTimer()

        columns_to_get = ["DATE", "REAL"]
        columns_to_get.extend(vector_names)
        table = self._get_or_read_table(columns_to_get)
        et_read_ms = timer.lap_ms()

        if realizations:
            mask = pc.is_in(table["REAL"], value_set=pa.array(realizations))
            table = table.filter(mask)
        et_filter_ms = timer.lap_ms()

        if resampling_frequency is not None:
            table = resample_segmented_multi_real_table(table, resampling_frequency)
        et_resample_ms = timer.lap_ms()

        df = table.to_pandas(timestamp_as_object=True)
        et_to_pandas_ms = timer.lap_ms()

        LOGGER.debug(
            f"get_vectors_df({resampling_frequency}) took: {timer.elapsed_ms()}ms ("
            f"read={et_read_ms}ms, "
            f"filter={et_filter_ms}ms, "
            f"resample={et_resample_ms}ms, "
            f"to_pandas={et_to_pandas_ms}ms), "
            f"#vecs={len(vector_names)}, "
            f"#real={len(realizations) if realizations else 'all'}, "
            f"df.shape={df.shape}, file={Path(self._arrow_file_name).name}"
        )

        return df
Beispiel #22
0
        def _handle_wells_request(provider_id: str,
                                  well_names_str: str) -> flask.Response:
            LOGGER.debug(f"Handling well request: "
                         f"provider_id={provider_id} "
                         f"well_names_str={well_names_str} ")

            timer = PerfTimer()

            try:
                provider = self._id_to_provider_dict[provider_id]
                well_names_arr = well_names_str.split("~")
            # pylint: disable=bare-except
            except:
                LOGGER.error("Error decoding wells address")
                flask.abort(404)

            validate_geometry = True
            feature_arr = []
            for wname in well_names_arr:
                well_path = provider.get_well_path(wname)

                coords = list(
                    zip(well_path.x_arr, well_path.y_arr, well_path.z_arr))
                # coords = coords[0::20]
                point = geojson.Point(coordinates=[coords[0][0], coords[0][1]],
                                      validate=validate_geometry)

                geocoll = geojson.GeometryCollection(geometries=[point])

                feature = geojson.Feature(id=wname,
                                          geometry=geocoll,
                                          properties={"name": wname})
                feature_arr.append(feature)

            featurecoll = geojson.FeatureCollection(features=feature_arr)
            response = flask.Response(geojson.dumps(featurecoll),
                                      mimetype="application/geo+json")

            LOGGER.debug(f"Request handled in: {timer.elapsed_s():.2f}s")
            return response
Beispiel #23
0
def load_per_realization_arrow_unsmry_files(
        ens_path: str, rel_file_pattern: str) -> Dict[int, pa.Table]:
    """Load summary data stored in per-realization arrow files.
    Returns dictionary containing a PyArrow table for each realization, indexed by
    realization number.

    `rel_file_pattern` denotes a file pattern relative to the realization's runpath,
    typical value is: "share/results/unsmry/*.arrow"
    """

    LOGGER.debug(
        f"load_per_realization_arrow_unsmry_files() starting - {ens_path}")
    LOGGER.debug(
        f"looking for .arrow files using relative pattern: {rel_file_pattern}")
    timer = PerfTimer()

    per_real_tables: Dict[int, pa.Table] = {}
    globpattern = os.path.join(ens_path, rel_file_pattern)
    files_to_process = _discover_arrow_unsmry_files(globpattern)
    if len(files_to_process) == 0:
        LOGGER.warning(f"No arrow files were discovered in: {ens_path}")
        LOGGER.warning(f"Glob pattern used: {globpattern}")
        return per_real_tables

    with ProcessPoolExecutor() as executor:
        futures = executor.map(_load_table_from_arrow_file, files_to_process)
    for i, table in enumerate(futures):
        real = files_to_process[i].real
        per_real_tables[real] = table

    # for entry in files_to_process:
    #     table = _load_table_from_arrow_file(entry)
    #     per_real_tables[entry.real] = table

    LOGGER.debug(f"load_per_realization_arrow_unsmry_files() "
                 f"finished in: {timer.elapsed_s():.2f}s")

    return per_real_tables
    def get_vectors_df(
        self,
        vector_names: Sequence[str],
        resampling_frequency: Optional[Frequency],
        realizations: Optional[Sequence[int]] = None,
    ) -> pd.DataFrame:

        if resampling_frequency is not None:
            raise ValueError("Resampling is not supported by this provider")

        timer = PerfTimer()

        columns_to_get = ["DATE", "REAL"]
        columns_to_get.extend(vector_names)
        table = self._get_or_read_table(columns_to_get)
        et_read_ms = timer.lap_ms()

        if realizations:
            mask = pc.is_in(table["REAL"], value_set=pa.array(realizations))
            table = table.filter(mask)
        et_filter_ms = timer.lap_ms()

        df = table.to_pandas(timestamp_as_object=True)
        # df = table.to_pandas(split_blocks=True, self_destruct=True)
        # del table  # not necessary, but a good practice
        et_to_pandas_ms = timer.lap_ms()

        LOGGER.debug(
            f"get_vectors_df() took: {timer.elapsed_ms()}ms ("
            f"read={et_read_ms}ms, "
            f"filter={et_filter_ms}ms, "
            f"to_pandas={et_to_pandas_ms}ms), "
            f"#vecs={len(vector_names)}, "
            f"#real={len(realizations) if realizations else 'all'}, "
            f"df.shape={df.shape}, file={Path(self._arrow_file_name).name}")

        return df
    def vector_names_filtered_by_value(
        self,
        exclude_all_values_zero: bool = False,
        exclude_constant_values: bool = False,
    ) -> List[str]:

        timer = PerfTimer()

        schema = self._get_or_read_schema()
        et_read_ms = timer.lap_ms()

        per_vector_min_max = get_per_vector_min_max_from_schema_metadata(schema)
        et_get_min_max_ms = timer.lap_ms()

        ret_vec_names: List[str] = []
        for vec_name in self._vector_names:
            minval = per_vector_min_max[vec_name]["min"]
            maxval = per_vector_min_max[vec_name]["max"]

            if minval == maxval:
                if exclude_constant_values:
                    continue

                if exclude_all_values_zero and minval == 0:
                    continue

            ret_vec_names.append(vec_name)
        et_filter_ms = timer.lap_ms()

        LOGGER.debug(
            f"vector_names_filtered_by_value() took: {timer.elapsed_ms()}ms ("
            f"read={et_read_ms}ms, "
            f"get_min_max={et_get_min_max_ms}ms, "
            f"filter={et_filter_ms}ms)"
        )

        return ret_vec_names
    def dates(
        self,
        resampling_frequency: Optional[Frequency],
        realizations: Optional[Sequence[int]] = None,
    ) -> List[datetime.datetime]:

        timer = PerfTimer()

        table = self._get_or_read_table(["DATE", "REAL"])
        et_read_ms = timer.lap_ms()

        if realizations:
            mask = pc.is_in(table["REAL"], value_set=pa.array(realizations))
            table = table.filter(mask)
        et_filter_ms = timer.lap_ms()

        if resampling_frequency is not None:
            unique_dates_np = table.column("DATE").unique().to_numpy()
            min_raw_date = np.min(unique_dates_np)
            max_raw_date = np.max(unique_dates_np)
            intersected_dates = generate_normalized_sample_dates(
                min_raw_date, max_raw_date, resampling_frequency
            )
        else:
            intersected_dates = find_intersected_dates_between_realizations(table)

        et_find_unique_ms = timer.lap_ms()

        LOGGER.debug(
            f"dates({resampling_frequency}) took: {timer.elapsed_ms()}ms ("
            f"read={et_read_ms}ms, "
            f"filter={et_filter_ms}ms, "
            f"find_unique={et_find_unique_ms}ms)"
        )

        return intersected_dates.astype(datetime.datetime).tolist()
Beispiel #27
0
    def write_backing_store(
        storage_dir: Path,
        storage_key: str,
        sim_surfaces: List[SurfaceFileInfo],
        obs_surfaces: List[SurfaceFileInfo],
        avoid_copying_surfaces: bool,
    ) -> None:
        """If avoid_copying_surfaces if True, the specified surfaces will NOT be copied
        into the backing store, but will be referenced from their source locations.
        Note that this is only useful when running in non-portable mode and will fail
        in portable mode.
        """

        timer = PerfTimer()

        do_copy_surfs_into_store = not avoid_copying_surfaces

        # All data for this provider will be stored inside a sub-directory
        # given by the storage key
        provider_dir = storage_dir / storage_key
        LOGGER.debug(f"Writing surface backing store to: {provider_dir}")
        provider_dir.mkdir(parents=True, exist_ok=True)
        (provider_dir / REL_SIM_DIR).mkdir(parents=True, exist_ok=True)
        (provider_dir / REL_OBS_DIR).mkdir(parents=True, exist_ok=True)

        type_arr: List[SurfaceType] = []
        real_arr: List[int] = []
        attribute_arr: List[str] = []
        name_arr: List[str] = []
        datestr_arr: List[str] = []
        rel_path_arr: List[str] = []
        original_path_arr: List[str] = []

        for surfinfo in sim_surfaces:
            type_arr.append(SurfaceType.SIMULATED)
            real_arr.append(surfinfo.real)
            attribute_arr.append(surfinfo.attribute)
            name_arr.append(surfinfo.name)
            datestr_arr.append(surfinfo.datestr if surfinfo.datestr else "")
            original_path_arr.append(surfinfo.path)

            rel_path_in_store = ""
            if do_copy_surfs_into_store:
                rel_path_in_store = _compose_rel_sim_surf_pathstr(
                    real=surfinfo.real,
                    attribute=surfinfo.attribute,
                    name=surfinfo.name,
                    datestr=surfinfo.datestr,
                    extension=Path(surfinfo.path).suffix,
                )
            rel_path_arr.append(rel_path_in_store)

        # We want to strip out observed surfaces without a matching simulated surface
        valid_obs_surfaces = _find_observed_surfaces_corresponding_to_simulated(
            obs_surfaces=obs_surfaces, sim_surfaces=sim_surfaces
        )

        for surfinfo in valid_obs_surfaces:
            type_arr.append(SurfaceType.OBSERVED)
            real_arr.append(-1)
            attribute_arr.append(surfinfo.attribute)
            name_arr.append(surfinfo.name)
            datestr_arr.append(surfinfo.datestr if surfinfo.datestr else "")
            original_path_arr.append(surfinfo.path)

            rel_path_in_store = ""
            if do_copy_surfs_into_store:
                rel_path_in_store = _compose_rel_obs_surf_pathstr(
                    attribute=surfinfo.attribute,
                    name=surfinfo.name,
                    datestr=surfinfo.datestr,
                    extension=Path(surfinfo.path).suffix,
                )
            rel_path_arr.append(rel_path_in_store)

        timer.lap_s()
        if do_copy_surfs_into_store:
            LOGGER.debug(
                f"Copying {len(original_path_arr)} surfaces into backing store..."
            )
            _copy_surfaces_into_provider_dir(
                original_path_arr, rel_path_arr, provider_dir
            )
        et_copy_s = timer.lap_s()

        surface_inventory_df = pd.DataFrame(
            {
                Col.TYPE: type_arr,
                Col.REAL: real_arr,
                Col.ATTRIBUTE: attribute_arr,
                Col.NAME: name_arr,
                Col.DATESTR: datestr_arr,
                Col.REL_PATH: rel_path_arr,
                Col.ORIGINAL_PATH: original_path_arr,
            }
        )

        parquet_file_name = provider_dir / "surface_inventory.parquet"
        surface_inventory_df.to_parquet(path=parquet_file_name)

        if do_copy_surfs_into_store:
            LOGGER.debug(
                f"Wrote surface backing store in: {timer.elapsed_s():.2f}s ("
                f"copy={et_copy_s:.2f}s)"
            )
        else:
            LOGGER.debug(
                f"Wrote surface backing store without copying surfaces in: "
                f"{timer.elapsed_s():.2f}s"
            )
        def _handle_request(provider_id: str, addr_type_str: str,
                            surf_address_str: str) -> flask.Response:
            LOGGER.debug(f"Handling request: "
                         f"provider_id={provider_id} "
                         f"addr_type_str={addr_type_str} "
                         f"surf_address_str={surf_address_str}")

            timer = PerfTimer()

            try:
                provider = self._id_to_provider_dict[provider_id]
                surf_address_dict = json.loads(unquote_plus(surf_address_str))
                address: Union[StatisticalSurfaceAddress,
                               SimulatedSurfaceAddress,
                               ObservedSurfaceAddress, ]
                if addr_type_str == "sta":
                    address = StatisticalSurfaceAddress(**surf_address_dict)
                if addr_type_str == "sim":
                    address = SimulatedSurfaceAddress(**surf_address_dict)
                if addr_type_str == "obs":
                    address = ObservedSurfaceAddress(**surf_address_dict)
            except:
                LOGGER.error("Error decoding surface address")
                flask.abort(404)

            if self._image_cache:
                img_cache_key = (
                    f"provider_id={provider_id} "
                    f"addr_type={addr_type_str} address={surf_address_str}")
                LOGGER.debug(
                    f"Looking for image in cache (key={img_cache_key}, "
                    f"cache_type={self._image_cache.config['CACHE_TYPE']})")
                cached_img_bytes = self._image_cache.get(img_cache_key)
                if cached_img_bytes:
                    response = flask.send_file(io.BytesIO(cached_img_bytes),
                                               mimetype="image/png")
                    LOGGER.debug(
                        f"Request handled from image cache in: {timer.elapsed_s():.2f}s"
                    )
                    return response

            LOGGER.debug("Getting surface from provider...")
            timer.lap_s()
            surface = provider.get_surface(address)
            if not surface:
                LOGGER.error(f"Error getting surface for address: {address}")
                flask.abort(404)
            et_get_s = timer.lap_s()
            LOGGER.debug(
                f"Got surface (dimensions={surface.dimensions}, #cells={surface.ncol*surface.nrow})"
            )

            LOGGER.debug("Converting to PNG image...")
            png_bytes: bytes = surface_to_png_bytes(surface)
            LOGGER.debug(
                f"Got PNG image, size={(len(png_bytes) / (1024 * 1024)):.2f}MB"
            )
            et_to_image_s = timer.lap_s()

            LOGGER.debug("Sending image")
            response = flask.send_file(io.BytesIO(png_bytes),
                                       mimetype="image/png")
            et_send_s = timer.lap_s()

            if self._image_cache and img_cache_key:
                self._image_cache.add(img_cache_key, png_bytes)

            LOGGER.debug(
                f"Request handled in: {timer.elapsed_s():.2f}s ("
                f"get={et_get_s:.2f}s, to_image={et_to_image_s:.2f}s, send={et_send_s:.2f}s)"
            )

            return response
    def write_backing_store_from_per_realization_tables(
        storage_dir: Path, storage_key: str, per_real_tables: Dict[int, pa.Table]
    ) -> None:
        # pylint: disable=too-many-locals
        @dataclass
        class Elapsed:
            concat_tables_s: float = -1
            build_add_real_col_s: float = -1
            sorting_s: float = -1
            find_and_store_min_max_s: float = -1
            write_s: float = -1

        elapsed = Elapsed()

        arrow_file_name = storage_dir / (storage_key + ".arrow")
        LOGGER.debug(f"Writing backing store to arrow file: {arrow_file_name}")
        timer = PerfTimer()

        unique_column_names = set()
        for real_num, table in per_real_tables.items():
            unique_column_names.update(table.schema.names)

            if "REAL" in table.schema.names:
                raise ValueError(
                    f"Input tables should not have REAL column (real={real_num})"
                )

            if table.schema.field("DATE").type != pa.timestamp("ms"):
                raise ValueError(
                    f"DATE column must have timestamp[ms] data type (real={real_num})"
                )

            if not _is_date_column_monotonically_increasing(table):
                offending_pair = _find_first_non_increasing_date_pair(table)
                raise ValueError(
                    f"DATE column must be monotonically increasing\n"
                    f"Error detected in realization: {real_num}\n"
                    f"First offending timestamps: {offending_pair}"
                )

        LOGGER.debug(
            f"Concatenating {len(per_real_tables)} tables with "
            f"{len(unique_column_names)} unique column names"
        )

        full_table = pa.concat_tables(per_real_tables.values(), promote=True)
        elapsed.concat_tables_s = timer.lap_s()

        real_arr = np.empty(full_table.num_rows, np.int32)
        table_start_idx = 0
        for real_num, real_table in per_real_tables.items():
            real_arr[table_start_idx : table_start_idx + real_table.num_rows] = real_num
            table_start_idx += real_table.num_rows

        full_table = full_table.add_column(0, "REAL", pa.array(real_arr))
        elapsed.build_add_real_col_s = timer.lap_s()

        # Must sort table on real since interpolations work per realization
        # and we utilize slicing for speed
        full_table = _sort_table_on_real_then_date(full_table)
        elapsed.sorting_s = timer.lap_s()

        # Find per column min/max values and store them as metadata on table's schema
        per_vector_min_max = find_min_max_for_numeric_table_columns(full_table)
        full_table = add_per_vector_min_max_to_table_schema_metadata(
            full_table, per_vector_min_max
        )
        elapsed.find_and_store_min_max_s = timer.lap_s()

        # feather.write_feather(full_table, dest=arrow_file_name)
        with pa.OSFile(str(arrow_file_name), "wb") as sink:
            with pa.RecordBatchFileWriter(sink, full_table.schema) as writer:
                writer.write_table(full_table)
        elapsed.write_s = timer.lap_s()

        LOGGER.debug(
            f"Wrote backing store to arrow file in: {timer.elapsed_s():.2f}s ("
            f"concat_tables={elapsed.concat_tables_s:.2f}s, "
            f"build_add_real_col={elapsed.build_add_real_col_s:.2f}s, "
            f"sorting={elapsed.sorting_s:.2f}s, "
            f"find_and_store_min_max={elapsed.find_and_store_min_max_s:.2f}s, "
            f"write={elapsed.write_s:.2f}s)"
        )
    def write_backing_store(
        storage_dir: Path,
        storage_key: str,
        sim_surfaces: List[SurfaceFileInfo],
        obs_surfaces: List[SurfaceFileInfo],
    ) -> None:

        timer = PerfTimer()

        # All data for this provider will be stored inside a sub-directory
        # given by the storage key
        provider_dir = storage_dir / storage_key
        LOGGER.debug(f"Writing surface backing store to: {provider_dir}")
        provider_dir.mkdir(parents=True, exist_ok=True)
        (provider_dir / REL_SIM_DIR).mkdir(parents=True, exist_ok=True)
        (provider_dir / REL_OBS_DIR).mkdir(parents=True, exist_ok=True)

        type_arr: List[SurfaceType] = []
        real_arr: List[int] = []
        attribute_arr: List[str] = []
        name_arr: List[str] = []
        datestr_arr: List[str] = []
        rel_path_arr: List[str] = []
        original_path_arr: List[str] = []

        for surfinfo in sim_surfaces:
            rel_path_in_store = _compose_rel_sim_surf_path(
                real=surfinfo.real,
                attribute=surfinfo.attribute,
                name=surfinfo.name,
                datestr=surfinfo.datestr,
                extension=Path(surfinfo.path).suffix,
            )
            type_arr.append(SurfaceType.SIMULATED)
            real_arr.append(surfinfo.real)
            attribute_arr.append(surfinfo.attribute)
            name_arr.append(surfinfo.name)
            datestr_arr.append(surfinfo.datestr if surfinfo.datestr else "")
            rel_path_arr.append(str(rel_path_in_store))
            original_path_arr.append(surfinfo.path)

        # We want to strip out observed surfaces without a matching simulated surface
        valid_obs_surfaces = _find_observed_surfaces_corresponding_to_simulated(
            obs_surfaces=obs_surfaces, sim_surfaces=sim_surfaces
        )

        for surfinfo in valid_obs_surfaces:
            rel_path_in_store = _compose_rel_obs_surf_path(
                attribute=surfinfo.attribute,
                name=surfinfo.name,
                datestr=surfinfo.datestr,
                extension=Path(surfinfo.path).suffix,
            )
            type_arr.append(SurfaceType.OBSERVED)
            real_arr.append(-1)
            attribute_arr.append(surfinfo.attribute)
            name_arr.append(surfinfo.name)
            datestr_arr.append(surfinfo.datestr if surfinfo.datestr else "")
            rel_path_arr.append(str(rel_path_in_store))
            original_path_arr.append(surfinfo.path)

        LOGGER.debug(f"Copying {len(original_path_arr)} surfaces into backing store...")
        timer.lap_s()
        _copy_surfaces_into_provider_dir(original_path_arr, rel_path_arr, provider_dir)
        et_copy_s = timer.lap_s()

        surface_inventory_df = pd.DataFrame(
            {
                Col.TYPE: type_arr,
                Col.REAL: real_arr,
                Col.ATTRIBUTE: attribute_arr,
                Col.NAME: name_arr,
                Col.DATESTR: datestr_arr,
                Col.REL_PATH: rel_path_arr,
                Col.ORIGINAL_PATH: original_path_arr,
            }
        )

        parquet_file_name = provider_dir / "surface_inventory.parquet"
        surface_inventory_df.to_parquet(path=parquet_file_name)

        LOGGER.debug(
            f"Wrote surface backing store in: {timer.elapsed_s():.2f}s ("
            f"copy={et_copy_s:.2f}s)"
        )