Ejemplo n.º 1
0
    def _get_or_create_statistical_surface(
        self, address: StatisticalSurfaceAddress
    ) -> Optional[xtgeo.RegularSurface]:

        timer = PerfTimer()

        surf = self._stat_surf_cache.fetch(address)
        if surf:
            LOGGER.debug(
                f"Fetched statistical surface from cache in: {timer.elapsed_s():.2f}s"
            )
            return surf

        surf = self._create_statistical_surface(address)
        et_create_s = timer.lap_s()

        self._stat_surf_cache.store(address, surf)
        et_write_cache_s = timer.lap_s()

        LOGGER.debug(
            f"Created and wrote statistical surface to cache in: {timer.elapsed_s():.2f}s ("
            f"create={et_create_s:.2f}s, store={et_write_cache_s:.2f}s), "
            f"[stat={address.statistic}, "
            f"attr={address.attribute}, name={address.name}, date={address.datestr}]"
        )

        return surf
    def create_from_arrow_unsmry_lazy(
            self, ens_path: str,
            rel_file_pattern: str) -> EnsembleSummaryProvider:
        """Create EnsembleSummaryProvider from per-realization unsmry data in .arrow format.

        The `rel_file_pattern` parameter must specify a relative (per realization) file pattern
        that will be used to find the wanted .arrow files within each realization. The file
        pattern is relative to each realization's `runpath`.
        Typically the file pattern will be: "share/results/unsmry/*.arrow"

        The returned summary provider supports lazy resampling.
        """

        timer = PerfTimer()

        storage_key = (
            f"arrow_unsmry_lazy__{_make_hash_string(ens_path + rel_file_pattern)}"
        )
        provider = ProviderImplArrowLazy.from_backing_store(
            self._storage_dir, storage_key)
        if provider:
            LOGGER.info(
                f"Loaded lazy summary provider from backing store in {timer.elapsed_s():.2f}s ("
                f"ens_path={ens_path})")
            return provider

        # We can only import data from data source if storage writes are allowed
        if not self._allow_storage_writes:
            raise ValueError(
                f"Failed to load lazy summary provider for {ens_path}")

        LOGGER.info(f"Importing/saving arrow summary data for: {ens_path}")

        timer.lap_s()
        per_real_tables = load_per_realization_arrow_unsmry_files(
            ens_path, rel_file_pattern)
        if not per_real_tables:
            raise ValueError(
                f"Could not find any .arrow unsmry files for ens_path={ens_path}"
            )
        et_import_smry_s = timer.lap_s()

        ProviderImplArrowLazy.write_backing_store_from_per_realization_tables(
            self._storage_dir, storage_key, per_real_tables)
        et_write_s = timer.lap_s()

        provider = ProviderImplArrowLazy.from_backing_store(
            self._storage_dir, storage_key)
        if not provider:
            raise ValueError(
                f"Failed to load/create lazy provider for {ens_path}")

        LOGGER.info(
            f"Saved lazy summary provider to backing store in {timer.elapsed_s():.2f}s ("
            f"import_smry={et_import_smry_s:.2f}s, write={et_write_s:.2f}s, ens_path={ens_path})"
        )

        return provider
    def create_from_per_realization_csv_file(
        self, ens_path: str, csv_file_rel_path: str
    ) -> EnsembleSummaryProvider:
        """Create EnsembleSummaryProvider from per realization CSV files.

        Note that the returned summary provider does not support resampling, nor will it
        be able to return vector metadata.
        """

        timer = PerfTimer()

        storage_key = f"per_real_csv__{_make_hash_string(ens_path + csv_file_rel_path)}"
        provider = ProviderImplArrowPresampled.from_backing_store(
            self._storage_dir, storage_key
        )

        if provider:
            LOGGER.info(
                f"Loaded summary provider (per real CSV) from backing store in "
                f"{timer.elapsed_s():.2f}s ("
                f"ens_path={ens_path}, csv_file_rel_path={csv_file_rel_path})"
            )
            return provider

        # We can only import data from CSV if storage writes are allowed
        if not self._allow_storage_writes:
            raise ValueError(
                f"Failed to load summary provider (per real CSV) for {ens_path}"
            )

        LOGGER.info(f"Importing/saving per real CSV summary data for: {ens_path}")

        timer.lap_s()

        ensemble_df = load_per_real_csv_file_using_fmu(ens_path, csv_file_rel_path)
        et_import_csv_s = timer.lap_s()

        ProviderImplArrowPresampled.write_backing_store_from_ensemble_dataframe(
            self._storage_dir, storage_key, ensemble_df
        )
        et_write_s = timer.lap_s()

        provider = ProviderImplArrowPresampled.from_backing_store(
            self._storage_dir, storage_key
        )

        if not provider:
            raise ValueError(
                f"Failed to load/create provider (per real CSV) for {ens_path}"
            )

        LOGGER.info(
            f"Saved summary provider (per real CSV) to backing store in {timer.elapsed_s():.2f}s ("
            f"import_csv={et_import_csv_s:.2f}s, write={et_write_s:.2f}s, "
            f"ens_path={ens_path}, csv_file_rel_path={csv_file_rel_path})"
        )

        return provider
Ejemplo n.º 4
0
    def _create_statistical_surface(
        self, address: StatisticalSurfaceAddress
    ) -> Optional[xtgeo.RegularSurface]:
        surf_fns: List[str] = self._locate_simulated_surfaces(
            attribute=address.attribute,
            name=address.name,
            datestr=address.datestr if address.datestr is not None else "",
            realizations=address.realizations,
        )

        if len(surf_fns) == 0:
            LOGGER.warning(f"No input surfaces found for statistical surface {address}")
            return None

        timer = PerfTimer()

        surfaces = xtgeo.Surfaces(surf_fns)
        et_load_s = timer.lap_s()

        surf_count = len(surfaces.surfaces)
        if surf_count == 0:
            LOGGER.warning(
                f"Could not load input surfaces for statistical surface {address}"
            )
            return None

        # print("########################################################")
        # first_surf = surfaces.surfaces[0]
        # for surf in surfaces.surfaces:
        #     print(
        #         surf.dimensions,
        #         surf.xinc,
        #         surf.yinc,
        #         surf.xori,
        #         surf.yori,
        #         surf.rotation,
        #         surf.filesrc,
        #     )
        # print("########################################################")

        # Suppress numpy warnings when surfaces have undefined z-values
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", "All-NaN slice encountered")
            warnings.filterwarnings("ignore", "Mean of empty slice")
            warnings.filterwarnings("ignore", "Degrees of freedom <= 0 for slice")

            stat_surface = _calc_statistic_across_surfaces(address.statistic, surfaces)
        et_calc_s = timer.lap_s()

        LOGGER.debug(
            f"Created statistical surface in: {timer.elapsed_s():.2f}s ("
            f"load={et_load_s:.2f}s, calc={et_calc_s:.2f}s), "
            f"[#surfaces={surf_count}, stat={address.statistic}, "
            f"attr={address.attribute}, name={address.name}, date={address.datestr}]"
        )

        return stat_surface
    def create_from_ensemble_surface_files(
        self,
        ens_path: str,
        rel_surface_folder: str = "share/results/maps",
        attribute_filter: List[str] = None,
    ) -> EnsembleSurfaceProvider:
        timer = PerfTimer()
        string_to_hash = (
            f"{ens_path}_{rel_surface_folder}" if attribute_filter is None else
            (f"{ens_path}_{rel_surface_folder}_"
             f"{'_'.join([str(attr) for attr in attribute_filter])}"))
        storage_key = f"ens__{_make_hash_string(string_to_hash)}"
        provider = ProviderImplFile.from_backing_store(self._storage_dir,
                                                       storage_key)
        if provider:
            LOGGER.info(
                f"Loaded surface provider from backing store in {timer.elapsed_s():.2f}s ("
                f"ens_path={ens_path})")
            return provider

        # We can only import data from data source if storage writes are allowed
        if not self._allow_storage_writes:
            raise ValueError(f"Failed to load surface provider for {ens_path}")

        LOGGER.info(f"Importing/copying surface data for: {ens_path}")

        timer.lap_s()
        sim_surface_files = discover_per_realization_surface_files(
            ens_path, rel_surface_folder, attribute_filter)
        obs_surface_files = discover_observed_surface_files(
            ens_path, attribute_filter)
        et_discover_s = timer.lap_s()

        # As an optimization, avoid copying the surfaces into the backing store,
        # typically when  we're running in non-portable mode
        ProviderImplFile.write_backing_store(
            self._storage_dir,
            storage_key,
            sim_surfaces=sim_surface_files,
            obs_surfaces=obs_surface_files,
            avoid_copying_surfaces=self._avoid_copying_surfaces,
        )
        et_write_s = timer.lap_s()

        provider = ProviderImplFile.from_backing_store(self._storage_dir,
                                                       storage_key)
        if not provider:
            raise ValueError(
                f"Failed to load/create surface provider for {ens_path}")

        LOGGER.info(
            f"Saved surface provider to backing store in {timer.elapsed_s():.2f}s ("
            f"discover={et_discover_s:.2f}s, write={et_write_s:.2f}s, ens_path={ens_path})"
        )

        return provider
Ejemplo n.º 6
0
    def write_backing_store(
        storage_dir: Path,
        storage_key: str,
        well_file_names: List[str],
        md_logname: Optional[str],
    ) -> None:

        timer = PerfTimer()

        # All data for this provider will be stored inside a sub-directory
        # given by the storage key
        provider_dir = storage_dir / storage_key
        LOGGER.debug(f"Writing well backing store to: {provider_dir}")
        provider_dir.mkdir(parents=True, exist_ok=True)

        inventory_dict: Dict[str, dict] = {}

        LOGGER.debug(
            f"Writing {len(well_file_names)} wells into backing store...")

        timer.lap_s()
        for file_name in well_file_names:
            well = xtgeo.well_from_file(wfile=file_name, mdlogname=md_logname)

            if well.mdlogname is None:
                try:
                    well.geometrics()
                except ValueError:
                    LOGGER.debug(
                        f"Ignoring {well.name} as MD cannot be calculated")
                    continue

            print("well.mdlogname=", well.mdlogname)

            well_name = well.name
            rel_path = f"{well_name}.rmswell"
            # rel_path = f"{well_name}.hdf"

            dst_file = provider_dir / rel_path
            print("dst_file=", dst_file)
            well.to_file(wfile=dst_file, fformat="rmswell")
            # well.to_hdf(wfile=dst_file)

            inventory_dict[well_name] = {
                INV_KEY_REL_PATH: rel_path,
                INV_KEY_MD_LOGNAME: well.mdlogname,
            }

        et_copy_s = timer.lap_s()

        json_fn = provider_dir / "inventory.json"
        with open(json_fn, "w") as file:
            json.dump(inventory_dict, file)

        LOGGER.debug(f"Wrote well backing store in: {timer.elapsed_s():.2f}s ("
                     f"copy={et_copy_s:.2f}s)")
    def create_from_well_files(self, well_folder: str, well_suffix: str,
                               md_logname: Optional[str]) -> WellProvider:
        timer = PerfTimer()

        file_pattern = str(Path(well_folder) / f"*{well_suffix}")
        storage_key = f"from_files__{_make_hash_string(f'{file_pattern}_{md_logname}')}"

        provider = ProviderImplFile.from_backing_store(self._storage_dir,
                                                       storage_key)
        if provider:
            LOGGER.info(
                f"Loaded well provider from backing store in {timer.elapsed_s():.2f}s ("
                f"file_pattern={file_pattern})")
            return provider

        # We can only import data from data source if storage writes are allowed
        if not self._allow_storage_writes:
            raise ValueError(
                f"Failed to load well provider for {file_pattern}")

        LOGGER.info(f"Importing/writing well data for: {file_pattern}")

        timer.lap_s()
        src_file_names = sorted([
            str(filename)
            for filename in Path(well_folder).glob(f"*{well_suffix}")
        ])
        et_discover_s = timer.lap_s()

        ProviderImplFile.write_backing_store(
            self._storage_dir,
            storage_key,
            well_file_names=src_file_names,
            md_logname=md_logname,
        )
        et_write_s = timer.lap_s()

        provider = ProviderImplFile.from_backing_store(self._storage_dir,
                                                       storage_key)
        if not provider:
            raise ValueError(
                f"Failed to load/create well provider for {file_pattern}")

        LOGGER.info(
            f"Saved well provider to backing store in {timer.elapsed_s():.2f}s ("
            f"discover={et_discover_s:.2f}s, write={et_write_s:.2f}s, file_pattern={file_pattern})"
        )

        return provider
Ejemplo n.º 8
0
    def create_from_ensemble_fault_polygons_files(
            self, ens_path: str) -> EnsembleFaultPolygonsProvider:
        timer = PerfTimer()

        storage_key = f"ens__{_make_hash_string(ens_path)}"
        provider = ProviderImplFile.from_backing_store(self._storage_dir,
                                                       storage_key)
        if provider:
            LOGGER.info(
                f"Loaded fault polygons provider from backing store in {timer.elapsed_s():.2f}s ("
                f"ens_path={ens_path})")
            return provider

        # We can only import data from data source if storage writes are allowed
        if not self._allow_storage_writes:
            raise ValueError(
                f"Failed to load fault polygons provider for {ens_path}")

        LOGGER.info(f"Importing/copying fault polygons data for: {ens_path}")

        timer.lap_s()
        sim_fault_polygons_files = discover_per_realization_fault_polygons_files(
            ens_path)

        et_discover_s = timer.lap_s()

        ProviderImplFile.write_backing_store(
            self._storage_dir,
            storage_key,
            sim_fault_polygons=sim_fault_polygons_files,
        )
        et_write_s = timer.lap_s()

        provider = ProviderImplFile.from_backing_store(self._storage_dir,
                                                       storage_key)
        if not provider:
            raise ValueError(
                f"Failed to load/create fault polygons provider for {ens_path}"
            )

        LOGGER.info(
            f"Saved fault polygons provider to backing store in {timer.elapsed_s():.2f}s ("
            f"discover={et_discover_s:.2f}s, write={et_write_s:.2f}s, ens_path={ens_path})"
        )

        return provider
Ejemplo n.º 9
0
    def _create_and_store_image_in_cache(
        self,
        base_cache_key: str,
        surface: xtgeo.RegularSurface,
    ) -> None:

        timer = PerfTimer()

        LOGGER.debug("Converting surface to PNG image...")
        png_bytes: bytes = surface_to_png_bytes_optimized(surface)
        LOGGER.debug(f"Got PNG image, size={(len(png_bytes) / (1024 * 1024)):.2f}MB")
        et_to_image_s = timer.lap_s()

        img_cache_key = "IMG:" + base_cache_key
        meta_cache_key = "META:" + base_cache_key

        self._image_cache.add(img_cache_key, png_bytes)

        # For debugging rotations
        # unrot_surf = surface.copy()
        # unrot_surf.unrotate()
        # unrot_surf.quickplot("/home/sigurdp/gitRoot/hk-webviz-subsurface/quickplot.png")

        deckgl_bounds, deckgl_rot = _calc_map_component_bounds_and_rot(surface)

        meta = SurfaceMeta(
            x_min=surface.xmin,
            x_max=surface.xmax,
            y_min=surface.ymin,
            y_max=surface.ymax,
            val_min=surface.values.min(),
            val_max=surface.values.max(),
            deckgl_bounds=deckgl_bounds,
            deckgl_rot_deg=deckgl_rot,
        )
        self._image_cache.add(meta_cache_key, meta)
        et_write_cache_s = timer.lap_s()

        LOGGER.debug(
            f"Created image and wrote to cache in in: {timer.elapsed_s():.2f}s ("
            f"to_image={et_to_image_s:.2f}s, write_cache={et_write_cache_s:.2f}s), "
            f"[base_cache_key={base_cache_key}]"
        )
        def _handle_request(provider_id: str, addr_type_str: str,
                            surf_address_str: str) -> flask.Response:
            LOGGER.debug(f"Handling request: "
                         f"provider_id={provider_id} "
                         f"addr_type_str={addr_type_str} "
                         f"surf_address_str={surf_address_str}")

            timer = PerfTimer()

            try:
                provider = self._id_to_provider_dict[provider_id]
                surf_address_dict = json.loads(unquote_plus(surf_address_str))
                address: Union[StatisticalSurfaceAddress,
                               SimulatedSurfaceAddress,
                               ObservedSurfaceAddress, ]
                if addr_type_str == "sta":
                    address = StatisticalSurfaceAddress(**surf_address_dict)
                if addr_type_str == "sim":
                    address = SimulatedSurfaceAddress(**surf_address_dict)
                if addr_type_str == "obs":
                    address = ObservedSurfaceAddress(**surf_address_dict)
            except:
                LOGGER.error("Error decoding surface address")
                flask.abort(404)

            if self._image_cache:
                img_cache_key = (
                    f"provider_id={provider_id} "
                    f"addr_type={addr_type_str} address={surf_address_str}")
                LOGGER.debug(
                    f"Looking for image in cache (key={img_cache_key}, "
                    f"cache_type={self._image_cache.config['CACHE_TYPE']})")
                cached_img_bytes = self._image_cache.get(img_cache_key)
                if cached_img_bytes:
                    response = flask.send_file(io.BytesIO(cached_img_bytes),
                                               mimetype="image/png")
                    LOGGER.debug(
                        f"Request handled from image cache in: {timer.elapsed_s():.2f}s"
                    )
                    return response

            LOGGER.debug("Getting surface from provider...")
            timer.lap_s()
            surface = provider.get_surface(address)
            if not surface:
                LOGGER.error(f"Error getting surface for address: {address}")
                flask.abort(404)
            et_get_s = timer.lap_s()
            LOGGER.debug(
                f"Got surface (dimensions={surface.dimensions}, #cells={surface.ncol*surface.nrow})"
            )

            LOGGER.debug("Converting to PNG image...")
            png_bytes: bytes = surface_to_png_bytes(surface)
            LOGGER.debug(
                f"Got PNG image, size={(len(png_bytes) / (1024 * 1024)):.2f}MB"
            )
            et_to_image_s = timer.lap_s()

            LOGGER.debug("Sending image")
            response = flask.send_file(io.BytesIO(png_bytes),
                                       mimetype="image/png")
            et_send_s = timer.lap_s()

            if self._image_cache and img_cache_key:
                self._image_cache.add(img_cache_key, png_bytes)

            LOGGER.debug(
                f"Request handled in: {timer.elapsed_s():.2f}s ("
                f"get={et_get_s:.2f}s, to_image={et_to_image_s:.2f}s, send={et_send_s:.2f}s)"
            )

            return response
Ejemplo n.º 11
0
    def write_backing_store(
        storage_dir: Path,
        storage_key: str,
        sim_surfaces: List[SurfaceFileInfo],
        obs_surfaces: List[SurfaceFileInfo],
        avoid_copying_surfaces: bool,
    ) -> None:
        """If avoid_copying_surfaces if True, the specified surfaces will NOT be copied
        into the backing store, but will be referenced from their source locations.
        Note that this is only useful when running in non-portable mode and will fail
        in portable mode.
        """

        timer = PerfTimer()

        do_copy_surfs_into_store = not avoid_copying_surfaces

        # All data for this provider will be stored inside a sub-directory
        # given by the storage key
        provider_dir = storage_dir / storage_key
        LOGGER.debug(f"Writing surface backing store to: {provider_dir}")
        provider_dir.mkdir(parents=True, exist_ok=True)
        (provider_dir / REL_SIM_DIR).mkdir(parents=True, exist_ok=True)
        (provider_dir / REL_OBS_DIR).mkdir(parents=True, exist_ok=True)

        type_arr: List[SurfaceType] = []
        real_arr: List[int] = []
        attribute_arr: List[str] = []
        name_arr: List[str] = []
        datestr_arr: List[str] = []
        rel_path_arr: List[str] = []
        original_path_arr: List[str] = []

        for surfinfo in sim_surfaces:
            type_arr.append(SurfaceType.SIMULATED)
            real_arr.append(surfinfo.real)
            attribute_arr.append(surfinfo.attribute)
            name_arr.append(surfinfo.name)
            datestr_arr.append(surfinfo.datestr if surfinfo.datestr else "")
            original_path_arr.append(surfinfo.path)

            rel_path_in_store = ""
            if do_copy_surfs_into_store:
                rel_path_in_store = _compose_rel_sim_surf_pathstr(
                    real=surfinfo.real,
                    attribute=surfinfo.attribute,
                    name=surfinfo.name,
                    datestr=surfinfo.datestr,
                    extension=Path(surfinfo.path).suffix,
                )
            rel_path_arr.append(rel_path_in_store)

        # We want to strip out observed surfaces without a matching simulated surface
        valid_obs_surfaces = _find_observed_surfaces_corresponding_to_simulated(
            obs_surfaces=obs_surfaces, sim_surfaces=sim_surfaces
        )

        for surfinfo in valid_obs_surfaces:
            type_arr.append(SurfaceType.OBSERVED)
            real_arr.append(-1)
            attribute_arr.append(surfinfo.attribute)
            name_arr.append(surfinfo.name)
            datestr_arr.append(surfinfo.datestr if surfinfo.datestr else "")
            original_path_arr.append(surfinfo.path)

            rel_path_in_store = ""
            if do_copy_surfs_into_store:
                rel_path_in_store = _compose_rel_obs_surf_pathstr(
                    attribute=surfinfo.attribute,
                    name=surfinfo.name,
                    datestr=surfinfo.datestr,
                    extension=Path(surfinfo.path).suffix,
                )
            rel_path_arr.append(rel_path_in_store)

        timer.lap_s()
        if do_copy_surfs_into_store:
            LOGGER.debug(
                f"Copying {len(original_path_arr)} surfaces into backing store..."
            )
            _copy_surfaces_into_provider_dir(
                original_path_arr, rel_path_arr, provider_dir
            )
        et_copy_s = timer.lap_s()

        surface_inventory_df = pd.DataFrame(
            {
                Col.TYPE: type_arr,
                Col.REAL: real_arr,
                Col.ATTRIBUTE: attribute_arr,
                Col.NAME: name_arr,
                Col.DATESTR: datestr_arr,
                Col.REL_PATH: rel_path_arr,
                Col.ORIGINAL_PATH: original_path_arr,
            }
        )

        parquet_file_name = provider_dir / "surface_inventory.parquet"
        surface_inventory_df.to_parquet(path=parquet_file_name)

        if do_copy_surfs_into_store:
            LOGGER.debug(
                f"Wrote surface backing store in: {timer.elapsed_s():.2f}s ("
                f"copy={et_copy_s:.2f}s)"
            )
        else:
            LOGGER.debug(
                f"Wrote surface backing store without copying surfaces in: "
                f"{timer.elapsed_s():.2f}s"
            )
    def write_backing_store_from_ensemble_dataframe(
            storage_dir: Path, storage_key: str,
            ensemble_df: pd.DataFrame) -> None:
        @dataclass
        class Elapsed:
            convert_date_s: float = -1
            table_from_pandas_s: float = -1
            find_and_store_min_max_s: float = -1
            sorting_s: float = -1
            write_s: float = -1

        elapsed = Elapsed()

        arrow_file_name = storage_dir / (storage_key + ".arrow")
        LOGGER.debug(
            f"Writing backing store from ensemble dataframe to arrow file: {arrow_file_name}"
        )
        timer = PerfTimer()

        # Force data type in the incoming DataFrame's DATE column to datetime.datetime objects
        # This is the first step in coercing pyarrow to always store DATEs as timestamps
        ensemble_df = make_date_column_datetime_object(ensemble_df)
        elapsed.convert_date_s = timer.lap_s()

        # By default, we'll now end up with a schema that has timestamp[ns] for the DATE column
        # We therefore modify the retrieved schema and specify usage of timestamp[ms] instead
        default_schema = pa.Schema.from_pandas(ensemble_df,
                                               preserve_index=False)
        schema_to_use = _set_date_column_type_to_timestamp_ms(default_schema)

        # For experimenting with conversion to float
        # timer.lap_s()
        # schema_to_use = _create_float_downcasting_schema(schema_to_use)
        # LOGGER.info(
        #     f"Created schema for float downcasting in : {timer.lap_s():.2f}s"
        # )

        timer.lap_s()
        table = pa.Table.from_pandas(ensemble_df,
                                     schema=schema_to_use,
                                     preserve_index=False)
        elapsed.table_from_pandas_s = timer.lap_s()

        # We're done with the dataframe
        del ensemble_df

        # Find per column min/max values and then store them as metadata on table's schema
        timer.lap_ms()
        per_vector_min_max = find_min_max_for_numeric_table_columns(table)
        table = add_per_vector_min_max_to_table_schema_metadata(
            table, per_vector_min_max)
        elapsed.find_and_store_min_max_s = timer.lap_s()

        table = _sort_table_on_date_then_real(table)
        elapsed.sorting_s = timer.lap_s()

        # feather.write_feather(table, dest=arrow_file_name)
        feather.write_feather(table,
                              dest=arrow_file_name,
                              compression="uncompressed")
        elapsed.write_s = timer.lap_s()

        LOGGER.debug(
            f"Wrote backing store to arrow file in: {timer.elapsed_s():.2f}s ("
            f"convert_date={elapsed.convert_date_s:.2f}s, "
            f"table_from_pandas={elapsed.table_from_pandas_s:.2f}s, "
            f"find_and_store_min_max={elapsed.find_and_store_min_max_s:.2f}s, "
            f"sorting={elapsed.sorting_s:.2f}s, "
            f"write={elapsed.write_s:.2f}s)")
    def write_backing_store_from_per_realization_tables(
            storage_dir: Path, storage_key: str,
            per_real_tables: Dict[int, pa.Table]) -> None:
        @dataclass
        class Elapsed:
            concat_tables_s: float = -1
            build_add_real_col_s: float = -1
            sorting_s: float = -1
            find_and_store_min_max_s: float = -1
            write_s: float = -1

        elapsed = Elapsed()

        arrow_file_name = storage_dir / (storage_key + ".arrow")
        LOGGER.debug(
            f"Writing backing store from per real tables to arrow file: {arrow_file_name}"
        )
        timer = PerfTimer()

        unique_column_names = set()
        for table in per_real_tables.values():
            unique_column_names.update(table.schema.names)
        LOGGER.debug(f"Concatenating {len(per_real_tables)} tables with "
                     f"{len(unique_column_names)} unique column names")

        timer.lap_s()
        full_table = pa.concat_tables(per_real_tables.values(), promote=True)
        elapsed.concat_tables_s = timer.lap_s()

        real_arr = np.empty(full_table.num_rows, np.int32)
        table_start_idx = 0
        for real_num, real_table in per_real_tables.items():
            real_arr[table_start_idx:table_start_idx +
                     real_table.num_rows] = real_num
            table_start_idx += real_table.num_rows

        full_table = full_table.add_column(0, "REAL", pa.array(real_arr))
        elapsed.build_add_real_col_s = timer.lap_s()

        # Find per column min/max values and then store them as metadata on table's schema
        per_vector_min_max = find_min_max_for_numeric_table_columns(full_table)
        full_table = add_per_vector_min_max_to_table_schema_metadata(
            full_table, per_vector_min_max)
        elapsed.find_and_store_min_max_s = timer.lap_s()

        full_table = _sort_table_on_date_then_real(full_table)
        elapsed.sorting_s = timer.lap_s()

        # feather.write_feather(full_table, dest=arrow_file_name)
        feather.write_feather(full_table,
                              dest=arrow_file_name,
                              compression="uncompressed")
        elapsed.write_s = timer.lap_s()

        LOGGER.debug(
            f"Wrote backing store to arrow file in: {timer.elapsed_s():.2f}s ("
            f"concat_tables={elapsed.concat_tables_s:.2f}s, "
            f"build_add_real_col={elapsed.build_add_real_col_s:.2f}s, "
            f"sorting={elapsed.sorting_s:.2f}s, "
            f"find_and_store_min_max={elapsed.find_and_store_min_max_s:.2f}s, "
            f"write={elapsed.write_s:.2f}s)")
    def write_backing_store(
        storage_dir: Path,
        storage_key: str,
        sim_surfaces: List[SurfaceFileInfo],
        obs_surfaces: List[SurfaceFileInfo],
    ) -> None:

        timer = PerfTimer()

        # All data for this provider will be stored inside a sub-directory
        # given by the storage key
        provider_dir = storage_dir / storage_key
        LOGGER.debug(f"Writing surface backing store to: {provider_dir}")
        provider_dir.mkdir(parents=True, exist_ok=True)
        (provider_dir / REL_SIM_DIR).mkdir(parents=True, exist_ok=True)
        (provider_dir / REL_OBS_DIR).mkdir(parents=True, exist_ok=True)

        type_arr: List[SurfaceType] = []
        real_arr: List[int] = []
        attribute_arr: List[str] = []
        name_arr: List[str] = []
        datestr_arr: List[str] = []
        rel_path_arr: List[str] = []
        original_path_arr: List[str] = []

        for surfinfo in sim_surfaces:
            rel_path_in_store = _compose_rel_sim_surf_path(
                real=surfinfo.real,
                attribute=surfinfo.attribute,
                name=surfinfo.name,
                datestr=surfinfo.datestr,
                extension=Path(surfinfo.path).suffix,
            )
            type_arr.append(SurfaceType.SIMULATED)
            real_arr.append(surfinfo.real)
            attribute_arr.append(surfinfo.attribute)
            name_arr.append(surfinfo.name)
            datestr_arr.append(surfinfo.datestr if surfinfo.datestr else "")
            rel_path_arr.append(str(rel_path_in_store))
            original_path_arr.append(surfinfo.path)

        # We want to strip out observed surfaces without a matching simulated surface
        valid_obs_surfaces = _find_observed_surfaces_corresponding_to_simulated(
            obs_surfaces=obs_surfaces, sim_surfaces=sim_surfaces
        )

        for surfinfo in valid_obs_surfaces:
            rel_path_in_store = _compose_rel_obs_surf_path(
                attribute=surfinfo.attribute,
                name=surfinfo.name,
                datestr=surfinfo.datestr,
                extension=Path(surfinfo.path).suffix,
            )
            type_arr.append(SurfaceType.OBSERVED)
            real_arr.append(-1)
            attribute_arr.append(surfinfo.attribute)
            name_arr.append(surfinfo.name)
            datestr_arr.append(surfinfo.datestr if surfinfo.datestr else "")
            rel_path_arr.append(str(rel_path_in_store))
            original_path_arr.append(surfinfo.path)

        LOGGER.debug(f"Copying {len(original_path_arr)} surfaces into backing store...")
        timer.lap_s()
        _copy_surfaces_into_provider_dir(original_path_arr, rel_path_arr, provider_dir)
        et_copy_s = timer.lap_s()

        surface_inventory_df = pd.DataFrame(
            {
                Col.TYPE: type_arr,
                Col.REAL: real_arr,
                Col.ATTRIBUTE: attribute_arr,
                Col.NAME: name_arr,
                Col.DATESTR: datestr_arr,
                Col.REL_PATH: rel_path_arr,
                Col.ORIGINAL_PATH: original_path_arr,
            }
        )

        parquet_file_name = provider_dir / "surface_inventory.parquet"
        surface_inventory_df.to_parquet(path=parquet_file_name)

        LOGGER.debug(
            f"Wrote surface backing store in: {timer.elapsed_s():.2f}s ("
            f"copy={et_copy_s:.2f}s)"
        )
    def create_from_arrow_unsmry_presampled(
        self,
        ens_path: str,
        rel_file_pattern: str,
        sampling_frequency: Optional[Frequency],
    ) -> EnsembleSummaryProvider:
        """Create EnsembleSummaryProvider from per-realization unsmry data in .arrow format.

        The `rel_file_pattern` parameter must specify a relative (per realization) file pattern
        that will be used to find the wanted .arrow files within each realization. The file
        pattern is relative to each realization's `runpath`.
        Typically the file pattern will be: "share/results/unsmry/*.arrow"

        This factory method will sample the input data according to the specified
        `sampling_frequency` during import.

        The returned summary provider does not support lazy resampling, but will always
        return data with the above specified frequency .
        """

        timer = PerfTimer()

        freq_str = sampling_frequency.value if sampling_frequency else "raw"
        hash_str = _make_hash_string(ens_path + rel_file_pattern)
        storage_key = f"arrow_unsmry_presampled_{freq_str}__{hash_str}"
        provider = ProviderImplArrowPresampled.from_backing_store(
            self._storage_dir, storage_key)
        if provider:
            LOGGER.info(
                f"Loaded presampled summary provider from backing store in "
                f"{timer.elapsed_s():.2f}s ("
                f"sampling_frequency={sampling_frequency}, ens_path={ens_path})"
            )
            return provider

        # We can only import data from data source if storage writes are allowed
        if not self._allow_storage_writes:
            raise ValueError(
                f"Failed to load presampled summary provider for {ens_path}")

        LOGGER.info(f"Importing/saving arrow summary data for: {ens_path}")

        timer.lap_s()
        per_real_tables = load_per_realization_arrow_unsmry_files(
            ens_path, rel_file_pattern)
        if not per_real_tables:
            raise ValueError(
                f"Could not find any .arrow unsmry files for ens_path={ens_path}"
            )
        et_import_smry_s = timer.lap_s()

        if sampling_frequency is not None:
            for real_num, table in per_real_tables.items():
                per_real_tables[real_num] = resample_single_real_table(
                    table, sampling_frequency)
        et_resample_s = timer.lap_s()

        ProviderImplArrowPresampled.write_backing_store_from_per_realization_tables(
            self._storage_dir, storage_key, per_real_tables)
        et_write_s = timer.lap_s()

        provider = ProviderImplArrowPresampled.from_backing_store(
            self._storage_dir, storage_key)
        if not provider:
            raise ValueError(f"Failed to load/create provider for {ens_path}")

        LOGGER.info(
            f"Saved presampled summary provider to backing store in {timer.elapsed_s():.2f}s ("
            f"import_smry={et_import_smry_s:.2f}s, "
            f"resample={et_resample_s:.2f}s, "
            f"write={et_write_s:.2f}s, "
            f"ens_path={ens_path})")

        return provider
    def write_backing_store_from_per_realization_tables(
        storage_dir: Path, storage_key: str, per_real_tables: Dict[int, pa.Table]
    ) -> None:
        # pylint: disable=too-many-locals
        @dataclass
        class Elapsed:
            concat_tables_s: float = -1
            build_add_real_col_s: float = -1
            sorting_s: float = -1
            find_and_store_min_max_s: float = -1
            write_s: float = -1

        elapsed = Elapsed()

        arrow_file_name = storage_dir / (storage_key + ".arrow")
        LOGGER.debug(f"Writing backing store to arrow file: {arrow_file_name}")
        timer = PerfTimer()

        unique_column_names = set()
        for real_num, table in per_real_tables.items():
            unique_column_names.update(table.schema.names)

            if "REAL" in table.schema.names:
                raise ValueError(
                    f"Input tables should not have REAL column (real={real_num})"
                )

            if table.schema.field("DATE").type != pa.timestamp("ms"):
                raise ValueError(
                    f"DATE column must have timestamp[ms] data type (real={real_num})"
                )

            if not _is_date_column_monotonically_increasing(table):
                offending_pair = _find_first_non_increasing_date_pair(table)
                raise ValueError(
                    f"DATE column must be monotonically increasing\n"
                    f"Error detected in realization: {real_num}\n"
                    f"First offending timestamps: {offending_pair}"
                )

        LOGGER.debug(
            f"Concatenating {len(per_real_tables)} tables with "
            f"{len(unique_column_names)} unique column names"
        )

        full_table = pa.concat_tables(per_real_tables.values(), promote=True)
        elapsed.concat_tables_s = timer.lap_s()

        real_arr = np.empty(full_table.num_rows, np.int32)
        table_start_idx = 0
        for real_num, real_table in per_real_tables.items():
            real_arr[table_start_idx : table_start_idx + real_table.num_rows] = real_num
            table_start_idx += real_table.num_rows

        full_table = full_table.add_column(0, "REAL", pa.array(real_arr))
        elapsed.build_add_real_col_s = timer.lap_s()

        # Must sort table on real since interpolations work per realization
        # and we utilize slicing for speed
        full_table = _sort_table_on_real_then_date(full_table)
        elapsed.sorting_s = timer.lap_s()

        # Find per column min/max values and store them as metadata on table's schema
        per_vector_min_max = find_min_max_for_numeric_table_columns(full_table)
        full_table = add_per_vector_min_max_to_table_schema_metadata(
            full_table, per_vector_min_max
        )
        elapsed.find_and_store_min_max_s = timer.lap_s()

        # feather.write_feather(full_table, dest=arrow_file_name)
        with pa.OSFile(str(arrow_file_name), "wb") as sink:
            with pa.RecordBatchFileWriter(sink, full_table.schema) as writer:
                writer.write_table(full_table)
        elapsed.write_s = timer.lap_s()

        LOGGER.debug(
            f"Wrote backing store to arrow file in: {timer.elapsed_s():.2f}s ("
            f"concat_tables={elapsed.concat_tables_s:.2f}s, "
            f"build_add_real_col={elapsed.build_add_real_col_s:.2f}s, "
            f"sorting={elapsed.sorting_s:.2f}s, "
            f"find_and_store_min_max={elapsed.find_and_store_min_max_s:.2f}s, "
            f"write={elapsed.write_s:.2f}s)"
        )
    def create_from_ensemble_csv_file(
        self,
        csv_file: Path,
        ensemble_filter: Optional[str] = None,
    ) -> EnsembleSummaryProvider:
        """Create EnsembleSummaryProvider from aggregated CSV file.
        The CSV file is assumed to contain data for a single ensemble and must contain
        columns for `REAL` and `DATE` in addition to the actual numeric vectors.
        If the CSV file contains an `ENSEMBLE` column it will be ignored, but an exception
        will be thrown if it is present and it contains multiple ensemble names.

        Note that the returned summary provider does not support resampling, nor will it
        be able to return vector metadata.
        """

        timer = PerfTimer()

        storage_key = "ens_csv"
        if ensemble_filter is not None:
            storage_key += f"_filtered_on_{ensemble_filter}"
        storage_key += f"__{_make_hash_string(str(csv_file))}"

        provider = ProviderImplArrowPresampled.from_backing_store(
            self._storage_dir, storage_key)
        if provider:
            LOGGER.info(f"Loaded summary provider (CSV) from backing store in "
                        f"{timer.elapsed_s():.2f}s (csv_file={csv_file})")
            return provider

        # We can only import data from CSV if storage writes are allowed
        if not self._allow_storage_writes:
            raise ValueError(
                f"Failed to load summary provider (CSV) for {csv_file}")

        LOGGER.info(f"Importing/saving CSV summary data for: {csv_file}")

        timer.lap_s()
        ensemble_df = load_ensemble_summary_csv_file(csv_file, ensemble_filter)
        et_import_csv_s = timer.lap_s()

        if len(ensemble_df) == 0:
            raise ValueError("Import resulted in empty DataFrame")
        if "DATE" not in ensemble_df.columns:
            raise ValueError("No DATE column present in input data")
        if "REAL" not in ensemble_df.columns:
            raise ValueError("No REAL column present in input data")

        ProviderImplArrowPresampled.write_backing_store_from_ensemble_dataframe(
            self._storage_dir, storage_key, ensemble_df)
        et_write_s = timer.lap_s()

        provider = ProviderImplArrowPresampled.from_backing_store(
            self._storage_dir, storage_key)
        if not provider:
            raise ValueError(f"Failed to load/create provider for {csv_file}")

        LOGGER.info(
            f"Saved summary provider (CSV) to backing store in {timer.elapsed_s():.2f}s ("
            f"import_csv={et_import_csv_s:.2f}s, "
            f"write={et_write_s:.2f}s, "
            f"csv_file={csv_file})")

        return provider
Ejemplo n.º 18
0
    def write_backing_store(
        storage_dir: Path,
        storage_key: str,
        sim_fault_polygons: List[FaultPolygonsFileInfo],
    ) -> None:

        timer = PerfTimer()

        # All data for this provider will be stored inside a sub-directory
        # given by the storage key
        provider_dir = storage_dir / storage_key
        LOGGER.debug(
            f"Writing fault polygons backing store to: {provider_dir}")
        provider_dir.mkdir(parents=True, exist_ok=True)
        (provider_dir / REL_SIM_DIR).mkdir(parents=True, exist_ok=True)

        type_arr: List[FaultPolygonsType] = []
        real_arr: List[int] = []
        attribute_arr: List[str] = []
        name_arr: List[str] = []
        rel_path_arr: List[str] = []
        original_path_arr: List[str] = []

        for fault_polygons_info in sim_fault_polygons:
            rel_path_in_store = _compose_rel_sim_fault_polygons_path(
                real=fault_polygons_info.real,
                attribute=fault_polygons_info.attribute,
                name=fault_polygons_info.name,
                extension=Path(fault_polygons_info.path).suffix,
            )
            type_arr.append(FaultPolygonsType.SIMULATED)
            real_arr.append(fault_polygons_info.real)
            attribute_arr.append(fault_polygons_info.attribute)
            name_arr.append(fault_polygons_info.name)
            rel_path_arr.append(str(rel_path_in_store))
            original_path_arr.append(fault_polygons_info.path)

        LOGGER.debug(
            f"Copying {len(original_path_arr)} fault polygons into backing store..."
        )
        timer.lap_s()
        _copy_fault_polygons_into_provider_dir(original_path_arr, rel_path_arr,
                                               provider_dir)
        et_copy_s = timer.lap_s()

        fault_polygons_inventory_df = pd.DataFrame({
            Col.TYPE:
            type_arr,
            Col.REAL:
            real_arr,
            Col.ATTRIBUTE:
            attribute_arr,
            Col.NAME:
            name_arr,
            Col.REL_PATH:
            rel_path_arr,
            Col.ORIGINAL_PATH:
            original_path_arr,
        })

        parquet_file_name = provider_dir / "fault_polygons_inventory.parquet"
        fault_polygons_inventory_df.to_parquet(path=parquet_file_name)

        LOGGER.debug(
            f"Wrote fault polygons backing store in: {timer.elapsed_s():.2f}s ("
            f"copy={et_copy_s:.2f}s)")