def create_from_arrow_unsmry_presampled(
        self,
        ens_path: str,
        rel_file_pattern: str,
        sampling_frequency: Optional[Frequency],
    ) -> EnsembleSummaryProvider:
        """Create EnsembleSummaryProvider from per-realization unsmry data in .arrow format.

        The `rel_file_pattern` parameter must specify a relative (per realization) file pattern
        that will be used to find the wanted .arrow files within each realization. The file
        pattern is relative to each realization's `runpath`.
        Typically the file pattern will be: "share/results/unsmry/*.arrow"

        This factory method will sample the input data according to the specified
        `sampling_frequency` during import.

        The returned summary provider does not support lazy resampling, but will always
        return data with the above specified frequency .
        """

        timer = PerfTimer()

        freq_str = sampling_frequency.value if sampling_frequency else "raw"
        hash_str = _make_hash_string(ens_path + rel_file_pattern)
        storage_key = f"arrow_unsmry_presampled_{freq_str}__{hash_str}"
        provider = ProviderImplArrowPresampled.from_backing_store(
            self._storage_dir, storage_key
        )
        if provider:
            LOGGER.info(
                f"Loaded presampled summary provider from backing store in "
                f"{timer.elapsed_s():.2f}s ("
                f"sampling_frequency={sampling_frequency}, ens_path={ens_path})"
            )
            return provider

        # We can only import data from data source if storage writes are allowed
        if not self._allow_storage_writes:
            raise ValueError(
                f"Failed to load presampled summary provider for {ens_path}"
            )

        LOGGER.info(f"Importing/saving arrow summary data for: {ens_path}")

        timer.lap_s()
        per_real_tables = load_per_realization_arrow_unsmry_files(
            ens_path, rel_file_pattern
        )
        if not per_real_tables:
            raise ValueError(
                f"Could not find any .arrow unsmry files for ens_path={ens_path}"
            )
        et_import_smry_s = timer.lap_s()

        if sampling_frequency is not None:
            for real_num, table in per_real_tables.items():
                per_real_tables[real_num] = resample_single_real_table(
                    table, sampling_frequency
                )
        et_resample_s = timer.lap_s()

        ProviderImplArrowPresampled.write_backing_store_from_per_realization_tables(
            self._storage_dir, storage_key, per_real_tables
        )
        et_write_s = timer.lap_s()

        provider = ProviderImplArrowPresampled.from_backing_store(
            self._storage_dir, storage_key
        )
        if not provider:
            raise ValueError(f"Failed to load/create provider for {ens_path}")

        LOGGER.info(
            f"Saved presampled summary provider to backing store in {timer.elapsed_s():.2f}s ("
            f"import_smry={et_import_smry_s:.2f}s, "
            f"resample={et_resample_s:.2f}s, "
            f"write={et_write_s:.2f}s, "
            f"ens_path={ens_path})"
        )

        return provider
    def create_from_arrow_unsmry_lazy(
        self, ens_path: str, rel_file_pattern: str
    ) -> EnsembleSummaryProvider:
        """Create EnsembleSummaryProvider from per-realization unsmry data in .arrow format.

        The `rel_file_pattern` parameter must specify a relative (per realization) file pattern
        that will be used to find the wanted .arrow files within each realization. The file
        pattern is relative to each realization's `runpath`.
        Typically the file pattern will be: "share/results/unsmry/*.arrow"

        The returned summary provider supports lazy resampling.
        """

        timer = PerfTimer()

        storage_key = (
            f"arrow_unsmry_lazy__{_make_hash_string(ens_path + rel_file_pattern)}"
        )
        provider = ProviderImplArrowLazy.from_backing_store(
            self._storage_dir, storage_key
        )
        if provider:
            LOGGER.info(
                f"Loaded lazy summary provider from backing store in {timer.elapsed_s():.2f}s ("
                f"ens_path={ens_path})"
            )
            return provider

        # We can only import data from data source if storage writes are allowed
        if not self._allow_storage_writes:
            raise ValueError(f"Failed to load lazy summary provider for {ens_path}")

        LOGGER.info(f"Importing/saving arrow summary data for: {ens_path}")

        timer.lap_s()
        per_real_tables = load_per_realization_arrow_unsmry_files(
            ens_path, rel_file_pattern
        )
        if not per_real_tables:
            raise ValueError(
                f"Could not find any .arrow unsmry files for ens_path={ens_path}"
            )
        et_import_smry_s = timer.lap_s()

        try:
            ProviderImplArrowLazy.write_backing_store_from_per_realization_tables(
                self._storage_dir, storage_key, per_real_tables
            )
        except ValueError as exc:
            raise ValueError(f"Failed to write backing store for: {ens_path}") from exc

        et_write_s = timer.lap_s()

        provider = ProviderImplArrowLazy.from_backing_store(
            self._storage_dir, storage_key
        )
        if not provider:
            raise ValueError(f"Failed to load/create lazy provider for {ens_path}")

        LOGGER.info(
            f"Saved lazy summary provider to backing store in {timer.elapsed_s():.2f}s ("
            f"import_smry={et_import_smry_s:.2f}s, write={et_write_s:.2f}s, ens_path={ens_path})"
        )

        return provider
コード例 #3
0
    def write_backing_store_from_ensemble_dataframe(
            storage_dir: Path, storage_key: str,
            ensemble_df: pd.DataFrame) -> None:
        @dataclass
        class Elapsed:
            convert_date_s: float = -1
            table_from_pandas_s: float = -1
            find_and_store_min_max_s: float = -1
            sorting_s: float = -1
            write_s: float = -1

        elapsed = Elapsed()

        arrow_file_name = storage_dir / (storage_key + ".arrow")
        LOGGER.debug(
            f"Writing backing store from ensemble dataframe to arrow file: {arrow_file_name}"
        )
        timer = PerfTimer()

        # Force data type in the incoming DataFrame's DATE column to datetime.datetime objects
        # This is the first step in coercing pyarrow to always store DATEs as timestamps
        ensemble_df = make_date_column_datetime_object(ensemble_df)
        elapsed.convert_date_s = timer.lap_s()

        # By default, we'll now end up with a schema that has timestamp[ns] for the DATE column
        # We therefore modify the retrieved schema and specify usage of timestamp[ms] instead
        default_schema = pa.Schema.from_pandas(ensemble_df,
                                               preserve_index=False)
        schema_to_use = _set_date_column_type_to_timestamp_ms(default_schema)

        # For experimenting with conversion to float
        # timer.lap_s()
        # schema_to_use = _create_float_downcasting_schema(schema_to_use)
        # LOGGER.info(
        #     f"Created schema for float downcasting in : {timer.lap_s():.2f}s"
        # )

        timer.lap_s()
        table = pa.Table.from_pandas(ensemble_df,
                                     schema=schema_to_use,
                                     preserve_index=False)
        elapsed.table_from_pandas_s = timer.lap_s()

        # We're done with the dataframe
        del ensemble_df

        # Find per column min/max values and then store them as metadata on table's schema
        timer.lap_ms()
        per_vector_min_max = find_min_max_for_numeric_table_columns(table)
        table = add_per_vector_min_max_to_table_schema_metadata(
            table, per_vector_min_max)
        elapsed.find_and_store_min_max_s = timer.lap_s()

        table = _sort_table_on_date_then_real(table)
        elapsed.sorting_s = timer.lap_s()

        # feather.write_feather(table, dest=arrow_file_name)
        feather.write_feather(table,
                              dest=arrow_file_name,
                              compression="uncompressed")
        elapsed.write_s = timer.lap_s()

        LOGGER.debug(
            f"Wrote backing store to arrow file in: {timer.elapsed_s():.2f}s ("
            f"convert_date={elapsed.convert_date_s:.2f}s, "
            f"table_from_pandas={elapsed.table_from_pandas_s:.2f}s, "
            f"find_and_store_min_max={elapsed.find_and_store_min_max_s:.2f}s, "
            f"sorting={elapsed.sorting_s:.2f}s, "
            f"write={elapsed.write_s:.2f}s)")
コード例 #4
0
    def write_backing_store_from_per_realization_tables(
            storage_dir: Path, storage_key: str,
            per_real_tables: Dict[int, pa.Table]) -> None:
        @dataclass
        class Elapsed:
            concat_tables_s: float = -1
            build_add_real_col_s: float = -1
            sorting_s: float = -1
            find_and_store_min_max_s: float = -1
            write_s: float = -1

        elapsed = Elapsed()

        arrow_file_name = storage_dir / (storage_key + ".arrow")
        LOGGER.debug(
            f"Writing backing store from per real tables to arrow file: {arrow_file_name}"
        )
        timer = PerfTimer()

        unique_column_names = set()
        for table in per_real_tables.values():
            unique_column_names.update(table.schema.names)
        LOGGER.debug(f"Concatenating {len(per_real_tables)} tables with "
                     f"{len(unique_column_names)} unique column names")

        timer.lap_s()
        full_table = pa.concat_tables(per_real_tables.values(), promote=True)
        elapsed.concat_tables_s = timer.lap_s()

        real_arr = np.empty(full_table.num_rows, np.int32)
        table_start_idx = 0
        for real_num, real_table in per_real_tables.items():
            real_arr[table_start_idx:table_start_idx +
                     real_table.num_rows] = real_num
            table_start_idx += real_table.num_rows

        full_table = full_table.add_column(0, "REAL", pa.array(real_arr))
        elapsed.build_add_real_col_s = timer.lap_s()

        # Find per column min/max values and then store them as metadata on table's schema
        per_vector_min_max = find_min_max_for_numeric_table_columns(full_table)
        full_table = add_per_vector_min_max_to_table_schema_metadata(
            full_table, per_vector_min_max)
        elapsed.find_and_store_min_max_s = timer.lap_s()

        full_table = _sort_table_on_date_then_real(full_table)
        elapsed.sorting_s = timer.lap_s()

        # feather.write_feather(full_table, dest=arrow_file_name)
        feather.write_feather(full_table,
                              dest=arrow_file_name,
                              compression="uncompressed")
        elapsed.write_s = timer.lap_s()

        LOGGER.debug(
            f"Wrote backing store to arrow file in: {timer.elapsed_s():.2f}s ("
            f"concat_tables={elapsed.concat_tables_s:.2f}s, "
            f"build_add_real_col={elapsed.build_add_real_col_s:.2f}s, "
            f"sorting={elapsed.sorting_s:.2f}s, "
            f"find_and_store_min_max={elapsed.find_and_store_min_max_s:.2f}s, "
            f"write={elapsed.write_s:.2f}s)")
コード例 #5
0
    def write_backing_store_from_per_realization_tables(
        storage_dir: Path, storage_key: str, per_real_tables: Dict[int, pa.Table]
    ) -> None:
        # pylint: disable=too-many-locals
        @dataclass
        class Elapsed:
            concat_tables_s: float = -1
            build_add_real_col_s: float = -1
            sorting_s: float = -1
            find_and_store_min_max_s: float = -1
            write_s: float = -1

        elapsed = Elapsed()

        arrow_file_name = storage_dir / (storage_key + ".arrow")
        LOGGER.debug(f"Writing backing store to arrow file: {arrow_file_name}")
        timer = PerfTimer()

        unique_column_names = set()
        for table in per_real_tables.values():
            unique_column_names.update(table.schema.names)

            if "REAL" in table.schema.names:
                raise ValueError("Input tables should not have REAL column")

            if table.schema.field("DATE").type != pa.timestamp("ms"):
                raise ValueError("DATE column must have timestamp[ms] data type")

            if not _is_date_column_sorted(table):
                raise ValueError("DATE column must be sorted")

        LOGGER.debug(
            f"Concatenating {len(per_real_tables)} tables with "
            f"{len(unique_column_names)} unique column names"
        )

        full_table = pa.concat_tables(per_real_tables.values(), promote=True)
        elapsed.concat_tables_s = timer.lap_s()

        real_arr = np.empty(full_table.num_rows, np.int32)
        table_start_idx = 0
        for real_num, real_table in per_real_tables.items():
            real_arr[table_start_idx : table_start_idx + real_table.num_rows] = real_num
            table_start_idx += real_table.num_rows

        full_table = full_table.add_column(0, "REAL", pa.array(real_arr))
        elapsed.build_add_real_col_s = timer.lap_s()

        # Must sort table on real since interpolations work per realization
        # and we utilize slicing for speed
        full_table = _sort_table_on_real_then_date(full_table)
        elapsed.sorting_s = timer.lap_s()

        # Find per column min/max values and store them as metadata on table's schema
        per_vector_min_max = find_min_max_for_numeric_table_columns(full_table)
        full_table = add_per_vector_min_max_to_table_schema_metadata(
            full_table, per_vector_min_max
        )
        elapsed.find_and_store_min_max_s = timer.lap_s()

        # feather.write_feather(full_table, dest=arrow_file_name)
        with pa.OSFile(str(arrow_file_name), "wb") as sink:
            with pa.RecordBatchFileWriter(sink, full_table.schema) as writer:
                writer.write_table(full_table)
        elapsed.write_s = timer.lap_s()

        LOGGER.debug(
            f"Wrote backing store to arrow file in: {timer.elapsed_s():.2f}s ("
            f"concat_tables={elapsed.concat_tables_s:.2f}s, "
            f"build_add_real_col={elapsed.build_add_real_col_s:.2f}s, "
            f"sorting={elapsed.sorting_s:.2f}s, "
            f"find_and_store_min_max={elapsed.find_and_store_min_max_s:.2f}s, "
            f"write={elapsed.write_s:.2f}s)"
        )