Python ScratchEnsemble.get_df Examples, fmu.ensemble.ScratchEnsemble.get_df Python Examples

Example #1

0

Show file

def test_reek001_scalars():
    """Test import of scalar values from files

    Files with scalar values can contain numerics or strings,
    or be empty."""

    if "__file__" in globals():
        # Easen up copying test code into interactive sessions
        testdir = os.path.dirname(os.path.abspath(__file__))
    else:
        testdir = os.path.abspath(".")

    reekensemble = ScratchEnsemble(
        "reektest",
        testdir + "/data/testensemble-reek001/" + "realization-*/iter-0")

    assert "OK" in reekensemble.keys()
    assert isinstance(reekensemble.get_df("OK"), pd.DataFrame)
    assert len(reekensemble.get_df("OK")) == 5

    # One of the npv.txt files contains the string "error!"
    reekensemble.load_scalar("npv.txt")
    npv = reekensemble.get_df("npv.txt")
    assert isinstance(npv, pd.DataFrame)
    assert "REAL" in npv
    assert "npv.txt" in npv  # filename is the column name
    print(npv)
    assert len(npv) == 5
    assert npv.dtypes["REAL"] == int
    assert npv.dtypes["npv.txt"] == object
    # This is undesirable, can cause trouble with aggregation
    # Try again:
    reekensemble.load_scalar("npv.txt",
                             force_reread=True,
                             convert_numeric=True)
    npv = reekensemble.get_df("npv.txt")
    assert npv.dtypes["npv.txt"] == int or npv.dtypes["npv.txt"] == float
    assert len(npv) == 4  # the error should now be removed

    reekensemble.load_scalar("emptyscalarfile")  # missing in real-4
    assert len(reekensemble.get_df("emptyscalarfile")) == 4
    assert "emptyscalarfile" in reekensemble.keys()
    # Use when filter is merged.
    # assert len(reekensemble.filter('emptyscalarfile', inplace=True)) == 4

    # If we try to read the empty files as numerical values, we should get
    # nothing back:
    with pytest.raises(ValueError):
        reekensemble.load_scalar("emptyscalarfile",
                                 force_reread=True,
                                 convert_numeric=True)

    with pytest.raises(ValueError):
        reekensemble.load_scalar("nonexistingfile")

Example #2

0

Show file

File: test_ensemble.py Project: equinor/fmu-ensemble

def test_apply(tmpdir):
    """
    Test the callback functionality
    """
    if "__file__" in globals():
        # Easen up copying test code into interactive sessions
        testdir = os.path.dirname(os.path.abspath(__file__))
    else:
        testdir = os.path.abspath(".")

    tmpdir.chdir()

    symlink_iter(os.path.join(testdir, "data/testensemble-reek001"), "iter-0")

    ens = ScratchEnsemble("reektest", "realization-*/iter-0")

    def ex_func1():
        """Example function that will return a constant dataframe"""
        return pd.DataFrame(index=["1", "2"],
                            columns=["foo", "bar"],
                            data=[[1, 2], [3, 4]])

    result = ens.apply(ex_func1)
    assert isinstance(result, pd.DataFrame)
    assert "REAL" in result.columns
    assert len(result) == 10

    # Check that we can internalize as well
    ens.apply(ex_func1, localpath="df-1234")
    int_df = ens.get_df("df-1234")
    assert "REAL" in int_df
    assert len(int_df) == len(result)

    if SKIP_FMU_TOOLS:
        return

    # Test if we can wrap the volumetrics-parser in fmu.tools:
    # It cannot be applied directly, as we need to combine the
    # realization's root directory with the relative path coming in:

    def rms_vol2df(kwargs):
        """Example function for bridging with fmu.tools to parse volumetrics"""
        fullpath = os.path.join(kwargs["realization"].runpath(),
                                kwargs["filename"])
        # The supplied callback should not fail too easy.
        if os.path.exists(fullpath):
            return volumetrics.rmsvolumetrics_txt2df(fullpath)
        return pd.DataFrame()

    rmsvols_df = ens.apply(rms_vol2df,
                           filename="share/results/volumes/" +
                           "geogrid_vol_oil_1.txt")
    assert rmsvols_df["STOIIP_OIL"].sum() > 0
    assert len(rmsvols_df["REAL"].unique()) == 4

Example #3

0

Show file

def test_batch():
    """Test batch processing at time of object initialization"""
    if "__file__" in globals():
        # Easen up copying test code into interactive sessions
        testdir = os.path.dirname(os.path.abspath(__file__))
    else:
        testdir = os.path.abspath(".")

    ens = ScratchEnsemble(
        "reektest",
        testdir + "/data/testensemble-reek001/" + "realization-*/iter-0",
        batch=[
            {"load_scalar": {"localpath": "npv.txt"}},
            {"load_smry": {"column_keys": "FOPT", "time_index": "yearly"}},
            {"load_smry": {"column_keys": "*", "time_index": "daily"}},
        ],
    )
    assert len(ens.get_df("npv.txt")) == 5
    assert len(ens.get_df("unsmry--daily")["FOPR"]) == 5490
    assert len(ens.get_df("unsmry--yearly")["FOPT"]) == 25

    # Also possible to batch process afterwards:
    ens = ScratchEnsemble(
        "reektest", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0"
    )
    ens.process_batch(
        batch=[
            {"load_scalar": {"localpath": "npv.txt"}},
            {"load_smry": {"column_keys": "FOPT", "time_index": "yearly"}},
            {"load_smry": {"column_keys": "*", "time_index": "daily"}},
        ]
    )
    assert len(ens.get_df("npv.txt")) == 5
    assert len(ens.get_df("unsmry--daily")["FOPR"]) == 5490
    assert len(ens.get_df("unsmry--yearly")["FOPT"]) == 25

Example #4

0

Show file

def test_noparameters():
    """Test what happens when parameters.txt is missing"""

    testdir = os.path.dirname(os.path.abspath(__file__))
    reekensemble = ScratchEnsemble(
        "reektest", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0"
    )
    # Parameters.txt exist on disk, so it is loaded:
    assert not reekensemble.parameters.empty
    # Remove it each realization:
    reekensemble.remove_data("parameters.txt")
    assert reekensemble.parameters.empty

    # However, when parameters.txt is excplicitly asked for,
    # an exception should be raised:
    with pytest.raises(KeyError):
        reekensemble.get_df("parameters.txt")

    reekensemble.load_smry(time_index="yearly", column_keys="FOPT")
    assert not reekensemble.get_df("unsmry--yearly").empty
    with pytest.raises(KeyError):
        reekensemble.get_df("unsmry--yearly", merge="parameters.txt")

Example #5

0

Show file

File: aggregate_results.py Project: magnesj/webviz-subsurface-testdata

from fmu.ensemble import ScratchEnsemble

# Gather selected CSV files from each realization
# and dump them (each indivitually merged with parameters.txt)
# to share/results

ens = ScratchEnsemble("", "realization-*/iter-0")
csv_files = [
    "volumes/geogrid--oil.csv",
    "volumes/simgrid--oil.csv",
    "volumes/simulator_volume_fipnum.csv",
    "tables/rft.csv",
    "tables/unsmry--monthly.csv",
    "tables/equil.csv",
    "tables/relperm.csv",
    "tables/pvt.csv",
]

for file in csv_files:
    ens.load_csv("share/results/" + file)
    ens.get_df(file.split("/")[1],
               merge="parameters.txt").to_csv("share/results/" + file,
                                              index=False)

Example #6

0

Show file

def test_ensemble_ecl():
    """Eclipse specific functionality"""

    if "__file__" in globals():
        # Easen up copying test code into interactive sessions
        testdir = os.path.dirname(os.path.abspath(__file__))
    else:
        testdir = os.path.abspath(".")

    reekensemble = ScratchEnsemble(
        "reektest",
        testdir + "/data/testensemble-reek001/" + "realization-*/iter-0")

    # Eclipse summary keys:
    assert len(reekensemble.get_smrykeys("FOPT")) == 1
    assert len(reekensemble.get_smrykeys("F*")) == 49
    assert len(reekensemble.get_smrykeys(["F*", "W*"])) == 49 + 280
    assert not reekensemble.get_smrykeys("BOGUS")

    # reading ensemble dataframe
    monthly = reekensemble.load_smry(time_index="monthly")

    monthly = reekensemble.load_smry(column_keys=["F*"], time_index="monthly")
    assert monthly.columns[0] == "REAL"  # Enforce order of columns.
    assert monthly.columns[1] == "DATE"
    assert len(monthly) == 190
    # Check that the result was cached in memory, not necessarily on disk..
    assert isinstance(reekensemble.get_df("unsmry--monthly.csv"), pd.DataFrame)

    assert len(reekensemble.keys()) == 4

    # When asking the ensemble for FOPR, we also get REAL as a column
    # in return. Note that the internal stored version will be
    # overwritten by each load_smry()
    assert len(reekensemble.load_smry(column_keys=["FOPR"]).columns) == 3
    assert len(reekensemble.load_smry(column_keys=["FOP*"]).columns) == 11
    assert len(
        reekensemble.load_smry(column_keys=["FGPR", "FOP*"]).columns) == 12

    # Check that there is now a cached version with raw dates:
    assert isinstance(reekensemble.get_df("unsmry--raw.csv"), pd.DataFrame)
    # The columns are not similar, this is allowed!'

    # If you get 3205 here, it means that you are using the union of
    # raw dates from all realizations, which is not correct
    assert len(
        reekensemble.load_smry(column_keys=["FGPR", "FOP*"]).index) == 1700

    # Date list handling:
    assert len(reekensemble.get_smry_dates(freq="report")) == 641
    assert len(reekensemble.get_smry_dates(freq="raw")) == 641
    assert len(reekensemble.get_smry_dates(freq="yearly")) == 5
    assert len(reekensemble.get_smry_dates(freq="monthly")) == 38
    assert len(reekensemble.get_smry_dates(freq="daily")) == 1098
    assert len(reekensemble.get_smry_dates(freq="last")) == 1
    assert reekensemble.get_smry_dates(
        freq="last") == reekensemble.get_smry_dates(freq="last",
                                                    end_date="2050-02-01")

    assert str(reekensemble.get_smry_dates(
        freq="report")[-1]) == "2003-01-02 00:00:00"
    assert str(
        reekensemble.get_smry_dates(freq="raw")[-1]) == "2003-01-02 00:00:00"
    assert str(reekensemble.get_smry_dates(freq="yearly")[-1]) == "2004-01-01"
    assert str(reekensemble.get_smry_dates(freq="monthly")[-1]) == "2003-02-01"
    assert str(reekensemble.get_smry_dates(freq="daily")[-1]) == "2003-01-02"
    assert str(reekensemble.get_smry_dates(freq="last")[-1]) == "2003-01-02"

    assert (str(
        reekensemble.get_smry_dates(
            freq="daily", end_date="2002-03-03")[-1]) == "2002-03-03")
    assert (str(
        reekensemble.get_smry_dates(
            freq="daily", start_date="2002-03-03")[0]) == "2002-03-03")

    # Start and end outside of orig data and on the "wrong side"
    dates = reekensemble.get_smry_dates(end_date="1999-03-03")
    assert len(dates) == 1
    assert str(dates[0]) == "1999-03-03"

    dates = reekensemble.get_smry_dates(start_date="2099-03-03")
    assert len(dates) == 1
    assert str(dates[0]) == "2099-03-03"

    # Time interpolated dataframes with summary data:
    yearly = reekensemble.get_smry_dates(freq="yearly")
    assert len(reekensemble.load_smry(column_keys=["FOPT"],
                                      time_index=yearly)) == 25
    # NB: This is cached in unsmry-custom.csv, not unsmry--yearly!
    # This usage is discouraged. Use 'yearly' in such cases.

    # Check that we can shortcut get_smry_dates:
    assert len(
        reekensemble.load_smry(column_keys=["FOPT"],
                               time_index="yearly")) == 25

    assert len(reekensemble.load_smry(column_keys=["FOPR"],
                                      time_index="last")) == 5
    assert isinstance(reekensemble.get_df("unsmry--last.csv"), pd.DataFrame)

    # Eclipse well names list
    assert len(reekensemble.get_wellnames("OP*")) == 5
    assert len(reekensemble.get_wellnames(None)) == 8
    assert len(reekensemble.get_wellnames()) == 8
    assert not reekensemble.get_wellnames("")
    assert len(reekensemble.get_wellnames(["OP*", "WI*"])) == 8

    # eclipse well groups list
    assert len(reekensemble.get_groupnames()) == 3

    # delta between two ensembles
    diff = reekensemble - reekensemble
    assert len(
        diff.get_smry(column_keys=["FOPR", "FGPR", "FWCT"]).columns) == 5

    # eclipse summary vector statistics for a given ensemble
    df_stats = reekensemble.get_smry_stats(column_keys=["FOPR", "FGPR"],
                                           time_index="monthly")
    assert isinstance(df_stats, pd.DataFrame)
    assert len(df_stats.columns) == 2
    assert isinstance(df_stats["FOPR"]["mean"], pd.Series)
    assert len(df_stats["FOPR"]["mean"].index) == 38

    # check if wild cards also work for get_smry_stats
    df_stats = reekensemble.get_smry_stats(column_keys=["FOP*", "FGP*"],
                                           time_index="monthly")
    assert len(df_stats.columns) == len(
        reekensemble.get_smrykeys(["FOP*", "FGP*"]))

    # Check webviz requirements for dataframe
    stats = df_stats.index.levels[0]
    assert "minimum" in stats
    assert "maximum" in stats
    assert "p10" in stats
    assert "p90" in stats
    assert "mean" in stats
    assert df_stats["FOPR"]["minimum"].iloc[-2] < df_stats["FOPR"][
        "maximum"].iloc[-2]

    # Check user supplied quantiles
    df_stats = reekensemble.get_smry_stats(column_keys=["FOPT"],
                                           time_index="yearly",
                                           quantiles=[0, 15, 50, 85, 100])
    statistics = df_stats.index.levels[0]
    assert "p0" in statistics
    assert "p15" in statistics
    assert "p50" in statistics
    assert "p85" in statistics
    assert "p100" in statistics

    # For oil industry, p15 on FOPT should yield a larger value than p85.
    # But the quantiles we get out follows the rest of the world
    # so we check for the opposite.
    assert df_stats["FOPT"]["p85"][-1] > df_stats["FOPT"]["p15"][-1]

    with pytest.raises(ValueError):
        reekensemble.get_smry_stats(column_keys=["FOPT"],
                                    time_index="yearly",
                                    quantiles=["foobar"])

    noquantiles = reekensemble.get_smry_stats(column_keys=["FOPT"],
                                              time_index="yearly",
                                              quantiles=[])
    assert len(noquantiles.index.levels[0]) == 3

Example #7

0

Show file

def test_reek001(tmp="TMP"):
    """Test import of a stripped 5 realization ensemble"""

    if "__file__" in globals():
        # Easen up copying test code into interactive sessions
        testdir = os.path.dirname(os.path.abspath(__file__))
    else:
        testdir = os.path.abspath(".")

    reekensemble = ScratchEnsemble(
        "reektest",
        testdir + "/data/testensemble-reek001/" + "realization-*/iter-0")
    assert isinstance(reekensemble, ScratchEnsemble)
    assert reekensemble.name == "reektest"
    assert len(reekensemble) == 5

    assert isinstance(reekensemble[0], ScratchRealization)

    assert len(
        reekensemble.files[reekensemble.files.LOCALPATH == "jobs.json"]) == 5
    assert (len(reekensemble.files[reekensemble.files.LOCALPATH ==
                                   "parameters.txt"]) == 5)
    assert len(
        reekensemble.files[reekensemble.files.LOCALPATH == "STATUS"]) == 5

    statusdf = reekensemble.get_df("STATUS")
    assert len(statusdf) == 250  # 5 realizations, 50 jobs in each
    assert "REAL" in statusdf.columns
    assert "FORWARD_MODEL" in statusdf.columns
    statusdf = statusdf.set_index(["REAL", "FORWARD_MODEL"]).sort_index()
    assert "DURATION" in statusdf.columns  # calculated
    assert "argList" in statusdf.columns  # from jobs.json

    # Sample check the duration for RMS in realization 4:
    assert int(statusdf.loc[4, "RMS_BATCH"]["DURATION"].values[0]) == 195

    # STATUS in real4 is modified to simulate that Eclipse never finished:
    assert numpy.isnan(statusdf.loc[4,
                                    "ECLIPSE100_2014.2"]["DURATION"].values[0])

    if not os.path.exists(tmp):
        os.mkdir(tmp)
    statusdf.to_csv(os.path.join(tmp, "status.csv"), index=False)

    # Parameters.txt
    paramsdf = reekensemble.load_txt("parameters.txt")
    assert len(paramsdf) == 5  # 5 realizations
    paramsdf = reekensemble.parameters  # also test as property
    paramsdf = reekensemble.get_df("parameters.txt")
    assert len(paramsdf) == 5
    assert len(paramsdf.columns) == 26  # 25 parameters, + REAL column
    paramsdf.to_csv(os.path.join(tmp, "params.csv"), index=False)

    # Check that the ensemble object has not tainted the realization dataframe:
    assert "REAL" not in reekensemble._realizations[0].get_df("parameters.txt")

    # The column FOO in parameters is only present in some, and
    # is present with NaN in real0:
    assert "FOO" in reekensemble.parameters.columns
    assert len(reekensemble.parameters["FOO"].dropna()) == 1
    # (NaN ine one real, and non-existing in the others is the same thing)

    # Test loading of another txt file:
    reekensemble.load_txt("outputs.txt")
    assert "NPV" in reekensemble.load_txt("outputs.txt").columns
    # Check implicit discovery
    assert "outputs.txt" in reekensemble.files["LOCALPATH"].values
    assert all([os.path.isabs(x) for x in reekensemble.files["FULLPATH"]])

    # File discovery:
    csvvolfiles = reekensemble.find_files("share/results/volumes/*csv",
                                          metadata={"GRID": "simgrid"})
    assert isinstance(csvvolfiles, pd.DataFrame)
    assert "REAL" in csvvolfiles
    assert "FULLPATH" in csvvolfiles
    assert "LOCALPATH" in csvvolfiles
    assert "BASENAME" in csvvolfiles
    # Check the explicit metadata:
    assert "GRID" in csvvolfiles
    assert csvvolfiles["GRID"].unique() == ["simgrid"]

    reekensemble.files.to_csv(os.path.join(tmp, "files.csv"), index=False)

    # Check that rediscovery does not mess things up:

    filecount = len(reekensemble.files)
    newfiles = reekensemble.find_files("share/results/volumes/*csv")
    # Also note that we skipped metadata here in rediscovery:

    assert len(reekensemble.files) == filecount
    assert len(newfiles) == len(csvvolfiles)

    # The last invocation of find_files() should not return the metadata
    assert len(newfiles.columns) + 1 == len(csvvolfiles.columns)

    # FULLPATH should always contain absolute paths
    assert all([os.path.isabs(x) for x in reekensemble.files["FULLPATH"]])

    # The metadata in the rediscovered files should have been removed
    assert len(
        reekensemble.files[reekensemble.files["GRID"] == "simgrid"]) == 0

    # CSV files
    csvpath = "share/results/volumes/simulator_volume_fipnum.csv"
    vol_df = reekensemble.load_csv(csvpath)

    # Check that we have not tainted the realization dataframes:
    assert "REAL" not in reekensemble._realizations[0].get_df(csvpath)

    assert "REAL" in vol_df
    assert len(vol_df["REAL"].unique()) == 3  # missing in 2 reals
    vol_df.to_csv(os.path.join(tmp, "simulatorvolumes.csv"), index=False)

    # Test retrival of cached data
    vol_df2 = reekensemble.get_df(csvpath)

    assert "REAL" in vol_df2
    assert len(vol_df2["REAL"].unique()) == 3  # missing in 2 reals

    # Realization deletion:
    reekensemble.remove_realizations([1, 3])
    assert len(reekensemble) == 3

    # Readd the same realizations
    reekensemble.add_realizations([
        testdir + "/data/testensemble-reek001/" + "realization-1/iter-0",
        testdir + "/data/testensemble-reek001/" + "realization-3/iter-0",
    ])
    assert len(reekensemble) == 5
    assert len(reekensemble.files) == 24

    # File discovery must be repeated for the newly added realizations
    reekensemble.find_files(
        "share/results/volumes/" + "simulator_volume_fipnum.csv",
        metadata={"GRID": "simgrid"},
    )
    assert len(reekensemble.files) == 25
    # Test addition of already added realization:
    reekensemble.add_realizations(testdir + "/data/testensemble-reek001/" +
                                  "realization-1/iter-0")
    assert len(reekensemble) == 5
    assert len(reekensemble.files) == 24  # discovered files are lost!

    keycount = len(reekensemble.keys())
    reekensemble.remove_data("parameters.txt")
    assert len(reekensemble.keys()) == keycount - 1

Example #8

0

Show file

def test_get_df():
    """Test the data retrieval functionality

    get_df() in the ensemble context is an aggregator, that will aggregate
    data from individual realaizations to the ensemble level, with
    optional merging capabilities performed on realization level."""
    testdir = os.path.dirname(os.path.abspath(__file__))
    ens = ScratchEnsemble(
        "reektest",
        testdir + "/data/testensemble-reek001/" + "realization-*/iter-0")
    smry = ens.load_smry(column_keys="FO*", time_index="yearly")
    assert not ens.get_df("unsmry--yearly").empty
    assert not ens.get_df("unsmry--yearly.csv").empty
    assert not ens.get_df("share/results/tables/unsmry--yearly").empty
    assert not ens.get_df("share/results/tables/unsmry--yearly.csv").empty
    with pytest.raises(KeyError):
        # pylint: disable=pointless-statement
        ens.get_df("unsmry--monthly")
    ens.load_smry(column_keys="FO*", time_index="monthly")
    assert not ens.get_df("unsmry--monthly").empty
    with pytest.raises(KeyError):
        # pylint: disable=pointless-statement
        ens.get_df("unsmry-monthly")

    # Tests that we can do merges directly:
    params = ens.get_df("parameters.txt")
    smryparams = ens.get_df("unsmry--yearly", merge="parameters")
    # The set union is to handle the REAL column present in both smry and params:
    assert len(smryparams.columns) == len(
        set(smry.columns).union(params.columns))

    # Test multiple merges:
    outputs = ens.load_txt("outputs.txt")
    assert len(
        ens.get_df("unsmry--yearly",
                   merge=["parameters", "outputs.txt"]).columns) == len(
                       set(smry.columns).union(params.columns).union(
                           outputs.columns))

    # Try merging dataframes:
    ens.load_csv("share/results/volumes/simulator_volume_fipnum.csv")

    # Inject a mocked dataframe to the realization, there is
    # no "add_data" API for ensembles, but we can use the apply()
    # functionality
    def fipnum2zone():
        """Helper function for injecting mocked frame into
        each realization"""
        return pd.DataFrame(
            columns=["FIPNUM", "ZONE"],
            data=[
                [1, "UpperReek"],
                [2, "MidReek"],
                [3, "LowerReek"],
                [4, "UpperReek"],
                [5, "MidReek"],
                [6, "LowerReek"],
            ],
        )

    ens.apply(fipnum2zone, localpath="fipnum2zone")
    volframe = ens.get_df("simulator_volume_fipnum", merge="fipnum2zone")

    assert "ZONE" in volframe
    assert "FIPNUM" in volframe
    assert "STOIIP_OIL" in volframe
    assert len(volframe["ZONE"].unique()) == 3

    # Merge with scalar data:
    ens.load_scalar("npv.txt")
    vol_npv = ens.get_df("simulator_volume_fipnum", merge="npv.txt")
    # (this particular data combination does not really make sense)
    assert "STOIIP_OIL" in vol_npv
    assert "npv.txt" in vol_npv