def test_volumetric_rates(): """Test the summary resampling code for virtual ensembles We only need to test the aggregation here. """ if "__file__" in globals(): # Easen up copying test code into interactive sessions testdir = os.path.dirname(os.path.abspath(__file__)) else: testdir = os.path.abspath(".") reekensemble = ScratchEnsemble( "reektest", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0" ) reekensemble.load_smry(time_index="yearly", column_keys=["F*"]) reekensemble.load_scalar("npv.txt") vens = reekensemble.to_virtual() vol_rates = vens.get_volumetric_rates(column_keys="FOPT", time_index="yearly") assert isinstance(vol_rates, pd.DataFrame) assert "REAL" in vol_rates assert "DATE" in vol_rates assert "FOPR" in vol_rates assert len(vol_rates) == 25
def test_get_df_merge(): """Testing merge support in get_df()""" if "__file__" in globals(): # Easen up copying test code into interactive sessions testdir = os.path.dirname(os.path.abspath(__file__)) else: testdir = os.path.abspath(".") reekensemble = ScratchEnsemble( "reektest", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0" ) reekensemble.load_smry(time_index="yearly", column_keys=["F*"]) reekensemble.load_scalar("npv.txt") reekensemble.load_csv("share/results/volumes/simulator_volume_fipnum.csv") outputs = reekensemble.load_txt("outputs.txt") vens = reekensemble.to_virtual() params = vens.get_df("parameters.txt") smrycount = len(vens.get_df("unsmry--yearly").columns) smryparams = vens.get_df("unsmry--yearly", merge="parameters") # The "minus 1" is due to the REAL column being present in both tables. assert len(smryparams.columns) == len(params.columns) + smrycount - 1 paramsoutputs = vens.get_df("parameters", merge=["outputs"]) assert len(paramsoutputs.columns) == len(params.columns) + len(outputs.columns) - 1 assert ( len(vens.get_df("unsmry--yearly", merge=["parameters", "outputs"]).columns) == smrycount + len(params.columns) + len(outputs.columns) - 2 ) assert ( len(vens.get_df("parameters", merge="npv.txt").columns) == len(params.columns) + 1 ) # Symmetry: assert ( len(vens.get_df("npv.txt", merge="parameters.txt").columns) == len(params.columns) + 1 ) # Merge with zone data, inject a mocked dataframe to the realization: vens.data["fipnum2zone"] = pd.DataFrame( columns=["FIPNUM", "ZONE"], data=[ [1, "UpperReek"], [2, "MidReek"], [3, "LowerReek"], [4, "UpperReek"], [5, "MidReek"], [6, "LowerReek"], ], ) volframe = vens.get_df("simulator_volume_fipnum", merge="fipnum2zone") assert "ZONE" in volframe assert "FIPNUM" in volframe assert "STOIIP_OIL" in volframe assert len(volframe["ZONE"].unique()) == 3
def test_manual_aggregation(): """Test that aggregating an ensemble using RealizationCombination is the same as calling agg() on the ensemble""" if "__file__" in globals(): # Easen up copying test code into interactive sessions testdir = os.path.dirname(os.path.abspath(__file__)) else: testdir = os.path.abspath(".") reekensemble = ScratchEnsemble( "reektest", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0") reekensemble.load_smry(time_index="yearly", column_keys=["F*"]) reekensemble.load_csv("share/results/volumes/simulator_volume_fipnum.csv") # Aggregate an ensemble into a virtual "mean" realization mean = reekensemble.agg("mean") # Combine the ensemble members directly into a mean computation. # Also returns a virtual realization. manualmean = (1 / 5 * (reekensemble[0] + reekensemble[1] + reekensemble[2] + reekensemble[3] + reekensemble[4])) # Commutativity proof: assert mean["parameters"]["RMS_SEED"] == manualmean["parameters"][ "RMS_SEED"]
def test_get_smry_meta(tmpdir): """Test the conservation of smry meta-data in virtual ensembles""" if "__file__" in globals(): # Easen up copying test code into interactive sessions testdir = os.path.dirname(os.path.abspath(__file__)) else: testdir = os.path.abspath(".") reekensemble = ScratchEnsemble( "reekmetatest", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0") # If no smry loaded before virtualization, nothing should be there: assert "__smry_metadata" not in reekensemble.to_virtual().keys() reekensemble.load_smry(time_index="yearly", column_keys=["F*"]) origmeta = reekensemble.get_smry_meta() vens = reekensemble.to_virtual() assert "__smry_metadata" in vens.keys() meta = vens.get_df("__smry_metadata") # Internally it is stored as a DataFrame, we check that # since it is possible to get it using get_df(), and thereby # almost part of the API assert isinstance(meta, pd.DataFrame) # But rather users should use get_smry_meta() to obtain # stuff from the internal frame __smry_metadata: metadict = vens.get_smry_meta() assert isinstance(metadict, dict) assert len(metadict) + 2 == len( vens.get_smry(time_index="yearly", column_keys="*").columns) # (the vens only knows of F* columns) assert len(metadict) + 2 == len( vens.get_smry(time_index="yearly", column_keys="F*").columns) assert origmeta["FOPT"] == metadict["FOPT"] assert origmeta["FWPTH"] == metadict["FWPTH"] assert not vens.get_smry_meta([]) assert vens.get_smry_meta(column_keys="FOPT")["FOPT"] == origmeta["FOPT"] assert not vens.get_smry_meta(column_keys="WOPT:NOTEXISTING") # Test that it is retrievable after dumping to disk: vens_disk_path = str(tmpdir.join("vens_dumped")) vens.to_disk(vens_disk_path) disk_vens = VirtualEnsemble(fromdisk=vens_disk_path) metadict = disk_vens.get_smry_meta() assert isinstance(metadict, dict) assert len(metadict) + 2 == len( vens.get_smry(time_index="yearly", column_keys="*").columns) # (the vens only knows of F* columns) assert len(metadict) + 2 == len( vens.get_smry(time_index="yearly", column_keys="F*").columns) assert origmeta["FOPT"] == metadict["FOPT"] assert origmeta["FWPTH"] == metadict["FWPTH"]
def test_todisk_includefile(tmpdir): """Test that we can write VirtualEnsembles to the filesystem in a retrievable manner with discovered files included""" if "__file__" in globals(): # Easen up copying test code into interactive sessions testdir = os.path.dirname(os.path.abspath(__file__)) else: testdir = os.path.abspath(".") reekensemble = ScratchEnsemble( "reektest", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0" ) tmpdir.chdir() reekensemble.load_smry(time_index="monthly", column_keys="*") reekensemble.load_smry(time_index="daily", column_keys="*") reekensemble.load_smry(time_index="yearly", column_keys="F*") reekensemble.load_smry(column_keys="FOPT") reekensemble.load_scalar("npv.txt") reekensemble.load_txt("outputs.txt") vens = reekensemble.to_virtual() vens.to_disk("vens_dumped_files", delete=True, includefiles=True, symlinks=True) for real in [0, 1, 2, 4, 4]: runpath = os.path.join( "vens_dumped_files", "__discoveredfiles", "realization-" + str(real) ) assert os.path.exists(runpath) assert os.path.exists( os.path.join(runpath, "eclipse/model/2_R001_REEK-" + str(real) + ".UNSMRY") )
def test_eclsumcaching(): """Test caching of eclsum""" if "__file__" in globals(): # Easen up copying test code into interactive sessions testdir = os.path.dirname(os.path.abspath(__file__)) else: testdir = os.path.abspath(".") dirs = testdir + "/data/testensemble-reek001/" + "realization-*/iter-0" ens = ScratchEnsemble("reektest", dirs) # The problem here is if you load in a lot of UNSMRY files # and the Python process keeps them in memory. Not sure # how to check in code that an object has been garbage collected # but for garbage collection to work, at least the realization # _eclsum variable must be None. ens.load_smry() # Default is to do caching, so these will not be None: assert all([x._eclsum for (idx, x) in ens.realizations.items()]) # If we redo this operation, the same objects should all # be None afterwards: ens.load_smry(cache_eclsum=False) # cache_eclsum==None is from v1.1.5 no longer equivalent to False assert not any([x._eclsum for (idx, x) in ens.realizations.items()]) ens.get_smry() assert all([x._eclsum for (idx, x) in ens.realizations.items()]) ens.get_smry(cache_eclsum=False) assert not any([x._eclsum for (idx, x) in ens.realizations.items()]) ens.get_smry_stats() assert all([x._eclsum for (idx, x) in ens.realizations.items()]) ens.get_smry_stats(cache_eclsum=False) assert not any([x._eclsum for (idx, x) in ens.realizations.items()]) ens.get_smry_dates() assert all([x._eclsum for (idx, x) in ens.realizations.items()]) # Clear the cached objects because the statement above has cached it.. for _, realization in ens.realizations.items(): realization._eclsum = None ens.get_smry_dates(cache_eclsum=False) assert not any([x._eclsum for (idx, x) in ens.realizations.items()])
def _load_smry_dataframe_using_fmu( ens_path: str, frequency: Optional[Frequency] ) -> pd.DataFrame: time_index: str = "raw" if frequency: time_index = frequency.value print(f"## Loading data into DataFrame using FMU time_index={time_index}...") scratch_ensemble = ScratchEnsemble("tempEnsName", paths=ens_path) df = scratch_ensemble.load_smry(time_index=time_index) df = _make_date_column_datetime_object(df) # Convert float columns to float32 and real column to int32 floatcols = df.select_dtypes("float").columns df[floatcols] = df[floatcols].apply(pd.to_numeric, downcast="float") df["REAL"] = df["REAL"].astype("int32") # Sort on real, then date to align with provider df.sort_values(by=["REAL", "DATE"], inplace=True) df.reset_index(drop=True, inplace=True) return df
def test_virtual_observations(): """Construct an virtual(?) observation object from a specific summary vector and use it to rank realizations for similarity. """ # We need an ensemble to work with: if "__file__" in globals(): # Easen up copying test code into interactive sessions testdir = os.path.dirname(os.path.abspath(__file__)) else: testdir = os.path.abspath(".") ens = ScratchEnsemble( "test", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0/") ens.load_smry(column_keys=["FOPT", "FGPT", "FWPT", "FWCT", "FGOR"], time_index="yearly") # And we need some VirtualRealizations virtreals = { "p90realization": ens.agg("p90"), "meanrealization": ens.agg("mean"), "p10realization": ens.agg("p10"), } summaryvector = "FOPT" representative_realizations = {} for virtrealname, virtreal in six.iteritems(virtreals): # Create empty observation object obs = Observations({}) obs.load_smry(virtreal, summaryvector, time_index="yearly") # Calculate how far each realization is from this observation set # (only one row pr. realization, as FOPTH is only one observation unit) mis = obs.mismatch(ens) closest_realization = ( mis.groupby("REAL").sum()["L2"].sort_values().index.values[0]) representative_realizations[virtrealname] = closest_realization assert representative_realizations["meanrealization"] == 4 assert representative_realizations["p90realization"] == 2 assert representative_realizations["p10realization"] == 1
def test_get_smry_interpolation(): """Test the summary resampling code for virtual ensembles""" if "__file__" in globals(): # Easen up copying test code into interactive sessions testdir = os.path.dirname(os.path.abspath(__file__)) else: testdir = os.path.abspath(".") reekensemble = ScratchEnsemble( "reektest", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0") reekensemble.load_smry(time_index="yearly", column_keys=["F*"]) reekensemble.load_scalar("npv.txt") vens_yearly = reekensemble.to_virtual() reekensemble.load_smry(time_index="monthly", column_keys=["F*"]) # Create a vens that contains both monthly and yearly: vens_monthly = reekensemble.to_virtual() assert "npv.txt" in vens_monthly.keys() reekensemble.load_smry(time_index="daily", column_keys=["F*"]) _ = reekensemble.to_virtual() # monthly, yearly *and* daily # Resample yearly to monthly: monthly = vens_yearly.get_smry(column_keys="FOPT", time_index="monthly") assert "FOPT" in monthly.columns assert "REAL" in monthly.columns assert "DATE" in monthly.columns assert len(monthly["REAL"].unique()) == 5 # 12 months pr. year, including final 1. jan, four years, 5 realizations: assert len(monthly) == (12 * 4 + 1) * 5 for realidx in monthly["REAL"].unique(): int_m = monthly.set_index("REAL").loc[realidx].set_index("DATE") true_m = (reekensemble.get_smry( column_keys="FOPT", time_index="monthly").set_index( "REAL").loc[realidx].set_index("DATE")) difference = int_m["FOPT"] - true_m["FOPT"] # The interpolation error should be zero at each 1st of January # but most likely nonzero elsewhere (at least for these realization) assert difference.loc["2001-01-01"] < 0.0001 assert abs(difference.loc["2001-06-01"]) > 0 assert difference.loc["2002-01-01"] < 0.0001 assert abs(difference.loc["2002-06-01"]) > 0 assert difference.loc["2003-01-01"] < 0.0001 daily = vens_yearly.get_smry(column_keys=["FOPT", "FOPR"], time_index="daily") assert "FOPT" in daily.columns assert "REAL" in daily.columns assert "DATE" in daily.columns assert len(daily["REAL"].unique()) == 5 assert len(daily) == (365 * 4 + 2) * 5 # 2003-01-01 and 2003-01-02 at end # Linear interpolation will give almost unique values everywhere: assert len(daily["FOPT"].unique()) > (365 * 4) * 5 # While bfill for rates cannot be more unique than the yearly input assert len(daily["FOPR"].unique()) < 4 * 5 # Must be less than the numbers
def test_noparameters(): """Test what happens when parameters.txt is missing""" testdir = os.path.dirname(os.path.abspath(__file__)) reekensemble = ScratchEnsemble( "reektest", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0" ) # Parameters.txt exist on disk, so it is loaded: assert not reekensemble.parameters.empty # Remove it each realization: reekensemble.remove_data("parameters.txt") assert reekensemble.parameters.empty # However, when parameters.txt is excplicitly asked for, # an exception should be raised: with pytest.raises(KeyError): reekensemble.get_df("parameters.txt") reekensemble.load_smry(time_index="yearly", column_keys="FOPT") assert not reekensemble.get_df("unsmry--yearly").empty with pytest.raises(KeyError): reekensemble.get_df("unsmry--yearly", merge="parameters.txt")
def test_filedescriptors(): """Test how filedescriptors are used. The lazy_load option to EclSum affects this, if it is set to True file descriptors are not closed (and True is the default). In order to be able to open thousands of smry files, we need to always close the file descriptors when possible, and therefore lazy_load should be set to False in realization.py""" if "__file__" in globals(): # Easen up copying test code into interactive sessions testdir = os.path.dirname(os.path.abspath(__file__)) else: testdir = os.path.abspath(".") fd_dir = "/proc/" + str(os.getpid()) + "/fd" if not os.path.exists(fd_dir): print("Counting file descriptors on non-Linux not supported") return fd_count1 = len(os.listdir(fd_dir)) reekensemble = ScratchEnsemble( "reektest", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0") # fd_count2 = len(os.listdir(fd_dir)) reekensemble.load_smry() # fd_count3 = len(os.listdir(fd_dir)) del reekensemble fd_count4 = len(os.listdir(fd_dir)) # As long as lazy_load = False, we should have 5,5,5,5 from this # If lazy_load is True (default), then we get 15, 15, 25, 20 # (that last number pattern reveals a (now fixed) bug in EclSum) # print(fd_count1, fd_count2, fd_count3, fd_count4) assert fd_count1 == fd_count4
def _dump_smry_to_csv_using_fmu(ens_path: str, time_index: str, output_csv_file: str) -> None: scratch_ensemble = ScratchEnsemble("tempEnsName", paths=ens_path) df = scratch_ensemble.load_smry(time_index=time_index) df.sort_values(["DATE", "REAL"], inplace=True) print("Dataframe shape::", df.shape) unique_dates = df["DATE"].unique() print("Num unique dates:", len(unique_dates)) print(unique_dates) unique_reals = df["REAL"].unique() print("Num unique reals:", len(unique_reals)) print(unique_reals) df.to_csv(output_csv_file, index=False)
def test_ens_premature_ecl(tmpdir): """Check an ensemble where Eclipse has failed early in realization 1""" if "__file__" in globals(): testdir = os.path.dirname(os.path.abspath(__file__)) else: testdir = os.path.abspath(".") origensemble = ScratchEnsemble( "origreek", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0" ) raw_orig_smry = origensemble.load_smry() # Copy the ensemble to /tmp so we can modify the UNSMRY file in real 2: tmpdir.chdir() shutil.copytree(testdir + "/data/testensemble-reek001", "ens_fail_real_reek001") unsmry_filename = ( "ens_fail_real_reek001/realization-1/" + "iter-0/eclipse/model/2_R001_REEK-1.UNSMRY" ) shutil.copy(unsmry_filename + "-failed2000", unsmry_filename) failensemble = ScratchEnsemble( "failedreek", "ens_fail_real_reek001/realization-*/iter-0" ) raw_fail_smry = failensemble.load_smry() # This is usually superfluous when raw datetimes are obtained. raw_orig_smry["DATE"] = pd.to_datetime(raw_orig_smry["DATE"]) raw_fail_smry["DATE"] = pd.to_datetime(raw_fail_smry["DATE"]) # Homogeneous max-date in orig smry: assert len(raw_orig_smry.groupby("REAL").max()["DATE"].unique()) == 1 # Different values for raw_fail: assert len(raw_fail_smry.groupby("REAL").max()["DATE"].unique()) == 2 # END statement in schedule file on 2000-08-01 yields this: assert ( str(raw_fail_smry.groupby("REAL").max()["DATE"].loc[1]) == "2000-08-01 00:00:00" ) # Filter away all those that did not make it to the end. In normal scenarios, # this would be accomplished by .filter('OK'), but not in this test context. max_date = str(failensemble.get_smry()["DATE"].max()) filtered_fail_ensemble = failensemble.filter( "unsmry--raw", column="DATE", columncontains=max_date, inplace=False ) assert len(filtered_fail_ensemble) == 4 assert ( len(filtered_fail_ensemble.get_smry().groupby("REAL").max()["DATE"].unique()) == 1 ) # Check also get_smry(): assert len(failensemble.get_smry().groupby("REAL").max()["DATE"].unique()) == 2 # With time_index set to something, then all realization will get # interpolated onto the same date range assert ( len( failensemble.get_smry(time_index="monthly") .groupby("REAL") .max()["DATE"] .unique() ) == 1 ) # This is in fact *different* from what you would get from load_smry (issue #97) assert ( len( failensemble.load_smry(time_index="monthly") .groupby("REAL") .max()["DATE"] .unique() ) == 2 ) # (this behaviour might change, get_smry() is allowed in # the future to mimic load_smry()) # Check that FOPT is very much lower in real 1 in failed ensemble: assert ( failensemble.get_smry(column_keys="FOPT", time_index="monthly") .groupby("REAL") .max()["FOPT"] .loc[1] < 1500000 ) assert ( origensemble.get_smry(column_keys="FOPT", time_index="monthly") .groupby("REAL") .max()["FOPT"] .loc[1] > 6000000 ) # Also for yearly assert ( failensemble.get_smry(column_keys="FOPT", time_index="yearly") .groupby("REAL") .max()["FOPT"] .loc[1] < 1500000 ) assert ( origensemble.get_smry(column_keys="FOPT", time_index="yearly") .groupby("REAL") .max()["FOPT"] .loc[1] > 6000000 ) fail_foprs = failensemble.get_smry(column_keys="FOPR", time_index="monthly") # The FOPR rate vector should be all zero after the stop assert ( fail_foprs[ (fail_foprs["REAL"] == 1) & (fail_foprs["DATE"] > datetime.date(2000, 8, 1)) ]["FOPR"] .abs() .sum() == 0 ) assert ( fail_foprs[ (fail_foprs["REAL"] == 0) & (fail_foprs["DATE"] > datetime.date(2000, 8, 1)) ]["FOPR"] .abs() .sum() > 0 ) # This frame treats the "failed" realization as correct, # and it will affect the stats: fail_stats = failensemble.get_smry_stats(time_index="monthly") # Here, real 1 is removed filtered_stats = filtered_fail_ensemble.get_smry_stats(time_index="monthly") # Original stats orig_stats = origensemble.get_smry_stats(time_index="monthly") # The 30 last rows are the rows from 2000-09-01 to 2003-02-01: assert fail_stats.loc["minimum"]["FOPR"].iloc[-30:].abs().sum() == 0 assert fail_stats.loc["minimum"]["FOPT"].iloc[-30:].unique()[0] == 1431247.125 # Oh no, in filtered stats, the last date 2003-02-01 is # not included, probably a minor bug! # But that means that the indexing of the last 30 is a little bit rogue. # (this test should work even that bug is fixed) assert filtered_stats.loc["minimum"]["FOPR"].iloc[-29:].abs().sum() > 0 assert len(filtered_stats.loc["minimum"]["FOPT"].iloc[-29:].unique()) == 29 # Mean FOPR and FOPT should be affected by the zero-padded rates: assert ( fail_stats.loc["mean"].iloc[-10]["FOPR"] < filtered_stats.loc["mean"].iloc[-10]["FOPR"] ) assert ( fail_stats.loc["mean"].iloc[-10]["FOPR"] < orig_stats.loc["mean"].iloc[-10]["FOPR"] ) assert ( fail_stats.loc["mean"].iloc[-10]["FOPT"] < filtered_stats.loc["mean"].iloc[-10]["FOPT"] ) assert ( fail_stats.loc["mean"].iloc[-10]["FOPT"] < orig_stats.loc["mean"].iloc[-10]["FOPT"] ) # Delta profiles: delta_fail = origensemble - failensemble # Delta profiles are given for all realizations delta_fail_smry = delta_fail.get_smry() assert len(delta_fail_smry["REAL"].unique()) == 5 # and they all end at the same ultimate date: assert len(delta_fail_smry.groupby("REAL").max()["DATE"].unique()) == 1 # BUT, there is only NaNs for values after 2000-08-01: assert np.isnan( delta_fail_smry[ (delta_fail_smry["REAL"] == 1) & (delta_fail_smry["DATE"] > "2000-08-01") ]["FOPT"].unique()[0] ) # Delta profiles after filtering: delta_filtered = origensemble - filtered_fail_ensemble assert len(origensemble) == 5 assert len(filtered_fail_ensemble) == 4 # assert len(delta_filtered) == 4 # Only four realizations (requires #83 resolved) # to_virtual() and time_index can be removed when #83 is finished. delta_filtered_smry = delta_filtered.to_virtual().get_smry(time_index="monthly") # Should contain only four realizations, as one has been filtered away assert len(delta_filtered_smry["REAL"].unique()) == 4 # Ultimate date is the same in all four: assert len(delta_filtered_smry.groupby("REAL").max()["DATE"].unique()) == 1
def test_ensemble_aggregations(tmpdir): """Test aggregations of ensembles, that is taking means, medians, p10 and so on, producing virtual realizations""" if "__file__" in globals(): # Easen up copying test code into interactive sessions testdir = os.path.dirname(os.path.abspath(__file__)) else: testdir = os.path.abspath(".") reekensemble = ScratchEnsemble( "reektest", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0") reekensemble.load_smry(time_index="monthly", column_keys=["F*"]) reekensemble.load_smry(time_index="yearly", column_keys=["F*"]) reekensemble.load_csv("share/results/volumes/simulator_volume_fipnum.csv") reekensemble.load_scalar("npv.txt", convert_numeric=True) stats = { "mean": reekensemble.agg("mean"), "median": reekensemble.agg("median"), "min": reekensemble.agg("min"), "max": reekensemble.agg("max"), "p10": reekensemble.agg("p10"), # low estimate "p90": reekensemble.agg("p90"), # high estimate } tmpdir.chdir() stats["min"].to_disk("virtreal_min", delete=True) stats["max"].to_disk("virtreal_max", delete=True) stats["mean"].to_disk("virtreal_mean", delete=True) assert (stats["min"]["parameters.txt"]["RMS_SEED"] < stats["max"]["parameters.txt"]["RMS_SEED"]) assert (stats["min"]["parameters.txt"]["RMS_SEED"] <= stats["p10"]["parameters.txt"]["RMS_SEED"]) assert (stats["p10"]["parameters.txt"]["RMS_SEED"] <= stats["median"]["parameters.txt"]["RMS_SEED"]) assert (stats["median"]["parameters.txt"]["RMS_SEED"] <= stats["p90"]["parameters.txt"]["RMS_SEED"]) assert (stats["p90"]["parameters.txt"]["RMS_SEED"] <= stats["max"]["parameters.txt"]["RMS_SEED"]) assert (stats["min"]["parameters.txt"]["RMS_SEED"] <= stats["mean"]["parameters.txt"]["RMS_SEED"]) assert (stats["min"]["parameters.txt"]["RMS_SEED"] <= stats["max"]["parameters.txt"]["RMS_SEED"]) assert (stats["min"]["unsmry--monthly"]["FOPT"].iloc[-1] < stats["max"]["unsmry--monthly"]["FOPT"].iloc[-1]) # .loc[2] corresponds to FIPNUM=3 assert (stats["min"]["simulator_volume_fipnum"].iloc[2]["STOIIP_OIL"] < stats["mean"]["simulator_volume_fipnum"].iloc[2]["STOIIP_OIL"]) assert (stats["mean"]["simulator_volume_fipnum"].loc[2]["STOIIP_OIL"] < stats["max"]["simulator_volume_fipnum"].loc[2]["STOIIP_OIL"]) # Aggregation of STATUS also works. Note that min and max # works for string columns, so the available data will vary # depending on aggregation method assert (stats["p10"]["STATUS"].iloc[49]["DURATION"] < stats["max"]["STATUS"].iloc[49]["DURATION"]) # job 49 is the Eclipse forward model assert "npv.txt" in stats["mean"].keys() assert stats["mean"]["npv.txt"] == 3382.5 # Test agg(excludekeys=..) assert "STATUS" not in reekensemble.agg("mean", excludekeys="STATUS").keys() assert "STATUS" not in reekensemble.agg("mean", keylist=["parameters.txt"]).keys() assert (reekensemble.agg("p01")["parameters"]["RMS_SEED"] < reekensemble.agg("p99")["parameters"]["RMS_SEED"]) with pytest.raises(ValueError): reekensemble.agg("foobar") # Check that include/exclude functionality in agg() works: assert ("parameters.txt" not in reekensemble.agg("mean", excludekeys="parameters.txt").keys()) assert ("parameters.txt" not in reekensemble.agg("mean", excludekeys=["parameters.txt"]).keys()) assert "parameters.txt" not in reekensemble.agg("mean", keylist="STATUS").keys() assert "parameters.txt" not in reekensemble.agg("mean", keylist=["STATUS"]).keys() # Shorthand notion works for keys to include, but they # should get returned with fully qualified paths. assert ("share/results/tables/unsmry--yearly.csv" in reekensemble.agg("mean", keylist="unsmry--yearly").keys()) assert ("share/results/tables/unsmry--yearly.csv" in reekensemble.agg("mean", keylist=["unsmry--yearly"]).keys()) assert isinstance( reekensemble.agg("mean", keylist="unsmry--yearly").get_df("unsmry--yearly"), pd.DataFrame, )
def test_filter(): """Test filtering of realizations in ensembles Realizations not fulfilling tested conditions are dropped from the ensemble""" if "__file__" in globals(): # Easen up copying test code into interactive sessions testdir = os.path.dirname(os.path.abspath(__file__)) else: testdir = os.path.abspath(".") dirs = testdir + "/data/testensemble-reek001/" + "realization-*/iter-0" reekensemble = ScratchEnsemble("reektest", dirs) # This should just require a STATUS file to be there # for every realization assert len(reekensemble.filter("STATUS")) == 5 # Test string equivalence on numeric data: reekensemble.filter("parameters.txt", key="RMS_SEED", value="723121249", inplace=True) assert len(reekensemble) == 2 # (False positive from pylint on this line) assert reekensemble.agg("mean")["parameters"]["RMS_SEED"] == 723121249 # Test numeric equivalence reekensemble = ScratchEnsemble("reektest", dirs) reekensemble.filter("parameters.txt", key="RMS_SEED", value=723121249, inplace=True) assert len(reekensemble) == 2 assert reekensemble.agg("mean")["parameters"]["RMS_SEED"] == 723121249 reekensemble = ScratchEnsemble("reektest", dirs) filtered = reekensemble.filter("parameters.txt", key="FOO", inplace=False) assert len(filtered) == 2 # (NaN in one of the parameters.txt is True in this context) filtered = reekensemble.filter("parameters.txt", key="MULTFLT_F1", value=0.001, inplace=False) assert len(filtered) == 4 assert (len( reekensemble.filter("parameters.txt", key="FWL", value=1700, inplace=False)) == 3) assert (len( reekensemble.filter("parameters.txt", key="FWL", value="1700", inplace=False)) == 3) # This one is tricky, the empty string should correspond to # missing data - NOT IMPLEMENTED # assert len(reekensemble.filter('parameters.txt', key='FOO', # value='', inplace=False) == 4) # while no value means that the key must be present assert len(reekensemble.filter("parameters.txt", key="FOO", inplace=False)) == 2 # 'key' is not accepted for things that are tables. with pytest.raises(ValueError): reekensemble.filter("STATUS", key="ECLIPSE") with pytest.raises(ValueError): reekensemble.filter("STATUS", value="ECLIPSE") # Check column presence assert len(reekensemble.filter("STATUS", column="FORWARD_MODEL")) == 5 assert (len( reekensemble.filter("STATUS", column="FORWARD_MODEL", inplace=False)) == 5) assert not reekensemble.filter("STATUS", column="FOOBAR", inplace=False) with pytest.raises(ValueError): reekensemble.filter("STATUS", wrongarg="FOOBAR", inplace=False) assert (len( reekensemble.filter("STATUS", column="FORWARD_MODEL", columncontains="ECLIPSE100_2014.2")) == 5) assert not reekensemble.filter( "STATUS", column="FORWARD_MODEL", columncontains="ECLIPSE100_2010.2", inplace=False, ) reekensemble.load_smry() assert len(reekensemble.filter("unsmry--raw")) == 5 assert len(reekensemble.filter("unsmry--raw", column="FOPT")) == 5 assert not reekensemble.filter( "unsmry--raw", column="FOOBAR", inplace=False) assert len( reekensemble.filter("unsmry--raw", column="FOPT", columncontains=0)) == 5 assert not reekensemble.filter( "unsmry--raw", column="FOPT", columncontains=-1000, inplace=False) assert (len( reekensemble.filter("unsmry--raw", column="FOPT", columncontains=6025523.0, inplace=False)) == 1) assert (len( reekensemble.filter("unsmry--raw", column="FOPT", columncontains=6025523, inplace=False)) == 1) # We do not support strings here (not yet) # assert len(reekensemble.filter('unsmry--raw', column='FOPT', # columncontains='6025523.0', # inplace=False)) == 1 assert (len( reekensemble.filter("unsmry--raw", column="DATE", columncontains="2002-11-25", inplace=False)) == 5) assert (len( reekensemble.filter( "unsmry--raw", column="DATE", columncontains="2002-11-25 00:00:00", inplace=False, )) == 5) assert not reekensemble.filter( "unsmry--raw", column="DATE", columncontains="2002-11-25 00:00:01", inplace=False, ) assert (len( reekensemble.filter( "unsmry--raw", column="DATE", columncontains="2000-01-07 02:26:15", inplace=False, )) == 3) assert not reekensemble.filter("unsmry--raw", column="DATE", columncontains="2000-01-07", inplace=False)
def test_ensemble_ecl(): """Eclipse specific functionality""" if "__file__" in globals(): # Easen up copying test code into interactive sessions testdir = os.path.dirname(os.path.abspath(__file__)) else: testdir = os.path.abspath(".") reekensemble = ScratchEnsemble( "reektest", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0") # Eclipse summary keys: assert len(reekensemble.get_smrykeys("FOPT")) == 1 assert len(reekensemble.get_smrykeys("F*")) == 49 assert len(reekensemble.get_smrykeys(["F*", "W*"])) == 49 + 280 assert not reekensemble.get_smrykeys("BOGUS") # reading ensemble dataframe monthly = reekensemble.load_smry(time_index="monthly") monthly = reekensemble.load_smry(column_keys=["F*"], time_index="monthly") assert monthly.columns[0] == "REAL" # Enforce order of columns. assert monthly.columns[1] == "DATE" assert len(monthly) == 190 # Check that the result was cached in memory, not necessarily on disk.. assert isinstance(reekensemble.get_df("unsmry--monthly.csv"), pd.DataFrame) assert len(reekensemble.keys()) == 4 # When asking the ensemble for FOPR, we also get REAL as a column # in return. Note that the internal stored version will be # overwritten by each load_smry() assert len(reekensemble.load_smry(column_keys=["FOPR"]).columns) == 3 assert len(reekensemble.load_smry(column_keys=["FOP*"]).columns) == 11 assert len( reekensemble.load_smry(column_keys=["FGPR", "FOP*"]).columns) == 12 # Check that there is now a cached version with raw dates: assert isinstance(reekensemble.get_df("unsmry--raw.csv"), pd.DataFrame) # The columns are not similar, this is allowed!' # If you get 3205 here, it means that you are using the union of # raw dates from all realizations, which is not correct assert len( reekensemble.load_smry(column_keys=["FGPR", "FOP*"]).index) == 1700 # Date list handling: assert len(reekensemble.get_smry_dates(freq="report")) == 641 assert len(reekensemble.get_smry_dates(freq="raw")) == 641 assert len(reekensemble.get_smry_dates(freq="yearly")) == 5 assert len(reekensemble.get_smry_dates(freq="monthly")) == 38 assert len(reekensemble.get_smry_dates(freq="daily")) == 1098 assert len(reekensemble.get_smry_dates(freq="last")) == 1 assert reekensemble.get_smry_dates( freq="last") == reekensemble.get_smry_dates(freq="last", end_date="2050-02-01") assert str(reekensemble.get_smry_dates( freq="report")[-1]) == "2003-01-02 00:00:00" assert str( reekensemble.get_smry_dates(freq="raw")[-1]) == "2003-01-02 00:00:00" assert str(reekensemble.get_smry_dates(freq="yearly")[-1]) == "2004-01-01" assert str(reekensemble.get_smry_dates(freq="monthly")[-1]) == "2003-02-01" assert str(reekensemble.get_smry_dates(freq="daily")[-1]) == "2003-01-02" assert str(reekensemble.get_smry_dates(freq="last")[-1]) == "2003-01-02" assert (str( reekensemble.get_smry_dates( freq="daily", end_date="2002-03-03")[-1]) == "2002-03-03") assert (str( reekensemble.get_smry_dates( freq="daily", start_date="2002-03-03")[0]) == "2002-03-03") # Start and end outside of orig data and on the "wrong side" dates = reekensemble.get_smry_dates(end_date="1999-03-03") assert len(dates) == 1 assert str(dates[0]) == "1999-03-03" dates = reekensemble.get_smry_dates(start_date="2099-03-03") assert len(dates) == 1 assert str(dates[0]) == "2099-03-03" # Time interpolated dataframes with summary data: yearly = reekensemble.get_smry_dates(freq="yearly") assert len(reekensemble.load_smry(column_keys=["FOPT"], time_index=yearly)) == 25 # NB: This is cached in unsmry-custom.csv, not unsmry--yearly! # This usage is discouraged. Use 'yearly' in such cases. # Check that we can shortcut get_smry_dates: assert len( reekensemble.load_smry(column_keys=["FOPT"], time_index="yearly")) == 25 assert len(reekensemble.load_smry(column_keys=["FOPR"], time_index="last")) == 5 assert isinstance(reekensemble.get_df("unsmry--last.csv"), pd.DataFrame) # Eclipse well names list assert len(reekensemble.get_wellnames("OP*")) == 5 assert len(reekensemble.get_wellnames(None)) == 8 assert len(reekensemble.get_wellnames()) == 8 assert not reekensemble.get_wellnames("") assert len(reekensemble.get_wellnames(["OP*", "WI*"])) == 8 # eclipse well groups list assert len(reekensemble.get_groupnames()) == 3 # delta between two ensembles diff = reekensemble - reekensemble assert len( diff.get_smry(column_keys=["FOPR", "FGPR", "FWCT"]).columns) == 5 # eclipse summary vector statistics for a given ensemble df_stats = reekensemble.get_smry_stats(column_keys=["FOPR", "FGPR"], time_index="monthly") assert isinstance(df_stats, pd.DataFrame) assert len(df_stats.columns) == 2 assert isinstance(df_stats["FOPR"]["mean"], pd.Series) assert len(df_stats["FOPR"]["mean"].index) == 38 # check if wild cards also work for get_smry_stats df_stats = reekensemble.get_smry_stats(column_keys=["FOP*", "FGP*"], time_index="monthly") assert len(df_stats.columns) == len( reekensemble.get_smrykeys(["FOP*", "FGP*"])) # Check webviz requirements for dataframe stats = df_stats.index.levels[0] assert "minimum" in stats assert "maximum" in stats assert "p10" in stats assert "p90" in stats assert "mean" in stats assert df_stats["FOPR"]["minimum"].iloc[-2] < df_stats["FOPR"][ "maximum"].iloc[-2] # Check user supplied quantiles df_stats = reekensemble.get_smry_stats(column_keys=["FOPT"], time_index="yearly", quantiles=[0, 15, 50, 85, 100]) statistics = df_stats.index.levels[0] assert "p0" in statistics assert "p15" in statistics assert "p50" in statistics assert "p85" in statistics assert "p100" in statistics # For oil industry, p15 on FOPT should yield a larger value than p85. # But the quantiles we get out follows the rest of the world # so we check for the opposite. assert df_stats["FOPT"]["p85"][-1] > df_stats["FOPT"]["p15"][-1] with pytest.raises(ValueError): reekensemble.get_smry_stats(column_keys=["FOPT"], time_index="yearly", quantiles=["foobar"]) noquantiles = reekensemble.get_smry_stats(column_keys=["FOPT"], time_index="yearly", quantiles=[]) assert len(noquantiles.index.levels[0]) == 3
def test_get_df(): """Test the data retrieval functionality get_df() in the ensemble context is an aggregator, that will aggregate data from individual realaizations to the ensemble level, with optional merging capabilities performed on realization level.""" testdir = os.path.dirname(os.path.abspath(__file__)) ens = ScratchEnsemble( "reektest", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0") smry = ens.load_smry(column_keys="FO*", time_index="yearly") assert not ens.get_df("unsmry--yearly").empty assert not ens.get_df("unsmry--yearly.csv").empty assert not ens.get_df("share/results/tables/unsmry--yearly").empty assert not ens.get_df("share/results/tables/unsmry--yearly.csv").empty with pytest.raises(KeyError): # pylint: disable=pointless-statement ens.get_df("unsmry--monthly") ens.load_smry(column_keys="FO*", time_index="monthly") assert not ens.get_df("unsmry--monthly").empty with pytest.raises(KeyError): # pylint: disable=pointless-statement ens.get_df("unsmry-monthly") # Tests that we can do merges directly: params = ens.get_df("parameters.txt") smryparams = ens.get_df("unsmry--yearly", merge="parameters") # The set union is to handle the REAL column present in both smry and params: assert len(smryparams.columns) == len( set(smry.columns).union(params.columns)) # Test multiple merges: outputs = ens.load_txt("outputs.txt") assert len( ens.get_df("unsmry--yearly", merge=["parameters", "outputs.txt"]).columns) == len( set(smry.columns).union(params.columns).union( outputs.columns)) # Try merging dataframes: ens.load_csv("share/results/volumes/simulator_volume_fipnum.csv") # Inject a mocked dataframe to the realization, there is # no "add_data" API for ensembles, but we can use the apply() # functionality def fipnum2zone(): """Helper function for injecting mocked frame into each realization""" return pd.DataFrame( columns=["FIPNUM", "ZONE"], data=[ [1, "UpperReek"], [2, "MidReek"], [3, "LowerReek"], [4, "UpperReek"], [5, "MidReek"], [6, "LowerReek"], ], ) ens.apply(fipnum2zone, localpath="fipnum2zone") volframe = ens.get_df("simulator_volume_fipnum", merge="fipnum2zone") assert "ZONE" in volframe assert "FIPNUM" in volframe assert "STOIIP_OIL" in volframe assert len(volframe["ZONE"].unique()) == 3 # Merge with scalar data: ens.load_scalar("npv.txt") vol_npv = ens.get_df("simulator_volume_fipnum", merge="npv.txt") # (this particular data combination does not really make sense) assert "STOIIP_OIL" in vol_npv assert "npv.txt" in vol_npv
def test_virtualensemble(): """Test the properties of a virtualized ScratchEnsemble""" if "__file__" in globals(): # Easen up copying test code into interactive sessions testdir = os.path.dirname(os.path.abspath(__file__)) else: testdir = os.path.abspath(".") manifest = { "what": "A test ensemble for pytest usage", "coordinate_system": "The correct one", } reekensemble = ScratchEnsemble( "reektest", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0", manifest=manifest, ) reekensemble.load_smry(time_index="yearly", column_keys=["F*"]) reekensemble.load_smry(column_keys=["FOPT", "FOIP"]) reekensemble.load_smry( column_keys=["FGPT"], time_index=[ datetime.strptime(strdate, "%Y-%m-%d %H:%M:%S") for strdate in ["2000-05-03 23:15:00", "2002-04-02 15:34:23"] ], ) reekensemble.load_scalar("npv.txt") reekensemble.load_txt("outputs.txt") vens = reekensemble.to_virtual() assert "coordinate_system" in vens.manifest # Overwrite the manifest: vens.manifest = {"foo": "bar"} assert "foo" in vens.manifest assert "coordinate_system" not in vens.manifest # Check that we have data for 5 realizations assert len(vens["unsmry--yearly"]["REAL"].unique()) == 5 assert len(vens["unsmry--raw"]["REAL"].unique()) == 5 assert len(vens["unsmry--custom"]["REAL"].unique()) == 5 assert len(vens["parameters.txt"]) == 5 assert not vens.lazy_keys() # This is the dataframe of discovered files in the ScratchRealization assert isinstance(vens["__files"], pd.DataFrame) assert not vens["__files"].empty assert "REAL" in vens["STATUS"].columns # Check shorthand functionality: assert ( vens.shortcut2path("unsmry--yearly") == "share/results/tables/unsmry--yearly.csv" ) assert ( vens.shortcut2path("unsmry--yearly.csv") == "share/results/tables/unsmry--yearly.csv" ) assert "npv.txt" in vens.keys() assert len(vens["npv.txt"]) == 5 # includes the 'error!' string in real4 assert "outputs.txt" in vens.keys() assert len(vens["outputs.txt"]) == 4 # Check that get_smry() works # (here is with no interpolation necessary) fopt = vens.get_smry(column_keys=["FOPT"], time_index="yearly") assert "FOPT" in fopt.columns assert "DATE" in fopt.columns assert "REAL" in fopt.columns assert "FGPT" not in fopt.columns assert len(fopt) == 25 # assert len(monthly_smry))== raw_smry = vens.get_smry(time_index="raw") pd.testing.assert_series_equal( vens.get_smry(time_index="first")["FOIP"].reset_index(drop=True), raw_smry[raw_smry["DATE"] == min(raw_smry["DATE"])]["FOIP"].reset_index( drop=True ), ) pd.testing.assert_series_equal( vens.get_smry(time_index="last")["FOIP"].reset_index(drop=True), raw_smry[raw_smry["DATE"] == max(raw_smry["DATE"])]["FOIP"].reset_index( drop=True ), ) # Check that we can default get_smry() alldefaults = vens.get_smry() # This should glob to all columns, and monthly time frequency # The 'monthly' is interpolated from the 'raw', as it is most likely # finer resolution than 'yearly' assert len(alldefaults) == 185 assert len(alldefaults.columns) == 4 # Check that monthly behaves the same way as default monthly_smry = vens.get_smry(time_index="monthly") assert len(monthly_smry) == 185 assert len(monthly_smry.columns) == 4 # Check that get_smry(time_index='raw')==get_smry(time_index=None) pd.testing.assert_series_equal( vens.get_smry(time_index="raw")["FOPT"].reset_index(drop=True), vens.get_smry(time_index=None)["FOPT"].reset_index(drop=True), ) # Check that the custom smry has two dates assert len(vens.get_smry(time_index="custom")["DATE"].unique()) == 2 # Eclipse summary vector statistics for a given ensemble df_stats = vens.get_smry_stats(column_keys=["FOPR", "FGPR"], time_index="yearly") assert isinstance(df_stats, pd.DataFrame) assert len(df_stats.columns) == 2 assert isinstance(df_stats["FOPR"]["mean"], pd.Series) assert len(df_stats["FOPR"]["mean"]) == 5 # Check webviz requirements for dataframe stats = df_stats.index.levels[0] assert "minimum" in stats assert "maximum" in stats assert "p10" in stats assert "p90" in stats assert "mean" in stats assert df_stats["FOPR"]["minimum"].iloc[-2] < df_stats["FOPR"]["maximum"].iloc[-2] # Test virtrealization retrieval: vreal = vens.get_realization(2) assert len(vreal.keys()) == len(vens.keys()) assert set(vreal.keys()) == set(vens.keys()) # Order is not preserved # Test realization removal: vens.remove_realizations(3) assert len(vens.parameters["REAL"].unique()) == 4 assert len(vens) == 4 vens.remove_realizations(3) # This will give warning assert len(vens.parameters["REAL"].unique()) == 4 assert len(vens["unsmry--yearly"]["REAL"].unique()) == 4 assert len(vens) == 4 # Test data removal: vens.remove_data("parameters.txt") assert "parameters.txt" not in vens.keys() vens.remove_data("bogus") # This should only give warning # Test data addition. It should(?) work also for earlier nonexisting vens.append( "betterdata", pd.DataFrame( { "REAL": [0, 1, 2, 3, 4, 5, 6, 80], "NPV": [1000, 2000, 1500, 2300, 6000, 3000, 800, 9], } ), ) assert "betterdata" in vens.keys() assert "REAL" in vens["betterdata"].columns assert "NPV" in vens["betterdata"].columns assert vens.get_realization(3).get_df("betterdata")["NPV"] == 2300 assert vens.get_realization(0).get_df("betterdata")["NPV"] == 1000 assert vens.get_realization(1).get_df("betterdata")["NPV"] == 2000 assert vens.get_realization(2).get_df("betterdata")["NPV"] == 1500 assert vens.get_realization(80).get_df("betterdata")["NPV"] == 9 with pytest.raises(ValueError): vens.get_realization(9999) assert vens.shortcut2path("betterdata") == "betterdata" assert vens.agg("min").get_df("betterdata")["NPV"] == 9 assert vens.agg("max").get_df("betterdata")["NPV"] == 6000 assert ( vens.agg("min").get_df("betterdata")["NPV"] < vens.agg("p07").get_df("betterdata")["NPV"] ) assert ( vens.agg("p05").get_df("betterdata")["NPV"] < vens.agg("p55").get_df("betterdata")["NPV"] ) assert ( vens.agg("p46").get_df("betterdata")["NPV"] < vens.agg("max").get_df("betterdata")["NPV"] ) assert "REAL" not in vens.agg("min")["STATUS"].columns # Betterdata should be returned as a dictionary # (it is returned from a virtualrealization object) assert isinstance(vens.agg("min").get_df("betterdata"), dict)
def test_todisk(tmpdir): """Test that we can write VirtualEnsembles to the filesystem in a retrievable manner""" if "__file__" in globals(): # Easen up copying test code into interactive sessions testdir = os.path.dirname(os.path.abspath(__file__)) else: testdir = os.path.abspath(".") reekensemble = ScratchEnsemble( "reektest", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0", manifest={"foo": "bar.com"}, ) reekensemble.load_smry(time_index="monthly", column_keys="*") reekensemble.load_smry(time_index="daily", column_keys="*") reekensemble.load_smry(time_index="yearly", column_keys="F*") reekensemble.load_scalar("npv.txt") reekensemble.load_txt("outputs.txt") vens = reekensemble.to_virtual() assert "foo" in vens.manifest tmpdir.chdir() vens.to_disk("vens_dumped", delete=True) assert len(vens) == len(reekensemble) fromdisk = VirtualEnsemble(fromdisk="vens_dumped") assert "foo" in fromdisk.manifest # Same number of realizations: assert len(fromdisk) == len(vens) # Should have all the same keys, # but change of order is fine assert set(vens.keys()) == set(fromdisk.keys()) for frame in vens.keys(): if frame == "STATUS": continue assert (vens.get_df(frame).columns == fromdisk.get_df(frame).columns).all() # Columns that only contains NaN will not have their # type preserved, this is too much to ask for, especially # with CSV files. So we drop columns with NaN virtframe = vens.get_df(frame).dropna("columns") diskframe = fromdisk.get_df(frame).dropna("columns") # It would be nice to be able to use pd.Dataframe.equals, # but it is too strict, as columns with mixed type number/strings # will easily be wrong. for column in set(virtframe.columns).intersection(set(diskframe.columns)): if object in (virtframe[column].dtype, diskframe[column].dtype): # Ensure we only compare strings when working with object dtype assert ( virtframe[column].astype(str).equals(diskframe[column].astype(str)) ) else: pd.testing.assert_series_equal(virtframe[column], diskframe[column]) fromdisk.to_disk("vens_double_dumped", delete=True) # Here we could check filesystem equivalence if we want. vens.to_disk("vens_dumped_csv", delete=True, dumpparquet=False) fromcsvdisk = VirtualEnsemble(fromdisk="vens_dumped_csv") lazyfromdisk = VirtualEnsemble(fromdisk="vens_dumped_csv", lazy_load=True) assert set(vens.keys()) == set(fromcsvdisk.keys()) assert set(vens.keys()) == set(lazyfromdisk.keys()) assert "OK" in lazyfromdisk.lazy_frames.keys() assert "OK" not in lazyfromdisk.data.keys() assert len(fromcsvdisk.get_df("OK")) == len(lazyfromdisk.get_df("OK")) assert "OK" not in lazyfromdisk.lazy_frames.keys() assert "OK" in lazyfromdisk.data.keys() assert len(fromcsvdisk.parameters) == len(lazyfromdisk.parameters) assert len(fromcsvdisk.get_df("unsmry--yearly")) == len( lazyfromdisk.get_df("unsmry--yearly") ) if HAVE_PYARROW: vens.to_disk("vens_dumped_parquet", delete=True, dumpcsv=False) fromparquetdisk = VirtualEnsemble() fromparquetdisk.from_disk("vens_dumped_parquet") assert set(vens.keys()) == set(fromparquetdisk.keys()) fromparquetdisk2 = VirtualEnsemble() fromparquetdisk2.from_disk("vens_dumped_parquet", fmt="csv") # Here we will miss a lot of CSV files, because we only wrote parquet: assert len(vens.keys()) > len(fromparquetdisk2.keys()) fromcsvdisk2 = VirtualEnsemble() fromcsvdisk2.from_disk("vens_dumped_csv", fmt="parquet") # But even if we only try to load parquet files, when CSV # files are found without corresponding parquet, the CSV file # will be read. assert set(vens.keys()) == set(fromcsvdisk2.keys()) # Test manual intervention: fooframe = pd.DataFrame(data=np.random.randn(3, 3), columns=["FOO", "BAR", "COM"]) fooframe.to_csv(os.path.join("vens_dumped", "share/results/tables/randomdata.csv")) manualens = VirtualEnsemble(fromdisk="vens_dumped") assert "share/results/tables/randomdata.csv" not in manualens.keys() # Now with correct column header, # but floating point data for realizations.. fooframe = pd.DataFrame(data=np.random.randn(3, 3), columns=["REAL", "BAR", "COM"]) fooframe.to_csv(os.path.join("vens_dumped", "share/results/tables/randomdata.csv")) manualens = VirtualEnsemble(fromdisk="vens_dumped") assert "share/results/tables/randomdata.csv" not in manualens.keys() # Now with correct column header, and with integer data for REAL.. fooframe = pd.DataFrame( data=np.random.randint(low=0, high=100, size=(3, 3)), columns=["REAL", "BAR", "COM"], ) fooframe.to_csv(os.path.join("vens_dumped", "share/results/tables/randomdata.csv")) manualens = VirtualEnsemble(fromdisk="vens_dumped") assert "share/results/tables/randomdata.csv" in manualens.keys()
def test_vens_mismatch(): """Test calculation of mismatch to virtualized ensemble data""" if "__file__" in globals(): # Easen up copying test code into interactive sessions testdir = os.path.dirname(os.path.abspath(__file__)) else: testdir = os.path.abspath(".") ens = ScratchEnsemble( "test", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0/") ens.load_smry(column_keys=["FOPT*"], time_index="monthly") vens = ens.to_virtual() # We don't need time_index now, because monthly is all we have. obs = Observations({"smryh": [{"key": "FOPT", "histvec": "FOPTH"}]}) mismatch = obs.mismatch(vens) mismatch_raw = obs.mismatch(ens) assert isinstance(mismatch, pd.DataFrame) assert not mismatch.empty assert "L1" in mismatch.columns assert "L2" in mismatch.columns assert "MISMATCH" in mismatch.columns assert mismatch["MISMATCH"].sum() != mismatch_raw["MISMATCH"].sum() obs_monthly = Observations({ "smryh": [{ "key": "FOPT", "histvec": "FOPTH", "time_index": "monthly" }] }) assert ((mismatch.sort_values("REAL").reset_index(drop=True) == obs_monthly.mismatch(ens).sort_values("REAL").reset_index( drop=True)).all().all()) # We should be able to do yearly smryh comparisons from virtualized # monthly profiles: obs_yearly = Observations({ "smryh": [{ "key": "FOPT", "histvec": "FOPTH", "time_index": "yearly" }] }) mismatch_yearly = obs_yearly.mismatch(vens) assert mismatch_yearly["MISMATCH"].sum() != mismatch["MISMATCH"].sum() # When load_smry() is forgotten before virtualization: vens = ScratchEnsemble( "test", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0/").to_virtual() with pytest.raises(ValueError): obs.mismatch(vens) # Removal of one realization in the virtualized ensemble: ens = ScratchEnsemble( "test", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0/") ens.load_smry(column_keys=["FOPT*"], time_index="monthly") vens = ens.to_virtual() vens.remove_realizations(2) mismatch_subset = obs.mismatch(vens) assert 2 not in mismatch_subset["REAL"].unique() assert 0 in mismatch_subset["REAL"].unique()