def test_roundtrip_yaml(filename, readonly_testdata_dir): """Test converting all test data sets in testdir into yaml and back again. Due to yaml supporting a subset of features in the internal dataframe format some exceptions must be hardcoded in this test function. Also pay attention to the way the yaml parser creates LABEL data. """ dframe = autoparse_file(filename)[1] # Reduce to the subset supported by yaml: dframe = dframe[(dframe["CLASS"] == "SUMMARY_OBSERVATION") | (dframe["CLASS"] == "BLOCK_OBSERVATION")].dropna( axis="columns", how="all") # Convert to YAML (really dict) format and back again: obsdict = df2obsdict(dframe) yaml_roundtrip_dframe = obsdict2df(obsdict) yaml_roundtrip_dframe.set_index("CLASS", inplace=True) dframe.set_index("CLASS", inplace=True) if "WELL" in yaml_roundtrip_dframe: # WELL as used in yaml is not preservable in roundtrips del yaml_roundtrip_dframe["WELL"] if "WELL" in dframe: del dframe["WELL"] # print(yaml_roundtrip_dframe) # print(dframe) pd.testing.assert_frame_equal( yaml_roundtrip_dframe.sort_index(axis="columns").sort_values("LABEL"), dframe.sort_index(axis="columns").sort_values("LABEL"), check_like=True, )
def test_obsdict2df(obsdict, expected_df): """Test converting yaml format (any kind of observation) into internal dataframe format. Specifics in each class of observation has its own test functions""" if "DATE" in expected_df: expected_df["DATE"] = pd.to_datetime(expected_df["DATE"]) pd.testing.assert_frame_equal( obsdict2df(obsdict).sort_index(axis=1), expected_df.sort_index(axis=1), check_dtype=False, )
def test_ertobs2df(string, expected): """Test converting all the way from ERT observation format to a Pandas Dataframe works as expected (this includes many of the other functions that are also tested individually)""" dframe = ertobs2df(string) pd.testing.assert_frame_equal(dframe.sort_index(axis=1), expected.sort_index(axis=1), check_dtype=False) pd.testing.assert_frame_equal( ertobs2df(df2ertobs(dframe)).sort_index(axis=1), dframe.sort_index(axis=1)) # Round-trip test via yaml: if "DATE" not in expected: return round_trip_yaml_dframe = obsdict2df(df2obsdict(dframe)) pd.testing.assert_frame_equal(round_trip_yaml_dframe.sort_index(axis=1), dframe.sort_index(axis=1))
def autoparse_file(filename): """Detects the observation file format for a given filename. This is done by attempting to parse its content and giving up on exceptions. NB: In case of ERT file formats, the include statements are interpreted relative to current working directory. Thus it is recommended to reparse with correct cwd after detecting ERT file format. The correct cwd for include-statement is the path of the ERT config file, which is outside the context of fmuobs. Args: filename (str) Returns: tuple: First element is a string in [resinsight, csv, yaml, ert], second element is a dataframe or a dict (if input was yaml). """ try: dframe = pd.read_csv(filename, sep=";") if {"DATE", "VECTOR", "VALUE", "ERROR"}.issubset( set(dframe.columns) ) and not dframe.empty: logger.info("Parsed %s as a ResInsight observation file", filename) return ("resinsight", resinsight_df2df(dframe)) except ValueError: pass try: dframe = pd.read_csv(filename, sep=",") if {"CLASS", "LABEL"}.issubset(dframe.columns) and not dframe.empty: logger.info( "Parsed %s as a CSV (internal dataframe format for ertobs) file", filename, ) if "DATE" in dframe: dframe["DATE"] = pd.to_datetime(dframe["DATE"]) return ("csv", dframe) except ValueError: pass try: with open(filename) as f_handle: obsdict = yaml.safe_load(f_handle.read()) if isinstance(obsdict, dict): if obsdict.get("smry", None) or obsdict.get("rft", None): logger.info("Parsed %s as a YAML file with observations", filename) return ("yaml", obsdict2df(obsdict)) except yaml.scanner.ScannerError as exception: # This occurs if there are tabs in the file, which is not # allowed in a YAML file (but it can be present in ERT observation files) logger.debug("ScannerError while attempting yaml-parsing") logger.debug(str(exception)) except ValueError: pass try: with open(filename) as f_handle: # This function does not have information on include file paths. # Accept a FileNotFoundError while parsing, if we encounter that # it is most likely an ert file, but which needs additional hints # on where include files are located. try: dframe = ertobs2df(f_handle.read()) except FileNotFoundError: logger.info( "Parsed %s as an ERT observation file, with include statements", filename, ) return ("ert", pd.DataFrame()) if {"CLASS", "LABEL"}.issubset(dframe.columns) and not dframe.empty: if set(dframe["CLASS"]).intersection(set(CLASS_SHORTNAME.keys())): logger.info("Parsed %s as an ERT observation file", filename) return ("ert", dframe) except ValueError: pass logger.error( "Unable to parse %s as any supported observation file format", filename ) return (None, pd.DataFrame)