def test_zarr_array_to_parquet_table(dataset): """ Test converting from a zarr array to a parquet table, specifying a list of variables to store and setting 'snappy' compression. """ with tempfile.TemporaryDirectory() as tmpdir: zarrstore: str = os.path.join(tmpdir, "temp.zarr") dataset.to_zarr(store=zarrstore, consolidated=True) zarrarray: zarr.hierarchy.Group = zarr.open_consolidated(store=zarrstore) parquetpath: str = os.path.join(tmpdir, "temp.parquet") ndarray_to_parquet( ndarray=zarrarray, parquetpath=parquetpath, variables=["longitude", "latitude", "h_corr", "delta_time"], compression="snappy", ) df: dask.dataframe.core.DataFrame = dask.dataframe.read_parquet( path=parquetpath ) assert len(df) == 1404 assert list(df.columns) == [ "longitude", "latitude", "h_corr_1", "h_corr_2", "delta_time_1", "delta_time_2", ] assert all(np.issubdtype(dtype, np.float64) for dtype in df.dtypes)
def test_xarray_dataset_to_parquet_table(dataset): """ Test converting from an xarray Dataset to a parquet table, specifying a list of variables to store and setting 'snappy' compression. """ with tempfile.TemporaryDirectory() as tmpdir: parquetpath: str = os.path.join(tmpdir, "temp.parquet") ndarray_to_parquet( ndarray=dataset, parquetpath=parquetpath, variables=["longitude", "latitude", "h_corr", "h_corr_sigma"], compression="snappy", ) df: dask.dataframe.core.DataFrame = dask.dataframe.read_parquet( path=parquetpath ) assert len(df) == 1404 assert list(df.columns) == [ "longitude", "latitude", "h_corr_1", "h_corr_2", "h_corr_sigma_1", "h_corr_sigma_2", ] assert all(np.issubdtype(dtype, np.float64) for dtype in df.dtypes)
def fixture_dataframe(): """ Loads the sample ICESat-2 ATL11 data, and processes it into an suitable pandas.DataFrame format. """ dataset: xr.Dataset = catalog.test_data.atl11_test_case.to_dask() dataset["utc_time"] = deltatime_to_utctime(dataarray=dataset.delta_time) with tempfile.TemporaryDirectory() as tmpdir: df: pd.DataFrame = ndarray_to_parquet( ndarray=dataset, parquetpath=os.path.join(tmpdir, "temp.parquet"), variables=["longitude", "latitude", "h_corr", "utc_time"], use_deprecated_int96_timestamps=True, ) dataframe: pd.DataFrame = wide_to_long( df=df, stubnames=["h_corr", "utc_time"], j="cycle_number" ) dataframe: pd.DataFrame = dataframe.reset_index(drop=True) # Mock up a dummy track1_track2 column based on the cycle_number dataframe["track1_track2"] = np.where( dataframe["cycle_number"] == 1, "0111_pt1x0222_pt2", "0333pt3x0111_pt1" ) return dataframe
def fixture_table(): """ Loads the sample ICESat-2 ATL11 data, and processes it into an suitable pandas.DataFrame table format. """ dataset: xr.Dataset = catalog.test_data.atl11_test_case.to_dask() with tempfile.TemporaryDirectory() as tmpdir: parquetpath: str = os.path.join(tmpdir, "temp.parquet") table: pd.DataFrame = ndarray_to_parquet( ndarray=dataset, parquetpath=parquetpath, variables=["longitude", "latitude", "h_corr"], ) return table
if not os.path.exists(f"ATLXI/df_dhdt_{placename}.parquet"): # Subset dataset to geographic region of interest ds_subset: xr.Dataset = region.subset(data=ds_dhdt) # Rename delta_time (timedelta64) to utc_time (datetime64), because that's what it is ds_subset = ds_subset.rename(name_dict={"delta_time": "utc_time"}) # Save to parquet format. If the dask workers get killed, reduce the number # of workers (e.g. 72 to 32) so that each worker will have more memory deepicedrain.ndarray_to_parquet( ndarray=ds_subset, parquetpath=f"ATLXI/df_dhdt_{placename.lower()}.parquet", variables=[ "x", "x_atc", "y", "y_atc", "dhdt_slope", "referencegroundtrack", "h_corr", "utc_time", ], dropnacols=["dhdt_slope"], use_deprecated_int96_timestamps=True, ) # df_dhdt = pd.read_parquet(f"ATLXI/df_dhdt_{placename}.parquet") df_dhdt: cudf.DataFrame = cudf.read_parquet(f"ATLXI/df_dhdt_{placename}.parquet") # %% # Interactive holoviews scatter plot to find referencegroundtrack needed # Tip: Hover over the points, and find those with high 'dhdt_slope' values viewer = deepicedrain.IceSat2Explorer(name="ICESat-2 Explorer", placename=placename) dashboard: pn.layout.Column = pn.Column(viewer.widgets, viewer.view)
env={"X2SYS_HOME": os.environ["X2SYS_HOME"]}) client # %% [markdown] # # Data Preparation # %% min_date, max_date = ("2018-10-14", "2020-07-16") # %% if not os.path.exists("ATLXI/df_dhdt_antarctica.parquet"): zarrarray = zarr.open_consolidated(store=f"ATLXI/ds_dhdt_antarctica.zarr", mode="r") _ = deepicedrain.ndarray_to_parquet( ndarray=zarrarray, parquetpath="ATLXI/df_dhdt_antarctica.parquet", variables=["x", "y", "dhdt_slope", "referencegroundtrack", "h_corr"], dropnacols=["dhdt_slope"], ) # %% # Read in Antarctic Drainage Basin Boundaries shapefile into a GeoDataFrame ice_boundaries: gpd.GeoDataFrame = ( deepicedrain.catalog.measures_antarctic_boundaries.read()) drainage_basins: gpd.GeoDataFrame = ice_boundaries.query(expr="TYPE == 'GR'") # %% [markdown] # ## Load in ICESat-2 data (x, y, dhdt) and do initial trimming # %% # Read in raw x, y, dhdt_slope and referencegroundtrack data into the GPU cudf_raw: cudf.DataFrame = cudf.read_parquet(