Exemple #1
0
def test_zarr_array_to_parquet_table(dataset):
    """
    Test converting from a zarr array to a parquet table, specifying a list of
    variables to store and setting 'snappy' compression.
    """
    with tempfile.TemporaryDirectory() as tmpdir:
        zarrstore: str = os.path.join(tmpdir, "temp.zarr")
        dataset.to_zarr(store=zarrstore, consolidated=True)
        zarrarray: zarr.hierarchy.Group = zarr.open_consolidated(store=zarrstore)

        parquetpath: str = os.path.join(tmpdir, "temp.parquet")
        ndarray_to_parquet(
            ndarray=zarrarray,
            parquetpath=parquetpath,
            variables=["longitude", "latitude", "h_corr", "delta_time"],
            compression="snappy",
        )

        df: dask.dataframe.core.DataFrame = dask.dataframe.read_parquet(
            path=parquetpath
        )
        assert len(df) == 1404
        assert list(df.columns) == [
            "longitude",
            "latitude",
            "h_corr_1",
            "h_corr_2",
            "delta_time_1",
            "delta_time_2",
        ]
        assert all(np.issubdtype(dtype, np.float64) for dtype in df.dtypes)
Exemple #2
0
def test_xarray_dataset_to_parquet_table(dataset):
    """
    Test converting from an xarray Dataset to a parquet table, specifying a
    list of variables to store and setting 'snappy' compression.
    """
    with tempfile.TemporaryDirectory() as tmpdir:
        parquetpath: str = os.path.join(tmpdir, "temp.parquet")
        ndarray_to_parquet(
            ndarray=dataset,
            parquetpath=parquetpath,
            variables=["longitude", "latitude", "h_corr", "h_corr_sigma"],
            compression="snappy",
        )

        df: dask.dataframe.core.DataFrame = dask.dataframe.read_parquet(
            path=parquetpath
        )
        assert len(df) == 1404
        assert list(df.columns) == [
            "longitude",
            "latitude",
            "h_corr_1",
            "h_corr_2",
            "h_corr_sigma_1",
            "h_corr_sigma_2",
        ]
        assert all(np.issubdtype(dtype, np.float64) for dtype in df.dtypes)
Exemple #3
0
def fixture_dataframe():
    """
    Loads the sample ICESat-2 ATL11 data, and processes it into an suitable
    pandas.DataFrame format.
    """
    dataset: xr.Dataset = catalog.test_data.atl11_test_case.to_dask()
    dataset["utc_time"] = deltatime_to_utctime(dataarray=dataset.delta_time)

    with tempfile.TemporaryDirectory() as tmpdir:
        df: pd.DataFrame = ndarray_to_parquet(
            ndarray=dataset,
            parquetpath=os.path.join(tmpdir, "temp.parquet"),
            variables=["longitude", "latitude", "h_corr", "utc_time"],
            use_deprecated_int96_timestamps=True,
        )
    dataframe: pd.DataFrame = wide_to_long(
        df=df, stubnames=["h_corr", "utc_time"], j="cycle_number"
    )
    dataframe: pd.DataFrame = dataframe.reset_index(drop=True)

    # Mock up a dummy track1_track2 column based on the cycle_number
    dataframe["track1_track2"] = np.where(
        dataframe["cycle_number"] == 1, "0111_pt1x0222_pt2", "0333pt3x0111_pt1"
    )
    return dataframe
Exemple #4
0
def fixture_table():
    """
    Loads the sample ICESat-2 ATL11 data, and processes it into an suitable
    pandas.DataFrame table format.
    """
    dataset: xr.Dataset = catalog.test_data.atl11_test_case.to_dask()
    with tempfile.TemporaryDirectory() as tmpdir:
        parquetpath: str = os.path.join(tmpdir, "temp.parquet")
        table: pd.DataFrame = ndarray_to_parquet(
            ndarray=dataset,
            parquetpath=parquetpath,
            variables=["longitude", "latitude", "h_corr"],
        )
    return table
Exemple #5
0
    if not os.path.exists(f"ATLXI/df_dhdt_{placename}.parquet"):
        # Subset dataset to geographic region of interest
        ds_subset: xr.Dataset = region.subset(data=ds_dhdt)
        # Rename delta_time (timedelta64) to utc_time (datetime64), because that's what it is
        ds_subset = ds_subset.rename(name_dict={"delta_time": "utc_time"})
        # Save to parquet format. If the dask workers get killed, reduce the number
        # of workers (e.g. 72 to 32) so that each worker will have more memory
        deepicedrain.ndarray_to_parquet(
            ndarray=ds_subset,
            parquetpath=f"ATLXI/df_dhdt_{placename.lower()}.parquet",
            variables=[
                "x",
                "x_atc",
                "y",
                "y_atc",
                "dhdt_slope",
                "referencegroundtrack",
                "h_corr",
                "utc_time",
            ],
            dropnacols=["dhdt_slope"],
            use_deprecated_int96_timestamps=True,
        )
# df_dhdt = pd.read_parquet(f"ATLXI/df_dhdt_{placename}.parquet")
df_dhdt: cudf.DataFrame = cudf.read_parquet(f"ATLXI/df_dhdt_{placename}.parquet")

# %%
# Interactive holoviews scatter plot to find referencegroundtrack needed
# Tip: Hover over the points, and find those with high 'dhdt_slope' values
viewer = deepicedrain.IceSat2Explorer(name="ICESat-2 Explorer", placename=placename)
dashboard: pn.layout.Column = pn.Column(viewer.widgets, viewer.view)
Exemple #6
0
                                 env={"X2SYS_HOME": os.environ["X2SYS_HOME"]})
client

# %% [markdown]
# # Data Preparation

# %%
min_date, max_date = ("2018-10-14", "2020-07-16")

# %%
if not os.path.exists("ATLXI/df_dhdt_antarctica.parquet"):
    zarrarray = zarr.open_consolidated(store=f"ATLXI/ds_dhdt_antarctica.zarr",
                                       mode="r")
    _ = deepicedrain.ndarray_to_parquet(
        ndarray=zarrarray,
        parquetpath="ATLXI/df_dhdt_antarctica.parquet",
        variables=["x", "y", "dhdt_slope", "referencegroundtrack", "h_corr"],
        dropnacols=["dhdt_slope"],
    )

# %%
# Read in Antarctic Drainage Basin Boundaries shapefile into a GeoDataFrame
ice_boundaries: gpd.GeoDataFrame = (
    deepicedrain.catalog.measures_antarctic_boundaries.read())
drainage_basins: gpd.GeoDataFrame = ice_boundaries.query(expr="TYPE == 'GR'")

# %% [markdown]
# ## Load in ICESat-2 data (x, y, dhdt) and do initial trimming

# %%
# Read in raw x, y, dhdt_slope and referencegroundtrack data into the GPU
cudf_raw: cudf.DataFrame = cudf.read_parquet(