Example #1
0
def test_polygons_geofeather(tmpdir, pg_polygons_wgs84):
    """Confirm that we can round-trip polygons to / from feather file"""

    filename = tmpdir / "polygons_wgs84.feather"
    to_geofeather(pg_polygons_wgs84, filename, crs="EPSG:4326")

    assert os.path.exists(filename)

    df = from_geofeather(filename)
    cols = df.columns.drop("geometry")
    assert_frame_equal(df[cols], pg_polygons_wgs84[cols])
    assert_geometry_equal(df.geometry, pg_polygons_wgs84.geometry)

    assert df.crs == GEO_CRS
Example #2
0
def test_points_geofeather_no_crs(tmpdir, pg_points_wgs84):
    """Confirm that we can round-trip points to / from feather file"""

    filename = tmpdir / "points_wgs84.feather"
    to_geofeather(pg_points_wgs84, filename)

    assert os.path.exists(filename)

    with pytest.warns(UserWarning):
        df = from_geofeather(filename)

    cols = df.columns.drop("geometry")
    assert_frame_equal(df[cols], pg_points_wgs84[cols])
    assert_geometry_equal(df.geometry, pg_points_wgs84.geometry)

    assert df.crs == None
Example #3
0
def snap_to_large_waterbodies(df, to_snap):
    """Snap to nearest large waterbody.

    NOTE: only run this on dams that could not snap to flowlines, to avoid
    moving them far away.

    This captures large dam centerpoints that are not near enough to flowlines.

    Updates df with snapping results, and returns to_snap as set of dams still
    needing to be snapped after this operation.

    Parameters
    ----------
    df : GeoDataFrame
        master dataset, this is where all snapping gets recorded
    to_snap : DataFrame
        data frame containing pygeos geometries to snap ("geometry")
        and snapping tolerance ("snap_tolerance")

    Returns
    -------
    tuple of (GeoDataFrame, DataFrame)
        (df, to_snap)
    """
    wb = from_geofeather(nhd_dir / "merged" / "large_waterbodies.feather").set_index(
        "wbID"
    )
    drains = (
        from_geofeather(nhd_dir / "merged" / "large_waterbody_drain_points.feather")
        .rename(columns={"id": "drainID"})
        .set_index("drainID")
    )

    near_wb = nearest(to_snap.geometry, pg.boundary(wb.geometry), NEAR_WB_TOLERANCE)
    near_wb = (
        pd.DataFrame(near_wb)
        .join(to_snap.geometry)
        .join(
            drains.reset_index()
            .set_index("wbID")[["geometry", "drainID", "lineID"]]
            .rename(columns={"geometry": "drain"}),
            on="wbID",
        )
        .dropna(subset=["drain"])
    )
    near_wb["snap_dist"] = pg.distance(near_wb.geometry, near_wb.drain)

    # drop any that are > 250 m away, these aren't useful
    near_wb = near_wb.loc[near_wb.snap_dist <= WB_DRAIN_MAX_TOLERANCE].copy()

    # take the closest drain point
    near_wb = near_wb.sort_values(by="snap_dist").groupby(level=0).first()

    ix = near_wb.index
    df.loc[ix, "snapped"] = True
    df.loc[ix, "geometry"] = near_wb.drain
    df.loc[ix, "snap_dist"] = near_wb.distance
    df.loc[ix, "snap_ref_id"] = near_wb.drainID
    df.loc[ix, "lineID"] = near_wb.lineID
    df.loc[ix, "wbID"] = near_wb.wbID

    df.loc[ix, "snap_log"] = ndarray_append_strings(
        "snapped: within ",
        WB_DRAIN_MAX_TOLERANCE,
        "m tolerance of drain point of large waterbody that is within ",
        NEAR_WB_TOLERANCE,
        "m of dam",
    )

    to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy()

    print(
        "Found {:,} dams within {}m of large waterbodies and within {}m of the drain point of those waterbodies".format(
            len(near_wb), NEAR_WB_TOLERANCE, WB_DRAIN_MAX_TOLERANCE
        )
    )

    return df, to_snap
Example #4
0
def snap_to_flowlines(df, to_snap):
    """Snap to nearest flowline, within tolerance

    Updates df with snapping results, and returns to_snap as set of dams still
    needing to be snapped after this operation.

    Parameters
    ----------
    df : GeoDataFrame
        master dataset, this is where all snapping gets recorded
    to_snap : DataFrame
        data frame containing pygeos geometries to snap ("geometry")
        and snapping tolerance ("snap_tolerance")

    Returns
    -------
    tuple of (GeoDataFrame, DataFrame)
        (df, to_snap)
    """

    for region, HUC2s in list(REGION_GROUPS.items()):
        region_start = time()

        print("\n----- {} ------\n".format(region))

        print("Reading flowlines...")
        flowlines = from_geofeather(
            nhd_dir / "clean" / region / "flowlines.feather"
        ).set_index("lineID")

        in_region = to_snap.loc[to_snap.HUC2.isin(HUC2s)]
        print(
            "Selected {:,} barriers in region to snap against {:,} flowlines".format(
                len(in_region), len(flowlines)
            )
        )

        if len(in_region) == 0:
            print("No barriers in region to snap")
            continue

        print("Finding nearest flowlines...")
        # TODO: can use near instead of nearest, and persist list of near lineIDs per barrier
        # so that we can construct subnetworks with just those
        lines = nearest(
            in_region.geometry, flowlines.geometry, in_region.snap_tolerance
        )
        lines = lines.join(in_region.geometry).join(
            flowlines.geometry.rename("line"), on="lineID",
        )

        # project the point to the line,
        # find out its distance on the line,
        # then interpolate its new coordinates
        lines["geometry"] = pg.line_interpolate_point(
            lines.line, pg.line_locate_point(lines.line, lines.geometry)
        )

        ix = lines.index
        df.loc[ix, "snapped"] = True
        df.loc[ix, "geometry"] = lines.geometry
        df.loc[ix, "snap_dist"] = lines.distance
        df.loc[ix, "snap_ref_id"] = lines.lineID
        df.loc[ix, "lineID"] = lines.lineID
        df.loc[ix, "snap_log"] = ndarray_append_strings(
            "snapped: within ",
            to_snap.loc[ix].snap_tolerance,
            "m tolerance of flowline",
        )

        to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy()

        print(
            "{:,} barriers snapped in region in {:.2f}s".format(
                len(ix), time() - region_start
            )
        )

    # TODO: flag those that joined to loops

    return df, to_snap
Example #5
0
def snap_to_nhd_dams(df, to_snap):
    """Attempt to snap points from to_snap to NHD dams.

    Updates df with snapping results, and returns to_snap as set of dams still
    needing to be snapped after this operation.

    Parameters
    ----------
    df : GeoDataFrame
        master dataset, this is where all snapping gets recorded
    to_snap : DataFrame
        data frame containing pygeos geometries to snap ("geometry")
        and snapping tolerance ("snap_tolerance")

    Returns
    -------
    tuple of (GeoDataFrame, DataFrame)
        (df, to_snap)
    """

    print("Snapping to NHD dams...")
    # NOTE: id is not unique for points
    nhd_dams_poly = (
        from_geofeather(nhd_dir / "merged" / "nhd_dams_poly.feather")
        .rename(columns={"id": "damID"})
        .set_index("damID")
        .drop(columns=["index"], errors="ignore")
    )
    nhd_dams = (
        from_geofeather(nhd_dir / "merged" / "nhd_dams_pt.feather")
        .rename(columns={"id": "damID"})
        .set_index("damID")
        .drop(columns=["index"], errors="ignore")
    )
    # set nulls back to na
    nhd_dams.wbID = nhd_dams.wbID.replace(-1, np.nan)

    ### Find dams that are really close (50m) to NHD dam polygons
    # Those that have multiple dams nearby are usually part of a dam complex
    snap_start = time()
    near_nhd = nearest(
        to_snap.geometry, nhd_dams_poly.geometry, distance=NHD_DAM_TOLERANCE
    )[["damID"]]

    # snap to nearest dam point for that dam (some are > 1 km away)
    # NOTE: this will create multiple entries for some dams
    near_nhd = near_nhd.join(to_snap.geometry.rename("source_pt")).join(
        nhd_dams, on="damID"
    )
    near_nhd["snap_dist"] = pg.distance(near_nhd.geometry, near_nhd.source_pt)
    near_nhd = (
        near_nhd.reset_index().sort_values(by=["id", "snap_dist"]).groupby("id").first()
    )

    ix = near_nhd.index
    df.loc[ix, "snapped"] = True
    df.loc[ix, "geometry"] = near_nhd.geometry
    df.loc[ix, "snap_dist"] = near_nhd.snap_dist
    df.loc[ix, "snap_ref_id"] = near_nhd.damID
    df.loc[ix, "lineID"] = near_nhd.lineID
    df.loc[ix, "wbID"] = near_nhd.wbID
    df.loc[ix, "snap_log"] = ndarray_append_strings(
        "snapped: within ", NHD_DAM_TOLERANCE, "m of NHD dam polygon"
    )
    to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy()
    print(
        "Snapped {:,} dams to NHD dam polygons in {:.2f}s".format(
            len(ix), time() - snap_start
        )
    )

    ### Find dams that are close (within snapping tolerance) of NHD dam points
    snap_start = time()
    tmp = nhd_dams.reset_index()  # reset index so we have unique index to join on
    near_nhd = nearest(
        to_snap.geometry, tmp.geometry, distance=to_snap.snap_tolerance
    ).rename(columns={"distance": "snap_dist"})

    near_nhd = near_nhd.join(to_snap.geometry.rename("source_pt")).join(
        tmp, on="index_right"
    )
    near_nhd = (
        near_nhd.reset_index().sort_values(by=["id", "snap_dist"]).groupby("id").first()
    )

    ix = near_nhd.index
    df.loc[ix, "snapped"] = True
    df.loc[ix, "geometry"] = near_nhd.geometry
    df.loc[ix, "snap_dist"] = near_nhd.snap_dist
    df.loc[ix, "snap_ref_id"] = near_nhd.damID
    df.loc[ix, "lineID"] = near_nhd.lineID
    df.loc[ix, "wbID"] = near_nhd.wbID
    df.loc[ix, "snap_log"] = ndarray_append_strings(
        "snapped: within ",
        to_snap.loc[ix].snap_tolerance,
        "m tolerance of NHD dam point but >",
        NHD_DAM_TOLERANCE,
        "m from NHD dam polygon",
    )
    to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy()
    print(
        "Snapped {:,} dams to NHD dam points in {:.2f}s".format(
            len(ix), time() - snap_start
        )
    )

    ### TODO: identify any NHD dam points that didn't get claimed  (need to do this after snapping others)

    return df, to_snap
Example #6
0
def snap_to_waterbodies(df, to_snap):
    """Attempt to snap points from to_snap to waterbody drain points.

    Updates df with snapping results, and returns to_snap as set of dams still
    needing to be snapped after this operation.

    Parameters
    ----------
    df : GeoDataFrame
        master dataset, this is where all snapping gets recorded
    to_snap : DataFrame
        data frame containing pygeos geometries to snap ("geometry")
        and snapping tolerance ("snap_tolerance")

    Returns
    -------
    tuple of (GeoDataFrame, DataFrame)
        (df, to_snap)
    """

    ### Attempt to snap to waterbody drain points for major waterbodies
    # Use larger tolerance for larger waterbodies
    print("Snapping to waterbodies and drain points..")
    wb = from_geofeather(nhd_dir / "merged" / "waterbodies.feather").set_index("wbID")
    drains = (
        from_geofeather(nhd_dir / "merged" / "waterbody_drain_points.feather")
        .rename(columns={"id": "drainID"})
        .set_index("drainID")
    )

    ### First pass - find the dams that are contained by waterbodies
    contained_start = time()

    in_wb = sjoin(to_snap, wb, how="inner").index_right.rename("wbID")

    # update wbID in dataset, but this doesn't mean it is snapped
    ix = in_wb.index
    df.loc[ix, "wbID"] = in_wb

    print(
        "Found {:,} dams in waterbodies in {:.2f}s".format(
            len(in_wb), time() - contained_start
        )
    )

    print("Finding nearest drain points...")
    snap_start = time()
    # join back to pygeos geoms and join to drains
    # NOTE: this may produce multiple drains for some waterbodies
    in_wb = (
        pd.DataFrame(in_wb)
        .join(to_snap[["geometry", "snap_tolerance"]])
        .join(
            drains.reset_index()
            .set_index("wbID")[["geometry", "drainID", "lineID"]]
            .rename(columns={"geometry": "drain"}),
            on="wbID",
        )
        .dropna(subset=["drain"])
    )
    in_wb["snap_dist"] = pg.distance(in_wb.geometry, in_wb.drain)

    # drop any that are > 500 m away, these aren't useful
    in_wb = in_wb.loc[in_wb.snap_dist <= 500].copy()

    # take the closest drain point
    in_wb.index.name = "index"
    in_wb = (
        in_wb.reset_index()
        .sort_values(by=["index", "snap_dist"])
        .groupby("index")
        .first()
    )

    # Any that are within the snap tolerance just snap to that drain
    close_enough = in_wb.loc[in_wb.snap_dist <= in_wb.snap_tolerance]
    ix = close_enough.index
    df.loc[ix, "snapped"] = True
    df.loc[ix, "geometry"] = close_enough.drain
    df.loc[ix, "snap_dist"] = close_enough.snap_dist
    df.loc[ix, "snap_ref_id"] = close_enough.drainID
    df.loc[ix, "lineID"] = close_enough.lineID
    df.loc[ix, "wbID"] = close_enough.wbID
    df.loc[ix, "snap_log"] = ndarray_append_strings(
        "snapped: within ",
        to_snap.loc[ix].snap_tolerance,
        "m tolerance of drain point for waterbody that contains this dam",
    )

    to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy()

    print(
        "Found {:,} dams within tolerance of the drain points for their waterbody in {:.2f}s".format(
            len(ix), time() - snap_start
        )
    )

    # Any that are > tolerance away from their own drain, but within tolerance of another drain
    # should snap to the other drain; these are in chains of multiple waterbodies.
    # Visually confirmed this by looking at several.
    snap_start = time()
    further = in_wb.loc[in_wb.snap_dist > in_wb.snap_tolerance].copy()
    nearest_drains = nearest(further.geometry, drains.geometry, further.snap_tolerance)

    maybe_near_neighbor = further.join(nearest_drains, rsuffix="_nearest")

    ix = maybe_near_neighbor.loc[
        maybe_near_neighbor.distance < maybe_near_neighbor.snap_dist
    ].index
    near_neighbor = (
        (
            maybe_near_neighbor.loc[ix]
            .drop(columns=["drain", "drainID", "wbID", "lineID", "snap_dist"])
            .rename(columns={"drainID_nearest": "drainID", "distance": "snap_dist"})
            .join(
                drains[["geometry", "lineID", "wbID"]].rename(
                    columns={"geometry": "drain"}
                ),
                on="drainID",
            )
        )
        .sort_values(by="snap_dist")
        .groupby(level=0)
        .first()
    )

    df.loc[ix, "snapped"] = True
    df.loc[ix, "geometry"] = near_neighbor.drain
    df.loc[ix, "snap_dist"] = near_neighbor.snap_dist
    df.loc[ix, "snap_ref_id"] = near_neighbor.drainID
    df.loc[ix, "lineID"] = near_neighbor.lineID
    df.loc[ix, "wbID"] = near_neighbor.wbID
    df.loc[ix, "snap_log"] = ndarray_append_strings(
        "snapped: within ",
        to_snap.loc[ix].snap_tolerance,
        "m tolerance of drain point for adjacent waterbody",
    )

    to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy()

    print(
        "Found {:,} dams close to drain points for an adjacent waterbody in {:.2f}s".format(
            len(ix), time() - snap_start
        )
    )

    # Any that remain and are < 250 in their waterbody snap to nearest drain
    further = further.loc[
        ~further.index.isin(ix) & (further.snap_dist <= WB_DRAIN_MAX_TOLERANCE)
    ].copy()

    ix = further.index
    df.loc[ix, "snapped"] = True
    df.loc[ix, "geometry"] = further.drain
    df.loc[ix, "snap_dist"] = further.snap_dist
    df.loc[ix, "snap_ref_id"] = further.drainID
    df.loc[ix, "lineID"] = further.lineID
    df.loc[ix, "wbID"] = further.wbID
    df.loc[ix, "snap_log"] = ndarray_append_strings(
        "snapped: within ",
        to_snap.loc[ix].snap_tolerance,
        "-",
        WB_DRAIN_MAX_TOLERANCE,
        "m tolerance of drain point of waterbody that contains this dam",
    )
    to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy()

    print(
        "Found {:,} dams within <{}m of the drain points for their waterbody".format(
            len(ix), WB_DRAIN_MAX_TOLERANCE
        )
    )

    ### Find the ones that are not in a waterbody but within tolerance of a drain
    # Visually inspected several that had multiple waterbodies nearby
    # in all cases, the nearest one was sufficient
    print("Finding nearest waterbody drains for unsnapped dams...")
    snap_start = time()
    nearest_drains = nearest(to_snap.geometry, drains.geometry, to_snap.snap_tolerance)

    nearest_drains = nearest_drains.join(to_snap.geometry).join(
        drains[["geometry", "wbID", "lineID"]].rename(columns={"geometry": "drain"}),
        on="drainID",
    )

    ix = nearest_drains.index
    df.loc[ix, "snapped"] = True
    df.loc[ix, "geometry"] = nearest_drains.drain
    df.loc[ix, "snap_dist"] = nearest_drains.distance
    df.loc[ix, "snap_ref_id"] = nearest_drains.drainID
    df.loc[ix, "lineID"] = nearest_drains.lineID
    df.loc[ix, "wbID"] = nearest_drains.wbID

    df.loc[ix, "snap_log"] = ndarray_append_strings(
        "snapped: within ",
        to_snap.loc[ix].snap_tolerance,
        "m tolerance of drain point of waterbody (dam not in waterbody)",
    )

    to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy()

    print(
        "Found {:,} dams within {}m of waterbody drain points".format(
            len(ix), to_snap.snap_tolerance.max()
        )
    )

    # TODO: need to track which waterbodies were claimed by dams

    return df, to_snap
df["dup_count"] = np.nan
df["dup_log"] = "not a duplicate"
df["dup_sort"] = 0  # not meaningful for waterfalls
df["ManualReview"] = 0  # not meaningful for waterfalls

dedup_start = time()
df, to_dedup = find_duplicates(df,
                               to_dedup=df.copy(),
                               tolerance=DUPLICATE_TOLERANCE)
print("Found {:,} total duplicates in {:.2f}s".format(
    len(df.loc[df.duplicate]),
    time() - dedup_start))

### Deduplicate by dams
# any that are within duplicate tolerance of dams may be duplicating those dams
dams = from_geofeather(master_dir / "dams.feather")
near_dams = nearest(df.geometry, dams.geometry, DUPLICATE_TOLERANCE)

ix = near_dams.index
df.loc[ix, "duplicate"] = True
df.loc[ix,
       "dup_log"] = "Within {}m of an existing dam".format(DUPLICATE_TOLERANCE)

print("Found {} waterfalls within {}m of dams".format(len(ix),
                                                      DUPLICATE_TOLERANCE))

### Join to line atts
flowlines = deserialize_dfs(
    [
        nhd_dir / "clean" / region / "flowlines.feather"
        for region in REGION_GROUPS
Example #8
0
from analysis.constants import REGION_GROUPS, CRS_WKT

data_dir = Path("data")
networks_dir = data_dir / "networks"

df = deserialize_dfs([
    networks_dir / region / "small_barriers/barriers_network.feather"
    for region in REGION_GROUPS
], )

networkIDs = df.loc[df.kind == "small_barrier"].upNetID.unique()

for region in list(REGION_GROUPS.keys()):
    print("\n----------------\n processing {}".format(region))

    networks = from_geofeather(networks_dir / region / "small_barriers" /
                               "network.feather")

    # Extract only the networks associated with small barriers, the rest are dams
    networks = networks.loc[networks.networkID.isin(networkIDs),
                            ["networkID", "geometry"]]

    if len(networks) == 0:
        print("No small barriers in this region, skipping")
        continue

    print("Writing to GPKG")
    to_gpkg(
        networks.reset_index(drop=True),
        data_dir / "tiles" / "small_barriers_network{}".format(region),
        index=False,
        name="networks",
def add_spatial_joins(df):
    """Add spatial joins needed for network analysis.

    Parameters
    ----------
    df : GeoDataFrame

    Returns
    -------
    GeoDataFrame
        has fields added by spatial joins to other datasets
    """

    print("Joining to HUC12")
    huc12 = from_geofeather(boundaries_dir / "HUC12.feather")

    df = spatial_join(df, huc12)

    # Expected: not all barriers fall cleanly within the states dataset
    print("{:,} barriers were not assigned HUC12".format(
        len(df.loc[df.HUC12.isnull()])))

    # Calculate HUC codes for other levels from HUC12
    df["HUC2"] = df["HUC12"].str.slice(0, 2)  # region
    df["HUC6"] = df["HUC12"].str.slice(0, 6)  # basin
    df["HUC8"] = df["HUC12"].str.slice(0, 8)  # subbasin

    # Read in HUC6 and join in basin name
    huc6 = (from_geofeather(boundaries_dir /
                            "HUC6.feather")[["HUC6",
                                             "NAME"]].rename(columns={
                                                 "NAME": "Basin"
                                             }).set_index("HUC6"))
    df = df.join(huc6, on="HUC6")

    print("Joining to counties")
    counties = from_geofeather(boundaries_dir / "counties.feather")[[
        "geometry", "County", "COUNTYFIPS", "STATEFIPS"
    ]]

    df = spatial_join(df, counties)

    # Join in state name based on STATEFIPS from county
    states = deserialize_df(boundaries_dir /
                            "states.feather")[["STATEFIPS",
                                               "State"]].set_index("STATEFIPS")
    df = df.join(states, on="STATEFIPS")

    # Expected: not all barriers fall cleanly within the states dataset
    print("{:,} barriers were not assigned states".format(
        len(df.loc[df.STATEFIPS.isnull()])))

    ### Level 3 & 4 Ecoregions
    print("Joining to ecoregions")
    # Only need to join in ECO4 dataset since it has both ECO3 and ECO4 codes
    eco4 = from_geofeather(boundaries_dir /
                           "eco4.feather")[["geometry", "ECO3", "ECO4"]]
    df = spatial_join(df, eco4)

    # Expected: not all barriers fall cleanly within the ecoregions dataset
    print("{:,} barriers were not assigned ecoregions".format(
        len(df.loc[df.ECO4.isnull()])))

    return df
nhd_dir = Path("data/nhd")
src_dir = nhd_dir / "raw"

start = time()
for region, HUC2s in list(REGION_GROUPS.items())[4:]:
    region_start = time()

    print("\n----- {} ------\n".format(region))

    out_dir = nhd_dir / "clean" / region
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    print("Reading flowlines...")
    flowlines = from_geofeather(src_dir / region /
                                "flowlines.feather").set_index("lineID")
    joins = deserialize_df(src_dir / region / "flowline_joins.feather")
    print("Read {:,} flowlines".format(len(flowlines)))

    ### Drop underground conduits
    ix = flowlines.loc[flowlines.FType == 420].index
    print("Removing {:,} underground conduits".format(len(ix)))
    flowlines = flowlines.loc[~flowlines.index.isin(ix)].copy()
    joins = remove_joins(joins,
                         ix,
                         downstream_col="downstream_id",
                         upstream_col="upstream_id")

    ### Manual fixes for flowlines
    exclude_ids = EXCLUDE_IDS.get(region, [])
    if exclude_ids:
SNAP_TOLERANCE = 50
DUPLICATE_TOLERANCE = 10  # meters


data_dir = Path("data")
boundaries_dir = data_dir / "boundaries"
nhd_dir = data_dir / "nhd"
barriers_dir = data_dir / "barriers"
src_dir = barriers_dir / "source"
master_dir = barriers_dir / "master"
snapped_dir = barriers_dir / "snapped"
qa_dir = barriers_dir / "qa"

start = time()

df = from_geofeather(src_dir / "sarp_small_barriers.feather")
print("Read {:,} small barriers".format(len(df)))

### Add IDs for internal use
# internal ID
df["id"] = df.index.astype("uint32")
df = df.set_index("id", drop=False)

######### Fix data issues

# Fix mixed casing of values
for column in ("CrossingType", "RoadType", "Stream", "Road"):
    df[column] = df[column].fillna("Unknown").str.title().str.strip()
    df.loc[df[column].str.len() == 0, column] = "Unknown"

# Fix line returns in stream name and road name
Example #12
0
src_dir = barriers_dir / "source"
master_dir = barriers_dir / "master"
snapped_dir = barriers_dir / "snapped"
qa_dir = barriers_dir / "qa"
dams_filename = "Raw_Featureservice_SARPUniqueID.gdb"
gdb = src_dir / dams_filename

# dams that fall outside SARP
outside_layer = "Dams_Non_SARP_States_09052019"

start = time()


### Read in SARP states and merge
print("Reading dams in SARP states")
df = from_geofeather(src_dir / "sarp_dams.feather")
print("Read {:,} dams in SARP states".format(len(df)))

### Read in non-SARP states and join in
# these are for states that overlap with HUC4s that overlap with SARP states
print(
    "Reading dams that fall outside SARP states, but within HUC4s that overlap with SARP states..."
)
outside_df = (
    gp.read_file(gdb, layer=outside_layer)
    # SARPID is Old, use SARPUniqueID for it instead
    .drop(columns=["SARPID"])
    .rename(columns={"SARPUniqueID": "SARPID", "Snap2018": "ManualReview"})[
        DAM_COLS + ["geometry"]
    ]
    .to_crs(CRS)
from analysis.pygeos_compat import sjoin_geometry as sjoin, dissolve, to_gdf

from analysis.constants import REGION_GROUPS, CRS
from analysis.util import append
from analysis.prep.barriers.lib.points import nearest, neighborhoods

nhd_dir = Path("data/nhd")
src_dir = nhd_dir / "clean"
out_dir = nhd_dir / "merged"
extra_dir = nhd_dir / "extra"

start = time()

### Merge NHD lines and areas that represent dams and dam-related features
print("Reading NHD lines and areas, and merging...")
nhd_lines = from_geofeather(extra_dir / "nhd_lines.feather")
nhd_lines = nhd_lines.loc[(nhd_lines.FType.isin([343, 369, 398]))
                          & nhd_lines.geometry.notnull()].copy()
# create buffers (10m) to merge with NHD areas
# from visual inspection, this helps coalesce those that are in pairs
nhd_lines["geometry"] = pg.buffer(nhd_lines.geometry, 10, quadsegs=1)

# All NHD areas indicate a dam-related feature
nhd_areas = from_geofeather(extra_dir / "nhd_areas.feather")
nhd_areas = nhd_areas.loc[nhd_areas.geometry.notnull()].copy()
# buffer polygons slightly so we can dissolve touching ones together.
nhd_areas["geometry"] = pg.buffer(nhd_areas.geometry, 5)

# Dissolve adjacent nhd lines and waterbodies together
nhd_dams = nhd_lines.append(nhd_areas, ignore_index=True, sort=False)
nearby = sjoin(nhd_dams.geometry, nhd_dams.geometry, how="inner")
from analysis.constants import REGION_GROUPS, CRS_WKT

data_dir = Path("data")
networks_dir = data_dir / "networks"

df = deserialize_dfs([
    networks_dir / region / "dams/barriers_network.feather"
    for region in REGION_GROUPS
], )

networkIDs = df.loc[df.kind == "dam"].upNetID.unique()

for region in list(REGION_GROUPS.keys()):
    print("\n----------------\n processing {}".format(region))

    networks = from_geofeather(networks_dir / region / "dams" /
                               "network.feather")

    # Extract only the networks associated with small barriers, the rest are dams
    networks = networks.loc[networks.networkID.isin(networkIDs),
                            ["networkID", "geometry"]]

    if len(networks) == 0:
        print("No small barriers in this region, skipping")
        continue

    print("Writing to GPKG")
    to_gpkg(
        networks.reset_index(drop=True),
        data_dir / "tiles" / "dam_networks{}".format(region),
        index=False,
        name="networks",