Python write_dataframe Examples, pyogrio.write_dataframe Python Examples

Example #1

0

Show file

def export_duplicate_areas(dups, path):
    """Export duplicate barriers for QA.

    Parameters
    ----------
    dups : GeoDataFrame
        contains "geometry" and "dup_group"
        to indicate group
    path : str or Path
        output path
    """

    print("Exporting duplicate areas")

    dups = dups.copy()
    dups["geometry"] = pg.buffer(dups.geometry.values.data, dups.dup_tolerance)
    dissolved = dissolve(dups[["geometry", "dup_group"]], by="dup_group")
    groups = gp.GeoDataFrame(
        dups[["id", "SARPID", "dup_group"]]
        .groupby("dup_group")
        .agg({"SARPID": "unique", "id": "unique"})
        .join(dissolved.geometry, on="dup_group"),
        crs=dups.crs,
    )
    groups["id"] = groups.id.apply(lambda x: ", ".join([str(s) for s in x]))
    groups["SARPID"] = groups.SARPID.apply(lambda x: ", ".join([str(s) for s in x]))
    write_dataframe(groups, path)

Example #2

0

Show file

File: snap.py Project: astutespruce/sarp-connectivity

def export_snap_dist_lines(df, original_locations, out_dir, prefix=""):
    """Creates lines from the original coordinate to the snapped coordinate
    to help QA/QC snapping operation.

    Creates geopackages in out_dir:
    - pre_snap_to_post_snap: line between snapped and unsnapped coordinate
    - pre_snap: unsnapped points
    - post_snap: snapped points

    Parameters
    ----------
    df : DataFrame
        contains pygeos geometries in "geometry" column
    original_locations : DataFrame
        contains pygeos geometries in "geometry" column
    out_dir : Path
    prefix : str
        prefix to add to filename
    """

    print("Exporting snap review datasets...")

    tmp = df.loc[
        df.snapped,
        ["geometry", "Name", "SARPID", "snapped", "snap_dist", "snap_log"
         ]].join(original_locations.geometry.rename("orig_pt"))
    tmp["new_pt"] = tmp.geometry.copy()
    tmp["geometry"] = connect_points(tmp.new_pt.values.data,
                                     tmp.orig_pt.values.data)

    write_dataframe(
        tmp.drop(columns=["new_pt", "orig_pt"]).reset_index(drop=True),
        out_dir / f"{prefix}pre_snap_to_post_snap.fgb",
    )
    write_dataframe(
        tmp.drop(columns=["geometry", "new_pt"]).rename(columns={
            "orig_pt": "geometry"
        }).reset_index(drop=True),
        out_dir / f"{prefix}pre_snap.fgb",
    )
    write_dataframe(
        tmp.drop(columns=["geometry", "orig_pt"]).rename(columns={
            "new_pt": "geometry"
        }).reset_index(drop=True),
        out_dir / f"{prefix}post_snap.fgb",
    )

Example #3

0

Show file

File: prep_boundaries.py Project: astutespruce/sarp-connectivity

outer_huc4["km2"] = pg.area(outer_huc4.geometry.values.data) / 1e6

# calculate geometric difference, explode, and keep non-slivers
outer_huc4["geometry"] = pg.difference(outer_huc4.geometry.values.data,
                                       state_merged)
outer_huc4 = explode(outer_huc4)
outer_huc4["clip_km2"] = pg.area(outer_huc4.geometry.values.data) / 1e6
outer_huc4["percent"] = 100 * outer_huc4.clip_km2 / outer_huc4.km2
keep_huc4 = outer_huc4.loc[outer_huc4.clip_km2 >= 100].HUC4.unique()
outer_huc4 = outer_huc4.loc[outer_huc4.HUC4.isin(keep_huc4)
                            & (outer_huc4.clip_km2 >= 2.5)].copy()
outer_huc4 = dissolve(outer_huc4, by="HUC4", agg={
    "HUC2": "first"
}).reset_index(drop=True)
outer_huc4.to_feather(out_dir / "outer_huc4.feather")
write_dataframe(outer_huc4, out_dir / "outer_huc4.gpkg")

### Counties - within HUC4 bounds
print("Processing counties")
fips = sorted(state_df.STATEFIPS.unique())

county_df = (read_dataframe(
    county_filename,
    columns=["NAME", "GEOID", "STATEFP"],
).to_crs(CRS).rename(columns={
    "NAME": "County",
    "GEOID": "COUNTYFIPS",
    "STATEFP": "STATEFIPS"
}))

# keep only those within the region HUC4 outer boundary

Example #4

0

Show file

File: prep_dams.py Project: astutespruce/sarp-connectivity

### Add lat / lon (must be done after snapping!)
print("Adding lat / lon fields")
geo = df[["geometry"]].to_crs(GEO_CRS)
geo["lat"] = pg.get_y(geo.geometry.values.data).astype("float32")
geo["lon"] = pg.get_x(geo.geometry.values.data).astype("float32")
df = df.join(geo[["lat", "lon"]])


### All done processing!

print("\n--------------\n")
df = df.reset_index(drop=True)

print("Serializing {:,} dams to master file".format(len(df)))
df.to_feather(master_dir / "dams.feather")
write_dataframe(df, qa_dir / "dams.fgb")


# Extract out only the snapped ones
df = df.loc[df.snapped & (~(df.duplicate | df.dropped | df.excluded))].reset_index(
    drop=True
)
df.lineID = df.lineID.astype("uint32")
df.NHDPlusID = df.NHDPlusID.astype("uint64")

print("Serializing {:,} snapped dams".format(len(df)))
df[
    ["geometry", "id", "HUC2", "lineID", "NHDPlusID", "loop", "intermittent"]
].to_feather(snapped_dir / "dams.feather",)
write_dataframe(df, qa_dir / "snapped_dams.fgb")

Example #5

0

Show file

File: prepare_naturescape.py Project: astutespruce/secas-blueprint

        regional_connectors_df,
]):
    tree = pg.STRtree(df.geometry.values.data)
    ix = tree.query(bnd, predicate="intersects")
    df = df.iloc[ix].copy()
    df["value"] = i + 1

    if i == 0:
        merged = df
    else:
        merged = merged.append(df, ignore_index=True, sort=False)

df = merged

if DEBUG:
    write_dataframe(df, "/tmp/naturescape.gpkg", driver="GPKG")

### Create cores raster
# cores and important areas don't overlap; just rasterize them

print("Rasterizing cores and other important areas...")
cores = np.ones(shape=data.shape, dtype="uint8") * 255
# set all areas inside the mask as 0
cores = np.where(mask == 1, 0, 255)

for value in [1, 2, 3]:
    print(f"Processing value {value}...")
    shapes = to_dict_all(df.loc[df.value == value].geometry.values.data)
    data = rasterize(shapes,
                     data.shape,
                     transform=transform,

Example #6

0

Show file

File: prepare_ownership.py Project: astutespruce/secas-blueprint

# set the CRS, it is same as 5070 but not recognized properly
df = df.set_crs(DATA_CRS)

# drop BOEM lease block groups
df = df.loc[df.Agg_Src != "USGS_PADUS2_0Marine_BOEM_Block_Dissolve"].drop(
    columns=["Agg_Src"])

tree = pg.STRtree(df.geometry.values.data)
ix = tree.query(bnd_df.geometry.values.data[0], predicate="intersects")
df = df.iloc[ix].copy()

print("making valid...")
df["geometry"] = pg.make_valid(df.geometry.values.data)

df = explode(df).reset_index()
# there are some geometry errors after cleaning up above, keep only polys
df = df.loc[pg.get_type_id(df.geometry.values.data) == 3].copy()

print("Writing files")
df.to_feather(out_dir / "ownership.feather")
write_dataframe(df, data_dir / "boundaries/ownership.gpkg", driver="GPKG")

# Write for tiles
print("Writing GeoJSON for tiles")
write_dataframe(
    df[["geometry", "Own_Type", "GAP_Sts"]].to_crs(GEO_CRS),
    tile_dir / "ownership.geojson",
    driver="GeoJSONSeq",
)

Example #7

0

Show file

                {"break_geometry": breaks.take(left)}, index=df.index.take(right)
            )
            grouped = pairs.groupby(level=0).break_geometry.apply(
                lambda g: pg.multipolygons(g.values.data)
            )
            df.loc[grouped.index, "geometry"] = pg.difference(
                df.loc[grouped.index].geometry.values.data, grouped.values
            )

            df = explode(df).reset_index(drop=True)

    # make sure all polygons are valid
    ix = ~pg.is_valid(df.geometry.values.data)
    if ix.sum():
        print(f"Repairing {ix.sum()} invalid waterbodies")
        df.loc[ix, "geometry"] = pg.make_valid(df.loc[ix].geometry.values.data)
        df = explode(explode(df))
        df = df.loc[pg.get_type_id(df.geometry.values.data) == 3].reset_index()

    # assign a new unique wbID
    df["wbID"] = df.index.values.astype("uint32") + 1 + int(huc2) * 1000000
    df["km2"] = pg.area(df.geometry.values.data) / 1e6

    df.to_feather(huc2_dir / "waterbodies.feather")
    write_dataframe(df, huc2_dir / "waterbodies.gpkg")

    print("--------------------")
    print(f"HUC2: {huc2} done in {time() - huc2_start:.0f}s\n\n")

print(f"Done in {time() - start:.2f}s\n============================")

Example #8

0

Show file

File: prepare_caribbean.py Project: astutespruce/secas-blueprint

from pathlib import Path
import os
import warnings

from pyogrio import read_dataframe, write_dataframe

from analysis.constants import DATA_CRS

warnings.filterwarnings("ignore",
                        message=".*initial implementation of Parquet.*")

src_dir = Path("source_data/caribbean")
out_dir = Path("data/inputs/indicators/caribbean")
tile_dir = Path("data/for_tiles")

if not out_dir.exists():
    os.makedirs(out_dir)

df = (read_dataframe(src_dir / "Watershed_Ranking_PR.shp",
                     columns=["Metric_Ran", "HUC_10"]).rename(columns={
                         "Metric_Ran": "carrank",
                         "HUC_10": "HUC10"
                     }).to_crs(DATA_CRS))

df.to_feather(out_dir / "caribbean.feather")

# for tiles
write_dataframe(df[["geometry", "carrank"]],
                tile_dir / "caribbean.geojson",
                driver="GeoJSONSeq")

Example #9

0

Show file

File: export_raw_flowlines.py Project: astutespruce/sarp-connectivity

nhd_dir = data_dir / "nhd/raw"
out_dir = Path("/tmp/sarp")

huc2_df = pd.read_feather(data_dir / "boundaries/huc2.feather",
                          columns=["HUC2"])
huc2s = huc2_df.HUC2.sort_values().values

huc2s = [
    # "02",
    # "03",
    # "05",
    # "06",
    # "07",
    # "08",
    # "09",
    # "10",
    # "11",
    # "12",
    # "13",
    "14",
    "15",
    "16",
    "17",
    # "21",
]

for huc2 in huc2s:
    print(f"Exporting {huc2}...")
    flowlines = gp.read_feather(nhd_dir / huc2 / "flowlines.feather")
    write_dataframe(flowlines, out_dir / f"region{huc2}_raw_flowlines.shp")

Example #10

0

Show file

File: prep_road_crossings.py Project: astutespruce/sarp-connectivity

df["joinID"] = (df.index * 1e6).astype("uint32")
df["kind"] = "crossing"

merged = barriers[["kind",
                   "geometry"]].append(df[["joinID", "kind", "geometry"]],
                                       sort=False,
                                       ignore_index=True)
merged = mark_duplicates(merged, tolerance=DUPLICATE_TOLERANCE)

dup_groups = merged.loc[(merged.dup_count > 1)
                        & (merged.kind == "barrier")].dup_group.unique()
remove_ids = merged.loc[merged.dup_group.isin(dup_groups)
                        & (merged.kind == "crossing")].joinID
print(
    f"{len(remove_ids):,} crossings appear to be duplicates of existing barriers"
)

df = df.loc[~df.joinID.isin(remove_ids)].drop(columns=["joinID", "kind"])
print(f"Now have {len(df):,} road crossings")

# make sure that id is unique of small barriers
df["id"] = (barriers.id.max() + 100000 +
            df.index.astype("uint")).astype("uint")

df = df.reset_index(drop=True)

df.to_feather(out_dir / "road_crossings.feather")
write_dataframe(df, qa_dir / "road_crossings.fgb")

print("Done in {:.2f}".format(time() - start))

Example #11

0

Show file

            columns=[
                "lineID",
                "geometry",
                "intermittent",
                "altered",
                "sizeclass",
                "StreamOrde",
            ],
        ).set_index("lineID").rename(columns={"StreamOrde": "streamorder"}))
        flowlines = flowlines.join(segments)

        # aggregate to multilinestrings by combinations of networkID, altered, intermittent
        networks = (aggregate_lines(
            flowlines,
            by=["networkID", "altered",
                "intermittent"]).set_index("networkID").join(
                    stats,
                    how="inner").reset_index().sort_values(by="networkID"))

        # Set plotting symbol
        networks["symbol"] = "normal"
        networks.loc[networks.altered, "symbol"] = "altered"
        # currently overrides altered since both come from NHD (mutually exclusive in source data)
        networks.loc[networks.intermittent, "symbol"] = "intermittent"
        networks.loc[networks.intermittent & networks.altered,
                     "symbol"] = "altered_intermittent"

        print("Serializing dissolved networks...")
        write_dataframe(
            networks, out_dir / f"region{huc2}_{barrier_type}_networks.{ext}")

Example #12

0

Show file

### Join to states
states = gp.read_feather(
    boundaries_dir / "states.feather", columns=["id", "geometry"]
).rename(columns={"id": "state"})

states = states.loc[states.state.isin(STATES.keys())].copy()

print("Joining to states...")
tree = pg.STRtree(drains.geometry.values.data)
left, right = tree.query_bulk(states.geometry.values.data, predicate="intersects")

tmp = (
    pd.DataFrame(
        {"state": states.state.values.take(left)}, index=drains.index.values.take(right)
    )
    .groupby(level=0)
    .first()
)

# drop any without states (e.g., outside region states)
drains = drains.join(tmp, how="inner")

# cleanup datatypes
for col in ["wb_km2", "TotDASqKm"]:
    drains[col] = drains[col].astype("float32")

print("Saving to shapefile")
write_dataframe(drains, out_dir / "unclaimed_drain_points.shp")

Example #13

0

Show file

File: create_network_tiles.py Project: astutespruce/sarp-connectivity

                # exclude altered flowlines at low zooms
                subset = subset.loc[subset.mapcode < 2].copy()

            if simplification:
                subset["geometry"] = pg.simplify(subset.geometry.values.data,
                                                 simplification)

            json_filename = tmp_dir / f"region{huc2}_{minzoom}_{maxzoom}_flowlines.json"
            mbtiles_filename = (
                tmp_dir /
                f"region{huc2}_{minzoom}_{maxzoom}_flowlines.mbtiles")
            mbtiles_files.append(mbtiles_filename)

            write_dataframe(
                subset.to_crs(GEO_CRS),
                json_filename,
                driver="GeoJSONSeq",
            )

            del subset

            ret = subprocess.run(
                tippecanoe_args +
                ["-Z", str(minzoom), "-z",
                 str(maxzoom)] +
                ["-o", f"{str(mbtiles_filename)}",
                 str(json_filename)] + col_types)
            ret.check_returncode()

            # remove JSON file
            json_filename.unlink()

Example #14

0

Show file

File: extract_catchments.py Project: astutespruce/sarp-connectivity

    print("----- {} ------".format(huc2))

    huc2_dir = out_dir / huc2
    if not huc2_dir.exists():
        os.makedirs(huc2_dir)

    huc4s = units[huc2]

    if len(huc4s) > MAX_HUC4s:
        # split into smaller groups to keep output files smaller
        for counter, i in enumerate(range(0, len(huc4s), MAX_HUC4s)):
            print(f"Processing subgroup {huc2}_{counter}")

            df = process_huc4s(src_dir, huc4s[i:i + MAX_HUC4s])

            print("serializing {:,} catchments".format(len(df)))
            df[["NHDPlusID", "geometry"
                ]].to_feather(huc2_dir / f"catchments_{counter}.feather")
            write_dataframe(df,
                            tmp_dir / f"region{huc2}_catchments_{counter}.shp")

    else:
        df = process_huc4s(src_dir, huc4s)

        print("serializing {:,} catchments".format(len(df)))
        df[["NHDPlusID",
            "geometry"]].to_feather(huc2_dir / f"catchments.feather")
        write_dataframe(df, tmp_dir / f"region{huc2}_catchments.shp")

print("Done in {:.2f}s\n============================".format(time() - start))

Example #15

0

Show file

inputs.sort()
inputs = pd.DataFrame({"inputs": inputs})

inputs.reset_index().rename(columns={
    "index": "value",
    "inputs": "id"
}).to_json(json_dir / "input_area_values.json", orient="records")

df = df.join(
    inputs.reset_index().rename(columns={
        "index": "value"
    }).set_index("inputs"),
    on="inputs",
)

write_dataframe(df, bnd_dir / "input_areas.fgb")
df.to_feather(out_dir / "boundaries/input_areas.feather")

# Rasterize to match the blueprint

df = pd.DataFrame(df[["geometry", "value"]].copy())
df.geometry = df.geometry.values.data

# convert to pairs of GeoJSON , value
shapes = df.apply(lambda row: (to_dict(row.geometry), row.value), axis=1)

print("Rasterizing inputs...")
with rasterio.open(blueprint_filename) as src:
    data = rasterize(shapes.values,
                     src.shape,
                     transform=src.transform,

Example #16

0

Show file

mx_df = mx_df.dissolve(by="NUM_EDO")
mx_df["country"] = "MX"

admin_df = (us_df[["geometry", "admin1", "admin1_name", "country"]].append(
    ca_df[["geometry", "admin1", "admin1_name", "country"]],
    ignore_index=True,
    sort=False,
).append(
    mx_df[["geometry", "admin1", "admin1_name", "country"]],
    ignore_index=True,
    sort=False,
))

admin_df["id"] = admin_df.index.astype("uint8") + 1

write_dataframe(admin_df, boundaries_dir / "na_admin1.json")
admin_df.to_feather(boundaries_dir / "na_admin1.feather")

### Process species ranges
print("Processing species ranges...")
# create lookup of species scientific name to code
sci_name_lut = {value["SNAME"]: key for key, value in SPECIES.items()}

range_df = read_dataframe("data/boundaries/src/species_ranges.shp")

# split hoary bat into Hawaiian vs mainland
laci = range_df.loc[range_df.SCI_NAME == "Lasiurus cinereus"]
Hawaii = pg.box(*HAWAII_BOUNDS)
haba = laci.copy()
# add new geometry for haba
haba.geometry = pg.intersection(laci.geometry.values.data, Hawaii)

Example #17

0

Show file

    # make valid
    ix = ~pg.is_valid(waterbodies.geometry.values.data)
    if ix.sum():
        print(f"Repairing {ix.sum():,} invalid waterbodies")
        waterbodies.loc[ix, "geometry"] = pg.make_valid(
            waterbodies.loc[ix].geometry.values.data)

    # note: nwi_code, nwi_type are discarded here since they aren't used later
    print("Dissolving adjacent waterbodies")
    waterbodies = dissolve(waterbodies, by=["altered"])
    waterbodies = explode(waterbodies).reset_index(drop=True)

    waterbodies["km2"] = pg.area(waterbodies.geometry.values.data) / 1e6

    waterbodies.to_feather(huc2_dir / "waterbodies.feather")
    write_dataframe(waterbodies, huc2_dir / "waterbodies.gpkg")

    ### Process riverine
    print(f"Extracted {len(rivers):,} NWI altered river polygons")
    left, right = tree.query_bulk(rivers.geometry.values.data,
                                  predicate="intersects")
    rivers = rivers.iloc[np.unique(left)].reset_index(drop=True)
    print(f"Kept {len(rivers):,} that intersect flowlines")

    rivers = explode(rivers)
    # make valid
    ix = ~pg.is_valid(rivers.geometry.values.data)
    if ix.sum():
        print(f"Repairing {ix.sum():,} invalid rivers")
        rivers.loc[ix, "geometry"] = pg.make_valid(
            rivers.loc[ix].geometry.values.data)

Example #18

0

Show file

    layer="Waterbody",
    force_2d=True,
    columns=[],
).rename(columns={
    "NAME": "name"
}).to_crs(CRS))

print("Reading flowlines...")
flowlines = gp.read_feather(nhd_dir / huc2 / "flowlines.feather", columns=[])
tree = pg.STRtree(flowlines.geometry.values.data)

print(f"Extracted {len(df):,} SC waterbodies")
left, right = tree.query_bulk(df.geometry.values.data, predicate="intersects")
df = df.iloc[np.unique(left)].reset_index(drop=True)
print(f"Kept {len(df):,} that intersect flowlines")

df = explode(df)
# make valid
ix = ~pg.is_valid(df.geometry.values.data)
if ix.sum():
    print(f"Repairing {ix.sum():,} invalid waterbodies")
    df.loc[ix, "geometry"] = pg.make_valid(df.loc[ix].geometry.values.data)

print("Dissolving adjacent waterbodies...")
df["tmp"] = 1
df = dissolve(df, by="tmp").drop(columns=["tmp"])
df = explode(df).reset_index(drop=True)

df.to_feather(src_dir / "sc_waterbodies.feather")
write_dataframe(df, src_dir / "sc_waterbodies.gpkg")

Example #19

0

Show file

    state_filename,
    columns=["STUSPS", "STATEFP", "NAME"],
).to_crs(CRS).rename(columns={
    "STUSPS": "id",
    "NAME": "State",
    "STATEFP": "STATEFIPS"
}))
state_df.geometry = pg.make_valid(state_df.geometry.values.data)

# save all states for spatial joins
state_df.to_feather(out_dir / "states.feather")

state_df = state_df.loc[state_df.id.isin(STATES.keys())].copy()
state_df.to_feather(out_dir / "region_states.feather")
write_dataframe(
    state_df[["State", "geometry"]].rename(columns={"State": "id"}),
    out_dir / "region_states.gpkg",
)

# dissolve to create outer state boundary for total analysis area and regions
bnd_df = gp.GeoDataFrame(
    [
        {
            "geometry": pg.union_all(state_df.geometry.values.data),
            "id": "total"
        },
    ] + [{
        "geometry":
        pg.union_all(state_df.loc[state_df.id.isin(
            REGION_STATES[region])].geometry.values.data),
        "id":
        region,

Example #20

0

Show file

huc4 = sorted(huc4_df.HUC4.unique())
sarp_huc4_df = gp.read_feather(out_dir / "sarp_huc4.feather")
sarp_huc4 = sorted(sarp_huc4_df.HUC4.unique())

### Extract HUC6 within HUC4
print("Processing HUC6...")
huc6_df = (read_dataframe(
    wbd_gdb,
    layer="WBDHU6",
    columns=["huc6", "name"],
    where=f"SUBSTR(huc6, 0, 4) IN {tuple(huc4)}",
).rename(columns={
    "huc6": "HUC6"
}).to_crs(CRS))
huc6_df.to_feather(out_dir / "huc6.feather")
write_dataframe(huc6_df.rename(columns={"HUC6": "id"}), out_dir / "huc6.gpkg")

huc6_df["HUC4"] = huc6_df.HUC6.str[:4]
sarp_huc6_df = huc6_df.loc[huc6_df.HUC4.isin(sarp_huc4)].drop(columns=["HUC4"])
write_dataframe(sarp_huc6_df.rename(columns={"HUC6": "id"}),
                out_dir / "sarp_huc6.gpkg")
sarp_huc6_df.to_feather(out_dir / "sarp_huc6.feather")

### Extract HUC8 within HUC4
print("Processing HUC8...")
huc8_df = (read_dataframe(
    wbd_gdb,
    layer="WBDHU8",
    columns=["huc8", "name"],
    where=f"SUBSTR(huc8, 0, 4) IN {tuple(huc4)}",
).rename(columns={

Example #21

0

Show file

                  })).join(snapped_df[["geometry", "ManualReview"]].rename(
                      columns={
                          "geometry": "reviewed_pt",
                          "ManualReview": "reviewed_ManualReview"
                      }))

# mark any as snapped / not
df.loc[df.snapped, "cur_pt_src"] = "autosnapped pt"
df.loc[~df.snapped, "cur_pt_src"] = "not snapped"

# any that were manually reviewed in source, use that original point and manual review
# (note: these may be overridden again by manually reviewed dams in snapping dataset)
ix = df.src_ManualReview.notnull()
df.loc[ix, "geometry"] = df.loc[ix].src_pt
df.loc[ix, "ManualReview"] = df.loc[ix].src_ManualReview
df.loc[ix, "cur_pt_src"] = "raw inventory pt"

# any that were manually reviewed in snapping dataset, use that coordinate and manual review
ix = df.reviewed_ManualReview.notnull()
df.loc[ix, "geometry"] = df.loc[ix].reviewed_pt
df.loc[ix, "ManualReview"] = df.loc[ix].reviewed_ManualReview
df.loc[ix, "cur_pt_src"] = "manually reviewed pt"

df = df.drop(columns=[
    "src_pt", "reviewed_pt", "src_ManualReview", "reviewed_ManualReview"
]).reset_index()

df.to_feather("data/barriers/qa/snapping_dataset_2022.feather")

write_dataframe(df, out_dir / "snapping_dataset_2022.shp")

Example #22

0

Show file

File: prepare_chat.py Project: astutespruce/secas-blueprint

    if state == "tx":
        state_df = state_df.drop(columns=["lcon"])
        # Reclassify chatrank to match blueprint integration rules.
        # First shift other values up one
        ix = state_df.chatrank >= 2
        state_df.loc[ix, "chatrank"] = state_df.chatrank + 1

        # for any that were previously a chatrank of 2 but with higher values of aquatic or
        # terrestrial, map them back to 2
        ix = ((state_df.chatrank == 3)
              & state_df.arank.isin([2, 3])
              & state_df.trank.isin([2, 3]))
        state_df.loc[ix, "chatrank"] = 2

    # for proofing
    write_dataframe(state_df, gis_dir / f"{state}chat.gpkg", driver="GPKG")

    # for tiles
    write_dataframe(state_df,
                    tile_dir / f"{state}chat.geojson",
                    driver="GeoJSONSeq")

    # convert attributes to categoricals for analysis
    for col in chat_fields:
        if col not in state_df.columns:
            continue

        if state == "tx" and col == "chatrank":
            # IMPORTANT: value 2 is split into values 2/3 and others are shifted up by 1
            categories = [0, 1, 2, 3, 4, 5, 6, 7]
        else:

Example #23

0

Show file

File: cut_flowlines.py Project: astutespruce/sarp-connectivity

barriers = read_feathers(
    [barriers_dir / f"{kind}s.feather" for kind in kinds],
    geo=True,
    new_fields={"kind": kinds},
)

for kind, init_id in zip(kinds, kind_ids):
    ix = barriers.kind == kind
    barriers.loc[ix, "barrierID"] = barriers.loc[ix].id + init_id

barriers.barrierID = barriers.barrierID.astype("uint64")
barriers.to_feather(out_dir / "all_barriers.feather")

if DEBUG:
    write_dataframe(barriers, out_dir / "all_barriers.fgb")

### Cut flowlines in each HUC2
for huc2 in huc2s:
    region_start = time()
    print(f"----- {huc2} ------")

    huc2_dir = out_dir / huc2

    if not huc2_dir.exists():
        os.makedirs(huc2_dir)

    # drop any barriers on loops in this region
    huc2_barriers = barriers.loc[(barriers.HUC2 == huc2)
                                 & (~barriers.loop)].set_index("barrierID",
                                                               drop=False)

Example #24

0

Show file

File: estimate_dams.py Project: astutespruce/sarp-connectivity

                         columns=["id", "geometry"])
tree = pg.STRtree(df.geometry.values.data)
left, right = tree.query_bulk(states.geometry.values.data,
                              predicate="intersects")

state_join = (pd.DataFrame({
    "state": states.id.take(left),
    "drain": df.index.take(right)
}).groupby("drain").first())

df = df.join(state_join)

# only keep those in the region states
df = df.loc[df.state.isin(STATES)].copy()

write_dataframe(df.loc[~has_dam], out_dir / "estimated_dam_lines.fgb")
write_dataframe(df.loc[~has_dam], tmp_dir / "estimated_dam_lines.shp")

write_dataframe(df.loc[has_dam], out_dir / "estimated_dam_lines_with_dam.fgb")
write_dataframe(df.loc[has_dam], tmp_dir / "estimated_dam_lines_with_dam.shp")

df = (df.join(drains.geometry.rename("drain"),
              on="drainID").set_geometry("drain").drop(columns=["geometry"]))
write_dataframe(df.loc[~has_dam], out_dir / "estimated_dams.fgb")
write_dataframe(df.loc[~has_dam], tmp_dir / "estimated_dams.shp")

write_dataframe(df.loc[has_dam], out_dir / "estimated_dams_with_dam.fgb")
write_dataframe(df.loc[has_dam], tmp_dir / "estimated_dams_with_dam.shp")

print(f"Total elapsed {time() - start:,.2f}s")

Example #25

0

Show file

File: aggregate_nhd_barriers.py Project: astutespruce/sarp-connectivity

warnings.filterwarnings("ignore",
                        message=".*initial implementation of Parquet.*")

data_dir = Path("data")

huc2s = sorted(
    pd.read_feather(data_dir / "boundaries/huc2.feather",
                    columns=["HUC2"]).HUC2.values)

src_dir = Path("data/nhd/raw")
out_dir = Path("data/nhd/merged")

if not out_dir.exists():
    os.makedirs(out_dir)

for group in ["points", "lines", "poly"]:
    merged = None

    for huc2 in huc2s:
        huc2_dir = src_dir / huc2

        filename = huc2_dir / f"nhd_{group}.feather"
        if filename.exists():
            df = gp.read_feather(filename)
            merged = append(merged, df)

    df = merged.reset_index(drop=True)
    df.to_feather(out_dir / f"nhd_{group}.feather")
    write_dataframe(df, out_dir / f"nhd_{group}.gpkg")

Example #26

0

Show file

print(df.groupby("loop").size())

### Add lat / lon
print("Adding lat / lon fields")
geo = df[["geometry"]].to_crs(GEO_CRS)
geo["lat"] = pg.get_y(geo.geometry.values.data).astype("float32")
geo["lon"] = pg.get_x(geo.geometry.values.data).astype("float32")
df = df.join(geo[["lat", "lon"]])

print("\n--------------\n")
df = df.reset_index(drop=True)

print("Serializing {:,} small barriers".format(len(df)))
df.to_feather(master_dir / "small_barriers.feather")
write_dataframe(df, qa_dir / "small_barriers.fgb")


# Extract out only the snapped ones
df = df.loc[df.snapped & (~(df.duplicate | df.dropped | df.excluded))].reset_index(
    drop=True
)
df.lineID = df.lineID.astype("uint32")
df.NHDPlusID = df.NHDPlusID.astype("uint64")

print("Serializing {:,} snapped small barriers".format(len(df)))
df[
    ["geometry", "id", "HUC2", "lineID", "NHDPlusID", "loop", "intermittent"]
].to_feather(snapped_dir / "small_barriers.feather",)
write_dataframe(df, qa_dir / "snapped_small_barriers.fgb")

Example #27

0

Show file

File: prep_nid_dams_outer_huc4s.py Project: astutespruce/sarp-connectivity

)


### cleanup fields
df["SourceState"] = df.SARPID.str[:2]

for column in ("River", "NIDID", "Source", "SourceDBID", "Name", "OtherName"):
    df[column] = df[column].fillna("").str.strip()

for column in (
    "Construction",
    "Condition",
    "Purpose",
    "Recon",
    "PassageFacility",
    "BarrierStatus",
    "ManualReview",
):
    df[column] = df[column].fillna(0).astype("uint8")

for column in ("Year", "Height", "Length", "Feasibility"):
    df[column] = df[column].fillna(0).astype("uint16")

s = df.groupby("SARPID").size()
if s.max() > 1:
    raise ValueError(f"Multiple dams with same SARPID: {s[s > 1].index}")


df.to_feather(src_dir / "dams_outer_huc4.feather")
write_dataframe(df, src_dir / "dams_outer_huc4.gpkg")

Example #28

0

Show file

                }
            )
        )
        flowlines = flowlines.join(segments).join(floodplains, on="NHDPlusID")
        flowlines["length"] = flowlines["length_m"] / 1000.0

        for col in ["natfldpln", "fldkm2", "natfldkm2"]:
            flowlines[col] = flowlines[col].fillna(-1)

        for col in ["interm", "altered"]:
            flowlines[col] = flowlines[col].astype("uint8")

        # serialize raw segments
        print("Serializing undissolved networks...")
        write_dataframe(
            flowlines.reset_index(),
            out_dir / f"region{huc2}_{barrier_type}_segments.{ext}",
        )

        # aggregate to multilinestrings by combinations of networkID
        print("Dissolving networks...")
        networks = (
            merge_lines(flowlines[["networkID", "geometry"]], by=["networkID"])
            .set_index("networkID")
            .join(stats, how="inner")
            .reset_index()
        )

        # this currently takes a very long time for shapefiles on GDAL3.4.x due to large multilinestrings
        # so write to GPKG and convert to shapefile using Docker GDAL 3.3.x
        print("Serializing dissolved networks...")
        write_dataframe(

Example #29

0

Show file

File: find_nhd_dams.py Project: astutespruce/sarp-connectivity

    pd.read_feather(
        data_dir / "boundaries/huc4.feather", columns=["HUC2"]
    ).HUC2.unique()
)

### Merge NHD lines and areas that represent dams and dam-related features
print("Reading NHD points, lines, and areas, and merging...")
nhd_pts = read_feathers(
    [raw_dir / huc2 / "nhd_points.feather" for huc2 in huc2s],
    geo=True,
    new_fields={"HUC2": huc2s},
)
nhd_pts = nhd_pts.loc[nhd_pts.FType.isin([343])].copy()

# write original points for SARP
write_dataframe(nhd_pts, out_dir / "nhd_dam_pts_nhdpoint.fgb")

nhd_pts["source"] = "NHDPoint"


# create circular buffers to merge with others
nhd_pts["geometry"] = pg.buffer(nhd_pts.geometry.values.data, 5)

nhd_lines = read_feathers(
    [raw_dir / huc2 / "nhd_lines.feather" for huc2 in huc2s],
    geo=True,
    new_fields={"HUC2": huc2s},
)
nhd_lines = nhd_lines.loc[
    (nhd_lines.FType.isin([343, 369, 398])) & nhd_lines.geometry.notnull()
].reset_index(drop=True)

Example #30

0

Show file

    percent_overlap = calculate_percent_overlap(
        input_area_mask, [to_dict(geometry)], bounds=pg.total_bounds(geometry)
    )
    if percent_overlap < 50:
        drop_ids.append(id)

print(f"Dropping {len(drop_ids)} HUC12s that do not sufficiently overlap input areas")
huc12 = huc12.loc[~huc12.id.isin(drop_ids)].copy()

# extract geographic bounds
huc12_wgs84 = huc12.to_crs(GEO_CRS)
huc12 = huc12.join(huc12_wgs84.bounds)

# Save in EPSG:5070 for analysis
huc12.to_feather(analysis_dir / "huc12.feather")
write_dataframe(huc12, bnd_dir / "huc12.gpkg")


### Marine units
print("Reading marine blocks...")

atl = read_dataframe(
    src_dir / "summary_units/marine_blocks/Atlantic/ATL_BLKCLP.shp",
    columns=["PROT_NUMBE", "BLOCK_NUMB"],
)
gulf = read_dataframe(
    src_dir / "summary_units/marine_blocks/Gulf_of_Mexico/blk_clip.shp",
    columns=["PROT_NUMBE", "BLOCK_NUMB"],
)

marine = atl.append(gulf, ignore_index=True)