Python read_feathers 예제들, analysis.lib.io.read_feathers Python 예제들

예제 #1

0

파일 보기

파일: find_nhd_dams.py 프로젝트: astutespruce/sarp-connectivity

if not out_dir.exists():
    os.makedirs(out_dir)

start = time()

huc2s = sorted(
    pd.read_feather(
        data_dir / "boundaries/huc4.feather", columns=["HUC2"]
    ).HUC2.unique()
)

### Merge NHD lines and areas that represent dams and dam-related features
print("Reading NHD points, lines, and areas, and merging...")
nhd_pts = read_feathers(
    [raw_dir / huc2 / "nhd_points.feather" for huc2 in huc2s],
    geo=True,
    new_fields={"HUC2": huc2s},
)
nhd_pts = nhd_pts.loc[nhd_pts.FType.isin([343])].copy()

# write original points for SARP
write_dataframe(nhd_pts, out_dir / "nhd_dam_pts_nhdpoint.fgb")

nhd_pts["source"] = "NHDPoint"


# create circular buffers to merge with others
nhd_pts["geometry"] = pg.buffer(nhd_pts.geometry.values.data, 5)

nhd_lines = read_feathers(
    [raw_dir / huc2 / "nhd_lines.feather" for huc2 in huc2s],

예제 #2

0

파일 보기

파일: cut_flowlines.py 프로젝트: astutespruce/sarp-connectivity

#     "14",
#     "15",
#     "16",
#     "17",
#     "21",
# ]

start = time()

### Aggregate barriers
kinds = ["waterfall", "dam", "small_barrier"]
kind_ids = [WATERFALLS_ID, DAMS_ID, SB_ID]

barriers = read_feathers(
    [barriers_dir / f"{kind}s.feather" for kind in kinds],
    geo=True,
    new_fields={"kind": kinds},
)

for kind, init_id in zip(kinds, kind_ids):
    ix = barriers.kind == kind
    barriers.loc[ix, "barrierID"] = barriers.loc[ix].id + init_id

barriers.barrierID = barriers.barrierID.astype("uint64")
barriers.to_feather(out_dir / "all_barriers.feather")

if DEBUG:
    write_dataframe(barriers, out_dir / "all_barriers.fgb")

### Cut flowlines in each HUC2
for huc2 in huc2s:

예제 #3

0

파일 보기

파일: prep_dams.py 프로젝트: astutespruce/sarp-connectivity

dups.loc[ix, "dup_tolerance"] = DUPLICATE_TOLERANCE["likely duplicate"]

export_duplicate_areas(dups, qa_dir / "dams_duplicate_areas.fgb")


### Join to line atts
flowlines = (
    read_feathers(
        [
            nhd_dir / "clean" / huc2 / "flowlines.feather"
            for huc2 in df.HUC2.unique()
            if huc2
        ],
        columns=[
            "lineID",
            "NHDPlusID",
            "GNIS_Name",
            "sizeclass",
            "StreamOrde",
            "FCode",
            "loop",
        ],
    )
    .rename(columns={"StreamOrde": "StreamOrder"})
    .set_index("lineID")
)

df = df.join(flowlines, on="lineID")

df.StreamOrder = df.StreamOrder.fillna(-1).astype("int8")

# Add name from snapped flowline if not already present

예제 #4

0

파일 보기

    )
    .set_index("NHDPlusID")
    .rename(columns={"nat_floodplain_km2": "natfldkm2", "floodplain_km2": "fldkm2"})
)
floodplains["natfldpln"] = (100 * floodplains.natfldkm2 / floodplains.fldkm2).astype(
    "float32"
)


# HUC2s that specifically overlap SECAS states (SARP states + WV)
for group in huc2_groups:

    segments = (
        read_feathers(
            [src_dir / "clean" / huc2 / "network_segments.feather" for huc2 in group],
            columns=["lineID", barrier_type],
        )
        .rename(columns={barrier_type: "networkID"})
        .set_index("lineID")
    )

    stats = (
        read_feathers(
            [
                src_dir / "clean" / huc2 / f"{barrier_type}_network_stats.feather"
                for huc2 in group
            ],
            columns=[
                "networkID",
                "total_miles",
                "perennial_miles",
                "intermittent_miles",

예제 #5

0

파일 보기

# "12",
# "13",
# "14",
# "15",
# "16",
# "17",
# "21",
# ]

print("Finding connected HUC2s")
joins = read_feathers(
    [src_dir / huc2 / "flowline_joins.feather" for huc2 in huc2s],
    columns=[
        "upstream",
        "downstream",
        "type",
        "marine",
        "upstream_id",
        "downstream_id",
    ],
    new_fields={"HUC2": huc2s},
)

groups, joins = connect_huc2s(joins)
print(f"Found {len(groups)} HUC2 groups in {time() - start:,.2f}s")

# persist table of connected HUC2s
connected_huc2s = pd.DataFrame({"HUC2": groups}).explode(column="HUC2")
connected_huc2s["group"] = connected_huc2s.index.astype("uint8")
connected_huc2s.reset_index(drop=True).to_feather(
    data_dir / "networks/connected_huc2s.feather")

예제 #6

0

파일 보기

파일: estimate_dams.py 프로젝트: astutespruce/sarp-connectivity

start = time()

huc2_df = pd.read_feather(data_dir / "boundaries/huc2.feather",
                          columns=["HUC2"])
huc2s = huc2_df.HUC2.sort_values().values

drains = (read_feathers(
    [
        data_dir / "nhd/clean/" / huc2 / "waterbody_drain_points.feather"
        for huc2 in huc2s
    ],
    new_fields={
        "HUC2": huc2s
    },
    geo=True,
).drop(columns=["snap_to_junction", "snap_dist"]).rename(
    columns={
        "MaxElevSmo": "maxelev",
        "MinElevSmo": "minelev",
        "Slope": "slope",
        "StreamOrde": "fsorder",
        "km2": "wb_km2",
        "flowlineLength": "flength",
    }).set_index("drainID"))

drains["intermittent"] = drains.lineFCode.isin([46003, 46007])

merged = None
for huc2 in huc2s:
    huc2_start = time()
    print(f"Extracting dams from waterbodies in {huc2}")

예제 #7

0

파일 보기

def get_network_results(df, network_type, barrier_type=None, rank=True):
    """Read network results, calculate derived metric classes, and calculate
    tiers.

    Only barriers that are not unranked (invasive spp barriers) have tiers calculated.

    Parameters
    ----------
    df : DataFrame
        barriers data; must contain State and unranked
    network_type : {"dams", "small_barriers"}
        network scenario
    barrier_type : {"dams", "small_barriers", "waterfalls"}, optional (default: None)
        if present, used to filter barrier kind from network results
    rank : bool, optional (default: True)
        if True, results will include tiers for the Southeast and state level

    Returns
    -------
    DataFrame
        Contains network metrics and tiers
    """

    barrier_type = barrier_type or network_type

    huc2s = [huc2 for huc2 in df.HUC2.unique() if huc2]

    networks = (
        read_feathers(
            [
                Path("data/networks/clean") / huc2 / f"{network_type}_network.feather"
                for huc2 in huc2s
            ],
            columns=NETWORK_COLUMNS,
        )
        .rename(columns=NETWORK_COLUMN_NAMES)
        .set_index("id")
    )

    # FIXME: temporary fix
    networks.PercentPerennialUnaltered = networks.PercentPerennialUnaltered.fillna(0)

    # select barrier type
    networks = networks.loc[networks.kind == barrier_type[:-1]].drop(columns=["kind"])

    # Convert dtypes to allow missing data when joined to barriers later
    # NOTE: upNetID or downNetID may be 0 if there aren't networks on that side, but
    # we set to int dtype instead of uint to allow -1 for missing data later
    for col in ["upNetID", "downNetID"]:
        networks[col] = networks[col].astype("int")

    for col in ["NumBarriersDownstream"]:
        networks[col] = networks[col].astype("int16")

    for column in ("Landcover", "SizeClasses", "FlowsToOcean"):
        networks[column] = networks[column].astype("int8")

    # sanity check to make sure no duplicate networks
    if networks.groupby(level=0).size().max() > 1:
        raise Exception(
            f"ERROR: multiple networks found for some {barrier_type} networks"
        )

    networks = networks.join(df[df.columns.intersection(["unranked", "State"])])

    # update data types and calculate total fields
    # calculate size classes GAINED instead of total
    # doesn't apply to those that don't have upstream networks
    networks.loc[networks.SizeClasses > 0, "SizeClasses"] -= 1

    # Calculate miles GAINED if barrier is removed
    # this is the lesser of the upstream or free downstream lengths.
    # Non-free miles downstream (downstream waterbodies) are omitted from this analysis.
    networks["GainMiles"] = networks[["TotalUpstreamMiles", "FreeDownstreamMiles"]].min(
        axis=1
    )
    networks["PerennialGainMiles"] = networks[
        ["PerennialUpstreamMiles", "FreePerennialDownstreamMiles"]
    ].min(axis=1)

    # TotalNetworkMiles is sum of upstream and free downstream miles
    networks["TotalNetworkMiles"] = networks[
        ["TotalUpstreamMiles", "FreeDownstreamMiles"]
    ].sum(axis=1)
    networks["TotalPerennialNetworkMiles"] = networks[
        ["PerennialUpstreamMiles", "FreePerennialDownstreamMiles"]
    ].sum(axis=1)

    # Round floating point columns to 3 decimals
    for column in [c for c in networks.columns if c.endswith("Miles")]:
        networks[column] = networks[column].round(3).fillna(-1).astype("float32")

    # Calculate network metric classes
    networks["GainMilesClass"] = classify_gainmiles(networks.GainMiles)
    networks["PerennialGainMilesClass"] = classify_gainmiles(
        networks.PerennialGainMiles
    )

    if not rank:
        return networks.drop(columns=["unranked", "State"], errors="ignore")

    # only calculate ranks / tiers for ranked barriers
    # (exclude unranked invasive spp. barriers)
    to_rank = networks.loc[~networks.unranked]

    ### Calculate regional tiers for SARP (Southeast) region
    # NOTE: this is limited to SARP region; other regions are not ranked at regional level
    # TODO: consider deprecating this
    ix = to_rank.State.isin(SARP_STATE_NAMES)
    sarp_tiers = calculate_tiers(to_rank.loc[ix])
    sarp_tiers = sarp_tiers.rename(
        columns={col: f"SE_{col}" for col in sarp_tiers.columns}
    )

    ### Calculate state tiers for each of total and perennial
    state_tiers = None
    for state in to_rank.State.unique():
        state_tiers = append(
            state_tiers,
            calculate_tiers(to_rank.loc[to_rank.State == state]).reset_index(),
        )

    state_tiers = state_tiers.set_index("id").rename(
        columns={col: f"State_{col}" for col in state_tiers.columns}
    )

    networks = networks.join(sarp_tiers).join(state_tiers)
    for col in [col for col in networks.columns if col.endswith("_tier")]:
        networks[col] = networks[col].fillna(-1).astype("int8")

    return networks.drop(columns=["unranked", "State"])

예제 #8

0

파일 보기

if not out_dir.exists():
    os.makedirs(out_dir)

barrier_type = "small_barriers"
ext = "fgb"

# groups_df = pd.read_feather(src_dir / "connected_huc2s.feather")

# for group in groups_df.groupby("group").HUC2.apply(set).values:
for group in [{"02"}]:
    segments = (read_feathers(
        [
            src_dir / "clean" / huc2 / "network_segments.feather"
            for huc2 in group
        ],
        columns=["lineID", barrier_type],
    ).rename(columns={
        barrier_type: "networkID"
    }).set_index("lineID"))

    # FIXME: remove, debug only
    s = segments.groupby(level=0).size()
    print("dups", s[s > 1])

    stats = read_feathers([
        src_dir / "clean" / huc2 / f"{barrier_type}_network_stats.feather"
        for huc2 in group
    ]).set_index("networkID")

    # use smaller data types for smaller output files

예제 #9

0

파일 보기

파일: create_network_tiles.py 프로젝트: astutespruce/sarp-connectivity

start = time()

groups_df = pd.read_feather(src_dir / "connected_huc2s.feather")

region_tiles = []

for group in groups_df.groupby("group").HUC2.apply(set).values:
    group = sorted(group)

    print(f"\n\n===========================\nProcessing group {group}")
    segments = read_feathers(
        [
            src_dir / "clean" / huc2 / "network_segments.feather"
            for huc2 in group
        ],
        columns=[
            "lineID",
            "dams",
            "small_barriers",
        ],
    ).set_index("lineID")

    # create output files by HUC2 based on where the segments occur
    for huc2 in group:
        print(f"----------------------\nProcessing {huc2}")

        huc2_start = time()

        huc2_mbtiles_filename = intermediate_dir / f"region{huc2}_networks.mbtiles"
        region_tiles.append(huc2_mbtiles_filename)

예제 #10

0

파일 보기

data_dir = Path("data")
out_dir = Path("/tmp/sarp")

if not out_dir.exists():
    os.makedirs(out_dir)

huc4_df = pd.read_feather(
    data_dir / "boundaries/huc4.feather",
    columns=["HUC2"],
)
huc2s = huc4_df.HUC2.unique()

df = read_feathers(
    [
        data_dir / "nhd/raw" / huc2 / "nhd_altered_rivers.feather"
        for huc2 in huc2s
    ],
    geo=True,
    new_fields={"HUC2": huc2s},
)

write_dataframe(df, out_dir / "nhd_altered_rivers.shp")

df = read_feathers(
    [data_dir / "nwi/raw" / huc2 / "altered_rivers.feather" for huc2 in huc2s],
    geo=True,
    new_fields={"HUC2": huc2s},
)

write_dataframe(df, out_dir / "nwi_altered_rivers.shp")