Python read_dataframe Examples, pyogrio.read_dataframe Python Examples

Example #1

0

Show file

File: extract_catchments.py Project: astutespruce/sarp-connectivity

def process_huc4s(src_dir, huc4s):

    merged = None
    for HUC4 in huc4s:
        print("\n\n------------------- Reading {} -------------------".format(
            HUC4))

        gdb = src_dir / HUC4 / "NHDPLUS_H_{HUC4}_HU4_GDB.gdb".format(HUC4=HUC4)

        df = read_dataframe(gdb,
                            layer="NHDPlusCatchment",
                            columns=["NHDPlusID"])
        print(f"Read {len(df):,} catchments")

        df = df.dropna(subset=["NHDPlusID"])

        print("Kept {:,} catchments after dropping those without NHDPlusID".
              format(len(df)))

        df.NHDPlusID = df.NHDPlusID.astype("uint64")

        df = df.to_crs(CRS)
        merged = append(merged, df)

    df = merged

    # add uniqueID
    df["catchID"] = df.index.astype("uint32") + 1

    # add string version of NHDPlusID
    df["NHDIDSTR"] = df.NHDPlusID.astype("str")

    return df

Example #2

0

Show file

File: areas.py Project: astutespruce/sarp-connectivity

def extract_altered_rivers(gdb_path, target_crs):
    """Extract NHDArea records that likely indicate altered riverways.

    Parameters
    ----------
    gdb_path : str
        path to the NHD HUC4 Geodatabase
    target_crs: GeoPandas CRS object
        target CRS to project NHD to for analysis, like length calculations.
        Must be a planar projection.

    Returns
    -------
    GeoDataFrame
    """

    df = read_dataframe(
        gdb_path,
        layer="NHDArea",
        columns=COLS,
        force_2d=True,
        where=f"FType in {tuple(FTYPES)}",
    )

    df.NHDPlusID = df.NHDPlusID.astype("uint64")
    df["id"] = df.index.values.astype("uint32") + 1

    if len(df):
        df = df.to_crs(target_crs)
        df.geometry = make_valid(df.geometry.values.data)

    return df

Example #3

0

Show file

File: marine.py Project: astutespruce/sarp-connectivity

def extract_marine(gdb_path, target_crs):
    """Extract areas from NHDWaterbody and NHDArea that are marine connected.

    Parameters
    ----------
    gdb_path : str
        path to the NHD HUC4 Geodatabase
    target_crs: GeoPandas CRS object
        target CRS to project NHD to for analysis, like length calculations.
        Must be a planar projection.

    Returns
    -------
    GeoDataFrame
    """

    print("Reading marine areas...")
    area = read_dataframe(
        gdb_path,
        layer="NHDArea",
        columns=COLS,
        force_2d=True,
        where=f"FType in {tuple(AREA_FTYPES)}",
    )

    wb = read_dataframe(
        gdb_path,
        layer="NHDWaterbody",
        columns=COLS,
        force_2d=True,
        # more complex expression when list is size 1
        where=f"FType in ({','.join([str(t) for t in WB_FTYPES])})",
    )

    df = area.append(wb)

    if len(df):
        df = explode(df.to_crs(target_crs))

    return df

Example #4

0

Show file

    def __call__(self, part):
        path, row_offset, batch_size = part

        import pyogrio

        df = pyogrio.read_dataframe(
            path,
            layer=self.layer,
            columns=self.columns,
            read_geometry=self.read_geometry,
            skip_features=row_offset,
            max_features=batch_size,
        )
        df.index = RangeIndex(row_offset, row_offset + batch_size)
        return df

Example #5

0

Show file

def extract_waterbodies(gdb_path, target_crs):
    """Extract waterbodies from NHDPlusHR data product that are are not one of
    the excluded types (e.g., estuary, playa, swamp/marsh).

    Parameters
    ----------
    gdb_path : str
        path to the NHD HUC4 Geodatabase
    target_crs: GeoPandas CRS object
        target CRS to project NHD to for analysis, like length calculations.
        Must be a planar projection.

    Returns
    -------
    GeoDataFrame
    """
    print("Reading waterbodies")
    df = read_dataframe(
        gdb_path,
        layer="NHDWaterbody",
        columns=[WATERBODY_COLS],
        force_2d=True,
        where=f"FType not in {tuple(WATERBODY_EXCLUDE_FTYPES)}",
    )
    print("Read {:,} waterbodies".format(len(df)))

    # Convert multipolygons to polygons
    # those we checked that are true multipolygons are errors
    df.geometry = pg.get_geometry(df.geometry.values.data, 0)
    df.geometry = make_valid(df.geometry.values.data)

    print("projecting to target projection")
    df = df.to_crs(target_crs)

    df.NHDPlusID = df.NHDPlusID.astype("uint64")
    df.AreaSqKm = df.AreaSqKm.astype("float32")
    df.FType = df.FType.astype("uint16")

    ### Add calculated fields
    df["wbID"] = df.index.values.astype("uint32") + 1

    return df

Example #6

0

Show file

    )

    raw_data = vrt.read()[0]

raw_data = raw_data.astype("uint16")

write_raster(
    src_dir / "midse_raw.tif",
    raw_data,
    transform=transform,
    crs=DATA_CRS,
    nodata=nodata,
)

print("Reclassifying data...")
table = read_dataframe(src_dir / "MidSE_2020_CVI.img.vat.dbf")
table["priority"] = table.M_SEBcode.astype("uint8")
remap_table = table[["Value", "priority"]].values.astype("uint16")

data = remap(raw_data, remap_table, nodata=nodata)

print("Done reclassifying values")

# coerce down to uint8, set nodata to smaller value
data[data == nodata] = 255
nodata = 255

# apply mask
data = np.where(mask == 1, data, nodata).astype("uint8")

write_raster(outfilename,

Example #7

0

Show file

    waterbodies = None
    rivers = None
    for huc8 in units[huc2]:
        print(f"Reading NWI data for {huc8}")

        filename = src_dir.resolve() / f"{huc8}.zip"
        if not filename.exists():
            print(f"WARNING: {filename} not found")
            continue

        # Extract and merge lakes and wetlands
        df = read_dataframe(
            f"/vsizip/{filename}/HU8_{huc8}_Watershed/HU8_{huc8}_Wetlands.shp",
            columns=["ATTRIBUTE", "WETLAND_TY"],
            where="WETLAND_TY in ('Lake', 'Pond', 'Riverine')",
        ).rename(columns={
            "ATTRIBUTE": "nwi_code",
            "WETLAND_TY": "nwi_type"
        })

        # some geometries are invalid, filter them out
        df = df.loc[pg.is_geometry(df.geometry.values.data)].copy()

        if not len(df):
            continue

        df = df.to_crs(CRS)

        # Mark structurally altered types where
        # codes with x (excavated), d (ditched), r (artificial substrate), h (diked)
        # strip any terminal numbers then take last character

Example #8

0

Show file

def bounds_to_poly(bounds):
    xmin, ymin, xmax, ymax = bounds
    return Polygon([[xmin, ymin], [xmin, ymax], [xmax, ymax], [xmax, ymin],
                    [xmin, ymin]])


def to_titlecase(text):
    return " ".join([t.capitalize() for t in text.split(" ")])


boundaries_dir = Path("data/boundaries")
src_dir = boundaries_dir / "src"

### Process admin boundaries
print("Extracting admin boundaries...")
us_df = read_dataframe(
    src_dir / "us_state_wgs84.shp").rename(columns={"NAME": "admin1_name"})
us_df["admin1"] = "US-" + us_df.STUSPS
us_df["country"] = "US"

ca_df = read_dataframe(src_dir / "canada_province_wgs84.shp").rename(
    columns={"PRENAME": "admin1_name"})
ca_df["admin1"] = "CA-" + ca_df.PREABBR.str.replace("\.", "")
ca_df["country"] = "CA"

mx_df = read_dataframe(
    src_dir /
    "mexico_state.shp")  # already in 4326, but needs to be simplified
mx_df.geometry = mx_df.geometry.simplify(0.001)
mx_df["admin1"] = "MX-" + mx_df.NUM_EDO
mx_df["admin1_name"] = mx_df.ENTIDAD.apply(to_titlecase)
mx_df = mx_df.dissolve(by="NUM_EDO")

Example #9

0

Show file

async def create_custom_report(ctx, zip_filename, dataset, layer, name=""):
    """Create a Blueprint report for a user-uploaded GIS file contained in a zip.
    Zip must contain either a shapefile or a file geodatabase.

    Parameters
    ----------
    ctx : job context
    zip_filename : str
        full path to zip filename
    dataset : str
        full path to dataset within zip file
    layer : str
        name of layer within dataset
    name : str, optional (default: "")
        Name of area of interest (included in output report)

    Returns
    -------
    str
        path to output file

    Raises
    ------
    DataError
        Raised if bounds are too large or if area of interest doesn't overalap SA region
    """

    errors = []

    await set_progress(ctx["job_id"], 0, "Loading data")

    path = f"/vsizip/{zip_filename}/{dataset}"

    df = read_dataframe(path, layer=layer)

    geometry = pg.make_valid(df.geometry.values.data)

    await set_progress(ctx["job_id"], 5, "Preparing area of interest")

    # dissolve
    geometry = np.asarray([pg.union_all(geometry)])

    geo_geometry = to_crs(geometry, df.crs, GEO_CRS)
    bounds = pg.total_bounds(geo_geometry)

    # estimate area
    extent_area = (
        pg.area(pg.box(*pg.total_bounds(to_crs(geometry, df.crs, DATA_CRS)))) *
        M2_ACRES)
    if extent_area >= CUSTOM_REPORT_MAX_ACRES:
        raise DataError(
            f"The bounding box of your area of interest is too large ({extent_area:,.0f} acres), it must be < {CUSTOM_REPORT_MAX_ACRES:,.0f} acres."
        )

    await set_progress(ctx["job_id"], 10,
                       "Calculating results (this might take a while)")

    # calculate results, data must be in DATA_CRS
    print("Calculating results...")
    results = CustomArea(geometry, df.crs, name).get_results()

    if results is None:
        raise DataError(
            "area of interest does not overlap Southeast Blueprint")

    if name:
        results["name"] = name

    has_urban = "proj_urban" in results and results["proj_urban"][4] > 0
    has_slr = "slr" in results
    has_ownership = "ownership" in results
    has_protection = "protection" in results

    # compile indicator IDs across all inputs
    indicators = []
    for input_area in results["inputs"]:
        for ecosystem in input_area.get("ecosystems", []):
            indicators.extend([i["id"] for i in ecosystem["indicators"]])

    await set_progress(ctx["job_id"], 25,
                       "Creating maps (this might take a while)")

    print("Rendering maps...")
    maps, scale, map_errors = await render_maps(
        bounds,
        geometry=geo_geometry[0],
        input_ids=results["input_ids"],
        indicators=indicators,
        urban=has_urban,
        slr=has_slr,
        ownership=has_ownership,
        protection=has_protection,
    )

    if map_errors:
        log.error(f"Map rendering errors: {map_errors}")
        if "basemap" in map_errors:
            errors.append("Error creating basemap for all maps")

        if "aoi" in map_errors:
            errors.append("Error rendering area of interest on maps")

        if set(map_errors.keys()).difference(["basemap", "aoi"]):
            errors.append("Error creating one or more maps")

    await set_progress(ctx["job_id"],
                       75,
                       "Creating PDF (this might take a while)",
                       errors=errors)

    results["scale"] = scale

    pdf = create_report(maps=maps, results=results)

    await set_progress(ctx["job_id"], 95, "Nearly done", errors=errors)

    fp, name = tempfile.mkstemp(suffix=".pdf", dir=TEMP_DIR)
    with open(fp, "wb") as out:
        out.write(pdf)

    await set_progress(ctx["job_id"], 100, "All done!", errors=errors)

    log.debug(f"Created PDF at: {name}")

    return name, errors

Example #10

0

Show file

File: prep_boundaries.py Project: astutespruce/sarp-connectivity

outer_huc4 = outer_huc4.loc[outer_huc4.HUC4.isin(keep_huc4)
                            & (outer_huc4.clip_km2 >= 2.5)].copy()
outer_huc4 = dissolve(outer_huc4, by="HUC4", agg={
    "HUC2": "first"
}).reset_index(drop=True)
outer_huc4.to_feather(out_dir / "outer_huc4.feather")
write_dataframe(outer_huc4, out_dir / "outer_huc4.gpkg")

### Counties - within HUC4 bounds
print("Processing counties")
fips = sorted(state_df.STATEFIPS.unique())

county_df = (read_dataframe(
    county_filename,
    columns=["NAME", "GEOID", "STATEFP"],
).to_crs(CRS).rename(columns={
    "NAME": "County",
    "GEOID": "COUNTYFIPS",
    "STATEFP": "STATEFIPS"
}))

# keep only those within the region HUC4 outer boundary
tree = pg.STRtree(county_df.geometry.values.data)
ix = np.unique(
    tree.query_bulk(huc4_df.geometry.values.data, predicate="intersects")[1])
ix.sort()
county_df = county_df.iloc[ix].reset_index(drop=True)
county_df.geometry = pg.make_valid(county_df.geometry.values.data)

# keep larger set for spatial joins
county_df.to_feather(out_dir / "counties.feather")

Example #11

0

Show file

File: prepare_naturescape.py Project: astutespruce/secas-blueprint

    data = vrt.read()[0]

# convert to uint8
data = np.where(data == int(rc.nodata), 255, data)

# clip to mask
data = np.where(mask == 1, data, 255).astype("uint8")

tnc_data = data.copy()

# Reclassify to incremental values based on lookup table
print("Reclassifying TNC data...")
table = read_dataframe(
    src_dir / "Resilient_and_Connected20180308.tif.vat.dbf",
    read_geometry=False,
    columns=["Value"],
)

for i, row in table.iterrows():
    if i == row.Value:
        continue

    data[data == row.Value] = i

write_raster(tnc_outfilename,
             data,
             transform=transform,
             crs=DATA_CRS,
             nodata=255)

Example #12

0

Show file

File: util.py Project: spatialucr/geosnap

def convert_census_gdb(
    file,
    year=None,
    layers=None,
    level="bg",
    save_intermediate=True,
    combine=True,
    output_dir=".",
):
    """Convert file geodatabases from Census into (set of) parquet files.

    Parameters
    ----------
    file : str
        path to file geodatabase
    year : str
        year that the data should be named by. If none, will try to infer from the filename
        based on convention from the Census Bureau FTP server
    layers : list, optional
        set of layers to extract from geodatabase. If none (default), all layers will be extracted
    level : str, optional
        geographic level of data ('bg' for blockgroups or 'tr' for tract), by default "bg"
    save_intermediate : bool, optional
        if true, each layer will be stored separately as a parquet file, by default True
    combine : bool, optional
        whether to store and concatenate intermediate dataframes, default is True
    output_dir : str, optional
        path to directory where parquet files will be written, by default "."
    """
    try:
        import pyogrio as ogr
    except ImportError:
        raise ImportError("this function requires the `pyogrio` package\n"
                          "`conda install pyogrio`")
    if not layers:  # grab them all except the metadata
        year_suffix = file.split(".")[0].split("_")[1][-2:]
        meta_str = f"{level.upper()}_METADATA_20{year_suffix}"
        layers = [layer[0] for layer in ogr.list_layers(file)]
        if meta_str in layers:
            layers.remove(meta_str)
    if (
            not year
    ):  # make a strong assumption about the name of the file coming from census
        year = file.split("_")[1]
    tables = []
    for i in layers:
        print(i)
        df = ogr.read_dataframe(file, layer=i).set_index("GEOID")
        if "ACS_" in i:
            df = gpd.GeoDataFrame(df)
        else:
            df = df[df.columns[df.columns.str.contains("e")]]
            df.columns = pd.Series(df.columns).apply(reformat_acs_vars)
        df = df.dropna(axis=1, how="all")
        if combine:
            tables.append(df)
        if save_intermediate:
            df.to_parquet(
                pathlib.PurePath(output_dir,
                                 f"acs_{year}_{i}_{level}.parquet"))
    if combine:
        df = pd.concat(tables, axis=1)
        if f"ACS_{year}_5YR_{level.upper()}" in layers:
            df = gpd.GeoDataFrame(df)
        df.to_parquet(
            pathlib.PurePath(output_dir, f"acs_{year}_{level}.parquet"))

Example #13

0

Show file

File: create_summary_tiles.py Project: astutespruce/sarp-connectivity

# NOTE: crossings are already de-duplicated against each other and against
# barriers
crossings = pd.read_feather(
    src_dir / "road_crossings.feather", columns=["id"] + SUMMARY_UNITS
)


# Calculate summary statistics for each type of summary unit
# These are joined to vector tiles
mbtiles_files = []
for unit in SUMMARY_UNITS:
    print(f"processing {unit}")

    if unit == "State":
        units = read_dataframe(
            bnd_dir / "region_states.gpkg", columns=["id"], read_geometry=False
        ).set_index("id")
    elif unit == "COUNTYFIPS":
        units = read_dataframe(
            bnd_dir / "region_counties.gpkg", columns=["id"], read_geometry=False
        ).set_index("id")
    else:
        units = pd.read_feather(bnd_dir / f"{unit}.feather", columns=[unit]).set_index(
            unit
        )

    dam_stats = (
        dams[[unit, "id", "OnNetwork", "Recon"]]
        .groupby(unit)
        .agg({"id": "count", "OnNetwork": "sum", "Recon": "sum"})
        .rename(

Example #14

0

Show file

File: prepare_caribbean.py Project: astutespruce/secas-blueprint

from pathlib import Path
import os
import warnings

from pyogrio import read_dataframe, write_dataframe

from analysis.constants import DATA_CRS

warnings.filterwarnings("ignore",
                        message=".*initial implementation of Parquet.*")

src_dir = Path("source_data/caribbean")
out_dir = Path("data/inputs/indicators/caribbean")
tile_dir = Path("data/for_tiles")

if not out_dir.exists():
    os.makedirs(out_dir)

df = (read_dataframe(src_dir / "Watershed_Ranking_PR.shp",
                     columns=["Metric_Ran", "HUC_10"]).rename(columns={
                         "Metric_Ran": "carrank",
                         "HUC_10": "HUC10"
                     }).to_crs(DATA_CRS))

df.to_feather(out_dir / "caribbean.feather")

# for tiles
write_dataframe(df[["geometry", "carrank"]],
                tile_dir / "caribbean.geojson",
                driver="GeoJSONSeq")

Example #15

0

Show file

src_dir = barriers_dir / "source"
master_dir = barriers_dir / "master"
qa_dir = barriers_dir / "qa"
network_dir = data_dir / "networks/21/dams"
gdb = src_dir / "PR_Dec2019.gdb"

dams_layer = "Puerto_Inventory_Dec2019_Indicators"
network_layer = "PR_Functional_River_Network"

if not os.path.exists(network_dir):
    os.makedirs(network_dir)

start = time()
print("Reading Puerto Rico networks...")
networks = pio.read_dataframe(gdb,
                              layer=network_layer,
                              as_pygeos=True,
                              columns=[NET_COLS])
src_crs = networks.crs
networks = networks.rename(columns={
    "batNetID": "networkID",
    "StreamOrde": "streamorder"
}).set_index("networkID")

# convert to LineStrings
networks.geometry = pg.get_geometry(networks.geometry, 0)

# project to crs
networks.geometry = to_crs(networks.geometry, src_crs, CRS)

networks["length"] = pg.length(networks.geometry)
networks["miles"] = networks.length * 0.000621371

Example #16

0

Show file

data_dir = Path("data")
out_dir = data_dir / "boundaries"

wbd_gdb = data_dir / "nhd/source/wbd/WBD_National_GDB/WBD_National_GDB.gdb"

huc4_df = gp.read_feather(out_dir / "huc4.feather")
huc4 = sorted(huc4_df.HUC4.unique())
sarp_huc4_df = gp.read_feather(out_dir / "sarp_huc4.feather")
sarp_huc4 = sorted(sarp_huc4_df.HUC4.unique())

### Extract HUC6 within HUC4
print("Processing HUC6...")
huc6_df = (read_dataframe(
    wbd_gdb,
    layer="WBDHU6",
    columns=["huc6", "name"],
    where=f"SUBSTR(huc6, 0, 4) IN {tuple(huc4)}",
).rename(columns={
    "huc6": "HUC6"
}).to_crs(CRS))
huc6_df.to_feather(out_dir / "huc6.feather")
write_dataframe(huc6_df.rename(columns={"HUC6": "id"}), out_dir / "huc6.gpkg")

huc6_df["HUC4"] = huc6_df.HUC6.str[:4]
sarp_huc6_df = huc6_df.loc[huc6_df.HUC4.isin(sarp_huc4)].drop(columns=["HUC4"])
write_dataframe(sarp_huc6_df.rename(columns={"HUC6": "id"}),
                out_dir / "sarp_huc6.gpkg")
sarp_huc6_df.to_feather(out_dir / "sarp_huc6.feather")

### Extract HUC8 within HUC4
print("Processing HUC8...")
huc8_df = (read_dataframe(

Example #17

0

Show file

from analysis.constants import CRS
from analysis.lib.geometry import dissolve, explode

warnings.filterwarnings("ignore",
                        message=".*initial implementation of Parquet.*")

data_dir = Path("data")
nhd_dir = data_dir / "nhd/raw"  # intentionally use raw flowlines
src_dir = data_dir / "states/sc"
huc2 = "03"

print("Reading waterbodies...")
df = (read_dataframe(
    src_dir / "SCBreakline.gdb",
    layer="Waterbody",
    force_2d=True,
    columns=[],
).rename(columns={
    "NAME": "name"
}).to_crs(CRS))

print("Reading flowlines...")
flowlines = gp.read_feather(nhd_dir / huc2 / "flowlines.feather", columns=[])
tree = pg.STRtree(flowlines.geometry.values.data)

print(f"Extracted {len(df):,} SC waterbodies")
left, right = tree.query_bulk(df.geometry.values.data, predicate="intersects")
df = df.iloc[np.unique(left)].reset_index(drop=True)
print(f"Kept {len(df):,} that intersect flowlines")

df = explode(df)
# make valid

Example #18

0

Show file

start = time()

merged = None

for huc2 in units.keys():
    print(f"Processing floodplain stats for {huc2}")

    if huc2 == "02":
        filename = region02_gdb_filename
        layer = "Region002_Catchments_Natl_LCStats"
    else:
        filename = gdb_filename
        layer = layers[huc2]

    df = read_dataframe(filename, layer=layer)

    df["HUC2"] = huc2
    df["NHDPlusID"] = df.NHDIDSTR.astype("uint64")
    cols = [c for c in df.columns if c.startswith("VALUE_")]
    natural_cols = [c for c in cols if int(c.split("_")[1]) in NATURAL_TYPES]

    df["floodplain_km2"] = df[cols].sum(axis=1) * 1e-6
    df["nat_floodplain_km2"] = df[natural_cols].sum(axis=1) * 1e-6

    merged = append(
        merged,
        df[["NHDPlusID", "HUC2", "nat_floodplain_km2", "floodplain_km2"]])

merged.reset_index(drop=True).to_feather(src_dir / "floodplain_stats.feather")

Example #19

0

Show file

from analysis.constants import MASK_FACTOR
from analysis.lib.pygeos_util import explode, to_dict
from analysis.lib.raster import add_overviews, create_lowres_mask

warnings.filterwarnings("ignore",
                        message=".*initial implementation of Parquet.*")

src_dir = Path("source_data/blueprint")
data_dir = Path("data")
out_dir = data_dir / "inputs"
bnd_dir = data_dir / "boundaries"
json_dir = Path("constants")

blueprint_filename = out_dir / "se_blueprint2021.tif"

df = read_dataframe(src_dir / "SE_Blueprint_2021_Vectors.gdb",
                    layer="InputAreas_SECAS_2021_20211117")

# some areas are null inputs, drop them
df = df.loc[df.InputOverlapAreasSECAS_InputUsedIn2021.notnull()].copy()

# making valid takes a really long time, and probably not necessary
df["inputs"] = df.InputOverlapAreasSECAS_InputUsedIn2021.str.lower().apply(
    lambda x: x.replace("tx chat", "txchat").replace(
        "ok chat", "okchat").replace(" ", "").replace(";", ","))

# split parts for easier indexing
df = explode(df).reset_index()

df = df[["inputs", "geometry"]].copy()

inputs = df.inputs.unique()

Example #20

0

Show file

warnings.filterwarnings("ignore",
                        message=".*initial implementation of Parquet.*")

data_dir = Path("data")
out_dir = data_dir / "boundaries"
ui_dir = Path("ui/data")

state_filename = data_dir / "boundaries/source/tl_2019_us_state/tl_2019_us_state.shp"
wbd_gdb = data_dir / "nhd/source/wbd/WBD_National_GDB/WBD_National_GDB.gdb"

### Construct region and SARP boundaries from states
print("Processing states...")
state_df = (read_dataframe(
    state_filename,
    columns=["STUSPS", "STATEFP", "NAME"],
).to_crs(CRS).rename(columns={
    "STUSPS": "id",
    "NAME": "State",
    "STATEFP": "STATEFIPS"
}))
state_df.geometry = pg.make_valid(state_df.geometry.values.data)

# save all states for spatial joins
state_df.to_feather(out_dir / "states.feather")

state_df = state_df.loc[state_df.id.isin(STATES.keys())].copy()
state_df.to_feather(out_dir / "region_states.feather")
write_dataframe(
    state_df[["State", "geometry"]].rename(columns={"State": "id"}),
    out_dir / "region_states.gpkg",
)

Example #21

0

Show file

File: prepare_chat.py Project: astutespruce/secas-blueprint

out_dir = data_dir / "inputs/indicators/chat"
gis_dir = data_dir / "indicators/chat"
tile_dir = data_dir / "for_tiles"

if not out_dir.exists():
    os.makedirs(out_dir)

if not gis_dir.exists():
    os.makedirs(gis_dir)

inputs_df = gp.read_feather(data_dir / "inputs/boundaries/input_areas.feather")

# Is in EPSG:5070 but not recognized as such
print("Reading CHAT data...")
df = (read_dataframe(src_dir /
                     "WAFWA_CHAT_Lower48.shp").set_crs(DATA_CRS).drop(
                         columns=["ls_cond"]).rename(columns=field_map).rename(
                             columns={"chat_rank": "chatrank"}))

for col in chat_fields:
    df[col] = df[col].astype("uint8")

df = df.drop(columns=["hexagon_id"])

### Find the CHAT units that intersect with OK / TX input areas
# Use centerpoints, since input area roughly follows edges of hexes
points = pg.centroid(df.geometry.values.data)
tree = pg.STRtree(points)

for state in ["ok", "tx"]:
    print(f"Processing {state} CHAT...")
    input_area = pg.union_all(

Example #22

0

Show file

File: prepare_ownership.py Project: astutespruce/secas-blueprint

#     else:
#         merged = merged.append(df, ignore_index=True)

# df = merged
# df["geometry"] = from_wkb(df.geometry)

# read specific states
states = ",".join(f"'{s}'" for s in SE_STATES + ["UNKF"])
df = read_dataframe(
    src_dir / "pad_us2_1.gpkg",
    columns=[
        "Category",
        "State_Nm",
        "Own_Type",
        "GAP_Sts",
        "Loc_Nm",
        "Loc_Own",
        "Agg_Src",
    ],
    where=f"State_Nm in ({states})",
)

# set the CRS, it is same as 5070 but not recognized properly
df = df.set_crs(DATA_CRS)

# drop BOEM lease block groups
df = df.loc[df.Agg_Src != "USGS_PADUS2_0Marine_BOEM_Block_Dissolve"].drop(
    columns=["Agg_Src"])

tree = pg.STRtree(df.geometry.values.data)

Example #23

0

Show file


print("Reading road crossings")


# rename columns to match small barriers
# NOTE: tiger2020_feature_names is a combination of multiple road names
df = read_dataframe(
    src_dir / "stream_crossings_united_states_feb_2022.gpkg",
    layer="stream_crossing_sites",
    columns=[
        "stream_crossing_id",
        "tiger2020_feature_names",
        "nhdhr_gnis_stream_name",
        "crossing_type",
    ],
).rename(
    columns={
        "tiger2020_feature_names": "Road",
        "nhdhr_gnis_stream_name": "Stream",
        "stream_crossing_id": "SARPID",
        "crossing_type": "crossingtype",
    }
)
print(f"Read {len(df):,} road crossings")

# project HUC4 to match crossings
huc4 = gp.read_feather(boundaries_dir / "huc4.feather", columns=["geometry"]).to_crs(
    df.crs
)

Example #24

0

Show file

endangered_df = listed_df.loc[listed_df.official_status == "E"].SNAME.unique()
threatened_df = listed_df.loc[listed_df.official_status == "T"].SNAME.unique()

### Process trout data (not necessarily T/E/SGCN, just used for filtering)
trout = read_dataframe(
    gdb,
    layer=trout_layer,
    read_geometry=False,
    columns=[
        "HUC12_Code",
        "Species_Name",
        "Common_Name",
        "Historical",
        "Federal_Status",
        "State_Status",
        "SGCN_Listing",
        "Regional_SGCN",
    ],
).rename(
    columns={
        "HUC12_Code": "HUC12",
        "Species_Name": "SNAME",
        "Common_Name": "CNAME",
        "Federal_Status": "federal",
        "State_Status": "state",
        "SGCN_Listing": "sgcn",
        "Regional_SGCN": "regional",
    })

# drop Trout-perch
trout = trout.loc[trout.CNAME != "Trout-perch"].copy()

Example #25

0

Show file

File: flowlines.py Project: astutespruce/sarp-connectivity

def extract_flowlines(gdb_path, target_crs, extra_flowline_cols=[]):
    """
    Extracts flowlines data from NHDPlusHR data product.
    Extract flowlines from NHDPlusHR data product, joins to VAA table,
    and filters out coastlines.
    Extracts joins between flowlines, and filters out coastlines.

    Parameters
    ----------
    gdb_path : str
        path to the NHD HUC4 Geodatabase
    target_crs: GeoPandas CRS object
        target CRS to project NHD to for analysis, like length calculations.
        Must be a planar projection.
    extra_cols: list
        List of extra field names to extract from NHDFlowline layer

    Returns
    -------
    tuple of (GeoDataFrame, DataFrame)
        (flowlines, joins)
    """

    ### Read in flowline data and convert to data frame
    print("Reading flowlines")
    flowline_cols = FLOWLINE_COLS + extra_flowline_cols
    df = read_dataframe(
        gdb_path, layer="NHDFlowline", force_2d=True, columns=[flowline_cols],
    )

    # Index on NHDPlusID for easy joins to other NHD data
    df.NHDPlusID = df.NHDPlusID.astype("uint64")
    df = df.set_index(["NHDPlusID"], drop=False)

    # convert MultiLineStrings to LineStrings (all have a single linestring)
    df.geometry = pg.get_geometry(df.geometry.values.data, 0)

    print("making valid and projecting to target projection")
    df.geometry = make_valid(df.geometry.values.data)
    df = df.to_crs(target_crs)
    print(f"Read {len(df):,} flowlines")

    ### Read in VAA and convert to data frame
    # NOTE: not all records in Flowlines have corresponding records in VAA
    # we drop those that do not since we need these fields.
    print("Reading VAA table and joining...")
    vaa_df = read_dataframe(gdb_path, layer="NHDPlusFlowlineVAA", columns=[VAA_COLS])

    vaa_df.NHDPlusID = vaa_df.NHDPlusID.astype("uint64")
    vaa_df = vaa_df.set_index(["NHDPlusID"])
    df = df.join(vaa_df, how="inner")
    print(f"{len(df):,} features after join to VAA")

    # Simplify data types for smaller files and faster IO
    df.FType = df.FType.astype("uint16")
    df.FCode = df.FCode.astype("uint16")
    df.StreamOrde = df.StreamOrde.astype("uint8")
    df.Slope = df.Slope.astype("float32")
    df.MinElevSmo = df.MinElevSmo.astype("float32")
    df.MaxElevSmo = df.MaxElevSmo.astype("float32")

    ### Read in flowline joins
    print("Reading flowline joins")
    join_df = read_dataframe(
        gdb_path,
        layer="NHDPlusFlow",
        read_geometry=False,
        columns=["FromNHDPID", "ToNHDPID"],
    ).rename(columns={"FromNHDPID": "upstream", "ToNHDPID": "downstream"})
    join_df.upstream = join_df.upstream.astype("uint64")
    join_df.downstream = join_df.downstream.astype("uint64")

    ### Fix errors in NHD
    # some valid joins are marked as terminals (downstream==0) in NHD; we need
    # to backfill the missing join info.
    # To do this, we intersect all terminals back with flowlines dropping any
    # that are themselves terminals.  Then we calculate the distance to the upstream
    # point of the intersected line, and the upstream point of the next segment
    # downstream.  We use the ID of whichever one is closer (must be within 100m).
    ix = join_df.loc[join_df.downstream == 0].upstream.unique()
    # get last point, is furthest downstream
    tmp = df.loc[df.index.isin(ix), ["geometry"]].copy()
    tmp["geometry"] = pg.get_point(tmp.geometry.values.data, -1)

    target = df.loc[~df.index.isin(ix)]

    # only search against other flowlines
    tree = pg.STRtree(target.geometry.values.data)
    # search within a tolerance of 0.001, these are very very close
    left, right = tree.nearest_all(tmp.geometry.values.data, max_distance=0.001)

    pairs = pd.DataFrame(
        {
            "left": tmp.index.take(left),
            "right": target.index.take(right),
            "source": tmp.geometry.values.data.take(left),
            # take upstream / downstream points of matched lines
            "upstream_target": pg.get_point(df.geometry.values.data.take(right), 0),
        }
    )

    # drop any pairs where the other side is also a terminal (these appear as
    # V shaped tiny networks that need to be left as is)
    pairs = pairs.loc[~pairs.right.isin(ix)]

    # calculate the next segment downstream (only keep the first if multiple; possible logic issue)
    next_downstream = (
        join_df.loc[(join_df.upstream != 0) & (join_df.downstream != 0)]
        .groupby("upstream")
        .downstream.first()
    )
    pairs["next_downstream"] = pairs.right.map(next_downstream)
    pairs.loc[pairs.next_downstream.notnull(), "downstream_target"] = pg.get_point(
        df.loc[
            pairs.loc[pairs.next_downstream.notnull()].next_downstream
        ].geometry.values.data,
        0,
    )

    pairs["upstream_dist"] = pg.distance(pairs.source, pairs.upstream_target)
    ix = pairs.next_downstream.notnull()
    pairs.loc[ix, "downstream_dist"] = pg.distance(
        pairs.loc[ix].source, pairs.loc[ix].downstream_target
    )

    # this ignores any nan
    pairs["dist"] = pairs[["upstream_dist", "downstream_dist"]].min(axis=1)
    # discard any that are too far (>100m)
    pairs = pairs.loc[pairs.dist <= 100].copy()

    # sort by distance to upstream point of matched flowline; this allows us
    # to sort on those then dedup to calculate a new downstream ID for this source line
    pairs = pairs.sort_values(by=["left", "dist"])

    # set the right value to the next downstream if it is closer
    # this also ignores na
    ix = pairs.downstream_dist < pairs.upstream_dist
    pairs.loc[ix, "right"] = pairs.loc[ix].next_downstream.astype("uint64")

    ids = pairs.groupby("left").right.first()

    if len(ids):
        # save to send to NHD
        pd.DataFrame({"NHDPlusID": ids.index.unique()}).to_csv(
            f"/tmp/{gdb_path.stem}_bad_joins.csv", index=False
        )

        ix = join_df.upstream.isin(ids.index)
        join_df.loc[ix, "downstream"] = join_df.loc[ix].upstream.map(ids)

        print(
            f"Repaired {len(ids):,} joins marked by NHD as terminals but actually joined to flowlines"
        )

    # set join types to make it easier to track
    join_df["type"] = "internal"  # set default
    # upstream-most origin points
    join_df.loc[join_df.upstream == 0, "type"] = "origin"
    # downstream-most termination points
    join_df.loc[join_df.downstream == 0, "type"] = "terminal"

    ### Filter out coastlines and update joins
    # WARNING: we tried filtering out pipelines (FType == 428).  It doesn't work properly;
    # there are many that go through dams and are thus needed to calculate
    # network connectivity and gain of removing a dam.
    print("Filtering out coastlines...")
    coastline_idx = df.loc[df.FType == 566].index
    df = df.loc[~df.index.isin(coastline_idx)].copy()
    print(f"{len(df):,} features after removing coastlines")

    # remove any joins that have coastlines as upstream
    # these are themselves coastline segments
    join_df = join_df.loc[~join_df.upstream.isin(coastline_idx)].copy()

    # set the downstream to 0 for any that join coastlines
    # this will enable us to mark these as downstream terminals in
    # the network analysis later
    join_df["marine"] = join_df.downstream.isin(coastline_idx)
    join_df.loc[join_df.marine, "downstream"] = 0
    join_df.loc[join_df.marine, "type"] = "terminal"

    # drop any duplicates (above operation sets some joins to upstream and downstream of 0)
    join_df = join_df.drop_duplicates(subset=["upstream", "downstream"])

    ### Filter out underground connectors
    ix = df.loc[df.FType == 420].index
    print("Removing {:,} underground conduits".format(len(ix)))
    df = df.loc[~df.index.isin(ix)].copy()
    join_df = remove_joins(
        join_df, ix, downstream_col="downstream", upstream_col="upstream"
    )

    ### Label loops for easier removal later
    # WARNING: loops may be very problematic from a network processing standpoint.
    # Include with caution.
    print("Identifying loops")
    df["loop"] = (df.StreamOrde != df.StreamCalc) | (df.FlowDir.isnull())

    idx = df.loc[df.loop].index
    join_df["loop"] = join_df.upstream.isin(idx) | join_df.downstream.isin(idx)

    ### Add calculated fields
    # Set our internal master IDs to the original index of the file we start from
    # Assume that we can always fit into a uint32, which is ~400 million records
    # and probably bigger than anything we could ever read in
    df["lineID"] = df.index.values.astype("uint32") + 1
    join_df = (
        join_df.join(df.lineID.rename("upstream_id"), on="upstream")
        .join(df.lineID.rename("downstream_id"), on="downstream")
        .fillna(0)
    )

    for col in ("upstream", "downstream"):
        join_df[col] = join_df[col].astype("uint64")

    for col in ("upstream_id", "downstream_id"):
        join_df[col] = join_df[col].astype("uint32")

    ### Calculate size classes
    print("Calculating size class")
    drainage = df.TotDASqKm
    df.loc[drainage < 10, "sizeclass"] = "1a"
    df.loc[(drainage >= 10) & (drainage < 100), "sizeclass"] = "1b"
    df.loc[(drainage >= 100) & (drainage < 518), "sizeclass"] = "2"
    df.loc[(drainage >= 518) & (drainage < 2590), "sizeclass"] = "3a"
    df.loc[(drainage >= 2590) & (drainage < 10000), "sizeclass"] = "3b"
    df.loc[(drainage >= 10000) & (drainage < 25000), "sizeclass"] = "4"
    df.loc[drainage >= 25000, "sizeclass"] = "5"

    # Calculate length and sinuosity
    print("Calculating length and sinuosity")
    df["length"] = df.geometry.length.astype("float32")
    df["sinuosity"] = calculate_sinuosity(df.geometry.values.data).astype("float32")

    # drop columns not useful for later processing steps
    df = df.drop(columns=["FlowDir", "StreamCalc"])

    # calculate incoming joins (have valid upstream, but not in this HUC4)
    join_df.loc[(join_df.upstream != 0) & (join_df.upstream_id == 0), "type"] = "huc_in"

    return df, join_df

Example #26

0

Show file

File: prep_nid_dams_outer_huc4s.py Project: astutespruce/sarp-connectivity

    CRS,
    DAM_FS_COLS,
)

warnings.filterwarnings("ignore", message=".*initial implementation of Parquet.*")


data_dir = Path("data")
boundaries_dir = data_dir / "boundaries"
src_dir = data_dir / "barriers/source"

# note: drop date fields, they have bogus values anyway
nid = (
    read_dataframe(
        src_dir / "NID_2021.gdb",
    )
    .to_crs(CRS)
    .drop(columns=["EditorDate", "NextFollowUpDate"])
    .set_index("NIDID")
)
cols = [c for c in nid.columns if c in DAM_FS_COLS]
nid = nid[cols + ["geometry"]].rename(columns={"SARPUniqueID": "SARPID"})
nid["ManualReview"] = 0


# load previously snapped dams
prev = gp.read_feather(
    src_dir / "manually_snapped_dams.feather",
)
prev.ManualReview = prev.ManualReview.astype("uint8")
prev = prev.loc[prev.ManualReview.isin([4, 5, 13])].set_index("SARPID")

Example #27

0

Show file

def read_file(path,
              npartitions=None,
              chunksize=None,
              layer=None,
              columns=None,
              **kwargs):
    """
    Read a GIS file into a Dask GeoDataFrame.

    This function requires `pyogrio <https://github.com/geopandas/pyogrio/>`__.

    Parameters
    ----------
    path : str
        The absolute or relative path to the file or URL to
        be opened.
    npartitions : int, optional
        The number of partitions to create. Either this or `chunksize` should
        be specified.
    chunksize : int, optional
        The number of rows per partition to use. Either this or `npartitions`
        should be specified.
    layer : int or str, optional (default: first layer)
        If an integer is provided, it corresponds to the index of the layer
        with the data source.  If a string is provided, it must match the name
        of the layer in the data source.  Defaults to first layer in data source.
    columns : list-like, optional (default: all columns)
        List of column names to import from the data source.  Column names must
        exactly match the names in the data source, and will be returned in
        the order they occur in the data source.  To avoid reading any columns,
        pass an empty list-like.

    """
    try:
        import pyogrio
    except ImportError as err:
        raise ImportError(
            "The 'read_file' function requires the 'pyogrio' package, but it is "
            "not installed or does not import correctly."
            f"\nImporting pyogrio resulted in: {str(err)}")

    from dask.layers import DataFrameIOLayer

    # TODO smart inference for a good default partition size ?
    if (npartitions is None) == (chunksize is None):
        raise ValueError(
            "Exactly one of npartitions and chunksize must be specified.")

    if "skip_features" in kwargs or "max_features" in kwargs:
        # TODO we currently use those keywords already for reading in each
        # partition (we would need to take those into account for determining
        # the part start/ends)
        raise ValueError(
            "The 'skip_features'/'max_feature' keywords are not yet supported")
    if kwargs:
        raise ValueError("Additional pyogrio keywords are not yet supported")

    total_size = pyogrio.read_info(path, layer=layer)["features"]

    if chunksize is None:
        chunksize = int(ceil(total_size / npartitions))

    # TODO this could be inferred from read_info ?
    read_geometry = True
    if columns is not None and "geometry" not in columns:
        read_geometry = False
    meta = pyogrio.read_dataframe(path,
                                  layer=layer,
                                  columns=columns,
                                  read_geometry=read_geometry,
                                  max_features=5)

    # Define parts
    parts = []
    row_offset = 0
    divs = [row_offset]

    while row_offset < total_size:
        batch_size = min(chunksize, total_size - row_offset)
        parts.append((path, row_offset, batch_size))
        row_offset += batch_size
        divs.append(row_offset)
    # Set the last division value to be the largest index value in the last partition
    divs[-1] = divs[-1] - 1

    # Create Blockwise layer
    label = "read-file-"
    output_name = label + tokenize(path, chunksize, layer, columns)
    layer = DataFrameIOLayer(
        output_name,
        columns,
        parts,
        FileFunctionWrapper(layer, columns),
        label=label,
    )
    graph = HighLevelGraph({output_name: layer}, {output_name: set()})
    return new_dd_object(graph, output_name, meta, divs)

Example #28

0

Show file

nodata = 255

print("Reading and warping Nature's Network...")
with rasterio.open(src_dir / "NaturesNetwork_conservdesign_180625.tif") as src:
    # note: raster does not have nodata set; 0 indicates NODATA (outside extent)
    # and 0 values
    data = extract_window(src, window, transform, nodata=nodata)

# apply input area mask
data = np.where(mask == 1, data, nodata).astype("uint8")

print("Reclassifying data...")
# Remap the raw values to priorities and categories
table = read_dataframe(
    src_dir / "NaturesNetwork_conservdesign_180625.tif.vat.dbf",
    columns=["Value", "Priority", "Descrpt"],
    read_geometry=False,
)
table = table.loc[table.Value > 0].copy()
table.Priority = table.Priority.astype("uint8")
table["category"] = table.Descrpt.str[0].astype("uint8")

remap_table = table[["Value", "Priority"]].values.astype("uint8")
priority_data = remap(data, remap_table, nodata=nodata)

write_raster(outfilename, priority_data, transform, DATA_CRS, nodata=nodata)

print("Adding overviews and masks...")
add_overviews(outfilename)

create_lowres_mask(

Example #29

0

Show file

tile_dir = data_dir / "for_tiles"
input_area_mask = data_dir / "inputs/input_areas_mask.tif"

if not analysis_dir.exists():
    os.makedirs(analysis_dir)

bnd_df = gp.read_feather(data_dir / "inputs/boundaries/se_boundary.feather")
bnd = bnd_df.geometry.values.data[0]

### Extract HUC12 within boundary
print("Reading source HUC12s...")
merged = None
for huc2 in [2, 3, 5, 6, 7, 8, 10, 11, 12, 13, 21]:
    df = read_dataframe(
        src_dir
        / f"summary_units/huc12/WBD_{huc2:02}_HU2_GDB/WBD_{huc2:02}_HU2_GDB.gdb",
        layer="WBDHU12",
    )[["huc12", "name", "geometry"]].rename(columns={"huc12": "id"})

    if merged is None:
        merged = df

    else:
        merged = merged.append(df, ignore_index=True)

print("Projecting to match SE region data...")
huc12 = merged.to_crs(DATA_CRS)


# select out those within the SE states
print("Selecting HUC12s in region...")