Exemple #1
0
def get_dataset(zip):
    """Gets singular geospatial dataset and layer for analysis.

    Validates rules:
    - There must be only one data source (.shp or .gdb) in the zip file.
    - There must be only one data layer in that data source.
    - The data source must contain the required files (.prj for shapefile; .dbf is not used so not required)

    Parameters
    ----------
    zip : open ZipFile

    Returns
    -------
    (str, str)
        tuple of geospatial file within zip file, name of layer
    """
    files = set(list_files(zip))
    geo_files = [f for f in list_files(zip) if f.endswith(".shp") or f.endswith(".gdb")]

    num_files = len(geo_files)

    if num_files == 0:
        log.error("Upload zip file does not contain shp or FGDB files")

        raise ValueError("zip file must include a shapefile or FGDB")

    if num_files > 1:
        log.error(
            f"Upload zip file contains {num_files} shp or FGDB files:\n{geo_files}"
        )

        raise ValueError("zip file must include only one shapefile or FGDB")

    filename = geo_files[0]

    if filename.endswith(".shp"):
        missing = []
        for ext in (".prj", ".shx"):
            if not (filename.replace(".shp", ext) in files):
                missing.append(ext)

        if missing:
            log.error(f"Upload zip file contains .shp but not {','.join(missing)}")
            raise ValueError("zip file must include .shp, .prj, and .shx files")

    # Validate that dataset is a polygon and has only a single layer
    layers = pio.list_layers(f"/vsizip/{zip.fp.name}/{filename}")

    if layers.shape[0] > 1:
        log.error(f"Upload data source contains multiple data layers\n{layers}")
        raise ValueError("data source must contain only one data layer")

    if "Polygon" not in layers[0, 1]:
        log.error(f"Upload data source is not a polygon: {layers[0,1]}")
        raise ValueError("data source must be a Polygon type")

    return filename, layers[0, 0]
Exemple #2
0
def test_list_layers(naturalearth_lowres, naturalearth_lowres_vsi,
                     test_fgdb_vsi):
    assert array_equal(list_layers(naturalearth_lowres),
                       [["naturalearth_lowres", "Polygon"]])

    assert array_equal(list_layers(naturalearth_lowres_vsi),
                       [["naturalearth_lowres", "Polygon"]])

    # Measured 3D is downgraded to 2.5D during read
    # Make sure this warning is raised
    with pytest.warns(
            UserWarning,
            match=r"Measured \(M\) geometry types are not supported"):
        fgdb_layers = list_layers(test_fgdb_vsi)
        assert len(fgdb_layers) == 7

        # Make sure that nonspatial layer has None for geometry
        assert array_equal(fgdb_layers[0], ["basetable_2", None])

        # Confirm that measured 3D is downgraded to 2.5D during read
        assert array_equal(fgdb_layers[3],
                           ["test_lines", "2.5D MultiLineString"])
        assert array_equal(fgdb_layers[6], ["test_areas", "2.5D MultiPolygon"])
Exemple #3
0
def test_read_layer(test_fgdb_vsi):
    layers = list_layers(test_fgdb_vsi)
    # The first layer is read by default (NOTE: first layer has no features)
    df = read_dataframe(test_fgdb_vsi, read_geometry=False, max_features=1)
    df2 = read_dataframe(test_fgdb_vsi,
                         layer=layers[0][0],
                         read_geometry=False,
                         max_features=1)
    assert_frame_equal(df, df2)

    # Reading a specific layer should return that layer.
    # Detected here by a known column.
    df = read_dataframe(test_fgdb_vsi,
                        layer="test_lines",
                        read_geometry=False,
                        max_features=1)
    assert "RIVER_MILE" in df.columns
Exemple #4
0
def test_vsi_read_layers(naturalearth_lowres_vsi):
    assert array_equal(list_layers(naturalearth_lowres_vsi),
                       [["naturalearth_lowres", "Polygon"]])

    meta, geometry, fields = read(naturalearth_lowres_vsi)
    assert geometry.shape == (177, )
Exemple #5
0
def convert_census_gdb(
    file,
    year=None,
    layers=None,
    level="bg",
    save_intermediate=True,
    combine=True,
    output_dir=".",
):
    """Convert file geodatabases from Census into (set of) parquet files.

    Parameters
    ----------
    file : str
        path to file geodatabase
    year : str
        year that the data should be named by. If none, will try to infer from the filename
        based on convention from the Census Bureau FTP server
    layers : list, optional
        set of layers to extract from geodatabase. If none (default), all layers will be extracted
    level : str, optional
        geographic level of data ('bg' for blockgroups or 'tr' for tract), by default "bg"
    save_intermediate : bool, optional
        if true, each layer will be stored separately as a parquet file, by default True
    combine : bool, optional
        whether to store and concatenate intermediate dataframes, default is True
    output_dir : str, optional
        path to directory where parquet files will be written, by default "."
    """
    try:
        import pyogrio as ogr
    except ImportError:
        raise ImportError("this function requires the `pyogrio` package\n"
                          "`conda install pyogrio`")
    if not layers:  # grab them all except the metadata
        year_suffix = file.split(".")[0].split("_")[1][-2:]
        meta_str = f"{level.upper()}_METADATA_20{year_suffix}"
        layers = [layer[0] for layer in ogr.list_layers(file)]
        if meta_str in layers:
            layers.remove(meta_str)
    if (
            not year
    ):  # make a strong assumption about the name of the file coming from census
        year = file.split("_")[1]
    tables = []
    for i in layers:
        print(i)
        df = ogr.read_dataframe(file, layer=i).set_index("GEOID")
        if "ACS_" in i:
            df = gpd.GeoDataFrame(df)
        else:
            df = df[df.columns[df.columns.str.contains("e")]]
            df.columns = pd.Series(df.columns).apply(reformat_acs_vars)
        df = df.dropna(axis=1, how="all")
        if combine:
            tables.append(df)
        if save_intermediate:
            df.to_parquet(
                pathlib.PurePath(output_dir,
                                 f"acs_{year}_{i}_{level}.parquet"))
    if combine:
        df = pd.concat(tables, axis=1)
        if f"ACS_{year}_5YR_{level.upper()}" in layers:
            df = gpd.GeoDataFrame(df)
        df.to_parquet(
            pathlib.PurePath(output_dir, f"acs_{year}_{level}.parquet"))
Exemple #6
0
from pyogrio import read_dataframe, list_layers

from analysis.lib.util import append

# NLCD natural landcover classes
# descriptions here: https://www.mrlc.gov/data/legends/national-land-cover-database-2016-nlcd2016-legend
NATURAL_TYPES = {11, 12, 31, 41, 42, 43, 51, 52, 71, 72, 73, 74, 90, 95}

data_dir = Path("data")
src_dir = data_dir / "floodplains"
gdb_filename = src_dir / "NLCD2016_Floodplain_Stats_2020_12072020.gdb"
# fixes run later for region 02 that have to be spliced in
region02_gdb_filename = src_dir / "Region2FixedStats.gdb"

# layers have varying names, make a lookup from them
layers = list_layers(gdb_filename)[:, 0]
layers = {re.search("\d+", l).group()[-2:]: l for l in layers}

huc4_df = pd.read_feather(data_dir / "boundaries/huc4.feather",
                          columns=["HUC2", "HUC4"])
# Convert to dict of sorted HUC4s per HUC2
units = huc4_df.groupby("HUC2").HUC4.unique().apply(sorted).to_dict()

start = time()

merged = None

for huc2 in units.keys():
    print(f"Processing floodplain stats for {huc2}")

    if huc2 == "02":