def test_parquet_roundtrip(tmp_path): # basic roundtrip df = geopandas.read_file( geopandas.datasets.get_path("naturalearth_lowres")) ddf = dask_geopandas.from_geopandas(df, npartitions=4) basedir = tmp_path / "dataset" ddf.to_parquet(basedir) # each partition (4) is written as parquet file paths = list(basedir.glob("*.parquet")) assert len(paths) == 4 # reading back gives identical GeoDataFrame result = dask_geopandas.read_parquet(basedir) assert ddf.npartitions == 4 assert_geodataframe_equal(result.compute(), df) # the written dataset is also readable by plain geopandas result_gpd = geopandas.read_parquet(basedir) # the dataset written by dask has "__null_dask_index__" index column name result_gpd.index.name = None assert_geodataframe_equal(result_gpd, df) result_part0 = geopandas.read_parquet(basedir / "part.0.parquet") result_part0.index.name = None assert_geodataframe_equal(result_part0, df.iloc[:45])
def test_parquet_promote_secondary_geometry(tmpdir): """Reading a subset of columns that does not include the primary geometry column should promote the first geometry column present. """ test_dataset = "naturalearth_lowres" df = read_file(get_path(test_dataset)) df["geom2"] = df.geometry.copy() filename = os.path.join(str(tmpdir), "test.pq") df.to_parquet(filename) pq_df = read_parquet(filename, columns=["name", "geom2"]) assert_geodataframe_equal( df.set_geometry("geom2")[["name", "geom2"]], pq_df) df["geom3"] = df.geometry.copy() df.to_parquet(filename) with pytest.warns( UserWarning, match= "Multiple non-primary geometry columns read from Parquet file.", ): pq_df = read_parquet(filename, columns=["name", "geom2", "geom3"]) assert_geodataframe_equal( df.set_geometry("geom2")[["name", "geom2", "geom3"]], pq_df)
def tracts_2010( self, states=None, ): """Nationwide Census Tracts as drawn in 2010 (cartographic 500k). Parameters ---------- states : list-like list of state fips to subset the national dataframe Returns ------- pandas.DataFrame or geopandas.GeoDataFrame 2010 tracts as a geodataframe or as a dataframe with geometry stored as well-known binary on the 'wkb' column. """ try: t = gpd.read_parquet(pathlib.Path(data_dir, "tracts_2010_500k.parquet")) except Exception: warn( "streaming remote data. Use `geosnap.io.store_census() to store the data locally for better performance" ) t = gpd.read_parquet( "s3://spatial-ucr/census/tracts_cartographic/tracts_2010_500k.parquet" ) if states: t = t[t.geoid.str[:2].isin(states)] t["year"] = 2010 return t
def acs(self, year=2018, level="tract", states=None): """American Community Survey Data. Parameters ---------- year : str vingage of ACS release. level : str geographic level states : list, optional subset of states (as 2-digit fips) to return Returns ------- geopandas.GeoDataFrame geodataframe of ACS data indexed by FIPS code """ try: t = gpd.read_parquet( pathlib.Path(data_dir, "acs", f"acs_{year}_{level}.parquet") ) except Exception: warn( "streaming remote data. Use `geosnap.io.store_acs() to store the data locally for better performance" ) t = gpd.read_parquet( f"s3://spatial-ucr/census/acs/acs_{year}_{level}.parquet" ) t = t.reset_index().rename(columns={"GEOID": "geoid"}) if states: t = t[t.geoid.str[:2].isin(states)] t["year"] = year return t
def test_parquet_invalid_metadata(tmpdir, geo_meta, error): """Has geo metadata with missing required fields will raise a ValueError. This requires writing the parquet file directly below, so that we can control the metadata that is written for this test. """ from pyarrow import parquet, Table test_dataset = "naturalearth_lowres" df = read_file(get_path(test_dataset)) # convert to DataFrame and encode geometry to WKB df = DataFrame(df) df["geometry"] = to_wkb(df["geometry"].values) table = Table.from_pandas(df) metadata = table.schema.metadata metadata.update(geo_meta) table = table.replace_schema_metadata(metadata) filename = os.path.join(str(tmpdir), "test.pq") parquet.write_table(table, filename) with pytest.raises(ValueError, match=error): read_parquet(filename)
def test_fsspec_url(): fsspec = pytest.importorskip("fsspec") import fsspec.implementations.memory class MyMemoryFileSystem(fsspec.implementations.memory.MemoryFileSystem): # Simple fsspec filesystem that adds a required keyword. # Attempting to use this filesystem without the keyword will raise an exception. def __init__(self, is_set, *args, **kwargs): self.is_set = is_set super().__init__(*args, **kwargs) fsspec.register_implementation("memory", MyMemoryFileSystem, clobber=True) memfs = MyMemoryFileSystem(is_set=True) test_dataset = "naturalearth_lowres" df = read_file(get_path(test_dataset)) with memfs.open("data.parquet", "wb") as f: df.to_parquet(f) result = read_parquet("memory://data.parquet", storage_options=dict(is_set=True)) assert_geodataframe_equal(result, df) result = read_parquet("memory://data.parquet", filesystem=memfs) assert_geodataframe_equal(result, df)
def _fetcher(local_path, remote_path, warning_msg): try: t = gpd.read_parquet(local_path) except FileNotFoundError: warn(warning_msg) t = gpd.read_parquet(remote_path, storage_options={"anon": True}) return t
def test_parquet_columns_no_geometry(tmpdir): """Reading a parquet file that is missing all of the geometry columns should raise a ValueError""" test_dataset = "naturalearth_lowres" df = read_file(get_path(test_dataset)) filename = os.path.join(str(tmpdir), "test.pq") df.to_parquet(filename) with pytest.raises(ValueError): read_parquet(filename, columns=["name"])
def _open_dataset(self): """ Open dataset using geopandas. """ if self._use_fsspec: with fsspec.open_files(self.urlpath, **self.storage_options) as f: f = self._resolve_single_file(f) if len(f) > 1 else f[0] self._dataframe = geopandas.read_parquet( f, **self._geopandas_kwargs, ) else: self._dataframe = geopandas.read_parquet(self.urlpath, **self._geopandas_kwargs)
def test_parquet_partition_on(tmp_path, write_metadata_file): df = geopandas.read_file( geopandas.datasets.get_path("naturalearth_lowres")) ddf = dask_geopandas.from_geopandas(df, npartitions=4) # Writing a partitioned dataset based on one of the attribute columns basedir = tmp_path / "naturalearth_lowres_by_continent.parquet" ddf.to_parquet(basedir, partition_on="continent", write_metadata_file=write_metadata_file) # Check for one of the partitions that the file is present and is correct n_files = 10 if write_metadata_file else 8 # 8 continents + 2 metadata files assert len(list(basedir.iterdir())) == n_files assert (basedir / "continent=Africa").exists() result_africa = geopandas.read_parquet(basedir / "continent=Africa") expected = df[df["continent"] == "Africa"].drop(columns=["continent"]) result_africa.index.name = None assert_geodataframe_equal(result_africa, expected) # Check roundtrip result = dask_geopandas.read_parquet(basedir) assert result.npartitions >= 8 assert result.spatial_partitions is not None expected = df.copy() expected["continent"] = expected["continent"].astype("category") assert_geodataframe_equal(result.compute(), expected, check_like=True)
def states(self): """States. Returns ------- pandas.DataFrame or geopandas.GeoDataFrame US States as a geodataframe or as a dataframe with geometry stored as well-known binary on the 'wkb' column. """ try: return gpd.read_parquet(pathlib.Path(data_dir, "states.parquet")) except Exception: return gpd.read_parquet( "s3://spatial-ucr/census/administrative/states.parquet" )
def test_write_read_parquet_expand_user(): gdf = geopandas.GeoDataFrame(geometry=[box(0, 0, 10, 10)], crs="epsg:4326") test_file = "~/test_file.parquet" gdf.to_parquet(test_file) pq_df = geopandas.read_parquet(test_file) assert_geodataframe_equal(gdf, pq_df, check_crs=True) os.remove(os.path.expanduser(test_file))
def download_geoparquet(file_name="my_file.parquet", bucket_name="city-planning-entitlements", local_path="", S3_path=""): """ Downloads geoparquet from S3 locally, read into memory as GeoDataFrame, and removes local version. geopandas>=0.8.0 supports initial geoparquets. Parameters ========== file_name: str, name of the file, such as "census_tracts.parquet" bucket_name: str, S3 bucket name. local_path: str, the local directory or folder path where the file should be stored. Ex: "./data/" S3_path: str, the S3 directory or folder path to where the file is stored in S3. Ex: "data/" """ s3.download_file(bucket_name, f'{S3_path}{file_name}', f'{local_path}{file_name}') gdf = gpd.read_parquet(f'{local_path}{file_name}') os.remove(f'{local_path}{file_name}') return gdf
def test_parquet_index(tmpdir): """Setting index=`True` should preserve index in output, and setting index=`False` should drop index from output. """ test_dataset = "naturalearth_lowres" df = read_file(get_path(test_dataset)).set_index("iso_a3") filename = os.path.join(str(tmpdir), "test_with_index.pq") df.to_parquet(filename, index=True) pq_df = read_parquet(filename) assert_geodataframe_equal(df, pq_df) filename = os.path.join(str(tmpdir), "drop_index.pq") df.to_parquet(filename, index=False) pq_df = read_parquet(filename) assert_geodataframe_equal(df.reset_index(drop=True), pq_df)
def test_parquet_missing_metadata2(tmpdir): """Missing geo metadata, such as from a parquet file created from a pyarrow Table (which will also not contain pandas metadata), will raise a ValueError. """ import pyarrow.parquet as pq table = pyarrow.table({"a": [1, 2, 3]}) filename = os.path.join(str(tmpdir), "test.pq") # use pyarrow.parquet write_table (no geo metadata, but also no pandas metadata) pq.write_table(table, filename) # missing metadata will raise ValueError with pytest.raises(ValueError, match="Missing geo metadata in Parquet/Feather file."): read_parquet(filename)
def region(vector, raster, cmap='rainbow', boundary='red', band=1): """Quickly plot a subregion from a rasterdataset. The subregion is defined by the bounding box of the supplied vector. Colourmaps will primarily be obtained from the colorcet library. Parameters ---------- vector : geopandas.GeoDataFrame or path_like object raster : rasterio.io.DatasetReader or path_like object cmap : colormap colormap for the raster data. Continuous data: e.g. colorwheel, rainbow, fire Categorical data: e.g. glasbey boundary : colorname color for the vector data Returns ------- ax : matplotlib plot """ if isinstance(vector, str): if Path(vector).suffix == '.parquet': gdf = gpd.read_parquet(vector) else: gdf = gpd.read_file(vector) else: gdf = vector if isinstance(raster, str): rast_file = rasterio.open(raster) else: rast_file = raster if cmap in cc.cm: cmap = cc.cm[cmap] # matplotlib and geographic packages like rasterio and geopandas use # different ordering conventions for their bounding box information. # geographic information systems (bounds): (west, south, north, east) # matplotlib (extent): (west, east, south, north) gdf_bounds = gdf.total_bounds gdf_extent = gdf_bounds[[0, 2, 1, 3]] # Subsetting raster data in rasterio is easiest to do before it is # read into memory (although it is possible to do so after read()). # Subsetting data requires a rasterio.windows.Window object to be built # that describes the area to focus on. There are many helper functions # to build windows, but the simplest is: rast_file.window # A window can easily be build by unpacking bounds obtained from a gdf rast_window = rast_file.window(*gdf_bounds) # Now we can read in our data within the desired region rast = rast_file.read(band, window=rast_window) plt.imshow(rast, cmap=cmap, extent=gdf_extent) gdf.boundary.plot(ax=plt.gca(), color=boundary)
def test_parquet_subset_columns(tmpdir): """Reading a subset of columns should correctly decode selected geometry columns. """ test_dataset = "naturalearth_lowres" df = read_file(get_path(test_dataset)) filename = os.path.join(str(tmpdir), "test.pq") df.to_parquet(filename) pq_df = read_parquet(filename, columns=["name", "geometry"]) assert_geodataframe_equal(df[["name", "geometry"]], pq_df) with pytest.raises( ValueError, match="No geometry columns are included in the columns read"): read_parquet(filename, columns=[])
def blocks_2010(self, states=None, fips=None): """Census blocks for 2010. Parameters ---------- states : list-like list of state fips codes to return as a datafrrame. Returns ------- type pandas.DataFrame or geopandas.GeoDataFrame 2010 blocks as a geodataframe or as a dataframe with geometry stored as well-known binary on the 'wkb' column. """ if isinstance(states, (str, int)): states = [states] blks = {} for state in states: try: blks[state] = gpd.read_parquet( pathlib.Path(data_dir, "blocks_2010", f"{state}.parquet") ) except Exception: warn( "Unable to locate local census 2010 block data. Streaming instead.\n" "If you plan to use census data repeatedly you can store it locally " "with the io.store_blocks_2010 function for better performance" ) blks[state] = gpd.read_parquet( f"s3://spatial-ucr/census/blocks_2010/{state}.parquet" ) if fips: blks[state] = blks[state][blks[state]["geoid"].str.startswith(fips)] blks[state]["year"] = 2010 blocks = list(blks.values()) blocks = gpd.GeoDataFrame(pd.concat(blocks, sort=True)) return blocks
def counties(self): """Nationwide counties as drawn in 2010. Parameters ---------- convert : bool if True, return geodataframe, else return dataframe (the default is True). Returns ------- geopandas.GeoDataFrame 2010 counties as a geodataframe or as a dataframe with geometry stored as well-known binary on the 'wkb' column. """ try: return gpd.read_parquet(pathlib.Path(data_dir, "counties.parquet")) except Exception: return gpd.read_parquet( "s3://spatial-ucr/census/administrative/counties.parquet" )
def main(config): incoming_data_path = config['paths']['incoming_data'] processed_data_path = config['paths']['data'] output_data_path = config['paths']['output'] epsg_jamaica = 3448 baseline_year = 2019 projection_end_year = 2100 discounting_rate = 10 asset_data_details = pd.read_csv( os.path.join(processed_data_path, "networks", "network_layers_hazard_intersections_details.csv")) asset_data_details = asset_data_details[ asset_data_details["sector"] != "buildings"] hazard_asset_intersection_path = os.path.join(output_data_path, "hazard_asset_intersection") flood_hazards = ["coastal", "fluvial", "surface"] flood_threshold = 0.5 hazard_data_details = pd.read_csv(os.path.join(processed_data_path, "hazards", "hazard_layers.csv"), encoding="latin1") hazard_keys = hazard_data_details[hazard_data_details["hazard"].isin( flood_hazards)].key.values.tolist() for asset_info in asset_data_details.itertuples(): asset_id = asset_info.asset_id_column index_columns = [asset_id, "damage_cost_unit", "hazard"] hazard_intersection_file = os.path.join( hazard_asset_intersection_path, f"{asset_info.asset_gpkg}_splits__hazard_layers__{asset_info.asset_layer}.geoparquet" ) if os.path.isfile(hazard_intersection_file) is True: hazard_df = gpd.read_parquet(hazard_intersection_file) hazard_df = hazard_df.to_crs(epsg=epsg_jamaica) hazard_df = add_exposure_dimensions( hazard_df, dataframe_type=asset_info.asset_layer, epsg=epsg_jamaica) hazard_df = hazard_df[[asset_id, 'exposure', 'exposure_unit'] + hazard_keys] hazard_df["max_flood_depth"] = hazard_df[hazard_keys].max(axis=1) hazard_df = hazard_df[ hazard_df["max_flood_depth"] > flood_threshold] print( hazard_df.sort_values(by="max_flood_depth", ascending=False).head(20)) print(f"* Done with {asset_info.asset_gpkg}")
def msas(self): """Metropolitan Statistical Areas as drawn in 2020. Data come from the U.S. Census Bureau's most recent TIGER/LINE files https://www.census.gov/cgi-bin/geo/shapefiles/index.php?year=2020&layergroup=Core+Based+Statistical+Areas Returns ------- pandas.DataFrame or geopandas.GeoDataFrame 2010 MSAs as a geodataframe or as a dataframe with geometry stored as well-known binary on the 'wkb' column. """ try: return gpd.read_parquet(pathlib.Path(data_dir, "msas.parquet")).sort_values( by="name" ) except Exception: return gpd.read_parquet( "s3://spatial-ucr/census/administrative/msas.parquet" ).sort_values(by="name")
def test_parquet_repeat_columns(tmpdir): """Reading repeated columns should return first value of each repeated column """ test_dataset = "naturalearth_lowres" df = read_file(get_path(test_dataset)) filename = os.path.join(str(tmpdir), "test.pq") df.to_parquet(filename) columns = ["name", "name", "iso_a3", "name", "geometry"] pq_df = read_parquet(filename, columns=columns) assert pq_df.columns.tolist() == ["name", "iso_a3", "geometry"]
def test_parquet_compression(compression, tmpdir): """Using compression options should not raise errors, and should return identical GeoDataFrame. """ test_dataset = "naturalearth_lowres" df = read_file(get_path(test_dataset)) filename = os.path.join(str(tmpdir), "test.pq") df.to_parquet(filename, compression=compression) pq_df = read_parquet(filename) assert isinstance(pq_df, GeoDataFrame) assert_geodataframe_equal(df, pq_df)
def transform_sa_geometries(input_filepath: Path, output_filepath: Path) -> None: """Transform Small Area geometries. Args: input_filepath (Path): Path to Raw Small Area Geometries Data output_filepath (Path): Path to Clean Small Area Geometries Data """ sa_geometries = ( gpd.read_parquet(input_filepath).pipe(extract_dublin_local_authorities) .to_crs("epsg:4326").loc[:, ["SMALL_AREA", "geometry"]].rename( columns={"SMALL_AREA": "small_area"})) sa_geometries.to_parquet(output_filepath)
def test_parquet_missing_metadata(tmpdir): """Missing geo metadata, such as from a parquet file created from a pandas DataFrame, will raise a ValueError. """ test_dataset = "naturalearth_lowres" df = read_file(get_path(test_dataset)) # convert to DataFrame df = DataFrame(df) # convert the geometry column so we can extract later df["geometry"] = to_wkb(df["geometry"].values) filename = os.path.join(str(tmpdir), "test.pq") # use pandas to_parquet (no geo metadata) df.to_parquet(filename) # missing metadata will raise ValueError with pytest.raises(ValueError, match="Missing geo metadata in Parquet/Feather file."): read_parquet(filename)
def map(self, *args, **kwargs): """ Fetches map of `self.level` given parameters :param args: positional parameters for geobr map reading function :param kwargs: keyword parameters for geobr map reading function :return: GeoDataFrame """ if os.path.exists(f'{self.level}_map.parquet'): self.mapdf = gpd.read_parquet(f'{self.level}_map.parquet') return self.mapdf print("Dowloading the Map...") if self.mapdf is None: self.mapdf = LEVELS[self.level](*args, **kwargs) self._persist('map') return self.mapdf
def test_parquet_missing_crs(tmpdir): """If CRS is `None`, it should be properly handled and remain `None` when read from parquet`. """ test_dataset = "naturalearth_lowres" df = read_file(get_path(test_dataset)) df.crs = None filename = os.path.join(str(tmpdir), "test.pq") df.to_parquet(filename) pq_df = read_parquet(filename) assert pq_df.crs is None assert_geodataframe_equal(df, pq_df, check_crs=True)
def generate_populations(self, scale=0.05): """ Generate a synthetic population of size scale*population size for each polygon in self.mapdf :param scale: """ if os.path.exists(f"{self.level}_pop.parquet"): self.pop = gpd.read_parquet(f"{self.level}_pop.parquet") return if "population" not in self.mapdf.columns: self.demographics() for row in self.mapdf.itertuples(): people = sample_random_people(int(row.population * scale), row.geometry) sex = np.random.randint(0, 2, size=len(people)) age = np.random.randint(0, 100, size=len(people)) print(len(people), people[0]) self.pop = gpd.GeoDataFrame({"sex": sex, "age": age, "geometry": people}) self.pop["longitude"] = [pt.x for pt in self.pop.geometry] self.pop["latitude"] = [pt.y for pt in self.pop.geometry] self._persist("pop")
def test_parquet_multiple_geom_cols(tmpdir): """If multiple geometry columns are present when written to parquet, they should all be returned as such when read from parquet. """ test_dataset = "naturalearth_lowres" df = read_file(get_path(test_dataset)) df["geom2"] = df.geometry.copy() filename = os.path.join(str(tmpdir), "test.pq") df.to_parquet(filename) assert os.path.exists(filename) pq_df = read_parquet(filename) assert isinstance(pq_df, GeoDataFrame) assert_geodataframe_equal(df, pq_df) assert_geoseries_equal(df.geom2, pq_df.geom2, check_geom_type=True)
def cache_function(*args, **kwargs): if not isinstance(cache_dir, Path): raise TypeError('cache_dir should be a pathlib.Path object') cache_file = cache_dir / (func.__name__ + '.trc.pqt') if hard_reset or (not cache_file.exists()): result = func(*args, **kwargs) if not isinstance(result, pd.DataFrame): raise TypeError( f"The result of computing {func.__name__} is not a DataFrame" ) result.to_parquet(cache_file) return result print("{} exist".format(cache_file.name)) if geoformat: import geopandas as gpd result = gpd.read_parquet(cache_file) else: result = pd.read_parquet(cache_file) return result