Example #1
0
def elevation_byloc(coord: Tuple[float, float], crs: str = DEF_CRS):
    """Get elevation from USGS 3DEP service for a coordinate.

    Parameters
    ----------
    coord : tuple
        Coordinates of the location as a tuple
    crs : str, optional
        The spatial reference of the input coord, defaults to epsg:4326 (lon, lat)

    Returns
    -------
    float
        Elevation in meter
    """
    if not isinstance(coord, tuple) or len(coord) != 2:
        raise InvalidInputType("coord", "tuple of length 2", "(x, y)")

    lon, lat = MatchCRS.coords(([coord[0]], [coord[1]]), crs, DEF_CRS)

    url = "https://nationalmap.gov/epqs/pqs.php"
    payload = {"output": "json", "x": lon[0], "y": lat[0], "units": "Meters"}
    r = RetrySession().get(url, payload)
    root = r.json()["USGS_Elevation_Point_Query_Service"]
    elevation = float(root["Elevation_Query"]["Elevation"])

    if abs(elevation - (-1000000)) < 1e-3:
        raise ValueError(
            f"The elevation of the requested coordinate ({coord[0]}, {coord[1]}) cannot be found."
        )

    return elevation
Example #2
0
File: pynhd.py Project: jsta/pynhd
 def __init__(self, save_dir: Optional[str] = None) -> None:
     self.save_dir = Path(save_dir) if save_dir else Path(
         tempfile.gettempdir())
     if not self.save_dir.exists():
         os.makedirs(self.save_dir)
     self.session = RetrySession()
     self.nhd_attr_item = "5669a79ee4b08895842a1d47"
     self.char_feather = Path(self.save_dir, "nhdplus_attrs.feather")
Example #3
0
def ssebopeta_bygeom(
    geometry: Union[Polygon, Tuple[float, float, float, float]],
    dates: Union[Tuple[str, str], Union[int, List[int]]],
    geo_crs: str = DEF_CRS,
    fill_holes: bool = False,
) -> xr.DataArray:
    """Get daily actual ET for a region from SSEBop database.

    Notes
    -----
    Since there's still no web service available for subsetting SSEBop, the data first
    needs to be downloaded for the requested period then it is masked by the
    region of interest locally. Therefore, it's not as fast as other functions and
    the bottleneck could be the download speed.

    Parameters
    ----------
    geometry : shapely.geometry.Polygon or tuple
        The geometry for downloading clipping the data. For a tuple bbox,
        the order should be (west, south, east, north).
    dates : tuple or list, optional
        Start and end dates as a tuple (start, end) or a list of years [2001, 2010, ...].
    geo_crs : str, optional
        The CRS of the input geometry, defaults to epsg:4326.
    fill_holes : bool, optional
        Whether to fill the holes in the geometry's interior (Polygon type), defaults to False.

    Returns
    -------
    xarray.DataArray
        Daily actual ET within a geometry in mm/day at 1 km resolution
    """
    _geometry = geoutils.geo2polygon(geometry, geo_crs, DEF_CRS)
    _geometry = Polygon(_geometry.exterior) if fill_holes else _geometry

    f_list = _get_ssebopeta_urls(dates)

    session = RetrySession()

    with session.onlyipv4():

        def _ssebop(url_stamped):
            dt, url = url_stamped
            resp = session.get(url)
            zfile = zipfile.ZipFile(io.BytesIO(resp.content))
            content = zfile.read(zfile.filelist[0].filename)
            ds = geoutils.gtiff2xarray({"eta": content}, _geometry, DEF_CRS)
            return dt, ds.expand_dims({"time": [dt]})

        resp_list = ogc.utils.threading(_ssebop, f_list, max_workers=4)
        data = xr.merge(
            OrderedDict(sorted(resp_list, key=lambda x: x[0])).values())

    eta = data.eta.copy()
    eta *= 1e-3
    eta.attrs.update({"units": "mm/day", "nodatavals": (np.nan, )})
    return eta
Example #4
0
    def __init__(self) -> None:
        self.base_url = ServiceURL().restful.nldi
        self.session = RetrySession()

        resp = self.session.get("/".join([self.base_url,
                                          "linked-data"])).json()
        self.valid_fsources = {r["source"]: r["sourceName"] for r in resp}

        resp = self.session.get("/".join([self.base_url, "lookups"])).json()
        self.valid_chartypes = {r["type"]: r["typeName"] for r in resp}
Example #5
0
 def __init__(self) -> None:
     self.session = RetrySession()
     self.base_url = "https://nid.sec.usace.army.mil/ords"
     self.headers = {
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
         "Accept-Encoding": "gzip, deflate, br",
         "Connection": "keep-alive",
         "Upgrade-Insecure-Requests": "1",
         "DNT": "1",
     }
Example #6
0
def test_ipv4():
    url = (
        "https://edcintl.cr.usgs.gov/downloads/sciweb1/shared/uswem/web/conus"
        + "/eta/modis_eta/daily/downloads/det2004003.modisSSEBopETactual.zip")

    session = RetrySession()
    with session.onlyipv4():
        r = session.get(url)
        z = zipfile.ZipFile(io.BytesIO(r.content))
        fname = z.read(z.filelist[0].filename)

    assert sys.getsizeof(fname) == 4361682
Example #7
0
def ssebopeta_byloc(
    coords: Tuple[float, float],
    dates: Union[Tuple[str, str], Union[int, List[int]]],
) -> pd.DataFrame:
    """Daily actual ET for a location from SSEBop database in mm/day.

    Parameters
    ----------
    coords : tuple
        Longitude and latitude of the location of interest as a tuple (lon, lat)
    dates : tuple or list, optional
        Start and end dates as a tuple (start, end) or a list of years [2001, 2010, ...].

    Returns
    -------
    pandas.DataFrame
        Daily actual ET for a location
    """
    if isinstance(coords, tuple) and len(coords) == 2:
        lon, lat = coords
    else:
        raise InvalidInputType("coords", "tuple", "(lon, lat)")

    f_list = _get_ssebopeta_urls(dates)
    session = RetrySession()

    with session.onlyipv4():

        def _ssebop(urls):
            dt, url = urls
            r = session.get(url)
            z = zipfile.ZipFile(io.BytesIO(r.content))

            with rio.MemoryFile() as memfile:
                memfile.write(z.read(z.filelist[0].filename))
                with memfile.open() as src:
                    return {
                        "dt": dt,
                        "eta": [e[0] for e in src.sample([(lon, lat)])][0],
                    }

        eta_list = ogc.utils.threading(_ssebop, f_list, max_workers=4)
    eta = pd.DataFrame.from_records(eta_list)
    eta.columns = ["datetime", "eta (mm/day)"]
    eta = eta.set_index("datetime")
    return eta * 1e-3
Example #8
0
def ssebopeta_bygeom(
    geometry: GTYPE,
    dates: Union[Tuple[str, str], Union[int, List[int]]],
    geo_crs: str = DEF_CRS,
) -> xr.DataArray:
    """Get daily actual ET for a region from SSEBop database.

    Notes
    -----
    Since there's still no web service available for subsetting SSEBop, the data first
    needs to be downloaded for the requested period then it is masked by the
    region of interest locally. Therefore, it's not as fast as other functions and
    the bottleneck could be the download speed.

    Parameters
    ----------
    geometry : shapely.geometry.Polygon or tuple
        The geometry for downloading clipping the data. For a tuple bbox,
        the order should be (west, south, east, north).
    dates : tuple or list, optional
        Start and end dates as a tuple (start, end) or a list of years [2001, 2010, ...].
    geo_crs : str, optional
        The CRS of the input geometry, defaults to epsg:4326.

    Returns
    -------
    xarray.DataArray
        Daily actual ET within a geometry in mm/day at 1 km resolution
    """
    f_list = helpers.get_ssebopeta_urls(dates)
    if isinstance(geometry, (Polygon, MultiPolygon)):
        gtiff2xarray = tlz.partial(geoutils.gtiff2xarray,
                                   geometry=geometry,
                                   geo_crs=geo_crs)
    else:
        gtiff2xarray = tlz.partial(geoutils.gtiff2xarray)

    session = RetrySession()

    with patch("socket.has_ipv6", False):

        def _ssebop(t: pd.Timestamp, url: str) -> xr.DataArray:
            resp = session.get(url)
            zfile = zipfile.ZipFile(io.BytesIO(resp.content))
            content = zfile.read(zfile.filelist[0].filename)
            ds: xr.DataArray = gtiff2xarray(r_dict={"eta": content})
            return ds.expand_dims({"time": [t]})

        data = xr.merge(_ssebop(t, url) for t, url in f_list)
    eta: xr.DataArray = data.where(
        data.eta < data.eta.nodatavals[0]).eta.copy() * 1e-3
    eta.attrs.update({
        "units": "mm/day",
        "nodatavals": (np.nan, ),
        "crs": DEF_CRS,
        "long_name": "Actual ET"
    })
    return eta
Example #9
0
class NID:
    """Retrieve data from the National Inventory of Dams."""

    def __init__(self) -> None:
        self.session = RetrySession()
        self.base_url = "https://nid.sec.usace.army.mil/ords"
        self.headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Encoding": "gzip, deflate, br",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "DNT": "1",
        }

    def get_xlsx(self) -> io.BytesIO:
        """Get the excel file that containes the dam data."""
        self.headers.update({"Cookie": "ORA_WWV_APP_105=ORA_WWV-QM2rrHvxwzROBqNYVD0WIlg2"})
        payload = {"InFileName": "NID2019_U.xlsx"}
        r = self.session.get(
            f"{self.base_url}/NID_R.DOWNLOADFILE", payload=payload, headers=self.headers
        )
        return io.BytesIO(r.content)

    def get_attrs(self, variables: List[str]) -> Dict[str, str]:
        """Get descriptions of the NID variables."""
        self.headers.update({"Cookie": "ORA_WWV_APP_105=ORA_WWV-iaBJjzLW1v3a1s1mXEub0S7R"})

        desc: Dict[str, str] = {}
        for v in variables:
            payload = {"p": f"105:10:10326760693796::NO::P10_COLUMN_NAME:{v}"}
            page = self.session.get(f"{self.base_url}/f", payload=payload, headers=self.headers)
            tables = pd.read_html(page.text)
            desc[v] = tables[0]["Field Definition"].values[0]

        return desc

    def get_codes(self) -> str:
        """Get the definitions of letter codes in NID database."""
        self.headers.update({"Cookie": "ORA_WWV_APP_105=ORA_WWV-Bk16kg_4BwSK2anC36B4XBQn"})
        payload = {"p": "105:21:16137342922753::NO:::"}
        page = self.session.get(f"{self.base_url}/f", payload=payload, headers=self.headers)
        return page.text
Example #10
0
def elevation_bycoords(coords: List[Tuple[float, float]],
                       crs: str = DEF_CRS) -> List[int]:
    """Get elevation from Airmap for a list of coordinates.

    Parameters
    ----------
    coords : list of tuples
        Coordinates of the location as a tuple
    crs : str, optional
        The spatial reference of the input coord, defaults to epsg:4326 (lon, lat)

    Returns
    -------
    list of int
        Elevation in meter
    """
    if not isinstance(coords, (list, Iterator)):
        raise InvalidInputType("coord",
                               "list (or iterator) of tuples of length 2",
                               "[(x, y), ...]")

    if isinstance(coords, list) and any(len(c) != 2 for c in coords):
        raise InvalidInputType("coord", "list of tuples of length 2",
                               "[(x, y), ...]")

    coords_reproj = zip(*MatchCRS.coords(tuple(zip(*coords)), crs, DEF_CRS))
    coords_reproj = tlz.partition_all(100, coords_reproj)

    headers = {"Content-Type": "application/json", "charset": "utf-8"}
    elevations = []
    for chunk in coords_reproj:
        payload = {"points": ",".join(f"{lat},{lon}" for lon, lat in chunk)}
        resp = RetrySession().get(ServiceURL().restful.airmap,
                                  payload=payload,
                                  headers=headers)
        elevations.append(resp.json()["data"])

    return list(tlz.concat(elevations))
Example #11
0
    def __init__(
        self,
        variables: Optional[Union[List[str], str]] = None,
        pet: bool = False,
    ) -> None:
        self.session = RetrySession()

        vars_table = pd.read_html("https://daymet.ornl.gov/overview")[1]

        self.units = dict(zip(vars_table["Abbr"], vars_table["Units"]))

        valid_variables = vars_table.Abbr.to_list()
        if variables is None:
            self.variables = valid_variables
        else:
            self.variables = variables if isinstance(variables,
                                                     list) else [variables]

            if not set(self.variables).issubset(set(valid_variables)):
                raise InvalidInputValue("variables", valid_variables)

            if pet:
                reqs = ("tmin", "tmax", "vp", "srad", "dayl")
                self.variables = list(set(reqs) | set(self.variables))
Example #12
0
File: pynhd.py Project: jsta/pynhd
class NLDI:
    """Access the Hydro Network-Linked Data Index (NLDI) service."""
    def __init__(self) -> None:
        self.base_url = ServiceURL().restful.nldi
        self.session = RetrySession()

        resp = self.session.get("/".join([self.base_url,
                                          "linked-data"])).json()
        self.valid_fsources = {r["source"]: r["sourceName"] for r in resp}

        resp = self.session.get("/".join([self.base_url, "lookups"])).json()
        self.valid_chartypes = {r["type"]: r["typeName"] for r in resp}

    @staticmethod
    def _missing_warning(n_miss: int, n_tot: int) -> None:
        """Show a warning if there are misssing features."""
        logger.warning(" ".join([
            f"{n_miss} of {n_tot} inputs didn't return any features.",
            "They are returned as a list.",
        ]))

    def getfeature_byid(
        self, fsource: str, fid: Union[str, List[str]]
    ) -> Union[gpd.GeoDataFrame, Tuple[gpd.GeoDataFrame, List[str]]]:
        """Get feature(s) based ID(s).

        Parameters
        ----------
        fsource : str
            The name of feature(s) source. The valid sources are:
            comid, huc12pp, nwissite, wade, wqp
        fid : str or list
            Feature ID(s).

        Returns
        -------
        geopandas.GeoDataFrame or (geopandas.GeoDataFrame, list)
            NLDI indexed features in EPSG:4326. If some IDs don't return any features
            a list of missing ID(s) are returnd as well.
        """
        self._validate_fsource(fsource)
        fid = fid if isinstance(fid, list) else [fid]
        urls = {
            f: "/".join([self.base_url, "linked-data", fsource, f])
            for f in fid
        }
        features, not_found = self._get_urls(urls)

        if len(not_found) > 0:
            self._missing_warning(len(not_found), len(fid))
            return features, not_found

        return features

    def comid_byloc(
        self,
        coords: Union[Tuple[float, float], List[Tuple[float, float]]],
        loc_crs: str = DEF_CRS,
    ) -> Union[gpd.GeoDataFrame, Tuple[gpd.GeoDataFrame, List[Tuple[float,
                                                                    float]]]]:
        """Get the closest ComID(s) based on coordinates.

        Parameters
        ----------
        coords : tuple or list
            A tuple of length two (x, y) or a list of them.
        loc_crs : str, optional
            The spatial reference of the input coordinate, defaults to EPSG:4326.

        Returns
        -------
        geopandas.GeoDataFrame or (geopandas.GeoDataFrame, list)
            NLDI indexed ComID(s) in EPSG:4326. If some coords don't return any ComID
            a list of missing coords are returnd as well.
        """
        coords = coords if isinstance(coords, list) else [coords]
        coords_4326 = list(
            zip(*MatchCRS.coords(tuple(zip(*coords)), loc_crs, DEF_CRS)))

        base_url = "/".join(
            [self.base_url, "linked-data", "comid", "position"])
        urls = {(coords[i][0], coords[i][1]):
                f"{base_url}?coords=POINT({lon} {lat})"
                for i, (lon, lat) in enumerate(coords_4326)}
        comids, not_found = self._get_urls(urls)
        comids = comids.reset_index(level=2, drop=True)

        if len(not_found) > 0:
            self._missing_warning(len(not_found), len(coords))
            return comids, not_found

        return comids

    def get_basins(
        self, station_ids: Union[str, List[str]]
    ) -> Union[gpd.GeoDataFrame, Tuple[gpd.GeoDataFrame, List[str]]]:
        """Get basins for a list of station IDs.

        Parameters
        ----------
        station_ids : str or list
            USGS station ID(s).

        Returns
        -------
        geopandas.GeoDataFrame or (geopandas.GeoDataFrame, list)
            NLDI indexed basins in EPSG:4326. If some IDs don't return any features
            a list of missing ID(s) are returnd as well.
        """
        station_ids = station_ids if isinstance(station_ids,
                                                list) else [station_ids]
        urls = {
            s: f"{self.base_url}/linked-data/nwissite/USGS-{s}/basin"
            for s in station_ids
        }
        basins, not_found = self._get_urls(urls)
        basins = basins.reset_index(level=1, drop=True)
        basins.index.rename("identifier", inplace=True)

        if len(not_found) > 0:
            self._missing_warning(len(not_found), len(station_ids))
            return basins, not_found

        return basins

    def getcharacteristic_byid(
        self,
        comids: Union[List[str], str],
        char_type: str,
        char_ids: Union[str, List[str]] = "all",
        values_only: bool = True,
    ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, pd.DataFrame]]:
        """Get characteristics using a list ComIDs.

        Parameters
        ----------
        comids : str or list
            The ID of the feature.
        char_type : str
            Type of the characteristic. Valid values are ``local`` for
            individual reach catchments, ``tot`` for network-accumulated values
            using total cumulative drainage area and ``div`` for network-accumulated values
            using divergence-routed.
        char_ids : str or list, optional
            Name(s) of the target characteristics, default to all.
        values_only : bool, optional
            Whether to return only ``characteristic_value`` as a series, default to True.
            If is set to False, ``percent_nodata`` is returned as well.

        Returns
        -------
        pandas.DataFrame or tuple of pandas.DataFrame
            Either only ``characteristic_value`` as a dataframe or
            or if ``values_only`` is Fale return ``percent_nodata`` as well.
        """
        if char_type not in self.valid_chartypes:
            valids = [
                f'"{s}" for {d}' for s, d in self.valid_chartypes.items()
            ]
            raise InvalidInputValue("char", valids)

        comids = comids if isinstance(comids, list) else [comids]
        v_dict, nd_dict = {}, {}

        if char_ids == "all":
            payload = None
        else:
            _char_ids = char_ids if isinstance(char_ids, list) else [char_ids]
            valid_charids = self.get_validchars(char_type)

            idx = valid_charids.index
            if any(c not in idx for c in _char_ids):
                vids = valid_charids["characteristic_description"]
                raise InvalidInputValue(
                    "char_id", [f'"{s}" for {d}' for s, d in vids.items()])
            payload = {"characteristicId": ",".join(_char_ids)}

        for comid in comids:
            url = "/".join(
                [self.base_url, "linked-data", "comid", comid, char_type])
            rjson = self._get_url(url, payload)
            char = pd.DataFrame.from_dict(rjson["characteristics"],
                                          orient="columns").T
            char.columns = char.iloc[0]
            char = char.drop(index="characteristic_id")

            v_dict[comid] = char.loc["characteristic_value"]
            if values_only:
                continue

            nd_dict[comid] = char.loc["percent_nodata"]

        def todf(df_dict: Dict[str, pd.DataFrame]) -> pd.DataFrame:
            df = pd.DataFrame.from_dict(df_dict, orient="index")
            df[df == ""] = np.nan
            df.index = df.index.astype("int64")
            return df.astype("f4")

        chars = todf(v_dict)
        if values_only:
            return chars

        return chars, todf(nd_dict)

    def get_validchars(self, char_type: str) -> pd.DataFrame:
        """Get all the avialable characteristics IDs for a give characteristics type."""
        resp = self.session.get("/".join(
            [self.base_url, "lookups", char_type, "characteristics"]))
        c_list = ogc.utils.traverse_json(
            resp.json(), ["characteristicMetadata", "characteristic"])
        return pd.DataFrame.from_dict(
            {c.pop("characteristic_id"): c
             for c in c_list}, orient="index")

    def navigate_byid(
        self,
        fsource: str,
        fid: str,
        navigation: str,
        source: str,
        distance: int = 500,
    ) -> gpd.GeoDataFrame:
        """Navigate the NHDPlus databse from a single feature id up to a distance.

        Parameters
        ----------
        fsource : str
            The name of feature source. The valid sources are:
            comid, huc12pp, nwissite, wade, WQP.
        fid : str
            The ID of the feature.
        navigation : str
            The navigation method.
        source : str, optional
            Return the data from another source after navigating
            the features using fsource, defaults to None.
        distance : int, optional
            Limit the search for navigation up to a distance in km,
            defaults is 500 km. Note that this is an expensive request so you
            have be mindful of the value that you provide.

        Returns
        -------
        geopandas.GeoDataFrame
            NLDI indexed features in EPSG:4326.
        """
        self._validate_fsource(fsource)

        url = "/".join(
            [self.base_url, "linked-data", fsource, fid, "navigation"])

        valid_navigations = self._get_url(url)
        if navigation not in valid_navigations.keys():
            raise InvalidInputValue("navigation",
                                    list(valid_navigations.keys()))

        url = valid_navigations[navigation]

        r_json = self._get_url(url)
        valid_sources = {s["source"].lower(): s["features"]
                         for s in r_json}  # type: ignore
        if source not in valid_sources:
            raise InvalidInputValue("source", list(valid_sources.keys()))

        url = f"{valid_sources[source]}?distance={int(distance)}"

        return geoutils.json2geodf(self._get_url(url), ALT_CRS, DEF_CRS)

    def navigate_byloc(
        self,
        coords: Tuple[float, float],
        navigation: Optional[str] = None,
        source: Optional[str] = None,
        loc_crs: str = DEF_CRS,
        distance: int = 500,
    ) -> gpd.GeoDataFrame:
        """Navigate the NHDPlus databse from a coordinate.

        Parameters
        ----------
        coords : tuple
            A tuple of length two (x, y).
        navigation : str, optional
            The navigation method, defaults to None which throws an exception
            if comid_only is False.
        source : str, optional
            Return the data from another source after navigating
            the features using fsource, defaults to None which throws an exception
            if comid_only is False.
        loc_crs : str, optional
            The spatial reference of the input coordinate, defaults to EPSG:4326.
        distance : int, optional
            Limit the search for navigation up to a distance in km,
            defaults to 500 km. Note that this is an expensive request so you
            have be mindful of the value that you provide. If you want to get
            all the available features you can pass a large distance like 9999999.

        Returns
        -------
        geopandas.GeoDataFrame
            NLDI indexed features in EPSG:4326.
        """
        _coords = MatchCRS().coords(((coords[0], ), (coords[1], )), loc_crs,
                                    DEF_CRS)
        lon, lat = _coords[0][0], _coords[1][0]

        url = "/".join([self.base_url, "linked-data", "comid", "position"])
        payload = {"coords": f"POINT({lon} {lat})"}
        rjson = self._get_url(url, payload)
        comid = geoutils.json2geodf(rjson, ALT_CRS, DEF_CRS).comid.iloc[0]

        if navigation is None or source is None:
            raise MissingItems(["navigation", "source"])

        return self.navigate_byid("comid", comid, navigation, source, distance)

    def _validate_fsource(self, fsource: str) -> None:
        """Check if the given feature source is valid."""
        if fsource not in self.valid_fsources:
            valids = [f'"{s}" for {d}' for s, d in self.valid_fsources.items()]
            raise InvalidInputValue("feature source", valids)

    def _get_urls(self, urls: Dict[Any,
                                   str]) -> Tuple[gpd.GeoDataFrame, List[str]]:
        """Get basins for a list of station IDs.

        Parameters
        ----------
        urls_dict : dict
            A dict with keys as feature ids and values as corresponsing url.

        Returns
        -------
        (geopandas.GeoDataFrame, list)
            NLDI indexed features in EPSG:4326 and list of ID(s) that no feature was found.
        """
        not_found = []
        resp = []
        for f, u in urls.items():
            try:
                rjson = self._get_url(u)
                resp.append((f, geoutils.json2geodf(rjson, ALT_CRS, DEF_CRS)))
            except (ZeroMatched, JSONDecodeError, ConnectionError):
                not_found.append(f)

        if len(resp) == 0:
            raise ZeroMatched("No feature was found with the provided inputs.")

        resp_df = gpd.GeoDataFrame(pd.concat(dict(resp)))

        return resp_df, not_found

    def _get_url(self,
                 url: str,
                 payload: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
        """Send a request to the service using GET method."""
        if payload:
            payload.update({"f": "json"})
        else:
            payload = {"f": "json"}

        try:
            return self.session.get(url, payload).json()
        except JSONDecodeError:
            raise ZeroMatched("No feature was found with the provided inputs.")
        except ConnectionError:
            raise ConnectionError(
                "NLDI server cannot be reached at the moment.")
Example #13
0
def nlcd_helper() -> Dict[str, Any]:
    """Get legends and properties of the NLCD cover dataset.

    Notes
    -----
    The following references have been used:
        - https://github.com/jzmiller1/nlcd
        - https://www.mrlc.gov/data-services-page
        - https://www.mrlc.gov/data/legends/national-land-cover-database-2016-nlcd2016-legend
    """
    url = ("https://www.mrlc.gov/downloads/sciweb1/shared/mrlc/metadata/" +
           "NLCD_2016_Land_Cover_Science_product_L48.xml")
    r = RetrySession().get(url)

    root = ET.fromstring(r.content)

    clist = root[4][1][1].text.split("\n")[2:]
    _colors = [i.split() for i in clist]
    colors = {int(c): (float(r), float(g), float(b)) for c, r, g, b in _colors}

    classes = {
        root[4][0][3][i][0][0].text:
        root[4][0][3][i][0][1].text.split("-")[0].strip()
        for i in range(3, len(root[4][0][3]))
    }

    nlcd_meta = {
        "impervious_years": [2016, 2011, 2006, 2001],
        "canopy_years": [2016, 2011],
        "cover_years": [2016, 2013, 2011, 2008, 2006, 2004, 2001],
        "classes": classes,
        "categories": {
            "Unclassified": ("0"),
            "Water": ("11", "12"),
            "Developed": ("21", "22", "23", "24"),
            "Barren": ("31", ),
            "Forest": ("41", "42", "43", "45", "46"),
            "Shrubland": ("51", "52"),
            "Herbaceous": ("71", "72", "73", "74"),
            "Planted/Cultivated": ("81", "82"),
            "Wetlands": ("90", "95"),
        },
        "roughness": {
            "11": 0.001,
            "12": 0.022,
            "21": 0.0404,
            "22": 0.0678,
            "23": 0.0678,
            "24": 0.0404,
            "31": 0.0113,
            "41": 0.36,
            "42": 0.32,
            "43": 0.4,
            "45": 0.4,
            "46": 0.24,
            "51": 0.24,
            "52": 0.4,
            "71": 0.368,
            "72": np.nan,
            "81": 0.325,
            "82": 0.16,
            "90": 0.086,
            "95": 0.1825,
        },
        "colors": colors,
    }

    return nlcd_meta
Example #14
0
class NLDI:
    """Access the Hydro Network-Linked Data Index (NLDI) service."""
    def __init__(self) -> None:
        self.base_url = ServiceURL().restful.nldi
        self.session = RetrySession()

        resp = self.session.get("/".join([self.base_url,
                                          "linked-data"])).json()
        self.valid_fsources = {r["source"]: r["sourceName"] for r in resp}

        resp = self.session.get("/".join([self.base_url, "lookups"])).json()
        self.valid_chartypes = {r["type"]: r["typeName"] for r in resp}

    def getfeature_byid(self,
                        fsource: str,
                        fid: str,
                        basin: bool = False) -> gpd.GeoDataFrame:
        """Get features of a single id.

        Parameters
        ----------
        fsource : str
            The name of feature source. The valid sources are:
            comid, huc12pp, nwissite, wade, wqp
        fid : str
            The ID of the feature.
        basin : bool
            Whether to return the basin containing the feature.

        Returns
        -------
        geopandas.GeoDataFrame
            NLDI indexed features in EPSG:4326.
        """
        self._validate_fsource(fsource)

        url = "/".join([self.base_url, "linked-data", fsource, fid])
        if basin:
            url += "/basin"

        return geoutils.json2geodf(self._geturl(url), ALT_CRS, DEF_CRS)

    def getcharacteristic_byid(
        self,
        comids: Union[List[str], str],
        char_type: str,
        char_ids: str = "all",
        values_only: bool = True,
    ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, pd.DataFrame]]:
        """Get characteristics using a list ComIDs.

        Parameters
        ----------
        comids : str or list
            The ID of the feature.
        char_type : str
            Type of the characteristic. Valid values are ``local`` for
            individual reach catchments, ``tot`` for network-accumulated values
            using total cumulative drainage area and ``div`` for network-accumulated values
            using divergence-routed.
        char_ids : str or list, optional
            Name(s) of the target characteristics, default to all.
        values_only : bool, optional
            Whether to return only ``characteristic_value`` as a series, default to True.
            If is set to False, ``percent_nodata`` is returned as well.

        Returns
        -------
        pandas.DataFrame or tuple of pandas.DataFrame
            Either only ``characteristic_value`` as a dataframe or
            or if ``values_only`` is Fale return ``percent_nodata`` is well.
        """
        if char_type not in self.valid_chartypes:
            valids = [
                f'"{s}" for {d}' for s, d in self.valid_chartypes.items()
            ]
            raise InvalidInputValue("char", valids)

        comids = comids if isinstance(comids, list) else [comids]
        v_dict, nd_dict = {}, {}

        if char_ids == "all":
            payload = None
        else:
            _char_ids = char_ids if isinstance(char_ids, list) else [char_ids]
            valid_charids = self.get_validchars(char_type)

            idx = valid_charids.index
            if any(c not in idx for c in _char_ids):
                vids = valid_charids["characteristic_description"]
                raise InvalidInputValue(
                    "char_id", [f'"{s}" for {d}' for s, d in vids.items()])
            payload = {"characteristicId": ",".join(_char_ids)}

        for comid in comids:
            url = "/".join(
                [self.base_url, "linked-data", "comid", comid, char_type])
            rjson = self._geturl(url, payload)
            char = pd.DataFrame.from_dict(rjson["characteristics"],
                                          orient="columns").T
            char.columns = char.iloc[0]
            char = char.drop(index="characteristic_id")

            v_dict[comid] = char.loc["characteristic_value"]
            if values_only:
                continue

            nd_dict[comid] = char.loc["percent_nodata"]

        def todf(df_dict: Dict[str, pd.DataFrame]) -> pd.DataFrame:
            df = pd.DataFrame.from_dict(df_dict, orient="index")
            df[df == ""] = np.nan
            df.index = df.index.astype("int64")
            return df.astype("f4")

        chars = todf(v_dict)
        if values_only:
            return chars

        return chars, todf(nd_dict)

    def get_validchars(self, char_type: str) -> pd.DataFrame:
        """Get all the avialable characteristics IDs for a give characteristics type."""
        resp = self.session.get("/".join(
            [self.base_url, "lookups", char_type, "characteristics"]))
        c_list = ogc.utils.traverse_json(
            resp.json(), ["characteristicMetadata", "characteristic"])
        return pd.DataFrame.from_dict(
            {c.pop("characteristic_id"): c
             for c in c_list}, orient="index")

    def navigate_byid(
        self,
        fsource: str,
        fid: str,
        navigation: str,
        source: str,
        distance: int = 500,
    ) -> gpd.GeoDataFrame:
        """Navigate the NHDPlus databse from a single feature id up to a distance.

        Parameters
        ----------
        fsource : str
            The name of feature source. The valid sources are:
            comid, huc12pp, nwissite, wade, WQP.
        fid : str
            The ID of the feature.
        navigation : str
            The navigation method.
        source : str, optional
            Return the data from another source after navigating
            the features using fsource, defaults to None.
        distance : int, optional
            Limit the search for navigation up to a distance in km,
            defaults is 500 km. Note that this is an expensive request so you
            have be mindful of the value that you provide.

        Returns
        -------
        geopandas.GeoDataFrame
            NLDI indexed features in EPSG:4326.
        """
        self._validate_fsource(fsource)

        url = "/".join(
            [self.base_url, "linked-data", fsource, fid, "navigation"])

        valid_navigations = self._geturl(url)
        if navigation not in valid_navigations.keys():
            raise InvalidInputValue("navigation",
                                    list(valid_navigations.keys()))

        url = valid_navigations[navigation]

        r_json = self._geturl(url)
        valid_sources = {s["source"].lower(): s["features"] for s in r_json}
        if source not in valid_sources:
            raise InvalidInputValue("source", list(valid_sources.keys()))

        url = f"{valid_sources[source]}?distance={int(distance)}"

        return geoutils.json2geodf(self._geturl(url), ALT_CRS, DEF_CRS)

    def navigate_byloc(
        self,
        coords: Tuple[float, float],
        navigation: Optional[str] = None,
        source: Optional[str] = None,
        loc_crs: str = DEF_CRS,
        distance: int = 500,
        comid_only: bool = False,
    ) -> gpd.GeoDataFrame:
        """Navigate the NHDPlus databse from a coordinate.

        Parameters
        ----------
        coordinate : tuple
            A tuple of length two (x, y).
        navigation : str, optional
            The navigation method, defaults to None which throws an exception
            if comid_only is False.
        source : str, optional
            Return the data from another source after navigating
            the features using fsource, defaults to None which throws an exception
            if comid_only is False.
        loc_crs : str, optional
            The spatial reference of the input coordinate, defaults to EPSG:4326.
        distance : int, optional
            Limit the search for navigation up to a distance in km,
            defaults to 500 km. Note that this is an expensive request so you
            have be mindful of the value that you provide. If you want to get
            all the available features you can pass a large distance like 9999999.
        comid_only : bool, optional
            Whether to return the nearest comid without navigation.

        Returns
        -------
        geopandas.GeoDataFrame
            NLDI indexed features in EPSG:4326.
        """
        _coords = MatchCRS().coords(((coords[0], ), (coords[1], )), loc_crs,
                                    DEF_CRS)
        lon, lat = _coords[0][0], _coords[1][0]

        url = "/".join([self.base_url, "linked-data", "comid", "position"])
        payload = {"coords": f"POINT({lon} {lat})"}
        rjson = self._geturl(url, payload)
        comid = geoutils.json2geodf(rjson, ALT_CRS, DEF_CRS).comid.iloc[0]

        if comid_only:
            return comid

        if navigation is None or source is None:
            raise MissingItems(["navigation", "source"])

        return self.navigate_byid("comid", comid, navigation, source, distance)

    def characteristics_dataframe(
        self,
        char_type: str,
        char_id: str,
        filename: Optional[str] = None,
        metadata: bool = False,
    ) -> Union[Dict[str, Any], pd.DataFrame]:
        """Get a NHDPlus-based characteristic from sciencebase.gov as dataframe.

        Parameters
        ----------
        char_type : str
            Characteristic type. Valid values are ``local`` for
            individual reach catchments, ``tot`` for network-accumulated values
            using total cumulative drainage area and ``div`` for network-accumulated values
            using divergence-routed.
        char_id : str
            Characteristic ID.
        filename : str, optional
            File name, defaults to None that throws an error and shows
            a list of available files.
        metadata : bool
            Whether to only return the metadata for the selected characteristic,
            defaults to False. Useful for getting information about the dataset
            such as citation, units, column names, etc.

        Returns
        -------
        pandas.DataFrame or dict
            The requested characteristic as a dataframe or if ``metadata`` is True
            the metadata as a dictionary.
        """
        if char_type not in self.valid_chartypes:
            valids = [
                f'"{s}" for {d}' for s, d in self.valid_chartypes.items()
            ]
            raise InvalidInputValue("char", valids)

        valid_charids = self.get_validchars(char_type)

        if char_id not in valid_charids.index:
            vids = valid_charids["characteristic_description"]
            raise InvalidInputValue(
                "char_id", [f'"{s}" for {d}' for s, d in vids.items()])

        meta = self.session.get(valid_charids.loc[char_id, "dataset_url"], {
            "format": "json"
        }).json()
        if metadata:
            return meta

        flist = {
            f["name"]: f["downloadUri"]
            for f in meta["files"] if f["name"].split(".")[-1] == "zip"
        }
        if filename not in flist:
            raise InvalidInputValue("filename", list(flist.keys()))

        return pd.read_csv(flist[filename], compression="zip")

    def _validate_fsource(self, fsource: str) -> None:
        """Check if the given feature source is valid."""
        if fsource not in self.valid_fsources:
            valids = [f'"{s}" for {d}' for s, d in self.valid_fsources.items()]
            raise InvalidInputValue("feature source", valids)

    def _geturl(self, url: str, payload: Optional[Dict[str, str]] = None):
        """Send a request to the service using GET method."""
        if payload is None:
            payload = {"f": "json"}
        else:
            payload.update({"f": "json"})

        try:
            return self.session.get(url, payload).json()
        except JSONDecodeError:
            raise ZeroMatched("No feature was found with the provided inputs.")
Example #15
0
def post_connection_error():
    url = "https://somefailedurl.com"
    s = RetrySession(retries=2)
    s.post(url)
Example #16
0
def ssebopeta_bycoords(
    coords: pd.DataFrame,
    dates: Union[Tuple[str, str], Union[int, List[int]]],
    crs: str = DEF_CRS,
) -> xr.Dataset:
    """Daily actual ET for a dataframe of coords from SSEBop database in mm/day.

    Parameters
    ----------
    coords : pandas.DataFrame
        A dataframe with ``id``, ``x``, ``y`` columns.
    dates : tuple or list, optional
        Start and end dates as a tuple (start, end) or a list of years [2001, 2010, ...].
    crs : str, optional
        The CRS of the input coordinates, defaults to epsg:4326.

    Returns
    -------
    xarray.Dataset
        Daily actual ET in mm/day as a dataset with ``time`` and ``location_id`` dimensions.
        The ``location_id`` dimension is the same as the ``id`` column in the input dataframe.
    """
    if not isinstance(coords, pd.DataFrame):
        raise InvalidInputType("coords", "pandas.DataFrame")

    req_cols = ["id", "x", "y"]
    if not set(req_cols).issubset(coords.columns):
        raise MissingColumns(req_cols)

    _coords = gpd.GeoSeries(gpd.points_from_xy(coords["x"], coords["y"]),
                            index=coords["id"],
                            crs=crs)
    _coords = _coords.to_crs(DEF_CRS)
    co_list = list(zip(_coords.x, _coords.y))

    f_list = helpers.get_ssebopeta_urls(dates)
    session = RetrySession()

    with patch("socket.has_ipv6", False):

        def _ssebop(url: str) -> List[np.ndarray]:  # type: ignore
            r = session.get(url)
            z = zipfile.ZipFile(io.BytesIO(r.content))

            with rio.MemoryFile() as memfile:
                memfile.write(z.read(z.filelist[0].filename))
                with memfile.open() as src:
                    return list(src.sample(co_list))

        time, eta = zip(*[(t, _ssebop(url)) for t, url in f_list])
    eta_arr = np.array(eta).reshape(len(time), -1)  # type: ignore
    ds = xr.Dataset(
        data_vars={
            "eta": (["time", "location_id"], eta_arr),
            "x": (["location_id"], coords["x"].to_numpy()),
            "y": (["location_id"], coords["y"].to_numpy()),
        },
        coords={
            "time": np.array(time, dtype="datetime64[ns]"),
            "location_id": coords["id"].to_numpy(),
        },
    )
    ds["eta"] = ds["eta"].where(ds["eta"] != 9999, np.nan) * 1e-3
    ds.eta.attrs = {
        "units": "mm/day",
        "long_name": "Actual ET",
        "nodatavals": (np.nan, ),
    }
    ds.x.attrs = {"crs": pyproj.CRS(crs).to_string()}
    ds.y.attrs = {"crs": pyproj.CRS(crs).to_string()}
    return ds
Example #17
0
class NWIS:
    """Access NWIS web service."""

    def __init__(self):
        self.session = RetrySession()
        self.url = ServiceURL().restful.nwis

    @staticmethod
    def query_byid(ids: Union[str, List[str]]) -> Dict[str, str]:
        """Generate the geometry keys and values of an ArcGISRESTful query."""
        if not isinstance(ids, (str, list)):
            raise InvalidInputType("ids", "str or list")

        ids = [str(i) for i in ids] if isinstance(ids, list) else [str(ids)]
        query = {"sites": ",".join(ids)}

        return query

    @staticmethod
    def query_bybox(bbox: Tuple[float, float, float, float]) -> Dict[str, str]:
        """Generate the geometry keys and values of an ArcGISRESTful query."""
        geoutils.check_bbox(bbox)
        query = {"bBox": ",".join(f"{b:.06f}" for b in bbox)}

        return query

    def get_info(self, query: Dict[str, str], expanded: bool = False) -> pd.DataFrame:
        """Get NWIS stations by a list of IDs or within a bounding box.

        Only stations that record(ed) daily streamflow data are returned.
        The following columns are included in the dataframe with expanded
        set to False:

        ==================  ==================================
        Name                Description
        ==================  ==================================
        site_no             Site identification number
        station_nm          Site name
        site_tp_cd          Site type
        dec_lat_va          Decimal latitude
        dec_long_va         Decimal longitude
        coord_acy_cd        Latitude-longitude accuracy
        dec_coord_datum_cd  Decimal Latitude-longitude datum
        alt_va              Altitude of Gage/land surface
        alt_acy_va          Altitude accuracy
        alt_datum_cd        Altitude datum
        huc_cd              Hydrologic unit code
        parm_cd             Parameter code
        stat_cd             Statistical code
        ts_id               Internal timeseries ID
        loc_web_ds          Additional measurement description
        medium_grp_cd       Medium group code
        parm_grp_cd         Parameter group code
        srs_id              SRS ID
        access_cd           Access code
        begin_date          Begin date
        end_date            End date
        count_nu            Record count
        hcdn_2009           Whether is in HCDN-2009 stations
        ==================  ==================================

        Parameters
        ----------
        query : dict
            A dictionary containing query by IDs or BBOX. Use ``query_byid`` or ``query_bbox``
            class methods to generate the queries.
        expanded : bool, optional
            Whether to get expanded sit information for example drainage area.

        Returns
        -------
        pandas.DataFrame
            NWIS stations
        """
        if not isinstance(query, dict):
            raise InvalidInputType("query", "dict")

        output_type = [{"outputDataTypeCd": "dv"}]
        if expanded:
            output_type.append({"siteOutput": "expanded"})

        site_list = []
        for t in output_type:
            payload = {
                **query,
                **t,
                "format": "rdb",
                "parameterCd": "00060",
                "siteStatus": "all",
                "hasDataTypeCd": "dv",
            }

            resp = self.session.post(f"{self.url}/site", payload).text.split("\n")

            r_list = [txt.split("\t") for txt in resp if "#" not in txt]
            r_dict = [dict(zip(r_list[0], st)) for st in r_list[2:]]

            site_list.append(pd.DataFrame.from_dict(r_dict).dropna())

        if expanded:
            sites = pd.merge(
                *site_list, on="site_no", how="outer", suffixes=("", "_overlap")
            ).filter(regex="^(?!.*_overlap)")
        else:
            sites = site_list[0]

        sites = sites.drop(sites[sites.alt_va == ""].index)
        try:
            sites = sites[sites.parm_cd == "00060"]
            sites["begin_date"] = pd.to_datetime(sites["begin_date"])
            sites["end_date"] = pd.to_datetime(sites["end_date"])
        except AttributeError:
            pass

        float_cols = ["dec_lat_va", "dec_long_va", "alt_va", "alt_acy_va"]
        if expanded:
            float_cols += ["drain_area_va", "contrib_drain_area_va"]

        sites[float_cols] = sites[float_cols].apply(lambda x: pd.to_numeric(x, errors="coerce"))

        sites = sites[sites.site_no.apply(len) == 8]

        gii = WaterData("gagesii", DEF_CRS)
        hcdn = gii.byid("staid", sites.site_no.tolist())
        hcdn_dict = hcdn[["staid", "hcdn_2009"]].set_index("staid").hcdn_2009.to_dict()
        sites["hcdn_2009"] = sites.site_no.apply(
            lambda x: len(hcdn_dict[x]) > 0 if x in hcdn_dict.keys() else False
        )

        return sites

    def get_streamflow(
        self, station_ids: Union[List[str], str], dates: Tuple[str, str], mmd: bool = False
    ) -> pd.DataFrame:
        """Get daily streamflow observations from USGS.

        Parameters
        ----------
        station_ids : str, list
            The gage ID(s)  of the USGS station.
        dates : tuple
            Start and end dates as a tuple (start, end).
        mmd : bool
            Convert cms to mm/day based on the contributing drainage area of the stations.

        Returns
        -------
        pandas.DataFrame
            Streamflow data observations in cubic meter per second (cms)
        """
        if not isinstance(station_ids, (str, list)):
            raise InvalidInputType("ids", "str or list")

        station_ids = station_ids if isinstance(station_ids, list) else [station_ids]

        if not isinstance(dates, tuple) or len(dates) != 2:
            raise InvalidInputType("dates", "tuple", "(start, end)")

        start = pd.to_datetime(dates[0])
        end = pd.to_datetime(dates[1])

        siteinfo = self.get_info(self.query_byid(station_ids))
        check_dates = siteinfo.loc[
            (
                (siteinfo.stat_cd == "00003")
                & (start < siteinfo.begin_date)
                & (end > siteinfo.end_date)
            ),
            "site_no",
        ].tolist()
        nas = [s for s in station_ids if s in check_dates]
        if len(nas) > 0:
            raise InvalidInputRange(
                "Daily Mean data unavailable for the specified time "
                + "period for the following stations:\n"
                + ", ".join(nas)
            )

        payload = {
            "format": "json",
            "sites": ",".join(station_ids),
            "startDT": start.strftime("%Y-%m-%d"),
            "endDT": end.strftime("%Y-%m-%d"),
            "parameterCd": "00060",
            "statCd": "00003",
            "siteStatus": "all",
        }

        resp = self.session.post(f"{self.url}/dv", payload)

        time_series = resp.json()["value"]["timeSeries"]
        r_ts = {
            t["sourceInfo"]["siteCode"][0]["value"]: t["values"][0]["value"] for t in time_series
        }

        def to_df(col, dic):
            discharge = pd.DataFrame.from_records(dic, exclude=["qualifiers"], index=["dateTime"])
            discharge.index = pd.to_datetime(discharge.index)
            discharge.columns = [col]
            return discharge

        qobs = pd.concat([to_df(f"USGS-{s}", t) for s, t in r_ts.items()], axis=1)

        # Convert cfs to cms
        qobs = qobs.astype("float64") * 0.028316846592

        if mmd:
            nldi = NLDI()
            basins_dict = {
                f"USGS-{s}": nldi.getfeature_byid("nwissite", f"USGS-{s}", basin=True).geometry
                for s in station_ids
            }
            basins = gpd.GeoDataFrame.from_dict(basins_dict, orient="index")
            basins.columns = ["geometry"]
            basins = basins.set_crs(DEF_CRS)
            eck4 = "+proj=eck4 +lon_0=0 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs"
            area = basins.to_crs(eck4).area
            ms2mmd = 1000.0 * 24.0 * 3600.0
            qobs = qobs.apply(lambda x: x / area.loc[x.name] * ms2mmd)
        return qobs
Example #18
0
 def __init__(self):
     self.session = RetrySession()
     self.url = ServiceURL().restful.nwis
Example #19
0
File: pynhd.py Project: jsta/pynhd
class ScienceBase:
    """Access NHDPlus V2.1 Attributes from ScienceBase over CONUS.

    More info can be found `here <https://www.sciencebase.gov/catalog/item/5669a79ee4b08895842a1d47>`_.

    Parameters
    ----------
    save_dir : str
        Directory to save the staged data frame containing metadata for the database,
        defaults to system's temp directory. The metadata dataframe is saved as a feather
        file, nhdplus_attrs.feather, in save_dir that can be loaded with Pandas.
    """
    def __init__(self, save_dir: Optional[str] = None) -> None:
        self.save_dir = Path(save_dir) if save_dir else Path(
            tempfile.gettempdir())
        if not self.save_dir.exists():
            os.makedirs(self.save_dir)
        self.session = RetrySession()
        self.nhd_attr_item = "5669a79ee4b08895842a1d47"
        self.char_feather = Path(self.save_dir, "nhdplus_attrs.feather")

    def get_children(self, item: str) -> Dict[str, Any]:
        """Get childern items of an item."""
        url = "https://www.sciencebase.gov/catalog/items"
        payload = {
            "filter": f"parentIdExcludingLinks={item}",
            "fields": "title,id",
            "format": "json",
        }
        return self.session.get(url, payload=payload).json()

    def get_files(self, item: str) -> Dict[str, Tuple[str, str]]:
        """Get all the available zip files in an item."""
        url = "https://www.sciencebase.gov/catalog/item"
        payload = {"fields": "files,downloadUri", "format": "json"}
        r = self.session.get(f"{url}/{item}", payload=payload).json()
        files_url = zip(tlz.pluck("name", r["files"]),
                        tlz.pluck("url", r["files"]))
        # TODO: Add units
        meta = "".join(tlz.pluck("metadataHtmlViewUri", r["files"],
                                 default=""))
        return {
            f.replace("_CONUS.zip", ""): (u, meta)
            for f, u in files_url if ".zip" in f
        }

    def stage_data(self) -> pd.DataFrame:
        """Stage the NHDPlus Attributes database and save to nhdplus_attrs.feather."""
        r = self.get_children(self.nhd_attr_item)

        titles = tlz.pluck("title", r["items"])
        titles = tlz.concat(
            tlz.map(tlz.partial(re.findall, "Select(.*?)Attributes"), titles))
        titles = tlz.map(str.strip, titles)

        main_items = dict(zip(titles, tlz.pluck("id", r["items"])))

        files = {}
        soil = main_items.pop("Soil")
        for i, item in main_items.items():
            r = self.get_children(item)

            titles = tlz.pluck("title", r["items"])
            titles = tlz.map(
                lambda s: s.split(":")[1].strip() if ":" in s else s, titles)

            child_items = dict(zip(titles, tlz.pluck("id", r["items"])))
            files[i] = {t: self.get_files(c) for t, c in child_items.items()}

        r = self.get_children(soil)
        titles = tlz.pluck("title", r["items"])
        titles = tlz.map(lambda s: s.split(":")[1].strip()
                         if ":" in s else s, titles)

        child_items = dict(zip(titles, tlz.pluck("id", r["items"])))
        stat = child_items.pop("STATSGO Soil Characteristics")
        ssur = child_items.pop("SSURGO Soil Characteristics")
        files["Soil"] = {t: self.get_files(c) for t, c in child_items.items()}

        r = self.get_children(stat)
        titles = tlz.pluck("title", r["items"])
        titles = tlz.map(lambda s: s.split(":")[1].split(",")[1].strip(),
                         titles)
        child_items = dict(zip(titles, tlz.pluck("id", r["items"])))
        files["STATSGO"] = {
            t: self.get_files(c)
            for t, c in child_items.items()
        }

        r = self.get_children(ssur)
        titles = tlz.pluck("title", r["items"])
        titles = tlz.map(lambda s: s.split(":")[1].strip(), titles)
        child_items = dict(zip(titles, tlz.pluck("id", r["items"])))
        files["SSURGO"] = {
            t: self.get_files(c)
            for t, c in child_items.items()
        }

        chars = []
        types = {"CAT": "local", "TOT": "upstream_acc", "ACC": "div_routing"}
        for t, dd in files.items():
            for d, fd in dd.items():
                for f, u in fd.items():
                    chars.append({
                        "name": f,
                        "type": types.get(f[-3:], "other"),
                        "theme": t,
                        "description": d,
                        "url": u[0],
                        "meta": u[1],
                    })
        char_df = pd.DataFrame(chars, dtype="category")
        char_df.to_feather(self.char_feather)
        return char_df