Ejemplo n.º 1
0
def test_urlopen_requests_kwargs():
    """Test that urlopen can pass kwargs to requests."""
    base_url = "https://standards.sensors.ioos.us/erddap/tabledap/"
    timeout_seconds = 1  # request timeout in seconds
    slowwly_milliseconds = (timeout_seconds + 1) * 1000
    slowwly_url = f"https://flash-the-slow-api.herokuapp.com/delay/{slowwly_milliseconds}/url/{base_url}"

    with pytest.raises(httpx.ReadTimeout):
        urlopen(slowwly_url, timeout=timeout_seconds)
Ejemplo n.º 2
0
def test_urlopen_requests_kwargs():
    """Test that urlopen can pass kwargs to requests"""
    base_url = "http://erddap.sensors.ioos.us/erddap/tabledap/"
    timeout_seconds = 1  # request timeout in seconds
    slowwly_milliseconds = (timeout_seconds + 1) * 1000
    slowwly_url = (
        f"https://flash.siwalik.in/delay/{slowwly_milliseconds}/url/{base_url}"
    )

    with pytest.raises(ReadTimeout):
        urlopen(slowwly_url, timeout=timeout_seconds)
Ejemplo n.º 3
0
def get_dsinfo(e, stdname, cdm_data_type, min_time, max_time, skip_datasets):
    """This function finds all the datasets with a given standard_name in
    the specified time period, and return GeoJSON"""

    search_url = e.get_search_url(
        response="csv",
        cdm_data_type=cdm_data_type.lower(),
        items_per_page=100000,
        standard_name=stdname,
        min_time=min_time,
        max_time=max_time,
    )
    try:
        df = pd.read_csv(urlopen(search_url))

        for skip_dataset in skip_datasets:
            try:
                row = df.loc[df["Dataset ID"] == skip_dataset].index[0]
                df.drop(row, inplace=True)
            except IndexError:
                pass

    except HTTPError:
        df = pd.DataFrame([])

    return df
Ejemplo n.º 4
0
def get_timeseries(e, dataset=None, stdname=None, constraints=None):
    """This function returns the specified dataset time series values as a Pandas dataframe"""

    var = e.get_var_by_attr(
        dataset_id=dataset,
        standard_name=lambda v: str(v).lower() == stdname.lower(),
    )
    if var:
        var = var[0]
    else:
        raise ValueError(f"Cannot get data for {stdname}.")

    download_url = e.get_download_url(
        dataset_id=dataset,
        constraints=constraints,
        variables=["time", var],
        response="csv",
    )

    df = pd.read_csv(
        urlopen(download_url),
        index_col="time",
    )

    unit = df.iloc[0, 0]

    df = df.drop(labels=df.index[0])

    df.index = pd.to_datetime(
        df.index,
        utc=True,
    )
    df[var] = df[var].astype(float)

    return df, var, unit
Ejemplo n.º 5
0
def get_timeseries(e, dataset=None, standard_name=None, constraints=None):
    """This function returns the specified dataset time series values as a Pandas dataframe"""

    var = e.get_var_by_attr(
        dataset_id=dataset,
        standard_name=lambda v: str(v).lower() == standard_name.lower(),
    )
    if var:
        var = var[0]
    else:
        raise ValueError(f"Cannot get data for {standard_name}.")
        # We should filter out only valid standard_names for each dataset!
        # df = pd.read_csv(e.get_info_url(response="csv"))
        # df.loc[df["Attribute Name"] == "standard_name"]["Value"].values

    download_url = e.get_download_url(
        dataset_id=dataset,
        constraints=constraints,
        variables=["time", var],
        response="csv",
    )

    df = pd.read_csv(
        urlopen(download_url),
        index_col="time",
        parse_dates=True,
        skiprows=[1],
    )
    return df, var
Ejemplo n.º 6
0
def _multi_urlopen(url: str) -> BinaryIO:
    """Simpler url open to work with multiprocessing."""
    try:
        data = urlopen(url)
    except (httpx.HTTPError, httpx.ConnectError):
        return None
    return data
Ejemplo n.º 7
0
def get_valid_stdnames(server_name):
    """Find all the `standard_name` attributes that exist on
    this ERDDAP endpoint, using [ERDDAP's "categorize" service]
    (http://www.neracoos.org/erddap/categorize/index.html)"""

    server = servers[server_name]
    server_url = server.get("url")

    # global e
    e = ERDDAP(server=server_url, protocol="tabledap")

    url_standard_names = f"{server_url}/categorize/standard_name/index.csv"
    df = pd.read_csv(urlopen(url_standard_names), skiprows=[1, 2])
    standard_names = list(df["Category"].values)

    standard_names = remove_qcstdnames(standard_names)

    valid_standard_names = []
    count = 0

    print(
        "Checking the variables available for this server. This might take up to a couple of minutes...\n",
    )

    for standard_name in standard_names:

        count += 1

        if count == np.floor(len(standard_names) / 2):
            print("Halfway there...\n")
        elif count == np.floor((len(standard_names) / 4) * 3):
            print("Almost done...\n")
        elif count == (len(standard_names)):
            print("Done!")

        features, datasets = stdname2geojson(
            e,
            standard_name,
            server.get("cdm_data_type"),
            server.get("min_time"),
            server.get("max_time"),
            server.get("skip_datasets"),
        )

        if len(datasets
               ) > 0:  # if there is at least one dataset with this data

            var = e.get_var_by_attr(
                dataset_id=datasets[0],
                standard_name=lambda v: str(v).lower() == standard_name.lower(
                ),
            )

            if var != []:
                valid_standard_names.append(standard_name)

        del features, datasets

    return valid_standard_names, server, e
Ejemplo n.º 8
0
def test__tempnc():
    url = "https://data.ioos.us/gliders/erddap/tabledap/cp_336-20170116T1254.nc"
    data = urlopen(url)
    with _tempnc(data) as tmp:
        # Check that the file was exists.
        assert os.path.exists(tmp)
        # Confirm that it is a netCDF file.
        assert tmp.endswith("nc")
    # Check that the file was removed.
    assert not os.path.exists(tmp)
Ejemplo n.º 9
0
def test__tempnc():
    """Test temporary netcdf file."""
    url = "https://podaac-opendap.jpl.nasa.gov/opendap/allData/modis/L3/aqua/11um/v2019.0/4km/daily/2017/365/AQUA_MODIS.20171231.L3m.DAY.NSST.sst.4km.nc"  # noqa
    data = urlopen(url)
    with _tempnc(data) as tmp:
        # Check that the file was exists.
        assert os.path.exists(tmp)
        # Confirm that it is a netCDF file.
        assert tmp.endswith("nc")
    # Check that the file was removed.
    assert not os.path.exists(tmp)
Ejemplo n.º 10
0
def _griddap_get_constraints(
    dataset_url: str,
    step: int,
) -> Tuple[Dict, List, List]:
    """
    Fetch metadata of griddap dataset and set initial constraints
    Step size is applied to all dimensions

    """

    dds_url = f"{dataset_url}.dds"
    url = urlopen(dds_url)
    data = url.read().decode()
    dims, *variables = data.split("GRID")
    dim_list = dims.split("[")[:-1]
    dim_names, variable_names = [], []
    for dim in dim_list:
        dim_name = dim.split(" ")[-1]
        dim_names.append(dim_name)
    for var in variables:
        phrase, *__ = var.split("[")
        var_name = phrase.split(" ")[-1]
        variable_names.append(var_name)
    table = pd.DataFrame({
        "dimension name": [],
        "min": [],
        "max": [],
        "length": []
    })
    for dim in dim_names:
        url = f"{dataset_url}.csvp?{dim}"
        data = pd.read_csv(url).values
        if dim == "time":
            data_start = data[-1][0]
        else:
            data_start = data[0][0]
        table = table.append(
            {
                "dimension name": dim,
                "min": data_start,
                "max": data[-1][0],
                "length": len(data),
            },
            ignore_index=True,
        )
    table.index = table["dimension name"]
    table = table.drop("dimension name", axis=1)
    constraints_dict = {}
    for dim, data in table.iterrows():
        constraints_dict[f"{dim}>="] = data["min"]
        constraints_dict[f"{dim}<="] = data["max"]
        constraints_dict[f"{dim}_step"] = step

    return constraints_dict, dim_names, variable_names
Ejemplo n.º 11
0
def _nc_dataset(url, auth, **requests_kwargs: Dict):
    """Return a netCDF4-python Dataset from memory and fallbacks to disk if that fails."""
    from netCDF4 import Dataset

    data = urlopen(url=url, auth=auth, **requests_kwargs)
    try:
        return Dataset(Path(urlparse(url).path).name, memory=data.read())
    except OSError:
        # if libnetcdf is not compiled with in-memory support fallback to a local tmp file
        with _tempnc(data) as _nc:
            return Dataset(_nc)
Ejemplo n.º 12
0
def all_datasets_locations(e, cdm_data_type, min_time, max_time):
    """This function returns the lon,lat values from all datasets"""
    url_dset = (f"{e.server}"
                "/tabledap/allDatasets.csv?"
                "datasetID,minLongitude,minLatitude&"
                f'cdm_data_type="{cdm_data_type}"'
                f"&minTime<={max_time.to_datetime_string()}"
                f"&maxTime>={min_time.to_datetime_string()}")

    url_dataset = quote(url_dset, safe=":/?&= ")
    del url_dset
    df = pd.read_csv(urlopen(url_dataset), skiprows=[1])
    return df
Ejemplo n.º 13
0
    def to_pandas(self, **kw):
        """Save a data request to a pandas.DataFrame.

        Accepts any `pandas.read_csv` keyword arguments.

        This method uses the .csvp [1] response as the default for simplicity,
        please check ERDDAP's documentation for the other csv options available.

        [1] Download a ISO-8859-1 .csv file with line 1: name (units). Times are ISO 8601 strings.

        """
        response = kw.pop("response", "csvp")
        url = self.get_download_url(response=response, **kw)
        data = urlopen(url, auth=self.auth, **self.requests_kwargs)
        return pd.read_csv(data, **kw)
Ejemplo n.º 14
0
def get_valid_stdnames(server_name):
    """Find all the `standard_name` attributes that exist on
    this ERDDAP endpoint, using [ERDDAP's "categorize" service]
    (http://www.neracoos.org/erddap/categorize/index.html)"""

    server = servers[server_name]
    server_url = server.get("url")

    e = ERDDAP(server=server_url, protocol="tabledap")

    url_stdnames = f"{server_url}/categorize/standard_name/index.csv"
    df = pd.read_csv(urlopen(url_stdnames), skiprows=[1, 2])
    stdnames = list(df["Category"].values)

    stdnames = remove_qcstdnames(stdnames)

    valid_stdnames = []
    count = 0

    display(pn.Column(pn.panel(progressbar.name), progressbar))

    for stdname in stdnames:

        count += 1

        progressbar.value = int(count / (len(stdnames)) * 100)

        df_stdname = get_datasets(
            e,
            stdname,
            server.get("cdm_data_type"),
            server.get("min_time"),
            server.get("max_time"),
            server.get("skip_datasets"),
        )

        if not df_stdname.empty:

            var = e.get_var_by_attr(
                dataset_id=df_stdname.datasetID.values[0],
                standard_name=lambda v: str(v).lower() == stdname.lower(),
            )

            if var != []:
                valid_stdnames.append(stdname)

    return valid_stdnames, server, e
Ejemplo n.º 15
0
    def to_iris(self, **kw):
        """Load the data request into an iris.CubeList.

        Accepts any `iris.load_raw` keyword arguments.
        """
        import iris

        response = "nc" if self.protocol == "griddap" else "ncCF"
        url = self.get_download_url(response=response, **kw)
        data = urlopen(url, auth=self.auth, **self.requests_kwargs)
        with _tempnc(data) as tmp:
            cubes = iris.load_raw(tmp, **kw)
            try:
                cubes.realise_data()
            except ValueError:
                iris.cube.CubeList([cube.data for cube in cubes])
            return cubes
Ejemplo n.º 16
0
def get_dslocation(e, cdm_data_type, min_time, max_time):
    """This function returns the lon,lat values from all datasets"""
    max_time_str = max_time.strftime("%Y-%m-%d %H:%M:%S")
    min_time_str = min_time.strftime("%Y-%m-%d %H:%M:%S")

    url_dset = (
        f"{e.server}"
        "/tabledap/allDatasets.csv?"
        "datasetID,minLongitude,minLatitude&"
        f'cdm_data_type="{cdm_data_type}"'
        f"&minTime<={max_time_str}"
        f"&maxTime>={min_time_str}"
    )

    url_dataset = quote(url_dset, safe=":/?&= ")
    del url_dset
    df = pd.read_csv(urlopen(url_dataset), skiprows=[1])

    return df
Ejemplo n.º 17
0
    def _get_variables(self, dataset_id: OptionalStr = None) -> Dict:
        if not dataset_id:
            dataset_id = self.dataset_id

        if dataset_id is None:
            raise ValueError(
                f"You must specify a valid dataset_id, got {dataset_id}")

        url = self.get_info_url(dataset_id=dataset_id, response="csv")

        variables = {}
        data = urlopen(url, auth=self.auth, **self.requests_kwargs)
        _df = pd.read_csv(data)
        self._dataset_id = dataset_id
        for variable in set(_df["Variable Name"]):
            attributes = (_df.loc[_df["Variable Name"] == variable,
                                  ["Attribute Name", "Value"]].set_index(
                                      "Attribute Name").to_dict()["Value"])
            variables.update({variable: attributes})
        return variables
Ejemplo n.º 18
0
def test_urlopen_raise():
    """Assure that urlopen will raise for bad URLs."""
    url = "https://developer.mozilla.org/en-US/404"
    with pytest.raises(httpx.HTTPError):
        urlopen(url)
Ejemplo n.º 19
0
def test_urlopen():
    """Assure that urlopen is always a BytesIO object."""
    url = "https://standards.sensors.ioos.us/erddap/tabledap/"
    ret = urlopen(url)
    isinstance(ret, io.BytesIO)