Ejemplo n.º 1
0
def load_glider(dataset_id='ru32-20190102T1317-profile-sci-rt',
                server="http://slocum-data.marine.rutgers.edu/erddap"):
    ''' Load glider data from erddap.
        input dataset ID and server
        Returns an xarray dataset indexed on time '''

    # should change: write to_netcdf, then check if netcdf exists

    e = ERDDAP(
        server=server,
        protocol="tabledap",
        response="nc",
    )

    e.dataset_id = dataset_id

    gds = e.to_xarray()

    # want to have the dimention be time not obs number
    gds = gds.swap_dims({"obs": "time"})
    gds = gds.sortby("time")

    # drop repeated time values
    gds = gds.sel(time=~gds.indexes['time'].duplicated())

    # get the seafloor depths too

    e2 = ERDDAP(
        server="http://slocum-data.marine.rutgers.edu/erddap",
        protocol="tabledap",
        response="nc",
    )

    # get some of the raw data:
    #     e2.dataset_id = dataset_id[:-14] + 'trajectory-raw-rt'
    e2.dataset_id = dataset_id.replace('profile-sci', 'trajectory-raw')

    e2.variables = ['time', 'm_water_depth', 'm_pitch']

    # this connects to the data and load into an xarray dataset

    gds_raw = e2.to_xarray().drop_dims('trajectory')

    # want to have the dimention be time not obs number

    gds_raw = gds_raw.swap_dims({"obs": "time"})
    gds_raw = gds_raw.sortby("time")

    gds_raw = gds_raw.sel(time=~gds_raw.indexes['time'].duplicated())

    # remove bad values:
    gds_raw['m_water_depth'] = gds_raw.m_water_depth.where(
        gds_raw.m_water_depth > 10, drop=True)

    gds['bottom_depth'] = gds_raw.m_water_depth.interp_like(gds,
                                                            method='nearest')

    return gds
Ejemplo n.º 2
0
def get_erddap_dataset(server, protocol, file_type, ds_id, var_list=None):
    e = ERDDAP(server=server, protocol=protocol, response=file_type)
    e.dataset_id = ds_id
    if var_list:
        e.variables = var_list
    ds = e.to_xarray()
    ds = ds.sortby(ds.time)
    return ds
Ejemplo n.º 3
0
def test_erddap_requests_kwargs():
    """ Test that an ERDDAP instance can have requests_kwargs attribute assigned
    and are passed to the underlying methods """

    base_url = "http://www.neracoos.org/erddap"
    timeout_seconds = 1  # request timeout in seconds
    slowwly_milliseconds = (timeout_seconds + 1) * 1000
    slowwly_url = ("http://slowwly.robertomurray.co.uk/delay/" +
                   str(slowwly_milliseconds) + "/url/" + base_url)

    connection = ERDDAP(slowwly_url)
    connection.dataset_id = "M01_sbe37_all"
    connection.protocol = "tabledap"

    connection.requests_kwargs["timeout"] = timeout_seconds

    with pytest.raises(ReadTimeout):
        connection.to_xarray()
Ejemplo n.º 4
0
def get_erddap_dataset(server, ds_id, variables=None, constraints=None):
    variables = variables or None
    constraints = constraints or None

    e = ERDDAP(server=server,
               protocol='tabledap',
               response='nc')
    e.dataset_id = ds_id
    if constraints:
        e.constraints = constraints
    if variables:
        e.variables = variables
    ds = e.to_xarray()
    ds = ds.sortby(ds.time)
    return ds
Ejemplo n.º 5
0
def get_erddap_dataset(ds_id, variables=None, constraints=None, filetype=None):
    """
    Returns a netcdf dataset for a specified dataset ID (or dataframe if dataset cannot be converted to xarray)
    :param ds_id: dataset ID e.g. ng314-20200806T2040
    :param variables: optional list of variables
    :param constraints: optional list of constraints
    :param filetype: optional filetype to return, 'nc' (default) or 'dataframe'
    :return: netcdf dataset
    """
    variables = variables or None
    constraints = constraints or None
    filetype = filetype or 'nc'

    e = ERDDAP(server='NGDAC', protocol='tabledap', response='nc')
    e.dataset_id = ds_id
    if constraints:
        e.constraints = constraints
    if variables:
        e.variables = variables
    if filetype == 'nc':
        try:
            ds = e.to_xarray()
            ds = ds.sortby(ds.time)
        except OSError:
            print('No dataset available for specified constraints: {}'.format(
                ds_id))
            ds = []
        except TypeError:
            print('Cannot convert to xarray, providing dataframe: {}'.format(
                ds_id))
            ds = e.to_pandas().dropna()
    elif filetype == 'dataframe':
        ds = e.to_pandas().dropna()
    else:
        print('Unrecognized filetype: {}. Needs to  be "nc" or "dataframe"'.
              format(filetype))

    return ds
Ejemplo n.º 6
0
class ErddapReader(Reader):
    """
    This class searches ERDDAP servers. There are 2 known_servers but
    others can be input too.

    Attributes
    ----------
    parallel: boolean
        If True, run with simple parallelization using `multiprocessing`.
        If False, run serially.
    known_server: string
        Two ERDDAP servers are built in to be known to this reader: "ioos" and
        "coastwatch".
    e: ERDDAP server instance
    e.protocol: string
        * "tabledap" (pandas, appropriate for reading as csv)
        * "griddap" (xarray, appropriate for reading as netcdf)
    e.server: string
        Return the server name
    columns: list
        Metadata columns
    name: string
        "erddap_ioos", "erddap_coastwatch", or a constructed string if the user
        inputs a new protocol and server.
    reader: string
        reader is defined as "ErddapReader".
    """
    def __init__(self,
                 known_server="ioos",
                 protocol=None,
                 server=None,
                 parallel=True):
        """
        Parameters
        ----------
        known_server: string, optional
            Two ERDDAP servers are built in to be known to this reader:
            "ioos" and "coastwatch".
        protocol, server: string, optional
            For a user-defined ERDDAP server, input the protocol as one of the
            following:
            * "tabledap" (pandas, appropriate for reading as csv)
            * "griddap" (xarray, appropriate for reading as netcdf)
            and the server address (such as
            "http://erddap.sensors.ioos.us/erddap" or
            "http://coastwatch.pfeg.noaa.gov/erddap").
        parallel: boolean
            If True, run with simple parallelization using `multiprocessing`.
            If False, run serially.
        """
        self.parallel = parallel

        # hard wire this for now
        filetype = "netcdf"

        # either select a known server or input protocol and server string
        if known_server == "ioos":
            protocol = "tabledap"
            server = "http://erddap.sensors.ioos.us/erddap"
            filetype = "netcdf"  # other option: "csv"
        elif known_server == "coastwatch":
            protocol = "griddap"
            server = "http://coastwatch.pfeg.noaa.gov/erddap"
            filetype = "netcdf"  # other option: "csv"
        elif known_server is not None:
            statement = (
                "either select a known server or input protocol and server string"
            )
            assert (protocol is not None) & (server is not None), statement
        else:
            known_server = urllib.parse.urlparse(server).netloc
            # known_server = server.strip("/erddap").strip("http://").replace(".", "_")
            statement = (
                "either select a known server or input protocol and server string"
            )
            assert (protocol is not None) & (server is not None), statement

        self.known_server = known_server
        self.e = ERDDAP(server=server)
        self.e.protocol = protocol
        self.e.server = server
        self.filetype = filetype

        # columns for metadata
        self.columns = [
            "geospatial_lat_min",
            "geospatial_lat_max",
            "geospatial_lon_min",
            "geospatial_lon_max",
            "time_coverage_start",
            "time_coverage_end",
            "defaultDataQuery",
            "subsetVariables",  # first works for timeseries sensors, 2nd for gliders
            "keywords",  # for hf radar
            "id",
            "infoUrl",
            "institution",
            "featureType",
            "source",
            "sourceUrl",
        ]

        # name
        self.name = f"erddap_{known_server}"

        self.reader = "ErddapReader"
        self.store = dict()

    def __getitem__(self, key):
        """Redefinition of dict-like behavior.

        This enables user to use syntax `reader[dataset_id]` to read in and
        save dataset into the object.

        Parameters
        ----------
        key: str
            dataset_id for a dataset that is available in the search/reader
            object.

        Returns
        -------
        xarray Dataset of the data associated with key
        """

        returned_data = self.data_by_dataset(key)
        # returned_data = self._return_data(key)
        self.__setitem__(key, returned_data)
        return returned_data

    def find_dataset_id_from_station(self, station):
        """Find dataset_id from station name.

        Parameters
        ----------
        station: string
            Station name for which to search for dataset_id
        """

        if station is None:
            return None
        # for station in self._stations:
        # if station has more than one word, AND will be put between
        # to search for multiple terms together.
        url = self.e.get_search_url(response="csv",
                                    items_per_page=5,
                                    search_for=station)

        try:
            df = pd.read_csv(url)
        except Exception as e:
            logger.exception(e)
            logger.warning(
                f"search url {url} did not work for station {station}.")
            return

        # first try for exact station match
        try:
            # Special case for TABS when don't split the id name
            if "tabs" in station:  # don't split
                dataset_id = [
                    dataset_id for dataset_id in df["Dataset ID"]
                    if station.lower() == dataset_id.lower()
                ][0]
            else:
                # first try as dataset_id then do as station name
                dataset_id = [
                    dataset_id for dataset_id in df["Dataset ID"]
                    if station.lower() in [dataset_id.lower()] +
                    dataset_id.lower().split("_")
                ][0]

        except Exception as e:
            logger.exception(e)
            logger.warning(
                "When searching for a dataset id to match station name %s, the first attempt to match the id did not work."
                % (station))
            # If that doesn't work, return None for dataset_id
            dataset_id = None
            # # if that doesn't work, trying for more general match and just take first returned option
            # dataset_id = df.iloc[0]["Dataset ID"]

        return dataset_id

    @property
    def dataset_ids(self):
        """Find dataset_ids for server.

        Notes
        -----
        The dataset_ids are found by querying the metadata through the ERDDAP server.

        The number of dataset_ids can change if a variable is removed from the
        list of variables and this is rerun.
        """

        if not hasattr(self, "_dataset_ids") or (
                self.variables and
            (len(self.variables) != self.num_variables)):

            # This should be a region search
            if self.approach == "region":

                # find all the dataset ids which we will use to get the data
                # This limits the search to our keyword arguments in kw which should
                # have min/max lon/lat/time values
                dataset_ids = []
                if self.variables is not None:
                    for variable in self.variables:

                        # find and save all dataset_ids associated with variable
                        search_url = self.e.get_search_url(
                            response="csv",
                            **self.kw,
                            variableName=variable,
                            items_per_page=10000,
                        )

                        try:
                            search = pd.read_csv(search_url)
                            dataset_ids.extend(search["Dataset ID"])
                        except Exception as e:
                            logger.exception(e)
                            logger.warning(
                                f"variable {variable} was not found in the search"
                            )
                            logger.warning(f"search_url: {search_url}")

                else:

                    # find and save all dataset_ids associated with variable
                    search_url = self.e.get_search_url(response="csv",
                                                       **self.kw,
                                                       items_per_page=10000)

                    try:
                        search = pd.read_csv(search_url)
                        dataset_ids.extend(search["Dataset ID"])
                    except Exception as e:
                        logger.exception(e)
                        logger.warning("nothing found in the search")
                        logger.warning(f"search_url: {search_url}")

                # only need a dataset id once since we will check them each for all standard_names
                self._dataset_ids = list(set(dataset_ids))

            # This should be a search for the station names
            elif self.approach == "stations":

                # search by station name for each of stations
                if self.parallel:
                    # get metadata for datasets
                    # run in parallel to save time
                    num_cores = multiprocessing.cpu_count()
                    dataset_ids = Parallel(n_jobs=num_cores)(
                        delayed(self.find_dataset_id_from_station)(station)
                        for station in self._stations)

                else:
                    dataset_ids = []
                    for station in self._stations:
                        dataset_ids.append(
                            self.find_dataset_id_from_station(station))

                # remove None from list
                dataset_ids = [i for i in dataset_ids if i]

                # In this case return all dataset_ids so they match 1-1 with
                # the input station list.
                self._dataset_ids = dataset_ids

            else:
                logger.warning(
                    "Neither stations nor region approach were used in function dataset_ids."
                )

            # update number of variables
            if self.variables:
                self.num_variables = len(self.variables)

        return self._dataset_ids

    def meta_by_dataset(self, dataset_id):
        """Return the catalog metadata for a single dataset_id."""

        info_url = self.e.get_info_url(response="csv", dataset_id=dataset_id)
        try:
            info = pd.read_csv(info_url)
        except Exception as e:
            logger.exception(e)
            logger.warning(f"Could not read info from {info_url}")
            return {dataset_id: []}

        items = []

        for col in self.columns:

            try:
                item = info[info["Attribute Name"] == col]["Value"].values[0]
                dtype = info[info["Attribute Name"] ==
                             col]["Data Type"].values[0]
            except:
                if col == "featureType":
                    # this column is not present in HF Radar metadata but want it to
                    # map to data_type, so input 'grid' in that case.
                    item = "grid"
                else:
                    item = "NA"

            if dtype == "String":
                pass
            elif dtype == "double":
                item = float(item)
            elif dtype == "int":
                item = int(item)
            items.append(item)

        # include download link ##
        self.e.dataset_id = dataset_id
        if self.e.protocol == "tabledap":
            # set the same time restraints as before
            self.e.constraints = {
                "time<=": self.kw["max_time"],
                "time>=": self.kw["min_time"],
            }
            if self.filetype == "csv":
                download_url = self.e.get_download_url(response="csvp")
            elif self.filetype == "netcdf":
                download_url = self.e.get_download_url(response="ncCf")

        elif self.e.protocol == "griddap":
            # the search terms that can be input for tabledap do not work for griddap
            # in erddapy currently. Instead, put together an opendap link and then
            # narrow the dataset with xarray.
            # get opendap link
            download_url = self.e.get_download_url(response="opendap")

        # check if "prediction" is present in metadata, esp in case of NOAA
        # model predictions
        is_prediction = "Prediction" in " ".join(
            list(info["Value"].replace(np.nan, None).values))

        # add erddap server name
        return {
            dataset_id:
            [self.e.server, download_url, info_url, is_prediction] + items +
            [self.variables]
        }

    @property
    def meta(self):
        """Rearrange the individual metadata into a dataframe.

        Notes
        -----
        This should exclude duplicate entries.
        """

        if not hasattr(self, "_meta"):

            if self.parallel:

                # get metadata for datasets
                # run in parallel to save time
                num_cores = multiprocessing.cpu_count()
                downloads = Parallel(n_jobs=num_cores)(
                    delayed(self.meta_by_dataset)(dataset_id)
                    for dataset_id in self.dataset_ids)

            else:

                downloads = []
                for dataset_id in self.dataset_ids:
                    downloads.append(self.meta_by_dataset(dataset_id))

            # make dict from individual dicts
            from collections import ChainMap

            meta = dict(ChainMap(*downloads))

            # Make dataframe of metadata
            # variable names are the column names for the dataframe
            self._meta = pd.DataFrame.from_dict(
                meta,
                orient="index",
                columns=[
                    "database", "download_url", "info_url", "is_prediction"
                ] + self.columns + ["variable names"],
            )

        return self._meta

    def data_by_dataset(self, dataset_id):
        """Return the data for a single dataset_id.

        Returns
        -------
        A tuple of (dataset_id, data), where data type is a pandas DataFrame

        Notes
        -----
        Data is read into memory.
        """

        if self.filetype == "csv":
            # if self.e.protocol == "tabledap":
            try:
                # fetch metadata if not already present
                # found download_url from metadata and use
                self.e.dataset_id = dataset_id
                # dataset_vars gives a list of the variables in the dataset
                dataset_vars = (self.meta.loc[dataset_id]
                                ["defaultDataQuery"].split("&")[0].split(","))
                # vars_present gives the variables in self.variables
                # that are actually in the dataset
                vars_present = []
                for selfvariable in self.variables:
                    vp = [var for var in dataset_vars if var == selfvariable]
                    if len(vp) > 0:
                        vars_present.append(vp[0])
                # If any variables are not present, this doesn't work.
                if self.variables is not None:
                    self.e.variables = [
                        "time",
                        "longitude",
                        "latitude",
                        "station",
                    ] + vars_present
                dd = self.e.to_pandas(response="csvp",
                                      index_col=0,
                                      parse_dates=True)
                # dd = self.e.to_pandas(response='csv', header=[0, 1],
                #                       index_col=0, parse_dates=True)
                # dd = pd.read_csv(
                #     download_url, header=[0, 1], index_col=0, parse_dates=True
                # )

                # Drop cols and rows that are only NaNs.
                dd = dd.dropna(axis="index", how="all").dropna(axis="columns",
                                                               how="all")

                if self.variables is not None:
                    # check to see if there is any actual data
                    # this is a bit convoluted because the column names are the variable names
                    # plus units so can't match 1 to 1.
                    datacols = (
                        0  # number of columns that represent data instead of metadata
                    )
                    for col in dd.columns:
                        datacols += [
                            varname in col for varname in self.variables
                        ].count(True)
                    # if no datacols, we can skip this one.
                    if datacols == 0:
                        dd = None

            except Exception as e:
                logger.exception(e)
                logger.warning("no data to be read in for %s" % dataset_id)
                dd = None

        elif self.filetype == "netcdf":
            # elif self.e.protocol == "griddap":

            if self.e.protocol == "tabledap":

                try:
                    # assume I don't need to narrow in space since time series (tabledap)
                    self.e.dataset_id = dataset_id
                    dd = self.e.to_xarray()
                    # dd = xr.open_dataset(download_url, chunks="auto")
                    dd = dd.swap_dims({"obs": dd.cf["time"].name})
                    dd = dd.sortby(dd.cf["time"], ascending=True)
                    dd = dd.cf.sel(
                        T=slice(self.kw["min_time"], self.kw["max_time"]))
                    # dd = dd.set_coords(
                    #     [dd.cf["longitude"].name, dd.cf["latitude"].name]
                    # )

                    # use variable names to drop other variables (should. Ido this?)
                    if self.variables is not None:
                        # I don't think this is true with new approach
                        # # ERDDAP prepends variables with 's.' in netcdf files,
                        # # so include those with variables
                        # erd_vars = [f's.{var}' for var in self.variables]
                        # var_list = set(dd.data_vars) - (set(self.variables) | set(erd_vars))
                        var_list = set(dd.data_vars) - set(self.variables)
                        dd = dd.drop_vars(var_list)

                    # the lon/lat are on the 'timeseries' singleton dimension
                    # but the data_var variable was not, which messed up
                    # cf-xarray. When longitude and latitude are not on a
                    # dimension shared with a variable, the variable can't be
                    # called with cf-xarray. e.g. dd.cf['ssh'] won't work.
                    if "timeseries" in dd.dims:
                        for data_var in dd.data_vars:
                            if "timeseries" not in dd[data_var].dims:
                                dd[data_var] = dd[data_var].expand_dims(
                                    dim="timeseries", axis=1)

                except Exception as e:
                    logger.exception(e)
                    logger.warning("no data to be read in for %s" % dataset_id)
                    dd = None

            elif self.e.protocol == "griddap":

                try:
                    # this makes it read in the whole file which might be large
                    self.e.dataset_id = dataset_id
                    # dd = self.e.to_xarray(chunks="auto").sel(
                    #     time=slice(self.kw["min_time"], self.kw["max_time"])
                    # )
                    download_url = self.e.get_download_url(response="opendap")
                    dd = xr.open_dataset(download_url, chunks="auto").sel(
                        time=slice(self.kw["min_time"], self.kw["max_time"]))

                    if ("min_lat" in self.kw) and ("max_lat" in self.kw):
                        dd = dd.sel(latitude=slice(self.kw["min_lat"],
                                                   self.kw["max_lat"]))

                    if ("min_lon" in self.kw) and ("max_lon" in self.kw):
                        dd = dd.sel(longitude=slice(self.kw["min_lon"],
                                                    self.kw["max_lon"]))

                    # use variable names to drop other variables (should. Ido this?)
                    if self.variables is not None:
                        vars_list = set(dd.data_vars) - set(self.variables)
                        dd = dd.drop_vars(vars_list)

                except Exception as e:
                    logger.exception(e)
                    logger.warning("no data to be read in for %s" % dataset_id)
                    dd = None

        # return (dataset_id, dd)
        return dd

    # @property
    def data(self, dataset_ids=None):
        """Read in data for some or all dataset_ids.

        NOT USED CURRENTLY

        Once data is read in for a dataset_ids, it is remembered.

        See full documentation in `utils.load_data()`.
        """

        output = odg.utils.load_data(self, dataset_ids)
        return output
Ejemplo n.º 7
0
show_iframe(e.get_download_url(response="html"))

Additionally, the object has `.get_info_url()` and `.get_search_url()` that can be used to obtain the info and search URLs respectively

show_iframe(e.get_info_url(response="html"))

show_iframe(e.get_search_url(response="html"))

`erddapy` also brings some simple methods to download the data in some common data formats, like `pandas.DataFrame` and `xarray.Dataset`.

df = e.to_pandas(index_col="time (UTC)", parse_dates=True,).dropna()


df.head()

ds = e.to_xarray(decode_times=False)

ds["temperature"]

Here is a simple plot using the data from `xarray`.

%matplotlib inline

import matplotlib.dates as mdates
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(17, 5))
kw = dict(s=15, c=df["temperature (Celsius)"], marker="o", edgecolor="none")
cs = ax.scatter(df.index, df["depth (m)"], **kw)

ax.invert_yaxis()
Ejemplo n.º 8
0
def test_erddap_testing():
    erddap_server = 'https://ferret.pmel.noaa.gov/pmel/erddap'
    dataset_id = 'sd1035_2019'
    dataset_id = 'sd1041_2019'
    #dataset_id = 'sd1055'
    # dataset_id = 'saildrone_arctic_data'
    # dataset_id = 'fisheries_2020_all'
    dataset_id = 'sd1069'

    from erddapy import ERDDAP
    e = ERDDAP(
        server=erddap_server,
        protocol='tabledap',
    )
    e.response = 'csv'
    e.dataset_id = dataset_id

    ds = e.to_xarray()
    ds

    # Dataset level metadata to drive climatology extraction
    min_t = str(ds.time.min().dt.floor("D").dt.strftime("%Y-%m-%d").data)
    max_t = str(ds.time.max().dt.ceil("D").dt.strftime("%Y-%m-%d").data)
    min_x = float(ds.longitude.min().data)
    min_y = float(ds.latitude.min().data)
    max_x = float(ds.longitude.max().data)
    max_y = float(ds.latitude.max().data)
    bbox = [min_x, min_y, max_x, max_y]

    # Configure how each variable's config will be generated
    default_config = {
        "bbox": bbox,
        "start_time": min_t,
        "end_time": max_t,
        "tests": {
            "spike_test": {
                "suspect_threshold": "1",
                "fail_threshold": "2"
            },
            "gross_range_test": {
                "suspect_min": "min - std * 2",
                "suspect_max": "max + std / 2",
                "fail_min": "mean / std",
                "fail_max": "mean * std"
            }
        }
    }

    # For any variable name or standard_name you can define a custom config
    custom_config = {
        'air_temperature': {
            "variable": "air"
        },
        'air_pressure': {
            "variable": "pres"
        },
        'relative_humidity': {
            "variable": "rhum"
        },
        'sea_water_temperature': {
            "variable": "temperature"
        },
        'sea_water_practical_salinity': {
            "variable": "salinity"
        },
        'eastward_wind': {
            "variable": "uwnd"
        },
        'northward_wind': {
            "variable": "vwnd"
        }
    }

    # Generate climatology configs
    creator_config = {
        "datasets": [
            {
                "name": "ocean_atlas",
                "file_path": "resources/ocean_atlas.nc",
                "variables": {
                    "o2": "o_an",
                    "salinity": "s_an",
                    "temperature": "t_an"
                },
                "3d": "depth"
            },
            {
                "name": "narr",
                "file_path": "resources/narr.nc",
                "variables": {
                    "air": "air",
                    "pres": "slp",
                    "rhum": "rhum",
                    "uwnd": "uwnd",
                    "vwnd": "vwnd"
                }
            }
        ]
    }
    cc = CreatorConfig(creator_config)
    qccc = QcConfigCreator(cc)

    # Break down variable by standard name
    def not_stddev(v):
        return v and not v.endswith(' SD')

    # air_temp_vars = ds.filter_by_attrs(long_name=not_stddev, standard_name='air_temperature')
    # pressure_vars = ds.filter_by_attrs(long_name=not_stddev, standard_name='air_pressure')
    # humidity_vars = ds.filter_by_attrs(long_name=not_stddev, standard_name='relative_humidity')
    # water_temp_vars = ds.filter_by_attrs(long_name=not_stddev, standard_name='sea_water_temperature')
    # salinity_vars = ds.filter_by_attrs(long_name=not_stddev, standard_name='sea_water_practical_salinity')
    # uwind_vars = ds.filter_by_attrs(long_name=not_stddev, standard_name='eastward_wind')
    # vwind_vars = ds.filter_by_attrs(long_name=not_stddev, standard_name='northward_wind')
    # all_vars = [air_temp_vars, pressure_vars, humidity_vars, water_temp_vars, salinity_vars, uwind_vars, vwind_vars]
    # all_vars

    air_temp = ['air_temperature']
    pressure = ['air_pressure']
    humidity = ['relative_humidity']
    water_temp = ['sea_water_temperature']
    salt = ['sea_water_practical_salinity']
    u = ['eastward_wind']
    v = ['northward_wind']

    run_tests = air_temp + pressure + humidity + water_temp + salt + u + v
    final_config = {}

    for v in ds:
        da = ds[v]

        # Don't run tests for unknown variables
        if 'standard_name' not in da.attrs or da.attrs['standard_name'] not in run_tests:
            continue

        # The standard names are identical for the mean and the stddev
        # so ignore the stddev version of the variable
        if v.endswith('_STDDEV'):
            continue

        config = default_config.copy()

        min_t = str(da.time.min().dt.floor("D").dt.strftime("%Y-%m-%d").data)
        max_t = str(da.time.max().dt.ceil("D").dt.strftime("%Y-%m-%d").data)
        min_x = float(da.longitude.min().data)
        min_y = float(da.latitude.min().data)
        max_x = float(da.longitude.max().data)
        max_y = float(da.latitude.max().data)
        bbox = [min_x, min_y, max_x, max_y]

        config["bbox"] = bbox
        config["start_time"] = min_t
        config["end_time"] = max_t

        # Allow custom overrides on a variable name basis
        if v in custom_config:
            config.update(custom_config[v])

        # Allow custom overrides on a standard_name name basis
        if da.attrs['standard_name'] in custom_config:
            config.update(custom_config[da.attrs['standard_name']])

        # Generate the ioos_qc Config object
        qc_var = QcVariableConfig(config)
        qc_config = qccc.create_config(qc_var)

        # Strip off the variable that create_config added
        qc_config = list(qc_config.values())[0]

        # Add it to the final config
        final_config[v] = qc_config

    c = Config(final_config)
    xs = XarrayStream(ds, time='time', lat='latitude', lon='longitude')
    qc_results = xs.run(c)

    # Plotting code
    # all_results = collect_results(qc_results, how='list')
    # # spike tests dont work with nan values so it causes issue
    # # with the shared time coordinate variable. Some variables
    # # only output every 5 readings
    # # https://ferret.pmel.noaa.gov/pmel/erddap/tabledap/sd1069.htmlTable?UWND_MEAN%2CVWND_MEAN%2CTEMP_AIR_MEAN%2Clatitude%2Clongitude%2Ctime&time%3E=2020-10-24&time%3C=2020-10-26T18%3A59%3A00Z
    # new_ds = ds.isel(dict(obs=slice(None, None, 5)))
    # new_xs = XarrayStream(new_ds, time='time', lat='latitude', lon='longitude')
    # new_qc_results = new_xs.run(c)
    # every_five_results = collect_results(new_qc_results, how='list')

    # plots = []
    # for i, lr in enumerate(all_results):
    #     if lr.data.any() and lr.results.any():
    #         if not np.isnan(lr.data[1:101:5]).all():
    #             print(f"plotting all for {lr.stream_id}")
    #             plot = bokeh_plot_collected_result(lr)
    #         else:
    #             print(f"plotting every 5 for {lr.stream_id}")
    #             plot = bokeh_plot_collected_result(every_five_results[i])
    #         plots.append(plot)

    # kwargs = {
    #     'merge_tools': True,
    #     'toolbar_location': 'above',
    #     'sizing_mode': 'scale_width',
    #     'plot_width': 600,
    #     'plot_height': 280,
    #     'ncols': 2
    # }
    # gp = gridplot(plots, **kwargs)
    # plotting.show(gp)

    # Save a netCDF file
    ncd = CFNetCDFStore(qc_results)
    ncd.save(
        'results.nc',
        IncompleteMultidimensionalTrajectory,
        c,
        dsg_kwargs=dict(
            reduce_dims=True,
            unlimited=False,
            unique_dims=True
        )
    )
Ejemplo n.º 9
0
        'longitude<=': lon_lim[1],
    }

variables = [
    'depth', 'latitude', 'longitude', 'time', 'temperature', 'salinity', 'u',
    'v'
]

e = ERDDAP(server=url_erddap_Rutgers, protocol='tabledap', response='nc')

e.dataset_id = dataset_id + '-profile-sci-rt'
e.constraints = constraints
e.variables = variables

# Converting glider data xarray
ds = e.to_xarray()

#%% Get rid of nans

ttu, indt = np.unique(ds['time'].values, return_index='true')

oku = np.isfinite(ds['u'].values[indt])
ug = ds['u'].values[indt][oku]
time_ug = ds['time'].values[indt][oku]

okv = np.isfinite(ds['v'].values[indt])
vg = ds['v'].values[indt][okv]
time_vg = ds['time'].values[indt][okv]

#%% Plot