def query(url, **kw): df = pd.DataFrame() # we need to rstrip to prevent a '//' in the URL for some reason: url = url.rstrip("/") e = ERDDAP(server=url, protocol='tabledap', response='csv') # submit the query: try: # this is redundant to ERDDAPY API query below: #r = requests.get(e.get_search_url(**kw), headers=headers) #r.raise_for_status() print("Testing ERDDAP {}".format(url)) df = pd.read_csv("{}".format(e.get_search_url(**kw), headers=headers)) print("ERDDAP {} returned results from URL: {}".format( url, e.get_search_url(**kw))) df['server'] = url df.dropna(subset=['tabledap'], inplace=True) return df[[ 'server', 'Dataset ID', 'tabledap', 'Institution', 'Summary' ]] except Exception as ex: # can happen if the dataset does not have any features within the query window, just log it here: if type(ex).__name__ in ["HTTPError"]: print(ex) #raise pass return None
def retrieve_variable_names_erddap_server(url_erddap, dataset_id): """ Created on Tue Nov 3 11:26:05 2020 @author: aristizabal This function retrieves the variable names from the IOOS and Rutgers erddapp glider servers. Inputs: url_erddap: url address of erddap server Example: 'https://data.ioos.us/gliders/erddap' dataset_id: Example: 'ng231-20190901T0000' Outputs: variables: list of variables for the requested dataset_id """ from erddapy import ERDDAP e = ERDDAP(server=url_erddap, protocol='tabledap', response='nc') e.dataset_id = dataset_id df = e.to_pandas() variable_names = [var for var in df.columns] print('List of available variables ') print(variable_names) return variable_names
def active_drifters(bbox=None, time_start=None, time_end=None): bbox = bbox or [-100, -40, 18, 60] time_end = time_end or dt.date.today() time_start = time_start or (time_end - dt.timedelta(days=1)) t0 = time_start.strftime('%Y-%m-%dT%H:%M:%SZ') t1 = time_end.strftime('%Y-%m-%dT%H:%M:%SZ') e = ERDDAP(server='OSMC', protocol="tabledap") e.dataset_id = "gdp_interpolated_drifter" # Setting constraints e.constraints = { "time>=": t0, "time<=": t1, 'longitude>=': bbox[0], 'longitude<=': bbox[1], 'latitude>=': bbox[2], 'latitude<=': bbox[3], } # e.variables = [ # "WMO", # "latitude", # "longitude", # "time", # ] try: df = e.to_pandas() except ValueError: return pd.DataFrame() return df
def get_valid_stdnames(server_name): """Find all the `standard_name` attributes that exist on this ERDDAP endpoint, using [ERDDAP's "categorize" service] (http://www.neracoos.org/erddap/categorize/index.html)""" server = servers[server_name] server_url = server.get("url") # global e e = ERDDAP(server=server_url, protocol="tabledap") url_standard_names = f"{server_url}/categorize/standard_name/index.csv" df = pd.read_csv(urlopen(url_standard_names), skiprows=[1, 2]) standard_names = list(df["Category"].values) standard_names = remove_qcstdnames(standard_names) valid_standard_names = [] count = 0 print( "Checking the variables available for this server. This might take up to a couple of minutes...\n", ) for standard_name in standard_names: count += 1 if count == np.floor(len(standard_names) / 2): print("Halfway there...\n") elif count == np.floor((len(standard_names) / 4) * 3): print("Almost done...\n") elif count == (len(standard_names)): print("Done!") features, datasets = stdname2geojson( e, standard_name, server.get("cdm_data_type"), server.get("min_time"), server.get("max_time"), server.get("skip_datasets"), ) if len(datasets ) > 0: # if there is at least one dataset with this data var = e.get_var_by_attr( dataset_id=datasets[0], standard_name=lambda v: str(v).lower() == standard_name.lower( ), ) if var != []: valid_standard_names.append(standard_name) del features, datasets return valid_standard_names, server, e
def get_erddap_dataset(server, protocol, file_type, ds_id, var_list=None): e = ERDDAP(server=server, protocol=protocol, response=file_type) e.dataset_id = ds_id if var_list: e.variables = var_list ds = e.to_xarray() ds = ds.sortby(ds.time) return ds
def load_glider(dataset_id='ru32-20190102T1317-profile-sci-rt', server="http://slocum-data.marine.rutgers.edu/erddap"): ''' Load glider data from erddap. input dataset ID and server Returns an xarray dataset indexed on time ''' # should change: write to_netcdf, then check if netcdf exists e = ERDDAP( server=server, protocol="tabledap", response="nc", ) e.dataset_id = dataset_id gds = e.to_xarray() # want to have the dimention be time not obs number gds = gds.swap_dims({"obs": "time"}) gds = gds.sortby("time") # drop repeated time values gds = gds.sel(time=~gds.indexes['time'].duplicated()) # get the seafloor depths too e2 = ERDDAP( server="http://slocum-data.marine.rutgers.edu/erddap", protocol="tabledap", response="nc", ) # get some of the raw data: # e2.dataset_id = dataset_id[:-14] + 'trajectory-raw-rt' e2.dataset_id = dataset_id.replace('profile-sci', 'trajectory-raw') e2.variables = ['time', 'm_water_depth', 'm_pitch'] # this connects to the data and load into an xarray dataset gds_raw = e2.to_xarray().drop_dims('trajectory') # want to have the dimention be time not obs number gds_raw = gds_raw.swap_dims({"obs": "time"}) gds_raw = gds_raw.sortby("time") gds_raw = gds_raw.sel(time=~gds_raw.indexes['time'].duplicated()) # remove bad values: gds_raw['m_water_depth'] = gds_raw.m_water_depth.where( gds_raw.m_water_depth > 10, drop=True) gds['bottom_depth'] = gds_raw.m_water_depth.interp_like(gds, method='nearest') return gds
def __init__(self, erddap_server, constraints=None, protocol="tabledap", response="csv"): """Initialise based on given ERDDAP instance.""" self._erddap = ERDDAP( server=erddap_server, protocol=protocol, response=response, ) self._erddap.constraints = constraints or []
def retrieve_dataset_id_erddap_server(url_erddap, lat_lim, lon_lim, date_ini, date_end): """ Created on Tue Feb 5 10:05:37 2019 @author: aristizabal This function retrieves glider ids from the IOOS Data Assembly Center (DAC). Inputs: url_erddap: url address of erddap server Example: 'https://data.ioos.us/gliders/erddap' lat_lim: latitude limits for the search. Example, lat_lim = [38.0,40.0] lon_lim: longitude limits for the search. Example, lon_lim = [-75.0,-72.0] date_ini: initial date of time window. This function accepts the data formats '%Y-%m-%d T %H:%M:%S Z' and '%Y/%m/%d/%H'. Examaple: date_ini = '2018-08-02T00:00:00Z' date_end: initial date of time window. This function accepts the data formats '%Y-%m-%d T %H:%M:%S Z' and '%Y/%m/%d/%H'. Examaple: date_ini = '2018-08-10T00:00:00Z' Outputs: gliders: list of gliders ids that fall within the lat, lon and time constraints """ from erddapy import ERDDAP import pandas as pd e = ERDDAP(server=url_erddap) # Search constraints kw = { 'min_lon': lon_lim[0], 'max_lon': lon_lim[1], 'min_lat': lat_lim[0], 'max_lat': lat_lim[1], 'min_time': date_ini, 'max_time': date_end, } search_url = e.get_search_url(response='csv', **kw) search = pd.read_csv(search_url) # Extract the IDs gliders = search['Dataset ID'].values return gliders
def __init__(self, erddap_url=None): self._logger = logging.getLogger(os.path.basename(__file__)) self._erddap_url = erddap_url or 'https://gliders.ioos.us/erddap' self._protocol = 'tabledap' self._response_type = 'csv' self._items_per_page = 1e10 self._page = 1 self._client = ERDDAP(server=self._erddap_url, protocol=self._protocol, response=self._response_type) self._last_request = None # DataFrame containing the results of ERDDAP advanced search (endpoints, etc.) self._datasets_info = pd.DataFrame() # DataFrame containing dataset_id, start/end dates, profile count, etc. self._datasets_summaries = pd.DataFrame() self._datasets_profiles = pd.DataFrame() self._datasets_days = pd.DataFrame() self._profiles_variables = ['time', 'latitude', 'longitude', 'profile_id', 'wmo_id'] self._valid_search_kwargs = {'institution', 'ioos_category', 'long_name', 'standard_name', 'variable_name', 'min_lon', 'min_lat', 'max_lon', 'max_lat', 'min_time', 'max_time'} self._months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] self._calendar_types = ['datasets', 'days', 'profiles']
def return_glider_ids(kwargs): """ Searches an ERDDAP server for datasets and returns dataset IDs :param kwargs: dictionary containing coordinate and time limits :return: array containing dataset IDs """ e = ERDDAP(server=ioos_url) search_url = e.get_search_url(response='csv', **kwargs) try: search = pd.read_csv(search_url) ds_ids = search['Dataset ID'].values except: ds_ids = np.array([]) return ds_ids
class DatasetList: """Search servers for glider dataset ids. Defaults to the string "glider" Attributes: e: an ERDDAP server instance search_terms: A list of terms to search the server for. Multiple terms will be combined as AND """ def __init__(self, server=_server): self.e = ERDDAP( server=server, protocol="tabledap", ) @functools.lru_cache(maxsize=None) def _get_ids(self, search_terms): """Thin wrapper where inputs can be hashed for lru_cache.""" dataset_ids = pd.Series(dtype=str) for term in search_terms: url = self.e.get_search_url(search_for=term, response="csv") dataset_ids = dataset_ids.append(pd.read_csv(url)["Dataset ID"], ignore_index=True) self.dataset_ids = dataset_ids.str.split(";", expand=True).stack().unique() return self.dataset_ids def get_ids(self, search_terms=["glider"]): """Search the database using a user supplied list of comma separated strings :return: Unique list of dataset ids """ search_terms = tuple(search_terms) return self._get_ids(search_terms)
def gliders(): """Instantiate ERDDAP class for testing.""" # The gliders server has 1244 datasets at time of writing yield ERDDAP( server="https://gliders.ioos.us/erddap/", response="htmlTable", )
def _init_erddapy(self): # Init erddapy self.erddap = ERDDAP(server='http://www.ifremer.fr/erddap', protocol='tabledap') self.erddap.response = 'csv' self.erddap.dataset_id = 'ArgoFloats-index' return self
def get_valid_stdnames(server_name): """Find all the `standard_name` attributes that exist on this ERDDAP endpoint, using [ERDDAP's "categorize" service] (http://www.neracoos.org/erddap/categorize/index.html)""" server = servers[server_name] server_url = server.get("url") e = ERDDAP(server=server_url, protocol="tabledap") url_stdnames = f"{server_url}/categorize/standard_name/index.csv" df = pd.read_csv(urlopen(url_stdnames), skiprows=[1, 2]) stdnames = list(df["Category"].values) stdnames = remove_qcstdnames(stdnames) valid_stdnames = [] count = 0 display(pn.Column(pn.panel(progressbar.name), progressbar)) for stdname in stdnames: count += 1 progressbar.value = int(count / (len(stdnames)) * 100) df_stdname = get_datasets( e, stdname, server.get("cdm_data_type"), server.get("min_time"), server.get("max_time"), server.get("skip_datasets"), ) if not df_stdname.empty: var = e.get_var_by_attr( dataset_id=df_stdname.datasetID.values[0], standard_name=lambda v: str(v).lower() == stdname.lower(), ) if var != []: valid_stdnames.append(stdname) return valid_stdnames, server, e
def __init__(self, server=_server): self.fetcher = ERDDAP( server=server, protocol="tabledap", ) if "ifremer" in self.fetcher.server: self.fetcher.variables = ifremer_vars else: self.fetcher.variables = [ "depth", "latitude", "longitude", "salinity", "temperature", "time", ] self.fetcher.dataset_id: OptionalStr = None
def _init_erddapy(self): # Init erddapy self.erddap = ERDDAP( server=self.server, protocol='tabledap' ) self.erddap.response = 'csv' self.erddap.dataset_id = 'ArgoFloats-index' return self
def load_data(self, year='2019'): self.dfs = {} for index, row in self.df.iterrows(): if (self.glider_id in row['Dataset ID']) and (year in row['Dataset ID']): print(row['Dataset ID']) try: e = ERDDAP( server=self.server_url, protocol='tabledap', response='csv', ) e.dataset_id = row['Dataset ID'] e.constraints = self.constraints e.variables = self.variables[row['Dataset ID']] except HTTPError: print('Failed to generate url {}'.format( row['Dataset ID'])) continue self.dfs.update({ row['Dataset ID']: e.to_pandas( index_col='time (UTC)', parse_dates=True, skiprows=(1, ) # units information can be dropped. ) }) return (self.dfs)
def __init__(self, known_server='ioos', protocol=None, server=None, parallel=True): # # run checks for KW # self.kw = kw self.parallel = parallel # either select a known server or input protocol and server string if known_server == 'ioos': protocol = 'tabledap' server = 'http://erddap.sensors.ioos.us/erddap' elif known_server == 'coastwatch': protocol = 'griddap' server = 'http://coastwatch.pfeg.noaa.gov/erddap' elif known_server is not None: statement = 'either select a known server or input protocol and server string' assert (protocol is not None) & (server is not None), statement else: known_server = server.strip('/erddap').strip('http://').replace('.','_') statement = 'either select a known server or input protocol and server string' assert (protocol is not None) & (server is not None), statement self.known_server = known_server self.e = ERDDAP(server=server) self.e.protocol = protocol self.e.server = server # columns for metadata self.columns = ['geospatial_lat_min', 'geospatial_lat_max', 'geospatial_lon_min', 'geospatial_lon_max', 'time_coverage_start', 'time_coverage_end', 'defaultDataQuery', 'subsetVariables', # first works for timeseries sensors, 2nd for gliders 'keywords', # for hf radar 'id', 'infoUrl', 'institution', 'featureType', 'source', 'sourceUrl'] # name self.name = f'erddap_{known_server}' self.reader = 'ErddapReader'
def active_argo_floats(bbox=None, time_start=None, time_end=None, floats=None): """ :param lon_lims: list containing westernmost longitude and easternmost latitude :param lat_lims: list containing southernmost latitude and northernmost longitude :param time_start: time to start looking for floats :param time_end: time to end looking for floats :return: """ bbox = bbox or [-100, -45, 5, 46] time_end = time_end or dt.date.today() time_start = time_start or (time_end - dt.timedelta(days=1)) floats = floats or False constraints = { 'time>=': str(time_start), 'time<=': str(time_end), } if bbox: constraints['longitude>='] = bbox[0] constraints['longitude<='] = bbox[1] constraints['latitude>='] = bbox[2] constraints['latitude<='] = bbox[3] if floats: constraints['platform_number='] = floats variables = [ 'platform_number', 'time', 'pres', 'longitude', 'latitude', 'temp', 'psal', ] e = ERDDAP( server='IFREMER', protocol='tabledap', response='nc' ) e.dataset_id = 'ArgoFloats' e.constraints = constraints e.variables = variables try: df = e.to_pandas( parse_dates=['time (UTC)'], skiprows=(1,) # units information can be dropped. ).dropna() except HTTPError: df = pd.DataFrame() return df
def load_data(self,year='2019'): self.dfs = {} for index,row in self.df.iterrows(): if (self.glider_id in row['Dataset ID']) and (year in row['Dataset ID']): print(row['Dataset ID']) try: e = ERDDAP(server=self.server_url, protocol='tabledap', response='csv', ) e.dataset_id=row['Dataset ID'] e.constraints=self.constraints e.variables=self.variables[row['Dataset ID']] except HTTPError: print('Failed to generate url {}'.format(row['Dataset ID'])) continue self.dfs.update({row['Dataset ID']: e.to_pandas( index_col='time (UTC)', parse_dates=True, skiprows=(1,) # units information can be dropped. )}) return(self.dfs)
def _init_erddapy(self): # Init erddapy self.erddap = ERDDAP(server='http://www.ifremer.fr/erddap', protocol='tabledap') self.erddap.response = 'csv' if self.dataset_id == 'phy': self.erddap.dataset_id = 'ArgoFloats' elif self.dataset_id == 'ref': self.erddap.dataset_id = 'ArgoFloats-ref' elif self.dataset_id == 'bgc': self.erddap.dataset_id = 'ArgoFloats-bio' else: raise ValueError( "Invalid database short name for Ifremer erddap (use: 'phy', 'bgc' or 'ref')" ) return self
def _init_erddapy(self): # Init erddapy self.erddap = ERDDAP(server=self.server, protocol='tabledap') self.erddap.response = 'nc' # This is a major change in v0.4, we used to work with csv files if self.dataset_id == 'phy': self.erddap.dataset_id = 'ArgoFloats' elif self.dataset_id == 'ref': self.erddap.dataset_id = 'ArgoFloats-ref' elif self.dataset_id == 'bgc': self.erddap.dataset_id = 'ArgoFloats-bio' elif self.dataset_id == 'fail': self.erddap.dataset_id = 'invalid_db' else: raise ValueError( "Invalid database short name for Ifremer erddap (use: 'phy', 'bgc' or 'ref')" ) return self
def test_erddap_requests_kwargs(): """ Test that an ERDDAP instance can have requests_kwargs attribute assigned and are passed to the underlying methods """ base_url = "http://www.neracoos.org/erddap" timeout_seconds = 1 # request timeout in seconds slowwly_milliseconds = (timeout_seconds + 1) * 1000 slowwly_url = ("http://slowwly.robertomurray.co.uk/delay/" + str(slowwly_milliseconds) + "/url/" + base_url) connection = ERDDAP(slowwly_url) connection.dataset_id = "M01_sbe37_all" connection.protocol = "tabledap" connection.requests_kwargs["timeout"] = timeout_seconds with pytest.raises(ReadTimeout): connection.to_xarray()
def _init_erddapy(self): # Init erddapy self.erddap = ERDDAP(server=self.server, protocol="tabledap") self.erddap.response = ( "nc" # This is a major change in v0.4, we used to work with csv files ) if self.dataset_id == "phy": self.erddap.dataset_id = "ArgoFloats" elif self.dataset_id == "ref": self.erddap.dataset_id = "ArgoFloats-ref" elif self.dataset_id == "bgc": self.erddap.dataset_id = "ArgoFloats-bio" elif self.dataset_id == "fail": self.erddap.dataset_id = "invalid_db" else: raise ValueError( "Invalid database short name for Ifremer erddap (use: 'phy', 'bgc' or 'ref')" ) return self
def get_erddap_dataset(server, ds_id, variables=None, constraints=None): variables = variables or None constraints = constraints or None e = ERDDAP(server=server, protocol='tabledap', response='nc') e.dataset_id = ds_id if constraints: e.constraints = constraints if variables: e.variables = variables ds = e.to_xarray() ds = ds.sortby(ds.time) return ds
def get_erddap_dataset(ds_id, variables=None, constraints=None, filetype=None): """ Returns a netcdf dataset for a specified dataset ID (or dataframe if dataset cannot be converted to xarray) :param ds_id: dataset ID e.g. ng314-20200806T2040 :param variables: optional list of variables :param constraints: optional list of constraints :param filetype: optional filetype to return, 'nc' (default) or 'dataframe' :return: netcdf dataset """ variables = variables or None constraints = constraints or None filetype = filetype or 'nc' e = ERDDAP(server='NGDAC', protocol='tabledap', response='nc') e.dataset_id = ds_id if constraints: e.constraints = constraints if variables: e.variables = variables if filetype == 'nc': try: ds = e.to_xarray() ds = ds.sortby(ds.time) except OSError: print('No dataset available for specified constraints: {}'.format( ds_id)) ds = [] except TypeError: print('Cannot convert to xarray, providing dataframe: {}'.format( ds_id)) ds = e.to_pandas().dropna() elif filetype == 'dataframe': ds = e.to_pandas().dropna() else: print('Unrecognized filetype: {}. Needs to be "nc" or "dataframe"'. format(filetype)) return ds
def check_dataset_empty(url_erddap,dataset_id,date_ini,date_end,lon_lim,lat_lim): from erddapy import ERDDAP constraints = { 'time>=': date_ini, 'time<=': date_end, 'latitude>=': lat_lim[0], 'latitude<=': lat_lim[1], 'longitude>=': lon_lim[0], 'longitude<=': lon_lim[1], } variable_names = [ 'depth', 'latitude', 'longitude', 'time', 'temperature', 'salinity' ] e = ERDDAP( server=url_erddap, protocol='tabledap', response='nc' ) e.dataset_id = dataset_id e.constraints = constraints e.variables = variable_names # Converting glider data to data frame # Cheching that data frame has data df = e.to_pandas() if len(df) < 4: empty_dataset = True else: empty_dataset = False return empty_dataset
def get_erddap_data(dataset_id): ''' :param dataset_id: the deployment name example:'ce_311-20200708T1723' :return: pandas DataFrame with deployment variable values ''' e = ERDDAP( server='https://gliders.ioos.us/erddap', protocol='tabledap', ) e.response = 'csv' e.dataset_id = dataset_id e.variables = [ 'depth', 'latitude', 'longitude', 'salinity', 'temperature', 'conductivity', 'density', 'time', ] df = e.to_pandas() return df
ti = datetime.today() - timedelta(1) tini = datetime(ti.year,ti.month,ti.day) ''' #%% #tend = datetime(2019, 7, 27, 0, 0) #tini = datetime(2019, 7, 28, 0, 0) tini = datetime(2019, 9, 14, 0, 0) tend = datetime(2019, 9, 15, 0, 0) #%% Look for datasets in IOOS glider dac print('Looking for glider data sets') e = ERDDAP(server = url_glider) # Grab every dataset available datasets = pd.read_csv(e.get_search_url(response='csv', search_for='all')) # Search constraints kw = { 'min_lon': lon_lim[0], 'max_lon': lon_lim[1], 'min_lat': lat_lim[0], 'max_lat': lat_lim[1], 'min_time': tini.strftime('%Y-%m-%dT%H:%M:%SZ'), 'max_time': tend.strftime('%Y-%m-%dT%H:%M:%SZ'), } search_url = e.get_search_url(response='csv', **kw)
def neracoos(): """Instantiate ERDDAP class for testing.""" yield ERDDAP( server="http://www.neracoos.org/erddap/", response="htmlTable", )
variables = [ depth, 'latitude', 'longitude', salinity, temperature, 'time', ] # In[2]: e = ERDDAP( server=server, dataset_id=dataset_id, constraints=constraints, variables=variables, protocol='tabledap', response='mat', ) print(e.get_download_url()) # # Obtaining the data # # There are a few methods to obtain the data with *to_pandas()* and *to_xarray()*: # In[3]: df = e.to_pandas( index_col='time',
def sensors(): """Instantiate ERDDAP class for testing.""" yield ERDDAP( server="https://erddap.sensors.ioos.us/erddap/", response="htmlTable", )
def list_data(self, verbose=False): e = ERDDAP(server=self.server_url) self.df = pd.read_csv(e.get_search_url(response='csv', search_for=self.glider_id)) if verbose: print(self.df['Dataset ID'])