Beispiel #1
0
    def load_data(self, year='2019'):
        self.dfs = {}
        for index, row in self.df.iterrows():
            if (self.glider_id
                    in row['Dataset ID']) and (year in row['Dataset ID']):
                print(row['Dataset ID'])

                try:
                    e = ERDDAP(
                        server=self.server_url,
                        protocol='tabledap',
                        response='csv',
                    )
                    e.dataset_id = row['Dataset ID']
                    e.constraints = self.constraints
                    e.variables = self.variables[row['Dataset ID']]
                except HTTPError:
                    print('Failed to generate url {}'.format(
                        row['Dataset ID']))
                    continue
                self.dfs.update({
                    row['Dataset ID']:
                    e.to_pandas(
                        index_col='time (UTC)',
                        parse_dates=True,
                        skiprows=(1, )  # units information can be dropped.
                    )
                })

        return (self.dfs)
Beispiel #2
0
def active_drifters(bbox=None, time_start=None, time_end=None):
    bbox = bbox or [-100, -40, 18, 60]
    time_end = time_end or dt.date.today()
    time_start = time_start or (time_end - dt.timedelta(days=1))
    t0 = time_start.strftime('%Y-%m-%dT%H:%M:%SZ')
    t1 = time_end.strftime('%Y-%m-%dT%H:%M:%SZ')

    e = ERDDAP(server='OSMC', protocol="tabledap")
    e.dataset_id = "gdp_interpolated_drifter"

    # Setting constraints
    e.constraints = {
        "time>=": t0,
        "time<=": t1,
        'longitude>=': bbox[0],
        'longitude<=': bbox[1],
        'latitude>=': bbox[2],
        'latitude<=': bbox[3],
    }

    # e.variables = [
    #     "WMO",
    #     "latitude",
    #     "longitude",
    #     "time",
    # ]

    try:
        df = e.to_pandas()
    except ValueError:
        return pd.DataFrame()

    return df
def retrieve_variable_names_erddap_server(url_erddap, dataset_id):
    """
    Created on Tue Nov  3 11:26:05 2020

    @author: aristizabal

    This function retrieves the variable names from the IOOS
    and Rutgers erddapp glider servers.

    Inputs:
    url_erddap: url address of erddap server
                Example: 'https://data.ioos.us/gliders/erddap'
    dataset_id: Example: 'ng231-20190901T0000'

    Outputs:
    variables: list of variables for the requested dataset_id

    """

    from erddapy import ERDDAP

    e = ERDDAP(server=url_erddap, protocol='tabledap', response='nc')

    e.dataset_id = dataset_id

    df = e.to_pandas()

    variable_names = [var for var in df.columns]
    print('List of available variables ')
    print(variable_names)

    return variable_names
Beispiel #4
0
def active_argo_floats(bbox=None, time_start=None, time_end=None, floats=None):
    """

    :param lon_lims: list containing westernmost longitude and easternmost latitude
    :param lat_lims: list containing southernmost latitude and northernmost longitude
    :param time_start: time to start looking for floats
    :param time_end: time to end looking for floats
    :return:
    """

    bbox = bbox or [-100, -45, 5, 46]
    time_end = time_end or dt.date.today()
    time_start = time_start or (time_end - dt.timedelta(days=1))
    floats = floats or False

    constraints = {
        'time>=': str(time_start),
        'time<=': str(time_end),
    }

    if bbox:
        constraints['longitude>='] = bbox[0]
        constraints['longitude<='] = bbox[1]
        constraints['latitude>='] = bbox[2]
        constraints['latitude<='] = bbox[3]

    if floats:
        constraints['platform_number='] = floats

    variables = [
        'platform_number',
        'time',
        'pres',
        'longitude',
        'latitude',
        'temp',
        'psal',
    ]

    e = ERDDAP(
        server='IFREMER',
        protocol='tabledap',
        response='nc'
    )

    e.dataset_id = 'ArgoFloats'
    e.constraints = constraints
    e.variables = variables

    try:
        df = e.to_pandas(
            parse_dates=['time (UTC)'],
            skiprows=(1,)  # units information can be dropped.
        ).dropna()
    except HTTPError:
        df = pd.DataFrame()

    return df
def get_erddap_dataset(ds_id, variables=None, constraints=None, filetype=None):
    """
    Returns a netcdf dataset for a specified dataset ID (or dataframe if dataset cannot be converted to xarray)
    :param ds_id: dataset ID e.g. ng314-20200806T2040
    :param variables: optional list of variables
    :param constraints: optional list of constraints
    :param filetype: optional filetype to return, 'nc' (default) or 'dataframe'
    :return: netcdf dataset
    """
    variables = variables or None
    constraints = constraints or None
    filetype = filetype or 'nc'

    e = ERDDAP(server='NGDAC', protocol='tabledap', response='nc')
    e.dataset_id = ds_id
    if constraints:
        e.constraints = constraints
    if variables:
        e.variables = variables
    if filetype == 'nc':
        try:
            ds = e.to_xarray()
            ds = ds.sortby(ds.time)
        except OSError:
            print('No dataset available for specified constraints: {}'.format(
                ds_id))
            ds = []
        except TypeError:
            print('Cannot convert to xarray, providing dataframe: {}'.format(
                ds_id))
            ds = e.to_pandas().dropna()
    elif filetype == 'dataframe':
        ds = e.to_pandas().dropna()
    else:
        print('Unrecognized filetype: {}. Needs to  be "nc" or "dataframe"'.
              format(filetype))

    return ds
def check_dataset_empty(url_erddap,dataset_id,date_ini,date_end,lon_lim,lat_lim):

    from erddapy import ERDDAP

    constraints = {
        'time>=': date_ini,
        'time<=': date_end,
        'latitude>=': lat_lim[0],
        'latitude<=': lat_lim[1],
        'longitude>=': lon_lim[0],
        'longitude<=': lon_lim[1],
        }

    variable_names = [
            'depth',
            'latitude',
            'longitude',
            'time',
            'temperature',
            'salinity'
            ]

    e = ERDDAP(
            server=url_erddap,
            protocol='tabledap',
            response='nc'
            )

    e.dataset_id = dataset_id
    e.constraints = constraints
    e.variables = variable_names

    # Converting glider data to data frame
    # Cheching that data frame has data
    df = e.to_pandas()
    if len(df) < 4:
        empty_dataset = True
    else:
        empty_dataset = False

    return empty_dataset
    def load_data(self,year='2019'):
        self.dfs = {}
        for index,row in self.df.iterrows():
            if (self.glider_id in row['Dataset ID']) and (year in row['Dataset ID']):
                print(row['Dataset ID'])

                try:
                    e = ERDDAP(server=self.server_url,
                        protocol='tabledap',
                        response='csv',
                    )
                    e.dataset_id=row['Dataset ID']
                    e.constraints=self.constraints
                    e.variables=self.variables[row['Dataset ID']]
                except HTTPError:
                    print('Failed to generate url {}'.format(row['Dataset ID']))
                    continue
                self.dfs.update({row['Dataset ID']: e.to_pandas(
                                        index_col='time (UTC)',
                                        parse_dates=True,
                                        skiprows=(1,)  # units information can be dropped.
                                        )})  
                
        return(self.dfs)
Beispiel #8
0
def get_erddap_data(dataset_id):
    '''
    :param dataset_id: the deployment name example:'ce_311-20200708T1723'
    :return: pandas DataFrame with deployment variable values
    '''
    e = ERDDAP(
        server='https://gliders.ioos.us/erddap',
        protocol='tabledap',
    )
    e.response = 'csv'
    e.dataset_id = dataset_id
    e.variables = [
        'depth',
        'latitude',
        'longitude',
        'salinity',
        'temperature',
        'conductivity',
        'density',
        'time',
    ]

    df = e.to_pandas()
    return df
Beispiel #9
0
def get_ndbc(bbox=None, time_start=None, time_end=None, buoy=None):
    bbox = bbox or [-100, -45, 5, 46]
    time_end = time_end or dt.date.today()
    time_start = time_start or (time_end - dt.timedelta(days=1))
    buoy = buoy or False
    time_formatter = '%Y-%m-%dT%H:%M:%SZ'

    e = ERDDAP(
        server='CSWC',
        protocol='tabledap',
        response='csv'
    )

    e.dataset_id = 'cwwcNDBCMet'
    e.constraints = {
        'time>=': time_start.strftime(time_formatter),
        'time<=': time_end.strftime(time_formatter),
    }

    if bbox:
        e.constraints['longitude>='] = bbox[0]
        e.constraints['longitude<='] = bbox[1]
        e.constraints['latitude>='] = bbox[2]
        e.constraints['latitude<='] = bbox[3]

    e.variables = [
        "station",
        "latitude",
        "longitude",
        "time"
    ]

    if buoy:
        e.constraints['station='] = buoy

    df = e.to_pandas(
        parse_dates=['time (UTC)'],
        skiprows=(1,)  # units information can be dropped.
    ).dropna()

    stations = df.station.unique()

    # e.variables = [
    #     "station",
    #     "latitude",
    #     "longitude",
    #     "wd",
    #     "wspd",
    #     "gst",
    #     "wvht",
    #     "dpd",
    #     "apd",
    #     "mwd",
    #     "bar",
    #     "atmp",
    #     "wtmp",
    #     "dewp",
    #     # "vis",
    #     # "ptdy",
    #     # "tide",
    #     "wspu",
    #     "wspv",
    #     "time",
    # ]

    try:
        df = e.to_pandas(
            parse_dates=['time (UTC)'],
            skiprows=(1,)  # units information can be dropped.
        ).dropna()
    except HTTPError:
        df = pd.DataFrame()

    return df
Beispiel #10
0
class GliderDataFetcher(object):
    """
    Args:
        server: a glider ERDDAP server URL

    Attributes:
        dataset_id: a dataset unique id.
        constraints: download constraints, default None (opendap-like url)

    """
    def __init__(self, server=_server):
        self.fetcher = ERDDAP(
            server=server,
            protocol="tabledap",
        )
        if "ifremer" in self.fetcher.server:
            self.fetcher.variables = ifremer_vars
        else:
            self.fetcher.variables = [
                "depth",
                "latitude",
                "longitude",
                "salinity",
                "temperature",
                "time",
            ]
        self.fetcher.dataset_id: OptionalStr = None

    def to_pandas(self):
        """
        Fetches data from the server and reads into a pandas dataframe

        :return: pandas dataframe with datetime UTC as index
        """
        return self.fetcher.to_pandas(
            index_col="time (UTC)",
            parse_dates=True,
        )

    def query(self, min_lat, max_lat, min_lon, max_lon, start_time, end_time):
        """
        Takes user supplied geographical and time constraints and adds them to the query

        :param min_lat: southernmost lat
        :param max_lat: northermost lat
        :param min_lon: westernmost lon (-180 to +180)
        :param max_lon: easternmost lon (-180 to +180)
        :param start_time: start time, can be datetime object or string
        :param end_time: end time, can be datetime object or string
        :return: search query with argument constraints applied
        """
        self.fetcher.constraints = {
            "time>=": start_time,
            "time<=": end_time,
            "latitude>=": min_lat,
            "latitude<=": max_lat,
            "longitude>=": min_lon,
            "longitude<=": max_lon,
        }
        return self

    def platform(self, platform):
        """

        :param platform: platform and deployment id from ifremer
        :return: search query with platform constraint applied
        """
        self.fetcher.constraints["platform_deployment="] = platform
        return self
Beispiel #11
0
class WireWallMonitor:
    """A class to handle retrieval and plotting of WireWall data."""

    window_time_column = "time (UTC)"
    event_time_column = "event time (UTC)"
    series_column = "wireID (Dmnless)"
    datetime_fields = ["time (UTC)", "gpsTime (UTC)", "timestamp (UTC)"]

    def __init__(self,
                 erddap_server,
                 constraints=None,
                 protocol="tabledap",
                 response="csv"):
        """Initialise based on given ERDDAP instance."""
        self._erddap = ERDDAP(
            server=erddap_server,
            protocol=protocol,
            response=response,
        )

        self._erddap.constraints = constraints or []

    def _add_event_columns(self, df):
        """Add a new columns which apply to events."""
        # calculate the event height with the baseline removed
        df["event depth preferred (cm)"] = df["elMEAN (cm)"] - df[
            "MEDelMEAN (cm)"]
        df["event depth fallback (cm)"] = df["elPTILE_6 (cm)"] - df[
            "MEDelPTILE_2 (cm)"]

        df[self.event_time_column] = df[self.window_time_column].copy()
        time_delta = df["sampleNUM (Dmnless)"] - df["sampleNUM10 (Dmnless)"]

        # events occur at ~400Hz
        time_delta /= 400

        # we need this as an actual timedelta
        time_delta = time_delta.apply(pd.to_timedelta, unit="S")

        # check all events occur in the interval [0, 10] mins from the window start time
        if time_delta.max() > pd.to_timedelta("10m"):
            warn(
                "Data has an event that occurs after the 10min sample window.",
                UserWarning,
            )

        if time_delta.min() < pd.to_timedelta("0m"):
            warn(
                "Data has an event that occurs before the 10min sample window.",
                UserWarning,
            )

        df[self.event_time_column] += time_delta

    def _get_dataframe(self, dataset_id):
        """Retrieve a dataframe for the given dataset_id."""
        self._erddap.dataset_id = dataset_id

        df = self._erddap.to_pandas(parse_dates=self.datetime_fields)
        df[self.series_column] = df[self.series_column].astype(str)
        self._add_event_columns(df)

        self._erddap.dataset_id = None

        return df

    def _plot_dataframe(self, df, x, y):
        """Plot the given columns of the dataframe."""
        fig = px.scatter(df, x=x, y=y, color=self.series_column)

        fig.update_layout(
            yaxis_title=y,
            xaxis=_XAXIS_FORMAT,
            margin=_MARGIN_FORMAT,
        )

        return fig

    def _plot_window_variables(self, df, column_names, column_names_secondary):
        """Plot variables that are constant over a window timespan."""
        # since these variables are constant over a given window, for a given wire
        # we can remove any rows which are duplicated
        df = df.drop_duplicates([self.window_time_column, self.series_column],
                                keep="first")

        figs = [None] * len(column_names)

        # use a loop so we can update each fig
        for i, (name, name_secondary) in enumerate(
                zip(column_names, column_names_secondary)):
            subfig1 = self._plot_dataframe(
                df,
                x=self.window_time_column,
                y=name,
            )

            yaxis_title = name

            if name_secondary is None:
                fig = subfig1
            else:
                fig = make_subplots()

                # get the units from the columns
                units = {s.split(" ")[1] for s in [name, name_secondary]}
                yaxis_title = "value " + " or ".join(units)

                subfig2 = self._plot_dataframe(
                    df,
                    x=self.window_time_column,
                    y=name_secondary,
                )

                # since this plot now has two series, rename them both
                subfig2.for_each_trace(lambda trace: trace.update(
                    name=f"Wire {trace.name} {name_secondary}"))
                subfig1.for_each_trace(lambda trace: trace.update(
                    name=f"Wire {trace.name} {name}"))

                # distinguish the second trace from the first
                subfig2.update_traces(
                    marker_symbol="square",
                    line_dash="dot",
                )

                # combine the traces into one figure
                fig.add_traces(subfig1.data + subfig2.data)

            fig.update_traces(mode="lines+markers",
                              selector=dict(type="scatter"))
            fig.update_layout(
                yaxis_title=yaxis_title,
                xaxis_title=self.window_time_column,
                xaxis=_XAXIS_FORMAT,
                margin=_MARGIN_FORMAT,
            )

            figs[i] = fig

        return figs

    def _plot_event_variables(self, df, column_names):
        """Plot event variables."""
        # some windows don't have any events and so there may be rows without any
        # sample num value. We are only interested in actual events, so remove them
        df = df.dropna(subset=[self.event_time_column]).copy()

        figs = [None] * len(column_names)

        # use a loop so we can update each fig
        for i, name in enumerate(column_names):
            figs[i] = self._plot_dataframe(df,
                                           x=self.event_time_column,
                                           y=name)

        return figs

    def plot_variables(
        self,
        dataset_id,
        window_variables=None,
        window_variables_secondary=None,
        event_variables=None,
    ):
        """Plot all the window and event variables for a given dataset.

        Args:
            dataset_id (str): the string identifier for the ERDDAP dataset.
            window_variables (list): a list of variable names (including units) which are
                constant over each window, to be plotted.
            event_variables (list): a list of variable names (including units) which are
                specific to each event, to be plotted.

        Returns: a list of figures generated, and calls .show() on all of them.
        """
        window_variables = window_variables or []
        window_variables_secondary = window_variables_secondary or [
            None
        ] * len(window_variables)
        event_variables = event_variables or []

        df = self._get_dataframe(dataset_id)
        window_figs = self._plot_window_variables(df, window_variables,
                                                  window_variables_secondary)
        event_figs = self._plot_event_variables(df, event_variables)

        figs = [*window_figs, *event_figs]

        for fig in figs:
            fig.show()

        return figs
    protocol='tabledap',
    response='mat',
)

print(e.get_download_url())


# # Obtaining the data
# 
# There are a few methods to obtain the data with *to_pandas()* and *to_xarray()*:

# In[3]:

df = e.to_pandas(
    index_col='time',
    parse_dates=True,
    skiprows=(1,)  # units information can be dropped.
).dropna()


# In[4]:

df.head()


# # Let's plot the data

# # Exploring an ERDDAP server

# In[5]:
Beispiel #13
0
    'WindSpeed',
    'WaveHeight',
    'WavePeriod',
    'MeanWaveDirection',
    # 'Hmax',
    # 'AirTemperature',
    'SeaTemperature'
]


url = e.get_download_url()

print(url)

df = e.to_pandas(
    index_col='time (UTC)',
    parse_dates=True).dropna()

df.shape

df.columns

cols = ['AtmosphericPressure', 'WindDirection', 'WindSpeed',
        'WaveHeight', 'WavePeriod', 'MeanWaveDirection', 'SeaTemperature']

# rename columns
df.columns = cols

df['Year'] = df.index.year
df['Month'] = df.index.month
df['Day'] = df.index.day
Beispiel #14
0
    iframe = '<iframe src="{src}" width="100%" height="950"></iframe>'.format
    return HTML(iframe(src=src))


show_iframe(e.get_download_url(response="html"))

Additionally, the object has `.get_info_url()` and `.get_search_url()` that can be used to obtain the info and search URLs respectively

show_iframe(e.get_info_url(response="html"))

show_iframe(e.get_search_url(response="html"))

`erddapy` also brings some simple methods to download the data in some common data formats, like `pandas.DataFrame` and `xarray.Dataset`.

df = e.to_pandas(index_col="time (UTC)", parse_dates=True,).dropna()


df.head()

ds = e.to_xarray(decode_times=False)

ds["temperature"]

Here is a simple plot using the data from `xarray`.

%matplotlib inline

import matplotlib.dates as mdates
import matplotlib.pyplot as plt
    e = ERDDAP(
        server="http://erddap.aoos.org/erddap/",
        protocol="tabledap"
    )
    e.dataset_id = "kotzebue-alaska-water-level"
    e.constraints = {
        "time>=": "2018-09-05T21:00:00Z",
        "time<=": "2019-07-10T19:00:00Z",
    }
    e.variables = [
        variable_name,
        "time",
        "z",
    ]
    data = e.to_pandas(
        index_col="time (UTC)",
        parse_dates=True,
    )
    data["timestamp"] = data.index.astype("int64") // 1e9
    data.to_csv(fname)

data.head()

from ioos_qc.config import QcConfig


qc = QcConfig(qc_config)

qc_results =  qc.run(
    inp=data["sea_surface_height_above_sea_level_geoid_mhhw (m)"],
    tinp=data["timestamp"],
    zinp=data["z (m)"],
def read_glider_variables_erddap_server(url_erddap,dataset_id,\
                                   lat_lim,lon_lim,\
                                   variable_names=['time'],
                                    **kwargs):
    """
    Created on Tue Nov  3 11:26:05 2020

    @author: aristizabal

    This function reads glider variables from the IOOS
    and Rutgers erddapp glider servers.

    Inputs:
    url_erddap: url address of erddap server
                Example: 'https://data.ioos.us/gliders/erddap'
    dataset_id: Example: 'ng231-20190901T0000'
    variable_names: list of variable names.
                    Example:
                            variable_names = ['depth',
                                            'latitude',
                                            'longitude',
                                            'time',
                                            'temperature',
                                            'salinity']
                    The default value is variable_names=['time']
    lat_lim: latitude limits for the search.
            Example, lat_lim = [38.0,40.0]
    lon_lim: longitude limits for the search.
            Example, lon_lim = [-75.0,-72.0]
    date_ini: initial date of time window.
        This function accepts the data formats '%Y-%m-%d T %H:%M:%S Z' and '%Y/%m/%d/%H'.
        Examaple: date_ini = '2018-08-02T00:00:00Z' or '2018/08/02/00'
    date_end: initial date of time window.
        This function uses the data format '%Y-%m-%d T %H:%M:%S Z'.
        Examaple: date_ini = '2018-08-10T00:00:00Z' and '2018/08/10/00'

    Outputs:
    df: Pandas data frame with all the variables requested as vectors

    """

    from erddapy import ERDDAP
    import numpy as np

    date_ini = kwargs.get('date_ini', None)
    date_end = kwargs.get('date_end', None)

    # Find time window of interest
    if np.logical_or(date_ini == None, date_end == None):
        constraints = {
            'latitude>=': lat_lim[0],
            'latitude<=': lat_lim[1],
            'longitude>=': lon_lim[0],
            'longitude<=': lon_lim[1],
        }
    else:
        constraints = {
            'time>=': date_ini,
            'time<=': date_end,
            'latitude>=': lat_lim[0],
            'latitude<=': lat_lim[1],
            'longitude>=': lon_lim[0],
            'longitude<=': lon_lim[1],
        }

    e = ERDDAP(server=url_erddap, protocol='tabledap', response='nc')

    e.dataset_id = dataset_id
    e.constraints = constraints
    e.variables = variable_names

    # Converting glider data to data frame
    # Cheching that data frame has data
    df = e.to_pandas()
    if len(df) > 3:

        df = e.to_pandas(parse_dates=True)

    return df
Beispiel #17
0
    'longitude>=': lon_lim[0],
    'longitude<=': lon_lim[-1],
}

variables = ['time', 'latitude', 'longitude']

#%%

e = ERDDAP(server=server, protocol='tabledap', response='nc')

for id in gliders:
    e.dataset_id = id
    e.constraints = constraints
    e.variables = variables

    df = e.to_pandas(parse_dates=True)

    print(id, df.index[-1])

#%% Reading bathymetry data

ncbath = xr.open_dataset(bath_file)
bath_lat = ncbath.variables['lat'][:]
bath_lon = ncbath.variables['lon'][:]
bath_elev = ncbath.variables['elevation'][:]

oklatbath = np.logical_and(bath_lat >= lat_lim[0], bath_lat <= lat_lim[-1])
oklonbath = np.logical_and(bath_lon >= lon_lim[0], bath_lon <= lon_lim[-1])

bath_latsub = bath_lat[oklatbath]
bath_lonsub = bath_lon[oklonbath]
Beispiel #18
0
        'time>=': str(x.tm_year)+"-"+str(x.tm_mon).zfill(2)+"-"+str(x.tm_mday).zfill(2)+"T"+str(x.tm_hour).zfill(2)+":00:00Z",
        'longitude>=': -80.0,
        'longitude<=': 80.0,
        'platform_type=': "DRIFTING BUOYS (GENERIC)",
        'platform_code=': str(wmo_mb[i]),
    }        
    e.variables = [
        'platform_code',
        'time',
        'latitude',
        'longitude',
        'sst',
        'slp',
    ]
    try:
        df = e.to_pandas()
    except:
        print("Não há dados para o WMO "+str(wmo_mb[i]))

    try:       
        df.columns = ['id', 'tempo','lat','lon','sst','pres']

        df.id=sat[i]
        
        dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ')
        
        df['tempo'] = pd.to_datetime(df['tempo'])
        
        df = df.set_index('tempo')
        
        df['lat']=df.lat.round(4)
Beispiel #19
0
class NDBC():
    def __init__(self, station_id, deploy_id, WMO, currentTime, startTime,
                 data_map, name_map):
        self.station_id = station_id
        self.deploy_id = deploy_id
        self.WMO = WMO
        self.now = currentTime
        self.startTime = startTime
        self.data_map = data_map
        self.name_map = name_map

    def adjust_pressure_to_sea_level(self, pres, temp, height):
        """Adjust barometric presure to sea-level."""
        temp = temp + 273.15
        slp = pres / np.exp(-height / (temp * 29.263))
        return slp

    def calculate_wind_speed(self, eastward, northward):
        """Calculate absolute wind speed from component wind vector."""
        u = np.square(eastward)
        v = np.square(northward)
        wind_speed = np.sqrt(u + v)
        return wind_speed

    def calculate_wind_direction(self, eastward, northward):
        """Calculate met wind direction from component wind vectors."""
        u = eastward
        v = northward
        wind_direction = 180 / np.pi * np.arctan2(-u, -v)
        return wind_direction

    def _connect_erddap(self,
                        server="http://ooivm1.whoi.net/erddap",
                        protocol="tabledap"):
        """Connect to the erddap server."""
        self._erddap = ERDDAP(server=server, protocol=protocol)

    def list_datasets(self):
        """Get the available datasets for the ERDDAP server."""
        # First, make the connection
        self._connect_erddap()
        # Next, get the datasets
        datasets = pd.read_csv(
            self._erddap.get_search_url(search_for=self.station_id,
                                        response='csv'))['Dataset ID']
        return datasets

    def get_dataset(self, dataset):
        """Get the data for specified datasets."""
        # First, have to re-establish the erddap connection
        self._connect_erddap()

        # Next, get the data for a dataset
        self._erddap.dataset_id = dataset

        # Only want the variables with standard names
        variables = self._erddap.get_var_by_attr(
            standard_name=lambda v: v is not None)
        self._erddap.variables = variables

        # Limit the data request to the current deployment
        self._erddap.constraints = {
            'deploy_id=': self.deploy_id,
            'time>=': self.startTime.strftime('%Y-%m-%dT%H:%M:%SZ')
        }

        try:
            # Download the data
            data = self._erddap.to_pandas(index_col='time (UTC)',
                                          parse_dates=True)

            # Sometimes it just returns an empty dataframe instead of an error
            if data.size == 0:
                data = self._create_empty_dataset()

        except:
            # If there is no available data in the requested time window, need
            # to create an empty dataframe of the data
            data = self._create_empty_dataset()

        # Return the dataset data
        return data

    def process_METBK_data(self, df, freq='10T'):
        """Process the METBK into the correct format and values for NDBC."""
        # Resample the data
        df_binned = df.resample(freq).mean()

        # Check that barometric pressure
        if 'barometric_pressure (mbar)' in df_binned.columns:
            # Adjust the barometric pressure to sea-level
            df_binned[
                'sea_level_pressure (hPa)'] = self.adjust_pressure_to_sea_level(
                    df_binned['barometric_pressure (mbar)'],
                    df_binned['air_temperature (degree_Celsius)'], 4.05)
        else:
            df_binned['sea_level_pressure (hPa)'] = np.nan

        # Check that the wind vector components are in the dataframe
        if 'eastward_wind_velocity (m s-1)' in df_binned.columns:
            # Calculate the wind speed
            df_binned['wind speed (m/s)'] = self.calculate_wind_speed(
                df_binned['eastward_wind_velocity (m s-1)'],
                df_binned['northward_wind_velocity (m s-1)'])

            # Calculate the wind direction
            df_binned['wind direction'] = self.calculate_wind_direction(
                df_binned['eastward_wind_velocity (m s-1)'],
                df_binned['northward_wind_velocity (m s-1)'])
            df_binned['wind direction'] = df_binned["wind direction"].apply(
                lambda x: x + 360 if x < 0 else x)

            # Don't need cardinal direction -> want direction in degrees
            # df_binned["wind direction"] = df_binned["wind direction"].apply(
            #   lambda x: self.get_cardinal_direction(np.round(x, decimals=2)))
        else:
            df_binned['wind speed (m/s)'] = np.nan
            df_binned['wind direction'] = np.nan

        # Return the processed data
        return df_binned

    def process_WAVSS_data(self, df, freq='10T'):
        """Much simpler function for processing the WAVSS data."""
        # Resample the data
        df_binned = df.resample(freq).mean()

        # Return the data
        return df_binned

    def _create_empty_dataset(self):
        """
        Create a dataset of all nans if there is no data available for
        the requested dataset in the given time period.
        """
        # Get the units for the corresponding variables
        info_url = self._erddap.get_info_url(
            dataset_id=self._erddap.dataset_id, response='csv')
        info = pd.read_csv(info_url)
        units = info[info['Attribute Name'] == 'units']

        # Now, add the units to the variable names
        columns = []
        for var in self._erddap.variables:
            unit = units[units['Variable Name'] == var]['Value'].values
            if len(unit) == 0:
                columns.append(f'{var}')
            elif var == 'time':
                pass
            else:
                columns.append(f'{var} ({unit[0]})')

        # Create an array of nans to fill out the empty dataframe
        empty_array = np.empty((2, len(columns)))
        empty_array[:] = np.nan

        # Put the empty array into a dataframe
        empty_df = pd.DataFrame(data=empty_array,
                                columns=columns,
                                index=[self.startTime, self.now])
        empty_df.index.name = 'time (UTC)'

        return empty_df

    def process_datasets(self, datasets):
        """Process the data for individual datasets."""
        self.datasets = datasets

        # Get the data for the individual datasets
        for dset in self.datasets.keys():
            self.datasets.update({dset: self.get_dataset(dset)})

        # Process the data
        for dset in self.datasets.keys():
            if 'METBK' in dset:
                self.datasets[dset] = self.process_METBK_data(
                    self.datasets[dset])
            else:
                self.datasets[dset] = self.process_WAVSS_data(
                    self.datasets[dset])

        # Add a header to the data in the datasets
        for key in self.datasets.keys():
            header = key.split('-', 2)[-1]
            for col in self.datasets.get(key).columns:
                self.datasets.get(key).rename(
                    columns={col: ' '.join((header, col))}, inplace=True)

    def parse_data_to_xml(self, data):
        """
        Function which takes in the 10-minute average buoy data,
        the station name, and two dictionaries which map the buoy
        column names to the xml tags, and outputs an xml file in
        the NDBC format.

        Returns:
            xml - a properly constructed xml file in the NDBC
            format for the given buoy data
        """

        # Start the xml file
        xml = ['<?xml version="1.0" encoding="ISO-8859-1"?>']

        # Iterate through the data
        for index in data.index:

            # Get the data associated with a row in the dataframe
            row = data.loc[index]

            # Reset a dictionary of the data
            xml_data = {}
            for key in self.data_map.keys():
                xml_data.update({key: self.data_map.get(key)})

            # Parse the data into the data dictionary
            for key in xml_data.keys():
                # Get the column name which corresponds to the ndbc tag
                column = self.name_map.get(key)
                # Check that the column was returned from the ERDDAP server
                if column in row.index:
                    value = row[column]
                    # If a nan, just leave it the default -9999
                    if str(value) == 'nan':
                        pass
                    else:
                        xml_data[key] = value
                # If no data, leave it as default -9999
                else:
                    pass

            # Write the parsed data to the xml file
            # Start the message
            xml.append('<message>')

            # Add in the station id
            xml.append(f'  <station>{self.WMO}</station>')

            # Get the time index
            time = row.name.strftime('%m/%d/%Y %H:%M:%S')
            xml.append(f'  <date>{time}</date>')

            # Missing fill value
            missing = str(-9999)
            xml.append(f'  <missing>{missing}</missing>')

            # Roundtime
            xml.append('  <roundtime>no</roundtime>')

            # Start of the data
            xml.append('  <met>')

            # Add in each data piece
            for tag in xml_data.keys():
                # Get the value
                value = xml_data.get(tag)
                value = str(value)
                # Add the data to the xml file
                xml.append(f'    <{tag}>{value}</{tag}>')

            # Finish off the message
            xml.append('  </met>')
            xml.append('</message>')

        # Return the results
        return xml
Beispiel #20
0
from datetime import date

from erddapy import ERDDAP

server = "http://osmc.noaa.gov/erddap"
e = ERDDAP(server=server, protocol="tabledap")

e.dataset_id = "ioos_obs_counts"
e.variables = ["time", "locationID", "region", "sponsor", "met", "wave"]
e.constraints = {
    "time>=": "2019-09",
    "time<": "2020-11",
}

df = e.to_pandas(parse_dates=True)

df["locationID"] = df["locationID"].str.lower()

df.tail()

The table has all the ingest data from 2019-01-01 to 2020-06-01. We can now explore it grouping the data by IOOS Regional Association (RA).

groups = df.groupby("region")

ax = groups.sum().plot(kind="bar", figsize=(11, 3.75))
ax.yaxis.get_major_formatter().set_scientific(False)
ax.set_ylabel("# observations");

Let us check the monthly sum of data released both for individuak met and wave and the totdals.
Beispiel #21
0
def active_gliders(bbox=None, time_start=None, time_end=dt.date.today(), glider_id=None):
    bbox = bbox or [-100, -40, 18, 60]
    time_start = time_start or (time_end - dt.timedelta(days=1))
    t0 = time_start.strftime('%Y-%m-%dT%H:%M:%SZ')
    t1 = time_end.strftime('%Y-%m-%dT%H:%M:%SZ')
    glider_id = glider_id or None

    e = ERDDAP(server='NGDAC')

    # Grab every dataset available
    # datasets = pd.read_csv(e.get_search_url(response='csv', search_for='all'))

    # Search constraints
    kw = dict()
    kw['min_time'] = t0
    kw['max_time'] = t1

    if bbox:
        kw['min_lon'] = bbox[0]
        kw['max_lon'] = bbox[1]
        kw['min_lat'] = bbox[2]
        kw['max_lat'] = bbox[3]

    if glider_id:
        search = glider_id
    else:
        search = None

    search_url = e.get_search_url(search_for=search, response='csv', **kw)

    try:
        # Grab the results
        search = pd.read_csv(search_url)
    except:
        # return empty dataframe if there are no results
        return pd.DataFrame()

    # Extract the IDs
    gliders = search['Dataset ID'].values

    msg = 'Found {} Glider Datasets:\n\n{}'.format
    print(msg(len(gliders), '\n'.join(gliders)))

    # Setting constraints
    constraints = {
            'time>=': t0,
            'time<=': t1,
            'longitude>=': bbox[0],
            'longitude<=': bbox[1],
            'latitude>=': bbox[2],
            'latitude<=': bbox[3],
            }

    variables = [
            'depth',
            'latitude',
            'longitude',
            'time',
            'temperature',
            'salinity',
            ]

    e = ERDDAP(
            server='NGDAC',
            protocol='tabledap',
            response='nc'
    )

    glider_dfs = []

    for id in gliders:
        # print('Reading ' + id)
        e.dataset_id = id
        e.constraints = constraints
        e.variables = variables

        # checking data frame is not empty
        try:
            df = e.to_pandas(
                index_col='time (UTC)',
                parse_dates=True,
                skiprows=(1,)  # units information can be dropped.
            ).dropna()
        except:
            continue
        df = df.reset_index()
        df['dataset_id'] = id
        df = df.set_index(['dataset_id', 'time (UTC)'])
        glider_dfs.append(df)

    try:
        ndf = pd.concat(glider_dfs)
    except ValueError:
        return pd.DataFrame()

    return ndf
Beispiel #22
0
def GOFS_RTOFS_vs_Argo_floats(lon_forec_track, lat_forec_track, lon_forec_cone,
                              lat_forec_cone, lon_best_track, lat_best_track,
                              lon_lim, lat_lim, folder_fig):
    #%% User input

    #GOFS3.1 output model location
    url_GOFS_ts = 'http://tds.hycom.org/thredds/dodsC/GLBy0.08/expt_93.0/ts3z'

    # RTOFS files
    folder_RTOFS = '/home/coolgroup/RTOFS/forecasts/domains/hurricanes/RTOFS_6hourly_North_Atlantic/'

    nc_files_RTOFS = ['rtofs_glo_3dz_f006_6hrly_hvr_US_east.nc',\
                      'rtofs_glo_3dz_f012_6hrly_hvr_US_east.nc',\
                      'rtofs_glo_3dz_f018_6hrly_hvr_US_east.nc',\
                      'rtofs_glo_3dz_f024_6hrly_hvr_US_east.nc']

    # COPERNICUS MARINE ENVIRONMENT MONITORING SERVICE (CMEMS)
    url_cmems = 'http://nrt.cmems-du.eu/motu-web/Motu'
    service_id = 'GLOBAL_ANALYSIS_FORECAST_PHY_001_024-TDS'
    product_id = 'global-analysis-forecast-phy-001-024'
    depth_min = '0.493'
    out_dir = '/home/aristizabal/crontab_jobs'

    # Bathymetry file
    #bath_file = '/Users/aristizabal/Desktop/MARACOOS_project/Maria_scripts/nc_files/GEBCO_2014_2D_-100.0_0.0_-60.0_45.0.nc'
    bath_file = '/home/aristizabal/bathymetry_files/GEBCO_2014_2D_-100.0_0.0_-10.0_50.0.nc'

    # Argo floats
    url_Argo = 'http://www.ifremer.fr/erddap'

    #%%

    from matplotlib import pyplot as plt
    import numpy as np
    import xarray as xr
    import netCDF4
    from datetime import datetime, timedelta
    import cmocean
    import matplotlib.dates as mdates
    from erddapy import ERDDAP
    import pandas as pd
    import os

    # Do not produce figures on screen
    plt.switch_backend('agg')

    # Increase fontsize of labels globally
    plt.rc('xtick', labelsize=14)
    plt.rc('ytick', labelsize=14)
    plt.rc('legend', fontsize=14)

    #%% Reading bathymetry data

    ncbath = xr.open_dataset(bath_file)
    bath_lat = ncbath.variables['lat'][:]
    bath_lon = ncbath.variables['lon'][:]
    bath_elev = ncbath.variables['elevation'][:]

    oklatbath = np.logical_and(bath_lat >= lat_lim[0], bath_lat <= lat_lim[-1])
    oklonbath = np.logical_and(bath_lon >= lon_lim[0], bath_lon <= lon_lim[-1])

    bath_latsub = bath_lat[oklatbath]
    bath_lonsub = bath_lon[oklonbath]
    bath_elevs = bath_elev[oklatbath, :]
    bath_elevsub = bath_elevs[:, oklonbath]

    #%% Get time bounds for current day
    #ti = datetime.today()
    ti = datetime.today() - timedelta(1) - timedelta(hours=6)
    tini = datetime(ti.year, ti.month, ti.day)
    te = ti + timedelta(2)
    tend = datetime(te.year, te.month, te.day)

    #%% Look for Argo datasets

    e = ERDDAP(server=url_Argo)

    # Grab every dataset available
    #datasets = pd.read_csv(e.get_search_url(response='csv', search_for='all'))

    kw = {
        'min_lon': lon_lim[0],
        'max_lon': lon_lim[1],
        'min_lat': lat_lim[0],
        'max_lat': lat_lim[1],
        'min_time': str(tini),
        'max_time': str(tend),
    }

    search_url = e.get_search_url(response='csv', **kw)

    # Grab the results
    search = pd.read_csv(search_url)

    # Extract the IDs
    dataset = search['Dataset ID'].values

    msg = 'Found {} Datasets:\n\n{}'.format
    print(msg(len(dataset), '\n'.join(dataset)))

    dataset_type = dataset[0]

    constraints = {
        'time>=': str(tini),
        'time<=': str(tend),
        'latitude>=': lat_lim[0],
        'latitude<=': lat_lim[1],
        'longitude>=': lon_lim[0],
        'longitude<=': lon_lim[1],
    }

    variables = [
        'platform_number',
        'time',
        'pres',
        'longitude',
        'latitude',
        'temp',
        'psal',
    ]

    e = ERDDAP(server=url_Argo, protocol='tabledap', response='nc')

    e.dataset_id = dataset_type
    e.constraints = constraints
    e.variables = variables

    print(e.get_download_url())

    df = e.to_pandas(
        parse_dates=True,
        skiprows=(1, )  # units information can be dropped.
    ).dropna()

    argo_ids = np.asarray(df['platform_number'])
    argo_times = np.asarray(df['time (UTC)'])
    argo_press = np.asarray(df['pres (decibar)'])
    argo_lons = np.asarray(df['longitude (degrees_east)'])
    argo_lats = np.asarray(df['latitude (degrees_north)'])
    argo_temps = np.asarray(df['temp (degree_Celsius)'])
    argo_salts = np.asarray(df['psal (PSU)'])

    #%% GOGF 3.1

    try:
        GOFS_ts = xr.open_dataset(url_GOFS_ts, decode_times=False)

        lt_GOFS = np.asarray(GOFS_ts['lat'][:])
        ln_GOFS = np.asarray(GOFS_ts['lon'][:])
        tt = GOFS_ts['time']
        t_GOFS = netCDF4.num2date(tt[:], tt.units)
        depth_GOFS = np.asarray(GOFS_ts['depth'][:])
    except Exception as err:
        print(err)
        GOFS_ts = np.nan
        lt_GOFS = np.nan
        ln_GOFS = np.nan
        depth_GOFS = np.nan
        t_GOFS = ti

    #%% Map Argo floats

    lev = np.arange(-9000, 9100, 100)
    plt.figure()
    plt.contourf(bath_lonsub,
                 bath_latsub,
                 bath_elevsub,
                 lev,
                 cmap=cmocean.cm.topo)
    plt.plot(lon_forec_track, lat_forec_track, '.-', color='gold')
    plt.plot(lon_forec_cone, lat_forec_cone, '.-b', markersize=1)
    plt.plot(lon_best_track, lat_best_track, 'or', markersize=3)

    argo_idd = np.unique(argo_ids)
    for i, id in enumerate(argo_idd):
        okind = np.where(argo_ids == id)[0]
        plt.plot(np.unique(argo_lons[okind]),
                 np.unique(argo_lats[okind]),
                 's',
                 color='darkorange',
                 markersize=5,
                 markeredgecolor='k')

    plt.title('Argo Floats ' + str(tini)[0:13] + '-' + str(tend)[0:13],
              fontsize=16)
    plt.axis('scaled')
    plt.xlim(lon_lim[0], lon_lim[1])
    plt.ylim(lat_lim[0], lat_lim[1])

    file = folder_fig + 'ARGO_lat_lon'
    #file = folder_fig + 'ARGO_lat_lon_' + str(np.unique(argo_times)[0])[0:10]
    plt.savefig(file, bbox_inches='tight', pad_inches=0.1)

    #%% Figure argo float vs GOFS and vs RTOFS

    argo_idd = np.unique(argo_ids)

    for i, id in enumerate(argo_idd):
        print(id)
        okind = np.where(argo_ids == id)[0]
        argo_time = np.asarray([
            datetime.strptime(t, '%Y-%m-%dT%H:%M:%SZ')
            for t in argo_times[okind]
        ])

        argo_lon = argo_lons[okind]
        argo_lat = argo_lats[okind]
        argo_pres = argo_press[okind]
        argo_temp = argo_temps[okind]
        argo_salt = argo_salts[okind]

        # GOFS
        print('Retrieving variables from GOFS')
        if isinstance(GOFS_ts, float):
            temp_GOFS = np.nan
            salt_GOFS = np.nan
        else:
            #oktt_GOFS = np.where(t_GOFS >= argo_time[0])[0][0]
            ttGOFS = np.asarray([
                datetime(t_GOFS[i].year, t_GOFS[i].month, t_GOFS[i].day,
                         t_GOFS[i].hour) for i in np.arange(len(t_GOFS))
            ])
            tstamp_GOFS = [
                mdates.date2num(ttGOFS[i]) for i in np.arange(len(ttGOFS))
            ]
            oktt_GOFS = np.unique(
                np.round(
                    np.interp(mdates.date2num(argo_time[0]), tstamp_GOFS,
                              np.arange(len(tstamp_GOFS)))).astype(int))[0]
            oklat_GOFS = np.where(lt_GOFS >= argo_lat[0])[0][0]
            oklon_GOFS = np.where(ln_GOFS >= argo_lon[0] + 360)[0][0]
            temp_GOFS = np.asarray(GOFS_ts['water_temp'][oktt_GOFS, :,
                                                         oklat_GOFS,
                                                         oklon_GOFS])
            salt_GOFS = np.asarray(GOFS_ts['salinity'][oktt_GOFS, :,
                                                       oklat_GOFS, oklon_GOFS])

        # RTOFS
        #Time window
        year = int(argo_time[0].year)
        month = int(argo_time[0].month)
        day = int(argo_time[0].day)
        tini = datetime(year, month, day)
        tend = tini + timedelta(days=1)

        # Read RTOFS grid and time
        print('Retrieving coordinates from RTOFS')

        if tini.month < 10:
            if tini.day < 10:
                fol = 'rtofs.' + str(tini.year) + '0' + str(
                    tini.month) + '0' + str(tini.day)
            else:
                fol = 'rtofs.' + str(tini.year) + '0' + str(tini.month) + str(
                    tini.day)
        else:
            if tini.day < 10:
                fol = 'rtofs.' + str(tini.year) + str(tini.month) + '0' + str(
                    tini.day)
            else:
                fol = 'rtofs.' + str(tini.year) + str(tini.month) + str(
                    tini.day)

        ncRTOFS = xr.open_dataset(folder_RTOFS + fol + '/' + nc_files_RTOFS[0])
        latRTOFS = np.asarray(ncRTOFS.Latitude[:])
        lonRTOFS = np.asarray(ncRTOFS.Longitude[:])
        depth_RTOFS = np.asarray(ncRTOFS.Depth[:])

        tRTOFS = []
        for t in np.arange(len(nc_files_RTOFS)):
            ncRTOFS = xr.open_dataset(folder_RTOFS + fol + '/' +
                                      nc_files_RTOFS[t])
            tRTOFS.append(np.asarray(ncRTOFS.MT[:])[0])

        tRTOFS = np.asarray([mdates.num2date(mdates.date2num(tRTOFS[t])) \
                  for t in np.arange(len(nc_files_RTOFS))])

        oktt_RTOFS = np.where(
            mdates.date2num(tRTOFS) >= mdates.date2num(argo_time[0]))[0][0]
        oklat_RTOFS = np.where(latRTOFS[:, 0] >= argo_lat[0])[0][0]
        oklon_RTOFS = np.where(lonRTOFS[0, :] >= argo_lon[0])[0][0]

        nc_file = folder_RTOFS + fol + '/' + nc_files_RTOFS[oktt_RTOFS]
        ncRTOFS = xr.open_dataset(nc_file)
        #time_RTOFS = tRTOFS[oktt_RTOFS]
        temp_RTOFS = np.asarray(ncRTOFS.variables['temperature'][0, :,
                                                                 oklat_RTOFS,
                                                                 oklon_RTOFS])
        salt_RTOFS = np.asarray(ncRTOFS.variables['salinity'][0, :,
                                                              oklat_RTOFS,
                                                              oklon_RTOFS])
        #lon_RTOFS = lonRTOFS[0,oklon_RTOFS]
        #lat_RTOFS = latRTOFS[oklat_RTOFS,0]

        # Downloading and reading Copernicus output
        motuc = 'python -m motuclient --motu ' + url_cmems + \
        ' --service-id ' + service_id + \
        ' --product-id ' + product_id + \
        ' --longitude-min ' + str(argo_lon[0]-2/12) + \
        ' --longitude-max ' + str(argo_lon[0]+2/12) + \
        ' --latitude-min ' + str(argo_lat[0]-2/12) + \
        ' --latitude-max ' + str(argo_lat[0]+2/12) + \
        ' --date-min ' + '"' + str(tini-timedelta(0.5)) + '"' + \
        ' --date-max ' + '"' + str(tend+timedelta(0.5)) + '"' + \
        ' --depth-min ' + depth_min + \
        ' --depth-max ' + str(np.nanmax(argo_pres)+1000) + \
        ' --variable ' + 'thetao' + ' ' + \
        ' --variable ' + 'so'  + ' ' + \
        ' --out-dir ' + out_dir + \
        ' --out-name ' + str(id) + '.nc' + ' ' + \
        ' --user ' + 'maristizabalvar' + ' ' + \
        ' --pwd ' +  'MariaCMEMS2018'

        os.system(motuc)
        # Check if file was downloaded

        COP_file = out_dir + '/' + str(id) + '.nc'
        # Check if file was downloaded
        resp = os.system('ls ' + out_dir + '/' + str(id) + '.nc')
        if resp == 0:
            COP = xr.open_dataset(COP_file)

            latCOP = np.asarray(COP.latitude[:])
            lonCOP = np.asarray(COP.longitude[:])
            depth_COP = np.asarray(COP.depth[:])
            tCOP = np.asarray(mdates.num2date(mdates.date2num(COP.time[:])))
        else:
            latCOP = np.empty(1)
            latCOP[:] = np.nan
            lonCOP = np.empty(1)
            lonCOP[:] = np.nan
            tCOP = np.empty(1)
            tCOP[:] = np.nan

        oktimeCOP = np.where(
            mdates.date2num(tCOP) >= mdates.date2num(tini))[0][0]
        oklonCOP = np.where(lonCOP >= argo_lon[0])[0][0]
        oklatCOP = np.where(latCOP >= argo_lat[0])[0][0]

        temp_COP = np.asarray(COP.variables['thetao'][oktimeCOP, :, oklatCOP,
                                                      oklonCOP])
        salt_COP = np.asarray(COP.variables['so'][oktimeCOP, :, oklatCOP,
                                                  oklonCOP])

        # Figure temp
        plt.figure(figsize=(5, 6))
        plt.plot(argo_temp,
                 -argo_pres,
                 '.-',
                 linewidth=2,
                 label='ARGO Float id ' + str(id))
        plt.plot(temp_GOFS,
                 -depth_GOFS,
                 '.-',
                 linewidth=2,
                 label='GOFS 3.1',
                 color='red')
        plt.plot(temp_RTOFS,
                 -depth_RTOFS,
                 '.-',
                 linewidth=2,
                 label='RTOFS',
                 color='g')
        plt.plot(temp_COP,
                 -depth_COP,
                 '.-',
                 linewidth=2,
                 label='Copernicus',
                 color='darkorchid')
        plt.ylim([-1000, 0])
        plt.title('Temperature Profile on '+ str(argo_time[0])[0:13] +
                  '\n [lon,lat] = [' \
                  + str(np.round(argo_lon[0],3)) +',' +\
                      str(np.round(argo_lat[0],3))+']',\
                      fontsize=16)
        plt.ylabel('Depth (m)', fontsize=14)
        plt.xlabel('$^oC$', fontsize=14)
        plt.legend(loc='lower right', fontsize=14)

        file = folder_fig + 'ARGO_vs_GOFS_RTOFS_COP_temp_' + str(id)
        plt.savefig(file, bbox_inches='tight', pad_inches=0.1)

        # Figure salt
        plt.figure(figsize=(5, 6))
        plt.plot(argo_salt,
                 -argo_pres,
                 '.-',
                 linewidth=2,
                 label='ARGO Float id ' + str(id))
        plt.plot(salt_GOFS,
                 -depth_GOFS,
                 '.-',
                 linewidth=2,
                 label='GOFS 3.1',
                 color='red')
        plt.plot(salt_RTOFS,
                 -depth_RTOFS,
                 '.-',
                 linewidth=2,
                 label='RTOFS',
                 color='g')
        plt.plot(salt_COP,
                 -depth_COP,
                 '.-',
                 linewidth=2,
                 label='Copernicus',
                 color='darkorchid')
        plt.ylim([-1000, 0])
        plt.title('Salinity Profile on '+ str(argo_time[0])[0:13] +
                  '\n [lon,lat] = [' \
                  + str(np.round(argo_lon[0],3)) +',' +\
                      str(np.round(argo_lat[0],3))+']',\
                      fontsize=16)
        plt.ylabel('Depth (m)', fontsize=14)
        plt.legend(loc='lower right', fontsize=14)

        file = folder_fig + 'ARGO_vs_GOFS_RTOFS_COP_salt_' + str(id)
        plt.savefig(file, bbox_inches='tight', pad_inches=0.1)
info_df[info_df['Row Type'] == 'variable']

# Take a look at the variables with standard names:

variables = erd.get_var_by_attr(standard_name=lambda v: v is not None)
variables

# These are the standard variables for the CTDBP instrument - specifically for the CP01CNSM-NSIF-CTDBP. Next, lets query the server for _all_ available data from the CP01CNSM-NSIF-CTDBP.

erd.variables = variables

erd.get_download_url()

# Put it all into a dataframe:

data = erd.to_pandas()

# +
# Plot a basic time-series of the conductivity 
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="darkgrid")
# -

data[data['time (UTC)'].isnull()]

data['time (UTC)'] = data['time (UTC)'].apply(lambda x: pd.to_datetime(x))

data.set_index(keys='time (UTC)', inplace=True)
def read_glider_data_erddap_Rutgers_server(url_erddap,dataset_id,\
                                   lat_lim,lon_lim,scatter_plot,**kwargs):

    from erddapy import ERDDAP
    import matplotlib.pyplot as plt
    import matplotlib.dates as mdates
    import cmocean
    import numpy as np

    date_ini = kwargs.get('date_ini', None)
    date_end = kwargs.get('date_end', None)

    # Find time window of interest
    if np.logical_or(date_ini == None, date_end == None):
        constraints = {
            'latitude>=': lat_lim[0],
            'latitude<=': lat_lim[1],
            'longitude>=': lon_lim[0],
            'longitude<=': lon_lim[1],
        }
    else:
        constraints = {
            'time>=': date_ini,
            'time<=': date_end,
            'latitude>=': lat_lim[0],
            'latitude<=': lat_lim[1],
            'longitude>=': lon_lim[0],
            'longitude<=': lon_lim[1],
        }

    variables = [
        'depth', 'latitude', 'longitude', 'time', 'temperature', 'salinity'
    ]

    e = ERDDAP(server=url_erddap, protocol='tabledap', response='nc')

    e.dataset_id = dataset_id
    e.constraints = constraints
    e.variables = variables

    # Converting glider data to data frame
    # Cheching that data frame has data
    df = e.to_pandas()
    if len(df) != 0:

        df = e.to_pandas(
            index_col='time (UTC)',
            parse_dates=True,
            skiprows=(1, )  # units information can be dropped.
        ).dropna()

    dg = df['depth (m)'].values
    tg = df.index.values
    vg1 = df[df.columns[3]].values
    vg2 = df[df.columns[4]].values

    upcast = np.where(np.diff(dg) < 0)[0]
    oku = np.where(np.diff(upcast) > 1)[0]
    end_upcast = upcast[oku]

    downcast = np.where(np.diff(dg) > 0)[0]
    okd = np.where(np.diff(downcast) > 1)[0]
    end_downcast = downcast[okd]

    ind = np.hstack(
        [0, np.unique(np.hstack([end_upcast, end_downcast])),
         len(dg)])
    zn = np.max(np.diff(ind))

    depthg = np.empty((zn, len(ind)))
    depthg[:] = np.nan
    timeg = np.empty((zn, len(ind)))
    timeg[:] = np.nan
    tempg = np.empty((zn, len(ind)))
    tempg[:] = np.nan
    saltg = np.empty((zn, len(ind)))
    saltg[:] = np.nan

    for i in np.arange(len(ind)):
        if i == 0:
            indd = np.argsort(dg[ind[i]:ind[i + 1] + 2])
            depthg[0:len(dg[ind[i]:ind[i + 1] + 2]),
                   i] = dg[ind[i]:ind[i + 1] + 2][indd]
            timeg[0:len(dg[ind[i]:ind[i + 1] + 2]),
                  i] = mdates.date2num(tg[ind[i]:ind[i + 1] + 2][indd])
            tempg[0:len(vg1[ind[i]:ind[i + 1] + 2]),
                  i] = vg1[ind[i]:ind[i + 1] + 2][indd]
            saltg[0:len(vg2[ind[i]:ind[i + 1] + 2]),
                  i] = vg2[ind[i]:ind[i + 1] + 2][indd]
        if i < len(ind) - 1:
            indd = np.argsort(dg[ind[i] + 1:ind[i + 1] + 2])
            depthg[0:len(dg[ind[i] + 1:ind[i + 1] + 2]),
                   i] = dg[ind[i] + 1:ind[i + 1] + 2][indd]
            timeg[0:len(dg[ind[i] + 1:ind[i + 1] + 2]),
                  i] = mdates.date2num(tg[ind[i] + 1:ind[i + 1] + 2][indd])
            tempg[0:len(vg1[ind[i] + 1:ind[i + 1] + 2]),
                  i] = vg1[ind[i] + 1:ind[i + 1] + 2][indd]
            saltg[0:len(vg2[ind[i] + 1:ind[i + 1] + 2]),
                  i] = vg2[ind[i] + 1:ind[i + 1] + 2][indd]
        else:
            indd = np.argsort(dg[ind[i] + 1:len(dg)])
            depthg[0:len(dg[ind[i] + 1:len(dg)]),
                   i] = dg[ind[i] + 1:len(dg)][indd]
            timeg[0:len(dg[ind[i] + 1:len(dg)]),
                  i] = mdates.date2num(tg[ind[i] + 1:len(dg)][indd])
            tempg[0:len(vg1[ind[i] + 1:len(vg1)]),
                  i] = vg1[ind[i] + 1:len(vg1)][indd]
            saltg[0:len(vg2[ind[i] + 1:len(vg2)]),
                  i] = vg2[ind[i] + 1:len(vg2)][indd]

    # Scatter plot
    if scatter_plot == 'yes':

        color_map = cmocean.cm.thermal
        varg = tempg
        #timeg_matrix = np.tile(timeg.T,(depthg.shape[0],1))
        ttg = np.ravel(timeg)
        dg = np.ravel(depthg)
        teg = np.ravel(varg)

        kw = dict(c=teg, marker='*', edgecolor='none')

        fig, ax = plt.subplots(figsize=(10, 3))
        cs = ax.scatter(ttg, -dg, cmap=color_map, **kw)
        #fig.colorbar(cs)
        ax.set_xlim(np.nanmin(ttg), np.nanmax(ttg))

        ax.set_ylabel('Depth (m)', fontsize=14)
        cbar = plt.colorbar(cs)
        cbar.ax.set_ylabel('Temperature ($^oC$)', fontsize=14)
        ax.set_title(dataset_id, fontsize=16)
        xfmt = mdates.DateFormatter('%H:%Mh\n%d-%b')
        ax.xaxis.set_major_formatter(xfmt)
        plt.ylim([-np.nanmax(dg), 0])

        color_map = cmocean.cm.haline
        varg = saltg
        #timeg_matrix = np.tile(timeg.T,(depthg.shape[0],1))
        ttg = np.ravel(timeg)
        dg = np.ravel(depthg)
        teg = np.ravel(varg)

        kw = dict(c=teg, marker='*', edgecolor='none')

        fig, ax = plt.subplots(figsize=(10, 3))
        cs = ax.scatter(ttg, -dg, cmap=color_map, **kw)
        #fig.colorbar(cs)
        ax.set_xlim(np.nanmin(ttg), np.nanmax(ttg))

        ax.set_ylabel('Depth (m)', fontsize=14)
        cbar = plt.colorbar(cs)
        cbar.ax.set_ylabel('Salinity', fontsize=14)
        ax.set_title(dataset_id, fontsize=16)
        xfmt = mdates.DateFormatter('%H:%Mh\n%d-%b')
        ax.xaxis.set_major_formatter(xfmt)
        plt.ylim([-np.nanmax(dg), 0])

    return tempg, saltg, timeg, latg, long, depthg
def read_glider_data_erddap_server(url_erddap,dataset_id,\
                                   lat_lim,lon_lim,scatter_plot,**kwargs):
    """
    Created on Tue Feb  5 10:05:37 2019

    @author: aristizabal

    This function reads glider data from the IOOS
    Data Assembly Center (DAC).

    Inputs:
    url_erddap: url address of thredds server
                Example: 'https://data.ioos.us/gliders/erddap'
    dataset_id: this id is retrieved from the glider DAC using the
               function "retrieve_glider_id_erddap_server".
               Example: 'ru30-20180705T1825'
    lat_lim: latitude limits for the search.
            Example, lat_lim = [38.0,40.0]
    lon_lim: longitude limits for the search.
            Example, lon_lim = [-75.0,-72.0]
    date_ini: initial date of time window.
        This function accepts the data formats '%Y-%m-%d T %H:%M:%S Z' and '%Y/%m/%d/%H'.
        Examaple: date_ini = '2018-08-02T00:00:00Z' or '2018/08/02/00'
    date_end: initial date of time window.
        This function uses the data format '%Y-%m-%d T %H:%M:%S Z'.
        Examaple: date_ini = '2018-08-10T00:00:00Z' and '2018/08/10/00'
    scatter_plot: if equal to 'yes' then a scatter plot
            of the glider transect is plotted

    Outputs:
    tempg: all the glider profiles of temperature within the user defined time window
    saltg: all the glider profiles of salinity within the user defined time window
    latg: latitude within the user defined time window
    long: longitude within the user defined time window
    timeg: user defined time window
    depthg: depth vector for all profiles
    """

    from erddapy import ERDDAP
    import matplotlib.pyplot as plt
    import matplotlib.dates as mdates
    import cmocean
    import numpy as np

    date_ini = kwargs.get('date_ini', None)
    date_end = kwargs.get('date_end', None)

    # Find time window of interest
    if np.logical_or(date_ini == None, date_end == None):
        constraints = {
            'latitude>=': lat_lim[0],
            'latitude<=': lat_lim[1],
            'longitude>=': lon_lim[0],
            'longitude<=': lon_lim[1],
        }
    else:
        constraints = {
            'time>=': date_ini,
            'time<=': date_end,
            'latitude>=': lat_lim[0],
            'latitude<=': lat_lim[1],
            'longitude>=': lon_lim[0],
            'longitude<=': lon_lim[1],
        }

    variables = [
        'depth', 'latitude', 'longitude', 'time', 'temperature', 'salinity'
    ]

    e = ERDDAP(server=url_erddap, protocol='tabledap', response='nc')

    e.dataset_id = dataset_id
    e.constraints = constraints
    e.variables = variables

    # Converting glider data to data frame
    # Cheching that data frame has data
    df = e.to_pandas()
    if len(df) > 3:

        df = e.to_pandas(
            index_col='time (UTC)',
            parse_dates=True,
            skiprows=(1, )  # units information can be dropped.
        ).dropna()

        # Coverting glider vectors into arrays
        timeg, ind = np.unique(df.index.values, return_index=True)
        latg = df['latitude (degrees_north)'].values[ind]
        long = df['longitude (degrees_east)'].values[ind]

        dg = df['depth (m)'].values
        vg1 = df[df.columns[3]].values
        vg2 = df[df.columns[4]].values

        zn = np.int(np.max(np.diff(np.hstack([ind, len(dg)]))))

        depthg = np.empty((zn, len(timeg)))
        depthg[:] = np.nan
        tempg = np.empty((zn, len(timeg)))
        tempg[:] = np.nan
        saltg = np.empty((zn, len(timeg)))
        saltg[:] = np.nan

        for i, ii in enumerate(ind):
            if i < len(timeg) - 1:
                depthg[0:len(dg[ind[i]:ind[i + 1]]), i] = dg[ind[i]:ind[i + 1]]
                tempg[0:len(vg1[ind[i]:ind[i + 1]]),
                      i] = vg1[ind[i]:ind[i + 1]]
                saltg[0:len(vg2[ind[i]:ind[i + 1]]),
                      i] = vg2[ind[i]:ind[i + 1]]
            else:
                depthg[0:len(dg[ind[i]:len(dg)]), i] = dg[ind[i]:len(dg)]
                tempg[0:len(vg1[ind[i]:len(vg1)]), i] = vg1[ind[i]:len(vg1)]
                saltg[0:len(vg2[ind[i]:len(vg2)]), i] = vg2[ind[i]:len(vg2)]

        # Scatter plot
        if scatter_plot == 'yes':

            color_map = cmocean.cm.thermal
            varg = tempg
            timeg_matrix = np.tile(timeg.T, (depthg.shape[0], 1))
            ttg = np.ravel(timeg_matrix)
            dg = np.ravel(depthg)
            teg = np.ravel(varg)

            kw = dict(c=teg, marker='*', edgecolor='none')

            fig, ax = plt.subplots(figsize=(10, 3))
            cs = ax.scatter(ttg, -dg, cmap=color_map, **kw)
            #fig.colorbar(cs)
            ax.set_xlim(timeg[0], timeg[-1])

            ax.set_ylabel('Depth (m)', fontsize=14)
            cbar = plt.colorbar(cs)
            cbar.ax.set_ylabel('Temperature ($^oC$)', fontsize=14)
            ax.set_title(dataset_id, fontsize=16)
            xfmt = mdates.DateFormatter('%H:%Mh\n%d-%b')
            ax.xaxis.set_major_formatter(xfmt)
            plt.ylim([-np.nanmax(dg), 0])

            color_map = cmocean.cm.haline
            varg = saltg
            timeg_matrix = np.tile(timeg.T, (depthg.shape[0], 1))
            ttg = np.ravel(timeg_matrix)
            dg = np.ravel(depthg)
            teg = np.ravel(varg)

            kw = dict(c=teg, marker='*', edgecolor='none')

            fig, ax = plt.subplots(figsize=(10, 3))
            cs = ax.scatter(ttg, -dg, cmap=color_map, **kw)
            #fig.colorbar(cs)
            ax.set_xlim(timeg[0], timeg[-1])

            ax.set_ylabel('Depth (m)', fontsize=14)
            cbar = plt.colorbar(cs)
            cbar.ax.set_ylabel('Salinity', fontsize=14)
            ax.set_title(dataset_id, fontsize=16)
            xfmt = mdates.DateFormatter('%H:%Mh\n%d-%b')
            ax.xaxis.set_major_formatter(xfmt)
            plt.ylim([-np.nanmax(dg), 0])

    else:
        tempg = np.nan
        saltg = np.nan
        timeg = np.nan
        latg = np.nan
        long = np.nan
        depthg = np.nan

    return tempg, saltg, timeg, latg, long, depthg
Beispiel #26
0
def grid_glider(
        dataset_id,
        varz2d=[
            'potential_temperature', 'salinity', 'cdom', 'chlorophyll_a',
            'beta_700nm'
        ],
        zgrid=np.arange(0, 1000, 5),
):
    '''grid the glider data from RUCOOL Erddap. this needs work'''
    import xarray as xr
    import pandas as pd
    from erddapy import ERDDAP

    from scipy.signal import find_peaks
    from scipy import stats
    e = ERDDAP(
        server="http://slocum-data.marine.rutgers.edu/erddap",
        protocol="tabledap",
        response="nc",
    )

    # get the science data:
    e.dataset_id = dataset_id

    # this connects to the data and load into an pandas dataframe
    ds = e.to_pandas()
    # remove the spaces from the column names
    ds.columns = ds.columns.str.split(' ').str[0]

    # get the time to be a datetime object
    ds['time'] = pd.to_datetime(ds['time'])

    # put the times in order
    ds = ds.sort_values(by=['time'])

    # fill nans in dpeth for the profile breakup
    interpd = ds.depth.interpolate()

    # find the top and bottom of each profile
    apogee, prop = find_peaks(interpd.values,
                              threshold=None,
                              distance=None,
                              prominence=50)

    perogee, prop = find_peaks(-1 * interpd.values,
                               threshold=None,
                               distance=None,
                               prominence=50)

    # stack the index of the turning points into one vector
    turns = np.sort(np.append(apogee, perogee))

    # this is your depth grid, you can set:
    zgrd = zgrid

    # list of variables to grid in 2d:
    # you choose from the columns of the science data
    dataz = varz2d

    # this is a dict to hold our gridded stuff
    # until we make a dataset later
    d2 = {}

    # loop on the variables you want to bin
    for varz in dataz:
        values = ds[varz]  # grab some data

        #this thing below bins the data
        ret = stats.binned_statistic_2d(ds.index.values,
                                        ds.depth,
                                        values,
                                        statistic='mean',
                                        bins=[turns, zgrd])
        d2[varz] = ret.statistic.T

    # things to bin in the x direction
    oneDvars = ['latitude', 'longitude', 'time', 'u', 'v']

    # NB: u, v only have one value per dive sequence, so only half the number profiles!
    # actually, its weirder than that... not sure there are more than half...

    # dict to hold our 1d bins
    d1 = {}

    # loop on 1d stuff:
    for thing in oneDvars:
        if thing == 'time':
            bin_means, bin_edges, binnumber = stats.binned_statistic(
                ds.index.values,
                ds[thing].astype(int),
                statistic='mean',
                bins=turns)
            bin_means = pd.to_datetime(bin_means)
        else:

            bin_means, bin_edges, binnumber = stats.binned_statistic(
                ds.index.values,
                ds[thing].values,
                statistic=np.nanmean,
                bins=turns)
        d1[thing] = bin_means

    # need the depth grid centers
    zgrd_ctr = zgrd[:-1] + np.diff(zgrd).mean() / 2

    # create the dataset
    ds_gridded = xr.Dataset(coords={
        'date': d1['time'].values,
        'depth': zgrd_ctr,
        'lat': ('date', d1['latitude']),
        'lon': ('date', d1['longitude'])
    },
                            data_vars={
                                'u': ('date', d1['u']),
                                'v': ('date', d1['v'])
                            })

    # add the other data
    for varz in dataz:
        ds_gridded[varz] = (('depth', 'date'), d2[varz])

    return ds_gridded
Beispiel #27
0
class ErddapReader(Reader):
    """
    This class searches ERDDAP servers. There are 2 known_servers but
    others can be input too.

    Attributes
    ----------
    parallel: boolean
        If True, run with simple parallelization using `multiprocessing`.
        If False, run serially.
    known_server: string
        Two ERDDAP servers are built in to be known to this reader: "ioos" and
        "coastwatch".
    e: ERDDAP server instance
    e.protocol: string
        * "tabledap" (pandas, appropriate for reading as csv)
        * "griddap" (xarray, appropriate for reading as netcdf)
    e.server: string
        Return the server name
    columns: list
        Metadata columns
    name: string
        "erddap_ioos", "erddap_coastwatch", or a constructed string if the user
        inputs a new protocol and server.
    reader: string
        reader is defined as "ErddapReader".
    """
    def __init__(self,
                 known_server="ioos",
                 protocol=None,
                 server=None,
                 parallel=True):
        """
        Parameters
        ----------
        known_server: string, optional
            Two ERDDAP servers are built in to be known to this reader:
            "ioos" and "coastwatch".
        protocol, server: string, optional
            For a user-defined ERDDAP server, input the protocol as one of the
            following:
            * "tabledap" (pandas, appropriate for reading as csv)
            * "griddap" (xarray, appropriate for reading as netcdf)
            and the server address (such as
            "http://erddap.sensors.ioos.us/erddap" or
            "http://coastwatch.pfeg.noaa.gov/erddap").
        parallel: boolean
            If True, run with simple parallelization using `multiprocessing`.
            If False, run serially.
        """
        self.parallel = parallel

        # hard wire this for now
        filetype = "netcdf"

        # either select a known server or input protocol and server string
        if known_server == "ioos":
            protocol = "tabledap"
            server = "http://erddap.sensors.ioos.us/erddap"
            filetype = "netcdf"  # other option: "csv"
        elif known_server == "coastwatch":
            protocol = "griddap"
            server = "http://coastwatch.pfeg.noaa.gov/erddap"
            filetype = "netcdf"  # other option: "csv"
        elif known_server is not None:
            statement = (
                "either select a known server or input protocol and server string"
            )
            assert (protocol is not None) & (server is not None), statement
        else:
            known_server = urllib.parse.urlparse(server).netloc
            # known_server = server.strip("/erddap").strip("http://").replace(".", "_")
            statement = (
                "either select a known server or input protocol and server string"
            )
            assert (protocol is not None) & (server is not None), statement

        self.known_server = known_server
        self.e = ERDDAP(server=server)
        self.e.protocol = protocol
        self.e.server = server
        self.filetype = filetype

        # columns for metadata
        self.columns = [
            "geospatial_lat_min",
            "geospatial_lat_max",
            "geospatial_lon_min",
            "geospatial_lon_max",
            "time_coverage_start",
            "time_coverage_end",
            "defaultDataQuery",
            "subsetVariables",  # first works for timeseries sensors, 2nd for gliders
            "keywords",  # for hf radar
            "id",
            "infoUrl",
            "institution",
            "featureType",
            "source",
            "sourceUrl",
        ]

        # name
        self.name = f"erddap_{known_server}"

        self.reader = "ErddapReader"
        self.store = dict()

    def __getitem__(self, key):
        """Redefinition of dict-like behavior.

        This enables user to use syntax `reader[dataset_id]` to read in and
        save dataset into the object.

        Parameters
        ----------
        key: str
            dataset_id for a dataset that is available in the search/reader
            object.

        Returns
        -------
        xarray Dataset of the data associated with key
        """

        returned_data = self.data_by_dataset(key)
        # returned_data = self._return_data(key)
        self.__setitem__(key, returned_data)
        return returned_data

    def find_dataset_id_from_station(self, station):
        """Find dataset_id from station name.

        Parameters
        ----------
        station: string
            Station name for which to search for dataset_id
        """

        if station is None:
            return None
        # for station in self._stations:
        # if station has more than one word, AND will be put between
        # to search for multiple terms together.
        url = self.e.get_search_url(response="csv",
                                    items_per_page=5,
                                    search_for=station)

        try:
            df = pd.read_csv(url)
        except Exception as e:
            logger.exception(e)
            logger.warning(
                f"search url {url} did not work for station {station}.")
            return

        # first try for exact station match
        try:
            # Special case for TABS when don't split the id name
            if "tabs" in station:  # don't split
                dataset_id = [
                    dataset_id for dataset_id in df["Dataset ID"]
                    if station.lower() == dataset_id.lower()
                ][0]
            else:
                # first try as dataset_id then do as station name
                dataset_id = [
                    dataset_id for dataset_id in df["Dataset ID"]
                    if station.lower() in [dataset_id.lower()] +
                    dataset_id.lower().split("_")
                ][0]

        except Exception as e:
            logger.exception(e)
            logger.warning(
                "When searching for a dataset id to match station name %s, the first attempt to match the id did not work."
                % (station))
            # If that doesn't work, return None for dataset_id
            dataset_id = None
            # # if that doesn't work, trying for more general match and just take first returned option
            # dataset_id = df.iloc[0]["Dataset ID"]

        return dataset_id

    @property
    def dataset_ids(self):
        """Find dataset_ids for server.

        Notes
        -----
        The dataset_ids are found by querying the metadata through the ERDDAP server.

        The number of dataset_ids can change if a variable is removed from the
        list of variables and this is rerun.
        """

        if not hasattr(self, "_dataset_ids") or (
                self.variables and
            (len(self.variables) != self.num_variables)):

            # This should be a region search
            if self.approach == "region":

                # find all the dataset ids which we will use to get the data
                # This limits the search to our keyword arguments in kw which should
                # have min/max lon/lat/time values
                dataset_ids = []
                if self.variables is not None:
                    for variable in self.variables:

                        # find and save all dataset_ids associated with variable
                        search_url = self.e.get_search_url(
                            response="csv",
                            **self.kw,
                            variableName=variable,
                            items_per_page=10000,
                        )

                        try:
                            search = pd.read_csv(search_url)
                            dataset_ids.extend(search["Dataset ID"])
                        except Exception as e:
                            logger.exception(e)
                            logger.warning(
                                f"variable {variable} was not found in the search"
                            )
                            logger.warning(f"search_url: {search_url}")

                else:

                    # find and save all dataset_ids associated with variable
                    search_url = self.e.get_search_url(response="csv",
                                                       **self.kw,
                                                       items_per_page=10000)

                    try:
                        search = pd.read_csv(search_url)
                        dataset_ids.extend(search["Dataset ID"])
                    except Exception as e:
                        logger.exception(e)
                        logger.warning("nothing found in the search")
                        logger.warning(f"search_url: {search_url}")

                # only need a dataset id once since we will check them each for all standard_names
                self._dataset_ids = list(set(dataset_ids))

            # This should be a search for the station names
            elif self.approach == "stations":

                # search by station name for each of stations
                if self.parallel:
                    # get metadata for datasets
                    # run in parallel to save time
                    num_cores = multiprocessing.cpu_count()
                    dataset_ids = Parallel(n_jobs=num_cores)(
                        delayed(self.find_dataset_id_from_station)(station)
                        for station in self._stations)

                else:
                    dataset_ids = []
                    for station in self._stations:
                        dataset_ids.append(
                            self.find_dataset_id_from_station(station))

                # remove None from list
                dataset_ids = [i for i in dataset_ids if i]

                # In this case return all dataset_ids so they match 1-1 with
                # the input station list.
                self._dataset_ids = dataset_ids

            else:
                logger.warning(
                    "Neither stations nor region approach were used in function dataset_ids."
                )

            # update number of variables
            if self.variables:
                self.num_variables = len(self.variables)

        return self._dataset_ids

    def meta_by_dataset(self, dataset_id):
        """Return the catalog metadata for a single dataset_id."""

        info_url = self.e.get_info_url(response="csv", dataset_id=dataset_id)
        try:
            info = pd.read_csv(info_url)
        except Exception as e:
            logger.exception(e)
            logger.warning(f"Could not read info from {info_url}")
            return {dataset_id: []}

        items = []

        for col in self.columns:

            try:
                item = info[info["Attribute Name"] == col]["Value"].values[0]
                dtype = info[info["Attribute Name"] ==
                             col]["Data Type"].values[0]
            except:
                if col == "featureType":
                    # this column is not present in HF Radar metadata but want it to
                    # map to data_type, so input 'grid' in that case.
                    item = "grid"
                else:
                    item = "NA"

            if dtype == "String":
                pass
            elif dtype == "double":
                item = float(item)
            elif dtype == "int":
                item = int(item)
            items.append(item)

        # include download link ##
        self.e.dataset_id = dataset_id
        if self.e.protocol == "tabledap":
            # set the same time restraints as before
            self.e.constraints = {
                "time<=": self.kw["max_time"],
                "time>=": self.kw["min_time"],
            }
            if self.filetype == "csv":
                download_url = self.e.get_download_url(response="csvp")
            elif self.filetype == "netcdf":
                download_url = self.e.get_download_url(response="ncCf")

        elif self.e.protocol == "griddap":
            # the search terms that can be input for tabledap do not work for griddap
            # in erddapy currently. Instead, put together an opendap link and then
            # narrow the dataset with xarray.
            # get opendap link
            download_url = self.e.get_download_url(response="opendap")

        # check if "prediction" is present in metadata, esp in case of NOAA
        # model predictions
        is_prediction = "Prediction" in " ".join(
            list(info["Value"].replace(np.nan, None).values))

        # add erddap server name
        return {
            dataset_id:
            [self.e.server, download_url, info_url, is_prediction] + items +
            [self.variables]
        }

    @property
    def meta(self):
        """Rearrange the individual metadata into a dataframe.

        Notes
        -----
        This should exclude duplicate entries.
        """

        if not hasattr(self, "_meta"):

            if self.parallel:

                # get metadata for datasets
                # run in parallel to save time
                num_cores = multiprocessing.cpu_count()
                downloads = Parallel(n_jobs=num_cores)(
                    delayed(self.meta_by_dataset)(dataset_id)
                    for dataset_id in self.dataset_ids)

            else:

                downloads = []
                for dataset_id in self.dataset_ids:
                    downloads.append(self.meta_by_dataset(dataset_id))

            # make dict from individual dicts
            from collections import ChainMap

            meta = dict(ChainMap(*downloads))

            # Make dataframe of metadata
            # variable names are the column names for the dataframe
            self._meta = pd.DataFrame.from_dict(
                meta,
                orient="index",
                columns=[
                    "database", "download_url", "info_url", "is_prediction"
                ] + self.columns + ["variable names"],
            )

        return self._meta

    def data_by_dataset(self, dataset_id):
        """Return the data for a single dataset_id.

        Returns
        -------
        A tuple of (dataset_id, data), where data type is a pandas DataFrame

        Notes
        -----
        Data is read into memory.
        """

        if self.filetype == "csv":
            # if self.e.protocol == "tabledap":
            try:
                # fetch metadata if not already present
                # found download_url from metadata and use
                self.e.dataset_id = dataset_id
                # dataset_vars gives a list of the variables in the dataset
                dataset_vars = (self.meta.loc[dataset_id]
                                ["defaultDataQuery"].split("&")[0].split(","))
                # vars_present gives the variables in self.variables
                # that are actually in the dataset
                vars_present = []
                for selfvariable in self.variables:
                    vp = [var for var in dataset_vars if var == selfvariable]
                    if len(vp) > 0:
                        vars_present.append(vp[0])
                # If any variables are not present, this doesn't work.
                if self.variables is not None:
                    self.e.variables = [
                        "time",
                        "longitude",
                        "latitude",
                        "station",
                    ] + vars_present
                dd = self.e.to_pandas(response="csvp",
                                      index_col=0,
                                      parse_dates=True)
                # dd = self.e.to_pandas(response='csv', header=[0, 1],
                #                       index_col=0, parse_dates=True)
                # dd = pd.read_csv(
                #     download_url, header=[0, 1], index_col=0, parse_dates=True
                # )

                # Drop cols and rows that are only NaNs.
                dd = dd.dropna(axis="index", how="all").dropna(axis="columns",
                                                               how="all")

                if self.variables is not None:
                    # check to see if there is any actual data
                    # this is a bit convoluted because the column names are the variable names
                    # plus units so can't match 1 to 1.
                    datacols = (
                        0  # number of columns that represent data instead of metadata
                    )
                    for col in dd.columns:
                        datacols += [
                            varname in col for varname in self.variables
                        ].count(True)
                    # if no datacols, we can skip this one.
                    if datacols == 0:
                        dd = None

            except Exception as e:
                logger.exception(e)
                logger.warning("no data to be read in for %s" % dataset_id)
                dd = None

        elif self.filetype == "netcdf":
            # elif self.e.protocol == "griddap":

            if self.e.protocol == "tabledap":

                try:
                    # assume I don't need to narrow in space since time series (tabledap)
                    self.e.dataset_id = dataset_id
                    dd = self.e.to_xarray()
                    # dd = xr.open_dataset(download_url, chunks="auto")
                    dd = dd.swap_dims({"obs": dd.cf["time"].name})
                    dd = dd.sortby(dd.cf["time"], ascending=True)
                    dd = dd.cf.sel(
                        T=slice(self.kw["min_time"], self.kw["max_time"]))
                    # dd = dd.set_coords(
                    #     [dd.cf["longitude"].name, dd.cf["latitude"].name]
                    # )

                    # use variable names to drop other variables (should. Ido this?)
                    if self.variables is not None:
                        # I don't think this is true with new approach
                        # # ERDDAP prepends variables with 's.' in netcdf files,
                        # # so include those with variables
                        # erd_vars = [f's.{var}' for var in self.variables]
                        # var_list = set(dd.data_vars) - (set(self.variables) | set(erd_vars))
                        var_list = set(dd.data_vars) - set(self.variables)
                        dd = dd.drop_vars(var_list)

                    # the lon/lat are on the 'timeseries' singleton dimension
                    # but the data_var variable was not, which messed up
                    # cf-xarray. When longitude and latitude are not on a
                    # dimension shared with a variable, the variable can't be
                    # called with cf-xarray. e.g. dd.cf['ssh'] won't work.
                    if "timeseries" in dd.dims:
                        for data_var in dd.data_vars:
                            if "timeseries" not in dd[data_var].dims:
                                dd[data_var] = dd[data_var].expand_dims(
                                    dim="timeseries", axis=1)

                except Exception as e:
                    logger.exception(e)
                    logger.warning("no data to be read in for %s" % dataset_id)
                    dd = None

            elif self.e.protocol == "griddap":

                try:
                    # this makes it read in the whole file which might be large
                    self.e.dataset_id = dataset_id
                    # dd = self.e.to_xarray(chunks="auto").sel(
                    #     time=slice(self.kw["min_time"], self.kw["max_time"])
                    # )
                    download_url = self.e.get_download_url(response="opendap")
                    dd = xr.open_dataset(download_url, chunks="auto").sel(
                        time=slice(self.kw["min_time"], self.kw["max_time"]))

                    if ("min_lat" in self.kw) and ("max_lat" in self.kw):
                        dd = dd.sel(latitude=slice(self.kw["min_lat"],
                                                   self.kw["max_lat"]))

                    if ("min_lon" in self.kw) and ("max_lon" in self.kw):
                        dd = dd.sel(longitude=slice(self.kw["min_lon"],
                                                    self.kw["max_lon"]))

                    # use variable names to drop other variables (should. Ido this?)
                    if self.variables is not None:
                        vars_list = set(dd.data_vars) - set(self.variables)
                        dd = dd.drop_vars(vars_list)

                except Exception as e:
                    logger.exception(e)
                    logger.warning("no data to be read in for %s" % dataset_id)
                    dd = None

        # return (dataset_id, dd)
        return dd

    # @property
    def data(self, dataset_ids=None):
        """Read in data for some or all dataset_ids.

        NOT USED CURRENTLY

        Once data is read in for a dataset_ids, it is remembered.

        See full documentation in `utils.load_data()`.
        """

        output = odg.utils.load_data(self, dataset_ids)
        return output
Beispiel #28
0
def load_data_from_erddap(config, station_id=None, station_data=None):
    mcf_template = yaml.load(open(config['static_data']['mcf_template'], 'r'),
                             Loader=yaml.FullLoader)

    es = ERDDAP(
        server=config['dynamic_data']['erddap_server'],
        protocol=config['dynamic_data']['erddap_protocol'],
    )

    if station_id is None:
        #load all station data MCF skeleton
        stations = {}
        es.dataset_id = 'allDatasets'

        # filter out "log in" datasets as the vast majoirty of their available metadata is unavailable
        es.constraints = {'accessible=': 'public'}
        stations_df = es.to_pandas()

        # drop 'allDatasets' row
        stations_df.drop(labels=0, axis='index', inplace=True)
        print(stations_df)

        for index_label, row_series in stations_df.iterrows():
            id = row_series['datasetID']

            # ensure each station has an independant copy of the MCF skeleton
            stations[id] = copy.deepcopy(mcf_template)
            dataset_url = row_series['tabledap'] if row_series[
                'dataStructure'] == 'table' else row_series['griddap']

            stations[id]['metadata']['identifier'] = id
            stations[id]['metadata']['dataseturi'] = dataset_url

            stations[id]['spatial']['datatype'] = 'textTable' if row_series[
                'dataStructure'] == 'table' else 'grid'

            stations[id]['spatial']['geomtype'] = row_series['cdm_data_type']
            stations[id]['spatial']['bbox'] = '%s,%s,%s,%s' % (
                row_series['minLongitude (degrees_east)'],
                row_series['minLatitude (degrees_north)'],
                row_series['maxLongitude (degrees_east)'],
                row_series['maxLatitude (degrees_north)'])

            stations[id]['identification']['title'] = row_series['title']
            stations[id]['identification']['dates']['creation'] = row_series[
                'minTime (UTC)']
            stations[id]['identification']['temporal_begin'] = row_series[
                'minTime (UTC)']
            stations[id]['identification']['temporal_end'] = row_series[
                'maxTime (UTC)']
            stations[id]['identification']['url'] = dataset_url
            stations[id]['identification']['abstract'] = row_series['summary']

            stations[id]['distribution']['erddap']['url'] = dataset_url
            stations[id]['distribution']['erddap']['name'] = row_series[
                'title']

        print('Stations after ERDDAP call...')
        print(stations)

        return_value = stations
        pass

    else:
        #load specific station data into MCF skeleton
        print('Loading ERDDAP metadata for station: %s' % (station_id))

        es.dataset_id = station_id

        metadata_url = es.get_download_url(dataset_id='%s/index' %
                                           (station_id),
                                           response='csv',
                                           protocol='info')
        metadata = pd.read_csv(filepath_or_buffer=metadata_url)
        print(metadata_url)
        print(metadata.head())

        # ERDDAP ISO XML provides a list of dataset field names (long & short), data types & units
        # of measurement, in case this becomes useful for the CIOOS metadata standard we can extend
        # the YAML skeleton to include these and the template to export them.
        #
        # below most varible attributes from ERDDAP are extracted and pivoted to describe the field
        # actual field data types are extracted seperately and merged into the pivoted dataframe
        # for completeness
        columns_pivot = metadata[(metadata['Variable Name'] != 'NC_GLOBAL')
                                 & (metadata['Row Type'] != 'variable')].pivot(
                                     index='Variable Name',
                                     columns='Attribute Name',
                                     values='Value')
        col_data_types = metadata[(metadata['Row Type'] == 'variable')][[
            'Variable Name', 'Data Type'
        ]]
        df_merge = pd.merge(columns_pivot, col_data_types, on='Variable Name')

        station_data['dataset'] = {}

        for index_label, field_series in df_merge.iterrows():
            field_name = field_series['Variable Name']
            station_data['dataset'][field_name] = {}
            station_data['dataset'][field_name]['long_name'] = field_series[
                'long_name']
            station_data['dataset'][field_name]['data_type'] = field_series[
                'Data Type']
            station_data['dataset'][field_name]['units'] = field_series[
                'units']

        station_data['identification']['keywords']['default'][
            'keywords'] = metadata[
                (metadata['Variable Name'] == 'NC_GLOBAL')
                & (metadata['Attribute Name'] == 'keywords')]['Value'].values

        return_value = station_data

    return return_value
ncbath = xr.open_dataset(bath_file)
bath_lat = ncbath.variables['lat'][:]
bath_lon = ncbath.variables['lon'][:]
bath_elev = ncbath.variables['elevation'][:]

#%% Looping through all gliders found

for id in gliders:
    print('Reading ' + id )
    e.dataset_id = id
    e.constraints = constraints
    e.variables = variables
    
    # chacking data frame is not empty
    df = e.to_pandas()
    if len(df.index) != 0 :
    
        # Converting glider data to data frame
        df = e.to_pandas(
                index_col='time (UTC)',
                parse_dates=True,
                skiprows=(1,)  # units information can be dropped.
                ).dropna()

        # Coverting glider vectors into arrays
        timeg, ind = np.unique(df.index.values,return_index=True)
        latg = df['latitude (degrees_north)'].values[ind]
        long = df['longitude (degrees_east)'].values[ind]

        dg = df['depth (m)'].values
Beispiel #30
0
        'sea_water_velocity_to_direction',
        'sea_water_speed'
        ]

e = ERDDAP(
        server=url_buoy,
        protocol='tabledap',
        response='nc'
        )

e.dataset_id = datasets[0]
e.constraints = constraints
e.variables = variables

df_vel = e.to_pandas(
            index_col='time (UTC)',
            parse_dates=True,
            )

time_vel, ind = np.unique(df_vel.index,return_index=True)
depth_vel = df_vel['depth (m)'].values
water_speed = df_vel['sea_water_speed (cm/s)'].values

# Reshape velocity and depth into array depth x time
zn = ind[1] # 34 vertical levels
depth_levels = depth_vel[0:zn]

water_speed_matrix = np.empty((zn,len(time_vel)))
water_speed_matrix[:] = np.nan
for i,ii in enumerate(ind):
    if i < len(time_vel)-1:
        water_speed_matrix[0:len(water_speed[ind[i]:ind[i+1]]),i] = water_speed[ind[i]:ind[i+1]]