def get_valid_stdnames(server_name):
    """Find all the `standard_name` attributes that exist on
    this ERDDAP endpoint, using [ERDDAP's "categorize" service]
    (http://www.neracoos.org/erddap/categorize/index.html)"""

    server = servers[server_name]
    server_url = server.get("url")

    # global e
    e = ERDDAP(server=server_url, protocol="tabledap")

    url_standard_names = f"{server_url}/categorize/standard_name/index.csv"
    df = pd.read_csv(urlopen(url_standard_names), skiprows=[1, 2])
    standard_names = list(df["Category"].values)

    standard_names = remove_qcstdnames(standard_names)

    valid_standard_names = []
    count = 0

    print(
        "Checking the variables available for this server. This might take up to a couple of minutes...\n",
    )

    for standard_name in standard_names:

        count += 1

        if count == np.floor(len(standard_names) / 2):
            print("Halfway there...\n")
        elif count == np.floor((len(standard_names) / 4) * 3):
            print("Almost done...\n")
        elif count == (len(standard_names)):
            print("Done!")

        features, datasets = stdname2geojson(
            e,
            standard_name,
            server.get("cdm_data_type"),
            server.get("min_time"),
            server.get("max_time"),
            server.get("skip_datasets"),
        )

        if len(datasets
               ) > 0:  # if there is at least one dataset with this data

            var = e.get_var_by_attr(
                dataset_id=datasets[0],
                standard_name=lambda v: str(v).lower() == standard_name.lower(
                ),
            )

            if var != []:
                valid_standard_names.append(standard_name)

        del features, datasets

    return valid_standard_names, server, e
Exemple #2
0
def get_valid_stdnames(server_name):
    """Find all the `standard_name` attributes that exist on
    this ERDDAP endpoint, using [ERDDAP's "categorize" service]
    (http://www.neracoos.org/erddap/categorize/index.html)"""

    server = servers[server_name]
    server_url = server.get("url")

    e = ERDDAP(server=server_url, protocol="tabledap")

    url_stdnames = f"{server_url}/categorize/standard_name/index.csv"
    df = pd.read_csv(urlopen(url_stdnames), skiprows=[1, 2])
    stdnames = list(df["Category"].values)

    stdnames = remove_qcstdnames(stdnames)

    valid_stdnames = []
    count = 0

    display(pn.Column(pn.panel(progressbar.name), progressbar))

    for stdname in stdnames:

        count += 1

        progressbar.value = int(count / (len(stdnames)) * 100)

        df_stdname = get_datasets(
            e,
            stdname,
            server.get("cdm_data_type"),
            server.get("min_time"),
            server.get("max_time"),
            server.get("skip_datasets"),
        )

        if not df_stdname.empty:

            var = e.get_var_by_attr(
                dataset_id=df_stdname.datasetID.values[0],
                standard_name=lambda v: str(v).lower() == stdname.lower(),
            )

            if var != []:
                valid_stdnames.append(stdname)

    return valid_stdnames, server, e
Exemple #3
0
info_url = e.get_info_url(dataset_id=gliders[0], response="csv")
info = pd.read_csv(info_url)

info.head()

With the info URL we can filter the data using attributes.

cdm_profile_variables = info.loc[
    info["Attribute Name"] == "cdm_profile_variables", "Variable Name"
]

print("".join(cdm_profile_variables))

In fact, that is such a common operation that `erddapy` brings its own method for filtering data by attributes. In the next three cells we request the variables names that has a `cdm_profile_variables`, a `standard_name` of `sea_water_temperature`, and an axis respectively.

e.get_var_by_attr(
    dataset_id=gliders[0], cdm_profile_variables=lambda v: v is not None,
)

e.get_var_by_attr(
    dataset_id="whoi_406-20160902T1700", standard_name="sea_water_temperature",
)

axis = e.get_var_by_attr(
    dataset_id="whoi_406-20160902T1700", axis=lambda v: v in ["X", "Y", "Z", "T"],
)
axis

With this method one can, for example, request data from multiple datasets using the standard_name.

def get_cf_vars(
info.head()

# In[11]:

cdm_profile_variables = info.loc[info['Attribute Name'] ==
                                 'cdm_profile_variables', 'Value']

print(''.join(cdm_profile_variables))

# # Selecting variables by attributes

# In[12]:

e.get_var_by_attr(
    dataset_id=
    'CP02PMCI-WFP01-03-CTDPFK000-ctdpf_ckl_wfp_instrument-telemetered-deployment0008-tabledap',
    standard_name='sea_water_temperature')

# # Easy to use CF conventions standards

# In[13]:

t_vars = [
    e.get_var_by_attr(dataset_id=glider,
                      standard_name='sea_water_temperature')[0]
    for glider in gliders
]
t_vars

# In[14]:
# Check what variables are available on the dataset:

info_url = erd.get_info_url(response='html')
show_iframe(info_url)

info_url = erd.get_info_url(response='csv')

info_df = to_df(info_url)
info_df

info_df[info_df['Row Type'] == 'variable']

# Take a look at the variables with standard names:

variables = erd.get_var_by_attr(standard_name=lambda v: v is not None)
variables

# These are the standard variables for the CTDBP instrument - specifically for the CP01CNSM-NSIF-CTDBP. Next, lets query the server for _all_ available data from the CP01CNSM-NSIF-CTDBP.

erd.variables = variables

erd.get_download_url()

# Put it all into a dataframe:

data = erd.to_pandas()

# +
# Plot a basic time-series of the conductivity 
import matplotlib.pyplot as plt
Exemple #6
0
class NDBC():
    def __init__(self, station_id, deploy_id, WMO, currentTime, startTime,
                 data_map, name_map):
        self.station_id = station_id
        self.deploy_id = deploy_id
        self.WMO = WMO
        self.now = currentTime
        self.startTime = startTime
        self.data_map = data_map
        self.name_map = name_map

    def adjust_pressure_to_sea_level(self, pres, temp, height):
        """Adjust barometric presure to sea-level."""
        temp = temp + 273.15
        slp = pres / np.exp(-height / (temp * 29.263))
        return slp

    def calculate_wind_speed(self, eastward, northward):
        """Calculate absolute wind speed from component wind vector."""
        u = np.square(eastward)
        v = np.square(northward)
        wind_speed = np.sqrt(u + v)
        return wind_speed

    def calculate_wind_direction(self, eastward, northward):
        """Calculate met wind direction from component wind vectors."""
        u = eastward
        v = northward
        wind_direction = 180 / np.pi * np.arctan2(-u, -v)
        return wind_direction

    def _connect_erddap(self,
                        server="http://ooivm1.whoi.net/erddap",
                        protocol="tabledap"):
        """Connect to the erddap server."""
        self._erddap = ERDDAP(server=server, protocol=protocol)

    def list_datasets(self):
        """Get the available datasets for the ERDDAP server."""
        # First, make the connection
        self._connect_erddap()
        # Next, get the datasets
        datasets = pd.read_csv(
            self._erddap.get_search_url(search_for=self.station_id,
                                        response='csv'))['Dataset ID']
        return datasets

    def get_dataset(self, dataset):
        """Get the data for specified datasets."""
        # First, have to re-establish the erddap connection
        self._connect_erddap()

        # Next, get the data for a dataset
        self._erddap.dataset_id = dataset

        # Only want the variables with standard names
        variables = self._erddap.get_var_by_attr(
            standard_name=lambda v: v is not None)
        self._erddap.variables = variables

        # Limit the data request to the current deployment
        self._erddap.constraints = {
            'deploy_id=': self.deploy_id,
            'time>=': self.startTime.strftime('%Y-%m-%dT%H:%M:%SZ')
        }

        try:
            # Download the data
            data = self._erddap.to_pandas(index_col='time (UTC)',
                                          parse_dates=True)

            # Sometimes it just returns an empty dataframe instead of an error
            if data.size == 0:
                data = self._create_empty_dataset()

        except:
            # If there is no available data in the requested time window, need
            # to create an empty dataframe of the data
            data = self._create_empty_dataset()

        # Return the dataset data
        return data

    def process_METBK_data(self, df, freq='10T'):
        """Process the METBK into the correct format and values for NDBC."""
        # Resample the data
        df_binned = df.resample(freq).mean()

        # Check that barometric pressure
        if 'barometric_pressure (mbar)' in df_binned.columns:
            # Adjust the barometric pressure to sea-level
            df_binned[
                'sea_level_pressure (hPa)'] = self.adjust_pressure_to_sea_level(
                    df_binned['barometric_pressure (mbar)'],
                    df_binned['air_temperature (degree_Celsius)'], 4.05)
        else:
            df_binned['sea_level_pressure (hPa)'] = np.nan

        # Check that the wind vector components are in the dataframe
        if 'eastward_wind_velocity (m s-1)' in df_binned.columns:
            # Calculate the wind speed
            df_binned['wind speed (m/s)'] = self.calculate_wind_speed(
                df_binned['eastward_wind_velocity (m s-1)'],
                df_binned['northward_wind_velocity (m s-1)'])

            # Calculate the wind direction
            df_binned['wind direction'] = self.calculate_wind_direction(
                df_binned['eastward_wind_velocity (m s-1)'],
                df_binned['northward_wind_velocity (m s-1)'])
            df_binned['wind direction'] = df_binned["wind direction"].apply(
                lambda x: x + 360 if x < 0 else x)

            # Don't need cardinal direction -> want direction in degrees
            # df_binned["wind direction"] = df_binned["wind direction"].apply(
            #   lambda x: self.get_cardinal_direction(np.round(x, decimals=2)))
        else:
            df_binned['wind speed (m/s)'] = np.nan
            df_binned['wind direction'] = np.nan

        # Return the processed data
        return df_binned

    def process_WAVSS_data(self, df, freq='10T'):
        """Much simpler function for processing the WAVSS data."""
        # Resample the data
        df_binned = df.resample(freq).mean()

        # Return the data
        return df_binned

    def _create_empty_dataset(self):
        """
        Create a dataset of all nans if there is no data available for
        the requested dataset in the given time period.
        """
        # Get the units for the corresponding variables
        info_url = self._erddap.get_info_url(
            dataset_id=self._erddap.dataset_id, response='csv')
        info = pd.read_csv(info_url)
        units = info[info['Attribute Name'] == 'units']

        # Now, add the units to the variable names
        columns = []
        for var in self._erddap.variables:
            unit = units[units['Variable Name'] == var]['Value'].values
            if len(unit) == 0:
                columns.append(f'{var}')
            elif var == 'time':
                pass
            else:
                columns.append(f'{var} ({unit[0]})')

        # Create an array of nans to fill out the empty dataframe
        empty_array = np.empty((2, len(columns)))
        empty_array[:] = np.nan

        # Put the empty array into a dataframe
        empty_df = pd.DataFrame(data=empty_array,
                                columns=columns,
                                index=[self.startTime, self.now])
        empty_df.index.name = 'time (UTC)'

        return empty_df

    def process_datasets(self, datasets):
        """Process the data for individual datasets."""
        self.datasets = datasets

        # Get the data for the individual datasets
        for dset in self.datasets.keys():
            self.datasets.update({dset: self.get_dataset(dset)})

        # Process the data
        for dset in self.datasets.keys():
            if 'METBK' in dset:
                self.datasets[dset] = self.process_METBK_data(
                    self.datasets[dset])
            else:
                self.datasets[dset] = self.process_WAVSS_data(
                    self.datasets[dset])

        # Add a header to the data in the datasets
        for key in self.datasets.keys():
            header = key.split('-', 2)[-1]
            for col in self.datasets.get(key).columns:
                self.datasets.get(key).rename(
                    columns={col: ' '.join((header, col))}, inplace=True)

    def parse_data_to_xml(self, data):
        """
        Function which takes in the 10-minute average buoy data,
        the station name, and two dictionaries which map the buoy
        column names to the xml tags, and outputs an xml file in
        the NDBC format.

        Returns:
            xml - a properly constructed xml file in the NDBC
            format for the given buoy data
        """

        # Start the xml file
        xml = ['<?xml version="1.0" encoding="ISO-8859-1"?>']

        # Iterate through the data
        for index in data.index:

            # Get the data associated with a row in the dataframe
            row = data.loc[index]

            # Reset a dictionary of the data
            xml_data = {}
            for key in self.data_map.keys():
                xml_data.update({key: self.data_map.get(key)})

            # Parse the data into the data dictionary
            for key in xml_data.keys():
                # Get the column name which corresponds to the ndbc tag
                column = self.name_map.get(key)
                # Check that the column was returned from the ERDDAP server
                if column in row.index:
                    value = row[column]
                    # If a nan, just leave it the default -9999
                    if str(value) == 'nan':
                        pass
                    else:
                        xml_data[key] = value
                # If no data, leave it as default -9999
                else:
                    pass

            # Write the parsed data to the xml file
            # Start the message
            xml.append('<message>')

            # Add in the station id
            xml.append(f'  <station>{self.WMO}</station>')

            # Get the time index
            time = row.name.strftime('%m/%d/%Y %H:%M:%S')
            xml.append(f'  <date>{time}</date>')

            # Missing fill value
            missing = str(-9999)
            xml.append(f'  <missing>{missing}</missing>')

            # Roundtime
            xml.append('  <roundtime>no</roundtime>')

            # Start of the data
            xml.append('  <met>')

            # Add in each data piece
            for tag in xml_data.keys():
                # Get the value
                value = xml_data.get(tag)
                value = str(value)
                # Add the data to the xml file
                xml.append(f'    <{tag}>{value}</{tag}>')

            # Finish off the message
            xml.append('  </met>')
            xml.append('</message>')

        # Return the results
        return xml
# In[11]:

cdm_profile_variables = info.loc[
    info['Attribute Name'] == 'cdm_profile_variables', 'Value'
]

print(''.join(cdm_profile_variables))


# # Selecting variables by attributes

# In[12]:

e.get_var_by_attr(
    dataset_id='CP02PMCI-WFP01-03-CTDPFK000-ctdpf_ckl_wfp_instrument-telemetered-deployment0008-tabledap',
    standard_name='sea_water_temperature'
)


# # Easy to use CF conventions standards

# In[13]:


t_vars = [
    e.get_var_by_attr(
        dataset_id=glider, standard_name='sea_water_temperature'
    )[0] for glider in gliders
]
t_vars
Exemple #8
0
def get_standard_variables_and_metadata(server_link, standard_variable_list):

    # Get access to the server and find datasets associated with standard_name variable listed
    e = ERDDAP(server=server_link, protocol='tabledap', response='csv')

    # Define Filter for which datasets to look into
    kw = {
        'standard_name': ','.join(standard_variable_list),
        'min_lon': -180.0,
        'max_lon': 180.0,
        'min_lat': -90.0,
        'max_lat': 90.0,
        'min_time': '',
        'max_time': '',
        'cdm_data_type': ''
    }

    variable_to_groupby = [('latitude', 'degrees_north'),
                           ('longitude', 'degrees_east')]

    # Get available datasets from that server
    search_url = e.get_search_url(response='csv', **kw)
    datasets = pd.read_csv(search_url)

    # Print results
    print(e.server)
    print(
        str(len(datasets)) + " datasets contains " +
        ', '.join(standard_variable_list))

    # Loop through different data sets and create a metadata dataFrame
    df = pd.DataFrame(columns=['Dataset ID'])

    for index, row in datasets.iterrows():
        # Get Info from dataset (mostly min/max lat/long)
        print(row['Dataset ID'])
        info_url = e.get_info_url(dataset_id=row['Dataset ID'], response='csv')
        info = pd.read_csv(info_url)
        attribute_table = info.set_index(
            ['Row Type', 'Variable Name',
             'Attribute Name']).transpose()['attribute']

        # Try to get the distinct lat/long and time and depth range for that dataset, if it fails rely on the
        # ERDDAP metadata
        try:
            # If dataset is spread out geographically find distinct locations (may not work well for trajectory data)
            latlong_url = e.get_download_url(
                dataset_id=row['Dataset ID'],
                protocol='tabledap',
                variables=['latitude', 'longitude', 'time'])

            # Get add to the url commands to get distinct values and ordered with min and max time for each lat/long
            distinctMinMaxTime_url = latlong_url + '&distinct()&orderByMinMax(%22latitude%2Clongitude%2Ctime%22)'

            # Get lat/long and min/max depth for this dataset
            data = pd.read_csv(distinctMinMaxTime_url, header=[0, 1])

            # Group data by latitude/longitude and get min max values
            data_reduced = data.groupby(by=variable_to_groupby).agg(
                ['min', 'max']).reset_index()

            if info[(info['Variable Name'] == 'depth')].size > 0:
                latlongdepth_url = e.get_download_url(
                    dataset_id=row['Dataset ID'],
                    protocol='tabledap',
                    variables=['latitude', 'longitude', 'depth'])

                # Get add to the url commands to get distinct values and ordered with min and max depth for
                # each lat/long
                distinctMinMaxDepth_url = latlongdepth_url + \
                                          '&distinct()&orderByMinMax(%22latitude%2Clongitude%2Cdepth%22)'

                # Get lat/long and min/max depth for this dataset
                data_depth = pd.read_csv(distinctMinMaxDepth_url,
                                         header=[0, 1])

                # Group depth data by lat/long and get min max values
                data_depth_reduced = data_depth.groupby(
                    by=variable_to_groupby).agg(['min', 'max']).reset_index()

                # Merge depth values with time
                data_reduced = data_reduced.merge(data_depth_reduced,
                                                  on=variable_to_groupby,
                                                  how='left')

            # Merge multi index column names
            data_reduced.columns = data_reduced.columns.map(
                ' '.join).str.strip(' ')

        except Exception as exception_error:

            print('Failed to read: ' + str(exception_error))
            # If there's only one location, it could get the range from metadata

            # Find lat/long range of this dataset, if it's point we don't need to look into it
            min_latitude = float(attribute_table['NC_GLOBAL',
                                                 'geospatial_lat_min'].Value)
            max_latitude = float(attribute_table['NC_GLOBAL',
                                                 'geospatial_lat_max'].Value)
            min_longitude = float(attribute_table['NC_GLOBAL',
                                                  'geospatial_lon_min'].Value)
            max_longitude = float(attribute_table['NC_GLOBAL',
                                                  'geospatial_lon_max'].Value)

            # If min/max lat/long are the same don't go in the dataset
            if (min_latitude == max_latitude) & (min_longitude
                                                 == max_longitude):
                data_reduced = pd.DataFrame(columns=['Dataset ID'])
                data_reduced = {}
                data_reduced['latitude degrees_north'] = min_latitude
                data_reduced['longitude degrees_east'] = min_longitude

                if 'depth' in attribute_table.columns and 'actual_range' in attribute_table[
                        'depth'] and ('m'
                                      == attribute_table['depth',
                                                         'units']['Value']):

                    depth_range = np.array(
                        str.split(
                            attribute_table['depth', 'actual_range']['Value'],
                            ',')).astype(np.float)
                    data_reduced['depth m min'] = depth_range[0]
                    data_reduced['depth m max'] = depth_range[1]

                # Convert to DataFrame
                data_reduced = pd.DataFrame(data_reduced, index=[0])
                print('Retrieved metadata')
            else:
                # Won't handle data with multiple location that it can't retrieve the data
                continue

        # Add Standard Name Variable Name to table info['Attribute Name'] == 'geospatial_lat_min'
        for var in standard_variable_list:
            data_reduced[var] = ','.join(
                e.get_var_by_attr(dataset_id=row['Dataset ID'],
                                  standard_name=var))

        # Add cdm_data_type to table
        data_reduced['cdm_data_type'] = ','.join(
            info[info['Attribute Name'] == 'cdm_data_type']['Value'].values)

        # Add Dataset id to the table
        data_reduced['Dataset ID'] = row['Dataset ID']

        # Merge that dataset ID with previously downloaded data
        df = df.append(data_reduced)

    # Add server to dataFrame
    df['server'] = e.server

    # Save resulting dataframe to a CSV, file name is based on the server address
    file_name = re.sub('https*://', '', e.server)
    file_name = re.sub("[\./]", '_', file_name)
    file_name = 'Server_List_' + file_name + '.csv'

    print('Save result to ' + file_name)
    df.to_csv(file_name)

    return df