Beispiel #1
0
print(e.get_download_url())

If we change the response to `html` we can visualize the page.

def show_iframe(src):
    from IPython.display import HTML

    iframe = '<iframe src="{src}" width="100%" height="950"></iframe>'.format
    return HTML(iframe(src=src))


show_iframe(e.get_download_url(response="html"))

Additionally, the object has `.get_info_url()` and `.get_search_url()` that can be used to obtain the info and search URLs respectively

show_iframe(e.get_info_url(response="html"))

show_iframe(e.get_search_url(response="html"))

`erddapy` also brings some simple methods to download the data in some common data formats, like `pandas.DataFrame` and `xarray.Dataset`.

df = e.to_pandas(index_col="time (UTC)", parse_dates=True,).dropna()


df.head()

ds = e.to_xarray(decode_times=False)

ds["temperature"]

Here is a simple plot using the data from `xarray`.
Beispiel #2
0
class ErddapReader(Reader):
    """
    This class searches ERDDAP servers. There are 2 known_servers but
    others can be input too.

    Attributes
    ----------
    parallel: boolean
        If True, run with simple parallelization using `multiprocessing`.
        If False, run serially.
    known_server: string
        Two ERDDAP servers are built in to be known to this reader: "ioos" and
        "coastwatch".
    e: ERDDAP server instance
    e.protocol: string
        * "tabledap" (pandas, appropriate for reading as csv)
        * "griddap" (xarray, appropriate for reading as netcdf)
    e.server: string
        Return the server name
    columns: list
        Metadata columns
    name: string
        "erddap_ioos", "erddap_coastwatch", or a constructed string if the user
        inputs a new protocol and server.
    reader: string
        reader is defined as "ErddapReader".
    """
    def __init__(self,
                 known_server="ioos",
                 protocol=None,
                 server=None,
                 parallel=True):
        """
        Parameters
        ----------
        known_server: string, optional
            Two ERDDAP servers are built in to be known to this reader:
            "ioos" and "coastwatch".
        protocol, server: string, optional
            For a user-defined ERDDAP server, input the protocol as one of the
            following:
            * "tabledap" (pandas, appropriate for reading as csv)
            * "griddap" (xarray, appropriate for reading as netcdf)
            and the server address (such as
            "http://erddap.sensors.ioos.us/erddap" or
            "http://coastwatch.pfeg.noaa.gov/erddap").
        parallel: boolean
            If True, run with simple parallelization using `multiprocessing`.
            If False, run serially.
        """
        self.parallel = parallel

        # hard wire this for now
        filetype = "netcdf"

        # either select a known server or input protocol and server string
        if known_server == "ioos":
            protocol = "tabledap"
            server = "http://erddap.sensors.ioos.us/erddap"
            filetype = "netcdf"  # other option: "csv"
        elif known_server == "coastwatch":
            protocol = "griddap"
            server = "http://coastwatch.pfeg.noaa.gov/erddap"
            filetype = "netcdf"  # other option: "csv"
        elif known_server is not None:
            statement = (
                "either select a known server or input protocol and server string"
            )
            assert (protocol is not None) & (server is not None), statement
        else:
            known_server = urllib.parse.urlparse(server).netloc
            # known_server = server.strip("/erddap").strip("http://").replace(".", "_")
            statement = (
                "either select a known server or input protocol and server string"
            )
            assert (protocol is not None) & (server is not None), statement

        self.known_server = known_server
        self.e = ERDDAP(server=server)
        self.e.protocol = protocol
        self.e.server = server
        self.filetype = filetype

        # columns for metadata
        self.columns = [
            "geospatial_lat_min",
            "geospatial_lat_max",
            "geospatial_lon_min",
            "geospatial_lon_max",
            "time_coverage_start",
            "time_coverage_end",
            "defaultDataQuery",
            "subsetVariables",  # first works for timeseries sensors, 2nd for gliders
            "keywords",  # for hf radar
            "id",
            "infoUrl",
            "institution",
            "featureType",
            "source",
            "sourceUrl",
        ]

        # name
        self.name = f"erddap_{known_server}"

        self.reader = "ErddapReader"
        self.store = dict()

    def __getitem__(self, key):
        """Redefinition of dict-like behavior.

        This enables user to use syntax `reader[dataset_id]` to read in and
        save dataset into the object.

        Parameters
        ----------
        key: str
            dataset_id for a dataset that is available in the search/reader
            object.

        Returns
        -------
        xarray Dataset of the data associated with key
        """

        returned_data = self.data_by_dataset(key)
        # returned_data = self._return_data(key)
        self.__setitem__(key, returned_data)
        return returned_data

    def find_dataset_id_from_station(self, station):
        """Find dataset_id from station name.

        Parameters
        ----------
        station: string
            Station name for which to search for dataset_id
        """

        if station is None:
            return None
        # for station in self._stations:
        # if station has more than one word, AND will be put between
        # to search for multiple terms together.
        url = self.e.get_search_url(response="csv",
                                    items_per_page=5,
                                    search_for=station)

        try:
            df = pd.read_csv(url)
        except Exception as e:
            logger.exception(e)
            logger.warning(
                f"search url {url} did not work for station {station}.")
            return

        # first try for exact station match
        try:
            # Special case for TABS when don't split the id name
            if "tabs" in station:  # don't split
                dataset_id = [
                    dataset_id for dataset_id in df["Dataset ID"]
                    if station.lower() == dataset_id.lower()
                ][0]
            else:
                # first try as dataset_id then do as station name
                dataset_id = [
                    dataset_id for dataset_id in df["Dataset ID"]
                    if station.lower() in [dataset_id.lower()] +
                    dataset_id.lower().split("_")
                ][0]

        except Exception as e:
            logger.exception(e)
            logger.warning(
                "When searching for a dataset id to match station name %s, the first attempt to match the id did not work."
                % (station))
            # If that doesn't work, return None for dataset_id
            dataset_id = None
            # # if that doesn't work, trying for more general match and just take first returned option
            # dataset_id = df.iloc[0]["Dataset ID"]

        return dataset_id

    @property
    def dataset_ids(self):
        """Find dataset_ids for server.

        Notes
        -----
        The dataset_ids are found by querying the metadata through the ERDDAP server.

        The number of dataset_ids can change if a variable is removed from the
        list of variables and this is rerun.
        """

        if not hasattr(self, "_dataset_ids") or (
                self.variables and
            (len(self.variables) != self.num_variables)):

            # This should be a region search
            if self.approach == "region":

                # find all the dataset ids which we will use to get the data
                # This limits the search to our keyword arguments in kw which should
                # have min/max lon/lat/time values
                dataset_ids = []
                if self.variables is not None:
                    for variable in self.variables:

                        # find and save all dataset_ids associated with variable
                        search_url = self.e.get_search_url(
                            response="csv",
                            **self.kw,
                            variableName=variable,
                            items_per_page=10000,
                        )

                        try:
                            search = pd.read_csv(search_url)
                            dataset_ids.extend(search["Dataset ID"])
                        except Exception as e:
                            logger.exception(e)
                            logger.warning(
                                f"variable {variable} was not found in the search"
                            )
                            logger.warning(f"search_url: {search_url}")

                else:

                    # find and save all dataset_ids associated with variable
                    search_url = self.e.get_search_url(response="csv",
                                                       **self.kw,
                                                       items_per_page=10000)

                    try:
                        search = pd.read_csv(search_url)
                        dataset_ids.extend(search["Dataset ID"])
                    except Exception as e:
                        logger.exception(e)
                        logger.warning("nothing found in the search")
                        logger.warning(f"search_url: {search_url}")

                # only need a dataset id once since we will check them each for all standard_names
                self._dataset_ids = list(set(dataset_ids))

            # This should be a search for the station names
            elif self.approach == "stations":

                # search by station name for each of stations
                if self.parallel:
                    # get metadata for datasets
                    # run in parallel to save time
                    num_cores = multiprocessing.cpu_count()
                    dataset_ids = Parallel(n_jobs=num_cores)(
                        delayed(self.find_dataset_id_from_station)(station)
                        for station in self._stations)

                else:
                    dataset_ids = []
                    for station in self._stations:
                        dataset_ids.append(
                            self.find_dataset_id_from_station(station))

                # remove None from list
                dataset_ids = [i for i in dataset_ids if i]

                # In this case return all dataset_ids so they match 1-1 with
                # the input station list.
                self._dataset_ids = dataset_ids

            else:
                logger.warning(
                    "Neither stations nor region approach were used in function dataset_ids."
                )

            # update number of variables
            if self.variables:
                self.num_variables = len(self.variables)

        return self._dataset_ids

    def meta_by_dataset(self, dataset_id):
        """Return the catalog metadata for a single dataset_id."""

        info_url = self.e.get_info_url(response="csv", dataset_id=dataset_id)
        try:
            info = pd.read_csv(info_url)
        except Exception as e:
            logger.exception(e)
            logger.warning(f"Could not read info from {info_url}")
            return {dataset_id: []}

        items = []

        for col in self.columns:

            try:
                item = info[info["Attribute Name"] == col]["Value"].values[0]
                dtype = info[info["Attribute Name"] ==
                             col]["Data Type"].values[0]
            except:
                if col == "featureType":
                    # this column is not present in HF Radar metadata but want it to
                    # map to data_type, so input 'grid' in that case.
                    item = "grid"
                else:
                    item = "NA"

            if dtype == "String":
                pass
            elif dtype == "double":
                item = float(item)
            elif dtype == "int":
                item = int(item)
            items.append(item)

        # include download link ##
        self.e.dataset_id = dataset_id
        if self.e.protocol == "tabledap":
            # set the same time restraints as before
            self.e.constraints = {
                "time<=": self.kw["max_time"],
                "time>=": self.kw["min_time"],
            }
            if self.filetype == "csv":
                download_url = self.e.get_download_url(response="csvp")
            elif self.filetype == "netcdf":
                download_url = self.e.get_download_url(response="ncCf")

        elif self.e.protocol == "griddap":
            # the search terms that can be input for tabledap do not work for griddap
            # in erddapy currently. Instead, put together an opendap link and then
            # narrow the dataset with xarray.
            # get opendap link
            download_url = self.e.get_download_url(response="opendap")

        # check if "prediction" is present in metadata, esp in case of NOAA
        # model predictions
        is_prediction = "Prediction" in " ".join(
            list(info["Value"].replace(np.nan, None).values))

        # add erddap server name
        return {
            dataset_id:
            [self.e.server, download_url, info_url, is_prediction] + items +
            [self.variables]
        }

    @property
    def meta(self):
        """Rearrange the individual metadata into a dataframe.

        Notes
        -----
        This should exclude duplicate entries.
        """

        if not hasattr(self, "_meta"):

            if self.parallel:

                # get metadata for datasets
                # run in parallel to save time
                num_cores = multiprocessing.cpu_count()
                downloads = Parallel(n_jobs=num_cores)(
                    delayed(self.meta_by_dataset)(dataset_id)
                    for dataset_id in self.dataset_ids)

            else:

                downloads = []
                for dataset_id in self.dataset_ids:
                    downloads.append(self.meta_by_dataset(dataset_id))

            # make dict from individual dicts
            from collections import ChainMap

            meta = dict(ChainMap(*downloads))

            # Make dataframe of metadata
            # variable names are the column names for the dataframe
            self._meta = pd.DataFrame.from_dict(
                meta,
                orient="index",
                columns=[
                    "database", "download_url", "info_url", "is_prediction"
                ] + self.columns + ["variable names"],
            )

        return self._meta

    def data_by_dataset(self, dataset_id):
        """Return the data for a single dataset_id.

        Returns
        -------
        A tuple of (dataset_id, data), where data type is a pandas DataFrame

        Notes
        -----
        Data is read into memory.
        """

        if self.filetype == "csv":
            # if self.e.protocol == "tabledap":
            try:
                # fetch metadata if not already present
                # found download_url from metadata and use
                self.e.dataset_id = dataset_id
                # dataset_vars gives a list of the variables in the dataset
                dataset_vars = (self.meta.loc[dataset_id]
                                ["defaultDataQuery"].split("&")[0].split(","))
                # vars_present gives the variables in self.variables
                # that are actually in the dataset
                vars_present = []
                for selfvariable in self.variables:
                    vp = [var for var in dataset_vars if var == selfvariable]
                    if len(vp) > 0:
                        vars_present.append(vp[0])
                # If any variables are not present, this doesn't work.
                if self.variables is not None:
                    self.e.variables = [
                        "time",
                        "longitude",
                        "latitude",
                        "station",
                    ] + vars_present
                dd = self.e.to_pandas(response="csvp",
                                      index_col=0,
                                      parse_dates=True)
                # dd = self.e.to_pandas(response='csv', header=[0, 1],
                #                       index_col=0, parse_dates=True)
                # dd = pd.read_csv(
                #     download_url, header=[0, 1], index_col=0, parse_dates=True
                # )

                # Drop cols and rows that are only NaNs.
                dd = dd.dropna(axis="index", how="all").dropna(axis="columns",
                                                               how="all")

                if self.variables is not None:
                    # check to see if there is any actual data
                    # this is a bit convoluted because the column names are the variable names
                    # plus units so can't match 1 to 1.
                    datacols = (
                        0  # number of columns that represent data instead of metadata
                    )
                    for col in dd.columns:
                        datacols += [
                            varname in col for varname in self.variables
                        ].count(True)
                    # if no datacols, we can skip this one.
                    if datacols == 0:
                        dd = None

            except Exception as e:
                logger.exception(e)
                logger.warning("no data to be read in for %s" % dataset_id)
                dd = None

        elif self.filetype == "netcdf":
            # elif self.e.protocol == "griddap":

            if self.e.protocol == "tabledap":

                try:
                    # assume I don't need to narrow in space since time series (tabledap)
                    self.e.dataset_id = dataset_id
                    dd = self.e.to_xarray()
                    # dd = xr.open_dataset(download_url, chunks="auto")
                    dd = dd.swap_dims({"obs": dd.cf["time"].name})
                    dd = dd.sortby(dd.cf["time"], ascending=True)
                    dd = dd.cf.sel(
                        T=slice(self.kw["min_time"], self.kw["max_time"]))
                    # dd = dd.set_coords(
                    #     [dd.cf["longitude"].name, dd.cf["latitude"].name]
                    # )

                    # use variable names to drop other variables (should. Ido this?)
                    if self.variables is not None:
                        # I don't think this is true with new approach
                        # # ERDDAP prepends variables with 's.' in netcdf files,
                        # # so include those with variables
                        # erd_vars = [f's.{var}' for var in self.variables]
                        # var_list = set(dd.data_vars) - (set(self.variables) | set(erd_vars))
                        var_list = set(dd.data_vars) - set(self.variables)
                        dd = dd.drop_vars(var_list)

                    # the lon/lat are on the 'timeseries' singleton dimension
                    # but the data_var variable was not, which messed up
                    # cf-xarray. When longitude and latitude are not on a
                    # dimension shared with a variable, the variable can't be
                    # called with cf-xarray. e.g. dd.cf['ssh'] won't work.
                    if "timeseries" in dd.dims:
                        for data_var in dd.data_vars:
                            if "timeseries" not in dd[data_var].dims:
                                dd[data_var] = dd[data_var].expand_dims(
                                    dim="timeseries", axis=1)

                except Exception as e:
                    logger.exception(e)
                    logger.warning("no data to be read in for %s" % dataset_id)
                    dd = None

            elif self.e.protocol == "griddap":

                try:
                    # this makes it read in the whole file which might be large
                    self.e.dataset_id = dataset_id
                    # dd = self.e.to_xarray(chunks="auto").sel(
                    #     time=slice(self.kw["min_time"], self.kw["max_time"])
                    # )
                    download_url = self.e.get_download_url(response="opendap")
                    dd = xr.open_dataset(download_url, chunks="auto").sel(
                        time=slice(self.kw["min_time"], self.kw["max_time"]))

                    if ("min_lat" in self.kw) and ("max_lat" in self.kw):
                        dd = dd.sel(latitude=slice(self.kw["min_lat"],
                                                   self.kw["max_lat"]))

                    if ("min_lon" in self.kw) and ("max_lon" in self.kw):
                        dd = dd.sel(longitude=slice(self.kw["min_lon"],
                                                    self.kw["max_lon"]))

                    # use variable names to drop other variables (should. Ido this?)
                    if self.variables is not None:
                        vars_list = set(dd.data_vars) - set(self.variables)
                        dd = dd.drop_vars(vars_list)

                except Exception as e:
                    logger.exception(e)
                    logger.warning("no data to be read in for %s" % dataset_id)
                    dd = None

        # return (dataset_id, dd)
        return dd

    # @property
    def data(self, dataset_ids=None):
        """Read in data for some or all dataset_ids.

        NOT USED CURRENTLY

        Once data is read in for a dataset_ids, it is remembered.

        See full documentation in `utils.load_data()`.
        """

        output = odg.utils.load_data(self, dataset_ids)
        return output
Beispiel #3
0
# In[9]:

search_url = e.get_search_url(response='csv', **kw)
search = pd.read_csv(search_url)
gliders = search['Dataset ID'].values

msg = 'Found {} Glider Datasets:\n\n{}'.format
print(msg(len(gliders), '\n'.join(gliders)))

# With the Dataset IDs we can explore the metadata with the *get_info_url*

# In[10]:

print(gliders[0])

info_url = e.get_info_url(dataset_id=gliders[0], response='csv')
info = pd.read_csv(info_url)

info.head()

# In[11]:

cdm_profile_variables = info.loc[info['Attribute Name'] ==
                                 'cdm_profile_variables', 'Value']

print(''.join(cdm_profile_variables))

# # Selecting variables by attributes

# In[12]:
Beispiel #4
0
def get_coordinates(df, **kw):
    '''
    Example ERDDAP TableDAP URL:

    dataset_url = '%s/tabledap/%s.csvp?latitude,longitude,time&longitude>=-72.0&longitude<=-69&latitude>=38&latitude<=41&time>=1278720000.0&time<=1470787200.0&distinct()' % (all_datasets['server'].iloc[int(i)],all_datasets['Dataset ID'].iloc[int(i)])
    '''
    df_coords = pd.DataFrame()

    # alternate approach to above is iterate the original DataFrame passed (df), stopping either
    #   at final_dataset_limit (10 currently) or the max # of rows in df (conclusion of for loop)
    #   previous enclosing while loop is unnecessary as a result
    final_dataset_limit = 10
    datasets_found = 0
    if df.shape[0] < final_dataset_limit:
        final_dataset_limit = df.shape[0]

    index_random = random.sample(range(0, df.shape[0]), df.shape[0])
    print("index_random: {}".format(index_random))

    #for i in range(subset_datasets.shape[0]):
    for i in index_random:
        server_url = df['server'].iloc[int(i)]
        dataset_id = df['Dataset ID'].iloc[int(i)]
        institution = df['Institution'].iloc[int(i)]

        # skip some difficult datasets for now:
        if "ROMS" in dataset_id or "DOP" in dataset_id:  # skip ROMS model output
            #print("Skipping %s" % server_url + dataset_id)
            continue

        e = ERDDAP(server=server_url, protocol='tabledap', response='csv')
        try:
            print("datasets_found: {}".format(datasets_found))
            # former config for query, replaced with new code below:
            #e.variables=["latitude","longitude"]#,"time"]
            #e.dataset_id = all_datasets['Dataset ID'].iloc[int(i)]
            #e.constraints = {
            #       "time>=": kw['min_time'],
            #       "time<=": kw['max_time'],
            #       "longitude>=": kw['min_lon'],
            #       "longitude<=": kw['max_lon'],
            #       "latitude>=": kw['min_lat'],
            #       "latitude<=": kw['max_lat'],
            #       "distinct" : ()
            #}

            # Generate a download URL via e.get_download_url and pass to Pandas DataFrame via read_csv
            #   we need to use e.constraints here rather than in e.get_download_url to allow appending '>=' '<=' to the contstraints keys to match ERDDAP's API
            #   (parameter signature differs from the search API used above)
            # also add a 'distinct = ()' param, generate a download url, and submit a csv dataset download request to ERDDAP
            #kw["distinct"] = "()"
            e.constraints = {
                "time>=": kw['min_time'],
                "time<=": kw['max_time'],
                "longitude>=": kw['min_lon'],
                "longitude<=": kw['max_lon'],
                "latitude>=": kw['min_lat'],
                "latitude<=": kw['max_lat'],
                "distinct": ()
            }
            url = e.get_download_url(
                #constraints=kw,
                response="csvp",
                dataset_id=df['Dataset ID'].iloc[int(i)],
                variables=["latitude", "longitude"])
            print("Download URL: {}".format(url))

            #coords = pd.read_csv(url, headers=headers)
            coords = pd.read_csv(url)
            coords['dataset_count'] = i
            coords['dataset_download_url'] = url
            coords['Dataset ID'] = dataset_id
            coords['Institution'] = institution

            metadata_url = e.get_info_url(
                dataset_id=df['Dataset ID'].iloc[int(i)], response='csv')

            metadata = pd.read_csv(metadata_url)

            coords['cdm_data_type'] = "".join(
                metadata.loc[metadata["Attribute Name"] == "cdm_data_type",
                             "Value"])

            #get_var_by_attr example (ToDo):
            #e.get_var_by_attr(dataset_id, standard_name='northward_sea_water_velocity')

            print(coords.head())
            df_coords = pd.concat([df_coords, coords])

            # reaching this point in the query means the dataset query was successful, increment
            #   we need to break out of for loop here however if we reach final_dataset_limit to not go over:
            datasets_found += 1
            print("new dataset acquired; datasets_found: {}".format(
                datasets_found))
            if datasets_found == final_dataset_limit: break

        except Exception as ex:
            # can happen if the dataset does not have any features within the query window, just log it here:
            if type(ex).__name__ in ["HTTPError"]:
                print(ex)
            #raise
            pass

    return df_coords
# * BUOY (surface buoy)
# * MFN (multifunction node - on the bottom of the ocean)
# * NSIF (near-surface instrument frame - located at 7 m depth)
#
# First, lets try the CTDBP on the NSIF:

url = erd.get_search_url(search_for='"CP01CNSM NSIF CTDBP"', response='csv')

datasets = to_df(url)['Dataset ID']
datasets

erd.dataset_id = datasets[0]

# Check what variables are available on the dataset:

info_url = erd.get_info_url(response='html')
show_iframe(info_url)

info_url = erd.get_info_url(response='csv')

info_df = to_df(info_url)
info_df

info_df[info_df['Row Type'] == 'variable']

# Take a look at the variables with standard names:

variables = erd.get_var_by_attr(standard_name=lambda v: v is not None)
variables

# These are the standard variables for the CTDBP instrument - specifically for the CP01CNSM-NSIF-CTDBP. Next, lets query the server for _all_ available data from the CP01CNSM-NSIF-CTDBP.
Beispiel #6
0
class ErddapReader:
    

    def __init__(self, known_server='ioos', protocol=None, server=None, parallel=True):
        
#         # run checks for KW 
#         self.kw = kw

        self.parallel = parallel
    
        
        # either select a known server or input protocol and server string
        if known_server == 'ioos':
            protocol = 'tabledap'
            server = 'http://erddap.sensors.ioos.us/erddap'
        elif known_server == 'coastwatch':
            protocol = 'griddap'
            server = 'http://coastwatch.pfeg.noaa.gov/erddap'
        elif known_server is not None:
            statement = 'either select a known server or input protocol and server string'
            assert (protocol is not None) & (server is not None), statement
        else:
            known_server = server.strip('/erddap').strip('http://').replace('.','_')
            statement = 'either select a known server or input protocol and server string'
            assert (protocol is not None) & (server is not None), statement
        
        self.known_server = known_server
        self.e = ERDDAP(server=server)
        self.e.protocol = protocol
        self.e.server = server
                  
        # columns for metadata
        self.columns = ['geospatial_lat_min', 'geospatial_lat_max', 
               'geospatial_lon_min', 'geospatial_lon_max', 
               'time_coverage_start', 'time_coverage_end',
               'defaultDataQuery', 'subsetVariables',  # first works for timeseries sensors, 2nd for gliders
               'keywords',  # for hf radar
               'id', 'infoUrl', 'institution', 'featureType', 'source', 'sourceUrl']
        
        # name
        self.name = f'erddap_{known_server}'
        
        self.reader = 'ErddapReader'
        
# #         self.data_type = data_type
#         self.standard_names = standard_names
#         # DOESN'T CURRENTLY LIMIT WHICH VARIABLES WILL BE FOUND ON EACH SERVER

    
    
    @property
    def dataset_ids(self):
        '''Find dataset_ids for server.'''
        
        if not hasattr(self, '_dataset_ids'):
            
            # This should be a region search
            if self.approach == 'region':
        
                # find all the dataset ids which we will use to get the data
                # This limits the search to our keyword arguments in kw which should 
                # have min/max lon/lat/time values
                dataset_ids = []
                if self.variables is not None:
                    for variable in self.variables:

                        # find and save all dataset_ids associated with variable
                        search_url = self.e.get_search_url(response="csv", **self.kw, 
                                                           variableName=variable, 
                                                           items_per_page=10000)

                        try:
                            search = pd.read_csv(search_url)
                            dataset_ids.extend(search["Dataset ID"])
                        except Exception as e:
                            logger_erd.exception(e)
                            logger_erd.warning(f"variable {variable} was not found in the search")
                            logger_erd.warning(f'search_url: {search_url}')

                else:
                    
                    # find and save all dataset_ids associated with variable
                    search_url = self.e.get_search_url(response="csv", **self.kw, 
                                                       items_per_page=10000)

                    try:
                        search = pd.read_csv(search_url)
                        dataset_ids.extend(search["Dataset ID"])
                    except Exception as e:
                        logger_erd.exception(e)
                        logger_erd.warning(f"nothing found in the search")
                        logger_erd.warning(f'search_url: {search_url}')

                    
                # only need a dataset id once since we will check them each for all standard_names
                self._dataset_ids = list(set(dataset_ids))
            
            # This should be a search for the station names
            elif self.approach == 'stations':
#             elif self._stations is not None:
                
                # search by station name for each of stations
                dataset_ids = []
                for station in self._stations:
                    # if station has more than one word, AND will be put between to search for multiple 
                    # terms together
                    url = self.e.get_search_url(response="csv", items_per_page=5, search_for=station)

                    try:
                        df = pd.read_csv(url)
                    except Exception as e:
                        logger_erd.exception(e)
                        logger_erd.warning(f'search url {url} did not work for station {station}.')
                        continue
    
                    # first try for exact station match
                    try:
                        dataset_id = [dataset_id for dataset_id in df['Dataset ID'] if station.lower() in dataset_id.lower().split('_')][0]

                    # if that doesn't work, trying for more general match and just take first returned option
                    except Exception as e:
                        logger_erd.exception(e)
                        logger_erd.warning('When searching for a dataset id to match station name %s, the first attempt to match the id did not work.' % (station))
                        dataset_id = df.iloc[0]['Dataset ID']
        
#                         if 'tabs' in org_id:  # don't split
#                             axiom_id = [axiom_id for axiom_id in df['Dataset ID'] if org_id.lower() == axiom_id.lower()]
#                         else:
#                             axiom_id = [axiom_id for axiom_id in df['Dataset ID'] if org_id.lower() in axiom_id.lower().split('_')][0]
                
#                     except:
#                         dataset_id = None
                
                    dataset_ids.append(dataset_id)
                    
                self._dataset_ids = list(set(dataset_ids))
                
            else:
                logger_erd.warning('Neither stations nor region approach were used in function dataset_ids.')
                
            
        return self._dataset_ids
        
    
    def meta_by_dataset(self, dataset_id):

        info_url = self.e.get_info_url(response="csv", dataset_id=dataset_id)
        info = pd.read_csv(info_url)

        items = []

        for col in self.columns:

            try:
                item = info[info['Attribute Name'] == col]['Value'].values[0]
                dtype = info[info['Attribute Name'] == col]['Data Type'].values[0]
            except:
                if col == 'featureType':
                    # this column is not present in HF Radar metadata but want it to
                    # map to data_type, so input 'grid' in that case.
                    item = 'grid'
                else:
                    item = 'NA'

            if dtype == 'String':
                pass
            elif dtype == 'double':
                item = float(item)
            elif dtype == 'int':
                item = int(item)
            items.append(item)
            
#         if self.standard_names is not None:
#             # In case the variable is named differently from the standard names, 
#             # we back out the variable names here for each dataset. This also only 
#             # returns those names for which there is data in the dataset.
#             varnames = self.e.get_var_by_attr(
#                 dataset_id=dataset_id,
#                 standard_name=lambda v: v in self.standard_names
#             )
#         else:
#             varnames = None

        ## include download link ##
        self.e.dataset_id = dataset_id
        if self.e.protocol == 'tabledap':
            if self.variables is not None:
                self.e.variables = ["time","longitude", "latitude", "station"] + self.variables
            # set the same time restraints as before
            self.e.constraints = {'time<=': self.kw['max_time'], 'time>=': self.kw['min_time'],}
            download_url = self.e.get_download_url(response='csvp')

        elif self.e.protocol == 'griddap':
            # the search terms that can be input for tabledap do not work for griddap
            # in erddapy currently. Instead, put together an opendap link and then 
            # narrow the dataset with xarray.
            # get opendap link
            download_url = self.e.get_download_url(response='opendap')
        
        # add erddap server name
        return {dataset_id: [self.e.server, download_url] + items + [self.variables]}
    
      
    @property
    def meta(self):
        
        if not hasattr(self, '_meta'):
            
            if self.parallel:
            
                # get metadata for datasets
                # run in parallel to save time
                num_cores = multiprocessing.cpu_count()
                downloads = Parallel(n_jobs=num_cores)(
                    delayed(self.meta_by_dataset)(dataset_id) for dataset_id in self.dataset_ids
                )
                
            else:

                downloads = []
                for dataset_id in self.dataset_ids:
                    downloads.append(self.meta_by_dataset(dataset_id))

            # make dict from individual dicts
            from collections import ChainMap
            meta = dict(ChainMap(*downloads)) 

            # Make dataframe of metadata
            # variable names are the column names for the dataframe
            self._meta = pd.DataFrame.from_dict(meta, orient='index', 
                                                columns=['database','download_url'] \
                                                + self.columns + ['variable names'])
           
        return self._meta       
    
    
    def data_by_dataset(self, dataset_id):

        download_url = self.meta.loc[dataset_id, 'download_url']
        # data variables in ds that are not the variables we searched for
#         varnames = self.meta.loc[dataset_id, 'variable names']

        if self.e.protocol == 'tabledap':

            try:

                # fetch metadata if not already present
                # found download_url from metadata and use
                dd = pd.read_csv(download_url, index_col=0, parse_dates=True)
                
                # Drop cols and rows that are only NaNs.
                dd = dd.dropna(axis='index', how='all').dropna(axis='columns', how='all')

                if self.variables is not None:
                    # check to see if there is any actual data
                    # this is a bit convoluted because the column names are the variable names 
                    # plus units so can't match 1 to 1.
                    datacols = 0  # number of columns that represent data instead of metadata
                    for col in dd.columns:
                        datacols += [varname in col for varname in self.variables].count(True)
                    # if no datacols, we can skip this one.
                    if datacols == 0:
                        dd = None
                    
            except Exception as e:
                logger_erd.exception(e)
                logger_erd.warning('no data to be read in for %s' % dataset_id)
                dd = None
        
        elif self.e.protocol == 'griddap':

            try:
                dd = xr.open_dataset(download_url, chunks='auto').sel(time=slice(self.kw['min_time'],self.kw['max_time']))

                if ('min_lat' in self.kw) and ('max_lat' in self.kw):
                    dd = dd.sel(latitude=slice(self.kw['min_lat'],self.kw['max_lat']))

                if ('min_lon' in self.kw) and ('max_lon' in self.kw):
                    dd = dd.sel(longitude=slice(self.kw['min_lon'],self.kw['max_lon']))

                # use variable names to drop other variables (should. Ido this?)
                if self.variables is not None:
                    l = set(dd.data_vars) - set(self.variables)
                    dd = dd.drop_vars(l)
                
            except Exception as e:
                logger_erd.exception(e)
                logger_erd.warning('no data to be read in for %s' % dataset_id)
                dd = None
                
        return (dataset_id, dd)


    @property
    def data(self):
        
        if not hasattr(self, '_data'):
            
            if self.parallel:
                num_cores = multiprocessing.cpu_count()
                downloads = Parallel(n_jobs=num_cores)(
                    delayed(self.data_by_dataset)(dataset_id) for dataset_id in self.dataset_ids
                )
            else:
                downloads = []
                for dataset_id in self.dataset_ids:
                    downloads.append(self.data_by_dataset(dataset_id))

#             if downloads is not None:
            dds = {dataset_id: dd for (dataset_id, dd) in downloads}
#             else:
#                 dds = None

            self._data = dds

        return self._data
    
    
    def count(self,url):
        try:
            return len(pd.read_csv(url))    
        except:
            return np.nan

    
    def all_variables(self):
        '''Return a list of all possible variables.'''
        
        file_name_counts = f'erddap_variable_list_{self.known_server}.csv'
        
        if os.path.exists(file_name_counts):
            return pd.read_csv(file_name_counts, index_col='variable')
        else:
            # This took 10 min running in parallel for ioos
            # 2 min for coastwatch
            url = f'{self.e.server}/categorize/variableName/index.csv?page=1&itemsPerPage=100000'
            df = pd.read_csv(url)
#             counts = []
#             for url in df.URL:
#                 counts.append(self.count(url))
            num_cores = multiprocessing.cpu_count()
            counts = Parallel(n_jobs=num_cores)(
                delayed(self.count)(url) for url in df.URL
            )
            dfnew = pd.DataFrame()
            dfnew['variable'] = df['Category']
            dfnew['count'] = counts
            dfnew = dfnew.set_index('variable')
            # remove nans
            if (dfnew.isnull().sum() > 0).values:
                dfnew = dfnew[~dfnew.isnull().values].astype(int)
            dfnew.to_csv(file_name_counts)
        
        return dfnew


    def search_variables(self, variables):
        '''Find valid variables names to use.
        
        Call with `search_variables()` to return the list of possible names.
        Call with `search_variables('salinity')` to return relevant names.
        '''
        
        if not isinstance(variables, list):
            variables = [variables]
        
        # set up search for input variables
        search = f"(?i)"
        for variable in variables:
            search += f".*{variable}|"
        search = search.strip('|')

        r = re.compile(search)
        
        # just get the variable names
        df = self.all_variables()
        parameters = df.index

        matches = list(filter(r.match, parameters))

        # return parameters that match input variable strings
        return df.loc[matches].sort_values('count', ascending=False)
    
    
    def check_variables(self, variables, verbose=False):
        
        if not isinstance(variables, list):
            variables = [variables]
            
#         parameters = list(self.all_variables().keys())
        parameters = list(self.all_variables().index)
        
        # for a variable to exactly match a parameter 
        # this should equal 1
        count = []
        for variable in variables:
            count += [parameters.count(variable)]
        
        condition = np.allclose(count,1)
        
        assertion = f'The input variables are not exact matches to ok variables for known_server {self.known_server}. \
                     \nCheck all parameter group values with `ErddapReader().all_variables()` \
                     \nor search parameter group values with `ErddapReader().search_variables({variables})`.\
                     \n\n Try some of the following variables:\n{str(self.search_variables(variables))}'# \
#                      \nor run `ErddapReader().check_variables("{variables}")'
        assert condition, assertion
        
        if condition and verbose:
            print('all variables are matches!')
Beispiel #7
0
class NDBC():
    def __init__(self, station_id, deploy_id, WMO, currentTime, startTime,
                 data_map, name_map):
        self.station_id = station_id
        self.deploy_id = deploy_id
        self.WMO = WMO
        self.now = currentTime
        self.startTime = startTime
        self.data_map = data_map
        self.name_map = name_map

    def adjust_pressure_to_sea_level(self, pres, temp, height):
        """Adjust barometric presure to sea-level."""
        temp = temp + 273.15
        slp = pres / np.exp(-height / (temp * 29.263))
        return slp

    def calculate_wind_speed(self, eastward, northward):
        """Calculate absolute wind speed from component wind vector."""
        u = np.square(eastward)
        v = np.square(northward)
        wind_speed = np.sqrt(u + v)
        return wind_speed

    def calculate_wind_direction(self, eastward, northward):
        """Calculate met wind direction from component wind vectors."""
        u = eastward
        v = northward
        wind_direction = 180 / np.pi * np.arctan2(-u, -v)
        return wind_direction

    def _connect_erddap(self,
                        server="http://ooivm1.whoi.net/erddap",
                        protocol="tabledap"):
        """Connect to the erddap server."""
        self._erddap = ERDDAP(server=server, protocol=protocol)

    def list_datasets(self):
        """Get the available datasets for the ERDDAP server."""
        # First, make the connection
        self._connect_erddap()
        # Next, get the datasets
        datasets = pd.read_csv(
            self._erddap.get_search_url(search_for=self.station_id,
                                        response='csv'))['Dataset ID']
        return datasets

    def get_dataset(self, dataset):
        """Get the data for specified datasets."""
        # First, have to re-establish the erddap connection
        self._connect_erddap()

        # Next, get the data for a dataset
        self._erddap.dataset_id = dataset

        # Only want the variables with standard names
        variables = self._erddap.get_var_by_attr(
            standard_name=lambda v: v is not None)
        self._erddap.variables = variables

        # Limit the data request to the current deployment
        self._erddap.constraints = {
            'deploy_id=': self.deploy_id,
            'time>=': self.startTime.strftime('%Y-%m-%dT%H:%M:%SZ')
        }

        try:
            # Download the data
            data = self._erddap.to_pandas(index_col='time (UTC)',
                                          parse_dates=True)

            # Sometimes it just returns an empty dataframe instead of an error
            if data.size == 0:
                data = self._create_empty_dataset()

        except:
            # If there is no available data in the requested time window, need
            # to create an empty dataframe of the data
            data = self._create_empty_dataset()

        # Return the dataset data
        return data

    def process_METBK_data(self, df, freq='10T'):
        """Process the METBK into the correct format and values for NDBC."""
        # Resample the data
        df_binned = df.resample(freq).mean()

        # Check that barometric pressure
        if 'barometric_pressure (mbar)' in df_binned.columns:
            # Adjust the barometric pressure to sea-level
            df_binned[
                'sea_level_pressure (hPa)'] = self.adjust_pressure_to_sea_level(
                    df_binned['barometric_pressure (mbar)'],
                    df_binned['air_temperature (degree_Celsius)'], 4.05)
        else:
            df_binned['sea_level_pressure (hPa)'] = np.nan

        # Check that the wind vector components are in the dataframe
        if 'eastward_wind_velocity (m s-1)' in df_binned.columns:
            # Calculate the wind speed
            df_binned['wind speed (m/s)'] = self.calculate_wind_speed(
                df_binned['eastward_wind_velocity (m s-1)'],
                df_binned['northward_wind_velocity (m s-1)'])

            # Calculate the wind direction
            df_binned['wind direction'] = self.calculate_wind_direction(
                df_binned['eastward_wind_velocity (m s-1)'],
                df_binned['northward_wind_velocity (m s-1)'])
            df_binned['wind direction'] = df_binned["wind direction"].apply(
                lambda x: x + 360 if x < 0 else x)

            # Don't need cardinal direction -> want direction in degrees
            # df_binned["wind direction"] = df_binned["wind direction"].apply(
            #   lambda x: self.get_cardinal_direction(np.round(x, decimals=2)))
        else:
            df_binned['wind speed (m/s)'] = np.nan
            df_binned['wind direction'] = np.nan

        # Return the processed data
        return df_binned

    def process_WAVSS_data(self, df, freq='10T'):
        """Much simpler function for processing the WAVSS data."""
        # Resample the data
        df_binned = df.resample(freq).mean()

        # Return the data
        return df_binned

    def _create_empty_dataset(self):
        """
        Create a dataset of all nans if there is no data available for
        the requested dataset in the given time period.
        """
        # Get the units for the corresponding variables
        info_url = self._erddap.get_info_url(
            dataset_id=self._erddap.dataset_id, response='csv')
        info = pd.read_csv(info_url)
        units = info[info['Attribute Name'] == 'units']

        # Now, add the units to the variable names
        columns = []
        for var in self._erddap.variables:
            unit = units[units['Variable Name'] == var]['Value'].values
            if len(unit) == 0:
                columns.append(f'{var}')
            elif var == 'time':
                pass
            else:
                columns.append(f'{var} ({unit[0]})')

        # Create an array of nans to fill out the empty dataframe
        empty_array = np.empty((2, len(columns)))
        empty_array[:] = np.nan

        # Put the empty array into a dataframe
        empty_df = pd.DataFrame(data=empty_array,
                                columns=columns,
                                index=[self.startTime, self.now])
        empty_df.index.name = 'time (UTC)'

        return empty_df

    def process_datasets(self, datasets):
        """Process the data for individual datasets."""
        self.datasets = datasets

        # Get the data for the individual datasets
        for dset in self.datasets.keys():
            self.datasets.update({dset: self.get_dataset(dset)})

        # Process the data
        for dset in self.datasets.keys():
            if 'METBK' in dset:
                self.datasets[dset] = self.process_METBK_data(
                    self.datasets[dset])
            else:
                self.datasets[dset] = self.process_WAVSS_data(
                    self.datasets[dset])

        # Add a header to the data in the datasets
        for key in self.datasets.keys():
            header = key.split('-', 2)[-1]
            for col in self.datasets.get(key).columns:
                self.datasets.get(key).rename(
                    columns={col: ' '.join((header, col))}, inplace=True)

    def parse_data_to_xml(self, data):
        """
        Function which takes in the 10-minute average buoy data,
        the station name, and two dictionaries which map the buoy
        column names to the xml tags, and outputs an xml file in
        the NDBC format.

        Returns:
            xml - a properly constructed xml file in the NDBC
            format for the given buoy data
        """

        # Start the xml file
        xml = ['<?xml version="1.0" encoding="ISO-8859-1"?>']

        # Iterate through the data
        for index in data.index:

            # Get the data associated with a row in the dataframe
            row = data.loc[index]

            # Reset a dictionary of the data
            xml_data = {}
            for key in self.data_map.keys():
                xml_data.update({key: self.data_map.get(key)})

            # Parse the data into the data dictionary
            for key in xml_data.keys():
                # Get the column name which corresponds to the ndbc tag
                column = self.name_map.get(key)
                # Check that the column was returned from the ERDDAP server
                if column in row.index:
                    value = row[column]
                    # If a nan, just leave it the default -9999
                    if str(value) == 'nan':
                        pass
                    else:
                        xml_data[key] = value
                # If no data, leave it as default -9999
                else:
                    pass

            # Write the parsed data to the xml file
            # Start the message
            xml.append('<message>')

            # Add in the station id
            xml.append(f'  <station>{self.WMO}</station>')

            # Get the time index
            time = row.name.strftime('%m/%d/%Y %H:%M:%S')
            xml.append(f'  <date>{time}</date>')

            # Missing fill value
            missing = str(-9999)
            xml.append(f'  <missing>{missing}</missing>')

            # Roundtime
            xml.append('  <roundtime>no</roundtime>')

            # Start of the data
            xml.append('  <met>')

            # Add in each data piece
            for tag in xml_data.keys():
                # Get the value
                value = xml_data.get(tag)
                value = str(value)
                # Add the data to the xml file
                xml.append(f'    <{tag}>{value}</{tag}>')

            # Finish off the message
            xml.append('  </met>')
            xml.append('</message>')

        # Return the results
        return xml
search_url = e.get_search_url(response='csv', **kw)
search = pd.read_csv(search_url)
gliders = search['Dataset ID'].values

msg = 'Found {} Glider Datasets:\n\n{}'.format
print(msg(len(gliders), '\n'.join(gliders)))


# With the Dataset IDs we can explore the metadata with the *get_info_url*

# In[10]:

print(gliders[0])

info_url = e.get_info_url(dataset_id=gliders[0], response='csv')
info = pd.read_csv(info_url)

info.head()


# In[11]:

cdm_profile_variables = info.loc[
    info['Attribute Name'] == 'cdm_profile_variables', 'Value'
]

print(''.join(cdm_profile_variables))


# # Selecting variables by attributes
Beispiel #9
0
def get_standard_variables_and_metadata(server_link, standard_variable_list):

    # Get access to the server and find datasets associated with standard_name variable listed
    e = ERDDAP(server=server_link, protocol='tabledap', response='csv')

    # Define Filter for which datasets to look into
    kw = {
        'standard_name': ','.join(standard_variable_list),
        'min_lon': -180.0,
        'max_lon': 180.0,
        'min_lat': -90.0,
        'max_lat': 90.0,
        'min_time': '',
        'max_time': '',
        'cdm_data_type': ''
    }

    variable_to_groupby = [('latitude', 'degrees_north'),
                           ('longitude', 'degrees_east')]

    # Get available datasets from that server
    search_url = e.get_search_url(response='csv', **kw)
    datasets = pd.read_csv(search_url)

    # Print results
    print(e.server)
    print(
        str(len(datasets)) + " datasets contains " +
        ', '.join(standard_variable_list))

    # Loop through different data sets and create a metadata dataFrame
    df = pd.DataFrame(columns=['Dataset ID'])

    for index, row in datasets.iterrows():
        # Get Info from dataset (mostly min/max lat/long)
        print(row['Dataset ID'])
        info_url = e.get_info_url(dataset_id=row['Dataset ID'], response='csv')
        info = pd.read_csv(info_url)
        attribute_table = info.set_index(
            ['Row Type', 'Variable Name',
             'Attribute Name']).transpose()['attribute']

        # Try to get the distinct lat/long and time and depth range for that dataset, if it fails rely on the
        # ERDDAP metadata
        try:
            # If dataset is spread out geographically find distinct locations (may not work well for trajectory data)
            latlong_url = e.get_download_url(
                dataset_id=row['Dataset ID'],
                protocol='tabledap',
                variables=['latitude', 'longitude', 'time'])

            # Get add to the url commands to get distinct values and ordered with min and max time for each lat/long
            distinctMinMaxTime_url = latlong_url + '&distinct()&orderByMinMax(%22latitude%2Clongitude%2Ctime%22)'

            # Get lat/long and min/max depth for this dataset
            data = pd.read_csv(distinctMinMaxTime_url, header=[0, 1])

            # Group data by latitude/longitude and get min max values
            data_reduced = data.groupby(by=variable_to_groupby).agg(
                ['min', 'max']).reset_index()

            if info[(info['Variable Name'] == 'depth')].size > 0:
                latlongdepth_url = e.get_download_url(
                    dataset_id=row['Dataset ID'],
                    protocol='tabledap',
                    variables=['latitude', 'longitude', 'depth'])

                # Get add to the url commands to get distinct values and ordered with min and max depth for
                # each lat/long
                distinctMinMaxDepth_url = latlongdepth_url + \
                                          '&distinct()&orderByMinMax(%22latitude%2Clongitude%2Cdepth%22)'

                # Get lat/long and min/max depth for this dataset
                data_depth = pd.read_csv(distinctMinMaxDepth_url,
                                         header=[0, 1])

                # Group depth data by lat/long and get min max values
                data_depth_reduced = data_depth.groupby(
                    by=variable_to_groupby).agg(['min', 'max']).reset_index()

                # Merge depth values with time
                data_reduced = data_reduced.merge(data_depth_reduced,
                                                  on=variable_to_groupby,
                                                  how='left')

            # Merge multi index column names
            data_reduced.columns = data_reduced.columns.map(
                ' '.join).str.strip(' ')

        except Exception as exception_error:

            print('Failed to read: ' + str(exception_error))
            # If there's only one location, it could get the range from metadata

            # Find lat/long range of this dataset, if it's point we don't need to look into it
            min_latitude = float(attribute_table['NC_GLOBAL',
                                                 'geospatial_lat_min'].Value)
            max_latitude = float(attribute_table['NC_GLOBAL',
                                                 'geospatial_lat_max'].Value)
            min_longitude = float(attribute_table['NC_GLOBAL',
                                                  'geospatial_lon_min'].Value)
            max_longitude = float(attribute_table['NC_GLOBAL',
                                                  'geospatial_lon_max'].Value)

            # If min/max lat/long are the same don't go in the dataset
            if (min_latitude == max_latitude) & (min_longitude
                                                 == max_longitude):
                data_reduced = pd.DataFrame(columns=['Dataset ID'])
                data_reduced = {}
                data_reduced['latitude degrees_north'] = min_latitude
                data_reduced['longitude degrees_east'] = min_longitude

                if 'depth' in attribute_table.columns and 'actual_range' in attribute_table[
                        'depth'] and ('m'
                                      == attribute_table['depth',
                                                         'units']['Value']):

                    depth_range = np.array(
                        str.split(
                            attribute_table['depth', 'actual_range']['Value'],
                            ',')).astype(np.float)
                    data_reduced['depth m min'] = depth_range[0]
                    data_reduced['depth m max'] = depth_range[1]

                # Convert to DataFrame
                data_reduced = pd.DataFrame(data_reduced, index=[0])
                print('Retrieved metadata')
            else:
                # Won't handle data with multiple location that it can't retrieve the data
                continue

        # Add Standard Name Variable Name to table info['Attribute Name'] == 'geospatial_lat_min'
        for var in standard_variable_list:
            data_reduced[var] = ','.join(
                e.get_var_by_attr(dataset_id=row['Dataset ID'],
                                  standard_name=var))

        # Add cdm_data_type to table
        data_reduced['cdm_data_type'] = ','.join(
            info[info['Attribute Name'] == 'cdm_data_type']['Value'].values)

        # Add Dataset id to the table
        data_reduced['Dataset ID'] = row['Dataset ID']

        # Merge that dataset ID with previously downloaded data
        df = df.append(data_reduced)

    # Add server to dataFrame
    df['server'] = e.server

    # Save resulting dataframe to a CSV, file name is based on the server address
    file_name = re.sub('https*://', '', e.server)
    file_name = re.sub("[\./]", '_', file_name)
    file_name = 'Server_List_' + file_name + '.csv'

    print('Save result to ' + file_name)
    df.to_csv(file_name)

    return df