def get_coordinates(df, **kw):
    '''
    df = pd.DataFrame(columns=['server','Dataset ID',...])

    kw = {
     'min_lon': -123.628173,
     'max_lon': -122.02382599999999,
     'min_lat': 47.25972200000001,
     'max_lat': 48.32253399999999,
     'min_time': '2018-01-27T00:00:00Z',
     'max_time': '2019-12-31T00:00:00Z'}

     dataset_url = '%s/tabledap/%s.csvp?latitude,longitude,time&longitude>=-72.0&longitude<=-69&latitude>=38&latitude<=41&time>=1278720000.0&time<=1470787200.0&distinct()' % (all_datasets['server'].iloc[int(i)],all_datasets['Dataset ID'].iloc[int(i)])
    '''
    df_coords = pd.DataFrame()

    # pick 10 random datasets from our search results:
    #count = 10
    #if df.shape[0] > count:
    #    print("Found %i datasets. Reducing return to %i." % (df.shape[0],count))
    #    subset_datasets = df.iloc[random.sample(range(0,df.shape[0]),count+1)]
    #else:
    #    subset_datasets = df
    #    final_dataset_limit = df.shape[0]

    # alternate approach to above is iterate the original DataFrame passed (df), stopping either
    #   at final_dataset_limit (10 currently) or the max # of rows in df (conclusion of for loop)
    #   previous enclosing while loop is unnecessary as a result
    final_dataset_limit = 10
    datasets_found = 0
    if df.shape[0] < final_dataset_limit:
        final_dataset_limit = df.shape[0]

    index_random = random.sample(range(0, df.shape[0]), df.shape[0])
    print("index_random: {}".format(index_random))

    #for i in range(subset_datasets.shape[0]):
    for i in index_random:
        server_url = df['server'].iloc[int(i)]
        dataset_id = df['Dataset ID'].iloc[int(i)]
        institution = df['Institution'].iloc[int(i)]

        # skip some difficult datasets for now:
        if "ROMS" in dataset_id or "DOP" in dataset_id:  # skip ROMS model output
            #print("Skipping %s" % server_url + dataset_id)
            continue

        e = ERDDAP(server=server_url, protocol='tabledap', response='csv')
        try:
            print("datasets_found: {}".format(datasets_found))
            # former config for query, replaced with new code below:
            #e.variables=["latitude","longitude"]#,"time"]
            #e.dataset_id = all_datasets['Dataset ID'].iloc[int(i)]
            #e.constraints = {
            #       "time>=": kw['min_time'],
            #       "time<=": kw['max_time'],
            #       "longitude>=": kw['min_lon'],
            #       "longitude<=": kw['max_lon'],
            #       "latitude>=": kw['min_lat'],
            #       "latitude<=": kw['max_lat'],
            #       "distinct" : ()
            #}

            # Generate a download URL via e.get_download_url and pass to Pandas DataFrame via read_csv
            #   we need to use e.constraints here rather than in e.get_download_url to allow appending '>=' '<=' to the contstraints keys to match ERDDAP's API
            #   (parameter signature differs from the search API used above)
            # also add a 'distinct = ()' param, generate a download url, and submit a csv dataset download request to ERDDAP
            #kw["distinct"] = "()"
            e.constraints = {
                "time>=": kw['min_time'],
                "time<=": kw['max_time'],
                "longitude>=": kw['min_lon'],
                "longitude<=": kw['max_lon'],
                "latitude>=": kw['min_lat'],
                "latitude<=": kw['max_lat'],
                "distinct": ()
            }
            url = e.get_download_url(
                #constraints=kw,
                response="csvp",
                dataset_id=df['Dataset ID'].iloc[int(i)],
                variables=["latitude", "longitude"])
            print("Download URL: {}".format(url))

            #coords = pd.read_csv(url, headers=headers)
            coords = pd.read_csv(url)
            coords['dataset_count'] = i
            coords['dataset_download_url'] = url
            coords['Dataset ID'] = dataset_id
            coords['Institution'] = institution

            #get_var_by_attr example (ToDo):
            #e.get_var_by_attr(dataset_id, standard_name='northward_sea_water_velocity')

            print(coords.head())
            df_coords = pd.concat([df_coords, coords])

            # reaching this point in the query means the dataset query was successful, increment
            #   we need to break out of for loop here however if we reach final_dataset_limit to not go over:
            datasets_found += 1
            print("new dataset acquired; datasets_found: {}".format(
                datasets_found))
            # not needed:
            #df.drop([i])
            if datasets_found == final_dataset_limit: break

        except Exception as ex:
            # can happen if the dataset does not have any features within the query window, just log it here:
            if type(ex).__name__ in ["HTTPError"]:
                print(ex)
            #raise
            pass

    return df_coords
    'platform_number',
    'time',
    'pres',
    'longitude',
    'latitude',
    'temp',
    'psal',
]

e = ERDDAP(server=url_Argo, protocol='tabledap', response='nc')

e.dataset_id = dataset_type
e.constraints = constraints
e.variables = variables

print(e.get_download_url())

df = e.to_pandas(
    parse_dates=True,
    skiprows=(1, )  # units information can be dropped.
).dropna()

argo_ids = np.asarray(df['platform_number'])
argo_times = np.asarray(df['time (UTC)'])
#argo_press = np.asarray(df['pres (decibar)'])
argo_lons = np.asarray(df['longitude (degrees_east)'])
argo_lats = np.asarray(df['latitude (degrees_north)'])
#argo_temps = np.asarray(df['temp (degree_Celsius)'])
#argo_salts = np.asarray(df['psal (PSU)'])

Number_argo_profiles = np.max([np.unique(argo_lons).shape,\
Esempio n. 3
0
class ErddapReader:
    

    def __init__(self, known_server='ioos', protocol=None, server=None, parallel=True):
        
#         # run checks for KW 
#         self.kw = kw

        self.parallel = parallel
    
        
        # either select a known server or input protocol and server string
        if known_server == 'ioos':
            protocol = 'tabledap'
            server = 'http://erddap.sensors.ioos.us/erddap'
        elif known_server == 'coastwatch':
            protocol = 'griddap'
            server = 'http://coastwatch.pfeg.noaa.gov/erddap'
        elif known_server is not None:
            statement = 'either select a known server or input protocol and server string'
            assert (protocol is not None) & (server is not None), statement
        else:
            known_server = server.strip('/erddap').strip('http://').replace('.','_')
            statement = 'either select a known server or input protocol and server string'
            assert (protocol is not None) & (server is not None), statement
        
        self.known_server = known_server
        self.e = ERDDAP(server=server)
        self.e.protocol = protocol
        self.e.server = server
                  
        # columns for metadata
        self.columns = ['geospatial_lat_min', 'geospatial_lat_max', 
               'geospatial_lon_min', 'geospatial_lon_max', 
               'time_coverage_start', 'time_coverage_end',
               'defaultDataQuery', 'subsetVariables',  # first works for timeseries sensors, 2nd for gliders
               'keywords',  # for hf radar
               'id', 'infoUrl', 'institution', 'featureType', 'source', 'sourceUrl']
        
        # name
        self.name = f'erddap_{known_server}'
        
        self.reader = 'ErddapReader'
        
# #         self.data_type = data_type
#         self.standard_names = standard_names
#         # DOESN'T CURRENTLY LIMIT WHICH VARIABLES WILL BE FOUND ON EACH SERVER

    
    
    @property
    def dataset_ids(self):
        '''Find dataset_ids for server.'''
        
        if not hasattr(self, '_dataset_ids'):
            
            # This should be a region search
            if self.approach == 'region':
        
                # find all the dataset ids which we will use to get the data
                # This limits the search to our keyword arguments in kw which should 
                # have min/max lon/lat/time values
                dataset_ids = []
                if self.variables is not None:
                    for variable in self.variables:

                        # find and save all dataset_ids associated with variable
                        search_url = self.e.get_search_url(response="csv", **self.kw, 
                                                           variableName=variable, 
                                                           items_per_page=10000)

                        try:
                            search = pd.read_csv(search_url)
                            dataset_ids.extend(search["Dataset ID"])
                        except Exception as e:
                            logger_erd.exception(e)
                            logger_erd.warning(f"variable {variable} was not found in the search")
                            logger_erd.warning(f'search_url: {search_url}')

                else:
                    
                    # find and save all dataset_ids associated with variable
                    search_url = self.e.get_search_url(response="csv", **self.kw, 
                                                       items_per_page=10000)

                    try:
                        search = pd.read_csv(search_url)
                        dataset_ids.extend(search["Dataset ID"])
                    except Exception as e:
                        logger_erd.exception(e)
                        logger_erd.warning(f"nothing found in the search")
                        logger_erd.warning(f'search_url: {search_url}')

                    
                # only need a dataset id once since we will check them each for all standard_names
                self._dataset_ids = list(set(dataset_ids))
            
            # This should be a search for the station names
            elif self.approach == 'stations':
#             elif self._stations is not None:
                
                # search by station name for each of stations
                dataset_ids = []
                for station in self._stations:
                    # if station has more than one word, AND will be put between to search for multiple 
                    # terms together
                    url = self.e.get_search_url(response="csv", items_per_page=5, search_for=station)

                    try:
                        df = pd.read_csv(url)
                    except Exception as e:
                        logger_erd.exception(e)
                        logger_erd.warning(f'search url {url} did not work for station {station}.')
                        continue
    
                    # first try for exact station match
                    try:
                        dataset_id = [dataset_id for dataset_id in df['Dataset ID'] if station.lower() in dataset_id.lower().split('_')][0]

                    # if that doesn't work, trying for more general match and just take first returned option
                    except Exception as e:
                        logger_erd.exception(e)
                        logger_erd.warning('When searching for a dataset id to match station name %s, the first attempt to match the id did not work.' % (station))
                        dataset_id = df.iloc[0]['Dataset ID']
        
#                         if 'tabs' in org_id:  # don't split
#                             axiom_id = [axiom_id for axiom_id in df['Dataset ID'] if org_id.lower() == axiom_id.lower()]
#                         else:
#                             axiom_id = [axiom_id for axiom_id in df['Dataset ID'] if org_id.lower() in axiom_id.lower().split('_')][0]
                
#                     except:
#                         dataset_id = None
                
                    dataset_ids.append(dataset_id)
                    
                self._dataset_ids = list(set(dataset_ids))
                
            else:
                logger_erd.warning('Neither stations nor region approach were used in function dataset_ids.')
                
            
        return self._dataset_ids
        
    
    def meta_by_dataset(self, dataset_id):

        info_url = self.e.get_info_url(response="csv", dataset_id=dataset_id)
        info = pd.read_csv(info_url)

        items = []

        for col in self.columns:

            try:
                item = info[info['Attribute Name'] == col]['Value'].values[0]
                dtype = info[info['Attribute Name'] == col]['Data Type'].values[0]
            except:
                if col == 'featureType':
                    # this column is not present in HF Radar metadata but want it to
                    # map to data_type, so input 'grid' in that case.
                    item = 'grid'
                else:
                    item = 'NA'

            if dtype == 'String':
                pass
            elif dtype == 'double':
                item = float(item)
            elif dtype == 'int':
                item = int(item)
            items.append(item)
            
#         if self.standard_names is not None:
#             # In case the variable is named differently from the standard names, 
#             # we back out the variable names here for each dataset. This also only 
#             # returns those names for which there is data in the dataset.
#             varnames = self.e.get_var_by_attr(
#                 dataset_id=dataset_id,
#                 standard_name=lambda v: v in self.standard_names
#             )
#         else:
#             varnames = None

        ## include download link ##
        self.e.dataset_id = dataset_id
        if self.e.protocol == 'tabledap':
            if self.variables is not None:
                self.e.variables = ["time","longitude", "latitude", "station"] + self.variables
            # set the same time restraints as before
            self.e.constraints = {'time<=': self.kw['max_time'], 'time>=': self.kw['min_time'],}
            download_url = self.e.get_download_url(response='csvp')

        elif self.e.protocol == 'griddap':
            # the search terms that can be input for tabledap do not work for griddap
            # in erddapy currently. Instead, put together an opendap link and then 
            # narrow the dataset with xarray.
            # get opendap link
            download_url = self.e.get_download_url(response='opendap')
        
        # add erddap server name
        return {dataset_id: [self.e.server, download_url] + items + [self.variables]}
    
      
    @property
    def meta(self):
        
        if not hasattr(self, '_meta'):
            
            if self.parallel:
            
                # get metadata for datasets
                # run in parallel to save time
                num_cores = multiprocessing.cpu_count()
                downloads = Parallel(n_jobs=num_cores)(
                    delayed(self.meta_by_dataset)(dataset_id) for dataset_id in self.dataset_ids
                )
                
            else:

                downloads = []
                for dataset_id in self.dataset_ids:
                    downloads.append(self.meta_by_dataset(dataset_id))

            # make dict from individual dicts
            from collections import ChainMap
            meta = dict(ChainMap(*downloads)) 

            # Make dataframe of metadata
            # variable names are the column names for the dataframe
            self._meta = pd.DataFrame.from_dict(meta, orient='index', 
                                                columns=['database','download_url'] \
                                                + self.columns + ['variable names'])
           
        return self._meta       
    
    
    def data_by_dataset(self, dataset_id):

        download_url = self.meta.loc[dataset_id, 'download_url']
        # data variables in ds that are not the variables we searched for
#         varnames = self.meta.loc[dataset_id, 'variable names']

        if self.e.protocol == 'tabledap':

            try:

                # fetch metadata if not already present
                # found download_url from metadata and use
                dd = pd.read_csv(download_url, index_col=0, parse_dates=True)
                
                # Drop cols and rows that are only NaNs.
                dd = dd.dropna(axis='index', how='all').dropna(axis='columns', how='all')

                if self.variables is not None:
                    # check to see if there is any actual data
                    # this is a bit convoluted because the column names are the variable names 
                    # plus units so can't match 1 to 1.
                    datacols = 0  # number of columns that represent data instead of metadata
                    for col in dd.columns:
                        datacols += [varname in col for varname in self.variables].count(True)
                    # if no datacols, we can skip this one.
                    if datacols == 0:
                        dd = None
                    
            except Exception as e:
                logger_erd.exception(e)
                logger_erd.warning('no data to be read in for %s' % dataset_id)
                dd = None
        
        elif self.e.protocol == 'griddap':

            try:
                dd = xr.open_dataset(download_url, chunks='auto').sel(time=slice(self.kw['min_time'],self.kw['max_time']))

                if ('min_lat' in self.kw) and ('max_lat' in self.kw):
                    dd = dd.sel(latitude=slice(self.kw['min_lat'],self.kw['max_lat']))

                if ('min_lon' in self.kw) and ('max_lon' in self.kw):
                    dd = dd.sel(longitude=slice(self.kw['min_lon'],self.kw['max_lon']))

                # use variable names to drop other variables (should. Ido this?)
                if self.variables is not None:
                    l = set(dd.data_vars) - set(self.variables)
                    dd = dd.drop_vars(l)
                
            except Exception as e:
                logger_erd.exception(e)
                logger_erd.warning('no data to be read in for %s' % dataset_id)
                dd = None
                
        return (dataset_id, dd)


    @property
    def data(self):
        
        if not hasattr(self, '_data'):
            
            if self.parallel:
                num_cores = multiprocessing.cpu_count()
                downloads = Parallel(n_jobs=num_cores)(
                    delayed(self.data_by_dataset)(dataset_id) for dataset_id in self.dataset_ids
                )
            else:
                downloads = []
                for dataset_id in self.dataset_ids:
                    downloads.append(self.data_by_dataset(dataset_id))

#             if downloads is not None:
            dds = {dataset_id: dd for (dataset_id, dd) in downloads}
#             else:
#                 dds = None

            self._data = dds

        return self._data
    
    
    def count(self,url):
        try:
            return len(pd.read_csv(url))    
        except:
            return np.nan

    
    def all_variables(self):
        '''Return a list of all possible variables.'''
        
        file_name_counts = f'erddap_variable_list_{self.known_server}.csv'
        
        if os.path.exists(file_name_counts):
            return pd.read_csv(file_name_counts, index_col='variable')
        else:
            # This took 10 min running in parallel for ioos
            # 2 min for coastwatch
            url = f'{self.e.server}/categorize/variableName/index.csv?page=1&itemsPerPage=100000'
            df = pd.read_csv(url)
#             counts = []
#             for url in df.URL:
#                 counts.append(self.count(url))
            num_cores = multiprocessing.cpu_count()
            counts = Parallel(n_jobs=num_cores)(
                delayed(self.count)(url) for url in df.URL
            )
            dfnew = pd.DataFrame()
            dfnew['variable'] = df['Category']
            dfnew['count'] = counts
            dfnew = dfnew.set_index('variable')
            # remove nans
            if (dfnew.isnull().sum() > 0).values:
                dfnew = dfnew[~dfnew.isnull().values].astype(int)
            dfnew.to_csv(file_name_counts)
        
        return dfnew


    def search_variables(self, variables):
        '''Find valid variables names to use.
        
        Call with `search_variables()` to return the list of possible names.
        Call with `search_variables('salinity')` to return relevant names.
        '''
        
        if not isinstance(variables, list):
            variables = [variables]
        
        # set up search for input variables
        search = f"(?i)"
        for variable in variables:
            search += f".*{variable}|"
        search = search.strip('|')

        r = re.compile(search)
        
        # just get the variable names
        df = self.all_variables()
        parameters = df.index

        matches = list(filter(r.match, parameters))

        # return parameters that match input variable strings
        return df.loc[matches].sort_values('count', ascending=False)
    
    
    def check_variables(self, variables, verbose=False):
        
        if not isinstance(variables, list):
            variables = [variables]
            
#         parameters = list(self.all_variables().keys())
        parameters = list(self.all_variables().index)
        
        # for a variable to exactly match a parameter 
        # this should equal 1
        count = []
        for variable in variables:
            count += [parameters.count(variable)]
        
        condition = np.allclose(count,1)
        
        assertion = f'The input variables are not exact matches to ok variables for known_server {self.known_server}. \
                     \nCheck all parameter group values with `ErddapReader().all_variables()` \
                     \nor search parameter group values with `ErddapReader().search_variables({variables})`.\
                     \n\n Try some of the following variables:\n{str(self.search_variables(variables))}'# \
#                      \nor run `ErddapReader().check_variables("{variables}")'
        assert condition, assertion
        
        if condition and verbose:
            print('all variables are matches!')
Esempio n. 4
0
class GdacClient(object):
    def __init__(self, erddap_url=None):

        self._logger = logging.getLogger(os.path.basename(__file__))

        self._erddap_url = erddap_url or 'https://gliders.ioos.us/erddap'
        self._protocol = 'tabledap'
        self._response_type = 'csv'
        self._items_per_page = 1e10
        self._page = 1
        self._client = ERDDAP(server=self._erddap_url,
                              protocol=self._protocol,
                              response=self._response_type)
        self._last_request = None

        # DataFrame containing the results of ERDDAP advanced search (endpoints, etc.)
        self._datasets_info = pd.DataFrame()
        # DataFrame containing dataset_id, start/end dates, profile count, etc.
        self._datasets_summaries = pd.DataFrame()
        self._datasets_profiles = pd.DataFrame()
        self._datasets_days = pd.DataFrame()

        self._profiles_variables = [
            'time', 'latitude', 'longitude', 'profile_id', 'wmo_id'
        ]

        self._valid_search_kwargs = {
            'institution', 'ioos_category', 'long_name', 'standard_name',
            'variable_name', 'min_lon', 'min_lat', 'max_lon', 'max_lat',
            'min_time', 'max_time'
        }

        self._months = [
            'January', 'February', 'March', 'April', 'May', 'June', 'July',
            'August', 'September', 'October', 'November', 'December'
        ]

        self._calendar_types = ['datasets', 'days', 'profiles']

    @property
    def datasets_info(self):
        return self._datasets_info

    @property
    def datasets_summaries(self):
        return self._datasets_summaries

    @property
    def datasets_profiles(self):
        return self._datasets_profiles

    @property
    def datasets_days(self):
        return self._datasets_days

    @property
    def dataset_ids(self):
        if self._datasets_summaries.empty:
            self._logger.warning('No data sets found')
            return

        return list(self._datasets_info['dataset_id'].values)

    @property
    def gliders(self):
        if self._datasets_summaries.empty:
            self._logger.warning('No data sets found')
            return

        return list(self._datasets_summaries.glider.unique())

    @property
    def profiles_per_yyyymmdd(self):
        return self._datasets_profiles.sum(axis=1)

    @property
    def profiles_per_year(self):
        return self._datasets_profiles.sum(
            axis=1).groupby(lambda x: x.year).sum()

    @property
    def glider_days_per_yyyymmdd(self):
        return self._datasets_days.sum(axis=1)

    @property
    def glider_days_per_year(self):
        return self._datasets_days.sum(axis=1).groupby(lambda x: x.year).sum()

    @property
    def deployments_per_yyyymmdd(self):
        return self._datasets_days.sum(axis=1)

    @property
    def deployments_per_year(self):
        return self._datasets_days.groupby(lambda x: x.year).any().sum(axis=1)

    @property
    def yearly_counts(self):

        columns = [
            self.deployments_per_year, self.glider_days_per_year,
            self.profiles_per_year
        ]
        totals = pd.DataFrame(columns).transpose().astype('i')
        totals.columns = ['deployments', 'glider days', 'profiles']
        totals.index.name = 'year'

        return totals

    @property
    def e(self):
        """erddapy.ERDDAP client"""
        return self._client

    @property
    def server(self):
        return self._client.server

    @property
    def response_type(self):
        return self._client.response

    @response_type.setter
    def response_type(self, response_type):
        self._client.response = response_type

    @property
    def last_request(self):
        return self._last_request

    def get_glider_datasets(self, glider):

        return self._datasets_summaries[self._datasets_summaries.glider ==
                                        glider].reset_index().drop('index',
                                                                   axis=1)

    def get_deployments_calendar(self, year=None):
        if not year:
            return self._datasets_days.groupby(
                [lambda x: x.year,
                 lambda x: x.month]).any().sum(axis=1).unstack()
        else:
            glider_days_by_yymmdd = self._datasets_days
            years = pd.to_datetime(glider_days_by_yymmdd.index).year.unique()
            if year not in years:
                self._logger.warning(
                    'No glider days found in year {:}'.format(year))
                return pd.DataFrame()
            return glider_days_by_yymmdd[pd.to_datetime(
                glider_days_by_yymmdd.index).year == year].groupby(
                    [lambda x: x.month,
                     lambda x: x.day]).any().sum(axis=1).unstack()

    def get_glider_days_calendar(self, year=None):
        if not year:
            return self._datasets_days.sum(axis=1).groupby(
                [lambda x: x.year, lambda x: x.month]).sum().unstack()
        else:
            glider_days_by_yymmdd = self._datasets_days.sum(axis=1)
            years = pd.to_datetime(glider_days_by_yymmdd.index).year.unique()
            if year not in years:
                self._logger.warning(
                    'No glider days found in year {:}'.format(year))
                return pd.DataFrame()
            return glider_days_by_yymmdd[pd.to_datetime(
                glider_days_by_yymmdd.index).year == year].groupby(
                    [lambda x: x.month, lambda x: x.day]).sum().unstack()

    def get_profiles_calendar(self, year=None):
        if not year:
            return self._datasets_profiles.sum(axis=1).groupby(
                [lambda x: x.year, lambda x: x.month]).sum().unstack()
        else:
            profiles_by_yymmdd = self._datasets_profiles.sum(axis=1)
            years = pd.to_datetime(profiles_by_yymmdd.index).year.unique()
            if year not in years:
                self._logger.warning(
                    'No profiles found in year {:}'.format(year))
                return pd.DataFrame()
            return profiles_by_yymmdd[pd.to_datetime(
                profiles_by_yymmdd.index).year == year].groupby(
                    [lambda x: x.month, lambda x: x.day]).sum().unstack()

    def search_datasets(self, search_for=None, delayedmode=False, **kwargs):
        """Search the ERDDAP server for glider deployment datasets.  Results are stored as pandas DataFrames in:

        self.deployments
        self.datasets

        Equivalent to ERDDAP's Advanced Search.  Searches can be performed by free text, bounding box, time bounds, etc.
        See the erddapy documentation for valid kwargs"""

        url = self._client.get_search_url(search_for=search_for, **kwargs)
        self._last_request = url

        glider_regex = re.compile(r'^(.*)-\d{8}T\d{4}')
        try:
            self._datasets_info = pd.read_csv(url)
            # Drop the allDatasets row
            self._datasets_info.drop(self._datasets_info[
                self._datasets_info['Dataset ID'] == 'allDatasets'].index,
                                     inplace=True)

            # Reset the index to start and 0
            self._datasets_info.reset_index(inplace=True)
            # Drop the index, griddap wms columns
            self._datasets_info.drop(['index', 'griddap', 'wms'],
                                     axis=1,
                                     inplace=True)

            # rename columns more friendly
            columns = {
                s: s.replace(' ', '_').lower()
                for s in self._datasets_info.columns
            }
            self._datasets_info.rename(columns=columns, inplace=True)

            if not delayedmode:
                self._datasets_info = self._datasets_info[
                    ~self._datasets_info.dataset_id.str.endswith('delayed')]

            # Iterate through each data set (except for allDatasets) and grab the info page
            datasets = []
            daily_profiles = []
            datasets_days = []
            for i, row in self._datasets_info.iterrows():

                if row['dataset_id'] == 'allDatasets':
                    continue

                if delayedmode and not row['dataset_id'].endswith('delayed'):
                    continue
                elif row['dataset_id'].endswith('delayed'):
                    continue

                self._logger.info('Fetching dataset: {:}'.format(
                    row['dataset_id']))

                # Get the data download url for erddap_vars
                try:
                    data_url = self._client.get_download_url(
                        dataset_id=row['dataset_id'],
                        variables=self._profiles_variables)
                except (ConnectionError, ConnectionRefusedError,
                        urllib3.exceptions.MaxRetryError) as e:
                    self._logger.error('{:} fetch failed: {:}'.format(
                        row['dataset_id'], e))
                    continue

                # Fetch the profiles into a pandas dataframe
                try:
                    profiles = pd.read_csv(data_url,
                                           skiprows=[1],
                                           index_col='time',
                                           parse_dates=True).sort_index()
                except HTTPError as e:
                    self._logger.error(
                        'Failed to fetch profiles: {:}'.format(e))
                    continue

                # Group profiles by yyyy-mm-dd and sum the number of profiles per day
                s = profiles.profile_id.dropna().groupby(
                    lambda x: x.date).count()
                s.name = row['dataset_id']
                daily_profiles.append(s)

                # Create the deployment date range
                d_index = pd.date_range(s.index.min(), s.index.max())
                deployment_days = pd.Series([1 for x in d_index],
                                            index=d_index,
                                            name=row['dataset_id'])
                datasets_days.append(deployment_days)

                glider_match = glider_regex.match(row['dataset_id'])
                glider = glider_match.groups()[0]

                # First profile time
                dt0 = profiles.index.min()
                # Last profile time
                dt1 = profiles.index.max()
                # Deployment length in days
                days = ceil((dt1 - dt0).total_seconds() / 86400)

                dataset_summary = [
                    glider, row['dataset_id'],
                    str(profiles.wmo_id.unique()[0]), dt0, dt1,
                    profiles.iloc[0]['latitude'],
                    profiles.iloc[0]['longitude'],
                    profiles.latitude.min(),
                    profiles.latitude.max(),
                    profiles.longitude.min(),
                    profiles.longitude.max(), profiles.shape[0], days
                ]

                datasets.append(dataset_summary)

            columns = [
                'glider', 'dataset_id', 'wmo_id', 'start_date', 'end_date',
                'deployment_lat', 'deployment_lon', 'lat_min', 'lat_max',
                'lon_min', 'lon_max', 'num_profiles', 'days'
            ]

            self._datasets_summaries = pd.DataFrame(datasets, columns=columns)

            # Create and store the DataFrame containing a 1 on each day the glider was deployed, 0 otherwise
            self._datasets_days = pd.concat(datasets_days, axis=1).sort_index()

            # Create and store the DataFrame containing the number of profiles on each day for each deployment
            self._datasets_profiles = pd.concat(daily_profiles,
                                                axis=1).sort_index()

        except HTTPError as e:
            self._logger.error(e)

        return

    def get_dataset_info(self, dataset_id):
        """Fetch the dataset metadata for the specified dataset_id"""

        if dataset_id not in self.dataset_ids:
            self._logger.error('Dataset id {:} not found in {:}'.format(
                dataset_id, self.__repr__()))
            return

        info = self._datasets_info[self._datasets_info.dataset_id ==
                                   dataset_id]
        info.reset_index(inplace=True)
        return info.drop('index', axis=1).transpose()

    def get_dataset_profiles(self, dataset_id):
        """Fetch all profiles (time, latitude, longitude, profile_id) for the specified dataset.  Profiles are sorted
        by ascending time"""

        if dataset_id not in self.dataset_ids:
            self._logger.error('Dataset id {:} not found in {:}'.format(
                dataset_id, self.__repr__()))
            return

        url = self._client.get_download_url(dataset_id=dataset_id,
                                            variables=self._profiles_variables)

        return pd.read_csv(url,
                           parse_dates=True,
                           skiprows=[1],
                           index_col='time').sort_index()

    def get_dataset_time_coverage(self, dataset_id):
        """Get the time coverage and wmo id (if specified) for specified dataset_id """
        if dataset_id not in self.dataset_ids:
            self._logger.error('Dataset id {:} not found in {:}'.format(
                dataset_id, self.__repr__()))
            return

        return self._datasets_summaries[[
            'dataset_id', 'start_date', 'end_date', 'wmo_id'
        ]].iloc[self.dataset_ids.index(dataset_id)]

    def get_dataset_time_series(self,
                                dataset_id,
                                variables,
                                min_time=None,
                                max_time=None):
        """Fetch the variables time-series for the specified dataset_id.  A time window can be specified using min_time
        and max_time, which must be ISO-8601 formatted date strings (i.e.: 'YYYY-mm-ddTHH:MM')

        Parameters
        dataset_id: valid dataset id from self.datasets
        variables: list of one or more valid variables in the dataset

        Options
        min_time: minimum time value formatted as 'YYYY-mm-ddTHH:MM[:SS]'
        max_time: maximum time value formatted as 'YYYY-mm-ddTHH:mm[:SS]'
        """
        if dataset_id not in self.dataset_ids:
            self._logger.error('Dataset id {:} not found in {:}'.format(
                dataset_id, self.__repr__()))
            return

        if not isinstance(variables, list):
            variables = [variables]

        all_variables = ['precise_time', 'time', 'depth'] + variables
        variables = set(all_variables)

        constraints = {}
        if min_time:
            constraints['precise_time>='] = min_time
        if max_time:
            constraints['precise_time<='] = max_time

        # Not sure why, but pd.read_csv doesn't like percent UNENCODED urls on data requests, so percent escape special
        # characters prior to sending the data request.
        data_url = self.encode_url(
            self._client.get_download_url(dataset_id=dataset_id,
                                          variables=variables,
                                          constraints=constraints))

        return pd.read_csv(
            data_url, skiprows=[1],
            parse_dates=True).set_index('precise_time').sort_index()

    def plot_yearly_totals(self,
                           totals_type=None,
                           palette='Blues_d',
                           **kwargs):
        """Bar chart plot of deployments, glider days and profiles, grouped by year"""
        totals = self.yearly_counts.reset_index()

        if totals_type and totals_type not in totals.columns:
            self._logger.error(
                'Invalid category specified: {:}'.format(totals_type))
            return

        if not totals_type:
            fig, (ax1, ax2, ax3) = plt.subplots(3,
                                                1,
                                                figsize=(8.5, 11),
                                                sharex=True)
            sns.barplot(x='year',
                        y='deployments',
                        ax=ax1,
                        data=totals,
                        palette=palette,
                        **kwargs)
            sns.barplot(x='year',
                        y='glider days',
                        ax=ax2,
                        data=totals,
                        palette=palette,
                        **kwargs)
            sns.barplot(x='year',
                        y='profiles',
                        ax=ax3,
                        data=totals,
                        palette=palette,
                        **kwargs)

            ax2.set_xlabel('')
            ax1.set_xlabel('')

            ax1.set_title('U.S. IOOS Glider Data Assembly Center')

            return fig, ax1, ax2, ax3

        else:
            ax = sns.barplot(x='year',
                             y=totals_type,
                             data=totals,
                             palette=palette,
                             **kwargs)
            ax.set_title('U.S. IOOS Glider Data Assembly Center')

            return ax.figure, ax

    def plot_datasets_calendar(self, calendar_type, year=None, cmap=None):
        """Heatmap of the specified calendar_type"""
        if calendar_type not in self._calendar_types:
            self._logger.error(
                'Invalid calendar type specified: {:}'.format(calendar_type))
            return

        if calendar_type == 'datasets':
            if not year:
                data = self.get_deployments_calendar()
                title = 'Active Real-Time Datasets'
            else:
                data = self.get_deployments_calendar(year)
                title = 'Active Real-Time Datasets: {:}'.format(year)
        elif calendar_type == 'days':
            if not year:
                data = self.get_glider_days_calendar()
                data.columns = self._months
                title = 'Glider In-Water Days'
            else:
                data = self.get_glider_days_calendar(year)
                title = 'Glider In-Water Days: {:}'.format(year)
        elif calendar_type == 'profiles':
            if not year:
                data = self.get_profiles_calendar()
                data.columns = self._months
                title = 'Real-Time Profiles'
            else:
                data = self.get_profiles_calendar(year)
                title = 'Real-Time Profiles: {:}'.format(year)
        else:
            self._logger.error(
                'Unknown calendar type: {:}'.format(calendar_type))
            return

        if data.empty:
            self._logger.warning('No results found')
            return

        if year:
            data.index = self._months
            plt.figure(figsize=(8.5, 4.))
            cb = True
            annotate = False
        else:
            data.columns = self._months
            plt.figure(figsize=(8.5, 8.5))
            cb = False
            annotate = True

        if cmap:
            ax = sns.heatmap(data,
                             annot=annotate,
                             fmt='.0f',
                             square=True,
                             cbar=cb,
                             linewidths=0.5,
                             cmap=cmap)
        else:
            ax = sns.heatmap(data,
                             annot=annotate,
                             fmt='.0f',
                             square=True,
                             cbar=cb,
                             linewidths=0.5)

        ax.invert_yaxis()
        _ = [ytick.set_rotation(0) for ytick in ax.get_yticklabels()]
        ax.set_title(title)

        return ax

    def plot_dataset_profiles_calendar(self, dataset_id, **heatmap_kwargs):
        """Plot the heatmap profiles/day calendar for the specified dataset"""
        if dataset_id not in self.dataset_ids:
            self._logger.error('Dataset id {:} not found in {:}'.format(
                dataset_id, self.__repr__()))
            return

        profiles = self.get_dataset_profiles(dataset_id)
        if profiles.empty:
            self._logger.warning(
                'No profiles found for dataset: {:}'.format(dataset_id))
            return

        pgroup = profiles.latitude.groupby(
            [lambda x: x.year, lambda x: x.month, lambda x: x.day]).count()
        calendar = pgroup.unstack()

        annotate = True
        square = True
        cbar = False
        annot_kws = {'fontsize': 10}
        annot_kws = {}

        fig = plt.figure(figsize=(11, 8.5))

        ax = sns.heatmap(calendar,
                         annot=annotate,
                         fmt='.0f',
                         square=square,
                         cbar=cbar,
                         linewidths=0.5,
                         annot_kws=annot_kws)

        # Format default y-tick labels to 'mmm YYYY'
        ylabels = [y.get_text() for y in ax.get_yticklabels()]
        new_ylabels = []
        for ylabel in ylabels:
            y, m = ylabel.split('-')
            new_ylabels.append('{:} {:}'.format(self._months[int(m) - 1][0:3],
                                                y))
        ax.set_yticklabels(new_ylabels)

        ax.set_ylabel('')
        ax.invert_yaxis()
        _ = [ytick.set_rotation(0) for ytick in ax.get_yticklabels()]

        ax.set_title('Profiles: {:}'.format(dataset_id))

        return ax

    @staticmethod
    def encode_url(data_url):
        """Percent encode special url characters."""
        url_pieces = list(urlsplit(data_url))
        url_pieces[3] = quote(url_pieces[3])

        return urlunsplit(url_pieces)

    def __repr__(self):
        return "<GdacClient(server='{:}', response='{:}', num_datasets={:})>".format(
            self._client.server, self._client.response,
            len(self._datasets_info))
 'time',
]


# In[2]:

e = ERDDAP(
    server=server,
    dataset_id=dataset_id,
    constraints=constraints,
    variables=variables,
    protocol='tabledap',
    response='mat',
)

print(e.get_download_url())


# # Obtaining the data
# 
# There are a few methods to obtain the data with *to_pandas()* and *to_xarray()*:

# In[3]:

df = e.to_pandas(
    index_col='time',
    parse_dates=True,
    skiprows=(1,)  # units information can be dropped.
).dropna()

Esempio n. 6
0
def get_standard_variables_and_metadata(server_link, standard_variable_list):

    # Get access to the server and find datasets associated with standard_name variable listed
    e = ERDDAP(server=server_link, protocol='tabledap', response='csv')

    # Define Filter for which datasets to look into
    kw = {
        'standard_name': ','.join(standard_variable_list),
        'min_lon': -180.0,
        'max_lon': 180.0,
        'min_lat': -90.0,
        'max_lat': 90.0,
        'min_time': '',
        'max_time': '',
        'cdm_data_type': ''
    }

    variable_to_groupby = [('latitude', 'degrees_north'),
                           ('longitude', 'degrees_east')]

    # Get available datasets from that server
    search_url = e.get_search_url(response='csv', **kw)
    datasets = pd.read_csv(search_url)

    # Print results
    print(e.server)
    print(
        str(len(datasets)) + " datasets contains " +
        ', '.join(standard_variable_list))

    # Loop through different data sets and create a metadata dataFrame
    df = pd.DataFrame(columns=['Dataset ID'])

    for index, row in datasets.iterrows():
        # Get Info from dataset (mostly min/max lat/long)
        print(row['Dataset ID'])
        info_url = e.get_info_url(dataset_id=row['Dataset ID'], response='csv')
        info = pd.read_csv(info_url)
        attribute_table = info.set_index(
            ['Row Type', 'Variable Name',
             'Attribute Name']).transpose()['attribute']

        # Try to get the distinct lat/long and time and depth range for that dataset, if it fails rely on the
        # ERDDAP metadata
        try:
            # If dataset is spread out geographically find distinct locations (may not work well for trajectory data)
            latlong_url = e.get_download_url(
                dataset_id=row['Dataset ID'],
                protocol='tabledap',
                variables=['latitude', 'longitude', 'time'])

            # Get add to the url commands to get distinct values and ordered with min and max time for each lat/long
            distinctMinMaxTime_url = latlong_url + '&distinct()&orderByMinMax(%22latitude%2Clongitude%2Ctime%22)'

            # Get lat/long and min/max depth for this dataset
            data = pd.read_csv(distinctMinMaxTime_url, header=[0, 1])

            # Group data by latitude/longitude and get min max values
            data_reduced = data.groupby(by=variable_to_groupby).agg(
                ['min', 'max']).reset_index()

            if info[(info['Variable Name'] == 'depth')].size > 0:
                latlongdepth_url = e.get_download_url(
                    dataset_id=row['Dataset ID'],
                    protocol='tabledap',
                    variables=['latitude', 'longitude', 'depth'])

                # Get add to the url commands to get distinct values and ordered with min and max depth for
                # each lat/long
                distinctMinMaxDepth_url = latlongdepth_url + \
                                          '&distinct()&orderByMinMax(%22latitude%2Clongitude%2Cdepth%22)'

                # Get lat/long and min/max depth for this dataset
                data_depth = pd.read_csv(distinctMinMaxDepth_url,
                                         header=[0, 1])

                # Group depth data by lat/long and get min max values
                data_depth_reduced = data_depth.groupby(
                    by=variable_to_groupby).agg(['min', 'max']).reset_index()

                # Merge depth values with time
                data_reduced = data_reduced.merge(data_depth_reduced,
                                                  on=variable_to_groupby,
                                                  how='left')

            # Merge multi index column names
            data_reduced.columns = data_reduced.columns.map(
                ' '.join).str.strip(' ')

        except Exception as exception_error:

            print('Failed to read: ' + str(exception_error))
            # If there's only one location, it could get the range from metadata

            # Find lat/long range of this dataset, if it's point we don't need to look into it
            min_latitude = float(attribute_table['NC_GLOBAL',
                                                 'geospatial_lat_min'].Value)
            max_latitude = float(attribute_table['NC_GLOBAL',
                                                 'geospatial_lat_max'].Value)
            min_longitude = float(attribute_table['NC_GLOBAL',
                                                  'geospatial_lon_min'].Value)
            max_longitude = float(attribute_table['NC_GLOBAL',
                                                  'geospatial_lon_max'].Value)

            # If min/max lat/long are the same don't go in the dataset
            if (min_latitude == max_latitude) & (min_longitude
                                                 == max_longitude):
                data_reduced = pd.DataFrame(columns=['Dataset ID'])
                data_reduced = {}
                data_reduced['latitude degrees_north'] = min_latitude
                data_reduced['longitude degrees_east'] = min_longitude

                if 'depth' in attribute_table.columns and 'actual_range' in attribute_table[
                        'depth'] and ('m'
                                      == attribute_table['depth',
                                                         'units']['Value']):

                    depth_range = np.array(
                        str.split(
                            attribute_table['depth', 'actual_range']['Value'],
                            ',')).astype(np.float)
                    data_reduced['depth m min'] = depth_range[0]
                    data_reduced['depth m max'] = depth_range[1]

                # Convert to DataFrame
                data_reduced = pd.DataFrame(data_reduced, index=[0])
                print('Retrieved metadata')
            else:
                # Won't handle data with multiple location that it can't retrieve the data
                continue

        # Add Standard Name Variable Name to table info['Attribute Name'] == 'geospatial_lat_min'
        for var in standard_variable_list:
            data_reduced[var] = ','.join(
                e.get_var_by_attr(dataset_id=row['Dataset ID'],
                                  standard_name=var))

        # Add cdm_data_type to table
        data_reduced['cdm_data_type'] = ','.join(
            info[info['Attribute Name'] == 'cdm_data_type']['Value'].values)

        # Add Dataset id to the table
        data_reduced['Dataset ID'] = row['Dataset ID']

        # Merge that dataset ID with previously downloaded data
        df = df.append(data_reduced)

    # Add server to dataFrame
    df['server'] = e.server

    # Save resulting dataframe to a CSV, file name is based on the server address
    file_name = re.sub('https*://', '', e.server)
    file_name = re.sub("[\./]", '_', file_name)
    file_name = 'Server_List_' + file_name + '.csv'

    print('Save result to ' + file_name)
    df.to_csv(file_name)

    return df
Esempio n. 7
0
def GOFS_RTOFS_vs_Argo_floats(lon_forec_track, lat_forec_track, lon_forec_cone,
                              lat_forec_cone, lon_best_track, lat_best_track,
                              lon_lim, lat_lim, folder_fig):
    #%% User input

    #GOFS3.1 output model location
    url_GOFS_ts = 'http://tds.hycom.org/thredds/dodsC/GLBy0.08/expt_93.0/ts3z'

    # RTOFS files
    folder_RTOFS = '/home/coolgroup/RTOFS/forecasts/domains/hurricanes/RTOFS_6hourly_North_Atlantic/'

    nc_files_RTOFS = ['rtofs_glo_3dz_f006_6hrly_hvr_US_east.nc',\
                      'rtofs_glo_3dz_f012_6hrly_hvr_US_east.nc',\
                      'rtofs_glo_3dz_f018_6hrly_hvr_US_east.nc',\
                      'rtofs_glo_3dz_f024_6hrly_hvr_US_east.nc']

    # COPERNICUS MARINE ENVIRONMENT MONITORING SERVICE (CMEMS)
    url_cmems = 'http://nrt.cmems-du.eu/motu-web/Motu'
    service_id = 'GLOBAL_ANALYSIS_FORECAST_PHY_001_024-TDS'
    product_id = 'global-analysis-forecast-phy-001-024'
    depth_min = '0.493'
    out_dir = '/home/aristizabal/crontab_jobs'

    # Bathymetry file
    #bath_file = '/Users/aristizabal/Desktop/MARACOOS_project/Maria_scripts/nc_files/GEBCO_2014_2D_-100.0_0.0_-60.0_45.0.nc'
    bath_file = '/home/aristizabal/bathymetry_files/GEBCO_2014_2D_-100.0_0.0_-10.0_50.0.nc'

    # Argo floats
    url_Argo = 'http://www.ifremer.fr/erddap'

    #%%

    from matplotlib import pyplot as plt
    import numpy as np
    import xarray as xr
    import netCDF4
    from datetime import datetime, timedelta
    import cmocean
    import matplotlib.dates as mdates
    from erddapy import ERDDAP
    import pandas as pd
    import os

    # Do not produce figures on screen
    plt.switch_backend('agg')

    # Increase fontsize of labels globally
    plt.rc('xtick', labelsize=14)
    plt.rc('ytick', labelsize=14)
    plt.rc('legend', fontsize=14)

    #%% Reading bathymetry data

    ncbath = xr.open_dataset(bath_file)
    bath_lat = ncbath.variables['lat'][:]
    bath_lon = ncbath.variables['lon'][:]
    bath_elev = ncbath.variables['elevation'][:]

    oklatbath = np.logical_and(bath_lat >= lat_lim[0], bath_lat <= lat_lim[-1])
    oklonbath = np.logical_and(bath_lon >= lon_lim[0], bath_lon <= lon_lim[-1])

    bath_latsub = bath_lat[oklatbath]
    bath_lonsub = bath_lon[oklonbath]
    bath_elevs = bath_elev[oklatbath, :]
    bath_elevsub = bath_elevs[:, oklonbath]

    #%% Get time bounds for current day
    #ti = datetime.today()
    ti = datetime.today() - timedelta(1) - timedelta(hours=6)
    tini = datetime(ti.year, ti.month, ti.day)
    te = ti + timedelta(2)
    tend = datetime(te.year, te.month, te.day)

    #%% Look for Argo datasets

    e = ERDDAP(server=url_Argo)

    # Grab every dataset available
    #datasets = pd.read_csv(e.get_search_url(response='csv', search_for='all'))

    kw = {
        'min_lon': lon_lim[0],
        'max_lon': lon_lim[1],
        'min_lat': lat_lim[0],
        'max_lat': lat_lim[1],
        'min_time': str(tini),
        'max_time': str(tend),
    }

    search_url = e.get_search_url(response='csv', **kw)

    # Grab the results
    search = pd.read_csv(search_url)

    # Extract the IDs
    dataset = search['Dataset ID'].values

    msg = 'Found {} Datasets:\n\n{}'.format
    print(msg(len(dataset), '\n'.join(dataset)))

    dataset_type = dataset[0]

    constraints = {
        'time>=': str(tini),
        'time<=': str(tend),
        'latitude>=': lat_lim[0],
        'latitude<=': lat_lim[1],
        'longitude>=': lon_lim[0],
        'longitude<=': lon_lim[1],
    }

    variables = [
        'platform_number',
        'time',
        'pres',
        'longitude',
        'latitude',
        'temp',
        'psal',
    ]

    e = ERDDAP(server=url_Argo, protocol='tabledap', response='nc')

    e.dataset_id = dataset_type
    e.constraints = constraints
    e.variables = variables

    print(e.get_download_url())

    df = e.to_pandas(
        parse_dates=True,
        skiprows=(1, )  # units information can be dropped.
    ).dropna()

    argo_ids = np.asarray(df['platform_number'])
    argo_times = np.asarray(df['time (UTC)'])
    argo_press = np.asarray(df['pres (decibar)'])
    argo_lons = np.asarray(df['longitude (degrees_east)'])
    argo_lats = np.asarray(df['latitude (degrees_north)'])
    argo_temps = np.asarray(df['temp (degree_Celsius)'])
    argo_salts = np.asarray(df['psal (PSU)'])

    #%% GOGF 3.1

    try:
        GOFS_ts = xr.open_dataset(url_GOFS_ts, decode_times=False)

        lt_GOFS = np.asarray(GOFS_ts['lat'][:])
        ln_GOFS = np.asarray(GOFS_ts['lon'][:])
        tt = GOFS_ts['time']
        t_GOFS = netCDF4.num2date(tt[:], tt.units)
        depth_GOFS = np.asarray(GOFS_ts['depth'][:])
    except Exception as err:
        print(err)
        GOFS_ts = np.nan
        lt_GOFS = np.nan
        ln_GOFS = np.nan
        depth_GOFS = np.nan
        t_GOFS = ti

    #%% Map Argo floats

    lev = np.arange(-9000, 9100, 100)
    plt.figure()
    plt.contourf(bath_lonsub,
                 bath_latsub,
                 bath_elevsub,
                 lev,
                 cmap=cmocean.cm.topo)
    plt.plot(lon_forec_track, lat_forec_track, '.-', color='gold')
    plt.plot(lon_forec_cone, lat_forec_cone, '.-b', markersize=1)
    plt.plot(lon_best_track, lat_best_track, 'or', markersize=3)

    argo_idd = np.unique(argo_ids)
    for i, id in enumerate(argo_idd):
        okind = np.where(argo_ids == id)[0]
        plt.plot(np.unique(argo_lons[okind]),
                 np.unique(argo_lats[okind]),
                 's',
                 color='darkorange',
                 markersize=5,
                 markeredgecolor='k')

    plt.title('Argo Floats ' + str(tini)[0:13] + '-' + str(tend)[0:13],
              fontsize=16)
    plt.axis('scaled')
    plt.xlim(lon_lim[0], lon_lim[1])
    plt.ylim(lat_lim[0], lat_lim[1])

    file = folder_fig + 'ARGO_lat_lon'
    #file = folder_fig + 'ARGO_lat_lon_' + str(np.unique(argo_times)[0])[0:10]
    plt.savefig(file, bbox_inches='tight', pad_inches=0.1)

    #%% Figure argo float vs GOFS and vs RTOFS

    argo_idd = np.unique(argo_ids)

    for i, id in enumerate(argo_idd):
        print(id)
        okind = np.where(argo_ids == id)[0]
        argo_time = np.asarray([
            datetime.strptime(t, '%Y-%m-%dT%H:%M:%SZ')
            for t in argo_times[okind]
        ])

        argo_lon = argo_lons[okind]
        argo_lat = argo_lats[okind]
        argo_pres = argo_press[okind]
        argo_temp = argo_temps[okind]
        argo_salt = argo_salts[okind]

        # GOFS
        print('Retrieving variables from GOFS')
        if isinstance(GOFS_ts, float):
            temp_GOFS = np.nan
            salt_GOFS = np.nan
        else:
            #oktt_GOFS = np.where(t_GOFS >= argo_time[0])[0][0]
            ttGOFS = np.asarray([
                datetime(t_GOFS[i].year, t_GOFS[i].month, t_GOFS[i].day,
                         t_GOFS[i].hour) for i in np.arange(len(t_GOFS))
            ])
            tstamp_GOFS = [
                mdates.date2num(ttGOFS[i]) for i in np.arange(len(ttGOFS))
            ]
            oktt_GOFS = np.unique(
                np.round(
                    np.interp(mdates.date2num(argo_time[0]), tstamp_GOFS,
                              np.arange(len(tstamp_GOFS)))).astype(int))[0]
            oklat_GOFS = np.where(lt_GOFS >= argo_lat[0])[0][0]
            oklon_GOFS = np.where(ln_GOFS >= argo_lon[0] + 360)[0][0]
            temp_GOFS = np.asarray(GOFS_ts['water_temp'][oktt_GOFS, :,
                                                         oklat_GOFS,
                                                         oklon_GOFS])
            salt_GOFS = np.asarray(GOFS_ts['salinity'][oktt_GOFS, :,
                                                       oklat_GOFS, oklon_GOFS])

        # RTOFS
        #Time window
        year = int(argo_time[0].year)
        month = int(argo_time[0].month)
        day = int(argo_time[0].day)
        tini = datetime(year, month, day)
        tend = tini + timedelta(days=1)

        # Read RTOFS grid and time
        print('Retrieving coordinates from RTOFS')

        if tini.month < 10:
            if tini.day < 10:
                fol = 'rtofs.' + str(tini.year) + '0' + str(
                    tini.month) + '0' + str(tini.day)
            else:
                fol = 'rtofs.' + str(tini.year) + '0' + str(tini.month) + str(
                    tini.day)
        else:
            if tini.day < 10:
                fol = 'rtofs.' + str(tini.year) + str(tini.month) + '0' + str(
                    tini.day)
            else:
                fol = 'rtofs.' + str(tini.year) + str(tini.month) + str(
                    tini.day)

        ncRTOFS = xr.open_dataset(folder_RTOFS + fol + '/' + nc_files_RTOFS[0])
        latRTOFS = np.asarray(ncRTOFS.Latitude[:])
        lonRTOFS = np.asarray(ncRTOFS.Longitude[:])
        depth_RTOFS = np.asarray(ncRTOFS.Depth[:])

        tRTOFS = []
        for t in np.arange(len(nc_files_RTOFS)):
            ncRTOFS = xr.open_dataset(folder_RTOFS + fol + '/' +
                                      nc_files_RTOFS[t])
            tRTOFS.append(np.asarray(ncRTOFS.MT[:])[0])

        tRTOFS = np.asarray([mdates.num2date(mdates.date2num(tRTOFS[t])) \
                  for t in np.arange(len(nc_files_RTOFS))])

        oktt_RTOFS = np.where(
            mdates.date2num(tRTOFS) >= mdates.date2num(argo_time[0]))[0][0]
        oklat_RTOFS = np.where(latRTOFS[:, 0] >= argo_lat[0])[0][0]
        oklon_RTOFS = np.where(lonRTOFS[0, :] >= argo_lon[0])[0][0]

        nc_file = folder_RTOFS + fol + '/' + nc_files_RTOFS[oktt_RTOFS]
        ncRTOFS = xr.open_dataset(nc_file)
        #time_RTOFS = tRTOFS[oktt_RTOFS]
        temp_RTOFS = np.asarray(ncRTOFS.variables['temperature'][0, :,
                                                                 oklat_RTOFS,
                                                                 oklon_RTOFS])
        salt_RTOFS = np.asarray(ncRTOFS.variables['salinity'][0, :,
                                                              oklat_RTOFS,
                                                              oklon_RTOFS])
        #lon_RTOFS = lonRTOFS[0,oklon_RTOFS]
        #lat_RTOFS = latRTOFS[oklat_RTOFS,0]

        # Downloading and reading Copernicus output
        motuc = 'python -m motuclient --motu ' + url_cmems + \
        ' --service-id ' + service_id + \
        ' --product-id ' + product_id + \
        ' --longitude-min ' + str(argo_lon[0]-2/12) + \
        ' --longitude-max ' + str(argo_lon[0]+2/12) + \
        ' --latitude-min ' + str(argo_lat[0]-2/12) + \
        ' --latitude-max ' + str(argo_lat[0]+2/12) + \
        ' --date-min ' + '"' + str(tini-timedelta(0.5)) + '"' + \
        ' --date-max ' + '"' + str(tend+timedelta(0.5)) + '"' + \
        ' --depth-min ' + depth_min + \
        ' --depth-max ' + str(np.nanmax(argo_pres)+1000) + \
        ' --variable ' + 'thetao' + ' ' + \
        ' --variable ' + 'so'  + ' ' + \
        ' --out-dir ' + out_dir + \
        ' --out-name ' + str(id) + '.nc' + ' ' + \
        ' --user ' + 'maristizabalvar' + ' ' + \
        ' --pwd ' +  'MariaCMEMS2018'

        os.system(motuc)
        # Check if file was downloaded

        COP_file = out_dir + '/' + str(id) + '.nc'
        # Check if file was downloaded
        resp = os.system('ls ' + out_dir + '/' + str(id) + '.nc')
        if resp == 0:
            COP = xr.open_dataset(COP_file)

            latCOP = np.asarray(COP.latitude[:])
            lonCOP = np.asarray(COP.longitude[:])
            depth_COP = np.asarray(COP.depth[:])
            tCOP = np.asarray(mdates.num2date(mdates.date2num(COP.time[:])))
        else:
            latCOP = np.empty(1)
            latCOP[:] = np.nan
            lonCOP = np.empty(1)
            lonCOP[:] = np.nan
            tCOP = np.empty(1)
            tCOP[:] = np.nan

        oktimeCOP = np.where(
            mdates.date2num(tCOP) >= mdates.date2num(tini))[0][0]
        oklonCOP = np.where(lonCOP >= argo_lon[0])[0][0]
        oklatCOP = np.where(latCOP >= argo_lat[0])[0][0]

        temp_COP = np.asarray(COP.variables['thetao'][oktimeCOP, :, oklatCOP,
                                                      oklonCOP])
        salt_COP = np.asarray(COP.variables['so'][oktimeCOP, :, oklatCOP,
                                                  oklonCOP])

        # Figure temp
        plt.figure(figsize=(5, 6))
        plt.plot(argo_temp,
                 -argo_pres,
                 '.-',
                 linewidth=2,
                 label='ARGO Float id ' + str(id))
        plt.plot(temp_GOFS,
                 -depth_GOFS,
                 '.-',
                 linewidth=2,
                 label='GOFS 3.1',
                 color='red')
        plt.plot(temp_RTOFS,
                 -depth_RTOFS,
                 '.-',
                 linewidth=2,
                 label='RTOFS',
                 color='g')
        plt.plot(temp_COP,
                 -depth_COP,
                 '.-',
                 linewidth=2,
                 label='Copernicus',
                 color='darkorchid')
        plt.ylim([-1000, 0])
        plt.title('Temperature Profile on '+ str(argo_time[0])[0:13] +
                  '\n [lon,lat] = [' \
                  + str(np.round(argo_lon[0],3)) +',' +\
                      str(np.round(argo_lat[0],3))+']',\
                      fontsize=16)
        plt.ylabel('Depth (m)', fontsize=14)
        plt.xlabel('$^oC$', fontsize=14)
        plt.legend(loc='lower right', fontsize=14)

        file = folder_fig + 'ARGO_vs_GOFS_RTOFS_COP_temp_' + str(id)
        plt.savefig(file, bbox_inches='tight', pad_inches=0.1)

        # Figure salt
        plt.figure(figsize=(5, 6))
        plt.plot(argo_salt,
                 -argo_pres,
                 '.-',
                 linewidth=2,
                 label='ARGO Float id ' + str(id))
        plt.plot(salt_GOFS,
                 -depth_GOFS,
                 '.-',
                 linewidth=2,
                 label='GOFS 3.1',
                 color='red')
        plt.plot(salt_RTOFS,
                 -depth_RTOFS,
                 '.-',
                 linewidth=2,
                 label='RTOFS',
                 color='g')
        plt.plot(salt_COP,
                 -depth_COP,
                 '.-',
                 linewidth=2,
                 label='Copernicus',
                 color='darkorchid')
        plt.ylim([-1000, 0])
        plt.title('Salinity Profile on '+ str(argo_time[0])[0:13] +
                  '\n [lon,lat] = [' \
                  + str(np.round(argo_lon[0],3)) +',' +\
                      str(np.round(argo_lat[0],3))+']',\
                      fontsize=16)
        plt.ylabel('Depth (m)', fontsize=14)
        plt.legend(loc='lower right', fontsize=14)

        file = folder_fig + 'ARGO_vs_GOFS_RTOFS_COP_salt_' + str(id)
        plt.savefig(file, bbox_inches='tight', pad_inches=0.1)