Beispiel #1
0
def globsimScaled2Pandas(ncdf_in, station_nr):
    """
    Read a scaled (or interpolated) globsim netCDF file and return all values
    for one station as a Pandas data frame.

    ncdf_in: full path to a globsim netCDF (by station)

    station_nr: station_number, as given in the stations .csv file to identify
                the station.

    """
    # open file
    ncf = nc.Dataset(ncdf_in, 'r')

    # station mask
    sm = ncf.variables['station'][:] == int(station_nr)
    # list variables
    varlist = [x.encode('UTF8') for x in ncf.variables.keys()]

    # get and convert time
    time = ncf.variables['time'][:]
    t_unit = ncf.variables['time'].units
    t_cal = ncf.variables['time'].calendar
    time = nc.num2date(time, units=t_unit, calendar=t_cal)

    # make data frame with time
    df = pd.DataFrame(data=time, columns=['time'])
    # add variables
    for var in varlist:
        if variables_skip(var):
            continue
        data = ncf.variables[var][:, sm]
        df = pd.concat([df, pd.DataFrame(data=data, columns=[var])], axis=1)

    return df
    def netCDF_empty(self, ncfile_out, stations, nc_in):
        # TODO: change date type from f4 to f8 for lat and lon
        '''
        Creates an empty station file to hold interpolated reults. The number of
        stations is defined by the variable stations, variables are determined by
        the variable list passed from the gridded original netCDF.

        ncfile_out: full name of the file to be created
        stations:   station list read with common_utils.StationListRead()
        variables:  variables read from netCDF handle
        lev:        list of pressure levels, empty is [] (default)
        '''
        rootgrp = netcdf_base(ncfile_out, len(stations), None,
                              'hours since 1980-01-01 00:00:00')

        station = rootgrp["station"]
        latitude = rootgrp["latitude"]
        longitude = rootgrp["longitude"]
        height = rootgrp["height"]

        # assign station characteristics
        station[:] = list(stations['station_number'])
        latitude[:] = list(stations['latitude_dd'])
        longitude[:] = list(stations['longitude_dd'])
        height[:] = list(stations['elevation_m'])

        # extra treatment for pressure level files
        try:
            lev = nc_in.variables['level'][:]
            logger.info("Creating empty 3D file (has pressure levels)")
            level = rootgrp.createDimension('level', len(lev))
            level = rootgrp.createVariable('level', 'i4', ('level'))
            level.long_name = 'pressure_level'
            level.units = 'hPa'
            level[:] = lev
        except Exception:
            logger.info("Creating empty 2D file (without pressure levels)")
            lev = []

        # remove extra variables
        varlist_merra = [str_encode(x) for x in nc_in.variables.keys()]

        # create and assign variables based on input file
        for n, var in enumerate(varlist_merra):
            if variables_skip(var):
                continue
            logger.debug(f"Add empty variable: {var}")
            # extra treatment for pressure level files
            if len(lev):
                tmp = rootgrp.createVariable(var, 'f4',
                                             ('time', 'level', 'station'))
            else:
                tmp = rootgrp.createVariable(var, 'f4', ('time', 'station'))
            tmp.long_name = str_encode(
                nc_in.variables[var].long_name)  # for merra2
            tmp.units = str_encode(nc_in.variables[var].units)

        # close the file
        rootgrp.close()
        logger.debug(f"Created empty netcdf file {ncfile_out}")
Beispiel #3
0
    def ERA2station(self,
                    ncfile_in,
                    ncfile_out,
                    points,
                    variables=None,
                    date=None):
        """
        Biliner interpolation from fields on regular grid (latitude, longitude)
        to individual point stations (latitude, longitude). This works for
        surface and for pressure level files (all Era_Interim files). The type
        of variable and file structure are determined from the input.

        This function creates an empty of netCDF file to hold the interpolated
        results, by calling ERAIgeneric().netCDF_empty. Then, data is
        interpolated in temporal chunks and appended. The temporal chunking can
        be set in the interpolation parameter file.

        Args:
        ncfile_in: Full path to an Era-Interim derived netCDF file. This can
                   contain wildcards to point to multiple files if temporal
                  chunking was used.

        ncfile_out: Full path to the output netCDF file to write.

        points: A dataframe of locations. See method StationListRead in
                common_utils.py for more details.

        variables:  List of variable(s) to interpolate such as
                    ['r', 't', 'u','v', 't2m', 'u10', 'v10', 'ssrd', 'strd', 'tp'].
                    Defaults to using all variables available.

        date: Directory to specify begin and end time for the derived time
                series. Defaluts to using all times available in ncfile_in.

        cs: chunk size, i.e. how many time steps to interpolate at once. This
            helps to manage overall memory usage (small cs is slower but less
            memory intense).
        """

        # read in one type of mutiple netcdf files
        ncf_in = nc.MFDataset(ncfile_in, 'r', aggdim='time')

        # is it a file with pressure levels?
        pl = 'level' in ncf_in.dimensions.keys()

        # build the output of empty netCDF file
        rootgrp = new_interpolated_netcdf(
            ncfile_out,
            self.stations,
            ncf_in,
            time_units='hours since 1900-01-01 00:00:0.0')
        rootgrp.source = 'ERA_Interim, interpolated bilinearly to stations'
        rootgrp.close()

        # open the output netCDF file, set it to be appendable ('a')
        ncf_out = nc.Dataset(ncfile_out, 'a')

        # get time and convert to datetime object
        nctime = ncf_in.variables['time'][:]
        #"hours since 1900-01-01 00:00:0.0"
        t_unit = ncf_in.variables['time'].units
        try:
            t_cal = ncf_in.variables['time'].calendar
        except AttributeError:  # attribute doesn't exist
            t_cal = u"gregorian"  # standard
        time = nc.num2date(nctime, units=t_unit, calendar=t_cal)

        # detect invariant files (topography etc.)
        invariant = True if len(time) == 1 else False

        # restrict to date/time range if given
        if date is None:
            tmask = time < datetime(3000, 1, 1)
        else:
            tmask = (time < date['end']) * (time >= date['beg'])

        # get time vector for output
        time_in = nctime[tmask]

        # ensure that chunk sizes cover entire period even if
        # len(time_in) is not an integer multiple of cs
        niter = len(time_in) // self.cs
        niter += ((len(time_in) % self.cs) > 0)

        # loop over chunks
        for n in range(niter):
            # indices (relative to index of the output file)
            beg = n * self.cs
            # restrict last chunk to lenght of tmask plus one (to get last time)
            end = min(n * self.cs + self.cs, len(time_in)) - 1

            # time to make tmask for chunk
            beg_time = nc.num2date(time_in[beg], units=t_unit, calendar=t_cal)
            if invariant:
                # allow topography to work in same code, len(nctime) = 1
                end_time = nc.num2date(nctime[0], units=t_unit, calendar=t_cal)
                #end = 1
            else:
                end_time = nc.num2date(time_in[end],
                                       units=t_unit,
                                       calendar=t_cal)

            #'<= end_time', would damage appending
            tmask_chunk = (time <= end_time) * (time >= beg_time)
            if invariant:
                # allow topography to work in same code
                tmask_chunk = [True]

            # get the interpolated variables
            dfield, variables = self.interp2D(ncfile_in,
                                              ncf_in,
                                              self.stations,
                                              tmask_chunk,
                                              variables=None,
                                              date=None)

            # append time
            ncf_out.variables['time'][:] = np.append(
                ncf_out.variables['time'][:], time_in[beg:end + 1])

            #append variables
            for i, var in enumerate(variables):
                if variables_skip(var):
                    continue

                if pl:
                    if ESMFnew:
                        # dfield has dimensions (station, variables, time, pressure levels)
                        ncf_out.variables[var][
                            beg:end +
                            1, :, :] = dfield.data[:, i, :, :].transpose(
                                (1, 2, 0))
                    else:
                        # dimension: time, level, station (pressure level files)
                        ncf_out.variables[var][
                            beg:end + 1, :, :] = dfield.data[i, :, :, :]
                else:
                    if ESMFnew:
                        # dfield has dimensions (station, variables, time)
                        ncf_out.variables[var][
                            beg:end + 1, :] = dfield.data[:, i, :].transpose(
                                (1, 0))
                    else:
                        # dfield has dimensions time, station (2D files)
                        ncf_out.variables[var][beg:end +
                                               1, :] = dfield.data[i, :, :]

        #close the file
        ncf_in.close()
        ncf_out.close()
Beispiel #4
0
def new_interpolated_netcdf(ncfile_out, stations, nc_in, time_units):
    """
    Creates an empty station file to hold interpolated reults. The number of
    stations is defined by the variable stations, variables are determined by
    the variable list passed from the gridded original netCDF.

    ncfile_out: full name of the file to be created
    stations:   station list read with common_utils.StationListRead()
    variables:  variables read from netCDF handle
    lev:        list of pressure levels, empty is [] (default)
    """
    logger.info(f"Creating new file {ncfile_out} from ")

    rootgrp = netcdf_base(ncfile_out, len(stations), None, time_units, nc_in)

    station = rootgrp['station']
    latitude = rootgrp['latitude']
    longitude = rootgrp['longitude']
    height = rootgrp['height']

    # assign station characteristics
    station[:]   = list(stations['station_number'])
    latitude[:]  = list(stations['latitude_dd'])
    longitude[:] = list(stations['longitude_dd'])
    height[:]    = list(stations['elevation_m'])

    # extra treatment for pressure level files
    try:
        lev = nc_in.variables['level'][:]
        logger.info(f"Source dataset is 3D (has pressure levels)")
        level           = rootgrp.createDimension('level', len(lev))
        level           = rootgrp.createVariable('level', 'i4', ('level'))
        level.long_name = 'pressure_level'
        level.units     = 'hPa'
        level[:] = lev
    except Exception:
        logger.info(f"Source dataset is 2D (without pressure levels)")
        lev = []

    try:
        num = rootgrp['number'][:]
    except Exception:
        num = []

    # create and assign variables based on input file
    for n, var in enumerate(nc_in.variables):
        if variables_skip(var):
            continue
        
        # extra treatment for pressure level files
        if len(num):
            if len(lev):
                tmp = rootgrp.createVariable(var,'f4',('time', 'number',
                                                       'level', 'station'))
            else:
                tmp = rootgrp.createVariable(var,'f4',('time','number',
                                                       'station'))
        else:
            if len(lev):
                tmp = rootgrp.createVariable(var,'f4', ('time','level',
                                                        'station'))
            else:
                tmp = rootgrp.createVariable(var,'f4', ('time','station'))

        # copy attributes
        input_var = nc_in.variables[var]
        for key in input_var.ncattrs():
            if key in ['_FillValue']:
                continue
            tmp.setncattr(key, getattr(input_var, key))
        
        logger.info(f"Created new empty variable: {str_encode(var)} [{tmp.units}]")
    
    return rootgrp
Beispiel #5
0
    def makeNCF(self, dsi):

        variables = self.getVars(dsi)
        dataLev = self.getDataLev(dsi)
        self.getDimName(dataLev)

        varf = np.sort(
            glob.glob(path.join(self.directory, f'*{variables[0]}*')))
        ncf = nc.MFDataset(varf.tolist(), aggdim='initial_time0_hours')

        if dataLev == 'pl':
            Levs = ncf['lv_ISBL1'][:].data

        Times = ncf[self.timeName][:]
        Lats = ncf[self.latName][:].data
        Lons = ncf[self.lonName][:].data

        file_new = self.getOutFile(ncf, dataLev)

        # initialize new data file and create group
        ncn = nc.Dataset(file_new, 'w', format='NETCDF4_CLASSIC')

        # make dimensions
        if dataLev == 'pl':
            Levs = ncf[self.levName][:].data
            ncn.createDimension('level', len(Levs))
            levels = ncn.createVariable('level', 'i4', ('level', ))
            levels.long_name = 'pressure level'
            levels.units = 'mbar'
            levels[:] = Levs
        ncn.createDimension('time', len(Times))
        ncn.createDimension('latitude', len(Lats))
        ncn.createDimension('longitude', len(Lons))

        # make dimension variables
        times = ncn.createVariable('time', 'd', ('time', ))
        latitudes = ncn.createVariable('latitude', 'f8', ('latitude', ))
        longitudes = ncn.createVariable('longitude', 'f8', ('longitude', ))

        times.standard_name = 'time'
        times.units = ncf[self.timeName].units
        times.calendar = 'standard'
        latitudes.standard_name = ncf[self.latName].long_name
        latitudes.units = ncf[self.latName].units
        longitudes.standard_name = ncf[self.lonName].long_name
        longitudes.units = ncf[self.lonName].units

        ncf.close()

        # assign dimensions
        times[:] = Times
        longitudes[:] = Lons
        latitudes[:] = Lats

        for vari in variables:
            flist = np.sort(glob.glob(path.join(self.directory, f'*{vari}*')))
            ncf = nc.MFDataset(flist.tolist(), aggdim=self.timeName)
            for n, var in enumerate(ncf.variables.keys()):
                if variables_skip(self.ncfVar[var]):
                    continue
                logger.info(f"Creating variable: {var}")
                if dataLev == 'pl':
                    vari = ncn.createVariable(self.ncfVar[var], 'f4', (
                        'time',
                        'level',
                        'latitude',
                        'longitude',
                    ))
                    vari[:, :, :, :] = ncf[var][:, :, :, :]
                else:
                    vari = ncn.createVariable(
                        self.ncfVar[var], 'f4',
                        ('time', 'latitude', 'longitude'))
                    vari[:, :, :] = ncf[var][:, :, :]

                vari.long_name = ncf[var].long_name
                vari.units = ncf[var].units

            ncf.close()
            for f in flist:
                remove(f)

        ncn.close()
    def MERRA2station(self,
                      ncfile_in,
                      ncfile_out,
                      points,
                      variables=None,
                      date=None):
        """
        Given the type of variables to interpoalted from MERRA2 downloaded diretory
        Create the empty of netCDF file to hold the interpolated results, by calling
        self.netCDF_empty
        Get the interpolated results from MERRA2station
        Append all variables into the empty netCDF file
        Close all files

        Args:
        ncfile_in: Full path to an MERRA-2 derived netCDF file. This can
                    contain wildcards to point to multiple files if temporal
                    chunking was used.

        ncfile_out: Full path to the output netCDF file to write.

        points: A dataframe of locations. See method StationListRead in
                common_utils.py for more details.

        variables:  List of variable(s) to interpolate such as
                    ['T','RH','U','V',' T2M', 'U2M', 'V2M', 'U10M', 'V10M',
                    'PRECTOT', 'SWGDN','SWGDNCLR','LWGDN', 'LWGDNCLR'].
                    Defaults to using all variables available.

        date: Directory to specify begin and end time for the derived time
                series. Defaluts to using all times available in ncfile_in.

        """

        # read in one type of mutiple netcdf files
        logger.info("Loading reanalysis data into memory")
        ncf_in = nc.MFDataset(ncfile_in, 'r', aggdim='time')

        # Check station bounds
        self.validate_stations_extent(ncf_in)

        # is it a file with pressure levels?
        pl = 'level' in ncf_in.dimensions.keys()

        # build the output of empty netCDF file
        self.netCDF_empty(ncfile_out, self.stations, ncf_in)

        # open the output netCDF file, set it to be appendable ('a')
        ncf_out = nc.Dataset(ncfile_out, 'a')

        # get time and convert to datetime object
        nctime = ncf_in.variables['time'][:]
        # "hours since 1980-01-01 00:00:00"
        t_unit = "hours since 1980-01-01 00:00:00"  # ncf_in.variables['time'].units
        try:
            t_cal = ncf_in.variables['time'].calendar
        except AttributeError:  # Attribute doesn't exist
            t_cal = u"gregorian"  # or standard
        # TODO: rm time = [nc.num2date(timei, units=t_unit, calendar=t_cal) for timei in nctime]
        # TODO: rm time = np.asarray(time)

        # detect invariant files (topography etc.)
        invariant = True if len(np.unique(nctime)) <= 2 else False

        # restrict to date/time range if given
        if date is None:
            tmask = nctime < nc.date2num(
                datetime(3000, 1, 1), units=t_unit, calendar=t_cal)
        else:
            beg_num = nc.date2num(date['beg'], units=t_unit, calendar=t_cal)
            end_num = nc.date2num(date['end'], units=t_unit, calendar=t_cal)
            tmask = (nctime < end_num) * (nctime >= beg_num)

        if not any(tmask):
            sys.exit(
                '''\n ERROR: No downloaded data exist within date range specified by interpolation control file.
                     Download new data or change 'beg' / 'end' in interpolation control file'''
            )

        # get time indices
        time_in = nctime[tmask]

        # ensure that chunk sizes cover entire period even if
        # len(time_in) is not an integer multiple of cs
        niter = len(time_in) // self.cs
        niter += ((len(time_in) % self.cs) > 0)

        # loop in chunk size cs
        for n in range(niter):
            # indices
            beg = n * self.cs
            # restrict last chunk to lenght of tmask plus one (to get last time)
            if invariant:
                end = beg
            else:
                end = min(n * self.cs + self.cs, len(time_in)) - 1

            # make tmask for chunk
            beg_time = time_in[beg]
            if invariant:
                # allow topography to work in same code, len(nctime) = 1
                # TODO: rm end_time = nc.num2date(nctime[0], units=t_unit, calendar=t_cal)
                end_time = nctime[0]
                # end = 1
            else:
                # TODO: rm end_time = nc.num2date(time_in[end], units=t_unit, calendar=t_cal)
                end_time = time_in[end]

            # !! CAN'T HAVE '<= end_time', would damage appeding
            tmask_chunk = (nctime <= end_time) * (nctime >= beg_time)
            if invariant:
                # allow topography to work in same code
                tmask_chunk = np.array([True])

            # get the interpolated variables
            dfield, variables = self.interp2D(ncfile_in,
                                              ncf_in,
                                              self.stations,
                                              tmask_chunk,
                                              variables=None,
                                              date=None)

            # append time
            ncf_out.variables['time'][:] = np.append(
                ncf_out.variables['time'][:], time_in[beg:end + 1])
            # append variables
            for i, var in enumerate(variables):
                if variables_skip(var):
                    continue

                # extra treatment for pressure level files
                if pl:
                    # lev = ncf_in.variables['level'][:]
                    ncf_out.variables[var][
                        beg:end + 1, :, :] = dfield.data[:, i, :, :].transpose(
                            1, 2, 0)
                else:
                    ncf_out.variables[var][beg:end +
                                           1, :] = dfield.data[:,
                                                               i, :].transpose(
                                                                   1, 0)

        ncf_in.close()
        ncf_out.close()