Esempio n. 1
0
def performRegridding(dataset,
                      mode,
                      griddef,
                      dataargs,
                      loverwrite=False,
                      varlist=None,
                      lwrite=True,
                      lreturn=False,
                      ldebug=False,
                      lparallel=False,
                      pidstr='',
                      logger=None):
    ''' worker function to perform regridding for a given dataset and target grid '''
    # input checking
    if not isinstance(dataset, basestring): raise TypeError
    if not isinstance(dataargs, dict):
        raise TypeError  # all dataset arguments are kwargs
    if not isinstance(griddef, GridDefinition): raise TypeError
    if lparallel:
        if not lwrite:
            raise IOError, 'Can only write to disk in parallel mode (i.e. lwrite = True).'
        if lreturn:
            raise IOError, 'Can not return datasets in parallel mode (i.e. lreturn = False).'

    # logging
    if logger is None:  # make new logger
        logger = logging.getLogger()  # new logger
        logger.addHandler(logging.StreamHandler())
    else:
        if isinstance(logger, basestring):
            logger = logging.getLogger(name=logger)  # connect to existing one
        elif not isinstance(logger, logging.Logger):
            raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format(
                str(logger))

    ## extract meta data from arguments
    dataargs, loadfct, srcage, datamsgstr = getMetaData(
        dataset, mode, dataargs)
    dataset_name = dataargs.dataset_name
    periodstr = dataargs.periodstr
    avgfolder = dataargs.avgfolder

    # get filename for target dataset and do some checks
    filename = getTargetFile(
        dataset=dataset,
        mode=mode,
        dataargs=dataargs,
        lwrite=lwrite,
        grid=griddef.name.lower(),
    )

    # prepare target dataset
    if ldebug: filename = 'test_' + filename
    if not os.path.exists(avgfolder):
        raise IOError, "Dataset folder '{:s}' does not exist!".format(
            avgfolder)
    lskip = False  # else just go ahead
    if lwrite:
        if lreturn:
            tmpfilename = filename  # no temporary file if dataset is passed on (can't rename the file while it is open!)
        else:
            if lparallel: tmppfx = 'tmp_regrid_{:s}_'.format(pidstr[1:-1])
            else: tmppfx = 'tmp_regrid_'.format(pidstr[1:-1])
            tmpfilename = tmppfx + filename
        filepath = avgfolder + filename
        tmpfilepath = avgfolder + tmpfilename
        if os.path.exists(filepath):
            if not loverwrite:
                age = datetime.fromtimestamp(os.path.getmtime(filepath))
                # if source file is newer than sink file or if sink file is a stub, recompute, otherwise skip
                if age > srcage and os.path.getsize(filepath) > 1e6:
                    lskip = True
                    if hasattr(griddef,
                               'filepath') and griddef.filepath is not None:
                        gridage = datetime.fromtimestamp(
                            os.path.getmtime(griddef.filepath))
                        if age < gridage: lskip = False
                # N.B.: NetCDF files smaller than 1MB are usually incomplete header fragments from a previous crashed

    # depending on last modification time of file or overwrite setting, start computation, or skip
    if lskip:
        # print message
        skipmsg = "\n{:s}   >>>   Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format(
            pidstr, filename, dataset_name)
        skipmsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr, filepath)
        logger.info(skipmsg)
    else:

        ## actually load datasets
        source = loadfct()  # load source
        # check period
        if 'period' in source.atts and dataargs.periodstr != source.atts.period:  # a NetCDF attribute
            raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format(
                periodstr, source.atts.period)

        # print message
        if mode == 'climatology':
            opmsgstr = 'Regridding Climatology ({:s}) to {:s} Grid'.format(
                periodstr, griddef.name)
        elif mode == 'time-series':
            opmsgstr = 'Regridding Time-series to {:s} Grid'.format(
                griddef.name)
        else:
            raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode)
        # print feedback to logger
        logger.info(
            '\n{0:s}   ***   {1:^65s}   ***   \n{0:s}   ***   {2:^65s}   ***   \n'
            .format(pidstr, datamsgstr, opmsgstr))
        if not lparallel and ldebug: logger.info('\n' + str(source) + '\n')

        ## create new sink/target file
        # set attributes
        atts = source.atts.copy()
        atts['period'] = periodstr
        atts['name'] = dataset_name
        atts['grid'] = griddef.name
        if mode == 'climatology':
            atts['title'] = '{:s} Climatology on {:s} Grid'.format(
                dataset_name, griddef.name)
        elif mode == 'time-series':
            atts['title'] = '{:s} Time-series on {:s} Grid'.format(
                dataset_name, griddef.name)

        # make new dataset
        if lwrite:  # write to NetCDF file
            if os.path.exists(tmpfilepath):
                os.remove(tmpfilepath)  # remove old temp files
            sink = DatasetNetCDF(folder=avgfolder,
                                 filelist=[tmpfilename],
                                 atts=atts,
                                 mode='w')
        else:
            sink = Dataset(atts=atts)  # ony create dataset in memory

        # initialize processing
        CPU = CentralProcessingUnit(source,
                                    sink,
                                    varlist=varlist,
                                    tmp=False,
                                    feedback=ldebug)

        # perform regridding (if target grid is different from native grid!)
        if griddef.name != dataset:
            # reproject and resample (regrid) dataset
            CPU.Regrid(griddef=griddef, flush=True)

        # get results
        CPU.sync(flush=True)

        # add geolocators
        sink = addGeoLocator(sink,
                             griddef=griddef,
                             lgdal=True,
                             lreplace=True,
                             lcheck=True)
        # N.B.: WRF datasets come with their own geolocator arrays - we need to replace those!

        # add length and names of month
        if mode == 'climatology' and not sink.hasVariable(
                'length_of_month') and sink.hasVariable('time'):
            addLengthAndNamesOfMonth(
                sink,
                noleap=True if dataset.upper() in ('WRF', 'CESM') else False)

        # print dataset
        if not lparallel and ldebug:
            logger.info('\n' + str(sink) + '\n')
        # write results to file
        if lwrite:
            sink.sync()
            writemsg = "\n{:s}   >>>   Writing to file '{:s}' in dataset {:s}".format(
                pidstr, filename, dataset_name)
            writemsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr, filepath)
            logger.info(writemsg)

            # rename file to proper name
            if not lreturn:
                sink.unload()
                sink.close()
                del sink  # destroy all references
                if os.path.exists(filepath):
                    os.remove(filepath)  # remove old file
                os.rename(
                    tmpfilepath,
                    filepath)  # this would also overwrite the old file...
            # N.B.: there is no temporary file if the dataset is returned, because an open file can't be renamed

        # clean up and return
        source.unload()
        del source, CPU
        if lreturn:
            return sink  # return dataset for further use (netcdf file still open!)
        else:
            return 0  # "exit code"
Esempio n. 2
0
def performExtraction(dataset, mode, stnfct, dataargs, loverwrite=False, varlist=None, lwrite=True, lreturn=False,
                      ldebug=False, lparallel=False, pidstr='', logger=None):
  ''' worker function to extract point data from gridded dataset '''  
  # input checking
  if not isinstance(dataset,basestring): raise TypeError
  if not isinstance(dataargs,dict): raise TypeError # all dataset arguments are kwargs 
  if not callable(stnfct): raise TypeError # function to load station dataset
  if lparallel: 
    if not lwrite: raise IOError, 'In parallel mode we can only write to disk (i.e. lwrite = True).'
    if lreturn: raise IOError, 'Can not return datasets in parallel mode (i.e. lreturn = False).'
  
  # logging
  if logger is None: # make new logger     
    logger = logging.getLogger() # new logger
    logger.addHandler(logging.StreamHandler())
  else:
    if isinstance(logger,basestring): 
      logger = logging.getLogger(name=logger) # connect to existing one
    elif not isinstance(logger,logging.Logger): 
      raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format(str(logger))

  lclim = False; lts = False
  if mode == 'climatology': lclim = True
  elif mode == 'time-series': lts = True
  else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode)
  
  ## extract meta data from arguments
  module, dataargs, loadfct, filepath, datamsgstr = getMetaData(dataset, mode, dataargs)
  dataset_name = dataargs.dataset_name; periodstr = dataargs.periodstr; avgfolder = dataargs.avgfolder

  # load template dataset
  stndata = stnfct() # load station dataset from function
  if not isinstance(stndata, Dataset): raise TypeError
  # N.B.: the loading function is necessary, because DataseNetCDF instances do not pickle well 
            
  # determine age of source file
  if not loverwrite: sourceage = datetime.fromtimestamp(os.path.getmtime(filepath))    
          
  # get filename for target dataset and do some checks
  filename = getTargetFile(stndata.name, dataset, mode, module, dataargs, lwrite)
  if ldebug: filename = 'test_' + filename
  if not os.path.exists(avgfolder): raise IOError, "Dataset folder '{:s}' does not exist!".format(avgfolder)
  lskip = False # else just go ahead
  if lwrite:
    if lreturn: 
      tmpfilename = filename # no temporary file if dataset is passed on (can't rename the file while it is open!)
    else: 
      if lparallel: tmppfx = 'tmp_exstns_{:s}_'.format(pidstr[1:-1])
      else: tmppfx = 'tmp_exstns_'.format(pidstr[1:-1])
      tmpfilename = tmppfx + filename      
    filepath = avgfolder + filename
    tmpfilepath = avgfolder + tmpfilename
    if os.path.exists(filepath): 
      if not loverwrite: 
        age = datetime.fromtimestamp(os.path.getmtime(filepath))
        # if source file is newer than sink file or if sink file is a stub, recompute, otherwise skip
        if age > sourceage and os.path.getsize(filepath) > 1e5: lskip = True
        # N.B.: NetCDF files smaller than 100kB are usually incomplete header fragments from a previous crashed
      if not lskip: os.remove(filepath) # recompute
  
  # depending on last modification time of file or overwrite setting, start computation, or skip
  if lskip:        
    # print message
    skipmsg =  "\n{:s}   >>>   Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format(pidstr,filename,dataset_name)
    skipmsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr,filepath)
    logger.info(skipmsg)              
  else:
          
    ## actually load datasets
    source = loadfct() # load source 
    # check period
    if 'period' in source.atts and dataargs.periodstr != source.atts.period: # a NetCDF attribute
      raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format(periodstr,source.atts.period)
  
    # print message
    if lclim: opmsgstr = "Extracting '{:s}'-type Point Data from Climatology ({:s})".format(stndata.name, periodstr)
    elif lts: opmsgstr = "Extracting '{:s}'-type Point Data from Time-series".format(stndata.name)
    else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode)
    # print feedback to logger
    logger.info('\n{0:s}   ***   {1:^65s}   ***   \n{0:s}   ***   {2:^65s}   ***   \n'.format(pidstr,datamsgstr,opmsgstr))
    if not lparallel and ldebug: logger.info('\n'+str(source)+'\n')  
    
    ## create new sink/target file
    # set attributes   
    atts=source.atts.copy()
    atts['period'] = dataargs.periodstr if dataargs.periodstr else 'time-series' 
    atts['name'] = dataset_name; atts['station'] = stndata.name
    atts['title'] = '{:s} (Stations) from {:s} {:s}'.format(stndata.title,dataset_name,mode.title())
    # make new dataset
    if lwrite: # write to NetCDF file 
      if os.path.exists(tmpfilepath): os.remove(tmpfilepath) # remove old temp files 
      sink = DatasetNetCDF(folder=avgfolder, filelist=[tmpfilename], atts=atts, mode='w')
    else: sink = Dataset(atts=atts) # ony create dataset in memory
    
    # initialize processing
    CPU = CentralProcessingUnit(source, sink, varlist=varlist, tmp=False, feedback=ldebug)
  
    # extract data at station locations
    CPU.Extract(template=stndata, flush=True)
    # get results    
    CPU.sync(flush=True)
    
    # print dataset
    if not lparallel and ldebug:
      logger.info('\n'+str(sink)+'\n')   
    # write results to file
    if lwrite:
      sink.sync()
      writemsg =  "\n{:s}   >>>   Writing to file '{:s}' in dataset {:s}".format(pidstr,filename,dataset_name)
      writemsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr,filepath)
      logger.info(writemsg)      
      
      # rename file to proper name
      if not lreturn:
        sink.unload(); sink.close(); del sink # destroy all references 
        if os.path.exists(filepath): os.remove(filepath) # remove old file
        os.rename(tmpfilepath,filepath)
      # N.B.: there is no temporary file if the dataset is returned, because an open file can't be renamed
        
    # clean up and return
    source.unload(); del source#, CPU
    if lreturn:      
      return sink # return dataset for further use (netcdf file still open!)
    else:            
      return 0 # "exit code"
Esempio n. 3
0
def performExport(dataset, mode, dataargs, expargs, bcargs, loverwrite=False, 
                  ldebug=False, lparallel=False, pidstr='', logger=None):
    ''' worker function to export ASCII rasters for a given dataset '''
    # input checking
    if not isinstance(dataset,basestring): raise TypeError
    if not isinstance(dataargs,dict): raise TypeError # all dataset arguments are kwargs 
    
    # logging
    if logger is None: # make new logger     
        logger = logging.getLogger() # new logger
        logger.addHandler(logging.StreamHandler())
    else:
        if isinstance(logger,basestring): 
            logger = logging.getLogger(name=logger) # connect to existing one
        elif not isinstance(logger,logging.Logger): 
            raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format(str(logger))
  
    ## extract meta data from arguments
    dataargs, loadfct, srcage, datamsgstr = getMetaData(dataset, mode, dataargs, lone=False)
    dataset_name = dataargs.dataset_name; periodstr = dataargs.periodstr; domain = dataargs.domain
    
    # figure out bias correction parameters
    if bcargs:
        bcargs = bcargs.copy() # first copy, then modify...
        bc_method = bcargs.pop('method',None)
        if bc_method is None: raise ArgumentError("Need to specify bias-correction method to use bias correction!")
        bc_obs = bcargs.pop('obs_dataset',None)
        if bc_obs is None: raise ArgumentError("Need to specify observational dataset to use bias correction!")
        bc_reference = bcargs.pop('reference',None)
        if bc_reference is None: # infer from experiment name
            if dataset_name[-5:] in ('-2050','-2100'): bc_reference = dataset_name[:-5] # cut of period indicator and hope for the best 
            else: bc_reference = dataset_name 
        bc_grid = bcargs.pop('grid',None)
        if bc_grid is None: bc_grid = dataargs.grid
        bc_domain = bcargs.pop('domain',None)
        if bc_domain is None: bc_domain = domain
        bc_varlist = bcargs.pop('varlist',None)
        bc_varmap = bcargs.pop('varmap',None)       
        bc_tag = bcargs.pop('tag',None) # an optional name extension/tag
        bc_pattern = bcargs.pop('file_pattern',None) # usually default in getPickleFile
        lgzip = bcargs.pop('lgzip',None) # if pickle is gzipped (None: auto-detect based on file name extension)
        # get name of pickle file (and folder)
        picklefolder = dataargs.avgfolder.replace(dataset_name,bc_reference)
        picklefile = getPickleFileName(method=bc_method, obs_name=bc_obs, gridstr=bc_grid, domain=bc_domain, 
                                       tag=bc_tag, pattern=bc_pattern)
        picklepath = '{:s}/{:s}'.format(picklefolder,picklefile)
        if lgzip:
            picklepath += '.gz' # add extension
            if not os.path.exists(picklepath): raise IOError(picklepath)
        elif lgzip is None:
            lgzip = False
            if not os.path.exists(picklepath):
                lgzip = True # assume gzipped file
                picklepath += '.gz' # try with extension...
                if not os.path.exists(picklepath): raise IOError(picklepath)
        elif not os.path.exists(picklepath): raise IOError(picklepath)
        pickleage = datetime.fromtimestamp(os.path.getmtime(picklepath))
        # determine age of pickle file and compare against source age
    else:
      bc_method = False 
      pickleage = srcage
    
    # parse export options
    expargs = expargs.copy() # first copy, then modify...
    lm3 = expargs.pop('lm3') # convert kg/m^2/s to m^3/m^2/s (water flux)
    expformat = expargs.pop('format') # needed to get FileFormat object
    exp_list= expargs.pop('exp_list') # this handled outside of export
    compute_list = expargs.pop('compute_list', []) # variables to be (re-)computed - by default all
    # initialize FileFormat class instance
    fileFormat = getFileFormat(expformat, bc_method=bc_method, **expargs)
    # get folder for target dataset and do some checks
    expname = '{:s}_d{:02d}'.format(dataset_name,domain) if domain else dataset_name
    expfolder = fileFormat.defineDataset(dataset=dataset, mode=mode, dataargs=dataargs, lwrite=True, ldebug=ldebug)
  
    # prepare destination for new dataset
    lskip = fileFormat.prepareDestination(srcage=max(srcage,pickleage), loverwrite=loverwrite)
  
    # depending on last modification time of file or overwrite setting, start computation, or skip
    if lskip:        
        # print message
        skipmsg =  "\n{:s}   >>>   Skipping: Format '{:s} for dataset '{:s}' already exists and is newer than source file.".format(pidstr,expformat,dataset_name)
        skipmsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr,expfolder)
        logger.info(skipmsg)              
    else:
            
      ## actually load datasets
      source = loadfct() # load source data
      # check period
      if 'period' in source.atts and dataargs.periodstr != source.atts.period: # a NetCDF attribute
          raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format(periodstr,source.atts.period)
      
      # load BiasCorrection object from pickle
      if bc_method:      
          op = gzip.open if lgzip else open
          with op(picklepath, 'r') as filehandle:
              BC = pickle.load(filehandle) 
          # assemble logger entry
          bcmsgstr = "(performing bias-correction using {:s} from {:s} towards {:s})".format(BC.long_name,bc_reference,bc_obs)
      
      # print message
      if mode == 'climatology': opmsgstr = 'Exporting Climatology ({:s}) to {:s} Format'.format(periodstr, expformat)
      elif mode == 'time-series': opmsgstr = 'Exporting Time-series to {:s} Format'.format(expformat)
      elif mode[-5:] == '-mean': opmsgstr = 'Exporting {:s}-Mean ({:s}) to {:s} Format'.format(mode[:-5], periodstr, expformat)
      else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode)        
      # print feedback to logger
      logmsg = '\n{0:s}   ***   {1:^65s}   ***   \n{0:s}   ***   {2:^65s}   ***   \n'.format(pidstr,datamsgstr,opmsgstr)
      if bc_method:
          logmsg += "{0:s}   ***   {1:^65s}   ***   \n".format(pidstr,bcmsgstr)
      logger.info(logmsg)
      if not lparallel and ldebug: logger.info('\n'+str(source)+'\n')
      
      # create GDAL-enabled target dataset
      sink = Dataset(axes=(source.xlon,source.ylat), name=expname, title=source.title, atts=source.atts.copy())
      addGDALtoDataset(dataset=sink, griddef=source.griddef)
      assert sink.gdal, sink
      
      # apply bias-correction
      if bc_method:
          source = BC.correct(source, asNC=False, varlist=bc_varlist, varmap=bc_varmap) # load bias-corrected variables into memory
        
      # N.B.: for variables that are not bias-corrected, data are not loaded immediately but on demand; this way 
      #       I/O and computing can be further disentangled and not all variables are always needed
      
      # compute intermediate variables, if necessary
      for varname in exp_list:
          variables = None # variable list
          var = None
          # (re-)compute variable, if desired...
          if varname in compute_list:
              if varname == 'precip': var = newvars.computeTotalPrecip(source)
              elif varname == 'waterflx': var = newvars.computeWaterFlux(source)
              elif varname == 'liqwatflx': var = newvars.computeLiquidWaterFlux(source)
              elif varname == 'netrad': var = newvars.computeNetRadiation(source, asVar=True)
              elif varname == 'netrad_bb': var = newvars.computeNetRadiation(source, asVar=True, lrad=False, name='netrad_bb')
              elif varname == 'netrad_bb0': var = newvars.computeNetRadiation(source, asVar=True, lrad=False, lA=False, name='netrad_bb0')
              elif varname == 'vapdef': var = newvars.computeVaporDeficit(source)
              elif varname in ('pet','pet_pm','petrad','petwnd') and 'pet' not in sink:
                  if 'petrad' in exp_list or 'petwnd' in exp_list:
                      variables = newvars.computePotEvapPM(source, lterms=True) # default; returns mutliple PET terms
                  else: var = newvars.computePotEvapPM(source, lterms=False) # returns only PET
              elif varname == 'pet_th': var = None # skip for now
                  #var = computePotEvapTh(source) # simplified formula (less prerequisites)
          # ... otherwise load from source file
          if var is None and variables is None and varname in source:
              var = source[varname].load() # load data (may not have to load all)
          #else: raise VariableError, "Unsupported Variable '{:s}'.".format(varname)
          # for now, skip variables that are None
          if var or variables:
              # handle lists as well
              if var and variables: raise VariableError, (var,variables)
              elif var: variables = (var,)
              for var in variables:
                  addGDALtoVar(var=var, griddef=sink.griddef)
                  if not var.gdal and isinstance(fileFormat,ASCII_raster):
                      raise GDALError, "Exporting to ASCII_raster format requires GDAL-enabled variables."
                  # add to new dataset
                  sink += var
      # convert units
      if lm3:
          for var in sink:
              if var.units == 'kg/m^2/s':
                  var /= 1000. # divide to get m^3/m^2/s
                  var.units = 'm^3/m^2/s' # update units
      
      # compute seasonal mean if we are in mean-mode
      if mode[-5:] == '-mean': 
          sink = sink.seasonalMean(season=mode[:-5], lclim=True)
          # N.B.: to remain consistent with other output modes, 
          #       we need to prevent renaming of the time axis
          sink = concatDatasets([sink,sink], axis='time', lensembleAxis=True)
          sink.squeeze() # we need the year-axis until now to distinguish constant fields; now remove
      
      # print dataset
      if not lparallel and ldebug:
          logger.info('\n'+str(sink)+'\n')
        
      # export new dataset to selected format
      fileFormat.exportDataset(sink)
        
      # write results to file
      writemsg =  "\n{:s}   >>>   Export of Dataset '{:s}' to Format '{:s}' complete.".format(pidstr,expname, expformat)
      writemsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr,expfolder)
      logger.info(writemsg)      
         
      # clean up and return
      source.unload(); #del source
      return 0 # "exit code"
Esempio n. 4
0
def performExtraction(dataset,
                      mode,
                      stnfct,
                      dataargs,
                      loverwrite=False,
                      varlist=None,
                      lwrite=True,
                      lreturn=False,
                      ldebug=False,
                      lparallel=False,
                      pidstr='',
                      logger=None):
    ''' worker function to extract point data from gridded dataset '''
    # input checking
    if not isinstance(dataset, basestring): raise TypeError
    if not isinstance(dataargs, dict):
        raise TypeError  # all dataset arguments are kwargs
    if not callable(stnfct):
        raise TypeError  # function to load station dataset
    if lparallel:
        if not lwrite:
            raise IOError, 'In parallel mode we can only write to disk (i.e. lwrite = True).'
        if lreturn:
            raise IOError, 'Can not return datasets in parallel mode (i.e. lreturn = False).'

    # logging
    if logger is None:  # make new logger
        logger = logging.getLogger()  # new logger
        logger.addHandler(logging.StreamHandler())
    else:
        if isinstance(logger, basestring):
            logger = logging.getLogger(name=logger)  # connect to existing one
        elif not isinstance(logger, logging.Logger):
            raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format(
                str(logger))

    lclim = False
    lts = False
    if mode == 'climatology': lclim = True
    elif mode == 'time-series': lts = True
    else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode)

    ## extract meta data from arguments
    dataargs, loadfct, srcage, datamsgstr = getMetaData(
        dataset, mode, dataargs)
    dataset_name = dataargs.dataset_name
    periodstr = dataargs.periodstr
    avgfolder = dataargs.avgfolder

    # load template dataset
    stndata = stnfct()  # load station dataset from function
    if not isinstance(stndata, Dataset): raise TypeError
    # N.B.: the loading function is necessary, because DataseNetCDF instances do not pickle well

    # get filename for target dataset and do some checks
    filename = getTargetFile(dataset=dataset,
                             mode=mode,
                             dataargs=dataargs,
                             lwrite=lwrite,
                             station=stndata.name)

    if ldebug: filename = 'test_' + filename
    if not os.path.exists(avgfolder):
        raise IOError, "Dataset folder '{:s}' does not exist!".format(
            avgfolder)
    lskip = False  # else just go ahead
    if lwrite:
        if lreturn:
            tmpfilename = filename  # no temporary file if dataset is passed on (can't rename the file while it is open!)
        else:
            if lparallel: tmppfx = 'tmp_exstns_{:s}_'.format(pidstr[1:-1])
            else: tmppfx = 'tmp_exstns_'.format(pidstr[1:-1])
            tmpfilename = tmppfx + filename
        filepath = avgfolder + filename
        tmpfilepath = avgfolder + tmpfilename
        if os.path.exists(filepath):
            if not loverwrite:
                age = datetime.fromtimestamp(os.path.getmtime(filepath))
                # if source file is newer than sink file or if sink file is a stub, recompute, otherwise skip
                if age > srcage and os.path.getsize(filepath) > 1e5:
                    lskip = True
                # N.B.: NetCDF files smaller than 100kB are usually incomplete header fragments from a previous crashed

    # depending on last modification time of file or overwrite setting, start computation, or skip
    if lskip:
        # print message
        skipmsg = "\n{:s}   >>>   Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format(
            pidstr, filename, dataset_name)
        skipmsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr, filepath)
        logger.info(skipmsg)
    else:

        ## actually load datasets
        source = loadfct()  # load source
        # check period
        if 'period' in source.atts and dataargs.periodstr != source.atts.period:  # a NetCDF attribute
            raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format(
                periodstr, source.atts.period)

        # print message
        if lclim:
            opmsgstr = "Extracting '{:s}'-type Point Data from Climatology ({:s})".format(
                stndata.name, periodstr)
        elif lts:
            opmsgstr = "Extracting '{:s}'-type Point Data from Time-series".format(
                stndata.name)
        else:
            raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode)
        # print feedback to logger
        logger.info(
            '\n{0:s}   ***   {1:^65s}   ***   \n{0:s}   ***   {2:^65s}   ***   \n'
            .format(pidstr, datamsgstr, opmsgstr))
        if not lparallel and ldebug: logger.info('\n' + str(source) + '\n')

        ## create new sink/target file
        # set attributes
        atts = source.atts.copy()
        atts[
            'period'] = dataargs.periodstr if dataargs.periodstr else 'time-series'
        atts['name'] = dataset_name
        atts['station'] = stndata.name
        atts['title'] = '{:s} (Stations) from {:s} {:s}'.format(
            stndata.title, dataset_name, mode.title())
        # make new dataset
        if lwrite:  # write to NetCDF file
            if os.path.exists(tmpfilepath):
                os.remove(tmpfilepath)  # remove old temp files
            sink = DatasetNetCDF(folder=avgfolder,
                                 filelist=[tmpfilename],
                                 atts=atts,
                                 mode='w')
        else:
            sink = Dataset(atts=atts)  # ony create dataset in memory

        # initialize processing
        CPU = CentralProcessingUnit(source,
                                    sink,
                                    varlist=varlist,
                                    tmp=False,
                                    feedback=ldebug)

        # extract data at station locations
        CPU.Extract(template=stndata, flush=True)
        # get results
        CPU.sync(flush=True)

        # print dataset
        if not lparallel and ldebug:
            logger.info('\n' + str(sink) + '\n')
        # write results to file
        if lwrite:
            sink.sync()
            writemsg = "\n{:s}   >>>   Writing to file '{:s}' in dataset {:s}".format(
                pidstr, filename, dataset_name)
            writemsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr, filepath)
            logger.info(writemsg)

            # rename file to proper name
            if not lreturn:
                sink.unload()
                sink.close()
                del sink  # destroy all references
                if os.path.exists(filepath):
                    os.remove(filepath)  # remove old file
                os.rename(tmpfilepath, filepath)
            # N.B.: there is no temporary file if the dataset is returned, because an open file can't be renamed

        # clean up and return
        source.unload()
        del source  #, CPU
        if lreturn:
            return sink  # return dataset for further use (netcdf file still open!)
        else:
            return 0  # "exit code"
Esempio n. 5
0
def performRegridding(dataset, mode, griddef, dataargs, loverwrite=False, varlist=None, lwrite=True, 
                      lreturn=False, ldebug=False, lparallel=False, pidstr='', logger=None):
  ''' worker function to perform regridding for a given dataset and target grid '''
  # input checking
  if not isinstance(dataset,basestring): raise TypeError
  if not isinstance(dataargs,dict): raise TypeError # all dataset arguments are kwargs 
  if not isinstance(griddef,GridDefinition): raise TypeError
  if lparallel: 
    if not lwrite: raise IOError, 'Can only write to disk in parallel mode (i.e. lwrite = True).'
    if lreturn: raise IOError, 'Can not return datasets in parallel mode (i.e. lreturn = False).'
  
  # logging
  if logger is None: # make new logger     
    logger = logging.getLogger() # new logger
    logger.addHandler(logging.StreamHandler())
  else:
    if isinstance(logger,basestring): 
      logger = logging.getLogger(name=logger) # connect to existing one
    elif not isinstance(logger,logging.Logger): 
      raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format(str(logger))

  ## extract meta data from arguments
  dataargs, loadfct, srcage, datamsgstr = getMetaData(dataset, mode, dataargs)
  dataset_name = dataargs.dataset_name; periodstr = dataargs.periodstr; avgfolder = dataargs.avgfolder

  # get filename for target dataset and do some checks
  filename = getTargetFile(dataset=dataset, mode=mode, dataargs=dataargs, lwrite=lwrite, 
                           grid=griddef.name.lower(), period=None, filetype=None) 
    
  # prepare target dataset
  if ldebug: filename = 'test_' + filename
  if not os.path.exists(avgfolder): raise IOError, "Dataset folder '{:s}' does not exist!".format(avgfolder)
  lskip = False # else just go ahead
  if lwrite:
    if lreturn: tmpfilename = filename # no temporary file if dataset is passed on (can't rename the file while it is open!)
    else: 
      if lparallel: tmppfx = 'tmp_regrid_{:s}_'.format(pidstr[1:-1])
      else: tmppfx = 'tmp_regrid_'.format(pidstr[1:-1])
      tmpfilename = tmppfx + filename      
    filepath = avgfolder + filename
    tmpfilepath = avgfolder + tmpfilename
    if os.path.exists(filepath): 
      if not loverwrite: 
        age = datetime.fromtimestamp(os.path.getmtime(filepath))
        # if source file is newer than sink file or if sink file is a stub, recompute, otherwise skip
        if age > srcage and os.path.getsize(filepath) > 1e6: 
          lskip = True
          if hasattr(griddef, 'filepath') and griddef.filepath is not None:
            gridage = datetime.fromtimestamp(os.path.getmtime(griddef.filepath))
            if age < gridage: lskip = False
        # N.B.: NetCDF files smaller than 1MB are usually incomplete header fragments from a previous crashed
      if not lskip: os.remove(filepath) # recompute
  
  # depending on last modification time of file or overwrite setting, start computation, or skip
  if lskip:        
    # print message
    skipmsg =  "\n{:s}   >>>   Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format(pidstr,filename,dataset_name)
    skipmsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr,filepath)
    logger.info(skipmsg)              
  else:
          
    ## actually load datasets
    source = loadfct() # load source 
    # check period
    if 'period' in source.atts and dataargs.periodstr != source.atts.period: # a NetCDF attribute
      raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format(periodstr,source.atts.period)

    # print message
    if mode == 'climatology': opmsgstr = 'Regridding Climatology ({:s}) to {:s} Grid'.format(periodstr, griddef.name)
    elif mode == 'time-series': opmsgstr = 'Regridding Time-series to {:s} Grid'.format(griddef.name)
    else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode)        
    # print feedback to logger
    logger.info('\n{0:s}   ***   {1:^65s}   ***   \n{0:s}   ***   {2:^65s}   ***   \n'.format(pidstr,datamsgstr,opmsgstr))
    if not lparallel and ldebug: logger.info('\n'+str(source)+'\n')
    
    ## create new sink/target file
    # set attributes   
    atts=source.atts.copy()
    atts['period'] = periodstr; atts['name'] = dataset_name; atts['grid'] = griddef.name
    if mode == 'climatology': atts['title'] = '{:s} Climatology on {:s} Grid'.format(dataset_name, griddef.name)
    elif mode == 'time-series':  atts['title'] = '{:s} Time-series on {:s} Grid'.format(dataset_name, griddef.name)
      
    # make new dataset
    if lwrite: # write to NetCDF file 
      if os.path.exists(tmpfilepath): os.remove(tmpfilepath) # remove old temp files 
      sink = DatasetNetCDF(folder=avgfolder, filelist=[tmpfilename], atts=atts, mode='w')
    else: sink = Dataset(atts=atts) # ony create dataset in memory
    
    # initialize processing
    CPU = CentralProcessingUnit(source, sink, varlist=varlist, tmp=False, feedback=ldebug)
  
    # perform regridding (if target grid is different from native grid!)
    if griddef.name != dataset:
      # reproject and resample (regrid) dataset
      CPU.Regrid(griddef=griddef, flush=True)

    # get results    
    CPU.sync(flush=True)
    
    # add geolocators
    sink = addGeoLocator(sink, griddef=griddef, lgdal=True, lreplace=True, lcheck=True)
    # N.B.: WRF datasets come with their own geolocator arrays - we need to replace those!
    
    # add length and names of month
    if mode == 'climatology' and not sink.hasVariable('length_of_month') and sink.hasVariable('time'): 
      addLengthAndNamesOfMonth(sink, noleap=True if dataset.upper() in ('WRF','CESM') else False) 
    
    # print dataset
    if not lparallel and ldebug:
      logger.info('\n'+str(sink)+'\n')   
    # write results to file
    if lwrite:
      sink.sync()
      writemsg =  "\n{:s}   >>>   Writing to file '{:s}' in dataset {:s}".format(pidstr,filename,dataset_name)
      writemsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr,filepath)
      logger.info(writemsg)      
      
      # rename file to proper name
      if not lreturn:
        sink.unload(); sink.close(); del sink # destroy all references 
        if os.path.exists(filepath): os.remove(filepath) # remove old file
        os.rename(tmpfilepath,filepath) # this would also overwrite the old file...
      # N.B.: there is no temporary file if the dataset is returned, because an open file can't be renamed
        
    # clean up and return
    source.unload(); del source, CPU
    if lreturn:      
      return sink # return dataset for further use (netcdf file still open!)
    else:            
      return 0 # "exit code"
Esempio n. 6
0
def generateBiasCorrection(dataset,
                           mode,
                           dataargs,
                           obs_dataset,
                           bc_method,
                           bc_args,
                           loverwrite=False,
                           lgzip=None,
                           tag=None,
                           ldebug=False,
                           lparallel=False,
                           pidstr='',
                           logger=None):
    ''' worker function to generate a bias correction objects for a given dataset '''
    # input checking
    if not isinstance(dataset, basestring): raise TypeError
    if not isinstance(dataargs, dict):
        raise TypeError  # all dataset arguments are kwargs

    # logging
    if logger is None:  # make new logger
        logger = logging.getLogger()  # new logger
        logger.addHandler(logging.StreamHandler())
    else:
        if isinstance(logger, basestring):
            logger = logging.getLogger(name=logger)  # connect to existing one
        elif not isinstance(logger, logging.Logger):
            raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format(
                str(logger))

    ## extract meta data from arguments
    dataargs, loadfct, srcage, datamsgstr = getMetaData(dataset,
                                                        mode,
                                                        dataargs,
                                                        lone=False)
    dataset_name = dataargs.dataset_name
    periodstr = dataargs.periodstr
    avgfolder = dataargs.avgfolder

    # parse export options
    bc_args = bc_args.copy()  # first copy, then modify...
    # initialize BiasCorrection class instance
    BC = getBCmethods(bc_method, **bc_args)
    # get folder for target dataset and do some checks
    picklefile = BC.picklefile(obs_name=obs_dataset.name,
                               gridstr=dataargs.grid,
                               domain=dataargs.domain,
                               tag=tag)
    if ldebug: picklefile = 'test_' + picklefile
    picklepath = '{:s}/{:s}'.format(avgfolder, picklefile)

    # check if we are overwriting an existing file
    if not os.path.exists(avgfolder):
        raise IOError, "Dataset folder '{:s}' does not exist!".format(
            avgfolder)
    lskip = False  # else just go ahead
    if os.path.exists(picklepath) and not loverwrite:
        age = datetime.fromtimestamp(os.path.getmtime(picklepath))
        # if source file is newer than sink file or if sink file is a stub, recompute, otherwise skip
        if age > srcage:
            lskip = True
            if hasattr(obs_dataset,
                       'filepath') and obs_dataset.filepath is not None:
                obsage = datetime.fromtimestamp(
                    os.path.getmtime(obs_dataset.filepath))
                if age < obsage: lskip = False

    # depending on last modification time of file or overwrite setting, start computation, or skip
    if lskip:
        # print message
        skipmsg = "\n{:s}   >>>   Skipping: Bias-correction '{:s} for dataset '{:s}' already exists and is newer than source file.".format(
            pidstr, BC.long_name, dataset_name)
        skipmsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr, picklepath)
        logger.info(skipmsg)
        del BC
    else:

        ## actually load datasets
        dataset = loadfct()  # load source data
        # check period
        if 'period' in dataset.atts and dataargs.periodstr != dataset.atts.period:  # a NetCDF attribute
            raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format(
                periodstr, dataset.atts.period)

        # print message
        if mode == 'climatology':
            opmsgstr = 'Bias-correcting Climatology ({:s}) using {:s}'.format(
                periodstr, BC.long_name)
        elif mode == 'time-series':
            opmsgstr = 'Bias-correcting Time-series using {:s}'.format(
                BC.long_name)
        elif mode[-5:] == '-mean':
            opmsgstr = 'Bias-correcting {:s}-Mean ({:s}) using {:s}'.format(
                mode[:-5], periodstr, BC.long_name)
        else:
            raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode)
        # print feedback to logger
        logger.info(
            '\n{0:s}   ***   {1:^65s}   ***   \n{0:s}   ***   {2:^65s}   ***   \n'
            .format(pidstr, datamsgstr, opmsgstr))
        if not lparallel and ldebug: logger.info('\n' + str(dataset) + '\n')

        # N.B.: data are not loaded immediately but on demand; this way I/O and computing are further
        #       disentangled and not all variables are always needed

        # "train", i.e. optimize fit parameters
        BC.train(dataset, obs_dataset)

        # print bias-correction
        if not lparallel and ldebug:
            logger.info('\n' + str(BC) + '\n')
            print("Bias-correction Statistics:")
            BC.validate(dataset, obs_dataset, lprint=True)
            print('')

        ## pickle bias-correction object with trained parameters
        # open file and save pickle
        if os.path.exists(picklepath): os.remove(picklepath)
        if lgzip:
            op = gzip.open
            picklepath += '.gz'
        else:
            op = open
        with op(picklepath, 'wb') as filehandle:
            pickle.dump(BC, filehandle,
                        protocol=-1)  # should be new binary protocol
        if not os.path.exists(picklepath):
            raise IOError, "Error while saving Pickle to '{0:s}'".format(
                picklepath)

        # write results to file
        writemsg = "\n{:s}   >>>   Generation of BiasCorrection '{:s}' for Dataset '{:s}' complete.".format(
            pidstr,
            bc_method,
            dataset_name,
        )
        writemsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr, picklepath)
        logger.info(writemsg)

        # clean up and return
        dataset.unload()
        del dataset, BC
        return 0  # "exit code"
Esempio n. 7
0
def performExport(dataset, mode, dataargs, expargs, loverwrite=False, 
                  ldebug=False, lparallel=False, pidstr='', logger=None):
  ''' worker function to perform regridding for a given dataset and target grid '''
  # input checking
  if not isinstance(dataset,basestring): raise TypeError
  if not isinstance(dataargs,dict): raise TypeError # all dataset arguments are kwargs 
  
  # logging
  if logger is None: # make new logger     
    logger = logging.getLogger() # new logger
    logger.addHandler(logging.StreamHandler())
  else:
    if isinstance(logger,basestring): 
      logger = logging.getLogger(name=logger) # connect to existing one
    elif not isinstance(logger,logging.Logger): 
      raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format(str(logger))

  ## extract meta data from arguments
  dataargs, loadfct, srcage, datamsgstr = getMetaData(dataset, mode, dataargs, lone=False)
  dataset_name = dataargs.dataset_name; periodstr = dataargs.periodstr; domain = dataargs.domain
  
  # parse export options
  expargs = expargs.copy() # first copy, then modify...
  lm3 = expargs.pop('lm3') # convert kg/m^2/s to m^3/m^2/s (water flux)
  expformat = expargs.pop('format') # needed to get FileFormat object
  varlist = expargs.pop('varlist') # this handled outside of export
  # initialize FileFormat class instance
  fileFormat = getFileFormat(expformat, **expargs)
  # get folder for target dataset and do some checks
  expname = '{:s}_d{:02d}'.format(dataset_name,domain) if domain else dataset_name
  expfolder = fileFormat.defineDataset(name=dataset_name, dataset=dataset, mode=mode, dataargs=dataargs, lwrite=True, ldebug=ldebug)

  # prepare destination for new dataset
  lskip = fileFormat.prepareDestination(srcage=srcage, loverwrite=loverwrite)
  
  # depending on last modification time of file or overwrite setting, start computation, or skip
  if lskip:        
    # print message
    skipmsg =  "\n{:s}   >>>   Skipping: Format '{:s} for dataset '{:s}' already exists and is newer than source file.".format(pidstr,expformat,dataset_name)
    skipmsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr,expfolder)
    logger.info(skipmsg)              
  else:
          
    ## actually load datasets
    source = loadfct() # load source data
    # check period
    if 'period' in source.atts and dataargs.periodstr != source.atts.period: # a NetCDF attribute
      raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format(periodstr,source.atts.period)

    # print message
    if mode == 'climatology': opmsgstr = 'Exporting Climatology ({:s}) to {:s} Format'.format(periodstr, expformat)
    elif mode == 'time-series': opmsgstr = 'Exporting Time-series to {:s} Format'.format(expformat)
    else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode)        
    # print feedback to logger
    logger.info('\n{0:s}   ***   {1:^65s}   ***   \n{0:s}   ***   {2:^65s}   ***   \n'.format(pidstr,datamsgstr,opmsgstr))
    if not lparallel and ldebug: logger.info('\n'+str(source)+'\n')
    
    # create GDAL-enabled target dataset
    sink = Dataset(axes=(source.xlon,source.ylat), name=expname, title=source.title)
    addGDALtoDataset(dataset=sink, griddef=source.griddef)
    assert sink.gdal, sink
    
    # N.B.: data are not loaded immediately but on demand; this way I/O and computing are further
    #       disentangled and not all variables are always needed
    
    # Compute intermediate variables, if necessary
    for varname in varlist:
      vars = None # variable list
      if varname in source:
        var = source[varname].load() # load data (may not have to load all)
      else:
        var = None
        if varname == 'waterflx': var = newvars.computeWaterFlux(source)
        elif varname == 'liqwatflx': var = newvars.computeLiquidWaterFlux(source)
        elif varname == 'netrad': var = newvars.computeNetRadiation(source, asVar=True)
        elif varname == 'netrad_0': var = newvars.computeNetRadiation(source, asVar=True, lA=False, name='netrad_0')
        elif varname == 'netrad_bb': var = newvars.computeNetRadiation(source, asVar=True, lrad=False, name='netrad_bb')
        elif varname == 'vapdef': var = newvars.computeVaporDeficit(source)
        elif varname == 'pet' or varname == 'pet_pm':
          vars = newvars.computePotEvapPM(source, lterms=True) # default; returns mutliple PET terms
          #var = newvars.computePotEvapPM(source, lterms=False) # returns only PET
        elif varname == 'pet_th': var = None # skip for now
          #var = computePotEvapTh(source) # simplified formula (less prerequisites)
        else: raise VariableError, "Unsupported Variable '{:s}'.".format(varname)
      # for now, skip variables that are None
      if var or vars:
        # handle lists as well
        if var and vars: raise VariableError, (var,vars)
        if var: vars = (var,)
        for var in vars:
          addGDALtoVar(var=var, griddef=sink.griddef)
          if not var.gdal and isinstance(fileFormat,ASCII_raster):
            raise GDALError, "Exporting to ASCII_raster format requires GDAL-enabled variables."
          # add to new dataset
          sink += var
    # convert units
    if lm3:
      for var in sink:
        if var.units == 'kg/m^2/s':
          var /= 1000. # divide to get m^3/m^2/s
          var.units = 'm^3/m^2/s' # update units
    
    # print dataset
    if not lparallel and ldebug:
      logger.info('\n'+str(sink)+'\n')
      
    # export new dataset to selected format
    fileFormat.exportDataset(sink)
      
    # write results to file
    writemsg =  "\n{:s}   >>>   Export of Dataset '{:s}' to Format '{:s}' complete.".format(pidstr,expname, expformat)
    writemsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr,expfolder)
    logger.info(writemsg)      
       
    # clean up and return
    source.unload(); #del source
    return 0 # "exit code"
Esempio n. 8
0
def generateBiasCorrection(dataset, mode, dataargs, obs_dataset, bc_method, bc_args, loverwrite=False, lgzip=None, tag=None, 
                           ldebug=False, lparallel=False, pidstr='', logger=None):
  ''' worker function to generate a bias correction objects for a given dataset '''
  # input checking
  if not isinstance(dataset,basestring): raise TypeError
  if not isinstance(dataargs,dict): raise TypeError # all dataset arguments are kwargs 
  
  # logging
  if logger is None: # make new logger     
    logger = logging.getLogger() # new logger
    logger.addHandler(logging.StreamHandler())
  else:
    if isinstance(logger,basestring): 
      logger = logging.getLogger(name=logger) # connect to existing one
    elif not isinstance(logger,logging.Logger): 
      raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format(str(logger))

  ## extract meta data from arguments
  dataargs, loadfct, srcage, datamsgstr = getMetaData(dataset, mode, dataargs, lone=False)
  dataset_name = dataargs.dataset_name; periodstr = dataargs.periodstr; avgfolder = dataargs.avgfolder
  
  # parse export options
  bc_args = bc_args.copy() # first copy, then modify...
  # initialize BiasCorrection class instance
  BC = getBCmethods(bc_method, **bc_args)
  # get folder for target dataset and do some checks
  picklefile = BC.picklefile(obs_name=obs_dataset.name, gridstr=dataargs.grid, domain=dataargs.domain, tag=tag)
  if ldebug: picklefile = 'test_' + picklefile 
  picklepath = '{:s}/{:s}'.format(avgfolder,picklefile)
  
  # check if we are overwriting an existing file
  if not os.path.exists(avgfolder): raise IOError, "Dataset folder '{:s}' does not exist!".format(avgfolder)
  lskip = False # else just go ahead
  if os.path.exists(picklepath) and not loverwrite: 
    age = datetime.fromtimestamp(os.path.getmtime(picklepath))
    # if source file is newer than sink file or if sink file is a stub, recompute, otherwise skip
    if age > srcage: 
      lskip = True
      if hasattr(obs_dataset, 'filepath') and obs_dataset.filepath is not None:
        obsage = datetime.fromtimestamp(os.path.getmtime(obs_dataset.filepath))
        if age < obsage: lskip = False

  
  # depending on last modification time of file or overwrite setting, start computation, or skip
  if lskip:        
    # print message
    skipmsg =  "\n{:s}   >>>   Skipping: Bias-correction '{:s} for dataset '{:s}' already exists and is newer than source file.".format(pidstr,BC.long_name,dataset_name)
    skipmsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr,picklepath)
    logger.info(skipmsg) 
    del BC             
  else:
          
    ## actually load datasets
    dataset = loadfct() # load source data
    # check period
    if 'period' in dataset.atts and dataargs.periodstr != dataset.atts.period: # a NetCDF attribute
      raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format(periodstr,dataset.atts.period)

    # print message
    if mode == 'climatology': opmsgstr = 'Bias-correcting Climatology ({:s}) using {:s}'.format(periodstr, BC.long_name)
    elif mode == 'time-series': opmsgstr = 'Bias-correcting Time-series using {:s}'.format(BC.long_name)
    elif mode[-5:] == '-mean': opmsgstr = 'Bias-correcting {:s}-Mean ({:s}) using {:s}'.format(mode[:-5], periodstr, BC.long_name)
    else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode)        
    # print feedback to logger
    logger.info('\n{0:s}   ***   {1:^65s}   ***   \n{0:s}   ***   {2:^65s}   ***   \n'.format(pidstr,datamsgstr,opmsgstr))
    if not lparallel and ldebug: logger.info('\n'+str(dataset)+'\n')
    
    # N.B.: data are not loaded immediately but on demand; this way I/O and computing are further
    #       disentangled and not all variables are always needed
    
    
    # "train", i.e. optimize fit parameters
    BC.train(dataset, obs_dataset)
    
    # print bias-correction
    if not lparallel and ldebug:
      logger.info('\n'+str(BC)+'\n')
      print("Bias-correction Statistics:")
      BC.validate(dataset, obs_dataset, lprint=True)    
      print('')  
      
    ## pickle bias-correction object with trained parameters
    # open file and save pickle
    if os.path.exists(picklepath): os.remove(picklepath)
    if lgzip:
      op = gzip.open 
      picklepath += '.gz'
    else: op = open
    with op(picklepath, 'wb') as filehandle:
      pickle.dump(BC, filehandle, protocol=-1) # should be new binary protocol
    if not os.path.exists(picklepath):
      raise IOError, "Error while saving Pickle to '{0:s}'".format(picklepath)

      
    # write results to file
    writemsg =  "\n{:s}   >>>   Generation of BiasCorrection '{:s}' for Dataset '{:s}' complete.".format(pidstr,bc_method, dataset_name,)
    writemsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr,picklepath)
    logger.info(writemsg)      
       
    # clean up and return
    dataset.unload(); del dataset, BC
    return 0 # "exit code"
Esempio n. 9
0
def performExport(dataset, mode, dataargs, expargs, bcargs, loverwrite=False, 
                  ldebug=False, lparallel=False, pidstr='', logger=None):
    ''' worker function to export ASCII rasters for a given dataset '''
    # input checking
    if not isinstance(dataset,basestring): raise TypeError
    if not isinstance(dataargs,dict): raise TypeError # all dataset arguments are kwargs 
    
    # logging
    if logger is None: # make new logger     
        logger = logging.getLogger() # new logger
        logger.addHandler(logging.StreamHandler())
    else:
        if isinstance(logger,basestring): 
            logger = logging.getLogger(name=logger) # connect to existing one
        elif not isinstance(logger,logging.Logger): 
            raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format(str(logger))
  
    ## extract meta data from arguments
    dataargs, loadfct, srcage, datamsgstr = getMetaData(dataset, mode, dataargs, lone=False)
    dataset_name = dataargs.dataset_name; periodstr = dataargs.periodstr; domain = dataargs.domain
    
    # figure out bias correction parameters
    if bcargs:
        bcargs = bcargs.copy() # first copy, then modify...
        bc_method = bcargs.pop('method',None)
        if bc_method is None: raise ArgumentError("Need to specify bias-correction method to use bias correction!")
        bc_obs = bcargs.pop('obs_dataset',None)
        if bc_obs is None: raise ArgumentError("Need to specify observational dataset to use bias correction!")
        bc_reference = bcargs.pop('reference',None)
        if bc_reference is None: # infer from experiment name
            if dataset_name[-5:] in ('-2050','-2100'): bc_reference = dataset_name[:-5] # cut of period indicator and hope for the best 
            else: bc_reference = dataset_name 
        bc_grid = bcargs.pop('grid',None)
        if bc_grid is None: bc_grid = dataargs.grid
        bc_domain = bcargs.pop('domain',None)
        if bc_domain is None: bc_domain = domain
        bc_varlist = bcargs.pop('varlist',None)
        bc_varmap = bcargs.pop('varmap',None)       
        bc_tag = bcargs.pop('tag',None) # an optional name extension/tag
        bc_pattern = bcargs.pop('file_pattern',None) # usually default in getPickleFile
        lgzip = bcargs.pop('lgzip',None) # if pickle is gzipped (None: auto-detect based on file name extension)
        # get name of pickle file (and folder)
        picklefolder = dataargs.avgfolder.replace(dataset_name,bc_reference)
        picklefile = getPickleFileName(method=bc_method, obs_name=bc_obs, gridstr=bc_grid, domain=bc_domain, 
                                       tag=bc_tag, pattern=bc_pattern)
        picklepath = '{:s}/{:s}'.format(picklefolder,picklefile)
        if lgzip:
            picklepath += '.gz' # add extension
            if not os.path.exists(picklepath): raise IOError(picklepath)
        elif lgzip is None:
            lgzip = False
            if not os.path.exists(picklepath):
                lgzip = True # assume gzipped file
                picklepath += '.gz' # try with extension...
                if not os.path.exists(picklepath): raise IOError(picklepath)
        elif not os.path.exists(picklepath): raise IOError(picklepath)
        pickleage = datetime.fromtimestamp(os.path.getmtime(picklepath))
        # determine age of pickle file and compare against source age
    else:
      bc_method = False 
      pickleage = srcage
    
    # parse export options
    expargs = expargs.copy() # first copy, then modify...
    lm3 = expargs.pop('lm3') # convert kg/m^2/s to m^3/m^2/s (water flux)
    expformat = expargs.pop('format') # needed to get FileFormat object
    exp_list= expargs.pop('exp_list') # this handled outside of export
    compute_list = expargs.pop('compute_list', []) # variables to be (re-)computed - by default all
    # initialize FileFormat class instance
    fileFormat = getFileFormat(expformat, bc_method=bc_method, **expargs)
    # get folder for target dataset and do some checks
    expname = '{:s}_d{:02d}'.format(dataset_name,domain) if domain else dataset_name
    expfolder = fileFormat.defineDataset(dataset=dataset, mode=mode, dataargs=dataargs, lwrite=True, ldebug=ldebug)
  
    # prepare destination for new dataset
    lskip = fileFormat.prepareDestination(srcage=max(srcage,pickleage), loverwrite=loverwrite)
  
    # depending on last modification time of file or overwrite setting, start computation, or skip
    if lskip:        
        # print message
        skipmsg =  "\n{:s}   >>>   Skipping: Format '{:s} for dataset '{:s}' already exists and is newer than source file.".format(pidstr,expformat,dataset_name)
        skipmsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr,expfolder)
        logger.info(skipmsg)              
    else:
            
      ## actually load datasets
      source = loadfct() # load source data
      # check period
      if 'period' in source.atts and dataargs.periodstr != source.atts.period: # a NetCDF attribute
          raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format(periodstr,source.atts.period)
      
      # load BiasCorrection object from pickle
      if bc_method:      
          op = gzip.open if lgzip else open
          with op(picklepath, 'r') as filehandle:
              BC = pickle.load(filehandle) 
          # assemble logger entry
          bcmsgstr = "(performing bias-correction using {:s} from {:s} towards {:s})".format(BC.long_name,bc_reference,bc_obs)
      
      # print message
      if mode == 'climatology': opmsgstr = 'Exporting Climatology ({:s}) to {:s} Format'.format(periodstr, expformat)
      elif mode == 'time-series': opmsgstr = 'Exporting Time-series to {:s} Format'.format(expformat)
      elif mode[-5:] == '-mean': opmsgstr = 'Exporting {:s}-Mean ({:s}) to {:s} Format'.format(mode[:-5], periodstr, expformat)
      else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode)        
      # print feedback to logger
      logmsg = '\n{0:s}   ***   {1:^65s}   ***   \n{0:s}   ***   {2:^65s}   ***   \n'.format(pidstr,datamsgstr,opmsgstr)
      if bc_method:
          logmsg += "{0:s}   ***   {1:^65s}   ***   \n".format(pidstr,bcmsgstr)
      logger.info(logmsg)
      if not lparallel and ldebug: logger.info('\n'+str(source)+'\n')
      
      # create GDAL-enabled target dataset
      sink = Dataset(axes=(source.xlon,source.ylat), name=expname, title=source.title, atts=source.atts.copy())
      addGDALtoDataset(dataset=sink, griddef=source.griddef)
      assert sink.gdal, sink
      
      # apply bias-correction
      if bc_method:
          source = BC.correct(source, asNC=False, varlist=bc_varlist, varmap=bc_varmap) # load bias-corrected variables into memory
        
      # N.B.: for variables that are not bias-corrected, data are not loaded immediately but on demand; this way 
      #       I/O and computing can be further disentangled and not all variables are always needed
      
      # compute intermediate variables, if necessary
      for varname in exp_list:
          variables = None # variable list
          var = None
          # (re-)compute variable, if desired...
          if varname in compute_list:
              if varname == 'precip': var = newvars.computeTotalPrecip(source)
              elif varname == 'waterflx': var = newvars.computeWaterFlux(source)
              elif varname == 'liqwatflx': var = newvars.computeLiquidWaterFlux(source)
              elif varname == 'netrad': var = newvars.computeNetRadiation(source, asVar=True)
              elif varname == 'netrad_bb': var = newvars.computeNetRadiation(source, asVar=True, lrad=False, name='netrad_bb')
              elif varname == 'netrad_bb0': var = newvars.computeNetRadiation(source, asVar=True, lrad=False, lA=False, name='netrad_bb0')
              elif varname == 'vapdef': var = newvars.computeVaporDeficit(source)
              elif varname in ('pet','pet_pm','petrad','petwnd') and 'pet' not in sink:
                  if 'petrad' in exp_list or 'petwnd' in exp_list:
                      variables = newvars.computePotEvapPM(source, lterms=True) # default; returns mutliple PET terms
                  else: var = newvars.computePotEvapPM(source, lterms=False) # returns only PET
              elif varname == 'pet_th': var = None # skip for now
                  #var = computePotEvapTh(source) # simplified formula (less prerequisites)
          # ... otherwise load from source file
          if var is None and variables is None and varname in source:
              var = source[varname].load() # load data (may not have to load all)
          #else: raise VariableError, "Unsupported Variable '{:s}'.".format(varname)
          # for now, skip variables that are None
          if var or variables:
              # handle lists as well
              if var and variables: raise VariableError, (var,variables)
              elif var: variables = (var,)
              for var in variables:
                  addGDALtoVar(var=var, griddef=sink.griddef)
                  if not var.gdal and isinstance(fileFormat,ASCII_raster):
                      raise GDALError, "Exporting to ASCII_raster format requires GDAL-enabled variables."
                  # add to new dataset
                  sink += var
      # convert units
      if lm3:
          for var in sink:
              if var.units == 'kg/m^2/s':
                  var /= 1000. # divide to get m^3/m^2/s
                  var.units = 'm^3/m^2/s' # update units
      
      # compute seasonal mean if we are in mean-mode
      if mode[-5:] == '-mean': 
          sink = sink.seasonalMean(season=mode[:-5], lclim=True)
          # N.B.: to remain consistent with other output modes, 
          #       we need to prevent renaming of the time axis
          sink = concatDatasets([sink,sink], axis='time', lensembleAxis=True)
          sink.squeeze() # we need the year-axis until now to distinguish constant fields; now remove
      
      # print dataset
      if not lparallel and ldebug:
          logger.info('\n'+str(sink)+'\n')
        
      # export new dataset to selected format
      fileFormat.exportDataset(sink)
        
      # write results to file
      writemsg =  "\n{:s}   >>>   Export of Dataset '{:s}' to Format '{:s}' complete.".format(pidstr,expname, expformat)
      writemsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr,expfolder)
      logger.info(writemsg)      
         
      # clean up and return
      source.unload(); #del source
      return 0 # "exit code"